split/join redesign

This commit is contained in:
Blaise Tine 2023-07-02 16:50:59 -04:00
parent ebf104de1b
commit 34206598e7
23 changed files with 555 additions and 483 deletions

View file

@ -37,11 +37,11 @@ then
#make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean
CONFIGS="-DFLEN_64 -DFPU_FPNEW" make -C sim/rtlsim
CONFIGS="-DEXT_D_ENABLE -DFPU_FPNEW" make -C sim/rtlsim
make -C tests/riscv/isa run-rtlsim-64d
make -C sim/rtlsim clean
CONFIGS="-DFLEN_64 -DFPU_DPI" make -C sim/rtlsim
CONFIGS="-DEXT_D_ENABLE -DFPU_DPI" make -C sim/rtlsim
make -C tests/riscv/isa run-rtlsim-64d
fi
@ -170,7 +170,7 @@ CONFIGS="-DENABLE_DPI -DEXT_GFX_ENABLE -DL1_DISABLE -DSM_DISABLE -DTCACHE_DISABL
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-ttriangle.cgltrace -rtriangle_ref_8.png -w8 -h8" --warps=1 --threads=2 --debug=3
CONFIGS="-DENABLE_DPI -DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=draw3d --args="-ttriangle.cgltrace -rtriangle_ref_8.png -w8 -h8" --warps=1 --threads=2 --debug=3
CONFIGS="-DEXT_GFX_ENABLE -DL1_DISABLE -DSM_DISABLE -DTCACHE_DISABLE -DRCACHE_DISABLE -DOCACHE_DISABLE" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-tvase.cgltrace -rvase_ref_32.png -w32 -h32" --threads=1
CONFIGS="-DEXT_GFX_ENABLE -DIPDOM_STACK_SIZE=128" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-x -ttriangle.cgltrace -rtriangle_ref_128.png"
CONFIGS="-DEXT_GFX_ENABLE -DPD_STACK_SIZE=128" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-x -ttriangle.cgltrace -rtriangle_ref_128.png"
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-y -ttriangle.cgltrace -rtriangle_ref_128.png"
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-z -ttriangle.cgltrace -rtriangle_ref_128.png"
CONFIGS="-DENABLE_DPI -DEXT_GFX_ENABLE -DL1_DISABLE -DSM_DISABLE -DTCACHE_DISABLE -DRCACHE_DISABLE -DOCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=draw3d --args="-tvase.cgltrace -rvase_ref_32.png -w32 -h32" --threads=2 || true

View file

@ -26,13 +26,6 @@
`endif
`endif
// 32 bit FLEN as default.
`ifndef FLEN_32
`ifndef FLEN_64
`define FLEN_32
`endif
`endif
`ifdef XLEN_64
`define XLEN 64
`endif
@ -41,14 +34,6 @@
`define XLEN 32
`endif
`ifdef FLEN_64
`define FLEN 64
`endif
`ifdef FLEN_32
`define FLEN 32
`endif
`ifndef NUM_CLUSTERS
`define NUM_CLUSTERS 1
`endif
@ -185,6 +170,20 @@
`define EXT_F_ENABLE
`endif
`ifdef EXT_D_ENABLE
`define FLEN_64
`else
`define FLEN_32
`endif
`ifdef FLEN_64
`define FLEN 64
`endif
`ifdef FLEN_32
`define FLEN 32
`endif
`ifdef EXT_GFX_ENABLE
`define EXT_TEX_ENABLE
`define EXT_RASTER_ENABLE
@ -401,11 +400,6 @@
`define LSUQ_SIZE `MAX(2, `NUM_WARPS * 2)
`endif
// Size of divergence Stack
`ifndef IPDOM_STACK_SIZE
`define IPDOM_STACK_SIZE 32
`endif
// Floating-Point Units ///////////////////////////////////////////////////////
// Number of FPU units

View file

@ -27,6 +27,9 @@
`define NR_BITS `CLOG2(`NUM_REGS)
`define PD_STACK_SIZE `UP(`NT_BITS)
`define PD_STACK_SIZEW `CLOG2(`PD_STACK_SIZE)
`define PERF_CTR_BITS 44
`ifndef NDEBUG
@ -215,6 +218,7 @@
`define INST_GPU_JOIN 4'h3
`define INST_GPU_BAR 4'h4
`define INST_GPU_PRED 4'h5
`define INST_GPU_IS_WCTL(op) (op <= 5)
`define INST_GPU_TEX 4'h6
`define INST_GPU_RASTER 4'h7

View file

@ -17,13 +17,17 @@ typedef struct packed {
} gpu_wspawn_t;
typedef struct packed {
logic valid;
logic diverged;
logic [`NUM_THREADS-1:0] then_tmask;
logic [`NUM_THREADS-1:0] else_tmask;
logic [`XLEN-1:0] pc;
logic valid;
logic [`NUM_THREADS-1:0] taken;
logic [`NUM_THREADS-1:0] tmask;
logic [`XLEN-1:0] next_pc;
} gpu_split_t;
typedef struct packed {
logic valid;
logic [`PD_STACK_SIZEW-1:0] stack_ptr;
} gpu_join_t;
typedef struct packed {
logic valid;
logic [`NB_BITS-1:0] id;
@ -32,8 +36,8 @@ typedef struct packed {
} gpu_barrier_t;
typedef struct packed {
logic [`XLEN-1:0] startup_addr;
logic [7:0] mpm_class;
logic [`XLEN-1:0] startup_addr;
logic [7:0] mpm_class;
} base_dcrs_t;
/* verilator lint_off UNUSED */
@ -265,6 +269,7 @@ endpackage
`define GPU_TMC_BITS $bits(VX_gpu_types::gpu_tmc_t)
`define GPU_WSPAWN_BITS $bits(VX_gpu_types::gpu_wspawn_t)
`define GPU_SPLIT_BITS $bits(VX_gpu_types::gpu_split_t)
`define GPU_JOIN_BITS $bits(VX_gpu_types::gpu_join_t)
`define GPU_BARRIER_BITS $bits(VX_gpu_types::gpu_barrier_t)
`endif // VX_GPU_TYPES_VH

View file

@ -37,7 +37,7 @@ module VX_decode #(
reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r;
reg [`XLEN-1:0] imm;
reg use_rd, use_PC, use_imm;
reg is_join, is_wstall;
reg is_wstall;
wire [31:0] instr = fetch_if.data;
wire [6:0] opcode = instr[6:0];
@ -137,7 +137,6 @@ module VX_decode #(
use_imm = 0;
use_PC = 0;
use_rd = 0;
is_join = 0;
is_wstall = 0;
case (opcode)
@ -437,11 +436,14 @@ module VX_decode #(
3'h2: begin // SPLIT
op_type = `INST_OP_BITS'(`INST_GPU_SPLIT);
is_wstall = 1;
`USED_IREG (rs1);
use_rd = 1;
`USED_IREG (rs1);
`USED_IREG (rd);
end
3'h3: begin // JOIN
op_type = `INST_OP_BITS'(`INST_GPU_JOIN);
is_join = 1;
is_wstall = 1;
`USED_IREG (rs1);
end
3'h4: begin // BAR
op_type = `INST_OP_BITS'(`INST_GPU_BAR);
@ -551,7 +553,6 @@ module VX_decode #(
assign decode_sched_if.valid = fetch_fire;
assign decode_sched_if.wid = fetch_if.wid;
assign decode_sched_if.is_wstall = is_wstall;
assign decode_sched_if.is_join = is_join;
assign fetch_if.ibuf_pop = decode_if.ibuf_pop;
assign fetch_if.ready = decode_if.ready;

View file

@ -194,10 +194,10 @@ module VX_execute #(
.reset (fpu_reset),
.fpu_exe_if (fpu_exe_if),
.fpu_bus_if (fpu_bus_if),
.fpu_to_csr_if (fpu_to_csr_if),
.fpu_commit_if (fpu_commit_if),
.fpu_to_csr_if (fpu_to_csr_if),
.csr_pending (csr_pending),
.req_pending (fpu_pending)
.req_pending (fpu_pending),
.commit_if (fpu_commit_if)
);
`endif

View file

@ -44,13 +44,11 @@ module VX_gpu_unit #(
localparam UUID_WIDTH = `UP(`UUID_BITS);
localparam NW_WIDTH = `UP(`NW_BITS);
localparam WCTL_DATAW = `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS;
localparam RSP_DATAW = `MAX(`NUM_THREADS * `XLEN, WCTL_DATAW);
localparam RSP_ARB_DATAW = UUID_WIDTH + NW_WIDTH + `NUM_THREADS + `XLEN + `NR_BITS + 1 + RSP_DATAW + 1 + 1;
localparam RSP_ARB_DATAW = UUID_WIDTH + NW_WIDTH + `NUM_THREADS + (`NUM_THREADS * `XLEN) + `NR_BITS + 1 + `XLEN + 1;
localparam RSP_ARB_SIZE = 1 + `EXT_TEX_ENABLED + `EXT_RASTER_ENABLED + `EXT_ROP_ENABLED + `EXT_IMADD_ENABLED;
localparam RSP_ARB_IDX_GPU = 0;
localparam RSP_ARB_IDX_RASTER = RSP_ARB_IDX_GPU + 1;
localparam RSP_ARB_IDX_WCTL = 0;
localparam RSP_ARB_IDX_RASTER = RSP_ARB_IDX_WCTL + 1;
localparam RSP_ARB_IDX_ROP = RSP_ARB_IDX_RASTER + `EXT_RASTER_ENABLED;
localparam RSP_ARB_IDX_TEX = RSP_ARB_IDX_ROP + `EXT_ROP_ENABLED;
localparam RSP_ARB_IDX_IMADD = RSP_ARB_IDX_TEX + `EXT_TEX_ENABLED;
@ -63,89 +61,46 @@ module VX_gpu_unit #(
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
wire [RSP_DATAW-1:0] rsp_data;
wire rsp_is_wctl;
wire gpu_req_valid;
reg gpu_req_ready;
wire csr_ready = ~csr_pending;
assign gpu_req_valid = gpu_exe_if.valid && csr_ready;
// Warp control block
gpu_tmc_t tmc;
gpu_wspawn_t wspawn;
gpu_barrier_t barrier;
gpu_split_t split;
wire is_wspawn = (gpu_exe_if.op_type == `INST_GPU_WSPAWN);
wire is_tmc = (gpu_exe_if.op_type == `INST_GPU_TMC);
wire is_split = (gpu_exe_if.op_type == `INST_GPU_SPLIT);
wire is_join = (gpu_exe_if.op_type == `INST_GPU_JOIN);
wire is_bar = (gpu_exe_if.op_type == `INST_GPU_BAR);
wire is_pred = (gpu_exe_if.op_type == `INST_GPU_PRED);
wire [`XLEN-1:0] rs1_data = gpu_exe_if.rs1_data[gpu_exe_if.tid];
wire [`XLEN-1:0] rs2_data = gpu_exe_if.rs2_data[gpu_exe_if.tid];
wire [`NUM_THREADS-1:0] taken_tmask;
wire [`NUM_THREADS-1:0] not_taken_tmask;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
wire taken = (gpu_exe_if.rs1_data[i] != 0);
assign taken_tmask[i] = gpu_exe_if.tmask[i] && taken;
assign not_taken_tmask[i] = gpu_exe_if.tmask[i] && ~taken;
end
// tmc
wire [`NUM_THREADS-1:0] pred_mask = (taken_tmask != 0) ? taken_tmask : gpu_exe_if.tmask;
assign tmc.valid = is_tmc || is_pred;
assign tmc.tmask = is_pred ? pred_mask : rs1_data[`NUM_THREADS-1:0];
// wspawn
wire [`XLEN-1:0] wspawn_pc = rs2_data;
wire [`NUM_WARPS-1:0] wspawn_wmask;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
assign wspawn_wmask[i] = (i < rs1_data);
end
assign wspawn.valid = is_wspawn;
assign wspawn.wmask = wspawn_wmask;
assign wspawn.pc = wspawn_pc;
// split
assign split.valid = is_split;
assign split.diverged = (| taken_tmask) && (| not_taken_tmask);
assign split.then_tmask = taken_tmask;
assign split.else_tmask = not_taken_tmask;
assign split.pc = gpu_exe_if.next_PC;
// barrier
assign barrier.valid = is_bar;
assign barrier.id = rs1_data[`NB_BITS-1:0];
assign barrier.is_global = rs1_data[31];
assign barrier.size_m1 = $bits(barrier.size_m1)'(rs2_data - 1);
// Warp control response
wire wctl_req_valid = gpu_req_valid && (is_wspawn || is_tmc || is_split || is_join || is_bar || is_pred);
wire wctl_rsp_valid = wctl_req_valid;
wire [WCTL_DATAW-1:0] wctl_rsp_data = {tmc, wspawn, split, barrier};
wire wctl_rsp_ready;
wire wctl_req_ready = wctl_rsp_ready;
assign rsp_arb_valid_in[RSP_ARB_IDX_GPU] = wctl_rsp_valid;
assign rsp_arb_data_in[RSP_ARB_IDX_GPU] = {gpu_exe_if.uuid, gpu_exe_if.wid, gpu_exe_if.tmask, gpu_exe_if.PC, `NR_BITS'(0), 1'b0, RSP_DATAW'(wctl_rsp_data), 1'b1, ~is_join};
assign wctl_rsp_ready = rsp_arb_ready_in[RSP_ARB_IDX_GPU];
`UNUSED_VAR (gpu_exe_if.op_mod)
`UNUSED_VAR (gpu_exe_if.rs3_data)
`UNUSED_VAR (gpu_exe_if.wb)
`UNUSED_VAR (gpu_exe_if.rd)
// Warp control block
VX_gpu_exe_if wctl_exe_if();
VX_commit_if wctl_commit_if();
assign wctl_exe_if.valid = gpu_req_valid && `INST_GPU_IS_WCTL(gpu_exe_if.op_type);
assign wctl_exe_if.op_type = gpu_exe_if.op_type;
assign wctl_exe_if.uuid = gpu_exe_if.uuid;
assign wctl_exe_if.wid = gpu_exe_if.wid;
assign wctl_exe_if.tmask = gpu_exe_if.tmask;
assign wctl_exe_if.tid = gpu_exe_if.tid;
assign wctl_exe_if.PC = gpu_exe_if.PC;
assign wctl_exe_if.next_PC = gpu_exe_if.next_PC;
assign wctl_exe_if.rd = gpu_exe_if.rd;
assign wctl_exe_if.wb = gpu_exe_if.wb;
assign wctl_exe_if.rs1_data = gpu_exe_if.rs1_data;
assign wctl_exe_if.rs2_data = gpu_exe_if.rs2_data;
VX_wctl_unit #(
.OUTPUT_REG (RSP_ARB_SIZE > 1)
) wctl_unit (
.clk (clk),
.reset (reset),
.gpu_exe_if (wctl_exe_if),
.warp_ctl_if(warp_ctl_if),
.commit_if (wctl_commit_if)
);
assign rsp_arb_valid_in[RSP_ARB_IDX_WCTL] = wctl_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_WCTL] = {wctl_commit_if.uuid, wctl_commit_if.wid, wctl_commit_if.tmask, wctl_commit_if.PC, wctl_commit_if.rd, wctl_commit_if.wb, wctl_commit_if.data, 1'b1};
assign wctl_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_WCTL];
`ifdef EXT_TEX_ENABLE
@ -171,16 +126,16 @@ module VX_gpu_unit #(
VX_tex_agent #(
.CORE_ID (CORE_ID)
) tex_agent (
.clk (clk),
.reset (tex_reset),
.tex_csr_if (tex_csr_if),
.tex_exe_if (tex_exe_if),
.tex_commit_if (tex_commit_if),
.tex_bus_if (tex_bus_if)
.clk (clk),
.reset (tex_reset),
.tex_csr_if (tex_csr_if),
.tex_exe_if (tex_exe_if),
.tex_bus_if (tex_bus_if),
.commit_if (tex_commit_if)
);
assign rsp_arb_valid_in[RSP_ARB_IDX_TEX] = tex_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_TEX] = {tex_commit_if.uuid, tex_commit_if.wid, tex_commit_if.tmask, tex_commit_if.PC, tex_commit_if.rd, tex_commit_if.wb, RSP_DATAW'(tex_commit_if.data), tex_commit_if.eop, 1'b0};
assign rsp_arb_data_in[RSP_ARB_IDX_TEX] = {tex_commit_if.uuid, tex_commit_if.wid, tex_commit_if.tmask, tex_commit_if.PC, tex_commit_if.rd, tex_commit_if.wb, tex_commit_if.data, tex_commit_if.eop};
assign tex_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_TEX];
`endif
@ -202,16 +157,16 @@ module VX_gpu_unit #(
VX_raster_agent #(
.CORE_ID (CORE_ID)
) raster_agent (
.clk (clk),
.reset (raster_reset),
.raster_csr_if (raster_csr_if),
.raster_bus_if (raster_bus_if),
.raster_exe_if (raster_exe_if),
.raster_commit_if (raster_commit_if)
.clk (clk),
.reset (raster_reset),
.raster_csr_if (raster_csr_if),
.raster_bus_if (raster_bus_if),
.raster_exe_if (raster_exe_if),
.commit_if (raster_commit_if)
);
assign rsp_arb_valid_in[RSP_ARB_IDX_RASTER] = raster_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_RASTER] = {raster_commit_if.uuid, raster_commit_if.wid, raster_commit_if.tmask, raster_commit_if.PC, raster_commit_if.rd, raster_commit_if.wb, RSP_DATAW'(raster_commit_if.data), raster_commit_if.eop, 1'b0};
assign rsp_arb_data_in[RSP_ARB_IDX_RASTER] = {raster_commit_if.uuid, raster_commit_if.wid, raster_commit_if.tmask, raster_commit_if.PC, raster_commit_if.rd, raster_commit_if.wb, raster_commit_if.data, raster_commit_if.eop};
assign raster_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_RASTER];
`endif
@ -240,16 +195,16 @@ module VX_gpu_unit #(
VX_rop_agent #(
.CORE_ID (CORE_ID)
) rop_agent (
.clk (clk),
.reset (rop_reset),
.rop_csr_if (rop_csr_if),
.rop_exe_if (rop_exe_if),
.rop_commit_if (rop_commit_if),
.rop_bus_if (rop_bus_if)
.clk (clk),
.reset (rop_reset),
.rop_csr_if (rop_csr_if),
.rop_exe_if (rop_exe_if),
.rop_bus_if (rop_bus_if),
.commit_if (rop_commit_if)
);
assign rsp_arb_valid_in[RSP_ARB_IDX_ROP] = rop_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_ROP] = {rop_commit_if.uuid, rop_commit_if.wid, rop_commit_if.tmask, rop_commit_if.PC, rop_commit_if.rd, rop_commit_if.wb, RSP_DATAW'(rop_commit_if.data), rop_commit_if.eop, 1'b0};
assign rsp_arb_data_in[RSP_ARB_IDX_ROP] = {rop_commit_if.uuid, rop_commit_if.wid, rop_commit_if.tmask, rop_commit_if.PC, rop_commit_if.rd, rop_commit_if.wb, rop_commit_if.data, rop_commit_if.eop};
assign rop_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_ROP];
`endif
@ -311,13 +266,11 @@ module VX_gpu_unit #(
end
assign rsp_arb_valid_in[RSP_ARB_IDX_IMADD] = imadd_valid_out;
assign rsp_arb_data_in[RSP_ARB_IDX_IMADD] = {imadd_uuid_out, imadd_wid_out, imadd_tmask_out, imadd_PC_out, imadd_rd_out, 1'b1, RSP_DATAW'(imadd_data_out_x), 1'b1, 1'b0};
assign rsp_arb_data_in[RSP_ARB_IDX_IMADD] = {imadd_uuid_out, imadd_wid_out, imadd_tmask_out, imadd_PC_out, imadd_rd_out, 1'b1, imadd_data_out_x, 1'b1};
assign imadd_ready_out = rsp_arb_ready_in[RSP_ARB_IDX_IMADD];
`endif
// can accept new request?
always @(*) begin
@ -334,7 +287,7 @@ module VX_gpu_unit #(
`ifdef EXT_IMADD_ENABLE
`INST_GPU_IMADD: gpu_req_ready = imadd_ready_in;
`endif
default: gpu_req_ready = wctl_req_ready;
default: gpu_req_ready = wctl_exe_if.ready;
endcase
end
assign gpu_exe_if.ready = gpu_req_ready && csr_ready;
@ -352,21 +305,13 @@ module VX_gpu_unit #(
.valid_in (rsp_arb_valid_in),
.ready_in (rsp_arb_ready_in),
.data_in (rsp_arb_data_in),
.data_out ({gpu_commit_if.uuid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data, gpu_commit_if.eop, rsp_is_wctl}),
.data_out ({gpu_commit_if.uuid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, gpu_commit_if.data, gpu_commit_if.eop}),
.valid_out (gpu_commit_if.valid),
.ready_out (gpu_commit_if.ready)
);
assign gpu_commit_if.data = rsp_data[(`NUM_THREADS * `XLEN)-1:0];
// warp control reponse
wire gpu_req_fire = gpu_exe_if.valid && gpu_exe_if.ready;
wire gpu_commit_fire = gpu_commit_if.valid && gpu_commit_if.ready;
assign warp_ctl_if.valid = gpu_commit_fire && rsp_is_wctl;
assign warp_ctl_if.wid = gpu_commit_if.wid;
assign {warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier} = rsp_data[WCTL_DATAW-1:0];
// pending request

View file

@ -1,32 +1,32 @@
`include "VX_platform.vh"
module VX_ipdom #(
module VX_ipdom_stack #(
parameter WIDTH = 1,
parameter DEPTH = 1
parameter DEPTH = 1,
parameter ADDRW = $clog2(DEPTH)
) (
input wire clk,
input wire reset,
input wire pair,
input wire [WIDTH - 1:0] q0,
input wire [WIDTH - 1:0] q1,
input wire [WIDTH - 1:0] q2,
output wire [WIDTH - 1:0] d,
output wire d_idx,
output wire [ADDRW-1:0] q_ptr,
output wire [ADDRW-1:0] d_ptr,
input wire push,
input wire pop,
output wire index,
input wire pop,
output wire empty,
output wire full
);
`STATIC_ASSERT(`ISPOW2(DEPTH), ("depth must be a power of 2!"))
`STATIC_ASSERT(`ISPOW2(DEPTH), ("depth must be a power of 2!"))
localparam ADDRW = $clog2(DEPTH);
reg is_part [DEPTH-1:0];
reg slot_idx [DEPTH-1:0];
reg [ADDRW-1:0] rd_ptr, wr_ptr;
reg empty_r, full_r;
wire [WIDTH-1:0] d1, d2;
wire [WIDTH-1:0] d0, d1;
always @(posedge clk) begin
if (reset) begin
@ -44,9 +44,9 @@ module VX_ipdom #(
empty_r <= 0;
full_r <= (ADDRW'(DEPTH-1) == wr_ptr);
end else if (pop) begin
wr_ptr <= wr_ptr - ADDRW'(is_part[rd_ptr]);
rd_ptr <= rd_ptr - ADDRW'(is_part[rd_ptr]);
empty_r <= is_part[rd_ptr] && (0 == rd_ptr);
wr_ptr <= wr_ptr - ADDRW'(d_idx);
rd_ptr <= rd_ptr - ADDRW'(d_idx);
empty_r <= (rd_ptr == 0) && (d_idx == 1);
full_r <= 0;
end
end
@ -61,21 +61,23 @@ module VX_ipdom #(
.write (push),
`UNUSED_PIN (wren),
.waddr (wr_ptr),
.wdata ({q2, q1}),
.wdata ({q1, q0}),
.raddr (rd_ptr),
.rdata ({d2, d1})
.rdata ({d1, d0})
);
always @(posedge clk) begin
if (push) begin
is_part[wr_ptr] <= ~pair;
slot_idx[wr_ptr] <= 0;
end else if (pop) begin
is_part[rd_ptr] <= 1;
slot_idx[rd_ptr] <= 1;
end
end
assign index = is_part[rd_ptr];
assign d = index ? d1 : d2;
assign d = d_idx ? d1 : d0;
assign d_idx = slot_idx[rd_ptr];
assign d_ptr = rd_ptr;
assign q_ptr = wr_ptr;
assign empty = empty_r;
assign full = full_r;

View file

@ -34,10 +34,6 @@ module VX_schedule #(
localparam NC_WIDTH = `UP(`NC_BITS);
localparam NW_WIDTH = `UP(`NW_BITS);
wire join_else;
wire [`XLEN-1:0] join_pc;
wire [`NUM_THREADS-1:0] join_tmask;
reg [`NUM_WARPS-1:0] active_warps, active_warps_n; // updated when a warp is activated or disabled
reg [`NUM_WARPS-1:0] stalled_warps; // set when branch/gpgpu instructions are issued
@ -63,6 +59,14 @@ module VX_schedule #(
wire schedule_valid;
wire schedule_ready;
// split/join
wire split_is_divergent;
wire [`NUM_THREADS-1:0] split_tmask0;
wire join_is_divergent;
wire join_is_else;
wire [`NUM_THREADS-1:0] join_tmask;
wire [`XLEN-1:0] join_pc;
reg [`PERF_CTR_BITS-1:0] cycles;
reg [`NUM_WARPS-1:0][UUID_WIDTH-1:0] issued_instrs;
@ -103,11 +107,14 @@ module VX_schedule #(
thread_masks[0] <= 1;
end else begin
// join handling
if (decode_sched_if.valid && decode_sched_if.is_join) begin
if (join_else) begin
warp_pcs[decode_sched_if.wid] <= `XLEN'(join_pc);
if (warp_ctl_if.valid && warp_ctl_if.sjoin.valid) begin
stalled_warps[warp_ctl_if.wid] <= 0;
if (join_is_divergent) begin
if (join_is_else) begin
warp_pcs[warp_ctl_if.wid] <= `XLEN'(join_pc);
end
thread_masks[warp_ctl_if.wid] <= join_tmask;
end
thread_masks[decode_sched_if.wid] <= join_tmask;
end
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
@ -145,8 +152,8 @@ module VX_schedule #(
// split handling
if (warp_ctl_if.valid && warp_ctl_if.split.valid) begin
stalled_warps[warp_ctl_if.wid] <= 0;
if (warp_ctl_if.split.diverged) begin
thread_masks[warp_ctl_if.wid] <= warp_ctl_if.split.then_tmask;
if (split_is_divergent) begin
thread_masks[warp_ctl_if.wid] <= split_tmask0;
end
end
@ -216,47 +223,62 @@ module VX_schedule #(
assign gbar_bus_if.req_size_m1 = gbar_req_size_m1;
assign gbar_bus_if.req_core_id = NC_WIDTH'(CORE_ID % `NUM_CORES);
// split/join stack management
// split/join handling
wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_data [`NUM_WARPS-1:0];
wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_data [`NUM_WARPS-1:0];
wire [`PD_STACK_SIZEW-1:0] ipdom_q_ptr [`NUM_WARPS-1:0];
wire ipdom_index [`NUM_WARPS-1:0];
wire [`NUM_THREADS-1:0] then_tmask;
wire [`NUM_THREADS-1:0] else_tmask;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign then_tmask[i] = warp_ctl_if.split.tmask[i] && warp_ctl_if.split.taken[i];
assign else_tmask[i] = warp_ctl_if.split.tmask[i] && ~warp_ctl_if.split.taken[i];
end
wire [`CLOG2(`NUM_THREADS+1)-1:0] then_tmask_cnt, else_tmask_cnt;
`POP_COUNT(then_tmask_cnt, then_tmask);
`POP_COUNT(else_tmask_cnt, else_tmask);
wire then_first = (then_tmask_cnt >= else_tmask_cnt);
assign split_is_divergent = (then_tmask != 0) && (else_tmask != 0);
assign split_tmask0 = then_first ? then_tmask : else_tmask;
assign warp_ctl_if.split_ret = ipdom_q_ptr[warp_ctl_if.wid];
assign join_is_divergent = (warp_ctl_if.sjoin.stack_ptr != ipdom_q_ptr[warp_ctl_if.wid]);
assign {join_pc, join_tmask} = ipdom_data[warp_ctl_if.wid];
assign join_is_else = (ipdom_index[warp_ctl_if.wid] == 0);
wire [`NUM_THREADS-1:0] split_tmask1 = then_first ? else_tmask : then_tmask;
wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_q0 = {warp_ctl_if.split.next_pc, split_tmask1};
wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_q1 = {`XLEN'(0), warp_ctl_if.split.tmask};
wire ipdom_push = warp_ctl_if.valid && warp_ctl_if.split.valid && split_is_divergent;
wire ipdom_pop = warp_ctl_if.valid && warp_ctl_if.sjoin.valid && join_is_divergent;
`RESET_RELAY (ipdom_reset, reset);
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
wire push = warp_ctl_if.valid
&& warp_ctl_if.split.valid
&& (i == warp_ctl_if.wid);
wire pop = decode_sched_if.valid && decode_sched_if.is_join && (i == decode_sched_if.wid);
wire [`NUM_THREADS-1:0] else_tmask = warp_ctl_if.split.else_tmask;
wire [`NUM_THREADS-1:0] orig_tmask = thread_masks[warp_ctl_if.wid];
wire [(`XLEN+`NUM_THREADS)-1:0] q_else = {warp_ctl_if.split.pc, else_tmask};
wire [(`XLEN+`NUM_THREADS)-1:0] q_end = {`XLEN'(0), orig_tmask};
VX_ipdom #(
VX_ipdom_stack #(
.WIDTH (`XLEN+`NUM_THREADS),
.DEPTH (`IPDOM_STACK_SIZE)
) ipdom (
.DEPTH (`PD_STACK_SIZE)
) ipdom_stack (
.clk (clk),
.reset (ipdom_reset),
.push (push),
.pop (pop),
.pair (warp_ctl_if.split.diverged),
.q1 (q_end),
.q2 (q_else),
.push (ipdom_push && (i == warp_ctl_if.wid)),
.pop (ipdom_pop && (i == warp_ctl_if.wid)),
.q0 (ipdom_q0),
.q1 (ipdom_q1),
.d (ipdom_data[i]),
.index (ipdom_index[i]),
.d_idx (ipdom_index[i]),
.q_ptr (ipdom_q_ptr[i]),
`UNUSED_PIN (d_ptr),
`UNUSED_PIN (empty),
`UNUSED_PIN (full)
);
end
assign {join_pc, join_tmask} = ipdom_data[decode_sched_if.wid];
assign join_else = ~ipdom_index[decode_sched_if.wid];
// schedule the next ready warp
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls);

108
hw/rtl/core/VX_wctl_unit.sv Normal file
View file

@ -0,0 +1,108 @@
`include "VX_define.vh"
module VX_wctl_unit #(
parameter OUTPUT_REG = 0
) (
input wire clk,
input wire reset,
// Inputs
VX_gpu_exe_if.slave gpu_exe_if,
// Outputs
VX_warp_ctl_if.master warp_ctl_if,
VX_commit_if.master commit_if
);
localparam UUID_WIDTH = `UP(`UUID_BITS);
localparam NW_WIDTH = `UP(`NW_BITS);
gpu_tmc_t tmc;
gpu_wspawn_t wspawn;
gpu_split_t split;
gpu_join_t sjoin;
gpu_barrier_t barrier;
wire [`XLEN-1:0] rs1_data = gpu_exe_if.rs1_data[gpu_exe_if.tid];
wire [`XLEN-1:0] rs2_data = gpu_exe_if.rs2_data[gpu_exe_if.tid];
wire [`NUM_THREADS-1:0] taken;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign taken[i] = gpu_exe_if.rs1_data[i][0];
end
wire is_wspawn = (gpu_exe_if.op_type == `INST_GPU_WSPAWN);
wire is_tmc = (gpu_exe_if.op_type == `INST_GPU_TMC);
wire is_pred = (gpu_exe_if.op_type == `INST_GPU_PRED);
wire is_split = (gpu_exe_if.op_type == `INST_GPU_SPLIT);
wire is_join = (gpu_exe_if.op_type == `INST_GPU_JOIN);
wire is_bar = (gpu_exe_if.op_type == `INST_GPU_BAR);
assign warp_ctl_if.valid = gpu_exe_if.valid && gpu_exe_if.ready;
assign warp_ctl_if.wid = gpu_exe_if.wid;
assign warp_ctl_if.tmc = tmc;
assign warp_ctl_if.wspawn = wspawn;
assign warp_ctl_if.split = split;
assign warp_ctl_if.sjoin = sjoin;
assign warp_ctl_if.barrier = barrier;
// tmc
wire [`NUM_THREADS-1:0] then_tmask = gpu_exe_if.tmask & taken;
wire [`NUM_THREADS-1:0] pred_mask = (then_tmask != 0) ? then_tmask : gpu_exe_if.tmask;
assign tmc.valid = is_tmc || is_pred;
assign tmc.tmask = is_pred ? pred_mask : rs1_data[`NUM_THREADS-1:0];
// wspawn
wire [`XLEN-1:0] wspawn_pc = rs2_data;
wire [`NUM_WARPS-1:0] wspawn_wmask;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
assign wspawn_wmask[i] = (i < rs1_data[31:0]);
end
assign wspawn.valid = is_wspawn;
assign wspawn.wmask = wspawn_wmask;
assign wspawn.pc = wspawn_pc;
// split
assign split.valid = is_split;
assign split.taken = taken;
assign split.tmask = gpu_exe_if.tmask;
assign split.next_pc = gpu_exe_if.next_PC;
// join
assign sjoin.valid = is_join;
assign sjoin.stack_ptr = `PD_STACK_SIZEW'(rs1_data);
// barrier
assign barrier.valid = is_bar;
assign barrier.id = rs1_data[`NB_BITS-1:0];
assign barrier.is_global = rs1_data[31];
assign barrier.size_m1 = $bits(barrier.size_m1)'(rs2_data[31:0] - 1);
// response
wire [`PD_STACK_SIZEW-1:0] rsp_data;
VX_skid_buffer #(
.DATAW (UUID_WIDTH + NW_WIDTH + `NUM_THREADS + `XLEN + `NR_BITS + 1 + `PD_STACK_SIZEW),
.PASSTHRU (OUTPUT_REG == 0)
) rsp_sbuf (
.clk (clk),
.reset (reset),
.valid_in (gpu_exe_if.valid),
.ready_in (gpu_exe_if.ready),
.data_in ({gpu_exe_if.uuid, gpu_exe_if.wid, gpu_exe_if.tmask, gpu_exe_if.PC, gpu_exe_if.rd, gpu_exe_if.wb, warp_ctl_if.split_ret}),
.data_out ({commit_if.uuid, commit_if.wid, commit_if.tmask, commit_if.PC, commit_if.rd, commit_if.wb, rsp_data}),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready)
);
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign commit_if.data[i] = `XLEN'(rsp_data);
end
endmodule

View file

@ -13,12 +13,13 @@ module VX_fpu_agent #(
VX_fpu_exe_if.slave fpu_exe_if,
VX_fpu_to_csr_if.master fpu_to_csr_if,
VX_commit_if.master fpu_commit_if,
VX_fpu_bus_if.master fpu_bus_if,
input wire csr_pending,
output wire req_pending
output wire req_pending,
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
@ -115,18 +116,18 @@ module VX_fpu_agent #(
.reset (reset),
.valid_in (fpu_bus_if.rsp_valid),
.ready_in (fpu_bus_if.rsp_ready),
.data_in ({rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, fpu_bus_if.rsp_result}),
.data_out ({fpu_commit_if.uuid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.data}),
.valid_out (fpu_commit_if.valid),
.ready_out (fpu_commit_if.ready)
.data_in ({rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, fpu_bus_if.rsp_result}),
.data_out ({commit_if.uuid, commit_if.wid, commit_if.tmask, commit_if.PC, commit_if.rd, commit_if.data}),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready)
);
assign fpu_commit_if.wb = 1'b1;
assign fpu_commit_if.eop = 1'b1;
assign commit_if.wb = 1'b1;
assign commit_if.eop = 1'b1;
// pending request
wire fpu_commit_fire = fpu_commit_if.valid && fpu_commit_if.ready;
wire fpu_commit_fire = commit_if.valid && commit_if.ready;
reg req_pending_r;
always @(posedge clk) begin

View file

@ -4,20 +4,17 @@ interface VX_decode_sched_if ();
wire valid;
wire is_wstall;
wire is_join;
wire [`UP(`NW_BITS)-1:0] wid;
modport master (
output valid,
output is_wstall,
output is_join,
output wid
);
modport slave (
input valid,
input is_wstall,
input is_join,
input wid
);

View file

@ -11,25 +11,31 @@ interface VX_warp_ctl_if ();
wire [`UP(`NW_BITS)-1:0] wid;
gpu_tmc_t tmc;
gpu_wspawn_t wspawn;
gpu_barrier_t barrier;
gpu_split_t split;
gpu_join_t sjoin;
gpu_barrier_t barrier;
wire [`PD_STACK_SIZEW-1:0] split_ret;
modport master (
output valid,
output wid,
output tmc,
output wspawn,
output tmc,
output split,
output sjoin,
output barrier,
output split
input split_ret
);
modport slave (
input valid,
input wid,
input tmc,
input wspawn,
input barrier,
input split
input valid,
input wid,
input wspawn,
input tmc,
input split,
input sjoin,
input barrier,
output split_ret
);
endinterface

View file

@ -10,9 +10,9 @@ module VX_raster_agent #(
VX_raster_exe_if.slave raster_exe_if,
VX_raster_bus_if.slave raster_bus_if,
// Outputs
VX_commit_if.master raster_commit_if,
VX_gpu_csr_if.slave raster_csr_if
// Outputs
VX_gpu_csr_if.slave raster_csr_if,
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
@ -41,7 +41,7 @@ module VX_raster_agent #(
);
// it is possible to have ready = f(valid) when using arbiters,
// because of that we need to decouple raster_exe_if and raster_commit_if handshake with a pipe register
// because of that we need to decouple raster_exe_if and commit_if handshake with a pipe register
assign raster_exe_if.ready = raster_bus_if.req_valid && raster_rsp_ready;
@ -62,18 +62,18 @@ module VX_raster_agent #(
.reset (reset),
.valid_in (raster_rsp_valid),
.ready_in (raster_rsp_ready),
.data_in ({raster_exe_if.uuid, raster_exe_if.wid, raster_exe_if.tmask, raster_exe_if.PC, raster_exe_if.rd, response_data}),
.data_out ({raster_commit_if.uuid, raster_commit_if.wid, raster_commit_if.tmask, raster_commit_if.PC, raster_commit_if.rd, commit_data}),
.valid_out (raster_commit_if.valid),
.ready_out (raster_commit_if.ready)
.data_in ({raster_exe_if.uuid, raster_exe_if.wid, raster_exe_if.tmask, raster_exe_if.PC, raster_exe_if.rd, response_data}),
.data_out ({commit_if.uuid, commit_if.wid, commit_if.tmask, commit_if.PC, commit_if.rd, commit_data}),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready)
);
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign raster_commit_if.data[i] = `XLEN'(commit_data[i]);
assign commit_if.data[i] = `XLEN'(commit_data[i]);
end
assign raster_commit_if.wb = 1'b1;
assign raster_commit_if.eop = 1'b1;
assign commit_if.wb = 1'b1;
assign commit_if.eop = 1'b1;
`ifdef DBG_TRACE_RASTER
always @(posedge clk) begin

View file

@ -11,8 +11,8 @@ module VX_rop_agent #(
VX_gpu_csr_if.slave rop_csr_if,
// Outputs
VX_commit_if.master rop_commit_if,
VX_rop_bus_if.master rop_bus_if
VX_rop_bus_if.master rop_bus_if,
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
@ -42,7 +42,7 @@ module VX_rop_agent #(
wire rop_rsp_valid, rop_rsp_ready;
// it is possible to have ready = f(valid) when using arbiters,
// because of that we need to decouple rop_exe_if and rop_commit_if handshake with a pipe register
// because of that we need to decouple rop_exe_if and commit_if handshake with a pipe register
VX_skid_buffer #(
.DATAW (UUID_WIDTH + `NUM_THREADS * (1 + 2 * `VX_ROP_DIM_BITS + 32 + `VX_ROP_DEPTH_BITS + 1)),
@ -69,16 +69,16 @@ module VX_rop_agent #(
.reset (reset),
.valid_in (rop_rsp_valid),
.ready_in (rop_rsp_ready),
.data_in ({rop_exe_if.uuid, rop_exe_if.wid, rop_exe_if.tmask, rop_exe_if.PC}),
.data_out ({rop_commit_if.uuid, rop_commit_if.wid, rop_commit_if.tmask, rop_commit_if.PC}),
.valid_out (rop_commit_if.valid),
.ready_out (rop_commit_if.ready)
.data_in ({rop_exe_if.uuid, rop_exe_if.wid, rop_exe_if.tmask, rop_exe_if.PC}),
.data_out ({commit_if.uuid, commit_if.wid, commit_if.tmask, commit_if.PC}),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready)
);
assign rop_commit_if.data = '0;
assign rop_commit_if.rd = '0;
assign rop_commit_if.wb = 0;
assign rop_commit_if.eop = 1;
assign commit_if.data = '0;
assign commit_if.rd = '0;
assign commit_if.wb = 0;
assign commit_if.eop = 1;
`ifdef DBG_TRACE_ROP
always @(posedge clk) begin

View file

@ -12,7 +12,7 @@ module VX_tex_agent #(
// Outputs
VX_tex_bus_if.master tex_bus_if,
VX_commit_if.master tex_commit_if
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
@ -107,18 +107,18 @@ module VX_tex_agent #(
.reset (reset),
.valid_in (tex_bus_if.rsp_valid),
.ready_in (tex_bus_if.rsp_ready),
.data_in ({rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, tex_bus_if.rsp_texels}),
.data_out ({tex_commit_if.uuid, tex_commit_if.wid, tex_commit_if.tmask, tex_commit_if.PC, tex_commit_if.rd, commit_data}),
.valid_out (tex_commit_if.valid),
.ready_out (tex_commit_if.ready)
.data_in ({rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, tex_bus_if.rsp_texels}),
.data_out ({commit_if.uuid, commit_if.wid, commit_if.tmask, commit_if.PC, commit_if.rd, commit_data}),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready)
);
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign tex_commit_if.data[i] = `XLEN'(commit_data[i]);
assign commit_if.data[i] = `XLEN'(commit_data[i]);
end
assign tex_commit_if.wb = 1'b1;
assign tex_commit_if.eop = 1'b1;
assign commit_if.wb = 1'b1;
assign commit_if.eop = 1'b1;
`ifdef DBG_TRACE_TEX
always @(posedge clk) begin
@ -131,10 +131,10 @@ module VX_tex_agent #(
`TRACE_ARRAY1D(1, tex_exe_if.lod, `NUM_THREADS);
`TRACE(1, (", stage=%0d, tag=0x%0h (#%0d)\n", tex_exe_if.stage, req_tag, tex_exe_if.uuid));
end
if (tex_commit_if.valid && tex_commit_if.ready) begin
`TRACE(1, ("%d: core%0d-tex-rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, texels=", $time, CORE_ID, tex_commit_if.wid, tex_commit_if.PC, tex_commit_if.tmask, tex_commit_if.rd));
`TRACE_ARRAY1D(1, tex_commit_if.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", tex_commit_if.uuid));
if (commit_if.valid && commit_if.ready) begin
`TRACE(1, ("%d: core%0d-tex-rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, texels=", $time, CORE_ID, commit_if.wid, commit_if.PC, commit_if.tmask, commit_if.rd));
`TRACE_ARRAY1D(1, commit_if.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", commit_if.uuid));
end
end
`endif

View file

@ -137,13 +137,15 @@ inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
}
// Split on a predicate
inline void vx_split(unsigned predicate) {
asm volatile (".insn r %0, 2, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(predicate));
inline unsigned vx_split(unsigned predicate) {
unsigned ret;
asm volatile (".insn r %1, 2, 0, %0, %2, x0" : "=r"(ret) : "i"(RISCV_CUSTOM0), "r"(predicate));
return ret;
}
// Join
inline void vx_join() {
asm volatile (".insn r %0, 3, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM0));
inline void vx_join(unsigned stack_ptr) {
asm volatile (".insn r %0, 3, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(stack_ptr));
}
// Warp Barrier
@ -153,72 +155,72 @@ inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
// Return current thread identifier
inline int vx_thread_id() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_THREAD_ID));
return result;
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_THREAD_ID));
return ret;
}
// Return current warp identifier
inline int vx_warp_id() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_WARP_ID));
return result;
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_WARP_ID));
return ret;
}
// Return current core identifier
inline int vx_core_id() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_CORE_ID));
return result;
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_CORE_ID));
return ret;
}
// Return current cluster identifier
inline int vx_cluster_id() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_CLUSTER_ID));
return result;
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_CLUSTER_ID));
return ret;
}
// Return current threadk mask
inline int vx_thread_mask() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_TMASK));
return result;
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_TMASK));
return ret;
}
// Return the number of threads per warp
inline int vx_num_threads() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_NUM_THREADS));
return result;
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_THREADS));
return ret;
}
// Return the number of warps per core
inline int vx_num_warps() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_NUM_WARPS));
return result;
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_WARPS));
return ret;
}
// Return the number of cores per cluster
inline int vx_num_cores() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_NUM_CORES));
return result;
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_CORES));
return ret;
}
// Return the number of clusters
inline int vx_num_clusters() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_NUM_CLUSTERS));
return result;
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_CLUSTERS));
return ret;
}
// Return the hart identifier (thread id accross the processor)
inline int vx_hart_id() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_MHARTID));
return result;
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_MHARTID));
return ret;
}
inline void vx_fence() {

View file

@ -31,7 +31,7 @@ public:
, num_regs_(32)
, num_csrs_(4096)
, num_barriers_(NUM_BARRIERS)
, ipdom_size_(IPDOM_STACK_SIZE)
, ipdom_size_(log2ceil(num_threads) * 2)
{}
uint16_t vsize() const {

View file

@ -28,29 +28,6 @@ union reg_data_t {
int64_t i64;
};
static bool HasDivergentThreads(const ThreadMask &thread_mask,
const std::vector<std::vector<Word>> &reg_file,
unsigned reg) {
bool cond;
size_t thread_idx = 0;
size_t num_threads = reg_file.size();
for (; thread_idx < num_threads; ++thread_idx) {
if (thread_mask[thread_idx]) {
cond = bool(reg_file[thread_idx][reg]);
break;
}
}
assert(thread_idx != num_threads);
for (; thread_idx < num_threads; ++thread_idx) {
if (thread_mask[thread_idx]) {
if (cond != (bool(reg_file[thread_idx][reg]))) {
return true;
}
}
}
return false;
}
inline uint32_t get_fpu_rm(uint32_t func3, Core* core, uint32_t tid, uint32_t wid) {
return (func3 == 0x7) ? core->get_csr(VX_CSR_FRM, tid, wid) : func3;
}
@ -80,7 +57,8 @@ inline int64_t check_boxing(int64_t a) {
void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
assert(tmask_.any());
auto nextPC = PC_ + 4;
auto next_pc = PC_ + 4;
auto next_tmask = tmask_;
auto func2 = instr.getFunc2();
auto func3 = instr.getFunc3();
@ -98,6 +76,12 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
auto num_threads = arch_.num_threads();
uint32_t thread_start = 0;
for (; thread_start < num_threads; ++thread_start) {
if (tmask_.test(thread_start))
break;
}
std::vector<reg_data_t[3]> rsdata(num_threads);
std::vector<reg_data_t> rddata(num_threads);
@ -149,7 +133,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
// RV32I: LUI
trace->exe_type = ExeType::ALU;
trace->alu_type = AluType::ARITH;
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
rddata[t].i = immsrc << 12;
@ -161,7 +145,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
// RV32I: AUIPC
trace->exe_type = ExeType::ALU;
trace->alu_type = AluType::ARITH;
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
rddata[t].i = (immsrc << 12) + PC_;
@ -174,7 +158,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
trace->alu_type = AluType::ARITH;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
if (func7 & 0x1) {
@ -334,7 +318,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
trace->exe_type = ExeType::ALU;
trace->alu_type = AluType::ARITH;
trace->used_iregs.set(rsrc0);
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
switch (func3) {
@ -395,7 +379,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
trace->alu_type = AluType::ARITH;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
if (func7 & 0x1) {
@ -521,7 +505,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
trace->exe_type = ExeType::ALU;
trace->alu_type = AluType::ARITH;
trace->used_iregs.set(rsrc0);
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
switch (func3) {
@ -565,49 +549,49 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
trace->alu_type = AluType::BRANCH;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
switch (func3) {
case 0: {
// RV32I: BEQ
if (rsdata[t][0].i == rsdata[t][1].i) {
nextPC = PC_ + immsrc;
next_pc = PC_ + immsrc;
}
break;
}
case 1: {
// RV32I: BNE
if (rsdata[t][0].i != rsdata[t][1].i) {
nextPC = PC_ + immsrc;
next_pc = PC_ + immsrc;
}
break;
}
case 4: {
// RV32I: BLT
if (rsdata[t][0].i < rsdata[t][1].i) {
nextPC = PC_ + immsrc;
next_pc = PC_ + immsrc;
}
break;
}
case 5: {
// RV32I: BGE
if (rsdata[t][0].i >= rsdata[t][1].i) {
nextPC = PC_ + immsrc;
next_pc = PC_ + immsrc;
}
break;
}
case 6: {
// RV32I: BLTU
if (rsdata[t][0].u < rsdata[t][1].u) {
nextPC = PC_ + immsrc;
next_pc = PC_ + immsrc;
}
break;
}
case 7: {
// RV32I: BGEU
if (rsdata[t][0].u >= rsdata[t][1].u) {
nextPC = PC_ + immsrc;
next_pc = PC_ + immsrc;
}
break;
}
@ -623,11 +607,11 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
// RV32I: JAL
trace->exe_type = ExeType::ALU;
trace->alu_type = AluType::BRANCH;
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
rddata[t].i = nextPC;
nextPC = PC_ + immsrc;
rddata[t].i = next_pc;
next_pc = PC_ + immsrc;
trace->fetch_stall = true;
break; // runonce
}
@ -639,11 +623,11 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
trace->exe_type = ExeType::ALU;
trace->alu_type = AluType::BRANCH;
trace->used_iregs.set(rsrc0);
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
rddata[t].i = nextPC;
nextPC = rsdata[t][0].i + immsrc;
rddata[t].i = next_pc;
next_pc = rsdata[t][0].i + immsrc;
trace->fetch_stall = true;
break; // runOnce
}
@ -662,7 +646,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|| (opcode == FL && func3 == 3)) {
uint32_t data_bytes = 1 << (func3 & 0x3);
uint32_t data_width = 8 * data_bytes;
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
uint64_t mem_addr = rsdata[t][0].i + immsrc;
@ -726,7 +710,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|| (opcode == FS && func3 == 2)
|| (opcode == FS && func3 == 3)) {
uint32_t data_bytes = 1 << (func3 & 0x3);
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
uint64_t mem_addr = rsdata[t][0].i + immsrc;
@ -769,7 +753,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
trace->data = trace_data;
uint32_t data_bytes = 1 << (func3 & 0x3);
uint32_t data_width = 8 * data_bytes;
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
uint64_t mem_addr = rsdata[t][0].u;
@ -834,7 +818,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
break;
}
case SYS_INST: {
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
uint32_t csr_addr = immsrc;
@ -931,7 +915,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
}
case FCI: {
trace->exe_type = ExeType::FPU;
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
uint32_t frm = get_fpu_rm(func3, core_, t, warp_id_);
@ -1264,7 +1248,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
trace->used_fregs.set(rsrc0);
trace->used_fregs.set(rsrc1);
trace->used_fregs.set(rsrc2);
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
uint32_t frm = get_fpu_rm(func3, core_, t, warp_id_);
@ -1312,14 +1296,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
}
case EXT1: {
switch (func7) {
case 0: {
uint32_t ts = 0;
for (uint32_t t = 0; t < num_threads; ++t) {
if (tmask_.test(t)) {
ts = t;
break;
}
}
case 0: {
switch (func3) {
case 0: {
// TMC
@ -1334,22 +1311,14 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
pred[t] = tmask_.test(t) ? (ireg_file_.at(t).at(rsrc0) != 0) : 0;
}
if (pred.any()) {
tmask_ &= pred;
next_tmask &= pred;
}
} else {
tmask_.reset();
next_tmask.reset();
for (uint32_t t = 0; t < num_threads; ++t) {
tmask_.set(t, rsdata.at(ts)[0].i & (1 << t));
next_tmask.set(t, rsdata.at(thread_start)[0].i & (1 << t));
}
}
DPH(3, "*** New TMC: ");
for (uint32_t i = 0; i < num_threads; ++i)
DPN(3, tmask_.test(i));
DPN(3, std::endl);
if (!tmask_.any()) {
core_->active_warps_.reset(warp_id_);
}
} break;
case 1: {
// WSPAWN
@ -1358,70 +1327,70 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
core_->wspawn(rsdata.at(ts)[0].i, rsdata.at(ts)[1].i);
core_->wspawn(rsdata.at(thread_start)[0].i, rsdata.at(thread_start)[1].i);
} break;
case 2: {
// SPLIT
if (ipdom_stack_.size() == arch_.ipdom_size()) {
std::cout << "IPDOM stack is full! (size=" << std::dec << ipdom_stack_.size() << ")\n" << std::flush;
std::abort();
}
// SPLIT
trace->exe_type = ExeType::GPU;
trace->gpu_type = GpuType::SPLIT;
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
if (HasDivergentThreads(tmask_, ireg_file_, rsrc0)) {
ThreadMask tmask;
for (uint32_t t = 0; t < num_threads; ++t) {
tmask[t] = tmask_.test(t) && !ireg_file_.at(t).at(rsrc0);
}
DomStackEntry e(tmask, nextPC);
ipdom_stack_.push(tmask_);
ipdom_stack_.push(e);
for (uint32_t t = 0, n = e.tmask.size(); t < n; ++t) {
tmask_.set(t, !e.tmask.test(t) && tmask_.test(t));
}
auto stack_size = ipdom_stack_.size();
DPH(3, "*** Split: New TM=");
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(t));
DPN(3, ", Pushed TM=");
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, e.tmask.test(t));
DPN(3, ", PC=0x" << std::hex << e.PC << "\n");
ThreadMask then_tmask, else_tmask;
for (uint32_t t = 0; t < num_threads; ++t) {
auto cond = ireg_file_.at(t).at(rsrc0);
then_tmask[t] = tmask_.test(t) && cond;
else_tmask[t] = tmask_.test(t) && !cond;
}
if (then_tmask.count() != tmask_.count()
&& else_tmask.count() != tmask_.count()) {
if (ipdom_stack_.size() == arch_.ipdom_size()) {
std::cout << "IPDOM stack is full! (size=" << std::dec << ipdom_stack_.size() << ")\n" << std::flush;
std::abort();
}
if (then_tmask.count() >= else_tmask.count()) {
next_tmask = then_tmask;
} else {
next_tmask = else_tmask;
}
// push reconvergence thread mask
ipdom_stack_.emplace(tmask_);
// push flipped thread mask
auto join_tmask = ~next_tmask & tmask_;
ipdom_stack_.emplace(join_tmask, next_pc);
} else {
DP(3, "*** Unanimous pred");
DomStackEntry e(tmask_);
e.unanimous = true;
ipdom_stack_.push(e);
}
// Uniform control-flow
}
for (uint32_t t = thread_start; t < num_threads; ++t) {
rddata[t].i = stack_size;
}
rd_write = true;
} break;
case 3: {
// JOIN
if (ipdom_stack_.empty()) {
std::cout << "IPDOM stack is empty!\n" << std::flush;
std::abort();
}
trace->exe_type = ExeType::GPU;
trace->gpu_type = GpuType::JOIN;
trace->fetch_stall = true;
if (!ipdom_stack_.empty() && ipdom_stack_.top().unanimous) {
DP(3, "*** Unanimous branch at join");
tmask_ = ipdom_stack_.top().tmask;
trace->gpu_type = GpuType::JOIN;
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
uint32_t stack_ptr = ireg_file_.at(thread_start).at(rsrc0);
if (stack_ptr != ipdom_stack_.size()) {
if (ipdom_stack_.empty()) {
std::cout << "IPDOM stack is empty!\n" << std::flush;
std::abort();
}
next_tmask = ipdom_stack_.top().tmask;
if (!ipdom_stack_.top().fallthrough) {
next_pc = ipdom_stack_.top().PC;
}
ipdom_stack_.pop();
} else {
if (!ipdom_stack_.top().fallThrough) {
nextPC = ipdom_stack_.top().PC;
DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
}
tmask_ = ipdom_stack_.top().tmask;
DPH(3, "*** Join: New TM=");
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(t));
DPN(3, "\n");
ipdom_stack_.pop();
}
// Uniform control-flow
}
} break;
case 4: {
// BAR
@ -1430,7 +1399,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
trace->data = std::make_shared<GPUTraceData>(rsdata[ts][0].i, rsdata[ts][1].i);
trace->data = std::make_shared<GPUTraceData>(rsdata[thread_start][0].i, rsdata[thread_start][1].i);
} break;
default:
std::abort();
@ -1446,7 +1415,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
for (uint32_t ri = 0, rn = core_->raster_units_.size(); ri < rn; ++ri) {
trace_data->raster_idx = core_->raster_idx();
bool has_stamps = false;
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
auto result = core_->raster_units_.at(trace_data->raster_idx)->fetch(
@ -1478,7 +1447,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
auto trace_data = std::make_shared<TexUnit::TraceData>(num_threads);
trace->data = trace_data;
trace_data->tex_idx = core_->tex_idx();
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
auto u = rsdata[t][0].i;
@ -1499,7 +1468,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
trace->used_iregs.set(rsrc2);
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
rddata[t].i = rsdata[t][0].i ? rsdata[t][1].i : rsdata[t][2].i;
@ -1515,7 +1484,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
auto trace_data = std::make_shared<RopUnit::TraceData>();
trace->data = trace_data;
trace_data->rop_idx = core_->rop_idx();
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
auto pos_face = rsdata[t][0].i;
@ -1539,7 +1508,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
trace->used_iregs.set(rsrc1);
trace->used_iregs.set(rsrc2);
uint32_t shift = func2 * 8;
for (uint32_t t = 0; t < num_threads; ++t) {
for (uint32_t t = thread_start; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
rddata[t].i = (int32_t)(((int64_t)rsdata[t][0].i32 * (int64_t)rsdata[t][1].i32) >> shift) + rsdata[t][2].i32;
@ -2444,8 +2413,18 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
}
PC_ += 4;
if (PC_ != nextPC) {
DP(3, "*** Next PC: " << std::hex << nextPC << std::dec);
PC_ = nextPC;
if (PC_ != next_pc) {
DP(3, "*** Next PC=0x" << std::hex << next_pc << std::dec);
PC_ = next_pc;
}
if (tmask_ != next_tmask) {
DPH(3, "*** New Tmask=");
for (uint32_t i = 0; i < num_threads; ++i)
DPN(3, next_tmask.test(i));
DPN(3, std::endl);
tmask_ = next_tmask;
if (!next_tmask.any()) {
core_->active_warps_.reset(warp_id_);
}
}
}

View file

@ -16,21 +16,17 @@ struct DomStackEntry {
DomStackEntry(const ThreadMask &tmask, Word PC)
: tmask(tmask)
, PC(PC)
, fallThrough(false)
, unanimous(false)
, fallthrough(false)
{}
DomStackEntry(const ThreadMask &tmask)
: tmask(tmask)
, PC(0)
, fallThrough(true)
, unanimous(false)
DomStackEntry(const ThreadMask &tmask)
: tmask(tmask)
, fallthrough(true)
{}
ThreadMask tmask;
Word PC;
bool fallThrough;
bool unanimous;
bool fallthrough;
};
struct vtype {

View file

@ -29,9 +29,9 @@ int main() {
errors += test_tls();
if (0 == errors) {
vx_printf("Passed!\n");
PRINTF("Passed!\n");
} else {
vx_printf("Failed!\n");
PRINTF("Failed!\n");
}
return errors;

View file

@ -5,31 +5,26 @@
#include <vx_print.h>
#include <vx_spawn.h>
#define __if(b) vx_split(b); \
if (b)
#define __else else
#define __endif vx_join();
int __attribute__ ((noinline)) check_error(const int* buffer, int offset, int size) {
int __attribute__((noinline)) check_error(const int* buffer, int offset, int size) {
int errors = 0;
for (int i = offset; i < size; i++) {
int value = buffer[i];
int ref_value = 65 + i;
if (value == ref_value) {
//vx_printf("[%d] %c\n", i, value);
//PRINTF("[%d] %c\n", i, value);
} else {
vx_printf("*** error: [%d] 0x%x, expected 0x%x\n", i, value, ref_value);
PRINTF("*** error: [%d] 0x%x, expected 0x%x\n", i, value, ref_value);
++errors;
}
}
return errors;
}
int __attribute__ ((noinline)) make_select_tmask(int tid) {
int __attribute__((noinline)) make_select_tmask(int tid) {
return (1 << tid);
}
int __attribute__ ((noinline)) make_full_tmask(int num_threads) {
int __attribute__((noinline)) make_full_tmask(int num_threads) {
return (1 << num_threads) - 1;
}
@ -39,7 +34,7 @@ int __attribute__ ((noinline)) make_full_tmask(int num_threads) {
int global_buffer[GLOBAL_MEM_SZ];
int test_global_memory() {
vx_printf("Global Memory Test\n");
PRINTF("Global Memory Test\n");
for (int i = 0; i < GLOBAL_MEM_SZ; i++) {
global_buffer[i] = 65 + i;
@ -51,7 +46,7 @@ int test_global_memory() {
///////////////////////////////////////////////////////////////////////////////
int test_stack_memory() {
vx_printf("Stack Memory Test\n");
PRINTF("Stack Memory Test\n");
static const int STACK_MEM_SZ = 8;
int stack_buffer[STACK_MEM_SZ];
@ -69,7 +64,7 @@ int test_shared_memory() {
static const int SHARED_MEM_SZ = 8;
int* shared_buffer = (int*)(STACK_BASE_ADDR-(128*4)-SHARED_MEM_SZ*4);
vx_printf("Shared Memory Test\n");
PRINTF("Shared Memory Test\n");
for (int i = 0; i < SHARED_MEM_SZ; i++) {
shared_buffer[i] = 65 + i;
@ -82,13 +77,13 @@ int test_shared_memory() {
int tmc_buffer[8];
void __attribute__ ((noinline)) do_tmc() {
void __attribute__((noinline)) do_tmc() {
unsigned tid = vx_thread_id();
tmc_buffer[tid] = 65 + tid;
}
int test_tmc() {
vx_printf("TMC Test\n");
PRINTF("TMC Test\n");
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
@ -103,13 +98,13 @@ int test_tmc() {
int pred_buffer[8];
void __attribute__ ((noinline)) do_pred() {
void __attribute__((noinline)) do_pred() {
unsigned tid = vx_thread_id();
pred_buffer[tid] = 65 + tid;
}
int test_pred() {
vx_printf("PRED Test\n");
PRINTF("PRED Test\n");
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
@ -138,7 +133,7 @@ void wspawn_kernel() {
}
int test_wsapwn() {
vx_printf("Wspawn Test\n");
PRINTF("Wspawn Test\n");
int num_warps = std::min(vx_num_warps(), 8);
vx_wspawn(num_warps, wspawn_kernel);
wspawn_kernel();
@ -150,33 +145,46 @@ int test_wsapwn() {
int dvg_buffer[4];
void __attribute__ ((noinline)) do_divergence() {
unsigned tid = vx_thread_id();
__if (tid < 2) {
__if (tid < 1) {
dvg_buffer[tid] = 65;
void __attribute__((noinline)) do_divergence() {
int tid = vx_thread_id();
int cond1 = tid < 2;
int sp1 = vx_split(cond1);
if (cond1) {
{
int cond2 = tid < 1;
int sp2 = vx_split(cond2);
if (cond2) {
dvg_buffer[tid] = 65; // A
} else {
dvg_buffer[tid] = 66; // B
}
vx_join(sp2);
}
__else {
dvg_buffer[tid] = 66;
{
int cond3 = tid < 0;
int sp3 = vx_split(cond3);
if (cond3) {
dvg_buffer[tid] = 67; // C
}
vx_join(sp3);
}
} else {
{
int cond2 = tid < 3;
int sp2 = vx_split(cond2);
if (cond2) {
dvg_buffer[tid] = 67; // C
} else {
dvg_buffer[tid] = 68; // D
}
vx_join(sp2);
}
__endif
}
__else {
__if (tid < 3) {
dvg_buffer[tid] = 67;
}
__else {
dvg_buffer[tid] = 68;
}
__endif
}
__endif
vx_join(sp1);
}
int test_divergence() {
vx_printf("Control Divergence Test\n");
PRINTF("Control Divergence Test\n");
int num_threads = std::min(vx_num_threads(), 4);
int tmask = make_full_tmask(num_threads);
@ -203,7 +211,7 @@ void st_kernel(int task_id, const st_args_t * __UNIFORM__ arg) {
}
int test_spawn_tasks() {
vx_printf("SpawnTasks Test\n");
PRINTF("SpawnTasks Test\n");
st_args_t arg;
arg.src = st_buffer_src;
@ -232,14 +240,14 @@ void sr_kernel(const sr_args_t * arg) {
arg->buf[tid] = 65 + tid;
}
void __attribute__ ((noinline)) do_serial() {
void __attribute__((noinline)) do_serial() {
sr_args_t arg;
arg.buf = sr_buffer;
vx_serial((vx_serial_cb)sr_kernel, &arg);
}
int test_serial() {
vx_printf("Serial Test\n");
PRINTF("Serial Test\n");
int num_threads = std::min(vx_num_threads(), 8);
int tmask = make_full_tmask(num_threads);
vx_tmc(tmask);
@ -253,7 +261,7 @@ int test_serial() {
int tmask_buffer[8];
int __attribute__ ((noinline)) do_tmask() {
int __attribute__((noinline)) do_tmask() {
int tid = vx_thread_id();
int tmask = make_select_tmask(tid);
int cur_tmask = vx_thread_mask();
@ -262,7 +270,7 @@ int __attribute__ ((noinline)) do_tmask() {
}
int test_tmask() {
vx_printf("Thread Mask Test\n");
PRINTF("Thread Mask Test\n");
// activate all thread to populate shared variables
vx_tmc(-1);
@ -298,7 +306,7 @@ void barrier_kernel() {
}
int test_barrier() {
vx_printf("Barrier Test\n");
PRINTF("Barrier Test\n");
int num_warps = std::min(vx_num_warps(), 8);
barrier_ctr = num_warps;
barrier_stall = 0;
@ -312,7 +320,7 @@ int test_barrier() {
int tls_buffer[8];
__thread int tls_var;
__attribute__ ((noinline)) void print_tls_var() {
__attribute__((noinline)) void print_tls_var() {
unsigned wid = vx_warp_id();
tls_buffer[wid] = 65 + tls_var;
}
@ -325,7 +333,7 @@ void tls_kernel() {
}
int test_tls() {
vx_printf("TLS Test\n");
PRINTF("TLS Test\n");
int num_warps = std::min(vx_num_warps(), 8);
vx_wspawn(num_warps, tls_kernel);
tls_kernel();

View file

@ -1,6 +1,8 @@
#ifndef TESTS
#define TESTS
#define PRINTF vx_printf
int test_global_memory();
int test_stack_memory();