mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
split/join redesign
This commit is contained in:
parent
ebf104de1b
commit
34206598e7
23 changed files with 555 additions and 483 deletions
|
@ -37,11 +37,11 @@ then
|
|||
#make -C tests/riscv/isa run-rtlsim-64f
|
||||
|
||||
make -C sim/rtlsim clean
|
||||
CONFIGS="-DFLEN_64 -DFPU_FPNEW" make -C sim/rtlsim
|
||||
CONFIGS="-DEXT_D_ENABLE -DFPU_FPNEW" make -C sim/rtlsim
|
||||
make -C tests/riscv/isa run-rtlsim-64d
|
||||
|
||||
make -C sim/rtlsim clean
|
||||
CONFIGS="-DFLEN_64 -DFPU_DPI" make -C sim/rtlsim
|
||||
CONFIGS="-DEXT_D_ENABLE -DFPU_DPI" make -C sim/rtlsim
|
||||
make -C tests/riscv/isa run-rtlsim-64d
|
||||
fi
|
||||
|
||||
|
@ -170,7 +170,7 @@ CONFIGS="-DENABLE_DPI -DEXT_GFX_ENABLE -DL1_DISABLE -DSM_DISABLE -DTCACHE_DISABL
|
|||
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-ttriangle.cgltrace -rtriangle_ref_8.png -w8 -h8" --warps=1 --threads=2 --debug=3
|
||||
CONFIGS="-DENABLE_DPI -DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=draw3d --args="-ttriangle.cgltrace -rtriangle_ref_8.png -w8 -h8" --warps=1 --threads=2 --debug=3
|
||||
CONFIGS="-DEXT_GFX_ENABLE -DL1_DISABLE -DSM_DISABLE -DTCACHE_DISABLE -DRCACHE_DISABLE -DOCACHE_DISABLE" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-tvase.cgltrace -rvase_ref_32.png -w32 -h32" --threads=1
|
||||
CONFIGS="-DEXT_GFX_ENABLE -DIPDOM_STACK_SIZE=128" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-x -ttriangle.cgltrace -rtriangle_ref_128.png"
|
||||
CONFIGS="-DEXT_GFX_ENABLE -DPD_STACK_SIZE=128" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-x -ttriangle.cgltrace -rtriangle_ref_128.png"
|
||||
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-y -ttriangle.cgltrace -rtriangle_ref_128.png"
|
||||
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-z -ttriangle.cgltrace -rtriangle_ref_128.png"
|
||||
CONFIGS="-DENABLE_DPI -DEXT_GFX_ENABLE -DL1_DISABLE -DSM_DISABLE -DTCACHE_DISABLE -DRCACHE_DISABLE -DOCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=draw3d --args="-tvase.cgltrace -rvase_ref_32.png -w32 -h32" --threads=2 || true
|
||||
|
|
|
@ -26,13 +26,6 @@
|
|||
`endif
|
||||
`endif
|
||||
|
||||
// 32 bit FLEN as default.
|
||||
`ifndef FLEN_32
|
||||
`ifndef FLEN_64
|
||||
`define FLEN_32
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef XLEN_64
|
||||
`define XLEN 64
|
||||
`endif
|
||||
|
@ -41,14 +34,6 @@
|
|||
`define XLEN 32
|
||||
`endif
|
||||
|
||||
`ifdef FLEN_64
|
||||
`define FLEN 64
|
||||
`endif
|
||||
|
||||
`ifdef FLEN_32
|
||||
`define FLEN 32
|
||||
`endif
|
||||
|
||||
`ifndef NUM_CLUSTERS
|
||||
`define NUM_CLUSTERS 1
|
||||
`endif
|
||||
|
@ -185,6 +170,20 @@
|
|||
`define EXT_F_ENABLE
|
||||
`endif
|
||||
|
||||
`ifdef EXT_D_ENABLE
|
||||
`define FLEN_64
|
||||
`else
|
||||
`define FLEN_32
|
||||
`endif
|
||||
|
||||
`ifdef FLEN_64
|
||||
`define FLEN 64
|
||||
`endif
|
||||
|
||||
`ifdef FLEN_32
|
||||
`define FLEN 32
|
||||
`endif
|
||||
|
||||
`ifdef EXT_GFX_ENABLE
|
||||
`define EXT_TEX_ENABLE
|
||||
`define EXT_RASTER_ENABLE
|
||||
|
@ -401,11 +400,6 @@
|
|||
`define LSUQ_SIZE `MAX(2, `NUM_WARPS * 2)
|
||||
`endif
|
||||
|
||||
// Size of divergence Stack
|
||||
`ifndef IPDOM_STACK_SIZE
|
||||
`define IPDOM_STACK_SIZE 32
|
||||
`endif
|
||||
|
||||
// Floating-Point Units ///////////////////////////////////////////////////////
|
||||
|
||||
// Number of FPU units
|
||||
|
|
|
@ -27,6 +27,9 @@
|
|||
|
||||
`define NR_BITS `CLOG2(`NUM_REGS)
|
||||
|
||||
`define PD_STACK_SIZE `UP(`NT_BITS)
|
||||
`define PD_STACK_SIZEW `CLOG2(`PD_STACK_SIZE)
|
||||
|
||||
`define PERF_CTR_BITS 44
|
||||
|
||||
`ifndef NDEBUG
|
||||
|
@ -215,6 +218,7 @@
|
|||
`define INST_GPU_JOIN 4'h3
|
||||
`define INST_GPU_BAR 4'h4
|
||||
`define INST_GPU_PRED 4'h5
|
||||
`define INST_GPU_IS_WCTL(op) (op <= 5)
|
||||
|
||||
`define INST_GPU_TEX 4'h6
|
||||
`define INST_GPU_RASTER 4'h7
|
||||
|
|
|
@ -17,13 +17,17 @@ typedef struct packed {
|
|||
} gpu_wspawn_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic valid;
|
||||
logic diverged;
|
||||
logic [`NUM_THREADS-1:0] then_tmask;
|
||||
logic [`NUM_THREADS-1:0] else_tmask;
|
||||
logic [`XLEN-1:0] pc;
|
||||
logic valid;
|
||||
logic [`NUM_THREADS-1:0] taken;
|
||||
logic [`NUM_THREADS-1:0] tmask;
|
||||
logic [`XLEN-1:0] next_pc;
|
||||
} gpu_split_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic valid;
|
||||
logic [`PD_STACK_SIZEW-1:0] stack_ptr;
|
||||
} gpu_join_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic valid;
|
||||
logic [`NB_BITS-1:0] id;
|
||||
|
@ -32,8 +36,8 @@ typedef struct packed {
|
|||
} gpu_barrier_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [`XLEN-1:0] startup_addr;
|
||||
logic [7:0] mpm_class;
|
||||
logic [`XLEN-1:0] startup_addr;
|
||||
logic [7:0] mpm_class;
|
||||
} base_dcrs_t;
|
||||
|
||||
/* verilator lint_off UNUSED */
|
||||
|
@ -265,6 +269,7 @@ endpackage
|
|||
`define GPU_TMC_BITS $bits(VX_gpu_types::gpu_tmc_t)
|
||||
`define GPU_WSPAWN_BITS $bits(VX_gpu_types::gpu_wspawn_t)
|
||||
`define GPU_SPLIT_BITS $bits(VX_gpu_types::gpu_split_t)
|
||||
`define GPU_JOIN_BITS $bits(VX_gpu_types::gpu_join_t)
|
||||
`define GPU_BARRIER_BITS $bits(VX_gpu_types::gpu_barrier_t)
|
||||
|
||||
`endif // VX_GPU_TYPES_VH
|
||||
|
|
|
@ -37,7 +37,7 @@ module VX_decode #(
|
|||
reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r;
|
||||
reg [`XLEN-1:0] imm;
|
||||
reg use_rd, use_PC, use_imm;
|
||||
reg is_join, is_wstall;
|
||||
reg is_wstall;
|
||||
|
||||
wire [31:0] instr = fetch_if.data;
|
||||
wire [6:0] opcode = instr[6:0];
|
||||
|
@ -137,7 +137,6 @@ module VX_decode #(
|
|||
use_imm = 0;
|
||||
use_PC = 0;
|
||||
use_rd = 0;
|
||||
is_join = 0;
|
||||
is_wstall = 0;
|
||||
|
||||
case (opcode)
|
||||
|
@ -437,11 +436,14 @@ module VX_decode #(
|
|||
3'h2: begin // SPLIT
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_SPLIT);
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rs1);
|
||||
use_rd = 1;
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rd);
|
||||
end
|
||||
3'h3: begin // JOIN
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_JOIN);
|
||||
is_join = 1;
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
3'h4: begin // BAR
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_BAR);
|
||||
|
@ -551,7 +553,6 @@ module VX_decode #(
|
|||
assign decode_sched_if.valid = fetch_fire;
|
||||
assign decode_sched_if.wid = fetch_if.wid;
|
||||
assign decode_sched_if.is_wstall = is_wstall;
|
||||
assign decode_sched_if.is_join = is_join;
|
||||
|
||||
assign fetch_if.ibuf_pop = decode_if.ibuf_pop;
|
||||
assign fetch_if.ready = decode_if.ready;
|
||||
|
|
|
@ -194,10 +194,10 @@ module VX_execute #(
|
|||
.reset (fpu_reset),
|
||||
.fpu_exe_if (fpu_exe_if),
|
||||
.fpu_bus_if (fpu_bus_if),
|
||||
.fpu_to_csr_if (fpu_to_csr_if),
|
||||
.fpu_commit_if (fpu_commit_if),
|
||||
.fpu_to_csr_if (fpu_to_csr_if),
|
||||
.csr_pending (csr_pending),
|
||||
.req_pending (fpu_pending)
|
||||
.req_pending (fpu_pending),
|
||||
.commit_if (fpu_commit_if)
|
||||
);
|
||||
`endif
|
||||
|
||||
|
|
|
@ -44,13 +44,11 @@ module VX_gpu_unit #(
|
|||
|
||||
localparam UUID_WIDTH = `UP(`UUID_BITS);
|
||||
localparam NW_WIDTH = `UP(`NW_BITS);
|
||||
localparam WCTL_DATAW = `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS;
|
||||
localparam RSP_DATAW = `MAX(`NUM_THREADS * `XLEN, WCTL_DATAW);
|
||||
localparam RSP_ARB_DATAW = UUID_WIDTH + NW_WIDTH + `NUM_THREADS + `XLEN + `NR_BITS + 1 + RSP_DATAW + 1 + 1;
|
||||
localparam RSP_ARB_DATAW = UUID_WIDTH + NW_WIDTH + `NUM_THREADS + (`NUM_THREADS * `XLEN) + `NR_BITS + 1 + `XLEN + 1;
|
||||
localparam RSP_ARB_SIZE = 1 + `EXT_TEX_ENABLED + `EXT_RASTER_ENABLED + `EXT_ROP_ENABLED + `EXT_IMADD_ENABLED;
|
||||
|
||||
localparam RSP_ARB_IDX_GPU = 0;
|
||||
localparam RSP_ARB_IDX_RASTER = RSP_ARB_IDX_GPU + 1;
|
||||
localparam RSP_ARB_IDX_WCTL = 0;
|
||||
localparam RSP_ARB_IDX_RASTER = RSP_ARB_IDX_WCTL + 1;
|
||||
localparam RSP_ARB_IDX_ROP = RSP_ARB_IDX_RASTER + `EXT_RASTER_ENABLED;
|
||||
localparam RSP_ARB_IDX_TEX = RSP_ARB_IDX_ROP + `EXT_ROP_ENABLED;
|
||||
localparam RSP_ARB_IDX_IMADD = RSP_ARB_IDX_TEX + `EXT_TEX_ENABLED;
|
||||
|
@ -63,89 +61,46 @@ module VX_gpu_unit #(
|
|||
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
|
||||
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
|
||||
|
||||
wire [RSP_DATAW-1:0] rsp_data;
|
||||
wire rsp_is_wctl;
|
||||
|
||||
wire gpu_req_valid;
|
||||
reg gpu_req_ready;
|
||||
|
||||
wire csr_ready = ~csr_pending;
|
||||
assign gpu_req_valid = gpu_exe_if.valid && csr_ready;
|
||||
|
||||
// Warp control block
|
||||
|
||||
gpu_tmc_t tmc;
|
||||
gpu_wspawn_t wspawn;
|
||||
gpu_barrier_t barrier;
|
||||
gpu_split_t split;
|
||||
|
||||
wire is_wspawn = (gpu_exe_if.op_type == `INST_GPU_WSPAWN);
|
||||
wire is_tmc = (gpu_exe_if.op_type == `INST_GPU_TMC);
|
||||
wire is_split = (gpu_exe_if.op_type == `INST_GPU_SPLIT);
|
||||
wire is_join = (gpu_exe_if.op_type == `INST_GPU_JOIN);
|
||||
wire is_bar = (gpu_exe_if.op_type == `INST_GPU_BAR);
|
||||
wire is_pred = (gpu_exe_if.op_type == `INST_GPU_PRED);
|
||||
|
||||
wire [`XLEN-1:0] rs1_data = gpu_exe_if.rs1_data[gpu_exe_if.tid];
|
||||
wire [`XLEN-1:0] rs2_data = gpu_exe_if.rs2_data[gpu_exe_if.tid];
|
||||
|
||||
wire [`NUM_THREADS-1:0] taken_tmask;
|
||||
wire [`NUM_THREADS-1:0] not_taken_tmask;
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
wire taken = (gpu_exe_if.rs1_data[i] != 0);
|
||||
assign taken_tmask[i] = gpu_exe_if.tmask[i] && taken;
|
||||
assign not_taken_tmask[i] = gpu_exe_if.tmask[i] && ~taken;
|
||||
end
|
||||
|
||||
// tmc
|
||||
|
||||
wire [`NUM_THREADS-1:0] pred_mask = (taken_tmask != 0) ? taken_tmask : gpu_exe_if.tmask;
|
||||
|
||||
assign tmc.valid = is_tmc || is_pred;
|
||||
assign tmc.tmask = is_pred ? pred_mask : rs1_data[`NUM_THREADS-1:0];
|
||||
|
||||
// wspawn
|
||||
|
||||
wire [`XLEN-1:0] wspawn_pc = rs2_data;
|
||||
wire [`NUM_WARPS-1:0] wspawn_wmask;
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
|
||||
assign wspawn_wmask[i] = (i < rs1_data);
|
||||
end
|
||||
assign wspawn.valid = is_wspawn;
|
||||
assign wspawn.wmask = wspawn_wmask;
|
||||
assign wspawn.pc = wspawn_pc;
|
||||
|
||||
// split
|
||||
|
||||
assign split.valid = is_split;
|
||||
assign split.diverged = (| taken_tmask) && (| not_taken_tmask);
|
||||
assign split.then_tmask = taken_tmask;
|
||||
assign split.else_tmask = not_taken_tmask;
|
||||
assign split.pc = gpu_exe_if.next_PC;
|
||||
|
||||
// barrier
|
||||
|
||||
assign barrier.valid = is_bar;
|
||||
assign barrier.id = rs1_data[`NB_BITS-1:0];
|
||||
assign barrier.is_global = rs1_data[31];
|
||||
assign barrier.size_m1 = $bits(barrier.size_m1)'(rs2_data - 1);
|
||||
|
||||
// Warp control response
|
||||
wire wctl_req_valid = gpu_req_valid && (is_wspawn || is_tmc || is_split || is_join || is_bar || is_pred);
|
||||
wire wctl_rsp_valid = wctl_req_valid;
|
||||
wire [WCTL_DATAW-1:0] wctl_rsp_data = {tmc, wspawn, split, barrier};
|
||||
wire wctl_rsp_ready;
|
||||
wire wctl_req_ready = wctl_rsp_ready;
|
||||
|
||||
assign rsp_arb_valid_in[RSP_ARB_IDX_GPU] = wctl_rsp_valid;
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_GPU] = {gpu_exe_if.uuid, gpu_exe_if.wid, gpu_exe_if.tmask, gpu_exe_if.PC, `NR_BITS'(0), 1'b0, RSP_DATAW'(wctl_rsp_data), 1'b1, ~is_join};
|
||||
assign wctl_rsp_ready = rsp_arb_ready_in[RSP_ARB_IDX_GPU];
|
||||
|
||||
`UNUSED_VAR (gpu_exe_if.op_mod)
|
||||
`UNUSED_VAR (gpu_exe_if.rs3_data)
|
||||
`UNUSED_VAR (gpu_exe_if.wb)
|
||||
`UNUSED_VAR (gpu_exe_if.rd)
|
||||
|
||||
// Warp control block
|
||||
|
||||
VX_gpu_exe_if wctl_exe_if();
|
||||
VX_commit_if wctl_commit_if();
|
||||
|
||||
assign wctl_exe_if.valid = gpu_req_valid && `INST_GPU_IS_WCTL(gpu_exe_if.op_type);
|
||||
assign wctl_exe_if.op_type = gpu_exe_if.op_type;
|
||||
assign wctl_exe_if.uuid = gpu_exe_if.uuid;
|
||||
assign wctl_exe_if.wid = gpu_exe_if.wid;
|
||||
assign wctl_exe_if.tmask = gpu_exe_if.tmask;
|
||||
assign wctl_exe_if.tid = gpu_exe_if.tid;
|
||||
assign wctl_exe_if.PC = gpu_exe_if.PC;
|
||||
assign wctl_exe_if.next_PC = gpu_exe_if.next_PC;
|
||||
assign wctl_exe_if.rd = gpu_exe_if.rd;
|
||||
assign wctl_exe_if.wb = gpu_exe_if.wb;
|
||||
assign wctl_exe_if.rs1_data = gpu_exe_if.rs1_data;
|
||||
assign wctl_exe_if.rs2_data = gpu_exe_if.rs2_data;
|
||||
|
||||
VX_wctl_unit #(
|
||||
.OUTPUT_REG (RSP_ARB_SIZE > 1)
|
||||
) wctl_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.gpu_exe_if (wctl_exe_if),
|
||||
.warp_ctl_if(warp_ctl_if),
|
||||
.commit_if (wctl_commit_if)
|
||||
);
|
||||
|
||||
assign rsp_arb_valid_in[RSP_ARB_IDX_WCTL] = wctl_commit_if.valid;
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_WCTL] = {wctl_commit_if.uuid, wctl_commit_if.wid, wctl_commit_if.tmask, wctl_commit_if.PC, wctl_commit_if.rd, wctl_commit_if.wb, wctl_commit_if.data, 1'b1};
|
||||
assign wctl_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_WCTL];
|
||||
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
|
||||
|
@ -171,16 +126,16 @@ module VX_gpu_unit #(
|
|||
VX_tex_agent #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) tex_agent (
|
||||
.clk (clk),
|
||||
.reset (tex_reset),
|
||||
.tex_csr_if (tex_csr_if),
|
||||
.tex_exe_if (tex_exe_if),
|
||||
.tex_commit_if (tex_commit_if),
|
||||
.tex_bus_if (tex_bus_if)
|
||||
.clk (clk),
|
||||
.reset (tex_reset),
|
||||
.tex_csr_if (tex_csr_if),
|
||||
.tex_exe_if (tex_exe_if),
|
||||
.tex_bus_if (tex_bus_if),
|
||||
.commit_if (tex_commit_if)
|
||||
);
|
||||
|
||||
assign rsp_arb_valid_in[RSP_ARB_IDX_TEX] = tex_commit_if.valid;
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_TEX] = {tex_commit_if.uuid, tex_commit_if.wid, tex_commit_if.tmask, tex_commit_if.PC, tex_commit_if.rd, tex_commit_if.wb, RSP_DATAW'(tex_commit_if.data), tex_commit_if.eop, 1'b0};
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_TEX] = {tex_commit_if.uuid, tex_commit_if.wid, tex_commit_if.tmask, tex_commit_if.PC, tex_commit_if.rd, tex_commit_if.wb, tex_commit_if.data, tex_commit_if.eop};
|
||||
assign tex_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_TEX];
|
||||
|
||||
`endif
|
||||
|
@ -202,16 +157,16 @@ module VX_gpu_unit #(
|
|||
VX_raster_agent #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) raster_agent (
|
||||
.clk (clk),
|
||||
.reset (raster_reset),
|
||||
.raster_csr_if (raster_csr_if),
|
||||
.raster_bus_if (raster_bus_if),
|
||||
.raster_exe_if (raster_exe_if),
|
||||
.raster_commit_if (raster_commit_if)
|
||||
.clk (clk),
|
||||
.reset (raster_reset),
|
||||
.raster_csr_if (raster_csr_if),
|
||||
.raster_bus_if (raster_bus_if),
|
||||
.raster_exe_if (raster_exe_if),
|
||||
.commit_if (raster_commit_if)
|
||||
);
|
||||
|
||||
assign rsp_arb_valid_in[RSP_ARB_IDX_RASTER] = raster_commit_if.valid;
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_RASTER] = {raster_commit_if.uuid, raster_commit_if.wid, raster_commit_if.tmask, raster_commit_if.PC, raster_commit_if.rd, raster_commit_if.wb, RSP_DATAW'(raster_commit_if.data), raster_commit_if.eop, 1'b0};
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_RASTER] = {raster_commit_if.uuid, raster_commit_if.wid, raster_commit_if.tmask, raster_commit_if.PC, raster_commit_if.rd, raster_commit_if.wb, raster_commit_if.data, raster_commit_if.eop};
|
||||
assign raster_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_RASTER];
|
||||
|
||||
`endif
|
||||
|
@ -240,16 +195,16 @@ module VX_gpu_unit #(
|
|||
VX_rop_agent #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) rop_agent (
|
||||
.clk (clk),
|
||||
.reset (rop_reset),
|
||||
.rop_csr_if (rop_csr_if),
|
||||
.rop_exe_if (rop_exe_if),
|
||||
.rop_commit_if (rop_commit_if),
|
||||
.rop_bus_if (rop_bus_if)
|
||||
.clk (clk),
|
||||
.reset (rop_reset),
|
||||
.rop_csr_if (rop_csr_if),
|
||||
.rop_exe_if (rop_exe_if),
|
||||
.rop_bus_if (rop_bus_if),
|
||||
.commit_if (rop_commit_if)
|
||||
);
|
||||
|
||||
assign rsp_arb_valid_in[RSP_ARB_IDX_ROP] = rop_commit_if.valid;
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_ROP] = {rop_commit_if.uuid, rop_commit_if.wid, rop_commit_if.tmask, rop_commit_if.PC, rop_commit_if.rd, rop_commit_if.wb, RSP_DATAW'(rop_commit_if.data), rop_commit_if.eop, 1'b0};
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_ROP] = {rop_commit_if.uuid, rop_commit_if.wid, rop_commit_if.tmask, rop_commit_if.PC, rop_commit_if.rd, rop_commit_if.wb, rop_commit_if.data, rop_commit_if.eop};
|
||||
assign rop_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_ROP];
|
||||
|
||||
`endif
|
||||
|
@ -311,13 +266,11 @@ module VX_gpu_unit #(
|
|||
end
|
||||
|
||||
assign rsp_arb_valid_in[RSP_ARB_IDX_IMADD] = imadd_valid_out;
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_IMADD] = {imadd_uuid_out, imadd_wid_out, imadd_tmask_out, imadd_PC_out, imadd_rd_out, 1'b1, RSP_DATAW'(imadd_data_out_x), 1'b1, 1'b0};
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_IMADD] = {imadd_uuid_out, imadd_wid_out, imadd_tmask_out, imadd_PC_out, imadd_rd_out, 1'b1, imadd_data_out_x, 1'b1};
|
||||
assign imadd_ready_out = rsp_arb_ready_in[RSP_ARB_IDX_IMADD];
|
||||
|
||||
`endif
|
||||
|
||||
|
||||
|
||||
// can accept new request?
|
||||
|
||||
always @(*) begin
|
||||
|
@ -334,7 +287,7 @@ module VX_gpu_unit #(
|
|||
`ifdef EXT_IMADD_ENABLE
|
||||
`INST_GPU_IMADD: gpu_req_ready = imadd_ready_in;
|
||||
`endif
|
||||
default: gpu_req_ready = wctl_req_ready;
|
||||
default: gpu_req_ready = wctl_exe_if.ready;
|
||||
endcase
|
||||
end
|
||||
assign gpu_exe_if.ready = gpu_req_ready && csr_ready;
|
||||
|
@ -352,21 +305,13 @@ module VX_gpu_unit #(
|
|||
.valid_in (rsp_arb_valid_in),
|
||||
.ready_in (rsp_arb_ready_in),
|
||||
.data_in (rsp_arb_data_in),
|
||||
.data_out ({gpu_commit_if.uuid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data, gpu_commit_if.eop, rsp_is_wctl}),
|
||||
.data_out ({gpu_commit_if.uuid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, gpu_commit_if.data, gpu_commit_if.eop}),
|
||||
.valid_out (gpu_commit_if.valid),
|
||||
.ready_out (gpu_commit_if.ready)
|
||||
);
|
||||
|
||||
assign gpu_commit_if.data = rsp_data[(`NUM_THREADS * `XLEN)-1:0];
|
||||
|
||||
// warp control reponse
|
||||
|
||||
wire gpu_req_fire = gpu_exe_if.valid && gpu_exe_if.ready;
|
||||
wire gpu_commit_fire = gpu_commit_if.valid && gpu_commit_if.ready;
|
||||
|
||||
assign warp_ctl_if.valid = gpu_commit_fire && rsp_is_wctl;
|
||||
assign warp_ctl_if.wid = gpu_commit_if.wid;
|
||||
assign {warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier} = rsp_data[WCTL_DATAW-1:0];
|
||||
|
||||
// pending request
|
||||
|
||||
|
|
|
@ -1,32 +1,32 @@
|
|||
`include "VX_platform.vh"
|
||||
|
||||
module VX_ipdom #(
|
||||
module VX_ipdom_stack #(
|
||||
parameter WIDTH = 1,
|
||||
parameter DEPTH = 1
|
||||
parameter DEPTH = 1,
|
||||
parameter ADDRW = $clog2(DEPTH)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
input wire pair,
|
||||
input wire [WIDTH - 1:0] q0,
|
||||
input wire [WIDTH - 1:0] q1,
|
||||
input wire [WIDTH - 1:0] q2,
|
||||
output wire [WIDTH - 1:0] d,
|
||||
output wire d_idx,
|
||||
output wire [ADDRW-1:0] q_ptr,
|
||||
output wire [ADDRW-1:0] d_ptr,
|
||||
input wire push,
|
||||
input wire pop,
|
||||
output wire index,
|
||||
input wire pop,
|
||||
output wire empty,
|
||||
output wire full
|
||||
);
|
||||
`STATIC_ASSERT(`ISPOW2(DEPTH), ("depth must be a power of 2!"))
|
||||
`STATIC_ASSERT(`ISPOW2(DEPTH), ("depth must be a power of 2!"))
|
||||
|
||||
localparam ADDRW = $clog2(DEPTH);
|
||||
|
||||
reg is_part [DEPTH-1:0];
|
||||
reg slot_idx [DEPTH-1:0];
|
||||
|
||||
reg [ADDRW-1:0] rd_ptr, wr_ptr;
|
||||
|
||||
reg empty_r, full_r;
|
||||
|
||||
wire [WIDTH-1:0] d1, d2;
|
||||
wire [WIDTH-1:0] d0, d1;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
@ -44,9 +44,9 @@ module VX_ipdom #(
|
|||
empty_r <= 0;
|
||||
full_r <= (ADDRW'(DEPTH-1) == wr_ptr);
|
||||
end else if (pop) begin
|
||||
wr_ptr <= wr_ptr - ADDRW'(is_part[rd_ptr]);
|
||||
rd_ptr <= rd_ptr - ADDRW'(is_part[rd_ptr]);
|
||||
empty_r <= is_part[rd_ptr] && (0 == rd_ptr);
|
||||
wr_ptr <= wr_ptr - ADDRW'(d_idx);
|
||||
rd_ptr <= rd_ptr - ADDRW'(d_idx);
|
||||
empty_r <= (rd_ptr == 0) && (d_idx == 1);
|
||||
full_r <= 0;
|
||||
end
|
||||
end
|
||||
|
@ -61,21 +61,23 @@ module VX_ipdom #(
|
|||
.write (push),
|
||||
`UNUSED_PIN (wren),
|
||||
.waddr (wr_ptr),
|
||||
.wdata ({q2, q1}),
|
||||
.wdata ({q1, q0}),
|
||||
.raddr (rd_ptr),
|
||||
.rdata ({d2, d1})
|
||||
.rdata ({d1, d0})
|
||||
);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (push) begin
|
||||
is_part[wr_ptr] <= ~pair;
|
||||
slot_idx[wr_ptr] <= 0;
|
||||
end else if (pop) begin
|
||||
is_part[rd_ptr] <= 1;
|
||||
slot_idx[rd_ptr] <= 1;
|
||||
end
|
||||
end
|
||||
|
||||
assign index = is_part[rd_ptr];
|
||||
assign d = index ? d1 : d2;
|
||||
assign d = d_idx ? d1 : d0;
|
||||
assign d_idx = slot_idx[rd_ptr];
|
||||
assign d_ptr = rd_ptr;
|
||||
assign q_ptr = wr_ptr;
|
||||
assign empty = empty_r;
|
||||
assign full = full_r;
|
||||
|
|
@ -34,10 +34,6 @@ module VX_schedule #(
|
|||
localparam NC_WIDTH = `UP(`NC_BITS);
|
||||
localparam NW_WIDTH = `UP(`NW_BITS);
|
||||
|
||||
wire join_else;
|
||||
wire [`XLEN-1:0] join_pc;
|
||||
wire [`NUM_THREADS-1:0] join_tmask;
|
||||
|
||||
reg [`NUM_WARPS-1:0] active_warps, active_warps_n; // updated when a warp is activated or disabled
|
||||
reg [`NUM_WARPS-1:0] stalled_warps; // set when branch/gpgpu instructions are issued
|
||||
|
||||
|
@ -63,6 +59,14 @@ module VX_schedule #(
|
|||
wire schedule_valid;
|
||||
wire schedule_ready;
|
||||
|
||||
// split/join
|
||||
wire split_is_divergent;
|
||||
wire [`NUM_THREADS-1:0] split_tmask0;
|
||||
wire join_is_divergent;
|
||||
wire join_is_else;
|
||||
wire [`NUM_THREADS-1:0] join_tmask;
|
||||
wire [`XLEN-1:0] join_pc;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] cycles;
|
||||
|
||||
reg [`NUM_WARPS-1:0][UUID_WIDTH-1:0] issued_instrs;
|
||||
|
@ -103,11 +107,14 @@ module VX_schedule #(
|
|||
thread_masks[0] <= 1;
|
||||
end else begin
|
||||
// join handling
|
||||
if (decode_sched_if.valid && decode_sched_if.is_join) begin
|
||||
if (join_else) begin
|
||||
warp_pcs[decode_sched_if.wid] <= `XLEN'(join_pc);
|
||||
if (warp_ctl_if.valid && warp_ctl_if.sjoin.valid) begin
|
||||
stalled_warps[warp_ctl_if.wid] <= 0;
|
||||
if (join_is_divergent) begin
|
||||
if (join_is_else) begin
|
||||
warp_pcs[warp_ctl_if.wid] <= `XLEN'(join_pc);
|
||||
end
|
||||
thread_masks[warp_ctl_if.wid] <= join_tmask;
|
||||
end
|
||||
thread_masks[decode_sched_if.wid] <= join_tmask;
|
||||
end
|
||||
|
||||
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
|
||||
|
@ -145,8 +152,8 @@ module VX_schedule #(
|
|||
// split handling
|
||||
if (warp_ctl_if.valid && warp_ctl_if.split.valid) begin
|
||||
stalled_warps[warp_ctl_if.wid] <= 0;
|
||||
if (warp_ctl_if.split.diverged) begin
|
||||
thread_masks[warp_ctl_if.wid] <= warp_ctl_if.split.then_tmask;
|
||||
if (split_is_divergent) begin
|
||||
thread_masks[warp_ctl_if.wid] <= split_tmask0;
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -216,47 +223,62 @@ module VX_schedule #(
|
|||
assign gbar_bus_if.req_size_m1 = gbar_req_size_m1;
|
||||
assign gbar_bus_if.req_core_id = NC_WIDTH'(CORE_ID % `NUM_CORES);
|
||||
|
||||
// split/join stack management
|
||||
// split/join handling
|
||||
|
||||
wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_data [`NUM_WARPS-1:0];
|
||||
wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_data [`NUM_WARPS-1:0];
|
||||
wire [`PD_STACK_SIZEW-1:0] ipdom_q_ptr [`NUM_WARPS-1:0];
|
||||
wire ipdom_index [`NUM_WARPS-1:0];
|
||||
|
||||
wire [`NUM_THREADS-1:0] then_tmask;
|
||||
wire [`NUM_THREADS-1:0] else_tmask;
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
assign then_tmask[i] = warp_ctl_if.split.tmask[i] && warp_ctl_if.split.taken[i];
|
||||
assign else_tmask[i] = warp_ctl_if.split.tmask[i] && ~warp_ctl_if.split.taken[i];
|
||||
end
|
||||
|
||||
wire [`CLOG2(`NUM_THREADS+1)-1:0] then_tmask_cnt, else_tmask_cnt;
|
||||
`POP_COUNT(then_tmask_cnt, then_tmask);
|
||||
`POP_COUNT(else_tmask_cnt, else_tmask);
|
||||
wire then_first = (then_tmask_cnt >= else_tmask_cnt);
|
||||
|
||||
assign split_is_divergent = (then_tmask != 0) && (else_tmask != 0);
|
||||
assign split_tmask0 = then_first ? then_tmask : else_tmask;
|
||||
assign warp_ctl_if.split_ret = ipdom_q_ptr[warp_ctl_if.wid];
|
||||
|
||||
assign join_is_divergent = (warp_ctl_if.sjoin.stack_ptr != ipdom_q_ptr[warp_ctl_if.wid]);
|
||||
assign {join_pc, join_tmask} = ipdom_data[warp_ctl_if.wid];
|
||||
assign join_is_else = (ipdom_index[warp_ctl_if.wid] == 0);
|
||||
|
||||
wire [`NUM_THREADS-1:0] split_tmask1 = then_first ? else_tmask : then_tmask;
|
||||
wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_q0 = {warp_ctl_if.split.next_pc, split_tmask1};
|
||||
wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_q1 = {`XLEN'(0), warp_ctl_if.split.tmask};
|
||||
|
||||
wire ipdom_push = warp_ctl_if.valid && warp_ctl_if.split.valid && split_is_divergent;
|
||||
wire ipdom_pop = warp_ctl_if.valid && warp_ctl_if.sjoin.valid && join_is_divergent;
|
||||
|
||||
`RESET_RELAY (ipdom_reset, reset);
|
||||
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
|
||||
wire push = warp_ctl_if.valid
|
||||
&& warp_ctl_if.split.valid
|
||||
&& (i == warp_ctl_if.wid);
|
||||
|
||||
wire pop = decode_sched_if.valid && decode_sched_if.is_join && (i == decode_sched_if.wid);
|
||||
|
||||
wire [`NUM_THREADS-1:0] else_tmask = warp_ctl_if.split.else_tmask;
|
||||
wire [`NUM_THREADS-1:0] orig_tmask = thread_masks[warp_ctl_if.wid];
|
||||
|
||||
wire [(`XLEN+`NUM_THREADS)-1:0] q_else = {warp_ctl_if.split.pc, else_tmask};
|
||||
wire [(`XLEN+`NUM_THREADS)-1:0] q_end = {`XLEN'(0), orig_tmask};
|
||||
|
||||
VX_ipdom #(
|
||||
VX_ipdom_stack #(
|
||||
.WIDTH (`XLEN+`NUM_THREADS),
|
||||
.DEPTH (`IPDOM_STACK_SIZE)
|
||||
) ipdom (
|
||||
.DEPTH (`PD_STACK_SIZE)
|
||||
) ipdom_stack (
|
||||
.clk (clk),
|
||||
.reset (ipdom_reset),
|
||||
.push (push),
|
||||
.pop (pop),
|
||||
.pair (warp_ctl_if.split.diverged),
|
||||
.q1 (q_end),
|
||||
.q2 (q_else),
|
||||
.push (ipdom_push && (i == warp_ctl_if.wid)),
|
||||
.pop (ipdom_pop && (i == warp_ctl_if.wid)),
|
||||
.q0 (ipdom_q0),
|
||||
.q1 (ipdom_q1),
|
||||
.d (ipdom_data[i]),
|
||||
.index (ipdom_index[i]),
|
||||
.d_idx (ipdom_index[i]),
|
||||
.q_ptr (ipdom_q_ptr[i]),
|
||||
`UNUSED_PIN (d_ptr),
|
||||
`UNUSED_PIN (empty),
|
||||
`UNUSED_PIN (full)
|
||||
);
|
||||
end
|
||||
|
||||
assign {join_pc, join_tmask} = ipdom_data[decode_sched_if.wid];
|
||||
assign join_else = ~ipdom_index[decode_sched_if.wid];
|
||||
|
||||
// schedule the next ready warp
|
||||
|
||||
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls);
|
||||
|
|
108
hw/rtl/core/VX_wctl_unit.sv
Normal file
108
hw/rtl/core/VX_wctl_unit.sv
Normal file
|
@ -0,0 +1,108 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_wctl_unit #(
|
||||
parameter OUTPUT_REG = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// Inputs
|
||||
VX_gpu_exe_if.slave gpu_exe_if,
|
||||
|
||||
// Outputs
|
||||
VX_warp_ctl_if.master warp_ctl_if,
|
||||
VX_commit_if.master commit_if
|
||||
);
|
||||
|
||||
localparam UUID_WIDTH = `UP(`UUID_BITS);
|
||||
localparam NW_WIDTH = `UP(`NW_BITS);
|
||||
|
||||
gpu_tmc_t tmc;
|
||||
gpu_wspawn_t wspawn;
|
||||
gpu_split_t split;
|
||||
gpu_join_t sjoin;
|
||||
gpu_barrier_t barrier;
|
||||
|
||||
wire [`XLEN-1:0] rs1_data = gpu_exe_if.rs1_data[gpu_exe_if.tid];
|
||||
wire [`XLEN-1:0] rs2_data = gpu_exe_if.rs2_data[gpu_exe_if.tid];
|
||||
|
||||
wire [`NUM_THREADS-1:0] taken;
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
assign taken[i] = gpu_exe_if.rs1_data[i][0];
|
||||
end
|
||||
|
||||
wire is_wspawn = (gpu_exe_if.op_type == `INST_GPU_WSPAWN);
|
||||
wire is_tmc = (gpu_exe_if.op_type == `INST_GPU_TMC);
|
||||
wire is_pred = (gpu_exe_if.op_type == `INST_GPU_PRED);
|
||||
wire is_split = (gpu_exe_if.op_type == `INST_GPU_SPLIT);
|
||||
wire is_join = (gpu_exe_if.op_type == `INST_GPU_JOIN);
|
||||
wire is_bar = (gpu_exe_if.op_type == `INST_GPU_BAR);
|
||||
|
||||
assign warp_ctl_if.valid = gpu_exe_if.valid && gpu_exe_if.ready;
|
||||
assign warp_ctl_if.wid = gpu_exe_if.wid;
|
||||
assign warp_ctl_if.tmc = tmc;
|
||||
assign warp_ctl_if.wspawn = wspawn;
|
||||
assign warp_ctl_if.split = split;
|
||||
assign warp_ctl_if.sjoin = sjoin;
|
||||
assign warp_ctl_if.barrier = barrier;
|
||||
|
||||
// tmc
|
||||
|
||||
wire [`NUM_THREADS-1:0] then_tmask = gpu_exe_if.tmask & taken;
|
||||
wire [`NUM_THREADS-1:0] pred_mask = (then_tmask != 0) ? then_tmask : gpu_exe_if.tmask;
|
||||
|
||||
assign tmc.valid = is_tmc || is_pred;
|
||||
assign tmc.tmask = is_pred ? pred_mask : rs1_data[`NUM_THREADS-1:0];
|
||||
|
||||
// wspawn
|
||||
|
||||
wire [`XLEN-1:0] wspawn_pc = rs2_data;
|
||||
wire [`NUM_WARPS-1:0] wspawn_wmask;
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
|
||||
assign wspawn_wmask[i] = (i < rs1_data[31:0]);
|
||||
end
|
||||
assign wspawn.valid = is_wspawn;
|
||||
assign wspawn.wmask = wspawn_wmask;
|
||||
assign wspawn.pc = wspawn_pc;
|
||||
|
||||
// split
|
||||
|
||||
assign split.valid = is_split;
|
||||
assign split.taken = taken;
|
||||
assign split.tmask = gpu_exe_if.tmask;
|
||||
assign split.next_pc = gpu_exe_if.next_PC;
|
||||
|
||||
// join
|
||||
|
||||
assign sjoin.valid = is_join;
|
||||
assign sjoin.stack_ptr = `PD_STACK_SIZEW'(rs1_data);
|
||||
|
||||
// barrier
|
||||
assign barrier.valid = is_bar;
|
||||
assign barrier.id = rs1_data[`NB_BITS-1:0];
|
||||
assign barrier.is_global = rs1_data[31];
|
||||
assign barrier.size_m1 = $bits(barrier.size_m1)'(rs2_data[31:0] - 1);
|
||||
|
||||
// response
|
||||
|
||||
wire [`PD_STACK_SIZEW-1:0] rsp_data;
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (UUID_WIDTH + NW_WIDTH + `NUM_THREADS + `XLEN + `NR_BITS + 1 + `PD_STACK_SIZEW),
|
||||
.PASSTHRU (OUTPUT_REG == 0)
|
||||
) rsp_sbuf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (gpu_exe_if.valid),
|
||||
.ready_in (gpu_exe_if.ready),
|
||||
.data_in ({gpu_exe_if.uuid, gpu_exe_if.wid, gpu_exe_if.tmask, gpu_exe_if.PC, gpu_exe_if.rd, gpu_exe_if.wb, warp_ctl_if.split_ret}),
|
||||
.data_out ({commit_if.uuid, commit_if.wid, commit_if.tmask, commit_if.PC, commit_if.rd, commit_if.wb, rsp_data}),
|
||||
.valid_out (commit_if.valid),
|
||||
.ready_out (commit_if.ready)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
assign commit_if.data[i] = `XLEN'(rsp_data);
|
||||
end
|
||||
|
||||
endmodule
|
|
@ -13,12 +13,13 @@ module VX_fpu_agent #(
|
|||
|
||||
VX_fpu_exe_if.slave fpu_exe_if,
|
||||
VX_fpu_to_csr_if.master fpu_to_csr_if,
|
||||
VX_commit_if.master fpu_commit_if,
|
||||
|
||||
|
||||
VX_fpu_bus_if.master fpu_bus_if,
|
||||
|
||||
input wire csr_pending,
|
||||
output wire req_pending
|
||||
output wire req_pending,
|
||||
|
||||
VX_commit_if.master commit_if
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
|
@ -115,18 +116,18 @@ module VX_fpu_agent #(
|
|||
.reset (reset),
|
||||
.valid_in (fpu_bus_if.rsp_valid),
|
||||
.ready_in (fpu_bus_if.rsp_ready),
|
||||
.data_in ({rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, fpu_bus_if.rsp_result}),
|
||||
.data_out ({fpu_commit_if.uuid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.data}),
|
||||
.valid_out (fpu_commit_if.valid),
|
||||
.ready_out (fpu_commit_if.ready)
|
||||
.data_in ({rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, fpu_bus_if.rsp_result}),
|
||||
.data_out ({commit_if.uuid, commit_if.wid, commit_if.tmask, commit_if.PC, commit_if.rd, commit_if.data}),
|
||||
.valid_out (commit_if.valid),
|
||||
.ready_out (commit_if.ready)
|
||||
);
|
||||
|
||||
assign fpu_commit_if.wb = 1'b1;
|
||||
assign fpu_commit_if.eop = 1'b1;
|
||||
assign commit_if.wb = 1'b1;
|
||||
assign commit_if.eop = 1'b1;
|
||||
|
||||
// pending request
|
||||
|
||||
wire fpu_commit_fire = fpu_commit_if.valid && fpu_commit_if.ready;
|
||||
wire fpu_commit_fire = commit_if.valid && commit_if.ready;
|
||||
|
||||
reg req_pending_r;
|
||||
always @(posedge clk) begin
|
||||
|
|
|
@ -4,20 +4,17 @@ interface VX_decode_sched_if ();
|
|||
|
||||
wire valid;
|
||||
wire is_wstall;
|
||||
wire is_join;
|
||||
wire [`UP(`NW_BITS)-1:0] wid;
|
||||
|
||||
modport master (
|
||||
output valid,
|
||||
output is_wstall,
|
||||
output is_join,
|
||||
output wid
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input valid,
|
||||
input is_wstall,
|
||||
input is_join,
|
||||
input wid
|
||||
);
|
||||
|
||||
|
|
|
@ -11,25 +11,31 @@ interface VX_warp_ctl_if ();
|
|||
wire [`UP(`NW_BITS)-1:0] wid;
|
||||
gpu_tmc_t tmc;
|
||||
gpu_wspawn_t wspawn;
|
||||
gpu_barrier_t barrier;
|
||||
gpu_split_t split;
|
||||
gpu_join_t sjoin;
|
||||
gpu_barrier_t barrier;
|
||||
wire [`PD_STACK_SIZEW-1:0] split_ret;
|
||||
|
||||
modport master (
|
||||
output valid,
|
||||
output wid,
|
||||
output tmc,
|
||||
output wspawn,
|
||||
output tmc,
|
||||
output split,
|
||||
output sjoin,
|
||||
output barrier,
|
||||
output split
|
||||
input split_ret
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input valid,
|
||||
input wid,
|
||||
input tmc,
|
||||
input wspawn,
|
||||
input barrier,
|
||||
input split
|
||||
input valid,
|
||||
input wid,
|
||||
input wspawn,
|
||||
input tmc,
|
||||
input split,
|
||||
input sjoin,
|
||||
input barrier,
|
||||
output split_ret
|
||||
);
|
||||
|
||||
endinterface
|
||||
|
|
|
@ -10,9 +10,9 @@ module VX_raster_agent #(
|
|||
VX_raster_exe_if.slave raster_exe_if,
|
||||
VX_raster_bus_if.slave raster_bus_if,
|
||||
|
||||
// Outputs
|
||||
VX_commit_if.master raster_commit_if,
|
||||
VX_gpu_csr_if.slave raster_csr_if
|
||||
// Outputs
|
||||
VX_gpu_csr_if.slave raster_csr_if,
|
||||
VX_commit_if.master commit_if
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
|
@ -41,7 +41,7 @@ module VX_raster_agent #(
|
|||
);
|
||||
|
||||
// it is possible to have ready = f(valid) when using arbiters,
|
||||
// because of that we need to decouple raster_exe_if and raster_commit_if handshake with a pipe register
|
||||
// because of that we need to decouple raster_exe_if and commit_if handshake with a pipe register
|
||||
|
||||
assign raster_exe_if.ready = raster_bus_if.req_valid && raster_rsp_ready;
|
||||
|
||||
|
@ -62,18 +62,18 @@ module VX_raster_agent #(
|
|||
.reset (reset),
|
||||
.valid_in (raster_rsp_valid),
|
||||
.ready_in (raster_rsp_ready),
|
||||
.data_in ({raster_exe_if.uuid, raster_exe_if.wid, raster_exe_if.tmask, raster_exe_if.PC, raster_exe_if.rd, response_data}),
|
||||
.data_out ({raster_commit_if.uuid, raster_commit_if.wid, raster_commit_if.tmask, raster_commit_if.PC, raster_commit_if.rd, commit_data}),
|
||||
.valid_out (raster_commit_if.valid),
|
||||
.ready_out (raster_commit_if.ready)
|
||||
.data_in ({raster_exe_if.uuid, raster_exe_if.wid, raster_exe_if.tmask, raster_exe_if.PC, raster_exe_if.rd, response_data}),
|
||||
.data_out ({commit_if.uuid, commit_if.wid, commit_if.tmask, commit_if.PC, commit_if.rd, commit_data}),
|
||||
.valid_out (commit_if.valid),
|
||||
.ready_out (commit_if.ready)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
assign raster_commit_if.data[i] = `XLEN'(commit_data[i]);
|
||||
assign commit_if.data[i] = `XLEN'(commit_data[i]);
|
||||
end
|
||||
|
||||
assign raster_commit_if.wb = 1'b1;
|
||||
assign raster_commit_if.eop = 1'b1;
|
||||
assign commit_if.wb = 1'b1;
|
||||
assign commit_if.eop = 1'b1;
|
||||
|
||||
`ifdef DBG_TRACE_RASTER
|
||||
always @(posedge clk) begin
|
||||
|
|
|
@ -11,8 +11,8 @@ module VX_rop_agent #(
|
|||
VX_gpu_csr_if.slave rop_csr_if,
|
||||
|
||||
// Outputs
|
||||
VX_commit_if.master rop_commit_if,
|
||||
VX_rop_bus_if.master rop_bus_if
|
||||
VX_rop_bus_if.master rop_bus_if,
|
||||
VX_commit_if.master commit_if
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
|
@ -42,7 +42,7 @@ module VX_rop_agent #(
|
|||
wire rop_rsp_valid, rop_rsp_ready;
|
||||
|
||||
// it is possible to have ready = f(valid) when using arbiters,
|
||||
// because of that we need to decouple rop_exe_if and rop_commit_if handshake with a pipe register
|
||||
// because of that we need to decouple rop_exe_if and commit_if handshake with a pipe register
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (UUID_WIDTH + `NUM_THREADS * (1 + 2 * `VX_ROP_DIM_BITS + 32 + `VX_ROP_DEPTH_BITS + 1)),
|
||||
|
@ -69,16 +69,16 @@ module VX_rop_agent #(
|
|||
.reset (reset),
|
||||
.valid_in (rop_rsp_valid),
|
||||
.ready_in (rop_rsp_ready),
|
||||
.data_in ({rop_exe_if.uuid, rop_exe_if.wid, rop_exe_if.tmask, rop_exe_if.PC}),
|
||||
.data_out ({rop_commit_if.uuid, rop_commit_if.wid, rop_commit_if.tmask, rop_commit_if.PC}),
|
||||
.valid_out (rop_commit_if.valid),
|
||||
.ready_out (rop_commit_if.ready)
|
||||
.data_in ({rop_exe_if.uuid, rop_exe_if.wid, rop_exe_if.tmask, rop_exe_if.PC}),
|
||||
.data_out ({commit_if.uuid, commit_if.wid, commit_if.tmask, commit_if.PC}),
|
||||
.valid_out (commit_if.valid),
|
||||
.ready_out (commit_if.ready)
|
||||
);
|
||||
|
||||
assign rop_commit_if.data = '0;
|
||||
assign rop_commit_if.rd = '0;
|
||||
assign rop_commit_if.wb = 0;
|
||||
assign rop_commit_if.eop = 1;
|
||||
assign commit_if.data = '0;
|
||||
assign commit_if.rd = '0;
|
||||
assign commit_if.wb = 0;
|
||||
assign commit_if.eop = 1;
|
||||
|
||||
`ifdef DBG_TRACE_ROP
|
||||
always @(posedge clk) begin
|
||||
|
|
|
@ -12,7 +12,7 @@ module VX_tex_agent #(
|
|||
|
||||
// Outputs
|
||||
VX_tex_bus_if.master tex_bus_if,
|
||||
VX_commit_if.master tex_commit_if
|
||||
VX_commit_if.master commit_if
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
|
@ -107,18 +107,18 @@ module VX_tex_agent #(
|
|||
.reset (reset),
|
||||
.valid_in (tex_bus_if.rsp_valid),
|
||||
.ready_in (tex_bus_if.rsp_ready),
|
||||
.data_in ({rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, tex_bus_if.rsp_texels}),
|
||||
.data_out ({tex_commit_if.uuid, tex_commit_if.wid, tex_commit_if.tmask, tex_commit_if.PC, tex_commit_if.rd, commit_data}),
|
||||
.valid_out (tex_commit_if.valid),
|
||||
.ready_out (tex_commit_if.ready)
|
||||
.data_in ({rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, tex_bus_if.rsp_texels}),
|
||||
.data_out ({commit_if.uuid, commit_if.wid, commit_if.tmask, commit_if.PC, commit_if.rd, commit_data}),
|
||||
.valid_out (commit_if.valid),
|
||||
.ready_out (commit_if.ready)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
assign tex_commit_if.data[i] = `XLEN'(commit_data[i]);
|
||||
assign commit_if.data[i] = `XLEN'(commit_data[i]);
|
||||
end
|
||||
|
||||
assign tex_commit_if.wb = 1'b1;
|
||||
assign tex_commit_if.eop = 1'b1;
|
||||
assign commit_if.wb = 1'b1;
|
||||
assign commit_if.eop = 1'b1;
|
||||
|
||||
`ifdef DBG_TRACE_TEX
|
||||
always @(posedge clk) begin
|
||||
|
@ -131,10 +131,10 @@ module VX_tex_agent #(
|
|||
`TRACE_ARRAY1D(1, tex_exe_if.lod, `NUM_THREADS);
|
||||
`TRACE(1, (", stage=%0d, tag=0x%0h (#%0d)\n", tex_exe_if.stage, req_tag, tex_exe_if.uuid));
|
||||
end
|
||||
if (tex_commit_if.valid && tex_commit_if.ready) begin
|
||||
`TRACE(1, ("%d: core%0d-tex-rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, texels=", $time, CORE_ID, tex_commit_if.wid, tex_commit_if.PC, tex_commit_if.tmask, tex_commit_if.rd));
|
||||
`TRACE_ARRAY1D(1, tex_commit_if.data, `NUM_THREADS);
|
||||
`TRACE(1, (" (#%0d)\n", tex_commit_if.uuid));
|
||||
if (commit_if.valid && commit_if.ready) begin
|
||||
`TRACE(1, ("%d: core%0d-tex-rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, texels=", $time, CORE_ID, commit_if.wid, commit_if.PC, commit_if.tmask, commit_if.rd));
|
||||
`TRACE_ARRAY1D(1, commit_if.data, `NUM_THREADS);
|
||||
`TRACE(1, (" (#%0d)\n", commit_if.uuid));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -137,13 +137,15 @@ inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
|
|||
}
|
||||
|
||||
// Split on a predicate
|
||||
inline void vx_split(unsigned predicate) {
|
||||
asm volatile (".insn r %0, 2, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(predicate));
|
||||
inline unsigned vx_split(unsigned predicate) {
|
||||
unsigned ret;
|
||||
asm volatile (".insn r %1, 2, 0, %0, %2, x0" : "=r"(ret) : "i"(RISCV_CUSTOM0), "r"(predicate));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Join
|
||||
inline void vx_join() {
|
||||
asm volatile (".insn r %0, 3, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM0));
|
||||
inline void vx_join(unsigned stack_ptr) {
|
||||
asm volatile (".insn r %0, 3, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(stack_ptr));
|
||||
}
|
||||
|
||||
// Warp Barrier
|
||||
|
@ -153,72 +155,72 @@ inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
|
|||
|
||||
// Return current thread identifier
|
||||
inline int vx_thread_id() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_THREAD_ID));
|
||||
return result;
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_THREAD_ID));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return current warp identifier
|
||||
inline int vx_warp_id() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_WARP_ID));
|
||||
return result;
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_WARP_ID));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return current core identifier
|
||||
inline int vx_core_id() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_CORE_ID));
|
||||
return result;
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_CORE_ID));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return current cluster identifier
|
||||
inline int vx_cluster_id() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_CLUSTER_ID));
|
||||
return result;
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_CLUSTER_ID));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return current threadk mask
|
||||
inline int vx_thread_mask() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_TMASK));
|
||||
return result;
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_TMASK));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return the number of threads per warp
|
||||
inline int vx_num_threads() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_NUM_THREADS));
|
||||
return result;
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_THREADS));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return the number of warps per core
|
||||
inline int vx_num_warps() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_NUM_WARPS));
|
||||
return result;
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_WARPS));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return the number of cores per cluster
|
||||
inline int vx_num_cores() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_NUM_CORES));
|
||||
return result;
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_CORES));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return the number of clusters
|
||||
inline int vx_num_clusters() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_NUM_CLUSTERS));
|
||||
return result;
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_CLUSTERS));
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Return the hart identifier (thread id accross the processor)
|
||||
inline int vx_hart_id() {
|
||||
int result;
|
||||
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_MHARTID));
|
||||
return result;
|
||||
int ret;
|
||||
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_MHARTID));
|
||||
return ret;
|
||||
}
|
||||
|
||||
inline void vx_fence() {
|
||||
|
|
|
@ -31,7 +31,7 @@ public:
|
|||
, num_regs_(32)
|
||||
, num_csrs_(4096)
|
||||
, num_barriers_(NUM_BARRIERS)
|
||||
, ipdom_size_(IPDOM_STACK_SIZE)
|
||||
, ipdom_size_(log2ceil(num_threads) * 2)
|
||||
{}
|
||||
|
||||
uint16_t vsize() const {
|
||||
|
|
|
@ -28,29 +28,6 @@ union reg_data_t {
|
|||
int64_t i64;
|
||||
};
|
||||
|
||||
static bool HasDivergentThreads(const ThreadMask &thread_mask,
|
||||
const std::vector<std::vector<Word>> ®_file,
|
||||
unsigned reg) {
|
||||
bool cond;
|
||||
size_t thread_idx = 0;
|
||||
size_t num_threads = reg_file.size();
|
||||
for (; thread_idx < num_threads; ++thread_idx) {
|
||||
if (thread_mask[thread_idx]) {
|
||||
cond = bool(reg_file[thread_idx][reg]);
|
||||
break;
|
||||
}
|
||||
}
|
||||
assert(thread_idx != num_threads);
|
||||
for (; thread_idx < num_threads; ++thread_idx) {
|
||||
if (thread_mask[thread_idx]) {
|
||||
if (cond != (bool(reg_file[thread_idx][reg]))) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
inline uint32_t get_fpu_rm(uint32_t func3, Core* core, uint32_t tid, uint32_t wid) {
|
||||
return (func3 == 0x7) ? core->get_csr(VX_CSR_FRM, tid, wid) : func3;
|
||||
}
|
||||
|
@ -80,7 +57,8 @@ inline int64_t check_boxing(int64_t a) {
|
|||
void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
||||
assert(tmask_.any());
|
||||
|
||||
auto nextPC = PC_ + 4;
|
||||
auto next_pc = PC_ + 4;
|
||||
auto next_tmask = tmask_;
|
||||
|
||||
auto func2 = instr.getFunc2();
|
||||
auto func3 = instr.getFunc3();
|
||||
|
@ -98,6 +76,12 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
|
||||
auto num_threads = arch_.num_threads();
|
||||
|
||||
uint32_t thread_start = 0;
|
||||
for (; thread_start < num_threads; ++thread_start) {
|
||||
if (tmask_.test(thread_start))
|
||||
break;
|
||||
}
|
||||
|
||||
std::vector<reg_data_t[3]> rsdata(num_threads);
|
||||
std::vector<reg_data_t> rddata(num_threads);
|
||||
|
||||
|
@ -149,7 +133,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
// RV32I: LUI
|
||||
trace->exe_type = ExeType::ALU;
|
||||
trace->alu_type = AluType::ARITH;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
rddata[t].i = immsrc << 12;
|
||||
|
@ -161,7 +145,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
// RV32I: AUIPC
|
||||
trace->exe_type = ExeType::ALU;
|
||||
trace->alu_type = AluType::ARITH;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
rddata[t].i = (immsrc << 12) + PC_;
|
||||
|
@ -174,7 +158,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
trace->alu_type = AluType::ARITH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
if (func7 & 0x1) {
|
||||
|
@ -334,7 +318,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
trace->exe_type = ExeType::ALU;
|
||||
trace->alu_type = AluType::ARITH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
switch (func3) {
|
||||
|
@ -395,7 +379,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
trace->alu_type = AluType::ARITH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
if (func7 & 0x1) {
|
||||
|
@ -521,7 +505,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
trace->exe_type = ExeType::ALU;
|
||||
trace->alu_type = AluType::ARITH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
switch (func3) {
|
||||
|
@ -565,49 +549,49 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
trace->alu_type = AluType::BRANCH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
switch (func3) {
|
||||
case 0: {
|
||||
// RV32I: BEQ
|
||||
if (rsdata[t][0].i == rsdata[t][1].i) {
|
||||
nextPC = PC_ + immsrc;
|
||||
next_pc = PC_ + immsrc;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 1: {
|
||||
// RV32I: BNE
|
||||
if (rsdata[t][0].i != rsdata[t][1].i) {
|
||||
nextPC = PC_ + immsrc;
|
||||
next_pc = PC_ + immsrc;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 4: {
|
||||
// RV32I: BLT
|
||||
if (rsdata[t][0].i < rsdata[t][1].i) {
|
||||
nextPC = PC_ + immsrc;
|
||||
next_pc = PC_ + immsrc;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 5: {
|
||||
// RV32I: BGE
|
||||
if (rsdata[t][0].i >= rsdata[t][1].i) {
|
||||
nextPC = PC_ + immsrc;
|
||||
next_pc = PC_ + immsrc;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 6: {
|
||||
// RV32I: BLTU
|
||||
if (rsdata[t][0].u < rsdata[t][1].u) {
|
||||
nextPC = PC_ + immsrc;
|
||||
next_pc = PC_ + immsrc;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case 7: {
|
||||
// RV32I: BGEU
|
||||
if (rsdata[t][0].u >= rsdata[t][1].u) {
|
||||
nextPC = PC_ + immsrc;
|
||||
next_pc = PC_ + immsrc;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -623,11 +607,11 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
// RV32I: JAL
|
||||
trace->exe_type = ExeType::ALU;
|
||||
trace->alu_type = AluType::BRANCH;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
rddata[t].i = nextPC;
|
||||
nextPC = PC_ + immsrc;
|
||||
rddata[t].i = next_pc;
|
||||
next_pc = PC_ + immsrc;
|
||||
trace->fetch_stall = true;
|
||||
break; // runonce
|
||||
}
|
||||
|
@ -639,11 +623,11 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
trace->exe_type = ExeType::ALU;
|
||||
trace->alu_type = AluType::BRANCH;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
rddata[t].i = nextPC;
|
||||
nextPC = rsdata[t][0].i + immsrc;
|
||||
rddata[t].i = next_pc;
|
||||
next_pc = rsdata[t][0].i + immsrc;
|
||||
trace->fetch_stall = true;
|
||||
break; // runOnce
|
||||
}
|
||||
|
@ -662,7 +646,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
|| (opcode == FL && func3 == 3)) {
|
||||
uint32_t data_bytes = 1 << (func3 & 0x3);
|
||||
uint32_t data_width = 8 * data_bytes;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
uint64_t mem_addr = rsdata[t][0].i + immsrc;
|
||||
|
@ -726,7 +710,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
|| (opcode == FS && func3 == 2)
|
||||
|| (opcode == FS && func3 == 3)) {
|
||||
uint32_t data_bytes = 1 << (func3 & 0x3);
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
uint64_t mem_addr = rsdata[t][0].i + immsrc;
|
||||
|
@ -769,7 +753,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
trace->data = trace_data;
|
||||
uint32_t data_bytes = 1 << (func3 & 0x3);
|
||||
uint32_t data_width = 8 * data_bytes;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
uint64_t mem_addr = rsdata[t][0].u;
|
||||
|
@ -834,7 +818,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
break;
|
||||
}
|
||||
case SYS_INST: {
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
uint32_t csr_addr = immsrc;
|
||||
|
@ -931,7 +915,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
}
|
||||
case FCI: {
|
||||
trace->exe_type = ExeType::FPU;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
uint32_t frm = get_fpu_rm(func3, core_, t, warp_id_);
|
||||
|
@ -1264,7 +1248,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
trace->used_fregs.set(rsrc0);
|
||||
trace->used_fregs.set(rsrc1);
|
||||
trace->used_fregs.set(rsrc2);
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
uint32_t frm = get_fpu_rm(func3, core_, t, warp_id_);
|
||||
|
@ -1312,14 +1296,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
}
|
||||
case EXT1: {
|
||||
switch (func7) {
|
||||
case 0: {
|
||||
uint32_t ts = 0;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
if (tmask_.test(t)) {
|
||||
ts = t;
|
||||
break;
|
||||
}
|
||||
}
|
||||
case 0: {
|
||||
switch (func3) {
|
||||
case 0: {
|
||||
// TMC
|
||||
|
@ -1334,22 +1311,14 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
pred[t] = tmask_.test(t) ? (ireg_file_.at(t).at(rsrc0) != 0) : 0;
|
||||
}
|
||||
if (pred.any()) {
|
||||
tmask_ &= pred;
|
||||
next_tmask &= pred;
|
||||
}
|
||||
} else {
|
||||
tmask_.reset();
|
||||
next_tmask.reset();
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
tmask_.set(t, rsdata.at(ts)[0].i & (1 << t));
|
||||
next_tmask.set(t, rsdata.at(thread_start)[0].i & (1 << t));
|
||||
}
|
||||
}
|
||||
DPH(3, "*** New TMC: ");
|
||||
for (uint32_t i = 0; i < num_threads; ++i)
|
||||
DPN(3, tmask_.test(i));
|
||||
DPN(3, std::endl);
|
||||
|
||||
if (!tmask_.any()) {
|
||||
core_->active_warps_.reset(warp_id_);
|
||||
}
|
||||
} break;
|
||||
case 1: {
|
||||
// WSPAWN
|
||||
|
@ -1358,70 +1327,70 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->fetch_stall = true;
|
||||
core_->wspawn(rsdata.at(ts)[0].i, rsdata.at(ts)[1].i);
|
||||
core_->wspawn(rsdata.at(thread_start)[0].i, rsdata.at(thread_start)[1].i);
|
||||
} break;
|
||||
case 2: {
|
||||
// SPLIT
|
||||
if (ipdom_stack_.size() == arch_.ipdom_size()) {
|
||||
std::cout << "IPDOM stack is full! (size=" << std::dec << ipdom_stack_.size() << ")\n" << std::flush;
|
||||
std::abort();
|
||||
}
|
||||
// SPLIT
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu_type = GpuType::SPLIT;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->fetch_stall = true;
|
||||
if (HasDivergentThreads(tmask_, ireg_file_, rsrc0)) {
|
||||
ThreadMask tmask;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
tmask[t] = tmask_.test(t) && !ireg_file_.at(t).at(rsrc0);
|
||||
}
|
||||
|
||||
DomStackEntry e(tmask, nextPC);
|
||||
ipdom_stack_.push(tmask_);
|
||||
ipdom_stack_.push(e);
|
||||
for (uint32_t t = 0, n = e.tmask.size(); t < n; ++t) {
|
||||
tmask_.set(t, !e.tmask.test(t) && tmask_.test(t));
|
||||
}
|
||||
auto stack_size = ipdom_stack_.size();
|
||||
|
||||
DPH(3, "*** Split: New TM=");
|
||||
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(t));
|
||||
DPN(3, ", Pushed TM=");
|
||||
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, e.tmask.test(t));
|
||||
DPN(3, ", PC=0x" << std::hex << e.PC << "\n");
|
||||
ThreadMask then_tmask, else_tmask;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
auto cond = ireg_file_.at(t).at(rsrc0);
|
||||
then_tmask[t] = tmask_.test(t) && cond;
|
||||
else_tmask[t] = tmask_.test(t) && !cond;
|
||||
}
|
||||
|
||||
if (then_tmask.count() != tmask_.count()
|
||||
&& else_tmask.count() != tmask_.count()) {
|
||||
if (ipdom_stack_.size() == arch_.ipdom_size()) {
|
||||
std::cout << "IPDOM stack is full! (size=" << std::dec << ipdom_stack_.size() << ")\n" << std::flush;
|
||||
std::abort();
|
||||
}
|
||||
if (then_tmask.count() >= else_tmask.count()) {
|
||||
next_tmask = then_tmask;
|
||||
} else {
|
||||
next_tmask = else_tmask;
|
||||
}
|
||||
// push reconvergence thread mask
|
||||
ipdom_stack_.emplace(tmask_);
|
||||
// push flipped thread mask
|
||||
auto join_tmask = ~next_tmask & tmask_;
|
||||
ipdom_stack_.emplace(join_tmask, next_pc);
|
||||
} else {
|
||||
DP(3, "*** Unanimous pred");
|
||||
DomStackEntry e(tmask_);
|
||||
e.unanimous = true;
|
||||
ipdom_stack_.push(e);
|
||||
}
|
||||
// Uniform control-flow
|
||||
}
|
||||
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
rddata[t].i = stack_size;
|
||||
}
|
||||
rd_write = true;
|
||||
} break;
|
||||
case 3: {
|
||||
// JOIN
|
||||
if (ipdom_stack_.empty()) {
|
||||
std::cout << "IPDOM stack is empty!\n" << std::flush;
|
||||
std::abort();
|
||||
}
|
||||
trace->exe_type = ExeType::GPU;
|
||||
trace->gpu_type = GpuType::JOIN;
|
||||
trace->fetch_stall = true;
|
||||
if (!ipdom_stack_.empty() && ipdom_stack_.top().unanimous) {
|
||||
DP(3, "*** Unanimous branch at join");
|
||||
tmask_ = ipdom_stack_.top().tmask;
|
||||
trace->gpu_type = GpuType::JOIN;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->fetch_stall = true;
|
||||
|
||||
uint32_t stack_ptr = ireg_file_.at(thread_start).at(rsrc0);
|
||||
if (stack_ptr != ipdom_stack_.size()) {
|
||||
if (ipdom_stack_.empty()) {
|
||||
std::cout << "IPDOM stack is empty!\n" << std::flush;
|
||||
std::abort();
|
||||
}
|
||||
next_tmask = ipdom_stack_.top().tmask;
|
||||
if (!ipdom_stack_.top().fallthrough) {
|
||||
next_pc = ipdom_stack_.top().PC;
|
||||
}
|
||||
ipdom_stack_.pop();
|
||||
} else {
|
||||
if (!ipdom_stack_.top().fallThrough) {
|
||||
nextPC = ipdom_stack_.top().PC;
|
||||
DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
|
||||
}
|
||||
|
||||
tmask_ = ipdom_stack_.top().tmask;
|
||||
|
||||
DPH(3, "*** Join: New TM=");
|
||||
for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(t));
|
||||
DPN(3, "\n");
|
||||
|
||||
ipdom_stack_.pop();
|
||||
}
|
||||
// Uniform control-flow
|
||||
}
|
||||
} break;
|
||||
case 4: {
|
||||
// BAR
|
||||
|
@ -1430,7 +1399,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->fetch_stall = true;
|
||||
trace->data = std::make_shared<GPUTraceData>(rsdata[ts][0].i, rsdata[ts][1].i);
|
||||
trace->data = std::make_shared<GPUTraceData>(rsdata[thread_start][0].i, rsdata[thread_start][1].i);
|
||||
} break;
|
||||
default:
|
||||
std::abort();
|
||||
|
@ -1446,7 +1415,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
for (uint32_t ri = 0, rn = core_->raster_units_.size(); ri < rn; ++ri) {
|
||||
trace_data->raster_idx = core_->raster_idx();
|
||||
bool has_stamps = false;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
auto result = core_->raster_units_.at(trace_data->raster_idx)->fetch(
|
||||
|
@ -1478,7 +1447,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
auto trace_data = std::make_shared<TexUnit::TraceData>(num_threads);
|
||||
trace->data = trace_data;
|
||||
trace_data->tex_idx = core_->tex_idx();
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
auto u = rsdata[t][0].i;
|
||||
|
@ -1499,7 +1468,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->used_iregs.set(rsrc2);
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
rddata[t].i = rsdata[t][0].i ? rsdata[t][1].i : rsdata[t][2].i;
|
||||
|
@ -1515,7 +1484,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
auto trace_data = std::make_shared<RopUnit::TraceData>();
|
||||
trace->data = trace_data;
|
||||
trace_data->rop_idx = core_->rop_idx();
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
auto pos_face = rsdata[t][0].i;
|
||||
|
@ -1539,7 +1508,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
trace->used_iregs.set(rsrc1);
|
||||
trace->used_iregs.set(rsrc2);
|
||||
uint32_t shift = func2 * 8;
|
||||
for (uint32_t t = 0; t < num_threads; ++t) {
|
||||
for (uint32_t t = thread_start; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
rddata[t].i = (int32_t)(((int64_t)rsdata[t][0].i32 * (int64_t)rsdata[t][1].i32) >> shift) + rsdata[t][2].i32;
|
||||
|
@ -2444,8 +2413,18 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
}
|
||||
|
||||
PC_ += 4;
|
||||
if (PC_ != nextPC) {
|
||||
DP(3, "*** Next PC: " << std::hex << nextPC << std::dec);
|
||||
PC_ = nextPC;
|
||||
if (PC_ != next_pc) {
|
||||
DP(3, "*** Next PC=0x" << std::hex << next_pc << std::dec);
|
||||
PC_ = next_pc;
|
||||
}
|
||||
if (tmask_ != next_tmask) {
|
||||
DPH(3, "*** New Tmask=");
|
||||
for (uint32_t i = 0; i < num_threads; ++i)
|
||||
DPN(3, next_tmask.test(i));
|
||||
DPN(3, std::endl);
|
||||
tmask_ = next_tmask;
|
||||
if (!next_tmask.any()) {
|
||||
core_->active_warps_.reset(warp_id_);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -16,21 +16,17 @@ struct DomStackEntry {
|
|||
DomStackEntry(const ThreadMask &tmask, Word PC)
|
||||
: tmask(tmask)
|
||||
, PC(PC)
|
||||
, fallThrough(false)
|
||||
, unanimous(false)
|
||||
, fallthrough(false)
|
||||
{}
|
||||
|
||||
DomStackEntry(const ThreadMask &tmask)
|
||||
: tmask(tmask)
|
||||
, PC(0)
|
||||
, fallThrough(true)
|
||||
, unanimous(false)
|
||||
DomStackEntry(const ThreadMask &tmask)
|
||||
: tmask(tmask)
|
||||
, fallthrough(true)
|
||||
{}
|
||||
|
||||
ThreadMask tmask;
|
||||
Word PC;
|
||||
bool fallThrough;
|
||||
bool unanimous;
|
||||
bool fallthrough;
|
||||
};
|
||||
|
||||
struct vtype {
|
||||
|
|
|
@ -29,9 +29,9 @@ int main() {
|
|||
errors += test_tls();
|
||||
|
||||
if (0 == errors) {
|
||||
vx_printf("Passed!\n");
|
||||
PRINTF("Passed!\n");
|
||||
} else {
|
||||
vx_printf("Failed!\n");
|
||||
PRINTF("Failed!\n");
|
||||
}
|
||||
|
||||
return errors;
|
||||
|
|
|
@ -5,31 +5,26 @@
|
|||
#include <vx_print.h>
|
||||
#include <vx_spawn.h>
|
||||
|
||||
#define __if(b) vx_split(b); \
|
||||
if (b)
|
||||
#define __else else
|
||||
#define __endif vx_join();
|
||||
|
||||
int __attribute__ ((noinline)) check_error(const int* buffer, int offset, int size) {
|
||||
int __attribute__((noinline)) check_error(const int* buffer, int offset, int size) {
|
||||
int errors = 0;
|
||||
for (int i = offset; i < size; i++) {
|
||||
int value = buffer[i];
|
||||
int ref_value = 65 + i;
|
||||
if (value == ref_value) {
|
||||
//vx_printf("[%d] %c\n", i, value);
|
||||
//PRINTF("[%d] %c\n", i, value);
|
||||
} else {
|
||||
vx_printf("*** error: [%d] 0x%x, expected 0x%x\n", i, value, ref_value);
|
||||
PRINTF("*** error: [%d] 0x%x, expected 0x%x\n", i, value, ref_value);
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
return errors;
|
||||
}
|
||||
|
||||
int __attribute__ ((noinline)) make_select_tmask(int tid) {
|
||||
int __attribute__((noinline)) make_select_tmask(int tid) {
|
||||
return (1 << tid);
|
||||
}
|
||||
|
||||
int __attribute__ ((noinline)) make_full_tmask(int num_threads) {
|
||||
int __attribute__((noinline)) make_full_tmask(int num_threads) {
|
||||
return (1 << num_threads) - 1;
|
||||
}
|
||||
|
||||
|
@ -39,7 +34,7 @@ int __attribute__ ((noinline)) make_full_tmask(int num_threads) {
|
|||
int global_buffer[GLOBAL_MEM_SZ];
|
||||
|
||||
int test_global_memory() {
|
||||
vx_printf("Global Memory Test\n");
|
||||
PRINTF("Global Memory Test\n");
|
||||
|
||||
for (int i = 0; i < GLOBAL_MEM_SZ; i++) {
|
||||
global_buffer[i] = 65 + i;
|
||||
|
@ -51,7 +46,7 @@ int test_global_memory() {
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int test_stack_memory() {
|
||||
vx_printf("Stack Memory Test\n");
|
||||
PRINTF("Stack Memory Test\n");
|
||||
|
||||
static const int STACK_MEM_SZ = 8;
|
||||
int stack_buffer[STACK_MEM_SZ];
|
||||
|
@ -69,7 +64,7 @@ int test_shared_memory() {
|
|||
static const int SHARED_MEM_SZ = 8;
|
||||
int* shared_buffer = (int*)(STACK_BASE_ADDR-(128*4)-SHARED_MEM_SZ*4);
|
||||
|
||||
vx_printf("Shared Memory Test\n");
|
||||
PRINTF("Shared Memory Test\n");
|
||||
|
||||
for (int i = 0; i < SHARED_MEM_SZ; i++) {
|
||||
shared_buffer[i] = 65 + i;
|
||||
|
@ -82,13 +77,13 @@ int test_shared_memory() {
|
|||
|
||||
int tmc_buffer[8];
|
||||
|
||||
void __attribute__ ((noinline)) do_tmc() {
|
||||
void __attribute__((noinline)) do_tmc() {
|
||||
unsigned tid = vx_thread_id();
|
||||
tmc_buffer[tid] = 65 + tid;
|
||||
}
|
||||
|
||||
int test_tmc() {
|
||||
vx_printf("TMC Test\n");
|
||||
PRINTF("TMC Test\n");
|
||||
|
||||
int num_threads = std::min(vx_num_threads(), 8);
|
||||
int tmask = make_full_tmask(num_threads);
|
||||
|
@ -103,13 +98,13 @@ int test_tmc() {
|
|||
|
||||
int pred_buffer[8];
|
||||
|
||||
void __attribute__ ((noinline)) do_pred() {
|
||||
void __attribute__((noinline)) do_pred() {
|
||||
unsigned tid = vx_thread_id();
|
||||
pred_buffer[tid] = 65 + tid;
|
||||
}
|
||||
|
||||
int test_pred() {
|
||||
vx_printf("PRED Test\n");
|
||||
PRINTF("PRED Test\n");
|
||||
|
||||
int num_threads = std::min(vx_num_threads(), 8);
|
||||
int tmask = make_full_tmask(num_threads);
|
||||
|
@ -138,7 +133,7 @@ void wspawn_kernel() {
|
|||
}
|
||||
|
||||
int test_wsapwn() {
|
||||
vx_printf("Wspawn Test\n");
|
||||
PRINTF("Wspawn Test\n");
|
||||
int num_warps = std::min(vx_num_warps(), 8);
|
||||
vx_wspawn(num_warps, wspawn_kernel);
|
||||
wspawn_kernel();
|
||||
|
@ -150,33 +145,46 @@ int test_wsapwn() {
|
|||
|
||||
int dvg_buffer[4];
|
||||
|
||||
void __attribute__ ((noinline)) do_divergence() {
|
||||
|
||||
unsigned tid = vx_thread_id();
|
||||
|
||||
__if (tid < 2) {
|
||||
__if (tid < 1) {
|
||||
dvg_buffer[tid] = 65;
|
||||
void __attribute__((noinline)) do_divergence() {
|
||||
int tid = vx_thread_id();
|
||||
int cond1 = tid < 2;
|
||||
int sp1 = vx_split(cond1);
|
||||
if (cond1) {
|
||||
{
|
||||
int cond2 = tid < 1;
|
||||
int sp2 = vx_split(cond2);
|
||||
if (cond2) {
|
||||
dvg_buffer[tid] = 65; // A
|
||||
} else {
|
||||
dvg_buffer[tid] = 66; // B
|
||||
}
|
||||
vx_join(sp2);
|
||||
}
|
||||
__else {
|
||||
dvg_buffer[tid] = 66;
|
||||
{
|
||||
int cond3 = tid < 0;
|
||||
int sp3 = vx_split(cond3);
|
||||
if (cond3) {
|
||||
dvg_buffer[tid] = 67; // C
|
||||
}
|
||||
vx_join(sp3);
|
||||
}
|
||||
} else {
|
||||
{
|
||||
int cond2 = tid < 3;
|
||||
int sp2 = vx_split(cond2);
|
||||
if (cond2) {
|
||||
dvg_buffer[tid] = 67; // C
|
||||
} else {
|
||||
dvg_buffer[tid] = 68; // D
|
||||
}
|
||||
vx_join(sp2);
|
||||
}
|
||||
__endif
|
||||
}
|
||||
__else {
|
||||
__if (tid < 3) {
|
||||
dvg_buffer[tid] = 67;
|
||||
}
|
||||
__else {
|
||||
dvg_buffer[tid] = 68;
|
||||
}
|
||||
__endif
|
||||
}
|
||||
__endif
|
||||
vx_join(sp1);
|
||||
}
|
||||
|
||||
int test_divergence() {
|
||||
vx_printf("Control Divergence Test\n");
|
||||
PRINTF("Control Divergence Test\n");
|
||||
|
||||
int num_threads = std::min(vx_num_threads(), 4);
|
||||
int tmask = make_full_tmask(num_threads);
|
||||
|
@ -203,7 +211,7 @@ void st_kernel(int task_id, const st_args_t * __UNIFORM__ arg) {
|
|||
}
|
||||
|
||||
int test_spawn_tasks() {
|
||||
vx_printf("SpawnTasks Test\n");
|
||||
PRINTF("SpawnTasks Test\n");
|
||||
|
||||
st_args_t arg;
|
||||
arg.src = st_buffer_src;
|
||||
|
@ -232,14 +240,14 @@ void sr_kernel(const sr_args_t * arg) {
|
|||
arg->buf[tid] = 65 + tid;
|
||||
}
|
||||
|
||||
void __attribute__ ((noinline)) do_serial() {
|
||||
void __attribute__((noinline)) do_serial() {
|
||||
sr_args_t arg;
|
||||
arg.buf = sr_buffer;
|
||||
vx_serial((vx_serial_cb)sr_kernel, &arg);
|
||||
}
|
||||
|
||||
int test_serial() {
|
||||
vx_printf("Serial Test\n");
|
||||
PRINTF("Serial Test\n");
|
||||
int num_threads = std::min(vx_num_threads(), 8);
|
||||
int tmask = make_full_tmask(num_threads);
|
||||
vx_tmc(tmask);
|
||||
|
@ -253,7 +261,7 @@ int test_serial() {
|
|||
|
||||
int tmask_buffer[8];
|
||||
|
||||
int __attribute__ ((noinline)) do_tmask() {
|
||||
int __attribute__((noinline)) do_tmask() {
|
||||
int tid = vx_thread_id();
|
||||
int tmask = make_select_tmask(tid);
|
||||
int cur_tmask = vx_thread_mask();
|
||||
|
@ -262,7 +270,7 @@ int __attribute__ ((noinline)) do_tmask() {
|
|||
}
|
||||
|
||||
int test_tmask() {
|
||||
vx_printf("Thread Mask Test\n");
|
||||
PRINTF("Thread Mask Test\n");
|
||||
|
||||
// activate all thread to populate shared variables
|
||||
vx_tmc(-1);
|
||||
|
@ -298,7 +306,7 @@ void barrier_kernel() {
|
|||
}
|
||||
|
||||
int test_barrier() {
|
||||
vx_printf("Barrier Test\n");
|
||||
PRINTF("Barrier Test\n");
|
||||
int num_warps = std::min(vx_num_warps(), 8);
|
||||
barrier_ctr = num_warps;
|
||||
barrier_stall = 0;
|
||||
|
@ -312,7 +320,7 @@ int test_barrier() {
|
|||
int tls_buffer[8];
|
||||
__thread int tls_var;
|
||||
|
||||
__attribute__ ((noinline)) void print_tls_var() {
|
||||
__attribute__((noinline)) void print_tls_var() {
|
||||
unsigned wid = vx_warp_id();
|
||||
tls_buffer[wid] = 65 + tls_var;
|
||||
}
|
||||
|
@ -325,7 +333,7 @@ void tls_kernel() {
|
|||
}
|
||||
|
||||
int test_tls() {
|
||||
vx_printf("TLS Test\n");
|
||||
PRINTF("TLS Test\n");
|
||||
int num_warps = std::min(vx_num_warps(), 8);
|
||||
vx_wspawn(num_warps, tls_kernel);
|
||||
tls_kernel();
|
||||
|
|
|
@ -1,6 +1,8 @@
|
|||
#ifndef TESTS
|
||||
#define TESTS
|
||||
|
||||
#define PRINTF vx_printf
|
||||
|
||||
int test_global_memory();
|
||||
|
||||
int test_stack_memory();
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue