split/join redesign

2025-04-23 21:39:10 -04:00 · 2023-07-02 16:50:59 -04:00 · 2023-07-02 16:50:59 -04:00 · 34206598e7
commit 34206598e7
parent ebf104de1b
23 changed files with 555 additions and 483 deletions
--- a/ci/regression.sh
+++ b/ci/regression.sh
@ -37,11 +37,11 @@ then
        #make -C tests/riscv/isa run-rtlsim-64f

        make -C sim/rtlsim clean
-        CONFIGS="-DFLEN_64 -DFPU_FPNEW" make -C sim/rtlsim
+        CONFIGS="-DEXT_D_ENABLE -DFPU_FPNEW" make -C sim/rtlsim
        make -C tests/riscv/isa run-rtlsim-64d

        make -C sim/rtlsim clean
-        CONFIGS="-DFLEN_64 -DFPU_DPI" make -C sim/rtlsim
+        CONFIGS="-DEXT_D_ENABLE -DFPU_DPI" make -C sim/rtlsim
        make -C tests/riscv/isa run-rtlsim-64d
 fi

@ -170,7 +170,7 @@ CONFIGS="-DENABLE_DPI -DEXT_GFX_ENABLE -DL1_DISABLE -DSM_DISABLE -DTCACHE_DISABL
 CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-ttriangle.cgltrace -rtriangle_ref_8.png -w8 -h8" --warps=1 --threads=2 --debug=3
 CONFIGS="-DENABLE_DPI -DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=draw3d --args="-ttriangle.cgltrace -rtriangle_ref_8.png -w8 -h8" --warps=1 --threads=2 --debug=3
 CONFIGS="-DEXT_GFX_ENABLE -DL1_DISABLE -DSM_DISABLE -DTCACHE_DISABLE -DRCACHE_DISABLE -DOCACHE_DISABLE" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-tvase.cgltrace -rvase_ref_32.png -w32 -h32" --threads=1
-CONFIGS="-DEXT_GFX_ENABLE -DIPDOM_STACK_SIZE=128" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-x -ttriangle.cgltrace -rtriangle_ref_128.png"
+CONFIGS="-DEXT_GFX_ENABLE -DPD_STACK_SIZE=128" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-x -ttriangle.cgltrace -rtriangle_ref_128.png"
 CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-y -ttriangle.cgltrace -rtriangle_ref_128.png"
 CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-z -ttriangle.cgltrace -rtriangle_ref_128.png"
 CONFIGS="-DENABLE_DPI -DEXT_GFX_ENABLE -DL1_DISABLE -DSM_DISABLE -DTCACHE_DISABLE -DRCACHE_DISABLE -DOCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=draw3d --args="-tvase.cgltrace -rvase_ref_32.png -w32 -h32" --threads=2 || true
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@ -26,13 +26,6 @@
 `endif
 `endif

-// 32 bit FLEN as default.
-`ifndef FLEN_32
-`ifndef FLEN_64
-`define FLEN_32
-`endif
-`endif
-
 `ifdef XLEN_64
 `define XLEN 64
 `endif
@ -41,14 +34,6 @@
 `define XLEN 32
 `endif

-`ifdef FLEN_64
-`define FLEN 64
-`endif
-
-`ifdef FLEN_32
-`define FLEN 32
-`endif
-
 `ifndef NUM_CLUSTERS
 `define NUM_CLUSTERS 1
 `endif
@ -185,6 +170,20 @@
 `define EXT_F_ENABLE
 `endif

+`ifdef EXT_D_ENABLE
+`define FLEN_64
+`else
+`define FLEN_32
+`endif
+
+`ifdef FLEN_64
+`define FLEN 64
+`endif
+
+`ifdef FLEN_32
+`define FLEN 32
+`endif
+
 `ifdef EXT_GFX_ENABLE
 `define EXT_TEX_ENABLE
 `define EXT_RASTER_ENABLE
@ -401,11 +400,6 @@
 `define LSUQ_SIZE `MAX(2, `NUM_WARPS * 2)
 `endif

-// Size of divergence Stack
-`ifndef IPDOM_STACK_SIZE
-`define IPDOM_STACK_SIZE 32
-`endif
-
 // Floating-Point Units ///////////////////////////////////////////////////////

 // Number of FPU units
--- a/hw/rtl/VX_define.vh
+++ b/hw/rtl/VX_define.vh
@ -27,6 +27,9 @@

 `define NR_BITS         `CLOG2(`NUM_REGS)

+`define PD_STACK_SIZE   `UP(`NT_BITS)
+`define PD_STACK_SIZEW  `CLOG2(`PD_STACK_SIZE)
+
 `define PERF_CTR_BITS   44

 `ifndef NDEBUG
@ -215,6 +218,7 @@
 `define INST_GPU_JOIN        4'h3
 `define INST_GPU_BAR         4'h4
 `define INST_GPU_PRED        4'h5
+`define INST_GPU_IS_WCTL(op) (op <= 5)

 `define INST_GPU_TEX         4'h6
 `define INST_GPU_RASTER      4'h7
--- a/hw/rtl/VX_gpu_types.vh
+++ b/hw/rtl/VX_gpu_types.vh
@ -17,13 +17,17 @@ typedef struct packed {
 } gpu_wspawn_t;

 typedef struct packed {
-    logic                   valid;
-    logic                   diverged;
-    logic [`NUM_THREADS-1:0] then_tmask;
-    logic [`NUM_THREADS-1:0] else_tmask;
-    logic [`XLEN-1:0]       pc;
+    logic                    valid;
+    logic [`NUM_THREADS-1:0] taken;
+    logic [`NUM_THREADS-1:0] tmask;
+    logic [`XLEN-1:0]        next_pc;
 } gpu_split_t;

+typedef struct packed {
+    logic                   valid;
+    logic [`PD_STACK_SIZEW-1:0] stack_ptr;
+} gpu_join_t;
+
 typedef struct packed {
    logic                   valid;
    logic [`NB_BITS-1:0]    id;
@ -32,8 +36,8 @@ typedef struct packed {
 } gpu_barrier_t;

 typedef struct packed {
-    logic [`XLEN-1:0]       startup_addr;
-    logic [7:0]             mpm_class;
+    logic [`XLEN-1:0]   startup_addr;
+    logic [7:0]         mpm_class;
 } base_dcrs_t;

 /* verilator lint_off UNUSED */
@ -265,6 +269,7 @@ endpackage
 `define GPU_TMC_BITS        $bits(VX_gpu_types::gpu_tmc_t)
 `define GPU_WSPAWN_BITS     $bits(VX_gpu_types::gpu_wspawn_t)
 `define GPU_SPLIT_BITS      $bits(VX_gpu_types::gpu_split_t)
+`define GPU_JOIN_BITS       $bits(VX_gpu_types::gpu_join_t)
 `define GPU_BARRIER_BITS    $bits(VX_gpu_types::gpu_barrier_t)

 `endif // VX_GPU_TYPES_VH
--- a/hw/rtl/core/VX_decode.sv
+++ b/hw/rtl/core/VX_decode.sv
@ -37,7 +37,7 @@ module VX_decode  #(
    reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r;
    reg [`XLEN-1:0] imm;    
    reg use_rd, use_PC, use_imm;
-    reg is_join, is_wstall;
+    reg is_wstall;

    wire [31:0] instr = fetch_if.data;
    wire [6:0] opcode = instr[6:0];  
@ -137,7 +137,6 @@ module VX_decode  #(
        use_imm   = 0;
        use_PC    = 0;
        use_rd    = 0;
-        is_join   = 0;
        is_wstall = 0;

        case (opcode)            
@ -437,11 +436,14 @@ module VX_decode  #(
                            3'h2: begin // SPLIT
                                op_type = `INST_OP_BITS'(`INST_GPU_SPLIT);
                                is_wstall = 1;
-                                `USED_IREG (rs1);
+                                use_rd    = 1;
+                                `USED_IREG (rs1);                                
+                                `USED_IREG (rd);                                
                            end
                            3'h3: begin // JOIN
                                op_type = `INST_OP_BITS'(`INST_GPU_JOIN);
-                                is_join = 1;
+                                is_wstall = 1;
+                                `USED_IREG (rs1);
                            end
                            3'h4: begin // BAR
                                op_type = `INST_OP_BITS'(`INST_GPU_BAR);
@ -551,7 +553,6 @@ module VX_decode  #(
    assign decode_sched_if.valid    = fetch_fire;
    assign decode_sched_if.wid      = fetch_if.wid;
    assign decode_sched_if.is_wstall = is_wstall;
-    assign decode_sched_if.is_join  =  is_join;
    
    assign fetch_if.ibuf_pop = decode_if.ibuf_pop;
    assign fetch_if.ready = decode_if.ready;
--- a/hw/rtl/core/VX_execute.sv
+++ b/hw/rtl/core/VX_execute.sv
@ -194,10 +194,10 @@ module VX_execute #(
        .reset          (fpu_reset),    
        .fpu_exe_if     (fpu_exe_if), 
        .fpu_bus_if     (fpu_bus_if),
-        .fpu_to_csr_if  (fpu_to_csr_if), 
-        .fpu_commit_if  (fpu_commit_if),
+        .fpu_to_csr_if  (fpu_to_csr_if),
        .csr_pending    (csr_pending),
-        .req_pending    (fpu_pending) 
+        .req_pending    (fpu_pending),
+        .commit_if      (fpu_commit_if)
    );
 `endif

--- a/hw/rtl/core/VX_gpu_unit.sv
+++ b/hw/rtl/core/VX_gpu_unit.sv
@ -44,13 +44,11 @@ module VX_gpu_unit #(

    localparam UUID_WIDTH    = `UP(`UUID_BITS);
    localparam NW_WIDTH      = `UP(`NW_BITS);
-    localparam WCTL_DATAW    = `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS;
-    localparam RSP_DATAW     = `MAX(`NUM_THREADS * `XLEN, WCTL_DATAW);
-    localparam RSP_ARB_DATAW = UUID_WIDTH + NW_WIDTH + `NUM_THREADS + `XLEN + `NR_BITS + 1 + RSP_DATAW + 1 + 1;
+    localparam RSP_ARB_DATAW = UUID_WIDTH + NW_WIDTH + `NUM_THREADS + (`NUM_THREADS * `XLEN) + `NR_BITS + 1 + `XLEN + 1;
    localparam RSP_ARB_SIZE  = 1 + `EXT_TEX_ENABLED + `EXT_RASTER_ENABLED + `EXT_ROP_ENABLED + `EXT_IMADD_ENABLED;

-    localparam RSP_ARB_IDX_GPU    = 0;
-    localparam RSP_ARB_IDX_RASTER = RSP_ARB_IDX_GPU + 1;
+    localparam RSP_ARB_IDX_WCTL   = 0;
+    localparam RSP_ARB_IDX_RASTER = RSP_ARB_IDX_WCTL + 1;
    localparam RSP_ARB_IDX_ROP    = RSP_ARB_IDX_RASTER + `EXT_RASTER_ENABLED;    
    localparam RSP_ARB_IDX_TEX    = RSP_ARB_IDX_ROP + `EXT_ROP_ENABLED;    
    localparam RSP_ARB_IDX_IMADD  = RSP_ARB_IDX_TEX + `EXT_TEX_ENABLED;
@ -63,89 +61,46 @@ module VX_gpu_unit #(
    wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
    wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;

-    wire [RSP_DATAW-1:0] rsp_data;
-    wire                 rsp_is_wctl;
-
    wire gpu_req_valid;
    reg gpu_req_ready;

    wire csr_ready = ~csr_pending;
    assign gpu_req_valid = gpu_exe_if.valid && csr_ready;

-    // Warp control block
-
-    gpu_tmc_t       tmc;
-    gpu_wspawn_t    wspawn;
-    gpu_barrier_t   barrier;
-    gpu_split_t     split;
-    
-    wire is_wspawn = (gpu_exe_if.op_type == `INST_GPU_WSPAWN);
-    wire is_tmc    = (gpu_exe_if.op_type == `INST_GPU_TMC);
-    wire is_split  = (gpu_exe_if.op_type == `INST_GPU_SPLIT);
-    wire is_join   = (gpu_exe_if.op_type == `INST_GPU_JOIN);
-    wire is_bar    = (gpu_exe_if.op_type == `INST_GPU_BAR);
-    wire is_pred   = (gpu_exe_if.op_type == `INST_GPU_PRED);
-
-    wire [`XLEN-1:0] rs1_data = gpu_exe_if.rs1_data[gpu_exe_if.tid];
-    wire [`XLEN-1:0] rs2_data = gpu_exe_if.rs2_data[gpu_exe_if.tid];
-    
-    wire [`NUM_THREADS-1:0] taken_tmask;
-    wire [`NUM_THREADS-1:0] not_taken_tmask;
-
-    for (genvar i = 0; i < `NUM_THREADS; ++i) begin
-        wire taken = (gpu_exe_if.rs1_data[i] != 0);
-        assign taken_tmask[i]     = gpu_exe_if.tmask[i] && taken;
-        assign not_taken_tmask[i] = gpu_exe_if.tmask[i] && ~taken;
-    end
-
-    // tmc
-
-    wire [`NUM_THREADS-1:0] pred_mask = (taken_tmask != 0) ? taken_tmask : gpu_exe_if.tmask;
-
-    assign tmc.valid = is_tmc || is_pred;
-    assign tmc.tmask = is_pred ? pred_mask : rs1_data[`NUM_THREADS-1:0];
-
-    // wspawn
-
-    wire [`XLEN-1:0] wspawn_pc = rs2_data;
-    wire [`NUM_WARPS-1:0] wspawn_wmask;
-    for (genvar i = 0; i < `NUM_WARPS; ++i) begin
-        assign wspawn_wmask[i] = (i < rs1_data);
-    end
-    assign wspawn.valid = is_wspawn;
-    assign wspawn.wmask = wspawn_wmask;
-    assign wspawn.pc    = wspawn_pc;
-
-    // split
-
-    assign split.valid      = is_split;
-    assign split.diverged   = (| taken_tmask) && (| not_taken_tmask);
-    assign split.then_tmask = taken_tmask;
-    assign split.else_tmask = not_taken_tmask;
-    assign split.pc         = gpu_exe_if.next_PC;
-
-    // barrier
-    
-    assign barrier.valid    = is_bar;
-    assign barrier.id       = rs1_data[`NB_BITS-1:0];
-    assign barrier.is_global = rs1_data[31];
-    assign barrier.size_m1  = $bits(barrier.size_m1)'(rs2_data - 1);       
-
-    // Warp control response
-    wire wctl_req_valid = gpu_req_valid && (is_wspawn || is_tmc || is_split || is_join || is_bar || is_pred);
-    wire wctl_rsp_valid = wctl_req_valid;
-    wire [WCTL_DATAW-1:0] wctl_rsp_data = {tmc, wspawn, split, barrier};
-    wire wctl_rsp_ready;
-    wire wctl_req_ready = wctl_rsp_ready;
-
-    assign rsp_arb_valid_in[RSP_ARB_IDX_GPU] = wctl_rsp_valid;
-    assign rsp_arb_data_in[RSP_ARB_IDX_GPU] = {gpu_exe_if.uuid, gpu_exe_if.wid, gpu_exe_if.tmask, gpu_exe_if.PC, `NR_BITS'(0), 1'b0, RSP_DATAW'(wctl_rsp_data), 1'b1, ~is_join};
-    assign wctl_rsp_ready = rsp_arb_ready_in[RSP_ARB_IDX_GPU];
-
    `UNUSED_VAR (gpu_exe_if.op_mod)
    `UNUSED_VAR (gpu_exe_if.rs3_data)
-    `UNUSED_VAR (gpu_exe_if.wb)
-    `UNUSED_VAR (gpu_exe_if.rd)
+
+    // Warp control block    
+
+    VX_gpu_exe_if wctl_exe_if();
+    VX_commit_if wctl_commit_if();
+
+    assign wctl_exe_if.valid    = gpu_req_valid && `INST_GPU_IS_WCTL(gpu_exe_if.op_type);
+    assign wctl_exe_if.op_type  = gpu_exe_if.op_type;
+    assign wctl_exe_if.uuid     = gpu_exe_if.uuid;
+    assign wctl_exe_if.wid      = gpu_exe_if.wid;
+    assign wctl_exe_if.tmask    = gpu_exe_if.tmask;
+    assign wctl_exe_if.tid      = gpu_exe_if.tid;
+    assign wctl_exe_if.PC       = gpu_exe_if.PC;
+    assign wctl_exe_if.next_PC  = gpu_exe_if.next_PC;
+    assign wctl_exe_if.rd       = gpu_exe_if.rd;
+    assign wctl_exe_if.wb       = gpu_exe_if.wb;
+    assign wctl_exe_if.rs1_data = gpu_exe_if.rs1_data;
+    assign wctl_exe_if.rs2_data = gpu_exe_if.rs2_data;
+    
+    VX_wctl_unit #(
+        .OUTPUT_REG (RSP_ARB_SIZE > 1)
+    ) wctl_unit (
+        .clk        (clk),
+        .reset      (reset),
+        .gpu_exe_if (wctl_exe_if),  
+        .warp_ctl_if(warp_ctl_if),      
+        .commit_if  (wctl_commit_if)
+    );
+
+    assign rsp_arb_valid_in[RSP_ARB_IDX_WCTL] = wctl_commit_if.valid;
+    assign rsp_arb_data_in[RSP_ARB_IDX_WCTL] = {wctl_commit_if.uuid, wctl_commit_if.wid, wctl_commit_if.tmask, wctl_commit_if.PC, wctl_commit_if.rd, wctl_commit_if.wb, wctl_commit_if.data, 1'b1};
+    assign wctl_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_WCTL];
    
 `ifdef EXT_TEX_ENABLE

@ -171,16 +126,16 @@ module VX_gpu_unit #(
    VX_tex_agent #(
        .CORE_ID (CORE_ID)
    ) tex_agent (
-        .clk           (clk),
-        .reset         (tex_reset),
-        .tex_csr_if    (tex_csr_if),
-        .tex_exe_if    (tex_exe_if),        
-        .tex_commit_if (tex_commit_if),
-        .tex_bus_if    (tex_bus_if)
+        .clk        (clk),
+        .reset      (tex_reset),
+        .tex_csr_if (tex_csr_if),
+        .tex_exe_if (tex_exe_if),
+        .tex_bus_if (tex_bus_if),
+        .commit_if  (tex_commit_if)        
    );     

    assign rsp_arb_valid_in[RSP_ARB_IDX_TEX] = tex_commit_if.valid;
-    assign rsp_arb_data_in[RSP_ARB_IDX_TEX] = {tex_commit_if.uuid, tex_commit_if.wid, tex_commit_if.tmask, tex_commit_if.PC, tex_commit_if.rd, tex_commit_if.wb, RSP_DATAW'(tex_commit_if.data), tex_commit_if.eop, 1'b0};
+    assign rsp_arb_data_in[RSP_ARB_IDX_TEX] = {tex_commit_if.uuid, tex_commit_if.wid, tex_commit_if.tmask, tex_commit_if.PC, tex_commit_if.rd, tex_commit_if.wb, tex_commit_if.data, tex_commit_if.eop};
    assign tex_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_TEX];

 `endif
@ -202,16 +157,16 @@ module VX_gpu_unit #(
    VX_raster_agent #(
        .CORE_ID (CORE_ID)
    ) raster_agent (
-        .clk              (clk),
-        .reset            (raster_reset),
-        .raster_csr_if    (raster_csr_if),
-        .raster_bus_if    (raster_bus_if),
-        .raster_exe_if    (raster_exe_if),        
-        .raster_commit_if (raster_commit_if)        
+        .clk            (clk),
+        .reset          (raster_reset),
+        .raster_csr_if  (raster_csr_if),
+        .raster_bus_if  (raster_bus_if),
+        .raster_exe_if  (raster_exe_if),        
+        .commit_if      (raster_commit_if)       
    );

    assign rsp_arb_valid_in[RSP_ARB_IDX_RASTER] = raster_commit_if.valid;
-    assign rsp_arb_data_in[RSP_ARB_IDX_RASTER] = {raster_commit_if.uuid, raster_commit_if.wid, raster_commit_if.tmask, raster_commit_if.PC, raster_commit_if.rd, raster_commit_if.wb, RSP_DATAW'(raster_commit_if.data), raster_commit_if.eop, 1'b0};
+    assign rsp_arb_data_in[RSP_ARB_IDX_RASTER] = {raster_commit_if.uuid, raster_commit_if.wid, raster_commit_if.tmask, raster_commit_if.PC, raster_commit_if.rd, raster_commit_if.wb, raster_commit_if.data, raster_commit_if.eop};
    assign raster_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_RASTER];

 `endif
@ -240,16 +195,16 @@ module VX_gpu_unit #(
    VX_rop_agent #(
        .CORE_ID (CORE_ID)
    ) rop_agent (
-        .clk           (clk),
-        .reset         (rop_reset),
-        .rop_csr_if    (rop_csr_if),
-        .rop_exe_if    (rop_exe_if),
-        .rop_commit_if (rop_commit_if),
-        .rop_bus_if    (rop_bus_if)        
+        .clk        (clk),
+        .reset      (rop_reset),
+        .rop_csr_if (rop_csr_if),
+        .rop_exe_if (rop_exe_if),
+        .rop_bus_if (rop_bus_if),
+        .commit_if  (rop_commit_if)
    );

    assign rsp_arb_valid_in[RSP_ARB_IDX_ROP] = rop_commit_if.valid;
-    assign rsp_arb_data_in[RSP_ARB_IDX_ROP] = {rop_commit_if.uuid, rop_commit_if.wid, rop_commit_if.tmask, rop_commit_if.PC, rop_commit_if.rd, rop_commit_if.wb, RSP_DATAW'(rop_commit_if.data), rop_commit_if.eop, 1'b0};
+    assign rsp_arb_data_in[RSP_ARB_IDX_ROP] = {rop_commit_if.uuid, rop_commit_if.wid, rop_commit_if.tmask, rop_commit_if.PC, rop_commit_if.rd, rop_commit_if.wb, rop_commit_if.data, rop_commit_if.eop};
    assign rop_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_ROP];

 `endif
@ -311,13 +266,11 @@ module VX_gpu_unit #(
    end

    assign rsp_arb_valid_in[RSP_ARB_IDX_IMADD] = imadd_valid_out;
-    assign rsp_arb_data_in[RSP_ARB_IDX_IMADD] = {imadd_uuid_out, imadd_wid_out, imadd_tmask_out, imadd_PC_out, imadd_rd_out, 1'b1, RSP_DATAW'(imadd_data_out_x), 1'b1, 1'b0};
+    assign rsp_arb_data_in[RSP_ARB_IDX_IMADD] = {imadd_uuid_out, imadd_wid_out, imadd_tmask_out, imadd_PC_out, imadd_rd_out, 1'b1, imadd_data_out_x, 1'b1};
    assign imadd_ready_out = rsp_arb_ready_in[RSP_ARB_IDX_IMADD];

 `endif

-
-
    // can accept new request?
    
    always @(*) begin
@ -334,7 +287,7 @@ module VX_gpu_unit #(
    `ifdef EXT_IMADD_ENABLE
        `INST_GPU_IMADD: gpu_req_ready = imadd_ready_in;
    `endif
-        default: gpu_req_ready = wctl_req_ready;
+        default: gpu_req_ready = wctl_exe_if.ready;
        endcase
    end   
    assign gpu_exe_if.ready = gpu_req_ready && csr_ready;
@ -352,21 +305,13 @@ module VX_gpu_unit #(
        .valid_in  (rsp_arb_valid_in),
        .ready_in  (rsp_arb_ready_in),
        .data_in   (rsp_arb_data_in),
-        .data_out  ({gpu_commit_if.uuid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data, gpu_commit_if.eop, rsp_is_wctl}),
+        .data_out  ({gpu_commit_if.uuid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, gpu_commit_if.data, gpu_commit_if.eop}),
        .valid_out (gpu_commit_if.valid),
        .ready_out (gpu_commit_if.ready)
    );

-    assign gpu_commit_if.data = rsp_data[(`NUM_THREADS * `XLEN)-1:0];
-
-    // warp control reponse
-
    wire gpu_req_fire = gpu_exe_if.valid && gpu_exe_if.ready;
    wire gpu_commit_fire = gpu_commit_if.valid && gpu_commit_if.ready;
-         
-    assign warp_ctl_if.valid = gpu_commit_fire && rsp_is_wctl;
-    assign warp_ctl_if.wid   = gpu_commit_if.wid;    
-    assign {warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier} = rsp_data[WCTL_DATAW-1:0];

    // pending request

--- a/hw/rtl/core/VX_ipdom_stack.sv
+++ b/hw/rtl/core/VX_ipdom_stack.sv
@ -1,32 +1,32 @@
 `include "VX_platform.vh"

-module VX_ipdom #(
+module VX_ipdom_stack #(
    parameter WIDTH = 1,
-    parameter DEPTH = 1
+    parameter DEPTH = 1,
+    parameter ADDRW = $clog2(DEPTH)
 ) (
    input  wire               clk,
    input  wire               reset,
-    input  wire               pair,
+    input  wire [WIDTH - 1:0] q0,
    input  wire [WIDTH - 1:0] q1,
-    input  wire [WIDTH - 1:0] q2,
    output wire [WIDTH - 1:0] d,
+    output wire               d_idx,
+    output wire [ADDRW-1:0]   q_ptr,
+    output wire [ADDRW-1:0]   d_ptr,    
    input  wire               push,
-    input  wire               pop,
-    output wire               index,
+    input  wire               pop,    
    output wire               empty,
    output wire               full
 );
-    `STATIC_ASSERT(`ISPOW2(DEPTH), ("depth must be a power of 2!"))
+    `STATIC_ASSERT(`ISPOW2(DEPTH), ("depth must be a power of 2!"))   

-    localparam ADDRW = $clog2(DEPTH);
-
-    reg is_part [DEPTH-1:0];
+    reg slot_idx [DEPTH-1:0];
    
    reg [ADDRW-1:0] rd_ptr, wr_ptr;

    reg empty_r, full_r;

-    wire [WIDTH-1:0] d1, d2;
+    wire [WIDTH-1:0] d0, d1;

    always @(posedge clk) begin
        if (reset) begin   
@ -44,9 +44,9 @@ module VX_ipdom #(
                empty_r <= 0;
                full_r  <= (ADDRW'(DEPTH-1) == wr_ptr);
            end else if (pop) begin                   
-                wr_ptr  <= wr_ptr - ADDRW'(is_part[rd_ptr]);
-                rd_ptr  <= rd_ptr - ADDRW'(is_part[rd_ptr]);
-                empty_r <= is_part[rd_ptr] && (0 == rd_ptr);
+                wr_ptr  <= wr_ptr - ADDRW'(d_idx);
+                rd_ptr  <= rd_ptr - ADDRW'(d_idx);
+                empty_r <= (rd_ptr == 0) && (d_idx == 1);
                full_r  <= 0;
            end
        end
@ -61,21 +61,23 @@ module VX_ipdom #(
        .write (push),        
        `UNUSED_PIN (wren),               
        .waddr (wr_ptr),
-        .wdata ({q2, q1}),
+        .wdata ({q1, q0}),
        .raddr (rd_ptr),
-        .rdata ({d2, d1})
+        .rdata ({d1, d0})
    );
    
    always @(posedge clk) begin
        if (push) begin
-            is_part[wr_ptr] <= ~pair;   
+            slot_idx[wr_ptr] <= 0;   
        end else if (pop) begin            
-            is_part[rd_ptr] <= 1;
+            slot_idx[rd_ptr] <= 1;
        end
    end

-    assign index = is_part[rd_ptr];
-    assign d     = index ? d1 : d2;
+    assign d     = d_idx ? d1 : d0;  
+    assign d_idx = slot_idx[rd_ptr];
+    assign d_ptr = rd_ptr;
+    assign q_ptr = wr_ptr;
    assign empty = empty_r;
    assign full  = full_r;

--- a/hw/rtl/core/VX_schedule.sv
+++ b/hw/rtl/core/VX_schedule.sv
@ -34,10 +34,6 @@ module VX_schedule #(
    localparam NC_WIDTH   = `UP(`NC_BITS);
    localparam NW_WIDTH   = `UP(`NW_BITS);

-    wire                    join_else;
-    wire [`XLEN-1:0]        join_pc;
-    wire [`NUM_THREADS-1:0] join_tmask;
-
    reg [`NUM_WARPS-1:0] active_warps, active_warps_n; // updated when a warp is activated or disabled
    reg [`NUM_WARPS-1:0] stalled_warps;  // set when branch/gpgpu instructions are issued
    
@ -63,6 +59,14 @@ module VX_schedule #(
    wire                    schedule_valid;
    wire                    schedule_ready;

+    // split/join
+    wire                    split_is_divergent;
+    wire [`NUM_THREADS-1:0] split_tmask0;
+    wire                    join_is_divergent;
+    wire                    join_is_else;    
+    wire [`NUM_THREADS-1:0] join_tmask;
+    wire [`XLEN-1:0]        join_pc;
+
    reg [`PERF_CTR_BITS-1:0] cycles;

    reg [`NUM_WARPS-1:0][UUID_WIDTH-1:0] issued_instrs;
@ -103,11 +107,14 @@ module VX_schedule #(
            thread_masks[0] <= 1;
        end else begin
            // join handling
-            if (decode_sched_if.valid && decode_sched_if.is_join) begin
-                if (join_else) begin
-                    warp_pcs[decode_sched_if.wid] <= `XLEN'(join_pc);
+            if (warp_ctl_if.valid && warp_ctl_if.sjoin.valid) begin
+                stalled_warps[warp_ctl_if.wid] <= 0;
+                if (join_is_divergent) begin
+                    if (join_is_else) begin
+                        warp_pcs[warp_ctl_if.wid] <= `XLEN'(join_pc);
+                    end
+                    thread_masks[warp_ctl_if.wid] <= join_tmask;
                end
-                thread_masks[decode_sched_if.wid] <= join_tmask;
            end

            if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
@ -145,8 +152,8 @@ module VX_schedule #(
            // split handling
            if (warp_ctl_if.valid && warp_ctl_if.split.valid) begin
                stalled_warps[warp_ctl_if.wid] <= 0;
-                if (warp_ctl_if.split.diverged) begin
-                    thread_masks[warp_ctl_if.wid] <= warp_ctl_if.split.then_tmask;
+                if (split_is_divergent) begin
+                    thread_masks[warp_ctl_if.wid] <= split_tmask0;
                end
            end

@ -216,47 +223,62 @@ module VX_schedule #(
    assign gbar_bus_if.req_size_m1 = gbar_req_size_m1;
    assign gbar_bus_if.req_core_id = NC_WIDTH'(CORE_ID % `NUM_CORES);

-    // split/join stack management    
+    // split/join handling  

-    wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_data [`NUM_WARPS-1:0]; 
+    wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_data [`NUM_WARPS-1:0];     
+    wire [`PD_STACK_SIZEW-1:0] ipdom_q_ptr [`NUM_WARPS-1:0];
    wire ipdom_index [`NUM_WARPS-1:0];

+    wire [`NUM_THREADS-1:0] then_tmask;
+    wire [`NUM_THREADS-1:0] else_tmask;
+
+    for (genvar i = 0; i < `NUM_THREADS; ++i) begin
+        assign then_tmask[i] = warp_ctl_if.split.tmask[i] && warp_ctl_if.split.taken[i];
+        assign else_tmask[i] = warp_ctl_if.split.tmask[i] && ~warp_ctl_if.split.taken[i];
+    end
+
+    wire [`CLOG2(`NUM_THREADS+1)-1:0] then_tmask_cnt, else_tmask_cnt;
+    `POP_COUNT(then_tmask_cnt, then_tmask);
+    `POP_COUNT(else_tmask_cnt, else_tmask);
+    wire then_first = (then_tmask_cnt >= else_tmask_cnt);
+    
+    assign split_is_divergent = (then_tmask != 0) && (else_tmask != 0);
+    assign split_tmask0 = then_first ? then_tmask : else_tmask;
+    assign warp_ctl_if.split_ret = ipdom_q_ptr[warp_ctl_if.wid];
+
+    assign join_is_divergent = (warp_ctl_if.sjoin.stack_ptr != ipdom_q_ptr[warp_ctl_if.wid]);
+    assign {join_pc, join_tmask} = ipdom_data[warp_ctl_if.wid];
+    assign join_is_else = (ipdom_index[warp_ctl_if.wid] == 0);
+
+    wire [`NUM_THREADS-1:0] split_tmask1 = then_first ? else_tmask : then_tmask;
+    wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_q0 = {warp_ctl_if.split.next_pc, split_tmask1};
+    wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_q1 = {`XLEN'(0),                 warp_ctl_if.split.tmask};
+
+    wire ipdom_push = warp_ctl_if.valid && warp_ctl_if.split.valid && split_is_divergent;
+    wire ipdom_pop = warp_ctl_if.valid  && warp_ctl_if.sjoin.valid && join_is_divergent;
+
    `RESET_RELAY (ipdom_reset, reset);
    
    for (genvar i = 0; i < `NUM_WARPS; ++i) begin
-        wire push = warp_ctl_if.valid 
-                 && warp_ctl_if.split.valid
-                 && (i == warp_ctl_if.wid);
-
-        wire pop = decode_sched_if.valid && decode_sched_if.is_join && (i == decode_sched_if.wid);
-
-        wire [`NUM_THREADS-1:0] else_tmask = warp_ctl_if.split.else_tmask;
-        wire [`NUM_THREADS-1:0] orig_tmask = thread_masks[warp_ctl_if.wid];
-
-        wire [(`XLEN+`NUM_THREADS)-1:0] q_else = {warp_ctl_if.split.pc, else_tmask};
-        wire [(`XLEN+`NUM_THREADS)-1:0] q_end  = {`XLEN'(0),            orig_tmask};
-
-        VX_ipdom #(
+        VX_ipdom_stack #(
            .WIDTH (`XLEN+`NUM_THREADS), 
-            .DEPTH (`IPDOM_STACK_SIZE)
-        ) ipdom (
+            .DEPTH (`PD_STACK_SIZE)
+        ) ipdom_stack (
            .clk   (clk),
            .reset (ipdom_reset),
-            .push  (push),
-            .pop   (pop),
-            .pair  (warp_ctl_if.split.diverged),
-            .q1    (q_end),
-            .q2    (q_else),
+            .push  (ipdom_push && (i == warp_ctl_if.wid)),
+            .pop   (ipdom_pop && (i == warp_ctl_if.wid)),
+            .q0    (ipdom_q0),
+            .q1    (ipdom_q1),
            .d     (ipdom_data[i]),
-            .index (ipdom_index[i]),
+            .d_idx (ipdom_index[i]),
+            .q_ptr (ipdom_q_ptr[i]),
+            `UNUSED_PIN (d_ptr),
            `UNUSED_PIN (empty),
            `UNUSED_PIN (full)
        );
    end

-    assign {join_pc, join_tmask} = ipdom_data[decode_sched_if.wid];
-    assign join_else = ~ipdom_index[decode_sched_if.wid];
-
    // schedule the next ready warp

    wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls);
--- a/hw/rtl/core/VX_wctl_unit.sv
+++ b/hw/rtl/core/VX_wctl_unit.sv
@ -0,0 +1,108 @@
+`include "VX_define.vh"
+
+module VX_wctl_unit #(
+    parameter OUTPUT_REG = 0
+) (
+    input wire              clk,
+    input wire              reset,
+
+    // Inputs
+    VX_gpu_exe_if.slave     gpu_exe_if,
+    
+    // Outputs
+    VX_warp_ctl_if.master   warp_ctl_if,
+    VX_commit_if.master     commit_if
+);
+
+    localparam UUID_WIDTH = `UP(`UUID_BITS);
+    localparam NW_WIDTH   = `UP(`NW_BITS);
+    
+    gpu_tmc_t       tmc;
+    gpu_wspawn_t    wspawn;    
+    gpu_split_t     split;
+    gpu_join_t      sjoin;
+    gpu_barrier_t   barrier;
+    
+    wire [`XLEN-1:0] rs1_data = gpu_exe_if.rs1_data[gpu_exe_if.tid];
+    wire [`XLEN-1:0] rs2_data = gpu_exe_if.rs2_data[gpu_exe_if.tid];
+    
+    wire [`NUM_THREADS-1:0] taken;
+    for (genvar i = 0; i < `NUM_THREADS; ++i) begin
+        assign taken[i] = gpu_exe_if.rs1_data[i][0];
+    end
+
+    wire is_wspawn = (gpu_exe_if.op_type == `INST_GPU_WSPAWN);
+    wire is_tmc    = (gpu_exe_if.op_type == `INST_GPU_TMC);
+    wire is_pred   = (gpu_exe_if.op_type == `INST_GPU_PRED);
+    wire is_split  = (gpu_exe_if.op_type == `INST_GPU_SPLIT);
+    wire is_join   = (gpu_exe_if.op_type == `INST_GPU_JOIN);
+    wire is_bar    = (gpu_exe_if.op_type == `INST_GPU_BAR);
+
+    assign warp_ctl_if.valid   = gpu_exe_if.valid && gpu_exe_if.ready;
+    assign warp_ctl_if.wid     = gpu_exe_if.wid;
+    assign warp_ctl_if.tmc     = tmc;
+    assign warp_ctl_if.wspawn  = wspawn;
+    assign warp_ctl_if.split   = split;
+    assign warp_ctl_if.sjoin   = sjoin;
+    assign warp_ctl_if.barrier = barrier;
+
+    // tmc
+
+    wire [`NUM_THREADS-1:0] then_tmask = gpu_exe_if.tmask & taken;
+    wire [`NUM_THREADS-1:0] pred_mask = (then_tmask != 0) ? then_tmask : gpu_exe_if.tmask;
+
+    assign tmc.valid = is_tmc || is_pred;
+    assign tmc.tmask = is_pred ? pred_mask : rs1_data[`NUM_THREADS-1:0];
+
+    // wspawn
+
+    wire [`XLEN-1:0] wspawn_pc = rs2_data;
+    wire [`NUM_WARPS-1:0] wspawn_wmask;
+    for (genvar i = 0; i < `NUM_WARPS; ++i) begin
+        assign wspawn_wmask[i] = (i < rs1_data[31:0]);
+    end
+    assign wspawn.valid     = is_wspawn;
+    assign wspawn.wmask     = wspawn_wmask;
+    assign wspawn.pc        = wspawn_pc;
+
+    // split
+    
+    assign split.valid      = is_split;
+    assign split.taken      = taken;
+    assign split.tmask      = gpu_exe_if.tmask;
+    assign split.next_pc    = gpu_exe_if.next_PC;
+
+    // join
+
+    assign sjoin.valid      = is_join;   
+    assign sjoin.stack_ptr  = `PD_STACK_SIZEW'(rs1_data);
+
+    // barrier
+    assign barrier.valid    = is_bar;
+    assign barrier.id       = rs1_data[`NB_BITS-1:0];
+    assign barrier.is_global = rs1_data[31];
+    assign barrier.size_m1  = $bits(barrier.size_m1)'(rs2_data[31:0] - 1);
+
+    // response
+
+    wire [`PD_STACK_SIZEW-1:0] rsp_data;
+    
+    VX_skid_buffer #(
+        .DATAW    (UUID_WIDTH + NW_WIDTH + `NUM_THREADS + `XLEN + `NR_BITS + 1 + `PD_STACK_SIZEW),
+        .PASSTHRU (OUTPUT_REG == 0)
+    ) rsp_sbuf (
+        .clk       (clk),
+        .reset     (reset),
+        .valid_in  (gpu_exe_if.valid),
+        .ready_in  (gpu_exe_if.ready),
+        .data_in   ({gpu_exe_if.uuid, gpu_exe_if.wid, gpu_exe_if.tmask, gpu_exe_if.PC, gpu_exe_if.rd, gpu_exe_if.wb, warp_ctl_if.split_ret}),
+        .data_out  ({commit_if.uuid,  commit_if.wid,  commit_if.tmask,  commit_if.PC,  commit_if.rd,  commit_if.wb,  rsp_data}),
+        .valid_out (commit_if.valid),
+        .ready_out (commit_if.ready)
+    );
+    
+    for (genvar i = 0; i < `NUM_THREADS; ++i) begin
+        assign commit_if.data[i] = `XLEN'(rsp_data);
+    end
+    
+endmodule
--- a/hw/rtl/fpu/VX_fpu_agent.sv
+++ b/hw/rtl/fpu/VX_fpu_agent.sv
@ -13,12 +13,13 @@ module VX_fpu_agent #(

    VX_fpu_exe_if.slave     fpu_exe_if,
    VX_fpu_to_csr_if.master fpu_to_csr_if,
-    VX_commit_if.master     fpu_commit_if,
-
+    
    VX_fpu_bus_if.master    fpu_bus_if,

    input wire              csr_pending,
-    output wire             req_pending
+    output wire             req_pending,
+
+    VX_commit_if.master     commit_if
 );
    `UNUSED_PARAM (CORE_ID)
    
@ -115,18 +116,18 @@ module VX_fpu_agent #(
        .reset     (reset),
        .valid_in  (fpu_bus_if.rsp_valid),
        .ready_in  (fpu_bus_if.rsp_ready),
-        .data_in   ({rsp_uuid,           rsp_wid,           rsp_tmask,           rsp_PC,           rsp_rd,           fpu_bus_if.rsp_result}),
-        .data_out  ({fpu_commit_if.uuid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.data}),
-        .valid_out (fpu_commit_if.valid),
-        .ready_out (fpu_commit_if.ready)
+        .data_in   ({rsp_uuid,       rsp_wid,       rsp_tmask,       rsp_PC,       rsp_rd,       fpu_bus_if.rsp_result}),
+        .data_out  ({commit_if.uuid, commit_if.wid, commit_if.tmask, commit_if.PC, commit_if.rd, commit_if.data}),
+        .valid_out (commit_if.valid),
+        .ready_out (commit_if.ready)
    );

-    assign fpu_commit_if.wb  = 1'b1; 
-    assign fpu_commit_if.eop = 1'b1;
+    assign commit_if.wb  = 1'b1; 
+    assign commit_if.eop = 1'b1;

    // pending request

-    wire fpu_commit_fire = fpu_commit_if.valid && fpu_commit_if.ready;
+    wire fpu_commit_fire = commit_if.valid && commit_if.ready;

    reg req_pending_r;
    always @(posedge clk) begin
--- a/hw/rtl/interfaces/VX_decode_sched_if.sv
+++ b/hw/rtl/interfaces/VX_decode_sched_if.sv
@ -4,20 +4,17 @@ interface VX_decode_sched_if ();

    wire                    valid;
    wire                    is_wstall;
-    wire                    is_join;
    wire [`UP(`NW_BITS)-1:0] wid;

    modport master (
        output valid,
        output is_wstall,
-        output is_join,
        output wid
    );

    modport slave (
        input valid,
        input is_wstall,
-        input is_join,
        input wid
    );

--- a/hw/rtl/interfaces/VX_warp_ctl_if.sv
+++ b/hw/rtl/interfaces/VX_warp_ctl_if.sv
@ -11,25 +11,31 @@ interface VX_warp_ctl_if ();
    wire [`UP(`NW_BITS)-1:0] wid;
    gpu_tmc_t           tmc;
    gpu_wspawn_t        wspawn;
-    gpu_barrier_t       barrier;
    gpu_split_t         split;
+    gpu_join_t          sjoin;
+    gpu_barrier_t       barrier;
+    wire [`PD_STACK_SIZEW-1:0] split_ret;

    modport master (
        output valid,
        output wid,
-        output tmc,
        output wspawn,
+        output tmc,
+        output split,
+        output sjoin,
        output barrier,
-        output split
+        input  split_ret
    );

    modport slave (
-        input valid,
-        input wid,
-        input tmc,
-        input wspawn,
-        input barrier,
-        input split
+        input  valid,
+        input  wid,
+        input  wspawn,
+        input  tmc,
+        input  split,
+        input  sjoin,
+        input  barrier,
+        output split_ret
    );

 endinterface
--- a/hw/rtl/raster/VX_raster_agent.sv
+++ b/hw/rtl/raster/VX_raster_agent.sv
@ -10,9 +10,9 @@ module VX_raster_agent #(
    VX_raster_exe_if.slave raster_exe_if,    
    VX_raster_bus_if.slave raster_bus_if,
        
-    // Outputs
-    VX_commit_if.master    raster_commit_if,
-    VX_gpu_csr_if.slave    raster_csr_if    
+    // Outputs    
+    VX_gpu_csr_if.slave    raster_csr_if,
+    VX_commit_if.master    commit_if
 );
    `UNUSED_PARAM (CORE_ID)

@ -41,7 +41,7 @@ module VX_raster_agent #(
    );

    // it is possible to have ready = f(valid) when using arbiters, 
-    // because of that we need to decouple raster_exe_if and raster_commit_if handshake with a pipe register
+    // because of that we need to decouple raster_exe_if and commit_if handshake with a pipe register

    assign raster_exe_if.ready = raster_bus_if.req_valid && raster_rsp_ready;

@ -62,18 +62,18 @@ module VX_raster_agent #(
        .reset     (reset),
        .valid_in  (raster_rsp_valid),
        .ready_in  (raster_rsp_ready), 
-        .data_in   ({raster_exe_if.uuid,    raster_exe_if.wid,    raster_exe_if.tmask,    raster_exe_if.PC,    raster_exe_if.rd,    response_data}),
-        .data_out  ({raster_commit_if.uuid, raster_commit_if.wid, raster_commit_if.tmask, raster_commit_if.PC, raster_commit_if.rd, commit_data}),
-        .valid_out (raster_commit_if.valid),
-        .ready_out (raster_commit_if.ready)
+        .data_in   ({raster_exe_if.uuid, raster_exe_if.wid, raster_exe_if.tmask, raster_exe_if.PC, raster_exe_if.rd, response_data}),
+        .data_out  ({commit_if.uuid,     commit_if.wid,     commit_if.tmask,     commit_if.PC,     commit_if.rd,     commit_data}),
+        .valid_out (commit_if.valid),
+        .ready_out (commit_if.ready)
    );

    for (genvar i = 0; i < `NUM_THREADS; ++i) begin
-        assign raster_commit_if.data[i] = `XLEN'(commit_data[i]);
+        assign commit_if.data[i] = `XLEN'(commit_data[i]);
    end

-    assign raster_commit_if.wb  = 1'b1;
-    assign raster_commit_if.eop = 1'b1;
+    assign commit_if.wb  = 1'b1;
+    assign commit_if.eop = 1'b1;

 `ifdef DBG_TRACE_RASTER
    always @(posedge clk) begin
--- a/hw/rtl/rop/VX_rop_agent.sv
+++ b/hw/rtl/rop/VX_rop_agent.sv
@ -11,8 +11,8 @@ module VX_rop_agent #(
    VX_gpu_csr_if.slave   rop_csr_if,  

    // Outputs    
-    VX_commit_if.master   rop_commit_if,
-    VX_rop_bus_if.master  rop_bus_if
+    VX_rop_bus_if.master  rop_bus_if,
+    VX_commit_if.master   commit_if
 );
    `UNUSED_PARAM (CORE_ID)
    
@ -42,7 +42,7 @@ module VX_rop_agent #(
    wire rop_rsp_valid, rop_rsp_ready;

    // it is possible to have ready = f(valid) when using arbiters, 
-    // because of that we need to decouple rop_exe_if and rop_commit_if handshake with a pipe register
+    // because of that we need to decouple rop_exe_if and commit_if handshake with a pipe register

    VX_skid_buffer #(
        .DATAW   (UUID_WIDTH + `NUM_THREADS * (1 + 2 * `VX_ROP_DIM_BITS + 32 + `VX_ROP_DEPTH_BITS + 1)),
@ -69,16 +69,16 @@ module VX_rop_agent #(
        .reset     (reset),
        .valid_in  (rop_rsp_valid),
        .ready_in  (rop_rsp_ready),
-        .data_in   ({rop_exe_if.uuid,    rop_exe_if.wid,    rop_exe_if.tmask,    rop_exe_if.PC}),
-        .data_out  ({rop_commit_if.uuid, rop_commit_if.wid, rop_commit_if.tmask, rop_commit_if.PC}),
-        .valid_out (rop_commit_if.valid),
-        .ready_out (rop_commit_if.ready)
+        .data_in   ({rop_exe_if.uuid, rop_exe_if.wid, rop_exe_if.tmask, rop_exe_if.PC}),
+        .data_out  ({commit_if.uuid,  commit_if.wid,  commit_if.tmask,  commit_if.PC}),
+        .valid_out (commit_if.valid),
+        .ready_out (commit_if.ready)
    );

-    assign rop_commit_if.data = '0;
-    assign rop_commit_if.rd   = '0;
-    assign rop_commit_if.wb   = 0;
-    assign rop_commit_if.eop  = 1;
+    assign commit_if.data = '0;
+    assign commit_if.rd   = '0;
+    assign commit_if.wb   = 0;
+    assign commit_if.eop  = 1;

 `ifdef DBG_TRACE_ROP
    always @(posedge clk) begin
--- a/hw/rtl/tex/VX_tex_agent.sv
+++ b/hw/rtl/tex/VX_tex_agent.sv
@ -12,7 +12,7 @@ module VX_tex_agent #(
        
    // Outputs
    VX_tex_bus_if.master    tex_bus_if,
-    VX_commit_if.master     tex_commit_if
+    VX_commit_if.master     commit_if
 );
    `UNUSED_PARAM (CORE_ID)

@ -107,18 +107,18 @@ module VX_tex_agent #(
        .reset     (reset),
        .valid_in  (tex_bus_if.rsp_valid),
        .ready_in  (tex_bus_if.rsp_ready),
-        .data_in   ({rsp_uuid,           rsp_wid,           rsp_tmask,           rsp_PC,           rsp_rd,           tex_bus_if.rsp_texels}),
-        .data_out  ({tex_commit_if.uuid, tex_commit_if.wid, tex_commit_if.tmask, tex_commit_if.PC, tex_commit_if.rd, commit_data}),
-        .valid_out (tex_commit_if.valid),
-        .ready_out (tex_commit_if.ready)
+        .data_in   ({rsp_uuid,       rsp_wid,       rsp_tmask,       rsp_PC,       rsp_rd,       tex_bus_if.rsp_texels}),
+        .data_out  ({commit_if.uuid, commit_if.wid, commit_if.tmask, commit_if.PC, commit_if.rd, commit_data}),
+        .valid_out (commit_if.valid),
+        .ready_out (commit_if.ready)
    );

    for (genvar i = 0; i < `NUM_THREADS; ++i) begin
-        assign tex_commit_if.data[i] = `XLEN'(commit_data[i]);
+        assign commit_if.data[i] = `XLEN'(commit_data[i]);
    end

-    assign tex_commit_if.wb  = 1'b1;
-    assign tex_commit_if.eop = 1'b1;
+    assign commit_if.wb  = 1'b1;
+    assign commit_if.eop = 1'b1;

 `ifdef DBG_TRACE_TEX
    always @(posedge clk) begin
@ -131,10 +131,10 @@ module VX_tex_agent #(
            `TRACE_ARRAY1D(1, tex_exe_if.lod, `NUM_THREADS);
            `TRACE(1, (", stage=%0d, tag=0x%0h (#%0d)\n", tex_exe_if.stage, req_tag, tex_exe_if.uuid));
        end
-        if (tex_commit_if.valid && tex_commit_if.ready) begin
-            `TRACE(1, ("%d: core%0d-tex-rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, texels=", $time, CORE_ID, tex_commit_if.wid, tex_commit_if.PC, tex_commit_if.tmask, tex_commit_if.rd));
-            `TRACE_ARRAY1D(1, tex_commit_if.data, `NUM_THREADS);
-            `TRACE(1, (" (#%0d)\n", tex_commit_if.uuid));
+        if (commit_if.valid && commit_if.ready) begin
+            `TRACE(1, ("%d: core%0d-tex-rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, texels=", $time, CORE_ID, commit_if.wid, commit_if.PC, commit_if.tmask, commit_if.rd));
+            `TRACE_ARRAY1D(1, commit_if.data, `NUM_THREADS);
+            `TRACE(1, (" (#%0d)\n", commit_if.uuid));
        end
    end
 `endif
--- a/kernel/include/vx_intrinsics.h
+++ b/kernel/include/vx_intrinsics.h
@ -137,13 +137,15 @@ inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
 }

 // Split on a predicate
-inline void vx_split(unsigned predicate) {
-    asm volatile (".insn r %0, 2, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(predicate));
+inline unsigned vx_split(unsigned predicate) {
+    unsigned ret;
+    asm volatile (".insn r %1, 2, 0, %0, %2, x0" : "=r"(ret) : "i"(RISCV_CUSTOM0), "r"(predicate));
+    return ret;
 }

 // Join
-inline void vx_join() {
-    asm volatile (".insn r %0, 3, 0, x0, x0, x0" :: "i"(RISCV_CUSTOM0));
+inline void vx_join(unsigned stack_ptr) {
+    asm volatile (".insn r %0, 3, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(stack_ptr));
 }

 // Warp Barrier
@ -153,72 +155,72 @@ inline void vx_barrier(unsigned barried_id, unsigned num_warps) {

 // Return current thread identifier
 inline int vx_thread_id() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_THREAD_ID));
-    return result;
+    int ret;
+    asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_THREAD_ID));
+    return ret;
 }

 // Return current warp identifier
 inline int vx_warp_id() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_WARP_ID));
-    return result;
+    int ret;
+    asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_WARP_ID));
+    return ret;
 }

 // Return current core identifier
 inline int vx_core_id() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_CORE_ID));
-    return result;
+    int ret;
+    asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_CORE_ID));
+    return ret;
 }

 // Return current cluster identifier
 inline int vx_cluster_id() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_CLUSTER_ID));
-    return result;
+    int ret;
+    asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_CLUSTER_ID));
+    return ret;
 }

 // Return current threadk mask
 inline int vx_thread_mask() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_TMASK));
-    return result;
+    int ret;
+    asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_TMASK));
+    return ret;
 }

 // Return the number of threads per warp
 inline int vx_num_threads() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_NUM_THREADS));
-    return result;
+    int ret;
+    asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_THREADS));
+    return ret;
 }

 // Return the number of warps per core
 inline int vx_num_warps() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_NUM_WARPS));
-    return result;   
+    int ret;
+    asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_WARPS));
+    return ret;   
 }

 // Return the number of cores per cluster
 inline int vx_num_cores() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_NUM_CORES));
-    return result;
+    int ret;
+    asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_CORES));
+    return ret;
 }

 // Return the number of clusters
 inline int vx_num_clusters() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_NUM_CLUSTERS));
-    return result;
+    int ret;
+    asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_CLUSTERS));
+    return ret;
 }

 // Return the hart identifier (thread id accross the processor)
 inline int vx_hart_id() {
-    int result;
-    asm volatile ("csrr %0, %1" : "=r"(result) : "i"(VX_CSR_MHARTID));
-    return result;
+    int ret;
+    asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_MHARTID));
+    return ret;
 }

 inline void vx_fence() {
--- a/sim/simx/arch.h
+++ b/sim/simx/arch.h
@ -31,7 +31,7 @@ public:
    , num_regs_(32)
    , num_csrs_(4096)
    , num_barriers_(NUM_BARRIERS)
-    , ipdom_size_(IPDOM_STACK_SIZE)
+    , ipdom_size_(log2ceil(num_threads) * 2)
  {}

  uint16_t vsize() const { 
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@ -28,29 +28,6 @@ union reg_data_t {
  int64_t  i64;
 };

-static bool HasDivergentThreads(const ThreadMask &thread_mask,                                
-                                const std::vector<std::vector<Word>> &reg_file,
-                                unsigned reg) {
-  bool cond;
-  size_t thread_idx = 0;
-  size_t num_threads = reg_file.size();
-  for (; thread_idx < num_threads; ++thread_idx) {
-    if (thread_mask[thread_idx]) {
-      cond = bool(reg_file[thread_idx][reg]);
-      break;
-    }
-  }  
-  assert(thread_idx != num_threads);  
-  for (; thread_idx < num_threads; ++thread_idx) {
-    if (thread_mask[thread_idx]) {
-      if (cond != (bool(reg_file[thread_idx][reg]))) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
 inline uint32_t get_fpu_rm(uint32_t func3, Core* core, uint32_t tid, uint32_t wid) {
  return (func3 == 0x7) ? core->get_csr(VX_CSR_FRM, tid, wid) : func3;
 }
@ -80,7 +57,8 @@ inline int64_t check_boxing(int64_t a) {
 void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
  assert(tmask_.any());

-  auto nextPC = PC_ + 4;
+  auto next_pc = PC_ + 4;
+  auto next_tmask = tmask_; 

  auto func2  = instr.getFunc2();
  auto func3  = instr.getFunc3();
@ -98,6 +76,12 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {

  auto num_threads = arch_.num_threads();

+  uint32_t thread_start = 0;
+  for (; thread_start < num_threads; ++thread_start) {
+      if (tmask_.test(thread_start))
+        break;
+  }
+
  std::vector<reg_data_t[3]> rsdata(num_threads);
  std::vector<reg_data_t> rddata(num_threads);

@ -149,7 +133,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
    // RV32I: LUI
    trace->exe_type = ExeType::ALU;
    trace->alu_type = AluType::ARITH;
-    for (uint32_t t = 0; t < num_threads; ++t) {
+    for (uint32_t t = thread_start; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
      rddata[t].i = immsrc << 12;
@ -161,7 +145,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
    // RV32I: AUIPC
    trace->exe_type = ExeType::ALU;
    trace->alu_type = AluType::ARITH;
-    for (uint32_t t = 0; t < num_threads; ++t) {
+    for (uint32_t t = thread_start; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
      rddata[t].i = (immsrc << 12) + PC_;
@ -174,7 +158,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
    trace->alu_type = AluType::ARITH;
    trace->used_iregs.set(rsrc0);
    trace->used_iregs.set(rsrc1);
-    for (uint32_t t = 0; t < num_threads; ++t) {
+    for (uint32_t t = thread_start; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
      if (func7 & 0x1) {
@ -334,7 +318,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
    trace->exe_type = ExeType::ALU;    
    trace->alu_type = AluType::ARITH;    
    trace->used_iregs.set(rsrc0);
-    for (uint32_t t = 0; t < num_threads; ++t) {
+    for (uint32_t t = thread_start; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
      switch (func3) {
@ -395,7 +379,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
    trace->alu_type = AluType::ARITH;
    trace->used_iregs.set(rsrc0);
    trace->used_iregs.set(rsrc1);
-    for (uint32_t t = 0; t < num_threads; ++t) {
+    for (uint32_t t = thread_start; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
      if (func7 & 0x1) {
@ -521,7 +505,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
    trace->exe_type = ExeType::ALU;    
    trace->alu_type = AluType::ARITH;    
    trace->used_iregs.set(rsrc0);
-    for (uint32_t t = 0; t < num_threads; ++t) {
+    for (uint32_t t = thread_start; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
      switch (func3) {
@ -565,49 +549,49 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
    trace->alu_type = AluType::BRANCH;    
    trace->used_iregs.set(rsrc0);
    trace->used_iregs.set(rsrc1);
-    for (uint32_t t = 0; t < num_threads; ++t) {
+    for (uint32_t t = thread_start; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
      switch (func3) {
      case 0: {
        // RV32I: BEQ
        if (rsdata[t][0].i == rsdata[t][1].i) {
-          nextPC = PC_ + immsrc;
+          next_pc = PC_ + immsrc;
        }
        break;
      }
      case 1: {
        // RV32I: BNE
        if (rsdata[t][0].i != rsdata[t][1].i) {
-          nextPC = PC_ + immsrc;
+          next_pc = PC_ + immsrc;
        }
        break;
      }
      case 4: {
        // RV32I: BLT
        if (rsdata[t][0].i < rsdata[t][1].i) {
-          nextPC = PC_ + immsrc;
+          next_pc = PC_ + immsrc;
        }
        break;
      }
      case 5: {
        // RV32I: BGE
        if (rsdata[t][0].i >= rsdata[t][1].i) {
-          nextPC = PC_ + immsrc;
+          next_pc = PC_ + immsrc;
        }
        break;
      }
      case 6: {
        // RV32I: BLTU
        if (rsdata[t][0].u < rsdata[t][1].u) {
-          nextPC = PC_ + immsrc;
+          next_pc = PC_ + immsrc;
        }
        break;
      }
      case 7: {
        // RV32I: BGEU
        if (rsdata[t][0].u >= rsdata[t][1].u) {
-          nextPC = PC_ + immsrc;
+          next_pc = PC_ + immsrc;
        }
        break;
      }
@ -623,11 +607,11 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
    // RV32I: JAL
    trace->exe_type = ExeType::ALU;    
    trace->alu_type = AluType::BRANCH;
-    for (uint32_t t = 0; t < num_threads; ++t) {
+    for (uint32_t t = thread_start; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
-      rddata[t].i = nextPC;
-      nextPC = PC_ + immsrc;
+      rddata[t].i = next_pc;
+      next_pc = PC_ + immsrc;
      trace->fetch_stall = true;
      break; // runonce
    }
@ -639,11 +623,11 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
    trace->exe_type = ExeType::ALU;    
    trace->alu_type = AluType::BRANCH;
    trace->used_iregs.set(rsrc0);
-    for (uint32_t t = 0; t < num_threads; ++t) {
+    for (uint32_t t = thread_start; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
-      rddata[t].i = nextPC;
-      nextPC = rsdata[t][0].i + immsrc;
+      rddata[t].i = next_pc;
+      next_pc = rsdata[t][0].i + immsrc;
      trace->fetch_stall = true;
      break; // runOnce
    }
@ -662,7 +646,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
     || (opcode == FL && func3 == 3)) {
      uint32_t data_bytes = 1 << (func3 & 0x3);
      uint32_t data_width = 8 * data_bytes;
-      for (uint32_t t = 0; t < num_threads; ++t) {
+      for (uint32_t t = thread_start; t < num_threads; ++t) {
        if (!tmask_.test(t))
          continue;
        uint64_t mem_addr = rsdata[t][0].i + immsrc;         
@ -726,7 +710,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
     || (opcode == FS && func3 == 2)
     || (opcode == FS && func3 == 3)) {
      uint32_t data_bytes = 1 << (func3 & 0x3);
-      for (uint32_t t = 0; t < num_threads; ++t) {
+      for (uint32_t t = thread_start; t < num_threads; ++t) {
        if (!tmask_.test(t))
          continue;
        uint64_t mem_addr = rsdata[t][0].i + immsrc;
@ -769,7 +753,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
    trace->data = trace_data;
    uint32_t data_bytes = 1 << (func3 & 0x3);
    uint32_t data_width = 8 * data_bytes;
-    for (uint32_t t = 0; t < num_threads; ++t) {
+    for (uint32_t t = thread_start; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
      uint64_t mem_addr = rsdata[t][0].u;
@ -834,7 +818,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
    break;
  }
  case SYS_INST: {
-    for (uint32_t t = 0; t < num_threads; ++t) {
+    for (uint32_t t = thread_start; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
      uint32_t csr_addr = immsrc;
@ -931,7 +915,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
  }
  case FCI: {     
    trace->exe_type = ExeType::FPU;     
-    for (uint32_t t = 0; t < num_threads; ++t) {
+    for (uint32_t t = thread_start; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue; 
      uint32_t frm = get_fpu_rm(func3, core_, t, warp_id_);
@ -1264,7 +1248,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
    trace->used_fregs.set(rsrc0);
    trace->used_fregs.set(rsrc1);
    trace->used_fregs.set(rsrc2);
-    for (uint32_t t = 0; t < num_threads; ++t) {
+    for (uint32_t t = thread_start; t < num_threads; ++t) {
      if (!tmask_.test(t))
        continue;
      uint32_t frm = get_fpu_rm(func3, core_, t, warp_id_);
@ -1312,14 +1296,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
  }
  case EXT1: {   
    switch (func7) {
-    case 0: {    
-      uint32_t ts = 0;
-      for (uint32_t t = 0; t < num_threads; ++t) {
-        if (tmask_.test(t)) {
-          ts = t;
-          break;
-        }
-      }
+    case 0: {
      switch (func3) {
      case 0: {
        // TMC   
@ -1334,22 +1311,14 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
            pred[t] = tmask_.test(t) ? (ireg_file_.at(t).at(rsrc0) != 0) : 0;
          }
          if (pred.any()) {
-            tmask_ &= pred;
+            next_tmask &= pred;
          }
        } else {
-          tmask_.reset();
+          next_tmask.reset();
          for (uint32_t t = 0; t < num_threads; ++t) {
-            tmask_.set(t, rsdata.at(ts)[0].i & (1 << t));
+            next_tmask.set(t, rsdata.at(thread_start)[0].i & (1 << t));
          }
        }
-        DPH(3, "*** New TMC: ");
-        for (uint32_t i = 0; i < num_threads; ++i)
-          DPN(3, tmask_.test(i));
-        DPN(3, std::endl);
-
-        if (!tmask_.any()) {
-          core_->active_warps_.reset(warp_id_);
-        }
      } break;
      case 1: {
        // WSPAWN
@ -1358,70 +1327,70 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
        trace->used_iregs.set(rsrc0);
        trace->used_iregs.set(rsrc1);
        trace->fetch_stall = true;
-        core_->wspawn(rsdata.at(ts)[0].i, rsdata.at(ts)[1].i);
+        core_->wspawn(rsdata.at(thread_start)[0].i, rsdata.at(thread_start)[1].i);
      } break;
      case 2: {
-        // SPLIT    
-        if (ipdom_stack_.size() == arch_.ipdom_size()) {
-          std::cout << "IPDOM stack is full! (size=" << std::dec << ipdom_stack_.size() << ")\n" << std::flush;
-          std::abort();
-        }
+        // SPLIT
        trace->exe_type = ExeType::GPU;
        trace->gpu_type = GpuType::SPLIT;
        trace->used_iregs.set(rsrc0);
        trace->fetch_stall = true;
-        if (HasDivergentThreads(tmask_, ireg_file_, rsrc0)) {          
-          ThreadMask tmask;
-          for (uint32_t t = 0; t < num_threads; ++t) {
-            tmask[t] = tmask_.test(t) && !ireg_file_.at(t).at(rsrc0);
-          }

-          DomStackEntry e(tmask, nextPC);
-          ipdom_stack_.push(tmask_);
-          ipdom_stack_.push(e);
-          for (uint32_t t = 0, n = e.tmask.size(); t < n; ++t) {
-            tmask_.set(t, !e.tmask.test(t) && tmask_.test(t));
-          }
+        auto stack_size = ipdom_stack_.size();

-          DPH(3, "*** Split: New TM=");
-          for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(t));
-          DPN(3, ", Pushed TM=");
-          for (uint32_t t = 0; t < num_threads; ++t) DPN(3, e.tmask.test(t));
-          DPN(3, ", PC=0x" << std::hex << e.PC << "\n");
+        ThreadMask then_tmask, else_tmask;
+        for (uint32_t t = 0; t < num_threads; ++t) {
+          auto cond = ireg_file_.at(t).at(rsrc0);
+          then_tmask[t] = tmask_.test(t) && cond;
+          else_tmask[t] = tmask_.test(t) && !cond;
+        }
+
+        if (then_tmask.count() != tmask_.count() 
+         && else_tmask.count() != tmask_.count()) {             
+          if (ipdom_stack_.size() == arch_.ipdom_size()) {
+            std::cout << "IPDOM stack is full! (size=" << std::dec << ipdom_stack_.size() << ")\n" << std::flush;
+            std::abort();
+          }
+          if (then_tmask.count() >= else_tmask.count()) {
+            next_tmask = then_tmask;        
+          } else {
+            next_tmask = else_tmask;          
+          }
+          // push reconvergence thread mask
+          ipdom_stack_.emplace(tmask_);
+          // push flipped thread mask
+          auto join_tmask = ~next_tmask & tmask_;
+          ipdom_stack_.emplace(join_tmask, next_pc);
        } else {
-          DP(3, "*** Unanimous pred");
-          DomStackEntry e(tmask_);
-          e.unanimous = true;
-          ipdom_stack_.push(e);
-        }        
+          // Uniform control-flow
+        }
+
+        for (uint32_t t = thread_start; t < num_threads; ++t) {
+          rddata[t].i = stack_size;
+        }
+        rd_write = true;
      } break;
      case 3: {
        // JOIN
-        if (ipdom_stack_.empty()) {
-          std::cout << "IPDOM stack is empty!\n" << std::flush;
-          std::abort();
-        }
        trace->exe_type = ExeType::GPU;
-        trace->gpu_type = GpuType::JOIN;        
-        trace->fetch_stall = true;        
-        if (!ipdom_stack_.empty() && ipdom_stack_.top().unanimous) {
-          DP(3, "*** Unanimous branch at join");
-          tmask_ = ipdom_stack_.top().tmask;
+        trace->gpu_type = GpuType::JOIN;
+        trace->used_iregs.set(rsrc0);
+        trace->fetch_stall = true;
+
+        uint32_t stack_ptr = ireg_file_.at(thread_start).at(rsrc0);
+        if (stack_ptr != ipdom_stack_.size()) {
+          if (ipdom_stack_.empty()) {
+            std::cout << "IPDOM stack is empty!\n" << std::flush;
+            std::abort();
+          }
+          next_tmask = ipdom_stack_.top().tmask;
+          if (!ipdom_stack_.top().fallthrough) {
+            next_pc = ipdom_stack_.top().PC;
+          }          
          ipdom_stack_.pop();
        } else {
-          if (!ipdom_stack_.top().fallThrough) {
-            nextPC = ipdom_stack_.top().PC;
-            DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
-          }
-
-          tmask_ = ipdom_stack_.top().tmask;
-
-          DPH(3, "*** Join: New TM=");
-          for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(t));
-          DPN(3, "\n");
-
-          ipdom_stack_.pop();
-        }        
+          // Uniform control-flow
+        }     
      } break;
      case 4: {
        // BAR
@ -1430,7 +1399,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
        trace->used_iregs.set(rsrc0);
        trace->used_iregs.set(rsrc1);
        trace->fetch_stall = true;
-        trace->data = std::make_shared<GPUTraceData>(rsdata[ts][0].i, rsdata[ts][1].i);
+        trace->data = std::make_shared<GPUTraceData>(rsdata[thread_start][0].i, rsdata[thread_start][1].i);
      } break;
      default:
        std::abort();
@ -1446,7 +1415,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
        for (uint32_t ri = 0, rn = core_->raster_units_.size(); ri < rn; ++ri) {
          trace_data->raster_idx = core_->raster_idx();
          bool has_stamps = false;
-          for (uint32_t t = 0; t < num_threads; ++t) {
+          for (uint32_t t = thread_start; t < num_threads; ++t) {
            if (!tmask_.test(t))
              continue;          
            auto result = core_->raster_units_.at(trace_data->raster_idx)->fetch(
@ -1478,7 +1447,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      auto trace_data = std::make_shared<TexUnit::TraceData>(num_threads);
      trace->data = trace_data;
      trace_data->tex_idx = core_->tex_idx();
-      for (uint32_t t = 0; t < num_threads; ++t) {
+      for (uint32_t t = thread_start; t < num_threads; ++t) {
        if (!tmask_.test(t))
          continue;        
        auto u     = rsdata[t][0].i;
@ -1499,7 +1468,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
        trace->used_iregs.set(rsrc0);
        trace->used_iregs.set(rsrc1);
        trace->used_iregs.set(rsrc2);
-        for (uint32_t t = 0; t < num_threads; ++t) {
+        for (uint32_t t = thread_start; t < num_threads; ++t) {
          if (!tmask_.test(t))
            continue;     
          rddata[t].i = rsdata[t][0].i ? rsdata[t][1].i : rsdata[t][2].i;
@ -1515,7 +1484,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
        auto trace_data = std::make_shared<RopUnit::TraceData>();
        trace->data = trace_data;
        trace_data->rop_idx = core_->rop_idx();
-        for (uint32_t t = 0; t < num_threads; ++t) {
+        for (uint32_t t = thread_start; t < num_threads; ++t) {
          if (!tmask_.test(t))
            continue;
          auto pos_face = rsdata[t][0].i;
@ -1539,7 +1508,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
      trace->used_iregs.set(rsrc1);
      trace->used_iregs.set(rsrc2);
      uint32_t shift = func2 * 8;
-      for (uint32_t t = 0; t < num_threads; ++t) {
+      for (uint32_t t = thread_start; t < num_threads; ++t) {
        if (!tmask_.test(t))
          continue;
        rddata[t].i = (int32_t)(((int64_t)rsdata[t][0].i32 * (int64_t)rsdata[t][1].i32) >> shift) + rsdata[t][2].i32;
@ -2444,8 +2413,18 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
  }

  PC_ += 4;
-  if (PC_ != nextPC) {
-    DP(3, "*** Next PC: " << std::hex << nextPC << std::dec);
-    PC_ = nextPC;
+  if (PC_ != next_pc) {
+    DP(3, "*** Next PC=0x" << std::hex << next_pc << std::dec);
+    PC_ = next_pc;
+  }
+  if (tmask_ != next_tmask) {    
+    DPH(3, "*** New Tmask=");
+    for (uint32_t i = 0; i < num_threads; ++i)
+      DPN(3, next_tmask.test(i));
+    DPN(3, std::endl);
+    tmask_ = next_tmask;
+    if (!next_tmask.any()) {
+      core_->active_warps_.reset(warp_id_);
+    }
  }
 }
--- a/sim/simx/warp.h
+++ b/sim/simx/warp.h
@ -16,21 +16,17 @@ struct DomStackEntry {
  DomStackEntry(const ThreadMask &tmask, Word PC) 
    : tmask(tmask)
    , PC(PC)
-    , fallThrough(false)
-    , unanimous(false) 
+    , fallthrough(false)
  {}

-  DomStackEntry(const ThreadMask &tmask)
-      : tmask(tmask)
-      , PC(0)
-      , fallThrough(true)
-      , unanimous(false) 
+  DomStackEntry(const ThreadMask &tmask) 
+    : tmask(tmask)
+    , fallthrough(true)
  {}

  ThreadMask tmask;
  Word PC;
-  bool fallThrough;
-  bool unanimous;
+  bool fallthrough;
 };

 struct vtype {
--- a/tests/kernel/simple/main.cpp
+++ b/tests/kernel/simple/main.cpp
@ -29,9 +29,9 @@ int main() {
 	errors += test_tls();

 	if (0 == errors) {	
-		vx_printf("Passed!\n");
+		PRINTF("Passed!\n");
 	} else {
-		vx_printf("Failed!\n");
+		PRINTF("Failed!\n");
 	}
 	
 	return errors;
--- a/tests/kernel/simple/tests.cpp
+++ b/tests/kernel/simple/tests.cpp
@ -5,31 +5,26 @@
 #include <vx_print.h>
 #include <vx_spawn.h>

-#define __if(b) vx_split(b); \
-                if (b) 
-#define __else else
-#define __endif vx_join();
-
-int __attribute__ ((noinline)) check_error(const int* buffer, int offset, int size) {
+int __attribute__((noinline)) check_error(const int* buffer, int offset, int size) {
 	int errors = 0;
 	for (int i = offset; i < size; i++)	{
 		int value = buffer[i];
 		int ref_value = 65 + i;
 		if (value == ref_value)	{
-			//vx_printf("[%d] %c\n", i, value);
+			//PRINTF("[%d] %c\n", i, value);
 		} else {
-			vx_printf("*** error: [%d] 0x%x, expected 0x%x\n", i, value, ref_value);
+			PRINTF("*** error: [%d] 0x%x, expected 0x%x\n", i, value, ref_value);
 			++errors;
 		}
 	}
 	return errors;
 }

-int __attribute__ ((noinline)) make_select_tmask(int tid) {
+int __attribute__((noinline)) make_select_tmask(int tid) {
 	return (1 << tid);
 }

-int __attribute__ ((noinline)) make_full_tmask(int num_threads) {
+int __attribute__((noinline)) make_full_tmask(int num_threads) {
 	return (1 << num_threads) - 1;
 }

@ -39,7 +34,7 @@ int __attribute__ ((noinline)) make_full_tmask(int num_threads) {
 int global_buffer[GLOBAL_MEM_SZ];

 int test_global_memory() {
-	vx_printf("Global Memory Test\n");
+	PRINTF("Global Memory Test\n");

 	for (int i = 0; i < GLOBAL_MEM_SZ; i++) {
 		global_buffer[i] = 65 + i;
@ -51,7 +46,7 @@ int test_global_memory() {
 ///////////////////////////////////////////////////////////////////////////////

 int test_stack_memory() {
-	vx_printf("Stack Memory Test\n");
+	PRINTF("Stack Memory Test\n");

 	static const int STACK_MEM_SZ = 8;
 	int stack_buffer[STACK_MEM_SZ];
@ -69,7 +64,7 @@ int test_shared_memory() {
 	static const int SHARED_MEM_SZ = 8;
 	int* shared_buffer = (int*)(STACK_BASE_ADDR-(128*4)-SHARED_MEM_SZ*4);

-	vx_printf("Shared Memory Test\n");	
+	PRINTF("Shared Memory Test\n");	
 	
 	for (int i = 0; i < SHARED_MEM_SZ; i++) {
 		shared_buffer[i] = 65 + i;
@ -82,13 +77,13 @@ int test_shared_memory() {

 int tmc_buffer[8];

-void __attribute__ ((noinline)) do_tmc() {
+void __attribute__((noinline)) do_tmc() {
 	unsigned tid = vx_thread_id();
 	tmc_buffer[tid] = 65 + tid;
 }

 int test_tmc() {
-	vx_printf("TMC Test\n");
+	PRINTF("TMC Test\n");

 	int num_threads = std::min(vx_num_threads(), 8);
 	int tmask = make_full_tmask(num_threads);
@ -103,13 +98,13 @@ int test_tmc() {

 int pred_buffer[8];

-void __attribute__ ((noinline)) do_pred() {
+void __attribute__((noinline)) do_pred() {
 	unsigned tid = vx_thread_id();
 	pred_buffer[tid] = 65 + tid;
 }

 int test_pred() {
-	vx_printf("PRED Test\n");
+	PRINTF("PRED Test\n");

 	int num_threads = std::min(vx_num_threads(), 8);
 	int tmask = make_full_tmask(num_threads);
@ -138,7 +133,7 @@ void wspawn_kernel() {
 }

 int test_wsapwn() {
-	vx_printf("Wspawn Test\n");
+	PRINTF("Wspawn Test\n");
 	int num_warps = std::min(vx_num_warps(), 8);
 	vx_wspawn(num_warps, wspawn_kernel);
 	wspawn_kernel();
@ -150,33 +145,46 @@ int test_wsapwn() {

 int dvg_buffer[4];

-void __attribute__ ((noinline)) do_divergence() {
-
-	unsigned tid = vx_thread_id();
-
-	__if (tid < 2) {
-		__if (tid < 1) {
-			dvg_buffer[tid] = 65;			
+void __attribute__((noinline)) do_divergence() {
+	int tid = vx_thread_id();
+	int cond1 = tid < 2;
+	int sp1 = vx_split(cond1);	
+	if (cond1) {
+		{
+			int cond2 = tid < 1;
+			int sp2 = vx_split(cond2);
+			if (cond2) {
+				dvg_buffer[tid] = 65; // A
+			} else {
+				dvg_buffer[tid] = 66; // B
+			}
+			vx_join(sp2);
 		}
-		__else {
-			dvg_buffer[tid] = 66;
+		{
+			int cond3 = tid < 0;
+			int sp3 = vx_split(cond3);
+			if (cond3) {
+				dvg_buffer[tid] = 67; // C
+			}
+			vx_join(sp3);
+		}
+	} else {
+		{
+			int cond2 = tid < 3;
+			int sp2 = vx_split(cond2);
+			if (cond2) {
+				dvg_buffer[tid] = 67; // C
+			} else {
+				dvg_buffer[tid] = 68; // D
+			}
+			vx_join(sp2);
 		}
-		__endif
 	}
-	__else {
-		__if (tid < 3) {
-			dvg_buffer[tid] = 67;
-		}
-		__else {
-			dvg_buffer[tid] = 68;
-		}
-		__endif
-	}
-	__endif
+	vx_join(sp1);
 }

 int test_divergence() {
-	vx_printf("Control Divergence Test\n");
+	PRINTF("Control Divergence Test\n");

 	int num_threads = std::min(vx_num_threads(), 4);
 	int tmask = make_full_tmask(num_threads);	
@ -203,7 +211,7 @@ void st_kernel(int task_id, const st_args_t * __UNIFORM__ arg) {
 }

 int test_spawn_tasks() {
-	vx_printf("SpawnTasks Test\n");
+	PRINTF("SpawnTasks Test\n");

 	st_args_t arg;
 	arg.src = st_buffer_src;
@ -232,14 +240,14 @@ void sr_kernel(const sr_args_t * arg) {
  	arg->buf[tid] = 65 + tid;
 }

-void __attribute__ ((noinline)) do_serial() {
+void __attribute__((noinline)) do_serial() {
 	sr_args_t arg;
 	arg.buf = sr_buffer;
 	vx_serial((vx_serial_cb)sr_kernel, &arg);
 }

 int test_serial() {
-	vx_printf("Serial Test\n");	
+	PRINTF("Serial Test\n");	
 	int num_threads = std::min(vx_num_threads(), 8);
 	int tmask = make_full_tmask(num_threads);	
 	vx_tmc(tmask);
@ -253,7 +261,7 @@ int test_serial() {

 int tmask_buffer[8];

-int __attribute__ ((noinline)) do_tmask() {					
+int __attribute__((noinline)) do_tmask() {					
 	int tid = vx_thread_id();
 	int tmask = make_select_tmask(tid);
 	int cur_tmask = vx_thread_mask();
@ -262,7 +270,7 @@ int __attribute__ ((noinline)) do_tmask() {
 }

 int test_tmask() {
-	vx_printf("Thread Mask Test\n");
+	PRINTF("Thread Mask Test\n");

 	// activate all thread to populate shared variables
 	vx_tmc(-1);
@ -298,7 +306,7 @@ void barrier_kernel() {
 }

 int test_barrier() {
-	vx_printf("Barrier Test\n");
+	PRINTF("Barrier Test\n");
 	int num_warps = std::min(vx_num_warps(), 8);
 	barrier_ctr = num_warps;
 	barrier_stall = 0;
@ -312,7 +320,7 @@ int test_barrier() {
 int tls_buffer[8];
 __thread int tls_var;

-__attribute__ ((noinline)) void print_tls_var() {
+__attribute__((noinline)) void print_tls_var() {
 	unsigned wid = vx_warp_id();
 	tls_buffer[wid] = 65 + tls_var;
 }
@ -325,7 +333,7 @@ void tls_kernel() {
 }

 int test_tls() {
-	vx_printf("TLS Test\n");
+	PRINTF("TLS Test\n");
 	int num_warps = std::min(vx_num_warps(), 8);
 	vx_wspawn(num_warps, tls_kernel);
 	tls_kernel();
--- a/tests/kernel/simple/tests.h
+++ b/tests/kernel/simple/tests.h
@ -1,6 +1,8 @@
 #ifndef TESTS
 #define TESTS

+#define PRINTF vx_printf
+
 int test_global_memory();

 int test_stack_memory();