fixed vortex custom extension opcode to use official unused values

2025-04-24 05:47:35 -04:00 · 2022-03-06 22:55:52 -05:00 · 2022-03-06 22:55:52 -05:00 · a767efe3c2
commit a767efe3c2
parent d241fc9a4b
32 changed files with 284 additions and 281 deletions
--- a/hw/rtl/VX_decode.sv
+++ b/hw/rtl/VX_decode.sv
@ -350,47 +350,52 @@ module VX_decode  #(
                endcase
            end
        `endif
-            `INST_GPGPU: begin 
-                ex_type = `EX_GPU;
-                case (func3)
-                    3'h0: begin
-                        op_type = rs2[0] ? `INST_OP_BITS'(`INST_GPU_PRED) : `INST_OP_BITS'(`INST_GPU_TMC);
-                        is_wstall = 1;
-                        `USED_IREG (rs1);
-                    end
-                    3'h1: begin
-                        op_type = `INST_OP_BITS'(`INST_GPU_WSPAWN);
-                        `USED_IREG (rs1);
-                        `USED_IREG (rs2);
-                    end
-                    3'h2: begin
-                        op_type = `INST_OP_BITS'(`INST_GPU_SPLIT);
-                        is_wstall = 1;
-                        `USED_IREG (rs1);
-                    end
-                    3'h3: begin 
-                        op_type = `INST_OP_BITS'(`INST_GPU_JOIN);
-                        is_join = 1;
-                    end
-                    3'h4: begin 
-                        op_type = `INST_OP_BITS'(`INST_GPU_BAR);
-                        is_wstall = 1;
-                        `USED_IREG (rs1);
-                        `USED_IREG (rs2);
-                    end                
-                    3'h5: begin
-                        ex_type = `EX_LSU;
-                        op_type = `INST_OP_BITS'(`INST_LSU_LW);
-                        op_mod  = `INST_MOD_BITS'(2);
-                        `USED_IREG (rs1);
+            `INST_EXT1: begin 
+                case (func7)
+                    7'h00: begin
+                        ex_type = `EX_GPU;
+                        case (func3)
+                            3'h0: begin // TMC, PRED
+                                op_type = rs2[0] ? `INST_OP_BITS'(`INST_GPU_PRED) : `INST_OP_BITS'(`INST_GPU_TMC);
+                                is_wstall = 1;
+                                `USED_IREG (rs1);
+                            end
+                            3'h1: begin // WSPAWN
+                                op_type = `INST_OP_BITS'(`INST_GPU_WSPAWN);
+                                `USED_IREG (rs1);
+                                `USED_IREG (rs2);
+                            end
+                            3'h2: begin // SPLIT
+                                op_type = `INST_OP_BITS'(`INST_GPU_SPLIT);
+                                is_wstall = 1;
+                                `USED_IREG (rs1);
+                            end
+                            3'h3: begin // JOIN
+                                op_type = `INST_OP_BITS'(`INST_GPU_JOIN);
+                                is_join = 1;
+                            end
+                            3'h4: begin // BAR
+                                op_type = `INST_OP_BITS'(`INST_GPU_BAR);
+                                is_wstall = 1;
+                                `USED_IREG (rs1);
+                                `USED_IREG (rs2);
+                            end                
+                            3'h5: begin // PREFETCH
+                                ex_type = `EX_LSU;
+                                op_type = `INST_OP_BITS'(`INST_LSU_LW);
+                                op_mod  = `INST_MOD_BITS'(2);
+                                `USED_IREG (rs1);
+                            end
+                            default:;
+                        endcase
                    end
                    default:;
                endcase
            end
-            `INST_GPU: begin                
+            `INST_EXT2: begin                
                case (func3)
                `ifdef EXT_TEX_ENABLE
-                    3'h0: begin
+                    3'h0: begin // TEX
                        ex_type = `EX_GPU;
                        op_type = `INST_OP_BITS'(`INST_GPU_TEX);
                        op_mod  = `INST_MOD_BITS'(func2);
@ -401,7 +406,7 @@ module VX_decode  #(
                        `USED_IREG (rs3);
                    end
                `endif
-                    3'h1: begin
+                    3'h1: begin // IMADD
                        ex_type = `EX_GPU;
                        op_type = `INST_OP_BITS'(`INST_GPU_IMADD);
                        use_rd = 1;
--- a/hw/rtl/VX_define.vh
+++ b/hw/rtl/VX_define.vh
@ -69,10 +69,11 @@
 `define INST_FNMADD     7'b1001111 
 `define INST_FCI        7'b1010011 // float common instructions

-`define INST_GPGPU      7'b1101011
-`define INST_GPU        7'b1011011
-
-`define INST_TEX       7'b0101011
+// Custom extension opcodes
+`define INST_EXT1       7'b0001011 // 0x0B
+`define INST_EXT2       7'b0101011 // 0x2B
+`define INST_EXT3       7'b1011011 // 0x5B
+`define INST_EXT4       7'b1111011 // 0x7B

 ///////////////////////////////////////////////////////////////////////////////

--- a/runtime/include/vx_intrinsics.h
+++ b/runtime/include/vx_intrinsics.h
@ -77,76 +77,76 @@ extern "C" {
 // Texture load
 #define vx_tex(stage, u, v, lod) ({              \
 	unsigned __r;                               \
-    __asm__ __volatile__ (".insn r4 0x5b, 0, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(stage), "r"(u), "r"(v), "r"(lod)); \
+    __asm__ __volatile__ (".insn r4 0x2b, 0, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(stage), "r"(u), "r"(v), "r"(lod)); \
 	__r;							            \
 })

 // Conditional move
 #define vx_cmov(c, t, f) ({                     \
 	unsigned __r;		                        \
-    __asm__ __volatile__ (".insn r4 0x5b, 1, 0, %0, %1, %2, %3" : "=r"(__r : "r"(c), "r"(t), "r"(f)); \
+    __asm__ __volatile__ (".insn r4 0x2b, 1, 0, %0, %1, %2, %3" : "=r"(__r : "r"(c), "r"(t), "r"(f)); \
 	__r;							            \
 })

 // IMADD
 #define vx_imadd(x, y, acc) ({                  \
-    __asm__ __volatile__ (".insn r4 0x5b, 1, 2, x0, %0, %1, %2" :: "r"(x), "r"(y), "r"(acc); \
+    __asm__ __volatile__ (".insn r4 0x2b, 1, 2, x0, %0, %1, %2" :: "r"(x), "r"(y), "r"(acc); \
 })

 // Raster load
 #define vx_rast() ({                            \
    unsigned __r;                               \
-    __asm__ __volatile__ (".insn r 0x0b, 0, 0, %0, x0, x0" : "=r"(__r)); \
+    __asm__ __volatile__ (".insn r 0x0b, 0, 1, %0, x0, x0" : "=r"(__r)); \
    __r;                                        \
 })

 // Rop write
 #define vx_rop(color, depth) ({                 \
-    __asm__ __volatile__ (".insn r 0x0b, 0, 1, x0, %0, %1" :: "r"(color), "r"(depth)); \
+    __asm__ __volatile__ (".insn r 0x0b, 1, 1, x0, %0, %1" :: "r"(color), "r"(depth)); \
 })

 // Interpolate
 #define vx_interp(f, a, b, c) ({                \
 	unsigned __r;                               \
-    __asm__ __volatile__ (".insn r4 0x5b, 2, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(f), "r"(a), "r"(b), "r"(c)); \
+    __asm__ __volatile__ (".insn r4 0x2b, 2, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(f), "r"(a), "r"(b), "r"(c)); \
 	__r;							            \
 })

 // Set thread mask
 inline void vx_tmc(unsigned thread_mask) {
-    asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(thread_mask));
+    asm volatile (".insn r 0x0b, 0, 0, x0, %0, x0" :: "r"(thread_mask));
 }

 // Set thread predicate
 inline void vx_pred(unsigned condition) {
-    asm volatile (".insn s 0x6b, 0, x1, 0(%0)" :: "r"(condition));
+    asm volatile (".insn r 0x0b, 0, 0, x0, %0, x1" :: "r"(condition));
 }

 typedef void (*vx_wspawn_pfn)();

 // Spawn warps
 inline void vx_wspawn(unsigned num_warps, vx_wspawn_pfn func_ptr) {
-    asm volatile (".insn s 0x6b, 1, %1, 0(%0)" :: "r"(num_warps), "r"(func_ptr));
+    asm volatile (".insn r 0x0b, 1, 0, x0, %0, %1" :: "r"(num_warps), "r"(func_ptr));
 }

 // Split on a predicate
 inline void vx_split(int predicate) {
-    asm volatile (".insn s 0x6b, 2, x0, 0(%0)" :: "r"(predicate));
+    asm volatile (".insn r 0x0b, 2, 0, x0, %0, x0" :: "r"(predicate));
 }

 // Join
 inline void vx_join() {
-  asm volatile (".insn s 0x6b, 3, x0, 0(x0)");
+  asm volatile (".insn r 0x0b, 3, 0, x0, x0, x0");
 }

 // Warp Barrier
 inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
-    asm volatile (".insn s 0x6b, 4, %1, 0(%0)" :: "r"(barried_id), "r"(num_warps));
+    asm volatile (".insn r 0x0b, 4, 0, x0, %0, %1" :: "r"(barried_id), "r"(num_warps));
 }

 // Prefetch
 inline void vx_prefetch(unsigned addr) {
-    asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) );
+    asm volatile (".insn r 0x0b, 5, 0, x0, %0, x0" :: "r"(addr) );
 }

 // Return active warp's thread id 
--- a/runtime/src/vx_spawn.S
+++ b/runtime/src/vx_spawn.S
@ -18,12 +18,12 @@ vx_serial:
 label_loop:
    sub	 t0, s0, s1
    seqz t1, t0                 # (index != tid)
-    .insn s 0x6b, 2, x0, 0(t1)  # split t0
+    .insn r 0x0b, 2, 0, x0, t1, x0  # split t0
    bnez t0, label_join
    mv   a0, s3                 # a0 <- arg
    jalr s4                     # callback(arg)
 label_join:
-    .insn s 0x6b, 3, x0, 0(x0)  # join
+    .insn r 0x0b, 3, 0, x0, x0, x0  # join
    addi s0, s0, 1              # index++
    blt	 s0, s2, label_loop     # loop back
    lw   ra, 20(sp)
--- a/runtime/src/vx_start.S
+++ b/runtime/src/vx_start.S
@ -9,12 +9,12 @@ _start:
  # execute stack initialization on all warps
  la a1, vx_set_sp
  csrr a0, CSR_NW  # get num warps
-  .insn s 0x6b, 1, a1, 0(a0)  # wspawn a0, a1
+  .insn r 0x0b, 1, 0, x0, a0, a1  # wspawn a0, a1
  jal vx_set_sp

  # return back to single thread execution
  li a0, 1
-  .insn s 0x6b, 0, x0, 0(a0)  # tmc a0
+  .insn r 0x0b, 0, 0, x0, a0, x0  # tmc a0
  
  # Clear the bss segment
  la      a0, _edata
@ -47,7 +47,7 @@ _exit:
  call vx_perf_dump 
  mv gp, s0
  li a0, 0
-  .insn s 0x6b, 0, x0, 0(a0)  # tmc a0
+  .insn r 0x0b, 0, 0, x0, a0, x0  # tmc a0

 .section .text
 .type vx_set_sp, @function
@ -55,7 +55,7 @@ _exit:
 vx_set_sp:
  # activate all threads
  li a0, -1
-  .insn s 0x6b, 0, x0, 0(a0)  # tmc a0
+  .insn r 0x0b, 0, 0, x0, a0, x0  # tmc a0
  
  # set per-thread stack register
  li sp, STACK_BASE_ADDR # load stack base address
@ -68,7 +68,7 @@ vx_set_sp:
  csrr a3, CSR_LWID     # get local wid
  beqz a3, RETURN
  li a0, 0
-  .insn s 0x6b, 0, x0, 0(a0)  # tmc a0
+  .insn r 0x0b, 0, 0, x0, a0, x0  # tmc a0
 RETURN:
  ret

--- a/sim/simx/decode.cpp
+++ b/sim/simx/decode.cpp
@ -39,8 +39,7 @@ static const std::unordered_map<Opcode, struct InstTableEntry_t> sc_instTable =
  {Opcode::FMSUB,      {false, InstType::R4_TYPE}},
  {Opcode::FMNMADD,    {false, InstType::R4_TYPE}},
  {Opcode::FMNMSUB,    {false, InstType::R4_TYPE}},  
-  {Opcode::VSET,       {false, InstType::V_TYPE}}, 
-  {Opcode::GPGPU,      {false, InstType::R_TYPE}},
+  {Opcode::VSET,       {false, InstType::V_TYPE}},
  {Opcode::EXT1,       {false, InstType::R_TYPE}},
  {Opcode::EXT2,       {false, InstType::R4_TYPE}},
  {Opcode::R_INST_W,   {false, InstType::R_TYPE}},
@ -345,27 +344,26 @@ static const char* op_string(const Instr &instr) {
  case Opcode::FMNMADD: return func2 ? "FNMADD.D" : "FNMADD.S";
  case Opcode::FMNMSUB: return func2 ? "FNMSUB.D" : "FNMSUB.S";
  case Opcode::VSET:    return "VSET";
-  case Opcode::GPGPU:
-    switch (func3) {            
-    case 0: return "TMC";
-    case 1: return "WSPAWN";
-    case 2: return "SPLIT";
-    case 3: return "JOIN";
-    case 4: return "BAR";
-    case 5: return "PREFETCH";
-    default:
-      std::abort();
-    }
  case Opcode::EXT1:
    switch (func7) {
-    case 0: {
+    case 0:
+      switch (func3) {            
+      case 0: return rs2 ? "PRED" : "TMC";
+      case 1: return "WSPAWN";
+      case 2: return "SPLIT";
+      case 3: return "JOIN";
+      case 4: return "BAR";
+      case 5: return "PREFETCH";
+      default:
+        std::abort();
+      }
+    case 1:
      switch (func3) {
      case 0: return "RASTER";
      case 1: return "ROP";
      default:
        std::abort();
      }
-    }
    default:
      std::abort();
    }
--- a/sim/simx/execute.cpp
+++ b/sim/simx/execute.cpp
@ -1285,141 +1285,141 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
    rd_write = true;
    break;
  }
-  case GPGPU: {    
-    uint32_t ts = 0;
-    for (uint32_t t = 0; t < num_threads; ++t) {
-      if (tmask_.test(t)) {
-        ts = t;
-        break;
-      }
-    }
-    switch (func3) {
-    case 0: {
-      // TMC   
-      trace->exe_type = ExeType::GPU;     
-      trace->gpu_type = GpuType::TMC;
-      trace->used_iregs.set(rsrc0);
-      trace->fetch_stall = true;
-      if (rsrc1) {
-        // predicate mode
-        ThreadMask pred;
-        for (uint32_t t = 0; t < num_threads; ++t) {
-          pred[t] = tmask_.test(t) ? (ireg_file_.at(t).at(rsrc0) != 0) : 0;
-        }
-        if (pred.any()) {
-          tmask_ &= pred;
-        }
-      } else {
-        tmask_.reset();
-        for (uint32_t t = 0; t < num_threads; ++t) {
-          tmask_.set(t, rsdata.at(ts)[0].i & (1 << t));
-        }
-      }
-      DPH(3, "*** New TMC: ");
-      for (uint32_t i = 0; i < num_threads; ++i)
-        DPN(3, tmask_.test(num_threads-i-1));
-      DPN(3, std::endl);
-
-      active_ = tmask_.any();
-      trace->data = new GPUTraceData(active_ << id_);
-    } break;
-    case 1: {
-      // WSPAWN
-      trace->exe_type = ExeType::GPU;
-      trace->gpu_type = GpuType::WSPAWN;
-      trace->used_iregs.set(rsrc0);
-      trace->used_iregs.set(rsrc1);
-      trace->fetch_stall = true;
-      trace->data = new GPUTraceData(core_->wspawn(rsdata.at(ts)[0].i, rsdata.at(ts)[1].i));
-    } break;
-    case 2: {
-      // SPLIT    
-      trace->exe_type = ExeType::GPU;
-      trace->gpu_type = GpuType::SPLIT;
-      trace->used_iregs.set(rsrc0);
-      trace->fetch_stall = true;
-      if (HasDivergentThreads(tmask_, ireg_file_, rsrc0)) {          
-        ThreadMask tmask;
-        for (uint32_t t = 0; t < num_threads; ++t) {
-          tmask[t] = tmask_.test(t) && !ireg_file_.at(t).at(rsrc0);
-        }
-
-        DomStackEntry e(tmask, nextPC);
-        dom_stack_.push(tmask_);
-        dom_stack_.push(e);
-        for (uint32_t t = 0, n = e.tmask.size(); t < n; ++t) {
-          tmask_.set(t, !e.tmask.test(t) && tmask_.test(t));
-        }
-        active_ = tmask_.any();
-
-        DPH(3, "*** Split: New TM=");
-        for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(num_threads-t-1));
-        DPN(3, ", Pushed TM=");
-        for (uint32_t t = 0; t < num_threads; ++t) DPN(3, e.tmask.test(num_threads-t-1));
-        DPN(3, ", PC=0x" << std::hex << e.PC << "\n");
-      } else {
-        DP(3, "*** Unanimous pred");
-        DomStackEntry e(tmask_);
-        e.unanimous = true;
-        dom_stack_.push(e);
-      }        
-    } break;
-    case 3: {
-      // JOIN
-      trace->exe_type = ExeType::GPU;
-      trace->gpu_type = GpuType::JOIN;        
-      trace->fetch_stall = true;        
-      if (!dom_stack_.empty() && dom_stack_.top().unanimous) {
-        DP(3, "*** Uninimous branch at join");
-        tmask_ = dom_stack_.top().tmask;
-        active_ = tmask_.any();
-        dom_stack_.pop();
-      } else {
-        if (!dom_stack_.top().fallThrough) {
-          nextPC = dom_stack_.top().PC;
-          DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
-        }
-
-        tmask_ = dom_stack_.top().tmask;
-        active_ = tmask_.any();
-
-        DPH(3, "*** Join: New TM=");
-        for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(num_threads-t-1));
-        DPN(3, "\n");
-
-        dom_stack_.pop();
-      }        
-    } break;
-    case 4: {
-      // BAR
-      trace->exe_type = ExeType::GPU; 
-      trace->gpu_type = GpuType::BAR;
-      trace->used_iregs.set(rsrc0);
-      trace->used_iregs.set(rsrc1);
-      trace->fetch_stall = true;
-      trace->data = new GPUTraceData(core_->barrier(rsdata[ts][0].i, rsdata[ts][1].i, id_));
-    } break;
-    case 5: {
-      // PREFETCH
-      trace->exe_type = ExeType::LSU; 
-      trace->lsu_type = LsuType::PREFETCH; 
-      trace->used_iregs.set(rsrc0);
-      auto trace_data = new LsuTraceData(num_threads);
-      trace->data = trace_data;
-      for (uint32_t t = 0; t < num_threads; ++t) {
-        if (!tmask_.test(t))
-          continue;
-        auto mem_addr = rsdata[t][0].i;
-        trace_data->mem_addrs.at(t) = {mem_addr, 4};
-      }
-    } break;
-    default:
-      std::abort();
-    }
-  }  break;
  case EXT1: {   
    switch (func7) {
-    case 0:
+    case 0: {    
+      uint32_t ts = 0;
+      for (uint32_t t = 0; t < num_threads; ++t) {
+        if (tmask_.test(t)) {
+          ts = t;
+          break;
+        }
+      }
+      switch (func3) {
+      case 0: {
+        // TMC   
+        trace->exe_type = ExeType::GPU;     
+        trace->gpu_type = GpuType::TMC;
+        trace->used_iregs.set(rsrc0);
+        trace->fetch_stall = true;
+        if (rsrc1) {
+          // predicate mode
+          ThreadMask pred;
+          for (uint32_t t = 0; t < num_threads; ++t) {
+            pred[t] = tmask_.test(t) ? (ireg_file_.at(t).at(rsrc0) != 0) : 0;
+          }
+          if (pred.any()) {
+            tmask_ &= pred;
+          }
+        } else {
+          tmask_.reset();
+          for (uint32_t t = 0; t < num_threads; ++t) {
+            tmask_.set(t, rsdata.at(ts)[0].i & (1 << t));
+          }
+        }
+        DPH(3, "*** New TMC: ");
+        for (uint32_t i = 0; i < num_threads; ++i)
+          DPN(3, tmask_.test(num_threads-i-1));
+        DPN(3, std::endl);
+
+        active_ = tmask_.any();
+        trace->data = new GPUTraceData(active_ << id_);
+      } break;
+      case 1: {
+        // WSPAWN
+        trace->exe_type = ExeType::GPU;
+        trace->gpu_type = GpuType::WSPAWN;
+        trace->used_iregs.set(rsrc0);
+        trace->used_iregs.set(rsrc1);
+        trace->fetch_stall = true;
+        trace->data = new GPUTraceData(core_->wspawn(rsdata.at(ts)[0].i, rsdata.at(ts)[1].i));
+      } break;
+      case 2: {
+        // SPLIT    
+        trace->exe_type = ExeType::GPU;
+        trace->gpu_type = GpuType::SPLIT;
+        trace->used_iregs.set(rsrc0);
+        trace->fetch_stall = true;
+        if (HasDivergentThreads(tmask_, ireg_file_, rsrc0)) {          
+          ThreadMask tmask;
+          for (uint32_t t = 0; t < num_threads; ++t) {
+            tmask[t] = tmask_.test(t) && !ireg_file_.at(t).at(rsrc0);
+          }
+
+          DomStackEntry e(tmask, nextPC);
+          dom_stack_.push(tmask_);
+          dom_stack_.push(e);
+          for (uint32_t t = 0, n = e.tmask.size(); t < n; ++t) {
+            tmask_.set(t, !e.tmask.test(t) && tmask_.test(t));
+          }
+          active_ = tmask_.any();
+
+          DPH(3, "*** Split: New TM=");
+          for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(num_threads-t-1));
+          DPN(3, ", Pushed TM=");
+          for (uint32_t t = 0; t < num_threads; ++t) DPN(3, e.tmask.test(num_threads-t-1));
+          DPN(3, ", PC=0x" << std::hex << e.PC << "\n");
+        } else {
+          DP(3, "*** Unanimous pred");
+          DomStackEntry e(tmask_);
+          e.unanimous = true;
+          dom_stack_.push(e);
+        }        
+      } break;
+      case 3: {
+        // JOIN
+        trace->exe_type = ExeType::GPU;
+        trace->gpu_type = GpuType::JOIN;        
+        trace->fetch_stall = true;        
+        if (!dom_stack_.empty() && dom_stack_.top().unanimous) {
+          DP(3, "*** Uninimous branch at join");
+          tmask_ = dom_stack_.top().tmask;
+          active_ = tmask_.any();
+          dom_stack_.pop();
+        } else {
+          if (!dom_stack_.top().fallThrough) {
+            nextPC = dom_stack_.top().PC;
+            DP(3, "*** Join: next PC: " << std::hex << nextPC << std::dec);
+          }
+
+          tmask_ = dom_stack_.top().tmask;
+          active_ = tmask_.any();
+
+          DPH(3, "*** Join: New TM=");
+          for (uint32_t t = 0; t < num_threads; ++t) DPN(3, tmask_.test(num_threads-t-1));
+          DPN(3, "\n");
+
+          dom_stack_.pop();
+        }        
+      } break;
+      case 4: {
+        // BAR
+        trace->exe_type = ExeType::GPU; 
+        trace->gpu_type = GpuType::BAR;
+        trace->used_iregs.set(rsrc0);
+        trace->used_iregs.set(rsrc1);
+        trace->fetch_stall = true;
+        trace->data = new GPUTraceData(core_->barrier(rsdata[ts][0].i, rsdata[ts][1].i, id_));
+      } break;
+      case 5: {
+        // PREFETCH
+        trace->exe_type = ExeType::LSU; 
+        trace->lsu_type = LsuType::PREFETCH; 
+        trace->used_iregs.set(rsrc0);
+        auto trace_data = new LsuTraceData(num_threads);
+        trace->data = trace_data;
+        for (uint32_t t = 0; t < num_threads; ++t) {
+          if (!tmask_.test(t))
+            continue;
+          auto mem_addr = rsdata[t][0].i;
+          trace_data->mem_addrs.at(t) = {mem_addr, 4};
+        }
+      } break;
+      default:
+        std::abort();
+      }
+    } break;
+    case 1:
      switch (func3) {
      case 0: { // RASTER
        trace->exe_type = ExeType::GPU; 
--- a/sim/simx/instr.h
+++ b/sim/simx/instr.h
@ -29,10 +29,11 @@ enum Opcode {
  FMNMADD   = 0x4f,
  // Vector Extension  
  VSET      = 0x57,
-  // Vortex Extensions
+  // Custom Extensions
  EXT1      = 0x0b,
-  EXT2      = 0x5b,
-  GPGPU     = 0x6b,
+  EXT2      = 0x2b,
+  EXT3      = 0x5b,
+  EXT4      = 0x7b,
  // RV64 Standard Extensions
  R_INST_W  = 0x3b,
  I_INST_W  = 0x1b,
--- a/tests/opencl/guassian/kernel.pocl
+++ b/tests/opencl/guassian/kernel.pocl
--- a/tests/opencl/nearn/kernel.pocl
+++ b/tests/opencl/nearn/kernel.pocl
--- a/tests/opencl/oclprintf/kernel.pocl
+++ b/tests/opencl/oclprintf/kernel.pocl
--- a/tests/opencl/psort/kernel.pocl
+++ b/tests/opencl/psort/kernel.pocl
--- a/tests/opencl/saxpy/kernel.pocl
+++ b/tests/opencl/saxpy/kernel.pocl
--- a/tests/opencl/sfilter/kernel.pocl
+++ b/tests/opencl/sfilter/kernel.pocl
--- a/tests/opencl/sgemm/kernel.pocl
+++ b/tests/opencl/sgemm/kernel.pocl
--- a/tests/opencl/vecadd/kernel.pocl
+++ b/tests/opencl/vecadd/kernel.pocl
--- a/tests/regression/basic/main.cpp
+++ b/tests/regression/basic/main.cpp
@ -249,8 +249,8 @@ int main(int argc, char *argv[]) {
  std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
  std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;

-  // allocate shared memory  
-  std::cout << "allocate shared memory" << std::endl;
+  // allocate staging buffer  
+  std::cout << "allocate staging buffer" << std::endl;
  uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
  RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));

--- a/tests/regression/demo/main.cpp
+++ b/tests/regression/demo/main.cpp
@ -148,8 +148,8 @@ int main(int argc, char *argv[]) {
  std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::endl;
  std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
  
-  // allocate shared memory  
-  std::cout << "allocate shared memory" << std::endl;    
+  // allocate staging buffer  
+  std::cout << "allocate staging buffer" << std::endl;    
  uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
  RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
  
--- a/tests/regression/diverge/main.cpp
+++ b/tests/regression/diverge/main.cpp
@ -201,8 +201,8 @@ int main(int argc, char *argv[]) {
  std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
  std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
  
-  // allocate shared memory  
-  std::cout << "allocate shared memory" << std::endl;    
+  // allocate staging buffer  
+  std::cout << "allocate staging buffer" << std::endl;    
  uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
                                  std::max<uint32_t>(dst_buf_size, 
                                    sizeof(kernel_arg_t)));
--- a/tests/regression/dogfood/main.cpp
+++ b/tests/regression/dogfood/main.cpp
@ -204,8 +204,8 @@ int main(int argc, char *argv[]) {
  std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::dec << std::endl;
  std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::dec << std::endl;
  
-  // allocate shared memory  
-  std::cout << "allocate shared memory" << std::endl;
+  // allocate staging buffer  
+  std::cout << "allocate staging buffer" << std::endl;
  RT_CHECK(vx_buf_alloc(device, sizeof(kernel_arg_t), &arg_buf));
  RT_CHECK(vx_buf_alloc(device, buf_size, &src1_buf));
  RT_CHECK(vx_buf_alloc(device, buf_size, &src2_buf));
--- a/tests/regression/draw3d/main.cpp
+++ b/tests/regression/draw3d/main.cpp
@ -26,7 +26,7 @@ using namespace cocogfx;
 ///////////////////////////////////////////////////////////////////////////////

 const char* kernel_file = "kernel.bin";
-const char* input_file  = "soccer.png";
+const char* input_file  = "fire.png";
 const char* output_file = "output.png";
 const char* reference_file  = nullptr;
 uint32_t clear_color = 0x00000000;
@ -34,8 +34,8 @@ int tex_format = TEX_FORMAT_A8R8G8B8;
 ePixelFormat tex_eformat = FORMAT_A8R8G8B8;
 int tex_wrap = TEX_WRAP_CLAMP;
 int tex_filter  = TEX_FILTER_POINT;
-uint32_t dst_width  = 64;
-uint32_t dst_height = 64;
+uint32_t dst_width  = 256;
+uint32_t dst_height = 256;
 const model_t& model = model_quad;

 vx_device_h device = nullptr;
@ -218,6 +218,7 @@ int main(int argc, char *argv[]) {

  // Perform tile binning
  auto num_tiles = Binning(tilebuf, primbuf, model, dst_width, dst_height, tile_size);
+  std::cout << "Binning allocated " << num_tiles << " tiles." << std::endl;
  
  // upload program
  std::cout << "upload program" << std::endl;  
@ -237,8 +238,8 @@ int main(int argc, char *argv[]) {
  std::cout << "zbuf_addr=0x" << std::hex << zbuf_addr << std::endl;
  std::cout << "cbuf_addr=0x" << std::hex << cbuf_addr << std::endl;

-  // allocate staging shared memory  
-  std::cout << "allocate shared memory" << std::endl;    
+  // allocate staging buffer  
+  std::cout << "allocate staging buffer" << std::endl;    
  uint32_t alloc_size = std::max<uint32_t>({
      sizeof(kernel_arg_t), (uint32_t)tilebuf.size(), (uint32_t)primbuf.size(), zbuf_size, cbuf_size
    });
--- a/tests/regression/draw3d/utils.cpp
+++ b/tests/regression/draw3d/utils.cpp
@ -16,9 +16,7 @@ using fixed16_t  = TFixed<16>;

 using vec2d_f_t  = TVector2<float>;
 using vec2d_fx_t = TVector2<fixed16_t>;
-
 using vec4d_f_t  = TVector4<float>;
-
 using rect_f_t   = TRect<float>;

 static fixed16_t fxZero(0);
@ -30,13 +28,13 @@ static fixed16_t evalEdgeFunction(const rast_edge_t& e, uint32_t x, uint32_t y)
 }

 // Calculate the edge extents for tile corners
-static fixed16_t calcEdgeExtents(const rast_edge_t& e, uint32_t logTileSize) {
+static fixed16_t calcEdgeExtents(const rast_edge_t& e) {
  vec2d_fx_t corners[4] = {{fxZero, fxZero},  // 00
-                            {e.x,    fxZero},  // 10
-                            {fxZero, e.y},     // 01
-                            {e.x,    e.y}};    // 11
+                           {e.x,    fxZero},  // 10
+                           {fxZero, e.y},     // 01
+                           {e.x,    e.y}};    // 11
  auto i = (e.y >= fxZero) ? ((e.x >= fxZero) ? 3 : 2) : (e.x >= fxZero) ? 1 : 0;
-  return (corners[i].x + corners[i].y) << logTileSize;
+  return corners[i].x + corners[i].y;
 }

 static float EdgeEquation(rast_edge_t edges[3], 
@ -92,7 +90,7 @@ uint32_t Binning(std::vector<uint8_t>& tilebuf,
                 uint32_t height,
                 uint32_t tileSize) {

-  uint32_t logTileSize = log2ceil(tileSize);
+  uint32_t tileLogSize = log2ceil(tileSize);

  std::unordered_map<uint32_t, std::vector<uint32_t>> tiles;

@ -151,7 +149,7 @@ uint32_t Binning(std::vector<uint8_t>& tilebuf,
    uint32_t p;

    {
-      #define INTERPOLATE_DELTA(dx, x0, x1, x2) \
+      #define ATTRIBUTE_DELTA(dx, x0, x1, x2) \
        dx.x = fixed23_t(x0 - x2); \
        dx.y = fixed23_t(x1 - x2); \
        dx.z = fixed23_t(x2)
@ -168,34 +166,34 @@ uint32_t Binning(std::vector<uint8_t>& tilebuf,
      ColorToFloat(colors[1], v1.c);
      ColorToFloat(colors[2], v2.c);
      
-      INTERPOLATE_DELTA(rast_prim.attribs.z, v0.z, v1.z, v2.z);
-      INTERPOLATE_DELTA(rast_prim.attribs.r, colors[0][0], colors[1][0], colors[2][0]);
-      INTERPOLATE_DELTA(rast_prim.attribs.g, colors[0][1], colors[1][1], colors[2][1]);
-      INTERPOLATE_DELTA(rast_prim.attribs.b, colors[0][2], colors[1][2], colors[2][2]);
-      INTERPOLATE_DELTA(rast_prim.attribs.a, colors[0][3], colors[1][3], colors[2][3]);      
-      INTERPOLATE_DELTA(rast_prim.attribs.u, v0.u, v1.u, v2.u);
-      INTERPOLATE_DELTA(rast_prim.attribs.v, v0.v, v1.v, v2.v);
+      ATTRIBUTE_DELTA(rast_prim.attribs.z, v0.z, v1.z, v2.z);
+      ATTRIBUTE_DELTA(rast_prim.attribs.r, colors[0][0], colors[1][0], colors[2][0]);
+      ATTRIBUTE_DELTA(rast_prim.attribs.g, colors[0][1], colors[1][1], colors[2][1]);
+      ATTRIBUTE_DELTA(rast_prim.attribs.b, colors[0][2], colors[1][2], colors[2][2]);
+      ATTRIBUTE_DELTA(rast_prim.attribs.a, colors[0][3], colors[1][3], colors[2][3]);      
+      ATTRIBUTE_DELTA(rast_prim.attribs.u, v0.u, v1.u, v2.u);
+      ATTRIBUTE_DELTA(rast_prim.attribs.v, v0.v, v1.v, v2.v);

      p = rast_prims.size();
      rast_prims.push_back(rast_prim);      
    }

    // Calculate min/max tile positions
-    auto tileSize = 1 << logTileSize;
-    auto minTileX = bbox.left >> logTileSize;
-    auto minTileY = bbox.top >> logTileSize;
-    auto maxTileX = (bbox.right + tileSize - 1) >> logTileSize;
-    auto maxTileY = (bbox.bottom + tileSize - 1) >> logTileSize;
+    auto tileSize = 1 << tileLogSize;
+    auto minTileX = bbox.left >> tileLogSize;
+    auto minTileY = bbox.top >> tileLogSize;
+    auto maxTileX = (bbox.right + tileSize - 1) >> tileLogSize;
+    auto maxTileY = (bbox.bottom + tileSize - 1) >> tileLogSize;

    // Starting tile coordinates
-    auto X = minTileX << logTileSize;
-    auto Y = minTileY << logTileSize;
+    auto X = minTileX << tileLogSize;
+    auto Y = minTileY << tileLogSize;

    // Add tile corner edge offsets
    fixed16_t extents[3];
-    extents[0] = calcEdgeExtents(edges[0], logTileSize);
-    extents[1] = calcEdgeExtents(edges[1], logTileSize);
-    extents[2] = calcEdgeExtents(edges[2], logTileSize);
+    extents[0] = calcEdgeExtents(edges[0]);
+    extents[1] = calcEdgeExtents(edges[1]);
+    extents[2] = calcEdgeExtents(edges[2]);

    // Evaluate edge equation for the starting tile
    auto e0 = evalEdgeFunction(edges[0], X, Y);
@ -209,34 +207,33 @@ uint32_t Binning(std::vector<uint8_t>& tilebuf,
      auto ee2 = e2;
      for (uint32_t tx = minTileX; tx < maxTileX; ++tx) {
        // check if tile overlap triangle    
-        if (((ee0 + extents[0]).data() 
-           | (ee1 + extents[1]).data()
-           | (ee2 + extents[2]).data()) >= 0) {
+        if (((ee0 + (extents[0] << tileLogSize)).data() 
+           | (ee1 + (extents[1] << tileLogSize)).data()
+           | (ee2 + (extents[2] << tileLogSize)).data()) >= 0) {
          // assign primitive to tile
          uint32_t tile_id = (ty << 16) | tx;
          tiles[tile_id].push_back(p);
          ++num_prims;
        }
-
        // update edge equation x components
-        ee0 += edges[0].x << logTileSize;
-        ee1 += edges[1].x << logTileSize;
-        ee2 += edges[2].x << logTileSize;
+        ee0 += edges[0].x << tileLogSize;
+        ee1 += edges[1].x << tileLogSize;
+        ee2 += edges[2].x << tileLogSize;
      }
      // update edge equation y components
-      e0 += edges[0].y << logTileSize;
-      e1 += edges[1].y << logTileSize;
-      e2 += edges[2].y << logTileSize;
+      e0 += edges[0].y << tileLogSize;
+      e1 += edges[1].y << tileLogSize;
+      e2 += edges[2].y << tileLogSize;
    }
  }

  {
-    primbuf.reserve(rast_prims.size()  * sizeof(rast_prim_t));
+    primbuf.resize(rast_prims.size() * sizeof(rast_prim_t));
    memcpy(primbuf.data(), rast_prims.data(), primbuf.size());
  }
  
  {
-    tilebuf.reserve(tiles.size() * sizeof(rast_tile_header_t) + num_prims * sizeof(uint32_t));
+    tilebuf.resize(tiles.size() * sizeof(rast_tile_header_t) + num_prims * sizeof(uint32_t));
    auto tile_data = tilebuf.data();
    for (auto it : tiles) {
      rast_tile_header_t header{it.first, (uint32_t)it.second.size()};
--- a/tests/regression/fence/main.cpp
+++ b/tests/regression/fence/main.cpp
@ -148,8 +148,8 @@ int main(int argc, char *argv[]) {
  std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::endl;
  std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
  
-  // allocate shared memory  
-  std::cout << "allocate shared memory" << std::endl;    
+  // allocate staging buffer  
+  std::cout << "allocate staging buffer" << std::endl;    
  uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
  RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
  
--- a/tests/regression/io_addr/main.cpp
+++ b/tests/regression/io_addr/main.cpp
@ -183,8 +183,8 @@ int main(int argc, char *argv[]) {
  std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
  std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
  
-  // allocate shared memory  
-  std::cout << "allocate shared memory" << std::endl;    
+  // allocate staging buffer  
+  std::cout << "allocate staging buffer" << std::endl;    
  uint32_t staging_buf_size = std::max<uint32_t>(NUM_ADDRS * sizeof(uint32_t),
                                std::max<uint32_t>(src_buf_size,
                                  std::max<uint32_t>(dst_buf_size, 
--- a/tests/regression/mstress/main.cpp
+++ b/tests/regression/mstress/main.cpp
@ -236,8 +236,8 @@ int main(int argc, char *argv[]) {
  std::cout << "dev_src="  << std::hex << kernel_arg.src1_addr << std::endl;  
  std::cout << "dev_dst="  << std::hex << kernel_arg.dst_addr << std::endl;
  
-  // allocate shared memory  
-  std::cout << "allocate shared memory" << std::endl;    
+  // allocate staging buffer  
+  std::cout << "allocate staging buffer" << std::endl;    
  uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size, 
                                std::max<uint32_t>(addr_buf_size, 
                                  std::max<uint32_t>(dst_buf_size, 
--- a/tests/regression/no_mf_ext/main.cpp
+++ b/tests/regression/no_mf_ext/main.cpp
@ -138,8 +138,8 @@ int main(int argc, char *argv[]) {
  std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
  std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
  
-  // allocate shared memory  
-  std::cout << "allocate shared memory" << std::endl;    
+  // allocate staging buffer  
+  std::cout << "allocate staging buffer" << std::endl;    
  uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
  RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
  
--- a/tests/regression/no_smem/main.cpp
+++ b/tests/regression/no_smem/main.cpp
@ -138,8 +138,8 @@ int main(int argc, char *argv[]) {
  std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
  std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
  
-  // allocate shared memory  
-  std::cout << "allocate shared memory" << std::endl;    
+  // allocate staging buffer  
+  std::cout << "allocate staging buffer" << std::endl;    
  uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
  RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
  
--- a/tests/regression/prefetch/main.cpp
+++ b/tests/regression/prefetch/main.cpp
@ -148,8 +148,8 @@ int main(int argc, char *argv[]) {
  std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::endl;
  std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
  
-  // allocate shared memory  
-  std::cout << "allocate shared memory" << std::endl;    
+  // allocate staging buffer  
+  std::cout << "allocate staging buffer" << std::endl;    
  uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
  RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
  
--- a/tests/regression/printf/main.cpp
+++ b/tests/regression/printf/main.cpp
@ -111,8 +111,8 @@ int main(int argc, char *argv[]) {

  std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
  
-  // allocate shared memory  
-  std::cout << "allocate shared memory" << std::endl;    
+  // allocate staging buffer  
+  std::cout << "allocate staging buffer" << std::endl;    
  uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
  RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
  
--- a/tests/regression/sort/main.cpp
+++ b/tests/regression/sort/main.cpp
@ -178,8 +178,8 @@ int main(int argc, char *argv[]) {
  std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
  std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
  
-  // allocate shared memory  
-  std::cout << "allocate shared memory" << std::endl;    
+  // allocate staging buffer  
+  std::cout << "allocate staging buffer" << std::endl;    
  uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
                                  std::max<uint32_t>(dst_buf_size, 
                                    sizeof(kernel_arg_t)));
--- a/tests/regression/tex/main.cpp
+++ b/tests/regression/tex/main.cpp
@ -213,8 +213,8 @@ int main(int argc, char *argv[]) {
  std::cout << "src_addr=0x" << std::hex << src_addr << std::endl;
  std::cout << "dst_addr=0x" << std::hex << dst_addr << std::endl;

-  // allocate staging shared memory  
-  std::cout << "allocate shared memory" << std::endl;    
+  // allocate staging buffer  
+  std::cout << "allocate staging buffer" << std::endl;    
  uint32_t alloc_size = std::max<uint32_t>(sizeof(kernel_arg_t), 
                            std::max<uint32_t>(src_bufsize, dst_bufsize));
  RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
--- a/third_party/cocogfx
+++ b/third_party/cocogfx
@ -1 +1 @@
-Subproject commit 8f78db5e1845b2a9cd337ac154ee276250d91ad3
+Subproject commit 6ff9739cee9a0528142123985e4d8e59f7d0a4e8