wspawn fix for small sets

2025-04-24 13:57:17 -04:00 · 2021-01-25 07:04:54 -08:00 · 2021-01-25 07:04:54 -08:00 · 3602d287b4
commit 3602d287b4
parent de2e118cc2
3 changed files with 160 additions and 3 deletions
--- a/driver/tests/basic/basic.cpp
+++ b/driver/tests/basic/basic.cpp
@ -4,6 +4,7 @@
 #include <vortex.h>
 #include <chrono>
 #include "common.h"
+#include "kernel_scheduler.h"

 #define RT_CHECK(_expr)                                         \
   do {                                                         \
@ -212,6 +213,7 @@ int run_kernel_test(const kernel_arg_t& kernel_arg,
 }

 int main(int argc, char *argv[]) {
+
  size_t value; 
  kernel_arg_t kernel_arg;

@ -222,6 +224,8 @@ int main(int argc, char *argv[]) {
    count = 1;
  }

+  //kernel_run(count, 1, 1, test, 4, 4);
+
  // open device connection
  std::cout << "open device connection" << std::endl;
  RT_CHECK(vx_dev_open(&device));
--- a/driver/tests/basic/kernel_scheduler.h
+++ b/driver/tests/basic/kernel_scheduler.h
@ -0,0 +1,153 @@
+#include <iostream>
+#include <assert.h>
+
+#define NUM_CORES_MAX 32
+
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+struct context_t {
+  uint32_t num_groups[3];
+  uint32_t global_offset[3];
+  uint32_t local_size[3];
+  char * printf_buffer;
+  uint32_t *printf_buffer_position;
+  uint32_t printf_buffer_capacity;
+  uint32_t work_dim;
+};
+
+typedef void (*vx_pocl_workgroup_func) (
+  const void * /* args */,
+	const struct context_t * /* context */,
+	uint32_t /* group_x */,
+	uint32_t /* group_y */,
+	uint32_t /* group_z */
+);
+
+typedef struct {
+  struct context_t * ctx;
+  vx_pocl_workgroup_func pfn;
+  const void * args;
+  int offset; 
+  int N;
+  int R;
+} wspawn_args_t;
+
+void kernel_spawn_callback(int core_id, int NW, int NT, int nW, wspawn_args_t* p_wspawn_args) {
+  assert(nW <= NW);
+  for (int wid = 0; wid < nW; ++wid) {
+    for (int tid = 0; tid < NT; ++tid) {
+      int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid);
+      int tK = p_wspawn_args->N + (wid < p_wspawn_args->R);
+      int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
+
+      int X = p_wspawn_args->ctx->num_groups[0];
+      int Y = p_wspawn_args->ctx->num_groups[1];
+      int XY = X * Y;
+
+      for (int wg_id = offset, N = wg_id + tK; wg_id < N; ++wg_id) {    
+        int k = wg_id / XY;
+        int wg_2d = wg_id - k * XY;
+        int j = wg_2d / X;
+        int i = wg_2d - j * X;
+
+        int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
+        int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
+        int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
+
+        printf("c%d w%d t%d: g={%d, %d, %d}\n", core_id, wid, tid, gid0, gid1, gid2);
+      }
+    }
+  }
+}
+
+void kernel_spawn_remaining_callback(int core_id, int NW, int NT, int wid, int nT, wspawn_args_t* p_wspawn_args) {    
+  assert(wid < NW);
+  assert(nT <= NT);
+  for (int t = 0; t < nT; ++t) {
+    int tid = core_id * NW * NT + wid * NT + t;
+
+    int wg_id = p_wspawn_args->offset + tid;
+
+    int X = p_wspawn_args->ctx->num_groups[0];
+    int Y = p_wspawn_args->ctx->num_groups[1];
+    int XY = X * Y;
+    
+    int k = wg_id / XY;
+    int wg_2d = wg_id - k * XY;
+    int j = wg_2d / X;
+    int i = wg_2d - j * X;
+
+    int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
+    int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
+    int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
+
+    printf("c%d w%d t%d: g={%d, %d, %d}\n", core_id, wid, tid, gid0, gid1, gid2);
+  }
+}
+
+void kernel_run_once(context_t* ctx, int NC, int NW, int NT, int core_id) {
+    // total number of WGs
+    int X = ctx->num_groups[0];
+    int Y = ctx->num_groups[1];
+    int Z = ctx->num_groups[2];
+    int Q = X * Y * Z;
+
+    // current core id
+    if (core_id >= NUM_CORES_MAX)
+      return;
+
+    // calculate necessary active cores
+    int WT = NW * NT;
+    int nC = (Q > WT) ? (Q / WT) : 1;
+    int nc = MIN(nC, NC);
+    if (core_id >= nc)
+      return; // terminate extra cores
+
+    // number of workgroups per core
+    int wgs_per_core = Q / nc;
+    int wgs_per_core0 = wgs_per_core;  
+    if (core_id == (NC-1)) {    
+      int QC_r = Q - (nc * wgs_per_core0); 
+      wgs_per_core0 += QC_r; // last core executes remaining WGs
+    }
+
+    // number of workgroups per warp
+    int nW = wgs_per_core0 / NT;              // total warps per core
+    int rT = wgs_per_core0 - (nW * NT);       // remaining threads
+    int fW = (nW >= NW) ? (nW / NW) : 0;      // full warps iterations
+    int rW = (fW != 0) ? (nW - fW * NW) : 0;  // reamining full warps
+    if (0 == fW)
+      fW = 1;
+
+    //--
+    wspawn_args_t wspawn_args = { ctx, NULL, NULL, core_id * wgs_per_core, fW, rW };
+
+    //--
+    if (nW >= 1)	{ 
+      int nw = MIN(nW, NW);
+      kernel_spawn_callback(core_id, NW, NT, nw, &wspawn_args);
+    }  
+
+    //--    
+    if (rT != 0) {
+      wspawn_args.offset = wgs_per_core0 - rT;
+      kernel_spawn_remaining_callback(core_id, NW, NT, 0, rT, &wspawn_args);
+    }
+  }
+
+  void kernel_run(int X, int Y, int Z, int NC, int NW, int NT) {
+    context_t ctx;
+
+    ctx.num_groups[0] = X;
+    ctx.num_groups[1] = Y;
+    ctx.num_groups[2] = Z;
+    ctx.global_offset[0] = 0;
+    ctx.global_offset[1] = 0;
+    ctx.global_offset[2] = 0;
+
+    for (int cid = 0; cid < NC; ++cid) {
+      kernel_run_once(&ctx, NC, NW, NT, cid);
+    }
+
+    exit (0);
+  }
--- a/runtime/src/vx_spawn.c
+++ b/runtime/src/vx_spawn.c
@ -6,7 +6,7 @@
 extern "C" {
 #endif

-#define NUM_CORES_MAX 16
+#define NUM_CORES_MAX 32

 #define MIN(a, b) ((a) < (b) ? (a) : (b))

@ -71,7 +71,7 @@ void vx_spawn_tasks(int num_tasks, pfn_callback callback , void * args) {
  int nC = (num_tasks > WT) ? (num_tasks / WT) : 1;
  int nc = MIN(nC, NC);
  if (core_id >= nc)
-    return; // terminate unused cores
+    return; // terminate extra cores

  // number of tasks per core
  int tasks_per_core = num_tasks / nc;
@ -94,7 +94,7 @@ void vx_spawn_tasks(int num_tasks, pfn_callback callback , void * args) {
  g_wspawn_args[core_id] = &wspawn_args;

  //--
-	if (nW > 1)	{ 
+	if (nW >= 1)	{ 
    int nw = MIN(nW, NW);    
 	  vx_wspawn(nw, (unsigned)&spawn_tasks_callback);
    spawn_tasks_callback();