mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-24 13:57:17 -04:00
wspawn fix for small sets
This commit is contained in:
parent
de2e118cc2
commit
3602d287b4
3 changed files with 160 additions and 3 deletions
|
@ -4,6 +4,7 @@
|
|||
#include <vortex.h>
|
||||
#include <chrono>
|
||||
#include "common.h"
|
||||
#include "kernel_scheduler.h"
|
||||
|
||||
#define RT_CHECK(_expr) \
|
||||
do { \
|
||||
|
@ -212,6 +213,7 @@ int run_kernel_test(const kernel_arg_t& kernel_arg,
|
|||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
|
||||
size_t value;
|
||||
kernel_arg_t kernel_arg;
|
||||
|
||||
|
@ -222,6 +224,8 @@ int main(int argc, char *argv[]) {
|
|||
count = 1;
|
||||
}
|
||||
|
||||
//kernel_run(count, 1, 1, test, 4, 4);
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
|
153
driver/tests/basic/kernel_scheduler.h
Normal file
153
driver/tests/basic/kernel_scheduler.h
Normal file
|
@ -0,0 +1,153 @@
|
|||
#include <iostream>
|
||||
#include <assert.h>
|
||||
|
||||
#define NUM_CORES_MAX 32
|
||||
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
|
||||
struct context_t {
|
||||
uint32_t num_groups[3];
|
||||
uint32_t global_offset[3];
|
||||
uint32_t local_size[3];
|
||||
char * printf_buffer;
|
||||
uint32_t *printf_buffer_position;
|
||||
uint32_t printf_buffer_capacity;
|
||||
uint32_t work_dim;
|
||||
};
|
||||
|
||||
typedef void (*vx_pocl_workgroup_func) (
|
||||
const void * /* args */,
|
||||
const struct context_t * /* context */,
|
||||
uint32_t /* group_x */,
|
||||
uint32_t /* group_y */,
|
||||
uint32_t /* group_z */
|
||||
);
|
||||
|
||||
typedef struct {
|
||||
struct context_t * ctx;
|
||||
vx_pocl_workgroup_func pfn;
|
||||
const void * args;
|
||||
int offset;
|
||||
int N;
|
||||
int R;
|
||||
} wspawn_args_t;
|
||||
|
||||
void kernel_spawn_callback(int core_id, int NW, int NT, int nW, wspawn_args_t* p_wspawn_args) {
|
||||
assert(nW <= NW);
|
||||
for (int wid = 0; wid < nW; ++wid) {
|
||||
for (int tid = 0; tid < NT; ++tid) {
|
||||
int wK = (p_wspawn_args->N * wid) + MIN(p_wspawn_args->R, wid);
|
||||
int tK = p_wspawn_args->N + (wid < p_wspawn_args->R);
|
||||
int offset = p_wspawn_args->offset + (wK * NT) + (tid * tK);
|
||||
|
||||
int X = p_wspawn_args->ctx->num_groups[0];
|
||||
int Y = p_wspawn_args->ctx->num_groups[1];
|
||||
int XY = X * Y;
|
||||
|
||||
for (int wg_id = offset, N = wg_id + tK; wg_id < N; ++wg_id) {
|
||||
int k = wg_id / XY;
|
||||
int wg_2d = wg_id - k * XY;
|
||||
int j = wg_2d / X;
|
||||
int i = wg_2d - j * X;
|
||||
|
||||
int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
|
||||
int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
|
||||
int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
|
||||
|
||||
printf("c%d w%d t%d: g={%d, %d, %d}\n", core_id, wid, tid, gid0, gid1, gid2);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_spawn_remaining_callback(int core_id, int NW, int NT, int wid, int nT, wspawn_args_t* p_wspawn_args) {
|
||||
assert(wid < NW);
|
||||
assert(nT <= NT);
|
||||
for (int t = 0; t < nT; ++t) {
|
||||
int tid = core_id * NW * NT + wid * NT + t;
|
||||
|
||||
int wg_id = p_wspawn_args->offset + tid;
|
||||
|
||||
int X = p_wspawn_args->ctx->num_groups[0];
|
||||
int Y = p_wspawn_args->ctx->num_groups[1];
|
||||
int XY = X * Y;
|
||||
|
||||
int k = wg_id / XY;
|
||||
int wg_2d = wg_id - k * XY;
|
||||
int j = wg_2d / X;
|
||||
int i = wg_2d - j * X;
|
||||
|
||||
int gid0 = p_wspawn_args->ctx->global_offset[0] + i;
|
||||
int gid1 = p_wspawn_args->ctx->global_offset[1] + j;
|
||||
int gid2 = p_wspawn_args->ctx->global_offset[2] + k;
|
||||
|
||||
printf("c%d w%d t%d: g={%d, %d, %d}\n", core_id, wid, tid, gid0, gid1, gid2);
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_run_once(context_t* ctx, int NC, int NW, int NT, int core_id) {
|
||||
// total number of WGs
|
||||
int X = ctx->num_groups[0];
|
||||
int Y = ctx->num_groups[1];
|
||||
int Z = ctx->num_groups[2];
|
||||
int Q = X * Y * Z;
|
||||
|
||||
// current core id
|
||||
if (core_id >= NUM_CORES_MAX)
|
||||
return;
|
||||
|
||||
// calculate necessary active cores
|
||||
int WT = NW * NT;
|
||||
int nC = (Q > WT) ? (Q / WT) : 1;
|
||||
int nc = MIN(nC, NC);
|
||||
if (core_id >= nc)
|
||||
return; // terminate extra cores
|
||||
|
||||
// number of workgroups per core
|
||||
int wgs_per_core = Q / nc;
|
||||
int wgs_per_core0 = wgs_per_core;
|
||||
if (core_id == (NC-1)) {
|
||||
int QC_r = Q - (nc * wgs_per_core0);
|
||||
wgs_per_core0 += QC_r; // last core executes remaining WGs
|
||||
}
|
||||
|
||||
// number of workgroups per warp
|
||||
int nW = wgs_per_core0 / NT; // total warps per core
|
||||
int rT = wgs_per_core0 - (nW * NT); // remaining threads
|
||||
int fW = (nW >= NW) ? (nW / NW) : 0; // full warps iterations
|
||||
int rW = (fW != 0) ? (nW - fW * NW) : 0; // reamining full warps
|
||||
if (0 == fW)
|
||||
fW = 1;
|
||||
|
||||
//--
|
||||
wspawn_args_t wspawn_args = { ctx, NULL, NULL, core_id * wgs_per_core, fW, rW };
|
||||
|
||||
//--
|
||||
if (nW >= 1) {
|
||||
int nw = MIN(nW, NW);
|
||||
kernel_spawn_callback(core_id, NW, NT, nw, &wspawn_args);
|
||||
}
|
||||
|
||||
//--
|
||||
if (rT != 0) {
|
||||
wspawn_args.offset = wgs_per_core0 - rT;
|
||||
kernel_spawn_remaining_callback(core_id, NW, NT, 0, rT, &wspawn_args);
|
||||
}
|
||||
}
|
||||
|
||||
void kernel_run(int X, int Y, int Z, int NC, int NW, int NT) {
|
||||
context_t ctx;
|
||||
|
||||
ctx.num_groups[0] = X;
|
||||
ctx.num_groups[1] = Y;
|
||||
ctx.num_groups[2] = Z;
|
||||
ctx.global_offset[0] = 0;
|
||||
ctx.global_offset[1] = 0;
|
||||
ctx.global_offset[2] = 0;
|
||||
|
||||
for (int cid = 0; cid < NC; ++cid) {
|
||||
kernel_run_once(&ctx, NC, NW, NT, cid);
|
||||
}
|
||||
|
||||
exit (0);
|
||||
}
|
|
@ -6,7 +6,7 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define NUM_CORES_MAX 16
|
||||
#define NUM_CORES_MAX 32
|
||||
|
||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||
|
||||
|
@ -71,7 +71,7 @@ void vx_spawn_tasks(int num_tasks, pfn_callback callback , void * args) {
|
|||
int nC = (num_tasks > WT) ? (num_tasks / WT) : 1;
|
||||
int nc = MIN(nC, NC);
|
||||
if (core_id >= nc)
|
||||
return; // terminate unused cores
|
||||
return; // terminate extra cores
|
||||
|
||||
// number of tasks per core
|
||||
int tasks_per_core = num_tasks / nc;
|
||||
|
@ -94,7 +94,7 @@ void vx_spawn_tasks(int num_tasks, pfn_callback callback , void * args) {
|
|||
g_wspawn_args[core_id] = &wspawn_args;
|
||||
|
||||
//--
|
||||
if (nW > 1) {
|
||||
if (nW >= 1) {
|
||||
int nw = MIN(nW, NW);
|
||||
vx_wspawn(nw, (unsigned)&spawn_tasks_callback);
|
||||
spawn_tasks_callback();
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue