minor update

This commit is contained in:
Blaise Tine 2024-06-06 00:32:58 -07:00
parent 6c56edf65d
commit 9e20e6edb6
10 changed files with 19 additions and 29 deletions

View file

@ -14,24 +14,28 @@
#ifndef __VX_SPAWN_H__
#define __VX_SPAWN_H__
#include <VX_types.h>
#include <stdint.h>
#include <stdio.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef void (*vx_spawn_tasks_cb)(int task_id, void *arg);
typedef void (*vx_spawn_tasks_cb)(int task_id, const void *arg);
typedef void (*vx_spawn_task_groups_cb)(int local_task_id, int group_id, int local_group_id, int warps_per_group, void *arg);
typedef void (*vx_spawn_task_groups_cb)(int local_task_id, int group_id, int local_group_id, int warps_per_group, const void *arg);
typedef void (*vx_serial_cb)(void *arg);
typedef void (*vx_serial_cb)(const void *arg);
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, const void * arg);
void vx_spawn_task_groups(int num_groups, int group_size, vx_spawn_task_groups_cb callback, void * arg);
void vx_spawn_task_groups(int num_groups, int group_size, vx_spawn_task_groups_cb callback, const void * arg);
void vx_serial(vx_serial_cb callback, void * arg);
inline void* vx_local_malloc(int local_group_id, int size) {
return (int8_t*)csr_read(VX_CSR_LOCAL_MEM_BASE) + local_group_id * size;
}
void vx_serial(vx_serial_cb callback, const void * arg);
#ifdef __cplusplus
}

View file

@ -98,15 +98,14 @@ SECTIONS
PROVIDE_HIDDEN (__tdata_end = .);
}
PROVIDE (__tdata_size = SIZEOF (.tdata));
.tbss :
.tbss :
{
PROVIDE_HIDDEN (__tbss_start = .);
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
*(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon)
PROVIDE_HIDDEN (__tbss_end = .);
}
PROVIDE (__tbss_size = SIZEOF (.tbss));
PROVIDE (__tcb_aligned_size = ALIGN(__tbss_end - __tdata_start, 64));
.preinit_array :
{
PROVIDE_HIDDEN (__preinit_array_start = .);

View file

@ -98,15 +98,14 @@ SECTIONS
PROVIDE_HIDDEN (__tdata_end = .);
}
PROVIDE (__tdata_size = SIZEOF (.tdata));
.tbss :
.tbss :
{
PROVIDE_HIDDEN (__tbss_start = .);
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
*(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon)
PROVIDE_HIDDEN (__tbss_end = .);
}
PROVIDE (__tbss_size = SIZEOF (.tbss));
PROVIDE (__tcb_aligned_size = ALIGN(__tbss_end - __tdata_start, 64));
.preinit_array :
{
PROVIDE_HIDDEN (__preinit_array_start = .);

View file

@ -34,7 +34,7 @@ void vx_perf_dump() {
int core_id = vx_core_id();
uint32_t * const csr_mem = (uint32_t*)(IO_MPM_ADDR + 64 * sizeof(uint32_t) * core_id);
DUMP_CSRS(0);
//DUMP_CSRS(1);
//DUMP_CSRS(1); reserved for exitcode
DUMP_CSRS(2);
DUMP_CSRS(3);
DUMP_CSRS(4);

View file

@ -209,12 +209,6 @@ static void __attribute__ ((noinline)) process_all_task_groups_stub() {
vx_tmc(0 == vx_warp_id());
}
void vx_syncthreads(int barrier_id) {
wspawn_task_groups_args_t* targs = (wspawn_task_groups_args_t*)csr_read(VX_CSR_MSCRATCH);
int warps_per_group = targs->warps_per_group;
vx_barrier(barrier_id, warps_per_group);
}
void vx_spawn_task_groups(int num_groups, int group_size, vx_spawn_task_groups_cb callback, void * arg) {
// device specifications
int num_cores = vx_num_cores();

View file

@ -93,8 +93,7 @@ init_regs:
# set thread pointer register
# use address space after BSS region
# ensure cache line alignment
la t1, __tcb_aligned_size
la t1, __tbss_size
mul t0, t0, t1
la tp, _end
addi tp, tp, 63

View file

@ -188,8 +188,6 @@ int main(int argc, char *argv[]) {
cleanup();
exit(1);
}
RT_CHECK(vx_dev_caps(device, VX_CAPS_LOCAL_MEM_ADDR, &kernel_arg.lmem_addr));
std::cout << "using local memory: base_addr=" << std::hex << kernel_arg.lmem_addr << std::dec << std::endl;
} else {
kernel_arg.lmem_addr = 0;
}

View file

@ -10,7 +10,6 @@ typedef struct {
uint32_t group_size;
uint32_t size;
uint32_t tile_size;
uint64_t local_addr;
uint64_t A_addr;
uint64_t B_addr;
uint64_t C_addr;

View file

@ -5,7 +5,6 @@
#include "common.h"
void kernel_body(int local_task_id, int group_id, int local_group_id, int warps_per_group, kernel_arg_t *arg) {
auto local_ptr = reinterpret_cast<TYPE*>(arg->local_addr);
auto A_ptr = reinterpret_cast<TYPE*>(arg->A_addr);
auto B_ptr = reinterpret_cast<TYPE*>(arg->B_addr);
auto C_ptr = reinterpret_cast<TYPE*>(arg->C_addr);
@ -14,6 +13,7 @@ void kernel_body(int local_task_id, int group_id, int local_group_id, int warps_
auto num_groups = arg->num_groups;
auto group_size = arg->group_size;
auto num_tiles = size / tile_size;
auto local_mem = vx_local_malloc(local_group_id, group_size * 2);
// Determine row and column indices of the current subtask
auto l_row = local_task_id / tile_size;
@ -24,7 +24,7 @@ void kernel_body(int local_task_id, int group_id, int local_group_id, int warps_
auto g_col = (group_id % num_tiles) * tile_size + l_col;
// Allocate local memory for the tile of matrix A & B
auto local_A = local_ptr + local_group_id * group_size * 2;
auto local_A = (TYPE*)local_mem;
auto local_B = local_A + group_size;
TYPE sum(0);

View file

@ -183,7 +183,6 @@ int main(int argc, char *argv[]) {
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_dev_caps(device, VX_CAPS_LOCAL_MEM_ADDR, &kernel_arg.local_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &A_buffer));
RT_CHECK(vx_mem_address(A_buffer, &kernel_arg.A_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &B_buffer));
@ -191,7 +190,6 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &C_buffer));
RT_CHECK(vx_mem_address(C_buffer, &kernel_arg.C_addr));
std::cout << "local_addr=0x" << std::hex << kernel_arg.local_addr << std::endl;
std::cout << "A_addr=0x" << std::hex << kernel_arg.A_addr << std::endl;
std::cout << "B_addr=0x" << std::hex << kernel_arg.B_addr << std::endl;
std::cout << "C_addr=0x" << std::hex << kernel_arg.C_addr << std::endl;