mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
minor update
This commit is contained in:
parent
6c56edf65d
commit
9e20e6edb6
10 changed files with 19 additions and 29 deletions
|
@ -14,24 +14,28 @@
|
|||
#ifndef __VX_SPAWN_H__
|
||||
#define __VX_SPAWN_H__
|
||||
|
||||
#include <VX_types.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef void (*vx_spawn_tasks_cb)(int task_id, void *arg);
|
||||
typedef void (*vx_spawn_tasks_cb)(int task_id, const void *arg);
|
||||
|
||||
typedef void (*vx_spawn_task_groups_cb)(int local_task_id, int group_id, int local_group_id, int warps_per_group, void *arg);
|
||||
typedef void (*vx_spawn_task_groups_cb)(int local_task_id, int group_id, int local_group_id, int warps_per_group, const void *arg);
|
||||
|
||||
typedef void (*vx_serial_cb)(void *arg);
|
||||
typedef void (*vx_serial_cb)(const void *arg);
|
||||
|
||||
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg);
|
||||
void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, const void * arg);
|
||||
|
||||
void vx_spawn_task_groups(int num_groups, int group_size, vx_spawn_task_groups_cb callback, void * arg);
|
||||
void vx_spawn_task_groups(int num_groups, int group_size, vx_spawn_task_groups_cb callback, const void * arg);
|
||||
|
||||
void vx_serial(vx_serial_cb callback, void * arg);
|
||||
inline void* vx_local_malloc(int local_group_id, int size) {
|
||||
return (int8_t*)csr_read(VX_CSR_LOCAL_MEM_BASE) + local_group_id * size;
|
||||
}
|
||||
|
||||
void vx_serial(vx_serial_cb callback, const void * arg);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -98,15 +98,14 @@ SECTIONS
|
|||
PROVIDE_HIDDEN (__tdata_end = .);
|
||||
}
|
||||
PROVIDE (__tdata_size = SIZEOF (.tdata));
|
||||
.tbss :
|
||||
.tbss :
|
||||
{
|
||||
PROVIDE_HIDDEN (__tbss_start = .);
|
||||
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
|
||||
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
|
||||
*(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon)
|
||||
PROVIDE_HIDDEN (__tbss_end = .);
|
||||
}
|
||||
PROVIDE (__tbss_size = SIZEOF (.tbss));
|
||||
PROVIDE (__tcb_aligned_size = ALIGN(__tbss_end - __tdata_start, 64));
|
||||
.preinit_array :
|
||||
{
|
||||
PROVIDE_HIDDEN (__preinit_array_start = .);
|
||||
|
|
|
@ -98,15 +98,14 @@ SECTIONS
|
|||
PROVIDE_HIDDEN (__tdata_end = .);
|
||||
}
|
||||
PROVIDE (__tdata_size = SIZEOF (.tdata));
|
||||
.tbss :
|
||||
.tbss :
|
||||
{
|
||||
PROVIDE_HIDDEN (__tbss_start = .);
|
||||
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
|
||||
PROVIDE_HIDDEN (__tbss_offset = ABSOLUTE (__tbss_start - __tdata_start));
|
||||
*(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon)
|
||||
PROVIDE_HIDDEN (__tbss_end = .);
|
||||
}
|
||||
PROVIDE (__tbss_size = SIZEOF (.tbss));
|
||||
PROVIDE (__tcb_aligned_size = ALIGN(__tbss_end - __tdata_start, 64));
|
||||
.preinit_array :
|
||||
{
|
||||
PROVIDE_HIDDEN (__preinit_array_start = .);
|
||||
|
|
|
@ -34,7 +34,7 @@ void vx_perf_dump() {
|
|||
int core_id = vx_core_id();
|
||||
uint32_t * const csr_mem = (uint32_t*)(IO_MPM_ADDR + 64 * sizeof(uint32_t) * core_id);
|
||||
DUMP_CSRS(0);
|
||||
//DUMP_CSRS(1);
|
||||
//DUMP_CSRS(1); reserved for exitcode
|
||||
DUMP_CSRS(2);
|
||||
DUMP_CSRS(3);
|
||||
DUMP_CSRS(4);
|
||||
|
|
|
@ -209,12 +209,6 @@ static void __attribute__ ((noinline)) process_all_task_groups_stub() {
|
|||
vx_tmc(0 == vx_warp_id());
|
||||
}
|
||||
|
||||
void vx_syncthreads(int barrier_id) {
|
||||
wspawn_task_groups_args_t* targs = (wspawn_task_groups_args_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
int warps_per_group = targs->warps_per_group;
|
||||
vx_barrier(barrier_id, warps_per_group);
|
||||
}
|
||||
|
||||
void vx_spawn_task_groups(int num_groups, int group_size, vx_spawn_task_groups_cb callback, void * arg) {
|
||||
// device specifications
|
||||
int num_cores = vx_num_cores();
|
||||
|
|
|
@ -93,8 +93,7 @@ init_regs:
|
|||
|
||||
# set thread pointer register
|
||||
# use address space after BSS region
|
||||
# ensure cache line alignment
|
||||
la t1, __tcb_aligned_size
|
||||
la t1, __tbss_size
|
||||
mul t0, t0, t1
|
||||
la tp, _end
|
||||
addi tp, tp, 63
|
||||
|
|
|
@ -188,8 +188,6 @@ int main(int argc, char *argv[]) {
|
|||
cleanup();
|
||||
exit(1);
|
||||
}
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_LOCAL_MEM_ADDR, &kernel_arg.lmem_addr));
|
||||
std::cout << "using local memory: base_addr=" << std::hex << kernel_arg.lmem_addr << std::dec << std::endl;
|
||||
} else {
|
||||
kernel_arg.lmem_addr = 0;
|
||||
}
|
||||
|
|
|
@ -10,7 +10,6 @@ typedef struct {
|
|||
uint32_t group_size;
|
||||
uint32_t size;
|
||||
uint32_t tile_size;
|
||||
uint64_t local_addr;
|
||||
uint64_t A_addr;
|
||||
uint64_t B_addr;
|
||||
uint64_t C_addr;
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
#include "common.h"
|
||||
|
||||
void kernel_body(int local_task_id, int group_id, int local_group_id, int warps_per_group, kernel_arg_t *arg) {
|
||||
auto local_ptr = reinterpret_cast<TYPE*>(arg->local_addr);
|
||||
auto A_ptr = reinterpret_cast<TYPE*>(arg->A_addr);
|
||||
auto B_ptr = reinterpret_cast<TYPE*>(arg->B_addr);
|
||||
auto C_ptr = reinterpret_cast<TYPE*>(arg->C_addr);
|
||||
|
@ -14,6 +13,7 @@ void kernel_body(int local_task_id, int group_id, int local_group_id, int warps_
|
|||
auto num_groups = arg->num_groups;
|
||||
auto group_size = arg->group_size;
|
||||
auto num_tiles = size / tile_size;
|
||||
auto local_mem = vx_local_malloc(local_group_id, group_size * 2);
|
||||
|
||||
// Determine row and column indices of the current subtask
|
||||
auto l_row = local_task_id / tile_size;
|
||||
|
@ -24,7 +24,7 @@ void kernel_body(int local_task_id, int group_id, int local_group_id, int warps_
|
|||
auto g_col = (group_id % num_tiles) * tile_size + l_col;
|
||||
|
||||
// Allocate local memory for the tile of matrix A & B
|
||||
auto local_A = local_ptr + local_group_id * group_size * 2;
|
||||
auto local_A = (TYPE*)local_mem;
|
||||
auto local_B = local_A + group_size;
|
||||
|
||||
TYPE sum(0);
|
||||
|
|
|
@ -183,7 +183,6 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_LOCAL_MEM_ADDR, &kernel_arg.local_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &A_buffer));
|
||||
RT_CHECK(vx_mem_address(A_buffer, &kernel_arg.A_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &B_buffer));
|
||||
|
@ -191,7 +190,6 @@ int main(int argc, char *argv[]) {
|
|||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &C_buffer));
|
||||
RT_CHECK(vx_mem_address(C_buffer, &kernel_arg.C_addr));
|
||||
|
||||
std::cout << "local_addr=0x" << std::hex << kernel_arg.local_addr << std::endl;
|
||||
std::cout << "A_addr=0x" << std::hex << kernel_arg.A_addr << std::endl;
|
||||
std::cout << "B_addr=0x" << std::hex << kernel_arg.B_addr << std::endl;
|
||||
std::cout << "C_addr=0x" << std::hex << kernel_arg.C_addr << std::endl;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue