fixed loader script stack setup

This commit is contained in:
Blaise Tine 2020-12-31 22:37:20 -05:00
parent e757a0e333
commit e4a00dd0d9
43 changed files with 157370 additions and 118082 deletions

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -17,11 +17,15 @@ typedef void* vx_buffer_h;
#define VX_CAPS_MAX_CORES 0x1 #define VX_CAPS_MAX_CORES 0x1
#define VX_CAPS_MAX_WARPS 0x2 #define VX_CAPS_MAX_WARPS 0x2
#define VX_CAPS_MAX_THREADS 0x3 #define VX_CAPS_MAX_THREADS 0x3
#define VX_CAPS_CACHE_LINESIZE 0x4 #define VX_CAPS_CACHE_LINE_SIZE 0x4
#define VX_CAPS_LOCAL_MEM_SIZE 0x5 #define VX_CAPS_LOCAL_MEM_SIZE 0x5
#define VX_CAPS_ALLOC_BASE_ADDR 0x6 #define VX_CAPS_ALLOC_BASE_ADDR 0x6
#define VX_CAPS_KERNEL_BASE_ADDR 0x7 #define VX_CAPS_KERNEL_BASE_ADDR 0x7
#define CACHE_BLOCK_SIZE 64
#define ALLOC_BASE_ADDR 0x00000000
#define LOCAL_MEM_SIZE 0xffffffff
// open the device and connect to it // open the device and connect to it
int vx_dev_open(vx_device_h* hdevice); int vx_dev_open(vx_device_h* hdevice);

View file

@ -3,7 +3,7 @@
#include <fstream> #include <fstream>
#include <iomanip> #include <iomanip>
#define CCI_LATENCY 8 #define CCI_LATENCY 8
#define CCI_RAND_MOD 8 #define CCI_RAND_MOD 8
#define CCI_RQ_SIZE 16 #define CCI_RQ_SIZE 16
#define CCI_WQ_SIZE 16 #define CCI_WQ_SIZE 16
@ -204,11 +204,11 @@ void opae_sim::sRxPort_bus() {
if (!mmio_req_enabled if (!mmio_req_enabled
&& (cci_rd_it != cci_reads_.end())) { && (cci_rd_it != cci_reads_.end())) {
vortex_afu_->vcp2af_sRxPort_c0_rspValid = 1; vortex_afu_->vcp2af_sRxPort_c0_rspValid = 1;
memcpy(vortex_afu_->vcp2af_sRxPort_c0_data, cci_rd_it->block.data(), CACHE_BLOCK_SIZE); memcpy(vortex_afu_->vcp2af_sRxPort_c0_data, cci_rd_it->data.data(), CACHE_BLOCK_SIZE);
vortex_afu_->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata; vortex_afu_->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata;
/*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata); /*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata);
for (int i = 0; i < CACHE_BLOCK_SIZE; ++i) for (int i = 0; i < CACHE_BLOCK_SIZE; ++i)
printf("%02x", cci_rd_it->block[CACHE_BLOCK_SIZE-1-i]); printf("%02x", cci_rd_it->data[CACHE_BLOCK_SIZE-1-i]);
printf("\n");*/ printf("\n");*/
cci_reads_.erase(cci_rd_it); cci_reads_.erase(cci_rd_it);
} }
@ -223,7 +223,7 @@ void opae_sim::sTxPort_bus() {
cci_req.addr = vortex_afu_->af2cp_sTxPort_c0_hdr_address; cci_req.addr = vortex_afu_->af2cp_sTxPort_c0_hdr_address;
cci_req.mdata = vortex_afu_->af2cp_sTxPort_c0_hdr_mdata; cci_req.mdata = vortex_afu_->af2cp_sTxPort_c0_hdr_mdata;
auto host_ptr = (uint64_t*)(vortex_afu_->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE); auto host_ptr = (uint64_t*)(vortex_afu_->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE);
memcpy(cci_req.block.data(), host_ptr, CACHE_BLOCK_SIZE); memcpy(cci_req.data.data(), host_ptr, CACHE_BLOCK_SIZE);
//printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, vortex_afu_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata); //printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, vortex_afu_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata);
cci_reads_.emplace_back(cci_req); cci_reads_.emplace_back(cci_req);
} }
@ -262,7 +262,7 @@ void opae_sim::avs_bus() {
vortex_afu_->avs_readdatavalid = 0; vortex_afu_->avs_readdatavalid = 0;
if (dram_rd_it != dram_reads_.end()) { if (dram_rd_it != dram_reads_.end()) {
vortex_afu_->avs_readdatavalid = 1; vortex_afu_->avs_readdatavalid = 1;
memcpy(vortex_afu_->avs_readdata, dram_rd_it->block.data(), CACHE_BLOCK_SIZE); memcpy(vortex_afu_->avs_readdata, dram_rd_it->data.data(), CACHE_BLOCK_SIZE);
uint32_t addr = dram_rd_it->addr; uint32_t addr = dram_rd_it->addr;
dram_reads_.erase(dram_rd_it); dram_reads_.erase(dram_rd_it);
/*printf("%0ld: [sim] DRAM Rd Rsp: addr=%x, pending={", timestamp, addr * CACHE_BLOCK_SIZE); /*printf("%0ld: [sim] DRAM Rd Rsp: addr=%x, pending={", timestamp, addr * CACHE_BLOCK_SIZE);
@ -304,7 +304,7 @@ void opae_sim::avs_bus() {
assert(0 == vortex_afu_->mem_bank_select); assert(0 == vortex_afu_->mem_bank_select);
dram_rd_req_t dram_req; dram_rd_req_t dram_req;
dram_req.addr = vortex_afu_->avs_address; dram_req.addr = vortex_afu_->avs_address;
ram_.read(vortex_afu_->avs_address * CACHE_BLOCK_SIZE, CACHE_BLOCK_SIZE, dram_req.block.data()); ram_.read(vortex_afu_->avs_address * CACHE_BLOCK_SIZE, CACHE_BLOCK_SIZE, dram_req.data.data());
dram_req.cycles_left = DRAM_LATENCY; dram_req.cycles_left = DRAM_LATENCY;
for (auto& rsp : dram_reads_) { for (auto& rsp : dram_reads_) {
if (dram_req.addr == rsp.addr) { if (dram_req.addr == rsp.addr) {

View file

@ -40,13 +40,13 @@ private:
typedef struct { typedef struct {
int cycles_left; int cycles_left;
std::array<uint8_t, CACHE_BLOCK_SIZE> block; std::array<uint8_t, CACHE_BLOCK_SIZE> data;
uint32_t addr; uint32_t addr;
} dram_rd_req_t; } dram_rd_req_t;
typedef struct { typedef struct {
int cycles_left; int cycles_left;
std::array<uint8_t, CACHE_BLOCK_SIZE> block; std::array<uint8_t, CACHE_BLOCK_SIZE> data;
uint64_t addr; uint64_t addr;
uint32_t mdata; uint32_t mdata;
} cci_rd_req_t; } cci_rd_req_t;

View file

@ -22,10 +22,6 @@
#include "vx_scope.h" #include "vx_scope.h"
#endif #endif
#define CACHE_BLOCK_SIZE 64
#define ALLOC_BASE_ADDR 0x10000000
#define LOCAL_MEM_SIZE 0xffffffff
#define CHECK_RES(_expr) \ #define CHECK_RES(_expr) \
do { \ do { \
fpga_result res = _expr; \ fpga_result res = _expr; \
@ -104,7 +100,7 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
case VX_CAPS_MAX_THREADS: case VX_CAPS_MAX_THREADS:
*value = device->num_threads; *value = device->num_threads;
break; break;
case VX_CAPS_CACHE_LINESIZE: case VX_CAPS_CACHE_LINE_SIZE:
*value = CACHE_BLOCK_SIZE; *value = CACHE_BLOCK_SIZE;
break; break;
case VX_CAPS_LOCAL_MEM_SIZE: case VX_CAPS_LOCAL_MEM_SIZE:

View file

@ -11,10 +11,6 @@
#include <ram.h> #include <ram.h>
#include <simulator.h> #include <simulator.h>
#define CACHE_LINESIZE 64
#define ALLOC_BASE_ADDR 0x10000000
#define LOCAL_MEM_SIZE 0xffffffff
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
inline size_t align_size(size_t size, size_t alignment) { inline size_t align_size(size_t size, size_t alignment) {
@ -31,7 +27,7 @@ public:
vx_buffer(size_t size, vx_device* device) vx_buffer(size_t size, vx_device* device)
: size_(size) : size_(size)
, device_(device) { , device_(device) {
auto aligned_asize = align_size(size, CACHE_LINESIZE); auto aligned_asize = align_size(size, CACHE_BLOCK_SIZE);
data_ = malloc(aligned_asize); data_ = malloc(aligned_asize);
} }
@ -75,7 +71,7 @@ public:
int alloc_local_mem(size_t size, size_t* dev_maddr) { int alloc_local_mem(size_t size, size_t* dev_maddr) {
auto dev_mem_size = LOCAL_MEM_SIZE; auto dev_mem_size = LOCAL_MEM_SIZE;
size_t asize = align_size(size, CACHE_LINESIZE); size_t asize = align_size(size, CACHE_BLOCK_SIZE);
if (mem_allocation_ + asize > dev_mem_size) if (mem_allocation_ + asize > dev_mem_size)
return -1; return -1;
*dev_maddr = mem_allocation_; *dev_maddr = mem_allocation_;
@ -84,7 +80,7 @@ public:
} }
int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) { int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) {
size_t asize = align_size(size, CACHE_LINESIZE); size_t asize = align_size(size, CACHE_BLOCK_SIZE);
if (dest_addr + asize > ram_.size()) if (dest_addr + asize > ram_.size())
return -1; return -1;
@ -98,7 +94,7 @@ public:
} }
int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) { int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) {
size_t asize = align_size(size, CACHE_LINESIZE); size_t asize = align_size(size, CACHE_BLOCK_SIZE);
if (src_addr + asize > ram_.size()) if (src_addr + asize > ram_.size())
return -1; return -1;
@ -189,8 +185,8 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
case VX_CAPS_MAX_THREADS: case VX_CAPS_MAX_THREADS:
*value = NUM_THREADS; *value = NUM_THREADS;
break; break;
case VX_CAPS_CACHE_LINESIZE: case VX_CAPS_CACHE_LINE_SIZE:
*value = CACHE_LINESIZE; *value = CACHE_BLOCK_SIZE;
break; break;
case VX_CAPS_LOCAL_MEM_SIZE: case VX_CAPS_LOCAL_MEM_SIZE:
*value = 0xffffffff; *value = 0xffffffff;

View file

@ -11,10 +11,7 @@
#include <core.h> #include <core.h>
#include <VX_config.h> #include <VX_config.h>
#define CACHE_LINESIZE 64
#define PAGE_SIZE 4096 #define PAGE_SIZE 4096
#define ALLOC_BASE_ADDR 0x10000000
#define LOCAL_MEM_SIZE 0xffffffff
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@ -32,7 +29,7 @@ public:
vx_buffer(size_t size, vx_device* device) vx_buffer(size_t size, vx_device* device)
: size_(size) : size_(size)
, device_(device) { , device_(device) {
auto aligned_asize = align_size(size, CACHE_LINESIZE); auto aligned_asize = align_size(size, CACHE_BLOCK_SIZE);
data_ = malloc(aligned_asize); data_ = malloc(aligned_asize);
} }
@ -81,7 +78,7 @@ public:
int alloc_local_mem(size_t size, size_t* dev_maddr) { int alloc_local_mem(size_t size, size_t* dev_maddr) {
auto dev_mem_size = LOCAL_MEM_SIZE; auto dev_mem_size = LOCAL_MEM_SIZE;
auto asize = align_size(size, CACHE_LINESIZE); auto asize = align_size(size, CACHE_BLOCK_SIZE);
if (mem_allocation_ + asize > dev_mem_size) if (mem_allocation_ + asize > dev_mem_size)
return -1; return -1;
*dev_maddr = mem_allocation_; *dev_maddr = mem_allocation_;
@ -90,7 +87,7 @@ public:
} }
int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) { int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) {
auto asize = align_size(size, CACHE_LINESIZE); auto asize = align_size(size, CACHE_BLOCK_SIZE);
if (dest_addr + asize > ram_.size()) if (dest_addr + asize > ram_.size())
return -1; return -1;
@ -104,7 +101,7 @@ public:
} }
int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) { int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) {
size_t asize = align_size(size, CACHE_LINESIZE); size_t asize = align_size(size, CACHE_BLOCK_SIZE);
if (src_addr + asize > ram_.size()) if (src_addr + asize > ram_.size())
return -1; return -1;
@ -236,8 +233,8 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
case VX_CAPS_MAX_THREADS: case VX_CAPS_MAX_THREADS:
*value = NUM_THREADS; *value = NUM_THREADS;
break; break;
case VX_CAPS_CACHE_LINESIZE: case VX_CAPS_CACHE_LINE_SIZE:
*value = CACHE_LINESIZE; *value = CACHE_BLOCK_SIZE;
break; break;
case VX_CAPS_LOCAL_MEM_SIZE: case VX_CAPS_LOCAL_MEM_SIZE:
*value = LOCAL_MEM_SIZE; *value = LOCAL_MEM_SIZE;

View file

@ -32,7 +32,7 @@
`endif `endif
`ifndef SM_ENABLE `ifndef SM_ENABLE
`define SM_ENABLE 0 `define SM_ENABLE 1
`endif `endif
`ifndef GLOBAL_BLOCK_SIZE `ifndef GLOBAL_BLOCK_SIZE
@ -47,14 +47,14 @@
`define STARTUP_ADDR 32'h80000000 `define STARTUP_ADDR 32'h80000000
`endif `endif
`ifndef SHARED_MEM_BASE_ADDR
`define SHARED_MEM_BASE_ADDR 32'h6FFFF000
`endif
`ifndef IO_BUS_BASE_ADDR `ifndef IO_BUS_BASE_ADDR
`define IO_BUS_BASE_ADDR 32'hFF000000 `define IO_BUS_BASE_ADDR 32'hFF000000
`endif `endif
`ifndef SHARED_MEM_BASE_ADDR
`define SHARED_MEM_BASE_ADDR `IO_BUS_BASE_ADDR
`endif
`ifndef IO_BUS_ADDR_COUT `ifndef IO_BUS_ADDR_COUT
`define IO_BUS_ADDR_COUT 32'hFFFFFFFC `define IO_BUS_ADDR_COUT 32'hFFFFFFFC
`endif `endif
@ -139,16 +139,6 @@
`define CSR_FRM 12'h002 `define CSR_FRM 12'h002
`define CSR_FCSR 12'h003 `define CSR_FCSR 12'h003
// SIMT CSRs
`define CSR_LTID 12'h020
`define CSR_LWID 12'h021
`define CSR_GTID 12'h022
`define CSR_GWID 12'h023
`define CSR_GCID 12'h024
`define CSR_NT 12'h025
`define CSR_NW 12'h026
`define CSR_NC 12'h027
`define CSR_SATP 12'h180 `define CSR_SATP 12'h180
`define CSR_PMPCFG0 12'h3A0 `define CSR_PMPCFG0 12'h3A0
@ -236,6 +226,19 @@
`define CSR_MIMPID 12'hF13 `define CSR_MIMPID 12'hF13
`define CSR_MHARTID 12'hF14 `define CSR_MHARTID 12'hF14
// User SIMT CSRs
`define CSR_WTID 12'hCC0
`define CSR_LTID 12'hCC1
`define CSR_GTID 12'hCC2
`define CSR_LWID 12'hCC3
`define CSR_GWID `CSR_MHARTID
`define CSR_GCID 12'hCC5
// Machine SIMT CSRs
`define CSR_NT 12'hFC0
`define CSR_NW 12'hFC1
`define CSR_NC 12'hFC2
// Pipeline Queues //////////////////////////////////////////////////////////// // Pipeline Queues ////////////////////////////////////////////////////////////
// Size of instruction queue // Size of instruction queue
@ -324,9 +327,14 @@
// SM Configurable Knobs ////////////////////////////////////////////////////// // SM Configurable Knobs //////////////////////////////////////////////////////
// Size of cache block in bytes
`ifndef SM_BLOCK_SIZE
`define SM_BLOCK_SIZE 1024
`endif
// Size of cache in bytes // Size of cache in bytes
`ifndef SMEM_SIZE `ifndef SMEM_SIZE
`define SMEM_SIZE 8192 `define SMEM_SIZE (`NUM_WARPS * `NUM_THREADS * `SM_BLOCK_SIZE)
`endif `endif
// Number of banks // Number of banks

View file

@ -110,9 +110,10 @@ module VX_csr_data #(
`CSR_FCSR : read_data_r = 32'(csr_fcsr[read_wid]); `CSR_FCSR : read_data_r = 32'(csr_fcsr[read_wid]);
`CSR_LWID : read_data_r = 32'(read_wid); `CSR_LWID : read_data_r = 32'(read_wid);
`CSR_WTID ,
`CSR_LTID , `CSR_LTID ,
`CSR_GTID , `CSR_GTID ,
`CSR_MHARTID , /*`CSR_MHARTID ,*/
`CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(read_wid); `CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(read_wid);
`CSR_GCID : read_data_r = CORE_ID; `CSR_GCID : read_data_r = CORE_ID;
`CSR_NT : read_data_r = `NUM_THREADS; `CSR_NT : read_data_r = `NUM_THREADS;

View file

@ -120,8 +120,9 @@ module VX_csr_unit #(
); );
for (genvar i = 0; i < `NUM_THREADS; i++) begin for (genvar i = 0; i < `NUM_THREADS; i++) begin
assign csr_pipe_rsp_if.data[i] = (csr_addr_s1 == `CSR_LTID) ? i : assign csr_pipe_rsp_if.data[i] = (csr_addr_s1 == `CSR_WTID) ? i :
(csr_addr_s1 == `CSR_GTID) ? (csr_read_data_s1 * `NUM_THREADS + i) : (csr_addr_s1 == `CSR_LTID
|| csr_addr_s1 == `CSR_GTID) ? (csr_read_data_s1 * `NUM_THREADS + i) :
csr_read_data_s1; csr_read_data_s1;
end end

View file

@ -319,7 +319,7 @@
`define SCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 2) `define SCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 2)
// Block size in bytes // Block size in bytes
`define SBANK_LINE_SIZE 4 `define SBANK_LINE_SIZE `SM_BLOCK_SIZE
// Word size in bytes // Word size in bytes
`define SWORD_SIZE 4 `define SWORD_SIZE 4

View file

@ -20,21 +20,24 @@ void vx_split(int predicate);
// Join // Join
void vx_join(); void vx_join();
// Return the warp's unique thread id // Return active warp's thread id
int vx_thread_id(); int vx_thread_id();
// Return the core's unique warp id // Return active core's local thread id
int vx_warp_id(); int vx_thread_lid();
// Return processsor unique core id
int vx_core_id();
// Return processsor global thread id // Return processsor global thread id
int vx_thread_gid(); int vx_thread_gid();
// Return processsor global warp id // Return active core's local warp id
int vx_warp_id();
// Return processsor's global warp id
int vx_warp_gid(); int vx_warp_gid();
// Return processsor core id
int vx_core_id();
// Return the number of threads in a warp // Return the number of threads in a warp
int vx_num_threads(); int vx_num_threads();

View file

@ -9,7 +9,6 @@ OUTPUT_ARCH(riscv)
ENTRY(_start) ENTRY(_start)
SECTIONS SECTIONS
{ {
PROVIDE(__stack_top = 0x6ffff000);
. = 0x80000000; . = 0x80000000;
.interp : { *(.interp) } .interp : { *(.interp) }
.note.gnu.build-id : { *(.note.gnu.build-id) } .note.gnu.build-id : { *(.note.gnu.build-id) }
@ -191,10 +190,24 @@ SECTIONS
. = SEGMENT_START("ldata-segment", .); . = SEGMENT_START("ldata-segment", .);
. = ALIGN(32 / 8); . = ALIGN(32 / 8);
__BSS_END__ = .; __BSS_END__ = .;
__global_pointer$ = MIN(__SDATA_BEGIN__ + 0x800, __global_pointer = MIN(__SDATA_BEGIN__ + 0x800,
MAX(__DATA_BEGIN__ + 0x800, __BSS_END__ - 0x800)); MAX(__DATA_BEGIN__ + 0x800, __BSS_END__ - 0x800));
_end = .; PROVIDE (end = .); _end = .; PROVIDE (end = .);
. = DATA_SEGMENT_END (.); . = DATA_SEGMENT_END (.);
/* .stack_dummy section doesn't contains any symbols. It is only
* used for linker to calculate size of stack sections, and assign
* values to stack symbols later */
.stack_dummy (COPY):
{
KEEP(*(.stack*))
}
__stack_usage = SIZEOF(.stack_dummy);
PROVIDE(__stack_top = 0xFF000000);
PROVIDE(__stack_size = 0x400);
PROVIDE(__stack = __stack_top);
ASSERT(__stack_usage <= __stack_size, "stack overflow")
/* Stabs debugging sections. */ /* Stabs debugging sections. */
.stab 0 : { *(.stab) } .stab 0 : { *(.stab) }
.stabstr 0 : { *(.stabstr) } .stabstr 0 : { *(.stabstr) }

View file

@ -47,6 +47,12 @@ vx_warp_gid:
.type vx_thread_id, @function .type vx_thread_id, @function
.global vx_thread_id .global vx_thread_id
vx_thread_id: vx_thread_id:
csrr a0, CSR_WTID
ret
.type vx_thread_lid, @function
.global vx_thread_lid
vx_thread_lid:
csrr a0, CSR_LTID csrr a0, CSR_LTID
ret ret

View file

@ -6,7 +6,7 @@
extern "C" { extern "C" {
#endif #endif
#define NUM_CORES_MAX 8 #define NUM_CORES_MAX 16
typedef struct { typedef struct {
func_t function; func_t function;

View file

@ -57,18 +57,20 @@ vx_set_sp:
# set global pointer register # set global pointer register
.option push .option push
.option norelax .option norelax
la gp, __global_pointer$ la gp, __global_pointer
.option pop .option pop
# allocate stack region for a threads on the processor # allocate stack region for a threads on the processor
# set stack pointer # set stack pointer
csrr a1, CSR_GTID # get global thread id la sp, __stack_top # load stack base address
slli a1, a1, 10 # multiply by 1024 la a1, __stack_size # stack size
csrr a2, CSR_LTID # get local thread id #if SM_ENABLE
slli a2, a2, 2 # multiply by 4 csrr a2, CSR_LTID # get lobal thread id
la sp, __stack_top$ # load stack base address #else
csrr a2, CSR_GTID # get global thread id
#endif
mul a1, a1, a2
sub sp, sp, a1 # sub thread block sub sp, sp, a1 # sub thread block
add sp, sp, a2 # reduce addr collision for perf
# disable active warps except warp0 # disable active warps except warp0
csrr a3, CSR_LWID # get local wid csrr a3, CSR_LWID # get local wid

View file

@ -1,14 +1,13 @@
#include <vx_intrinsics.h>
// #include <utlist.h>
#include <string.h> #include <string.h>
#include <unistd.h> #include <unistd.h>
#include <stdint.h> #include <stdint.h>
#include <stdbool.h> #include <stdbool.h>
#include <stdlib.h> #include <stdlib.h>
#include <vx_intrinsics.h>
#include <vx_print.h>
#include <vx_spawn.h>
typedef struct typedef struct
{ {
unsigned * x; unsigned * x;

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -1,16 +1,5 @@
#include <vx_intrinsics.h> #include <vx_intrinsics.h>
#include <vx_print.h>
// #include <utlist.h>
#include <string.h>
#include <unistd.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdlib.h>
// Newlib
#include <stdio.h>
int main() int main()
{ {

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff