mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 13:27:29 -04:00
fixed loader script stack setup
This commit is contained in:
parent
e757a0e333
commit
e4a00dd0d9
43 changed files with 157370 additions and 118082 deletions
25475
benchmarks/opencl/convolution/convolution.dump
Normal file
25475
benchmarks/opencl/convolution/convolution.dump
Normal file
File diff suppressed because it is too large
Load diff
1786
benchmarks/opencl/guassian/Fan1.dump
Normal file
1786
benchmarks/opencl/guassian/Fan1.dump
Normal file
File diff suppressed because it is too large
Load diff
2000
benchmarks/opencl/guassian/Fan2.dump
Normal file
2000
benchmarks/opencl/guassian/Fan2.dump
Normal file
File diff suppressed because it is too large
Load diff
BIN
benchmarks/opencl/guassian/kernel.pocl
Normal file
BIN
benchmarks/opencl/guassian/kernel.pocl
Normal file
Binary file not shown.
2102
benchmarks/opencl/nearn/NearestNeighbor.dump
Normal file
2102
benchmarks/opencl/nearn/NearestNeighbor.dump
Normal file
File diff suppressed because it is too large
Load diff
BIN
benchmarks/opencl/nearn/kernel.pocl
Normal file
BIN
benchmarks/opencl/nearn/kernel.pocl
Normal file
Binary file not shown.
BIN
benchmarks/opencl/saxpy/kernel.pocl
Normal file
BIN
benchmarks/opencl/saxpy/kernel.pocl
Normal file
Binary file not shown.
1736
benchmarks/opencl/saxpy/saxpy.dump
Normal file
1736
benchmarks/opencl/saxpy/saxpy.dump
Normal file
File diff suppressed because it is too large
Load diff
BIN
benchmarks/opencl/sfilter/kernel.pocl
Normal file
BIN
benchmarks/opencl/sfilter/kernel.pocl
Normal file
Binary file not shown.
2092
benchmarks/opencl/sfilter/sfilter.dump
Normal file
2092
benchmarks/opencl/sfilter/sfilter.dump
Normal file
File diff suppressed because it is too large
Load diff
BIN
benchmarks/opencl/sgemm/kernel.pocl
Normal file
BIN
benchmarks/opencl/sgemm/kernel.pocl
Normal file
Binary file not shown.
1902
benchmarks/opencl/sgemm/sgemm.dump
Normal file
1902
benchmarks/opencl/sgemm/sgemm.dump
Normal file
File diff suppressed because it is too large
Load diff
BIN
benchmarks/opencl/vecadd/kernel.pocl
Normal file
BIN
benchmarks/opencl/vecadd/kernel.pocl
Normal file
Binary file not shown.
1747
benchmarks/opencl/vecadd/vecadd.dump
Normal file
1747
benchmarks/opencl/vecadd/vecadd.dump
Normal file
File diff suppressed because it is too large
Load diff
|
@ -17,11 +17,15 @@ typedef void* vx_buffer_h;
|
|||
#define VX_CAPS_MAX_CORES 0x1
|
||||
#define VX_CAPS_MAX_WARPS 0x2
|
||||
#define VX_CAPS_MAX_THREADS 0x3
|
||||
#define VX_CAPS_CACHE_LINESIZE 0x4
|
||||
#define VX_CAPS_CACHE_LINE_SIZE 0x4
|
||||
#define VX_CAPS_LOCAL_MEM_SIZE 0x5
|
||||
#define VX_CAPS_ALLOC_BASE_ADDR 0x6
|
||||
#define VX_CAPS_KERNEL_BASE_ADDR 0x7
|
||||
|
||||
#define CACHE_BLOCK_SIZE 64
|
||||
#define ALLOC_BASE_ADDR 0x00000000
|
||||
#define LOCAL_MEM_SIZE 0xffffffff
|
||||
|
||||
// open the device and connect to it
|
||||
int vx_dev_open(vx_device_h* hdevice);
|
||||
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#include <fstream>
|
||||
#include <iomanip>
|
||||
|
||||
#define CCI_LATENCY 8
|
||||
#define CCI_LATENCY 8
|
||||
#define CCI_RAND_MOD 8
|
||||
#define CCI_RQ_SIZE 16
|
||||
#define CCI_WQ_SIZE 16
|
||||
|
@ -204,11 +204,11 @@ void opae_sim::sRxPort_bus() {
|
|||
if (!mmio_req_enabled
|
||||
&& (cci_rd_it != cci_reads_.end())) {
|
||||
vortex_afu_->vcp2af_sRxPort_c0_rspValid = 1;
|
||||
memcpy(vortex_afu_->vcp2af_sRxPort_c0_data, cci_rd_it->block.data(), CACHE_BLOCK_SIZE);
|
||||
memcpy(vortex_afu_->vcp2af_sRxPort_c0_data, cci_rd_it->data.data(), CACHE_BLOCK_SIZE);
|
||||
vortex_afu_->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata;
|
||||
/*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata);
|
||||
for (int i = 0; i < CACHE_BLOCK_SIZE; ++i)
|
||||
printf("%02x", cci_rd_it->block[CACHE_BLOCK_SIZE-1-i]);
|
||||
printf("%02x", cci_rd_it->data[CACHE_BLOCK_SIZE-1-i]);
|
||||
printf("\n");*/
|
||||
cci_reads_.erase(cci_rd_it);
|
||||
}
|
||||
|
@ -223,7 +223,7 @@ void opae_sim::sTxPort_bus() {
|
|||
cci_req.addr = vortex_afu_->af2cp_sTxPort_c0_hdr_address;
|
||||
cci_req.mdata = vortex_afu_->af2cp_sTxPort_c0_hdr_mdata;
|
||||
auto host_ptr = (uint64_t*)(vortex_afu_->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE);
|
||||
memcpy(cci_req.block.data(), host_ptr, CACHE_BLOCK_SIZE);
|
||||
memcpy(cci_req.data.data(), host_ptr, CACHE_BLOCK_SIZE);
|
||||
//printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, vortex_afu_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata);
|
||||
cci_reads_.emplace_back(cci_req);
|
||||
}
|
||||
|
@ -262,7 +262,7 @@ void opae_sim::avs_bus() {
|
|||
vortex_afu_->avs_readdatavalid = 0;
|
||||
if (dram_rd_it != dram_reads_.end()) {
|
||||
vortex_afu_->avs_readdatavalid = 1;
|
||||
memcpy(vortex_afu_->avs_readdata, dram_rd_it->block.data(), CACHE_BLOCK_SIZE);
|
||||
memcpy(vortex_afu_->avs_readdata, dram_rd_it->data.data(), CACHE_BLOCK_SIZE);
|
||||
uint32_t addr = dram_rd_it->addr;
|
||||
dram_reads_.erase(dram_rd_it);
|
||||
/*printf("%0ld: [sim] DRAM Rd Rsp: addr=%x, pending={", timestamp, addr * CACHE_BLOCK_SIZE);
|
||||
|
@ -304,7 +304,7 @@ void opae_sim::avs_bus() {
|
|||
assert(0 == vortex_afu_->mem_bank_select);
|
||||
dram_rd_req_t dram_req;
|
||||
dram_req.addr = vortex_afu_->avs_address;
|
||||
ram_.read(vortex_afu_->avs_address * CACHE_BLOCK_SIZE, CACHE_BLOCK_SIZE, dram_req.block.data());
|
||||
ram_.read(vortex_afu_->avs_address * CACHE_BLOCK_SIZE, CACHE_BLOCK_SIZE, dram_req.data.data());
|
||||
dram_req.cycles_left = DRAM_LATENCY;
|
||||
for (auto& rsp : dram_reads_) {
|
||||
if (dram_req.addr == rsp.addr) {
|
||||
|
|
|
@ -40,13 +40,13 @@ private:
|
|||
|
||||
typedef struct {
|
||||
int cycles_left;
|
||||
std::array<uint8_t, CACHE_BLOCK_SIZE> block;
|
||||
std::array<uint8_t, CACHE_BLOCK_SIZE> data;
|
||||
uint32_t addr;
|
||||
} dram_rd_req_t;
|
||||
|
||||
typedef struct {
|
||||
int cycles_left;
|
||||
std::array<uint8_t, CACHE_BLOCK_SIZE> block;
|
||||
std::array<uint8_t, CACHE_BLOCK_SIZE> data;
|
||||
uint64_t addr;
|
||||
uint32_t mdata;
|
||||
} cci_rd_req_t;
|
||||
|
|
|
@ -22,10 +22,6 @@
|
|||
#include "vx_scope.h"
|
||||
#endif
|
||||
|
||||
#define CACHE_BLOCK_SIZE 64
|
||||
#define ALLOC_BASE_ADDR 0x10000000
|
||||
#define LOCAL_MEM_SIZE 0xffffffff
|
||||
|
||||
#define CHECK_RES(_expr) \
|
||||
do { \
|
||||
fpga_result res = _expr; \
|
||||
|
@ -104,7 +100,7 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
|
|||
case VX_CAPS_MAX_THREADS:
|
||||
*value = device->num_threads;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINESIZE:
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
*value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||
|
|
|
@ -11,10 +11,6 @@
|
|||
#include <ram.h>
|
||||
#include <simulator.h>
|
||||
|
||||
#define CACHE_LINESIZE 64
|
||||
#define ALLOC_BASE_ADDR 0x10000000
|
||||
#define LOCAL_MEM_SIZE 0xffffffff
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
inline size_t align_size(size_t size, size_t alignment) {
|
||||
|
@ -31,7 +27,7 @@ public:
|
|||
vx_buffer(size_t size, vx_device* device)
|
||||
: size_(size)
|
||||
, device_(device) {
|
||||
auto aligned_asize = align_size(size, CACHE_LINESIZE);
|
||||
auto aligned_asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
data_ = malloc(aligned_asize);
|
||||
}
|
||||
|
||||
|
@ -75,7 +71,7 @@ public:
|
|||
|
||||
int alloc_local_mem(size_t size, size_t* dev_maddr) {
|
||||
auto dev_mem_size = LOCAL_MEM_SIZE;
|
||||
size_t asize = align_size(size, CACHE_LINESIZE);
|
||||
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
if (mem_allocation_ + asize > dev_mem_size)
|
||||
return -1;
|
||||
*dev_maddr = mem_allocation_;
|
||||
|
@ -84,7 +80,7 @@ public:
|
|||
}
|
||||
|
||||
int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) {
|
||||
size_t asize = align_size(size, CACHE_LINESIZE);
|
||||
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dest_addr + asize > ram_.size())
|
||||
return -1;
|
||||
|
||||
|
@ -98,7 +94,7 @@ public:
|
|||
}
|
||||
|
||||
int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) {
|
||||
size_t asize = align_size(size, CACHE_LINESIZE);
|
||||
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
if (src_addr + asize > ram_.size())
|
||||
return -1;
|
||||
|
||||
|
@ -189,8 +185,8 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
|
|||
case VX_CAPS_MAX_THREADS:
|
||||
*value = NUM_THREADS;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINESIZE:
|
||||
*value = CACHE_LINESIZE;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
*value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||
*value = 0xffffffff;
|
||||
|
|
|
@ -11,10 +11,7 @@
|
|||
#include <core.h>
|
||||
#include <VX_config.h>
|
||||
|
||||
#define CACHE_LINESIZE 64
|
||||
#define PAGE_SIZE 4096
|
||||
#define ALLOC_BASE_ADDR 0x10000000
|
||||
#define LOCAL_MEM_SIZE 0xffffffff
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -32,7 +29,7 @@ public:
|
|||
vx_buffer(size_t size, vx_device* device)
|
||||
: size_(size)
|
||||
, device_(device) {
|
||||
auto aligned_asize = align_size(size, CACHE_LINESIZE);
|
||||
auto aligned_asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
data_ = malloc(aligned_asize);
|
||||
}
|
||||
|
||||
|
@ -81,7 +78,7 @@ public:
|
|||
|
||||
int alloc_local_mem(size_t size, size_t* dev_maddr) {
|
||||
auto dev_mem_size = LOCAL_MEM_SIZE;
|
||||
auto asize = align_size(size, CACHE_LINESIZE);
|
||||
auto asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
if (mem_allocation_ + asize > dev_mem_size)
|
||||
return -1;
|
||||
*dev_maddr = mem_allocation_;
|
||||
|
@ -90,7 +87,7 @@ public:
|
|||
}
|
||||
|
||||
int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) {
|
||||
auto asize = align_size(size, CACHE_LINESIZE);
|
||||
auto asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dest_addr + asize > ram_.size())
|
||||
return -1;
|
||||
|
||||
|
@ -104,7 +101,7 @@ public:
|
|||
}
|
||||
|
||||
int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) {
|
||||
size_t asize = align_size(size, CACHE_LINESIZE);
|
||||
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
if (src_addr + asize > ram_.size())
|
||||
return -1;
|
||||
|
||||
|
@ -236,8 +233,8 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
|
|||
case VX_CAPS_MAX_THREADS:
|
||||
*value = NUM_THREADS;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINESIZE:
|
||||
*value = CACHE_LINESIZE;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
*value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||
*value = LOCAL_MEM_SIZE;
|
||||
|
|
|
@ -32,7 +32,7 @@
|
|||
`endif
|
||||
|
||||
`ifndef SM_ENABLE
|
||||
`define SM_ENABLE 0
|
||||
`define SM_ENABLE 1
|
||||
`endif
|
||||
|
||||
`ifndef GLOBAL_BLOCK_SIZE
|
||||
|
@ -47,14 +47,14 @@
|
|||
`define STARTUP_ADDR 32'h80000000
|
||||
`endif
|
||||
|
||||
`ifndef SHARED_MEM_BASE_ADDR
|
||||
`define SHARED_MEM_BASE_ADDR 32'h6FFFF000
|
||||
`endif
|
||||
|
||||
`ifndef IO_BUS_BASE_ADDR
|
||||
`define IO_BUS_BASE_ADDR 32'hFF000000
|
||||
`endif
|
||||
|
||||
`ifndef SHARED_MEM_BASE_ADDR
|
||||
`define SHARED_MEM_BASE_ADDR `IO_BUS_BASE_ADDR
|
||||
`endif
|
||||
|
||||
`ifndef IO_BUS_ADDR_COUT
|
||||
`define IO_BUS_ADDR_COUT 32'hFFFFFFFC
|
||||
`endif
|
||||
|
@ -139,16 +139,6 @@
|
|||
`define CSR_FRM 12'h002
|
||||
`define CSR_FCSR 12'h003
|
||||
|
||||
// SIMT CSRs
|
||||
`define CSR_LTID 12'h020
|
||||
`define CSR_LWID 12'h021
|
||||
`define CSR_GTID 12'h022
|
||||
`define CSR_GWID 12'h023
|
||||
`define CSR_GCID 12'h024
|
||||
`define CSR_NT 12'h025
|
||||
`define CSR_NW 12'h026
|
||||
`define CSR_NC 12'h027
|
||||
|
||||
`define CSR_SATP 12'h180
|
||||
|
||||
`define CSR_PMPCFG0 12'h3A0
|
||||
|
@ -236,6 +226,19 @@
|
|||
`define CSR_MIMPID 12'hF13
|
||||
`define CSR_MHARTID 12'hF14
|
||||
|
||||
// User SIMT CSRs
|
||||
`define CSR_WTID 12'hCC0
|
||||
`define CSR_LTID 12'hCC1
|
||||
`define CSR_GTID 12'hCC2
|
||||
`define CSR_LWID 12'hCC3
|
||||
`define CSR_GWID `CSR_MHARTID
|
||||
`define CSR_GCID 12'hCC5
|
||||
|
||||
// Machine SIMT CSRs
|
||||
`define CSR_NT 12'hFC0
|
||||
`define CSR_NW 12'hFC1
|
||||
`define CSR_NC 12'hFC2
|
||||
|
||||
// Pipeline Queues ////////////////////////////////////////////////////////////
|
||||
|
||||
// Size of instruction queue
|
||||
|
@ -324,9 +327,14 @@
|
|||
|
||||
// SM Configurable Knobs //////////////////////////////////////////////////////
|
||||
|
||||
// Size of cache block in bytes
|
||||
`ifndef SM_BLOCK_SIZE
|
||||
`define SM_BLOCK_SIZE 1024
|
||||
`endif
|
||||
|
||||
// Size of cache in bytes
|
||||
`ifndef SMEM_SIZE
|
||||
`define SMEM_SIZE 8192
|
||||
`define SMEM_SIZE (`NUM_WARPS * `NUM_THREADS * `SM_BLOCK_SIZE)
|
||||
`endif
|
||||
|
||||
// Number of banks
|
||||
|
|
|
@ -110,9 +110,10 @@ module VX_csr_data #(
|
|||
`CSR_FCSR : read_data_r = 32'(csr_fcsr[read_wid]);
|
||||
|
||||
`CSR_LWID : read_data_r = 32'(read_wid);
|
||||
`CSR_WTID ,
|
||||
`CSR_LTID ,
|
||||
`CSR_GTID ,
|
||||
`CSR_MHARTID ,
|
||||
/*`CSR_MHARTID ,*/
|
||||
`CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(read_wid);
|
||||
`CSR_GCID : read_data_r = CORE_ID;
|
||||
`CSR_NT : read_data_r = `NUM_THREADS;
|
||||
|
|
|
@ -120,8 +120,9 @@ module VX_csr_unit #(
|
|||
);
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
assign csr_pipe_rsp_if.data[i] = (csr_addr_s1 == `CSR_LTID) ? i :
|
||||
(csr_addr_s1 == `CSR_GTID) ? (csr_read_data_s1 * `NUM_THREADS + i) :
|
||||
assign csr_pipe_rsp_if.data[i] = (csr_addr_s1 == `CSR_WTID) ? i :
|
||||
(csr_addr_s1 == `CSR_LTID
|
||||
|| csr_addr_s1 == `CSR_GTID) ? (csr_read_data_s1 * `NUM_THREADS + i) :
|
||||
csr_read_data_s1;
|
||||
end
|
||||
|
||||
|
|
|
@ -319,7 +319,7 @@
|
|||
`define SCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 2)
|
||||
|
||||
// Block size in bytes
|
||||
`define SBANK_LINE_SIZE 4
|
||||
`define SBANK_LINE_SIZE `SM_BLOCK_SIZE
|
||||
|
||||
// Word size in bytes
|
||||
`define SWORD_SIZE 4
|
||||
|
|
|
@ -20,21 +20,24 @@ void vx_split(int predicate);
|
|||
// Join
|
||||
void vx_join();
|
||||
|
||||
// Return the warp's unique thread id
|
||||
// Return active warp's thread id
|
||||
int vx_thread_id();
|
||||
|
||||
// Return the core's unique warp id
|
||||
int vx_warp_id();
|
||||
|
||||
// Return processsor unique core id
|
||||
int vx_core_id();
|
||||
// Return active core's local thread id
|
||||
int vx_thread_lid();
|
||||
|
||||
// Return processsor global thread id
|
||||
int vx_thread_gid();
|
||||
|
||||
// Return processsor global warp id
|
||||
// Return active core's local warp id
|
||||
int vx_warp_id();
|
||||
|
||||
// Return processsor's global warp id
|
||||
int vx_warp_gid();
|
||||
|
||||
// Return processsor core id
|
||||
int vx_core_id();
|
||||
|
||||
// Return the number of threads in a warp
|
||||
int vx_num_threads();
|
||||
|
||||
|
|
|
@ -9,7 +9,6 @@ OUTPUT_ARCH(riscv)
|
|||
ENTRY(_start)
|
||||
SECTIONS
|
||||
{
|
||||
PROVIDE(__stack_top = 0x6ffff000);
|
||||
. = 0x80000000;
|
||||
.interp : { *(.interp) }
|
||||
.note.gnu.build-id : { *(.note.gnu.build-id) }
|
||||
|
@ -191,10 +190,24 @@ SECTIONS
|
|||
. = SEGMENT_START("ldata-segment", .);
|
||||
. = ALIGN(32 / 8);
|
||||
__BSS_END__ = .;
|
||||
__global_pointer$ = MIN(__SDATA_BEGIN__ + 0x800,
|
||||
__global_pointer = MIN(__SDATA_BEGIN__ + 0x800,
|
||||
MAX(__DATA_BEGIN__ + 0x800, __BSS_END__ - 0x800));
|
||||
_end = .; PROVIDE (end = .);
|
||||
. = DATA_SEGMENT_END (.);
|
||||
. = DATA_SEGMENT_END (.);
|
||||
|
||||
/* .stack_dummy section doesn't contains any symbols. It is only
|
||||
* used for linker to calculate size of stack sections, and assign
|
||||
* values to stack symbols later */
|
||||
.stack_dummy (COPY):
|
||||
{
|
||||
KEEP(*(.stack*))
|
||||
}
|
||||
__stack_usage = SIZEOF(.stack_dummy);
|
||||
PROVIDE(__stack_top = 0xFF000000);
|
||||
PROVIDE(__stack_size = 0x400);
|
||||
PROVIDE(__stack = __stack_top);
|
||||
ASSERT(__stack_usage <= __stack_size, "stack overflow")
|
||||
|
||||
/* Stabs debugging sections. */
|
||||
.stab 0 : { *(.stab) }
|
||||
.stabstr 0 : { *(.stabstr) }
|
||||
|
|
|
@ -47,6 +47,12 @@ vx_warp_gid:
|
|||
.type vx_thread_id, @function
|
||||
.global vx_thread_id
|
||||
vx_thread_id:
|
||||
csrr a0, CSR_WTID
|
||||
ret
|
||||
|
||||
.type vx_thread_lid, @function
|
||||
.global vx_thread_lid
|
||||
vx_thread_lid:
|
||||
csrr a0, CSR_LTID
|
||||
ret
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define NUM_CORES_MAX 8
|
||||
#define NUM_CORES_MAX 16
|
||||
|
||||
typedef struct {
|
||||
func_t function;
|
||||
|
|
|
@ -57,18 +57,20 @@ vx_set_sp:
|
|||
# set global pointer register
|
||||
.option push
|
||||
.option norelax
|
||||
la gp, __global_pointer$
|
||||
la gp, __global_pointer
|
||||
.option pop
|
||||
|
||||
# allocate stack region for a threads on the processor
|
||||
# set stack pointer
|
||||
csrr a1, CSR_GTID # get global thread id
|
||||
slli a1, a1, 10 # multiply by 1024
|
||||
csrr a2, CSR_LTID # get local thread id
|
||||
slli a2, a2, 2 # multiply by 4
|
||||
la sp, __stack_top$ # load stack base address
|
||||
la sp, __stack_top # load stack base address
|
||||
la a1, __stack_size # stack size
|
||||
#if SM_ENABLE
|
||||
csrr a2, CSR_LTID # get lobal thread id
|
||||
#else
|
||||
csrr a2, CSR_GTID # get global thread id
|
||||
#endif
|
||||
mul a1, a1, a2
|
||||
sub sp, sp, a1 # sub thread block
|
||||
add sp, sp, a2 # reduce addr collision for perf
|
||||
|
||||
# disable active warps except warp0
|
||||
csrr a3, CSR_LWID # get local wid
|
||||
|
|
|
@ -1,14 +1,13 @@
|
|||
|
||||
#include <vx_intrinsics.h>
|
||||
|
||||
|
||||
// #include <utlist.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <vx_intrinsics.h>
|
||||
#include <vx_print.h>
|
||||
#include <vx_spawn.h>
|
||||
|
||||
typedef struct
|
||||
{
|
||||
unsigned * x;
|
||||
|
|
File diff suppressed because it is too large
Load diff
Binary file not shown.
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
Binary file not shown.
File diff suppressed because it is too large
Load diff
|
@ -1,16 +1,5 @@
|
|||
|
||||
#include <vx_intrinsics.h>
|
||||
|
||||
// #include <utlist.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
|
||||
// Newlib
|
||||
#include <stdio.h>
|
||||
#include <vx_print.h>
|
||||
|
||||
int main()
|
||||
{
|
||||
|
|
File diff suppressed because it is too large
Load diff
Binary file not shown.
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
Binary file not shown.
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue