fixed loader script stack setup

This commit is contained in:
Blaise Tine 2020-12-31 22:37:20 -05:00
parent e757a0e333
commit e4a00dd0d9
43 changed files with 157370 additions and 118082 deletions

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -17,11 +17,15 @@ typedef void* vx_buffer_h;
#define VX_CAPS_MAX_CORES 0x1
#define VX_CAPS_MAX_WARPS 0x2
#define VX_CAPS_MAX_THREADS 0x3
#define VX_CAPS_CACHE_LINESIZE 0x4
#define VX_CAPS_CACHE_LINE_SIZE 0x4
#define VX_CAPS_LOCAL_MEM_SIZE 0x5
#define VX_CAPS_ALLOC_BASE_ADDR 0x6
#define VX_CAPS_KERNEL_BASE_ADDR 0x7
#define CACHE_BLOCK_SIZE 64
#define ALLOC_BASE_ADDR 0x00000000
#define LOCAL_MEM_SIZE 0xffffffff
// open the device and connect to it
int vx_dev_open(vx_device_h* hdevice);

View file

@ -3,7 +3,7 @@
#include <fstream>
#include <iomanip>
#define CCI_LATENCY 8
#define CCI_LATENCY 8
#define CCI_RAND_MOD 8
#define CCI_RQ_SIZE 16
#define CCI_WQ_SIZE 16
@ -204,11 +204,11 @@ void opae_sim::sRxPort_bus() {
if (!mmio_req_enabled
&& (cci_rd_it != cci_reads_.end())) {
vortex_afu_->vcp2af_sRxPort_c0_rspValid = 1;
memcpy(vortex_afu_->vcp2af_sRxPort_c0_data, cci_rd_it->block.data(), CACHE_BLOCK_SIZE);
memcpy(vortex_afu_->vcp2af_sRxPort_c0_data, cci_rd_it->data.data(), CACHE_BLOCK_SIZE);
vortex_afu_->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata;
/*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata);
for (int i = 0; i < CACHE_BLOCK_SIZE; ++i)
printf("%02x", cci_rd_it->block[CACHE_BLOCK_SIZE-1-i]);
printf("%02x", cci_rd_it->data[CACHE_BLOCK_SIZE-1-i]);
printf("\n");*/
cci_reads_.erase(cci_rd_it);
}
@ -223,7 +223,7 @@ void opae_sim::sTxPort_bus() {
cci_req.addr = vortex_afu_->af2cp_sTxPort_c0_hdr_address;
cci_req.mdata = vortex_afu_->af2cp_sTxPort_c0_hdr_mdata;
auto host_ptr = (uint64_t*)(vortex_afu_->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE);
memcpy(cci_req.block.data(), host_ptr, CACHE_BLOCK_SIZE);
memcpy(cci_req.data.data(), host_ptr, CACHE_BLOCK_SIZE);
//printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, vortex_afu_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata);
cci_reads_.emplace_back(cci_req);
}
@ -262,7 +262,7 @@ void opae_sim::avs_bus() {
vortex_afu_->avs_readdatavalid = 0;
if (dram_rd_it != dram_reads_.end()) {
vortex_afu_->avs_readdatavalid = 1;
memcpy(vortex_afu_->avs_readdata, dram_rd_it->block.data(), CACHE_BLOCK_SIZE);
memcpy(vortex_afu_->avs_readdata, dram_rd_it->data.data(), CACHE_BLOCK_SIZE);
uint32_t addr = dram_rd_it->addr;
dram_reads_.erase(dram_rd_it);
/*printf("%0ld: [sim] DRAM Rd Rsp: addr=%x, pending={", timestamp, addr * CACHE_BLOCK_SIZE);
@ -304,7 +304,7 @@ void opae_sim::avs_bus() {
assert(0 == vortex_afu_->mem_bank_select);
dram_rd_req_t dram_req;
dram_req.addr = vortex_afu_->avs_address;
ram_.read(vortex_afu_->avs_address * CACHE_BLOCK_SIZE, CACHE_BLOCK_SIZE, dram_req.block.data());
ram_.read(vortex_afu_->avs_address * CACHE_BLOCK_SIZE, CACHE_BLOCK_SIZE, dram_req.data.data());
dram_req.cycles_left = DRAM_LATENCY;
for (auto& rsp : dram_reads_) {
if (dram_req.addr == rsp.addr) {

View file

@ -40,13 +40,13 @@ private:
typedef struct {
int cycles_left;
std::array<uint8_t, CACHE_BLOCK_SIZE> block;
std::array<uint8_t, CACHE_BLOCK_SIZE> data;
uint32_t addr;
} dram_rd_req_t;
typedef struct {
int cycles_left;
std::array<uint8_t, CACHE_BLOCK_SIZE> block;
std::array<uint8_t, CACHE_BLOCK_SIZE> data;
uint64_t addr;
uint32_t mdata;
} cci_rd_req_t;

View file

@ -22,10 +22,6 @@
#include "vx_scope.h"
#endif
#define CACHE_BLOCK_SIZE 64
#define ALLOC_BASE_ADDR 0x10000000
#define LOCAL_MEM_SIZE 0xffffffff
#define CHECK_RES(_expr) \
do { \
fpga_result res = _expr; \
@ -104,7 +100,7 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
case VX_CAPS_MAX_THREADS:
*value = device->num_threads;
break;
case VX_CAPS_CACHE_LINESIZE:
case VX_CAPS_CACHE_LINE_SIZE:
*value = CACHE_BLOCK_SIZE;
break;
case VX_CAPS_LOCAL_MEM_SIZE:

View file

@ -11,10 +11,6 @@
#include <ram.h>
#include <simulator.h>
#define CACHE_LINESIZE 64
#define ALLOC_BASE_ADDR 0x10000000
#define LOCAL_MEM_SIZE 0xffffffff
///////////////////////////////////////////////////////////////////////////////
inline size_t align_size(size_t size, size_t alignment) {
@ -31,7 +27,7 @@ public:
vx_buffer(size_t size, vx_device* device)
: size_(size)
, device_(device) {
auto aligned_asize = align_size(size, CACHE_LINESIZE);
auto aligned_asize = align_size(size, CACHE_BLOCK_SIZE);
data_ = malloc(aligned_asize);
}
@ -75,7 +71,7 @@ public:
int alloc_local_mem(size_t size, size_t* dev_maddr) {
auto dev_mem_size = LOCAL_MEM_SIZE;
size_t asize = align_size(size, CACHE_LINESIZE);
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
if (mem_allocation_ + asize > dev_mem_size)
return -1;
*dev_maddr = mem_allocation_;
@ -84,7 +80,7 @@ public:
}
int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) {
size_t asize = align_size(size, CACHE_LINESIZE);
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
if (dest_addr + asize > ram_.size())
return -1;
@ -98,7 +94,7 @@ public:
}
int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) {
size_t asize = align_size(size, CACHE_LINESIZE);
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
if (src_addr + asize > ram_.size())
return -1;
@ -189,8 +185,8 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
case VX_CAPS_MAX_THREADS:
*value = NUM_THREADS;
break;
case VX_CAPS_CACHE_LINESIZE:
*value = CACHE_LINESIZE;
case VX_CAPS_CACHE_LINE_SIZE:
*value = CACHE_BLOCK_SIZE;
break;
case VX_CAPS_LOCAL_MEM_SIZE:
*value = 0xffffffff;

View file

@ -11,10 +11,7 @@
#include <core.h>
#include <VX_config.h>
#define CACHE_LINESIZE 64
#define PAGE_SIZE 4096
#define ALLOC_BASE_ADDR 0x10000000
#define LOCAL_MEM_SIZE 0xffffffff
///////////////////////////////////////////////////////////////////////////////
@ -32,7 +29,7 @@ public:
vx_buffer(size_t size, vx_device* device)
: size_(size)
, device_(device) {
auto aligned_asize = align_size(size, CACHE_LINESIZE);
auto aligned_asize = align_size(size, CACHE_BLOCK_SIZE);
data_ = malloc(aligned_asize);
}
@ -81,7 +78,7 @@ public:
int alloc_local_mem(size_t size, size_t* dev_maddr) {
auto dev_mem_size = LOCAL_MEM_SIZE;
auto asize = align_size(size, CACHE_LINESIZE);
auto asize = align_size(size, CACHE_BLOCK_SIZE);
if (mem_allocation_ + asize > dev_mem_size)
return -1;
*dev_maddr = mem_allocation_;
@ -90,7 +87,7 @@ public:
}
int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) {
auto asize = align_size(size, CACHE_LINESIZE);
auto asize = align_size(size, CACHE_BLOCK_SIZE);
if (dest_addr + asize > ram_.size())
return -1;
@ -104,7 +101,7 @@ public:
}
int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) {
size_t asize = align_size(size, CACHE_LINESIZE);
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
if (src_addr + asize > ram_.size())
return -1;
@ -236,8 +233,8 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
case VX_CAPS_MAX_THREADS:
*value = NUM_THREADS;
break;
case VX_CAPS_CACHE_LINESIZE:
*value = CACHE_LINESIZE;
case VX_CAPS_CACHE_LINE_SIZE:
*value = CACHE_BLOCK_SIZE;
break;
case VX_CAPS_LOCAL_MEM_SIZE:
*value = LOCAL_MEM_SIZE;

View file

@ -32,7 +32,7 @@
`endif
`ifndef SM_ENABLE
`define SM_ENABLE 0
`define SM_ENABLE 1
`endif
`ifndef GLOBAL_BLOCK_SIZE
@ -47,14 +47,14 @@
`define STARTUP_ADDR 32'h80000000
`endif
`ifndef SHARED_MEM_BASE_ADDR
`define SHARED_MEM_BASE_ADDR 32'h6FFFF000
`endif
`ifndef IO_BUS_BASE_ADDR
`define IO_BUS_BASE_ADDR 32'hFF000000
`endif
`ifndef SHARED_MEM_BASE_ADDR
`define SHARED_MEM_BASE_ADDR `IO_BUS_BASE_ADDR
`endif
`ifndef IO_BUS_ADDR_COUT
`define IO_BUS_ADDR_COUT 32'hFFFFFFFC
`endif
@ -139,16 +139,6 @@
`define CSR_FRM 12'h002
`define CSR_FCSR 12'h003
// SIMT CSRs
`define CSR_LTID 12'h020
`define CSR_LWID 12'h021
`define CSR_GTID 12'h022
`define CSR_GWID 12'h023
`define CSR_GCID 12'h024
`define CSR_NT 12'h025
`define CSR_NW 12'h026
`define CSR_NC 12'h027
`define CSR_SATP 12'h180
`define CSR_PMPCFG0 12'h3A0
@ -236,6 +226,19 @@
`define CSR_MIMPID 12'hF13
`define CSR_MHARTID 12'hF14
// User SIMT CSRs
`define CSR_WTID 12'hCC0
`define CSR_LTID 12'hCC1
`define CSR_GTID 12'hCC2
`define CSR_LWID 12'hCC3
`define CSR_GWID `CSR_MHARTID
`define CSR_GCID 12'hCC5
// Machine SIMT CSRs
`define CSR_NT 12'hFC0
`define CSR_NW 12'hFC1
`define CSR_NC 12'hFC2
// Pipeline Queues ////////////////////////////////////////////////////////////
// Size of instruction queue
@ -324,9 +327,14 @@
// SM Configurable Knobs //////////////////////////////////////////////////////
// Size of cache block in bytes
`ifndef SM_BLOCK_SIZE
`define SM_BLOCK_SIZE 1024
`endif
// Size of cache in bytes
`ifndef SMEM_SIZE
`define SMEM_SIZE 8192
`define SMEM_SIZE (`NUM_WARPS * `NUM_THREADS * `SM_BLOCK_SIZE)
`endif
// Number of banks

View file

@ -110,9 +110,10 @@ module VX_csr_data #(
`CSR_FCSR : read_data_r = 32'(csr_fcsr[read_wid]);
`CSR_LWID : read_data_r = 32'(read_wid);
`CSR_WTID ,
`CSR_LTID ,
`CSR_GTID ,
`CSR_MHARTID ,
/*`CSR_MHARTID ,*/
`CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(read_wid);
`CSR_GCID : read_data_r = CORE_ID;
`CSR_NT : read_data_r = `NUM_THREADS;

View file

@ -120,8 +120,9 @@ module VX_csr_unit #(
);
for (genvar i = 0; i < `NUM_THREADS; i++) begin
assign csr_pipe_rsp_if.data[i] = (csr_addr_s1 == `CSR_LTID) ? i :
(csr_addr_s1 == `CSR_GTID) ? (csr_read_data_s1 * `NUM_THREADS + i) :
assign csr_pipe_rsp_if.data[i] = (csr_addr_s1 == `CSR_WTID) ? i :
(csr_addr_s1 == `CSR_LTID
|| csr_addr_s1 == `CSR_GTID) ? (csr_read_data_s1 * `NUM_THREADS + i) :
csr_read_data_s1;
end

View file

@ -319,7 +319,7 @@
`define SCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 2)
// Block size in bytes
`define SBANK_LINE_SIZE 4
`define SBANK_LINE_SIZE `SM_BLOCK_SIZE
// Word size in bytes
`define SWORD_SIZE 4

View file

@ -20,21 +20,24 @@ void vx_split(int predicate);
// Join
void vx_join();
// Return the warp's unique thread id
// Return active warp's thread id
int vx_thread_id();
// Return the core's unique warp id
int vx_warp_id();
// Return processsor unique core id
int vx_core_id();
// Return active core's local thread id
int vx_thread_lid();
// Return processsor global thread id
int vx_thread_gid();
// Return processsor global warp id
// Return active core's local warp id
int vx_warp_id();
// Return processsor's global warp id
int vx_warp_gid();
// Return processsor core id
int vx_core_id();
// Return the number of threads in a warp
int vx_num_threads();

View file

@ -9,7 +9,6 @@ OUTPUT_ARCH(riscv)
ENTRY(_start)
SECTIONS
{
PROVIDE(__stack_top = 0x6ffff000);
. = 0x80000000;
.interp : { *(.interp) }
.note.gnu.build-id : { *(.note.gnu.build-id) }
@ -191,10 +190,24 @@ SECTIONS
. = SEGMENT_START("ldata-segment", .);
. = ALIGN(32 / 8);
__BSS_END__ = .;
__global_pointer$ = MIN(__SDATA_BEGIN__ + 0x800,
__global_pointer = MIN(__SDATA_BEGIN__ + 0x800,
MAX(__DATA_BEGIN__ + 0x800, __BSS_END__ - 0x800));
_end = .; PROVIDE (end = .);
. = DATA_SEGMENT_END (.);
. = DATA_SEGMENT_END (.);
/* .stack_dummy section doesn't contains any symbols. It is only
* used for linker to calculate size of stack sections, and assign
* values to stack symbols later */
.stack_dummy (COPY):
{
KEEP(*(.stack*))
}
__stack_usage = SIZEOF(.stack_dummy);
PROVIDE(__stack_top = 0xFF000000);
PROVIDE(__stack_size = 0x400);
PROVIDE(__stack = __stack_top);
ASSERT(__stack_usage <= __stack_size, "stack overflow")
/* Stabs debugging sections. */
.stab 0 : { *(.stab) }
.stabstr 0 : { *(.stabstr) }

View file

@ -47,6 +47,12 @@ vx_warp_gid:
.type vx_thread_id, @function
.global vx_thread_id
vx_thread_id:
csrr a0, CSR_WTID
ret
.type vx_thread_lid, @function
.global vx_thread_lid
vx_thread_lid:
csrr a0, CSR_LTID
ret

View file

@ -6,7 +6,7 @@
extern "C" {
#endif
#define NUM_CORES_MAX 8
#define NUM_CORES_MAX 16
typedef struct {
func_t function;

View file

@ -57,18 +57,20 @@ vx_set_sp:
# set global pointer register
.option push
.option norelax
la gp, __global_pointer$
la gp, __global_pointer
.option pop
# allocate stack region for a threads on the processor
# set stack pointer
csrr a1, CSR_GTID # get global thread id
slli a1, a1, 10 # multiply by 1024
csrr a2, CSR_LTID # get local thread id
slli a2, a2, 2 # multiply by 4
la sp, __stack_top$ # load stack base address
la sp, __stack_top # load stack base address
la a1, __stack_size # stack size
#if SM_ENABLE
csrr a2, CSR_LTID # get lobal thread id
#else
csrr a2, CSR_GTID # get global thread id
#endif
mul a1, a1, a2
sub sp, sp, a1 # sub thread block
add sp, sp, a2 # reduce addr collision for perf
# disable active warps except warp0
csrr a3, CSR_LWID # get local wid

View file

@ -1,14 +1,13 @@
#include <vx_intrinsics.h>
// #include <utlist.h>
#include <string.h>
#include <unistd.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdlib.h>
#include <vx_intrinsics.h>
#include <vx_print.h>
#include <vx_spawn.h>
typedef struct
{
unsigned * x;

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -1,16 +1,5 @@
#include <vx_intrinsics.h>
// #include <utlist.h>
#include <string.h>
#include <unistd.h>
#include <stdint.h>
#include <stdbool.h>
#include <stdlib.h>
// Newlib
#include <stdio.h>
#include <vx_print.h>
int main()
{

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

Binary file not shown.

File diff suppressed because it is too large Load diff