mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-25 06:17:38 -04:00
fixed loader script stack setup
This commit is contained in:
parent
e757a0e333
commit
e4a00dd0d9
43 changed files with 157370 additions and 118082 deletions
25475
benchmarks/opencl/convolution/convolution.dump
Normal file
25475
benchmarks/opencl/convolution/convolution.dump
Normal file
File diff suppressed because it is too large
Load diff
1786
benchmarks/opencl/guassian/Fan1.dump
Normal file
1786
benchmarks/opencl/guassian/Fan1.dump
Normal file
File diff suppressed because it is too large
Load diff
2000
benchmarks/opencl/guassian/Fan2.dump
Normal file
2000
benchmarks/opencl/guassian/Fan2.dump
Normal file
File diff suppressed because it is too large
Load diff
BIN
benchmarks/opencl/guassian/kernel.pocl
Normal file
BIN
benchmarks/opencl/guassian/kernel.pocl
Normal file
Binary file not shown.
2102
benchmarks/opencl/nearn/NearestNeighbor.dump
Normal file
2102
benchmarks/opencl/nearn/NearestNeighbor.dump
Normal file
File diff suppressed because it is too large
Load diff
BIN
benchmarks/opencl/nearn/kernel.pocl
Normal file
BIN
benchmarks/opencl/nearn/kernel.pocl
Normal file
Binary file not shown.
BIN
benchmarks/opencl/saxpy/kernel.pocl
Normal file
BIN
benchmarks/opencl/saxpy/kernel.pocl
Normal file
Binary file not shown.
1736
benchmarks/opencl/saxpy/saxpy.dump
Normal file
1736
benchmarks/opencl/saxpy/saxpy.dump
Normal file
File diff suppressed because it is too large
Load diff
BIN
benchmarks/opencl/sfilter/kernel.pocl
Normal file
BIN
benchmarks/opencl/sfilter/kernel.pocl
Normal file
Binary file not shown.
2092
benchmarks/opencl/sfilter/sfilter.dump
Normal file
2092
benchmarks/opencl/sfilter/sfilter.dump
Normal file
File diff suppressed because it is too large
Load diff
BIN
benchmarks/opencl/sgemm/kernel.pocl
Normal file
BIN
benchmarks/opencl/sgemm/kernel.pocl
Normal file
Binary file not shown.
1902
benchmarks/opencl/sgemm/sgemm.dump
Normal file
1902
benchmarks/opencl/sgemm/sgemm.dump
Normal file
File diff suppressed because it is too large
Load diff
BIN
benchmarks/opencl/vecadd/kernel.pocl
Normal file
BIN
benchmarks/opencl/vecadd/kernel.pocl
Normal file
Binary file not shown.
1747
benchmarks/opencl/vecadd/vecadd.dump
Normal file
1747
benchmarks/opencl/vecadd/vecadd.dump
Normal file
File diff suppressed because it is too large
Load diff
|
@ -17,11 +17,15 @@ typedef void* vx_buffer_h;
|
||||||
#define VX_CAPS_MAX_CORES 0x1
|
#define VX_CAPS_MAX_CORES 0x1
|
||||||
#define VX_CAPS_MAX_WARPS 0x2
|
#define VX_CAPS_MAX_WARPS 0x2
|
||||||
#define VX_CAPS_MAX_THREADS 0x3
|
#define VX_CAPS_MAX_THREADS 0x3
|
||||||
#define VX_CAPS_CACHE_LINESIZE 0x4
|
#define VX_CAPS_CACHE_LINE_SIZE 0x4
|
||||||
#define VX_CAPS_LOCAL_MEM_SIZE 0x5
|
#define VX_CAPS_LOCAL_MEM_SIZE 0x5
|
||||||
#define VX_CAPS_ALLOC_BASE_ADDR 0x6
|
#define VX_CAPS_ALLOC_BASE_ADDR 0x6
|
||||||
#define VX_CAPS_KERNEL_BASE_ADDR 0x7
|
#define VX_CAPS_KERNEL_BASE_ADDR 0x7
|
||||||
|
|
||||||
|
#define CACHE_BLOCK_SIZE 64
|
||||||
|
#define ALLOC_BASE_ADDR 0x00000000
|
||||||
|
#define LOCAL_MEM_SIZE 0xffffffff
|
||||||
|
|
||||||
// open the device and connect to it
|
// open the device and connect to it
|
||||||
int vx_dev_open(vx_device_h* hdevice);
|
int vx_dev_open(vx_device_h* hdevice);
|
||||||
|
|
||||||
|
|
|
@ -204,11 +204,11 @@ void opae_sim::sRxPort_bus() {
|
||||||
if (!mmio_req_enabled
|
if (!mmio_req_enabled
|
||||||
&& (cci_rd_it != cci_reads_.end())) {
|
&& (cci_rd_it != cci_reads_.end())) {
|
||||||
vortex_afu_->vcp2af_sRxPort_c0_rspValid = 1;
|
vortex_afu_->vcp2af_sRxPort_c0_rspValid = 1;
|
||||||
memcpy(vortex_afu_->vcp2af_sRxPort_c0_data, cci_rd_it->block.data(), CACHE_BLOCK_SIZE);
|
memcpy(vortex_afu_->vcp2af_sRxPort_c0_data, cci_rd_it->data.data(), CACHE_BLOCK_SIZE);
|
||||||
vortex_afu_->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata;
|
vortex_afu_->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata;
|
||||||
/*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata);
|
/*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata);
|
||||||
for (int i = 0; i < CACHE_BLOCK_SIZE; ++i)
|
for (int i = 0; i < CACHE_BLOCK_SIZE; ++i)
|
||||||
printf("%02x", cci_rd_it->block[CACHE_BLOCK_SIZE-1-i]);
|
printf("%02x", cci_rd_it->data[CACHE_BLOCK_SIZE-1-i]);
|
||||||
printf("\n");*/
|
printf("\n");*/
|
||||||
cci_reads_.erase(cci_rd_it);
|
cci_reads_.erase(cci_rd_it);
|
||||||
}
|
}
|
||||||
|
@ -223,7 +223,7 @@ void opae_sim::sTxPort_bus() {
|
||||||
cci_req.addr = vortex_afu_->af2cp_sTxPort_c0_hdr_address;
|
cci_req.addr = vortex_afu_->af2cp_sTxPort_c0_hdr_address;
|
||||||
cci_req.mdata = vortex_afu_->af2cp_sTxPort_c0_hdr_mdata;
|
cci_req.mdata = vortex_afu_->af2cp_sTxPort_c0_hdr_mdata;
|
||||||
auto host_ptr = (uint64_t*)(vortex_afu_->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE);
|
auto host_ptr = (uint64_t*)(vortex_afu_->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE);
|
||||||
memcpy(cci_req.block.data(), host_ptr, CACHE_BLOCK_SIZE);
|
memcpy(cci_req.data.data(), host_ptr, CACHE_BLOCK_SIZE);
|
||||||
//printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, vortex_afu_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata);
|
//printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, vortex_afu_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata);
|
||||||
cci_reads_.emplace_back(cci_req);
|
cci_reads_.emplace_back(cci_req);
|
||||||
}
|
}
|
||||||
|
@ -262,7 +262,7 @@ void opae_sim::avs_bus() {
|
||||||
vortex_afu_->avs_readdatavalid = 0;
|
vortex_afu_->avs_readdatavalid = 0;
|
||||||
if (dram_rd_it != dram_reads_.end()) {
|
if (dram_rd_it != dram_reads_.end()) {
|
||||||
vortex_afu_->avs_readdatavalid = 1;
|
vortex_afu_->avs_readdatavalid = 1;
|
||||||
memcpy(vortex_afu_->avs_readdata, dram_rd_it->block.data(), CACHE_BLOCK_SIZE);
|
memcpy(vortex_afu_->avs_readdata, dram_rd_it->data.data(), CACHE_BLOCK_SIZE);
|
||||||
uint32_t addr = dram_rd_it->addr;
|
uint32_t addr = dram_rd_it->addr;
|
||||||
dram_reads_.erase(dram_rd_it);
|
dram_reads_.erase(dram_rd_it);
|
||||||
/*printf("%0ld: [sim] DRAM Rd Rsp: addr=%x, pending={", timestamp, addr * CACHE_BLOCK_SIZE);
|
/*printf("%0ld: [sim] DRAM Rd Rsp: addr=%x, pending={", timestamp, addr * CACHE_BLOCK_SIZE);
|
||||||
|
@ -304,7 +304,7 @@ void opae_sim::avs_bus() {
|
||||||
assert(0 == vortex_afu_->mem_bank_select);
|
assert(0 == vortex_afu_->mem_bank_select);
|
||||||
dram_rd_req_t dram_req;
|
dram_rd_req_t dram_req;
|
||||||
dram_req.addr = vortex_afu_->avs_address;
|
dram_req.addr = vortex_afu_->avs_address;
|
||||||
ram_.read(vortex_afu_->avs_address * CACHE_BLOCK_SIZE, CACHE_BLOCK_SIZE, dram_req.block.data());
|
ram_.read(vortex_afu_->avs_address * CACHE_BLOCK_SIZE, CACHE_BLOCK_SIZE, dram_req.data.data());
|
||||||
dram_req.cycles_left = DRAM_LATENCY;
|
dram_req.cycles_left = DRAM_LATENCY;
|
||||||
for (auto& rsp : dram_reads_) {
|
for (auto& rsp : dram_reads_) {
|
||||||
if (dram_req.addr == rsp.addr) {
|
if (dram_req.addr == rsp.addr) {
|
||||||
|
|
|
@ -40,13 +40,13 @@ private:
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int cycles_left;
|
int cycles_left;
|
||||||
std::array<uint8_t, CACHE_BLOCK_SIZE> block;
|
std::array<uint8_t, CACHE_BLOCK_SIZE> data;
|
||||||
uint32_t addr;
|
uint32_t addr;
|
||||||
} dram_rd_req_t;
|
} dram_rd_req_t;
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
int cycles_left;
|
int cycles_left;
|
||||||
std::array<uint8_t, CACHE_BLOCK_SIZE> block;
|
std::array<uint8_t, CACHE_BLOCK_SIZE> data;
|
||||||
uint64_t addr;
|
uint64_t addr;
|
||||||
uint32_t mdata;
|
uint32_t mdata;
|
||||||
} cci_rd_req_t;
|
} cci_rd_req_t;
|
||||||
|
|
|
@ -22,10 +22,6 @@
|
||||||
#include "vx_scope.h"
|
#include "vx_scope.h"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define CACHE_BLOCK_SIZE 64
|
|
||||||
#define ALLOC_BASE_ADDR 0x10000000
|
|
||||||
#define LOCAL_MEM_SIZE 0xffffffff
|
|
||||||
|
|
||||||
#define CHECK_RES(_expr) \
|
#define CHECK_RES(_expr) \
|
||||||
do { \
|
do { \
|
||||||
fpga_result res = _expr; \
|
fpga_result res = _expr; \
|
||||||
|
@ -104,7 +100,7 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
|
||||||
case VX_CAPS_MAX_THREADS:
|
case VX_CAPS_MAX_THREADS:
|
||||||
*value = device->num_threads;
|
*value = device->num_threads;
|
||||||
break;
|
break;
|
||||||
case VX_CAPS_CACHE_LINESIZE:
|
case VX_CAPS_CACHE_LINE_SIZE:
|
||||||
*value = CACHE_BLOCK_SIZE;
|
*value = CACHE_BLOCK_SIZE;
|
||||||
break;
|
break;
|
||||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||||
|
|
|
@ -11,10 +11,6 @@
|
||||||
#include <ram.h>
|
#include <ram.h>
|
||||||
#include <simulator.h>
|
#include <simulator.h>
|
||||||
|
|
||||||
#define CACHE_LINESIZE 64
|
|
||||||
#define ALLOC_BASE_ADDR 0x10000000
|
|
||||||
#define LOCAL_MEM_SIZE 0xffffffff
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
inline size_t align_size(size_t size, size_t alignment) {
|
inline size_t align_size(size_t size, size_t alignment) {
|
||||||
|
@ -31,7 +27,7 @@ public:
|
||||||
vx_buffer(size_t size, vx_device* device)
|
vx_buffer(size_t size, vx_device* device)
|
||||||
: size_(size)
|
: size_(size)
|
||||||
, device_(device) {
|
, device_(device) {
|
||||||
auto aligned_asize = align_size(size, CACHE_LINESIZE);
|
auto aligned_asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||||
data_ = malloc(aligned_asize);
|
data_ = malloc(aligned_asize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -75,7 +71,7 @@ public:
|
||||||
|
|
||||||
int alloc_local_mem(size_t size, size_t* dev_maddr) {
|
int alloc_local_mem(size_t size, size_t* dev_maddr) {
|
||||||
auto dev_mem_size = LOCAL_MEM_SIZE;
|
auto dev_mem_size = LOCAL_MEM_SIZE;
|
||||||
size_t asize = align_size(size, CACHE_LINESIZE);
|
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||||
if (mem_allocation_ + asize > dev_mem_size)
|
if (mem_allocation_ + asize > dev_mem_size)
|
||||||
return -1;
|
return -1;
|
||||||
*dev_maddr = mem_allocation_;
|
*dev_maddr = mem_allocation_;
|
||||||
|
@ -84,7 +80,7 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) {
|
int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) {
|
||||||
size_t asize = align_size(size, CACHE_LINESIZE);
|
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||||
if (dest_addr + asize > ram_.size())
|
if (dest_addr + asize > ram_.size())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
|
@ -98,7 +94,7 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) {
|
int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) {
|
||||||
size_t asize = align_size(size, CACHE_LINESIZE);
|
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||||
if (src_addr + asize > ram_.size())
|
if (src_addr + asize > ram_.size())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
|
@ -189,8 +185,8 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
|
||||||
case VX_CAPS_MAX_THREADS:
|
case VX_CAPS_MAX_THREADS:
|
||||||
*value = NUM_THREADS;
|
*value = NUM_THREADS;
|
||||||
break;
|
break;
|
||||||
case VX_CAPS_CACHE_LINESIZE:
|
case VX_CAPS_CACHE_LINE_SIZE:
|
||||||
*value = CACHE_LINESIZE;
|
*value = CACHE_BLOCK_SIZE;
|
||||||
break;
|
break;
|
||||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||||
*value = 0xffffffff;
|
*value = 0xffffffff;
|
||||||
|
|
|
@ -11,10 +11,7 @@
|
||||||
#include <core.h>
|
#include <core.h>
|
||||||
#include <VX_config.h>
|
#include <VX_config.h>
|
||||||
|
|
||||||
#define CACHE_LINESIZE 64
|
|
||||||
#define PAGE_SIZE 4096
|
#define PAGE_SIZE 4096
|
||||||
#define ALLOC_BASE_ADDR 0x10000000
|
|
||||||
#define LOCAL_MEM_SIZE 0xffffffff
|
|
||||||
|
|
||||||
///////////////////////////////////////////////////////////////////////////////
|
///////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
@ -32,7 +29,7 @@ public:
|
||||||
vx_buffer(size_t size, vx_device* device)
|
vx_buffer(size_t size, vx_device* device)
|
||||||
: size_(size)
|
: size_(size)
|
||||||
, device_(device) {
|
, device_(device) {
|
||||||
auto aligned_asize = align_size(size, CACHE_LINESIZE);
|
auto aligned_asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||||
data_ = malloc(aligned_asize);
|
data_ = malloc(aligned_asize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -81,7 +78,7 @@ public:
|
||||||
|
|
||||||
int alloc_local_mem(size_t size, size_t* dev_maddr) {
|
int alloc_local_mem(size_t size, size_t* dev_maddr) {
|
||||||
auto dev_mem_size = LOCAL_MEM_SIZE;
|
auto dev_mem_size = LOCAL_MEM_SIZE;
|
||||||
auto asize = align_size(size, CACHE_LINESIZE);
|
auto asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||||
if (mem_allocation_ + asize > dev_mem_size)
|
if (mem_allocation_ + asize > dev_mem_size)
|
||||||
return -1;
|
return -1;
|
||||||
*dev_maddr = mem_allocation_;
|
*dev_maddr = mem_allocation_;
|
||||||
|
@ -90,7 +87,7 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) {
|
int upload(void* src, size_t dest_addr, size_t size, size_t src_offset) {
|
||||||
auto asize = align_size(size, CACHE_LINESIZE);
|
auto asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||||
if (dest_addr + asize > ram_.size())
|
if (dest_addr + asize > ram_.size())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
|
@ -104,7 +101,7 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) {
|
int download(const void* dest, size_t src_addr, size_t size, size_t dest_offset) {
|
||||||
size_t asize = align_size(size, CACHE_LINESIZE);
|
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||||
if (src_addr + asize > ram_.size())
|
if (src_addr + asize > ram_.size())
|
||||||
return -1;
|
return -1;
|
||||||
|
|
||||||
|
@ -236,8 +233,8 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
|
||||||
case VX_CAPS_MAX_THREADS:
|
case VX_CAPS_MAX_THREADS:
|
||||||
*value = NUM_THREADS;
|
*value = NUM_THREADS;
|
||||||
break;
|
break;
|
||||||
case VX_CAPS_CACHE_LINESIZE:
|
case VX_CAPS_CACHE_LINE_SIZE:
|
||||||
*value = CACHE_LINESIZE;
|
*value = CACHE_BLOCK_SIZE;
|
||||||
break;
|
break;
|
||||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||||
*value = LOCAL_MEM_SIZE;
|
*value = LOCAL_MEM_SIZE;
|
||||||
|
|
|
@ -32,7 +32,7 @@
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
`ifndef SM_ENABLE
|
`ifndef SM_ENABLE
|
||||||
`define SM_ENABLE 0
|
`define SM_ENABLE 1
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
`ifndef GLOBAL_BLOCK_SIZE
|
`ifndef GLOBAL_BLOCK_SIZE
|
||||||
|
@ -47,14 +47,14 @@
|
||||||
`define STARTUP_ADDR 32'h80000000
|
`define STARTUP_ADDR 32'h80000000
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
`ifndef SHARED_MEM_BASE_ADDR
|
|
||||||
`define SHARED_MEM_BASE_ADDR 32'h6FFFF000
|
|
||||||
`endif
|
|
||||||
|
|
||||||
`ifndef IO_BUS_BASE_ADDR
|
`ifndef IO_BUS_BASE_ADDR
|
||||||
`define IO_BUS_BASE_ADDR 32'hFF000000
|
`define IO_BUS_BASE_ADDR 32'hFF000000
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
|
`ifndef SHARED_MEM_BASE_ADDR
|
||||||
|
`define SHARED_MEM_BASE_ADDR `IO_BUS_BASE_ADDR
|
||||||
|
`endif
|
||||||
|
|
||||||
`ifndef IO_BUS_ADDR_COUT
|
`ifndef IO_BUS_ADDR_COUT
|
||||||
`define IO_BUS_ADDR_COUT 32'hFFFFFFFC
|
`define IO_BUS_ADDR_COUT 32'hFFFFFFFC
|
||||||
`endif
|
`endif
|
||||||
|
@ -139,16 +139,6 @@
|
||||||
`define CSR_FRM 12'h002
|
`define CSR_FRM 12'h002
|
||||||
`define CSR_FCSR 12'h003
|
`define CSR_FCSR 12'h003
|
||||||
|
|
||||||
// SIMT CSRs
|
|
||||||
`define CSR_LTID 12'h020
|
|
||||||
`define CSR_LWID 12'h021
|
|
||||||
`define CSR_GTID 12'h022
|
|
||||||
`define CSR_GWID 12'h023
|
|
||||||
`define CSR_GCID 12'h024
|
|
||||||
`define CSR_NT 12'h025
|
|
||||||
`define CSR_NW 12'h026
|
|
||||||
`define CSR_NC 12'h027
|
|
||||||
|
|
||||||
`define CSR_SATP 12'h180
|
`define CSR_SATP 12'h180
|
||||||
|
|
||||||
`define CSR_PMPCFG0 12'h3A0
|
`define CSR_PMPCFG0 12'h3A0
|
||||||
|
@ -236,6 +226,19 @@
|
||||||
`define CSR_MIMPID 12'hF13
|
`define CSR_MIMPID 12'hF13
|
||||||
`define CSR_MHARTID 12'hF14
|
`define CSR_MHARTID 12'hF14
|
||||||
|
|
||||||
|
// User SIMT CSRs
|
||||||
|
`define CSR_WTID 12'hCC0
|
||||||
|
`define CSR_LTID 12'hCC1
|
||||||
|
`define CSR_GTID 12'hCC2
|
||||||
|
`define CSR_LWID 12'hCC3
|
||||||
|
`define CSR_GWID `CSR_MHARTID
|
||||||
|
`define CSR_GCID 12'hCC5
|
||||||
|
|
||||||
|
// Machine SIMT CSRs
|
||||||
|
`define CSR_NT 12'hFC0
|
||||||
|
`define CSR_NW 12'hFC1
|
||||||
|
`define CSR_NC 12'hFC2
|
||||||
|
|
||||||
// Pipeline Queues ////////////////////////////////////////////////////////////
|
// Pipeline Queues ////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
// Size of instruction queue
|
// Size of instruction queue
|
||||||
|
@ -324,9 +327,14 @@
|
||||||
|
|
||||||
// SM Configurable Knobs //////////////////////////////////////////////////////
|
// SM Configurable Knobs //////////////////////////////////////////////////////
|
||||||
|
|
||||||
|
// Size of cache block in bytes
|
||||||
|
`ifndef SM_BLOCK_SIZE
|
||||||
|
`define SM_BLOCK_SIZE 1024
|
||||||
|
`endif
|
||||||
|
|
||||||
// Size of cache in bytes
|
// Size of cache in bytes
|
||||||
`ifndef SMEM_SIZE
|
`ifndef SMEM_SIZE
|
||||||
`define SMEM_SIZE 8192
|
`define SMEM_SIZE (`NUM_WARPS * `NUM_THREADS * `SM_BLOCK_SIZE)
|
||||||
`endif
|
`endif
|
||||||
|
|
||||||
// Number of banks
|
// Number of banks
|
||||||
|
|
|
@ -110,9 +110,10 @@ module VX_csr_data #(
|
||||||
`CSR_FCSR : read_data_r = 32'(csr_fcsr[read_wid]);
|
`CSR_FCSR : read_data_r = 32'(csr_fcsr[read_wid]);
|
||||||
|
|
||||||
`CSR_LWID : read_data_r = 32'(read_wid);
|
`CSR_LWID : read_data_r = 32'(read_wid);
|
||||||
|
`CSR_WTID ,
|
||||||
`CSR_LTID ,
|
`CSR_LTID ,
|
||||||
`CSR_GTID ,
|
`CSR_GTID ,
|
||||||
`CSR_MHARTID ,
|
/*`CSR_MHARTID ,*/
|
||||||
`CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(read_wid);
|
`CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(read_wid);
|
||||||
`CSR_GCID : read_data_r = CORE_ID;
|
`CSR_GCID : read_data_r = CORE_ID;
|
||||||
`CSR_NT : read_data_r = `NUM_THREADS;
|
`CSR_NT : read_data_r = `NUM_THREADS;
|
||||||
|
|
|
@ -120,8 +120,9 @@ module VX_csr_unit #(
|
||||||
);
|
);
|
||||||
|
|
||||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||||
assign csr_pipe_rsp_if.data[i] = (csr_addr_s1 == `CSR_LTID) ? i :
|
assign csr_pipe_rsp_if.data[i] = (csr_addr_s1 == `CSR_WTID) ? i :
|
||||||
(csr_addr_s1 == `CSR_GTID) ? (csr_read_data_s1 * `NUM_THREADS + i) :
|
(csr_addr_s1 == `CSR_LTID
|
||||||
|
|| csr_addr_s1 == `CSR_GTID) ? (csr_read_data_s1 * `NUM_THREADS + i) :
|
||||||
csr_read_data_s1;
|
csr_read_data_s1;
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -319,7 +319,7 @@
|
||||||
`define SCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 2)
|
`define SCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 2)
|
||||||
|
|
||||||
// Block size in bytes
|
// Block size in bytes
|
||||||
`define SBANK_LINE_SIZE 4
|
`define SBANK_LINE_SIZE `SM_BLOCK_SIZE
|
||||||
|
|
||||||
// Word size in bytes
|
// Word size in bytes
|
||||||
`define SWORD_SIZE 4
|
`define SWORD_SIZE 4
|
||||||
|
|
|
@ -20,21 +20,24 @@ void vx_split(int predicate);
|
||||||
// Join
|
// Join
|
||||||
void vx_join();
|
void vx_join();
|
||||||
|
|
||||||
// Return the warp's unique thread id
|
// Return active warp's thread id
|
||||||
int vx_thread_id();
|
int vx_thread_id();
|
||||||
|
|
||||||
// Return the core's unique warp id
|
// Return active core's local thread id
|
||||||
int vx_warp_id();
|
int vx_thread_lid();
|
||||||
|
|
||||||
// Return processsor unique core id
|
|
||||||
int vx_core_id();
|
|
||||||
|
|
||||||
// Return processsor global thread id
|
// Return processsor global thread id
|
||||||
int vx_thread_gid();
|
int vx_thread_gid();
|
||||||
|
|
||||||
// Return processsor global warp id
|
// Return active core's local warp id
|
||||||
|
int vx_warp_id();
|
||||||
|
|
||||||
|
// Return processsor's global warp id
|
||||||
int vx_warp_gid();
|
int vx_warp_gid();
|
||||||
|
|
||||||
|
// Return processsor core id
|
||||||
|
int vx_core_id();
|
||||||
|
|
||||||
// Return the number of threads in a warp
|
// Return the number of threads in a warp
|
||||||
int vx_num_threads();
|
int vx_num_threads();
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,6 @@ OUTPUT_ARCH(riscv)
|
||||||
ENTRY(_start)
|
ENTRY(_start)
|
||||||
SECTIONS
|
SECTIONS
|
||||||
{
|
{
|
||||||
PROVIDE(__stack_top = 0x6ffff000);
|
|
||||||
. = 0x80000000;
|
. = 0x80000000;
|
||||||
.interp : { *(.interp) }
|
.interp : { *(.interp) }
|
||||||
.note.gnu.build-id : { *(.note.gnu.build-id) }
|
.note.gnu.build-id : { *(.note.gnu.build-id) }
|
||||||
|
@ -191,10 +190,24 @@ SECTIONS
|
||||||
. = SEGMENT_START("ldata-segment", .);
|
. = SEGMENT_START("ldata-segment", .);
|
||||||
. = ALIGN(32 / 8);
|
. = ALIGN(32 / 8);
|
||||||
__BSS_END__ = .;
|
__BSS_END__ = .;
|
||||||
__global_pointer$ = MIN(__SDATA_BEGIN__ + 0x800,
|
__global_pointer = MIN(__SDATA_BEGIN__ + 0x800,
|
||||||
MAX(__DATA_BEGIN__ + 0x800, __BSS_END__ - 0x800));
|
MAX(__DATA_BEGIN__ + 0x800, __BSS_END__ - 0x800));
|
||||||
_end = .; PROVIDE (end = .);
|
_end = .; PROVIDE (end = .);
|
||||||
. = DATA_SEGMENT_END (.);
|
. = DATA_SEGMENT_END (.);
|
||||||
|
|
||||||
|
/* .stack_dummy section doesn't contains any symbols. It is only
|
||||||
|
* used for linker to calculate size of stack sections, and assign
|
||||||
|
* values to stack symbols later */
|
||||||
|
.stack_dummy (COPY):
|
||||||
|
{
|
||||||
|
KEEP(*(.stack*))
|
||||||
|
}
|
||||||
|
__stack_usage = SIZEOF(.stack_dummy);
|
||||||
|
PROVIDE(__stack_top = 0xFF000000);
|
||||||
|
PROVIDE(__stack_size = 0x400);
|
||||||
|
PROVIDE(__stack = __stack_top);
|
||||||
|
ASSERT(__stack_usage <= __stack_size, "stack overflow")
|
||||||
|
|
||||||
/* Stabs debugging sections. */
|
/* Stabs debugging sections. */
|
||||||
.stab 0 : { *(.stab) }
|
.stab 0 : { *(.stab) }
|
||||||
.stabstr 0 : { *(.stabstr) }
|
.stabstr 0 : { *(.stabstr) }
|
||||||
|
|
|
@ -47,6 +47,12 @@ vx_warp_gid:
|
||||||
.type vx_thread_id, @function
|
.type vx_thread_id, @function
|
||||||
.global vx_thread_id
|
.global vx_thread_id
|
||||||
vx_thread_id:
|
vx_thread_id:
|
||||||
|
csrr a0, CSR_WTID
|
||||||
|
ret
|
||||||
|
|
||||||
|
.type vx_thread_lid, @function
|
||||||
|
.global vx_thread_lid
|
||||||
|
vx_thread_lid:
|
||||||
csrr a0, CSR_LTID
|
csrr a0, CSR_LTID
|
||||||
ret
|
ret
|
||||||
|
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define NUM_CORES_MAX 8
|
#define NUM_CORES_MAX 16
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
func_t function;
|
func_t function;
|
||||||
|
|
|
@ -57,18 +57,20 @@ vx_set_sp:
|
||||||
# set global pointer register
|
# set global pointer register
|
||||||
.option push
|
.option push
|
||||||
.option norelax
|
.option norelax
|
||||||
la gp, __global_pointer$
|
la gp, __global_pointer
|
||||||
.option pop
|
.option pop
|
||||||
|
|
||||||
# allocate stack region for a threads on the processor
|
# allocate stack region for a threads on the processor
|
||||||
# set stack pointer
|
# set stack pointer
|
||||||
csrr a1, CSR_GTID # get global thread id
|
la sp, __stack_top # load stack base address
|
||||||
slli a1, a1, 10 # multiply by 1024
|
la a1, __stack_size # stack size
|
||||||
csrr a2, CSR_LTID # get local thread id
|
#if SM_ENABLE
|
||||||
slli a2, a2, 2 # multiply by 4
|
csrr a2, CSR_LTID # get lobal thread id
|
||||||
la sp, __stack_top$ # load stack base address
|
#else
|
||||||
|
csrr a2, CSR_GTID # get global thread id
|
||||||
|
#endif
|
||||||
|
mul a1, a1, a2
|
||||||
sub sp, sp, a1 # sub thread block
|
sub sp, sp, a1 # sub thread block
|
||||||
add sp, sp, a2 # reduce addr collision for perf
|
|
||||||
|
|
||||||
# disable active warps except warp0
|
# disable active warps except warp0
|
||||||
csrr a3, CSR_LWID # get local wid
|
csrr a3, CSR_LWID # get local wid
|
||||||
|
|
|
@ -1,14 +1,13 @@
|
||||||
|
|
||||||
#include <vx_intrinsics.h>
|
|
||||||
|
|
||||||
|
|
||||||
// #include <utlist.h>
|
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include <unistd.h>
|
#include <unistd.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdbool.h>
|
#include <stdbool.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include <vx_intrinsics.h>
|
||||||
|
#include <vx_print.h>
|
||||||
|
#include <vx_spawn.h>
|
||||||
|
|
||||||
typedef struct
|
typedef struct
|
||||||
{
|
{
|
||||||
unsigned * x;
|
unsigned * x;
|
||||||
|
|
File diff suppressed because it is too large
Load diff
Binary file not shown.
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
Binary file not shown.
File diff suppressed because it is too large
Load diff
|
@ -1,16 +1,5 @@
|
||||||
|
|
||||||
#include <vx_intrinsics.h>
|
#include <vx_intrinsics.h>
|
||||||
|
#include <vx_print.h>
|
||||||
// #include <utlist.h>
|
|
||||||
#include <string.h>
|
|
||||||
#include <unistd.h>
|
|
||||||
#include <stdint.h>
|
|
||||||
#include <stdbool.h>
|
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
|
|
||||||
// Newlib
|
|
||||||
#include <stdio.h>
|
|
||||||
|
|
||||||
int main()
|
int main()
|
||||||
{
|
{
|
||||||
|
|
File diff suppressed because it is too large
Load diff
Binary file not shown.
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
Binary file not shown.
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue