runtime API refactoring to support memory reservation and protection

This commit is contained in:
Blaise Tine 2024-04-28 04:23:00 -07:00
parent c554f53e44
commit db0f0fd353
35 changed files with 3190 additions and 2081 deletions

View file

@ -1,13 +1,13 @@
#!/usr/bin/env python
# Copyright 2019-2023
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -33,7 +33,7 @@ def get_vma_size(elf_file):
max_vma = 0
regex = re.compile(r'\s*LOAD\s+(\w+)\s+(\w+)\s+(\w+)\s+(\w+)\s+(\w+)')
for line in output.splitlines():
for line in output.splitlines():
match = regex.match(line)
if match:
vma = int(match.group(2), 16)
@ -44,15 +44,14 @@ def get_vma_size(elf_file):
vma_size = max_vma - min_vma
#print("vma={0:x}, size={1}, min_vma={2:x}, max_vma={3:x}, vma_size={4}".format(vma, size, min_vma, max_vma, vma_size))
total_vma_span = max_vma - min_vma
return total_vma_span # Return the calculated size
return min_vma, max_vma
except Exception as e:
print("Failed to calculate vma size due to an error: {}".format(str(e)))
sys.exit(-1)
def create_vxbin_binary(input_elf, output_bin, objcopy_path):
vma_size = get_vma_size(input_elf)
min_vma, max_vma = get_vma_size(input_elf)
# Create a binary data from the ELF file using objcopy
temp_bin_path = '/tmp/temp_kernel.bin'
@ -62,17 +61,19 @@ def create_vxbin_binary(input_elf, output_bin, objcopy_path):
with open(temp_bin_path, 'rb') as temp_file:
binary_data = temp_file.read()
# Pack size into 64-bit unsigned integer
total_size_bytes = struct.pack('<Q', vma_size)
# Pack addresses into 64-bit unsigned integer
min_vma_bytes = struct.pack('<Q', min_vma)
max_vma_bytes = struct.pack('<Q', max_vma)
# Write the total size and binary data to the final output file
with open(output_bin, 'wb') as bin_file:
bin_file.write(total_size_bytes)
bin_file.write(min_vma_bytes)
bin_file.write(max_vma_bytes)
bin_file.write(binary_data)
# Remove the temporary binary file
os.remove(temp_bin_path)
print("Binary created successfully: {}, vma_size={}".format(output_bin, vma_size))
print("Binary created successfully: {}, min_vma={:x}, max_vma={:x}".format(output_bin, min_vma, max_vma))
if __name__ == '__main__':
if len(sys.argv) != 3:

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -24,14 +24,13 @@ public:
MemoryAllocator(
uint64_t baseAddress,
uint64_t capacity,
uint32_t pageAlign,
uint32_t blockAlign)
uint32_t pageAlign,
uint32_t blockAlign)
: baseAddress_(baseAddress)
, capacity_(capacity)
, pageAlign_(pageAlign)
, blockAlign_(blockAlign)
, pages_(nullptr)
, nextAddress_(0)
, allocated_(0)
{}
@ -40,11 +39,11 @@ public:
page_t* currPage = pages_;
while (currPage) {
auto nextPage = currPage->next;
this->DeletePage(currPage);
delete currPage;
currPage = nextPage;
}
}
uint32_t baseAddress() const {
return baseAddress_;
}
@ -61,73 +60,80 @@ public:
return allocated_;
}
int allocate(uint64_t size, uint64_t* addr) {
if (size == 0 || addr == nullptr) {
printf("error: invalid argurments\n");
int reserve(uint64_t addr, uint64_t size) {
if (size == 0) {
printf("error: invalid arguments\n");
return -1;
}
// Align allocation size
size = AlignSize(size, blockAlign_);
size = alignSize(size, pageAlign_);
// Check if the reservation is within memory capacity bounds
if (addr < baseAddress_ || addr + size > baseAddress_ + capacity_) {
printf("error: address range out of bounds\n");
return -1;
}
// Ensure the reservation does not overlap with existing pages
if (hasPageOverlap(addr, size)) {
printf("error: address range overlaps with existing allocation\n");
return -1;
}
// allocate a new page for segment
auto newPage = this->createPage(addr, size);
// allocate space on free block
auto freeBlock = newPage->findFreeBlock(size);
newPage->allocate(size, freeBlock);
// Update allocated size
allocated_ += size;
return 0;
}
int allocate(uint64_t size, uint64_t* addr) {
if (size == 0 || addr == nullptr) {
printf("error: invalid arguments\n");
return -1;
}
// Align allocation size
size = alignSize(size, blockAlign_);
// Walk thru all pages to find a free block
block_t* freeBlock = nullptr;
auto currPage = pages_;
while (currPage) {
auto currBlock = currPage->freeSList;
if (currBlock) {
// The free S-list is already sorted with the largest block first
// Quick check if the head block has enough space.
if (currBlock->size >= size) {
// Find the smallest matching block in the S-list
while (currBlock->nextFreeS
&& (currBlock->nextFreeS->size >= size)) {
currBlock = currBlock->nextFreeS;
}
// Return the free block
freeBlock = currBlock;
break;
}
}
freeBlock = currPage->findFreeBlock(size);
if (freeBlock != nullptr)
break;
currPage = currPage->next;
}
if (nullptr == freeBlock) {
// Allocate a new page for this request
currPage = this->NewPage(size);
// Allocate a new page if no free block is found
if (freeBlock == nullptr) {
auto pageSize = alignSize(size, pageAlign_);
uint64_t pageAddr;
if (!this->findNextAddress(pageSize, &pageAddr)) {
printf("error: out of memory\n");
return -1;
}
currPage = this->createPage(pageAddr, pageSize);
if (nullptr == currPage) {
printf("error: out of memory\n");
return -1;
}
freeBlock = currPage->freeSList;
}
// Remove the block from the free lists
assert(freeBlock->size >= size);
currPage->RemoveFreeMList(freeBlock);
currPage->RemoveFreeSList(freeBlock);
// If the free block we have found is larger than what we are looking for,
// we may be able to split our free block in two.
uint64_t extraBytes = freeBlock->size - size;
if (extraBytes >= blockAlign_) {
// Reduce the free block size to the requested value
freeBlock->size = size;
// Allocate a new block to contain the extra buffer
auto nextAddr = freeBlock->addr + size;
auto newBlock = new block_t(nextAddr, extraBytes);
// Add the new block to the free lists
currPage->InsertFreeMList(newBlock);
currPage->InsertFreeSList(newBlock);
freeBlock = currPage->findFreeBlock(size);
}
// Insert the free block into the used list
currPage->InsertUsedList(freeBlock);
// allocate space on free block
currPage->allocate(size, freeBlock);
// Return the free block address
*addr = baseAddress_ + freeBlock->addr;
*addr = freeBlock->addr;
// Update allocated size
allocated_ += size;
@ -137,22 +143,12 @@ public:
int release(uint64_t addr) {
// Walk all pages to find the pointer
uint64_t local_addr = addr - baseAddress_;
block_t* usedBlock = nullptr;
auto currPage = pages_;
while (currPage) {
if (local_addr >= currPage->addr
&& local_addr < (currPage->addr + currPage->size)) {
auto currBlock = currPage->usedList;
while (currBlock) {
if (currBlock->addr == local_addr) {
usedBlock = currBlock;
break;
}
currBlock = currBlock->nextUsed;
}
usedBlock = currPage->findUsedBlock(addr);
if (usedBlock != nullptr)
break;
}
currPage = currPage->next;
}
@ -164,65 +160,12 @@ public:
auto size = usedBlock->size;
// Remove the block from the used list
currPage->RemoveUsedList(usedBlock);
// Insert the block into the free M-list.
currPage->InsertFreeMList(usedBlock);
// Check if we can merge adjacent free blocks from the left.
if (usedBlock->prevFreeM) {
// Calculate the previous address
auto prevAddr = usedBlock->prevFreeM->addr + usedBlock->prevFreeM->size;
if (usedBlock->addr == prevAddr) {
auto prevBlock = usedBlock->prevFreeM;
// Merge the blocks to the left
prevBlock->size += usedBlock->size;
prevBlock->nextFreeM = usedBlock->nextFreeM;
if (prevBlock->nextFreeM) {
prevBlock->nextFreeM->prevFreeM = prevBlock;
}
// Detach previous block from the free S-list since size increased
currPage->RemoveFreeSList(prevBlock);
// reset usedBlock
delete usedBlock;
usedBlock = prevBlock;
}
}
// Check if we can merge adjacent free blocks from the right.
if (usedBlock->nextFreeM) {
// Calculate the next allocation start address
auto nextAddr = usedBlock->addr + usedBlock->size;
if (usedBlock->nextFreeM->addr == nextAddr) {
auto nextBlock = usedBlock->nextFreeM;
// Merge the blocks to the right
usedBlock->size += nextBlock->size;
usedBlock->nextFreeM = nextBlock->nextFreeM;
if (usedBlock->nextFreeM) {
usedBlock->nextFreeM->prevFreeM = usedBlock;
}
// Delete next block
currPage->RemoveFreeSList(nextBlock);
delete nextBlock;
}
}
// Insert the block into the free S-list.
currPage->InsertFreeSList(usedBlock);
// Check if we can free empty pages
if (nullptr == currPage->usedList) {
// Try to delete the page
while (currPage && this->DeletePage(currPage)) {
currPage = this->FindNextEmptyPage();
}
// release the used block
currPage->release(usedBlock);
// Free the page if empty
if (currPage->empty()) {
this->deletePage(currPage);
}
// update allocated size
@ -236,17 +179,17 @@ private:
struct block_t {
block_t* nextFreeS;
block_t* prevFreeS;
block_t* nextFreeM;
block_t* prevFreeM;
block_t* nextUsed;
block_t* prevUsed;
uint64_t addr;
uint64_t size;
block_t(uint64_t addr, uint64_t size)
block_t(uint64_t addr, uint64_t size)
: nextFreeS(nullptr)
, prevFreeS(nullptr)
, nextFreeM(nullptr)
@ -259,43 +202,156 @@ private:
};
struct page_t {
page_t* next;
// List of used blocks
block_t* usedList;
// List with blocks sorted by descreasing sizes
// Used for block lookup during memory allocation.
block_t* freeSList;
// List with blocks sorted by increasing memory addresses
// Used for block merging during memory release.
block_t* freeMList;
page_t* next;
uint64_t addr;
uint64_t size;
page_t(uint64_t addr, uint64_t size) :
next(nullptr),
usedList(nullptr),
page_t(uint64_t addr, uint64_t size, uint32_t blockAlign) :
next(nullptr),
addr(addr),
size(size) {
freeSList = freeMList = new block_t(addr, size);
size(size),
blockAlign_(blockAlign),
usedList_(nullptr) {
freeSList_ = freeMList_ = new block_t(addr, size);
}
void InsertUsedList(block_t* block) {
block->nextUsed = usedList;
if (usedList) {
usedList->prevUsed = block;
~page_t() {
// The page should be empty
assert(nullptr == usedList_);
assert(freeMList_
&& (nullptr == freeMList_->nextFreeM)
&& (nullptr == freeMList_->prevFreeM));
delete freeMList_;
}
bool empty() const {
return (usedList_ == nullptr);
}
void allocate(uint64_t size, block_t* freeBlock) {
// Remove the block from the free lists
this->removeFreeMList(freeBlock);
this->removeFreeSList(freeBlock);
// If the free block we have found is larger than what we are looking for,
// we may be able to split our free block in two.
uint64_t extraBytes = freeBlock->size - size;
if (extraBytes >= blockAlign_) {
// Reduce the free block size to the requested value
freeBlock->size = size;
// Allocate a new block to contain the extra buffer
auto nextAddr = freeBlock->addr + size;
auto newBlock = new block_t(nextAddr, extraBytes);
// Add the new block to the free lists
this->insertFreeMList(newBlock);
this->insertFreeSList(newBlock);
}
usedList = block;
// Insert the free block into the used list
this->insertUsedList(freeBlock);
}
void RemoveUsedList(block_t* block) {
void release(block_t* usedBlock) {
// Remove the block from the used list
this->removeUsedList(usedBlock);
// Insert the block into the free M-list.
this->insertFreeMList(usedBlock);
// Check if we can merge adjacent free blocks from the left.
if (usedBlock->prevFreeM) {
// Calculate the previous address
auto prevAddr = usedBlock->prevFreeM->addr + usedBlock->prevFreeM->size;
if (usedBlock->addr == prevAddr) {
auto prevBlock = usedBlock->prevFreeM;
// Merge the blocks to the left
prevBlock->size += usedBlock->size;
prevBlock->nextFreeM = usedBlock->nextFreeM;
if (prevBlock->nextFreeM) {
prevBlock->nextFreeM->prevFreeM = prevBlock;
}
// Detach previous block from the free S-list since size increased
this->removeFreeSList(prevBlock);
// reset usedBlock
delete usedBlock;
usedBlock = prevBlock;
}
}
// Check if we can merge adjacent free blocks from the right.
if (usedBlock->nextFreeM) {
// Calculate the next allocation start address
auto nextAddr = usedBlock->addr + usedBlock->size;
if (usedBlock->nextFreeM->addr == nextAddr) {
auto nextBlock = usedBlock->nextFreeM;
// Merge the blocks to the right
usedBlock->size += nextBlock->size;
usedBlock->nextFreeM = nextBlock->nextFreeM;
if (usedBlock->nextFreeM) {
usedBlock->nextFreeM->prevFreeM = usedBlock;
}
// Delete next block
this->removeFreeSList(nextBlock);
delete nextBlock;
}
}
// Insert the block into the free S-list.
this->insertFreeSList(usedBlock);
}
block_t* findFreeBlock(uint64_t size) {
auto freeBlock = freeSList_;
if (freeBlock) {
// The free S-list is already sorted with the largest block first
// Quick check if the head block has enough space.
if (freeBlock->size >= size) {
// Find the smallest matching block in the S-list
while (freeBlock->nextFreeS
&& (freeBlock->nextFreeS->size >= size)) {
freeBlock = freeBlock->nextFreeS;
}
// Return the free block
return freeBlock;
}
}
return nullptr;
}
block_t* findUsedBlock(uint64_t addr) {
if (addr >= this->addr
&& addr < (this->addr + this->size)) {
auto useBlock = usedList_;
while (useBlock) {
if (useBlock->addr == addr)
return useBlock;
useBlock = useBlock->nextUsed;
}
}
return nullptr;
}
private:
void insertUsedList(block_t* block) {
block->nextUsed = usedList_;
if (usedList_) {
usedList_->prevUsed = block;
}
usedList_ = block;
}
void removeUsedList(block_t* block) {
if (block->prevUsed) {
block->prevUsed->nextUsed = block->nextUsed;
} else {
usedList = block->nextUsed;
usedList_ = block->nextUsed;
}
if (block->nextUsed) {
block->nextUsed->prevUsed = block->prevUsed;
@ -304,8 +360,8 @@ private:
block->prevUsed = nullptr;
}
void InsertFreeMList(block_t* block) {
block_t* currBlock = freeMList;
void insertFreeMList(block_t* block) {
block_t* currBlock = freeMList_;
block_t* prevBlock = nullptr;
while (currBlock && (currBlock->addr < block->addr)) {
prevBlock = currBlock;
@ -316,18 +372,18 @@ private:
if (prevBlock) {
prevBlock->nextFreeM = block;
} else {
freeMList = block;
freeMList_ = block;
}
if (currBlock) {
currBlock->prevFreeM = block;
}
}
}
void RemoveFreeMList(block_t* block) {
void removeFreeMList(block_t* block) {
if (block->prevFreeM) {
block->prevFreeM->nextFreeM = block->nextFreeM;
} else {
freeMList = block->nextFreeM;
freeMList_ = block->nextFreeM;
}
if (block->nextFreeM) {
block->nextFreeM->prevFreeM = block->prevFreeM;
@ -336,8 +392,8 @@ private:
block->prevFreeM = nullptr;
}
void InsertFreeSList(block_t* block) {
block_t* currBlock = this->freeSList;
void insertFreeSList(block_t* block) {
block_t* currBlock = freeSList_;
block_t* prevBlock = nullptr;
while (currBlock && (currBlock->size > block->size)) {
prevBlock = currBlock;
@ -348,60 +404,62 @@ private:
if (prevBlock) {
prevBlock->nextFreeS = block;
} else {
this->freeSList = block;
freeSList_ = block;
}
if (currBlock) {
currBlock->prevFreeS = block;
}
}
void RemoveFreeSList(block_t* block) {
void removeFreeSList(block_t* block) {
if (block->prevFreeS) {
block->prevFreeS->nextFreeS = block->nextFreeS;
} else {
freeSList = block->nextFreeS;
freeSList_ = block->nextFreeS;
}
if (block->nextFreeS) {
block->nextFreeS->prevFreeS = block->prevFreeS;
}
block->nextFreeS = nullptr;
block->prevFreeS = nullptr;
block->prevFreeS = nullptr;
}
// block alignment
uint32_t blockAlign_;
// List of used blocks
block_t* usedList_;
// List with blocks sorted by decreasing sizes
// Used for block lookup during memory allocation.
block_t* freeSList_;
// List with blocks sorted by increasing memory addresses
// Used for block merging during memory release.
block_t* freeMList_;
};
page_t* NewPage(uint64_t size) {
// Increase buffer size to include the page and first block size
// also add padding to ensure page alignment
size = AlignSize(size, pageAlign_);
// Allocate page memory
auto addr = nextAddress_;
nextAddress_ += size;
// Overflow check
if (nextAddress_ > capacity_)
return nullptr;
page_t* createPage(uint64_t addr, uint64_t size) {
// Allocate object
auto newPage = new page_t(addr, size);
auto newPage = new page_t(addr, size, blockAlign_);
// Insert the new page into the list
newPage->next = pages_;
pages_ = newPage;
// Insert the new page into the list in address sorted order
if (pages_ == nullptr || pages_->addr > newPage->addr) {
newPage->next = pages_;
pages_ = newPage;
} else {
page_t* current = pages_;
while (current->next != nullptr && current->next->addr < newPage->addr) {
current = current->next;
}
newPage->next = current->next;
current->next = newPage;
}
return newPage;
}
bool DeletePage(page_t* page) {
// The page should be empty
assert(nullptr == page->usedList);
assert(page->freeMList && (nullptr == page->freeMList->nextFreeM));
// Only delete top-level pages
auto nextAddr = page->addr + page->size;
if (nextAddr != nextAddress_)
return false;
void deletePage(page_t* page) {
// Remove the page from the list
page_t* prevPage = nullptr;
auto currPage = pages_;
@ -417,36 +475,66 @@ private:
prevPage = currPage;
currPage = currPage->next;
}
// Update next allocation address
nextAddress_ = page->addr;
// free object
delete page->freeMList;
// Delete the page
delete page;
return true;
}
page_t* FindNextEmptyPage() {
auto currPage = pages_;
while (currPage) {
if (nullptr == currPage->usedList)
return currPage;
currPage = currPage->next;
}
return nullptr;
bool findNextAddress(uint64_t size, uint64_t* addr) {
if (pages_ == nullptr) {
*addr = baseAddress_;
return true;
}
page_t* current = pages_;
uint64_t endOfLastPage = baseAddress_;
while (current != nullptr) {
uint64_t startOfCurrentPage = current->addr;
if ((endOfLastPage + size) <= startOfCurrentPage) {
*addr = endOfLastPage;
return true;
}
// Update the end of the last page to the end of the current page
// Move to the next page in the sorted list
endOfLastPage = current->addr + current->size;
current = current->next;
}
// If no suitable gap is found, place the new page at the end of the last page
// Check if the allocator has enough capacity
if ((endOfLastPage + size) <= capacity_) {
*addr = endOfLastPage;
return true;
}
return false;
}
static uint64_t AlignSize(uint64_t size, uint64_t alignment) {
bool hasPageOverlap(uint64_t start, uint64_t size) {
page_t* current = pages_;
while (current != nullptr) {
uint64_t pageStart = current->addr;
uint64_t pageEnd = pageStart + current->size;
uint64_t requestEnd = start + size;
if ((start >= pageStart && start < pageEnd) || // Start of request is inside the page
(requestEnd > pageStart && requestEnd <= pageEnd) || // End of request is inside the page
(start <= pageStart && requestEnd >= pageEnd)) { // Request envelops the page
return true;
}
current = current->next;
}
return false;
}
static uint64_t alignSize(uint64_t size, uint64_t alignment) {
assert(0 == (alignment & (alignment - 1)));
return (size + alignment - 1) & ~(alignment - 1);
}
uint64_t baseAddress_;
uint64_t capacity_;
uint32_t pageAlign_;
uint32_t blockAlign_;
uint32_t pageAlign_;
uint32_t blockAlign_;
page_t* pages_;
uint64_t nextAddress_;
uint64_t allocated_;

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -29,7 +29,7 @@
_cleanup \
} while (false)
uint64_t aligned_size(uint64_t size, uint64_t alignment) {
uint64_t aligned_size(uint64_t size, uint64_t alignment) {
assert(0 == (alignment & (alignment - 1)));
return (size + alignment - 1) & ~(alignment - 1);
}
@ -68,7 +68,7 @@ public:
int get_perf_class() const {
return perf_class_;
}
private:
std::list<vx_device_h> hdevices_;
int perf_class_;
@ -97,14 +97,15 @@ void perf_remove_device(vx_device_h hdevice) {
///////////////////////////////////////////////////////////////////////////////
void DeviceConfig::write(uint32_t addr, uint32_t value) {
data_[addr] = value;
store_[addr] = value;
}
uint32_t DeviceConfig::read(uint32_t addr) const {
if (0 == data_.count(addr)) {
printf("Error: DeviceConfig::read(%d) failed\n", addr);
}
return data_.at(addr);
int DeviceConfig::read(uint32_t addr, uint32_t* value) const {
auto it = store_.find(addr);
if (it == store_.end())
return -1;
*value = it->second;
return 0;
}
///////////////////////////////////////////////////////////////////////////////
@ -131,47 +132,58 @@ int dcr_initialize(vx_device_h hdevice) {
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, 0), {
return _ret;
});
return 0;
}
///////////////////////////////////////////////////////////////////////////////
extern int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size, uint64_t* addr) {
if (NULL == content || size <= 8 || NULL == addr)
extern int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size, vx_buffer_h* hbuffer) {
if (nullptr == hdevice || nullptr == content || size <= 8 || nullptr == hbuffer)
return -1;
auto bytes = reinterpret_cast<const uint8_t*>(content);
uint64_t _addr;
auto bytes = reinterpret_cast<const uint64_t*>(content);
#ifdef NDEBUG
auto runtime_size = *reinterpret_cast<const uint64_t*>(bytes);
RT_CHECK(vx_mem_alloc(hdevice, runtime_size, &_addr), {
auto min_vma = *bytes++;
auto max_vma = *bytes++;
auto bin_size = size - 16;
auto runtime_size = (max_vma - min_vma);
vx_buffer_h _hbuffer;
#ifndef NDEBUG
RT_CHECK(vx_mem_reserve(hdevice, min_vma, runtime_size, 0, &_hbuffer), {
return _ret;
});
#else
uint32_t startup_addr0, startup_addr1;
RT_CHECK(vx_dcr_read(hdevice, VX_DCR_BASE_STARTUP_ADDR0, &startup_addr0), {
return _ret;
});
RT_CHECK(vx_dcr_read(hdevice, VX_DCR_BASE_STARTUP_ADDR1, &startup_addr1), {
return _ret;
});
_addr = (uint64_t(startup_addr1) << 32) | startup_addr0;
RT_CHECK(vx_mem_alloc(hdevice, runtime_size, 0, &_hbuffer), {
return _ret;
});
#endif
RT_CHECK(vx_copy_to_dev(hdevice, _addr, bytes + 8, size - 8), {
vx_mem_free(hdevice, _addr);
RT_CHECK(vx_mem_access(_hbuffer, 0, bin_size, VX_MEM_READ), {
vx_mem_free(_hbuffer);
return _ret;
});
*addr = _addr;
RT_CHECK(vx_mem_access(_hbuffer, bin_size, runtime_size - bin_size, VX_MEM_READ_WRITE), {
vx_mem_free(_hbuffer);
return _ret;
});
return 0;
RT_CHECK(vx_copy_to_dev(_hbuffer, bytes, 0, bin_size), {
vx_mem_free(_hbuffer);
return _ret;
});
*hbuffer = _hbuffer;
return 0;
}
extern int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, uint64_t* addr) {
extern int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, vx_buffer_h* hbuffer) {
if (nullptr == hdevice || nullptr == filename || nullptr == hbuffer)
return -1;
std::ifstream ifs(filename);
if (!ifs) {
std::cout << "error: " << filename << " not found" << std::endl;
@ -181,39 +193,42 @@ extern int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, uint
// read file content
ifs.seekg(0, ifs.end);
auto size = ifs.tellg();
std::vector<char> content(size);
std::vector<char> content(size);
ifs.seekg(0, ifs.beg);
ifs.read(content.data(), size);
// upload buffer
RT_CHECK(vx_upload_kernel_bytes(hdevice, content.data(), size, addr), {
RT_CHECK(vx_upload_kernel_bytes(hdevice, content.data(), size, hbuffer), {
return _ret;
});
return 0;
}
extern int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t size, uint64_t* addr) {
if (NULL == content || 0 == size || NULL == addr)
extern int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t size, vx_buffer_h* hbuffer) {
if (nullptr == hdevice || nullptr == content || 0 == size || nullptr == hbuffer)
return -1;
uint64_t _addr;
vx_buffer_h _hbuffer;
RT_CHECK(vx_mem_alloc(hdevice, size, &_addr), {
RT_CHECK(vx_mem_alloc(hdevice, size, VX_MEM_READ, &_hbuffer), {
return _ret;
});
RT_CHECK(vx_copy_to_dev(hdevice, _addr, content, size), {
vx_mem_free(hdevice, _addr);
RT_CHECK(vx_copy_to_dev(_hbuffer, content, 0, size), {
vx_mem_free(_hbuffer);
return _ret;
});
*addr = _addr;
*hbuffer = _hbuffer;
return 0;
}
extern int vx_upload_file(vx_device_h hdevice, const char* filename, uint64_t* addr) {
extern int vx_upload_file(vx_device_h hdevice, const char* filename, vx_buffer_h* hbuffer) {
if (nullptr == hdevice || nullptr == filename || nullptr == hbuffer)
return -1;
std::ifstream ifs(filename);
if (!ifs) {
std::cout << "error: " << filename << " not found" << std::endl;
@ -223,29 +238,12 @@ extern int vx_upload_file(vx_device_h hdevice, const char* filename, uint64_t* a
// read file content
ifs.seekg(0, ifs.end);
auto size = ifs.tellg();
std::vector<char> content(size);
std::vector<char> content(size);
ifs.seekg(0, ifs.beg);
ifs.read(content.data(), size);
// upload buffer
RT_CHECK(vx_upload_bytes(hdevice, content.data(), size, addr), {
return _ret;
});
return 0;
}
extern int vx_set_kernel_args(vx_device_h hdevice, const void* content, uint64_t size) {
if (NULL == content || 0 == size)
return -1;
uint64_t startup_arg;
RT_CHECK(vx_mem_alloc(hdevice, size, &startup_arg), {
return _ret;
});
RT_CHECK(vx_copy_to_dev(hdevice, startup_arg, content, size), {
vx_mem_free(hdevice, startup_arg);
RT_CHECK(vx_upload_bytes(hdevice, content.data(), size, hbuffer), {
return _ret;
});
@ -294,20 +292,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
uint64_t loads = 0;
uint64_t stores = 0;
uint64_t ifetch_lat = 0;
uint64_t load_lat = 0;
// PERF: l2cache
uint64_t load_lat = 0;
// PERF: l2cache
uint64_t l2cache_reads = 0;
uint64_t l2cache_writes = 0;
uint64_t l2cache_read_misses = 0;
uint64_t l2cache_write_misses = 0;
uint64_t l2cache_bank_stalls = 0;
uint64_t l2cache_bank_stalls = 0;
uint64_t l2cache_mshr_stalls = 0;
// PERF: l3cache
// PERF: l3cache
uint64_t l3cache_reads = 0;
uint64_t l3cache_writes = 0;
uint64_t l3cache_read_misses = 0;
uint64_t l3cache_write_misses = 0;
uint64_t l3cache_bank_stalls = 0;
uint64_t l3cache_bank_stalls = 0;
uint64_t l3cache_mshr_stalls = 0;
// PERF: memory
uint64_t mem_reads = 0;
@ -332,28 +330,27 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
bool lmem_enable = isa_flags & VX_ISA_EXT_LMEM;
#endif
std::vector<uint64_t> staging_buf(32);
auto get_mpm_csr = [&staging_buf](int csr_addr) {
return staging_buf.at(csr_addr - VX_CSR_MPM_BASE);
};
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
uint64_t mpm_mem_addr = IO_CSR_ADDR + core_id * staging_buf.size() * sizeof(uint64_t);
RT_CHECK(vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size() * sizeof(uint64_t)), {
uint64_t cycles_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MCYCLE, core_id, &cycles_per_core), {
return _ret;
});
uint64_t cycles_per_core = get_mpm_csr(VX_CSR_MCYCLE);
uint64_t instrs_per_core = get_mpm_csr(VX_CSR_MINSTRET);
uint64_t instrs_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MINSTRET, core_id, &instrs_per_core), {
return _ret;
});
#ifdef PERF_ENABLE
switch (perf_class) {
case VX_DCR_MPM_CLASS_CORE: {
// PERF: pipeline
// PERF: pipeline
// scheduler idles
{
uint64_t sched_idles_per_core = get_mpm_csr(VX_CSR_MPM_SCHED_ID);
uint64_t sched_idles_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCHED_ID, core_id, &sched_idles_per_core), {
return _ret;
});
if (num_cores > 1) {
int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core);
fprintf(stream, "PERF: core%d: scheduler idle=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core);
@ -362,7 +359,10 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
// scheduler stalls
{
uint64_t sched_stalls_per_core = get_mpm_csr(VX_CSR_MPM_SCHED_ST);
uint64_t sched_stalls_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCHED_ST, core_id, &sched_stalls_per_core), {
return _ret;
});
if (num_cores > 1) {
int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core);
fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core);
@ -371,7 +371,10 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
// ibuffer_stalls
{
uint64_t ibuffer_stalls_per_core = get_mpm_csr(VX_CSR_MPM_IBUF_ST);
uint64_t ibuffer_stalls_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_IBUF_ST, core_id, &ibuffer_stalls_per_core), {
return _ret;
});
if (num_cores > 1) {
int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core);
fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core);
@ -380,19 +383,34 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
// issue_stalls
{
uint64_t scrb_stalls_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_ST);
uint64_t scrb_alu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_ALU);
uint64_t scrb_fpu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_FPU);
uint64_t scrb_lsu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_LSU);
uint64_t scrb_sfu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_SFU);
uint64_t scrb_stalls_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ST, core_id, &scrb_stalls_per_core), {
return _ret;
});
uint64_t scrb_alu_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ALU, core_id, &scrb_alu_per_core), {
return _ret;
});
uint64_t scrb_fpu_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_FPU, core_id, &scrb_fpu_per_core), {
return _ret;
});
uint64_t scrb_lsu_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_LSU, core_id, &scrb_lsu_per_core), {
return _ret;
});
uint64_t scrb_sfu_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
return _ret;
});
scrb_alu += scrb_alu_per_core;
scrb_fpu += scrb_fpu_per_core;
scrb_lsu += scrb_lsu_per_core;
scrb_sfu += scrb_sfu_per_core;
scrb_sfu += scrb_sfu_per_core;
if (num_cores > 1) {
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
fprintf(stream, "PERF: core%d: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core,
calcAvgPercent(scrb_alu_per_core, scrb_total),
fprintf(stream, "PERF: core%d: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core,
calcAvgPercent(scrb_alu_per_core, scrb_total),
calcAvgPercent(scrb_fpu_per_core, scrb_total),
calcAvgPercent(scrb_lsu_per_core, scrb_total),
calcAvgPercent(scrb_sfu_per_core, scrb_total));
@ -401,14 +419,23 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
// sfu_stalls
{
uint64_t scrb_sfu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_SFU);
uint64_t scrb_wctl_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_WCTL);
uint64_t scrb_csrs_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_CSRS);
uint64_t scrb_sfu_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
return _ret;
});
uint64_t scrb_wctl_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_WCTL, core_id, &scrb_wctl_per_core), {
return _ret;
});
uint64_t scrb_csrs_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_CSRS, core_id, &scrb_csrs_per_core), {
return _ret;
});
if (num_cores > 1) {
uint64_t sfu_total = scrb_wctl_per_core + scrb_csrs_per_core;
fprintf(stream, "PERF: core%d: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n"
, core_id
, scrb_sfu_per_core
, scrb_sfu_per_core
, calcAvgPercent(scrb_csrs_per_core, sfu_total)
, calcAvgPercent(scrb_wctl_per_core, sfu_total)
);
@ -419,11 +446,17 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
// PERF: memory
// ifetches
{
uint64_t ifetches_per_core = get_mpm_csr(VX_CSR_MPM_IFETCHES);
uint64_t ifetches_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_IFETCHES, core_id, &ifetches_per_core), {
return _ret;
});
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
ifetches += ifetches_per_core;
uint64_t ifetch_lat_per_core = get_mpm_csr(VX_CSR_MPM_IFETCH_LT);
uint64_t ifetch_lat_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_IFETCH_LT, core_id, &ifetch_lat_per_core), {
return _ret;
});
if (num_cores > 1) {
int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
@ -432,11 +465,17 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
// loads
{
uint64_t loads_per_core = get_mpm_csr(VX_CSR_MPM_LOADS);
uint64_t loads_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LOADS, core_id, &loads_per_core), {
return _ret;
});
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
loads += loads_per_core;
uint64_t load_lat_per_core = get_mpm_csr(VX_CSR_MPM_LOAD_LT);
uint64_t load_lat_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LOAD_LT, core_id, &load_lat_per_core), {
return _ret;
});
if (num_cores > 1) {
int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
@ -445,42 +484,78 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
// stores
{
uint64_t stores_per_core = get_mpm_csr(VX_CSR_MPM_STORES);
uint64_t stores_per_core;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_STORES, core_id, &stores_per_core), {
return _ret;
});
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
stores += stores_per_core;
}
} break;
case VX_DCR_MPM_CLASS_MEM: {
case VX_DCR_MPM_CLASS_MEM: {
if (lmem_enable) {
// PERF: lmem
uint64_t lmem_reads = get_mpm_csr(VX_CSR_MPM_LMEM_READS);
uint64_t lmem_writes = get_mpm_csr(VX_CSR_MPM_LMEM_WRITES);
uint64_t lmem_bank_stalls = get_mpm_csr(VX_CSR_MPM_LMEM_BANK_ST);
uint64_t lmem_reads;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_READS, core_id, &lmem_reads), {
return _ret;
});
uint64_t lmem_writes;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_WRITES, core_id, &lmem_writes), {
return _ret;
});
uint64_t lmem_bank_stalls;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_BANK_ST, core_id, &lmem_bank_stalls), {
return _ret;
});
int lmem_bank_utilization = calcAvgPercent(lmem_reads + lmem_writes, lmem_reads + lmem_writes + lmem_bank_stalls);
fprintf(stream, "PERF: core%d: lmem reads=%ld\n", core_id, lmem_reads);
fprintf(stream, "PERF: core%d: lmem writes=%ld\n", core_id, lmem_writes);
fprintf(stream, "PERF: core%d: lmem writes=%ld\n", core_id, lmem_writes);
fprintf(stream, "PERF: core%d: lmem bank stalls=%ld (utilization=%d%%)\n", core_id, lmem_bank_stalls, lmem_bank_utilization);
}
if (icache_enable) {
// PERF: Icache
uint64_t icache_reads = get_mpm_csr(VX_CSR_MPM_ICACHE_READS);
uint64_t icache_read_misses = get_mpm_csr(VX_CSR_MPM_ICACHE_MISS_R);
uint64_t icache_mshr_stalls = get_mpm_csr(VX_CSR_MPM_ICACHE_MSHR_ST);
int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads);
uint64_t icache_reads;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_READS, core_id, &icache_reads), {
return _ret;
});
uint64_t icache_read_misses;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_MISS_R, core_id, &icache_read_misses), {
return _ret;
});
uint64_t icache_mshr_stalls;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_MSHR_ST, core_id, &icache_mshr_stalls), {
return _ret;
});
int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads);
int mshr_utilization = calcAvgPercent(icache_read_misses, icache_read_misses + icache_mshr_stalls);
fprintf(stream, "PERF: core%d: icache reads=%ld\n", core_id, icache_reads);
fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_read_misses, icache_read_hit_ratio);
fprintf(stream, "PERF: core%d: icache mshr stalls=%ld (utilization=%d%%)\n", core_id, icache_mshr_stalls, mshr_utilization);
}
if (dcache_enable) {
// PERF: Dcache
uint64_t dcache_reads = get_mpm_csr(VX_CSR_MPM_DCACHE_READS);
uint64_t dcache_writes = get_mpm_csr(VX_CSR_MPM_DCACHE_WRITES);
uint64_t dcache_read_misses = get_mpm_csr(VX_CSR_MPM_DCACHE_MISS_R);
uint64_t dcache_write_misses = get_mpm_csr(VX_CSR_MPM_DCACHE_MISS_W);
uint64_t dcache_bank_stalls = get_mpm_csr(VX_CSR_MPM_DCACHE_BANK_ST);
uint64_t dcache_reads;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_READS, core_id, &dcache_reads), {
return _ret;
});
uint64_t dcache_writes;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_WRITES, core_id, &dcache_writes), {
return _ret;
});
uint64_t dcache_read_misses;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_R, core_id, &dcache_read_misses), {
return _ret;
});
uint64_t dcache_write_misses;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_W, core_id, &dcache_write_misses), {
return _ret;
});
uint64_t dcache_bank_stalls;
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_BANK_ST, core_id, &dcache_bank_stalls), {
return _ret;
});
uint64_t dcache_mshr_stalls = get_mpm_csr(VX_CSR_MPM_DCACHE_MSHR_ST);
int dcache_read_hit_ratio = calcRatio(dcache_read_misses, dcache_reads);
int dcache_write_hit_ratio = calcRatio(dcache_write_misses, dcache_writes);
@ -489,7 +564,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
fprintf(stream, "PERF: core%d: dcache reads=%ld\n", core_id, dcache_reads);
fprintf(stream, "PERF: core%d: dcache writes=%ld\n", core_id, dcache_writes);
fprintf(stream, "PERF: core%d: dcache read misses=%ld (hit ratio=%d%%)\n", core_id, dcache_read_misses, dcache_read_hit_ratio);
fprintf(stream, "PERF: core%d: dcache write misses=%ld (hit ratio=%d%%)\n", core_id, dcache_write_misses, dcache_write_hit_ratio);
fprintf(stream, "PERF: core%d: dcache write misses=%ld (hit ratio=%d%%)\n", core_id, dcache_write_misses, dcache_write_hit_ratio);
fprintf(stream, "PERF: core%d: dcache bank stalls=%ld (utilization=%d%%)\n", core_id, dcache_bank_stalls, dcache_bank_utilization);
fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld (utilization=%d%%)\n", core_id, dcache_mshr_stalls, mshr_utilization);
}
@ -504,7 +579,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
l2cache_mshr_stalls += get_mpm_csr(VX_CSR_MPM_L2CACHE_MSHR_ST);
}
if (0 == core_id) {
if (0 == core_id) {
if (l3cache_enable) {
// PERF: L3cache
l3cache_reads = get_mpm_csr(VX_CSR_MPM_L3CACHE_READS);
@ -514,7 +589,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
l3cache_bank_stalls = get_mpm_csr(VX_CSR_MPM_L3CACHE_BANK_ST);
l3cache_mshr_stalls = get_mpm_csr(VX_CSR_MPM_L3CACHE_MSHR_ST);
}
// PERF: memory
mem_reads = get_mpm_csr(VX_CSR_MPM_MEM_READS);
mem_writes = get_mpm_csr(VX_CSR_MPM_MEM_WRITES);
@ -524,18 +599,18 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
default:
break;
}
#endif
#endif
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
total_instrs += instrs_per_core;
total_cycles += cycles_per_core;
max_cycles = std::max<uint64_t>(cycles_per_core, max_cycles);
}
#ifdef PERF_ENABLE
switch (perf_class) {
case VX_DCR_MPM_CLASS_CORE: {
case VX_DCR_MPM_CLASS_CORE: {
int sched_idles_percent = calcAvgPercent(sched_idles, total_cycles);
int sched_stalls_percent = calcAvgPercent(sched_stalls, total_cycles);
int ibuffer_percent = calcAvgPercent(ibuffer_stalls, total_cycles);
@ -547,22 +622,22 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
fprintf(stream, "PERF: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls,
calcAvgPercent(scrb_alu, scrb_total),
calcAvgPercent(scrb_alu, scrb_total),
calcAvgPercent(scrb_fpu, scrb_total),
calcAvgPercent(scrb_lsu, scrb_total),
calcAvgPercent(scrb_sfu, scrb_total));
calcAvgPercent(scrb_sfu, scrb_total));
fprintf(stream, "PERF: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n"
, scrb_sfu
, scrb_sfu
, calcAvgPercent(scrb_csrs, sfu_total)
, calcAvgPercent(scrb_wctl, sfu_total)
);
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
fprintf(stream, "PERF: loads=%ld\n", loads);
fprintf(stream, "PERF: stores=%ld\n", stores);
fprintf(stream, "PERF: stores=%ld\n", stores);
fprintf(stream, "PERF: ifetch latency=%d cycles\n", ifetch_avg_lat);
fprintf(stream, "PERF: load latency=%d cycles\n", load_avg_lat);
} break;
case VX_DCR_MPM_CLASS_MEM: {
fprintf(stream, "PERF: load latency=%d cycles\n", load_avg_lat);
} break;
case VX_DCR_MPM_CLASS_MEM: {
if (l2cache_enable) {
l2cache_reads /= num_cores;
l2cache_writes /= num_cores;
@ -577,12 +652,12 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads);
fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes);
fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, read_hit_ratio);
fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, write_hit_ratio);
fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, write_hit_ratio);
fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, bank_utilization);
fprintf(stream, "PERF: l2cache mshr stalls=%ld (utilization=%d%%)\n", l2cache_mshr_stalls, mshr_utilization);
}
if (l3cache_enable) {
if (l3cache_enable) {
int read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads);
int write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes);
int bank_utilization = calcAvgPercent(l3cache_reads + l3cache_writes, l3cache_reads + l3cache_writes + l3cache_bank_stalls);
@ -590,66 +665,24 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, read_hit_ratio);
fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, write_hit_ratio);
fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, write_hit_ratio);
fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, bank_utilization);
fprintf(stream, "PERF: l3cache mshr stalls=%ld (utilization=%d%%)\n", l3cache_mshr_stalls, mshr_utilization);
}
int mem_avg_lat = caclAverage(mem_lat, mem_reads);
int mem_avg_lat = caclAverage(mem_lat, mem_reads);
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
} break;
default:
break;
}
#endif
#endif
float IPC = (float)(double(total_instrs) / double(max_cycles));
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, max_cycles, IPC);
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, max_cycles, IPC);
fflush(stream);
return 0;
}
extern int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64_t* value) {
uint64_t num_cores;
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
return _ret;
});
if (core_id >= (int)num_cores) {
std::cout << "error: core_id out of range" << std::endl;
return -1;
}
std::vector<uint64_t> staging_buf(32);
uint64_t _value = 0;
unsigned i = 0;
if (core_id != -1) {
i = core_id;
num_cores = core_id + 1;
}
for (i = 0; i < num_cores; ++i) {
uint64_t mpm_mem_addr = IO_CSR_ADDR + i * staging_buf.size() * sizeof(uint64_t);
RT_CHECK(vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size() * sizeof(uint64_t)), {
return _ret;
});
auto per_core_value = staging_buf.at(counter-VX_CSR_MPM_BASE);
if (counter == VX_CSR_MCYCLE) {
_value = std::max<uint64_t>(per_core_value, _value);
} else {
_value += per_core_value;
}
}
// output
*value = _value;
return 0;
}
}

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -20,11 +20,11 @@
#include <VX_types.h>
class DeviceConfig {
public:
public:
void write(uint32_t addr, uint32_t value);
uint32_t read(uint32_t addr) const;
int read(uint32_t addr, uint32_t* value) const;
private:
std::unordered_map<uint32_t, uint32_t> data_;
std::unordered_map<uint32_t, uint32_t> store_;
};
int dcr_initialize(vx_device_h device);
@ -39,7 +39,6 @@ void perf_remove_device(vx_device_h device);
#define CACHE_BLOCK_SIZE 64
#define ALLOC_BASE_ADDR CACHE_BLOCK_SIZE
#define ALLOC_MAX_ADDR STARTUP_ADDR
#if (XLEN == 64)
#define GLOBAL_MEM_SIZE 0x200000000 // 8 GB
#else

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -23,9 +23,10 @@ extern "C" {
#endif
typedef void* vx_device_h;
typedef void* vx_buffer_h;
// device caps ids
#define VX_CAPS_VERSION 0x0
#define VX_CAPS_VERSION 0x0
#define VX_CAPS_NUM_THREADS 0x1
#define VX_CAPS_NUM_WARPS 0x2
#define VX_CAPS_NUM_CORES 0x3
@ -57,6 +58,11 @@ typedef void* vx_device_h;
// ready wait timeout
#define VX_MAX_TIMEOUT (24*60*60*1000) // 24 Hr
// device memory access
#define VX_MEM_READ 0x1
#define VX_MEM_WRITE 0x2
#define VX_MEM_READ_WRITE 0x3
// open the device and connect to it
int vx_dev_open(vx_device_h* hdevice);
@ -67,22 +73,31 @@ int vx_dev_close(vx_device_h hdevice);
int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value);
// allocate device memory and return address
int vx_mem_alloc(vx_device_h hdevice, uint64_t size, uint64_t* dev_addr);
int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int flags, vx_buffer_h* hbuffer);
// reserve memory address range
int vx_mem_reserve(vx_device_h hdevice, uint64_t address, uint64_t size, int flags, vx_buffer_h* hbuffer);
// release device memory
int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr);
int vx_mem_free(vx_buffer_h hbuffer);
// set device memory access rights
int vx_mem_access(vx_buffer_h hbuffer, uint64_t offset, uint64_t size, int flags);
// return device memory address
int vx_mem_address(vx_buffer_h hbuffer, uint64_t* address);
// get device memory info
int vx_mem_info(vx_device_h hdevice, uint64_t* mem_free, uint64_t* mem_used);
// Copy bytes from host to device memory
int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size);
int vx_copy_to_dev(vx_buffer_h hbuffer, const void* host_ptr, uint64_t dst_offset, uint64_t size);
// Copy bytes from device memory to host
int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size);
int vx_copy_from_dev(void* host_ptr, vx_buffer_h hbuffer, uint64_t src_offset, uint64_t size);
// Start device execution
int vx_start(vx_device_h hdevice, uint64_t krnl_addr, uint64_t args_addr);
int vx_start(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments);
// Wait for device ready with milliseconds timeout
int vx_ready_wait(vx_device_h hdevice, uint64_t timeout);
@ -93,23 +108,25 @@ int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value);
// write device configuration registers
int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value);
// query device performance counter
int vx_mpm_query(vx_device_h hdevice, uint32_t addr, uint32_t core_id, uint64_t* value);
////////////////////////////// UTILITY FUNCTIONS //////////////////////////////
// upload bytes to device
int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size, uint64_t* addr);
int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size, vx_buffer_h* hbuffer);
// upload file to device
int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, uint64_t* addr);
int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, vx_buffer_h* hbuffer);
// upload bytes to device
int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t size, uint64_t* addr);
int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t size, vx_buffer_h* hbuffer);
// upload file to device
int vx_upload_file(vx_device_h hdevice, const char* filename, uint64_t* addr);
int vx_upload_file(vx_device_h hdevice, const char* filename, vx_buffer_h* hbuffer);
// performance counters
int vx_dump_perf(vx_device_h hdevice, FILE* stream);
int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64_t* value);
#ifdef __cplusplus
}

File diff suppressed because it is too large Load diff

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -30,6 +30,8 @@
#include <util.h>
#include <processor.h>
using namespace vortex;
#define RAM_PAGE_SIZE 4096
#ifndef NDEBUG
@ -38,37 +40,109 @@
#define DBGPRINT(format, ...) ((void)0)
#endif
using namespace vortex;
#define CHECK_ERR(_expr, _cleanup) \
do { \
auto err = _expr; \
if (err == 0) \
break; \
printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
_cleanup \
} while (false)
///////////////////////////////////////////////////////////////////////////////
class vx_device {
class vx_device {
public:
vx_device()
vx_device()
: ram_(0, RAM_PAGE_SIZE)
, global_mem_(
ALLOC_BASE_ADDR,
ALLOC_MAX_ADDR - ALLOC_BASE_ADDR,
RAM_PAGE_SIZE,
CACHE_BLOCK_SIZE)
, global_mem_(ALLOC_BASE_ADDR, GLOBAL_MEM_SIZE - ALLOC_BASE_ADDR, RAM_PAGE_SIZE, CACHE_BLOCK_SIZE)
{
processor_.attach_ram(&ram_);
}
~vx_device() {
~vx_device() {
if (future_.valid()) {
future_.wait();
}
}
int mem_alloc(uint64_t size, uint64_t* dev_addr) {
return global_mem_.allocate(size, dev_addr);
int get_caps(uint32_t caps_id, uint64_t *value) {
uint64_t _value;
switch (caps_id) {
case VX_CAPS_VERSION:
_value = IMPLEMENTATION_ID;
break;
case VX_CAPS_NUM_THREADS:
_value = NUM_THREADS;
break;
case VX_CAPS_NUM_WARPS:
_value = NUM_WARPS;
break;
case VX_CAPS_NUM_CORES:
_value = NUM_CORES * NUM_CLUSTERS;
break;
case VX_CAPS_CACHE_LINE_SIZE:
_value = CACHE_BLOCK_SIZE;
break;
case VX_CAPS_GLOBAL_MEM_SIZE:
_value = GLOBAL_MEM_SIZE;
break;
case VX_CAPS_LOCAL_MEM_SIZE:
_value = (1 << LMEM_LOG_SIZE);
break;
case VX_CAPS_LOCAL_MEM_ADDR:
_value = LMEM_BASE_ADDR;
break;
case VX_CAPS_ISA_FLAGS:
_value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
break;
default:
std::cout << "invalid caps id: " << caps_id << std::endl;
std::abort();
return -1;
}
*value = _value;
return 0;
}
int mem_alloc(uint64_t size, int flags, uint64_t* dev_addr) {
uint64_t addr;
CHECK_ERR(global_mem_.allocate(size, &addr), {
return err;
});
CHECK_ERR(this->mem_access(addr, size, flags), {
global_mem_.release(addr);
return err;
});
*dev_addr = addr;
return 0;
}
int mem_reserve(uint64_t dev_addr, uint64_t size, int flags) {
CHECK_ERR(global_mem_.reserve(dev_addr, size), {
return err;
});
CHECK_ERR(this->mem_access(dev_addr, size, flags), {
global_mem_.release(dev_addr);
return err;
});
return 0;
}
int mem_free(uint64_t dev_addr) {
return global_mem_.release(dev_addr);
}
int mem_access(uint64_t dev_addr, uint64_t size, int flags) {
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
if (dev_addr + asize > GLOBAL_MEM_SIZE)
return -1;
ram_.set_acl(dev_addr, size, flags);
return 0;
}
int mem_info(uint64_t* mem_free, uint64_t* mem_used) const {
if (mem_free)
*mem_free = global_mem_.free();
@ -82,6 +156,10 @@ public:
if (dest_addr + asize > GLOBAL_MEM_SIZE)
return -1;
ram_.enable_acl(false);
ram_.write((const uint8_t*)src, dest_addr, size);
ram_.enable_acl(true);
/*printf("VXDRV: upload %ld bytes from 0x%lx:", size, uintptr_t((uint8_t*)src));
for (int i = 0; i < (asize / CACHE_BLOCK_SIZE); ++i) {
printf("\n0x%08lx=", dest_addr + i * CACHE_BLOCK_SIZE);
@ -90,8 +168,7 @@ public:
}
}
printf("\n");*/
ram_.write((const uint8_t*)src, dest_addr, size);
return 0;
}
@ -100,8 +177,10 @@ public:
if (src_addr + asize > GLOBAL_MEM_SIZE)
return -1;
ram_.enable_acl(false);
ram_.read((uint8_t*)dest, src_addr, size);
ram_.enable_acl(true);
/*printf("VXDRV: download %ld bytes to 0x%lx:", size, uintptr_t((uint8_t*)dest));
for (int i = 0; i < (asize / CACHE_BLOCK_SIZE); ++i) {
printf("\n0x%08lx=", src_addr + i * CACHE_BLOCK_SIZE);
@ -110,21 +189,21 @@ public:
}
}
printf("\n");*/
return 0;
}
int start(uint64_t krnl_addr, uint64_t args_addr) {
int start(uint64_t krnl_addr, uint64_t args_addr) {
// ensure prior run completed
if (future_.valid()) {
future_.wait();
}
// set kernel info
this->write_dcr(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff);
this->write_dcr(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32);
this->write_dcr(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
this->write_dcr(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff);
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32);
this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
// start new run
future_ = std::async(std::launch::async, [&]{
@ -133,7 +212,7 @@ public:
return 0;
}
int wait(uint64_t timeout) {
int ready_wait(uint64_t timeout) {
if (!future_.valid())
return 0;
uint64_t timeout_sec = timeout / 1000;
@ -141,24 +220,24 @@ public:
for (;;) {
// wait for 1 sec and check status
auto status = future_.wait_for(wait_time);
if (status == std::future_status::ready
if (status == std::future_status::ready
|| 0 == timeout_sec--)
break;
}
return 0;
}
int write_dcr(uint32_t addr, uint32_t value) {
int dcr_write(uint32_t addr, uint32_t value) {
if (future_.valid()) {
future_.wait(); // ensure prior run completed
}
processor_.write_dcr(addr, value);
}
processor_.dcr_write(addr, value);
dcrs_.write(addr, value);
return 0;
}
uint64_t read_dcr(uint32_t addr) const {
return dcrs_.read(addr);
int dcr_read(uint32_t addr, uint32_t* value) const {
return dcrs_.read(addr, value);
}
private:
@ -170,51 +249,14 @@ private:
std::future<void> future_;
};
struct vx_buffer {
vx_device* device;
uint64_t addr;
uint64_t size;
};
///////////////////////////////////////////////////////////////////////////////
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
if (nullptr == hdevice)
return -1;
//vx_device *device = ((vx_device*)hdevice);
switch (caps_id) {
case VX_CAPS_VERSION:
*value = IMPLEMENTATION_ID;
break;
case VX_CAPS_NUM_THREADS:
*value = NUM_THREADS;
break;
case VX_CAPS_NUM_WARPS:
*value = NUM_WARPS;
break;
case VX_CAPS_NUM_CORES:
*value = NUM_CORES * NUM_CLUSTERS;
break;
case VX_CAPS_CACHE_LINE_SIZE:
*value = CACHE_BLOCK_SIZE;
break;
case VX_CAPS_GLOBAL_MEM_SIZE:
*value = GLOBAL_MEM_SIZE;
break;
case VX_CAPS_LOCAL_MEM_SIZE:
*value = (1 << LMEM_LOG_SIZE);
break;
case VX_CAPS_LOCAL_MEM_ADDR:
*value = LMEM_BASE_ADDR;
break;
case VX_CAPS_ISA_FLAGS:
*value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
break;
default:
std::cout << "invalid caps id: " << caps_id << std::endl;
std::abort();
return -1;
}
return 0;
}
extern int vx_dev_open(vx_device_h* hdevice) {
if (nullptr == hdevice)
return -1;
@ -233,6 +275,8 @@ extern int vx_dev_open(vx_device_h* hdevice) {
perf_add_device(device);
#endif
DBGPRINT("DEV_OPEN: hdevice=%p\n", (void*)device);
*hdevice = device;
return 0;
@ -242,107 +286,228 @@ extern int vx_dev_close(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
DBGPRINT("DEV_CLOSE: hdevice=%p\n", hdevice);
#ifdef DUMP_PERF_STATS
perf_remove_device(hdevice);
#endif
vx_device *device = ((vx_device*)hdevice);
delete device;
return 0;
}
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, uint64_t* dev_addr) {
if (nullptr == hdevice
|| nullptr == dev_addr
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
uint64_t _value;
CHECK_ERR(device->get_caps(caps_id, &_value), {
return err;
});
DBGPRINT("DEV_CAPS: hdevice=%p, caps_id=%d, value=%ld\n", hdevice, caps_id, _value);
*value = _value;
return 0;
}
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int flags, vx_buffer_h* hbuffer) {
if (nullptr == hdevice
|| nullptr == hbuffer
|| 0 == size)
return -1;
DBGPRINT("MEM_ALLOC: size=%ld\n", size);
auto device = ((vx_device*)hdevice);
vx_device *device = ((vx_device*)hdevice);
return device->mem_alloc(size, dev_addr);
uint64_t dev_addr;
CHECK_ERR(device->mem_alloc(size, flags, &dev_addr), {
return err;
});
auto buffer = new vx_buffer{device, dev_addr, size};
if (nullptr == buffer) {
device->mem_free(dev_addr);
return -1;
}
DBGPRINT("MEM_ALLOC: hdevice=%p, size=%ld, flags=0x%d, hbuffer=%p\n", hdevice, size, flags, (void*)buffer);
*hbuffer = buffer;
return 0;
}
extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr) {
if (nullptr == hdevice)
extern int vx_mem_reserve(vx_device_h hdevice, uint64_t address, uint64_t size, int flags, vx_buffer_h* hbuffer) {
if (nullptr == hdevice
|| nullptr == hbuffer
|| 0 == size)
return -1;
if (0 == dev_addr)
auto device = ((vx_device*)hdevice);
CHECK_ERR(device->mem_reserve(address, size, flags), {
return err;
});
auto buffer = new vx_buffer{device, address, size};
if (nullptr == buffer) {
device->mem_free(address);
return -1;
}
DBGPRINT("MEM_RESERVE: hdevice=%p, address=0x%lx, size=%ld, flags=0x%d, hbuffer=%p\n", hdevice, address, size, flags, (void*)buffer);
*hbuffer = buffer;
return 0;
}
extern int vx_mem_free(vx_buffer_h hbuffer) {
if (nullptr == hbuffer)
return 0;
DBGPRINT("MEM_FREE: dev_addr=0x%lx\n", dev_addr);
DBGPRINT("MEM_FREE: hbuffer=%p\n", hbuffer);
vx_device *device = ((vx_device*)hdevice);
return device->mem_free(dev_addr);
auto buffer = ((vx_buffer*)hbuffer);
auto device = ((vx_device*)buffer->device);
vx_mem_access(hbuffer, 0, buffer->size, 0);
int err = device->mem_free(buffer->addr);
delete buffer;
return err;
}
extern int vx_mem_access(vx_buffer_h hbuffer, uint64_t offset, uint64_t size, int flags) {
if (nullptr == hbuffer)
return -1;
auto buffer = ((vx_buffer*)hbuffer);
auto device = ((vx_device*)buffer->device);
if ((offset + size) > buffer->size)
return -1;
DBGPRINT("MEM_ACCESS: hbuffer=%p, offset=%ld, size=%ld, flags=%d\n", hbuffer, offset, size, flags);
return device->mem_access(buffer->addr + offset, size, flags);
}
extern int vx_mem_address(vx_buffer_h hbuffer, uint64_t* address) {
if (nullptr == hbuffer)
return -1;
auto buffer = ((vx_buffer*)hbuffer);
DBGPRINT("MEM_ADDRESS: hbuffer=%p, address=0x%lx\n", hbuffer, buffer->addr);
*address = buffer->addr;
return 0;
}
extern int vx_mem_info(vx_device_h hdevice, uint64_t* mem_free, uint64_t* mem_used) {
if (nullptr == hdevice)
return -1;
DBGPRINT("%s\n", "MEM_INFO");
auto device = ((vx_device*)hdevice);
uint64_t _mem_free, _mem_used;
CHECK_ERR(device->mem_info(&_mem_free, &_mem_used), {
return err;
});
DBGPRINT("MEM_INFO: hdevice=%p, mem_free=%ld, mem_used=%ld\n", hdevice, _mem_free, _mem_used);
if (mem_free) {
*mem_free = _mem_free;
}
if (mem_used) {
*mem_used = _mem_used;
}
return 0;
}
extern int vx_copy_to_dev(vx_buffer_h hbuffer, const void* host_ptr, uint64_t dst_offset, uint64_t size) {
if (nullptr == hbuffer || nullptr == host_ptr)
return -1;
auto buffer = ((vx_buffer*)hbuffer);
auto device = ((vx_device*)buffer->device);
if ((dst_offset + size) > buffer->size)
return -1;
DBGPRINT("COPY_TO_DEV: hbuffer=%p, host_addr=%p, dst_offset=%ld, size=%ld\n", hbuffer, host_ptr, dst_offset, size);
return device->upload(buffer->addr + dst_offset, host_ptr, size);
}
extern int vx_copy_from_dev(void* host_ptr, vx_buffer_h hbuffer, uint64_t src_offset, uint64_t size) {
if (nullptr == hbuffer || nullptr == host_ptr)
return -1;
auto buffer = ((vx_buffer*)hbuffer);
auto device = ((vx_device*)buffer->device);
if ((src_offset + size) > buffer->size)
return -1;
DBGPRINT("COPY_FROM_DEV: hbuffer=%p, host_addr=%p, src_offset=%ld, size=%ld\n", hbuffer, host_ptr, src_offset, size);
return device->download(host_ptr, buffer->addr + src_offset, size);
}
extern int vx_start(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments) {
if (nullptr == hdevice || nullptr == hkernel || nullptr == harguments)
return -1;
DBGPRINT("START: hdevice=%p, hkernel=%p, harguments=%p\n", hdevice, hkernel, harguments);
auto device = ((vx_device*)hdevice);
return device->mem_info(mem_free, mem_used);
}
auto kernel = ((vx_buffer*)hkernel);
auto arguments = ((vx_buffer*)harguments);
extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size) {
if (nullptr == hdevice)
return -1;
auto device = (vx_device*)hdevice;
DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%p, size=%ld\n", dev_addr, host_ptr, size);
return device->upload(dev_addr, host_ptr, size);
}
extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size) {
if (nullptr == hdevice)
return -1;
auto device = (vx_device*)hdevice;
DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%p, size=%ld\n", dev_addr, host_ptr, size);
return device->download(host_ptr, dev_addr, size);
}
extern int vx_start(vx_device_h hdevice, uint64_t krnl_addr, uint64_t args_addr) {
if (nullptr == hdevice)
return -1;
DBGPRINT("START: krnl_addr=0x%lx, args_addr=0x%lx\n", krnl_addr, args_addr);
vx_device *device = ((vx_device*)hdevice);
return device->start(krnl_addr, args_addr);
return device->start(kernel->addr, arguments->addr);
}
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
if (nullptr == hdevice)
return -1;
DBGPRINT("%s\n", "WAIT");
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->wait(timeout);
DBGPRINT("READY_WAIT: hdevice=%p, timeout=%ld\n", hdevice, timeout);
auto device = ((vx_device*)hdevice);
return device->ready_wait(timeout);
}
extern int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) {
if (nullptr == hdevice || NULL == value)
return -1;
vx_device *device = ((vx_device*)hdevice);
auto device = ((vx_device*)hdevice);
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
uint32_t _value;
*value = device->read_dcr(addr);
CHECK_ERR(device->dcr_read(addr, &_value), {
return err;
});
DBGPRINT("DCR_READ: addr=0x%x, value=0x%x\n", addr, *value);
DBGPRINT("DCR_READ: hdevice=%p, addr=0x%x, value=0x%x\n", hdevice, addr, _value);
*value = _value;
return 0;
}
@ -351,13 +516,34 @@ extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
DBGPRINT("DCR_WRITE: hdevice=%p, addr=0x%x, value=0x%x\n", hdevice, addr, value);
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
auto device = ((vx_device*)hdevice);
DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%x\n", addr, value);
return device->dcr_write(addr, value);
}
return device->write_dcr(addr, value);
extern int vx_mpm_query(vx_device_h hdevice, uint32_t addr, uint32_t core_id, uint64_t* value) {
if (nullptr == hdevice)
return -1;
uint32_t offset = addr - VX_CSR_MPM_BASE;
if (offset > 31)
return -1;
auto device = ((vx_device*)hdevice);
uint64_t mpm_mem_addr = IO_MPM_ADDR + (core_id * 32 + offset) * sizeof(uint64_t);
uint64_t _value;
CHECK_ERR(device->download(&_value, mpm_mem_addr, sizeof(uint64_t)), {
return err;
});
DBGPRINT("MPM_QUERY: hdevice=%p, addr=0x%x, core_id=%d, value=0x%lx\n", hdevice, addr, core_id, _value);
*value = _value;
return 0;
}

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -33,27 +33,32 @@
#include <mem.h>
#include <constants.h>
using namespace vortex;
#ifndef NDEBUG
#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
#else
#define DBGPRINT(format, ...) ((void)0)
#endif
using namespace vortex;
#define CHECK_ERR(_expr, _cleanup) \
do { \
auto err = _expr; \
if (err == 0) \
break; \
printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
_cleanup \
} while (false)
///////////////////////////////////////////////////////////////////////////////
class vx_device {
class vx_device {
public:
vx_device()
vx_device()
: arch_(NUM_THREADS, NUM_WARPS, NUM_CORES)
, ram_(0, RAM_PAGE_SIZE)
, processor_(arch_)
, global_mem_(
ALLOC_BASE_ADDR,
ALLOC_MAX_ADDR - ALLOC_BASE_ADDR,
RAM_PAGE_SIZE,
CACHE_BLOCK_SIZE)
, global_mem_(ALLOC_BASE_ADDR, GLOBAL_MEM_SIZE - ALLOC_BASE_ADDR, RAM_PAGE_SIZE, CACHE_BLOCK_SIZE)
{
// attach memory module
processor_.attach_ram(&ram_);
@ -63,16 +68,84 @@ public:
if (future_.valid()) {
future_.wait();
}
}
}
int mem_alloc(uint64_t size, uint64_t* dev_addr) {
return global_mem_.allocate(size, dev_addr);
int get_caps(uint32_t caps_id, uint64_t *value) {
uint64_t _value;
switch (caps_id) {
case VX_CAPS_VERSION:
_value = IMPLEMENTATION_ID;
break;
case VX_CAPS_NUM_THREADS:
_value = NUM_THREADS;
break;
case VX_CAPS_NUM_WARPS:
_value = NUM_WARPS;
break;
case VX_CAPS_NUM_CORES:
_value = NUM_CORES * NUM_CLUSTERS;
break;
case VX_CAPS_CACHE_LINE_SIZE:
_value = CACHE_BLOCK_SIZE;
break;
case VX_CAPS_GLOBAL_MEM_SIZE:
_value = GLOBAL_MEM_SIZE;
break;
case VX_CAPS_LOCAL_MEM_SIZE:
_value = (1 << LMEM_LOG_SIZE);
break;
case VX_CAPS_LOCAL_MEM_ADDR:
_value = LMEM_BASE_ADDR;
break;
case VX_CAPS_ISA_FLAGS:
_value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
break;
default:
std::cout << "invalid caps id: " << caps_id << std::endl;
std::abort();
return -1;
}
*value = _value;
return 0;
}
int mem_alloc(uint64_t size, int flags, uint64_t* dev_addr) {
uint64_t addr;
CHECK_ERR(global_mem_.allocate(size, &addr), {
return err;
});
CHECK_ERR(this->mem_access(addr, size, flags), {
global_mem_.release(addr);
return err;
});
*dev_addr = addr;
return 0;
}
int mem_reserve(uint64_t dev_addr, uint64_t size, int flags) {
CHECK_ERR(global_mem_.reserve(dev_addr, size), {
return err;
});
CHECK_ERR(this->mem_access(dev_addr, size, flags), {
global_mem_.release(dev_addr);
return err;
});
return 0;
}
int mem_free(uint64_t dev_addr) {
return global_mem_.release(dev_addr);
}
int mem_access(uint64_t dev_addr, uint64_t size, int flags) {
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
if (dev_addr + asize > GLOBAL_MEM_SIZE)
return -1;
ram_.set_acl(dev_addr, size, flags);
return 0;
}
int mem_info(uint64_t* mem_free, uint64_t* mem_used) const {
if (mem_free)
*mem_free = global_mem_.free();
@ -86,13 +159,15 @@ public:
if (dest_addr + asize > GLOBAL_MEM_SIZE)
return -1;
ram_.enable_acl(false);
ram_.write((const uint8_t*)src, dest_addr, size);
ram_.enable_acl(true);
/*DBGPRINT("upload %ld bytes to 0x%lx\n", size, dest_addr);
for (uint64_t i = 0; i < size && i < 1024; i += 4) {
DBGPRINT(" 0x%lx <- 0x%x\n", dest_addr + i, *(uint32_t*)((uint8_t*)src + i));
}*/
return 0;
}
@ -101,37 +176,39 @@ public:
if (src_addr + asize > GLOBAL_MEM_SIZE)
return -1;
ram_.enable_acl(false);
ram_.read((uint8_t*)dest, src_addr, size);
ram_.enable_acl(true);
/*DBGPRINT("download %ld bytes from 0x%lx\n", size, src_addr);
for (uint64_t i = 0; i < size && i < 1024; i += 4) {
DBGPRINT(" 0x%lx -> 0x%x\n", src_addr + i, *(uint32_t*)((uint8_t*)dest + i));
}*/
return 0;
}
int start(uint64_t krnl_addr, uint64_t args_addr) {
int start(uint64_t krnl_addr, uint64_t args_addr) {
// ensure prior run completed
if (future_.valid()) {
future_.wait();
}
// set kernel info
this->write_dcr(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff);
this->write_dcr(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32);
this->write_dcr(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
this->write_dcr(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff);
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32);
this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
// start new run
future_ = std::async(std::launch::async, [&]{
processor_.run();
});
return 0;
}
int wait(uint64_t timeout) {
int ready_wait(uint64_t timeout) {
if (!future_.valid())
return 0;
uint64_t timeout_sec = timeout / 1000;
@ -139,24 +216,24 @@ public:
for (;;) {
// wait for 1 sec and check status
auto status = future_.wait_for(wait_time);
if (status == std::future_status::ready
if (status == std::future_status::ready
|| 0 == timeout_sec--)
break;
}
return 0;
}
int write_dcr(uint32_t addr, uint32_t value) {
int dcr_write(uint32_t addr, uint32_t value) {
if (future_.valid()) {
future_.wait(); // ensure prior run completed
}
processor_.write_dcr(addr, value);
}
processor_.dcr_write(addr, value);
dcrs_.write(addr, value);
return 0;
}
uint64_t read_dcr(uint32_t addr) const {
return dcrs_.read(addr);
int dcr_read(uint32_t addr, uint32_t* value) const {
return dcrs_.read(addr, value);
}
private:
@ -168,6 +245,12 @@ private:
std::future<void> future_;
};
struct vx_buffer {
vx_device* device;
uint64_t addr;
uint64_t size;
};
///////////////////////////////////////////////////////////////////////////////
extern int vx_dev_open(vx_device_h* hdevice) {
@ -186,12 +269,12 @@ extern int vx_dev_open(vx_device_h* hdevice) {
#ifdef DUMP_PERF_STATS
perf_add_device(device);
#endif
#endif
DBGPRINT("DEV_OPEN: hdevice=%p\n", (void*)device);
*hdevice = device;
DBGPRINT("device creation complete!\n");
return 0;
}
@ -199,7 +282,9 @@ extern int vx_dev_close(vx_device_h hdevice) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
DBGPRINT("DEV_CLOSE: hdevice=%p\n", hdevice);
auto device = ((vx_device*)hdevice);
#ifdef DUMP_PERF_STATS
perf_remove_device(hdevice);
@ -207,144 +292,218 @@ extern int vx_dev_close(vx_device_h hdevice) {
delete device;
DBGPRINT("device destroyed!\n");
return 0;
}
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
if (nullptr == hdevice)
return -1;
//vx_device *device = ((vx_device*)hdevice);
switch (caps_id) {
case VX_CAPS_VERSION:
*value = IMPLEMENTATION_ID;
break;
case VX_CAPS_NUM_THREADS:
*value = NUM_THREADS;
break;
case VX_CAPS_NUM_WARPS:
*value = NUM_WARPS;
break;
case VX_CAPS_NUM_CORES:
*value = NUM_CORES * NUM_CLUSTERS;
break;
case VX_CAPS_CACHE_LINE_SIZE:
*value = CACHE_BLOCK_SIZE;
break;
case VX_CAPS_GLOBAL_MEM_SIZE:
*value = GLOBAL_MEM_SIZE;
break;
case VX_CAPS_LOCAL_MEM_SIZE:
*value = (1 << LMEM_LOG_SIZE);
break;
case VX_CAPS_LOCAL_MEM_ADDR:
*value = LMEM_BASE_ADDR;
break;
case VX_CAPS_ISA_FLAGS:
*value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
break;
default:
std::cout << "invalid caps id: " << caps_id << std::endl;
std::abort();
return -1;
}
vx_device *device = ((vx_device*)hdevice);
uint64_t _value;
CHECK_ERR(device->get_caps(caps_id, &_value), {
return err;
});
DBGPRINT("DEV_CAPS: hdevice=%p, caps_id=%d, value=%ld\n", hdevice, caps_id, _value);
*value = _value;
return 0;
}
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, uint64_t* dev_addr) {
if (nullptr == hdevice
|| nullptr == dev_addr
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int flags, vx_buffer_h* hbuffer) {
if (nullptr == hdevice
|| nullptr == hbuffer
|| 0 == size)
return -1;
DBGPRINT("MEM_ALLOC: size=%ld\n", size);
auto device = ((vx_device*)hdevice);
vx_device *device = ((vx_device*)hdevice);
return device->mem_alloc(size, dev_addr);
uint64_t dev_addr;
CHECK_ERR(device->mem_alloc(size, flags, &dev_addr), {
return err;
});
auto buffer = new vx_buffer{device, dev_addr, size};
if (nullptr == buffer) {
device->mem_free(dev_addr);
return -1;
}
DBGPRINT("MEM_ALLOC: hdevice=%p, size=%ld, flags=0x%d, hbuffer=%p\n", hdevice, size, flags, (void*)buffer);
*hbuffer = buffer;
return 0;
}
extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr) {
if (nullptr == hdevice)
extern int vx_mem_reserve(vx_device_h hdevice, uint64_t address, uint64_t size, int flags, vx_buffer_h* hbuffer) {
if (nullptr == hdevice
|| nullptr == hbuffer
|| 0 == size)
return -1;
if (0 == dev_addr)
auto device = ((vx_device*)hdevice);
CHECK_ERR(device->mem_reserve(address, size, flags), {
return err;
});
auto buffer = new vx_buffer{device, address, size};
if (nullptr == buffer) {
device->mem_free(address);
return -1;
}
DBGPRINT("MEM_RESERVE: hdevice=%p, address=0x%lx, size=%ld, flags=0x%d, hbuffer=%p\n", hdevice, address, size, flags, (void*)buffer);
*hbuffer = buffer;
return 0;
}
extern int vx_mem_free(vx_buffer_h hbuffer) {
if (nullptr == hbuffer)
return 0;
DBGPRINT("MEM_FREE: dev_addr=0x%lx\n", dev_addr);
DBGPRINT("MEM_FREE: hbuffer=%p\n", hbuffer);
vx_device *device = ((vx_device*)hdevice);
return device->mem_free(dev_addr);
auto buffer = ((vx_buffer*)hbuffer);
auto device = ((vx_device*)buffer->device);
vx_mem_access(hbuffer, 0, buffer->size, 0);
int err = device->mem_free(buffer->addr);
delete buffer;
return err;
}
extern int vx_mem_access(vx_buffer_h hbuffer, uint64_t offset, uint64_t size, int flags) {
if (nullptr == hbuffer)
return -1;
auto buffer = ((vx_buffer*)hbuffer);
auto device = ((vx_device*)buffer->device);
if ((offset + size) > buffer->size)
return -1;
DBGPRINT("MEM_ACCESS: hbuffer=%p, offset=%ld, size=%ld, flags=%d\n", hbuffer, offset, size, flags);
return device->mem_access(buffer->addr + offset, size, flags);
}
extern int vx_mem_address(vx_buffer_h hbuffer, uint64_t* address) {
if (nullptr == hbuffer)
return -1;
auto buffer = ((vx_buffer*)hbuffer);
DBGPRINT("MEM_ADDRESS: hbuffer=%p, address=0x%lx\n", hbuffer, buffer->addr);
*address = buffer->addr;
return 0;
}
extern int vx_mem_info(vx_device_h hdevice, uint64_t* mem_free, uint64_t* mem_used) {
if (nullptr == hdevice)
return -1;
DBGPRINT("%s\n", "MEM_INFO");
auto device = ((vx_device*)hdevice);
return device->mem_info(mem_free, mem_used);
uint64_t _mem_free, _mem_used;
CHECK_ERR(device->mem_info(&_mem_free, &_mem_used), {
return err;
});
DBGPRINT("MEM_INFO: hdevice=%p, mem_free=%ld, mem_used=%ld\n", hdevice, _mem_free, _mem_used);
if (mem_free) {
*mem_free = _mem_free;
}
if (mem_used) {
*mem_used = _mem_used;
}
return 0;
}
extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size) {
if (nullptr == hdevice)
extern int vx_copy_to_dev(vx_buffer_h hbuffer, const void* host_ptr, uint64_t dst_offset, uint64_t size) {
if (nullptr == hbuffer || nullptr == host_ptr)
return -1;
auto device = ((vx_device*)hdevice);
auto buffer = ((vx_buffer*)hbuffer);
auto device = ((vx_device*)buffer->device);
DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%p, size=%ld\n", dev_addr, host_ptr, size);
return device->upload(dev_addr, host_ptr, size);
}
extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size) {
if (nullptr == hdevice)
if ((dst_offset + size) > buffer->size)
return -1;
auto device = ((vx_device*)hdevice);
DBGPRINT("COPY_TO_DEV: hbuffer=%p, host_addr=%p, dst_offset=%ld, size=%ld\n", hbuffer, host_ptr, dst_offset, size);
DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%p, size=%ld\n", dev_addr, host_ptr, size);
return device->download(host_ptr, dev_addr, size);
return device->upload(buffer->addr + dst_offset, host_ptr, size);
}
extern int vx_start(vx_device_h hdevice, uint64_t krnl_addr, uint64_t args_addr) {
if (nullptr == hdevice)
return -1;
DBGPRINT("START: krnl_addr=0x%lx, args_addr=0x%lx\n", krnl_addr, args_addr);
extern int vx_copy_from_dev(void* host_ptr, vx_buffer_h hbuffer, uint64_t src_offset, uint64_t size) {
if (nullptr == hbuffer || nullptr == host_ptr)
return -1;
vx_device *device = ((vx_device*)hdevice);
return device->start(krnl_addr, args_addr);
auto buffer = ((vx_buffer*)hbuffer);
auto device = ((vx_device*)buffer->device);
if ((src_offset + size) > buffer->size)
return -1;
DBGPRINT("COPY_FROM_DEV: hbuffer=%p, host_addr=%p, src_offset=%ld, size=%ld\n", hbuffer, host_ptr, src_offset, size);
return device->download(host_ptr, buffer->addr + src_offset, size);
}
extern int vx_start(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments) {
if (nullptr == hdevice || nullptr == hkernel || nullptr == harguments)
return -1;
DBGPRINT("START: hdevice=%p, hkernel=%p, harguments=%p\n", hdevice, hkernel, harguments);
auto device = ((vx_device*)hdevice);
auto kernel = ((vx_buffer*)hkernel);
auto arguments = ((vx_buffer*)harguments);
return device->start(kernel->addr, arguments->addr);
}
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
if (nullptr == hdevice)
return -1;
DBGPRINT("%s\n", "WAIT");
DBGPRINT("READY_WAIT: hdevice=%p, timeout=%ld\n", hdevice, timeout);
vx_device *device = ((vx_device*)hdevice);
return device->wait(timeout);
auto device = ((vx_device*)hdevice);
return device->ready_wait(timeout);
}
extern int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) {
if (nullptr == hdevice || NULL == value)
return -1;
vx_device *device = ((vx_device*)hdevice);
auto device = ((vx_device*)hdevice);
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
return -1;
uint32_t _value;
*value = device->read_dcr(addr);
CHECK_ERR(device->dcr_read(addr, &_value), {
return err;
});
DBGPRINT("DCR_READ: addr=0x%x, value=0x%x\n", addr, *value);
DBGPRINT("DCR_READ: hdevice=%p, addr=0x%x, value=0x%x\n", hdevice, addr, _value);
*value = _value;
return 0;
}
@ -353,13 +512,34 @@ extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) {
if (nullptr == hdevice)
return -1;
vx_device *device = ((vx_device*)hdevice);
DBGPRINT("DCR_WRITE: hdevice=%p, addr=0x%x, value=0x%x\n", hdevice, addr, value);
// Ensure ready for new command
if (vx_ready_wait(hdevice, -1) != 0)
auto device = ((vx_device*)hdevice);
return device->dcr_write(addr, value);
}
extern int vx_mpm_query(vx_device_h hdevice, uint32_t addr, uint32_t core_id, uint64_t* value) {
if (nullptr == hdevice)
return -1;
DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%x\n", addr, value);
return device->write_dcr(addr, value);
}
uint32_t offset = addr - VX_CSR_MPM_BASE;
if (offset > 31)
return -1;
auto device = ((vx_device*)hdevice);
uint64_t mpm_mem_addr = IO_MPM_ADDR + (core_id * 32 + offset) * sizeof(uint64_t);
uint64_t _value;
CHECK_ERR(device->download(&_value, mpm_mem_addr, sizeof(uint64_t)), {
return err;
});
DBGPRINT("MPM_QUERY: hdevice=%p, addr=0x%x, core_id=%d, value=0x%lx\n", hdevice, addr, core_id, _value);
*value = _value;
return 0;
}

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -25,11 +25,23 @@ extern int vx_dev_caps(vx_device_h /*hdevice*/, uint32_t /*caps_id*/, uint64_t*
return -1;
}
extern int vx_mem_alloc(vx_device_h /*hdevice*/, uint64_t /*size*/, uint64_t* /*dev_addr*/) {
extern int vx_mem_alloc(vx_device_h /*hdevice*/, uint64_t /*size*/, int /*flags*/, vx_buffer_h* /*hbuffer*/) {
return -1;
}
extern int vx_mem_free(vx_device_h /*hdevice*/, uint64_t /*dev_addr*/) {
extern int vx_mem_reserve(vx_device_h /*hdevice*/, uint64_t /*address*/, uint64_t /*size*/, int /*flags*/, vx_buffer_h* /*hbuffer*/) {
return -1;
}
extern int vx_mem_free(vx_buffer_h /*hbuffer*/) {
return -1;
}
extern int vx_mem_access(vx_buffer_h /*hbuffer*/, uint64_t /*offset*/, uint64_t /*size*/, int /*flags*/) {
return -1;
}
extern int vx_mem_address(vx_buffer_h /*hbuffer*/, uint64_t* /*address*/) {
return -1;
}
@ -37,15 +49,15 @@ extern int vx_mem_info(vx_device_h /*hdevice*/, uint64_t* /*mem_free*/, uint64_t
return 0;
}
extern int vx_copy_to_dev(vx_device_h /*hdevice*/, uint64_t /*dev_addr*/, const void* /*host_ptr*/, uint64_t /*size*/) {
extern int vx_copy_to_dev(vx_buffer_h /*hbuffer*/, const void* /*host_ptr*/, uint64_t /*dst_offset*/, uint64_t /*size*/) {
return -1;
}
extern int vx_copy_from_dev(vx_device_h /*hdevice*/, void* /*host_ptr*/, uint64_t /*dev_addr*/, uint64_t /*size*/) {
extern int vx_copy_from_dev(void* /*host_ptr*/, vx_buffer_h /*hbuffer*/, uint64_t /*src_offset*/, uint64_t /*size*/) {
return -1;
}
extern int vx_start(vx_device_h /*hdevice*/, uint64_t /*krnl_addr*/, uint64_t /*args_add*/) {
extern int vx_start(vx_device_h /*hdevice*/, vx_buffer_h /*hkernel*/, vx_buffer_h /*harguments*/) {
return -1;
}
@ -61,3 +73,7 @@ extern int vx_dcr_read(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint32_t* /*v
extern int vx_dcr_write(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint32_t /*value*/) {
return -1;
}
extern int vx_mpm_query(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint32_t /*core_id*/, uint64_t* /*value*/) {
return -1;
}

File diff suppressed because it is too large Load diff

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -20,7 +20,7 @@
using namespace vortex;
RamMemDevice::RamMemDevice(const char *filename, uint32_t wordSize)
RamMemDevice::RamMemDevice(const char *filename, uint32_t wordSize)
: wordSize_(wordSize) {
std::ifstream input(filename);
@ -39,19 +39,19 @@ RamMemDevice::RamMemDevice(const char *filename, uint32_t wordSize)
}
RamMemDevice::RamMemDevice(uint64_t size, uint32_t wordSize)
: contents_(size)
: contents_(size)
, wordSize_(wordSize)
{}
void RamMemDevice::read(void* data, uint64_t addr, uint64_t size) {
auto addr_end = addr + size;
if ((addr & (wordSize_-1))
|| (addr_end & (wordSize_-1))
|| (addr_end & (wordSize_-1))
|| (addr_end <= contents_.size())) {
std::cout << "lookup of 0x" << std::hex << (addr_end-1) << " failed.\n";
throw BadAddress();
}
}
const uint8_t *s = contents_.data() + addr;
for (uint8_t *d = (uint8_t*)data, *de = d + size; d != de;) {
*d++ = *s++;
@ -61,7 +61,7 @@ void RamMemDevice::read(void* data, uint64_t addr, uint64_t size) {
void RamMemDevice::write(const void* data, uint64_t addr, uint64_t size) {
auto addr_end = addr + size;
if ((addr & (wordSize_-1))
|| (addr_end & (wordSize_-1))
|| (addr_end & (wordSize_-1))
|| (addr_end <= contents_.size())) {
std::cout << "lookup of 0x" << std::hex << (addr_end-1) << " failed.\n";
throw BadAddress();
@ -106,7 +106,7 @@ void MemoryUnit::ADecoder::read(void* data, uint64_t addr, uint64_t size) {
if (!this->lookup(addr, size, &ma)) {
std::cout << "lookup of 0x" << std::hex << addr << " failed.\n";
throw BadAddress();
}
}
ma.md->read(data, ma.addr, size);
}
@ -153,7 +153,7 @@ uint64_t MemoryUnit::toPhyAddr(uint64_t addr, uint32_t flagMask) {
TLBEntry t = this->tlbLookup(addr, flagMask);
pAddr = t.pfn * pageSize_ + addr % pageSize_;
} else {
pAddr = addr;
pAddr = addr;
}
return pAddr;
}
@ -190,14 +190,90 @@ void MemoryUnit::tlbRm(uint64_t va) {
///////////////////////////////////////////////////////////////////////////////
RAM::RAM(uint64_t capacity, uint32_t page_size)
void ACLManager::set(uint64_t addr, uint64_t size, int flags) {
if (size == 0)
return;
uint64_t end = addr + size;
// get starting interval
auto it = acl_map_.lower_bound(addr);
if (it != acl_map_.begin() && (--it)->second.end < addr) {
++it;
}
// Remove existing entries that overlap or are within the new range
while (it != acl_map_.end() && it->first < end) {
auto current = it++;
uint64_t current_end = current->second.end;
if (current_end <= addr)
continue; // No overlap, no need to adjust
// Adjust the current interval or erase it depending on overlap and flags
if (current->first < addr) {
if (current_end > end) {
acl_map_[end] = {current_end, current->second.flags};
}
current->second.end = addr;
} else {
if (current_end > end) {
acl_map_[end] = {current_end, current->second.flags};
}
acl_map_.erase(current);
}
}
// Insert new range if flags are not zero
if (flags != 0) {
it = acl_map_.emplace(addr, acl_entry_t{end, flags}).first;
// Merge adjacent ranges with the same flags
auto prev = it;
if (it != acl_map_.begin() && (--prev)->second.end == addr && prev->second.flags == flags) {
prev->second.end = it->second.end;
acl_map_.erase(it);
it = prev;
}
auto next = std::next(it);
if (next != acl_map_.end() && it->second.end == next->first && it->second.flags == next->second.flags) {
it->second.end = next->second.end;
acl_map_.erase(next);
}
}
}
bool ACLManager::check(uint64_t addr, uint64_t size, int flags) const {
uint64_t end = addr + size;
auto it = acl_map_.lower_bound(addr);
if (it != acl_map_.begin() && (--it)->second.end < addr) {
++it;
}
while (it != acl_map_.end() && it->first < end) {
if (it->second.end > addr) {
if ((it->second.flags & flags) != flags) {
std::cout << "Memory access violation from 0x" << std::hex << addr << " to 0x" << end << ", flags=" << (it->second.flags ^ flags) << std::endl;
return false; // Overlapping entry is missing at least one required flag bit
}
addr = it->second.end; // Move to the end of the current matching range
}
++it;
}
return true;
}
///////////////////////////////////////////////////////////////////////////////
RAM::RAM(uint64_t capacity, uint32_t page_size)
: capacity_(capacity)
, page_bits_(log2ceil(page_size))
, last_page_(nullptr)
, last_page_index_(0) {
, last_page_index_(0)
, check_acl_(false) {
assert(ispow2(page_size));
if (capacity != 0) {
assert(ispow2(capacity));
assert(ispow2(capacity));
assert(page_size <= capacity);
assert(0 == (capacity % page_size));
}
@ -221,7 +297,7 @@ uint8_t *RAM::get(uint64_t address) const {
if (capacity_ != 0 && address >= capacity_) {
throw OutOfRange();
}
uint32_t page_size = 1 << page_bits_;
uint32_t page_size = 1 << page_bits_;
uint32_t page_offset = address & (page_size - 1);
uint64_t page_index = address >> page_bits_;
@ -249,6 +325,9 @@ uint8_t *RAM::get(uint64_t address) const {
}
void RAM::read(void* data, uint64_t addr, uint64_t size) {
if (check_acl_ && acl_mngr_.check(addr, size, 0x1) == false) {
throw BadAddress();
}
uint8_t* d = (uint8_t*)data;
for (uint64_t i = 0; i < size; i++) {
d[i] = *this->get(addr + i);
@ -256,12 +335,22 @@ void RAM::read(void* data, uint64_t addr, uint64_t size) {
}
void RAM::write(const void* data, uint64_t addr, uint64_t size) {
if (check_acl_ && acl_mngr_.check(addr, size, 0x2) == false) {
throw BadAddress();
}
const uint8_t* d = (const uint8_t*)data;
for (uint64_t i = 0; i < size; i++) {
*this->get(addr + i) = d[i];
}
}
void RAM::set_acl(uint64_t addr, uint64_t size, int flags) {
if (capacity_ != 0 && (addr + size)> capacity_) {
throw OutOfRange();
}
acl_mngr_.set(addr, size, flags);
}
void RAM::loadBinImage(const char* filename, uint64_t destination) {
std::ifstream ifs(filename);
if (!ifs) {

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -15,6 +15,7 @@
#include <cstdint>
#include <vector>
#include <map>
#include <unordered_map>
#include <cstdint>
@ -38,7 +39,7 @@ public:
RamMemDevice(const char* filename, uint32_t wordSize);
~RamMemDevice() {}
void read(void* data, uint64_t addr, uint64_t size) override;
void read(void* data, uint64_t addr, uint64_t size) override;
void write(const void* data, uint64_t addr, uint64_t size) override;
virtual uint64_t size() const {
@ -55,13 +56,13 @@ protected:
class RomMemDevice : public RamMemDevice {
public:
RomMemDevice(const char *filename, uint32_t wordSize)
: RamMemDevice(filename, wordSize)
: RamMemDevice(filename, wordSize)
{}
RomMemDevice(uint64_t size, uint32_t wordSize)
: RamMemDevice(size, wordSize)
: RamMemDevice(size, wordSize)
{}
~RomMemDevice();
void write(const void* data, uint64_t addr, uint64_t size) override;
@ -71,11 +72,11 @@ public:
class MemoryUnit {
public:
struct PageFault {
PageFault(uint64_t a, bool nf)
: faultAddr(a)
, notFound(nf)
, notFound(nf)
{}
uint64_t faultAddr;
bool notFound;
@ -107,10 +108,10 @@ private:
class ADecoder {
public:
ADecoder() {}
void read(void* data, uint64_t addr, uint64_t size);
void write(const void* data, uint64_t addr, uint64_t size);
void map(uint64_t start, uint64_t end, MemDevice &md);
private:
@ -119,11 +120,11 @@ private:
MemDevice* md;
uint64_t addr;
};
struct entry_t {
MemDevice* md;
uint64_t start;
uint64_t end;
uint64_t end;
};
bool lookup(uint64_t addr, uint32_t wordSize, mem_accessor_t*);
@ -135,7 +136,7 @@ private:
TLBEntry() {}
TLBEntry(uint32_t pfn, uint32_t flags)
: pfn(pfn)
, flags(flags)
, flags(flags)
{}
uint32_t pfn;
uint32_t flags;
@ -147,7 +148,7 @@ private:
std::unordered_map<uint64_t, TLBEntry> tlb_;
uint64_t pageSize_;
ADecoder decoder_;
ADecoder decoder_;
bool enableVM_;
amo_reservation_t amo_reservation_;
@ -155,9 +156,28 @@ private:
///////////////////////////////////////////////////////////////////////////////
class ACLManager {
public:
void set(uint64_t addr, uint64_t size, int flags);
bool check(uint64_t addr, uint64_t size, int flags) const;
private:
struct acl_entry_t {
uint64_t end;
int32_t flags;
};
std::map<uint64_t, acl_entry_t> acl_map_;
};
///////////////////////////////////////////////////////////////////////////////
class RAM : public MemDevice {
public:
RAM(uint64_t capacity, uint32_t page_size);
RAM(uint64_t capacity) : RAM(capacity, capacity) {}
~RAM();
@ -166,7 +186,7 @@ public:
uint64_t size() const override;
void read(void* data, uint64_t addr, uint64_t size) override;
void read(void* data, uint64_t addr, uint64_t size) override;
void write(const void* data, uint64_t addr, uint64_t size) override;
void loadBinImage(const char* filename, uint64_t destination);
@ -180,15 +200,23 @@ public:
return *this->get(address);
}
void set_acl(uint64_t addr, uint64_t size, int flags);
void enable_acl(bool enable) {
check_acl_ = enable;
}
private:
uint8_t *get(uint64_t address) const;
uint64_t capacity_;
uint32_t page_bits_;
uint32_t page_bits_;
mutable std::unordered_map<uint64_t, uint8_t*> pages_;
mutable uint8_t* last_page_;
mutable uint64_t last_page_index_;
ACLManager acl_mngr_;
bool check_acl_;
};
} // namespace vortex

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -61,8 +61,8 @@ static void parse_args(int argc, char **argv) {
int main(int argc, char **argv) {
int exitcode = 0;
parse_args(argc, argv);
parse_args(argc, argv);
// create memory module
vortex::RAM ram(0, RAM_PAGE_SIZE);
@ -75,14 +75,14 @@ int main(int argc, char **argv) {
// setup base DCRs
const uint64_t startup_addr(STARTUP_ADDR);
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
processor.dcr_write(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
#if (XLEN == 64)
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
processor.dcr_write(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
#endif
processor.write_dcr(VX_DCR_BASE_MPM_CLASS, 0);
processor.dcr_write(VX_DCR_BASE_MPM_CLASS, 0);
// load program
{
{
std::string program_ext(fileExtension(program));
if (program_ext == "bin") {
ram.loadBinImage(program, startup_addr);
@ -96,7 +96,7 @@ int main(int argc, char **argv) {
// run simulation
exitcode = processor.run();
if (riscv_test) {
if (1 == exitcode) {
std::cout << "Passed" << std::endl;

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -37,7 +37,7 @@
#include <list>
#include <queue>
#include <vector>
#include <sstream>
#include <sstream>
#include <unordered_map>
#define RAMULATOR
@ -84,7 +84,7 @@ using namespace vortex;
static uint64_t timestamp = 0;
double sc_time_stamp() {
double sc_time_stamp() {
return timestamp;
}
@ -95,7 +95,7 @@ static uint64_t trace_start_time = TRACE_START_TIME;
static uint64_t trace_stop_time = TRACE_STOP_TIME;
bool sim_trace_enabled() {
if (timestamp >= trace_start_time
if (timestamp >= trace_start_time
&& timestamp < trace_stop_time)
return true;
return trace_enabled;
@ -110,7 +110,7 @@ void sim_trace_enable(bool enable) {
class Processor::Impl {
public:
Impl() {
// force random values for unitialized signals
// force random values for unitialized signals
Verilated::randReset(VERILATOR_RESET_VALUE);
Verilated::randSeed(50);
@ -132,7 +132,7 @@ public:
#endif
ram_ = nullptr;
// initialize dram simulator
ramulator::Config ram_config;
ram_config.add("standard", "DDR4");
@ -147,7 +147,7 @@ public:
// reset the device
this->reset();
// Turn on assertion after reset
Verilated::assertOn(true);
}
@ -159,9 +159,9 @@ public:
trace_->close();
delete trace_;
#endif
delete device_;
if (dram_) {
dram_->finish();
Stats::statlist.printall();
@ -202,11 +202,11 @@ public:
while (device_->busy) {
if (get_ebreak()) {
exitcode = (int)get_last_wb_value(3);
break;
break;
}
this->tick();
}
// reset device
this->reset();
@ -215,7 +215,7 @@ public:
return exitcode;
}
void write_dcr(uint32_t addr, uint32_t value) {
void dcr_write(uint32_t addr, uint32_t value) {
device_->dcr_wr_valid = 1;
device_->dcr_wr_addr = addr;
device_->dcr_wr_data = value;
@ -232,7 +232,7 @@ private:
print_bufs_.clear();
pending_mem_reqs_.clear();
mem_rd_rsp_active_ = false;
mem_wr_rsp_active_ = false;
@ -268,7 +268,7 @@ private:
device_->clk = 1;
this->eval();
#ifdef AXI_BUS
this->eval_axi_bus(1);
#else
@ -276,13 +276,13 @@ private:
#endif
this->eval_dcr_bus(1);
if (MEM_CYCLE_RATIO > 0) {
if (MEM_CYCLE_RATIO > 0) {
auto cycle = timestamp / 2;
if ((cycle % MEM_CYCLE_RATIO) == 0)
dram_->tick();
} else {
for (int i = MEM_CYCLE_RATIO; i <= 0; ++i)
dram_->tick();
dram_->tick();
}
if (!dram_queue_.empty()) {
@ -309,14 +309,14 @@ private:
#ifdef AXI_BUS
void reset_axi_bus() {
void reset_axi_bus() {
device_->m_axi_wready[0] = 0;
device_->m_axi_awready[0] = 0;
device_->m_axi_arready[0] = 0;
device_->m_axi_arready[0] = 0;
device_->m_axi_rvalid[0] = 0;
device_->m_axi_bvalid[0] = 0;
}
void eval_axi_bus(bool clk) {
if (!clk) {
mem_rd_rsp_ready_ = device_->m_axi_rready[0];
@ -327,7 +327,7 @@ private:
if (ram_ == nullptr) {
device_->m_axi_wready[0] = 0;
device_->m_axi_awready[0] = 0;
device_->m_axi_arready[0] = 0;
device_->m_axi_arready[0] = 0;
return;
}
@ -335,11 +335,11 @@ private:
if (mem_rd_rsp_active_
&& device_->m_axi_rvalid[0] && mem_rd_rsp_ready_) {
mem_rd_rsp_active_ = false;
}
if (!mem_rd_rsp_active_) {
}
if (!mem_rd_rsp_active_) {
if (!pending_mem_reqs_.empty()
&& (*pending_mem_reqs_.begin())->ready
&& !(*pending_mem_reqs_.begin())->write) {
&& (*pending_mem_reqs_.begin())->ready
&& !(*pending_mem_reqs_.begin())->write) {
auto mem_rsp_it = pending_mem_reqs_.begin();
auto mem_rsp = *mem_rsp_it;
/*
@ -348,9 +348,9 @@ private:
printf("%02x", mem_rsp->block[(MEM_BLOCK_SIZE-1)-i]);
}
printf("\n");
*/
*/
device_->m_axi_rvalid[0] = 1;
device_->m_axi_rid[0] = mem_rsp->tag;
device_->m_axi_rid[0] = mem_rsp->tag;
device_->m_axi_rresp[0] = 0;
device_->m_axi_rlast[0] = 1;
memcpy(device_->m_axi_rdata[0].data(), mem_rsp->block.data(), MEM_BLOCK_SIZE);
@ -362,46 +362,46 @@ private:
}
}
// send memory write response
// send memory write response
if (mem_wr_rsp_active_
&& device_->m_axi_bvalid[0] && mem_wr_rsp_ready_) {
mem_wr_rsp_active_ = false;
}
if (!mem_wr_rsp_active_) {
if (!pending_mem_reqs_.empty()
&& (*pending_mem_reqs_.begin())->ready
&& (*pending_mem_reqs_.begin())->ready
&& (*pending_mem_reqs_.begin())->write) {
auto mem_rsp_it = pending_mem_reqs_.begin();
auto mem_rsp = *mem_rsp_it;
/*
printf("%0ld: [sim] MEM Wr Rsp: bank=%d, addr=%0lx\n", timestamp, last_mem_rsp_bank_, mem_rsp->addr);
printf("%0ld: [sim] MEM Wr Rsp: bank=%d, addr=%0lx\n", timestamp, last_mem_rsp_bank_, mem_rsp->addr);
*/
device_->m_axi_bvalid[0] = 1;
device_->m_axi_bvalid[0] = 1;
device_->m_axi_bid[0] = mem_rsp->tag;
device_->m_axi_bresp[0] = 0;
pending_mem_reqs_.erase(mem_rsp_it);
pending_mem_reqs_.erase(mem_rsp_it);
mem_wr_rsp_active_ = true;
delete mem_rsp;
} else {
device_->m_axi_bvalid[0] = 0;
}
}
}
// select the memory bank
uint32_t req_addr = device_->m_axi_wvalid[0] ? device_->m_axi_awaddr[0] : device_->m_axi_araddr[0];
// process memory requests
if ((device_->m_axi_wvalid[0] || device_->m_axi_arvalid[0]) && running_) {
if (device_->m_axi_wvalid[0]) {
if (device_->m_axi_wvalid[0]) {
uint64_t byteen = device_->m_axi_wstrb[0];
uint64_t base_addr = device_->m_axi_awaddr[0];
uint8_t* data = (uint8_t*)device_->m_axi_wdata[0].data();
// check console output
if (base_addr >= uint64_t(IO_COUT_ADDR)
&& base_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
&& base_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
if ((byteen >> i) & 0x1) {
if ((byteen >> i) & 0x1) {
auto& ss_buf = print_bufs_[i];
char c = data[i];
ss_buf << c;
@ -410,7 +410,7 @@ private:
ss_buf.str("");
}
}
}
}
} else {
/*
printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, base_addr, byteen);
@ -420,26 +420,26 @@ private:
printf("\n");
*/
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
if ((byteen >> i) & 0x1) {
if ((byteen >> i) & 0x1) {
(*ram_)[base_addr + i] = data[i];
}
}
}
auto mem_req = new mem_req_t();
mem_req->tag = device_->m_axi_awid[0];
mem_req->addr = device_->m_axi_awaddr[0];
mem_req->addr = device_->m_axi_awaddr[0];
mem_req->write = true;
mem_req->ready = true;
pending_mem_reqs_.emplace_back(mem_req);
// send dram request
ramulator::Request dram_req(
ramulator::Request dram_req(
device_->m_axi_awaddr[0],
ramulator::Request::Type::WRITE,
0
);
dram_queue_.push(dram_req);
}
}
} else {
// process reads
auto mem_req = new mem_req_t();
@ -451,7 +451,7 @@ private:
pending_mem_reqs_.emplace_back(mem_req);
// send dram request
ramulator::Request dram_req(
ramulator::Request dram_req(
device_->m_axi_araddr[0],
ramulator::Request::Type::READ,
std::bind([&](ramulator::Request& dram_req, mem_req_t* mem_req) {
@ -460,12 +460,12 @@ private:
0
);
dram_queue_.push(dram_req);
}
}
}
}
device_->m_axi_wready[0] = running_;
device_->m_axi_awready[0] = running_;
device_->m_axi_arready[0] = running_;
device_->m_axi_arready[0] = running_;
}
#else
@ -486,7 +486,7 @@ private:
return;
}
// process memory responses
// process memory responses
if (mem_rd_rsp_active_
&& device_->mem_rsp_valid && mem_rd_rsp_ready_) {
mem_rd_rsp_active_ = false;
@ -494,7 +494,7 @@ private:
if (!mem_rd_rsp_active_) {
if (!pending_mem_reqs_.empty()
&& (*pending_mem_reqs_.begin())->ready) {
device_->mem_rsp_valid = 1;
device_->mem_rsp_valid = 1;
auto mem_rsp_it = pending_mem_reqs_.begin();
auto mem_rsp = *mem_rsp_it;
/*
@ -505,7 +505,7 @@ private:
printf("\n");
*/
memcpy(device_->mem_rsp_data.data(), mem_rsp->block.data(), MEM_BLOCK_SIZE);
device_->mem_rsp_tag = mem_rsp->tag;
device_->mem_rsp_tag = mem_rsp->tag;
pending_mem_reqs_.erase(mem_rsp_it);
mem_rd_rsp_active_ = true;
delete mem_rsp;
@ -514,19 +514,19 @@ private:
}
}
// process memory requests
// process memory requests
if (device_->mem_req_valid && running_) {
uint64_t byte_addr = (device_->mem_req_addr * MEM_BLOCK_SIZE);
if (device_->mem_req_rw) {
if (device_->mem_req_rw) {
// process writes
uint64_t byteen = device_->mem_req_byteen;
uint64_t byteen = device_->mem_req_byteen;
uint8_t* data = (uint8_t*)(device_->mem_req_data.data());
// check console output
if (byte_addr >= uint64_t(IO_COUT_ADDR)
&& byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
&& byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
for (int i = 0; i < IO_COUT_SIZE; i++) {
if ((byteen >> i) & 0x1) {
if ((byteen >> i) & 0x1) {
auto& ss_buf = print_bufs_[i];
char c = data[i];
ss_buf << c;
@ -535,7 +535,7 @@ private:
ss_buf.str("");
}
}
}
}
} else {
/*
printf("%0ld: [sim] MEM Wr: tag=%0lx, addr=%0x, byteen=%0lx, data=", timestamp, device_->mem_req_tag, byte_addr, byteen);
@ -545,23 +545,23 @@ private:
printf("\n");
*/
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
if ((byteen >> i) & 0x1) {
if ((byteen >> i) & 0x1) {
(*ram_)[byte_addr + i] = data[i];
}
}
// send dram request
ramulator::Request dram_req(
ramulator::Request dram_req(
byte_addr,
ramulator::Request::Type::WRITE,
0
);
dram_queue_.push(dram_req);
}
}
} else {
// process reads
auto mem_req = new mem_req_t();
mem_req->tag = device_->mem_req_tag;
mem_req->tag = device_->mem_req_tag;
mem_req->addr = byte_addr;
mem_req->write = false;
mem_req->ready = false;
@ -571,7 +571,7 @@ private:
//printf("%0ld: [sim] MEM Rd Req: addr=%0x, tag=%0lx\n", timestamp, byte_addr, device_->mem_req_tag);
// send dram request
ramulator::Request dram_req(
ramulator::Request dram_req(
byte_addr,
ramulator::Request::Type::READ,
std::bind([&](ramulator::Request& dram_req, mem_req_t* mem_req) {
@ -581,7 +581,7 @@ private:
);
dram_queue_.push(dram_req);
}
}
}
device_->mem_req_ready = running_;
}
@ -625,8 +625,8 @@ private:
private:
typedef struct {
bool ready;
typedef struct {
bool ready;
std::array<uint8_t, MEM_BLOCK_SIZE> block;
uint64_t addr;
uint64_t tag;
@ -663,7 +663,7 @@ private:
///////////////////////////////////////////////////////////////////////////////
Processor::Processor()
Processor::Processor()
: impl_(new Impl())
{}
@ -679,6 +679,6 @@ int Processor::run() {
return impl_->run();
}
void Processor::write_dcr(uint32_t addr, uint32_t value) {
return impl_->write_dcr(addr, value);
void Processor::dcr_write(uint32_t addr, uint32_t value) {
return impl_->dcr_write(addr, value);
}

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -21,7 +21,7 @@ class RAM;
class Processor {
public:
Processor();
~Processor();
@ -29,7 +29,7 @@ public:
int run();
void write_dcr(uint32_t addr, uint32_t value);
void dcr_write(uint32_t addr, uint32_t value);
private:

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -92,20 +92,20 @@ int main(int argc, char **argv) {
// create processor
Processor processor(arch);
// attach memory module
processor.attach_ram(&ram);
processor.attach_ram(&ram);
// setup base DCRs
const uint64_t startup_addr(STARTUP_ADDR);
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
processor.dcr_write(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
#if (XLEN == 64)
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
processor.dcr_write(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
#endif
processor.write_dcr(VX_DCR_BASE_MPM_CLASS, 0);
processor.dcr_write(VX_DCR_BASE_MPM_CLASS, 0);
// load program
{
{
std::string program_ext(fileExtension(program));
if (program_ext == "bin") {
ram.loadBinImage(program, startup_addr);
@ -122,11 +122,11 @@ int main(int argc, char **argv) {
if (riscv_test) {
exitcode = (1 - exitcode);
}
}
}
if (exitcode != 0) {
std::cout << "*** error: exitcode=" << exitcode << std::endl;
}
}
return exitcode;
}

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -16,7 +16,7 @@
using namespace vortex;
ProcessorImpl::ProcessorImpl(const Arch& arch)
ProcessorImpl::ProcessorImpl(const Arch& arch)
: arch_(arch)
, clusters_(arch.num_clusters())
{
@ -36,16 +36,16 @@ ProcessorImpl::ProcessorImpl(const Arch& arch)
log2ceil(L2_LINE_SIZE), // W
log2ceil(L3_NUM_WAYS), // A
log2ceil(L3_NUM_BANKS), // B
XLEN, // address bits
XLEN, // address bits
1, // number of ports
uint8_t(arch.num_clusters()), // request size
uint8_t(arch.num_clusters()), // request size
true, // write-through
false, // write response
L3_MSHR_SIZE, // mshr size
2, // pipeline latency
}
);
);
// connect L3 memory ports
l3cache_->MemReqPort.bind(&memsim_->MemReqPort);
memsim_->MemRspPort.bind(&l3cache_->MemRspPort);
@ -86,7 +86,7 @@ void ProcessorImpl::attach_ram(RAM* ram) {
int ProcessorImpl::run() {
SimPlatform::instance().reset();
this->reset();
bool done;
int exitcode = 0;
do {
@ -104,16 +104,16 @@ int ProcessorImpl::run() {
return exitcode;
}
void ProcessorImpl::reset() {
perf_mem_reads_ = 0;
perf_mem_writes_ = 0;
perf_mem_latency_ = 0;
perf_mem_pending_reads_ = 0;
}
void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
void ProcessorImpl::dcr_write(uint32_t addr, uint32_t value) {
dcrs_.write(addr, value);
}
@ -128,7 +128,7 @@ ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
///////////////////////////////////////////////////////////////////////////////
Processor::Processor(const Arch& arch)
Processor::Processor(const Arch& arch)
: impl_(new ProcessorImpl(arch))
{}
@ -144,6 +144,6 @@ int Processor::run() {
return impl_->run();
}
void Processor::write_dcr(uint32_t addr, uint32_t value) {
return impl_->write_dcr(addr, value);
void Processor::dcr_write(uint32_t addr, uint32_t value) {
return impl_->dcr_write(addr, value);
}

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -30,7 +30,7 @@ public:
int run();
void write_dcr(uint32_t addr, uint32_t value);
void dcr_write(uint32_t addr, uint32_t value);
private:
ProcessorImpl* impl_;

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -37,12 +37,12 @@ public:
int run();
void write_dcr(uint32_t addr, uint32_t value);
void dcr_write(uint32_t addr, uint32_t value);
PerfStats perf_stats() const;
private:
void reset();
const Arch& arch_;

View file

@ -25,8 +25,10 @@ int test = -1;
uint32_t count = 0;
vx_device_h device = nullptr;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
vx_buffer_h src_buffer = nullptr;
vx_buffer_h dst_buffer = nullptr;
vx_buffer_h krnl_buffer = nullptr;
vx_buffer_h args_buffer = nullptr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -47,7 +49,7 @@ static void parse_args(int argc, char **argv) {
case 'k':
kernel_file = optarg;
break;
case 'h':
case 'h':
case '?': {
show_usage();
exit(0);
@ -61,10 +63,10 @@ static void parse_args(int argc, char **argv) {
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_mem_free(src_buffer);
vx_mem_free(dst_buffer);
vx_mem_free(krnl_buffer);
vx_mem_free(args_buffer);
vx_dev_close(device);
}
}
@ -80,23 +82,23 @@ int run_memcopy_test(const kernel_arg_t& kernel_arg) {
std::vector<uint32_t> h_src(num_points);
std::vector<uint32_t> h_dst(num_points);
// update source buffer
// update source buffer
for (uint32_t i = 0; i < num_points; ++i) {
h_src[i] = shuffle(i, NONCE);
}
auto time_start = std::chrono::high_resolution_clock::now();
// upload source buffer
std::cout << "write source buffer to local memory" << std::endl;
auto t0 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, h_src.data(), buf_size));
RT_CHECK(vx_copy_to_dev(dst_buffer, h_src.data(), 0, buf_size));
auto t1 = std::chrono::high_resolution_clock::now();
// download destination buffer
std::cout << "read destination buffer from local memory" << std::endl;
auto t2 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
auto t3 = std::chrono::high_resolution_clock::now();
// verify result
@ -114,11 +116,11 @@ int run_memcopy_test(const kernel_arg_t& kernel_arg) {
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed;
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
printf("upload time: %lg ms\n", elapsed);
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count();
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count();
printf("download time: %lg ms\n", elapsed);
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Total elapsed time: %lg ms\n", elapsed);
return errors;
@ -130,42 +132,42 @@ int run_kernel_test(const kernel_arg_t& kernel_arg) {
std::vector<uint32_t> h_src(num_points);
std::vector<uint32_t> h_dst(num_points);
// update source buffer
// update source buffer
for (uint32_t i = 0; i < num_points; ++i) {
h_src[i] = shuffle(i, NONCE);
}
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
auto time_start = std::chrono::high_resolution_clock::now();
// upload source buffer
auto t0 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, h_src.data(), buf_size));
RT_CHECK(vx_copy_to_dev(src_buffer, h_src.data(), 0, buf_size));
auto t1 = std::chrono::high_resolution_clock::now();
// start device
std::cout << "start execution" << std::endl;
auto t2 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
auto t3 = std::chrono::high_resolution_clock::now();
// download destination buffer
std::cout << "read destination buffer from local memory" << std::endl;
auto t4 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
auto t5 = std::chrono::high_resolution_clock::now();
// verify result
int errors = 0;
// verify result
int errors = 0;
std::cout << "verify result" << std::endl;
for (uint32_t i = 0; i < num_points; ++i) {
auto cur = h_dst[i];
@ -179,13 +181,13 @@ int run_kernel_test(const kernel_arg_t& kernel_arg) {
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed;
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
printf("upload time: %lg ms\n", elapsed);
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count();
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count();
printf("execute time: %lg ms\n", elapsed);
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t5 - t4).count();
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t5 - t4).count();
printf("download time: %lg ms\n", elapsed);
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Total elapsed time: %lg ms\n", elapsed);
return errors;
@ -214,8 +216,10 @@ int main(int argc, char *argv[]) {
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src_buffer));
RT_CHECK(vx_mem_address(src_buffer, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
kernel_arg.count = num_points;
@ -224,7 +228,7 @@ int main(int argc, char *argv[]) {
int errors = 0;
// run tests
// run tests
if (0 == test || -1 == test) {
std::cout << "run memcopy test" << std::endl;
errors = run_memcopy_test(kernel_arg);
@ -236,16 +240,16 @@ int main(int argc, char *argv[]) {
}
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "cleanup" << std::endl;
cleanup();
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return errors;
}
}
std::cout << "Test PASSED" << std::endl;
std::cout << "Test PASSED" << std::endl;
return 0;
}

View file

@ -30,10 +30,10 @@ public:
static const char* type_str() {
return "integer";
}
static int generate() {
return rand();
static int generate() {
return rand();
}
static bool compare(int a, int b, int index, int errors) {
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a);
@ -41,7 +41,7 @@ public:
return false;
}
return true;
}
}
};
template <>
@ -50,10 +50,10 @@ public:
static const char* type_str() {
return "float";
}
static int generate() {
static int generate() {
return static_cast<float>(rand()) / RAND_MAX;
}
static bool compare(float a, float b, int index, int errors) {
static bool compare(float a, float b, int index, int errors) {
union fi_t { float f; int32_t i; };
fi_t fa, fb;
fa.f = a;
@ -66,7 +66,7 @@ public:
return false;
}
return true;
}
}
};
static void convolution_cpu(TYPE *O, TYPE *I, TYPE *W, int32_t width, int32_t height) {
@ -95,8 +95,11 @@ int size = 32;
bool use_lmem = false;
vx_device_h device = nullptr;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
vx_buffer_h I_buffer = nullptr;
vx_buffer_h W_buffer = nullptr;
vx_buffer_h O_buffer = nullptr;
vx_buffer_h krnl_buffer = nullptr;
vx_buffer_h args_buffer = nullptr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -111,7 +114,7 @@ static void parse_args(int argc, char **argv) {
case 'n':
size = atoi(optarg);
break;
case 'l':
case 'l':
use_lmem = true;
break;
case 'k':
@ -130,26 +133,26 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.I_addr);
if (device) {
vx_mem_free(I_buffer);
if (!use_lmem) {
vx_mem_free(device, kernel_arg.W_addr);
vx_mem_free(W_buffer);
}
vx_mem_free(device, kernel_arg.O_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_mem_free(O_buffer);
vx_mem_free(krnl_buffer);
vx_mem_free(args_buffer);
vx_dev_close(device);
}
}
int main(int argc, char *argv[]) {
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = size * size;
@ -166,13 +169,16 @@ int main(int argc, char *argv[]) {
uint32_t w_points = 3 * 3;
// allocate device memory
std::cout << "allocate device memory" << std::endl;
std::cout << "allocate device memory" << std::endl;
size_t i_nbytes = i_points * sizeof(TYPE);
size_t w_nbytes = w_points * sizeof(TYPE);
size_t o_nbytes = o_points * sizeof(TYPE);
RT_CHECK(vx_mem_alloc(device, i_nbytes, &kernel_arg.I_addr));
RT_CHECK(vx_mem_alloc(device, o_nbytes, &kernel_arg.O_addr));
RT_CHECK(vx_mem_alloc(device, w_nbytes, &kernel_arg.W_addr));
RT_CHECK(vx_mem_alloc(device, i_nbytes, VX_MEM_READ, &I_buffer));
RT_CHECK(vx_mem_address(I_buffer, &kernel_arg.I_addr));
RT_CHECK(vx_mem_alloc(device, w_nbytes, VX_MEM_READ, &W_buffer));
RT_CHECK(vx_mem_address(W_buffer, &kernel_arg.W_addr));
RT_CHECK(vx_mem_alloc(device, o_nbytes, VX_MEM_WRITE, &O_buffer));
RT_CHECK(vx_mem_address(O_buffer, &kernel_arg.O_addr));
if (use_lmem) {
uint64_t dev_local_mem_size;
@ -212,32 +218,32 @@ int main(int argc, char *argv[]) {
// upload input buffer
{
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.I_addr, h_I.data(), i_nbytes));
RT_CHECK(vx_copy_to_dev(I_buffer, h_I.data(), 0, i_nbytes));
}
// upload weight buffer
{
std::cout << "upload weight buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.W_addr, h_W.data(), w_nbytes));
RT_CHECK(vx_copy_to_dev(W_buffer, h_W.data(), 0, w_nbytes));
}
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
auto time_start = std::chrono::high_resolution_clock::now();
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
@ -245,7 +251,7 @@ int main(int argc, char *argv[]) {
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, h_O.data(), kernel_arg.O_addr, o_nbytes));
RT_CHECK(vx_copy_from_dev(h_O.data(), O_buffer, 0, o_nbytes));
// verify result
std::cout << "verify result" << std::endl;
@ -253,7 +259,7 @@ int main(int argc, char *argv[]) {
{
std::vector<TYPE> h_ref(o_points);
convolution_cpu(h_ref.data(), h_I.data(), h_W.data(), size, size);
for (uint32_t i = 0; i < h_ref.size(); ++i) {
auto ref = h_ref[i];
auto cur = h_O[i];
@ -264,13 +270,13 @@ int main(int argc, char *argv[]) {
}
// cleanup
std::cout << "cleanup" << std::endl;
std::cout << "cleanup" << std::endl;
cleanup();
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return errors;
return errors;
}
std::cout << "PASSED!" << std::endl;

View file

@ -28,10 +28,10 @@ public:
static const char* type_str() {
return "integer";
}
static int generate() {
return rand();
static int generate() {
return rand();
}
static bool compare(int a, int b, int index, int errors) {
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a);
@ -39,7 +39,7 @@ public:
return false;
}
return true;
}
}
};
template <>
@ -50,10 +50,10 @@ public:
static const char* type_str() {
return "float";
}
static int generate() {
static int generate() {
return static_cast<float>(rand()) / RAND_MAX;
}
static bool compare(float a, float b, int index, int errors) {
static bool compare(float a, float b, int index, int errors) {
union fi_t { float f; int32_t i; };
fi_t fa, fb;
fa.f = a;
@ -66,15 +66,18 @@ public:
return false;
}
return true;
}
}
};
const char* kernel_file = "kernel.vxbin";
uint32_t count = 16;
vx_device_h device = nullptr;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
vx_buffer_h src0_buffer = nullptr;
vx_buffer_h src1_buffer = nullptr;
vx_buffer_h dst_buffer = nullptr;
vx_buffer_h krnl_buffer = nullptr;
vx_buffer_h args_buffer = nullptr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -105,24 +108,24 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
if (device) {
vx_mem_free(src0_buffer);
vx_mem_free(src1_buffer);
vx_mem_free(dst_buffer);
vx_mem_free(krnl_buffer);
vx_mem_free(args_buffer);
vx_dev_close(device);
}
}
int main(int argc, char *argv[]) {
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
@ -130,28 +133,31 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t buf_size = num_points * sizeof(TYPE);
uint32_t total_threads = num_cores * num_warps * num_threads;
uint32_t num_points = count * total_threads;
uint32_t buf_size = num_points * sizeof(TYPE);
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
kernel_arg.num_tasks = num_tasks;
kernel_arg.num_tasks = total_threads;
kernel_arg.task_size = count;
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer));
RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer));
RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate host buffers
// allocate host buffers
std::cout << "allocate host buffers" << std::endl;
std::vector<TYPE> h_src0(num_points);
std::vector<TYPE> h_src1(num_points);
@ -165,23 +171,23 @@ int main(int argc, char *argv[]) {
// upload source buffer0
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, h_src0.data(), buf_size));
RT_CHECK(vx_copy_to_dev(src0_buffer, h_src0.data(), 0, buf_size));
// upload source buffer1
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, h_src1.data(), buf_size));
RT_CHECK(vx_copy_to_dev(src1_buffer, h_src1.data(), 0, buf_size));
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
// wait for completion
std::cout << "wait for completion" << std::endl;
@ -189,10 +195,10 @@ int main(int argc, char *argv[]) {
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
// verify result
std::cout << "verify result" << std::endl;
std::cout << "verify result" << std::endl;
int errors = 0;
for (uint32_t i = 0; i < num_points; ++i) {
auto ref = h_src0[i] + h_src1[i];
@ -203,13 +209,13 @@ int main(int argc, char *argv[]) {
}
// cleanup
std::cout << "cleanup" << std::endl;
std::cout << "cleanup" << std::endl;
cleanup();
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return errors;
return errors;
}
std::cout << "PASSED!" << std::endl;

View file

@ -22,8 +22,10 @@ const char* kernel_file = "kernel.vxbin";
uint32_t count = 0;
vx_device_h device = nullptr;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
vx_buffer_h src_buffer = nullptr;
vx_buffer_h dst_buffer = nullptr;
vx_buffer_h krnl_buffer = nullptr;
vx_buffer_h args_buffer = nullptr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -55,10 +57,10 @@ static void parse_args(int argc, char **argv) {
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_mem_free(src_buffer);
vx_mem_free(dst_buffer);
vx_mem_free(krnl_buffer);
vx_mem_free(args_buffer);
vx_dev_close(device);
}
}
@ -72,7 +74,7 @@ void gen_src_data(std::vector<int>& src_data, uint32_t size) {
}
}
void gen_ref_data(std::vector<int>& ref_data, const std::vector<int>& src_data, uint32_t size) {
void gen_ref_data(std::vector<int>& ref_data, const std::vector<int>& src_data, uint32_t size) {
ref_data.resize(size);
for (int i = 0; i < (int)size; ++i) {
int value = src_data.at(i);
@ -83,7 +85,7 @@ void gen_ref_data(std::vector<int>& ref_data, const std::vector<int>& src_data,
} else {
value += 2;
}
// diverge
if (i > 1) {
if (i > 2) {
@ -109,8 +111,8 @@ void gen_ref_data(std::vector<int>& ref_data, const std::vector<int>& src_data,
// loop
for (int j = 0, n = i; j < n; ++j) {
value += src_data.at(j);
}
}
// switch
switch (i) {
case 0:
@ -141,7 +143,7 @@ void gen_ref_data(std::vector<int>& ref_data, const std::vector<int>& src_data,
}
}
int main(int argc, char *argv[]) {
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@ -152,7 +154,7 @@ int main(int argc, char *argv[]) {
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
@ -160,8 +162,8 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t total_threads = num_cores * num_warps * num_threads;
uint32_t num_points = count * total_threads;
uint32_t buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
@ -170,13 +172,15 @@ int main(int argc, char *argv[]) {
kernel_arg.num_points = num_points;
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src_buffer));
RT_CHECK(vx_mem_address(src_buffer, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate host buffers
std::cout << "allocate host buffers" << std::endl;
std::vector<int32_t> h_src;
@ -184,20 +188,20 @@ int main(int argc, char *argv[]) {
gen_src_data(h_src, num_points);
// upload source buffer
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, h_src.data(), buf_size));
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(src_buffer, h_src.data(), 0, buf_size));
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
// wait for completion
std::cout << "wait for completion" << std::endl;
@ -205,7 +209,7 @@ int main(int argc, char *argv[]) {
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
// verify result
std::cout << "verify result" << std::endl;
@ -226,13 +230,13 @@ int main(int argc, char *argv[]) {
}
// cleanup
std::cout << "cleanup" << std::endl;
std::cout << "cleanup" << std::endl;
cleanup();
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return errors;
return errors;
}
std::cout << "PASSED!" << std::endl;

View file

@ -2,6 +2,7 @@
#include <math.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include <vx_print.h>
#include "common.h"
typedef void (*PFN_Kernel)(int task_id, kernel_arg_t* __UNIFORM__ arg);
@ -15,7 +16,7 @@ void kernel_iadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -30,7 +31,7 @@ void kernel_imul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -45,7 +46,7 @@ void kernel_idiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -60,7 +61,7 @@ void kernel_idiv_mul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -77,7 +78,7 @@ void kernel_fadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -92,7 +93,7 @@ void kernel_fsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -107,7 +108,7 @@ void kernel_fmul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -122,7 +123,7 @@ void kernel_fmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -137,7 +138,7 @@ void kernel_fmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -152,7 +153,7 @@ void kernel_fnmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -167,7 +168,7 @@ void kernel_fnmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -182,7 +183,7 @@ void kernel_fnmadd_madd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -199,7 +200,7 @@ void kernel_fdiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -214,7 +215,7 @@ void kernel_fdiv2(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -231,7 +232,7 @@ void kernel_fsqrt(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -246,7 +247,7 @@ void kernel_ftoi(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -262,7 +263,7 @@ void kernel_ftou(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -278,7 +279,7 @@ void kernel_itof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -294,7 +295,7 @@ void kernel_utof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -314,7 +315,7 @@ void kernel_fclamp(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
@ -328,12 +329,12 @@ void kernel_trigo(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
dst_ptr[offset+i] = sin(a) + cos(b);
auto a = sinf(src0_ptr[offset+i]);
auto b = cosf(src1_ptr[offset+i]);
dst_ptr[offset+i] = a + b;
}
}
@ -356,14 +357,14 @@ void kernel_bar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
for (int i = 0; i <= block_size; ++i) {
dst_ptr[i + offset] = src0_ptr[i + offset];
}
}
}
// memory fence
vx_fence();
// local barrier
vx_barrier(0, num_warps);
// update destination
dst_ptr[task_id] += 1;
}
@ -372,7 +373,7 @@ void kernel_gbar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto num_cores = vx_num_cores();
auto num_warps = vx_num_warps();
auto num_threads = vx_num_threads();
auto cid = vx_core_id();
auto wid = vx_warp_id();
auto tid = vx_thread_id();
@ -385,47 +386,45 @@ void kernel_gbar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
for (int i = 0, n = arg->num_tasks; i <= n; ++i) {
dst_ptr[i] = src0_ptr[i];
}
}
}
// memory fence
vx_fence();
// global barrier
vx_barrier(0x80000000, num_cores);
// update destination
dst_ptr[task_id] += 1;
}
static PFN_Kernel sc_tests[23];
void register_tests() {
sc_tests[0] = kernel_iadd;
sc_tests[1] = kernel_imul;
sc_tests[2] = kernel_idiv;
sc_tests[3] = kernel_idiv_mul;
sc_tests[4] = kernel_fadd;
sc_tests[5] = kernel_fsub;
sc_tests[6] = kernel_fmul;
sc_tests[7] = kernel_fmadd;
sc_tests[8] = kernel_fmsub;
sc_tests[9] = kernel_fnmadd;
sc_tests[10] = kernel_fnmsub;
sc_tests[11] = kernel_fnmadd_madd;
sc_tests[12] = kernel_fdiv;
sc_tests[13] = kernel_fdiv2;
sc_tests[14] = kernel_fsqrt;
sc_tests[15] = kernel_ftoi;
sc_tests[16] = kernel_ftou;
sc_tests[17] = kernel_itof;
sc_tests[18] = kernel_utof;
sc_tests[19] = kernel_fclamp;
sc_tests[20] = kernel_trigo;
sc_tests[21] = kernel_bar;
sc_tests[22] = kernel_gbar;
}
static const PFN_Kernel sc_tests[] = {
/*kernel_iadd,
kernel_imul,
kernel_idiv,
kernel_idiv_mul,
kernel_fadd,
kernel_fsub,
kernel_fmul,
kernel_fmadd,
kernel_fmsub,
kernel_fnmadd,
kernel_fnmsub,
kernel_fnmadd_madd,
kernel_fdiv,
kernel_fdiv2,
kernel_fsqrt,
kernel_ftoi,
kernel_ftou,
kernel_itof,
kernel_utof,
kernel_fclamp,*/
kernel_trigo,
/*kernel_bar,
kernel_gbar*/
};
int main() {
register_tests();
auto arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)sc_tests[arg->testid], arg);
return 0;

View file

@ -20,8 +20,11 @@ int testid_e = 0;
bool stop_on_error = true;
vx_device_h device = nullptr;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
vx_buffer_h src0_buffer = nullptr;
vx_buffer_h src1_buffer = nullptr;
vx_buffer_h dst_buffer = nullptr;
vx_buffer_h krnl_buffer = nullptr;
vx_buffer_h args_buffer = nullptr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -67,21 +70,21 @@ static void parse_args(int argc, char **argv) {
}
}
void cleanup() {
void cleanup() {
if (testSuite) {
delete testSuite;
}
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_mem_free(src0_buffer);
vx_mem_free(src1_buffer);
vx_mem_free(dst_buffer);
vx_mem_free(krnl_buffer);
vx_mem_free(args_buffer);
vx_dev_close(device);
}
}
int main(int argc, char *argv[]) {
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@ -96,7 +99,7 @@ int main(int argc, char *argv[]) {
std::cout << "using kernel: " << kernel_file << std::endl;
// open device connection
std::cout << "open device connection" << std::endl;
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
@ -107,7 +110,7 @@ int main(int argc, char *argv[]) {
int num_tasks = num_cores * num_warps * num_threads;
int num_points = count * num_tasks;
size_t buf_size = num_points * sizeof(uint32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
@ -116,17 +119,20 @@ int main(int argc, char *argv[]) {
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
RT_CHECK(vx_mem_alloc(device, sizeof(kernel_arg_t), &kernel_args_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer));
RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer));
RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, sizeof(kernel_arg_t), VX_MEM_READ, &args_buffer));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::dec << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::dec << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::dec << std::endl;
// allocate host buffers
std::cout << "allocate host buffers" << std::endl;
// allocate host buffers
std::cout << "allocate host buffers" << std::endl;
std::vector<uint8_t> src1_buf(buf_size);
std::vector<uint8_t> src2_buf(buf_size);
std::vector<uint8_t> dst_buf(buf_size);
@ -138,15 +144,15 @@ int main(int argc, char *argv[]) {
}
// upload program
std::cout << "upload kernel" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
std::cout << "upload kernel" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
// execute tests
int errors = 0;
for (int t = testid_s; t <= testid_e; ++t) {
for (int t = testid_s; t <= testid_e; ++t) {
auto test = testSuite->get_test(t);
auto name = test->name();
if (!selected.empty()) {
if (selected.count(name) == 0)
continue;
@ -162,30 +168,30 @@ int main(int argc, char *argv[]) {
// get test arguments
std::cout << "get test arguments" << std::endl;
RT_CHECK(test->setup(num_points, (void*)src1_buf.data(), (void*)src2_buf.data()));
// upload source buffer0
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, src1_buf.data(), buf_size));
// upload source buffer1
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, src2_buf.data(), buf_size));
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
// upload source buffer0
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(src0_buffer, src1_buf.data(), 0, buf_size));
// upload source buffer1
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(src1_buffer, src2_buf.data(), 0, buf_size));
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
for (int i = 0; i < num_points; ++i) {
((uint32_t*)dst_buf.data())[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, dst_buf.data(), buf_size));
}
RT_CHECK(vx_copy_to_dev(dst_buffer, dst_buf.data(), 0, buf_size));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
kernel_arg.testid = t;
RT_CHECK(vx_copy_to_dev(device, kernel_args_addr, &kernel_arg, sizeof(kernel_arg_t)));
RT_CHECK(vx_copy_to_dev(args_buffer, &kernel_arg, 0, sizeof(kernel_arg_t)));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
// wait for completion
std::cout << "wait for completion" << std::endl;
@ -193,7 +199,7 @@ int main(int argc, char *argv[]) {
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, dst_buf.data(), kernel_arg.dst_addr, buf_size));
RT_CHECK(vx_copy_from_dev(dst_buf.data(), dst_buffer, 0, buf_size));
// verify destination
std::cout << "verify test result" << std::endl;
@ -201,16 +207,16 @@ int main(int argc, char *argv[]) {
if (err != 0) {
std::cout << "found " << std::dec << err << " errors!" << std::endl;
std::cout << "Test" << t << "-" << name << " FAILED!" << std::endl << std::flush;
errors += err;
errors += err;
if (stop_on_error)
break;
} else {
std::cout << "Test" << t << "-" << name << " PASSED!" << std::endl << std::flush;
}
}
}
// cleanup
std::cout << "cleanup" << std::endl;
std::cout << "cleanup" << std::endl;
cleanup();
return errors;

View file

@ -17,7 +17,7 @@ void cleanup();
exit(-1); \
} while (false)
union Float_t {
union Float_t {
float f;
int i;
struct {
@ -95,8 +95,8 @@ private:
class ITestCase {
public:
ITestCase(TestSuite* suite, const char* name)
: suite_(suite)
ITestCase(TestSuite* suite, const char* name)
: suite_(suite)
, name_(name)
{}
@ -116,7 +116,7 @@ public:
protected:
TestSuite* suite_;
const char* const name_;
const char* const name_;
};
class Test_IADD : public ITestCase {
@ -132,14 +132,14 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] + b[i];
auto ref = a[i] + b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
@ -162,14 +162,14 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i];
auto ref = a[i] * b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
@ -192,14 +192,14 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] / b[i];
auto ref = a[i] / b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
@ -222,16 +222,16 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] / b[i];
auto y = a[i] * b[i];
auto ref = x + y;
auto x = a[i] / b[i];
auto y = a[i] * b[i];
auto ref = x + y;
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
@ -254,14 +254,14 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] + b[i];
auto ref = a[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
@ -284,14 +284,14 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] - b[i];
auto ref = a[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
@ -314,14 +314,14 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i];
auto ref = a[i] * b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
@ -344,7 +344,7 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
@ -374,7 +374,7 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
@ -404,7 +404,7 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
@ -434,7 +434,7 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
@ -464,7 +464,7 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
@ -496,7 +496,7 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
@ -526,7 +526,7 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
@ -559,7 +559,7 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
@ -586,11 +586,11 @@ public:
for (uint32_t i = 0; i < n; ++i) {
float q = fround(float(n/2) - i + (float(i) / n));
a[i] = q;
b[i] = q;
b[i] = q;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
@ -622,7 +622,7 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
@ -653,7 +653,7 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
@ -684,7 +684,7 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (uint32_t*)src1;
@ -715,7 +715,7 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
@ -740,19 +740,19 @@ public:
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((2*i-n) * (1.0f/n) * 3.1416);
b[i] = fround((2*i-n) * (1.0f/n) * 3.1416);
a[i] = fround(int(2*i-n) * (1.0f/n) * 3.1416);
b[i] = fround(int(2*i-n) * (1.0f/n) * 3.1416);
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = sin(a[i]) + cos(b[i]);
auto ref = sinf(a[i]) + cosf(b[i]);
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
@ -766,20 +766,20 @@ class Test_BAR : public ITestCase {
public:
Test_BAR(TestSuite* suite) : ITestCase(suite, "bar") {}
int setup(uint32_t n, void* src1, void* /*src2*/) override {
int setup(uint32_t n, void* src1, void* /*src2*/) override {
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_WARPS, &num_warps_));
if (num_warps_ == 1) {
std::cout << "Error: multiple warps configuration required!" << std::endl;
return -1;
}
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_THREADS, &num_threads_));
auto a = (uint32_t*)src1;
auto a = (uint32_t*)src1;
for (uint32_t i = 0; i < n; ++i) {
a[i] = i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* /*src2*/) override {
int errors = 0;
auto a = (uint32_t*)src1;
@ -816,7 +816,7 @@ public:
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* /*src2*/) override {
int errors = 0;
auto a = (uint32_t*)src1;
@ -832,15 +832,15 @@ public:
}
uint64_t num_cores_;
uint64_t num_warps_;
uint64_t num_warps_;
uint64_t num_threads_;
};
///////////////////////////////////////////////////////////////////////////////
TestSuite::TestSuite(vx_device_h device)
TestSuite::TestSuite(vx_device_h device)
: device_(device) {
this->add_test(new Test_IADD(this));
/*this->add_test(new Test_IADD(this));
this->add_test(new Test_IMUL(this));
this->add_test(new Test_IDIV(this));
this->add_test(new Test_IDIV_MUL(this));
@ -859,10 +859,10 @@ TestSuite::TestSuite(vx_device_h device)
this->add_test(new Test_FTOU(this));
this->add_test(new Test_ITOF(this));
this->add_test(new Test_UTOF(this));
this->add_test(new Test_FCLAMP(this));
this->add_test(new Test_FCLAMP(this));*/
this->add_test(new Test_TRIGO(this));
this->add_test(new Test_BAR(this));
this->add_test(new Test_GBAR(this));
/*this->add_test(new Test_BAR(this));
this->add_test(new Test_GBAR(this));*/
}
TestSuite::~TestSuite() {

View file

@ -21,8 +21,11 @@ const char* kernel_file = "kernel.vxbin";
uint32_t count = 0;
vx_device_h device = nullptr;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
vx_buffer_h src0_buffer = nullptr;
vx_buffer_h src1_buffer = nullptr;
vx_buffer_h dst_buffer = nullptr;
vx_buffer_h krnl_buffer = nullptr;
vx_buffer_h args_buffer = nullptr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -54,16 +57,16 @@ static void parse_args(int argc, char **argv) {
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_mem_free(src0_buffer);
vx_mem_free(src1_buffer);
vx_mem_free(dst_buffer);
vx_mem_free(krnl_buffer);
vx_mem_free(args_buffer);
vx_dev_close(device);
}
}
int main(int argc, char *argv[]) {
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@ -72,7 +75,7 @@ int main(int argc, char *argv[]) {
}
// open device connection
std::cout << "open device connection" << std::endl;
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
@ -80,27 +83,30 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t total_threads = num_cores * num_warps * num_threads;
uint32_t num_points = count * total_threads;
uint32_t buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
kernel_arg.num_tasks = num_tasks;
kernel_arg.num_tasks = total_threads;
kernel_arg.task_size = count;
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer));
RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer));
RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate host buffers
// allocate host buffers
std::cout << "allocate host buffers" << std::endl;
std::vector<int32_t> h_src0(num_points);
std::vector<int32_t> h_src1(num_points);
@ -110,27 +116,27 @@ int main(int argc, char *argv[]) {
for (uint32_t i = 0; i < num_points; ++i) {
h_src0[i] = i-1;
h_src1[i] = i+1;
}
}
// upload source buffer0
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, h_src0.data(), buf_size));
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(src0_buffer, h_src0.data(), 0, buf_size));
// upload source buffer1
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, h_src1.data(), buf_size));
RT_CHECK(vx_copy_to_dev(src1_buffer, h_src1.data(), 0, buf_size));
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
// wait for completion
std::cout << "wait for completion" << std::endl;
@ -138,13 +144,13 @@ int main(int argc, char *argv[]) {
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
// verify result
std::cout << "verify result" << std::endl;
int errors = 0;
std::cout << "verify result" << std::endl;
int errors = 0;
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i + i;
int ref = i + i;
int cur = h_dst[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
@ -154,13 +160,13 @@ int main(int argc, char *argv[]) {
}
// cleanup
std::cout << "cleanup" << std::endl;
std::cout << "cleanup" << std::endl;
cleanup();
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return errors;
return errors;
}
std::cout << "PASSED!" << std::endl;

View file

@ -23,12 +23,16 @@
const char* kernel_file = "kernel.vxbin";
uint32_t count = 0;
static uint64_t io_base_addr = IO_CSR_ADDR + IO_CSR_SIZE;
static uint64_t io_base_addr = IO_MPM_ADDR + IO_CSR_SIZE;
uint64_t usr_test_addr;
vx_device_h device = nullptr;
uint64_t usr_test_mem;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
vx_buffer_h usr_test_buffer = nullptr;
vx_buffer_h io_test_buffer = nullptr;
vx_buffer_h src_buffer = nullptr;
vx_buffer_h dst_buffer = nullptr;
vx_buffer_h krnl_buffer = nullptr;
vx_buffer_h args_buffer = nullptr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -60,11 +64,12 @@ static void parse_args(int argc, char **argv) {
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_mem_free(device, usr_test_mem);
vx_mem_free(usr_test_buffer);
vx_mem_free(io_test_buffer);
vx_mem_free(src_buffer);
vx_mem_free(dst_buffer);
vx_mem_free(krnl_buffer);
vx_mem_free(args_buffer);
vx_dev_close(device);
}
}
@ -73,12 +78,12 @@ void gen_src_addrs(std::vector<uint64_t>& src_addrs, uint32_t size) {
src_addrs.resize(size);
uint32_t u = 0, k = 0;
for (uint32_t i = 0; i < size; ++i) {
if (0 ==(i % 4)) {
if (0 ==(i % 4)) {
k = (i + u) % NUM_ADDRS;
++u;
}
uint32_t j = i % NUM_ADDRS;
uint64_t a = ((j == k) ? usr_test_mem : io_base_addr) + j * sizeof(uint32_t);
uint32_t j = i % NUM_ADDRS;
uint64_t a = ((j == k) ? usr_test_addr : io_base_addr) + j * sizeof(uint32_t);
std::cout << std::dec << i << "," << k << ": value=0x" << std::hex << a << std::endl;
src_addrs[i] = a;
}
@ -103,7 +108,7 @@ int main(int argc, char *argv[]) {
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
@ -111,61 +116,65 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t total_threads = num_cores * num_warps * num_threads;
uint32_t num_points = count * total_threads;
uint32_t src_buf_size = NUM_ADDRS * sizeof(int32_t);
uint32_t addr_buf_size = num_points * sizeof(uint64_t);
uint32_t addr_buf_size = NUM_ADDRS * sizeof(int32_t);
uint32_t src_buf_size = num_points * sizeof(uint64_t);
uint32_t dst_buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << std::dec << num_points << std::endl;
std::cout << "usr buffer size: " << src_buf_size << " bytes" << std::endl;
std::cout << "addr buffer size: " << addr_buf_size << " bytes" << std::endl;
std::cout << "src buffer size: " << src_buf_size << " bytes" << std::endl;
std::cout << "dst buffer size: " << dst_buf_size << " bytes" << std::endl;
kernel_arg.num_points = num_points;
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_buf_size, &usr_test_mem));
RT_CHECK(vx_mem_alloc(device, addr_buf_size, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &kernel_arg.dst_addr));
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, addr_buf_size, VX_MEM_READ, &usr_test_buffer));
RT_CHECK(vx_mem_address(usr_test_buffer, &usr_test_addr));
RT_CHECK(vx_mem_reserve(device, io_base_addr, addr_buf_size, VX_MEM_READ, &io_test_buffer));
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_READ, &src_buffer));
RT_CHECK(vx_mem_address(src_buffer, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_WRITE, &dst_buffer));
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate host buffers
std::cout << "allocate host buffers" << std::endl;
std::vector<uint64_t> h_addr;
std::vector<uint32_t> h_src(NUM_ADDRS);
std::vector<uint64_t> h_src;
std::vector<uint32_t> h_addr(NUM_ADDRS);
std::vector<int32_t> h_dst(num_points);
// generate source data
gen_src_addrs(h_addr, num_points);
gen_src_addrs(h_src, num_points);
for (uint32_t i = 0; i < NUM_ADDRS; ++i) {
h_src[i] = i * i;
h_addr[i] = i * i;
}
// upload user address data
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(device, usr_test_mem, h_src.data(), src_buf_size));
RT_CHECK(vx_copy_to_dev(device, io_base_addr, h_src.data(), src_buf_size));
RT_CHECK(vx_copy_to_dev(usr_test_buffer, h_addr.data(), 0, addr_buf_size));
RT_CHECK(vx_copy_to_dev(io_test_buffer, h_addr.data(), 0, addr_buf_size));
// upload source buffer
std::cout << "upload address buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, h_addr.data(), addr_buf_size));
RT_CHECK(vx_copy_to_dev(src_buffer, h_src.data(), 0, src_buf_size));
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
// upload kernel argument
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
// wait for completion
std::cout << "wait for completion" << std::endl;
@ -173,7 +182,7 @@ int main(int argc, char *argv[]) {
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, dst_buf_size));
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, dst_buf_size));
// verify result
std::cout << "verify result" << std::endl;
@ -194,13 +203,13 @@ int main(int argc, char *argv[]) {
}
// cleanup
std::cout << "cleanup" << std::endl;
std::cout << "cleanup" << std::endl;
cleanup();
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return errors;
return errors;
}
std::cout << "PASSED!" << std::endl;

View file

@ -28,10 +28,10 @@ public:
static const char* type_str() {
return "integer";
}
static int generate() {
return rand();
static int generate() {
return rand();
}
static bool compare(int a, int b, int index, int errors) {
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
@ -39,7 +39,7 @@ public:
return false;
}
return true;
}
}
};
template <>
@ -50,10 +50,10 @@ public:
static const char* type_str() {
return "float";
}
static int generate() {
static int generate() {
return static_cast<float>(rand()) / RAND_MAX;
}
static bool compare(float a, float b, int index, int errors) {
static bool compare(float a, float b, int index, int errors) {
union fi_t { float f; int32_t i; };
fi_t fa, fb;
fa.f = a;
@ -66,15 +66,18 @@ public:
return false;
}
return true;
}
}
};
const char* kernel_file = "kernel.vxbin";
uint32_t count = 16;
vx_device_h device = nullptr;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
vx_buffer_h src0_buffer = nullptr;
vx_buffer_h src1_buffer = nullptr;
vx_buffer_h dst_buffer = nullptr;
vx_buffer_h krnl_buffer = nullptr;
vx_buffer_h args_buffer = nullptr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -105,24 +108,24 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
if (device) {
vx_mem_free(src0_buffer);
vx_mem_free(src1_buffer);
vx_mem_free(dst_buffer);
vx_mem_free(krnl_buffer);
vx_mem_free(args_buffer);
vx_dev_close(device);
}
}
int main(int argc, char *argv[]) {
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
@ -130,27 +133,30 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t total_threads = num_cores * num_warps * num_threads;
uint32_t num_points = count * total_threads;
uint32_t buf_size = num_points * sizeof(TYPE);
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
kernel_arg.num_tasks = num_tasks;
kernel_arg.num_tasks = total_threads;
kernel_arg.task_size = count;
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer));
RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer));
RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate host buffers
std::cout << "allocate host buffers" << std::endl;
std::vector<TYPE> h_src0(num_points);
@ -165,23 +171,23 @@ int main(int argc, char *argv[]) {
// upload source buffer0
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, h_src0.data(), buf_size));
RT_CHECK(vx_copy_to_dev(src0_buffer, h_src0.data(), 0, buf_size));
// upload source buffer1
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, h_src1.data(), buf_size));
RT_CHECK(vx_copy_to_dev(src1_buffer, h_src1.data(), 0, buf_size));
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
// wait for completion
std::cout << "wait for completion" << std::endl;
@ -189,7 +195,7 @@ int main(int argc, char *argv[]) {
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
// verify result
std::cout << "verify result" << std::endl;
@ -203,13 +209,13 @@ int main(int argc, char *argv[]) {
}
// cleanup
std::cout << "cleanup" << std::endl;
std::cout << "cleanup" << std::endl;
cleanup();
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return errors;
return errors;
}
std::cout << "PASSED!" << std::endl;

View file

@ -20,7 +20,7 @@
///////////////////////////////////////////////////////////////////////////////
union Float_t {
union Float_t {
float f;
int i;
struct {
@ -69,8 +69,11 @@ const char* kernel_file = "kernel.vxbin";
uint32_t count = 0;
vx_device_h device = nullptr;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
vx_buffer_h src0_buffer = nullptr;
vx_buffer_h src1_buffer = nullptr;
vx_buffer_h dst_buffer = nullptr;
vx_buffer_h krnl_buffer = nullptr;
vx_buffer_h args_buffer = nullptr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -102,22 +105,22 @@ static void parse_args(int argc, char **argv) {
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_mem_free(src0_buffer);
vx_mem_free(src1_buffer);
vx_mem_free(dst_buffer);
vx_mem_free(krnl_buffer);
vx_mem_free(args_buffer);
vx_dev_close(device);
}
}
void gen_src_data(std::vector<float>& test_data,
void gen_src_data(std::vector<float>& test_data,
std::vector<uint32_t>& addr_table,
uint32_t num_points,
uint32_t num_addrs) {
test_data.resize(num_points);
addr_table.resize(num_addrs);
for (uint32_t i = 0; i < num_points; ++i) {
float r = static_cast<float>(std::rand()) / RAND_MAX;
test_data[i] = r;
@ -131,7 +134,7 @@ void gen_src_data(std::vector<float>& test_data,
}
}
int main(int argc, char *argv[]) {
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@ -142,7 +145,7 @@ int main(int argc, char *argv[]) {
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
@ -150,12 +153,12 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t total_threads = num_cores * num_warps * num_threads;
uint32_t num_points = count * total_threads;
uint32_t num_addrs = num_points + NUM_LOADS - 1;
uint32_t addr_buf_size = num_addrs * sizeof(int32_t);
uint32_t src_buf_size = num_points * sizeof(int32_t);
uint32_t src_buf_size = num_points * sizeof(int32_t);
uint32_t dst_buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
@ -163,45 +166,48 @@ int main(int argc, char *argv[]) {
std::cout << "src buffer size: " << src_buf_size << " bytes" << std::endl;
std::cout << "dst buffer size: " << dst_buf_size << " bytes" << std::endl;
kernel_arg.num_tasks = num_tasks;
kernel_arg.num_tasks = total_threads;
kernel_arg.stride = count;
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, addr_buf_size, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, src_buf_size, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &kernel_arg.dst_addr));
RT_CHECK(vx_mem_alloc(device, addr_buf_size, VX_MEM_READ, &src0_buffer));
RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_READ, &src1_buffer));
RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_WRITE, &dst_buffer));
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
std::cout << "dev_addr=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate host buffers
std::cout << "allocate host buffers" << std::endl;
// allocate host buffers
std::cout << "allocate host buffers" << std::endl;
std::vector<uint32_t> h_addr;
std::vector<float> h_src;
std::vector<float> h_dst(num_points);
gen_src_data(h_src, h_addr, num_points, num_addrs);
// upload source buffer0
std::cout << "upload address buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, h_addr.data(), addr_buf_size));
RT_CHECK(vx_copy_to_dev(src0_buffer, h_addr.data(), 0, addr_buf_size));
// upload source buffer1
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, h_src.data(), src_buf_size));
RT_CHECK(vx_copy_to_dev(src1_buffer, h_src.data(), 0, src_buf_size));
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
// upload kernel argument
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
// wait for completion
std::cout << "wait for completion" << std::endl;
@ -209,7 +215,7 @@ int main(int argc, char *argv[]) {
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, dst_buf_size));
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, dst_buf_size));
// verify result
std::cout << "verify result" << std::endl;
@ -223,7 +229,7 @@ int main(int argc, char *argv[]) {
//printf("*** [%d] addr=%d, index=%d, value=%f\n", i, addr, index, value);
ref *= value;
}
float cur = h_dst[i];
if (!almost_equal(cur, ref)) {
std::cout << "error at result #" << std::dec << i
@ -233,13 +239,13 @@ int main(int argc, char *argv[]) {
}
// cleanup
std::cout << "cleanup" << std::endl;
std::cout << "cleanup" << std::endl;
cleanup();
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
return 1;
}
std::cout << "PASSED!" << std::endl;

View file

@ -21,8 +21,9 @@ const char* kernel_file = "kernel.vxbin";
uint32_t count = 4;
vx_device_h device = nullptr;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
vx_buffer_h src_buffer = nullptr;
vx_buffer_h krnl_buffer = nullptr;
vx_buffer_h args_buffer = nullptr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -54,14 +55,14 @@ static void parse_args(int argc, char **argv) {
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_mem_free(src_buffer);
vx_mem_free(krnl_buffer);
vx_mem_free(args_buffer);
vx_dev_close(device);
}
}
int main(int argc, char *argv[]) {
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@ -70,7 +71,7 @@ int main(int argc, char *argv[]) {
}
// open device connection
std::cout << "open device connection" << std::endl;
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
@ -78,8 +79,8 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t total_threads = num_cores * num_warps * num_threads;
uint32_t num_points = count * total_threads;
uint32_t buf_size = num_points * sizeof(char);
std::cout << "number of points: " << num_points << std::endl;
@ -89,41 +90,42 @@ int main(int argc, char *argv[]) {
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src_buffer));
RT_CHECK(vx_mem_address(src_buffer, &kernel_arg.src_addr));
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
// allocate host buffers
std::cout << "allocate host buffers" << std::endl;
std::vector<char> h_src(num_points);
// generate input data
for (uint32_t i = 0; i < num_points; ++i) {
h_src[i] = (char)i;
}
}
// upload source buffer0
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, h_src.data(), buf_size));
RT_CHECK(vx_copy_to_dev(src_buffer, h_src.data(), 0, buf_size));
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
// upload kernel argument
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// cleanup
std::cout << "cleanup" << std::endl;
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;

View file

@ -26,7 +26,7 @@ void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
for (int e = 0; e < size; ++e) {
sum += A[row * size + e] * B[e * size + col];
}
C[row * size + col] = sum;
}

View file

@ -30,10 +30,10 @@ public:
static const char* type_str() {
return "integer";
}
static int generate() {
return rand();
static int generate() {
return rand();
}
static bool compare(int a, int b, int index, int errors) {
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a);
@ -41,7 +41,7 @@ public:
return false;
}
return true;
}
}
};
template <>
@ -50,10 +50,10 @@ public:
static const char* type_str() {
return "float";
}
static int generate() {
static int generate() {
return static_cast<float>(rand()) / RAND_MAX;
}
static bool compare(float a, float b, int index, int errors) {
static bool compare(float a, float b, int index, int errors) {
union fi_t { float f; int32_t i; };
fi_t fa, fb;
fa.f = a;
@ -66,7 +66,7 @@ public:
return false;
}
return true;
}
}
};
static void matmul_cpu(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
@ -85,8 +85,11 @@ const char* kernel_file = "kernel.vxbin";
uint32_t size = 32;
vx_device_h device = nullptr;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
vx_buffer_h A_buffer = nullptr;
vx_buffer_h B_buffer = nullptr;
vx_buffer_h C_buffer = nullptr;
vx_buffer_h krnl_buffer = nullptr;
vx_buffer_h args_buffer = nullptr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -117,24 +120,24 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.A_addr);
vx_mem_free(device, kernel_arg.B_addr);
vx_mem_free(device, kernel_arg.C_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
if (device) {
vx_mem_free(A_buffer);
vx_mem_free(B_buffer);
vx_mem_free(C_buffer);
vx_mem_free(krnl_buffer);
vx_mem_free(args_buffer);
vx_dev_close(device);
}
}
int main(int argc, char *argv[]) {
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = size * size;
@ -149,9 +152,12 @@ int main(int argc, char *argv[]) {
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.A_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.B_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.C_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &A_buffer));
RT_CHECK(vx_mem_address(A_buffer, &kernel_arg.A_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &B_buffer));
RT_CHECK(vx_mem_address(B_buffer, &kernel_arg.B_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &C_buffer));
RT_CHECK(vx_mem_address(C_buffer, &kernel_arg.C_addr));
std::cout << "dev_argA=0x" << std::hex << kernel_arg.A_addr << std::endl;
std::cout << "dev_argB=0x" << std::hex << kernel_arg.B_addr << std::endl;
@ -171,32 +177,32 @@ int main(int argc, char *argv[]) {
// upload matrix A buffer
{
std::cout << "upload matrix A buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.A_addr, h_A.data(), buf_size));
RT_CHECK(vx_copy_to_dev(A_buffer, h_A.data(), 0, buf_size));
}
// upload matrix B buffer
{
std::cout << "upload matrix B buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.B_addr, h_B.data(), buf_size));
RT_CHECK(vx_copy_to_dev(B_buffer, h_B.data(), 0, buf_size));
}
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
auto time_start = std::chrono::high_resolution_clock::now();
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
@ -204,7 +210,7 @@ int main(int argc, char *argv[]) {
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, h_C.data(), kernel_arg.C_addr, buf_size));
RT_CHECK(vx_copy_from_dev(h_C.data(), C_buffer, 0, buf_size));
// verify result
std::cout << "verify result" << std::endl;
@ -212,7 +218,7 @@ int main(int argc, char *argv[]) {
{
std::vector<TYPE> h_ref(num_points);
matmul_cpu(h_ref.data(), h_A.data(), h_B.data(), size, size);
for (uint32_t i = 0; i < h_ref.size(); ++i) {
if (!Comparator<TYPE>::compare(h_C[i], h_ref[i], i, errors)) {
++errors;
@ -221,13 +227,13 @@ int main(int argc, char *argv[]) {
}
// cleanup
std::cout << "cleanup" << std::endl;
std::cout << "cleanup" << std::endl;
cleanup();
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return errors;
return errors;
}
std::cout << "PASSED!" << std::endl;

View file

@ -21,8 +21,10 @@ const char* kernel_file = "kernel.vxbin";
uint32_t count = 0;
vx_device_h device = nullptr;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
vx_buffer_h src_buffer = nullptr;
vx_buffer_h dst_buffer = nullptr;
vx_buffer_h krnl_buffer = nullptr;
vx_buffer_h args_buffer = nullptr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -54,10 +56,10 @@ static void parse_args(int argc, char **argv) {
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
vx_mem_free(src_buffer);
vx_mem_free(dst_buffer);
vx_mem_free(krnl_buffer);
vx_mem_free(args_buffer);
vx_dev_close(device);
}
}
@ -69,7 +71,7 @@ void gen_src_data(std::vector<TYPE>& src_data, uint32_t size) {
auto value = static_cast<TYPE>(r * size);
src_data[i] = value;
std::cout << std::dec << i << ": value=" << value << std::endl;
}
}
}
void gen_ref_data(std::vector<TYPE>& ref_data, const std::vector<TYPE>& src_data, uint32_t size) {
@ -85,7 +87,7 @@ void gen_ref_data(std::vector<TYPE>& ref_data, const std::vector<TYPE>& src_data
}
}
int main(int argc, char *argv[]) {
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@ -96,7 +98,7 @@ int main(int argc, char *argv[]) {
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
@ -104,44 +106,46 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t total_threads = num_cores * num_warps * num_threads;
uint32_t num_points = count * total_threads;
uint32_t buf_size = num_points * sizeof(TYPE);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
kernel_arg.num_points = num_points;
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src_buffer));
RT_CHECK(vx_mem_address(src_buffer, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate host buffers
// allocate host buffers
std::cout << "allocate host buffers" << std::endl;
std::vector<TYPE> h_src;
std::vector<TYPE> h_dst(num_points);
gen_src_data(h_src, num_points);
// upload source buffer
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, h_src.data(), buf_size));
RT_CHECK(vx_copy_to_dev(src_buffer, h_src.data(), 0, buf_size));
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
// upload kernel argument
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
// wait for completion
std::cout << "wait for completion" << std::endl;
@ -149,10 +153,10 @@ int main(int argc, char *argv[]) {
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
// verify result
std::cout << "verify result" << std::endl;
std::cout << "verify result" << std::endl;
int errors = 0;
{
std::vector<TYPE> h_ref;
@ -170,13 +174,13 @@ int main(int argc, char *argv[]) {
}
// cleanup
std::cout << "cleanup" << std::endl;
std::cout << "cleanup" << std::endl;
cleanup();
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return errors;
return errors;
}
std::cout << "PASSED!" << std::endl;

View file

@ -28,10 +28,10 @@ public:
static const char* type_str() {
return "integer";
}
static int generate() {
return rand();
static int generate() {
return rand();
}
static bool compare(int a, int b, int index, int errors) {
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a);
@ -39,7 +39,7 @@ public:
return false;
}
return true;
}
}
};
template <>
@ -50,10 +50,10 @@ public:
static const char* type_str() {
return "float";
}
static int generate() {
static int generate() {
return static_cast<float>(rand()) / RAND_MAX;
}
static bool compare(float a, float b, int index, int errors) {
static bool compare(float a, float b, int index, int errors) {
union fi_t { float f; int32_t i; };
fi_t fa, fb;
fa.f = a;
@ -66,15 +66,18 @@ public:
return false;
}
return true;
}
}
};
const char* kernel_file = "kernel.vxbin";
uint32_t size = 16;
vx_device_h device = nullptr;
uint64_t kernel_prog_addr;
uint64_t kernel_args_addr;
vx_buffer_h src0_buffer = nullptr;
vx_buffer_h src1_buffer = nullptr;
vx_buffer_h dst_buffer = nullptr;
vx_buffer_h krnl_buffer = nullptr;
vx_buffer_h args_buffer = nullptr;
kernel_arg_t kernel_arg = {};
static void show_usage() {
@ -105,30 +108,30 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, kernel_prog_addr);
vx_mem_free(device, kernel_args_addr);
if (device) {
vx_mem_free(src0_buffer);
vx_mem_free(src1_buffer);
vx_mem_free(dst_buffer);
vx_mem_free(krnl_buffer);
vx_mem_free(args_buffer);
vx_dev_close(device);
}
}
int main(int argc, char *argv[]) {
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = size;
uint32_t num_points = size;
uint32_t buf_size = num_points * sizeof(TYPE);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "number of points: " << num_points << std::endl;
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
@ -136,14 +139,17 @@ int main(int argc, char *argv[]) {
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer));
RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer));
RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate host buffers
std::cout << "allocate host buffers" << std::endl;
std::vector<TYPE> h_src0(num_points);
@ -157,23 +163,23 @@ int main(int argc, char *argv[]) {
// upload source buffer0
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, h_src0.data(), buf_size));
RT_CHECK(vx_copy_to_dev(src0_buffer, h_src0.data(), 0, buf_size));
// upload source buffer1
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, h_src1.data(), buf_size));
RT_CHECK(vx_copy_to_dev(src1_buffer, h_src1.data(), 0, buf_size));
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
// wait for completion
std::cout << "wait for completion" << std::endl;
@ -181,7 +187,7 @@ int main(int argc, char *argv[]) {
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
// verify result
std::cout << "verify result" << std::endl;
@ -195,13 +201,13 @@ int main(int argc, char *argv[]) {
}
// cleanup
std::cout << "cleanup" << std::endl;
std::cout << "cleanup" << std::endl;
cleanup();
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
return 1;
}
std::cout << "PASSED!" << std::endl;