mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
runtime API refactoring to support memory reservation and protection
This commit is contained in:
parent
c554f53e44
commit
db0f0fd353
35 changed files with 3190 additions and 2081 deletions
|
@ -1,13 +1,13 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2019-2023
|
||||
#
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -33,7 +33,7 @@ def get_vma_size(elf_file):
|
|||
max_vma = 0
|
||||
regex = re.compile(r'\s*LOAD\s+(\w+)\s+(\w+)\s+(\w+)\s+(\w+)\s+(\w+)')
|
||||
|
||||
for line in output.splitlines():
|
||||
for line in output.splitlines():
|
||||
match = regex.match(line)
|
||||
if match:
|
||||
vma = int(match.group(2), 16)
|
||||
|
@ -44,15 +44,14 @@ def get_vma_size(elf_file):
|
|||
vma_size = max_vma - min_vma
|
||||
#print("vma={0:x}, size={1}, min_vma={2:x}, max_vma={3:x}, vma_size={4}".format(vma, size, min_vma, max_vma, vma_size))
|
||||
|
||||
total_vma_span = max_vma - min_vma
|
||||
return total_vma_span # Return the calculated size
|
||||
return min_vma, max_vma
|
||||
|
||||
except Exception as e:
|
||||
print("Failed to calculate vma size due to an error: {}".format(str(e)))
|
||||
sys.exit(-1)
|
||||
|
||||
def create_vxbin_binary(input_elf, output_bin, objcopy_path):
|
||||
vma_size = get_vma_size(input_elf)
|
||||
min_vma, max_vma = get_vma_size(input_elf)
|
||||
|
||||
# Create a binary data from the ELF file using objcopy
|
||||
temp_bin_path = '/tmp/temp_kernel.bin'
|
||||
|
@ -62,17 +61,19 @@ def create_vxbin_binary(input_elf, output_bin, objcopy_path):
|
|||
with open(temp_bin_path, 'rb') as temp_file:
|
||||
binary_data = temp_file.read()
|
||||
|
||||
# Pack size into 64-bit unsigned integer
|
||||
total_size_bytes = struct.pack('<Q', vma_size)
|
||||
# Pack addresses into 64-bit unsigned integer
|
||||
min_vma_bytes = struct.pack('<Q', min_vma)
|
||||
max_vma_bytes = struct.pack('<Q', max_vma)
|
||||
|
||||
# Write the total size and binary data to the final output file
|
||||
with open(output_bin, 'wb') as bin_file:
|
||||
bin_file.write(total_size_bytes)
|
||||
bin_file.write(min_vma_bytes)
|
||||
bin_file.write(max_vma_bytes)
|
||||
bin_file.write(binary_data)
|
||||
|
||||
# Remove the temporary binary file
|
||||
os.remove(temp_bin_path)
|
||||
print("Binary created successfully: {}, vma_size={}".format(output_bin, vma_size))
|
||||
print("Binary created successfully: {}, min_vma={:x}, max_vma={:x}".format(output_bin, min_vma, max_vma))
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) != 3:
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -24,14 +24,13 @@ public:
|
|||
MemoryAllocator(
|
||||
uint64_t baseAddress,
|
||||
uint64_t capacity,
|
||||
uint32_t pageAlign,
|
||||
uint32_t blockAlign)
|
||||
uint32_t pageAlign,
|
||||
uint32_t blockAlign)
|
||||
: baseAddress_(baseAddress)
|
||||
, capacity_(capacity)
|
||||
, pageAlign_(pageAlign)
|
||||
, blockAlign_(blockAlign)
|
||||
, pages_(nullptr)
|
||||
, nextAddress_(0)
|
||||
, allocated_(0)
|
||||
{}
|
||||
|
||||
|
@ -40,11 +39,11 @@ public:
|
|||
page_t* currPage = pages_;
|
||||
while (currPage) {
|
||||
auto nextPage = currPage->next;
|
||||
this->DeletePage(currPage);
|
||||
delete currPage;
|
||||
currPage = nextPage;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
uint32_t baseAddress() const {
|
||||
return baseAddress_;
|
||||
}
|
||||
|
@ -61,73 +60,80 @@ public:
|
|||
return allocated_;
|
||||
}
|
||||
|
||||
int allocate(uint64_t size, uint64_t* addr) {
|
||||
if (size == 0 || addr == nullptr) {
|
||||
printf("error: invalid argurments\n");
|
||||
int reserve(uint64_t addr, uint64_t size) {
|
||||
if (size == 0) {
|
||||
printf("error: invalid arguments\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Align allocation size
|
||||
size = AlignSize(size, blockAlign_);
|
||||
size = alignSize(size, pageAlign_);
|
||||
|
||||
// Check if the reservation is within memory capacity bounds
|
||||
if (addr < baseAddress_ || addr + size > baseAddress_ + capacity_) {
|
||||
printf("error: address range out of bounds\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Ensure the reservation does not overlap with existing pages
|
||||
if (hasPageOverlap(addr, size)) {
|
||||
printf("error: address range overlaps with existing allocation\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// allocate a new page for segment
|
||||
auto newPage = this->createPage(addr, size);
|
||||
|
||||
// allocate space on free block
|
||||
auto freeBlock = newPage->findFreeBlock(size);
|
||||
newPage->allocate(size, freeBlock);
|
||||
|
||||
// Update allocated size
|
||||
allocated_ += size;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int allocate(uint64_t size, uint64_t* addr) {
|
||||
if (size == 0 || addr == nullptr) {
|
||||
printf("error: invalid arguments\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Align allocation size
|
||||
size = alignSize(size, blockAlign_);
|
||||
|
||||
// Walk thru all pages to find a free block
|
||||
block_t* freeBlock = nullptr;
|
||||
auto currPage = pages_;
|
||||
while (currPage) {
|
||||
auto currBlock = currPage->freeSList;
|
||||
if (currBlock) {
|
||||
// The free S-list is already sorted with the largest block first
|
||||
// Quick check if the head block has enough space.
|
||||
if (currBlock->size >= size) {
|
||||
// Find the smallest matching block in the S-list
|
||||
while (currBlock->nextFreeS
|
||||
&& (currBlock->nextFreeS->size >= size)) {
|
||||
currBlock = currBlock->nextFreeS;
|
||||
}
|
||||
// Return the free block
|
||||
freeBlock = currBlock;
|
||||
break;
|
||||
}
|
||||
}
|
||||
freeBlock = currPage->findFreeBlock(size);
|
||||
if (freeBlock != nullptr)
|
||||
break;
|
||||
currPage = currPage->next;
|
||||
}
|
||||
|
||||
if (nullptr == freeBlock) {
|
||||
// Allocate a new page for this request
|
||||
currPage = this->NewPage(size);
|
||||
// Allocate a new page if no free block is found
|
||||
if (freeBlock == nullptr) {
|
||||
auto pageSize = alignSize(size, pageAlign_);
|
||||
uint64_t pageAddr;
|
||||
if (!this->findNextAddress(pageSize, &pageAddr)) {
|
||||
printf("error: out of memory\n");
|
||||
return -1;
|
||||
}
|
||||
currPage = this->createPage(pageAddr, pageSize);
|
||||
if (nullptr == currPage) {
|
||||
printf("error: out of memory\n");
|
||||
return -1;
|
||||
}
|
||||
freeBlock = currPage->freeSList;
|
||||
}
|
||||
|
||||
// Remove the block from the free lists
|
||||
assert(freeBlock->size >= size);
|
||||
currPage->RemoveFreeMList(freeBlock);
|
||||
currPage->RemoveFreeSList(freeBlock);
|
||||
|
||||
// If the free block we have found is larger than what we are looking for,
|
||||
// we may be able to split our free block in two.
|
||||
uint64_t extraBytes = freeBlock->size - size;
|
||||
if (extraBytes >= blockAlign_) {
|
||||
// Reduce the free block size to the requested value
|
||||
freeBlock->size = size;
|
||||
|
||||
// Allocate a new block to contain the extra buffer
|
||||
auto nextAddr = freeBlock->addr + size;
|
||||
auto newBlock = new block_t(nextAddr, extraBytes);
|
||||
|
||||
// Add the new block to the free lists
|
||||
currPage->InsertFreeMList(newBlock);
|
||||
currPage->InsertFreeSList(newBlock);
|
||||
freeBlock = currPage->findFreeBlock(size);
|
||||
}
|
||||
|
||||
// Insert the free block into the used list
|
||||
currPage->InsertUsedList(freeBlock);
|
||||
// allocate space on free block
|
||||
currPage->allocate(size, freeBlock);
|
||||
|
||||
// Return the free block address
|
||||
*addr = baseAddress_ + freeBlock->addr;
|
||||
*addr = freeBlock->addr;
|
||||
|
||||
// Update allocated size
|
||||
allocated_ += size;
|
||||
|
@ -137,22 +143,12 @@ public:
|
|||
|
||||
int release(uint64_t addr) {
|
||||
// Walk all pages to find the pointer
|
||||
uint64_t local_addr = addr - baseAddress_;
|
||||
block_t* usedBlock = nullptr;
|
||||
auto currPage = pages_;
|
||||
while (currPage) {
|
||||
if (local_addr >= currPage->addr
|
||||
&& local_addr < (currPage->addr + currPage->size)) {
|
||||
auto currBlock = currPage->usedList;
|
||||
while (currBlock) {
|
||||
if (currBlock->addr == local_addr) {
|
||||
usedBlock = currBlock;
|
||||
break;
|
||||
}
|
||||
currBlock = currBlock->nextUsed;
|
||||
}
|
||||
usedBlock = currPage->findUsedBlock(addr);
|
||||
if (usedBlock != nullptr)
|
||||
break;
|
||||
}
|
||||
currPage = currPage->next;
|
||||
}
|
||||
|
||||
|
@ -164,65 +160,12 @@ public:
|
|||
|
||||
auto size = usedBlock->size;
|
||||
|
||||
// Remove the block from the used list
|
||||
currPage->RemoveUsedList(usedBlock);
|
||||
|
||||
// Insert the block into the free M-list.
|
||||
currPage->InsertFreeMList(usedBlock);
|
||||
|
||||
// Check if we can merge adjacent free blocks from the left.
|
||||
if (usedBlock->prevFreeM) {
|
||||
// Calculate the previous address
|
||||
auto prevAddr = usedBlock->prevFreeM->addr + usedBlock->prevFreeM->size;
|
||||
if (usedBlock->addr == prevAddr) {
|
||||
auto prevBlock = usedBlock->prevFreeM;
|
||||
|
||||
// Merge the blocks to the left
|
||||
prevBlock->size += usedBlock->size;
|
||||
prevBlock->nextFreeM = usedBlock->nextFreeM;
|
||||
if (prevBlock->nextFreeM) {
|
||||
prevBlock->nextFreeM->prevFreeM = prevBlock;
|
||||
}
|
||||
|
||||
// Detach previous block from the free S-list since size increased
|
||||
currPage->RemoveFreeSList(prevBlock);
|
||||
|
||||
// reset usedBlock
|
||||
delete usedBlock;
|
||||
usedBlock = prevBlock;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we can merge adjacent free blocks from the right.
|
||||
if (usedBlock->nextFreeM) {
|
||||
// Calculate the next allocation start address
|
||||
auto nextAddr = usedBlock->addr + usedBlock->size;
|
||||
if (usedBlock->nextFreeM->addr == nextAddr) {
|
||||
auto nextBlock = usedBlock->nextFreeM;
|
||||
|
||||
// Merge the blocks to the right
|
||||
usedBlock->size += nextBlock->size;
|
||||
usedBlock->nextFreeM = nextBlock->nextFreeM;
|
||||
if (usedBlock->nextFreeM) {
|
||||
usedBlock->nextFreeM->prevFreeM = usedBlock;
|
||||
}
|
||||
|
||||
// Delete next block
|
||||
currPage->RemoveFreeSList(nextBlock);
|
||||
delete nextBlock;
|
||||
}
|
||||
}
|
||||
|
||||
// Insert the block into the free S-list.
|
||||
currPage->InsertFreeSList(usedBlock);
|
||||
|
||||
// Check if we can free empty pages
|
||||
if (nullptr == currPage->usedList) {
|
||||
// Try to delete the page
|
||||
while (currPage && this->DeletePage(currPage)) {
|
||||
currPage = this->FindNextEmptyPage();
|
||||
}
|
||||
// release the used block
|
||||
currPage->release(usedBlock);
|
||||
|
||||
// Free the page if empty
|
||||
if (currPage->empty()) {
|
||||
this->deletePage(currPage);
|
||||
}
|
||||
|
||||
// update allocated size
|
||||
|
@ -236,17 +179,17 @@ private:
|
|||
struct block_t {
|
||||
block_t* nextFreeS;
|
||||
block_t* prevFreeS;
|
||||
|
||||
|
||||
block_t* nextFreeM;
|
||||
block_t* prevFreeM;
|
||||
|
||||
|
||||
block_t* nextUsed;
|
||||
block_t* prevUsed;
|
||||
|
||||
uint64_t addr;
|
||||
uint64_t size;
|
||||
|
||||
block_t(uint64_t addr, uint64_t size)
|
||||
block_t(uint64_t addr, uint64_t size)
|
||||
: nextFreeS(nullptr)
|
||||
, prevFreeS(nullptr)
|
||||
, nextFreeM(nullptr)
|
||||
|
@ -259,43 +202,156 @@ private:
|
|||
};
|
||||
|
||||
struct page_t {
|
||||
page_t* next;
|
||||
|
||||
// List of used blocks
|
||||
block_t* usedList;
|
||||
|
||||
// List with blocks sorted by descreasing sizes
|
||||
// Used for block lookup during memory allocation.
|
||||
block_t* freeSList;
|
||||
|
||||
// List with blocks sorted by increasing memory addresses
|
||||
// Used for block merging during memory release.
|
||||
block_t* freeMList;
|
||||
|
||||
page_t* next;
|
||||
uint64_t addr;
|
||||
uint64_t size;
|
||||
|
||||
page_t(uint64_t addr, uint64_t size) :
|
||||
next(nullptr),
|
||||
usedList(nullptr),
|
||||
page_t(uint64_t addr, uint64_t size, uint32_t blockAlign) :
|
||||
next(nullptr),
|
||||
addr(addr),
|
||||
size(size) {
|
||||
freeSList = freeMList = new block_t(addr, size);
|
||||
size(size),
|
||||
blockAlign_(blockAlign),
|
||||
usedList_(nullptr) {
|
||||
freeSList_ = freeMList_ = new block_t(addr, size);
|
||||
}
|
||||
|
||||
void InsertUsedList(block_t* block) {
|
||||
block->nextUsed = usedList;
|
||||
if (usedList) {
|
||||
usedList->prevUsed = block;
|
||||
~page_t() {
|
||||
// The page should be empty
|
||||
assert(nullptr == usedList_);
|
||||
assert(freeMList_
|
||||
&& (nullptr == freeMList_->nextFreeM)
|
||||
&& (nullptr == freeMList_->prevFreeM));
|
||||
delete freeMList_;
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
return (usedList_ == nullptr);
|
||||
}
|
||||
|
||||
void allocate(uint64_t size, block_t* freeBlock) {
|
||||
// Remove the block from the free lists
|
||||
this->removeFreeMList(freeBlock);
|
||||
this->removeFreeSList(freeBlock);
|
||||
|
||||
// If the free block we have found is larger than what we are looking for,
|
||||
// we may be able to split our free block in two.
|
||||
uint64_t extraBytes = freeBlock->size - size;
|
||||
if (extraBytes >= blockAlign_) {
|
||||
// Reduce the free block size to the requested value
|
||||
freeBlock->size = size;
|
||||
|
||||
// Allocate a new block to contain the extra buffer
|
||||
auto nextAddr = freeBlock->addr + size;
|
||||
auto newBlock = new block_t(nextAddr, extraBytes);
|
||||
|
||||
// Add the new block to the free lists
|
||||
this->insertFreeMList(newBlock);
|
||||
this->insertFreeSList(newBlock);
|
||||
}
|
||||
usedList = block;
|
||||
|
||||
// Insert the free block into the used list
|
||||
this->insertUsedList(freeBlock);
|
||||
}
|
||||
|
||||
void RemoveUsedList(block_t* block) {
|
||||
void release(block_t* usedBlock) {
|
||||
// Remove the block from the used list
|
||||
this->removeUsedList(usedBlock);
|
||||
|
||||
// Insert the block into the free M-list.
|
||||
this->insertFreeMList(usedBlock);
|
||||
|
||||
// Check if we can merge adjacent free blocks from the left.
|
||||
if (usedBlock->prevFreeM) {
|
||||
// Calculate the previous address
|
||||
auto prevAddr = usedBlock->prevFreeM->addr + usedBlock->prevFreeM->size;
|
||||
if (usedBlock->addr == prevAddr) {
|
||||
auto prevBlock = usedBlock->prevFreeM;
|
||||
|
||||
// Merge the blocks to the left
|
||||
prevBlock->size += usedBlock->size;
|
||||
prevBlock->nextFreeM = usedBlock->nextFreeM;
|
||||
if (prevBlock->nextFreeM) {
|
||||
prevBlock->nextFreeM->prevFreeM = prevBlock;
|
||||
}
|
||||
|
||||
// Detach previous block from the free S-list since size increased
|
||||
this->removeFreeSList(prevBlock);
|
||||
|
||||
// reset usedBlock
|
||||
delete usedBlock;
|
||||
usedBlock = prevBlock;
|
||||
}
|
||||
}
|
||||
|
||||
// Check if we can merge adjacent free blocks from the right.
|
||||
if (usedBlock->nextFreeM) {
|
||||
// Calculate the next allocation start address
|
||||
auto nextAddr = usedBlock->addr + usedBlock->size;
|
||||
if (usedBlock->nextFreeM->addr == nextAddr) {
|
||||
auto nextBlock = usedBlock->nextFreeM;
|
||||
|
||||
// Merge the blocks to the right
|
||||
usedBlock->size += nextBlock->size;
|
||||
usedBlock->nextFreeM = nextBlock->nextFreeM;
|
||||
if (usedBlock->nextFreeM) {
|
||||
usedBlock->nextFreeM->prevFreeM = usedBlock;
|
||||
}
|
||||
|
||||
// Delete next block
|
||||
this->removeFreeSList(nextBlock);
|
||||
delete nextBlock;
|
||||
}
|
||||
}
|
||||
|
||||
// Insert the block into the free S-list.
|
||||
this->insertFreeSList(usedBlock);
|
||||
}
|
||||
|
||||
block_t* findFreeBlock(uint64_t size) {
|
||||
auto freeBlock = freeSList_;
|
||||
if (freeBlock) {
|
||||
// The free S-list is already sorted with the largest block first
|
||||
// Quick check if the head block has enough space.
|
||||
if (freeBlock->size >= size) {
|
||||
// Find the smallest matching block in the S-list
|
||||
while (freeBlock->nextFreeS
|
||||
&& (freeBlock->nextFreeS->size >= size)) {
|
||||
freeBlock = freeBlock->nextFreeS;
|
||||
}
|
||||
// Return the free block
|
||||
return freeBlock;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
block_t* findUsedBlock(uint64_t addr) {
|
||||
if (addr >= this->addr
|
||||
&& addr < (this->addr + this->size)) {
|
||||
auto useBlock = usedList_;
|
||||
while (useBlock) {
|
||||
if (useBlock->addr == addr)
|
||||
return useBlock;
|
||||
useBlock = useBlock->nextUsed;
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
}
|
||||
private:
|
||||
|
||||
void insertUsedList(block_t* block) {
|
||||
block->nextUsed = usedList_;
|
||||
if (usedList_) {
|
||||
usedList_->prevUsed = block;
|
||||
}
|
||||
usedList_ = block;
|
||||
}
|
||||
|
||||
void removeUsedList(block_t* block) {
|
||||
if (block->prevUsed) {
|
||||
block->prevUsed->nextUsed = block->nextUsed;
|
||||
} else {
|
||||
usedList = block->nextUsed;
|
||||
usedList_ = block->nextUsed;
|
||||
}
|
||||
if (block->nextUsed) {
|
||||
block->nextUsed->prevUsed = block->prevUsed;
|
||||
|
@ -304,8 +360,8 @@ private:
|
|||
block->prevUsed = nullptr;
|
||||
}
|
||||
|
||||
void InsertFreeMList(block_t* block) {
|
||||
block_t* currBlock = freeMList;
|
||||
void insertFreeMList(block_t* block) {
|
||||
block_t* currBlock = freeMList_;
|
||||
block_t* prevBlock = nullptr;
|
||||
while (currBlock && (currBlock->addr < block->addr)) {
|
||||
prevBlock = currBlock;
|
||||
|
@ -316,18 +372,18 @@ private:
|
|||
if (prevBlock) {
|
||||
prevBlock->nextFreeM = block;
|
||||
} else {
|
||||
freeMList = block;
|
||||
freeMList_ = block;
|
||||
}
|
||||
if (currBlock) {
|
||||
currBlock->prevFreeM = block;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RemoveFreeMList(block_t* block) {
|
||||
void removeFreeMList(block_t* block) {
|
||||
if (block->prevFreeM) {
|
||||
block->prevFreeM->nextFreeM = block->nextFreeM;
|
||||
} else {
|
||||
freeMList = block->nextFreeM;
|
||||
freeMList_ = block->nextFreeM;
|
||||
}
|
||||
if (block->nextFreeM) {
|
||||
block->nextFreeM->prevFreeM = block->prevFreeM;
|
||||
|
@ -336,8 +392,8 @@ private:
|
|||
block->prevFreeM = nullptr;
|
||||
}
|
||||
|
||||
void InsertFreeSList(block_t* block) {
|
||||
block_t* currBlock = this->freeSList;
|
||||
void insertFreeSList(block_t* block) {
|
||||
block_t* currBlock = freeSList_;
|
||||
block_t* prevBlock = nullptr;
|
||||
while (currBlock && (currBlock->size > block->size)) {
|
||||
prevBlock = currBlock;
|
||||
|
@ -348,60 +404,62 @@ private:
|
|||
if (prevBlock) {
|
||||
prevBlock->nextFreeS = block;
|
||||
} else {
|
||||
this->freeSList = block;
|
||||
freeSList_ = block;
|
||||
}
|
||||
if (currBlock) {
|
||||
currBlock->prevFreeS = block;
|
||||
}
|
||||
}
|
||||
|
||||
void RemoveFreeSList(block_t* block) {
|
||||
void removeFreeSList(block_t* block) {
|
||||
if (block->prevFreeS) {
|
||||
block->prevFreeS->nextFreeS = block->nextFreeS;
|
||||
} else {
|
||||
freeSList = block->nextFreeS;
|
||||
freeSList_ = block->nextFreeS;
|
||||
}
|
||||
if (block->nextFreeS) {
|
||||
block->nextFreeS->prevFreeS = block->prevFreeS;
|
||||
}
|
||||
block->nextFreeS = nullptr;
|
||||
block->prevFreeS = nullptr;
|
||||
block->prevFreeS = nullptr;
|
||||
}
|
||||
|
||||
// block alignment
|
||||
uint32_t blockAlign_;
|
||||
|
||||
// List of used blocks
|
||||
block_t* usedList_;
|
||||
|
||||
// List with blocks sorted by decreasing sizes
|
||||
// Used for block lookup during memory allocation.
|
||||
block_t* freeSList_;
|
||||
|
||||
// List with blocks sorted by increasing memory addresses
|
||||
// Used for block merging during memory release.
|
||||
block_t* freeMList_;
|
||||
};
|
||||
|
||||
page_t* NewPage(uint64_t size) {
|
||||
// Increase buffer size to include the page and first block size
|
||||
// also add padding to ensure page alignment
|
||||
size = AlignSize(size, pageAlign_);
|
||||
|
||||
// Allocate page memory
|
||||
auto addr = nextAddress_;
|
||||
nextAddress_ += size;
|
||||
|
||||
// Overflow check
|
||||
if (nextAddress_ > capacity_)
|
||||
return nullptr;
|
||||
|
||||
page_t* createPage(uint64_t addr, uint64_t size) {
|
||||
// Allocate object
|
||||
auto newPage = new page_t(addr, size);
|
||||
auto newPage = new page_t(addr, size, blockAlign_);
|
||||
|
||||
// Insert the new page into the list
|
||||
newPage->next = pages_;
|
||||
pages_ = newPage;
|
||||
// Insert the new page into the list in address sorted order
|
||||
if (pages_ == nullptr || pages_->addr > newPage->addr) {
|
||||
newPage->next = pages_;
|
||||
pages_ = newPage;
|
||||
} else {
|
||||
page_t* current = pages_;
|
||||
while (current->next != nullptr && current->next->addr < newPage->addr) {
|
||||
current = current->next;
|
||||
}
|
||||
newPage->next = current->next;
|
||||
current->next = newPage;
|
||||
}
|
||||
|
||||
return newPage;
|
||||
}
|
||||
|
||||
bool DeletePage(page_t* page) {
|
||||
// The page should be empty
|
||||
assert(nullptr == page->usedList);
|
||||
assert(page->freeMList && (nullptr == page->freeMList->nextFreeM));
|
||||
|
||||
// Only delete top-level pages
|
||||
auto nextAddr = page->addr + page->size;
|
||||
if (nextAddr != nextAddress_)
|
||||
return false;
|
||||
|
||||
void deletePage(page_t* page) {
|
||||
// Remove the page from the list
|
||||
page_t* prevPage = nullptr;
|
||||
auto currPage = pages_;
|
||||
|
@ -417,36 +475,66 @@ private:
|
|||
prevPage = currPage;
|
||||
currPage = currPage->next;
|
||||
}
|
||||
|
||||
// Update next allocation address
|
||||
nextAddress_ = page->addr;
|
||||
|
||||
// free object
|
||||
delete page->freeMList;
|
||||
// Delete the page
|
||||
delete page;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
page_t* FindNextEmptyPage() {
|
||||
auto currPage = pages_;
|
||||
while (currPage) {
|
||||
if (nullptr == currPage->usedList)
|
||||
return currPage;
|
||||
currPage = currPage->next;
|
||||
}
|
||||
return nullptr;
|
||||
bool findNextAddress(uint64_t size, uint64_t* addr) {
|
||||
if (pages_ == nullptr) {
|
||||
*addr = baseAddress_;
|
||||
return true;
|
||||
}
|
||||
|
||||
page_t* current = pages_;
|
||||
uint64_t endOfLastPage = baseAddress_;
|
||||
|
||||
while (current != nullptr) {
|
||||
uint64_t startOfCurrentPage = current->addr;
|
||||
if ((endOfLastPage + size) <= startOfCurrentPage) {
|
||||
*addr = endOfLastPage;
|
||||
return true;
|
||||
}
|
||||
// Update the end of the last page to the end of the current page
|
||||
// Move to the next page in the sorted list
|
||||
endOfLastPage = current->addr + current->size;
|
||||
current = current->next;
|
||||
}
|
||||
|
||||
// If no suitable gap is found, place the new page at the end of the last page
|
||||
// Check if the allocator has enough capacity
|
||||
if ((endOfLastPage + size) <= capacity_) {
|
||||
*addr = endOfLastPage;
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static uint64_t AlignSize(uint64_t size, uint64_t alignment) {
|
||||
bool hasPageOverlap(uint64_t start, uint64_t size) {
|
||||
page_t* current = pages_;
|
||||
while (current != nullptr) {
|
||||
uint64_t pageStart = current->addr;
|
||||
uint64_t pageEnd = pageStart + current->size;
|
||||
uint64_t requestEnd = start + size;
|
||||
if ((start >= pageStart && start < pageEnd) || // Start of request is inside the page
|
||||
(requestEnd > pageStart && requestEnd <= pageEnd) || // End of request is inside the page
|
||||
(start <= pageStart && requestEnd >= pageEnd)) { // Request envelops the page
|
||||
return true;
|
||||
}
|
||||
current = current->next;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static uint64_t alignSize(uint64_t size, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return (size + alignment - 1) & ~(alignment - 1);
|
||||
}
|
||||
|
||||
uint64_t baseAddress_;
|
||||
uint64_t capacity_;
|
||||
uint32_t pageAlign_;
|
||||
uint32_t blockAlign_;
|
||||
uint32_t pageAlign_;
|
||||
uint32_t blockAlign_;
|
||||
page_t* pages_;
|
||||
uint64_t nextAddress_;
|
||||
uint64_t allocated_;
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -29,7 +29,7 @@
|
|||
_cleanup \
|
||||
} while (false)
|
||||
|
||||
uint64_t aligned_size(uint64_t size, uint64_t alignment) {
|
||||
uint64_t aligned_size(uint64_t size, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return (size + alignment - 1) & ~(alignment - 1);
|
||||
}
|
||||
|
@ -68,7 +68,7 @@ public:
|
|||
int get_perf_class() const {
|
||||
return perf_class_;
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
std::list<vx_device_h> hdevices_;
|
||||
int perf_class_;
|
||||
|
@ -97,14 +97,15 @@ void perf_remove_device(vx_device_h hdevice) {
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void DeviceConfig::write(uint32_t addr, uint32_t value) {
|
||||
data_[addr] = value;
|
||||
store_[addr] = value;
|
||||
}
|
||||
|
||||
uint32_t DeviceConfig::read(uint32_t addr) const {
|
||||
if (0 == data_.count(addr)) {
|
||||
printf("Error: DeviceConfig::read(%d) failed\n", addr);
|
||||
}
|
||||
return data_.at(addr);
|
||||
int DeviceConfig::read(uint32_t addr, uint32_t* value) const {
|
||||
auto it = store_.find(addr);
|
||||
if (it == store_.end())
|
||||
return -1;
|
||||
*value = it->second;
|
||||
return 0;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -131,47 +132,58 @@ int dcr_initialize(vx_device_h hdevice) {
|
|||
RT_CHECK(vx_dcr_write(hdevice, VX_DCR_BASE_MPM_CLASS, 0), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size, uint64_t* addr) {
|
||||
if (NULL == content || size <= 8 || NULL == addr)
|
||||
extern int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size, vx_buffer_h* hbuffer) {
|
||||
if (nullptr == hdevice || nullptr == content || size <= 8 || nullptr == hbuffer)
|
||||
return -1;
|
||||
|
||||
auto bytes = reinterpret_cast<const uint8_t*>(content);
|
||||
|
||||
uint64_t _addr;
|
||||
auto bytes = reinterpret_cast<const uint64_t*>(content);
|
||||
|
||||
#ifdef NDEBUG
|
||||
auto runtime_size = *reinterpret_cast<const uint64_t*>(bytes);
|
||||
RT_CHECK(vx_mem_alloc(hdevice, runtime_size, &_addr), {
|
||||
auto min_vma = *bytes++;
|
||||
auto max_vma = *bytes++;
|
||||
auto bin_size = size - 16;
|
||||
auto runtime_size = (max_vma - min_vma);
|
||||
|
||||
vx_buffer_h _hbuffer;
|
||||
#ifndef NDEBUG
|
||||
RT_CHECK(vx_mem_reserve(hdevice, min_vma, runtime_size, 0, &_hbuffer), {
|
||||
return _ret;
|
||||
});
|
||||
#else
|
||||
uint32_t startup_addr0, startup_addr1;
|
||||
RT_CHECK(vx_dcr_read(hdevice, VX_DCR_BASE_STARTUP_ADDR0, &startup_addr0), {
|
||||
return _ret;
|
||||
});
|
||||
RT_CHECK(vx_dcr_read(hdevice, VX_DCR_BASE_STARTUP_ADDR1, &startup_addr1), {
|
||||
return _ret;
|
||||
});
|
||||
_addr = (uint64_t(startup_addr1) << 32) | startup_addr0;
|
||||
RT_CHECK(vx_mem_alloc(hdevice, runtime_size, 0, &_hbuffer), {
|
||||
return _ret;
|
||||
});
|
||||
#endif
|
||||
|
||||
RT_CHECK(vx_copy_to_dev(hdevice, _addr, bytes + 8, size - 8), {
|
||||
vx_mem_free(hdevice, _addr);
|
||||
RT_CHECK(vx_mem_access(_hbuffer, 0, bin_size, VX_MEM_READ), {
|
||||
vx_mem_free(_hbuffer);
|
||||
return _ret;
|
||||
});
|
||||
|
||||
*addr = _addr;
|
||||
RT_CHECK(vx_mem_access(_hbuffer, bin_size, runtime_size - bin_size, VX_MEM_READ_WRITE), {
|
||||
vx_mem_free(_hbuffer);
|
||||
return _ret;
|
||||
});
|
||||
|
||||
return 0;
|
||||
RT_CHECK(vx_copy_to_dev(_hbuffer, bytes, 0, bin_size), {
|
||||
vx_mem_free(_hbuffer);
|
||||
return _ret;
|
||||
});
|
||||
|
||||
*hbuffer = _hbuffer;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, uint64_t* addr) {
|
||||
extern int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, vx_buffer_h* hbuffer) {
|
||||
if (nullptr == hdevice || nullptr == filename || nullptr == hbuffer)
|
||||
return -1;
|
||||
|
||||
std::ifstream ifs(filename);
|
||||
if (!ifs) {
|
||||
std::cout << "error: " << filename << " not found" << std::endl;
|
||||
|
@ -181,39 +193,42 @@ extern int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, uint
|
|||
// read file content
|
||||
ifs.seekg(0, ifs.end);
|
||||
auto size = ifs.tellg();
|
||||
std::vector<char> content(size);
|
||||
std::vector<char> content(size);
|
||||
ifs.seekg(0, ifs.beg);
|
||||
ifs.read(content.data(), size);
|
||||
|
||||
// upload buffer
|
||||
RT_CHECK(vx_upload_kernel_bytes(hdevice, content.data(), size, addr), {
|
||||
RT_CHECK(vx_upload_kernel_bytes(hdevice, content.data(), size, hbuffer), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t size, uint64_t* addr) {
|
||||
if (NULL == content || 0 == size || NULL == addr)
|
||||
extern int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t size, vx_buffer_h* hbuffer) {
|
||||
if (nullptr == hdevice || nullptr == content || 0 == size || nullptr == hbuffer)
|
||||
return -1;
|
||||
|
||||
uint64_t _addr;
|
||||
vx_buffer_h _hbuffer;
|
||||
|
||||
RT_CHECK(vx_mem_alloc(hdevice, size, &_addr), {
|
||||
RT_CHECK(vx_mem_alloc(hdevice, size, VX_MEM_READ, &_hbuffer), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
RT_CHECK(vx_copy_to_dev(hdevice, _addr, content, size), {
|
||||
vx_mem_free(hdevice, _addr);
|
||||
RT_CHECK(vx_copy_to_dev(_hbuffer, content, 0, size), {
|
||||
vx_mem_free(_hbuffer);
|
||||
return _ret;
|
||||
});
|
||||
|
||||
*addr = _addr;
|
||||
*hbuffer = _hbuffer;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_upload_file(vx_device_h hdevice, const char* filename, uint64_t* addr) {
|
||||
extern int vx_upload_file(vx_device_h hdevice, const char* filename, vx_buffer_h* hbuffer) {
|
||||
if (nullptr == hdevice || nullptr == filename || nullptr == hbuffer)
|
||||
return -1;
|
||||
|
||||
std::ifstream ifs(filename);
|
||||
if (!ifs) {
|
||||
std::cout << "error: " << filename << " not found" << std::endl;
|
||||
|
@ -223,29 +238,12 @@ extern int vx_upload_file(vx_device_h hdevice, const char* filename, uint64_t* a
|
|||
// read file content
|
||||
ifs.seekg(0, ifs.end);
|
||||
auto size = ifs.tellg();
|
||||
std::vector<char> content(size);
|
||||
std::vector<char> content(size);
|
||||
ifs.seekg(0, ifs.beg);
|
||||
ifs.read(content.data(), size);
|
||||
|
||||
// upload buffer
|
||||
RT_CHECK(vx_upload_bytes(hdevice, content.data(), size, addr), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_set_kernel_args(vx_device_h hdevice, const void* content, uint64_t size) {
|
||||
if (NULL == content || 0 == size)
|
||||
return -1;
|
||||
|
||||
uint64_t startup_arg;
|
||||
RT_CHECK(vx_mem_alloc(hdevice, size, &startup_arg), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
RT_CHECK(vx_copy_to_dev(hdevice, startup_arg, content, size), {
|
||||
vx_mem_free(hdevice, startup_arg);
|
||||
RT_CHECK(vx_upload_bytes(hdevice, content.data(), size, hbuffer), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
|
@ -294,20 +292,20 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
uint64_t loads = 0;
|
||||
uint64_t stores = 0;
|
||||
uint64_t ifetch_lat = 0;
|
||||
uint64_t load_lat = 0;
|
||||
// PERF: l2cache
|
||||
uint64_t load_lat = 0;
|
||||
// PERF: l2cache
|
||||
uint64_t l2cache_reads = 0;
|
||||
uint64_t l2cache_writes = 0;
|
||||
uint64_t l2cache_read_misses = 0;
|
||||
uint64_t l2cache_write_misses = 0;
|
||||
uint64_t l2cache_bank_stalls = 0;
|
||||
uint64_t l2cache_bank_stalls = 0;
|
||||
uint64_t l2cache_mshr_stalls = 0;
|
||||
// PERF: l3cache
|
||||
// PERF: l3cache
|
||||
uint64_t l3cache_reads = 0;
|
||||
uint64_t l3cache_writes = 0;
|
||||
uint64_t l3cache_read_misses = 0;
|
||||
uint64_t l3cache_write_misses = 0;
|
||||
uint64_t l3cache_bank_stalls = 0;
|
||||
uint64_t l3cache_bank_stalls = 0;
|
||||
uint64_t l3cache_mshr_stalls = 0;
|
||||
// PERF: memory
|
||||
uint64_t mem_reads = 0;
|
||||
|
@ -332,28 +330,27 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
bool lmem_enable = isa_flags & VX_ISA_EXT_LMEM;
|
||||
#endif
|
||||
|
||||
std::vector<uint64_t> staging_buf(32);
|
||||
|
||||
auto get_mpm_csr = [&staging_buf](int csr_addr) {
|
||||
return staging_buf.at(csr_addr - VX_CSR_MPM_BASE);
|
||||
};
|
||||
|
||||
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
|
||||
uint64_t mpm_mem_addr = IO_CSR_ADDR + core_id * staging_buf.size() * sizeof(uint64_t);
|
||||
RT_CHECK(vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size() * sizeof(uint64_t)), {
|
||||
uint64_t cycles_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MCYCLE, core_id, &cycles_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
uint64_t cycles_per_core = get_mpm_csr(VX_CSR_MCYCLE);
|
||||
uint64_t instrs_per_core = get_mpm_csr(VX_CSR_MINSTRET);
|
||||
uint64_t instrs_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MINSTRET, core_id, &instrs_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
switch (perf_class) {
|
||||
case VX_DCR_MPM_CLASS_CORE: {
|
||||
// PERF: pipeline
|
||||
// PERF: pipeline
|
||||
// scheduler idles
|
||||
{
|
||||
uint64_t sched_idles_per_core = get_mpm_csr(VX_CSR_MPM_SCHED_ID);
|
||||
uint64_t sched_idles_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCHED_ID, core_id, &sched_idles_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
int idles_percent_per_core = calcAvgPercent(sched_idles_per_core, cycles_per_core);
|
||||
fprintf(stream, "PERF: core%d: scheduler idle=%ld (%d%%)\n", core_id, sched_idles_per_core, idles_percent_per_core);
|
||||
|
@ -362,7 +359,10 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
}
|
||||
// scheduler stalls
|
||||
{
|
||||
uint64_t sched_stalls_per_core = get_mpm_csr(VX_CSR_MPM_SCHED_ST);
|
||||
uint64_t sched_stalls_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCHED_ST, core_id, &sched_stalls_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
int stalls_percent_per_core = calcAvgPercent(sched_stalls_per_core, cycles_per_core);
|
||||
fprintf(stream, "PERF: core%d: scheduler stalls=%ld (%d%%)\n", core_id, sched_stalls_per_core, stalls_percent_per_core);
|
||||
|
@ -371,7 +371,10 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
}
|
||||
// ibuffer_stalls
|
||||
{
|
||||
uint64_t ibuffer_stalls_per_core = get_mpm_csr(VX_CSR_MPM_IBUF_ST);
|
||||
uint64_t ibuffer_stalls_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_IBUF_ST, core_id, &ibuffer_stalls_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
int ibuffer_percent_per_core = calcAvgPercent(ibuffer_stalls_per_core, cycles_per_core);
|
||||
fprintf(stream, "PERF: core%d: ibuffer stalls=%ld (%d%%)\n", core_id, ibuffer_stalls_per_core, ibuffer_percent_per_core);
|
||||
|
@ -380,19 +383,34 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
}
|
||||
// issue_stalls
|
||||
{
|
||||
uint64_t scrb_stalls_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_ST);
|
||||
uint64_t scrb_alu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_ALU);
|
||||
uint64_t scrb_fpu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_FPU);
|
||||
uint64_t scrb_lsu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_LSU);
|
||||
uint64_t scrb_sfu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_SFU);
|
||||
uint64_t scrb_stalls_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ST, core_id, &scrb_stalls_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
uint64_t scrb_alu_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_ALU, core_id, &scrb_alu_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
uint64_t scrb_fpu_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_FPU, core_id, &scrb_fpu_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
uint64_t scrb_lsu_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_LSU, core_id, &scrb_lsu_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
uint64_t scrb_sfu_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
scrb_alu += scrb_alu_per_core;
|
||||
scrb_fpu += scrb_fpu_per_core;
|
||||
scrb_lsu += scrb_lsu_per_core;
|
||||
scrb_sfu += scrb_sfu_per_core;
|
||||
scrb_sfu += scrb_sfu_per_core;
|
||||
if (num_cores > 1) {
|
||||
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_sfu_per_core;
|
||||
fprintf(stream, "PERF: core%d: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core,
|
||||
calcAvgPercent(scrb_alu_per_core, scrb_total),
|
||||
fprintf(stream, "PERF: core%d: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", core_id, scrb_stalls_per_core,
|
||||
calcAvgPercent(scrb_alu_per_core, scrb_total),
|
||||
calcAvgPercent(scrb_fpu_per_core, scrb_total),
|
||||
calcAvgPercent(scrb_lsu_per_core, scrb_total),
|
||||
calcAvgPercent(scrb_sfu_per_core, scrb_total));
|
||||
|
@ -401,14 +419,23 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
}
|
||||
// sfu_stalls
|
||||
{
|
||||
uint64_t scrb_sfu_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_SFU);
|
||||
uint64_t scrb_wctl_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_WCTL);
|
||||
uint64_t scrb_csrs_per_core = get_mpm_csr(VX_CSR_MPM_SCRB_CSRS);
|
||||
uint64_t scrb_sfu_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_SFU, core_id, &scrb_sfu_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
uint64_t scrb_wctl_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_WCTL, core_id, &scrb_wctl_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
uint64_t scrb_csrs_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_SCRB_CSRS, core_id, &scrb_csrs_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
uint64_t sfu_total = scrb_wctl_per_core + scrb_csrs_per_core;
|
||||
fprintf(stream, "PERF: core%d: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n"
|
||||
, core_id
|
||||
, scrb_sfu_per_core
|
||||
, scrb_sfu_per_core
|
||||
, calcAvgPercent(scrb_csrs_per_core, sfu_total)
|
||||
, calcAvgPercent(scrb_wctl_per_core, sfu_total)
|
||||
);
|
||||
|
@ -419,11 +446,17 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
// PERF: memory
|
||||
// ifetches
|
||||
{
|
||||
uint64_t ifetches_per_core = get_mpm_csr(VX_CSR_MPM_IFETCHES);
|
||||
uint64_t ifetches_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_IFETCHES, core_id, &ifetches_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ifetches=%ld\n", core_id, ifetches_per_core);
|
||||
ifetches += ifetches_per_core;
|
||||
|
||||
uint64_t ifetch_lat_per_core = get_mpm_csr(VX_CSR_MPM_IFETCH_LT);
|
||||
uint64_t ifetch_lat_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_IFETCH_LT, core_id, &ifetch_lat_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
int mem_avg_lat = caclAverage(ifetch_lat_per_core, ifetches_per_core);
|
||||
fprintf(stream, "PERF: core%d: ifetch latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
|
@ -432,11 +465,17 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
}
|
||||
// loads
|
||||
{
|
||||
uint64_t loads_per_core = get_mpm_csr(VX_CSR_MPM_LOADS);
|
||||
uint64_t loads_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LOADS, core_id, &loads_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
|
||||
loads += loads_per_core;
|
||||
|
||||
uint64_t load_lat_per_core = get_mpm_csr(VX_CSR_MPM_LOAD_LT);
|
||||
uint64_t load_lat_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LOAD_LT, core_id, &load_lat_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
if (num_cores > 1) {
|
||||
int mem_avg_lat = caclAverage(load_lat_per_core, loads_per_core);
|
||||
fprintf(stream, "PERF: core%d: load latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
|
@ -445,42 +484,78 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
}
|
||||
// stores
|
||||
{
|
||||
uint64_t stores_per_core = get_mpm_csr(VX_CSR_MPM_STORES);
|
||||
uint64_t stores_per_core;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_STORES, core_id, &stores_per_core), {
|
||||
return _ret;
|
||||
});
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
|
||||
stores += stores_per_core;
|
||||
}
|
||||
} break;
|
||||
case VX_DCR_MPM_CLASS_MEM: {
|
||||
case VX_DCR_MPM_CLASS_MEM: {
|
||||
if (lmem_enable) {
|
||||
// PERF: lmem
|
||||
uint64_t lmem_reads = get_mpm_csr(VX_CSR_MPM_LMEM_READS);
|
||||
uint64_t lmem_writes = get_mpm_csr(VX_CSR_MPM_LMEM_WRITES);
|
||||
uint64_t lmem_bank_stalls = get_mpm_csr(VX_CSR_MPM_LMEM_BANK_ST);
|
||||
uint64_t lmem_reads;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_READS, core_id, &lmem_reads), {
|
||||
return _ret;
|
||||
});
|
||||
uint64_t lmem_writes;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_WRITES, core_id, &lmem_writes), {
|
||||
return _ret;
|
||||
});
|
||||
uint64_t lmem_bank_stalls;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_LMEM_BANK_ST, core_id, &lmem_bank_stalls), {
|
||||
return _ret;
|
||||
});
|
||||
int lmem_bank_utilization = calcAvgPercent(lmem_reads + lmem_writes, lmem_reads + lmem_writes + lmem_bank_stalls);
|
||||
fprintf(stream, "PERF: core%d: lmem reads=%ld\n", core_id, lmem_reads);
|
||||
fprintf(stream, "PERF: core%d: lmem writes=%ld\n", core_id, lmem_writes);
|
||||
fprintf(stream, "PERF: core%d: lmem writes=%ld\n", core_id, lmem_writes);
|
||||
fprintf(stream, "PERF: core%d: lmem bank stalls=%ld (utilization=%d%%)\n", core_id, lmem_bank_stalls, lmem_bank_utilization);
|
||||
}
|
||||
|
||||
if (icache_enable) {
|
||||
// PERF: Icache
|
||||
uint64_t icache_reads = get_mpm_csr(VX_CSR_MPM_ICACHE_READS);
|
||||
uint64_t icache_read_misses = get_mpm_csr(VX_CSR_MPM_ICACHE_MISS_R);
|
||||
uint64_t icache_mshr_stalls = get_mpm_csr(VX_CSR_MPM_ICACHE_MSHR_ST);
|
||||
int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads);
|
||||
uint64_t icache_reads;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_READS, core_id, &icache_reads), {
|
||||
return _ret;
|
||||
});
|
||||
uint64_t icache_read_misses;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_MISS_R, core_id, &icache_read_misses), {
|
||||
return _ret;
|
||||
});
|
||||
uint64_t icache_mshr_stalls;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_ICACHE_MSHR_ST, core_id, &icache_mshr_stalls), {
|
||||
return _ret;
|
||||
});
|
||||
int icache_read_hit_ratio = calcRatio(icache_read_misses, icache_reads);
|
||||
int mshr_utilization = calcAvgPercent(icache_read_misses, icache_read_misses + icache_mshr_stalls);
|
||||
fprintf(stream, "PERF: core%d: icache reads=%ld\n", core_id, icache_reads);
|
||||
fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_read_misses, icache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: core%d: icache mshr stalls=%ld (utilization=%d%%)\n", core_id, icache_mshr_stalls, mshr_utilization);
|
||||
}
|
||||
|
||||
|
||||
if (dcache_enable) {
|
||||
// PERF: Dcache
|
||||
uint64_t dcache_reads = get_mpm_csr(VX_CSR_MPM_DCACHE_READS);
|
||||
uint64_t dcache_writes = get_mpm_csr(VX_CSR_MPM_DCACHE_WRITES);
|
||||
uint64_t dcache_read_misses = get_mpm_csr(VX_CSR_MPM_DCACHE_MISS_R);
|
||||
uint64_t dcache_write_misses = get_mpm_csr(VX_CSR_MPM_DCACHE_MISS_W);
|
||||
uint64_t dcache_bank_stalls = get_mpm_csr(VX_CSR_MPM_DCACHE_BANK_ST);
|
||||
uint64_t dcache_reads;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_READS, core_id, &dcache_reads), {
|
||||
return _ret;
|
||||
});
|
||||
uint64_t dcache_writes;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_WRITES, core_id, &dcache_writes), {
|
||||
return _ret;
|
||||
});
|
||||
uint64_t dcache_read_misses;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_R, core_id, &dcache_read_misses), {
|
||||
return _ret;
|
||||
});
|
||||
uint64_t dcache_write_misses;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_W, core_id, &dcache_write_misses), {
|
||||
return _ret;
|
||||
});
|
||||
uint64_t dcache_bank_stalls;
|
||||
RT_CHECK(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_BANK_ST, core_id, &dcache_bank_stalls), {
|
||||
return _ret;
|
||||
});
|
||||
uint64_t dcache_mshr_stalls = get_mpm_csr(VX_CSR_MPM_DCACHE_MSHR_ST);
|
||||
int dcache_read_hit_ratio = calcRatio(dcache_read_misses, dcache_reads);
|
||||
int dcache_write_hit_ratio = calcRatio(dcache_write_misses, dcache_writes);
|
||||
|
@ -489,7 +564,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
fprintf(stream, "PERF: core%d: dcache reads=%ld\n", core_id, dcache_reads);
|
||||
fprintf(stream, "PERF: core%d: dcache writes=%ld\n", core_id, dcache_writes);
|
||||
fprintf(stream, "PERF: core%d: dcache read misses=%ld (hit ratio=%d%%)\n", core_id, dcache_read_misses, dcache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: core%d: dcache write misses=%ld (hit ratio=%d%%)\n", core_id, dcache_write_misses, dcache_write_hit_ratio);
|
||||
fprintf(stream, "PERF: core%d: dcache write misses=%ld (hit ratio=%d%%)\n", core_id, dcache_write_misses, dcache_write_hit_ratio);
|
||||
fprintf(stream, "PERF: core%d: dcache bank stalls=%ld (utilization=%d%%)\n", core_id, dcache_bank_stalls, dcache_bank_utilization);
|
||||
fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld (utilization=%d%%)\n", core_id, dcache_mshr_stalls, mshr_utilization);
|
||||
}
|
||||
|
@ -504,7 +579,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
l2cache_mshr_stalls += get_mpm_csr(VX_CSR_MPM_L2CACHE_MSHR_ST);
|
||||
}
|
||||
|
||||
if (0 == core_id) {
|
||||
if (0 == core_id) {
|
||||
if (l3cache_enable) {
|
||||
// PERF: L3cache
|
||||
l3cache_reads = get_mpm_csr(VX_CSR_MPM_L3CACHE_READS);
|
||||
|
@ -514,7 +589,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
l3cache_bank_stalls = get_mpm_csr(VX_CSR_MPM_L3CACHE_BANK_ST);
|
||||
l3cache_mshr_stalls = get_mpm_csr(VX_CSR_MPM_L3CACHE_MSHR_ST);
|
||||
}
|
||||
|
||||
|
||||
// PERF: memory
|
||||
mem_reads = get_mpm_csr(VX_CSR_MPM_MEM_READS);
|
||||
mem_writes = get_mpm_csr(VX_CSR_MPM_MEM_WRITES);
|
||||
|
@ -524,18 +599,18 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
default:
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
|
||||
total_instrs += instrs_per_core;
|
||||
total_cycles += cycles_per_core;
|
||||
max_cycles = std::max<uint64_t>(cycles_per_core, max_cycles);
|
||||
}
|
||||
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
switch (perf_class) {
|
||||
case VX_DCR_MPM_CLASS_CORE: {
|
||||
case VX_DCR_MPM_CLASS_CORE: {
|
||||
int sched_idles_percent = calcAvgPercent(sched_idles, total_cycles);
|
||||
int sched_stalls_percent = calcAvgPercent(sched_stalls, total_cycles);
|
||||
int ibuffer_percent = calcAvgPercent(ibuffer_stalls, total_cycles);
|
||||
|
@ -547,22 +622,22 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
|
||||
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
|
||||
fprintf(stream, "PERF: issue stalls=%ld (alu=%d%%, fpu=%d%%, lsu=%d%%, sfu=%d%%)\n", scrb_stalls,
|
||||
calcAvgPercent(scrb_alu, scrb_total),
|
||||
calcAvgPercent(scrb_alu, scrb_total),
|
||||
calcAvgPercent(scrb_fpu, scrb_total),
|
||||
calcAvgPercent(scrb_lsu, scrb_total),
|
||||
calcAvgPercent(scrb_sfu, scrb_total));
|
||||
calcAvgPercent(scrb_sfu, scrb_total));
|
||||
fprintf(stream, "PERF: sfu stalls=%ld (scrs=%d%%, wctl=%d%%)\n"
|
||||
, scrb_sfu
|
||||
, scrb_sfu
|
||||
, calcAvgPercent(scrb_csrs, sfu_total)
|
||||
, calcAvgPercent(scrb_wctl, sfu_total)
|
||||
);
|
||||
fprintf(stream, "PERF: ifetches=%ld\n", ifetches);
|
||||
fprintf(stream, "PERF: loads=%ld\n", loads);
|
||||
fprintf(stream, "PERF: stores=%ld\n", stores);
|
||||
fprintf(stream, "PERF: stores=%ld\n", stores);
|
||||
fprintf(stream, "PERF: ifetch latency=%d cycles\n", ifetch_avg_lat);
|
||||
fprintf(stream, "PERF: load latency=%d cycles\n", load_avg_lat);
|
||||
} break;
|
||||
case VX_DCR_MPM_CLASS_MEM: {
|
||||
fprintf(stream, "PERF: load latency=%d cycles\n", load_avg_lat);
|
||||
} break;
|
||||
case VX_DCR_MPM_CLASS_MEM: {
|
||||
if (l2cache_enable) {
|
||||
l2cache_reads /= num_cores;
|
||||
l2cache_writes /= num_cores;
|
||||
|
@ -577,12 +652,12 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
fprintf(stream, "PERF: l2cache reads=%ld\n", l2cache_reads);
|
||||
fprintf(stream, "PERF: l2cache writes=%ld\n", l2cache_writes);
|
||||
fprintf(stream, "PERF: l2cache read misses=%ld (hit ratio=%d%%)\n", l2cache_read_misses, read_hit_ratio);
|
||||
fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, write_hit_ratio);
|
||||
fprintf(stream, "PERF: l2cache write misses=%ld (hit ratio=%d%%)\n", l2cache_write_misses, write_hit_ratio);
|
||||
fprintf(stream, "PERF: l2cache bank stalls=%ld (utilization=%d%%)\n", l2cache_bank_stalls, bank_utilization);
|
||||
fprintf(stream, "PERF: l2cache mshr stalls=%ld (utilization=%d%%)\n", l2cache_mshr_stalls, mshr_utilization);
|
||||
}
|
||||
|
||||
if (l3cache_enable) {
|
||||
if (l3cache_enable) {
|
||||
int read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads);
|
||||
int write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes);
|
||||
int bank_utilization = calcAvgPercent(l3cache_reads + l3cache_writes, l3cache_reads + l3cache_writes + l3cache_bank_stalls);
|
||||
|
@ -590,66 +665,24 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
|
|||
fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
|
||||
fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
|
||||
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, read_hit_ratio);
|
||||
fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, write_hit_ratio);
|
||||
fprintf(stream, "PERF: l3cache write misses=%ld (hit ratio=%d%%)\n", l3cache_write_misses, write_hit_ratio);
|
||||
fprintf(stream, "PERF: l3cache bank stalls=%ld (utilization=%d%%)\n", l3cache_bank_stalls, bank_utilization);
|
||||
fprintf(stream, "PERF: l3cache mshr stalls=%ld (utilization=%d%%)\n", l3cache_mshr_stalls, mshr_utilization);
|
||||
}
|
||||
|
||||
int mem_avg_lat = caclAverage(mem_lat, mem_reads);
|
||||
int mem_avg_lat = caclAverage(mem_lat, mem_reads);
|
||||
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
|
||||
fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
|
||||
} break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
float IPC = (float)(double(total_instrs) / double(max_cycles));
|
||||
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, max_cycles, IPC);
|
||||
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", total_instrs, max_cycles, IPC);
|
||||
|
||||
fflush(stream);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64_t* value) {
|
||||
uint64_t num_cores;
|
||||
|
||||
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
if (core_id >= (int)num_cores) {
|
||||
std::cout << "error: core_id out of range" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
std::vector<uint64_t> staging_buf(32);
|
||||
|
||||
uint64_t _value = 0;
|
||||
|
||||
unsigned i = 0;
|
||||
if (core_id != -1) {
|
||||
i = core_id;
|
||||
num_cores = core_id + 1;
|
||||
}
|
||||
|
||||
for (i = 0; i < num_cores; ++i) {
|
||||
uint64_t mpm_mem_addr = IO_CSR_ADDR + i * staging_buf.size() * sizeof(uint64_t);
|
||||
RT_CHECK(vx_copy_from_dev(hdevice, staging_buf.data(), mpm_mem_addr, staging_buf.size() * sizeof(uint64_t)), {
|
||||
return _ret;
|
||||
});
|
||||
|
||||
auto per_core_value = staging_buf.at(counter-VX_CSR_MPM_BASE);
|
||||
if (counter == VX_CSR_MCYCLE) {
|
||||
_value = std::max<uint64_t>(per_core_value, _value);
|
||||
} else {
|
||||
_value += per_core_value;
|
||||
}
|
||||
}
|
||||
|
||||
// output
|
||||
*value = _value;
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -20,11 +20,11 @@
|
|||
#include <VX_types.h>
|
||||
|
||||
class DeviceConfig {
|
||||
public:
|
||||
public:
|
||||
void write(uint32_t addr, uint32_t value);
|
||||
uint32_t read(uint32_t addr) const;
|
||||
int read(uint32_t addr, uint32_t* value) const;
|
||||
private:
|
||||
std::unordered_map<uint32_t, uint32_t> data_;
|
||||
std::unordered_map<uint32_t, uint32_t> store_;
|
||||
};
|
||||
|
||||
int dcr_initialize(vx_device_h device);
|
||||
|
@ -39,7 +39,6 @@ void perf_remove_device(vx_device_h device);
|
|||
|
||||
#define CACHE_BLOCK_SIZE 64
|
||||
#define ALLOC_BASE_ADDR CACHE_BLOCK_SIZE
|
||||
#define ALLOC_MAX_ADDR STARTUP_ADDR
|
||||
#if (XLEN == 64)
|
||||
#define GLOBAL_MEM_SIZE 0x200000000 // 8 GB
|
||||
#else
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -23,9 +23,10 @@ extern "C" {
|
|||
#endif
|
||||
|
||||
typedef void* vx_device_h;
|
||||
typedef void* vx_buffer_h;
|
||||
|
||||
// device caps ids
|
||||
#define VX_CAPS_VERSION 0x0
|
||||
#define VX_CAPS_VERSION 0x0
|
||||
#define VX_CAPS_NUM_THREADS 0x1
|
||||
#define VX_CAPS_NUM_WARPS 0x2
|
||||
#define VX_CAPS_NUM_CORES 0x3
|
||||
|
@ -57,6 +58,11 @@ typedef void* vx_device_h;
|
|||
// ready wait timeout
|
||||
#define VX_MAX_TIMEOUT (24*60*60*1000) // 24 Hr
|
||||
|
||||
// device memory access
|
||||
#define VX_MEM_READ 0x1
|
||||
#define VX_MEM_WRITE 0x2
|
||||
#define VX_MEM_READ_WRITE 0x3
|
||||
|
||||
// open the device and connect to it
|
||||
int vx_dev_open(vx_device_h* hdevice);
|
||||
|
||||
|
@ -67,22 +73,31 @@ int vx_dev_close(vx_device_h hdevice);
|
|||
int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value);
|
||||
|
||||
// allocate device memory and return address
|
||||
int vx_mem_alloc(vx_device_h hdevice, uint64_t size, uint64_t* dev_addr);
|
||||
int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int flags, vx_buffer_h* hbuffer);
|
||||
|
||||
// reserve memory address range
|
||||
int vx_mem_reserve(vx_device_h hdevice, uint64_t address, uint64_t size, int flags, vx_buffer_h* hbuffer);
|
||||
|
||||
// release device memory
|
||||
int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr);
|
||||
int vx_mem_free(vx_buffer_h hbuffer);
|
||||
|
||||
// set device memory access rights
|
||||
int vx_mem_access(vx_buffer_h hbuffer, uint64_t offset, uint64_t size, int flags);
|
||||
|
||||
// return device memory address
|
||||
int vx_mem_address(vx_buffer_h hbuffer, uint64_t* address);
|
||||
|
||||
// get device memory info
|
||||
int vx_mem_info(vx_device_h hdevice, uint64_t* mem_free, uint64_t* mem_used);
|
||||
|
||||
// Copy bytes from host to device memory
|
||||
int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size);
|
||||
int vx_copy_to_dev(vx_buffer_h hbuffer, const void* host_ptr, uint64_t dst_offset, uint64_t size);
|
||||
|
||||
// Copy bytes from device memory to host
|
||||
int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size);
|
||||
int vx_copy_from_dev(void* host_ptr, vx_buffer_h hbuffer, uint64_t src_offset, uint64_t size);
|
||||
|
||||
// Start device execution
|
||||
int vx_start(vx_device_h hdevice, uint64_t krnl_addr, uint64_t args_addr);
|
||||
int vx_start(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments);
|
||||
|
||||
// Wait for device ready with milliseconds timeout
|
||||
int vx_ready_wait(vx_device_h hdevice, uint64_t timeout);
|
||||
|
@ -93,23 +108,25 @@ int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value);
|
|||
// write device configuration registers
|
||||
int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value);
|
||||
|
||||
// query device performance counter
|
||||
int vx_mpm_query(vx_device_h hdevice, uint32_t addr, uint32_t core_id, uint64_t* value);
|
||||
|
||||
////////////////////////////// UTILITY FUNCTIONS //////////////////////////////
|
||||
|
||||
// upload bytes to device
|
||||
int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size, uint64_t* addr);
|
||||
int vx_upload_kernel_bytes(vx_device_h hdevice, const void* content, uint64_t size, vx_buffer_h* hbuffer);
|
||||
|
||||
// upload file to device
|
||||
int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, uint64_t* addr);
|
||||
int vx_upload_kernel_file(vx_device_h hdevice, const char* filename, vx_buffer_h* hbuffer);
|
||||
|
||||
// upload bytes to device
|
||||
int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t size, uint64_t* addr);
|
||||
int vx_upload_bytes(vx_device_h hdevice, const void* content, uint64_t size, vx_buffer_h* hbuffer);
|
||||
|
||||
// upload file to device
|
||||
int vx_upload_file(vx_device_h hdevice, const char* filename, uint64_t* addr);
|
||||
int vx_upload_file(vx_device_h hdevice, const char* filename, vx_buffer_h* hbuffer);
|
||||
|
||||
// performance counters
|
||||
int vx_dump_perf(vx_device_h hdevice, FILE* stream);
|
||||
int vx_perf_counter(vx_device_h hdevice, int counter, int core_id, uint64_t* value);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -30,6 +30,8 @@
|
|||
#include <util.h>
|
||||
#include <processor.h>
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
@ -38,37 +40,109 @@
|
|||
#define DBGPRINT(format, ...) ((void)0)
|
||||
#endif
|
||||
|
||||
using namespace vortex;
|
||||
#define CHECK_ERR(_expr, _cleanup) \
|
||||
do { \
|
||||
auto err = _expr; \
|
||||
if (err == 0) \
|
||||
break; \
|
||||
printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
|
||||
_cleanup \
|
||||
} while (false)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class vx_device {
|
||||
class vx_device {
|
||||
public:
|
||||
vx_device()
|
||||
vx_device()
|
||||
: ram_(0, RAM_PAGE_SIZE)
|
||||
, global_mem_(
|
||||
ALLOC_BASE_ADDR,
|
||||
ALLOC_MAX_ADDR - ALLOC_BASE_ADDR,
|
||||
RAM_PAGE_SIZE,
|
||||
CACHE_BLOCK_SIZE)
|
||||
, global_mem_(ALLOC_BASE_ADDR, GLOBAL_MEM_SIZE - ALLOC_BASE_ADDR, RAM_PAGE_SIZE, CACHE_BLOCK_SIZE)
|
||||
{
|
||||
processor_.attach_ram(&ram_);
|
||||
}
|
||||
|
||||
~vx_device() {
|
||||
~vx_device() {
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
}
|
||||
|
||||
int mem_alloc(uint64_t size, uint64_t* dev_addr) {
|
||||
return global_mem_.allocate(size, dev_addr);
|
||||
int get_caps(uint32_t caps_id, uint64_t *value) {
|
||||
uint64_t _value;
|
||||
switch (caps_id) {
|
||||
case VX_CAPS_VERSION:
|
||||
_value = IMPLEMENTATION_ID;
|
||||
break;
|
||||
case VX_CAPS_NUM_THREADS:
|
||||
_value = NUM_THREADS;
|
||||
break;
|
||||
case VX_CAPS_NUM_WARPS:
|
||||
_value = NUM_WARPS;
|
||||
break;
|
||||
case VX_CAPS_NUM_CORES:
|
||||
_value = NUM_CORES * NUM_CLUSTERS;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
_value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_GLOBAL_MEM_SIZE:
|
||||
_value = GLOBAL_MEM_SIZE;
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||
_value = (1 << LMEM_LOG_SIZE);
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_ADDR:
|
||||
_value = LMEM_BASE_ADDR;
|
||||
break;
|
||||
case VX_CAPS_ISA_FLAGS:
|
||||
_value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
|
||||
break;
|
||||
default:
|
||||
std::cout << "invalid caps id: " << caps_id << std::endl;
|
||||
std::abort();
|
||||
return -1;
|
||||
}
|
||||
*value = _value;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_alloc(uint64_t size, int flags, uint64_t* dev_addr) {
|
||||
uint64_t addr;
|
||||
CHECK_ERR(global_mem_.allocate(size, &addr), {
|
||||
return err;
|
||||
});
|
||||
CHECK_ERR(this->mem_access(addr, size, flags), {
|
||||
global_mem_.release(addr);
|
||||
return err;
|
||||
});
|
||||
*dev_addr = addr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_reserve(uint64_t dev_addr, uint64_t size, int flags) {
|
||||
CHECK_ERR(global_mem_.reserve(dev_addr, size), {
|
||||
return err;
|
||||
});
|
||||
CHECK_ERR(this->mem_access(dev_addr, size, flags), {
|
||||
global_mem_.release(dev_addr);
|
||||
return err;
|
||||
});
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_free(uint64_t dev_addr) {
|
||||
return global_mem_.release(dev_addr);
|
||||
}
|
||||
|
||||
int mem_access(uint64_t dev_addr, uint64_t size, int flags) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dev_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.set_acl(dev_addr, size, flags);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_info(uint64_t* mem_free, uint64_t* mem_used) const {
|
||||
if (mem_free)
|
||||
*mem_free = global_mem_.free();
|
||||
|
@ -82,6 +156,10 @@ public:
|
|||
if (dest_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.enable_acl(false);
|
||||
ram_.write((const uint8_t*)src, dest_addr, size);
|
||||
ram_.enable_acl(true);
|
||||
|
||||
/*printf("VXDRV: upload %ld bytes from 0x%lx:", size, uintptr_t((uint8_t*)src));
|
||||
for (int i = 0; i < (asize / CACHE_BLOCK_SIZE); ++i) {
|
||||
printf("\n0x%08lx=", dest_addr + i * CACHE_BLOCK_SIZE);
|
||||
|
@ -90,8 +168,7 @@ public:
|
|||
}
|
||||
}
|
||||
printf("\n");*/
|
||||
|
||||
ram_.write((const uint8_t*)src, dest_addr, size);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -100,8 +177,10 @@ public:
|
|||
if (src_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.enable_acl(false);
|
||||
ram_.read((uint8_t*)dest, src_addr, size);
|
||||
|
||||
ram_.enable_acl(true);
|
||||
|
||||
/*printf("VXDRV: download %ld bytes to 0x%lx:", size, uintptr_t((uint8_t*)dest));
|
||||
for (int i = 0; i < (asize / CACHE_BLOCK_SIZE); ++i) {
|
||||
printf("\n0x%08lx=", src_addr + i * CACHE_BLOCK_SIZE);
|
||||
|
@ -110,21 +189,21 @@ public:
|
|||
}
|
||||
}
|
||||
printf("\n");*/
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int start(uint64_t krnl_addr, uint64_t args_addr) {
|
||||
int start(uint64_t krnl_addr, uint64_t args_addr) {
|
||||
// ensure prior run completed
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
|
||||
// set kernel info
|
||||
this->write_dcr(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff);
|
||||
this->write_dcr(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32);
|
||||
this->write_dcr(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
|
||||
this->write_dcr(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
|
||||
|
||||
// start new run
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
|
@ -133,7 +212,7 @@ public:
|
|||
return 0;
|
||||
}
|
||||
|
||||
int wait(uint64_t timeout) {
|
||||
int ready_wait(uint64_t timeout) {
|
||||
if (!future_.valid())
|
||||
return 0;
|
||||
uint64_t timeout_sec = timeout / 1000;
|
||||
|
@ -141,24 +220,24 @@ public:
|
|||
for (;;) {
|
||||
// wait for 1 sec and check status
|
||||
auto status = future_.wait_for(wait_time);
|
||||
if (status == std::future_status::ready
|
||||
if (status == std::future_status::ready
|
||||
|| 0 == timeout_sec--)
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int write_dcr(uint32_t addr, uint32_t value) {
|
||||
int dcr_write(uint32_t addr, uint32_t value) {
|
||||
if (future_.valid()) {
|
||||
future_.wait(); // ensure prior run completed
|
||||
}
|
||||
processor_.write_dcr(addr, value);
|
||||
}
|
||||
processor_.dcr_write(addr, value);
|
||||
dcrs_.write(addr, value);
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t read_dcr(uint32_t addr) const {
|
||||
return dcrs_.read(addr);
|
||||
int dcr_read(uint32_t addr, uint32_t* value) const {
|
||||
return dcrs_.read(addr, value);
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -170,51 +249,14 @@ private:
|
|||
std::future<void> future_;
|
||||
};
|
||||
|
||||
struct vx_buffer {
|
||||
vx_device* device;
|
||||
uint64_t addr;
|
||||
uint64_t size;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
//vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
switch (caps_id) {
|
||||
case VX_CAPS_VERSION:
|
||||
*value = IMPLEMENTATION_ID;
|
||||
break;
|
||||
case VX_CAPS_NUM_THREADS:
|
||||
*value = NUM_THREADS;
|
||||
break;
|
||||
case VX_CAPS_NUM_WARPS:
|
||||
*value = NUM_WARPS;
|
||||
break;
|
||||
case VX_CAPS_NUM_CORES:
|
||||
*value = NUM_CORES * NUM_CLUSTERS;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
*value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_GLOBAL_MEM_SIZE:
|
||||
*value = GLOBAL_MEM_SIZE;
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||
*value = (1 << LMEM_LOG_SIZE);
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_ADDR:
|
||||
*value = LMEM_BASE_ADDR;
|
||||
break;
|
||||
case VX_CAPS_ISA_FLAGS:
|
||||
*value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
|
||||
break;
|
||||
default:
|
||||
std::cout << "invalid caps id: " << caps_id << std::endl;
|
||||
std::abort();
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_open(vx_device_h* hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
@ -233,6 +275,8 @@ extern int vx_dev_open(vx_device_h* hdevice) {
|
|||
perf_add_device(device);
|
||||
#endif
|
||||
|
||||
DBGPRINT("DEV_OPEN: hdevice=%p\n", (void*)device);
|
||||
|
||||
*hdevice = device;
|
||||
|
||||
return 0;
|
||||
|
@ -242,107 +286,228 @@ extern int vx_dev_close(vx_device_h hdevice) {
|
|||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
DBGPRINT("DEV_CLOSE: hdevice=%p\n", hdevice);
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
perf_remove_device(hdevice);
|
||||
#endif
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
delete device;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, uint64_t* dev_addr) {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == dev_addr
|
||||
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
uint64_t _value;
|
||||
|
||||
CHECK_ERR(device->get_caps(caps_id, &_value), {
|
||||
return err;
|
||||
});
|
||||
|
||||
DBGPRINT("DEV_CAPS: hdevice=%p, caps_id=%d, value=%ld\n", hdevice, caps_id, _value);
|
||||
|
||||
*value = _value;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int flags, vx_buffer_h* hbuffer) {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == hbuffer
|
||||
|| 0 == size)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("MEM_ALLOC: size=%ld\n", size);
|
||||
auto device = ((vx_device*)hdevice);
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->mem_alloc(size, dev_addr);
|
||||
uint64_t dev_addr;
|
||||
CHECK_ERR(device->mem_alloc(size, flags, &dev_addr), {
|
||||
return err;
|
||||
});
|
||||
|
||||
auto buffer = new vx_buffer{device, dev_addr, size};
|
||||
if (nullptr == buffer) {
|
||||
device->mem_free(dev_addr);
|
||||
return -1;
|
||||
}
|
||||
|
||||
DBGPRINT("MEM_ALLOC: hdevice=%p, size=%ld, flags=0x%d, hbuffer=%p\n", hdevice, size, flags, (void*)buffer);
|
||||
|
||||
*hbuffer = buffer;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr) {
|
||||
if (nullptr == hdevice)
|
||||
extern int vx_mem_reserve(vx_device_h hdevice, uint64_t address, uint64_t size, int flags, vx_buffer_h* hbuffer) {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == hbuffer
|
||||
|| 0 == size)
|
||||
return -1;
|
||||
|
||||
if (0 == dev_addr)
|
||||
auto device = ((vx_device*)hdevice);
|
||||
|
||||
CHECK_ERR(device->mem_reserve(address, size, flags), {
|
||||
return err;
|
||||
});
|
||||
|
||||
auto buffer = new vx_buffer{device, address, size};
|
||||
if (nullptr == buffer) {
|
||||
device->mem_free(address);
|
||||
return -1;
|
||||
}
|
||||
|
||||
DBGPRINT("MEM_RESERVE: hdevice=%p, address=0x%lx, size=%ld, flags=0x%d, hbuffer=%p\n", hdevice, address, size, flags, (void*)buffer);
|
||||
|
||||
*hbuffer = buffer;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_mem_free(vx_buffer_h hbuffer) {
|
||||
if (nullptr == hbuffer)
|
||||
return 0;
|
||||
|
||||
DBGPRINT("MEM_FREE: dev_addr=0x%lx\n", dev_addr);
|
||||
DBGPRINT("MEM_FREE: hbuffer=%p\n", hbuffer);
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->mem_free(dev_addr);
|
||||
auto buffer = ((vx_buffer*)hbuffer);
|
||||
auto device = ((vx_device*)buffer->device);
|
||||
|
||||
vx_mem_access(hbuffer, 0, buffer->size, 0);
|
||||
|
||||
int err = device->mem_free(buffer->addr);
|
||||
|
||||
delete buffer;
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
extern int vx_mem_access(vx_buffer_h hbuffer, uint64_t offset, uint64_t size, int flags) {
|
||||
if (nullptr == hbuffer)
|
||||
return -1;
|
||||
|
||||
auto buffer = ((vx_buffer*)hbuffer);
|
||||
auto device = ((vx_device*)buffer->device);
|
||||
|
||||
if ((offset + size) > buffer->size)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("MEM_ACCESS: hbuffer=%p, offset=%ld, size=%ld, flags=%d\n", hbuffer, offset, size, flags);
|
||||
|
||||
return device->mem_access(buffer->addr + offset, size, flags);
|
||||
}
|
||||
|
||||
extern int vx_mem_address(vx_buffer_h hbuffer, uint64_t* address) {
|
||||
if (nullptr == hbuffer)
|
||||
return -1;
|
||||
|
||||
auto buffer = ((vx_buffer*)hbuffer);
|
||||
|
||||
DBGPRINT("MEM_ADDRESS: hbuffer=%p, address=0x%lx\n", hbuffer, buffer->addr);
|
||||
|
||||
*address = buffer->addr;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_mem_info(vx_device_h hdevice, uint64_t* mem_free, uint64_t* mem_used) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("%s\n", "MEM_INFO");
|
||||
auto device = ((vx_device*)hdevice);
|
||||
|
||||
uint64_t _mem_free, _mem_used;
|
||||
|
||||
CHECK_ERR(device->mem_info(&_mem_free, &_mem_used), {
|
||||
return err;
|
||||
});
|
||||
|
||||
DBGPRINT("MEM_INFO: hdevice=%p, mem_free=%ld, mem_used=%ld\n", hdevice, _mem_free, _mem_used);
|
||||
|
||||
if (mem_free) {
|
||||
*mem_free = _mem_free;
|
||||
}
|
||||
|
||||
if (mem_used) {
|
||||
*mem_used = _mem_used;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_copy_to_dev(vx_buffer_h hbuffer, const void* host_ptr, uint64_t dst_offset, uint64_t size) {
|
||||
if (nullptr == hbuffer || nullptr == host_ptr)
|
||||
return -1;
|
||||
|
||||
auto buffer = ((vx_buffer*)hbuffer);
|
||||
auto device = ((vx_device*)buffer->device);
|
||||
|
||||
if ((dst_offset + size) > buffer->size)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("COPY_TO_DEV: hbuffer=%p, host_addr=%p, dst_offset=%ld, size=%ld\n", hbuffer, host_ptr, dst_offset, size);
|
||||
|
||||
return device->upload(buffer->addr + dst_offset, host_ptr, size);
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(void* host_ptr, vx_buffer_h hbuffer, uint64_t src_offset, uint64_t size) {
|
||||
if (nullptr == hbuffer || nullptr == host_ptr)
|
||||
return -1;
|
||||
|
||||
auto buffer = ((vx_buffer*)hbuffer);
|
||||
auto device = ((vx_device*)buffer->device);
|
||||
|
||||
if ((src_offset + size) > buffer->size)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("COPY_FROM_DEV: hbuffer=%p, host_addr=%p, src_offset=%ld, size=%ld\n", hbuffer, host_ptr, src_offset, size);
|
||||
|
||||
return device->download(host_ptr, buffer->addr + src_offset, size);
|
||||
}
|
||||
|
||||
extern int vx_start(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments) {
|
||||
if (nullptr == hdevice || nullptr == hkernel || nullptr == harguments)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("START: hdevice=%p, hkernel=%p, harguments=%p\n", hdevice, hkernel, harguments);
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
return device->mem_info(mem_free, mem_used);
|
||||
}
|
||||
auto kernel = ((vx_buffer*)hkernel);
|
||||
auto arguments = ((vx_buffer*)harguments);
|
||||
|
||||
extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = (vx_device*)hdevice;
|
||||
|
||||
DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%p, size=%ld\n", dev_addr, host_ptr, size);
|
||||
|
||||
return device->upload(dev_addr, host_ptr, size);
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
auto device = (vx_device*)hdevice;
|
||||
|
||||
DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%p, size=%ld\n", dev_addr, host_ptr, size);
|
||||
|
||||
return device->download(host_ptr, dev_addr, size);
|
||||
}
|
||||
|
||||
extern int vx_start(vx_device_h hdevice, uint64_t krnl_addr, uint64_t args_addr) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("START: krnl_addr=0x%lx, args_addr=0x%lx\n", krnl_addr, args_addr);
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->start(krnl_addr, args_addr);
|
||||
return device->start(kernel->addr, arguments->addr);
|
||||
}
|
||||
|
||||
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("%s\n", "WAIT");
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->wait(timeout);
|
||||
DBGPRINT("READY_WAIT: hdevice=%p, timeout=%ld\n", hdevice, timeout);
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
return device->ready_wait(timeout);
|
||||
}
|
||||
|
||||
extern int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) {
|
||||
if (nullptr == hdevice || NULL == value)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
auto device = ((vx_device*)hdevice);
|
||||
|
||||
// Ensure ready for new command
|
||||
if (vx_ready_wait(hdevice, -1) != 0)
|
||||
return -1;
|
||||
uint32_t _value;
|
||||
|
||||
*value = device->read_dcr(addr);
|
||||
CHECK_ERR(device->dcr_read(addr, &_value), {
|
||||
return err;
|
||||
});
|
||||
|
||||
DBGPRINT("DCR_READ: addr=0x%x, value=0x%x\n", addr, *value);
|
||||
DBGPRINT("DCR_READ: hdevice=%p, addr=0x%x, value=0x%x\n", hdevice, addr, _value);
|
||||
|
||||
*value = _value;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -351,13 +516,34 @@ extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) {
|
|||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
DBGPRINT("DCR_WRITE: hdevice=%p, addr=0x%x, value=0x%x\n", hdevice, addr, value);
|
||||
|
||||
// Ensure ready for new command
|
||||
if (vx_ready_wait(hdevice, -1) != 0)
|
||||
return -1;
|
||||
auto device = ((vx_device*)hdevice);
|
||||
|
||||
DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%x\n", addr, value);
|
||||
return device->dcr_write(addr, value);
|
||||
}
|
||||
|
||||
return device->write_dcr(addr, value);
|
||||
extern int vx_mpm_query(vx_device_h hdevice, uint32_t addr, uint32_t core_id, uint64_t* value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
uint32_t offset = addr - VX_CSR_MPM_BASE;
|
||||
if (offset > 31)
|
||||
return -1;
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
|
||||
uint64_t mpm_mem_addr = IO_MPM_ADDR + (core_id * 32 + offset) * sizeof(uint64_t);
|
||||
|
||||
uint64_t _value;
|
||||
|
||||
CHECK_ERR(device->download(&_value, mpm_mem_addr, sizeof(uint64_t)), {
|
||||
return err;
|
||||
});
|
||||
|
||||
DBGPRINT("MPM_QUERY: hdevice=%p, addr=0x%x, core_id=%d, value=0x%lx\n", hdevice, addr, core_id, _value);
|
||||
|
||||
*value = _value;
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -33,27 +33,32 @@
|
|||
#include <mem.h>
|
||||
#include <constants.h>
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
#ifndef NDEBUG
|
||||
#define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0)
|
||||
#else
|
||||
#define DBGPRINT(format, ...) ((void)0)
|
||||
#endif
|
||||
|
||||
using namespace vortex;
|
||||
#define CHECK_ERR(_expr, _cleanup) \
|
||||
do { \
|
||||
auto err = _expr; \
|
||||
if (err == 0) \
|
||||
break; \
|
||||
printf("[VXDRV] Error: '%s' returned %d!\n", #_expr, (int)err); \
|
||||
_cleanup \
|
||||
} while (false)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class vx_device {
|
||||
class vx_device {
|
||||
public:
|
||||
vx_device()
|
||||
vx_device()
|
||||
: arch_(NUM_THREADS, NUM_WARPS, NUM_CORES)
|
||||
, ram_(0, RAM_PAGE_SIZE)
|
||||
, processor_(arch_)
|
||||
, global_mem_(
|
||||
ALLOC_BASE_ADDR,
|
||||
ALLOC_MAX_ADDR - ALLOC_BASE_ADDR,
|
||||
RAM_PAGE_SIZE,
|
||||
CACHE_BLOCK_SIZE)
|
||||
, global_mem_(ALLOC_BASE_ADDR, GLOBAL_MEM_SIZE - ALLOC_BASE_ADDR, RAM_PAGE_SIZE, CACHE_BLOCK_SIZE)
|
||||
{
|
||||
// attach memory module
|
||||
processor_.attach_ram(&ram_);
|
||||
|
@ -63,16 +68,84 @@ public:
|
|||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int mem_alloc(uint64_t size, uint64_t* dev_addr) {
|
||||
return global_mem_.allocate(size, dev_addr);
|
||||
int get_caps(uint32_t caps_id, uint64_t *value) {
|
||||
uint64_t _value;
|
||||
switch (caps_id) {
|
||||
case VX_CAPS_VERSION:
|
||||
_value = IMPLEMENTATION_ID;
|
||||
break;
|
||||
case VX_CAPS_NUM_THREADS:
|
||||
_value = NUM_THREADS;
|
||||
break;
|
||||
case VX_CAPS_NUM_WARPS:
|
||||
_value = NUM_WARPS;
|
||||
break;
|
||||
case VX_CAPS_NUM_CORES:
|
||||
_value = NUM_CORES * NUM_CLUSTERS;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
_value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_GLOBAL_MEM_SIZE:
|
||||
_value = GLOBAL_MEM_SIZE;
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||
_value = (1 << LMEM_LOG_SIZE);
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_ADDR:
|
||||
_value = LMEM_BASE_ADDR;
|
||||
break;
|
||||
case VX_CAPS_ISA_FLAGS:
|
||||
_value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
|
||||
break;
|
||||
default:
|
||||
std::cout << "invalid caps id: " << caps_id << std::endl;
|
||||
std::abort();
|
||||
return -1;
|
||||
}
|
||||
*value = _value;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_alloc(uint64_t size, int flags, uint64_t* dev_addr) {
|
||||
uint64_t addr;
|
||||
CHECK_ERR(global_mem_.allocate(size, &addr), {
|
||||
return err;
|
||||
});
|
||||
CHECK_ERR(this->mem_access(addr, size, flags), {
|
||||
global_mem_.release(addr);
|
||||
return err;
|
||||
});
|
||||
*dev_addr = addr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_reserve(uint64_t dev_addr, uint64_t size, int flags) {
|
||||
CHECK_ERR(global_mem_.reserve(dev_addr, size), {
|
||||
return err;
|
||||
});
|
||||
CHECK_ERR(this->mem_access(dev_addr, size, flags), {
|
||||
global_mem_.release(dev_addr);
|
||||
return err;
|
||||
});
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_free(uint64_t dev_addr) {
|
||||
return global_mem_.release(dev_addr);
|
||||
}
|
||||
|
||||
int mem_access(uint64_t dev_addr, uint64_t size, int flags) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dev_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.set_acl(dev_addr, size, flags);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int mem_info(uint64_t* mem_free, uint64_t* mem_used) const {
|
||||
if (mem_free)
|
||||
*mem_free = global_mem_.free();
|
||||
|
@ -86,13 +159,15 @@ public:
|
|||
if (dest_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.enable_acl(false);
|
||||
ram_.write((const uint8_t*)src, dest_addr, size);
|
||||
|
||||
ram_.enable_acl(true);
|
||||
|
||||
/*DBGPRINT("upload %ld bytes to 0x%lx\n", size, dest_addr);
|
||||
for (uint64_t i = 0; i < size && i < 1024; i += 4) {
|
||||
DBGPRINT(" 0x%lx <- 0x%x\n", dest_addr + i, *(uint32_t*)((uint8_t*)src + i));
|
||||
}*/
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -101,37 +176,39 @@ public:
|
|||
if (src_addr + asize > GLOBAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.enable_acl(false);
|
||||
ram_.read((uint8_t*)dest, src_addr, size);
|
||||
|
||||
ram_.enable_acl(true);
|
||||
|
||||
/*DBGPRINT("download %ld bytes from 0x%lx\n", size, src_addr);
|
||||
for (uint64_t i = 0; i < size && i < 1024; i += 4) {
|
||||
DBGPRINT(" 0x%lx -> 0x%x\n", src_addr + i, *(uint32_t*)((uint8_t*)dest + i));
|
||||
}*/
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int start(uint64_t krnl_addr, uint64_t args_addr) {
|
||||
int start(uint64_t krnl_addr, uint64_t args_addr) {
|
||||
// ensure prior run completed
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
|
||||
// set kernel info
|
||||
this->write_dcr(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff);
|
||||
this->write_dcr(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32);
|
||||
this->write_dcr(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
|
||||
this->write_dcr(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
|
||||
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR0, krnl_addr & 0xffffffff);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ADDR1, krnl_addr >> 32);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ARG0, args_addr & 0xffffffff);
|
||||
this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32);
|
||||
|
||||
// start new run
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
processor_.run();
|
||||
});
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int wait(uint64_t timeout) {
|
||||
int ready_wait(uint64_t timeout) {
|
||||
if (!future_.valid())
|
||||
return 0;
|
||||
uint64_t timeout_sec = timeout / 1000;
|
||||
|
@ -139,24 +216,24 @@ public:
|
|||
for (;;) {
|
||||
// wait for 1 sec and check status
|
||||
auto status = future_.wait_for(wait_time);
|
||||
if (status == std::future_status::ready
|
||||
if (status == std::future_status::ready
|
||||
|| 0 == timeout_sec--)
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int write_dcr(uint32_t addr, uint32_t value) {
|
||||
int dcr_write(uint32_t addr, uint32_t value) {
|
||||
if (future_.valid()) {
|
||||
future_.wait(); // ensure prior run completed
|
||||
}
|
||||
processor_.write_dcr(addr, value);
|
||||
}
|
||||
processor_.dcr_write(addr, value);
|
||||
dcrs_.write(addr, value);
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint64_t read_dcr(uint32_t addr) const {
|
||||
return dcrs_.read(addr);
|
||||
int dcr_read(uint32_t addr, uint32_t* value) const {
|
||||
return dcrs_.read(addr, value);
|
||||
}
|
||||
|
||||
private:
|
||||
|
@ -168,6 +245,12 @@ private:
|
|||
std::future<void> future_;
|
||||
};
|
||||
|
||||
struct vx_buffer {
|
||||
vx_device* device;
|
||||
uint64_t addr;
|
||||
uint64_t size;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern int vx_dev_open(vx_device_h* hdevice) {
|
||||
|
@ -186,12 +269,12 @@ extern int vx_dev_open(vx_device_h* hdevice) {
|
|||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
perf_add_device(device);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
DBGPRINT("DEV_OPEN: hdevice=%p\n", (void*)device);
|
||||
|
||||
*hdevice = device;
|
||||
|
||||
DBGPRINT("device creation complete!\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -199,7 +282,9 @@ extern int vx_dev_close(vx_device_h hdevice) {
|
|||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
DBGPRINT("DEV_CLOSE: hdevice=%p\n", hdevice);
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
perf_remove_device(hdevice);
|
||||
|
@ -207,144 +292,218 @@ extern int vx_dev_close(vx_device_h hdevice) {
|
|||
|
||||
delete device;
|
||||
|
||||
DBGPRINT("device destroyed!\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
//vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
switch (caps_id) {
|
||||
case VX_CAPS_VERSION:
|
||||
*value = IMPLEMENTATION_ID;
|
||||
break;
|
||||
case VX_CAPS_NUM_THREADS:
|
||||
*value = NUM_THREADS;
|
||||
break;
|
||||
case VX_CAPS_NUM_WARPS:
|
||||
*value = NUM_WARPS;
|
||||
break;
|
||||
case VX_CAPS_NUM_CORES:
|
||||
*value = NUM_CORES * NUM_CLUSTERS;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
*value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_GLOBAL_MEM_SIZE:
|
||||
*value = GLOBAL_MEM_SIZE;
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||
*value = (1 << LMEM_LOG_SIZE);
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_ADDR:
|
||||
*value = LMEM_BASE_ADDR;
|
||||
break;
|
||||
case VX_CAPS_ISA_FLAGS:
|
||||
*value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
|
||||
break;
|
||||
default:
|
||||
std::cout << "invalid caps id: " << caps_id << std::endl;
|
||||
std::abort();
|
||||
return -1;
|
||||
}
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
uint64_t _value;
|
||||
|
||||
CHECK_ERR(device->get_caps(caps_id, &_value), {
|
||||
return err;
|
||||
});
|
||||
|
||||
DBGPRINT("DEV_CAPS: hdevice=%p, caps_id=%d, value=%ld\n", hdevice, caps_id, _value);
|
||||
|
||||
*value = _value;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, uint64_t* dev_addr) {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == dev_addr
|
||||
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, int flags, vx_buffer_h* hbuffer) {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == hbuffer
|
||||
|| 0 == size)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("MEM_ALLOC: size=%ld\n", size);
|
||||
auto device = ((vx_device*)hdevice);
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->mem_alloc(size, dev_addr);
|
||||
uint64_t dev_addr;
|
||||
CHECK_ERR(device->mem_alloc(size, flags, &dev_addr), {
|
||||
return err;
|
||||
});
|
||||
|
||||
auto buffer = new vx_buffer{device, dev_addr, size};
|
||||
if (nullptr == buffer) {
|
||||
device->mem_free(dev_addr);
|
||||
return -1;
|
||||
}
|
||||
|
||||
DBGPRINT("MEM_ALLOC: hdevice=%p, size=%ld, flags=0x%d, hbuffer=%p\n", hdevice, size, flags, (void*)buffer);
|
||||
|
||||
*hbuffer = buffer;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_addr) {
|
||||
if (nullptr == hdevice)
|
||||
extern int vx_mem_reserve(vx_device_h hdevice, uint64_t address, uint64_t size, int flags, vx_buffer_h* hbuffer) {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == hbuffer
|
||||
|| 0 == size)
|
||||
return -1;
|
||||
|
||||
if (0 == dev_addr)
|
||||
auto device = ((vx_device*)hdevice);
|
||||
|
||||
CHECK_ERR(device->mem_reserve(address, size, flags), {
|
||||
return err;
|
||||
});
|
||||
|
||||
auto buffer = new vx_buffer{device, address, size};
|
||||
if (nullptr == buffer) {
|
||||
device->mem_free(address);
|
||||
return -1;
|
||||
}
|
||||
|
||||
DBGPRINT("MEM_RESERVE: hdevice=%p, address=0x%lx, size=%ld, flags=0x%d, hbuffer=%p\n", hdevice, address, size, flags, (void*)buffer);
|
||||
|
||||
*hbuffer = buffer;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_mem_free(vx_buffer_h hbuffer) {
|
||||
if (nullptr == hbuffer)
|
||||
return 0;
|
||||
|
||||
DBGPRINT("MEM_FREE: dev_addr=0x%lx\n", dev_addr);
|
||||
DBGPRINT("MEM_FREE: hbuffer=%p\n", hbuffer);
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->mem_free(dev_addr);
|
||||
auto buffer = ((vx_buffer*)hbuffer);
|
||||
auto device = ((vx_device*)buffer->device);
|
||||
|
||||
vx_mem_access(hbuffer, 0, buffer->size, 0);
|
||||
|
||||
int err = device->mem_free(buffer->addr);
|
||||
|
||||
delete buffer;
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
extern int vx_mem_access(vx_buffer_h hbuffer, uint64_t offset, uint64_t size, int flags) {
|
||||
if (nullptr == hbuffer)
|
||||
return -1;
|
||||
|
||||
auto buffer = ((vx_buffer*)hbuffer);
|
||||
auto device = ((vx_device*)buffer->device);
|
||||
|
||||
if ((offset + size) > buffer->size)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("MEM_ACCESS: hbuffer=%p, offset=%ld, size=%ld, flags=%d\n", hbuffer, offset, size, flags);
|
||||
|
||||
return device->mem_access(buffer->addr + offset, size, flags);
|
||||
}
|
||||
|
||||
extern int vx_mem_address(vx_buffer_h hbuffer, uint64_t* address) {
|
||||
if (nullptr == hbuffer)
|
||||
return -1;
|
||||
|
||||
auto buffer = ((vx_buffer*)hbuffer);
|
||||
|
||||
DBGPRINT("MEM_ADDRESS: hbuffer=%p, address=0x%lx\n", hbuffer, buffer->addr);
|
||||
|
||||
*address = buffer->addr;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_mem_info(vx_device_h hdevice, uint64_t* mem_free, uint64_t* mem_used) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("%s\n", "MEM_INFO");
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
return device->mem_info(mem_free, mem_used);
|
||||
|
||||
uint64_t _mem_free, _mem_used;
|
||||
|
||||
CHECK_ERR(device->mem_info(&_mem_free, &_mem_used), {
|
||||
return err;
|
||||
});
|
||||
|
||||
DBGPRINT("MEM_INFO: hdevice=%p, mem_free=%ld, mem_used=%ld\n", hdevice, _mem_free, _mem_used);
|
||||
|
||||
if (mem_free) {
|
||||
*mem_free = _mem_free;
|
||||
}
|
||||
|
||||
if (mem_used) {
|
||||
*mem_used = _mem_used;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_copy_to_dev(vx_device_h hdevice, uint64_t dev_addr, const void* host_ptr, uint64_t size) {
|
||||
if (nullptr == hdevice)
|
||||
extern int vx_copy_to_dev(vx_buffer_h hbuffer, const void* host_ptr, uint64_t dst_offset, uint64_t size) {
|
||||
if (nullptr == hbuffer || nullptr == host_ptr)
|
||||
return -1;
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
auto buffer = ((vx_buffer*)hbuffer);
|
||||
auto device = ((vx_device*)buffer->device);
|
||||
|
||||
DBGPRINT("COPY_TO_DEV: dev_addr=0x%lx, host_addr=0x%p, size=%ld\n", dev_addr, host_ptr, size);
|
||||
|
||||
return device->upload(dev_addr, host_ptr, size);
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(vx_device_h hdevice, void* host_ptr, uint64_t dev_addr, uint64_t size) {
|
||||
if (nullptr == hdevice)
|
||||
if ((dst_offset + size) > buffer->size)
|
||||
return -1;
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
DBGPRINT("COPY_TO_DEV: hbuffer=%p, host_addr=%p, dst_offset=%ld, size=%ld\n", hbuffer, host_ptr, dst_offset, size);
|
||||
|
||||
DBGPRINT("COPY_FROM_DEV: dev_addr=0x%lx, host_addr=0x%p, size=%ld\n", dev_addr, host_ptr, size);
|
||||
|
||||
return device->download(host_ptr, dev_addr, size);
|
||||
return device->upload(buffer->addr + dst_offset, host_ptr, size);
|
||||
}
|
||||
|
||||
extern int vx_start(vx_device_h hdevice, uint64_t krnl_addr, uint64_t args_addr) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("START: krnl_addr=0x%lx, args_addr=0x%lx\n", krnl_addr, args_addr);
|
||||
extern int vx_copy_from_dev(void* host_ptr, vx_buffer_h hbuffer, uint64_t src_offset, uint64_t size) {
|
||||
if (nullptr == hbuffer || nullptr == host_ptr)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->start(krnl_addr, args_addr);
|
||||
auto buffer = ((vx_buffer*)hbuffer);
|
||||
auto device = ((vx_device*)buffer->device);
|
||||
|
||||
if ((src_offset + size) > buffer->size)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("COPY_FROM_DEV: hbuffer=%p, host_addr=%p, src_offset=%ld, size=%ld\n", hbuffer, host_ptr, src_offset, size);
|
||||
|
||||
return device->download(host_ptr, buffer->addr + src_offset, size);
|
||||
}
|
||||
|
||||
extern int vx_start(vx_device_h hdevice, vx_buffer_h hkernel, vx_buffer_h harguments) {
|
||||
if (nullptr == hdevice || nullptr == hkernel || nullptr == harguments)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("START: hdevice=%p, hkernel=%p, harguments=%p\n", hdevice, hkernel, harguments);
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
auto kernel = ((vx_buffer*)hkernel);
|
||||
auto arguments = ((vx_buffer*)harguments);
|
||||
|
||||
return device->start(kernel->addr, arguments->addr);
|
||||
}
|
||||
|
||||
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("%s\n", "WAIT");
|
||||
DBGPRINT("READY_WAIT: hdevice=%p, timeout=%ld\n", hdevice, timeout);
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->wait(timeout);
|
||||
auto device = ((vx_device*)hdevice);
|
||||
return device->ready_wait(timeout);
|
||||
}
|
||||
|
||||
extern int vx_dcr_read(vx_device_h hdevice, uint32_t addr, uint32_t* value) {
|
||||
if (nullptr == hdevice || NULL == value)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
auto device = ((vx_device*)hdevice);
|
||||
|
||||
// Ensure ready for new command
|
||||
if (vx_ready_wait(hdevice, -1) != 0)
|
||||
return -1;
|
||||
uint32_t _value;
|
||||
|
||||
*value = device->read_dcr(addr);
|
||||
CHECK_ERR(device->dcr_read(addr, &_value), {
|
||||
return err;
|
||||
});
|
||||
|
||||
DBGPRINT("DCR_READ: addr=0x%x, value=0x%x\n", addr, *value);
|
||||
DBGPRINT("DCR_READ: hdevice=%p, addr=0x%x, value=0x%x\n", hdevice, addr, _value);
|
||||
|
||||
*value = _value;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -353,13 +512,34 @@ extern int vx_dcr_write(vx_device_h hdevice, uint32_t addr, uint32_t value) {
|
|||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
DBGPRINT("DCR_WRITE: hdevice=%p, addr=0x%x, value=0x%x\n", hdevice, addr, value);
|
||||
|
||||
// Ensure ready for new command
|
||||
if (vx_ready_wait(hdevice, -1) != 0)
|
||||
auto device = ((vx_device*)hdevice);
|
||||
|
||||
return device->dcr_write(addr, value);
|
||||
}
|
||||
|
||||
extern int vx_mpm_query(vx_device_h hdevice, uint32_t addr, uint32_t core_id, uint64_t* value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
DBGPRINT("DCR_WRITE: addr=0x%x, value=0x%x\n", addr, value);
|
||||
|
||||
return device->write_dcr(addr, value);
|
||||
}
|
||||
uint32_t offset = addr - VX_CSR_MPM_BASE;
|
||||
if (offset > 31)
|
||||
return -1;
|
||||
|
||||
auto device = ((vx_device*)hdevice);
|
||||
|
||||
uint64_t mpm_mem_addr = IO_MPM_ADDR + (core_id * 32 + offset) * sizeof(uint64_t);
|
||||
|
||||
uint64_t _value;
|
||||
|
||||
CHECK_ERR(device->download(&_value, mpm_mem_addr, sizeof(uint64_t)), {
|
||||
return err;
|
||||
});
|
||||
|
||||
DBGPRINT("MPM_QUERY: hdevice=%p, addr=0x%x, core_id=%d, value=0x%lx\n", hdevice, addr, core_id, _value);
|
||||
|
||||
*value = _value;
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -25,11 +25,23 @@ extern int vx_dev_caps(vx_device_h /*hdevice*/, uint32_t /*caps_id*/, uint64_t*
|
|||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_mem_alloc(vx_device_h /*hdevice*/, uint64_t /*size*/, uint64_t* /*dev_addr*/) {
|
||||
extern int vx_mem_alloc(vx_device_h /*hdevice*/, uint64_t /*size*/, int /*flags*/, vx_buffer_h* /*hbuffer*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_mem_free(vx_device_h /*hdevice*/, uint64_t /*dev_addr*/) {
|
||||
extern int vx_mem_reserve(vx_device_h /*hdevice*/, uint64_t /*address*/, uint64_t /*size*/, int /*flags*/, vx_buffer_h* /*hbuffer*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_mem_free(vx_buffer_h /*hbuffer*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_mem_access(vx_buffer_h /*hbuffer*/, uint64_t /*offset*/, uint64_t /*size*/, int /*flags*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_mem_address(vx_buffer_h /*hbuffer*/, uint64_t* /*address*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -37,15 +49,15 @@ extern int vx_mem_info(vx_device_h /*hdevice*/, uint64_t* /*mem_free*/, uint64_t
|
|||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_copy_to_dev(vx_device_h /*hdevice*/, uint64_t /*dev_addr*/, const void* /*host_ptr*/, uint64_t /*size*/) {
|
||||
extern int vx_copy_to_dev(vx_buffer_h /*hbuffer*/, const void* /*host_ptr*/, uint64_t /*dst_offset*/, uint64_t /*size*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(vx_device_h /*hdevice*/, void* /*host_ptr*/, uint64_t /*dev_addr*/, uint64_t /*size*/) {
|
||||
extern int vx_copy_from_dev(void* /*host_ptr*/, vx_buffer_h /*hbuffer*/, uint64_t /*src_offset*/, uint64_t /*size*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_start(vx_device_h /*hdevice*/, uint64_t /*krnl_addr*/, uint64_t /*args_add*/) {
|
||||
extern int vx_start(vx_device_h /*hdevice*/, vx_buffer_h /*hkernel*/, vx_buffer_h /*harguments*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -61,3 +73,7 @@ extern int vx_dcr_read(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint32_t* /*v
|
|||
extern int vx_dcr_write(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint32_t /*value*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_mpm_query(vx_device_h /*hdevice*/, uint32_t /*addr*/, uint32_t /*core_id*/, uint64_t* /*value*/) {
|
||||
return -1;
|
||||
}
|
File diff suppressed because it is too large
Load diff
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -20,7 +20,7 @@
|
|||
|
||||
using namespace vortex;
|
||||
|
||||
RamMemDevice::RamMemDevice(const char *filename, uint32_t wordSize)
|
||||
RamMemDevice::RamMemDevice(const char *filename, uint32_t wordSize)
|
||||
: wordSize_(wordSize) {
|
||||
std::ifstream input(filename);
|
||||
|
||||
|
@ -39,19 +39,19 @@ RamMemDevice::RamMemDevice(const char *filename, uint32_t wordSize)
|
|||
}
|
||||
|
||||
RamMemDevice::RamMemDevice(uint64_t size, uint32_t wordSize)
|
||||
: contents_(size)
|
||||
: contents_(size)
|
||||
, wordSize_(wordSize)
|
||||
{}
|
||||
|
||||
void RamMemDevice::read(void* data, uint64_t addr, uint64_t size) {
|
||||
auto addr_end = addr + size;
|
||||
if ((addr & (wordSize_-1))
|
||||
|| (addr_end & (wordSize_-1))
|
||||
|| (addr_end & (wordSize_-1))
|
||||
|| (addr_end <= contents_.size())) {
|
||||
std::cout << "lookup of 0x" << std::hex << (addr_end-1) << " failed.\n";
|
||||
throw BadAddress();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
const uint8_t *s = contents_.data() + addr;
|
||||
for (uint8_t *d = (uint8_t*)data, *de = d + size; d != de;) {
|
||||
*d++ = *s++;
|
||||
|
@ -61,7 +61,7 @@ void RamMemDevice::read(void* data, uint64_t addr, uint64_t size) {
|
|||
void RamMemDevice::write(const void* data, uint64_t addr, uint64_t size) {
|
||||
auto addr_end = addr + size;
|
||||
if ((addr & (wordSize_-1))
|
||||
|| (addr_end & (wordSize_-1))
|
||||
|| (addr_end & (wordSize_-1))
|
||||
|| (addr_end <= contents_.size())) {
|
||||
std::cout << "lookup of 0x" << std::hex << (addr_end-1) << " failed.\n";
|
||||
throw BadAddress();
|
||||
|
@ -106,7 +106,7 @@ void MemoryUnit::ADecoder::read(void* data, uint64_t addr, uint64_t size) {
|
|||
if (!this->lookup(addr, size, &ma)) {
|
||||
std::cout << "lookup of 0x" << std::hex << addr << " failed.\n";
|
||||
throw BadAddress();
|
||||
}
|
||||
}
|
||||
ma.md->read(data, ma.addr, size);
|
||||
}
|
||||
|
||||
|
@ -153,7 +153,7 @@ uint64_t MemoryUnit::toPhyAddr(uint64_t addr, uint32_t flagMask) {
|
|||
TLBEntry t = this->tlbLookup(addr, flagMask);
|
||||
pAddr = t.pfn * pageSize_ + addr % pageSize_;
|
||||
} else {
|
||||
pAddr = addr;
|
||||
pAddr = addr;
|
||||
}
|
||||
return pAddr;
|
||||
}
|
||||
|
@ -190,14 +190,90 @@ void MemoryUnit::tlbRm(uint64_t va) {
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
RAM::RAM(uint64_t capacity, uint32_t page_size)
|
||||
void ACLManager::set(uint64_t addr, uint64_t size, int flags) {
|
||||
if (size == 0)
|
||||
return;
|
||||
|
||||
uint64_t end = addr + size;
|
||||
|
||||
// get starting interval
|
||||
auto it = acl_map_.lower_bound(addr);
|
||||
if (it != acl_map_.begin() && (--it)->second.end < addr) {
|
||||
++it;
|
||||
}
|
||||
|
||||
// Remove existing entries that overlap or are within the new range
|
||||
while (it != acl_map_.end() && it->first < end) {
|
||||
auto current = it++;
|
||||
uint64_t current_end = current->second.end;
|
||||
if (current_end <= addr)
|
||||
continue; // No overlap, no need to adjust
|
||||
|
||||
// Adjust the current interval or erase it depending on overlap and flags
|
||||
if (current->first < addr) {
|
||||
if (current_end > end) {
|
||||
acl_map_[end] = {current_end, current->second.flags};
|
||||
}
|
||||
current->second.end = addr;
|
||||
} else {
|
||||
if (current_end > end) {
|
||||
acl_map_[end] = {current_end, current->second.flags};
|
||||
}
|
||||
acl_map_.erase(current);
|
||||
}
|
||||
}
|
||||
|
||||
// Insert new range if flags are not zero
|
||||
if (flags != 0) {
|
||||
it = acl_map_.emplace(addr, acl_entry_t{end, flags}).first;
|
||||
// Merge adjacent ranges with the same flags
|
||||
auto prev = it;
|
||||
if (it != acl_map_.begin() && (--prev)->second.end == addr && prev->second.flags == flags) {
|
||||
prev->second.end = it->second.end;
|
||||
acl_map_.erase(it);
|
||||
it = prev;
|
||||
}
|
||||
auto next = std::next(it);
|
||||
if (next != acl_map_.end() && it->second.end == next->first && it->second.flags == next->second.flags) {
|
||||
it->second.end = next->second.end;
|
||||
acl_map_.erase(next);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool ACLManager::check(uint64_t addr, uint64_t size, int flags) const {
|
||||
uint64_t end = addr + size;
|
||||
|
||||
auto it = acl_map_.lower_bound(addr);
|
||||
if (it != acl_map_.begin() && (--it)->second.end < addr) {
|
||||
++it;
|
||||
}
|
||||
|
||||
while (it != acl_map_.end() && it->first < end) {
|
||||
if (it->second.end > addr) {
|
||||
if ((it->second.flags & flags) != flags) {
|
||||
std::cout << "Memory access violation from 0x" << std::hex << addr << " to 0x" << end << ", flags=" << (it->second.flags ^ flags) << std::endl;
|
||||
return false; // Overlapping entry is missing at least one required flag bit
|
||||
}
|
||||
addr = it->second.end; // Move to the end of the current matching range
|
||||
}
|
||||
++it;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
RAM::RAM(uint64_t capacity, uint32_t page_size)
|
||||
: capacity_(capacity)
|
||||
, page_bits_(log2ceil(page_size))
|
||||
, last_page_(nullptr)
|
||||
, last_page_index_(0) {
|
||||
, last_page_index_(0)
|
||||
, check_acl_(false) {
|
||||
assert(ispow2(page_size));
|
||||
if (capacity != 0) {
|
||||
assert(ispow2(capacity));
|
||||
assert(ispow2(capacity));
|
||||
assert(page_size <= capacity);
|
||||
assert(0 == (capacity % page_size));
|
||||
}
|
||||
|
@ -221,7 +297,7 @@ uint8_t *RAM::get(uint64_t address) const {
|
|||
if (capacity_ != 0 && address >= capacity_) {
|
||||
throw OutOfRange();
|
||||
}
|
||||
uint32_t page_size = 1 << page_bits_;
|
||||
uint32_t page_size = 1 << page_bits_;
|
||||
uint32_t page_offset = address & (page_size - 1);
|
||||
uint64_t page_index = address >> page_bits_;
|
||||
|
||||
|
@ -249,6 +325,9 @@ uint8_t *RAM::get(uint64_t address) const {
|
|||
}
|
||||
|
||||
void RAM::read(void* data, uint64_t addr, uint64_t size) {
|
||||
if (check_acl_ && acl_mngr_.check(addr, size, 0x1) == false) {
|
||||
throw BadAddress();
|
||||
}
|
||||
uint8_t* d = (uint8_t*)data;
|
||||
for (uint64_t i = 0; i < size; i++) {
|
||||
d[i] = *this->get(addr + i);
|
||||
|
@ -256,12 +335,22 @@ void RAM::read(void* data, uint64_t addr, uint64_t size) {
|
|||
}
|
||||
|
||||
void RAM::write(const void* data, uint64_t addr, uint64_t size) {
|
||||
if (check_acl_ && acl_mngr_.check(addr, size, 0x2) == false) {
|
||||
throw BadAddress();
|
||||
}
|
||||
const uint8_t* d = (const uint8_t*)data;
|
||||
for (uint64_t i = 0; i < size; i++) {
|
||||
*this->get(addr + i) = d[i];
|
||||
}
|
||||
}
|
||||
|
||||
void RAM::set_acl(uint64_t addr, uint64_t size, int flags) {
|
||||
if (capacity_ != 0 && (addr + size)> capacity_) {
|
||||
throw OutOfRange();
|
||||
}
|
||||
acl_mngr_.set(addr, size, flags);
|
||||
}
|
||||
|
||||
void RAM::loadBinImage(const char* filename, uint64_t destination) {
|
||||
std::ifstream ifs(filename);
|
||||
if (!ifs) {
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -15,6 +15,7 @@
|
|||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
#include <map>
|
||||
#include <unordered_map>
|
||||
#include <cstdint>
|
||||
|
||||
|
@ -38,7 +39,7 @@ public:
|
|||
RamMemDevice(const char* filename, uint32_t wordSize);
|
||||
~RamMemDevice() {}
|
||||
|
||||
void read(void* data, uint64_t addr, uint64_t size) override;
|
||||
void read(void* data, uint64_t addr, uint64_t size) override;
|
||||
void write(const void* data, uint64_t addr, uint64_t size) override;
|
||||
|
||||
virtual uint64_t size() const {
|
||||
|
@ -55,13 +56,13 @@ protected:
|
|||
class RomMemDevice : public RamMemDevice {
|
||||
public:
|
||||
RomMemDevice(const char *filename, uint32_t wordSize)
|
||||
: RamMemDevice(filename, wordSize)
|
||||
: RamMemDevice(filename, wordSize)
|
||||
{}
|
||||
|
||||
RomMemDevice(uint64_t size, uint32_t wordSize)
|
||||
: RamMemDevice(size, wordSize)
|
||||
: RamMemDevice(size, wordSize)
|
||||
{}
|
||||
|
||||
|
||||
~RomMemDevice();
|
||||
|
||||
void write(const void* data, uint64_t addr, uint64_t size) override;
|
||||
|
@ -71,11 +72,11 @@ public:
|
|||
|
||||
class MemoryUnit {
|
||||
public:
|
||||
|
||||
|
||||
struct PageFault {
|
||||
PageFault(uint64_t a, bool nf)
|
||||
: faultAddr(a)
|
||||
, notFound(nf)
|
||||
, notFound(nf)
|
||||
{}
|
||||
uint64_t faultAddr;
|
||||
bool notFound;
|
||||
|
@ -107,10 +108,10 @@ private:
|
|||
class ADecoder {
|
||||
public:
|
||||
ADecoder() {}
|
||||
|
||||
|
||||
void read(void* data, uint64_t addr, uint64_t size);
|
||||
void write(const void* data, uint64_t addr, uint64_t size);
|
||||
|
||||
|
||||
void map(uint64_t start, uint64_t end, MemDevice &md);
|
||||
|
||||
private:
|
||||
|
@ -119,11 +120,11 @@ private:
|
|||
MemDevice* md;
|
||||
uint64_t addr;
|
||||
};
|
||||
|
||||
|
||||
struct entry_t {
|
||||
MemDevice* md;
|
||||
uint64_t start;
|
||||
uint64_t end;
|
||||
uint64_t end;
|
||||
};
|
||||
|
||||
bool lookup(uint64_t addr, uint32_t wordSize, mem_accessor_t*);
|
||||
|
@ -135,7 +136,7 @@ private:
|
|||
TLBEntry() {}
|
||||
TLBEntry(uint32_t pfn, uint32_t flags)
|
||||
: pfn(pfn)
|
||||
, flags(flags)
|
||||
, flags(flags)
|
||||
{}
|
||||
uint32_t pfn;
|
||||
uint32_t flags;
|
||||
|
@ -147,7 +148,7 @@ private:
|
|||
|
||||
std::unordered_map<uint64_t, TLBEntry> tlb_;
|
||||
uint64_t pageSize_;
|
||||
ADecoder decoder_;
|
||||
ADecoder decoder_;
|
||||
bool enableVM_;
|
||||
|
||||
amo_reservation_t amo_reservation_;
|
||||
|
@ -155,9 +156,28 @@ private:
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class ACLManager {
|
||||
public:
|
||||
|
||||
void set(uint64_t addr, uint64_t size, int flags);
|
||||
|
||||
bool check(uint64_t addr, uint64_t size, int flags) const;
|
||||
|
||||
private:
|
||||
|
||||
struct acl_entry_t {
|
||||
uint64_t end;
|
||||
int32_t flags;
|
||||
};
|
||||
|
||||
std::map<uint64_t, acl_entry_t> acl_map_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class RAM : public MemDevice {
|
||||
public:
|
||||
|
||||
|
||||
RAM(uint64_t capacity, uint32_t page_size);
|
||||
RAM(uint64_t capacity) : RAM(capacity, capacity) {}
|
||||
~RAM();
|
||||
|
@ -166,7 +186,7 @@ public:
|
|||
|
||||
uint64_t size() const override;
|
||||
|
||||
void read(void* data, uint64_t addr, uint64_t size) override;
|
||||
void read(void* data, uint64_t addr, uint64_t size) override;
|
||||
void write(const void* data, uint64_t addr, uint64_t size) override;
|
||||
|
||||
void loadBinImage(const char* filename, uint64_t destination);
|
||||
|
@ -180,15 +200,23 @@ public:
|
|||
return *this->get(address);
|
||||
}
|
||||
|
||||
void set_acl(uint64_t addr, uint64_t size, int flags);
|
||||
|
||||
void enable_acl(bool enable) {
|
||||
check_acl_ = enable;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
uint8_t *get(uint64_t address) const;
|
||||
|
||||
uint64_t capacity_;
|
||||
uint32_t page_bits_;
|
||||
uint32_t page_bits_;
|
||||
mutable std::unordered_map<uint64_t, uint8_t*> pages_;
|
||||
mutable uint8_t* last_page_;
|
||||
mutable uint64_t last_page_index_;
|
||||
ACLManager acl_mngr_;
|
||||
bool check_acl_;
|
||||
};
|
||||
|
||||
} // namespace vortex
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -61,8 +61,8 @@ static void parse_args(int argc, char **argv) {
|
|||
|
||||
int main(int argc, char **argv) {
|
||||
int exitcode = 0;
|
||||
|
||||
parse_args(argc, argv);
|
||||
|
||||
parse_args(argc, argv);
|
||||
|
||||
// create memory module
|
||||
vortex::RAM ram(0, RAM_PAGE_SIZE);
|
||||
|
@ -75,14 +75,14 @@ int main(int argc, char **argv) {
|
|||
|
||||
// setup base DCRs
|
||||
const uint64_t startup_addr(STARTUP_ADDR);
|
||||
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
|
||||
processor.dcr_write(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
|
||||
#if (XLEN == 64)
|
||||
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
|
||||
processor.dcr_write(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
|
||||
#endif
|
||||
processor.write_dcr(VX_DCR_BASE_MPM_CLASS, 0);
|
||||
processor.dcr_write(VX_DCR_BASE_MPM_CLASS, 0);
|
||||
|
||||
// load program
|
||||
{
|
||||
{
|
||||
std::string program_ext(fileExtension(program));
|
||||
if (program_ext == "bin") {
|
||||
ram.loadBinImage(program, startup_addr);
|
||||
|
@ -96,7 +96,7 @@ int main(int argc, char **argv) {
|
|||
|
||||
// run simulation
|
||||
exitcode = processor.run();
|
||||
|
||||
|
||||
if (riscv_test) {
|
||||
if (1 == exitcode) {
|
||||
std::cout << "Passed" << std::endl;
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -37,7 +37,7 @@
|
|||
#include <list>
|
||||
#include <queue>
|
||||
#include <vector>
|
||||
#include <sstream>
|
||||
#include <sstream>
|
||||
#include <unordered_map>
|
||||
|
||||
#define RAMULATOR
|
||||
|
@ -84,7 +84,7 @@ using namespace vortex;
|
|||
|
||||
static uint64_t timestamp = 0;
|
||||
|
||||
double sc_time_stamp() {
|
||||
double sc_time_stamp() {
|
||||
return timestamp;
|
||||
}
|
||||
|
||||
|
@ -95,7 +95,7 @@ static uint64_t trace_start_time = TRACE_START_TIME;
|
|||
static uint64_t trace_stop_time = TRACE_STOP_TIME;
|
||||
|
||||
bool sim_trace_enabled() {
|
||||
if (timestamp >= trace_start_time
|
||||
if (timestamp >= trace_start_time
|
||||
&& timestamp < trace_stop_time)
|
||||
return true;
|
||||
return trace_enabled;
|
||||
|
@ -110,7 +110,7 @@ void sim_trace_enable(bool enable) {
|
|||
class Processor::Impl {
|
||||
public:
|
||||
Impl() {
|
||||
// force random values for unitialized signals
|
||||
// force random values for unitialized signals
|
||||
Verilated::randReset(VERILATOR_RESET_VALUE);
|
||||
Verilated::randSeed(50);
|
||||
|
||||
|
@ -132,7 +132,7 @@ public:
|
|||
#endif
|
||||
|
||||
ram_ = nullptr;
|
||||
|
||||
|
||||
// initialize dram simulator
|
||||
ramulator::Config ram_config;
|
||||
ram_config.add("standard", "DDR4");
|
||||
|
@ -147,7 +147,7 @@ public:
|
|||
|
||||
// reset the device
|
||||
this->reset();
|
||||
|
||||
|
||||
// Turn on assertion after reset
|
||||
Verilated::assertOn(true);
|
||||
}
|
||||
|
@ -159,9 +159,9 @@ public:
|
|||
trace_->close();
|
||||
delete trace_;
|
||||
#endif
|
||||
|
||||
|
||||
delete device_;
|
||||
|
||||
|
||||
if (dram_) {
|
||||
dram_->finish();
|
||||
Stats::statlist.printall();
|
||||
|
@ -202,11 +202,11 @@ public:
|
|||
while (device_->busy) {
|
||||
if (get_ebreak()) {
|
||||
exitcode = (int)get_last_wb_value(3);
|
||||
break;
|
||||
break;
|
||||
}
|
||||
this->tick();
|
||||
}
|
||||
|
||||
|
||||
// reset device
|
||||
this->reset();
|
||||
|
||||
|
@ -215,7 +215,7 @@ public:
|
|||
return exitcode;
|
||||
}
|
||||
|
||||
void write_dcr(uint32_t addr, uint32_t value) {
|
||||
void dcr_write(uint32_t addr, uint32_t value) {
|
||||
device_->dcr_wr_valid = 1;
|
||||
device_->dcr_wr_addr = addr;
|
||||
device_->dcr_wr_data = value;
|
||||
|
@ -232,7 +232,7 @@ private:
|
|||
print_bufs_.clear();
|
||||
|
||||
pending_mem_reqs_.clear();
|
||||
|
||||
|
||||
mem_rd_rsp_active_ = false;
|
||||
mem_wr_rsp_active_ = false;
|
||||
|
||||
|
@ -268,7 +268,7 @@ private:
|
|||
|
||||
device_->clk = 1;
|
||||
this->eval();
|
||||
|
||||
|
||||
#ifdef AXI_BUS
|
||||
this->eval_axi_bus(1);
|
||||
#else
|
||||
|
@ -276,13 +276,13 @@ private:
|
|||
#endif
|
||||
this->eval_dcr_bus(1);
|
||||
|
||||
if (MEM_CYCLE_RATIO > 0) {
|
||||
if (MEM_CYCLE_RATIO > 0) {
|
||||
auto cycle = timestamp / 2;
|
||||
if ((cycle % MEM_CYCLE_RATIO) == 0)
|
||||
dram_->tick();
|
||||
} else {
|
||||
for (int i = MEM_CYCLE_RATIO; i <= 0; ++i)
|
||||
dram_->tick();
|
||||
dram_->tick();
|
||||
}
|
||||
|
||||
if (!dram_queue_.empty()) {
|
||||
|
@ -309,14 +309,14 @@ private:
|
|||
|
||||
#ifdef AXI_BUS
|
||||
|
||||
void reset_axi_bus() {
|
||||
void reset_axi_bus() {
|
||||
device_->m_axi_wready[0] = 0;
|
||||
device_->m_axi_awready[0] = 0;
|
||||
device_->m_axi_arready[0] = 0;
|
||||
device_->m_axi_arready[0] = 0;
|
||||
device_->m_axi_rvalid[0] = 0;
|
||||
device_->m_axi_bvalid[0] = 0;
|
||||
}
|
||||
|
||||
|
||||
void eval_axi_bus(bool clk) {
|
||||
if (!clk) {
|
||||
mem_rd_rsp_ready_ = device_->m_axi_rready[0];
|
||||
|
@ -327,7 +327,7 @@ private:
|
|||
if (ram_ == nullptr) {
|
||||
device_->m_axi_wready[0] = 0;
|
||||
device_->m_axi_awready[0] = 0;
|
||||
device_->m_axi_arready[0] = 0;
|
||||
device_->m_axi_arready[0] = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -335,11 +335,11 @@ private:
|
|||
if (mem_rd_rsp_active_
|
||||
&& device_->m_axi_rvalid[0] && mem_rd_rsp_ready_) {
|
||||
mem_rd_rsp_active_ = false;
|
||||
}
|
||||
if (!mem_rd_rsp_active_) {
|
||||
}
|
||||
if (!mem_rd_rsp_active_) {
|
||||
if (!pending_mem_reqs_.empty()
|
||||
&& (*pending_mem_reqs_.begin())->ready
|
||||
&& !(*pending_mem_reqs_.begin())->write) {
|
||||
&& (*pending_mem_reqs_.begin())->ready
|
||||
&& !(*pending_mem_reqs_.begin())->write) {
|
||||
auto mem_rsp_it = pending_mem_reqs_.begin();
|
||||
auto mem_rsp = *mem_rsp_it;
|
||||
/*
|
||||
|
@ -348,9 +348,9 @@ private:
|
|||
printf("%02x", mem_rsp->block[(MEM_BLOCK_SIZE-1)-i]);
|
||||
}
|
||||
printf("\n");
|
||||
*/
|
||||
*/
|
||||
device_->m_axi_rvalid[0] = 1;
|
||||
device_->m_axi_rid[0] = mem_rsp->tag;
|
||||
device_->m_axi_rid[0] = mem_rsp->tag;
|
||||
device_->m_axi_rresp[0] = 0;
|
||||
device_->m_axi_rlast[0] = 1;
|
||||
memcpy(device_->m_axi_rdata[0].data(), mem_rsp->block.data(), MEM_BLOCK_SIZE);
|
||||
|
@ -362,46 +362,46 @@ private:
|
|||
}
|
||||
}
|
||||
|
||||
// send memory write response
|
||||
// send memory write response
|
||||
if (mem_wr_rsp_active_
|
||||
&& device_->m_axi_bvalid[0] && mem_wr_rsp_ready_) {
|
||||
mem_wr_rsp_active_ = false;
|
||||
}
|
||||
if (!mem_wr_rsp_active_) {
|
||||
if (!pending_mem_reqs_.empty()
|
||||
&& (*pending_mem_reqs_.begin())->ready
|
||||
&& (*pending_mem_reqs_.begin())->ready
|
||||
&& (*pending_mem_reqs_.begin())->write) {
|
||||
auto mem_rsp_it = pending_mem_reqs_.begin();
|
||||
auto mem_rsp = *mem_rsp_it;
|
||||
/*
|
||||
printf("%0ld: [sim] MEM Wr Rsp: bank=%d, addr=%0lx\n", timestamp, last_mem_rsp_bank_, mem_rsp->addr);
|
||||
printf("%0ld: [sim] MEM Wr Rsp: bank=%d, addr=%0lx\n", timestamp, last_mem_rsp_bank_, mem_rsp->addr);
|
||||
*/
|
||||
device_->m_axi_bvalid[0] = 1;
|
||||
device_->m_axi_bvalid[0] = 1;
|
||||
device_->m_axi_bid[0] = mem_rsp->tag;
|
||||
device_->m_axi_bresp[0] = 0;
|
||||
pending_mem_reqs_.erase(mem_rsp_it);
|
||||
pending_mem_reqs_.erase(mem_rsp_it);
|
||||
mem_wr_rsp_active_ = true;
|
||||
delete mem_rsp;
|
||||
} else {
|
||||
device_->m_axi_bvalid[0] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// select the memory bank
|
||||
uint32_t req_addr = device_->m_axi_wvalid[0] ? device_->m_axi_awaddr[0] : device_->m_axi_araddr[0];
|
||||
|
||||
|
||||
// process memory requests
|
||||
if ((device_->m_axi_wvalid[0] || device_->m_axi_arvalid[0]) && running_) {
|
||||
if (device_->m_axi_wvalid[0]) {
|
||||
if (device_->m_axi_wvalid[0]) {
|
||||
uint64_t byteen = device_->m_axi_wstrb[0];
|
||||
uint64_t base_addr = device_->m_axi_awaddr[0];
|
||||
uint8_t* data = (uint8_t*)device_->m_axi_wdata[0].data();
|
||||
|
||||
// check console output
|
||||
if (base_addr >= uint64_t(IO_COUT_ADDR)
|
||||
&& base_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
|
||||
&& base_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
|
||||
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
|
||||
if ((byteen >> i) & 0x1) {
|
||||
if ((byteen >> i) & 0x1) {
|
||||
auto& ss_buf = print_bufs_[i];
|
||||
char c = data[i];
|
||||
ss_buf << c;
|
||||
|
@ -410,7 +410,7 @@ private:
|
|||
ss_buf.str("");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, base_addr, byteen);
|
||||
|
@ -420,26 +420,26 @@ private:
|
|||
printf("\n");
|
||||
*/
|
||||
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
|
||||
if ((byteen >> i) & 0x1) {
|
||||
if ((byteen >> i) & 0x1) {
|
||||
(*ram_)[base_addr + i] = data[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
auto mem_req = new mem_req_t();
|
||||
mem_req->tag = device_->m_axi_awid[0];
|
||||
mem_req->addr = device_->m_axi_awaddr[0];
|
||||
mem_req->addr = device_->m_axi_awaddr[0];
|
||||
mem_req->write = true;
|
||||
mem_req->ready = true;
|
||||
pending_mem_reqs_.emplace_back(mem_req);
|
||||
|
||||
// send dram request
|
||||
ramulator::Request dram_req(
|
||||
ramulator::Request dram_req(
|
||||
device_->m_axi_awaddr[0],
|
||||
ramulator::Request::Type::WRITE,
|
||||
0
|
||||
);
|
||||
dram_queue_.push(dram_req);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// process reads
|
||||
auto mem_req = new mem_req_t();
|
||||
|
@ -451,7 +451,7 @@ private:
|
|||
pending_mem_reqs_.emplace_back(mem_req);
|
||||
|
||||
// send dram request
|
||||
ramulator::Request dram_req(
|
||||
ramulator::Request dram_req(
|
||||
device_->m_axi_araddr[0],
|
||||
ramulator::Request::Type::READ,
|
||||
std::bind([&](ramulator::Request& dram_req, mem_req_t* mem_req) {
|
||||
|
@ -460,12 +460,12 @@ private:
|
|||
0
|
||||
);
|
||||
dram_queue_.push(dram_req);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
device_->m_axi_wready[0] = running_;
|
||||
device_->m_axi_awready[0] = running_;
|
||||
device_->m_axi_arready[0] = running_;
|
||||
device_->m_axi_arready[0] = running_;
|
||||
}
|
||||
|
||||
#else
|
||||
|
@ -486,7 +486,7 @@ private:
|
|||
return;
|
||||
}
|
||||
|
||||
// process memory responses
|
||||
// process memory responses
|
||||
if (mem_rd_rsp_active_
|
||||
&& device_->mem_rsp_valid && mem_rd_rsp_ready_) {
|
||||
mem_rd_rsp_active_ = false;
|
||||
|
@ -494,7 +494,7 @@ private:
|
|||
if (!mem_rd_rsp_active_) {
|
||||
if (!pending_mem_reqs_.empty()
|
||||
&& (*pending_mem_reqs_.begin())->ready) {
|
||||
device_->mem_rsp_valid = 1;
|
||||
device_->mem_rsp_valid = 1;
|
||||
auto mem_rsp_it = pending_mem_reqs_.begin();
|
||||
auto mem_rsp = *mem_rsp_it;
|
||||
/*
|
||||
|
@ -505,7 +505,7 @@ private:
|
|||
printf("\n");
|
||||
*/
|
||||
memcpy(device_->mem_rsp_data.data(), mem_rsp->block.data(), MEM_BLOCK_SIZE);
|
||||
device_->mem_rsp_tag = mem_rsp->tag;
|
||||
device_->mem_rsp_tag = mem_rsp->tag;
|
||||
pending_mem_reqs_.erase(mem_rsp_it);
|
||||
mem_rd_rsp_active_ = true;
|
||||
delete mem_rsp;
|
||||
|
@ -514,19 +514,19 @@ private:
|
|||
}
|
||||
}
|
||||
|
||||
// process memory requests
|
||||
// process memory requests
|
||||
if (device_->mem_req_valid && running_) {
|
||||
uint64_t byte_addr = (device_->mem_req_addr * MEM_BLOCK_SIZE);
|
||||
if (device_->mem_req_rw) {
|
||||
if (device_->mem_req_rw) {
|
||||
// process writes
|
||||
uint64_t byteen = device_->mem_req_byteen;
|
||||
uint64_t byteen = device_->mem_req_byteen;
|
||||
uint8_t* data = (uint8_t*)(device_->mem_req_data.data());
|
||||
|
||||
// check console output
|
||||
if (byte_addr >= uint64_t(IO_COUT_ADDR)
|
||||
&& byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
|
||||
&& byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
|
||||
for (int i = 0; i < IO_COUT_SIZE; i++) {
|
||||
if ((byteen >> i) & 0x1) {
|
||||
if ((byteen >> i) & 0x1) {
|
||||
auto& ss_buf = print_bufs_[i];
|
||||
char c = data[i];
|
||||
ss_buf << c;
|
||||
|
@ -535,7 +535,7 @@ private:
|
|||
ss_buf.str("");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
printf("%0ld: [sim] MEM Wr: tag=%0lx, addr=%0x, byteen=%0lx, data=", timestamp, device_->mem_req_tag, byte_addr, byteen);
|
||||
|
@ -545,23 +545,23 @@ private:
|
|||
printf("\n");
|
||||
*/
|
||||
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
|
||||
if ((byteen >> i) & 0x1) {
|
||||
if ((byteen >> i) & 0x1) {
|
||||
(*ram_)[byte_addr + i] = data[i];
|
||||
}
|
||||
}
|
||||
|
||||
// send dram request
|
||||
ramulator::Request dram_req(
|
||||
ramulator::Request dram_req(
|
||||
byte_addr,
|
||||
ramulator::Request::Type::WRITE,
|
||||
0
|
||||
);
|
||||
dram_queue_.push(dram_req);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// process reads
|
||||
auto mem_req = new mem_req_t();
|
||||
mem_req->tag = device_->mem_req_tag;
|
||||
mem_req->tag = device_->mem_req_tag;
|
||||
mem_req->addr = byte_addr;
|
||||
mem_req->write = false;
|
||||
mem_req->ready = false;
|
||||
|
@ -571,7 +571,7 @@ private:
|
|||
//printf("%0ld: [sim] MEM Rd Req: addr=%0x, tag=%0lx\n", timestamp, byte_addr, device_->mem_req_tag);
|
||||
|
||||
// send dram request
|
||||
ramulator::Request dram_req(
|
||||
ramulator::Request dram_req(
|
||||
byte_addr,
|
||||
ramulator::Request::Type::READ,
|
||||
std::bind([&](ramulator::Request& dram_req, mem_req_t* mem_req) {
|
||||
|
@ -581,7 +581,7 @@ private:
|
|||
);
|
||||
dram_queue_.push(dram_req);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
device_->mem_req_ready = running_;
|
||||
}
|
||||
|
@ -625,8 +625,8 @@ private:
|
|||
|
||||
private:
|
||||
|
||||
typedef struct {
|
||||
bool ready;
|
||||
typedef struct {
|
||||
bool ready;
|
||||
std::array<uint8_t, MEM_BLOCK_SIZE> block;
|
||||
uint64_t addr;
|
||||
uint64_t tag;
|
||||
|
@ -663,7 +663,7 @@ private:
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
Processor::Processor()
|
||||
Processor::Processor()
|
||||
: impl_(new Impl())
|
||||
{}
|
||||
|
||||
|
@ -679,6 +679,6 @@ int Processor::run() {
|
|||
return impl_->run();
|
||||
}
|
||||
|
||||
void Processor::write_dcr(uint32_t addr, uint32_t value) {
|
||||
return impl_->write_dcr(addr, value);
|
||||
void Processor::dcr_write(uint32_t addr, uint32_t value) {
|
||||
return impl_->dcr_write(addr, value);
|
||||
}
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -21,7 +21,7 @@ class RAM;
|
|||
|
||||
class Processor {
|
||||
public:
|
||||
|
||||
|
||||
Processor();
|
||||
~Processor();
|
||||
|
||||
|
@ -29,7 +29,7 @@ public:
|
|||
|
||||
int run();
|
||||
|
||||
void write_dcr(uint32_t addr, uint32_t value);
|
||||
void dcr_write(uint32_t addr, uint32_t value);
|
||||
|
||||
private:
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -92,20 +92,20 @@ int main(int argc, char **argv) {
|
|||
|
||||
// create processor
|
||||
Processor processor(arch);
|
||||
|
||||
|
||||
// attach memory module
|
||||
processor.attach_ram(&ram);
|
||||
processor.attach_ram(&ram);
|
||||
|
||||
// setup base DCRs
|
||||
const uint64_t startup_addr(STARTUP_ADDR);
|
||||
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
|
||||
processor.dcr_write(VX_DCR_BASE_STARTUP_ADDR0, startup_addr & 0xffffffff);
|
||||
#if (XLEN == 64)
|
||||
processor.write_dcr(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
|
||||
processor.dcr_write(VX_DCR_BASE_STARTUP_ADDR1, startup_addr >> 32);
|
||||
#endif
|
||||
processor.write_dcr(VX_DCR_BASE_MPM_CLASS, 0);
|
||||
processor.dcr_write(VX_DCR_BASE_MPM_CLASS, 0);
|
||||
|
||||
// load program
|
||||
{
|
||||
{
|
||||
std::string program_ext(fileExtension(program));
|
||||
if (program_ext == "bin") {
|
||||
ram.loadBinImage(program, startup_addr);
|
||||
|
@ -122,11 +122,11 @@ int main(int argc, char **argv) {
|
|||
if (riscv_test) {
|
||||
exitcode = (1 - exitcode);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (exitcode != 0) {
|
||||
std::cout << "*** error: exitcode=" << exitcode << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
return exitcode;
|
||||
}
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -16,7 +16,7 @@
|
|||
|
||||
using namespace vortex;
|
||||
|
||||
ProcessorImpl::ProcessorImpl(const Arch& arch)
|
||||
ProcessorImpl::ProcessorImpl(const Arch& arch)
|
||||
: arch_(arch)
|
||||
, clusters_(arch.num_clusters())
|
||||
{
|
||||
|
@ -36,16 +36,16 @@ ProcessorImpl::ProcessorImpl(const Arch& arch)
|
|||
log2ceil(L2_LINE_SIZE), // W
|
||||
log2ceil(L3_NUM_WAYS), // A
|
||||
log2ceil(L3_NUM_BANKS), // B
|
||||
XLEN, // address bits
|
||||
XLEN, // address bits
|
||||
1, // number of ports
|
||||
uint8_t(arch.num_clusters()), // request size
|
||||
uint8_t(arch.num_clusters()), // request size
|
||||
true, // write-through
|
||||
false, // write response
|
||||
L3_MSHR_SIZE, // mshr size
|
||||
2, // pipeline latency
|
||||
}
|
||||
);
|
||||
|
||||
);
|
||||
|
||||
// connect L3 memory ports
|
||||
l3cache_->MemReqPort.bind(&memsim_->MemReqPort);
|
||||
memsim_->MemRspPort.bind(&l3cache_->MemRspPort);
|
||||
|
@ -86,7 +86,7 @@ void ProcessorImpl::attach_ram(RAM* ram) {
|
|||
int ProcessorImpl::run() {
|
||||
SimPlatform::instance().reset();
|
||||
this->reset();
|
||||
|
||||
|
||||
bool done;
|
||||
int exitcode = 0;
|
||||
do {
|
||||
|
@ -104,16 +104,16 @@ int ProcessorImpl::run() {
|
|||
|
||||
return exitcode;
|
||||
}
|
||||
|
||||
|
||||
void ProcessorImpl::reset() {
|
||||
perf_mem_reads_ = 0;
|
||||
perf_mem_writes_ = 0;
|
||||
perf_mem_latency_ = 0;
|
||||
perf_mem_pending_reads_ = 0;
|
||||
|
||||
|
||||
}
|
||||
|
||||
void ProcessorImpl::write_dcr(uint32_t addr, uint32_t value) {
|
||||
void ProcessorImpl::dcr_write(uint32_t addr, uint32_t value) {
|
||||
dcrs_.write(addr, value);
|
||||
}
|
||||
|
||||
|
@ -128,7 +128,7 @@ ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const {
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
Processor::Processor(const Arch& arch)
|
||||
Processor::Processor(const Arch& arch)
|
||||
: impl_(new ProcessorImpl(arch))
|
||||
{}
|
||||
|
||||
|
@ -144,6 +144,6 @@ int Processor::run() {
|
|||
return impl_->run();
|
||||
}
|
||||
|
||||
void Processor::write_dcr(uint32_t addr, uint32_t value) {
|
||||
return impl_->write_dcr(addr, value);
|
||||
void Processor::dcr_write(uint32_t addr, uint32_t value) {
|
||||
return impl_->dcr_write(addr, value);
|
||||
}
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -30,7 +30,7 @@ public:
|
|||
|
||||
int run();
|
||||
|
||||
void write_dcr(uint32_t addr, uint32_t value);
|
||||
void dcr_write(uint32_t addr, uint32_t value);
|
||||
|
||||
private:
|
||||
ProcessorImpl* impl_;
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -37,12 +37,12 @@ public:
|
|||
|
||||
int run();
|
||||
|
||||
void write_dcr(uint32_t addr, uint32_t value);
|
||||
void dcr_write(uint32_t addr, uint32_t value);
|
||||
|
||||
PerfStats perf_stats() const;
|
||||
|
||||
private:
|
||||
|
||||
|
||||
void reset();
|
||||
|
||||
const Arch& arch_;
|
||||
|
|
|
@ -25,8 +25,10 @@ int test = -1;
|
|||
uint32_t count = 0;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
vx_buffer_h src_buffer = nullptr;
|
||||
vx_buffer_h dst_buffer = nullptr;
|
||||
vx_buffer_h krnl_buffer = nullptr;
|
||||
vx_buffer_h args_buffer = nullptr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
|
@ -47,7 +49,7 @@ static void parse_args(int argc, char **argv) {
|
|||
case 'k':
|
||||
kernel_file = optarg;
|
||||
break;
|
||||
case 'h':
|
||||
case 'h':
|
||||
case '?': {
|
||||
show_usage();
|
||||
exit(0);
|
||||
|
@ -61,10 +63,10 @@ static void parse_args(int argc, char **argv) {
|
|||
|
||||
void cleanup() {
|
||||
if (device) {
|
||||
vx_mem_free(device, kernel_arg.src_addr);
|
||||
vx_mem_free(device, kernel_arg.dst_addr);
|
||||
vx_mem_free(device, kernel_prog_addr);
|
||||
vx_mem_free(device, kernel_args_addr);
|
||||
vx_mem_free(src_buffer);
|
||||
vx_mem_free(dst_buffer);
|
||||
vx_mem_free(krnl_buffer);
|
||||
vx_mem_free(args_buffer);
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
@ -80,23 +82,23 @@ int run_memcopy_test(const kernel_arg_t& kernel_arg) {
|
|||
std::vector<uint32_t> h_src(num_points);
|
||||
std::vector<uint32_t> h_dst(num_points);
|
||||
|
||||
// update source buffer
|
||||
// update source buffer
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
h_src[i] = shuffle(i, NONCE);
|
||||
}
|
||||
|
||||
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
|
||||
// upload source buffer
|
||||
std::cout << "write source buffer to local memory" << std::endl;
|
||||
auto t0 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, h_src.data(), buf_size));
|
||||
RT_CHECK(vx_copy_to_dev(dst_buffer, h_src.data(), 0, buf_size));
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// download destination buffer
|
||||
std::cout << "read destination buffer from local memory" << std::endl;
|
||||
auto t2 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
|
||||
auto t3 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// verify result
|
||||
|
@ -114,11 +116,11 @@ int run_memcopy_test(const kernel_arg_t& kernel_arg) {
|
|||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
double elapsed;
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
|
||||
printf("upload time: %lg ms\n", elapsed);
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count();
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count();
|
||||
printf("download time: %lg ms\n", elapsed);
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
printf("Total elapsed time: %lg ms\n", elapsed);
|
||||
|
||||
return errors;
|
||||
|
@ -130,42 +132,42 @@ int run_kernel_test(const kernel_arg_t& kernel_arg) {
|
|||
|
||||
std::vector<uint32_t> h_src(num_points);
|
||||
std::vector<uint32_t> h_dst(num_points);
|
||||
|
||||
// update source buffer
|
||||
|
||||
// update source buffer
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
h_src[i] = shuffle(i, NONCE);
|
||||
}
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
|
||||
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// upload source buffer
|
||||
auto t0 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, h_src.data(), buf_size));
|
||||
RT_CHECK(vx_copy_to_dev(src_buffer, h_src.data(), 0, buf_size));
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// start device
|
||||
std::cout << "start execution" << std::endl;
|
||||
auto t2 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
|
||||
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
|
||||
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
|
||||
auto t3 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// download destination buffer
|
||||
std::cout << "read destination buffer from local memory" << std::endl;
|
||||
auto t4 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
|
||||
auto t5 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// verify result
|
||||
int errors = 0;
|
||||
|
||||
// verify result
|
||||
int errors = 0;
|
||||
std::cout << "verify result" << std::endl;
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
auto cur = h_dst[i];
|
||||
|
@ -179,13 +181,13 @@ int run_kernel_test(const kernel_arg_t& kernel_arg) {
|
|||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
|
||||
double elapsed;
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
|
||||
printf("upload time: %lg ms\n", elapsed);
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count();
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count();
|
||||
printf("execute time: %lg ms\n", elapsed);
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t5 - t4).count();
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t5 - t4).count();
|
||||
printf("download time: %lg ms\n", elapsed);
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
printf("Total elapsed time: %lg ms\n", elapsed);
|
||||
|
||||
return errors;
|
||||
|
@ -214,8 +216,10 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src_buffer));
|
||||
RT_CHECK(vx_mem_address(src_buffer, &kernel_arg.src_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
|
||||
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
|
||||
|
||||
kernel_arg.count = num_points;
|
||||
|
||||
|
@ -224,7 +228,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
int errors = 0;
|
||||
|
||||
// run tests
|
||||
// run tests
|
||||
if (0 == test || -1 == test) {
|
||||
std::cout << "run memcopy test" << std::endl;
|
||||
errors = run_memcopy_test(kernel_arg);
|
||||
|
@ -236,16 +240,16 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Test PASSED" << std::endl;
|
||||
|
||||
std::cout << "Test PASSED" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -30,10 +30,10 @@ public:
|
|||
static const char* type_str() {
|
||||
return "integer";
|
||||
}
|
||||
static int generate() {
|
||||
return rand();
|
||||
static int generate() {
|
||||
return rand();
|
||||
}
|
||||
static bool compare(int a, int b, int index, int errors) {
|
||||
static bool compare(int a, int b, int index, int errors) {
|
||||
if (a != b) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a);
|
||||
|
@ -41,7 +41,7 @@ public:
|
|||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
|
@ -50,10 +50,10 @@ public:
|
|||
static const char* type_str() {
|
||||
return "float";
|
||||
}
|
||||
static int generate() {
|
||||
static int generate() {
|
||||
return static_cast<float>(rand()) / RAND_MAX;
|
||||
}
|
||||
static bool compare(float a, float b, int index, int errors) {
|
||||
static bool compare(float a, float b, int index, int errors) {
|
||||
union fi_t { float f; int32_t i; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
|
@ -66,7 +66,7 @@ public:
|
|||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static void convolution_cpu(TYPE *O, TYPE *I, TYPE *W, int32_t width, int32_t height) {
|
||||
|
@ -95,8 +95,11 @@ int size = 32;
|
|||
bool use_lmem = false;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
vx_buffer_h I_buffer = nullptr;
|
||||
vx_buffer_h W_buffer = nullptr;
|
||||
vx_buffer_h O_buffer = nullptr;
|
||||
vx_buffer_h krnl_buffer = nullptr;
|
||||
vx_buffer_h args_buffer = nullptr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
|
@ -111,7 +114,7 @@ static void parse_args(int argc, char **argv) {
|
|||
case 'n':
|
||||
size = atoi(optarg);
|
||||
break;
|
||||
case 'l':
|
||||
case 'l':
|
||||
use_lmem = true;
|
||||
break;
|
||||
case 'k':
|
||||
|
@ -130,26 +133,26 @@ static void parse_args(int argc, char **argv) {
|
|||
}
|
||||
|
||||
void cleanup() {
|
||||
if (device) {
|
||||
vx_mem_free(device, kernel_arg.I_addr);
|
||||
if (device) {
|
||||
vx_mem_free(I_buffer);
|
||||
if (!use_lmem) {
|
||||
vx_mem_free(device, kernel_arg.W_addr);
|
||||
vx_mem_free(W_buffer);
|
||||
}
|
||||
vx_mem_free(device, kernel_arg.O_addr);
|
||||
vx_mem_free(device, kernel_prog_addr);
|
||||
vx_mem_free(device, kernel_args_addr);
|
||||
vx_mem_free(O_buffer);
|
||||
vx_mem_free(krnl_buffer);
|
||||
vx_mem_free(args_buffer);
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int main(int argc, char *argv[]) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
std::srand(50);
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint32_t num_points = size * size;
|
||||
|
@ -166,13 +169,16 @@ int main(int argc, char *argv[]) {
|
|||
uint32_t w_points = 3 * 3;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
size_t i_nbytes = i_points * sizeof(TYPE);
|
||||
size_t w_nbytes = w_points * sizeof(TYPE);
|
||||
size_t o_nbytes = o_points * sizeof(TYPE);
|
||||
RT_CHECK(vx_mem_alloc(device, i_nbytes, &kernel_arg.I_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, o_nbytes, &kernel_arg.O_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, w_nbytes, &kernel_arg.W_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, i_nbytes, VX_MEM_READ, &I_buffer));
|
||||
RT_CHECK(vx_mem_address(I_buffer, &kernel_arg.I_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, w_nbytes, VX_MEM_READ, &W_buffer));
|
||||
RT_CHECK(vx_mem_address(W_buffer, &kernel_arg.W_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, o_nbytes, VX_MEM_WRITE, &O_buffer));
|
||||
RT_CHECK(vx_mem_address(O_buffer, &kernel_arg.O_addr));
|
||||
|
||||
if (use_lmem) {
|
||||
uint64_t dev_local_mem_size;
|
||||
|
@ -212,32 +218,32 @@ int main(int argc, char *argv[]) {
|
|||
// upload input buffer
|
||||
{
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.I_addr, h_I.data(), i_nbytes));
|
||||
RT_CHECK(vx_copy_to_dev(I_buffer, h_I.data(), 0, i_nbytes));
|
||||
}
|
||||
|
||||
// upload weight buffer
|
||||
{
|
||||
std::cout << "upload weight buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.W_addr, h_W.data(), w_nbytes));
|
||||
RT_CHECK(vx_copy_to_dev(W_buffer, h_W.data(), 0, w_nbytes));
|
||||
}
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
|
||||
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
|
||||
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
|
||||
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
|
||||
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
|
@ -245,7 +251,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, h_O.data(), kernel_arg.O_addr, o_nbytes));
|
||||
RT_CHECK(vx_copy_from_dev(h_O.data(), O_buffer, 0, o_nbytes));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
|
@ -253,7 +259,7 @@ int main(int argc, char *argv[]) {
|
|||
{
|
||||
std::vector<TYPE> h_ref(o_points);
|
||||
convolution_cpu(h_ref.data(), h_I.data(), h_W.data(), size, size);
|
||||
|
||||
|
||||
for (uint32_t i = 0; i < h_ref.size(); ++i) {
|
||||
auto ref = h_ref[i];
|
||||
auto cur = h_O[i];
|
||||
|
@ -264,13 +270,13 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
|
|
@ -28,10 +28,10 @@ public:
|
|||
static const char* type_str() {
|
||||
return "integer";
|
||||
}
|
||||
static int generate() {
|
||||
return rand();
|
||||
static int generate() {
|
||||
return rand();
|
||||
}
|
||||
static bool compare(int a, int b, int index, int errors) {
|
||||
static bool compare(int a, int b, int index, int errors) {
|
||||
if (a != b) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a);
|
||||
|
@ -39,7 +39,7 @@ public:
|
|||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
|
@ -50,10 +50,10 @@ public:
|
|||
static const char* type_str() {
|
||||
return "float";
|
||||
}
|
||||
static int generate() {
|
||||
static int generate() {
|
||||
return static_cast<float>(rand()) / RAND_MAX;
|
||||
}
|
||||
static bool compare(float a, float b, int index, int errors) {
|
||||
static bool compare(float a, float b, int index, int errors) {
|
||||
union fi_t { float f; int32_t i; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
|
@ -66,15 +66,18 @@ public:
|
|||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const char* kernel_file = "kernel.vxbin";
|
||||
uint32_t count = 16;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
vx_buffer_h src0_buffer = nullptr;
|
||||
vx_buffer_h src1_buffer = nullptr;
|
||||
vx_buffer_h dst_buffer = nullptr;
|
||||
vx_buffer_h krnl_buffer = nullptr;
|
||||
vx_buffer_h args_buffer = nullptr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
|
@ -105,24 +108,24 @@ static void parse_args(int argc, char **argv) {
|
|||
}
|
||||
|
||||
void cleanup() {
|
||||
if (device) {
|
||||
vx_mem_free(device, kernel_arg.src0_addr);
|
||||
vx_mem_free(device, kernel_arg.src1_addr);
|
||||
vx_mem_free(device, kernel_arg.dst_addr);
|
||||
vx_mem_free(device, kernel_prog_addr);
|
||||
vx_mem_free(device, kernel_args_addr);
|
||||
if (device) {
|
||||
vx_mem_free(src0_buffer);
|
||||
vx_mem_free(src1_buffer);
|
||||
vx_mem_free(dst_buffer);
|
||||
vx_mem_free(krnl_buffer);
|
||||
vx_mem_free(args_buffer);
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int main(int argc, char *argv[]) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
std::srand(50);
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint64_t num_cores, num_warps, num_threads;
|
||||
|
@ -130,28 +133,31 @@ int main(int argc, char *argv[]) {
|
|||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
|
||||
|
||||
uint32_t num_tasks = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * num_tasks;
|
||||
uint32_t buf_size = num_points * sizeof(TYPE);
|
||||
uint32_t total_threads = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * total_threads;
|
||||
uint32_t buf_size = num_points * sizeof(TYPE);
|
||||
|
||||
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.num_tasks = total_threads;
|
||||
kernel_arg.task_size = count;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer));
|
||||
RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer));
|
||||
RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
|
||||
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
|
||||
|
||||
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
|
||||
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate host buffers
|
||||
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<TYPE> h_src0(num_points);
|
||||
std::vector<TYPE> h_src1(num_points);
|
||||
|
@ -165,23 +171,23 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// upload source buffer0
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, h_src0.data(), buf_size));
|
||||
RT_CHECK(vx_copy_to_dev(src0_buffer, h_src0.data(), 0, buf_size));
|
||||
|
||||
// upload source buffer1
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, h_src1.data(), buf_size));
|
||||
|
||||
RT_CHECK(vx_copy_to_dev(src1_buffer, h_src1.data(), 0, buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
|
||||
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
|
@ -189,10 +195,10 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
std::cout << "verify result" << std::endl;
|
||||
int errors = 0;
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
auto ref = h_src0[i] + h_src1[i];
|
||||
|
@ -203,13 +209,13 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
|
|
@ -22,8 +22,10 @@ const char* kernel_file = "kernel.vxbin";
|
|||
uint32_t count = 0;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
vx_buffer_h src_buffer = nullptr;
|
||||
vx_buffer_h dst_buffer = nullptr;
|
||||
vx_buffer_h krnl_buffer = nullptr;
|
||||
vx_buffer_h args_buffer = nullptr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
|
@ -55,10 +57,10 @@ static void parse_args(int argc, char **argv) {
|
|||
|
||||
void cleanup() {
|
||||
if (device) {
|
||||
vx_mem_free(device, kernel_arg.src_addr);
|
||||
vx_mem_free(device, kernel_arg.dst_addr);
|
||||
vx_mem_free(device, kernel_prog_addr);
|
||||
vx_mem_free(device, kernel_args_addr);
|
||||
vx_mem_free(src_buffer);
|
||||
vx_mem_free(dst_buffer);
|
||||
vx_mem_free(krnl_buffer);
|
||||
vx_mem_free(args_buffer);
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
@ -72,7 +74,7 @@ void gen_src_data(std::vector<int>& src_data, uint32_t size) {
|
|||
}
|
||||
}
|
||||
|
||||
void gen_ref_data(std::vector<int>& ref_data, const std::vector<int>& src_data, uint32_t size) {
|
||||
void gen_ref_data(std::vector<int>& ref_data, const std::vector<int>& src_data, uint32_t size) {
|
||||
ref_data.resize(size);
|
||||
for (int i = 0; i < (int)size; ++i) {
|
||||
int value = src_data.at(i);
|
||||
|
@ -83,7 +85,7 @@ void gen_ref_data(std::vector<int>& ref_data, const std::vector<int>& src_data,
|
|||
} else {
|
||||
value += 2;
|
||||
}
|
||||
|
||||
|
||||
// diverge
|
||||
if (i > 1) {
|
||||
if (i > 2) {
|
||||
|
@ -109,8 +111,8 @@ void gen_ref_data(std::vector<int>& ref_data, const std::vector<int>& src_data,
|
|||
// loop
|
||||
for (int j = 0, n = i; j < n; ++j) {
|
||||
value += src_data.at(j);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// switch
|
||||
switch (i) {
|
||||
case 0:
|
||||
|
@ -141,7 +143,7 @@ void gen_ref_data(std::vector<int>& ref_data, const std::vector<int>& src_data,
|
|||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int main(int argc, char *argv[]) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
|
@ -152,7 +154,7 @@ int main(int argc, char *argv[]) {
|
|||
std::srand(50);
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint64_t num_cores, num_warps, num_threads;
|
||||
|
@ -160,8 +162,8 @@ int main(int argc, char *argv[]) {
|
|||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
|
||||
|
||||
uint32_t num_tasks = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * num_tasks;
|
||||
uint32_t total_threads = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * total_threads;
|
||||
uint32_t buf_size = num_points * sizeof(int32_t);
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
|
@ -170,13 +172,15 @@ int main(int argc, char *argv[]) {
|
|||
kernel_arg.num_points = num_points;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src_buffer));
|
||||
RT_CHECK(vx_mem_address(src_buffer, &kernel_arg.src_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
|
||||
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
|
||||
|
||||
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<int32_t> h_src;
|
||||
|
@ -184,20 +188,20 @@ int main(int argc, char *argv[]) {
|
|||
gen_src_data(h_src, num_points);
|
||||
|
||||
// upload source buffer
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, h_src.data(), buf_size));
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(src_buffer, h_src.data(), 0, buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
|
||||
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
|
@ -205,7 +209,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
|
@ -226,13 +230,13 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
#include <math.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include <vx_spawn.h>
|
||||
#include <vx_print.h>
|
||||
#include "common.h"
|
||||
|
||||
typedef void (*PFN_Kernel)(int task_id, kernel_arg_t* __UNIFORM__ arg);
|
||||
|
@ -15,7 +16,7 @@ void kernel_iadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (int32_t*)arg->src0_addr;
|
||||
auto src1_ptr = (int32_t*)arg->src1_addr;
|
||||
auto dst_ptr = (int32_t*)arg->dst_addr;
|
||||
auto dst_ptr = (int32_t*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -30,7 +31,7 @@ void kernel_imul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (int32_t*)arg->src0_addr;
|
||||
auto src1_ptr = (int32_t*)arg->src1_addr;
|
||||
auto dst_ptr = (int32_t*)arg->dst_addr;
|
||||
auto dst_ptr = (int32_t*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -45,7 +46,7 @@ void kernel_idiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (int32_t*)arg->src0_addr;
|
||||
auto src1_ptr = (int32_t*)arg->src1_addr;
|
||||
auto dst_ptr = (int32_t*)arg->dst_addr;
|
||||
auto dst_ptr = (int32_t*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -60,7 +61,7 @@ void kernel_idiv_mul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (int32_t*)arg->src0_addr;
|
||||
auto src1_ptr = (int32_t*)arg->src1_addr;
|
||||
auto dst_ptr = (int32_t*)arg->dst_addr;
|
||||
auto dst_ptr = (int32_t*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -77,7 +78,7 @@ void kernel_fadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -92,7 +93,7 @@ void kernel_fsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -107,7 +108,7 @@ void kernel_fmul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -122,7 +123,7 @@ void kernel_fmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -137,7 +138,7 @@ void kernel_fmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -152,7 +153,7 @@ void kernel_fnmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -167,7 +168,7 @@ void kernel_fnmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -182,7 +183,7 @@ void kernel_fnmadd_madd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -199,7 +200,7 @@ void kernel_fdiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -214,7 +215,7 @@ void kernel_fdiv2(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -231,7 +232,7 @@ void kernel_fsqrt(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -246,7 +247,7 @@ void kernel_ftoi(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (int32_t*)arg->dst_addr;
|
||||
auto dst_ptr = (int32_t*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -262,7 +263,7 @@ void kernel_ftou(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (uint32_t*)arg->dst_addr;
|
||||
auto dst_ptr = (uint32_t*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -278,7 +279,7 @@ void kernel_itof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (int32_t*)arg->src0_addr;
|
||||
auto src1_ptr = (int32_t*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -294,7 +295,7 @@ void kernel_utof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (int32_t*)arg->src0_addr;
|
||||
auto src1_ptr = (int32_t*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -314,7 +315,7 @@ void kernel_fclamp(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
@ -328,12 +329,12 @@ void kernel_trigo(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto count = arg->task_size;
|
||||
auto src0_ptr = (float*)arg->src0_addr;
|
||||
auto src1_ptr = (float*)arg->src1_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto dst_ptr = (float*)arg->dst_addr;
|
||||
auto offset = task_id * count;
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
auto a = src0_ptr[offset+i];
|
||||
auto b = src1_ptr[offset+i];
|
||||
dst_ptr[offset+i] = sin(a) + cos(b);
|
||||
auto a = sinf(src0_ptr[offset+i]);
|
||||
auto b = cosf(src1_ptr[offset+i]);
|
||||
dst_ptr[offset+i] = a + b;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -356,14 +357,14 @@ void kernel_bar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
for (int i = 0; i <= block_size; ++i) {
|
||||
dst_ptr[i + offset] = src0_ptr[i + offset];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// memory fence
|
||||
vx_fence();
|
||||
|
||||
// local barrier
|
||||
vx_barrier(0, num_warps);
|
||||
|
||||
|
||||
// update destination
|
||||
dst_ptr[task_id] += 1;
|
||||
}
|
||||
|
@ -372,7 +373,7 @@ void kernel_gbar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
auto num_cores = vx_num_cores();
|
||||
auto num_warps = vx_num_warps();
|
||||
auto num_threads = vx_num_threads();
|
||||
|
||||
|
||||
auto cid = vx_core_id();
|
||||
auto wid = vx_warp_id();
|
||||
auto tid = vx_thread_id();
|
||||
|
@ -385,47 +386,45 @@ void kernel_gbar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
for (int i = 0, n = arg->num_tasks; i <= n; ++i) {
|
||||
dst_ptr[i] = src0_ptr[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// memory fence
|
||||
vx_fence();
|
||||
|
||||
// global barrier
|
||||
vx_barrier(0x80000000, num_cores);
|
||||
|
||||
|
||||
// update destination
|
||||
dst_ptr[task_id] += 1;
|
||||
}
|
||||
|
||||
static PFN_Kernel sc_tests[23];
|
||||
void register_tests() {
|
||||
sc_tests[0] = kernel_iadd;
|
||||
sc_tests[1] = kernel_imul;
|
||||
sc_tests[2] = kernel_idiv;
|
||||
sc_tests[3] = kernel_idiv_mul;
|
||||
sc_tests[4] = kernel_fadd;
|
||||
sc_tests[5] = kernel_fsub;
|
||||
sc_tests[6] = kernel_fmul;
|
||||
sc_tests[7] = kernel_fmadd;
|
||||
sc_tests[8] = kernel_fmsub;
|
||||
sc_tests[9] = kernel_fnmadd;
|
||||
sc_tests[10] = kernel_fnmsub;
|
||||
sc_tests[11] = kernel_fnmadd_madd;
|
||||
sc_tests[12] = kernel_fdiv;
|
||||
sc_tests[13] = kernel_fdiv2;
|
||||
sc_tests[14] = kernel_fsqrt;
|
||||
sc_tests[15] = kernel_ftoi;
|
||||
sc_tests[16] = kernel_ftou;
|
||||
sc_tests[17] = kernel_itof;
|
||||
sc_tests[18] = kernel_utof;
|
||||
sc_tests[19] = kernel_fclamp;
|
||||
sc_tests[20] = kernel_trigo;
|
||||
sc_tests[21] = kernel_bar;
|
||||
sc_tests[22] = kernel_gbar;
|
||||
}
|
||||
static const PFN_Kernel sc_tests[] = {
|
||||
/*kernel_iadd,
|
||||
kernel_imul,
|
||||
kernel_idiv,
|
||||
kernel_idiv_mul,
|
||||
kernel_fadd,
|
||||
kernel_fsub,
|
||||
kernel_fmul,
|
||||
kernel_fmadd,
|
||||
kernel_fmsub,
|
||||
kernel_fnmadd,
|
||||
kernel_fnmsub,
|
||||
kernel_fnmadd_madd,
|
||||
kernel_fdiv,
|
||||
kernel_fdiv2,
|
||||
kernel_fsqrt,
|
||||
kernel_ftoi,
|
||||
kernel_ftou,
|
||||
kernel_itof,
|
||||
kernel_utof,
|
||||
kernel_fclamp,*/
|
||||
kernel_trigo,
|
||||
/*kernel_bar,
|
||||
kernel_gbar*/
|
||||
};
|
||||
|
||||
int main() {
|
||||
register_tests();
|
||||
auto arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH);
|
||||
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)sc_tests[arg->testid], arg);
|
||||
return 0;
|
||||
|
|
|
@ -20,8 +20,11 @@ int testid_e = 0;
|
|||
bool stop_on_error = true;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
vx_buffer_h src0_buffer = nullptr;
|
||||
vx_buffer_h src1_buffer = nullptr;
|
||||
vx_buffer_h dst_buffer = nullptr;
|
||||
vx_buffer_h krnl_buffer = nullptr;
|
||||
vx_buffer_h args_buffer = nullptr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
|
@ -67,21 +70,21 @@ static void parse_args(int argc, char **argv) {
|
|||
}
|
||||
}
|
||||
|
||||
void cleanup() {
|
||||
void cleanup() {
|
||||
if (testSuite) {
|
||||
delete testSuite;
|
||||
}
|
||||
if (device) {
|
||||
vx_mem_free(device, kernel_arg.src0_addr);
|
||||
vx_mem_free(device, kernel_arg.src1_addr);
|
||||
vx_mem_free(device, kernel_arg.dst_addr);
|
||||
vx_mem_free(device, kernel_prog_addr);
|
||||
vx_mem_free(device, kernel_args_addr);
|
||||
vx_mem_free(src0_buffer);
|
||||
vx_mem_free(src1_buffer);
|
||||
vx_mem_free(dst_buffer);
|
||||
vx_mem_free(krnl_buffer);
|
||||
vx_mem_free(args_buffer);
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int main(int argc, char *argv[]) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
|
@ -96,7 +99,7 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "using kernel: " << kernel_file << std::endl;
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint64_t num_cores, num_warps, num_threads;
|
||||
|
@ -107,7 +110,7 @@ int main(int argc, char *argv[]) {
|
|||
int num_tasks = num_cores * num_warps * num_threads;
|
||||
int num_points = count * num_tasks;
|
||||
size_t buf_size = num_points * sizeof(uint32_t);
|
||||
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
|
@ -116,17 +119,20 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer));
|
||||
RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer));
|
||||
RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, sizeof(kernel_arg_t), VX_MEM_READ, &args_buffer));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
|
||||
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
|
||||
|
||||
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::dec << std::endl;
|
||||
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::dec << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::dec << std::endl;
|
||||
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<uint8_t> src1_buf(buf_size);
|
||||
std::vector<uint8_t> src2_buf(buf_size);
|
||||
std::vector<uint8_t> dst_buf(buf_size);
|
||||
|
@ -138,15 +144,15 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
// upload program
|
||||
std::cout << "upload kernel" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
std::cout << "upload kernel" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
|
||||
|
||||
// execute tests
|
||||
int errors = 0;
|
||||
for (int t = testid_s; t <= testid_e; ++t) {
|
||||
for (int t = testid_s; t <= testid_e; ++t) {
|
||||
auto test = testSuite->get_test(t);
|
||||
auto name = test->name();
|
||||
|
||||
|
||||
if (!selected.empty()) {
|
||||
if (selected.count(name) == 0)
|
||||
continue;
|
||||
|
@ -162,30 +168,30 @@ int main(int argc, char *argv[]) {
|
|||
// get test arguments
|
||||
std::cout << "get test arguments" << std::endl;
|
||||
RT_CHECK(test->setup(num_points, (void*)src1_buf.data(), (void*)src2_buf.data()));
|
||||
|
||||
// upload source buffer0
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, src1_buf.data(), buf_size));
|
||||
|
||||
// upload source buffer1
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, src2_buf.data(), buf_size));
|
||||
|
||||
// clear destination buffer
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
// upload source buffer0
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(src0_buffer, src1_buf.data(), 0, buf_size));
|
||||
|
||||
// upload source buffer1
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(src1_buffer, src2_buf.data(), 0, buf_size));
|
||||
|
||||
// clear destination buffer
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
for (int i = 0; i < num_points; ++i) {
|
||||
((uint32_t*)dst_buf.data())[i] = 0xdeadbeef;
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, dst_buf.data(), buf_size));
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(dst_buffer, dst_buf.data(), 0, buf_size));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
kernel_arg.testid = t;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_args_addr, &kernel_arg, sizeof(kernel_arg_t)));
|
||||
RT_CHECK(vx_copy_to_dev(args_buffer, &kernel_arg, 0, sizeof(kernel_arg_t)));
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
|
||||
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
|
@ -193,7 +199,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, dst_buf.data(), kernel_arg.dst_addr, buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(dst_buf.data(), dst_buffer, 0, buf_size));
|
||||
|
||||
// verify destination
|
||||
std::cout << "verify test result" << std::endl;
|
||||
|
@ -201,16 +207,16 @@ int main(int argc, char *argv[]) {
|
|||
if (err != 0) {
|
||||
std::cout << "found " << std::dec << err << " errors!" << std::endl;
|
||||
std::cout << "Test" << t << "-" << name << " FAILED!" << std::endl << std::flush;
|
||||
errors += err;
|
||||
errors += err;
|
||||
if (stop_on_error)
|
||||
break;
|
||||
} else {
|
||||
std::cout << "Test" << t << "-" << name << " PASSED!" << std::endl << std::flush;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
return errors;
|
||||
|
|
|
@ -17,7 +17,7 @@ void cleanup();
|
|||
exit(-1); \
|
||||
} while (false)
|
||||
|
||||
union Float_t {
|
||||
union Float_t {
|
||||
float f;
|
||||
int i;
|
||||
struct {
|
||||
|
@ -95,8 +95,8 @@ private:
|
|||
|
||||
class ITestCase {
|
||||
public:
|
||||
ITestCase(TestSuite* suite, const char* name)
|
||||
: suite_(suite)
|
||||
ITestCase(TestSuite* suite, const char* name)
|
||||
: suite_(suite)
|
||||
, name_(name)
|
||||
{}
|
||||
|
||||
|
@ -116,7 +116,7 @@ public:
|
|||
|
||||
protected:
|
||||
TestSuite* suite_;
|
||||
const char* const name_;
|
||||
const char* const name_;
|
||||
};
|
||||
|
||||
class Test_IADD : public ITestCase {
|
||||
|
@ -132,14 +132,14 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (int32_t*)src1;
|
||||
auto b = (int32_t*)src2;
|
||||
auto c = (int32_t*)dst;
|
||||
for (uint32_t i = 0; i < n; ++i) {
|
||||
auto ref = a[i] + b[i];
|
||||
auto ref = a[i] + b[i];
|
||||
if (c[i] != ref) {
|
||||
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
|
||||
++errors;
|
||||
|
@ -162,14 +162,14 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (int32_t*)src1;
|
||||
auto b = (int32_t*)src2;
|
||||
auto c = (int32_t*)dst;
|
||||
for (uint32_t i = 0; i < n; ++i) {
|
||||
auto ref = a[i] * b[i];
|
||||
auto ref = a[i] * b[i];
|
||||
if (c[i] != ref) {
|
||||
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
|
||||
++errors;
|
||||
|
@ -192,14 +192,14 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (int32_t*)src1;
|
||||
auto b = (int32_t*)src2;
|
||||
auto c = (int32_t*)dst;
|
||||
for (uint32_t i = 0; i < n; ++i) {
|
||||
auto ref = a[i] / b[i];
|
||||
auto ref = a[i] / b[i];
|
||||
if (c[i] != ref) {
|
||||
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
|
||||
++errors;
|
||||
|
@ -222,16 +222,16 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (int32_t*)src1;
|
||||
auto b = (int32_t*)src2;
|
||||
auto c = (int32_t*)dst;
|
||||
for (uint32_t i = 0; i < n; ++i) {
|
||||
auto x = a[i] / b[i];
|
||||
auto y = a[i] * b[i];
|
||||
auto ref = x + y;
|
||||
auto x = a[i] / b[i];
|
||||
auto y = a[i] * b[i];
|
||||
auto ref = x + y;
|
||||
if (c[i] != ref) {
|
||||
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
|
||||
++errors;
|
||||
|
@ -254,14 +254,14 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (float*)src1;
|
||||
auto b = (float*)src2;
|
||||
auto c = (float*)dst;
|
||||
for (uint32_t i = 0; i < n; ++i) {
|
||||
auto ref = a[i] + b[i];
|
||||
auto ref = a[i] + b[i];
|
||||
if (!almost_equal(c[i], ref)) {
|
||||
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
|
||||
++errors;
|
||||
|
@ -284,14 +284,14 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (float*)src1;
|
||||
auto b = (float*)src2;
|
||||
auto c = (float*)dst;
|
||||
for (uint32_t i = 0; i < n; ++i) {
|
||||
auto ref = a[i] - b[i];
|
||||
auto ref = a[i] - b[i];
|
||||
if (!almost_equal(c[i], ref)) {
|
||||
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
|
||||
++errors;
|
||||
|
@ -314,14 +314,14 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (float*)src1;
|
||||
auto b = (float*)src2;
|
||||
auto c = (float*)dst;
|
||||
for (uint32_t i = 0; i < n; ++i) {
|
||||
auto ref = a[i] * b[i];
|
||||
auto ref = a[i] * b[i];
|
||||
if (!almost_equal(c[i], ref)) {
|
||||
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
|
||||
++errors;
|
||||
|
@ -344,7 +344,7 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (float*)src1;
|
||||
|
@ -374,7 +374,7 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (float*)src1;
|
||||
|
@ -404,7 +404,7 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (float*)src1;
|
||||
|
@ -434,7 +434,7 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (float*)src1;
|
||||
|
@ -464,7 +464,7 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (float*)src1;
|
||||
|
@ -496,7 +496,7 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (float*)src1;
|
||||
|
@ -526,7 +526,7 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (float*)src1;
|
||||
|
@ -559,7 +559,7 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (float*)src1;
|
||||
|
@ -586,11 +586,11 @@ public:
|
|||
for (uint32_t i = 0; i < n; ++i) {
|
||||
float q = fround(float(n/2) - i + (float(i) / n));
|
||||
a[i] = q;
|
||||
b[i] = q;
|
||||
b[i] = q;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (float*)src1;
|
||||
|
@ -622,7 +622,7 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (float*)src1;
|
||||
|
@ -653,7 +653,7 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (int32_t*)src1;
|
||||
|
@ -684,7 +684,7 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (uint32_t*)src1;
|
||||
|
@ -715,7 +715,7 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (float*)src1;
|
||||
|
@ -740,19 +740,19 @@ public:
|
|||
auto a = (float*)src1;
|
||||
auto b = (float*)src2;
|
||||
for (uint32_t i = 0; i < n; ++i) {
|
||||
a[i] = fround((2*i-n) * (1.0f/n) * 3.1416);
|
||||
b[i] = fround((2*i-n) * (1.0f/n) * 3.1416);
|
||||
a[i] = fround(int(2*i-n) * (1.0f/n) * 3.1416);
|
||||
b[i] = fround(int(2*i-n) * (1.0f/n) * 3.1416);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
|
||||
int errors = 0;
|
||||
auto a = (float*)src1;
|
||||
auto b = (float*)src2;
|
||||
auto c = (float*)dst;
|
||||
for (uint32_t i = 0; i < n; ++i) {
|
||||
auto ref = sin(a[i]) + cos(b[i]);
|
||||
auto ref = sinf(a[i]) + cosf(b[i]);
|
||||
if (!almost_equal(c[i], ref)) {
|
||||
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
|
||||
++errors;
|
||||
|
@ -766,20 +766,20 @@ class Test_BAR : public ITestCase {
|
|||
public:
|
||||
Test_BAR(TestSuite* suite) : ITestCase(suite, "bar") {}
|
||||
|
||||
int setup(uint32_t n, void* src1, void* /*src2*/) override {
|
||||
int setup(uint32_t n, void* src1, void* /*src2*/) override {
|
||||
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_WARPS, &num_warps_));
|
||||
if (num_warps_ == 1) {
|
||||
std::cout << "Error: multiple warps configuration required!" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_THREADS, &num_threads_));
|
||||
auto a = (uint32_t*)src1;
|
||||
auto a = (uint32_t*)src1;
|
||||
for (uint32_t i = 0; i < n; ++i) {
|
||||
a[i] = i;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* /*src2*/) override {
|
||||
int errors = 0;
|
||||
auto a = (uint32_t*)src1;
|
||||
|
@ -816,7 +816,7 @@ public:
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int verify(uint32_t n, void* dst, const void* src1, const void* /*src2*/) override {
|
||||
int errors = 0;
|
||||
auto a = (uint32_t*)src1;
|
||||
|
@ -832,15 +832,15 @@ public:
|
|||
}
|
||||
|
||||
uint64_t num_cores_;
|
||||
uint64_t num_warps_;
|
||||
uint64_t num_warps_;
|
||||
uint64_t num_threads_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
TestSuite::TestSuite(vx_device_h device)
|
||||
TestSuite::TestSuite(vx_device_h device)
|
||||
: device_(device) {
|
||||
this->add_test(new Test_IADD(this));
|
||||
/*this->add_test(new Test_IADD(this));
|
||||
this->add_test(new Test_IMUL(this));
|
||||
this->add_test(new Test_IDIV(this));
|
||||
this->add_test(new Test_IDIV_MUL(this));
|
||||
|
@ -859,10 +859,10 @@ TestSuite::TestSuite(vx_device_h device)
|
|||
this->add_test(new Test_FTOU(this));
|
||||
this->add_test(new Test_ITOF(this));
|
||||
this->add_test(new Test_UTOF(this));
|
||||
this->add_test(new Test_FCLAMP(this));
|
||||
this->add_test(new Test_FCLAMP(this));*/
|
||||
this->add_test(new Test_TRIGO(this));
|
||||
this->add_test(new Test_BAR(this));
|
||||
this->add_test(new Test_GBAR(this));
|
||||
/*this->add_test(new Test_BAR(this));
|
||||
this->add_test(new Test_GBAR(this));*/
|
||||
}
|
||||
|
||||
TestSuite::~TestSuite() {
|
||||
|
|
|
@ -21,8 +21,11 @@ const char* kernel_file = "kernel.vxbin";
|
|||
uint32_t count = 0;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
vx_buffer_h src0_buffer = nullptr;
|
||||
vx_buffer_h src1_buffer = nullptr;
|
||||
vx_buffer_h dst_buffer = nullptr;
|
||||
vx_buffer_h krnl_buffer = nullptr;
|
||||
vx_buffer_h args_buffer = nullptr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
|
@ -54,16 +57,16 @@ static void parse_args(int argc, char **argv) {
|
|||
|
||||
void cleanup() {
|
||||
if (device) {
|
||||
vx_mem_free(device, kernel_arg.src0_addr);
|
||||
vx_mem_free(device, kernel_arg.src1_addr);
|
||||
vx_mem_free(device, kernel_arg.dst_addr);
|
||||
vx_mem_free(device, kernel_prog_addr);
|
||||
vx_mem_free(device, kernel_args_addr);
|
||||
vx_mem_free(src0_buffer);
|
||||
vx_mem_free(src1_buffer);
|
||||
vx_mem_free(dst_buffer);
|
||||
vx_mem_free(krnl_buffer);
|
||||
vx_mem_free(args_buffer);
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int main(int argc, char *argv[]) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
|
@ -72,7 +75,7 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint64_t num_cores, num_warps, num_threads;
|
||||
|
@ -80,27 +83,30 @@ int main(int argc, char *argv[]) {
|
|||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
|
||||
|
||||
uint32_t num_tasks = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * num_tasks;
|
||||
uint32_t total_threads = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * total_threads;
|
||||
uint32_t buf_size = num_points * sizeof(int32_t);
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.num_tasks = total_threads;
|
||||
kernel_arg.task_size = count;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer));
|
||||
RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer));
|
||||
RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
|
||||
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
|
||||
|
||||
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
|
||||
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate host buffers
|
||||
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<int32_t> h_src0(num_points);
|
||||
std::vector<int32_t> h_src1(num_points);
|
||||
|
@ -110,27 +116,27 @@ int main(int argc, char *argv[]) {
|
|||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
h_src0[i] = i-1;
|
||||
h_src1[i] = i+1;
|
||||
}
|
||||
}
|
||||
|
||||
// upload source buffer0
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, h_src0.data(), buf_size));
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(src0_buffer, h_src0.data(), 0, buf_size));
|
||||
|
||||
// upload source buffer1
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, h_src1.data(), buf_size));
|
||||
|
||||
RT_CHECK(vx_copy_to_dev(src1_buffer, h_src1.data(), 0, buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
|
||||
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
|
@ -138,13 +144,13 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
int errors = 0;
|
||||
std::cout << "verify result" << std::endl;
|
||||
int errors = 0;
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
int ref = i + i;
|
||||
int ref = i + i;
|
||||
int cur = h_dst[i];
|
||||
if (cur != ref) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
|
@ -154,13 +160,13 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
|
|
@ -23,12 +23,16 @@
|
|||
const char* kernel_file = "kernel.vxbin";
|
||||
uint32_t count = 0;
|
||||
|
||||
static uint64_t io_base_addr = IO_CSR_ADDR + IO_CSR_SIZE;
|
||||
static uint64_t io_base_addr = IO_MPM_ADDR + IO_CSR_SIZE;
|
||||
uint64_t usr_test_addr;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
uint64_t usr_test_mem;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
vx_buffer_h usr_test_buffer = nullptr;
|
||||
vx_buffer_h io_test_buffer = nullptr;
|
||||
vx_buffer_h src_buffer = nullptr;
|
||||
vx_buffer_h dst_buffer = nullptr;
|
||||
vx_buffer_h krnl_buffer = nullptr;
|
||||
vx_buffer_h args_buffer = nullptr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
|
@ -60,11 +64,12 @@ static void parse_args(int argc, char **argv) {
|
|||
|
||||
void cleanup() {
|
||||
if (device) {
|
||||
vx_mem_free(device, kernel_arg.src_addr);
|
||||
vx_mem_free(device, kernel_arg.dst_addr);
|
||||
vx_mem_free(device, kernel_prog_addr);
|
||||
vx_mem_free(device, kernel_args_addr);
|
||||
vx_mem_free(device, usr_test_mem);
|
||||
vx_mem_free(usr_test_buffer);
|
||||
vx_mem_free(io_test_buffer);
|
||||
vx_mem_free(src_buffer);
|
||||
vx_mem_free(dst_buffer);
|
||||
vx_mem_free(krnl_buffer);
|
||||
vx_mem_free(args_buffer);
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
@ -73,12 +78,12 @@ void gen_src_addrs(std::vector<uint64_t>& src_addrs, uint32_t size) {
|
|||
src_addrs.resize(size);
|
||||
uint32_t u = 0, k = 0;
|
||||
for (uint32_t i = 0; i < size; ++i) {
|
||||
if (0 ==(i % 4)) {
|
||||
if (0 ==(i % 4)) {
|
||||
k = (i + u) % NUM_ADDRS;
|
||||
++u;
|
||||
}
|
||||
uint32_t j = i % NUM_ADDRS;
|
||||
uint64_t a = ((j == k) ? usr_test_mem : io_base_addr) + j * sizeof(uint32_t);
|
||||
uint32_t j = i % NUM_ADDRS;
|
||||
uint64_t a = ((j == k) ? usr_test_addr : io_base_addr) + j * sizeof(uint32_t);
|
||||
std::cout << std::dec << i << "," << k << ": value=0x" << std::hex << a << std::endl;
|
||||
src_addrs[i] = a;
|
||||
}
|
||||
|
@ -103,7 +108,7 @@ int main(int argc, char *argv[]) {
|
|||
std::srand(50);
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint64_t num_cores, num_warps, num_threads;
|
||||
|
@ -111,61 +116,65 @@ int main(int argc, char *argv[]) {
|
|||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
|
||||
|
||||
uint32_t num_tasks = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * num_tasks;
|
||||
uint32_t total_threads = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * total_threads;
|
||||
|
||||
uint32_t src_buf_size = NUM_ADDRS * sizeof(int32_t);
|
||||
uint32_t addr_buf_size = num_points * sizeof(uint64_t);
|
||||
uint32_t addr_buf_size = NUM_ADDRS * sizeof(int32_t);
|
||||
uint32_t src_buf_size = num_points * sizeof(uint64_t);
|
||||
uint32_t dst_buf_size = num_points * sizeof(int32_t);
|
||||
|
||||
std::cout << "number of points: " << std::dec << num_points << std::endl;
|
||||
std::cout << "usr buffer size: " << src_buf_size << " bytes" << std::endl;
|
||||
std::cout << "addr buffer size: " << addr_buf_size << " bytes" << std::endl;
|
||||
std::cout << "src buffer size: " << src_buf_size << " bytes" << std::endl;
|
||||
std::cout << "dst buffer size: " << dst_buf_size << " bytes" << std::endl;
|
||||
|
||||
|
||||
kernel_arg.num_points = num_points;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, src_buf_size, &usr_test_mem));
|
||||
RT_CHECK(vx_mem_alloc(device, addr_buf_size, &kernel_arg.src_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &kernel_arg.dst_addr));
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, addr_buf_size, VX_MEM_READ, &usr_test_buffer));
|
||||
RT_CHECK(vx_mem_address(usr_test_buffer, &usr_test_addr));
|
||||
RT_CHECK(vx_mem_reserve(device, io_base_addr, addr_buf_size, VX_MEM_READ, &io_test_buffer));
|
||||
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_READ, &src_buffer));
|
||||
RT_CHECK(vx_mem_address(src_buffer, &kernel_arg.src_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_WRITE, &dst_buffer));
|
||||
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
|
||||
|
||||
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<uint64_t> h_addr;
|
||||
std::vector<uint32_t> h_src(NUM_ADDRS);
|
||||
std::vector<uint64_t> h_src;
|
||||
std::vector<uint32_t> h_addr(NUM_ADDRS);
|
||||
std::vector<int32_t> h_dst(num_points);
|
||||
|
||||
// generate source data
|
||||
gen_src_addrs(h_addr, num_points);
|
||||
gen_src_addrs(h_src, num_points);
|
||||
for (uint32_t i = 0; i < NUM_ADDRS; ++i) {
|
||||
h_src[i] = i * i;
|
||||
h_addr[i] = i * i;
|
||||
}
|
||||
|
||||
|
||||
// upload user address data
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, usr_test_mem, h_src.data(), src_buf_size));
|
||||
RT_CHECK(vx_copy_to_dev(device, io_base_addr, h_src.data(), src_buf_size));
|
||||
|
||||
RT_CHECK(vx_copy_to_dev(usr_test_buffer, h_addr.data(), 0, addr_buf_size));
|
||||
RT_CHECK(vx_copy_to_dev(io_test_buffer, h_addr.data(), 0, addr_buf_size));
|
||||
|
||||
// upload source buffer
|
||||
std::cout << "upload address buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, h_addr.data(), addr_buf_size));
|
||||
|
||||
RT_CHECK(vx_copy_to_dev(src_buffer, h_src.data(), 0, src_buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
|
||||
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
|
@ -173,7 +182,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, dst_buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, dst_buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
|
@ -194,13 +203,13 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
|
|
@ -28,10 +28,10 @@ public:
|
|||
static const char* type_str() {
|
||||
return "integer";
|
||||
}
|
||||
static int generate() {
|
||||
return rand();
|
||||
static int generate() {
|
||||
return rand();
|
||||
}
|
||||
static bool compare(int a, int b, int index, int errors) {
|
||||
static bool compare(int a, int b, int index, int errors) {
|
||||
if (a != b) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
|
||||
|
@ -39,7 +39,7 @@ public:
|
|||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
|
@ -50,10 +50,10 @@ public:
|
|||
static const char* type_str() {
|
||||
return "float";
|
||||
}
|
||||
static int generate() {
|
||||
static int generate() {
|
||||
return static_cast<float>(rand()) / RAND_MAX;
|
||||
}
|
||||
static bool compare(float a, float b, int index, int errors) {
|
||||
static bool compare(float a, float b, int index, int errors) {
|
||||
union fi_t { float f; int32_t i; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
|
@ -66,15 +66,18 @@ public:
|
|||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const char* kernel_file = "kernel.vxbin";
|
||||
uint32_t count = 16;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
vx_buffer_h src0_buffer = nullptr;
|
||||
vx_buffer_h src1_buffer = nullptr;
|
||||
vx_buffer_h dst_buffer = nullptr;
|
||||
vx_buffer_h krnl_buffer = nullptr;
|
||||
vx_buffer_h args_buffer = nullptr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
|
@ -105,24 +108,24 @@ static void parse_args(int argc, char **argv) {
|
|||
}
|
||||
|
||||
void cleanup() {
|
||||
if (device) {
|
||||
vx_mem_free(device, kernel_arg.src0_addr);
|
||||
vx_mem_free(device, kernel_arg.src1_addr);
|
||||
vx_mem_free(device, kernel_arg.dst_addr);
|
||||
vx_mem_free(device, kernel_prog_addr);
|
||||
vx_mem_free(device, kernel_args_addr);
|
||||
if (device) {
|
||||
vx_mem_free(src0_buffer);
|
||||
vx_mem_free(src1_buffer);
|
||||
vx_mem_free(dst_buffer);
|
||||
vx_mem_free(krnl_buffer);
|
||||
vx_mem_free(args_buffer);
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int main(int argc, char *argv[]) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
std::srand(50);
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint64_t num_cores, num_warps, num_threads;
|
||||
|
@ -130,27 +133,30 @@ int main(int argc, char *argv[]) {
|
|||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
|
||||
|
||||
uint32_t num_tasks = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * num_tasks;
|
||||
uint32_t total_threads = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * total_threads;
|
||||
uint32_t buf_size = num_points * sizeof(TYPE);
|
||||
|
||||
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.num_tasks = total_threads;
|
||||
kernel_arg.task_size = count;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer));
|
||||
RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer));
|
||||
RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
|
||||
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
|
||||
|
||||
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
|
||||
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<TYPE> h_src0(num_points);
|
||||
|
@ -165,23 +171,23 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// upload source buffer0
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, h_src0.data(), buf_size));
|
||||
|
||||
RT_CHECK(vx_copy_to_dev(src0_buffer, h_src0.data(), 0, buf_size));
|
||||
|
||||
// upload source buffer1
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, h_src1.data(), buf_size));
|
||||
|
||||
RT_CHECK(vx_copy_to_dev(src1_buffer, h_src1.data(), 0, buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
|
||||
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
|
@ -189,7 +195,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
|
@ -203,13 +209,13 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
union Float_t {
|
||||
union Float_t {
|
||||
float f;
|
||||
int i;
|
||||
struct {
|
||||
|
@ -69,8 +69,11 @@ const char* kernel_file = "kernel.vxbin";
|
|||
uint32_t count = 0;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
vx_buffer_h src0_buffer = nullptr;
|
||||
vx_buffer_h src1_buffer = nullptr;
|
||||
vx_buffer_h dst_buffer = nullptr;
|
||||
vx_buffer_h krnl_buffer = nullptr;
|
||||
vx_buffer_h args_buffer = nullptr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
|
@ -102,22 +105,22 @@ static void parse_args(int argc, char **argv) {
|
|||
|
||||
void cleanup() {
|
||||
if (device) {
|
||||
vx_mem_free(device, kernel_arg.src0_addr);
|
||||
vx_mem_free(device, kernel_arg.src1_addr);
|
||||
vx_mem_free(device, kernel_arg.dst_addr);
|
||||
vx_mem_free(device, kernel_prog_addr);
|
||||
vx_mem_free(device, kernel_args_addr);
|
||||
vx_mem_free(src0_buffer);
|
||||
vx_mem_free(src1_buffer);
|
||||
vx_mem_free(dst_buffer);
|
||||
vx_mem_free(krnl_buffer);
|
||||
vx_mem_free(args_buffer);
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
||||
void gen_src_data(std::vector<float>& test_data,
|
||||
void gen_src_data(std::vector<float>& test_data,
|
||||
std::vector<uint32_t>& addr_table,
|
||||
uint32_t num_points,
|
||||
uint32_t num_addrs) {
|
||||
test_data.resize(num_points);
|
||||
addr_table.resize(num_addrs);
|
||||
|
||||
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
float r = static_cast<float>(std::rand()) / RAND_MAX;
|
||||
test_data[i] = r;
|
||||
|
@ -131,7 +134,7 @@ void gen_src_data(std::vector<float>& test_data,
|
|||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int main(int argc, char *argv[]) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
|
@ -142,7 +145,7 @@ int main(int argc, char *argv[]) {
|
|||
std::srand(50);
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint64_t num_cores, num_warps, num_threads;
|
||||
|
@ -150,12 +153,12 @@ int main(int argc, char *argv[]) {
|
|||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
|
||||
|
||||
uint32_t num_tasks = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * num_tasks;
|
||||
uint32_t total_threads = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * total_threads;
|
||||
uint32_t num_addrs = num_points + NUM_LOADS - 1;
|
||||
|
||||
uint32_t addr_buf_size = num_addrs * sizeof(int32_t);
|
||||
uint32_t src_buf_size = num_points * sizeof(int32_t);
|
||||
uint32_t src_buf_size = num_points * sizeof(int32_t);
|
||||
uint32_t dst_buf_size = num_points * sizeof(int32_t);
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
|
@ -163,45 +166,48 @@ int main(int argc, char *argv[]) {
|
|||
std::cout << "src buffer size: " << src_buf_size << " bytes" << std::endl;
|
||||
std::cout << "dst buffer size: " << dst_buf_size << " bytes" << std::endl;
|
||||
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.num_tasks = total_threads;
|
||||
kernel_arg.stride = count;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, addr_buf_size, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, src_buf_size, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &kernel_arg.dst_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, addr_buf_size, VX_MEM_READ, &src0_buffer));
|
||||
RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_READ, &src1_buffer));
|
||||
RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_WRITE, &dst_buffer));
|
||||
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
|
||||
|
||||
std::cout << "dev_addr=0x" << std::hex << kernel_arg.src0_addr << std::endl;
|
||||
std::cout << "dev_src=0x" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_src=0x" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<uint32_t> h_addr;
|
||||
std::vector<float> h_src;
|
||||
std::vector<float> h_dst(num_points);
|
||||
gen_src_data(h_src, h_addr, num_points, num_addrs);
|
||||
|
||||
|
||||
// upload source buffer0
|
||||
std::cout << "upload address buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, h_addr.data(), addr_buf_size));
|
||||
RT_CHECK(vx_copy_to_dev(src0_buffer, h_addr.data(), 0, addr_buf_size));
|
||||
|
||||
// upload source buffer1
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, h_src.data(), src_buf_size));
|
||||
|
||||
RT_CHECK(vx_copy_to_dev(src1_buffer, h_src.data(), 0, src_buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
|
||||
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
|
@ -209,7 +215,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, dst_buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, dst_buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
|
@ -223,7 +229,7 @@ int main(int argc, char *argv[]) {
|
|||
//printf("*** [%d] addr=%d, index=%d, value=%f\n", i, addr, index, value);
|
||||
ref *= value;
|
||||
}
|
||||
|
||||
|
||||
float cur = h_dst[i];
|
||||
if (!almost_equal(cur, ref)) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
|
@ -233,13 +239,13 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
|
|
@ -21,8 +21,9 @@ const char* kernel_file = "kernel.vxbin";
|
|||
uint32_t count = 4;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
vx_buffer_h src_buffer = nullptr;
|
||||
vx_buffer_h krnl_buffer = nullptr;
|
||||
vx_buffer_h args_buffer = nullptr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
|
@ -54,14 +55,14 @@ static void parse_args(int argc, char **argv) {
|
|||
|
||||
void cleanup() {
|
||||
if (device) {
|
||||
vx_mem_free(device, kernel_arg.src_addr);
|
||||
vx_mem_free(device, kernel_prog_addr);
|
||||
vx_mem_free(device, kernel_args_addr);
|
||||
vx_mem_free(src_buffer);
|
||||
vx_mem_free(krnl_buffer);
|
||||
vx_mem_free(args_buffer);
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int main(int argc, char *argv[]) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
|
@ -70,7 +71,7 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint64_t num_cores, num_warps, num_threads;
|
||||
|
@ -78,8 +79,8 @@ int main(int argc, char *argv[]) {
|
|||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
|
||||
|
||||
uint32_t num_tasks = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * num_tasks;
|
||||
uint32_t total_threads = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * total_threads;
|
||||
uint32_t buf_size = num_points * sizeof(char);
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
|
@ -89,41 +90,42 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src_buffer));
|
||||
RT_CHECK(vx_mem_address(src_buffer, &kernel_arg.src_addr));
|
||||
|
||||
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
|
||||
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<char> h_src(num_points);
|
||||
|
||||
|
||||
// generate input data
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
h_src[i] = (char)i;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// upload source buffer0
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, h_src.data(), buf_size));
|
||||
RT_CHECK(vx_copy_to_dev(src_buffer, h_src.data(), 0, buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
|
||||
|
||||
// upload kernel argument
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
|
||||
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
|
|
@ -26,7 +26,7 @@ void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|||
for (int e = 0; e < size; ++e) {
|
||||
sum += A[row * size + e] * B[e * size + col];
|
||||
}
|
||||
|
||||
|
||||
C[row * size + col] = sum;
|
||||
}
|
||||
|
||||
|
|
|
@ -30,10 +30,10 @@ public:
|
|||
static const char* type_str() {
|
||||
return "integer";
|
||||
}
|
||||
static int generate() {
|
||||
return rand();
|
||||
static int generate() {
|
||||
return rand();
|
||||
}
|
||||
static bool compare(int a, int b, int index, int errors) {
|
||||
static bool compare(int a, int b, int index, int errors) {
|
||||
if (a != b) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a);
|
||||
|
@ -41,7 +41,7 @@ public:
|
|||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
|
@ -50,10 +50,10 @@ public:
|
|||
static const char* type_str() {
|
||||
return "float";
|
||||
}
|
||||
static int generate() {
|
||||
static int generate() {
|
||||
return static_cast<float>(rand()) / RAND_MAX;
|
||||
}
|
||||
static bool compare(float a, float b, int index, int errors) {
|
||||
static bool compare(float a, float b, int index, int errors) {
|
||||
union fi_t { float f; int32_t i; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
|
@ -66,7 +66,7 @@ public:
|
|||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
static void matmul_cpu(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
|
||||
|
@ -85,8 +85,11 @@ const char* kernel_file = "kernel.vxbin";
|
|||
uint32_t size = 32;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
vx_buffer_h A_buffer = nullptr;
|
||||
vx_buffer_h B_buffer = nullptr;
|
||||
vx_buffer_h C_buffer = nullptr;
|
||||
vx_buffer_h krnl_buffer = nullptr;
|
||||
vx_buffer_h args_buffer = nullptr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
|
@ -117,24 +120,24 @@ static void parse_args(int argc, char **argv) {
|
|||
}
|
||||
|
||||
void cleanup() {
|
||||
if (device) {
|
||||
vx_mem_free(device, kernel_arg.A_addr);
|
||||
vx_mem_free(device, kernel_arg.B_addr);
|
||||
vx_mem_free(device, kernel_arg.C_addr);
|
||||
vx_mem_free(device, kernel_prog_addr);
|
||||
vx_mem_free(device, kernel_args_addr);
|
||||
if (device) {
|
||||
vx_mem_free(A_buffer);
|
||||
vx_mem_free(B_buffer);
|
||||
vx_mem_free(C_buffer);
|
||||
vx_mem_free(krnl_buffer);
|
||||
vx_mem_free(args_buffer);
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int main(int argc, char *argv[]) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
std::srand(50);
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint32_t num_points = size * size;
|
||||
|
@ -149,9 +152,12 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.A_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.B_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.C_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &A_buffer));
|
||||
RT_CHECK(vx_mem_address(A_buffer, &kernel_arg.A_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &B_buffer));
|
||||
RT_CHECK(vx_mem_address(B_buffer, &kernel_arg.B_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &C_buffer));
|
||||
RT_CHECK(vx_mem_address(C_buffer, &kernel_arg.C_addr));
|
||||
|
||||
std::cout << "dev_argA=0x" << std::hex << kernel_arg.A_addr << std::endl;
|
||||
std::cout << "dev_argB=0x" << std::hex << kernel_arg.B_addr << std::endl;
|
||||
|
@ -171,32 +177,32 @@ int main(int argc, char *argv[]) {
|
|||
// upload matrix A buffer
|
||||
{
|
||||
std::cout << "upload matrix A buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.A_addr, h_A.data(), buf_size));
|
||||
RT_CHECK(vx_copy_to_dev(A_buffer, h_A.data(), 0, buf_size));
|
||||
}
|
||||
|
||||
// upload matrix B buffer
|
||||
{
|
||||
std::cout << "upload matrix B buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.B_addr, h_B.data(), buf_size));
|
||||
RT_CHECK(vx_copy_to_dev(B_buffer, h_B.data(), 0, buf_size));
|
||||
}
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
|
||||
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
|
||||
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
|
||||
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
|
||||
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
|
@ -204,7 +210,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, h_C.data(), kernel_arg.C_addr, buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(h_C.data(), C_buffer, 0, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
|
@ -212,7 +218,7 @@ int main(int argc, char *argv[]) {
|
|||
{
|
||||
std::vector<TYPE> h_ref(num_points);
|
||||
matmul_cpu(h_ref.data(), h_A.data(), h_B.data(), size, size);
|
||||
|
||||
|
||||
for (uint32_t i = 0; i < h_ref.size(); ++i) {
|
||||
if (!Comparator<TYPE>::compare(h_C[i], h_ref[i], i, errors)) {
|
||||
++errors;
|
||||
|
@ -221,13 +227,13 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
|
|
@ -21,8 +21,10 @@ const char* kernel_file = "kernel.vxbin";
|
|||
uint32_t count = 0;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
vx_buffer_h src_buffer = nullptr;
|
||||
vx_buffer_h dst_buffer = nullptr;
|
||||
vx_buffer_h krnl_buffer = nullptr;
|
||||
vx_buffer_h args_buffer = nullptr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
|
@ -54,10 +56,10 @@ static void parse_args(int argc, char **argv) {
|
|||
|
||||
void cleanup() {
|
||||
if (device) {
|
||||
vx_mem_free(device, kernel_arg.src_addr);
|
||||
vx_mem_free(device, kernel_arg.dst_addr);
|
||||
vx_mem_free(device, kernel_prog_addr);
|
||||
vx_mem_free(device, kernel_args_addr);
|
||||
vx_mem_free(src_buffer);
|
||||
vx_mem_free(dst_buffer);
|
||||
vx_mem_free(krnl_buffer);
|
||||
vx_mem_free(args_buffer);
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
@ -69,7 +71,7 @@ void gen_src_data(std::vector<TYPE>& src_data, uint32_t size) {
|
|||
auto value = static_cast<TYPE>(r * size);
|
||||
src_data[i] = value;
|
||||
std::cout << std::dec << i << ": value=" << value << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void gen_ref_data(std::vector<TYPE>& ref_data, const std::vector<TYPE>& src_data, uint32_t size) {
|
||||
|
@ -85,7 +87,7 @@ void gen_ref_data(std::vector<TYPE>& ref_data, const std::vector<TYPE>& src_data
|
|||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int main(int argc, char *argv[]) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
|
@ -96,7 +98,7 @@ int main(int argc, char *argv[]) {
|
|||
std::srand(50);
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint64_t num_cores, num_warps, num_threads;
|
||||
|
@ -104,44 +106,46 @@ int main(int argc, char *argv[]) {
|
|||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
|
||||
|
||||
uint32_t num_tasks = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * num_tasks;
|
||||
uint32_t total_threads = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * total_threads;
|
||||
uint32_t buf_size = num_points * sizeof(TYPE);
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
|
||||
kernel_arg.num_points = num_points;
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src_buffer));
|
||||
RT_CHECK(vx_mem_address(src_buffer, &kernel_arg.src_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
|
||||
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
|
||||
|
||||
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate host buffers
|
||||
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<TYPE> h_src;
|
||||
std::vector<TYPE> h_dst(num_points);
|
||||
gen_src_data(h_src, num_points);
|
||||
|
||||
|
||||
// upload source buffer
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, h_src.data(), buf_size));
|
||||
RT_CHECK(vx_copy_to_dev(src_buffer, h_src.data(), 0, buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
|
||||
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
|
@ -149,10 +153,10 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
std::cout << "verify result" << std::endl;
|
||||
int errors = 0;
|
||||
{
|
||||
std::vector<TYPE> h_ref;
|
||||
|
@ -170,13 +174,13 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return errors;
|
||||
return errors;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
|
|
@ -28,10 +28,10 @@ public:
|
|||
static const char* type_str() {
|
||||
return "integer";
|
||||
}
|
||||
static int generate() {
|
||||
return rand();
|
||||
static int generate() {
|
||||
return rand();
|
||||
}
|
||||
static bool compare(int a, int b, int index, int errors) {
|
||||
static bool compare(int a, int b, int index, int errors) {
|
||||
if (a != b) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%d, actual=%d\n", index, b, a);
|
||||
|
@ -39,7 +39,7 @@ public:
|
|||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
|
@ -50,10 +50,10 @@ public:
|
|||
static const char* type_str() {
|
||||
return "float";
|
||||
}
|
||||
static int generate() {
|
||||
static int generate() {
|
||||
return static_cast<float>(rand()) / RAND_MAX;
|
||||
}
|
||||
static bool compare(float a, float b, int index, int errors) {
|
||||
static bool compare(float a, float b, int index, int errors) {
|
||||
union fi_t { float f; int32_t i; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
|
@ -66,15 +66,18 @@ public:
|
|||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
const char* kernel_file = "kernel.vxbin";
|
||||
uint32_t size = 16;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
uint64_t kernel_prog_addr;
|
||||
uint64_t kernel_args_addr;
|
||||
vx_buffer_h src0_buffer = nullptr;
|
||||
vx_buffer_h src1_buffer = nullptr;
|
||||
vx_buffer_h dst_buffer = nullptr;
|
||||
vx_buffer_h krnl_buffer = nullptr;
|
||||
vx_buffer_h args_buffer = nullptr;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
|
@ -105,30 +108,30 @@ static void parse_args(int argc, char **argv) {
|
|||
}
|
||||
|
||||
void cleanup() {
|
||||
if (device) {
|
||||
vx_mem_free(device, kernel_arg.src0_addr);
|
||||
vx_mem_free(device, kernel_arg.src1_addr);
|
||||
vx_mem_free(device, kernel_arg.dst_addr);
|
||||
vx_mem_free(device, kernel_prog_addr);
|
||||
vx_mem_free(device, kernel_args_addr);
|
||||
if (device) {
|
||||
vx_mem_free(src0_buffer);
|
||||
vx_mem_free(src1_buffer);
|
||||
vx_mem_free(dst_buffer);
|
||||
vx_mem_free(krnl_buffer);
|
||||
vx_mem_free(args_buffer);
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int main(int argc, char *argv[]) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
std::srand(50);
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint32_t num_points = size;
|
||||
uint32_t num_points = size;
|
||||
uint32_t buf_size = num_points * sizeof(TYPE);
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
|
@ -136,14 +139,17 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, &kernel_arg.dst_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src0_buffer));
|
||||
RT_CHECK(vx_mem_address(src0_buffer, &kernel_arg.src0_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_READ, &src1_buffer));
|
||||
RT_CHECK(vx_mem_address(src1_buffer, &kernel_arg.src1_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_WRITE, &dst_buffer));
|
||||
RT_CHECK(vx_mem_address(dst_buffer, &kernel_arg.dst_addr));
|
||||
|
||||
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
|
||||
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
|
||||
// allocate host buffers
|
||||
std::cout << "allocate host buffers" << std::endl;
|
||||
std::vector<TYPE> h_src0(num_points);
|
||||
|
@ -157,23 +163,23 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// upload source buffer0
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, h_src0.data(), buf_size));
|
||||
RT_CHECK(vx_copy_to_dev(src0_buffer, h_src0.data(), 0, buf_size));
|
||||
|
||||
// upload source buffer1
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, h_src1.data(), buf_size));
|
||||
RT_CHECK(vx_copy_to_dev(src1_buffer, h_src1.data(), 0, buf_size));
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &kernel_prog_addr));
|
||||
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file, &krnl_buffer));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &kernel_args_addr));
|
||||
|
||||
RT_CHECK(vx_upload_bytes(device, &kernel_arg, sizeof(kernel_arg_t), &args_buffer));
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device, kernel_prog_addr, kernel_args_addr));
|
||||
RT_CHECK(vx_start(device, krnl_buffer, args_buffer));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
|
@ -181,7 +187,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, h_dst.data(), kernel_arg.dst_addr, buf_size));
|
||||
RT_CHECK(vx_copy_from_dev(h_dst.data(), dst_buffer, 0, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
|
@ -195,13 +201,13 @@ int main(int argc, char *argv[]) {
|
|||
}
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue