mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
Merge branch 'master' into graphics
This commit is contained in:
commit
d42171d2ed
120 changed files with 4269 additions and 2329 deletions
|
@ -18,10 +18,10 @@ install:
|
|||
- export RISCV_TOOLCHAIN_PATH=/opt/riscv-gnu-toolchain
|
||||
- export VERILATOR_ROOT=/opt/verilator
|
||||
- export PATH=$VERILATOR_ROOT/bin:$PATH
|
||||
- make -s
|
||||
|
||||
|
||||
script:
|
||||
- ./ci/regression.sh
|
||||
- ./ci/test_compiler.sh
|
||||
|
||||
after_success:
|
||||
# Gather code coverage
|
||||
|
|
|
@ -233,7 +233,7 @@ free(allPlatforms);*/
|
|||
//--cambine-4: Create an OpenCL command queue
|
||||
oclHandles.queue = clCreateCommandQueue(
|
||||
oclHandles.context, oclHandles.devices[DEVICE_ID_INUSED], 0, &resultCL);
|
||||
printf("resultCL=%d, queue=0x%x\n", resultCL, oclHandles.queue);
|
||||
//printf("resultCL=%d, queue=0x%x\n", resultCL, oclHandles.queue);
|
||||
|
||||
if ((resultCL != CL_SUCCESS) || (oclHandles.queue == NULL))
|
||||
throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)"));
|
||||
|
@ -383,8 +383,8 @@ void _clRelease() {
|
|||
errorFlag = true;
|
||||
}
|
||||
oclHandles.kernel[nKernel] = NULL;
|
||||
printf("clReleaseKernel()\n");
|
||||
}
|
||||
oclHandles.kernel.clear();
|
||||
}
|
||||
|
||||
if (oclHandles.program != NULL) {
|
||||
|
@ -394,6 +394,7 @@ void _clRelease() {
|
|||
errorFlag = true;
|
||||
}
|
||||
oclHandles.program = NULL;
|
||||
printf("clReleaseProgram()\n");
|
||||
}
|
||||
|
||||
if (oclHandles.queue != NULL) {
|
||||
|
@ -403,10 +404,9 @@ void _clRelease() {
|
|||
errorFlag = true;
|
||||
}
|
||||
oclHandles.queue = NULL;
|
||||
printf("clReleaseCommandQueue()\n");
|
||||
}
|
||||
|
||||
free(oclHandles.devices);
|
||||
|
||||
if (oclHandles.context != NULL) {
|
||||
cl_int resultCL = clReleaseContext(oclHandles.context);
|
||||
if (resultCL != CL_SUCCESS) {
|
||||
|
@ -414,6 +414,17 @@ void _clRelease() {
|
|||
errorFlag = true;
|
||||
}
|
||||
oclHandles.context = NULL;
|
||||
printf("clReleaseContext()\n");
|
||||
}
|
||||
|
||||
if (oclHandles.devices != NULL) {
|
||||
cl_int resultCL = clReleaseDevice(oclHandles.devices[0]);
|
||||
if (resultCL != CL_SUCCESS) {
|
||||
cerr << "ReleaseCL()::Error: In clReleaseDevice" << endl;
|
||||
errorFlag = true;
|
||||
}
|
||||
free(oclHandles.devices);
|
||||
printf("clReleaseDevice()\n");
|
||||
}
|
||||
|
||||
if (errorFlag)
|
||||
|
@ -675,7 +686,7 @@ void _clFinish() throw(string) {
|
|||
void _clInvokeKernel(int kernel_id, int work_items,
|
||||
int work_group_size) throw(string) {
|
||||
cl_uint work_dim = WORK_DIM;
|
||||
cl_event e[1];
|
||||
//cl_event e[1];
|
||||
if (work_items % work_group_size != 0) // process situations that work_items
|
||||
// cannot be divided by work_group_size
|
||||
work_items =
|
||||
|
@ -684,7 +695,7 @@ void _clInvokeKernel(int kernel_id, int work_items,
|
|||
size_t global_work_size[] = {work_items, 1};
|
||||
oclHandles.cl_status = clEnqueueNDRangeKernel(
|
||||
oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0,
|
||||
global_work_size, local_work_size, 0, 0, &(e[0]));
|
||||
global_work_size, local_work_size, 0, 0, NULL);
|
||||
#ifdef ERRMSG
|
||||
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
|
||||
switch (oclHandles.cl_status) {
|
||||
|
@ -749,13 +760,13 @@ void _clInvokeKernel2D(int kernel_id, int range_x, int range_y, int group_x,
|
|||
cl_uint work_dim = WORK_DIM;
|
||||
size_t local_work_size[] = {group_x, group_y};
|
||||
size_t global_work_size[] = {range_x, range_y};
|
||||
cl_event e[1];
|
||||
//cl_event e[1];
|
||||
/*if(work_items%work_group_size != 0) //process situations that work_items
|
||||
cannot be divided by work_group_size
|
||||
work_items = work_items + (work_group_size-(work_items%work_group_size));*/
|
||||
oclHandles.cl_status = clEnqueueNDRangeKernel(
|
||||
oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0,
|
||||
global_work_size, local_work_size, 0, 0, &(e[0]));
|
||||
global_work_size, local_work_size, 0, 0, NULL);
|
||||
#ifdef ERRMSG
|
||||
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
|
||||
switch (oclHandles.cl_status) {
|
||||
|
|
|
@ -78,14 +78,15 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
|
|||
char h_over;
|
||||
cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask,
|
||||
d_graph_visited, d_cost, d_over;
|
||||
|
||||
try {
|
||||
//--1 transfer data from host to device
|
||||
_clInit();
|
||||
|
||||
d_graph_nodes = _clMalloc(no_of_nodes * sizeof(Node), h_graph_nodes);
|
||||
d_graph_edges = _clMalloc(edge_list_size * sizeof(int), h_graph_edges);
|
||||
d_graph_mask = _clMallocRW(no_of_nodes * sizeof(char), h_graph_mask);
|
||||
d_updating_graph_mask =
|
||||
_clMallocRW(no_of_nodes * sizeof(char), h_updating_graph_mask);
|
||||
d_updating_graph_mask = _clMallocRW(no_of_nodes * sizeof(char), h_updating_graph_mask);
|
||||
d_graph_visited = _clMallocRW(no_of_nodes * sizeof(char), h_graph_visited);
|
||||
|
||||
d_cost = _clMallocRW(no_of_nodes * sizeof(int), h_cost);
|
||||
|
@ -94,8 +95,7 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
|
|||
_clMemcpyH2D(d_graph_nodes, no_of_nodes * sizeof(Node), h_graph_nodes);
|
||||
_clMemcpyH2D(d_graph_edges, edge_list_size * sizeof(int), h_graph_edges);
|
||||
_clMemcpyH2D(d_graph_mask, no_of_nodes * sizeof(char), h_graph_mask);
|
||||
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes * sizeof(char),
|
||||
h_updating_graph_mask);
|
||||
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes * sizeof(char), h_updating_graph_mask);
|
||||
_clMemcpyH2D(d_graph_visited, no_of_nodes * sizeof(char), h_graph_visited);
|
||||
_clMemcpyH2D(d_cost, no_of_nodes * sizeof(int), h_cost);
|
||||
|
||||
|
@ -106,6 +106,7 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
|
|||
kernel_timer.reset();
|
||||
kernel_timer.start();
|
||||
#endif
|
||||
|
||||
do {
|
||||
h_over = false;
|
||||
_clMemcpyH2D(d_over, sizeof(char), &h_over);
|
||||
|
@ -136,9 +137,8 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
|
|||
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
|
||||
|
||||
_clMemcpyD2H(d_over, sizeof(char), &h_over);
|
||||
} while (h_over);
|
||||
} while (h_over);
|
||||
|
||||
_clFinish();
|
||||
#ifdef PROFILING
|
||||
kernel_timer.stop();
|
||||
kernel_time = kernel_timer.getTimeInSeconds();
|
||||
|
|
|
@ -60,10 +60,10 @@ void compare_results(const datatype *cpu_results, const datatype *gpu_results, c
|
|||
}
|
||||
}
|
||||
if (passed){
|
||||
std::cout << "--cambine:passed:-)" << endl;
|
||||
std::cout << "--cambine: passed: -)" << endl;
|
||||
}
|
||||
else{
|
||||
std::cout << "--cambine: failed:-(" << endl;
|
||||
std::cout << "--cambine: failed :-(" << endl;
|
||||
}
|
||||
return ;
|
||||
}
|
||||
|
|
|
@ -69,7 +69,7 @@ static cl_uint numPlatforms;
|
|||
|
||||
//! All discoverable OpenCL devices (one pointer per platform)
|
||||
static cl_device_id* devices = NULL;
|
||||
static cl_uint* numDevices;
|
||||
static cl_uint* numDevices = NULL;
|
||||
|
||||
//! The chosen OpenCL platform
|
||||
static cl_platform_id platform = NULL;
|
||||
|
@ -88,7 +88,6 @@ static cl_command_queue commandQueueNoProf = NULL;
|
|||
//! Global status of events
|
||||
static bool eventsEnabled = false;
|
||||
|
||||
|
||||
//-------------------------------------------------------
|
||||
// Initialization and Cleanup
|
||||
//-------------------------------------------------------
|
||||
|
@ -239,14 +238,34 @@ static bool eventsEnabled = false;
|
|||
return context;
|
||||
}*/
|
||||
|
||||
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
if (nullptr == filename || nullptr == data || 0 == size)
|
||||
return -1;
|
||||
|
||||
FILE* fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Failed to load kernel.");
|
||||
return -1;
|
||||
}
|
||||
fseek(fp , 0 , SEEK_END);
|
||||
long fsize = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
*data = (uint8_t*)malloc(fsize);
|
||||
*size = fread(*data, 1, fsize, fp);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
cl_context cl_init_context(int platform, int dev,int quiet) {
|
||||
int printInfo=1;
|
||||
if (platform >= 0 && dev >= 0) printInfo = 0;
|
||||
cl_int status;
|
||||
// Used to iterate through the platforms and devices, respectively
|
||||
cl_uint numPlatforms;
|
||||
cl_uint numDevices;
|
||||
|
||||
|
||||
// These will hold the platform and device we select (can potentially be
|
||||
// multiple, but we're just doing one for now)
|
||||
// cl_platform_id platform = NULL;
|
||||
|
@ -376,23 +395,24 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
|
|||
// Getting platform and device information
|
||||
|
||||
numPlatforms = 1;
|
||||
numDevices = 1;
|
||||
int platform_touse = 0;
|
||||
int device_touse = 0;
|
||||
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
|
||||
devices = (cl_device_id*)malloc(sizeof(cl_device_id)*numDevices);
|
||||
|
||||
status = clGetPlatformIDs(1, platforms, NULL);
|
||||
numDevices = (cl_uint*)malloc(sizeof(cl_uint)*numPlatforms);
|
||||
numDevices[0] = 1;
|
||||
devices = (cl_device_id*)malloc(sizeof(cl_device_id)*numDevices[0]);
|
||||
|
||||
int platform_touse = 0;
|
||||
int device_touse = 0;
|
||||
|
||||
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
|
||||
cl_errChk(status, "Oops!", true);
|
||||
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_DEFAULT, 1, devices, NULL);
|
||||
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_DEFAULT, numDevices[0], devices, NULL);
|
||||
cl_errChk(status, "Oops!", true);
|
||||
context = clCreateContext(NULL, 1, devices, NULL, NULL, &status);
|
||||
context = clCreateContext(NULL, numDevices[0], devices, NULL, NULL, &status);
|
||||
cl_errChk(status, "Oops!", true);
|
||||
|
||||
device=devices[device_touse];
|
||||
|
||||
#define PROFILING
|
||||
|
||||
#ifdef PROFILING
|
||||
|
||||
commandQueue = clCreateCommandQueue(context,
|
||||
|
@ -400,7 +420,7 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
|
|||
|
||||
#else
|
||||
|
||||
clCommandQueue = clCreateCommandQueue(clGPUContext,
|
||||
commandQueue = clCreateCommandQueue(context,
|
||||
devices[device_touse], NULL, &status);
|
||||
|
||||
#endif // PROFILING
|
||||
|
@ -413,22 +433,34 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
|
|||
/*!
|
||||
Release all resources that the user doesn't have access to.
|
||||
*/
|
||||
void cl_cleanup()
|
||||
void cl_cleanup()
|
||||
{
|
||||
cl_int status;
|
||||
|
||||
// Free the command queue
|
||||
if(commandQueue) {
|
||||
clReleaseCommandQueue(commandQueue);
|
||||
if (commandQueue) {
|
||||
status = clReleaseCommandQueue(commandQueue);
|
||||
cl_errChk(status, "Oops!", true);
|
||||
printf("clReleaseCommandQueue()\n");
|
||||
}
|
||||
|
||||
// Free the context
|
||||
if(context) {
|
||||
clReleaseContext(context);
|
||||
if (context) {
|
||||
status = clReleaseContext(context);
|
||||
cl_errChk(status, "Oops!", true);
|
||||
printf("clReleaseContext()\n");
|
||||
}
|
||||
|
||||
for (int p = 0; p < numPlatforms; ++p) {
|
||||
for (int d = 0; d < numDevices[p]; ++d) {
|
||||
status = clReleaseDevice(devices[d]);
|
||||
cl_errChk(status, "Oops!", true);
|
||||
printf("clReleaseDevice()\n");
|
||||
}
|
||||
}
|
||||
|
||||
free(devices);
|
||||
free(numDevices);
|
||||
|
||||
// Free the platforms
|
||||
free(platforms);
|
||||
}
|
||||
|
||||
|
@ -443,6 +475,7 @@ void cl_freeKernel(cl_kernel kernel)
|
|||
if(kernel != NULL) {
|
||||
status = clReleaseKernel(kernel);
|
||||
cl_errChk(status, "Releasing kernel object", true);
|
||||
printf("clReleaseKernel()\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -457,6 +490,7 @@ void cl_freeMem(cl_mem mem)
|
|||
if(mem != NULL) {
|
||||
status = clReleaseMemObject(mem);
|
||||
cl_errChk(status, "Releasing mem object", true);
|
||||
printf("clReleaseMemObject()\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -471,6 +505,7 @@ void cl_freeProgram(cl_program program)
|
|||
if(program != NULL) {
|
||||
status = clReleaseProgram(program);
|
||||
cl_errChk(status, "Releasing program object", true);
|
||||
printf("clReleaseProgram()\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -782,27 +817,6 @@ void cl_writeToZCBuffer(cl_mem mem, void* data, size_t size)
|
|||
cl_unmapBuffer(mem, ptr);
|
||||
}
|
||||
|
||||
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
if (nullptr == filename || nullptr == data || 0 == size)
|
||||
return -1;
|
||||
|
||||
FILE* fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Failed to load kernel.");
|
||||
return -1;
|
||||
}
|
||||
fseek(fp , 0 , SEEK_END);
|
||||
long fsize = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
*data = (uint8_t*)malloc(fsize);
|
||||
*size = fread(*data, 1, fsize, fp);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//-------------------------------------------------------
|
||||
// Program and kernels
|
||||
//-------------------------------------------------------
|
||||
|
@ -858,17 +872,17 @@ cl_program cl_compileProgram(char* kernelPath, char* compileoptions, bool verbos
|
|||
fread(source, 1, size, fp);
|
||||
source[size] = '\0';*/
|
||||
|
||||
// Create the program object
|
||||
//cl_program clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &status);
|
||||
//cl_program clProgramReturn = clCreateProgramWithBuiltInKernels(context, 1, &device, "Fan1;Fan2", &status);
|
||||
// read kernel binary from file
|
||||
// read kernel binary from file
|
||||
uint8_t *kernel_bin = NULL;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
status = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
cl_errChk(status, "read_kernel_file", true);
|
||||
cl_int binary_status = 0;
|
||||
int err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
cl_errChk(err, "read_kernel_file", true);
|
||||
|
||||
// Create the program object
|
||||
//cl_program clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &status);
|
||||
cl_program clProgramReturn = clCreateProgramWithBinary(
|
||||
context, 1, &device, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &status);
|
||||
context, 1, devices, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &status);
|
||||
free(kernel_bin);
|
||||
cl_errChk(status, "Creating program", true);
|
||||
|
||||
|
@ -1440,4 +1454,4 @@ char* itoa_portable(int value, char* result, int base) {
|
|||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
}
|
|
@ -76,6 +76,9 @@ int main(int argc, char *argv[]) {
|
|||
free(b);
|
||||
free(finalVec);
|
||||
// OpenClGaussianElimination(context,timing);
|
||||
|
||||
cl_cleanup();
|
||||
|
||||
printf("Passed!\n");
|
||||
return 0;
|
||||
}
|
||||
|
@ -142,7 +145,8 @@ void ForwardSub(cl_context context, float *a, float *b, float *m, int size,
|
|||
writeTime += eventTime(writeEvent, command_queue);
|
||||
clReleaseEvent(writeEvent);
|
||||
|
||||
error = clEnqueueWriteBuffer(command_queue, m_dev,
|
||||
error = clEnqueueWriteBuffer(command_queue,
|
||||
m_dev,
|
||||
1, // change to 0 for nonblocking write
|
||||
0, // offset
|
||||
sizeof(float) * size * size, m, 0, NULL,
|
||||
|
@ -258,6 +262,13 @@ void ForwardSub(cl_context context, float *a, float *b, float *m, int size,
|
|||
|
||||
printf("%f\n\n", writeTime + kernelTime + readTime);
|
||||
}
|
||||
|
||||
cl_freeMem(a_dev);
|
||||
cl_freeMem(b_dev);
|
||||
cl_freeMem(m_dev);
|
||||
cl_freeKernel(fan1_kernel);
|
||||
cl_freeKernel(fan2_kernel);
|
||||
cl_freeProgram(gaussianElim_program);
|
||||
}
|
||||
|
||||
float eventTime(cl_event event, cl_command_queue command_queue) {
|
||||
|
|
|
@ -69,7 +69,7 @@ static cl_uint numPlatforms;
|
|||
|
||||
//! All discoverable OpenCL devices (one pointer per platform)
|
||||
static cl_device_id* devices = NULL;
|
||||
static cl_uint* numDevices;
|
||||
static cl_uint* numDevices = NULL;
|
||||
|
||||
//! The chosen OpenCL platform
|
||||
static cl_platform_id platform = NULL;
|
||||
|
@ -265,9 +265,7 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
|
|||
if (platform >= 0 && dev >= 0) printInfo = 0;
|
||||
cl_int status;
|
||||
// Used to iterate through the platforms and devices, respectively
|
||||
cl_uint numPlatforms;
|
||||
cl_uint numDevices;
|
||||
|
||||
|
||||
// These will hold the platform and device we select (can potentially be
|
||||
// multiple, but we're just doing one for now)
|
||||
// cl_platform_id platform = NULL;
|
||||
|
@ -397,23 +395,24 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
|
|||
// Getting platform and device information
|
||||
|
||||
numPlatforms = 1;
|
||||
numDevices = 1;
|
||||
int platform_touse = 0;
|
||||
int device_touse = 0;
|
||||
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
|
||||
devices = (cl_device_id*)malloc(sizeof(cl_device_id)*numDevices);
|
||||
|
||||
status = clGetPlatformIDs(1, platforms, NULL);
|
||||
numDevices = (cl_uint*)malloc(sizeof(cl_uint)*numPlatforms);
|
||||
numDevices[0] = 1;
|
||||
devices = (cl_device_id*)malloc(sizeof(cl_device_id)*numDevices[0]);
|
||||
|
||||
int platform_touse = 0;
|
||||
int device_touse = 0;
|
||||
|
||||
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
|
||||
cl_errChk(status, "Oops!", true);
|
||||
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_DEFAULT, 1, devices, NULL);
|
||||
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_DEFAULT, numDevices[0], devices, NULL);
|
||||
cl_errChk(status, "Oops!", true);
|
||||
context = clCreateContext(NULL, 1, devices, NULL, NULL, &status);
|
||||
context = clCreateContext(NULL, numDevices[0], devices, NULL, NULL, &status);
|
||||
cl_errChk(status, "Oops!", true);
|
||||
|
||||
device=devices[device_touse];
|
||||
|
||||
#define PROFILING
|
||||
|
||||
#ifdef PROFILING
|
||||
|
||||
commandQueue = clCreateCommandQueue(context,
|
||||
|
@ -421,7 +420,7 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
|
|||
|
||||
#else
|
||||
|
||||
clCommandQueue = clCreateCommandQueue(clGPUContext,
|
||||
commandQueue = clCreateCommandQueue(context,
|
||||
devices[device_touse], NULL, &status);
|
||||
|
||||
#endif // PROFILING
|
||||
|
@ -434,22 +433,34 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
|
|||
/*!
|
||||
Release all resources that the user doesn't have access to.
|
||||
*/
|
||||
void cl_cleanup()
|
||||
void cl_cleanup()
|
||||
{
|
||||
cl_int status;
|
||||
|
||||
// Free the command queue
|
||||
if(commandQueue) {
|
||||
clReleaseCommandQueue(commandQueue);
|
||||
if (commandQueue) {
|
||||
status = clReleaseCommandQueue(commandQueue);
|
||||
cl_errChk(status, "Oops!", true);
|
||||
printf("clReleaseCommandQueue()\n");
|
||||
}
|
||||
|
||||
// Free the context
|
||||
if(context) {
|
||||
clReleaseContext(context);
|
||||
if (context) {
|
||||
status = clReleaseContext(context);
|
||||
cl_errChk(status, "Oops!", true);
|
||||
printf("clReleaseContext()\n");
|
||||
}
|
||||
|
||||
for (int p = 0; p < numPlatforms; ++p) {
|
||||
for (int d = 0; d < numDevices[p]; ++d) {
|
||||
status = clReleaseDevice(devices[d]);
|
||||
cl_errChk(status, "Oops!", true);
|
||||
printf("clReleaseDevice()\n");
|
||||
}
|
||||
}
|
||||
|
||||
free(devices);
|
||||
free(numDevices);
|
||||
|
||||
// Free the platforms
|
||||
free(platforms);
|
||||
}
|
||||
|
||||
|
@ -464,6 +475,7 @@ void cl_freeKernel(cl_kernel kernel)
|
|||
if(kernel != NULL) {
|
||||
status = clReleaseKernel(kernel);
|
||||
cl_errChk(status, "Releasing kernel object", true);
|
||||
printf("clReleaseKernel()\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -478,6 +490,7 @@ void cl_freeMem(cl_mem mem)
|
|||
if(mem != NULL) {
|
||||
status = clReleaseMemObject(mem);
|
||||
cl_errChk(status, "Releasing mem object", true);
|
||||
printf("clReleaseMemObject()\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -492,6 +505,7 @@ void cl_freeProgram(cl_program program)
|
|||
if(program != NULL) {
|
||||
status = clReleaseProgram(program);
|
||||
cl_errChk(status, "Releasing program object", true);
|
||||
printf("clReleaseProgram()\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -49,25 +49,27 @@ int main(int argc, char *argv[]) {
|
|||
printf("%s --> Distance=%f\n", records[i].recString, records[i].distance);
|
||||
}
|
||||
free(recordDistances);
|
||||
|
||||
cl_cleanup();
|
||||
|
||||
printf("Passed!\n");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
float *OpenClFindNearestNeighbors(cl_context context, int numRecords,
|
||||
std::vector<LatLong> &locations, float lat,
|
||||
float lng, int timing) {
|
||||
|
||||
// 1. set up kernel
|
||||
cl_kernel NN_kernel;
|
||||
cl_int status;
|
||||
|
||||
// 1. set up kernel
|
||||
cl_kernel NN_kernel;
|
||||
cl_program cl_NN_program;
|
||||
cl_NN_program = cl_compileProgram((char *)"nearestNeighbor_kernel.cl", NULL);
|
||||
|
||||
NN_kernel = clCreateKernel(cl_NN_program, "NearestNeighbor", &status);
|
||||
status =
|
||||
cl_errChk(status, (char *)"Error Creating Nearest Neighbor kernel", true);
|
||||
if (status)
|
||||
exit(1);
|
||||
cl_errChk(status, (char *)"Error Creating Nearest Neighbor kernel", true);
|
||||
|
||||
// 2. set up memory on device and send ipts data to device
|
||||
// copy ipts(1,2) to device
|
||||
// also need to alloate memory for the distancePoints
|
||||
|
@ -78,9 +80,11 @@ float *OpenClFindNearestNeighbors(cl_context context, int numRecords,
|
|||
|
||||
d_locations = clCreateBuffer(context, CL_MEM_READ_ONLY,
|
||||
sizeof(LatLong) * numRecords, NULL, &error);
|
||||
cl_errChk(error, "ERROR: clCreateBuffer() failed", true);
|
||||
|
||||
d_distances = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
sizeof(float) * numRecords, NULL, &error);
|
||||
cl_errChk(error, "ERROR: clCreateBuffer() failed", true);
|
||||
|
||||
cl_command_queue command_queue = cl_getCommandQueue();
|
||||
cl_event writeEvent, kernelEvent, readEvent;
|
||||
|
@ -89,6 +93,7 @@ float *OpenClFindNearestNeighbors(cl_context context, int numRecords,
|
|||
0, // offset
|
||||
sizeof(LatLong) * numRecords, &locations[0], 0,
|
||||
NULL, &writeEvent);
|
||||
cl_errChk(error, "ERROR: clEnqueueWriteBuffer() failed", true);
|
||||
|
||||
// 3. send arguments to device
|
||||
cl_int argchk;
|
||||
|
@ -124,8 +129,10 @@ float *OpenClFindNearestNeighbors(cl_context context, int numRecords,
|
|||
&readEvent);
|
||||
|
||||
cl_errChk(error, "ERROR with clEnqueueReadBuffer", true);
|
||||
if (timing) {
|
||||
clFinish(command_queue);
|
||||
|
||||
clFinish(command_queue);
|
||||
|
||||
if (timing) {
|
||||
cl_ulong eventStart, eventEnd, totalTime = 0;
|
||||
printf("# Records\tWrite(s) [size]\t\tKernel(s)\tRead(s) "
|
||||
"[size]\t\tTotal(s)\n");
|
||||
|
@ -166,8 +173,14 @@ float *OpenClFindNearestNeighbors(cl_context context, int numRecords,
|
|||
printf("%f\n\n", (float)(totalTime / 1e9));
|
||||
}
|
||||
// 6. return finalized data and release buffers
|
||||
clReleaseMemObject(d_locations);
|
||||
clReleaseMemObject(d_distances);
|
||||
clReleaseEvent(writeEvent);
|
||||
clReleaseEvent(kernelEvent);
|
||||
clReleaseEvent(readEvent);
|
||||
cl_freeMem(d_locations);
|
||||
cl_freeMem(d_distances);
|
||||
cl_freeKernel(NN_kernel);
|
||||
cl_freeProgram(cl_NN_program);
|
||||
|
||||
return distances;
|
||||
}
|
||||
|
||||
|
|
|
@ -7,6 +7,8 @@ POCL_RT_PATH ?= /opt/pocl/runtime
|
|||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
OPTS ?= -n1024
|
||||
|
||||
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -float-abi=hard -code-model=small"
|
||||
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
|
||||
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
|
||||
|
@ -33,19 +35,19 @@ $(PROJECT): $(SRCS)
|
|||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
|
|
@ -29,11 +29,9 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <chrono>
|
||||
|
||||
//#define NUM_DATA 65536
|
||||
#define NUM_DATA 1024
|
||||
|
||||
#define CL_CHECK(_expr) \
|
||||
do { \
|
||||
cl_int _err = _expr; \
|
||||
|
@ -85,14 +83,18 @@ uint8_t *kernel_bin = NULL;
|
|||
///
|
||||
// Cleanup any created OpenCL resources
|
||||
//
|
||||
void Cleanup(cl_context context, cl_command_queue commandQueue,
|
||||
cl_program program, cl_kernel kernel, cl_mem memObjects[3]) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
void Cleanup(cl_device_id device_id, cl_context context, cl_command_queue commandQueue,
|
||||
cl_program program, cl_kernel kernel, cl_mem memObjects[2]) {
|
||||
if (kernel_bin)
|
||||
free(kernel_bin);
|
||||
|
||||
if (commandQueue != 0)
|
||||
clReleaseCommandQueue(commandQueue);
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (memObjects[i] != 0)
|
||||
clReleaseMemObject(memObjects[i]);
|
||||
}
|
||||
if (commandQueue != 0)
|
||||
clReleaseCommandQueue(commandQueue);
|
||||
|
||||
if (kernel != 0)
|
||||
clReleaseKernel(kernel);
|
||||
|
@ -103,11 +105,40 @@ void Cleanup(cl_context context, cl_command_queue commandQueue,
|
|||
if (context != 0)
|
||||
clReleaseContext(context);
|
||||
|
||||
if (kernel_bin) free(kernel_bin);
|
||||
if (device_id != 0)
|
||||
clReleaseDevice(device_id);
|
||||
}
|
||||
|
||||
int size = 1024;
|
||||
|
||||
static void show_usage() {
|
||||
printf("Usage: [-n size] [-h: help]\n");
|
||||
}
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "n:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
size = atoi(optarg);
|
||||
break;
|
||||
case 'h':
|
||||
case '?': {
|
||||
show_usage();
|
||||
exit(0);
|
||||
} break;
|
||||
default:
|
||||
show_usage();
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
printf("Workload size=%d\n", size);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
printf("enter demo main\n");
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
cl_platform_id platform_id;
|
||||
cl_device_id device_id;
|
||||
|
@ -126,7 +157,7 @@ int main(int argc, char **argv) {
|
|||
context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, &pfn_notify, NULL, &_err));
|
||||
|
||||
cl_command_queue queue;
|
||||
queue = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &_err));
|
||||
queue = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, NULL, &_err));
|
||||
|
||||
cl_kernel kernel = 0;
|
||||
cl_mem memObjects[2] = {0, 0};
|
||||
|
@ -139,7 +170,7 @@ int main(int argc, char **argv) {
|
|||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
|
||||
if (program == NULL) {
|
||||
std::cerr << "Failed to write program binary" << std::endl;
|
||||
Cleanup(context, queue, program, kernel, memObjects);
|
||||
Cleanup(device_id, context, queue, program, kernel, memObjects);
|
||||
return 1;
|
||||
} else {
|
||||
std::cout << "Read program from binary." << std::endl;
|
||||
|
@ -148,7 +179,7 @@ int main(int argc, char **argv) {
|
|||
// Build program
|
||||
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
|
||||
|
||||
size_t nbytes = sizeof(float) * NUM_DATA;
|
||||
size_t nbytes = sizeof(float) * size;
|
||||
|
||||
printf("attempting to create input buffer\n");
|
||||
cl_mem input_buffer;
|
||||
|
@ -175,13 +206,13 @@ int main(int argc, char **argv) {
|
|||
|
||||
printf("attempting to enqueue write buffer\n");
|
||||
float* h_src = (float*)malloc(nbytes);
|
||||
for (int i = 0; i < NUM_DATA; i++) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
h_src[i] = ((float)rand() / (float)(RAND_MAX)) * 100.0;
|
||||
}
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL));
|
||||
free(h_src);
|
||||
|
||||
size_t global_work_size[] = {NUM_DATA/2, NUM_DATA/2};
|
||||
size_t global_work_size[] = {size/2, size/2};
|
||||
printf("attempting to enqueue kernel\n");
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size,
|
||||
|
@ -196,18 +227,13 @@ int main(int argc, char **argv) {
|
|||
CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL));
|
||||
|
||||
/*printf("Result:");
|
||||
for (int i = 0; i < NUM_DATA; i++) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
float data = h_dst[i];
|
||||
printf(" %f", data);
|
||||
}*/
|
||||
free(h_dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(memObjects[0]));
|
||||
CL_CHECK(clReleaseMemObject(memObjects[1]));
|
||||
|
||||
CL_CHECK(clReleaseKernel(kernel));
|
||||
CL_CHECK(clReleaseProgram(program));
|
||||
CL_CHECK(clReleaseContext(context));
|
||||
Cleanup(device_id, context, queue, program, kernel, memObjects);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -7,6 +7,8 @@ POCL_RT_PATH ?= /opt/pocl/runtime
|
|||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
OPTS ?= -n16
|
||||
|
||||
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -float-abi=hard -code-model=small"
|
||||
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
|
||||
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
|
||||
|
@ -33,19 +35,19 @@ $(PROJECT): $(SRCS)
|
|||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
|
|
@ -35,8 +35,6 @@
|
|||
#include <unistd.h>
|
||||
#include <chrono>
|
||||
|
||||
#define NUM_DATA (16+2)
|
||||
|
||||
#define CL_CHECK(_expr) \
|
||||
do { \
|
||||
cl_int _err = _expr; \
|
||||
|
@ -159,14 +157,18 @@ float poclu_cl_half_to_float(cl_half value) {
|
|||
///
|
||||
// Cleanup any created OpenCL resources
|
||||
//
|
||||
void Cleanup(cl_context context, cl_command_queue commandQueue,
|
||||
cl_program program, cl_kernel kernel, cl_mem memObjects[3]) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
void Cleanup(cl_device_id device_id, cl_context context, cl_command_queue commandQueue,
|
||||
cl_program program, cl_kernel kernel, cl_mem memObjects[2]) {
|
||||
if (kernel_bin)
|
||||
free(kernel_bin);
|
||||
|
||||
if (commandQueue != 0)
|
||||
clReleaseCommandQueue(commandQueue);
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
if (memObjects[i] != 0)
|
||||
clReleaseMemObject(memObjects[i]);
|
||||
}
|
||||
if (commandQueue != 0)
|
||||
clReleaseCommandQueue(commandQueue);
|
||||
|
||||
if (kernel != 0)
|
||||
clReleaseKernel(kernel);
|
||||
|
@ -177,11 +179,40 @@ void Cleanup(cl_context context, cl_command_queue commandQueue,
|
|||
if (context != 0)
|
||||
clReleaseContext(context);
|
||||
|
||||
if (kernel_bin) free(kernel_bin);
|
||||
if (device_id != 0)
|
||||
clReleaseDevice(device_id);
|
||||
}
|
||||
|
||||
int size = 16+2;
|
||||
|
||||
static void show_usage() {
|
||||
printf("Usage: [-n size] [-h: help]\n");
|
||||
}
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "n:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
size = atoi(optarg)+2;
|
||||
break;
|
||||
case 'h':
|
||||
case '?': {
|
||||
show_usage();
|
||||
exit(0);
|
||||
} break;
|
||||
default:
|
||||
show_usage();
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
printf("Workload size=%d\n", size);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
printf("enter demo main\n");
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
cl_platform_id platform_id;
|
||||
cl_device_id device_id;
|
||||
|
@ -213,7 +244,7 @@ int main(int argc, char **argv) {
|
|||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
|
||||
if (program == NULL) {
|
||||
std::cerr << "Failed to write program binary" << std::endl;
|
||||
Cleanup(context, queue, program, kernel, memObjects);
|
||||
Cleanup(device_id, context, queue, program, kernel, memObjects);
|
||||
return 1;
|
||||
} else {
|
||||
std::cout << "Read program from binary." << std::endl;
|
||||
|
@ -222,7 +253,7 @@ int main(int argc, char **argv) {
|
|||
// Build program
|
||||
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
|
||||
|
||||
size_t nbytes = sizeof(float) * NUM_DATA * NUM_DATA;
|
||||
size_t nbytes = sizeof(float) * size * size;
|
||||
|
||||
printf("attempting to create input buffer\n");
|
||||
cl_mem input_buffer;
|
||||
|
@ -235,7 +266,7 @@ int main(int argc, char **argv) {
|
|||
memObjects[0] = input_buffer;
|
||||
memObjects[1] = output_buffer;
|
||||
|
||||
long long ldc = NUM_DATA;
|
||||
long long ldc = size;
|
||||
|
||||
float m0 = 1.0;
|
||||
float m1 = 1.0;
|
||||
|
@ -265,15 +296,15 @@ int main(int argc, char **argv) {
|
|||
|
||||
printf("attempting to enqueue write buffer\n");
|
||||
float* h_src = (float*)malloc(nbytes);
|
||||
for (int i = 0; i < NUM_DATA * NUM_DATA; i++) {
|
||||
for (int i = 0; i < size * size; i++) {
|
||||
h_src[i] = ((float)rand() / (float)(RAND_MAX)) * 100.0;
|
||||
}
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL));
|
||||
free(h_src);
|
||||
|
||||
size_t global_offset[2] = {1, 1};
|
||||
size_t global_work_size[2] = {NUM_DATA - 2, NUM_DATA - 2}; // avoid the edges
|
||||
const size_t local_work_size[2] = {NUM_DATA - 2, 1};
|
||||
size_t global_work_size[2] = {size - 2, size - 2}; // avoid the edges
|
||||
const size_t local_work_size[2] = {size - 2, 1};
|
||||
printf("attempting to enqueue kernel\n");
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, global_offset,
|
||||
|
@ -286,20 +317,15 @@ int main(int argc, char **argv) {
|
|||
printf("Download destination buffer\n");
|
||||
float* h_dst = (float*)malloc(nbytes);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL));
|
||||
|
||||
|
||||
/*printf("Result:");
|
||||
for (int i = 0; i < NUM_DATA * NUM_DATA; i++) {
|
||||
for (int i = 0; i < size; i++) {
|
||||
float data = h_dst[i];
|
||||
printf(" %f", data);
|
||||
}*/
|
||||
free(h_dst);
|
||||
|
||||
CL_CHECK(clReleaseMemObject(memObjects[0]));
|
||||
CL_CHECK(clReleaseMemObject(memObjects[1]));
|
||||
|
||||
CL_CHECK(clReleaseKernel(kernel));
|
||||
CL_CHECK(clReleaseProgram(program));
|
||||
CL_CHECK(clReleaseContext(context));
|
||||
Cleanup(device_id, context, queue, program, kernel, memObjects);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -129,6 +129,8 @@ static void parse_args(int argc, char **argv) {
|
|||
printf("Error: invalid size!\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
printf("Workload size=%d\n", size);
|
||||
}
|
||||
|
||||
int main (int argc, char **argv) {
|
||||
|
@ -218,7 +220,8 @@ int main (int argc, char **argv) {
|
|||
matmul(h_ref, h_a, h_b, size, size, size);
|
||||
for (int i = 0; i < (size * size); i++) {
|
||||
if (!almost_equal(h_c[i], h_ref[i])) {
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", i, h_ref[i], h_c[i]);
|
||||
if (errors < 100)
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", i, h_ref[i], h_c[i]);
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -112,6 +112,8 @@ static void parse_args(int argc, char **argv) {
|
|||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
printf("Workload size=%d\n", size);
|
||||
}
|
||||
|
||||
int main (int argc, char **argv) {
|
||||
|
@ -196,7 +198,8 @@ int main (int argc, char **argv) {
|
|||
for (int i = 0; i < size; ++i) {
|
||||
float ref = h_a[i] + h_b[i];
|
||||
if (!almost_equal(h_c[i], ref)) {
|
||||
printf("*** error: [%d] expected=%f, actual=%f, a=%f, b=%f\n", i, ref, h_c[i], h_a[i], h_b[i]);
|
||||
if (errors < 100)
|
||||
printf("*** error: [%d] expected=%f, actual=%f, a=%f, b=%f\n", i, ref, h_c[i], h_a[i], h_b[i]);
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -123,11 +123,15 @@ esac
|
|||
if [ -d "$VORTEX_HOME/driver/tests/$APP" ];
|
||||
then
|
||||
APP_PATH=$VORTEX_HOME/driver/tests/$APP
|
||||
else
|
||||
elif [ -d "$VORTEX_HOME/benchmarks/opencl/$APP" ];
|
||||
then
|
||||
APP_PATH=$VORTEX_HOME/benchmarks/opencl/$APP
|
||||
else
|
||||
echo "Application folder found: $APP"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DL2_ENABLE=$L2 -DL3_ENABLE=$L3 $PERF_FLAG"
|
||||
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DL2_ENABLE=$L2 -DL3_ENABLE=$L3 $PERF_FLAG $CONFIGS"
|
||||
|
||||
echo "CONFIGS=$CONFIGS"
|
||||
|
||||
|
|
|
@ -3,25 +3,62 @@
|
|||
# exit when any command fails
|
||||
set -e
|
||||
|
||||
make -s
|
||||
|
||||
# Dogfood tests
|
||||
./ci/test_runtime.sh
|
||||
./ci/test_riscv_isa.sh
|
||||
./ci/test_opencl.sh
|
||||
./ci/test_driver.sh
|
||||
./ci/test_simx.sh
|
||||
./ci/test_compiler.sh
|
||||
|
||||
# Build tests disabling extensions
|
||||
# warp/threads configurations
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=2 --app=demo
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=demo
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=demo
|
||||
|
||||
# cores clustering
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=demo --args="-n1"
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=1 --app=demo --args="-n1"
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=demo --args="-n1"
|
||||
|
||||
# L2/L3
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=demo --args="-n1"
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=demo --args="-n1"
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=demo --args="-n1"
|
||||
|
||||
# build flags
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --perf --app=demo --args="-n1"
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --debug --app=demo --args="-n1"
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --scope --app=basic --args="-t0 -n1"
|
||||
|
||||
# disabling M extension
|
||||
CONFIGS=-DEXT_M_DISABLE make -C hw/simulate
|
||||
|
||||
# disabling F extension
|
||||
CONFIGS=-DEXT_F_DISABLE make -C hw/simulate
|
||||
|
||||
# disable shared memory
|
||||
CONFIGS=-DSM_ENABLE=0 make -C hw/simulate
|
||||
|
||||
# Blackbox tests
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --perf --app=demo --args="-n1"
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --debug --app=demo --args="-n1"
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --scope --app=demo --args="-n1"
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=demo --args="-n1"
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=demo --args="-n1"
|
||||
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=demo --args="-n1"
|
||||
# using FPNEW core
|
||||
FPU_CORE=FPU_FPNEW ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood
|
||||
|
||||
# test 128-bit MEM block
|
||||
CONFIGS=-DMEM_BLOCK_SIZE=16 ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
|
||||
|
||||
# test 128-bit MEM and DRAM block
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=16 -DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=128 -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=28 -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
|
||||
|
||||
# test 27-bit DRAM address
|
||||
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
|
||||
|
||||
# test 128-bit DRAM block
|
||||
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1 -DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=128 -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=28" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
|
||||
|
||||
# test verilator reset values
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=0" ./ci/blackbox.sh --driver=vlsim --cores=4 --app=sgemm
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --cores=4 --app=sgemm
|
||||
|
||||
# test vlsim memory stress
|
||||
CONFIGS="-DMEM_LATENCY=100 -DMEM_RQ_SIZE=4 -DMEM_STALLS_MODULO=4" ./ci/blackbox.sh --driver=vlsim --cores=4 --app=sgemm
|
||||
|
|
70
doc/Cache_Subsystem.md
Normal file
70
doc/Cache_Subsystem.md
Normal file
|
@ -0,0 +1,70 @@
|
|||
# Vortex Cache Subsystem
|
||||
|
||||
The Vortex Cache Sub-system has the following main properties:
|
||||
|
||||
- High-bandwidth with bank parallelism
|
||||
- Snoop protocol to flush data for CPU access
|
||||
- Generic design: Dcache, Icache, Shared Memory, L2 cache, L3 cache
|
||||
|
||||
### Cache Hierarchy
|
||||
|
||||

|
||||
|
||||
- Cache can be configured to be any level in the hierarchy
|
||||
- Caches communicate via snooping
|
||||
- Cache flush from AFU is passed down the hierarchy
|
||||
|
||||
### VX_cache.v (Top Module)
|
||||
|
||||
VX.cache.v is the top module of the cache verilog code located in the `/hw/rtl/cache` directory.
|
||||
|
||||

|
||||
|
||||
- Configurable (Cache size, number of banks, bank line size, etc.)
|
||||
- I/O signals
|
||||
- Core Request
|
||||
- Core Rsp
|
||||
- DRAM Req
|
||||
- DRAM Rsp
|
||||
- Snoop Rsp
|
||||
- Snoop Rsp
|
||||
- Snoop Forwarding Out
|
||||
- Snoop Forwarding In
|
||||
- Bank Select
|
||||
- Assigns valid and ready signals for each bank
|
||||
- Snoop Forwarder
|
||||
- DRAM Request Arbiter
|
||||
- Prepares cache response for communication with DRAM
|
||||
- Snoop Response Arbiter
|
||||
- Sends snoop response
|
||||
- Core Response Merge
|
||||
- Cache accesses one line at a time. As a result, each request may not come back in the same response. This module tries to recombine the responses by thread ID.
|
||||
|
||||
### VX_bank.v
|
||||
|
||||
VX_bank.v is the verilog code that handles cache bank functionality and is located in the `/hw/rtl/cache` directory.
|
||||
|
||||

|
||||
|
||||
- Allows for high throughput
|
||||
- Each bank contains queues to hold requests to the cache
|
||||
- I/O signals
|
||||
- Core request
|
||||
- Core Response
|
||||
- DRAM Fill Requests
|
||||
- DRAM Fill Response
|
||||
- DRAM WB Requests
|
||||
- Snp Request
|
||||
- Snp Response
|
||||
- Request Priority: DRAM fill, miss reserve, core request, snoop request
|
||||
- Snoop Request Queue
|
||||
- DRAM Fill Queue
|
||||
- Core Req Arbiter
|
||||
- Requests to be processed by the bank
|
||||
- Tag Data Store
|
||||
- Registers for valid, dirty, dirtyb, tag, and data
|
||||
- Length of registers determined by lines in the bank
|
||||
- Tag Data Access:
|
||||
- I/O: stall, snoop info, force request miss
|
||||
- Writes to cache or sends read response; hit or miss determined here
|
||||
- A missed request goes to the miss reserve if it is not a snoop request or DRAM fill
|
35
doc/Codebase.md
Normal file
35
doc/Codebase.md
Normal file
|
@ -0,0 +1,35 @@
|
|||
# Vortex Codebase
|
||||
|
||||
The directory/file layout of the Vortex codebase is as followed:
|
||||
|
||||
- `benchmark`: contains opencl, risc-v, and vector tests
|
||||
- `opencl`: contains basic kernel operation tests (i.e. vector add, transpose, dot product)
|
||||
- `riscv`: contains official riscv tests which are pre-compiled into binaries
|
||||
- `vector`: tests for vector instructions (not yet implemented)
|
||||
- `ci`: contain tests to be run during continuous integration (Travis CI)
|
||||
- driver, opencl, riscv_isa, and runtime tests
|
||||
- `driver`: contains driver software implementation (software that is run on the host to communicate with the vortex processor)
|
||||
- `opae`: contains code for driver that runs on FPGA
|
||||
- `rtlsim`: contains code for driver that runs on local machine (driver built using verilator which converts rtl to c++ binary)
|
||||
- `simx`: contains code for driver that runs on local machine (vortex)
|
||||
- `include`: contains vortex.h which has the vortex API that is used by the drivers
|
||||
- `runtime`: contains software used inside kernel programs to expose GPGPU capabilities
|
||||
- `include`: contains vortex API needed for runtime
|
||||
- `linker`: contains linker file for compiling kernels
|
||||
- `src`: contains implementation of vortex API (from include folder)
|
||||
- `tests`: contains runtime tests
|
||||
- `simple`: contains test for GPGPU functionality allowed in vortex
|
||||
- `simx`: contains simX, the cycle approximate simulator for vortex
|
||||
- `miscs`: contains old code that is no longer used
|
||||
- `hw`:
|
||||
- `unit_tests`: contains unit test for RTL of cache and queue
|
||||
- `syn`: contains all synthesis scripts (quartus and yosys)
|
||||
- `quartus`: contains code to synthesis cache, core, pipeline, top, and vortex stand-alone
|
||||
- `simulate`: contains RTL simulator (verilator)
|
||||
- `testbench.cpp`: runs either the riscv, runtime, or opencl tests
|
||||
- `opae`: contains source code for the accelerator functional unit (AFU) and code which programs the fpga
|
||||
- `rtl`: contains rtl source code
|
||||
- `cache`: contains cache subsystem code
|
||||
- `fp_cores`: contains floating point unit code
|
||||
- `interfaces`: contains code that handles communication for each of the units of the microarchitecture
|
||||
- `libs`: contains general-purpose modules (i.e., buffers, encoders, arbiters, pipe registers)
|
BIN
doc/Images/cache_hierarchy.png
Normal file
BIN
doc/Images/cache_hierarchy.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 60 KiB |
BIN
doc/Images/vortex_bank.png
Normal file
BIN
doc/Images/vortex_bank.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 77 KiB |
BIN
doc/Images/vortex_cache_top_module.png
Normal file
BIN
doc/Images/vortex_cache_top_module.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 67 KiB |
BIN
doc/Images/vortex_microarchitecture_v2.png
Normal file
BIN
doc/Images/vortex_microarchitecture_v2.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 517 KiB |
94
doc/Microarchitecture.md
Normal file
94
doc/Microarchitecture.md
Normal file
|
@ -0,0 +1,94 @@
|
|||
# Vortex Microarchitecture
|
||||
|
||||
### Vortex GPGPU Execution Model
|
||||
|
||||
Vortex uses the SIMT (Single Instruction, Multiple Threads) execution model with a single warp issued per cycle.
|
||||
|
||||
- **Threads**
|
||||
- Smallest unit of computation
|
||||
- Each thread has its own register file (32 int + 32 fp registers)
|
||||
- Threads execute in parallel
|
||||
- **Warps**
|
||||
- A logical clster of threads
|
||||
- Each thread in a warp execute the same instruction
|
||||
- The PC is shared; maintain thread mask for Writeback
|
||||
- Warp's execution is time-multiplexed at log steps
|
||||
- Ex. warp 0 executes at cycle 0, warp 1 executes at cycle 1
|
||||
|
||||
### Vortex RISC-V ISA Extension
|
||||
|
||||
- **Thread Mask Control**
|
||||
- Control the number of warps to activate during execution
|
||||
- `TMC` *count*: activate count threads
|
||||
- **Warp Scheduling**
|
||||
- Control the number of warps to activate during execution
|
||||
- `WSPAWN` *count, addr*: activate count warps and jump to addr location
|
||||
- **Control-Flow Divergence**
|
||||
- Control threads to activate when a branch diverges
|
||||
- `SPLIT` *predicate*: apply 'taken' predicate thread mask adn save 'not-taken' into IPDOM stack
|
||||
- `JOIN`: restore 'not-taken' thread mask
|
||||
- **Warp Synchronization**
|
||||
- `BAR` *id, count*: stall warps entering barrier *id* until count is reached
|
||||
|
||||
### Vortex Pipeline/Datapath
|
||||
|
||||

|
||||
|
||||
Vortex has a 5-stage pipeline: FI | ID | Issue | EX | WB.
|
||||
|
||||
- **Fetch**
|
||||
- Warp Scheduler
|
||||
- Track stalled & active warps, resolve branches and barriers, maintain split/join IPDOM stack
|
||||
- Instruction Cache
|
||||
- Retrieve instruction from cache, issue I-cache requests/responses
|
||||
- **Decode**
|
||||
- Decode fetched instructions, notify warp scheduler when the following instructions are decoded:
|
||||
- Branch, tmc, split/join, wspawn
|
||||
- Precompute used_regs mask (needed for Issue stage)
|
||||
- **Issue**
|
||||
- Scheduling
|
||||
- In-order issue (operands/execute unit ready), out-of-order commit
|
||||
- IBuffer
|
||||
- Store fetched instructions, separate queues per-warp, selects next warp through round-robin scheduling
|
||||
- Scoreboard
|
||||
- Track in-use registers
|
||||
- GPRs (General-Purpose Registers) stage
|
||||
- Fetch issued instruction operands and send operands to execute unit
|
||||
- **Execute**
|
||||
- ALU Unit
|
||||
- Single-cycle operations (+,-,>>,<<,&,|,^), Branch instructions (Share ALU resources)
|
||||
- MULDIV Unit
|
||||
- Multiplier - done in 2 cycles
|
||||
- Divider - division and remainder, done in 32 cycles
|
||||
- Implements serial alogrithm (Stalls the pipeline)
|
||||
- FPU Unit
|
||||
- Multi-cycle operations, uses `FPnew` Library on ASIC, uses hard DSPs on FPGA
|
||||
- CSR Unit
|
||||
- Store constant status registers - device caps, FPU status flags, performance counters
|
||||
- Handle external CSR requests (requests from host CPU)
|
||||
- LSU Unit
|
||||
- Handle load/store operations, issue D-cache requests, handle D-cache responses
|
||||
- Commit load responses - saves storage, Scoreboard tracks completion
|
||||
- GPGPU Unit
|
||||
- Handle GPGPU instructions
|
||||
- TMC, WSPAWN, SPLIT, BAR
|
||||
- JOIN is handled by Warp Scheduler (upon SPLIT response)
|
||||
- **Commit**
|
||||
- Commit
|
||||
- Update CSR flags, update performance counters
|
||||
- Writeback
|
||||
- Write result back to GPRs, notify Scoreboard (release in-use register), select candidate instruction (ALU unit has highest priority)
|
||||
- **Clustering**
|
||||
- Group mulitple cores into clusters (optionally share L2 cache)
|
||||
- Group multiple clusters (optionally share L3 cache)
|
||||
- Configurable at build time
|
||||
- Default configuration:
|
||||
- #Clusters = 1
|
||||
- #Cores = 4
|
||||
- #Warps = 4
|
||||
- #Threads = 4
|
||||
- **FPGA AFU Interface**
|
||||
- Manage CPU-GPU comunication
|
||||
- Query devices caps, load kernel instructions and resource buffers, start kernel execution, read destination buffers
|
||||
- Local Memory - GPU access to local DRAM
|
||||
- Reserved I/O addresses - redirect to host CPU, console output
|
|
@ -24,10 +24,9 @@ Running tests under specific drivers (rtlsim,simx,fpga) is done using the script
|
|||
- *L3cache* - used to enable the shared l3cache among the Vortex clusters.
|
||||
- *Driver* - used to specify which driver to run the Vortex simulation (either rtlsim, vlsim, fpga, or simx).
|
||||
- *Debug* - used to enable debug mode for the Vortex simulation.
|
||||
- *Scope* -
|
||||
- *Perf* - is used to enable the detailed performance counters within the Vortex simulation.
|
||||
- *App* - is used to specify which test/benchmark to run in the Vortex simulation. The main choices are vecadd, sgemm, basic, demo, and dogfood. Other tests/benchmarks are located in the `/benchmarks/opencl` folder though not all of them work wit the current version of Vortex.
|
||||
- *Args* -
|
||||
- *Perf* - used to enable the detailed performance counters within the Vortex simulation.
|
||||
- *App* - used to specify which test/benchmark to run in the Vortex simulation. The main choices are vecadd, sgemm, basic, demo, and dogfood. Other tests/benchmarks are located in the `/benchmarks/opencl` folder though not all of them work wit the current version of Vortex.
|
||||
- *Args* - used to pass additional arguments to the application.
|
||||
|
||||
Example use of command line arguments: Run the sgemm benchmark using the vlsim driver with a Vortex configuration of 1 cluster, 4 cores, 4 warps, and 4 threads.
|
||||
|
||||
|
|
|
@ -2,10 +2,12 @@
|
|||
|
||||
### Table of Contents
|
||||
|
||||
- Vortex Architecture
|
||||
- [Vortex Codebase Layout](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Codebase.md)
|
||||
- [Vortex Microarchitecture and Extended RISC-V ISA](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Microarchitecture.md)
|
||||
- [Vortex Cache Subsystem](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Cache_Subsystem.md)
|
||||
- Vortex Software
|
||||
- [Vortex Simulation](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Simulation.md)
|
||||
- [FPGA](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Flubber_FPGA_Startup_Guide.md)
|
||||
- [FPGA Configuration, Program and Test](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Flubber_FPGA_Startup_Guide.md)
|
||||
- Debugging
|
||||
- Useful Links
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
all: stub rtlsim simx opae
|
||||
all: stub rtlsim simx opae tests
|
||||
|
||||
stub:
|
||||
$(MAKE) -C stub
|
||||
|
@ -12,10 +12,14 @@ rtlsim:
|
|||
simx:
|
||||
$(MAKE) -C simx
|
||||
|
||||
tests:
|
||||
$(MAKE) -C tests
|
||||
|
||||
clean:
|
||||
$(MAKE) clean -C stub
|
||||
$(MAKE) clean -C opae
|
||||
$(MAKE) clean -C rtlsim
|
||||
$(MAKE) clean -C simx
|
||||
$(MAKE) clean -C tests
|
||||
|
||||
.PHONY: all stub opae rtlsim simx clean
|
||||
.PHONY: all stub opae rtlsim simx tests clean
|
|
@ -33,6 +33,13 @@ extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_
|
|||
while (offset < size) {
|
||||
auto chunk_size = std::min<size_t>(buffer_transfer_size, size - offset);
|
||||
std::memcpy(buf_ptr, (uint8_t*)content + offset, chunk_size);
|
||||
|
||||
/*printf("** Upload Kernel to 0x%0x: data=", kernel_base_addr + offset);
|
||||
for (int i = 0, n = ((chunk_size+7)/8); i < n; ++i) {
|
||||
printf("%08x", ((uint64_t*)((uint8_t*)content + offset))[n-1-i]);
|
||||
}
|
||||
printf("\n");*/
|
||||
|
||||
err = vx_copy_to_dev(buffer, kernel_base_addr + offset, chunk_size, 0);
|
||||
if (err != 0) {
|
||||
vx_buf_release(buffer);
|
||||
|
@ -115,10 +122,10 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
uint64_t smem_writes = 0;
|
||||
uint64_t smem_bank_stalls = 0;
|
||||
// PERF: memory
|
||||
uint64_t dram_reads = 0;
|
||||
uint64_t dram_writes = 0;
|
||||
uint64_t dram_stalls = 0;
|
||||
uint64_t dram_lat = 0;
|
||||
uint64_t mem_reads = 0;
|
||||
uint64_t mem_writes = 0;
|
||||
uint64_t mem_stalls = 0;
|
||||
uint64_t mem_lat = 0;
|
||||
#endif
|
||||
|
||||
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
|
||||
|
@ -255,21 +262,21 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
if (num_cores > 1) fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_st_per_core, smem_bank_utilization);
|
||||
smem_bank_stalls += smem_bank_st_per_core;
|
||||
|
||||
// PERF: DRAM
|
||||
uint64_t dram_reads_per_core, dram_writes_per_core, dram_stalls_per_core, dram_lat_per_core;
|
||||
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_READS, CSR_MPM_DRAM_READS_H, &dram_reads_per_core);
|
||||
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_WRITES, CSR_MPM_DRAM_WRITES_H, &dram_writes_per_core);
|
||||
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_ST, CSR_MPM_DRAM_ST_H, &dram_stalls_per_core);
|
||||
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_LAT, CSR_MPM_DRAM_LAT_H, &dram_lat_per_core);
|
||||
int dram_utilization = (int)((double(dram_reads_per_core + dram_writes_per_core) / double(dram_reads_per_core + dram_writes_per_core + dram_stalls_per_core)) * 100);
|
||||
int dram_avg_lat = (int)(double(dram_lat_per_core) / double(dram_reads_per_core));
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dram requests=%ld (reads=%ld, writes=%ld)\n", core_id, (dram_reads_per_core + dram_writes_per_core), dram_reads_per_core, dram_writes_per_core);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dram stalls=%ld (utilization=%d%%)\n", core_id, dram_stalls_per_core, dram_utilization);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dram average latency=%d cycles\n", core_id, dram_avg_lat);
|
||||
dram_reads += dram_reads_per_core;
|
||||
dram_writes += dram_writes_per_core;
|
||||
dram_stalls += dram_stalls_per_core;
|
||||
dram_lat += dram_lat_per_core;
|
||||
// PERF: memory
|
||||
uint64_t mem_reads_per_core, mem_writes_per_core, mem_stalls_per_core, mem_lat_per_core;
|
||||
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_READS, CSR_MPM_MEM_READS_H, &mem_reads_per_core);
|
||||
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_WRITES, CSR_MPM_MEM_WRITES_H, &mem_writes_per_core);
|
||||
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_ST, CSR_MPM_MEM_ST_H, &mem_stalls_per_core);
|
||||
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_LAT, CSR_MPM_MEM_LAT_H, &mem_lat_per_core);
|
||||
int mem_utilization = (int)((double(mem_reads_per_core + mem_writes_per_core) / double(mem_reads_per_core + mem_writes_per_core + mem_stalls_per_core)) * 100);
|
||||
int mem_avg_lat = (int)(double(mem_lat_per_core) / double(mem_reads_per_core));
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory requests=%ld (reads=%ld, writes=%ld)\n", core_id, (mem_reads_per_core + mem_writes_per_core), mem_reads_per_core, mem_writes_per_core);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory stalls=%ld (utilization=%d%%)\n", core_id, mem_stalls_per_core, mem_utilization);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory average latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
mem_reads += mem_reads_per_core;
|
||||
mem_writes += mem_writes_per_core;
|
||||
mem_stalls += mem_stalls_per_core;
|
||||
mem_lat += mem_lat_per_core;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -282,8 +289,8 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
|
||||
int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100);
|
||||
int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100);
|
||||
int dram_utilization = (int)((double(dram_reads + dram_writes) / double(dram_reads + dram_writes + dram_stalls)) * 100);
|
||||
int dram_avg_lat = (int)(double(dram_lat) / double(dram_reads));
|
||||
int mem_utilization = (int)((double(mem_reads + mem_writes) / double(mem_reads + mem_writes + mem_stalls)) * 100);
|
||||
int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads));
|
||||
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
|
||||
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
|
||||
fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls);
|
||||
|
@ -306,9 +313,9 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
fprintf(stream, "PERF: smem reads=%ld\n", smem_reads);
|
||||
fprintf(stream, "PERF: smem writes=%ld\n", smem_writes);
|
||||
fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization);
|
||||
fprintf(stream, "PERF: dram requests=%ld (reads=%ld, writes=%ld)\n", (dram_reads + dram_writes), dram_reads, dram_writes);
|
||||
fprintf(stream, "PERF: dram stalls=%ld (utilization=%d%%)\n", dram_stalls, dram_utilization);
|
||||
fprintf(stream, "PERF: dram average latency=%d cycles\n", dram_avg_lat);
|
||||
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
|
||||
fprintf(stream, "PERF: memory stalls=%ld (utilization=%d%%)\n", mem_stalls, mem_utilization);
|
||||
fprintf(stream, "PERF: memory average latency=%d cycles\n", mem_avg_lat);
|
||||
#endif
|
||||
|
||||
return ret;
|
||||
|
|
|
@ -1,8 +1,7 @@
|
|||
CFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
|
||||
#CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors
|
||||
|
||||
CFLAGS += -Wno-aligned-new -Wno-maybe-uninitialized
|
||||
|
||||
CFLAGS += -DUSE_VLSIM -fPIC -Wno-maybe-uninitialized
|
||||
CFLAGS += -I../../../../hw
|
||||
|
||||
# control RTL debug print states
|
||||
|
@ -13,7 +12,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
|
|||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
|
||||
|
@ -22,15 +21,9 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_TEX
|
|||
DBG_FLAGS += $(DBG_PRINT_FLAGS)
|
||||
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
|
||||
|
||||
#CONFIGS ?= -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
|
||||
#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
|
||||
#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
|
||||
CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=1
|
||||
|
||||
CFLAGS += -fPIC
|
||||
|
||||
CFLAGS += -DUSE_VLSIM $(CONFIGS)
|
||||
|
||||
CFLAGS += $(CONFIGS)
|
||||
CFLAGS += -DDUMP_PERF_STATS
|
||||
|
||||
LDFLAGS += -shared -pthread
|
||||
|
@ -49,10 +42,11 @@ TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
|
|||
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE)
|
||||
RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip
|
||||
|
||||
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
|
||||
VL_FLAGS += -Wno-DECLFILENAME
|
||||
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
|
||||
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
||||
VL_FLAGS += --x-initial unique --x-assign unique
|
||||
VL_FLAGS += verilator.vlt
|
||||
VL_FLAGS += $(CONFIGS)
|
||||
|
||||
# Enable Verilator multithreaded simulation
|
||||
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
|
||||
|
@ -83,16 +77,20 @@ endif
|
|||
VL_FLAGS += -DNOPAE
|
||||
CFLAGS += -DNOPAE
|
||||
|
||||
# use DPI FPU
|
||||
VL_FLAGS += -DFPU_DPI
|
||||
# FPU backend
|
||||
FPU_CORE ?= FPU_DPI
|
||||
VL_FLAGS += -D$(FPU_CORE)
|
||||
|
||||
PROJECT = libopae-c-vlsim.so
|
||||
|
||||
all: $(PROJECT)
|
||||
|
||||
vortex_afu.h : $(RTL_DIR)/afu/vortex_afu.vh
|
||||
../../../hw/scripts/gen_config.py -i $(RTL_DIR)/afu/vortex_afu.vh -o vortex_afu.h
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(PROJECT): $(SRCS) vortex_afu.h
|
||||
verilator --exe --cc $(TOP) --top-module $(TOP) $(RTL_INCLUDE) $(VL_FLAGS) $(SRCS) -CFLAGS '$(CFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT)
|
||||
make -j -C obj_dir -f V$(TOP).mk
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) obj_dir ../scope-defs.h $(RTL_DIR)/scope-defs.vh
|
||||
rm -rf $(PROJECT) obj_dir ../scope-defs.h $(RTL_DIR)/scope-defs.vh vortex_afu.h
|
||||
|
|
|
@ -10,10 +10,23 @@
|
|||
|
||||
#define RESET_DELAY 4
|
||||
|
||||
#define ENABLE_DRAM_STALLS
|
||||
#define DRAM_LATENCY 24
|
||||
#define DRAM_RQ_SIZE 16
|
||||
#define DRAM_STALLS_MODULO 16
|
||||
#define ENABLE_MEM_STALLS
|
||||
|
||||
#ifndef MEM_LATENCY
|
||||
#define MEM_LATENCY 24
|
||||
#endif
|
||||
|
||||
#ifndef MEM_RQ_SIZE
|
||||
#define MEM_RQ_SIZE 16
|
||||
#endif
|
||||
|
||||
#ifndef MEM_STALLS_MODULO
|
||||
#define MEM_STALLS_MODULO 16
|
||||
#endif
|
||||
|
||||
#ifndef VERILATOR_RESET_VALUE
|
||||
#define VERILATOR_RESET_VALUE 2
|
||||
#endif
|
||||
|
||||
uint64_t timestamp = 0;
|
||||
|
||||
|
@ -23,7 +36,7 @@ double sc_time_stamp() {
|
|||
|
||||
opae_sim::opae_sim() {
|
||||
// force random values for unitialized signals
|
||||
Verilated::randReset(2);
|
||||
Verilated::randReset(VERILATOR_RESET_VALUE);
|
||||
Verilated::randSeed(50);
|
||||
|
||||
// Turn off assertion before reset
|
||||
|
@ -137,16 +150,19 @@ void opae_sim::flush() {
|
|||
|
||||
void opae_sim::reset() {
|
||||
|
||||
host_buffers_.clear();
|
||||
dram_reads_.clear();
|
||||
host_buffers_.clear();
|
||||
cci_reads_.clear();
|
||||
cci_writes_.clear();
|
||||
vortex_afu_->vcp2af_sRxPort_c0_rspValid = 0;
|
||||
vortex_afu_->vcp2af_sRxPort_c1_rspValid = 0;
|
||||
vortex_afu_->vcp2af_sRxPort_c0_TxAlmFull = 0;
|
||||
vortex_afu_->vcp2af_sRxPort_c1_TxAlmFull = 0;
|
||||
vortex_afu_->avs_readdatavalid = 0;
|
||||
vortex_afu_->avs_waitrequest = 0;
|
||||
|
||||
for (int b = 0; b < PLATFORM_PARAM_LOCAL_MEMORY_BANKS; ++b) {
|
||||
mem_reads_[b].clear();
|
||||
vortex_afu_->avs_readdatavalid[b] = 0;
|
||||
vortex_afu_->avs_waitrequest[b] = 0;
|
||||
}
|
||||
|
||||
vortex_afu_->reset = 1;
|
||||
|
||||
|
@ -268,84 +284,89 @@ void opae_sim::sTxPort_bus() {
|
|||
}
|
||||
|
||||
void opae_sim::avs_bus() {
|
||||
// update DRAM responses schedule
|
||||
for (auto& rsp : dram_reads_) {
|
||||
if (rsp.cycles_left > 0)
|
||||
rsp.cycles_left -= 1;
|
||||
}
|
||||
|
||||
// schedule DRAM responses in FIFO order
|
||||
std::list<dram_rd_req_t>::iterator dram_rd_it(dram_reads_.end());
|
||||
if (!dram_reads_.empty()
|
||||
&& (0 == dram_reads_.begin()->cycles_left)) {
|
||||
dram_rd_it = dram_reads_.begin();
|
||||
}
|
||||
|
||||
// send DRAM response
|
||||
vortex_afu_->avs_readdatavalid = 0;
|
||||
if (dram_rd_it != dram_reads_.end()) {
|
||||
vortex_afu_->avs_readdatavalid = 1;
|
||||
memcpy(vortex_afu_->avs_readdata, dram_rd_it->data.data(), CACHE_BLOCK_SIZE);
|
||||
uint32_t addr = dram_rd_it->addr;
|
||||
dram_reads_.erase(dram_rd_it);
|
||||
/*printf("%0ld: [sim] DRAM Rd Rsp: addr=%x, pending={", timestamp, addr * CACHE_BLOCK_SIZE);
|
||||
for (auto& req : dram_reads_) {
|
||||
if (req.cycles_left != 0)
|
||||
printf(" !%0x", req.addr * CACHE_BLOCK_SIZE);
|
||||
else
|
||||
printf(" %0x", req.addr * CACHE_BLOCK_SIZE);
|
||||
for (int b = 0; b < PLATFORM_PARAM_LOCAL_MEMORY_BANKS; ++b) {
|
||||
// update memory responses schedule
|
||||
for (auto& rsp : mem_reads_[b]) {
|
||||
if (rsp.cycles_left > 0)
|
||||
rsp.cycles_left -= 1;
|
||||
}
|
||||
printf("}\n");*/
|
||||
}
|
||||
|
||||
// handle DRAM stalls
|
||||
bool dram_stalled = false;
|
||||
#ifdef ENABLE_DRAM_STALLS
|
||||
if (0 == ((timestamp/2) % DRAM_STALLS_MODULO)) {
|
||||
dram_stalled = true;
|
||||
} else
|
||||
if (dram_reads_.size() >= DRAM_RQ_SIZE) {
|
||||
dram_stalled = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
// process DRAM requests
|
||||
if (!dram_stalled) {
|
||||
assert(!vortex_afu_->avs_read || !vortex_afu_->avs_write);
|
||||
if (vortex_afu_->avs_write) {
|
||||
assert(0 == vortex_afu_->mem_bank_select);
|
||||
uint64_t byteen = vortex_afu_->avs_byteenable;
|
||||
unsigned base_addr = (vortex_afu_->avs_address * CACHE_BLOCK_SIZE);
|
||||
uint8_t* data = (uint8_t*)(vortex_afu_->avs_writedata);
|
||||
for (int i = 0; i < CACHE_BLOCK_SIZE; i++) {
|
||||
if ((byteen >> i) & 0x1) {
|
||||
ram_[base_addr + i] = data[i];
|
||||
}
|
||||
}
|
||||
// schedule memory responses in FIFO order
|
||||
std::list<mem_rd_req_t>::iterator mem_rd_it(mem_reads_[b].end());
|
||||
if (!mem_reads_[b].empty()
|
||||
&& (0 == mem_reads_[b].begin()->cycles_left)) {
|
||||
mem_rd_it = mem_reads_[b].begin();
|
||||
}
|
||||
if (vortex_afu_->avs_read) {
|
||||
assert(0 == vortex_afu_->mem_bank_select);
|
||||
dram_rd_req_t dram_req;
|
||||
dram_req.addr = vortex_afu_->avs_address;
|
||||
ram_.read(vortex_afu_->avs_address * CACHE_BLOCK_SIZE, CACHE_BLOCK_SIZE, dram_req.data.data());
|
||||
dram_req.cycles_left = DRAM_LATENCY;
|
||||
for (auto& rsp : dram_reads_) {
|
||||
if (dram_req.addr == rsp.addr) {
|
||||
dram_req.cycles_left = rsp.cycles_left;
|
||||
break;
|
||||
}
|
||||
}
|
||||
dram_reads_.emplace_back(dram_req);
|
||||
/*printf("%0ld: [sim] DRAM Rd Req: addr=%x, pending={", timestamp, dram_req.addr * CACHE_BLOCK_SIZE);
|
||||
for (auto& req : dram_reads_) {
|
||||
|
||||
// send memory response
|
||||
vortex_afu_->avs_readdatavalid[b] = 0;
|
||||
if (mem_rd_it != mem_reads_[b].end()) {
|
||||
vortex_afu_->avs_readdatavalid[b] = 1;
|
||||
memcpy(vortex_afu_->avs_readdata[b], mem_rd_it->data.data(), MEM_BLOCK_SIZE);
|
||||
uint32_t addr = mem_rd_it->addr;
|
||||
mem_reads_[b].erase(mem_rd_it);
|
||||
/*printf("%0ld: [sim] MEM Rd Rsp: addr=%x, pending={", timestamp, addr * MEM_BLOCK_SIZE);
|
||||
for (auto& req : mem_reads_[b]) {
|
||||
if (req.cycles_left != 0)
|
||||
printf(" !%0x", req.addr * CACHE_BLOCK_SIZE);
|
||||
printf(" !%0x", req.addr * MEM_BLOCK_SIZE);
|
||||
else
|
||||
printf(" %0x", req.addr * CACHE_BLOCK_SIZE);
|
||||
printf(" %0x", req.addr * MEM_BLOCK_SIZE);
|
||||
}
|
||||
printf("}\n");*/
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vortex_afu_->avs_waitrequest = dram_stalled;
|
||||
// handle memory stalls
|
||||
bool mem_stalled = false;
|
||||
#ifdef ENABLE_MEM_STALLS
|
||||
if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) {
|
||||
mem_stalled = true;
|
||||
} else
|
||||
if (mem_reads_[b].size() >= MEM_RQ_SIZE) {
|
||||
mem_stalled = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
// process memory requests
|
||||
if (!mem_stalled) {
|
||||
assert(!vortex_afu_->avs_read[b] || !vortex_afu_->avs_write[b]);
|
||||
if (vortex_afu_->avs_write[b]) {
|
||||
uint64_t byteen = vortex_afu_->avs_byteenable[b];
|
||||
unsigned base_addr = vortex_afu_->avs_address[b] * MEM_BLOCK_SIZE;
|
||||
uint8_t* data = (uint8_t*)(vortex_afu_->avs_writedata[b]);
|
||||
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
|
||||
if ((byteen >> i) & 0x1) {
|
||||
ram_[base_addr + i] = data[i];
|
||||
}
|
||||
}
|
||||
/*printf("%0ld: [sim] MEM Wr Req: addr=%x, data=", timestamp, base_addr);
|
||||
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
|
||||
printf("%0x", data[(MEM_BLOCK_SIZE-1)-i]);
|
||||
}
|
||||
printf("\n");*/
|
||||
}
|
||||
if (vortex_afu_->avs_read[b]) {
|
||||
mem_rd_req_t mem_req;
|
||||
mem_req.addr = vortex_afu_->avs_address[b];
|
||||
ram_.read(vortex_afu_->avs_address[b] * MEM_BLOCK_SIZE, MEM_BLOCK_SIZE, mem_req.data.data());
|
||||
mem_req.cycles_left = MEM_LATENCY;
|
||||
for (auto& rsp : mem_reads_[b]) {
|
||||
if (mem_req.addr == rsp.addr) {
|
||||
mem_req.cycles_left = rsp.cycles_left;
|
||||
break;
|
||||
}
|
||||
}
|
||||
mem_reads_[b].emplace_back(mem_req);
|
||||
/*printf("%0ld: [sim] MEM Rd Req: addr=%x, pending={", timestamp, mem_req.addr * MEM_BLOCK_SIZE);
|
||||
for (auto& req : mem_reads_[b]) {
|
||||
if (req.cycles_left != 0)
|
||||
printf(" !%0x", req.addr * MEM_BLOCK_SIZE);
|
||||
else
|
||||
printf(" %0x", req.addr * MEM_BLOCK_SIZE);
|
||||
}
|
||||
printf("}\n");*/
|
||||
}
|
||||
}
|
||||
|
||||
vortex_afu_->avs_waitrequest[b] = mem_stalled;
|
||||
}
|
||||
}
|
|
@ -1,14 +1,16 @@
|
|||
#pragma once
|
||||
|
||||
#include "verilated.h"
|
||||
//#include "verilated_stub.h"
|
||||
#include "Vvortex_afu_shim.h"
|
||||
#include "Vvortex_afu_shim__Syms.h"
|
||||
#include "verilated.h"
|
||||
|
||||
#ifdef VCD_OUTPUT
|
||||
#include <verilated_vcd_c.h>
|
||||
#endif
|
||||
|
||||
#include <VX_config.h>
|
||||
#include "vortex_afu.h"
|
||||
#include "ram.h"
|
||||
|
||||
#include <ostream>
|
||||
|
@ -16,7 +18,10 @@
|
|||
#include <list>
|
||||
#include <unordered_map>
|
||||
|
||||
#define CACHE_BLOCK_SIZE 64
|
||||
#undef MEM_BLOCK_SIZE
|
||||
#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8)
|
||||
|
||||
#define CACHE_BLOCK_SIZE 64
|
||||
|
||||
class opae_sim {
|
||||
public:
|
||||
|
@ -40,9 +45,9 @@ private:
|
|||
|
||||
typedef struct {
|
||||
int cycles_left;
|
||||
std::array<uint8_t, CACHE_BLOCK_SIZE> data;
|
||||
std::array<uint8_t, MEM_BLOCK_SIZE> data;
|
||||
uint32_t addr;
|
||||
} dram_rd_req_t;
|
||||
} mem_rd_req_t;
|
||||
|
||||
typedef struct {
|
||||
int cycles_left;
|
||||
|
@ -77,7 +82,7 @@ private:
|
|||
|
||||
std::unordered_map<int64_t, host_buffer_t> host_buffers_;
|
||||
|
||||
std::list<dram_rd_req_t> dram_reads_;
|
||||
std::list<mem_rd_req_t> mem_reads_ [PLATFORM_PARAM_LOCAL_MEMORY_BANKS];
|
||||
|
||||
std::list<cci_rd_req_t> cci_reads_;
|
||||
|
||||
|
|
|
@ -1,13 +1,16 @@
|
|||
`include "VX_define.vh"
|
||||
`include "VX_platform.vh"
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
`include "vortex_afu.vh"
|
||||
`IGNORE_WARNINGS_END
|
||||
|
||||
/* verilator lint_off IMPORTSTAR */
|
||||
import ccip_if_pkg::*;
|
||||
import local_mem_cfg_pkg::*;
|
||||
/* verilator lint_on IMPORTSTAR */
|
||||
/* verilator lint_on IMPORTSTAR */
|
||||
|
||||
module vortex_afu_shim #(
|
||||
parameter NUM_LOCAL_MEM_BANKS = 2
|
||||
) (
|
||||
`include "VX_define.vh"
|
||||
|
||||
module vortex_afu_shim (
|
||||
// global signals
|
||||
input clk,
|
||||
input reset,
|
||||
|
@ -69,24 +72,22 @@ module vortex_afu_shim #(
|
|||
output t_ccip_mmioData af2cp_sTxPort_c2_data,
|
||||
|
||||
// Avalon signals for local memory access
|
||||
output t_local_mem_data avs_writedata,
|
||||
input t_local_mem_data avs_readdata,
|
||||
output t_local_mem_addr avs_address,
|
||||
input logic avs_waitrequest,
|
||||
output logic avs_write,
|
||||
output logic avs_read,
|
||||
output t_local_mem_byte_mask avs_byteenable,
|
||||
output t_local_mem_burst_cnt avs_burstcount,
|
||||
input avs_readdatavalid,
|
||||
|
||||
output logic [$clog2(NUM_LOCAL_MEM_BANKS)-1:0] mem_bank_select
|
||||
output t_local_mem_data avs_writedata [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
|
||||
input t_local_mem_data avs_readdata [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
|
||||
output t_local_mem_addr avs_address [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
|
||||
input logic avs_waitrequest [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
|
||||
output logic avs_write [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
|
||||
output logic avs_read [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
|
||||
output t_local_mem_byte_mask avs_byteenable [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
|
||||
output t_local_mem_burst_cnt avs_burstcount [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
|
||||
input avs_readdatavalid [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS]
|
||||
);
|
||||
|
||||
t_if_ccip_Rx cp2af_sRxPort;
|
||||
t_if_ccip_Tx af2cp_sTxPort;
|
||||
|
||||
vortex_afu #(
|
||||
.NUM_LOCAL_MEM_BANKS(NUM_LOCAL_MEM_BANKS)
|
||||
.NUM_LOCAL_MEM_BANKS(`PLATFORM_PARAM_LOCAL_MEMORY_BANKS)
|
||||
) afu (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
|
@ -100,8 +101,7 @@ vortex_afu #(
|
|||
.avs_read(avs_read),
|
||||
.avs_byteenable(avs_byteenable),
|
||||
.avs_burstcount(avs_burstcount),
|
||||
.avs_readdatavalid(avs_readdatavalid),
|
||||
.mem_bank_select(mem_bank_select)
|
||||
.avs_readdatavalid(avs_readdatavalid)
|
||||
);
|
||||
|
||||
t_if_ccip_c0_RxHdr c0_RxHdr;
|
||||
|
|
|
@ -1,33 +1,29 @@
|
|||
CFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
|
||||
#CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors
|
||||
|
||||
CFLAGS += -fPIC -Wno-aligned-new -Wno-maybe-uninitialized
|
||||
CFLAGS += -DUSE_RTLSIM -fPIC -Wno-maybe-uninitialized
|
||||
CFLAGS += -I../../include -I../../../hw/simulate -I../../../hw
|
||||
|
||||
# control RTL debug print states
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE
|
||||
#DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
|
||||
#DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
|
||||
#DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
|
||||
#DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
|
||||
#DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM
|
||||
#DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
|
||||
#DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
|
||||
#DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_TEX
|
||||
|
||||
DBG_FLAGS += $(DBG_PRINT_FLAGS)
|
||||
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
|
||||
|
||||
#CONFIGS ?= -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
|
||||
#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
|
||||
#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
|
||||
CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=1
|
||||
|
||||
CFLAGS += $(CONFIGS)
|
||||
|
||||
CFLAGS += -DDUMP_PERF_STATS
|
||||
|
||||
LDFLAGS += -shared -pthread
|
||||
|
@ -45,10 +41,11 @@ FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src
|
|||
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE)
|
||||
|
||||
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
|
||||
VL_FLAGS += -Wno-DECLFILENAME
|
||||
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
|
||||
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
||||
VL_FLAGS += --x-initial unique --x-assign unique
|
||||
VL_FLAGS += verilator.vlt
|
||||
VL_FLAGS += $(CONFIGS)
|
||||
|
||||
# Enable Verilator multithreaded simulation
|
||||
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
|
||||
|
@ -69,8 +66,9 @@ ifdef PERF
|
|||
CFLAGS += -DPERF_ENABLE
|
||||
endif
|
||||
|
||||
# use DPI FPU
|
||||
VL_FLAGS += -DFPU_DPI
|
||||
# FPU backend
|
||||
FPU_CORE ?= FPU_DPI
|
||||
VL_FLAGS += -D$(FPU_CORE)
|
||||
|
||||
PROJECT = libvortex.so
|
||||
# PROJECT = libvortex.dylib
|
||||
|
|
|
@ -6,16 +6,13 @@ SIMX_DIR = ../../simX
|
|||
CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
|
||||
#CXXFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -fPIC -Wno-aligned-new -Wno-maybe-uninitialized
|
||||
CXXFLAGS += -DUSE_SIMX -fPIC -Wno-maybe-uninitialized
|
||||
CXXFLAGS += -I../include -I../../hw -I$(SIMX_DIR)
|
||||
CXXFLAGS += -DDUMP_PERF_STATS
|
||||
|
||||
#CONFIGS ?= -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
|
||||
#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
|
||||
#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
|
||||
CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=1
|
||||
|
||||
CXXFLAGS += $(CONFIGS)
|
||||
CXXFLAGS += -DDUMP_PERF_STATS
|
||||
|
||||
LDFLAGS += -shared -pthread
|
||||
#LDFLAGS += -dynamiclib -pthread
|
||||
|
|
|
@ -2,19 +2,23 @@ all:
|
|||
$(MAKE) -C basic
|
||||
$(MAKE) -C demo
|
||||
$(MAKE) -C dogfood
|
||||
$(MAKE) -C stress
|
||||
|
||||
run:
|
||||
$(MAKE) -C basic run-vlsim
|
||||
$(MAKE) -C demo run-vlsim
|
||||
$(MAKE) -C dogfood run-vlsim
|
||||
$(MAKE) -C stress run-vlsim
|
||||
|
||||
clean:
|
||||
$(MAKE) -C basic clean
|
||||
$(MAKE) -C demo clean
|
||||
$(MAKE) -C dogfood clean
|
||||
$(MAKE) -C stress clean
|
||||
|
||||
clean-all:
|
||||
$(MAKE) -C basic clean-all
|
||||
$(MAKE) -C demo clean-all
|
||||
$(MAKE) -C dogfood clean-all
|
||||
$(MAKE) -C stress clean-all
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@ int test = -1;
|
|||
uint32_t count = 0;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
vx_buffer_h buffer = nullptr;
|
||||
vx_buffer_h staging_buf = nullptr;
|
||||
|
||||
static void show_usage() {
|
||||
std::cout << "Vortex Driver Test." << std::endl;
|
||||
|
@ -56,8 +56,8 @@ static void parse_args(int argc, char **argv) {
|
|||
}
|
||||
|
||||
void cleanup() {
|
||||
if (buffer) {
|
||||
vx_buf_release(buffer);
|
||||
if (staging_buf) {
|
||||
vx_buf_release(staging_buf);
|
||||
}
|
||||
if (device) {
|
||||
vx_dev_close(device);
|
||||
|
@ -77,38 +77,38 @@ int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) {
|
|||
|
||||
// update source buffer
|
||||
for (int i = 0; i < num_blocks_8; ++i) {
|
||||
((uint64_t*)vx_host_ptr(buffer))[i] = shuffle(i, value);
|
||||
((uint64_t*)vx_host_ptr(staging_buf))[i] = shuffle(i, value);
|
||||
}
|
||||
|
||||
/*for (int i = 0; i < num_blocks; ++i) {
|
||||
std::cout << "data[" << i << "]=0x";
|
||||
for (int j = 7; j >= 0; --j) {
|
||||
std::cout << std::hex << ((uint64_t*)vx_host_ptr(buffer))[i * 8 +j];
|
||||
std::cout << std::hex << ((uint64_t*)vx_host_ptr(staging_buf))[i * 8 +j];
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}*/
|
||||
|
||||
// write buffer to local memory
|
||||
std::cout << "write buffer to local memory" << std::endl;
|
||||
// write source buffer to local memory
|
||||
std::cout << "write source buffer to local memory" << std::endl;
|
||||
auto t0 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_to_dev(buffer, dev_addr, 64 * num_blocks, 0));
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, dev_addr, 64 * num_blocks, 0));
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear destination buffer
|
||||
for (int i = 0; i < num_blocks_8; ++i) {
|
||||
((uint64_t*)vx_host_ptr(buffer))[i] = 0;
|
||||
((uint64_t*)vx_host_ptr(staging_buf))[i] = 0;
|
||||
}
|
||||
|
||||
// read buffer from local memory
|
||||
std::cout << "read buffer from local memory" << std::endl;
|
||||
// read destination buffer from local memory
|
||||
std::cout << "read destination buffer from local memory" << std::endl;
|
||||
auto t2 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_from_dev(buffer, dev_addr, 64 * num_blocks, 0));
|
||||
RT_CHECK(vx_copy_from_dev(staging_buf, dev_addr, 64 * num_blocks, 0));
|
||||
auto t3 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
for (int i = 0; i < num_blocks_8; ++i) {
|
||||
auto curr = ((uint64_t*)vx_host_ptr(buffer))[i];
|
||||
auto curr = ((uint64_t*)vx_host_ptr(staging_buf))[i];
|
||||
auto ref = shuffle(i, value);
|
||||
if (curr != ref) {
|
||||
std::cout << "error at 0x" << std::hex << (dev_addr + 8 * i)
|
||||
|
@ -145,25 +145,25 @@ int run_kernel_test(const kernel_arg_t& kernel_arg,
|
|||
|
||||
// update source buffer
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = i;
|
||||
}
|
||||
}
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
auto t0 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_ptr, buf_size, 0));
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_ptr, buf_size, 0));
|
||||
auto t1 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// clear destination buffer
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = 0xdeadbeef;
|
||||
}
|
||||
}
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0));
|
||||
|
||||
// start device
|
||||
std::cout << "start execution" << std::endl;
|
||||
|
@ -172,17 +172,17 @@ int run_kernel_test(const kernel_arg_t& kernel_arg,
|
|||
RT_CHECK(vx_ready_wait(device, -1));
|
||||
auto t3 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// read buffer from local memory
|
||||
std::cout << "read buffer from local memory" << std::endl;
|
||||
// read destination buffer from local memory
|
||||
std::cout << "read destination buffer from local memory" << std::endl;
|
||||
auto t4 = std::chrono::high_resolution_clock::now();
|
||||
RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));
|
||||
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0));
|
||||
auto t5 = std::chrono::high_resolution_clock::now();
|
||||
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
int32_t curr = ((int32_t*)vx_host_ptr(buffer))[i];
|
||||
int32_t curr = ((int32_t*)vx_host_ptr(staging_buf))[i];
|
||||
int32_t ref = i;
|
||||
if (curr != ref) {
|
||||
std::cout << "error at result #" << i
|
||||
|
@ -233,8 +233,8 @@ int main(int argc, char *argv[]) {
|
|||
unsigned max_cores;
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
|
||||
uint32_t num_points = 1 * count;
|
||||
uint32_t num_blocks = (num_points * sizeof(uint32_t) + 63) / 64;
|
||||
uint32_t buf_size = num_blocks * 64;
|
||||
uint32_t num_blocks = (num_points * sizeof(int32_t) + 63) / 64;
|
||||
uint32_t buf_size = num_blocks * 64;
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
@ -253,7 +253,7 @@ int main(int argc, char *argv[]) {
|
|||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer));
|
||||
RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &staging_buf));
|
||||
|
||||
// run tests
|
||||
if (0 == test || -1 == test) {
|
||||
|
@ -269,9 +269,9 @@ int main(int argc, char *argv[]) {
|
|||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
{
|
||||
auto buf_ptr = (void*)vx_host_ptr(buffer);
|
||||
auto buf_ptr = (void*)vx_host_ptr(staging_buf);
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
|
||||
}
|
||||
|
||||
std::cout << "run kernel test" << std::endl;
|
||||
|
|
|
@ -20,7 +20,7 @@ const char* kernel_file = "kernel.bin";
|
|||
uint32_t count = 0;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
vx_buffer_h buffer = nullptr;
|
||||
vx_buffer_h staging_buf = nullptr;
|
||||
|
||||
static void show_usage() {
|
||||
std::cout << "Vortex Driver Test." << std::endl;
|
||||
|
@ -50,8 +50,8 @@ static void parse_args(int argc, char **argv) {
|
|||
}
|
||||
|
||||
void cleanup() {
|
||||
if (buffer) {
|
||||
vx_buf_release(buffer);
|
||||
if (staging_buf) {
|
||||
vx_buf_release(staging_buf);
|
||||
}
|
||||
if (device) {
|
||||
vx_dev_close(device);
|
||||
|
@ -71,13 +71,13 @@ int run_test(const kernel_arg_t& kernel_arg,
|
|||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));
|
||||
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
int ref = i + i;
|
||||
int cur = buf_ptr[i];
|
||||
|
@ -119,7 +119,7 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
uint32_t num_tasks = max_cores * max_warps * max_threads;
|
||||
uint32_t num_points = count * num_tasks;
|
||||
uint32_t buf_size = num_points * sizeof(uint32_t);
|
||||
uint32_t buf_size = num_points * sizeof(int32_t);
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
@ -148,45 +148,45 @@ int main(int argc, char *argv[]) {
|
|||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer));
|
||||
RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &staging_buf));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
{
|
||||
auto buf_ptr = (int*)vx_host_ptr(buffer);
|
||||
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
|
||||
}
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = i-1;
|
||||
}
|
||||
}
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src0_ptr, buf_size, 0));
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src0_ptr, buf_size, 0));
|
||||
|
||||
// upload source buffer1
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = i+1;
|
||||
}
|
||||
}
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src1_ptr, buf_size, 0));
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src1_ptr, buf_size, 0));
|
||||
|
||||
// clear destination buffer
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = 0xdeadbeef;
|
||||
}
|
||||
}
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0));
|
||||
|
||||
// run tests
|
||||
std::cout << "run tests" << std::endl;
|
||||
|
|
67
driver/tests/stress/Makefile
Normal file
67
driver/tests/stress/Makefile
Normal file
|
@ -0,0 +1,67 @@
|
|||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
|
||||
OPTS ?= -n64
|
||||
|
||||
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
|
||||
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
|
||||
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
|
||||
|
||||
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
|
||||
|
||||
VX_SRCS = kernel.c
|
||||
|
||||
#CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I../../include
|
||||
|
||||
PROJECT = stress
|
||||
|
||||
SRCS = main.cpp
|
||||
|
||||
all: $(PROJECT) kernel.bin kernel.dump
|
||||
|
||||
kernel.dump: kernel.elf
|
||||
$(VX_DP) -D kernel.elf > kernel.dump
|
||||
|
||||
kernel.bin: kernel.elf
|
||||
$(VX_CP) -O binary kernel.elf kernel.bin
|
||||
|
||||
kernel.elf: $(VX_SRCS)
|
||||
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -L../../stub -lvortex -o $@
|
||||
|
||||
run-fpga: $(PROJECT)
|
||||
LD_LIBRARY_PATH=../../opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-asesim: $(PROJECT)
|
||||
ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-vlsim: $(PROJECT)
|
||||
LD_LIBRARY_PATH=../../opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT)
|
||||
LD_LIBRARY_PATH=../../rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-simx: $(PROJECT)
|
||||
LD_LIBRARY_PATH=../../simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.elf *.bin *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
17
driver/tests/stress/common.h
Normal file
17
driver/tests/stress/common.h
Normal file
|
@ -0,0 +1,17 @@
|
|||
#ifndef _COMMON_H_
|
||||
#define _COMMON_H_
|
||||
|
||||
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
|
||||
|
||||
#define NUM_LOADS 8
|
||||
|
||||
struct kernel_arg_t {
|
||||
uint32_t num_tasks;
|
||||
uint32_t size;
|
||||
uint32_t stride;
|
||||
uint32_t addr_ptr;
|
||||
uint32_t src_ptr;
|
||||
uint32_t dst_ptr;
|
||||
};
|
||||
|
||||
#endif
|
BIN
driver/tests/stress/kernel.bin
Executable file
BIN
driver/tests/stress/kernel.bin
Executable file
Binary file not shown.
29
driver/tests/stress/kernel.c
Normal file
29
driver/tests/stress/kernel.c
Normal file
|
@ -0,0 +1,29 @@
|
|||
#include <stdint.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include <vx_spawn.h>
|
||||
#include "common.h"
|
||||
|
||||
void kernel_body(int task_id, void* arg) {
|
||||
struct kernel_arg_t* _arg = (struct kernel_arg_t*)(arg);
|
||||
uint32_t stride = _arg->stride;
|
||||
uint32_t* addr_ptr = (uint32_t*)_arg->addr_ptr;
|
||||
float* src_ptr = (float*)_arg->src_ptr;
|
||||
float* dst_ptr = (float*)_arg->dst_ptr;
|
||||
|
||||
uint32_t offset = task_id * stride;
|
||||
|
||||
for (uint32_t i = 0; i < stride; ++i) {
|
||||
float value = 0.0f;
|
||||
for (uint32_t j = 0; j < NUM_LOADS; ++j) {
|
||||
uint32_t addr = offset + i + j;
|
||||
uint32_t index = addr_ptr[addr];
|
||||
value *= src_ptr[index];
|
||||
}
|
||||
dst_ptr[offset+i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
void main() {
|
||||
struct kernel_arg_t* arg = (struct kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
|
||||
vx_spawn_tasks(arg->num_tasks, kernel_body, arg);
|
||||
}
|
596
driver/tests/stress/kernel.dump
Normal file
596
driver/tests/stress/kernel.dump
Normal file
|
@ -0,0 +1,596 @@
|
|||
|
||||
kernel.elf: file format elf32-littleriscv
|
||||
|
||||
|
||||
Disassembly of section .init:
|
||||
|
||||
80000000 <_start>:
|
||||
80000000: 00000597 auipc a1,0x0
|
||||
80000004: 17058593 addi a1,a1,368 # 80000170 <vx_set_sp>
|
||||
80000008: fc102573 csrr a0,0xfc1
|
||||
8000000c: 00b5106b 0xb5106b
|
||||
80000010: 160000ef jal ra,80000170 <vx_set_sp>
|
||||
80000014: 00100513 li a0,1
|
||||
80000018: 0005006b 0x5006b
|
||||
8000001c: 00002517 auipc a0,0x2
|
||||
80000020: ba850513 addi a0,a0,-1112 # 80001bc4 <g_wspawn_args>
|
||||
80000024: 00002617 auipc a2,0x2
|
||||
80000028: c2060613 addi a2,a2,-992 # 80001c44 <__BSS_END__>
|
||||
8000002c: 40a60633 sub a2,a2,a0
|
||||
80000030: 00000593 li a1,0
|
||||
80000034: 4c0000ef jal ra,800004f4 <memset>
|
||||
80000038: 00000517 auipc a0,0x0
|
||||
8000003c: 3c450513 addi a0,a0,964 # 800003fc <__libc_fini_array>
|
||||
80000040: 374000ef jal ra,800003b4 <atexit>
|
||||
80000044: 414000ef jal ra,80000458 <__libc_init_array>
|
||||
80000048: 008000ef jal ra,80000050 <main>
|
||||
8000004c: 37c0006f j 800003c8 <exit>
|
||||
|
||||
Disassembly of section .text:
|
||||
|
||||
80000050 <main>:
|
||||
80000050: 7ffff7b7 lui a5,0x7ffff
|
||||
80000054: 0007a503 lw a0,0(a5) # 7ffff000 <__stack_size+0x7fffec00>
|
||||
80000058: 800005b7 lui a1,0x80000
|
||||
8000005c: 7ffff637 lui a2,0x7ffff
|
||||
80000060: 08058593 addi a1,a1,128 # 80000080 <__stack_top+0x81000080>
|
||||
80000064: 2080006f j 8000026c <vx_spawn_tasks>
|
||||
|
||||
80000068 <register_fini>:
|
||||
80000068: 00000793 li a5,0
|
||||
8000006c: 00078863 beqz a5,8000007c <register_fini+0x14>
|
||||
80000070: 80000537 lui a0,0x80000
|
||||
80000074: 3fc50513 addi a0,a0,1020 # 800003fc <__stack_top+0x810003fc>
|
||||
80000078: 33c0006f j 800003b4 <atexit>
|
||||
8000007c: 00008067 ret
|
||||
|
||||
80000080 <kernel_body>:
|
||||
80000080: 0085a783 lw a5,8(a1)
|
||||
80000084: 00c5a603 lw a2,12(a1)
|
||||
80000088: 0105a703 lw a4,16(a1)
|
||||
8000008c: 02f506b3 mul a3,a0,a5
|
||||
80000090: 0145a883 lw a7,20(a1)
|
||||
80000094: 0c078863 beqz a5,80000164 <kernel_body+0xe4>
|
||||
80000098: 00d78833 add a6,a5,a3
|
||||
8000009c: f0000653 fmv.w.x fa2,zero
|
||||
800000a0: 00269693 slli a3,a3,0x2
|
||||
800000a4: 00281813 slli a6,a6,0x2
|
||||
800000a8: 00c686b3 add a3,a3,a2
|
||||
800000ac: 00c80833 add a6,a6,a2
|
||||
800000b0: 40c888b3 sub a7,a7,a2
|
||||
800000b4: 0006a583 lw a1,0(a3)
|
||||
800000b8: 0086a603 lw a2,8(a3)
|
||||
800000bc: 00c6a503 lw a0,12(a3)
|
||||
800000c0: 00259593 slli a1,a1,0x2
|
||||
800000c4: 00b705b3 add a1,a4,a1
|
||||
800000c8: 0005a787 flw fa5,0(a1)
|
||||
800000cc: 0046a583 lw a1,4(a3)
|
||||
800000d0: 00261613 slli a2,a2,0x2
|
||||
800000d4: 10f677d3 fmul.s fa5,fa2,fa5
|
||||
800000d8: 00259593 slli a1,a1,0x2
|
||||
800000dc: 00b705b3 add a1,a4,a1
|
||||
800000e0: 0005a687 flw fa3,0(a1)
|
||||
800000e4: 00c70633 add a2,a4,a2
|
||||
800000e8: 00062707 flw fa4,0(a2) # 7ffff000 <__stack_size+0x7fffec00>
|
||||
800000ec: 10d7f7d3 fmul.s fa5,fa5,fa3
|
||||
800000f0: 00251513 slli a0,a0,0x2
|
||||
800000f4: 00a70533 add a0,a4,a0
|
||||
800000f8: 0106a583 lw a1,16(a3)
|
||||
800000fc: 0146a603 lw a2,20(a3)
|
||||
80000100: 10e7f7d3 fmul.s fa5,fa5,fa4
|
||||
80000104: 00052707 flw fa4,0(a0)
|
||||
80000108: 00259593 slli a1,a1,0x2
|
||||
8000010c: 00b705b3 add a1,a4,a1
|
||||
80000110: 0005a687 flw fa3,0(a1)
|
||||
80000114: 10e7f7d3 fmul.s fa5,fa5,fa4
|
||||
80000118: 00261613 slli a2,a2,0x2
|
||||
8000011c: 00c70633 add a2,a4,a2
|
||||
80000120: 00062707 flw fa4,0(a2)
|
||||
80000124: 0186a583 lw a1,24(a3)
|
||||
80000128: 10d7f7d3 fmul.s fa5,fa5,fa3
|
||||
8000012c: 01c6a603 lw a2,28(a3)
|
||||
80000130: 00259593 slli a1,a1,0x2
|
||||
80000134: 00b705b3 add a1,a4,a1
|
||||
80000138: 00261613 slli a2,a2,0x2
|
||||
8000013c: 10e7f7d3 fmul.s fa5,fa5,fa4
|
||||
80000140: 0005a707 flw fa4,0(a1)
|
||||
80000144: 00c70633 add a2,a4,a2
|
||||
80000148: 00d887b3 add a5,a7,a3
|
||||
8000014c: 00468693 addi a3,a3,4
|
||||
80000150: 10e7f7d3 fmul.s fa5,fa5,fa4
|
||||
80000154: 00062707 flw fa4,0(a2)
|
||||
80000158: 10f777d3 fmul.s fa5,fa4,fa5
|
||||
8000015c: 00f7a027 fsw fa5,0(a5)
|
||||
80000160: f4d81ae3 bne a6,a3,800000b4 <kernel_body+0x34>
|
||||
80000164: 00008067 ret
|
||||
|
||||
80000168 <_exit>:
|
||||
80000168: 00000513 li a0,0
|
||||
8000016c: 0005006b 0x5006b
|
||||
|
||||
80000170 <vx_set_sp>:
|
||||
80000170: fc002573 csrr a0,0xfc0
|
||||
80000174: 0005006b 0x5006b
|
||||
80000178: 00002197 auipc gp,0x2
|
||||
8000017c: e2018193 addi gp,gp,-480 # 80001f98 <__global_pointer>
|
||||
80000180: 7f000117 auipc sp,0x7f000
|
||||
80000184: e8010113 addi sp,sp,-384 # ff000000 <__stack_top>
|
||||
80000188: 40000593 li a1,1024
|
||||
8000018c: cc102673 csrr a2,0xcc1
|
||||
80000190: 02c585b3 mul a1,a1,a2
|
||||
80000194: 40b10133 sub sp,sp,a1
|
||||
80000198: cc3026f3 csrr a3,0xcc3
|
||||
8000019c: 00068663 beqz a3,800001a8 <RETURN>
|
||||
800001a0: 00000513 li a0,0
|
||||
800001a4: 0005006b 0x5006b
|
||||
|
||||
800001a8 <RETURN>:
|
||||
800001a8: 00008067 ret
|
||||
|
||||
800001ac <spawn_tasks_callback>:
|
||||
800001ac: fe010113 addi sp,sp,-32
|
||||
800001b0: 00112e23 sw ra,28(sp)
|
||||
800001b4: 00812c23 sw s0,24(sp)
|
||||
800001b8: 00912a23 sw s1,20(sp)
|
||||
800001bc: 01212823 sw s2,16(sp)
|
||||
800001c0: 01312623 sw s3,12(sp)
|
||||
800001c4: fc0027f3 csrr a5,0xfc0
|
||||
800001c8: 0007806b 0x7806b
|
||||
800001cc: cc5027f3 csrr a5,0xcc5
|
||||
800001d0: cc3029f3 csrr s3,0xcc3
|
||||
800001d4: cc002773 csrr a4,0xcc0
|
||||
800001d8: fc002673 csrr a2,0xfc0
|
||||
800001dc: 00279693 slli a3,a5,0x2
|
||||
800001e0: 800027b7 lui a5,0x80002
|
||||
800001e4: bc478793 addi a5,a5,-1084 # 80001bc4 <__stack_top+0x81001bc4>
|
||||
800001e8: 00d787b3 add a5,a5,a3
|
||||
800001ec: 0007a483 lw s1,0(a5)
|
||||
800001f0: 0104a403 lw s0,16(s1)
|
||||
800001f4: 00c4a683 lw a3,12(s1)
|
||||
800001f8: 0089a933 slt s2,s3,s0
|
||||
800001fc: 00040793 mv a5,s0
|
||||
80000200: 00d90933 add s2,s2,a3
|
||||
80000204: 03368433 mul s0,a3,s3
|
||||
80000208: 00f9d463 bge s3,a5,80000210 <spawn_tasks_callback+0x64>
|
||||
8000020c: 00098793 mv a5,s3
|
||||
80000210: 00f40433 add s0,s0,a5
|
||||
80000214: 0084a683 lw a3,8(s1)
|
||||
80000218: 02c40433 mul s0,s0,a2
|
||||
8000021c: 02e907b3 mul a5,s2,a4
|
||||
80000220: 00d40433 add s0,s0,a3
|
||||
80000224: 00f40433 add s0,s0,a5
|
||||
80000228: 00890933 add s2,s2,s0
|
||||
8000022c: 01245e63 bge s0,s2,80000248 <spawn_tasks_callback+0x9c>
|
||||
80000230: 0004a783 lw a5,0(s1)
|
||||
80000234: 0044a583 lw a1,4(s1)
|
||||
80000238: 00040513 mv a0,s0
|
||||
8000023c: 00140413 addi s0,s0,1
|
||||
80000240: 000780e7 jalr a5
|
||||
80000244: fe8916e3 bne s2,s0,80000230 <spawn_tasks_callback+0x84>
|
||||
80000248: 0019b993 seqz s3,s3
|
||||
8000024c: 0009806b 0x9806b
|
||||
80000250: 01c12083 lw ra,28(sp)
|
||||
80000254: 01812403 lw s0,24(sp)
|
||||
80000258: 01412483 lw s1,20(sp)
|
||||
8000025c: 01012903 lw s2,16(sp)
|
||||
80000260: 00c12983 lw s3,12(sp)
|
||||
80000264: 02010113 addi sp,sp,32
|
||||
80000268: 00008067 ret
|
||||
|
||||
8000026c <vx_spawn_tasks>:
|
||||
8000026c: fc010113 addi sp,sp,-64
|
||||
80000270: 02112e23 sw ra,60(sp)
|
||||
80000274: 02812c23 sw s0,56(sp)
|
||||
80000278: 02912a23 sw s1,52(sp)
|
||||
8000027c: 03212823 sw s2,48(sp)
|
||||
80000280: 03312623 sw s3,44(sp)
|
||||
80000284: fc2026f3 csrr a3,0xfc2
|
||||
80000288: fc102873 csrr a6,0xfc1
|
||||
8000028c: fc002473 csrr s0,0xfc0
|
||||
80000290: cc5027f3 csrr a5,0xcc5
|
||||
80000294: 01f00713 li a4,31
|
||||
80000298: 0cf74463 blt a4,a5,80000360 <vx_spawn_tasks+0xf4>
|
||||
8000029c: 030408b3 mul a7,s0,a6
|
||||
800002a0: 00100713 li a4,1
|
||||
800002a4: 00a8d463 bge a7,a0,800002ac <vx_spawn_tasks+0x40>
|
||||
800002a8: 03154733 div a4,a0,a7
|
||||
800002ac: 0ce6c863 blt a3,a4,8000037c <vx_spawn_tasks+0x110>
|
||||
800002b0: 0ae7d863 bge a5,a4,80000360 <vx_spawn_tasks+0xf4>
|
||||
800002b4: fff68693 addi a3,a3,-1
|
||||
800002b8: 02e54333 div t1,a0,a4
|
||||
800002bc: 00030893 mv a7,t1
|
||||
800002c0: 00f69663 bne a3,a5,800002cc <vx_spawn_tasks+0x60>
|
||||
800002c4: 02e56533 rem a0,a0,a4
|
||||
800002c8: 006508b3 add a7,a0,t1
|
||||
800002cc: 0288c4b3 div s1,a7,s0
|
||||
800002d0: 0288e933 rem s2,a7,s0
|
||||
800002d4: 0b04ca63 blt s1,a6,80000388 <vx_spawn_tasks+0x11c>
|
||||
800002d8: 00100693 li a3,1
|
||||
800002dc: 0304c733 div a4,s1,a6
|
||||
800002e0: 00070663 beqz a4,800002ec <vx_spawn_tasks+0x80>
|
||||
800002e4: 00070693 mv a3,a4
|
||||
800002e8: 0304e733 rem a4,s1,a6
|
||||
800002ec: 800029b7 lui s3,0x80002
|
||||
800002f0: bc498993 addi s3,s3,-1084 # 80001bc4 <__stack_top+0x81001bc4>
|
||||
800002f4: 00e12e23 sw a4,28(sp)
|
||||
800002f8: 00c10713 addi a4,sp,12
|
||||
800002fc: 00b12623 sw a1,12(sp)
|
||||
80000300: 00c12823 sw a2,16(sp)
|
||||
80000304: 00d12c23 sw a3,24(sp)
|
||||
80000308: 02f30333 mul t1,t1,a5
|
||||
8000030c: 00279793 slli a5,a5,0x2
|
||||
80000310: 00f987b3 add a5,s3,a5
|
||||
80000314: 00e7a023 sw a4,0(a5)
|
||||
80000318: 00612a23 sw t1,20(sp)
|
||||
8000031c: 06904c63 bgtz s1,80000394 <vx_spawn_tasks+0x128>
|
||||
80000320: 04090063 beqz s2,80000360 <vx_spawn_tasks+0xf4>
|
||||
80000324: 02848433 mul s0,s1,s0
|
||||
80000328: 00812a23 sw s0,20(sp)
|
||||
8000032c: 0009006b 0x9006b
|
||||
80000330: cc5027f3 csrr a5,0xcc5
|
||||
80000334: cc202573 csrr a0,0xcc2
|
||||
80000338: 00279793 slli a5,a5,0x2
|
||||
8000033c: 00f989b3 add s3,s3,a5
|
||||
80000340: 0009a783 lw a5,0(s3)
|
||||
80000344: 0087a683 lw a3,8(a5)
|
||||
80000348: 0007a703 lw a4,0(a5)
|
||||
8000034c: 0047a583 lw a1,4(a5)
|
||||
80000350: 00d50533 add a0,a0,a3
|
||||
80000354: 000700e7 jalr a4
|
||||
80000358: 00100793 li a5,1
|
||||
8000035c: 0007806b 0x7806b
|
||||
80000360: 03c12083 lw ra,60(sp)
|
||||
80000364: 03812403 lw s0,56(sp)
|
||||
80000368: 03412483 lw s1,52(sp)
|
||||
8000036c: 03012903 lw s2,48(sp)
|
||||
80000370: 02c12983 lw s3,44(sp)
|
||||
80000374: 04010113 addi sp,sp,64
|
||||
80000378: 00008067 ret
|
||||
8000037c: 00068713 mv a4,a3
|
||||
80000380: f2e7cae3 blt a5,a4,800002b4 <vx_spawn_tasks+0x48>
|
||||
80000384: fddff06f j 80000360 <vx_spawn_tasks+0xf4>
|
||||
80000388: 00000713 li a4,0
|
||||
8000038c: 00100693 li a3,1
|
||||
80000390: f5dff06f j 800002ec <vx_spawn_tasks+0x80>
|
||||
80000394: 00048713 mv a4,s1
|
||||
80000398: 00985463 bge a6,s1,800003a0 <vx_spawn_tasks+0x134>
|
||||
8000039c: 00080713 mv a4,a6
|
||||
800003a0: 800007b7 lui a5,0x80000
|
||||
800003a4: 1ac78793 addi a5,a5,428 # 800001ac <__stack_top+0x810001ac>
|
||||
800003a8: 00f7106b 0xf7106b
|
||||
800003ac: e01ff0ef jal ra,800001ac <spawn_tasks_callback>
|
||||
800003b0: f71ff06f j 80000320 <vx_spawn_tasks+0xb4>
|
||||
|
||||
800003b4 <atexit>:
|
||||
800003b4: 00050593 mv a1,a0
|
||||
800003b8: 00000693 li a3,0
|
||||
800003bc: 00000613 li a2,0
|
||||
800003c0: 00000513 li a0,0
|
||||
800003c4: 20c0006f j 800005d0 <__register_exitproc>
|
||||
|
||||
800003c8 <exit>:
|
||||
800003c8: ff010113 addi sp,sp,-16
|
||||
800003cc: 00000593 li a1,0
|
||||
800003d0: 00812423 sw s0,8(sp)
|
||||
800003d4: 00112623 sw ra,12(sp)
|
||||
800003d8: 00050413 mv s0,a0
|
||||
800003dc: 290000ef jal ra,8000066c <__call_exitprocs>
|
||||
800003e0: 800027b7 lui a5,0x80002
|
||||
800003e4: bc07a503 lw a0,-1088(a5) # 80001bc0 <__stack_top+0x81001bc0>
|
||||
800003e8: 03c52783 lw a5,60(a0)
|
||||
800003ec: 00078463 beqz a5,800003f4 <exit+0x2c>
|
||||
800003f0: 000780e7 jalr a5
|
||||
800003f4: 00040513 mv a0,s0
|
||||
800003f8: d71ff0ef jal ra,80000168 <_exit>
|
||||
|
||||
800003fc <__libc_fini_array>:
|
||||
800003fc: ff010113 addi sp,sp,-16
|
||||
80000400: 00812423 sw s0,8(sp)
|
||||
80000404: 800017b7 lui a5,0x80001
|
||||
80000408: 80001437 lui s0,0x80001
|
||||
8000040c: 79440413 addi s0,s0,1940 # 80001794 <__stack_top+0x81001794>
|
||||
80000410: 79478793 addi a5,a5,1940 # 80001794 <__stack_top+0x81001794>
|
||||
80000414: 408787b3 sub a5,a5,s0
|
||||
80000418: 00912223 sw s1,4(sp)
|
||||
8000041c: 00112623 sw ra,12(sp)
|
||||
80000420: 4027d493 srai s1,a5,0x2
|
||||
80000424: 02048063 beqz s1,80000444 <__libc_fini_array+0x48>
|
||||
80000428: ffc78793 addi a5,a5,-4
|
||||
8000042c: 00878433 add s0,a5,s0
|
||||
80000430: 00042783 lw a5,0(s0)
|
||||
80000434: fff48493 addi s1,s1,-1
|
||||
80000438: ffc40413 addi s0,s0,-4
|
||||
8000043c: 000780e7 jalr a5
|
||||
80000440: fe0498e3 bnez s1,80000430 <__libc_fini_array+0x34>
|
||||
80000444: 00c12083 lw ra,12(sp)
|
||||
80000448: 00812403 lw s0,8(sp)
|
||||
8000044c: 00412483 lw s1,4(sp)
|
||||
80000450: 01010113 addi sp,sp,16
|
||||
80000454: 00008067 ret
|
||||
|
||||
80000458 <__libc_init_array>:
|
||||
80000458: ff010113 addi sp,sp,-16
|
||||
8000045c: 00812423 sw s0,8(sp)
|
||||
80000460: 01212023 sw s2,0(sp)
|
||||
80000464: 80001437 lui s0,0x80001
|
||||
80000468: 80001937 lui s2,0x80001
|
||||
8000046c: 79040793 addi a5,s0,1936 # 80001790 <__stack_top+0x81001790>
|
||||
80000470: 79090913 addi s2,s2,1936 # 80001790 <__stack_top+0x81001790>
|
||||
80000474: 40f90933 sub s2,s2,a5
|
||||
80000478: 00112623 sw ra,12(sp)
|
||||
8000047c: 00912223 sw s1,4(sp)
|
||||
80000480: 40295913 srai s2,s2,0x2
|
||||
80000484: 02090063 beqz s2,800004a4 <__libc_init_array+0x4c>
|
||||
80000488: 79040413 addi s0,s0,1936
|
||||
8000048c: 00000493 li s1,0
|
||||
80000490: 00042783 lw a5,0(s0)
|
||||
80000494: 00148493 addi s1,s1,1
|
||||
80000498: 00440413 addi s0,s0,4
|
||||
8000049c: 000780e7 jalr a5
|
||||
800004a0: fe9918e3 bne s2,s1,80000490 <__libc_init_array+0x38>
|
||||
800004a4: 80001437 lui s0,0x80001
|
||||
800004a8: 80001937 lui s2,0x80001
|
||||
800004ac: 79040793 addi a5,s0,1936 # 80001790 <__stack_top+0x81001790>
|
||||
800004b0: 79490913 addi s2,s2,1940 # 80001794 <__stack_top+0x81001794>
|
||||
800004b4: 40f90933 sub s2,s2,a5
|
||||
800004b8: 40295913 srai s2,s2,0x2
|
||||
800004bc: 02090063 beqz s2,800004dc <__libc_init_array+0x84>
|
||||
800004c0: 79040413 addi s0,s0,1936
|
||||
800004c4: 00000493 li s1,0
|
||||
800004c8: 00042783 lw a5,0(s0)
|
||||
800004cc: 00148493 addi s1,s1,1
|
||||
800004d0: 00440413 addi s0,s0,4
|
||||
800004d4: 000780e7 jalr a5
|
||||
800004d8: fe9918e3 bne s2,s1,800004c8 <__libc_init_array+0x70>
|
||||
800004dc: 00c12083 lw ra,12(sp)
|
||||
800004e0: 00812403 lw s0,8(sp)
|
||||
800004e4: 00412483 lw s1,4(sp)
|
||||
800004e8: 00012903 lw s2,0(sp)
|
||||
800004ec: 01010113 addi sp,sp,16
|
||||
800004f0: 00008067 ret
|
||||
|
||||
800004f4 <memset>:
|
||||
800004f4: 00f00313 li t1,15
|
||||
800004f8: 00050713 mv a4,a0
|
||||
800004fc: 02c37e63 bgeu t1,a2,80000538 <memset+0x44>
|
||||
80000500: 00f77793 andi a5,a4,15
|
||||
80000504: 0a079063 bnez a5,800005a4 <memset+0xb0>
|
||||
80000508: 08059263 bnez a1,8000058c <memset+0x98>
|
||||
8000050c: ff067693 andi a3,a2,-16
|
||||
80000510: 00f67613 andi a2,a2,15
|
||||
80000514: 00e686b3 add a3,a3,a4
|
||||
80000518: 00b72023 sw a1,0(a4)
|
||||
8000051c: 00b72223 sw a1,4(a4)
|
||||
80000520: 00b72423 sw a1,8(a4)
|
||||
80000524: 00b72623 sw a1,12(a4)
|
||||
80000528: 01070713 addi a4,a4,16
|
||||
8000052c: fed766e3 bltu a4,a3,80000518 <memset+0x24>
|
||||
80000530: 00061463 bnez a2,80000538 <memset+0x44>
|
||||
80000534: 00008067 ret
|
||||
80000538: 40c306b3 sub a3,t1,a2
|
||||
8000053c: 00269693 slli a3,a3,0x2
|
||||
80000540: 00000297 auipc t0,0x0
|
||||
80000544: 005686b3 add a3,a3,t0
|
||||
80000548: 00c68067 jr 12(a3)
|
||||
8000054c: 00b70723 sb a1,14(a4)
|
||||
80000550: 00b706a3 sb a1,13(a4)
|
||||
80000554: 00b70623 sb a1,12(a4)
|
||||
80000558: 00b705a3 sb a1,11(a4)
|
||||
8000055c: 00b70523 sb a1,10(a4)
|
||||
80000560: 00b704a3 sb a1,9(a4)
|
||||
80000564: 00b70423 sb a1,8(a4)
|
||||
80000568: 00b703a3 sb a1,7(a4)
|
||||
8000056c: 00b70323 sb a1,6(a4)
|
||||
80000570: 00b702a3 sb a1,5(a4)
|
||||
80000574: 00b70223 sb a1,4(a4)
|
||||
80000578: 00b701a3 sb a1,3(a4)
|
||||
8000057c: 00b70123 sb a1,2(a4)
|
||||
80000580: 00b700a3 sb a1,1(a4)
|
||||
80000584: 00b70023 sb a1,0(a4)
|
||||
80000588: 00008067 ret
|
||||
8000058c: 0ff5f593 andi a1,a1,255
|
||||
80000590: 00859693 slli a3,a1,0x8
|
||||
80000594: 00d5e5b3 or a1,a1,a3
|
||||
80000598: 01059693 slli a3,a1,0x10
|
||||
8000059c: 00d5e5b3 or a1,a1,a3
|
||||
800005a0: f6dff06f j 8000050c <memset+0x18>
|
||||
800005a4: 00279693 slli a3,a5,0x2
|
||||
800005a8: 00000297 auipc t0,0x0
|
||||
800005ac: 005686b3 add a3,a3,t0
|
||||
800005b0: 00008293 mv t0,ra
|
||||
800005b4: fa0680e7 jalr -96(a3)
|
||||
800005b8: 00028093 mv ra,t0
|
||||
800005bc: ff078793 addi a5,a5,-16
|
||||
800005c0: 40f70733 sub a4,a4,a5
|
||||
800005c4: 00f60633 add a2,a2,a5
|
||||
800005c8: f6c378e3 bgeu t1,a2,80000538 <memset+0x44>
|
||||
800005cc: f3dff06f j 80000508 <memset+0x14>
|
||||
|
||||
800005d0 <__register_exitproc>:
|
||||
800005d0: 800027b7 lui a5,0x80002
|
||||
800005d4: bc07a703 lw a4,-1088(a5) # 80001bc0 <__stack_top+0x81001bc0>
|
||||
800005d8: 14872783 lw a5,328(a4)
|
||||
800005dc: 04078c63 beqz a5,80000634 <__register_exitproc+0x64>
|
||||
800005e0: 0047a703 lw a4,4(a5)
|
||||
800005e4: 01f00813 li a6,31
|
||||
800005e8: 06e84e63 blt a6,a4,80000664 <__register_exitproc+0x94>
|
||||
800005ec: 00271813 slli a6,a4,0x2
|
||||
800005f0: 02050663 beqz a0,8000061c <__register_exitproc+0x4c>
|
||||
800005f4: 01078333 add t1,a5,a6
|
||||
800005f8: 08c32423 sw a2,136(t1)
|
||||
800005fc: 1887a883 lw a7,392(a5)
|
||||
80000600: 00100613 li a2,1
|
||||
80000604: 00e61633 sll a2,a2,a4
|
||||
80000608: 00c8e8b3 or a7,a7,a2
|
||||
8000060c: 1917a423 sw a7,392(a5)
|
||||
80000610: 10d32423 sw a3,264(t1)
|
||||
80000614: 00200693 li a3,2
|
||||
80000618: 02d50463 beq a0,a3,80000640 <__register_exitproc+0x70>
|
||||
8000061c: 00170713 addi a4,a4,1
|
||||
80000620: 00e7a223 sw a4,4(a5)
|
||||
80000624: 010787b3 add a5,a5,a6
|
||||
80000628: 00b7a423 sw a1,8(a5)
|
||||
8000062c: 00000513 li a0,0
|
||||
80000630: 00008067 ret
|
||||
80000634: 14c70793 addi a5,a4,332
|
||||
80000638: 14f72423 sw a5,328(a4)
|
||||
8000063c: fa5ff06f j 800005e0 <__register_exitproc+0x10>
|
||||
80000640: 18c7a683 lw a3,396(a5)
|
||||
80000644: 00170713 addi a4,a4,1
|
||||
80000648: 00e7a223 sw a4,4(a5)
|
||||
8000064c: 00c6e633 or a2,a3,a2
|
||||
80000650: 18c7a623 sw a2,396(a5)
|
||||
80000654: 010787b3 add a5,a5,a6
|
||||
80000658: 00b7a423 sw a1,8(a5)
|
||||
8000065c: 00000513 li a0,0
|
||||
80000660: 00008067 ret
|
||||
80000664: fff00513 li a0,-1
|
||||
80000668: 00008067 ret
|
||||
|
||||
8000066c <__call_exitprocs>:
|
||||
8000066c: fd010113 addi sp,sp,-48
|
||||
80000670: 800027b7 lui a5,0x80002
|
||||
80000674: 01412c23 sw s4,24(sp)
|
||||
80000678: bc07aa03 lw s4,-1088(a5) # 80001bc0 <__stack_top+0x81001bc0>
|
||||
8000067c: 03212023 sw s2,32(sp)
|
||||
80000680: 02112623 sw ra,44(sp)
|
||||
80000684: 148a2903 lw s2,328(s4)
|
||||
80000688: 02812423 sw s0,40(sp)
|
||||
8000068c: 02912223 sw s1,36(sp)
|
||||
80000690: 01312e23 sw s3,28(sp)
|
||||
80000694: 01512a23 sw s5,20(sp)
|
||||
80000698: 01612823 sw s6,16(sp)
|
||||
8000069c: 01712623 sw s7,12(sp)
|
||||
800006a0: 01812423 sw s8,8(sp)
|
||||
800006a4: 04090063 beqz s2,800006e4 <__call_exitprocs+0x78>
|
||||
800006a8: 00050b13 mv s6,a0
|
||||
800006ac: 00058b93 mv s7,a1
|
||||
800006b0: 00100a93 li s5,1
|
||||
800006b4: fff00993 li s3,-1
|
||||
800006b8: 00492483 lw s1,4(s2)
|
||||
800006bc: fff48413 addi s0,s1,-1
|
||||
800006c0: 02044263 bltz s0,800006e4 <__call_exitprocs+0x78>
|
||||
800006c4: 00249493 slli s1,s1,0x2
|
||||
800006c8: 009904b3 add s1,s2,s1
|
||||
800006cc: 040b8463 beqz s7,80000714 <__call_exitprocs+0xa8>
|
||||
800006d0: 1044a783 lw a5,260(s1)
|
||||
800006d4: 05778063 beq a5,s7,80000714 <__call_exitprocs+0xa8>
|
||||
800006d8: fff40413 addi s0,s0,-1
|
||||
800006dc: ffc48493 addi s1,s1,-4
|
||||
800006e0: ff3416e3 bne s0,s3,800006cc <__call_exitprocs+0x60>
|
||||
800006e4: 02c12083 lw ra,44(sp)
|
||||
800006e8: 02812403 lw s0,40(sp)
|
||||
800006ec: 02412483 lw s1,36(sp)
|
||||
800006f0: 02012903 lw s2,32(sp)
|
||||
800006f4: 01c12983 lw s3,28(sp)
|
||||
800006f8: 01812a03 lw s4,24(sp)
|
||||
800006fc: 01412a83 lw s5,20(sp)
|
||||
80000700: 01012b03 lw s6,16(sp)
|
||||
80000704: 00c12b83 lw s7,12(sp)
|
||||
80000708: 00812c03 lw s8,8(sp)
|
||||
8000070c: 03010113 addi sp,sp,48
|
||||
80000710: 00008067 ret
|
||||
80000714: 00492783 lw a5,4(s2)
|
||||
80000718: 0044a683 lw a3,4(s1)
|
||||
8000071c: fff78793 addi a5,a5,-1
|
||||
80000720: 04878e63 beq a5,s0,8000077c <__call_exitprocs+0x110>
|
||||
80000724: 0004a223 sw zero,4(s1)
|
||||
80000728: fa0688e3 beqz a3,800006d8 <__call_exitprocs+0x6c>
|
||||
8000072c: 18892783 lw a5,392(s2)
|
||||
80000730: 008a9733 sll a4,s5,s0
|
||||
80000734: 00492c03 lw s8,4(s2)
|
||||
80000738: 00f777b3 and a5,a4,a5
|
||||
8000073c: 02079263 bnez a5,80000760 <__call_exitprocs+0xf4>
|
||||
80000740: 000680e7 jalr a3
|
||||
80000744: 00492703 lw a4,4(s2)
|
||||
80000748: 148a2783 lw a5,328(s4)
|
||||
8000074c: 01871463 bne a4,s8,80000754 <__call_exitprocs+0xe8>
|
||||
80000750: f92784e3 beq a5,s2,800006d8 <__call_exitprocs+0x6c>
|
||||
80000754: f80788e3 beqz a5,800006e4 <__call_exitprocs+0x78>
|
||||
80000758: 00078913 mv s2,a5
|
||||
8000075c: f5dff06f j 800006b8 <__call_exitprocs+0x4c>
|
||||
80000760: 18c92783 lw a5,396(s2)
|
||||
80000764: 0844a583 lw a1,132(s1)
|
||||
80000768: 00f77733 and a4,a4,a5
|
||||
8000076c: 00071c63 bnez a4,80000784 <__call_exitprocs+0x118>
|
||||
80000770: 000b0513 mv a0,s6
|
||||
80000774: 000680e7 jalr a3
|
||||
80000778: fcdff06f j 80000744 <__call_exitprocs+0xd8>
|
||||
8000077c: 00892223 sw s0,4(s2)
|
||||
80000780: fa9ff06f j 80000728 <__call_exitprocs+0xbc>
|
||||
80000784: 00058513 mv a0,a1
|
||||
80000788: 000680e7 jalr a3
|
||||
8000078c: fb9ff06f j 80000744 <__call_exitprocs+0xd8>
|
||||
|
||||
Disassembly of section .init_array:
|
||||
|
||||
80001790 <__init_array_start>:
|
||||
80001790: 0068 addi a0,sp,12
|
||||
80001792: 8000 0x8000
|
||||
|
||||
Disassembly of section .data:
|
||||
|
||||
80001798 <impure_data>:
|
||||
80001798: 0000 unimp
|
||||
8000179a: 0000 unimp
|
||||
8000179c: 1a84 addi s1,sp,368
|
||||
8000179e: 8000 0x8000
|
||||
800017a0: 1aec addi a1,sp,380
|
||||
800017a2: 8000 0x8000
|
||||
800017a4: 1b54 addi a3,sp,436
|
||||
800017a6: 8000 0x8000
|
||||
...
|
||||
80001840: 0001 nop
|
||||
80001842: 0000 unimp
|
||||
80001844: 0000 unimp
|
||||
80001846: 0000 unimp
|
||||
80001848: 330e fld ft6,224(sp)
|
||||
8000184a: abcd j 80001e3c <__BSS_END__+0x1f8>
|
||||
8000184c: 1234 addi a3,sp,296
|
||||
8000184e: e66d bnez a2,80001938 <impure_data+0x1a0>
|
||||
80001850: deec sw a1,124(a3)
|
||||
80001852: 0005 c.nop 1
|
||||
80001854: 0000000b 0xb
|
||||
...
|
||||
|
||||
Disassembly of section .sdata:
|
||||
|
||||
80001bc0 <_global_impure_ptr>:
|
||||
80001bc0: 1798 addi a4,sp,992
|
||||
80001bc2: 8000 0x8000
|
||||
|
||||
Disassembly of section .bss:
|
||||
|
||||
80001bc4 <g_wspawn_args>:
|
||||
...
|
||||
|
||||
Disassembly of section .comment:
|
||||
|
||||
00000000 <.comment>:
|
||||
0: 3a434347 fmsub.d ft6,ft6,ft4,ft7,rmm
|
||||
4: 2820 fld fs0,80(s0)
|
||||
6: 29554e47 fmsub.s ft8,fa0,fs5,ft5,rmm
|
||||
a: 3120 fld fs0,96(a0)
|
||||
c: 2e30 fld fa2,88(a2)
|
||||
e: 2e32 fld ft8,264(sp)
|
||||
10: 0030 addi a2,sp,8
|
||||
|
||||
Disassembly of section .riscv.attributes:
|
||||
|
||||
00000000 <.riscv.attributes>:
|
||||
0: 2941 jal 490 <__stack_size+0x90>
|
||||
2: 0000 unimp
|
||||
4: 7200 flw fs0,32(a2)
|
||||
6: 7369 lui t1,0xffffa
|
||||
8: 01007663 bgeu zero,a6,14 <__stack_usage+0x14>
|
||||
c: 001f 0000 1004 0x10040000001f
|
||||
12: 7205 lui tp,0xfffe1
|
||||
14: 3376 fld ft6,376(sp)
|
||||
16: 6932 flw fs2,12(sp)
|
||||
18: 7032 flw ft0,44(sp)
|
||||
1a: 5f30 lw a2,120(a4)
|
||||
1c: 326d jal fffff9c6 <__stack_top+0xfff9c6>
|
||||
1e: 3070 fld fa2,224(s0)
|
||||
20: 665f 7032 0030 0x307032665f
|
||||
26: 0108 addi a0,sp,128
|
||||
28: 0b0a slli s6,s6,0x2
|
BIN
driver/tests/stress/kernel.elf
Executable file
BIN
driver/tests/stress/kernel.elf
Executable file
Binary file not shown.
293
driver/tests/stress/main.cpp
Normal file
293
driver/tests/stress/main.cpp
Normal file
|
@ -0,0 +1,293 @@
|
|||
#include <iostream>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <vortex.h>
|
||||
#include "common.h"
|
||||
#include <assert.h>
|
||||
#include <limits>
|
||||
#include <math.h>
|
||||
#include <vector>
|
||||
|
||||
#define RT_CHECK(_expr) \
|
||||
do { \
|
||||
int _ret = _expr; \
|
||||
if (0 == _ret) \
|
||||
break; \
|
||||
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
|
||||
cleanup(); \
|
||||
exit(-1); \
|
||||
} while (false)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
union Float_t {
|
||||
float f;
|
||||
int i;
|
||||
struct {
|
||||
uint32_t man : 23;
|
||||
uint32_t exp : 8;
|
||||
uint32_t sign : 1;
|
||||
} parts;
|
||||
};
|
||||
|
||||
inline float fround(float x, int32_t precision = 8) {
|
||||
auto power_of_10 = std::pow(10, precision);
|
||||
return std::round(x * power_of_10) / power_of_10;
|
||||
}
|
||||
|
||||
inline bool almost_equal_eps(float a, float b, int ulp = 128) {
|
||||
auto eps = std::numeric_limits<float>::epsilon() * (std::max(fabs(a), fabs(b)) * ulp);
|
||||
auto d = fabs(a - b);
|
||||
if (d > eps) {
|
||||
std::cout << "*** almost_equal_eps: d=" << d << ", eps=" << eps << std::endl;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool almost_equal_ulp(float a, float b, int32_t ulp = 6) {
|
||||
Float_t fa{a}, fb{b};
|
||||
auto d = std::abs(fa.i - fb.i);
|
||||
if (d > ulp) {
|
||||
std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
inline bool almost_equal(float a, float b) {
|
||||
if (a == b)
|
||||
return true;
|
||||
/*if (almost_equal_eps(a, b))
|
||||
return true;*/
|
||||
return almost_equal_ulp(a, b);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
const char* kernel_file = "kernel.bin";
|
||||
uint32_t count = 0;
|
||||
|
||||
std::vector<float> test_data;
|
||||
std::vector<uint32_t> addr_table;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
vx_buffer_h staging_buf = nullptr;
|
||||
|
||||
static void show_usage() {
|
||||
std::cout << "Vortex Driver Test." << std::endl;
|
||||
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
|
||||
}
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
count = atoi(optarg);
|
||||
break;
|
||||
case 'k':
|
||||
kernel_file = optarg;
|
||||
break;
|
||||
case 'h':
|
||||
case '?': {
|
||||
show_usage();
|
||||
exit(0);
|
||||
} break;
|
||||
default:
|
||||
show_usage();
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup() {
|
||||
if (staging_buf) {
|
||||
vx_buf_release(staging_buf);
|
||||
}
|
||||
if (device) {
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
||||
void gen_input_data(uint32_t num_points) {
|
||||
test_data.resize(num_points);
|
||||
addr_table.resize(num_points + NUM_LOADS - 1);
|
||||
|
||||
for (uint32_t i = 0; i < test_data.size(); ++i) {
|
||||
float r = static_cast<float>(std::rand()) / RAND_MAX;
|
||||
test_data[i] = r;
|
||||
}
|
||||
|
||||
for (uint32_t i = 0; i < addr_table.size(); ++i) {
|
||||
float r = static_cast<float>(std::rand()) / RAND_MAX;
|
||||
uint32_t index = static_cast<uint32_t>(r * num_points);
|
||||
assert(index < num_points);
|
||||
addr_table[i] = index;
|
||||
}
|
||||
}
|
||||
|
||||
int run_test(const kernel_arg_t& kernel_arg,
|
||||
uint32_t dst_buf_size,
|
||||
uint32_t num_points) {
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
RT_CHECK(vx_ready_wait(device, -1));
|
||||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_ptr, dst_buf_size, 0));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (float*)vx_host_ptr(staging_buf);
|
||||
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
|
||||
float ref = 0.0f;
|
||||
for (uint32_t j = 0; j < NUM_LOADS; ++j) {
|
||||
uint32_t addr = i + j;
|
||||
uint32_t index = addr_table.at(addr);
|
||||
float value = test_data.at(index);
|
||||
//printf("*** [%d] addr=%d, index=%d, value=%f\n", i, addr, index, value);
|
||||
ref *= value;
|
||||
}
|
||||
|
||||
float cur = buf_ptr[i];
|
||||
if (!almost_equal(cur, ref)) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
<< ": actual " << cur << ", expected " << ref << std::endl;
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
size_t value;
|
||||
kernel_arg_t kernel_arg;
|
||||
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
if (count == 0) {
|
||||
count = 1;
|
||||
}
|
||||
|
||||
std::srand(50);
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
unsigned max_cores, max_warps, max_threads;
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
|
||||
|
||||
uint32_t num_tasks = max_cores * max_warps * max_threads;
|
||||
uint32_t num_points = count * num_tasks;
|
||||
|
||||
// generate input data
|
||||
gen_input_data(num_points);
|
||||
|
||||
uint32_t addr_buf_size = addr_table.size() * sizeof(int32_t);
|
||||
uint32_t src_buf_size = test_data.size() * sizeof(int32_t);
|
||||
uint32_t dst_buf_size = test_data.size() * sizeof(int32_t);
|
||||
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
|
||||
RT_CHECK(vx_alloc_dev_mem(device, addr_buf_size, &value));
|
||||
kernel_arg.addr_ptr = value;
|
||||
RT_CHECK(vx_alloc_dev_mem(device, src_buf_size, &value));
|
||||
kernel_arg.src_ptr = value;
|
||||
RT_CHECK(vx_alloc_dev_mem(device, dst_buf_size, &value));
|
||||
kernel_arg.dst_ptr = value;
|
||||
|
||||
kernel_arg.num_tasks = num_tasks;
|
||||
kernel_arg.stride = count;
|
||||
|
||||
std::cout << "dev_addr=" << std::hex << kernel_arg.addr_ptr << std::endl;
|
||||
std::cout << "dev_src=" << std::hex << kernel_arg.src_ptr << std::endl;
|
||||
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_ptr << std::endl;
|
||||
|
||||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
|
||||
std::max<uint32_t>(addr_buf_size,
|
||||
std::max<uint32_t>(dst_buf_size,
|
||||
sizeof(kernel_arg_t))));
|
||||
RT_CHECK(vx_alloc_shared_mem(device, staging_buf_size, &staging_buf));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
{
|
||||
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
|
||||
}
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
for (uint32_t i = 0; i < addr_table.size(); ++i) {
|
||||
buf_ptr[i] = addr_table.at(i);
|
||||
}
|
||||
}
|
||||
std::cout << "upload address buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.addr_ptr, addr_buf_size, 0));
|
||||
|
||||
// upload source buffer1
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
for (uint32_t i = 0; i < test_data.size(); ++i) {
|
||||
buf_ptr[i] = test_data.at(i);
|
||||
}
|
||||
}
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_ptr, src_buf_size, 0));
|
||||
|
||||
// clear destination buffer
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
for (uint32_t i = 0; i < test_data.size(); ++i) {
|
||||
buf_ptr[i] = 0xdeadbeef;
|
||||
}
|
||||
}
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_ptr, dst_buf_size, 0));
|
||||
|
||||
// run tests
|
||||
std::cout << "run tests" << std::endl;
|
||||
RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points));
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -5,19 +5,16 @@ Description: Makes the build in the opae directory with the specified core
|
|||
exists, a make clean command is ran before the build. Script waits
|
||||
until the inteldev script or quartus program is finished running.
|
||||
|
||||
Usage: ./build.sh -c [1|2|4|8|16] [-p perf] [-w wait]
|
||||
Usage: ./build.sh -c [1|2|4|8|16] [-p [y|n]]
|
||||
|
||||
Options:
|
||||
-c
|
||||
Core count (1, 2, 4, 8, or 16).
|
||||
|
||||
-p
|
||||
Performance profiling enable. Changes the source file in the
|
||||
Performance profiling enable (y or n). Changes the source file in the
|
||||
opae directory to include/exclude "+define+PERF_ENABLE".
|
||||
|
||||
-w
|
||||
Wait for the build to complete
|
||||
|
||||
_______________________________________________________________________________
|
||||
|
||||
|
||||
|
@ -27,6 +24,7 @@ Description: Runs build.sh with performance profiling enabled for all valid
|
|||
core configurations.
|
||||
|
||||
_______________________________________________________________________________
|
||||
_______________________________________________________________________________
|
||||
|
||||
|
||||
-program_fpga.sh-
|
||||
|
@ -41,6 +39,7 @@ Options:
|
|||
Core count (1, 2, 4, 8, or 16).
|
||||
|
||||
_______________________________________________________________________________
|
||||
_______________________________________________________________________________
|
||||
|
||||
|
||||
-gather_perf_results.sh-
|
||||
|
@ -65,3 +64,53 @@ _______________________________________________________________________________
|
|||
Description: Programs fpga and runs gather_perf_results.sh for all valid core
|
||||
configurations. All builds should already be made before running
|
||||
this.
|
||||
|
||||
_______________________________________________________________________________
|
||||
_______________________________________________________________________________
|
||||
|
||||
|
||||
-export_csv.sh-
|
||||
|
||||
Description: Creates specified .csv output file from an input directory, file,
|
||||
and parameter. The .csv file contains two columns: cores, and the input
|
||||
parameter. The output file is located within the directory specified with -d.
|
||||
|
||||
Usage: ./export_csv.sh -c [cores] -d [directory] -i [input filename] -o
|
||||
[output filename] -p '[parameter]'
|
||||
|
||||
Example: ./export_csv.sh -c 16 -d perf_2021_03_07 -i sgemm.result -o output.csv
|
||||
-p 'PERF: scoreboard stalls'
|
||||
|
||||
Options:
|
||||
-c
|
||||
Upper limit of cores to be read in. Core directories should exist in
|
||||
the directory specified by -d e.g. 1c, 2c, 4c for -c 4.
|
||||
|
||||
-d
|
||||
The directory of the form perf_{date} located in the evaluation
|
||||
directory.
|
||||
|
||||
-i
|
||||
The input filename located in each core directory within the
|
||||
directory specified by -d.
|
||||
|
||||
-o
|
||||
The output filename to be created within the directory specified
|
||||
by -d.
|
||||
|
||||
-p
|
||||
The parameter corresponding to the core count in the .csv file. The
|
||||
full name of the parameter from the start of the line should be
|
||||
inputted to avoid the parameter name being matched multiple times.
|
||||
|
||||
_______________________________________________________________________________
|
||||
|
||||
|
||||
-export_ipc_csv.sh-
|
||||
|
||||
Description: Runs export_csv.sh for the parameter IPC.
|
||||
|
||||
Usage: ./export_csv.sh -c [cores] -d [directory] -i [input filename] -o
|
||||
[output filename]
|
||||
|
||||
Example: ./export_ipc.sh -c 16 -d perf_2021_03_07 -i sgemm.result -o output.csv
|
||||
|
|
|
@ -28,26 +28,15 @@ fi
|
|||
|
||||
cd ${BUILD_DIR}
|
||||
|
||||
sources_file="./sources_${cores}c.txt"
|
||||
|
||||
if [ ${perf} = 1 ]; then
|
||||
if grep -Fxq '#+define+PERF_ENABLE' ${sources_file}; then
|
||||
sed -i 's/+define+PERF_ENABLE/#+define+PERF_ENABLE/' ${sources_file}
|
||||
elif ! grep -Fxq '+define+PERF_ENABLE' ${sources_file}; then
|
||||
sed -i '1s/^/+define+PERF_ENABLE\n/' ${sources_file}
|
||||
fi
|
||||
else
|
||||
if grep -v '^ *#' ${sources_file} | grep -Fxq '+define+SYNTHESIS'; then
|
||||
sed -i 's/+define+PERF_ENABLE/#+define+PERF_ENABLE/' ${sources_file}
|
||||
elif ! grep -Fxq '#+define+PERF_ENABLE' ${sources_file}; then
|
||||
sed -i '1s/^/#+define+PERF_ENABLE\n/' ${sources_file}
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -d "./build_fpga_{$cores}c" ]; then
|
||||
make "clean-fpga-${cores}c"
|
||||
fi
|
||||
make "fpga-${cores}c"
|
||||
|
||||
if [ ${perf} = 1 ]; then
|
||||
PERF=1 make "fpga-${cores}c"
|
||||
else
|
||||
make "fpga-${cores}c"
|
||||
fi
|
||||
|
||||
if [ ${wait} = 1 ]; then
|
||||
sleep 30
|
||||
|
|
33
evaluation/scripts/export_csv.sh
Executable file
33
evaluation/scripts/export_csv.sh
Executable file
|
@ -0,0 +1,33 @@
|
|||
#!/bin/bash
|
||||
|
||||
while getopts c:d:i:o:p: flag
|
||||
do
|
||||
case "${flag}" in
|
||||
c) cores=${OPTARG};; #1, 2, 4, 8, 16
|
||||
d) dir=${OPTARG};; #directory name (e.g. perf_2021_03_07)
|
||||
i) ifile=${OPTARG};; #input filename
|
||||
o) ofile=${OPTARG};; #output filename
|
||||
p) param=${OPTARG};; #parameter to be made into csv
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ ! "$cores" =~ ^(1|2|4|8|16)$ ]]; then
|
||||
echo 'Invalid parameter for argument -c (1, 2, 4, 8, or 16 expected)'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$ifile" ]; then
|
||||
echo 'No input filename given for argument -f'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$dir" ]; then
|
||||
echo 'No directory given for argument -d'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
printf "cores,${param}\n" > "../${dir}/${ofile}"
|
||||
for ((i=1; i<=$cores; i=i*2)); do
|
||||
printf "${i}," >> "../${dir}/${ofile}"
|
||||
(sed -n "s/${param}=\(.*\)/\1/p" < "../${dir}/${i}c/${ifile}") >> "../${dir}/${ofile}"
|
||||
done
|
32
evaluation/scripts/export_ipc_csv.sh
Executable file
32
evaluation/scripts/export_ipc_csv.sh
Executable file
|
@ -0,0 +1,32 @@
|
|||
#!/bin/bash
|
||||
|
||||
while getopts c:d:f:o: flag
|
||||
do
|
||||
case "${flag}" in
|
||||
c) cores=${OPTARG};; #1, 2, 4, 8, 16
|
||||
d) dir=${OPTARG};; #directory name (e.g. perf_2021_03_07)
|
||||
i) ifile=${OPTARG};; #input filename
|
||||
o) ofile=${OPTARG};; #output filename
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ ! "$cores" =~ ^(1|2|4|8|16)$ ]]; then
|
||||
echo 'Invalid parameter for argument -c (1, 2, 4, 8, or 16 expected)'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$ifile" ]; then
|
||||
echo 'No input filename given for argument -f'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z "$dir" ]; then
|
||||
echo 'No directory given for argument -d'
|
||||
exit 1
|
||||
fi
|
||||
|
||||
printf "cores,IPC" > "../${dir}/${ofile}"
|
||||
for ((i=1; i<=$cores; i=i*2)); do
|
||||
printf "${i}," >> "../${dir}/${ofile}"
|
||||
(sed -n "s/IPC=\(.*\)/\1/p" < "../${dir}/${i}c/${ifile}" | awk 'END {print $NF}') >> "../${dir}/${ofile}"
|
||||
done
|
|
@ -1,9 +1,9 @@
|
|||
.PHONY: build_config
|
||||
|
||||
build_config:
|
||||
./scripts/gen_config.py --outv ./rtl/VX_user_config.vh --outc ./VX_config.h
|
||||
build_config: ./rtl/VX_config.vh
|
||||
./scripts/gen_config.py -i ./rtl/VX_config.vh -o ./VX_config.h
|
||||
$(MAKE) -C simulate
|
||||
|
||||
clean:
|
||||
rm -f ./rtl/VX_user_config.vh ./VX_config.h
|
||||
rm -f ./VX_config.h
|
||||
$(MAKE) -C simulate clean
|
|
@ -9,20 +9,20 @@ module VX_cluster #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// DRAM request
|
||||
output wire dram_req_valid,
|
||||
output wire dram_req_rw,
|
||||
output wire [`L2DRAM_BYTEEN_WIDTH-1:0] dram_req_byteen,
|
||||
output wire [`L2DRAM_ADDR_WIDTH-1:0] dram_req_addr,
|
||||
output wire [`L2DRAM_LINE_WIDTH-1:0] dram_req_data,
|
||||
output wire [`L2DRAM_TAG_WIDTH-1:0] dram_req_tag,
|
||||
input wire dram_req_ready,
|
||||
// Memory request
|
||||
output wire mem_req_valid,
|
||||
output wire mem_req_rw,
|
||||
output wire [`L2MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
|
||||
output wire [`L2MEM_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [`L2MEM_LINE_WIDTH-1:0] mem_req_data,
|
||||
output wire [`L2MEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// DRAM response
|
||||
input wire dram_rsp_valid,
|
||||
input wire [`L2DRAM_LINE_WIDTH-1:0] dram_rsp_data,
|
||||
input wire [`L2DRAM_TAG_WIDTH-1:0] dram_rsp_tag,
|
||||
output wire dram_rsp_ready,
|
||||
// Memory response
|
||||
input wire mem_rsp_valid,
|
||||
input wire [`L2MEM_LINE_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [`L2MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire mem_rsp_ready,
|
||||
|
||||
// CSR Request
|
||||
input wire csr_req_valid,
|
||||
|
@ -42,31 +42,31 @@ module VX_cluster #(
|
|||
output wire ebreak
|
||||
);
|
||||
|
||||
wire [`NUM_CORES-1:0] per_core_dram_req_valid;
|
||||
wire [`NUM_CORES-1:0] per_core_dram_req_rw;
|
||||
wire [`NUM_CORES-1:0][`DDRAM_BYTEEN_WIDTH-1:0] per_core_dram_req_byteen;
|
||||
wire [`NUM_CORES-1:0][`DDRAM_ADDR_WIDTH-1:0] per_core_dram_req_addr;
|
||||
wire [`NUM_CORES-1:0][`DDRAM_LINE_WIDTH-1:0] per_core_dram_req_data;
|
||||
wire [`NUM_CORES-1:0][`XDRAM_TAG_WIDTH-1:0] per_core_dram_req_tag;
|
||||
wire [`NUM_CORES-1:0] per_core_dram_req_ready;
|
||||
wire [`NUM_CORES-1:0] per_core_mem_req_valid;
|
||||
wire [`NUM_CORES-1:0] per_core_mem_req_rw;
|
||||
wire [`NUM_CORES-1:0][`DMEM_BYTEEN_WIDTH-1:0] per_core_mem_req_byteen;
|
||||
wire [`NUM_CORES-1:0][`DMEM_ADDR_WIDTH-1:0] per_core_mem_req_addr;
|
||||
wire [`NUM_CORES-1:0][`DMEM_LINE_WIDTH-1:0] per_core_mem_req_data;
|
||||
wire [`NUM_CORES-1:0][`XMEM_TAG_WIDTH-1:0] per_core_mem_req_tag;
|
||||
wire [`NUM_CORES-1:0] per_core_mem_req_ready;
|
||||
|
||||
wire [`NUM_CORES-1:0] per_core_dram_rsp_valid;
|
||||
wire [`NUM_CORES-1:0][`DDRAM_LINE_WIDTH-1:0] per_core_dram_rsp_data;
|
||||
wire [`NUM_CORES-1:0][`XDRAM_TAG_WIDTH-1:0] per_core_dram_rsp_tag;
|
||||
wire [`NUM_CORES-1:0] per_core_dram_rsp_ready;
|
||||
wire [`NUM_CORES-1:0] per_core_mem_rsp_valid;
|
||||
wire [`NUM_CORES-1:0][`DMEM_LINE_WIDTH-1:0] per_core_mem_rsp_data;
|
||||
wire [`NUM_CORES-1:0][`XMEM_TAG_WIDTH-1:0] per_core_mem_rsp_tag;
|
||||
wire [`NUM_CORES-1:0] per_core_mem_rsp_ready;
|
||||
|
||||
wire [`NUM_CORES-1:0] per_core_csr_req_valid;
|
||||
wire [`NUM_CORES-1:0][11:0] per_core_csr_req_addr;
|
||||
wire [`NUM_CORES-1:0] per_core_csr_req_rw;
|
||||
wire [`NUM_CORES-1:0][31:0] per_core_csr_req_data;
|
||||
wire [`NUM_CORES-1:0] per_core_csr_req_ready;
|
||||
wire [`NUM_CORES-1:0] per_core_csr_req_valid;
|
||||
wire [`NUM_CORES-1:0][11:0] per_core_csr_req_addr;
|
||||
wire [`NUM_CORES-1:0] per_core_csr_req_rw;
|
||||
wire [`NUM_CORES-1:0][31:0] per_core_csr_req_data;
|
||||
wire [`NUM_CORES-1:0] per_core_csr_req_ready;
|
||||
|
||||
wire [`NUM_CORES-1:0] per_core_csr_rsp_valid;
|
||||
wire [`NUM_CORES-1:0][31:0] per_core_csr_rsp_data;
|
||||
wire [`NUM_CORES-1:0] per_core_csr_rsp_ready;
|
||||
wire [`NUM_CORES-1:0] per_core_csr_rsp_valid;
|
||||
wire [`NUM_CORES-1:0][31:0] per_core_csr_rsp_data;
|
||||
wire [`NUM_CORES-1:0] per_core_csr_rsp_ready;
|
||||
|
||||
wire [`NUM_CORES-1:0] per_core_busy;
|
||||
wire [`NUM_CORES-1:0] per_core_ebreak;
|
||||
wire [`NUM_CORES-1:0] per_core_busy;
|
||||
wire [`NUM_CORES-1:0] per_core_ebreak;
|
||||
|
||||
for (genvar i = 0; i < `NUM_CORES; i++) begin
|
||||
|
||||
|
@ -87,18 +87,18 @@ module VX_cluster #(
|
|||
.clk (clk),
|
||||
.reset (core_reset),
|
||||
|
||||
.dram_req_valid (per_core_dram_req_valid[i]),
|
||||
.dram_req_rw (per_core_dram_req_rw [i]),
|
||||
.dram_req_byteen(per_core_dram_req_byteen[i]),
|
||||
.dram_req_addr (per_core_dram_req_addr [i]),
|
||||
.dram_req_data (per_core_dram_req_data [i]),
|
||||
.dram_req_tag (per_core_dram_req_tag [i]),
|
||||
.dram_req_ready (per_core_dram_req_ready[i]),
|
||||
.mem_req_valid (per_core_mem_req_valid[i]),
|
||||
.mem_req_rw (per_core_mem_req_rw [i]),
|
||||
.mem_req_byteen (per_core_mem_req_byteen[i]),
|
||||
.mem_req_addr (per_core_mem_req_addr [i]),
|
||||
.mem_req_data (per_core_mem_req_data [i]),
|
||||
.mem_req_tag (per_core_mem_req_tag [i]),
|
||||
.mem_req_ready (per_core_mem_req_ready[i]),
|
||||
|
||||
.dram_rsp_valid (per_core_dram_rsp_valid[i]),
|
||||
.dram_rsp_data (per_core_dram_rsp_data [i]),
|
||||
.dram_rsp_tag (per_core_dram_rsp_tag [i]),
|
||||
.dram_rsp_ready (per_core_dram_rsp_ready[i]),
|
||||
.mem_rsp_valid (per_core_mem_rsp_valid[i]),
|
||||
.mem_rsp_data (per_core_mem_rsp_data [i]),
|
||||
.mem_rsp_tag (per_core_mem_rsp_tag [i]),
|
||||
.mem_rsp_ready (per_core_mem_rsp_ready[i]),
|
||||
|
||||
.csr_req_valid (per_core_csr_req_valid [i]),
|
||||
.csr_req_rw (per_core_csr_req_rw [i]),
|
||||
|
@ -169,12 +169,12 @@ module VX_cluster #(
|
|||
.NUM_REQS (`NUM_CORES),
|
||||
.CREQ_SIZE (`L2CREQ_SIZE),
|
||||
.MSHR_SIZE (`L2MSHR_SIZE),
|
||||
.DRSQ_SIZE (`L2DRSQ_SIZE),
|
||||
.DREQ_SIZE (`L2DREQ_SIZE),
|
||||
.MRSQ_SIZE (`L2MRSQ_SIZE),
|
||||
.MREQ_SIZE (`L2MREQ_SIZE),
|
||||
.WRITE_ENABLE (1),
|
||||
.CORE_TAG_WIDTH (`XDRAM_TAG_WIDTH),
|
||||
.CORE_TAG_WIDTH (`XMEM_TAG_WIDTH),
|
||||
.CORE_TAG_ID_BITS (0),
|
||||
.DRAM_TAG_WIDTH (`L2DRAM_TAG_WIDTH)
|
||||
.MEM_TAG_WIDTH (`L2MEM_TAG_WIDTH)
|
||||
) l2cache (
|
||||
`SCOPE_BIND_VX_cluster_l2cache
|
||||
|
||||
|
@ -188,78 +188,78 @@ module VX_cluster #(
|
|||
`endif
|
||||
|
||||
// Core request
|
||||
.core_req_valid (per_core_dram_req_valid),
|
||||
.core_req_rw (per_core_dram_req_rw),
|
||||
.core_req_byteen (per_core_dram_req_byteen),
|
||||
.core_req_addr (per_core_dram_req_addr),
|
||||
.core_req_data (per_core_dram_req_data),
|
||||
.core_req_tag (per_core_dram_req_tag),
|
||||
.core_req_ready (per_core_dram_req_ready),
|
||||
.core_req_valid (per_core_mem_req_valid),
|
||||
.core_req_rw (per_core_mem_req_rw),
|
||||
.core_req_byteen (per_core_mem_req_byteen),
|
||||
.core_req_addr (per_core_mem_req_addr),
|
||||
.core_req_data (per_core_mem_req_data),
|
||||
.core_req_tag (per_core_mem_req_tag),
|
||||
.core_req_ready (per_core_mem_req_ready),
|
||||
|
||||
// Core response
|
||||
.core_rsp_valid (per_core_dram_rsp_valid),
|
||||
.core_rsp_data (per_core_dram_rsp_data),
|
||||
.core_rsp_tag (per_core_dram_rsp_tag),
|
||||
.core_rsp_ready (per_core_dram_rsp_ready),
|
||||
.core_rsp_valid (per_core_mem_rsp_valid),
|
||||
.core_rsp_data (per_core_mem_rsp_data),
|
||||
.core_rsp_tag (per_core_mem_rsp_tag),
|
||||
.core_rsp_ready (per_core_mem_rsp_ready),
|
||||
|
||||
// DRAM request
|
||||
.dram_req_valid (dram_req_valid),
|
||||
.dram_req_rw (dram_req_rw),
|
||||
.dram_req_byteen (dram_req_byteen),
|
||||
.dram_req_addr (dram_req_addr),
|
||||
.dram_req_data (dram_req_data),
|
||||
.dram_req_tag (dram_req_tag),
|
||||
.dram_req_ready (dram_req_ready),
|
||||
// Memory request
|
||||
.mem_req_valid (mem_req_valid),
|
||||
.mem_req_rw (mem_req_rw),
|
||||
.mem_req_byteen (mem_req_byteen),
|
||||
.mem_req_addr (mem_req_addr),
|
||||
.mem_req_data (mem_req_data),
|
||||
.mem_req_tag (mem_req_tag),
|
||||
.mem_req_ready (mem_req_ready),
|
||||
|
||||
// DRAM response
|
||||
.dram_rsp_valid (dram_rsp_valid),
|
||||
.dram_rsp_tag (dram_rsp_tag),
|
||||
.dram_rsp_data (dram_rsp_data),
|
||||
.dram_rsp_ready (dram_rsp_ready)
|
||||
// Memory response
|
||||
.mem_rsp_valid (mem_rsp_valid),
|
||||
.mem_rsp_tag (mem_rsp_tag),
|
||||
.mem_rsp_data (mem_rsp_data),
|
||||
.mem_rsp_ready (mem_rsp_ready)
|
||||
);
|
||||
|
||||
end else begin
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_REQS (`NUM_CORES),
|
||||
.DATA_WIDTH (`L2DRAM_LINE_WIDTH),
|
||||
.TAG_IN_WIDTH (`XDRAM_TAG_WIDTH),
|
||||
.TAG_OUT_WIDTH (`L2DRAM_TAG_WIDTH),
|
||||
.BUFFERED_REQ (1),
|
||||
.BUFFERED_RSP (1)
|
||||
) dram_arb (
|
||||
.NUM_REQS (`NUM_CORES),
|
||||
.DATA_WIDTH (`L2MEM_LINE_WIDTH),
|
||||
.TAG_IN_WIDTH (`XMEM_TAG_WIDTH),
|
||||
.TAG_OUT_WIDTH (`L2MEM_TAG_WIDTH),
|
||||
.BUFFERED_REQ (1),
|
||||
.BUFFERED_RSP (1)
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
// Core request
|
||||
.req_valid_in (per_core_dram_req_valid),
|
||||
.req_rw_in (per_core_dram_req_rw),
|
||||
.req_byteen_in (per_core_dram_req_byteen),
|
||||
.req_addr_in (per_core_dram_req_addr),
|
||||
.req_data_in (per_core_dram_req_data),
|
||||
.req_tag_in (per_core_dram_req_tag),
|
||||
.req_ready_in (per_core_dram_req_ready),
|
||||
.req_valid_in (per_core_mem_req_valid),
|
||||
.req_rw_in (per_core_mem_req_rw),
|
||||
.req_byteen_in (per_core_mem_req_byteen),
|
||||
.req_addr_in (per_core_mem_req_addr),
|
||||
.req_data_in (per_core_mem_req_data),
|
||||
.req_tag_in (per_core_mem_req_tag),
|
||||
.req_ready_in (per_core_mem_req_ready),
|
||||
|
||||
// DRAM request
|
||||
.req_valid_out (dram_req_valid),
|
||||
.req_rw_out (dram_req_rw),
|
||||
.req_byteen_out (dram_req_byteen),
|
||||
.req_addr_out (dram_req_addr),
|
||||
.req_data_out (dram_req_data),
|
||||
.req_tag_out (dram_req_tag),
|
||||
.req_ready_out (dram_req_ready),
|
||||
// Memory request
|
||||
.req_valid_out (mem_req_valid),
|
||||
.req_rw_out (mem_req_rw),
|
||||
.req_byteen_out (mem_req_byteen),
|
||||
.req_addr_out (mem_req_addr),
|
||||
.req_data_out (mem_req_data),
|
||||
.req_tag_out (mem_req_tag),
|
||||
.req_ready_out (mem_req_ready),
|
||||
|
||||
// Core response
|
||||
.rsp_valid_out (per_core_dram_rsp_valid),
|
||||
.rsp_data_out (per_core_dram_rsp_data),
|
||||
.rsp_tag_out (per_core_dram_rsp_tag),
|
||||
.rsp_ready_out (per_core_dram_rsp_ready),
|
||||
.rsp_valid_out (per_core_mem_rsp_valid),
|
||||
.rsp_data_out (per_core_mem_rsp_data),
|
||||
.rsp_tag_out (per_core_mem_rsp_tag),
|
||||
.rsp_ready_out (per_core_mem_rsp_ready),
|
||||
|
||||
// DRAM response
|
||||
.rsp_valid_in (dram_rsp_valid),
|
||||
.rsp_tag_in (dram_rsp_tag),
|
||||
.rsp_data_in (dram_rsp_data),
|
||||
.rsp_ready_in (dram_rsp_ready)
|
||||
// Memory response
|
||||
.rsp_valid_in (mem_rsp_valid),
|
||||
.rsp_tag_in (mem_rsp_tag),
|
||||
.rsp_data_in (mem_rsp_data),
|
||||
.rsp_ready_in (mem_rsp_ready)
|
||||
);
|
||||
|
||||
end
|
||||
|
|
|
@ -1,8 +1,6 @@
|
|||
`ifndef VX_CONFIG
|
||||
`define VX_CONFIG
|
||||
|
||||
`include "VX_user_config.vh"
|
||||
|
||||
`ifndef NUM_CLUSTERS
|
||||
`define NUM_CLUSTERS 1
|
||||
`endif
|
||||
|
@ -35,8 +33,8 @@
|
|||
`define SM_ENABLE 1
|
||||
`endif
|
||||
|
||||
`ifndef GLOBAL_BLOCK_SIZE
|
||||
`define GLOBAL_BLOCK_SIZE 64
|
||||
`ifndef MEM_BLOCK_SIZE
|
||||
`define MEM_BLOCK_SIZE 64
|
||||
`endif
|
||||
|
||||
`ifndef L1_BLOCK_SIZE
|
||||
|
@ -209,14 +207,14 @@
|
|||
`define CSR_MPM_SMEM_BANK_ST 12'hB18 // bank conflicts stalls
|
||||
`define CSR_MPM_SMEM_BANK_ST_H 12'hB98
|
||||
// PERF: memory
|
||||
`define CSR_MPM_DRAM_READS 12'hB19 // dram reads
|
||||
`define CSR_MPM_DRAM_READS_H 12'hB99
|
||||
`define CSR_MPM_DRAM_WRITES 12'hB1A // dram writes
|
||||
`define CSR_MPM_DRAM_WRITES_H 12'hB9A
|
||||
`define CSR_MPM_DRAM_ST 12'hB1B // dram request stalls
|
||||
`define CSR_MPM_DRAM_ST_H 12'hB9B
|
||||
`define CSR_MPM_DRAM_LAT 12'hB1C // dram latency (total)
|
||||
`define CSR_MPM_DRAM_LAT_H 12'hB9C
|
||||
`define CSR_MPM_MEM_READS 12'hB19 // memory reads
|
||||
`define CSR_MPM_MEM_READS_H 12'hB99
|
||||
`define CSR_MPM_MEM_WRITES 12'hB1A // memory writes
|
||||
`define CSR_MPM_MEM_WRITES_H 12'hB9A
|
||||
`define CSR_MPM_MEM_ST 12'hB1B // memory request stalls
|
||||
`define CSR_MPM_MEM_ST_H 12'hB9B
|
||||
`define CSR_MPM_MEM_LAT 12'hB1C // memory latency (total)
|
||||
`define CSR_MPM_MEM_LAT_H 12'hB9C
|
||||
|
||||
// Machine Information Registers
|
||||
`define CSR_MVENDORID 12'hF11
|
||||
|
@ -281,14 +279,14 @@
|
|||
`define IMSHR_SIZE `NUM_WARPS
|
||||
`endif
|
||||
|
||||
// DRAM Request Queue Size
|
||||
`ifndef IDREQ_SIZE
|
||||
`define IDREQ_SIZE 4
|
||||
// Memory Request Queue Size
|
||||
`ifndef IMREQ_SIZE
|
||||
`define IMREQ_SIZE 4
|
||||
`endif
|
||||
|
||||
// DRAM Response Queue Size
|
||||
`ifndef IDRSQ_SIZE
|
||||
`define IDRSQ_SIZE 4
|
||||
// Memory Response Queue Size
|
||||
`ifndef IMRSQ_SIZE
|
||||
`define IMRSQ_SIZE 4
|
||||
`endif
|
||||
|
||||
// Dcache Configurable Knobs //////////////////////////////////////////////////
|
||||
|
@ -318,14 +316,14 @@
|
|||
`define DMSHR_SIZE `LSUQ_SIZE
|
||||
`endif
|
||||
|
||||
// DRAM Request Queue Size
|
||||
`ifndef DDREQ_SIZE
|
||||
`define DDREQ_SIZE 4
|
||||
// Memory Request Queue Size
|
||||
`ifndef DMREQ_SIZE
|
||||
`define DMREQ_SIZE 4
|
||||
`endif
|
||||
|
||||
// DRAM Response Queue Size
|
||||
`ifndef DDRSQ_SIZE
|
||||
`define DDRSQ_SIZE `MAX(4, (`DNUM_BANKS * 2))
|
||||
// Memory Response Queue Size
|
||||
`ifndef DMRSQ_SIZE
|
||||
`define DMRSQ_SIZE `MAX(4, (`DNUM_BANKS * 2))
|
||||
`endif
|
||||
|
||||
// SM Configurable Knobs //////////////////////////////////////////////////////
|
||||
|
@ -372,14 +370,14 @@
|
|||
`define L2MSHR_SIZE 16
|
||||
`endif
|
||||
|
||||
// DRAM Request Queue Size
|
||||
`ifndef L2DREQ_SIZE
|
||||
`define L2DREQ_SIZE 4
|
||||
// L2 Request Queue Size
|
||||
`ifndef L2MREQ_SIZE
|
||||
`define L2MREQ_SIZE 4
|
||||
`endif
|
||||
|
||||
// DRAM Response Queue Size
|
||||
`ifndef L2DRSQ_SIZE
|
||||
`define L2DRSQ_SIZE `MAX(4, (`L2NUM_BANKS * 2))
|
||||
// L2 Response Queue Size
|
||||
`ifndef L2MRSQ_SIZE
|
||||
`define L2MRSQ_SIZE `MAX(4, (`L2NUM_BANKS * 2))
|
||||
`endif
|
||||
|
||||
// L3cache Configurable Knobs /////////////////////////////////////////////////
|
||||
|
@ -404,14 +402,14 @@
|
|||
`define L3MSHR_SIZE 16
|
||||
`endif
|
||||
|
||||
// DRAM Request Queue Size
|
||||
`ifndef L3DREQ_SIZE
|
||||
`define L3DREQ_SIZE 4
|
||||
// L3 Request Queue Size
|
||||
`ifndef L3MREQ_SIZE
|
||||
`define L3MREQ_SIZE 4
|
||||
`endif
|
||||
|
||||
// DRAM Response Queue Size
|
||||
`ifndef L3DRSQ_SIZE
|
||||
`define L3DRSQ_SIZE `MAX(4, (`L3NUM_BANKS * 2))
|
||||
// L3 Response Queue Size
|
||||
`ifndef L3MRSQ_SIZE
|
||||
`define L3MRSQ_SIZE `MAX(4, (`L3NUM_BANKS * 2))
|
||||
`endif
|
||||
|
||||
`endif
|
||||
|
|
|
@ -9,20 +9,20 @@ module VX_core #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// DRAM request
|
||||
output wire dram_req_valid,
|
||||
output wire dram_req_rw,
|
||||
output wire [`DDRAM_BYTEEN_WIDTH-1:0] dram_req_byteen,
|
||||
output wire [`DDRAM_ADDR_WIDTH-1:0] dram_req_addr,
|
||||
output wire [`DDRAM_LINE_WIDTH-1:0] dram_req_data,
|
||||
output wire [`XDRAM_TAG_WIDTH-1:0] dram_req_tag,
|
||||
input wire dram_req_ready,
|
||||
// Memory request
|
||||
output wire mem_req_valid,
|
||||
output wire mem_req_rw,
|
||||
output wire [`DMEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
|
||||
output wire [`DMEM_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [`DMEM_LINE_WIDTH-1:0] mem_req_data,
|
||||
output wire [`XMEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// DRAM reponse
|
||||
input wire dram_rsp_valid,
|
||||
input wire [`DDRAM_LINE_WIDTH-1:0] dram_rsp_data,
|
||||
input wire [`XDRAM_TAG_WIDTH-1:0] dram_rsp_tag,
|
||||
output wire dram_rsp_ready,
|
||||
// Memory reponse
|
||||
input wire mem_rsp_valid,
|
||||
input wire [`DMEM_LINE_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [`XMEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire mem_rsp_ready,
|
||||
|
||||
// CSR request
|
||||
input wire csr_req_valid,
|
||||
|
@ -44,29 +44,29 @@ module VX_core #(
|
|||
VX_perf_memsys_if perf_memsys_if();
|
||||
`endif
|
||||
|
||||
VX_cache_dram_req_if #(
|
||||
.DRAM_LINE_WIDTH(`DDRAM_LINE_WIDTH),
|
||||
.DRAM_ADDR_WIDTH(`DDRAM_ADDR_WIDTH),
|
||||
.DRAM_TAG_WIDTH(`XDRAM_TAG_WIDTH)
|
||||
) dram_req_if();
|
||||
VX_cache_mem_req_if #(
|
||||
.MEM_LINE_WIDTH(`DMEM_LINE_WIDTH),
|
||||
.MEM_ADDR_WIDTH(`DMEM_ADDR_WIDTH),
|
||||
.MEM_TAG_WIDTH(`XMEM_TAG_WIDTH)
|
||||
) mem_req_if();
|
||||
|
||||
VX_cache_dram_rsp_if #(
|
||||
.DRAM_LINE_WIDTH(`DDRAM_LINE_WIDTH),
|
||||
.DRAM_TAG_WIDTH(`XDRAM_TAG_WIDTH)
|
||||
) dram_rsp_if();
|
||||
VX_cache_mem_rsp_if #(
|
||||
.MEM_LINE_WIDTH(`DMEM_LINE_WIDTH),
|
||||
.MEM_TAG_WIDTH(`XMEM_TAG_WIDTH)
|
||||
) mem_rsp_if();
|
||||
|
||||
assign dram_req_valid = dram_req_if.valid;
|
||||
assign dram_req_rw = dram_req_if.rw;
|
||||
assign dram_req_byteen= dram_req_if.byteen;
|
||||
assign dram_req_addr = dram_req_if.addr;
|
||||
assign dram_req_data = dram_req_if.data;
|
||||
assign dram_req_tag = dram_req_if.tag;
|
||||
assign dram_req_if.ready = dram_req_ready;
|
||||
assign mem_req_valid = mem_req_if.valid;
|
||||
assign mem_req_rw = mem_req_if.rw;
|
||||
assign mem_req_byteen= mem_req_if.byteen;
|
||||
assign mem_req_addr = mem_req_if.addr;
|
||||
assign mem_req_data = mem_req_if.data;
|
||||
assign mem_req_tag = mem_req_if.tag;
|
||||
assign mem_req_if.ready = mem_req_ready;
|
||||
|
||||
assign dram_rsp_if.valid = dram_rsp_valid;
|
||||
assign dram_rsp_if.data = dram_rsp_data;
|
||||
assign dram_rsp_if.tag = dram_rsp_tag;
|
||||
assign dram_rsp_ready = dram_rsp_if.ready;
|
||||
assign mem_rsp_if.valid = mem_rsp_valid;
|
||||
assign mem_rsp_if.data = mem_rsp_data;
|
||||
assign mem_rsp_if.tag = mem_rsp_tag;
|
||||
assign mem_rsp_ready = mem_rsp_if.ready;
|
||||
|
||||
//--
|
||||
|
||||
|
@ -168,9 +168,9 @@ module VX_core #(
|
|||
.icache_core_req_if (icache_core_req_if),
|
||||
.icache_core_rsp_if (icache_core_rsp_if),
|
||||
|
||||
// DRAM
|
||||
.dram_req_if (dram_req_if),
|
||||
.dram_rsp_if (dram_rsp_if)
|
||||
// Memory
|
||||
.mem_req_if (mem_req_if),
|
||||
.mem_rsp_if (mem_rsp_if)
|
||||
);
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -123,61 +123,61 @@ module VX_csr_data #(
|
|||
`ifdef PERF_ENABLE
|
||||
// PERF: pipeline
|
||||
`CSR_MPM_IBUF_ST : read_data_r = perf_pipeline_if.ibf_stalls[31:0];
|
||||
`CSR_MPM_IBUF_ST_H : read_data_r = 32'(perf_pipeline_if.ibf_stalls[43:32]);
|
||||
`CSR_MPM_IBUF_ST_H : read_data_r = 32'(perf_pipeline_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_SCRB_ST : read_data_r = perf_pipeline_if.scb_stalls[31:0];
|
||||
`CSR_MPM_SCRB_ST_H : read_data_r = 32'(perf_pipeline_if.scb_stalls[43:32]);
|
||||
`CSR_MPM_SCRB_ST_H : read_data_r = 32'(perf_pipeline_if.scb_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_ALU_ST : read_data_r = perf_pipeline_if.alu_stalls[31:0];
|
||||
`CSR_MPM_ALU_ST_H : read_data_r = 32'(perf_pipeline_if.alu_stalls[43:32]);
|
||||
`CSR_MPM_ALU_ST_H : read_data_r = 32'(perf_pipeline_if.alu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_LSU_ST : read_data_r = perf_pipeline_if.lsu_stalls[31:0];
|
||||
`CSR_MPM_LSU_ST_H : read_data_r = 32'(perf_pipeline_if.lsu_stalls[43:32]);
|
||||
`CSR_MPM_LSU_ST_H : read_data_r = 32'(perf_pipeline_if.lsu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_CSR_ST : read_data_r = perf_pipeline_if.csr_stalls[31:0];
|
||||
`CSR_MPM_CSR_ST_H : read_data_r = 32'(perf_pipeline_if.csr_stalls[43:32]);
|
||||
`CSR_MPM_CSR_ST_H : read_data_r = 32'(perf_pipeline_if.csr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_FPU_ST : read_data_r = perf_pipeline_if.fpu_stalls[31:0];
|
||||
`CSR_MPM_FPU_ST_H : read_data_r = 32'(perf_pipeline_if.fpu_stalls[43:32]);
|
||||
`CSR_MPM_FPU_ST_H : read_data_r = 32'(perf_pipeline_if.fpu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_GPU_ST : read_data_r = perf_pipeline_if.gpu_stalls[31:0];
|
||||
`CSR_MPM_GPU_ST_H : read_data_r = 32'(perf_pipeline_if.gpu_stalls[43:32]);
|
||||
`CSR_MPM_GPU_ST_H : read_data_r = 32'(perf_pipeline_if.gpu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: icache
|
||||
`CSR_MPM_ICACHE_READS : read_data_r = perf_memsys_if.icache_reads[31:0];
|
||||
`CSR_MPM_ICACHE_READS_H : read_data_r = 32'(perf_memsys_if.icache_reads[43:32]);
|
||||
`CSR_MPM_ICACHE_READS_H : read_data_r = 32'(perf_memsys_if.icache_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_ICACHE_MISS_R : read_data_r = perf_memsys_if.icache_read_misses[31:0];
|
||||
`CSR_MPM_ICACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.icache_read_misses[43:32]);
|
||||
`CSR_MPM_ICACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.icache_read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_ICACHE_PIPE_ST : read_data_r = perf_memsys_if.icache_pipe_stalls[31:0];
|
||||
`CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.icache_pipe_stalls[43:32]);
|
||||
`CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.icache_pipe_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_ICACHE_CRSP_ST : read_data_r = perf_memsys_if.icache_crsp_stalls[31:0];
|
||||
`CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.icache_crsp_stalls[43:32]);
|
||||
`CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.icache_crsp_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: dcache
|
||||
`CSR_MPM_DCACHE_READS : read_data_r = perf_memsys_if.dcache_reads[31:0];
|
||||
`CSR_MPM_DCACHE_READS_H : read_data_r = 32'(perf_memsys_if.dcache_reads[43:32]);
|
||||
`CSR_MPM_DCACHE_READS_H : read_data_r = 32'(perf_memsys_if.dcache_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_WRITES : read_data_r = perf_memsys_if.dcache_writes[31:0];
|
||||
`CSR_MPM_DCACHE_WRITES_H : read_data_r = 32'(perf_memsys_if.dcache_writes[43:32]);
|
||||
`CSR_MPM_DCACHE_WRITES_H : read_data_r = 32'(perf_memsys_if.dcache_writes[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_MISS_R : read_data_r = perf_memsys_if.dcache_read_misses[31:0];
|
||||
`CSR_MPM_DCACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.dcache_read_misses[43:32]);
|
||||
`CSR_MPM_DCACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.dcache_read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_MISS_W : read_data_r = perf_memsys_if.dcache_write_misses[31:0];
|
||||
`CSR_MPM_DCACHE_MISS_W_H : read_data_r = 32'(perf_memsys_if.dcache_write_misses[43:32]);
|
||||
`CSR_MPM_DCACHE_MISS_W_H : read_data_r = 32'(perf_memsys_if.dcache_write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_BANK_ST : read_data_r = perf_memsys_if.dcache_bank_stalls[31:0];
|
||||
`CSR_MPM_DCACHE_BANK_ST_H : read_data_r = 32'(perf_memsys_if.dcache_bank_stalls[43:32]);
|
||||
`CSR_MPM_DCACHE_BANK_ST_H : read_data_r = 32'(perf_memsys_if.dcache_bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_MSHR_ST : read_data_r = perf_memsys_if.dcache_mshr_stalls[31:0];
|
||||
`CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = 32'(perf_memsys_if.dcache_mshr_stalls[43:32]);
|
||||
`CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = 32'(perf_memsys_if.dcache_mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_PIPE_ST : read_data_r = perf_memsys_if.dcache_pipe_stalls[31:0];
|
||||
`CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.dcache_pipe_stalls[43:32]);
|
||||
`CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.dcache_pipe_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_CRSP_ST : read_data_r = perf_memsys_if.dcache_crsp_stalls[31:0];
|
||||
`CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.dcache_crsp_stalls[43:32]);
|
||||
`CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.dcache_crsp_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: smem
|
||||
`CSR_MPM_SMEM_READS : read_data_r = perf_memsys_if.smem_reads[31:0];
|
||||
`CSR_MPM_SMEM_READS_H : read_data_r = 32'(perf_memsys_if.smem_reads[43:32]);
|
||||
`CSR_MPM_SMEM_READS_H : read_data_r = 32'(perf_memsys_if.smem_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_SMEM_WRITES : read_data_r = perf_memsys_if.smem_writes[31:0];
|
||||
`CSR_MPM_SMEM_WRITES_H : read_data_r = 32'(perf_memsys_if.smem_writes[43:32]);
|
||||
`CSR_MPM_SMEM_WRITES_H : read_data_r = 32'(perf_memsys_if.smem_writes[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_SMEM_BANK_ST : read_data_r = perf_memsys_if.smem_bank_stalls[31:0];
|
||||
`CSR_MPM_SMEM_BANK_ST_H : read_data_r = 32'(perf_memsys_if.smem_bank_stalls[43:32]);
|
||||
// PERF: DRAM
|
||||
`CSR_MPM_DRAM_READS : read_data_r = perf_memsys_if.dram_reads[31:0];
|
||||
`CSR_MPM_DRAM_READS_H : read_data_r = 32'(perf_memsys_if.dram_reads[43:32]);
|
||||
`CSR_MPM_DRAM_WRITES : read_data_r = perf_memsys_if.dram_writes[31:0];
|
||||
`CSR_MPM_DRAM_WRITES_H : read_data_r = 32'(perf_memsys_if.dram_writes[43:32]);
|
||||
`CSR_MPM_DRAM_ST : read_data_r = perf_memsys_if.dram_stalls[31:0];
|
||||
`CSR_MPM_DRAM_ST_H : read_data_r = 32'(perf_memsys_if.dram_stalls[43:32]);
|
||||
`CSR_MPM_DRAM_LAT : read_data_r = perf_memsys_if.dram_latency[31:0];
|
||||
`CSR_MPM_DRAM_LAT_H : read_data_r = 32'(perf_memsys_if.dram_latency[43:32]);
|
||||
`CSR_MPM_SMEM_BANK_ST_H : read_data_r = 32'(perf_memsys_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: MEM
|
||||
`CSR_MPM_MEM_READS : read_data_r = perf_memsys_if.mem_reads[31:0];
|
||||
`CSR_MPM_MEM_READS_H : read_data_r = 32'(perf_memsys_if.mem_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_MEM_WRITES : read_data_r = perf_memsys_if.mem_writes[31:0];
|
||||
`CSR_MPM_MEM_WRITES_H : read_data_r = 32'(perf_memsys_if.mem_writes[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_MEM_ST : read_data_r = perf_memsys_if.mem_stalls[31:0];
|
||||
`CSR_MPM_MEM_ST_H : read_data_r = 32'(perf_memsys_if.mem_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_MEM_LAT : read_data_r = perf_memsys_if.mem_latency[31:0];
|
||||
`CSR_MPM_MEM_LAT_H : read_data_r = 32'(perf_memsys_if.mem_latency[`PERF_CTR_BITS-1:32]);
|
||||
`endif
|
||||
|
||||
`CSR_SATP : read_data_r = 32'(csr_satp);
|
||||
|
@ -195,9 +195,9 @@ module VX_csr_data #(
|
|||
`CSR_PMPADDR0 : read_data_r = 32'(csr_pmpaddr[0]);
|
||||
|
||||
`CSR_CYCLE : read_data_r = csr_cycle[31:0];
|
||||
`CSR_CYCLE_H : read_data_r = 32'(csr_cycle[43:32]);
|
||||
`CSR_CYCLE_H : read_data_r = 32'(csr_cycle[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_INSTRET : read_data_r = csr_instret[31:0];
|
||||
`CSR_INSTRET_H : read_data_r = 32'(csr_instret[43:32]);
|
||||
`CSR_INSTRET_H : read_data_r = 32'(csr_instret[`PERF_CTR_BITS-1:32]);
|
||||
|
||||
`CSR_MVENDORID : read_data_r = `VENDOR_ID;
|
||||
`CSR_MARCHID : read_data_r = `ARCHITECTURE_ID;
|
||||
|
|
|
@ -30,6 +30,8 @@
|
|||
|
||||
`define CSR_WIDTH 12
|
||||
|
||||
`define PERF_CTR_BITS 44
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_LUI 7'b0110111
|
||||
|
@ -244,7 +246,7 @@
|
|||
`define ICACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 0)
|
||||
|
||||
// Block size in bytes
|
||||
`define ICACHE_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `GLOBAL_BLOCK_SIZE)
|
||||
`define ICACHE_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `MEM_BLOCK_SIZE)
|
||||
|
||||
// Word size in bytes
|
||||
`define IWORD_SIZE 4
|
||||
|
@ -264,11 +266,11 @@
|
|||
// Core request tag bits
|
||||
`define ICORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `ICORE_TAG_ID_BITS)
|
||||
|
||||
// DRAM request data bits
|
||||
`define IDRAM_LINE_WIDTH (`ICACHE_LINE_SIZE * 8)
|
||||
// Memory request data bits
|
||||
`define IMEM_LINE_WIDTH (`ICACHE_LINE_SIZE * 8)
|
||||
|
||||
// DRAM byte enable bits
|
||||
`define IDRAM_BYTEEN_WIDTH `ICACHE_LINE_SIZE
|
||||
// Memory byte enable bits
|
||||
`define IMEM_BYTEEN_WIDTH `ICACHE_LINE_SIZE
|
||||
|
||||
////////////////////////// Dcache Configurable Knobs //////////////////////////
|
||||
|
||||
|
@ -276,7 +278,7 @@
|
|||
`define DCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 1)
|
||||
|
||||
// Block size in bytes
|
||||
`define DCACHE_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `GLOBAL_BLOCK_SIZE)
|
||||
`define DCACHE_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `MEM_BLOCK_SIZE)
|
||||
|
||||
// Word size in bytes
|
||||
`define DWORD_SIZE 4
|
||||
|
@ -299,14 +301,14 @@
|
|||
// DRAM request data bits
|
||||
`define DDRAM_LINE_WIDTH (`DCACHE_LINE_SIZE * 8)
|
||||
|
||||
// DRAM request address bits
|
||||
`define DDRAM_ADDR_WIDTH (32 - `CLOG2(`DCACHE_LINE_SIZE))
|
||||
// Memory request address bits
|
||||
`define DMEM_ADDR_WIDTH (32 - `CLOG2(`DCACHE_LINE_SIZE))
|
||||
|
||||
// DRAM byte enable bits
|
||||
`define DDRAM_BYTEEN_WIDTH `DCACHE_LINE_SIZE
|
||||
// Memory byte enable bits
|
||||
`define DMEM_BYTEEN_WIDTH `DCACHE_LINE_SIZE
|
||||
|
||||
// DRAM request tag bits
|
||||
`define DDRAM_TAG_WIDTH `DDRAM_ADDR_WIDTH
|
||||
// Memory request tag bits
|
||||
`define DMEM_TAG_WIDTH `DMEM_ADDR_WIDTH
|
||||
|
||||
// Core request size
|
||||
`define DNUM_REQUESTS `NUM_THREADS
|
||||
|
@ -334,7 +336,7 @@
|
|||
`define L2CACHE_ID (32'(`L3_ENABLE) + CLUSTER_ID)
|
||||
|
||||
// Block size in bytes
|
||||
`define L2CACHE_LINE_SIZE `GLOBAL_BLOCK_SIZE
|
||||
`define L2CACHE_LINE_SIZE `MEM_BLOCK_SIZE
|
||||
|
||||
// Word size in bytes
|
||||
`define L2WORD_SIZE `DCACHE_LINE_SIZE
|
||||
|
@ -342,17 +344,17 @@
|
|||
// Core request tag bits
|
||||
`define L2CORE_TAG_WIDTH (`DCORE_TAG_WIDTH + `CLOG2(`NUM_CORES))
|
||||
|
||||
// DRAM request data bits
|
||||
`define L2DRAM_LINE_WIDTH (`L2CACHE_LINE_SIZE * 8)
|
||||
// Memory request data bits
|
||||
`define L2MEM_LINE_WIDTH (`L2CACHE_LINE_SIZE * 8)
|
||||
|
||||
// DRAM request address bits
|
||||
`define L2DRAM_ADDR_WIDTH (32 - `CLOG2(`L2CACHE_LINE_SIZE))
|
||||
// Memory request address bits
|
||||
`define L2MEM_ADDR_WIDTH (32 - `CLOG2(`L2CACHE_LINE_SIZE))
|
||||
|
||||
// DRAM byte enable bits
|
||||
`define L2DRAM_BYTEEN_WIDTH `L2CACHE_LINE_SIZE
|
||||
// Memory byte enable bits
|
||||
`define L2MEM_BYTEEN_WIDTH `L2CACHE_LINE_SIZE
|
||||
|
||||
// DRAM request tag bits
|
||||
`define L2DRAM_TAG_WIDTH (`L2_ENABLE ? `L2DRAM_ADDR_WIDTH : (`XDRAM_TAG_WIDTH+`CLOG2(`NUM_CORES)))
|
||||
// Memory request tag bits
|
||||
`define L2MEM_TAG_WIDTH (`L2_ENABLE ? `L2MEM_ADDR_WIDTH : (`XMEM_TAG_WIDTH+`CLOG2(`NUM_CORES)))
|
||||
|
||||
////////////////////////// L3cache Configurable Knobs /////////////////////////
|
||||
|
||||
|
@ -360,7 +362,7 @@
|
|||
`define L3CACHE_ID 0
|
||||
|
||||
// Block size in bytes
|
||||
`define L3CACHE_LINE_SIZE `GLOBAL_BLOCK_SIZE
|
||||
`define L3CACHE_LINE_SIZE `MEM_BLOCK_SIZE
|
||||
|
||||
// Word size in bytes
|
||||
`define L3WORD_SIZE `L2CACHE_LINE_SIZE
|
||||
|
@ -368,30 +370,30 @@
|
|||
// Core request tag bits
|
||||
`define L3CORE_TAG_WIDTH (`L2CORE_TAG_WIDTH + `CLOG2(`NUM_CLUSTERS))
|
||||
|
||||
// DRAM request data bits
|
||||
`define L3DRAM_LINE_WIDTH (`L3CACHE_LINE_SIZE * 8)
|
||||
// Memory request data bits
|
||||
`define L3MEM_LINE_WIDTH (`L3CACHE_LINE_SIZE * 8)
|
||||
|
||||
// DRAM request address bits
|
||||
`define L3DRAM_ADDR_WIDTH (32 - `CLOG2(`L3CACHE_LINE_SIZE))
|
||||
// Memory request address bits
|
||||
`define L3MEM_ADDR_WIDTH (32 - `CLOG2(`L3CACHE_LINE_SIZE))
|
||||
|
||||
// DRAM byte enable bits
|
||||
`define L3DRAM_BYTEEN_WIDTH `L3CACHE_LINE_SIZE
|
||||
// Memory byte enable bits
|
||||
`define L3MEM_BYTEEN_WIDTH `L3CACHE_LINE_SIZE
|
||||
|
||||
// DRAM request tag bits
|
||||
`define L3DRAM_TAG_WIDTH (`L3_ENABLE ? `L3DRAM_ADDR_WIDTH : (`L2DRAM_TAG_WIDTH+`CLOG2(`NUM_CLUSTERS)))
|
||||
// Memory request tag bits
|
||||
`define L3MEM_TAG_WIDTH (`L3_ENABLE ? `L3MEM_ADDR_WIDTH : (`L2MEM_TAG_WIDTH+`CLOG2(`NUM_CLUSTERS)))
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define VX_DRAM_BYTEEN_WIDTH `L3DRAM_BYTEEN_WIDTH
|
||||
`define VX_DRAM_ADDR_WIDTH `L3DRAM_ADDR_WIDTH
|
||||
`define VX_DRAM_LINE_WIDTH `L3DRAM_LINE_WIDTH
|
||||
`define VX_DRAM_TAG_WIDTH `L3DRAM_TAG_WIDTH
|
||||
`define VX_MEM_BYTEEN_WIDTH `L3MEM_BYTEEN_WIDTH
|
||||
`define VX_MEM_ADDR_WIDTH `L3MEM_ADDR_WIDTH
|
||||
`define VX_MEM_LINE_WIDTH `L3MEM_LINE_WIDTH
|
||||
`define VX_MEM_TAG_WIDTH `L3MEM_TAG_WIDTH
|
||||
`define VX_CORE_TAG_WIDTH `L3CORE_TAG_WIDTH
|
||||
`define VX_CSR_ID_WIDTH `LOG2UP(`NUM_CLUSTERS * `NUM_CORES)
|
||||
|
||||
`define TO_FULL_ADDR(x) {x, (32-$bits(x))'(0)}
|
||||
|
||||
`define XDRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH+`CLOG2(2))
|
||||
`define XMEM_TAG_WIDTH (`DMEM_TAG_WIDTH+`CLOG2(2))
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
|
|
@ -7,7 +7,6 @@ module VX_ibuffer #(
|
|||
input wire reset,
|
||||
|
||||
// inputs
|
||||
input wire freeze, // keep current warp
|
||||
VX_decode_if ibuf_enq_if,
|
||||
|
||||
// outputs
|
||||
|
@ -117,18 +116,9 @@ module VX_ibuffer #(
|
|||
deq_valid_n = 0;
|
||||
deq_wid_n = 'x;
|
||||
deq_instr_n = 'x;
|
||||
schedule_table_n = 'x;
|
||||
|
||||
if ((0 == num_warps)
|
||||
|| (1 == num_warps && deq_fire && q_alm_empty[deq_wid])) begin
|
||||
deq_valid_n = enq_fire;
|
||||
deq_wid_n = ibuf_enq_if.wid;
|
||||
deq_instr_n = q_data_in;
|
||||
end else if ((1 == num_warps) || freeze) begin
|
||||
deq_valid_n = 1;
|
||||
deq_wid_n = deq_wid;
|
||||
deq_instr_n = deq_fire ? q_data_prev[deq_wid] : q_data_out[deq_wid];
|
||||
end else begin
|
||||
schedule_table_n = 'x;
|
||||
|
||||
if (num_warps > 1) begin
|
||||
deq_valid_n = (| schedule_table);
|
||||
schedule_table_n = schedule_table;
|
||||
for (integer i = 0; i < `NUM_WARPS; i++) begin
|
||||
|
@ -139,6 +129,14 @@ module VX_ibuffer #(
|
|||
break;
|
||||
end
|
||||
end
|
||||
end else if (1 == num_warps && !(deq_fire && q_alm_empty[deq_wid])) begin
|
||||
deq_valid_n = 1;
|
||||
deq_wid_n = deq_wid;
|
||||
deq_instr_n = deq_fire ? q_data_prev[deq_wid] : q_data_out[deq_wid];
|
||||
end else begin
|
||||
deq_valid_n = enq_fire;
|
||||
deq_wid_n = ibuf_enq_if.wid;
|
||||
deq_instr_n = q_data_in;
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -33,7 +33,6 @@ module VX_issue #(
|
|||
) ibuffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.freeze (1'b0),
|
||||
.ibuf_enq_if (decode_if),
|
||||
.ibuf_deq_if (ibuf_deq_if)
|
||||
);
|
||||
|
@ -121,14 +120,14 @@ module VX_issue #(
|
|||
`SCOPE_ASSIGN (writeback_eop, writeback_if.eop);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [43:0] perf_ibf_stalls;
|
||||
reg [43:0] perf_scb_stalls;
|
||||
reg [43:0] perf_alu_stalls;
|
||||
reg [43:0] perf_lsu_stalls;
|
||||
reg [43:0] perf_csr_stalls;
|
||||
reg [43:0] perf_gpu_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_scb_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_alu_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_lsu_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_csr_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_gpu_stalls;
|
||||
`ifdef EXT_F_ENABLE
|
||||
reg [43:0] perf_fpu_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_fpu_stalls;
|
||||
`endif
|
||||
|
||||
always @(posedge clk) begin
|
||||
|
@ -144,26 +143,26 @@ module VX_issue #(
|
|||
`endif
|
||||
end else begin
|
||||
if (decode_if.valid & !decode_if.ready) begin
|
||||
perf_ibf_stalls <= perf_ibf_stalls + 44'd1;
|
||||
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (ibuf_deq_if.valid & scoreboard_delay) begin
|
||||
perf_scb_stalls <= perf_scb_stalls + 44'd1;
|
||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (alu_req_if.valid & !alu_req_if.ready) begin
|
||||
perf_alu_stalls <= perf_alu_stalls + 44'd1;
|
||||
perf_alu_stalls <= perf_alu_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (lsu_req_if.valid & !lsu_req_if.ready) begin
|
||||
perf_lsu_stalls <= perf_lsu_stalls + 44'd1;
|
||||
perf_lsu_stalls <= perf_lsu_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (csr_req_if.valid & !csr_req_if.ready) begin
|
||||
perf_csr_stalls <= perf_csr_stalls + 44'd1;
|
||||
perf_csr_stalls <= perf_csr_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (gpu_req_if.valid & !gpu_req_if.ready) begin
|
||||
perf_gpu_stalls <= perf_gpu_stalls + 44'd1;
|
||||
perf_gpu_stalls <= perf_gpu_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (fpu_req_if.valid & !fpu_req_if.ready) begin
|
||||
perf_fpu_stalls <= perf_fpu_stalls + 44'd1;
|
||||
perf_fpu_stalls <= perf_fpu_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
`endif
|
||||
end
|
||||
|
|
|
@ -44,10 +44,6 @@ module VX_lsu_unit #(
|
|||
end
|
||||
wire is_dup_load = lsu_req_if.wb && lsu_req_if.tmask[0] && (& addr_matches);
|
||||
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
reg [`LSUQ_SIZE-1:0][`LSUQ_ADDR_BITS-1:0] pending_tags;
|
||||
`IGNORE_WARNINGS_END
|
||||
|
||||
wire ready_in;
|
||||
wire stall_in = ~ready_in && req_valid;
|
||||
|
||||
|
@ -79,7 +75,7 @@ module VX_lsu_unit #(
|
|||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
|
||||
reg [`NUM_THREADS-1:0] req_sent_mask;
|
||||
wire sent_all_ready;
|
||||
wire req_ready_all;
|
||||
|
||||
wire [`LSUQ_ADDR_BITS-1:0] mbuf_waddr, mbuf_raddr;
|
||||
wire mbuf_full;
|
||||
|
@ -118,13 +114,7 @@ module VX_lsu_unit #(
|
|||
`UNUSED_PIN (empty)
|
||||
);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (mbuf_push) begin
|
||||
pending_tags[mbuf_waddr] <= req_tag;
|
||||
end
|
||||
end
|
||||
|
||||
assign sent_all_ready = &(dcache_req_if.ready | req_sent_mask);
|
||||
assign req_ready_all = &(dcache_req_if.ready | req_sent_mask | ~req_tmask);
|
||||
|
||||
wire [`NUM_THREADS-1:0] req_sent_dup = {{(`NUM_THREADS-1){dcache_req_fire[0] && req_is_dup}}, 1'b0};
|
||||
|
||||
|
@ -132,19 +122,22 @@ module VX_lsu_unit #(
|
|||
if (reset) begin
|
||||
req_sent_mask <= 0;
|
||||
end else begin
|
||||
if (sent_all_ready)
|
||||
if (req_ready_all)
|
||||
req_sent_mask <= 0;
|
||||
else
|
||||
req_sent_mask <= req_sent_mask | dcache_req_fire | req_sent_dup;
|
||||
end
|
||||
end
|
||||
|
||||
wire is_req_start = (0 == req_sent_mask);
|
||||
|
||||
// need to hold the acquired tag index until the full request is submitted
|
||||
reg [`LSUQ_ADDR_BITS-1:0] req_tag_hold;
|
||||
wire [`LSUQ_ADDR_BITS-1:0] req_tag = (0 == req_sent_mask) ? mbuf_waddr : req_tag_hold;
|
||||
reg [`DCORE_TAG_ID_BITS-1:0] req_tag_hold;
|
||||
wire [`DCORE_TAG_ID_BITS-1:0] req_tag = is_req_start ? mbuf_waddr : req_tag_hold;
|
||||
always @(posedge clk) begin
|
||||
if (mbuf_push)
|
||||
if (mbuf_push) begin
|
||||
req_tag_hold <= mbuf_waddr;
|
||||
end
|
||||
end
|
||||
|
||||
wire [`NUM_THREADS-1:0] req_tmask_dup = req_tmask & {{(`NUM_THREADS-1){~req_is_dup}}, 1'b1};
|
||||
|
@ -160,7 +153,8 @@ module VX_lsu_unit #(
|
|||
end
|
||||
end
|
||||
|
||||
wire req_ready_dep = (req_wb && ~mbuf_full)
|
||||
// ensure all dependencies for the requests are resolved
|
||||
wire req_dep_ready = (req_wb && (~mbuf_full || ~is_req_start))
|
||||
|| (~req_wb && st_commit_if.ready);
|
||||
|
||||
// DCache Request
|
||||
|
@ -193,7 +187,7 @@ module VX_lsu_unit #(
|
|||
end
|
||||
end
|
||||
|
||||
assign dcache_req_if.valid = {`NUM_THREADS{req_valid && req_ready_dep}} & req_tmask_dup & ~req_sent_mask;
|
||||
assign dcache_req_if.valid = {`NUM_THREADS{req_valid && req_dep_ready}} & req_tmask_dup & ~req_sent_mask;
|
||||
assign dcache_req_if.rw = {`NUM_THREADS{~req_wb}};
|
||||
assign dcache_req_if.addr = mem_req_addr;
|
||||
assign dcache_req_if.byteen = mem_req_byteen;
|
||||
|
@ -205,11 +199,11 @@ module VX_lsu_unit #(
|
|||
assign dcache_req_if.tag = {`NUM_THREADS{req_tag}};
|
||||
`endif
|
||||
|
||||
assign ready_in = req_ready_dep && sent_all_ready;
|
||||
assign ready_in = req_dep_ready && req_ready_all;
|
||||
|
||||
// send store commit
|
||||
|
||||
wire is_store_rsp = req_valid && ~req_wb && sent_all_ready;
|
||||
wire is_store_rsp = req_valid && ~req_wb && req_ready_all;
|
||||
|
||||
assign st_commit_if.valid = is_store_rsp;
|
||||
assign st_commit_if.wid = req_wid;
|
||||
|
@ -280,23 +274,46 @@ module VX_lsu_unit #(
|
|||
`SCOPE_ASSIGN (dcache_rsp_tag, mbuf_raddr);
|
||||
|
||||
`ifdef DBG_PRINT_CORE_DCACHE
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
reg [`LSUQ_SIZE-1:0][`DCORE_TAG_WIDTH:0] pending_reqs;
|
||||
`IGNORE_WARNINGS_END
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
pending_reqs <= '0;
|
||||
end else if (mbuf_push) begin
|
||||
pending_reqs[mbuf_waddr] <= {dcache_req_if.tag[0], 1'b1};
|
||||
end else if (mbuf_pop) begin
|
||||
pending_reqs[mbuf_raddr] <= '0;
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if ((| dcache_req_fire)) begin
|
||||
if ((| dcache_req_if.rw))
|
||||
$display("%t: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, byteen=%0h, data=%0h",
|
||||
$time, CORE_ID, req_wid, req_pc, dcache_req_fire, req_addr, dcache_req_if.tag, dcache_req_if.byteen, dcache_req_if.data);
|
||||
else
|
||||
$display("%t: D$%0d Rd Req: wid=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, byteen=%0h, rd=%0d, is_dup=%b",
|
||||
$time, CORE_ID, req_wid, req_pc, dcache_req_fire, req_addr, dcache_req_if.tag, dcache_req_if.byteen, req_rd, req_is_dup);
|
||||
if (dcache_req_if.rw[0]) begin
|
||||
$write("%t: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire);
|
||||
`PRINT_ARRAY1D(req_addr, `NUM_THREADS);
|
||||
$write(", tag=%0h, byteen=%0h, data=", dcache_req_if.tag[0], dcache_req_if.byteen);
|
||||
`PRINT_ARRAY1D(dcache_req_if.data, `NUM_THREADS);
|
||||
$write("\n");
|
||||
end else begin
|
||||
$write("%t: D$%0d Rd Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire);
|
||||
`PRINT_ARRAY1D(req_addr, `NUM_THREADS);
|
||||
$write(", tag=%0h, byteen=%0h, rd=%0d, is_dup=%b\n", dcache_req_if.tag[0], dcache_req_if.byteen, req_rd, req_is_dup);
|
||||
end
|
||||
end
|
||||
if (dcache_rsp_fire) begin
|
||||
$display("%t: D$%0d Rsp: valid=%b, wid=%0d, PC=%0h, tag=%0h, rd=%0d, data=%0h, is_dup=%b",
|
||||
$time, CORE_ID, dcache_rsp_if.valid, rsp_wid, rsp_pc, dcache_rsp_if.tag, rsp_rd, dcache_rsp_if.data, rsp_is_dup);
|
||||
$write("%t: D$%0d Rsp: valid=%b, wid=%0d, PC=%0h, tag=%0h, rd=%0d, data=",
|
||||
$time, CORE_ID, dcache_rsp_if.valid, rsp_wid, rsp_pc, dcache_rsp_if.tag, rsp_rd);
|
||||
`PRINT_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS);
|
||||
$write(", is_dup=%b\n", rsp_is_dup);
|
||||
end
|
||||
if (mbuf_full) begin
|
||||
$write("%t: D$%0d queue-full:", $time, CORE_ID);
|
||||
$write("%t: *** D$%0d queue-full:", $time, CORE_ID);
|
||||
for (integer j = 0; j < `LSUQ_SIZE; j++) begin
|
||||
$write(" tag%0d=%0h", j, pending_tags[j]);
|
||||
if (pending_reqs[j][0]) begin
|
||||
$write(" %0d->%0h", j, pending_reqs[j][1 +: `DCORE_TAG_WIDTH]);
|
||||
end
|
||||
end
|
||||
$write("\n");
|
||||
end
|
||||
|
|
|
@ -20,25 +20,25 @@ module VX_mem_unit # (
|
|||
VX_icache_core_req_if icache_core_req_if,
|
||||
VX_icache_core_rsp_if icache_core_rsp_if,
|
||||
|
||||
// DRAM
|
||||
VX_cache_dram_req_if dram_req_if,
|
||||
VX_cache_dram_rsp_if dram_rsp_if
|
||||
// Memory
|
||||
VX_cache_mem_req_if mem_req_if,
|
||||
VX_cache_mem_rsp_if mem_rsp_if
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_cache_if perf_icache_if(), perf_dcache_if(), perf_smem_if();
|
||||
`endif
|
||||
|
||||
VX_cache_dram_req_if #(
|
||||
.DRAM_LINE_WIDTH (`DDRAM_LINE_WIDTH),
|
||||
.DRAM_ADDR_WIDTH (`DDRAM_ADDR_WIDTH),
|
||||
.DRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH)
|
||||
) dcache_dram_req_if(), icache_dram_req_if();
|
||||
VX_cache_mem_req_if #(
|
||||
.MEM_LINE_WIDTH (`DMEM_LINE_WIDTH),
|
||||
.MEM_ADDR_WIDTH (`DMEM_ADDR_WIDTH),
|
||||
.MEM_TAG_WIDTH (`DMEM_TAG_WIDTH)
|
||||
) dcache_mem_req_if(), icache_mem_req_if();
|
||||
|
||||
VX_cache_dram_rsp_if #(
|
||||
.DRAM_LINE_WIDTH (`DDRAM_LINE_WIDTH),
|
||||
.DRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH)
|
||||
) dcache_dram_rsp_if(), icache_dram_rsp_if();
|
||||
VX_cache_mem_rsp_if #(
|
||||
.MEM_LINE_WIDTH (`DMEM_LINE_WIDTH),
|
||||
.MEM_TAG_WIDTH (`DMEM_TAG_WIDTH)
|
||||
) dcache_mem_rsp_if(), icache_mem_rsp_if();
|
||||
|
||||
VX_dcache_core_req_if #(
|
||||
.LANES (`DNUM_REQUESTS),
|
||||
|
@ -96,12 +96,12 @@ module VX_mem_unit # (
|
|||
.NUM_REQS (1),
|
||||
.CREQ_SIZE (`ICREQ_SIZE),
|
||||
.MSHR_SIZE (`IMSHR_SIZE),
|
||||
.DRSQ_SIZE (`IDRSQ_SIZE),
|
||||
.DREQ_SIZE (`IDREQ_SIZE),
|
||||
.MRSQ_SIZE (`IMRSQ_SIZE),
|
||||
.MREQ_SIZE (`IMREQ_SIZE),
|
||||
.WRITE_ENABLE (0),
|
||||
.CORE_TAG_WIDTH (`ICORE_TAG_WIDTH),
|
||||
.CORE_TAG_ID_BITS (`ICORE_TAG_ID_BITS),
|
||||
.DRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH)
|
||||
.MEM_TAG_WIDTH (`DMEM_TAG_WIDTH)
|
||||
) icache (
|
||||
`SCOPE_BIND_VX_mem_unit_icache
|
||||
|
||||
|
@ -129,20 +129,20 @@ module VX_mem_unit # (
|
|||
.perf_cache_if (perf_icache_if),
|
||||
`endif
|
||||
|
||||
// DRAM Req
|
||||
.dram_req_valid (icache_dram_req_if.valid),
|
||||
.dram_req_rw (icache_dram_req_if.rw),
|
||||
.dram_req_byteen (icache_dram_req_if.byteen),
|
||||
.dram_req_addr (icache_dram_req_if.addr),
|
||||
.dram_req_data (icache_dram_req_if.data),
|
||||
.dram_req_tag (icache_dram_req_if.tag),
|
||||
.dram_req_ready (icache_dram_req_if.ready),
|
||||
// Memory Request
|
||||
.mem_req_valid (icache_mem_req_if.valid),
|
||||
.mem_req_rw (icache_mem_req_if.rw),
|
||||
.mem_req_byteen (icache_mem_req_if.byteen),
|
||||
.mem_req_addr (icache_mem_req_if.addr),
|
||||
.mem_req_data (icache_mem_req_if.data),
|
||||
.mem_req_tag (icache_mem_req_if.tag),
|
||||
.mem_req_ready (icache_mem_req_if.ready),
|
||||
|
||||
// DRAM response
|
||||
.dram_rsp_valid (icache_dram_rsp_if.valid),
|
||||
.dram_rsp_data (icache_dram_rsp_if.data),
|
||||
.dram_rsp_tag (icache_dram_rsp_if.tag),
|
||||
.dram_rsp_ready (icache_dram_rsp_if.ready)
|
||||
// Memory response
|
||||
.mem_rsp_valid (icache_mem_rsp_if.valid),
|
||||
.mem_rsp_data (icache_mem_rsp_if.data),
|
||||
.mem_rsp_tag (icache_mem_rsp_if.tag),
|
||||
.mem_rsp_ready (icache_mem_rsp_if.ready)
|
||||
);
|
||||
|
||||
VX_cache #(
|
||||
|
@ -155,12 +155,12 @@ module VX_mem_unit # (
|
|||
.NUM_REQS (`DNUM_REQUESTS),
|
||||
.CREQ_SIZE (`DCREQ_SIZE),
|
||||
.MSHR_SIZE (`DMSHR_SIZE),
|
||||
.DRSQ_SIZE (`DDRSQ_SIZE),
|
||||
.DREQ_SIZE (`DDREQ_SIZE),
|
||||
.MRSQ_SIZE (`DMRSQ_SIZE),
|
||||
.MREQ_SIZE (`DMREQ_SIZE),
|
||||
.WRITE_ENABLE (1),
|
||||
.CORE_TAG_WIDTH (`DCORE_TAG_WIDTH),
|
||||
.CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS),
|
||||
.DRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH)
|
||||
.MEM_TAG_WIDTH (`DMEM_TAG_WIDTH)
|
||||
) dcache (
|
||||
`SCOPE_BIND_VX_mem_unit_dcache
|
||||
|
||||
|
@ -188,20 +188,20 @@ module VX_mem_unit # (
|
|||
.perf_cache_if (perf_dcache_if),
|
||||
`endif
|
||||
|
||||
// DRAM request
|
||||
.dram_req_valid (dcache_dram_req_if.valid),
|
||||
.dram_req_rw (dcache_dram_req_if.rw),
|
||||
.dram_req_byteen (dcache_dram_req_if.byteen),
|
||||
.dram_req_addr (dcache_dram_req_if.addr),
|
||||
.dram_req_data (dcache_dram_req_if.data),
|
||||
.dram_req_tag (dcache_dram_req_if.tag),
|
||||
.dram_req_ready (dcache_dram_req_if.ready),
|
||||
// Memory request
|
||||
.mem_req_valid (dcache_mem_req_if.valid),
|
||||
.mem_req_rw (dcache_mem_req_if.rw),
|
||||
.mem_req_byteen (dcache_mem_req_if.byteen),
|
||||
.mem_req_addr (dcache_mem_req_if.addr),
|
||||
.mem_req_data (dcache_mem_req_if.data),
|
||||
.mem_req_tag (dcache_mem_req_if.tag),
|
||||
.mem_req_ready (dcache_mem_req_if.ready),
|
||||
|
||||
// DRAM response
|
||||
.dram_rsp_valid (dcache_dram_rsp_if.valid),
|
||||
.dram_rsp_data (dcache_dram_rsp_if.data),
|
||||
.dram_rsp_tag (dcache_dram_rsp_if.tag),
|
||||
.dram_rsp_ready (dcache_dram_rsp_if.ready)
|
||||
// Memory response
|
||||
.mem_rsp_valid (dcache_mem_rsp_if.valid),
|
||||
.mem_rsp_data (dcache_mem_rsp_if.data),
|
||||
.mem_rsp_tag (dcache_mem_rsp_if.tag),
|
||||
.mem_rsp_ready (dcache_mem_rsp_if.ready)
|
||||
);
|
||||
|
||||
if (`SM_ENABLE) begin
|
||||
|
@ -252,45 +252,45 @@ module VX_mem_unit # (
|
|||
|
||||
VX_mem_arb #(
|
||||
.NUM_REQS (2),
|
||||
.DATA_WIDTH (`DDRAM_LINE_WIDTH),
|
||||
.ADDR_WIDTH (`DDRAM_ADDR_WIDTH),
|
||||
.TAG_IN_WIDTH (`DDRAM_TAG_WIDTH),
|
||||
.TAG_OUT_WIDTH (`XDRAM_TAG_WIDTH),
|
||||
.DATA_WIDTH (`DMEM_LINE_WIDTH),
|
||||
.ADDR_WIDTH (`DMEM_ADDR_WIDTH),
|
||||
.TAG_IN_WIDTH (`DMEM_TAG_WIDTH),
|
||||
.TAG_OUT_WIDTH (`XMEM_TAG_WIDTH),
|
||||
.BUFFERED_REQ (1),
|
||||
.BUFFERED_RSP (0)
|
||||
) dram_arb (
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
// Source request
|
||||
.req_valid_in ({dcache_dram_req_if.valid, icache_dram_req_if.valid}),
|
||||
.req_rw_in ({dcache_dram_req_if.rw, icache_dram_req_if.rw}),
|
||||
.req_byteen_in ({dcache_dram_req_if.byteen, icache_dram_req_if.byteen}),
|
||||
.req_addr_in ({dcache_dram_req_if.addr, icache_dram_req_if.addr}),
|
||||
.req_data_in ({dcache_dram_req_if.data, icache_dram_req_if.data}),
|
||||
.req_tag_in ({dcache_dram_req_if.tag, icache_dram_req_if.tag}),
|
||||
.req_ready_in ({dcache_dram_req_if.ready, icache_dram_req_if.ready}),
|
||||
.req_valid_in ({dcache_mem_req_if.valid, icache_mem_req_if.valid}),
|
||||
.req_rw_in ({dcache_mem_req_if.rw, icache_mem_req_if.rw}),
|
||||
.req_byteen_in ({dcache_mem_req_if.byteen, icache_mem_req_if.byteen}),
|
||||
.req_addr_in ({dcache_mem_req_if.addr, icache_mem_req_if.addr}),
|
||||
.req_data_in ({dcache_mem_req_if.data, icache_mem_req_if.data}),
|
||||
.req_tag_in ({dcache_mem_req_if.tag, icache_mem_req_if.tag}),
|
||||
.req_ready_in ({dcache_mem_req_if.ready, icache_mem_req_if.ready}),
|
||||
|
||||
// DRAM request
|
||||
.req_valid_out (dram_req_if.valid),
|
||||
.req_rw_out (dram_req_if.rw),
|
||||
.req_byteen_out (dram_req_if.byteen),
|
||||
.req_addr_out (dram_req_if.addr),
|
||||
.req_data_out (dram_req_if.data),
|
||||
.req_tag_out (dram_req_if.tag),
|
||||
.req_ready_out (dram_req_if.ready),
|
||||
// Memory request
|
||||
.req_valid_out (mem_req_if.valid),
|
||||
.req_rw_out (mem_req_if.rw),
|
||||
.req_byteen_out (mem_req_if.byteen),
|
||||
.req_addr_out (mem_req_if.addr),
|
||||
.req_data_out (mem_req_if.data),
|
||||
.req_tag_out (mem_req_if.tag),
|
||||
.req_ready_out (mem_req_if.ready),
|
||||
|
||||
// Source response
|
||||
.rsp_valid_out ({dcache_dram_rsp_if.valid, icache_dram_rsp_if.valid}),
|
||||
.rsp_data_out ({dcache_dram_rsp_if.data, icache_dram_rsp_if.data}),
|
||||
.rsp_tag_out ({dcache_dram_rsp_if.tag, icache_dram_rsp_if.tag}),
|
||||
.rsp_ready_out ({dcache_dram_rsp_if.ready, icache_dram_rsp_if.ready}),
|
||||
.rsp_valid_out ({dcache_mem_rsp_if.valid, icache_mem_rsp_if.valid}),
|
||||
.rsp_data_out ({dcache_mem_rsp_if.data, icache_mem_rsp_if.data}),
|
||||
.rsp_tag_out ({dcache_mem_rsp_if.tag, icache_mem_rsp_if.tag}),
|
||||
.rsp_ready_out ({dcache_mem_rsp_if.ready, icache_mem_rsp_if.ready}),
|
||||
|
||||
// DRAM response
|
||||
.rsp_valid_in (dram_rsp_if.valid),
|
||||
.rsp_tag_in (dram_rsp_if.tag),
|
||||
.rsp_data_in (dram_rsp_if.data),
|
||||
.rsp_ready_in (dram_rsp_if.ready)
|
||||
// Memory response
|
||||
.rsp_valid_in (mem_rsp_if.valid),
|
||||
.rsp_tag_in (mem_rsp_if.tag),
|
||||
.rsp_data_in (mem_rsp_if.data),
|
||||
.rsp_ready_in (mem_rsp_if.ready)
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
|
@ -319,47 +319,47 @@ end else begin
|
|||
assign perf_memsys_if.smem_bank_stalls = 0;
|
||||
end
|
||||
|
||||
reg [43:0] perf_dram_lat_per_cycle;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_lat_per_cycle;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_dram_lat_per_cycle <= 0;
|
||||
perf_mem_lat_per_cycle <= 0;
|
||||
end else begin
|
||||
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle +
|
||||
44'($signed(2'((dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready) && !(dram_rsp_if.valid && dram_rsp_if.ready)) -
|
||||
2'((dram_rsp_if.valid && dram_rsp_if.ready) && !(dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready))));
|
||||
perf_mem_lat_per_cycle <= perf_mem_lat_per_cycle +
|
||||
`PERF_CTR_BITS'($signed(2'((mem_req_if.valid && !mem_req_if.rw && mem_req_if.ready) && !(mem_rsp_if.valid && mem_rsp_if.ready)) -
|
||||
2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && !mem_req_if.rw && mem_req_if.ready))));
|
||||
end
|
||||
end
|
||||
|
||||
reg [43:0] perf_dram_reads;
|
||||
reg [43:0] perf_dram_writes;
|
||||
reg [43:0] perf_dram_lat;
|
||||
reg [43:0] perf_dram_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_writes;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_lat;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_dram_reads <= 0;
|
||||
perf_dram_writes <= 0;
|
||||
perf_dram_lat <= 0;
|
||||
perf_dram_stalls <= 0;
|
||||
perf_mem_reads <= 0;
|
||||
perf_mem_writes <= 0;
|
||||
perf_mem_lat <= 0;
|
||||
perf_mem_stalls <= 0;
|
||||
end else begin
|
||||
if (dram_req_if.valid && dram_req_if.ready && !dram_req_if.rw) begin
|
||||
perf_dram_reads <= perf_dram_reads + 44'd1;
|
||||
if (mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) begin
|
||||
perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (dram_req_if.valid && dram_req_if.ready && dram_req_if.rw) begin
|
||||
perf_dram_writes <= perf_dram_writes + 44'd1;
|
||||
if (mem_req_if.valid && mem_req_if.ready && mem_req_if.rw) begin
|
||||
perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (dram_req_if.valid && !dram_req_if.ready) begin
|
||||
perf_dram_stalls <= perf_dram_stalls + 44'd1;
|
||||
if (mem_req_if.valid && !mem_req_if.ready) begin
|
||||
perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
perf_dram_lat <= perf_dram_lat + perf_dram_lat_per_cycle;
|
||||
perf_mem_lat <= perf_mem_lat + perf_mem_lat_per_cycle;
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_memsys_if.dram_reads = perf_dram_reads;
|
||||
assign perf_memsys_if.dram_writes = perf_dram_writes;
|
||||
assign perf_memsys_if.dram_latency = perf_dram_lat;
|
||||
assign perf_memsys_if.dram_stalls = perf_dram_stalls;
|
||||
assign perf_memsys_if.mem_reads = perf_mem_reads;
|
||||
assign perf_memsys_if.mem_writes = perf_mem_writes;
|
||||
assign perf_memsys_if.mem_latency = perf_mem_lat;
|
||||
assign perf_memsys_if.mem_stalls = perf_mem_stalls;
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -70,6 +70,8 @@
|
|||
`define LOG2UP(x) (((x) > 1) ? $clog2(x) : 1)
|
||||
`define ISPOW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1))))
|
||||
|
||||
`define ABS(x) (($signed(x) < 0) ? (-$signed(x)) : x);
|
||||
|
||||
`define MIN(x, y) ((x < y) ? (x) : (y))
|
||||
`define MAX(x, y) ((x > y) ? (x) : (y))
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ module VX_scoreboard #(
|
|||
if (release_reg) begin
|
||||
inuse_regs[writeback_if.wid][writeback_if.rd] <= 0;
|
||||
assert(inuse_regs[writeback_if.wid][writeback_if.rd] != 0)
|
||||
else $error("*** %t: core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d",
|
||||
else $error("%t: *** core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d",
|
||||
$time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd);
|
||||
end
|
||||
end
|
||||
|
@ -40,7 +40,7 @@ module VX_scoreboard #(
|
|||
`ifdef DBG_PRINT_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin
|
||||
$display("%t: core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b",
|
||||
$display("%t: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b",
|
||||
$time, CORE_ID, ibuf_deq_if.wid, ibuf_deq_if.PC, ibuf_deq_if.rd, ibuf_deq_if.wb,
|
||||
deq_inuse_regs[ibuf_deq_if.rd], deq_inuse_regs[ibuf_deq_if.rs1], deq_inuse_regs[ibuf_deq_if.rs2], deq_inuse_regs[ibuf_deq_if.rs3]);
|
||||
end
|
||||
|
@ -54,7 +54,7 @@ module VX_scoreboard #(
|
|||
deadlock_ctr <= 0;
|
||||
end else if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin
|
||||
deadlock_ctr <= deadlock_ctr + 1;
|
||||
assert(deadlock_ctr < deadlock_timeout) else $error("*** %t: core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b",
|
||||
assert(deadlock_ctr < deadlock_timeout) else $error("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b",
|
||||
$time, CORE_ID, ibuf_deq_if.wid, ibuf_deq_if.PC, ibuf_deq_if.rd, ibuf_deq_if.wb,
|
||||
deq_inuse_regs[ibuf_deq_if.rd], deq_inuse_regs[ibuf_deq_if.rs1], deq_inuse_regs[ibuf_deq_if.rs2], deq_inuse_regs[ibuf_deq_if.rs3]);
|
||||
end else if (ibuf_deq_if.valid && ibuf_deq_if.ready) begin
|
||||
|
|
|
@ -34,7 +34,7 @@ module VX_smem_arb (
|
|||
wire is_smem_addr_in, is_smem_addr_out;
|
||||
|
||||
// select shared memory bus
|
||||
assign is_smem_addr_in = core_req_if.valid[i] && `SM_ENABLE
|
||||
assign is_smem_addr_in = `SM_ENABLE
|
||||
&& (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] >= (32-SMEM_ASHIFT)'((`SHARED_MEM_BASE_ADDR - `SMEM_SIZE) >> SMEM_ASHIFT))
|
||||
&& (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] < (32-SMEM_ASHIFT)'(`SHARED_MEM_BASE_ADDR >> SMEM_ASHIFT));
|
||||
|
||||
|
@ -51,13 +51,13 @@ module VX_smem_arb (
|
|||
.ready_out (cache_req_ready_out)
|
||||
);
|
||||
|
||||
if (`SM_ENABLE ) begin
|
||||
if (`SM_ENABLE) begin
|
||||
assign cache_req_if.valid[i] = cache_req_valid_out && ~is_smem_addr_out;
|
||||
assign smem_req_if.valid[i] = cache_req_valid_out && is_smem_addr_out;
|
||||
assign cache_req_ready_out = is_smem_addr_out ? smem_req_if.ready[i] : cache_req_if.ready[i];
|
||||
|
||||
assign smem_req_if.addr[i] = cache_req_if.addr[i];
|
||||
assign smem_req_if.rw[i] = cache_req_if.rw[i];
|
||||
assign smem_req_if.rw[i] = cache_req_if.rw[i];
|
||||
assign smem_req_if.byteen[i] = cache_req_if.byteen[i];
|
||||
assign smem_req_if.data[i] = cache_req_if.data[i];
|
||||
assign smem_req_if.tag[i] = cache_req_if.tag[i];
|
||||
|
|
220
hw/rtl/Vortex.v
220
hw/rtl/Vortex.v
|
@ -7,20 +7,20 @@ module Vortex (
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// DRAM request
|
||||
output wire dram_req_valid,
|
||||
output wire dram_req_rw,
|
||||
output wire [`VX_DRAM_BYTEEN_WIDTH-1:0] dram_req_byteen,
|
||||
output wire [`VX_DRAM_ADDR_WIDTH-1:0] dram_req_addr,
|
||||
output wire [`VX_DRAM_LINE_WIDTH-1:0] dram_req_data,
|
||||
output wire [`VX_DRAM_TAG_WIDTH-1:0] dram_req_tag,
|
||||
input wire dram_req_ready,
|
||||
// Memory request
|
||||
output wire mem_req_valid,
|
||||
output wire mem_req_rw,
|
||||
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
|
||||
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [`VX_MEM_LINE_WIDTH-1:0] mem_req_data,
|
||||
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// DRAM response
|
||||
input wire dram_rsp_valid,
|
||||
input wire [`VX_DRAM_LINE_WIDTH-1:0] dram_rsp_data,
|
||||
input wire [`VX_DRAM_TAG_WIDTH-1:0] dram_rsp_tag,
|
||||
output wire dram_rsp_ready,
|
||||
// Memory response
|
||||
input wire mem_rsp_valid,
|
||||
input wire [`VX_MEM_LINE_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire mem_rsp_ready,
|
||||
|
||||
// CSR Request
|
||||
input wire csr_req_valid,
|
||||
|
@ -40,18 +40,18 @@ module Vortex (
|
|||
output wire ebreak
|
||||
);
|
||||
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_valid;
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_rw;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2DRAM_BYTEEN_WIDTH-1:0] per_cluster_dram_req_byteen;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2DRAM_ADDR_WIDTH-1:0] per_cluster_dram_req_addr;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2DRAM_LINE_WIDTH-1:0] per_cluster_dram_req_data;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2DRAM_TAG_WIDTH-1:0] per_cluster_dram_req_tag;
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_ready;
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_valid;
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_rw;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2MEM_BYTEEN_WIDTH-1:0] per_cluster_mem_req_byteen;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2MEM_ADDR_WIDTH-1:0] per_cluster_mem_req_addr;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2MEM_LINE_WIDTH-1:0] per_cluster_mem_req_data;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2MEM_TAG_WIDTH-1:0] per_cluster_mem_req_tag;
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_ready;
|
||||
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_dram_rsp_valid;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2DRAM_LINE_WIDTH-1:0] per_cluster_dram_rsp_data;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2DRAM_TAG_WIDTH-1:0] per_cluster_dram_rsp_tag;
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_dram_rsp_ready;
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_rsp_valid;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2MEM_LINE_WIDTH-1:0] per_cluster_mem_rsp_data;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2MEM_TAG_WIDTH-1:0] per_cluster_mem_rsp_tag;
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_rsp_ready;
|
||||
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_csr_req_valid;
|
||||
wire [`NUM_CLUSTERS-1:0][11:0] per_cluster_csr_req_addr;
|
||||
|
@ -88,18 +88,18 @@ module Vortex (
|
|||
.clk (clk),
|
||||
.reset (cluster_reset),
|
||||
|
||||
.dram_req_valid (per_cluster_dram_req_valid [i]),
|
||||
.dram_req_rw (per_cluster_dram_req_rw [i]),
|
||||
.dram_req_byteen(per_cluster_dram_req_byteen[i]),
|
||||
.dram_req_addr (per_cluster_dram_req_addr [i]),
|
||||
.dram_req_data (per_cluster_dram_req_data [i]),
|
||||
.dram_req_tag (per_cluster_dram_req_tag [i]),
|
||||
.dram_req_ready (per_cluster_dram_req_ready [i]),
|
||||
.mem_req_valid (per_cluster_mem_req_valid [i]),
|
||||
.mem_req_rw (per_cluster_mem_req_rw [i]),
|
||||
.mem_req_byteen (per_cluster_mem_req_byteen[i]),
|
||||
.mem_req_addr (per_cluster_mem_req_addr [i]),
|
||||
.mem_req_data (per_cluster_mem_req_data [i]),
|
||||
.mem_req_tag (per_cluster_mem_req_tag [i]),
|
||||
.mem_req_ready (per_cluster_mem_req_ready [i]),
|
||||
|
||||
.dram_rsp_valid (per_cluster_dram_rsp_valid [i]),
|
||||
.dram_rsp_data (per_cluster_dram_rsp_data [i]),
|
||||
.dram_rsp_tag (per_cluster_dram_rsp_tag [i]),
|
||||
.dram_rsp_ready (per_cluster_dram_rsp_ready [i]),
|
||||
.mem_rsp_valid (per_cluster_mem_rsp_valid [i]),
|
||||
.mem_rsp_data (per_cluster_mem_rsp_data [i]),
|
||||
.mem_rsp_tag (per_cluster_mem_rsp_tag [i]),
|
||||
.mem_rsp_ready (per_cluster_mem_rsp_ready [i]),
|
||||
|
||||
.csr_req_valid (per_cluster_csr_req_valid [i]),
|
||||
.csr_req_coreid (csr_core_id),
|
||||
|
@ -171,12 +171,12 @@ module Vortex (
|
|||
.NUM_REQS (`NUM_CLUSTERS),
|
||||
.CREQ_SIZE (`L3CREQ_SIZE),
|
||||
.MSHR_SIZE (`L3MSHR_SIZE),
|
||||
.DRSQ_SIZE (`L3DRSQ_SIZE),
|
||||
.DREQ_SIZE (`L3DREQ_SIZE),
|
||||
.MRSQ_SIZE (`L3MRSQ_SIZE),
|
||||
.MREQ_SIZE (`L3MREQ_SIZE),
|
||||
.WRITE_ENABLE (1),
|
||||
.CORE_TAG_WIDTH (`L2DRAM_TAG_WIDTH),
|
||||
.CORE_TAG_WIDTH (`L2MEM_TAG_WIDTH),
|
||||
.CORE_TAG_ID_BITS (0),
|
||||
.DRAM_TAG_WIDTH (`L3DRAM_TAG_WIDTH)
|
||||
.MEM_TAG_WIDTH (`L3MEM_TAG_WIDTH)
|
||||
) l3cache (
|
||||
`SCOPE_BIND_Vortex_l3cache
|
||||
|
||||
|
@ -190,105 +190,105 @@ module Vortex (
|
|||
`endif
|
||||
|
||||
// Core request
|
||||
.core_req_valid (per_cluster_dram_req_valid),
|
||||
.core_req_rw (per_cluster_dram_req_rw),
|
||||
.core_req_byteen (per_cluster_dram_req_byteen),
|
||||
.core_req_addr (per_cluster_dram_req_addr),
|
||||
.core_req_data (per_cluster_dram_req_data),
|
||||
.core_req_tag (per_cluster_dram_req_tag),
|
||||
.core_req_ready (per_cluster_dram_req_ready),
|
||||
.core_req_valid (per_cluster_mem_req_valid),
|
||||
.core_req_rw (per_cluster_mem_req_rw),
|
||||
.core_req_byteen (per_cluster_mem_req_byteen),
|
||||
.core_req_addr (per_cluster_mem_req_addr),
|
||||
.core_req_data (per_cluster_mem_req_data),
|
||||
.core_req_tag (per_cluster_mem_req_tag),
|
||||
.core_req_ready (per_cluster_mem_req_ready),
|
||||
|
||||
// Core response
|
||||
.core_rsp_valid (per_cluster_dram_rsp_valid),
|
||||
.core_rsp_data (per_cluster_dram_rsp_data),
|
||||
.core_rsp_tag (per_cluster_dram_rsp_tag),
|
||||
.core_rsp_ready (per_cluster_dram_rsp_ready),
|
||||
.core_rsp_valid (per_cluster_mem_rsp_valid),
|
||||
.core_rsp_data (per_cluster_mem_rsp_data),
|
||||
.core_rsp_tag (per_cluster_mem_rsp_tag),
|
||||
.core_rsp_ready (per_cluster_mem_rsp_ready),
|
||||
|
||||
// DRAM request
|
||||
.dram_req_valid (dram_req_valid),
|
||||
.dram_req_rw (dram_req_rw),
|
||||
.dram_req_byteen (dram_req_byteen),
|
||||
.dram_req_addr (dram_req_addr),
|
||||
.dram_req_data (dram_req_data),
|
||||
.dram_req_tag (dram_req_tag),
|
||||
.dram_req_ready (dram_req_ready),
|
||||
// Memory request
|
||||
.mem_req_valid (mem_req_valid),
|
||||
.mem_req_rw (mem_req_rw),
|
||||
.mem_req_byteen (mem_req_byteen),
|
||||
.mem_req_addr (mem_req_addr),
|
||||
.mem_req_data (mem_req_data),
|
||||
.mem_req_tag (mem_req_tag),
|
||||
.mem_req_ready (mem_req_ready),
|
||||
|
||||
// DRAM response
|
||||
.dram_rsp_valid (dram_rsp_valid),
|
||||
.dram_rsp_data (dram_rsp_data),
|
||||
.dram_rsp_tag (dram_rsp_tag),
|
||||
.dram_rsp_ready (dram_rsp_ready)
|
||||
// Memory response
|
||||
.mem_rsp_valid (mem_rsp_valid),
|
||||
.mem_rsp_data (mem_rsp_data),
|
||||
.mem_rsp_tag (mem_rsp_tag),
|
||||
.mem_rsp_ready (mem_rsp_ready)
|
||||
);
|
||||
|
||||
end else begin
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_REQS (`NUM_CLUSTERS),
|
||||
.DATA_WIDTH (`L3DRAM_LINE_WIDTH),
|
||||
.TAG_IN_WIDTH (`L2DRAM_TAG_WIDTH),
|
||||
.TAG_OUT_WIDTH (`L3DRAM_TAG_WIDTH),
|
||||
.BUFFERED_REQ (1),
|
||||
.BUFFERED_RSP (1)
|
||||
) dram_arb (
|
||||
.NUM_REQS (`NUM_CLUSTERS),
|
||||
.DATA_WIDTH (`L3MEM_LINE_WIDTH),
|
||||
.TAG_IN_WIDTH (`L2MEM_TAG_WIDTH),
|
||||
.TAG_OUT_WIDTH (`L3MEM_TAG_WIDTH),
|
||||
.BUFFERED_REQ (1),
|
||||
.BUFFERED_RSP (1)
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
// Core request
|
||||
.req_valid_in (per_cluster_dram_req_valid),
|
||||
.req_rw_in (per_cluster_dram_req_rw),
|
||||
.req_byteen_in (per_cluster_dram_req_byteen),
|
||||
.req_addr_in (per_cluster_dram_req_addr),
|
||||
.req_data_in (per_cluster_dram_req_data),
|
||||
.req_tag_in (per_cluster_dram_req_tag),
|
||||
.req_ready_in (per_cluster_dram_req_ready),
|
||||
.req_valid_in (per_cluster_mem_req_valid),
|
||||
.req_rw_in (per_cluster_mem_req_rw),
|
||||
.req_byteen_in (per_cluster_mem_req_byteen),
|
||||
.req_addr_in (per_cluster_mem_req_addr),
|
||||
.req_data_in (per_cluster_mem_req_data),
|
||||
.req_tag_in (per_cluster_mem_req_tag),
|
||||
.req_ready_in (per_cluster_mem_req_ready),
|
||||
|
||||
// DRAM request
|
||||
.req_valid_out (dram_req_valid),
|
||||
.req_rw_out (dram_req_rw),
|
||||
.req_byteen_out (dram_req_byteen),
|
||||
.req_addr_out (dram_req_addr),
|
||||
.req_data_out (dram_req_data),
|
||||
.req_tag_out (dram_req_tag),
|
||||
.req_ready_out (dram_req_ready),
|
||||
// Memory request
|
||||
.req_valid_out (mem_req_valid),
|
||||
.req_rw_out (mem_req_rw),
|
||||
.req_byteen_out (mem_req_byteen),
|
||||
.req_addr_out (mem_req_addr),
|
||||
.req_data_out (mem_req_data),
|
||||
.req_tag_out (mem_req_tag),
|
||||
.req_ready_out (mem_req_ready),
|
||||
|
||||
// Core response
|
||||
.rsp_valid_out (per_cluster_dram_rsp_valid),
|
||||
.rsp_data_out (per_cluster_dram_rsp_data),
|
||||
.rsp_tag_out (per_cluster_dram_rsp_tag),
|
||||
.rsp_ready_out (per_cluster_dram_rsp_ready),
|
||||
.rsp_valid_out (per_cluster_mem_rsp_valid),
|
||||
.rsp_data_out (per_cluster_mem_rsp_data),
|
||||
.rsp_tag_out (per_cluster_mem_rsp_tag),
|
||||
.rsp_ready_out (per_cluster_mem_rsp_ready),
|
||||
|
||||
// DRAM response
|
||||
.rsp_valid_in (dram_rsp_valid),
|
||||
.rsp_tag_in (dram_rsp_tag),
|
||||
.rsp_data_in (dram_rsp_data),
|
||||
.rsp_ready_in (dram_rsp_ready)
|
||||
// Memory response
|
||||
.rsp_valid_in (mem_rsp_valid),
|
||||
.rsp_tag_in (mem_rsp_tag),
|
||||
.rsp_data_in (mem_rsp_data),
|
||||
.rsp_ready_in (mem_rsp_ready)
|
||||
);
|
||||
|
||||
end
|
||||
|
||||
`SCOPE_ASSIGN (reset, reset);
|
||||
|
||||
`SCOPE_ASSIGN (dram_req_fire, dram_req_valid && dram_req_ready);
|
||||
`SCOPE_ASSIGN (dram_req_addr, `TO_FULL_ADDR(dram_req_addr));
|
||||
`SCOPE_ASSIGN (dram_req_rw, dram_req_rw);
|
||||
`SCOPE_ASSIGN (dram_req_byteen, dram_req_byteen);
|
||||
`SCOPE_ASSIGN (dram_req_data, dram_req_data);
|
||||
`SCOPE_ASSIGN (dram_req_tag, dram_req_tag);
|
||||
`SCOPE_ASSIGN (dram_rsp_fire, dram_rsp_valid && dram_rsp_ready);
|
||||
`SCOPE_ASSIGN (dram_rsp_data, dram_rsp_data);
|
||||
`SCOPE_ASSIGN (dram_rsp_tag, dram_rsp_tag);
|
||||
`SCOPE_ASSIGN (mem_req_fire, mem_req_valid && mem_req_ready);
|
||||
`SCOPE_ASSIGN (mem_req_addr, `TO_FULL_ADDR(mem_req_addr));
|
||||
`SCOPE_ASSIGN (mem_req_rw, mem_req_rw);
|
||||
`SCOPE_ASSIGN (mem_req_byteen, mem_req_byteen);
|
||||
`SCOPE_ASSIGN (mem_req_data, mem_req_data);
|
||||
`SCOPE_ASSIGN (mem_req_tag, mem_req_tag);
|
||||
`SCOPE_ASSIGN (mem_rsp_fire, mem_rsp_valid && mem_rsp_ready);
|
||||
`SCOPE_ASSIGN (mem_rsp_data, mem_rsp_data);
|
||||
`SCOPE_ASSIGN (mem_rsp_tag, mem_rsp_tag);
|
||||
`SCOPE_ASSIGN (busy, busy);
|
||||
|
||||
`ifdef DBG_PRINT_DRAM
|
||||
`ifdef DBG_PRINT_MEM
|
||||
always @(posedge clk) begin
|
||||
if (dram_req_valid && dram_req_ready) begin
|
||||
if (dram_req_rw)
|
||||
$display("%t: DRAM Wr Req: addr=%0h, tag=%0h, byteen=%0h data=%0h", $time, `TO_FULL_ADDR(dram_req_addr), dram_req_tag, dram_req_byteen, dram_req_data);
|
||||
if (mem_req_valid && mem_req_ready) begin
|
||||
if (mem_req_rw)
|
||||
$display("%t: MEM Wr Req: addr=%0h, tag=%0h, byteen=%0h data=%0h", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data);
|
||||
else
|
||||
$display("%t: DRAM Rd Req: addr=%0h, tag=%0h, byteen=%0h", $time, `TO_FULL_ADDR(dram_req_addr), dram_req_tag, dram_req_byteen);
|
||||
$display("%t: MEM Rd Req: addr=%0h, tag=%0h, byteen=%0h", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen);
|
||||
end
|
||||
if (dram_rsp_valid && dram_rsp_ready) begin
|
||||
$display("%t: DRAM Rsp: tag=%0h, data=%0h", $time, dram_rsp_tag, dram_rsp_data);
|
||||
if (mem_rsp_valid && mem_rsp_ready) begin
|
||||
$display("%t: MEM Rsp: tag=%0h, data=%0h", $time, mem_rsp_tag, mem_rsp_data);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -1,133 +1,166 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_avs_wrapper #(
|
||||
parameter AVS_DATAW = 1,
|
||||
parameter AVS_ADDRW = 1,
|
||||
parameter AVS_BURSTW = 1,
|
||||
parameter AVS_BANKS = 1,
|
||||
parameter REQ_TAGW = 1,
|
||||
parameter RD_QUEUE_SIZE = 1,
|
||||
parameter NUM_BANKS = 1,
|
||||
parameter AVS_DATA_WIDTH = 1,
|
||||
parameter AVS_ADDR_WIDTH = 1,
|
||||
parameter AVS_BURST_WIDTH = 1,
|
||||
parameter AVS_BANKS = 1,
|
||||
parameter REQ_TAG_WIDTH = 1,
|
||||
parameter RD_QUEUE_SIZE = 1,
|
||||
|
||||
parameter AVS_BYTEENW = (AVS_DATAW / 8),
|
||||
parameter RD_QUEUE_ADDRW= $clog2(RD_QUEUE_SIZE+1),
|
||||
parameter AVS_BANKS_BITS= $clog2(AVS_BANKS)
|
||||
parameter AVS_BYTEENW = (AVS_DATA_WIDTH / 8),
|
||||
parameter RD_QUEUE_ADDR_WIDTH = $clog2(RD_QUEUE_SIZE+1),
|
||||
parameter AVS_BANKS_BITS = $clog2(AVS_BANKS)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// Memory request
|
||||
input wire mem_req_valid,
|
||||
input wire mem_req_rw,
|
||||
input wire [AVS_BYTEENW-1:0] mem_req_byteen,
|
||||
input wire [AVS_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
input wire [AVS_DATA_WIDTH-1:0] mem_req_data,
|
||||
input wire [REQ_TAG_WIDTH-1:0] mem_req_tag,
|
||||
output wire mem_req_ready,
|
||||
|
||||
// Memory response
|
||||
output wire mem_rsp_valid,
|
||||
output wire [AVS_DATA_WIDTH-1:0] mem_rsp_data,
|
||||
output wire [REQ_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
input wire mem_rsp_ready,
|
||||
|
||||
// AVS bus
|
||||
output wire [AVS_DATAW-1:0] avs_writedata,
|
||||
input wire [AVS_DATAW-1:0] avs_readdata,
|
||||
output wire [AVS_ADDRW-1:0] avs_address,
|
||||
input wire avs_waitrequest,
|
||||
output wire avs_write,
|
||||
output wire avs_read,
|
||||
output wire [AVS_BYTEENW-1:0] avs_byteenable,
|
||||
output wire [AVS_BURSTW-1:0] avs_burstcount,
|
||||
input avs_readdatavalid,
|
||||
output wire [AVS_BANKS_BITS-1:0] avs_bankselect,
|
||||
|
||||
// DRAM request
|
||||
input wire dram_req_valid,
|
||||
input wire dram_req_rw,
|
||||
input wire [AVS_BYTEENW-1:0] dram_req_byteen,
|
||||
input wire [AVS_ADDRW-1:0] dram_req_addr,
|
||||
input wire [AVS_DATAW-1:0] dram_req_data,
|
||||
input wire [REQ_TAGW-1:0] dram_req_tag,
|
||||
output wire dram_req_ready,
|
||||
|
||||
// DRAM response
|
||||
output wire dram_rsp_valid,
|
||||
output wire [AVS_DATAW-1:0] dram_rsp_data,
|
||||
output wire [REQ_TAGW-1:0] dram_rsp_tag,
|
||||
input wire dram_rsp_ready
|
||||
output wire [AVS_DATA_WIDTH-1:0] avs_writedata [NUM_BANKS],
|
||||
input wire [AVS_DATA_WIDTH-1:0] avs_readdata [NUM_BANKS],
|
||||
output wire [AVS_ADDR_WIDTH-1:0] avs_address [NUM_BANKS],
|
||||
input wire avs_waitrequest [NUM_BANKS],
|
||||
output wire avs_write [NUM_BANKS],
|
||||
output wire avs_read [NUM_BANKS],
|
||||
output wire [AVS_BYTEENW-1:0] avs_byteenable [NUM_BANKS],
|
||||
output wire [AVS_BURST_WIDTH-1:0] avs_burstcount [NUM_BANKS],
|
||||
input avs_readdatavalid [NUM_BANKS]
|
||||
);
|
||||
reg [AVS_BANKS_BITS-1:0] avs_bankselect_r;
|
||||
reg [AVS_BURSTW-1:0] avs_burstcount_r;
|
||||
|
||||
wire avs_reqq_push = dram_req_valid && dram_req_ready && !dram_req_rw;
|
||||
wire avs_reqq_pop = dram_rsp_valid && dram_rsp_ready;
|
||||
localparam BANK_ADDRW = `LOG2UP(NUM_BANKS);
|
||||
|
||||
wire avs_rspq_push = avs_readdatavalid;
|
||||
wire avs_rspq_pop = avs_reqq_pop;
|
||||
wire avs_rspq_empty;
|
||||
|
||||
wire rsp_queue_going_full;
|
||||
wire [RD_QUEUE_ADDRW-1:0] rsp_queue_size;
|
||||
VX_pending_size #(
|
||||
.SIZE (RD_QUEUE_SIZE)
|
||||
) pending_size (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.push (avs_reqq_push),
|
||||
.pop (avs_rspq_pop),
|
||||
`UNUSED_PIN (empty),
|
||||
.full (rsp_queue_going_full),
|
||||
.size (rsp_queue_size)
|
||||
);
|
||||
`UNUSED_VAR (rsp_queue_size)
|
||||
|
||||
always @(posedge clk) begin
|
||||
avs_burstcount_r <= 1;
|
||||
avs_bankselect_r <= 0;
|
||||
end
|
||||
// Requests handling
|
||||
|
||||
VX_fifo_queue #(
|
||||
.DATAW (REQ_TAGW),
|
||||
.SIZE (RD_QUEUE_SIZE)
|
||||
) rd_req_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.push (avs_reqq_push),
|
||||
.pop (avs_reqq_pop),
|
||||
.data_in (dram_req_tag),
|
||||
.data_out (dram_rsp_tag),
|
||||
`UNUSED_PIN (empty),
|
||||
`UNUSED_PIN (full),
|
||||
`UNUSED_PIN (alm_empty),
|
||||
`UNUSED_PIN (alm_full),
|
||||
`UNUSED_PIN (size)
|
||||
wire [NUM_BANKS-1:0] avs_reqq_push, avs_reqq_pop, avs_reqq_ready;
|
||||
wire [NUM_BANKS-1:0] req_queue_going_full;
|
||||
wire [NUM_BANKS-1:0][RD_QUEUE_ADDR_WIDTH-1:0] req_queue_size;
|
||||
wire [NUM_BANKS-1:0][REQ_TAG_WIDTH-1:0] avs_reqq_data_out;
|
||||
|
||||
wire [BANK_ADDRW-1:0] req_bank_sel = (NUM_BANKS >= 2) ? mem_req_addr[BANK_ADDRW-1:0] : '0;
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; i++) begin
|
||||
assign avs_reqq_ready[i] = !req_queue_going_full[i] && !avs_waitrequest[i];
|
||||
assign avs_reqq_push[i] = mem_req_valid && !mem_req_rw && avs_reqq_ready[i] && (req_bank_sel == i);
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; i++) begin
|
||||
VX_pending_size #(
|
||||
.SIZE (RD_QUEUE_SIZE)
|
||||
) pending_size (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.push (avs_reqq_push[i]),
|
||||
.pop (avs_reqq_pop[i]),
|
||||
.full (req_queue_going_full[i]),
|
||||
.size (req_queue_size[i]),
|
||||
`UNUSED_PIN (empty)
|
||||
);
|
||||
`UNUSED_VAR (req_queue_size)
|
||||
|
||||
VX_fifo_queue #(
|
||||
.DATAW (REQ_TAG_WIDTH),
|
||||
.SIZE (RD_QUEUE_SIZE)
|
||||
) rd_req_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.push (avs_reqq_push[i]),
|
||||
.pop (avs_reqq_pop[i]),
|
||||
.data_in (mem_req_tag),
|
||||
.data_out (avs_reqq_data_out[i]),
|
||||
`UNUSED_PIN (empty),
|
||||
`UNUSED_PIN (full),
|
||||
`UNUSED_PIN (alm_empty),
|
||||
`UNUSED_PIN (alm_full),
|
||||
`UNUSED_PIN (size)
|
||||
);
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; i++) begin
|
||||
assign avs_read[i] = mem_req_valid && !mem_req_rw && !req_queue_going_full[i] && (req_bank_sel == i);
|
||||
assign avs_write[i] = mem_req_valid && mem_req_rw && !req_queue_going_full[i] && (req_bank_sel == i);
|
||||
assign avs_address[i] = mem_req_addr;
|
||||
assign avs_byteenable[i] = mem_req_byteen;
|
||||
assign avs_writedata[i] = mem_req_data;
|
||||
assign avs_burstcount[i] = AVS_BURST_WIDTH'(1);
|
||||
end
|
||||
|
||||
assign mem_req_ready = avs_reqq_ready[req_bank_sel];
|
||||
|
||||
// Responses handling
|
||||
|
||||
wire [NUM_BANKS-1:0] rsp_arb_valid_in;
|
||||
wire [NUM_BANKS-1:0][AVS_DATA_WIDTH+REQ_TAG_WIDTH-1:0] rsp_arb_data_in;
|
||||
wire [NUM_BANKS-1:0] rsp_arb_ready_in;
|
||||
|
||||
wire [NUM_BANKS-1:0][AVS_DATA_WIDTH-1:0] avs_rspq_data_out;
|
||||
wire [NUM_BANKS-1:0] avs_rspq_empty;
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; i++) begin
|
||||
VX_fifo_queue #(
|
||||
.DATAW (AVS_DATA_WIDTH),
|
||||
.SIZE (RD_QUEUE_SIZE)
|
||||
) rd_rsp_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.push (avs_readdatavalid[i]),
|
||||
.pop (avs_reqq_pop[i]),
|
||||
.data_in (avs_readdata[i]),
|
||||
.data_out (avs_rspq_data_out[i]),
|
||||
.empty (avs_rspq_empty[i]),
|
||||
`UNUSED_PIN (full),
|
||||
`UNUSED_PIN (alm_empty),
|
||||
`UNUSED_PIN (alm_full),
|
||||
`UNUSED_PIN (size)
|
||||
);
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; i++) begin
|
||||
assign rsp_arb_valid_in[i] = !avs_rspq_empty[i];
|
||||
assign rsp_arb_data_in[i] = {avs_rspq_data_out[i], avs_reqq_data_out[i]};
|
||||
assign avs_reqq_pop[i] = rsp_arb_valid_in[i] && rsp_arb_ready_in[i];
|
||||
end
|
||||
|
||||
VX_stream_arbiter #(
|
||||
.NUM_REQS (NUM_BANKS),
|
||||
.DATAW (AVS_DATA_WIDTH + REQ_TAG_WIDTH),
|
||||
.BUFFERED (NUM_BANKS > 2)
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (rsp_arb_valid_in),
|
||||
.data_in (rsp_arb_data_in),
|
||||
.ready_in (rsp_arb_ready_in),
|
||||
.valid_out (mem_rsp_valid),
|
||||
.data_out ({mem_rsp_data, mem_rsp_tag}),
|
||||
.ready_out (mem_rsp_ready)
|
||||
);
|
||||
|
||||
VX_fifo_queue #(
|
||||
.DATAW (AVS_DATAW),
|
||||
.SIZE (RD_QUEUE_SIZE)
|
||||
) rd_rsp_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.push (avs_rspq_push),
|
||||
.pop (avs_rspq_pop),
|
||||
.data_in (avs_readdata),
|
||||
.data_out (dram_rsp_data),
|
||||
.empty (avs_rspq_empty),
|
||||
`UNUSED_PIN (full),
|
||||
`UNUSED_PIN (alm_empty),
|
||||
`UNUSED_PIN (alm_full),
|
||||
`UNUSED_PIN (size)
|
||||
);
|
||||
|
||||
assign avs_read = dram_req_valid && !dram_req_rw && !rsp_queue_going_full;
|
||||
assign avs_write = dram_req_valid && dram_req_rw && !rsp_queue_going_full;
|
||||
assign avs_address = dram_req_addr;
|
||||
assign avs_byteenable = dram_req_byteen;
|
||||
assign avs_writedata = dram_req_data;
|
||||
assign avs_burstcount = avs_burstcount_r;
|
||||
assign avs_bankselect = avs_bankselect_r;
|
||||
|
||||
assign dram_req_ready = !avs_waitrequest && !rsp_queue_going_full;
|
||||
|
||||
assign dram_rsp_valid = !avs_rspq_empty;
|
||||
|
||||
`ifdef DBG_PRINT_AVS
|
||||
always @(posedge clk) begin
|
||||
if (dram_req_valid && dram_req_ready) begin
|
||||
if (dram_req_rw)
|
||||
$display("%t: AVS Wr Req: addr=%0h, byteen=%0h, tag=%0h, data=%0h", $time, `TO_FULL_ADDR(dram_req_addr), dram_req_byteen, dram_req_tag, dram_req_data);
|
||||
if (mem_req_valid && mem_req_ready) begin
|
||||
if (mem_req_rw)
|
||||
$display("%t: AVS Wr Req: addr=%0h, byteen=%0h, tag=%0h, data=%0h", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_byteen, mem_req_tag, mem_req_data);
|
||||
else
|
||||
$display("%t: AVS Rd Req: addr=%0h, byteen=%0h, tag=%0h, pending=%0d", $time, `TO_FULL_ADDR(dram_req_addr), dram_req_byteen, dram_req_tag, rsp_queue_size);
|
||||
$display("%t: AVS Rd Req: addr=%0h, byteen=%0h, tag=%0h, pending=%0d", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_byteen, mem_req_tag, req_queue_size);
|
||||
end
|
||||
if (dram_rsp_valid && dram_rsp_ready) begin
|
||||
$display("%t: AVS Rd Rsp: tag=%0h, data=%0h, pending=%0d", $time, dram_rsp_tag, dram_rsp_data, rsp_queue_size);
|
||||
if (mem_rsp_valid && mem_rsp_ready) begin
|
||||
$display("%t: AVS Rd Rsp: tag=%0h, data=%0h, pending=%0d", $time, mem_rsp_tag, mem_rsp_data, req_queue_size);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
178
hw/rtl/afu/VX_to_mem.v
Normal file
178
hw/rtl/afu/VX_to_mem.v
Normal file
|
@ -0,0 +1,178 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_to_mem #(
|
||||
parameter SRC_DATA_WIDTH = 1,
|
||||
parameter SRC_ADDR_WIDTH = 1,
|
||||
parameter DST_DATA_WIDTH = 1,
|
||||
parameter DST_ADDR_WIDTH = 1,
|
||||
parameter SRC_TAG_WIDTH = 1,
|
||||
parameter DST_TAG_WIDTH = 1,
|
||||
parameter SRC_DATA_SIZE = (SRC_DATA_WIDTH / 8),
|
||||
parameter DST_DATA_SIZE = (DST_DATA_WIDTH / 8)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
input wire mem_req_valid_in,
|
||||
input wire [SRC_ADDR_WIDTH-1:0] mem_req_addr_in,
|
||||
input wire mem_req_rw_in,
|
||||
input wire [SRC_DATA_SIZE-1:0] mem_req_byteen_in,
|
||||
input wire [SRC_DATA_WIDTH-1:0] mem_req_data_in,
|
||||
input wire [SRC_TAG_WIDTH-1:0] mem_req_tag_in,
|
||||
output wire mem_req_ready_in,
|
||||
|
||||
output wire mem_req_valid_out,
|
||||
output wire [DST_ADDR_WIDTH-1:0] mem_req_addr_out,
|
||||
output wire mem_req_rw_out,
|
||||
output wire [DST_DATA_SIZE-1:0] mem_req_byteen_out,
|
||||
output wire [DST_DATA_WIDTH-1:0] mem_req_data_out,
|
||||
output wire [DST_TAG_WIDTH-1:0] mem_req_tag_out,
|
||||
input wire mem_req_ready_out,
|
||||
|
||||
input wire mem_rsp_valid_in,
|
||||
input wire [DST_DATA_WIDTH-1:0] mem_rsp_data_in,
|
||||
input wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_in,
|
||||
output wire mem_rsp_ready_in,
|
||||
|
||||
output wire mem_rsp_valid_out,
|
||||
output wire [SRC_DATA_WIDTH-1:0] mem_rsp_data_out,
|
||||
output wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_out,
|
||||
input wire mem_rsp_ready_out
|
||||
);
|
||||
`STATIC_ASSERT ((DST_TAG_WIDTH >= SRC_TAG_WIDTH), ("oops!"))
|
||||
|
||||
localparam DST_LDATAW = $clog2(DST_DATA_WIDTH);
|
||||
localparam SRC_LDATAW = $clog2(SRC_DATA_WIDTH);
|
||||
localparam D = `ABS(DST_LDATAW - SRC_LDATAW);
|
||||
localparam P = 2**D;
|
||||
|
||||
`UNUSED_VAR (mem_rsp_tag_in)
|
||||
|
||||
if (DST_LDATAW > SRC_LDATAW) begin
|
||||
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
wire [D-1:0] req_idx = mem_req_addr_in[D-1:0];
|
||||
wire [D-1:0] rsp_idx = mem_rsp_tag_in[D-1:0];
|
||||
|
||||
wire [SRC_ADDR_WIDTH-D-1:0] mem_req_addr_in_qual = mem_req_addr_in[SRC_ADDR_WIDTH-1:D];
|
||||
|
||||
wire [P-1:0][SRC_DATA_WIDTH-1:0] mem_rsp_data_in_w = mem_rsp_data_in;
|
||||
|
||||
if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH - D)) begin
|
||||
`UNUSED_VAR (mem_req_addr_in_qual)
|
||||
assign mem_req_addr_out = mem_req_addr_in_qual[DST_ADDR_WIDTH-1:0];
|
||||
end else if (DST_ADDR_WIDTH > (SRC_ADDR_WIDTH - D)) begin
|
||||
assign mem_req_addr_out = DST_ADDR_WIDTH'(mem_req_addr_in_qual);
|
||||
end else begin
|
||||
assign mem_req_addr_out = mem_req_addr_in_qual;
|
||||
end
|
||||
|
||||
assign mem_req_valid_out = mem_req_valid_in;
|
||||
assign mem_req_rw_out = mem_req_rw_in;
|
||||
assign mem_req_byteen_out = DST_DATA_SIZE'(mem_req_byteen_in) << ((DST_LDATAW-3)'(req_idx) << (SRC_LDATAW-3));
|
||||
assign mem_req_data_out = DST_DATA_WIDTH'(mem_req_data_in) << ((DST_LDATAW'(req_idx)) << SRC_LDATAW);
|
||||
assign mem_req_tag_out = DST_TAG_WIDTH'({mem_req_tag_in, req_idx});
|
||||
assign mem_req_ready_in = mem_req_ready_out;
|
||||
|
||||
assign mem_rsp_valid_out = mem_rsp_valid_in;
|
||||
assign mem_rsp_data_out = mem_rsp_data_in_w[rsp_idx];
|
||||
assign mem_rsp_tag_out = SRC_TAG_WIDTH'(mem_rsp_tag_in[SRC_TAG_WIDTH+D-1:D]);
|
||||
assign mem_rsp_ready_in = mem_rsp_ready_out;
|
||||
|
||||
end else if (DST_LDATAW < SRC_LDATAW) begin
|
||||
|
||||
reg [D-1:0] req_ctr, rsp_ctr;
|
||||
|
||||
reg [P-1:0][DST_DATA_WIDTH-1:0] mem_rsp_data_out_r, mem_rsp_data_out_n;
|
||||
|
||||
wire mem_req_out_fire = mem_req_valid_out && mem_req_ready_out;
|
||||
wire mem_rsp_in_fire = mem_rsp_valid_in && mem_rsp_ready_in;
|
||||
|
||||
wire [P-1:0][DST_DATA_WIDTH-1:0] mem_req_data_in_w = mem_req_data_in;
|
||||
wire [P-1:0][DST_DATA_SIZE-1:0] mem_req_byteen_in_w = mem_req_byteen_in;
|
||||
|
||||
always @(*) begin
|
||||
mem_rsp_data_out_n = mem_rsp_data_out_r;
|
||||
mem_rsp_data_out_n[rsp_ctr] = mem_rsp_data_in;
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
req_ctr <= 0;
|
||||
rsp_ctr <= 0;
|
||||
end else begin
|
||||
if (mem_req_out_fire) begin
|
||||
req_ctr <= req_ctr + 1;
|
||||
end
|
||||
if (mem_rsp_in_fire) begin
|
||||
rsp_ctr <= rsp_ctr + 1;
|
||||
mem_rsp_data_out_r <= mem_rsp_data_out_n;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
reg [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_r;
|
||||
wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_w;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (mem_rsp_in_fire) begin
|
||||
mem_rsp_tag_in_r <= mem_rsp_tag_in;
|
||||
end
|
||||
end
|
||||
assign mem_rsp_tag_in_w = (rsp_ctr != 0) ? mem_rsp_tag_in_r : mem_rsp_tag_in;
|
||||
`RUNTIME_ASSERT((mem_rsp_tag_in_w == mem_rsp_tag_in), ("oops!"))
|
||||
|
||||
wire [SRC_ADDR_WIDTH+D-1:0] mem_req_addr_in_qual = {mem_req_addr_in, req_ctr};
|
||||
|
||||
if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH + D)) begin
|
||||
`UNUSED_VAR (mem_req_addr_in_qual)
|
||||
assign mem_req_addr_out = mem_req_addr_in_qual[DST_ADDR_WIDTH-1:0];
|
||||
end else if (DST_ADDR_WIDTH > (SRC_ADDR_WIDTH + D)) begin
|
||||
assign mem_req_addr_out = DST_ADDR_WIDTH'(mem_req_addr_in_qual);
|
||||
end else begin
|
||||
assign mem_req_addr_out = mem_req_addr_in_qual;
|
||||
end
|
||||
|
||||
assign mem_req_valid_out = mem_req_valid_in;
|
||||
assign mem_req_rw_out = mem_req_rw_in;
|
||||
assign mem_req_byteen_out = mem_req_byteen_in_w[req_ctr];
|
||||
assign mem_req_data_out = mem_req_data_in_w[req_ctr];
|
||||
assign mem_req_tag_out = DST_TAG_WIDTH'(mem_req_tag_in);
|
||||
assign mem_req_ready_in = mem_req_ready_out && (req_ctr == (P-1));
|
||||
|
||||
assign mem_rsp_valid_out = mem_rsp_valid_in && (rsp_ctr == (P-1));
|
||||
assign mem_rsp_data_out = mem_rsp_data_out_n;
|
||||
assign mem_rsp_tag_out = SRC_TAG_WIDTH'(mem_rsp_tag_in);
|
||||
assign mem_rsp_ready_in = mem_rsp_ready_out;
|
||||
|
||||
end else begin
|
||||
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
if (DST_ADDR_WIDTH < SRC_ADDR_WIDTH) begin
|
||||
`UNUSED_VAR (mem_req_addr_in)
|
||||
assign mem_req_addr_out = mem_req_addr_in[DST_ADDR_WIDTH-1:0];
|
||||
end else if (DST_ADDR_WIDTH > SRC_ADDR_WIDTH) begin
|
||||
assign mem_req_addr_out = DST_ADDR_WIDTH'(mem_req_addr_in);
|
||||
end else begin
|
||||
assign mem_req_addr_out = mem_req_addr_in;
|
||||
end
|
||||
|
||||
assign mem_req_valid_out = mem_req_valid_in;
|
||||
assign mem_req_rw_out = mem_req_rw_in;
|
||||
assign mem_req_byteen_out = mem_req_byteen_in;
|
||||
assign mem_req_data_out = mem_req_data_in;
|
||||
assign mem_req_tag_out = DST_TAG_WIDTH'(mem_req_tag_in);
|
||||
assign mem_req_ready_in = mem_req_ready_out;
|
||||
|
||||
assign mem_rsp_valid_out = mem_rsp_valid_in;
|
||||
assign mem_rsp_data_out = mem_rsp_data_in;
|
||||
assign mem_rsp_tag_out = SRC_TAG_WIDTH'(mem_rsp_tag_in);
|
||||
assign mem_rsp_ready_in = mem_rsp_ready_out;
|
||||
|
||||
end
|
||||
|
||||
endmodule
|
|
@ -77,30 +77,28 @@ module ccip_std_afu #(
|
|||
// User AFU goes here
|
||||
// ====================================================================
|
||||
|
||||
//
|
||||
// vortex_afu depends on CCI-P and local memory being in the same
|
||||
// clock domain. This is accomplished by choosing a common clock
|
||||
// in the AFU's JSON description. The platform instantiates clock-
|
||||
// crossing shims automatically, as needed.
|
||||
//
|
||||
t_local_mem_byte_mask avs_byteenable [NUM_LOCAL_MEM_BANKS];
|
||||
logic avs_waitrequest [NUM_LOCAL_MEM_BANKS];
|
||||
t_local_mem_data avs_readdata [NUM_LOCAL_MEM_BANKS];
|
||||
logic avs_readdatavalid [NUM_LOCAL_MEM_BANKS];
|
||||
t_local_mem_burst_cnt avs_burstcount [NUM_LOCAL_MEM_BANKS];
|
||||
t_local_mem_data avs_writedata [NUM_LOCAL_MEM_BANKS];
|
||||
t_local_mem_addr avs_address [NUM_LOCAL_MEM_BANKS];
|
||||
logic avs_write [NUM_LOCAL_MEM_BANKS];
|
||||
logic avs_read [NUM_LOCAL_MEM_BANKS];
|
||||
|
||||
//
|
||||
// Memory banks are used very simply here. Only bank is active at
|
||||
// a time, selected by mem_bank_select. mem_bank_select is set
|
||||
// by a CSR from the host.
|
||||
//
|
||||
t_local_mem_byte_mask avs_byteenable;
|
||||
logic avs_waitrequest;
|
||||
t_local_mem_data avs_readdata;
|
||||
logic avs_readdatavalid;
|
||||
t_local_mem_burst_cnt avs_burstcount;
|
||||
t_local_mem_data avs_writedata;
|
||||
t_local_mem_addr avs_address;
|
||||
logic avs_write;
|
||||
logic avs_read;
|
||||
|
||||
// choose which memory bank to test
|
||||
logic [$clog2(NUM_LOCAL_MEM_BANKS)-1:0] mem_bank_select;
|
||||
for (genvar b = 0; b < NUM_LOCAL_MEM_BANKS; b++) begin
|
||||
assign local_mem[b].burstcount = avs_burstcount[b];
|
||||
assign local_mem[b].writedata = avs_writedata[b];
|
||||
assign local_mem[b].address = avs_address[b];
|
||||
assign local_mem[b].byteenable = avs_byteenable[b];
|
||||
assign local_mem[b].write = avs_write[b];
|
||||
assign local_mem[b].read = avs_read[b];
|
||||
|
||||
assign avs_waitrequest[b] = local_mem[b].waitrequest;
|
||||
assign avs_readdata[b] = local_mem[b].readdata;
|
||||
assign avs_readdatavalid[b] = local_mem[b].readdatavalid;
|
||||
end
|
||||
|
||||
vortex_afu #(
|
||||
.NUM_LOCAL_MEM_BANKS(NUM_LOCAL_MEM_BANKS)
|
||||
|
@ -108,6 +106,9 @@ module ccip_std_afu #(
|
|||
.clk (clk),
|
||||
.reset (reset_T1),
|
||||
|
||||
.cp2af_sRxPort (cp2af_sRx_T1),
|
||||
.af2cp_sTxPort (af2cp_sTx_T0),
|
||||
|
||||
.avs_writedata (avs_writedata),
|
||||
.avs_readdata (avs_readdata),
|
||||
.avs_address (avs_address),
|
||||
|
@ -116,52 +117,7 @@ module ccip_std_afu #(
|
|||
.avs_read (avs_read),
|
||||
.avs_byteenable (avs_byteenable),
|
||||
.avs_burstcount (avs_burstcount),
|
||||
.avs_readdatavalid (avs_readdatavalid),
|
||||
.mem_bank_select (mem_bank_select),
|
||||
|
||||
.cp2af_sRxPort (cp2af_sRx_T1),
|
||||
.af2cp_sTxPort (af2cp_sTx_T0)
|
||||
);
|
||||
|
||||
//
|
||||
// Export the local memory interface signals as vectors so that bank
|
||||
// selection can use array syntax.
|
||||
//
|
||||
logic avs_waitrequest_v[NUM_LOCAL_MEM_BANKS];
|
||||
t_local_mem_data avs_readdata_v[NUM_LOCAL_MEM_BANKS];
|
||||
logic avs_readdatavalid_v[NUM_LOCAL_MEM_BANKS];
|
||||
|
||||
genvar b;
|
||||
generate
|
||||
for (b = 0; b < NUM_LOCAL_MEM_BANKS; b = b + 1)
|
||||
begin : lmb
|
||||
always_comb
|
||||
begin
|
||||
// Local memory to AFU signals
|
||||
avs_waitrequest_v[b] = local_mem[b].waitrequest;
|
||||
avs_readdata_v[b] = local_mem[b].readdata;
|
||||
avs_readdatavalid_v[b] = local_mem[b].readdatavalid;
|
||||
|
||||
// Replicate address and write data to all banks. Only
|
||||
// the request signals have to be bank-specific.
|
||||
local_mem[b].burstcount = avs_burstcount;
|
||||
local_mem[b].writedata = avs_writedata;
|
||||
local_mem[b].address = avs_address;
|
||||
local_mem[b].byteenable = avs_byteenable;
|
||||
|
||||
// Request a write to this bank?
|
||||
local_mem[b].write = avs_write &&
|
||||
($bits(mem_bank_select)'(b) == mem_bank_select);
|
||||
|
||||
// Request a read from this bank?
|
||||
local_mem[b].read = avs_read &&
|
||||
($bits(mem_bank_select)'(b) == mem_bank_select);
|
||||
end
|
||||
end
|
||||
endgenerate
|
||||
|
||||
assign avs_waitrequest = avs_waitrequest_v[mem_bank_select];
|
||||
assign avs_readdata = avs_readdata_v[mem_bank_select];
|
||||
assign avs_readdatavalid = avs_readdatavalid_v[mem_bank_select];
|
||||
.avs_readdatavalid (avs_readdatavalid)
|
||||
);
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -1,13 +1,18 @@
|
|||
`include "VX_define.vh"
|
||||
`ifndef NOPAE
|
||||
`include "afu_json_info.vh"
|
||||
`else
|
||||
`include "VX_platform.vh"
|
||||
`ifdef NOPAE
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
`include "vortex_afu.vh"
|
||||
`IGNORE_WARNINGS_END
|
||||
`else
|
||||
`include "afu_json_info.vh"
|
||||
`endif
|
||||
|
||||
/* verilator lint_off IMPORTSTAR */
|
||||
import ccip_if_pkg::*;
|
||||
import local_mem_cfg_pkg::*;
|
||||
/* verilator lint_on IMPORTSTAR */
|
||||
/* verilator lint_on IMPORTSTAR */
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module vortex_afu #(
|
||||
parameter NUM_LOCAL_MEM_BANKS = 2
|
||||
|
@ -21,30 +26,32 @@ module vortex_afu #(
|
|||
output t_if_ccip_Tx af2cp_sTxPort,
|
||||
|
||||
// Avalon signals for local memory access
|
||||
output t_local_mem_data avs_writedata,
|
||||
input t_local_mem_data avs_readdata,
|
||||
output t_local_mem_addr avs_address,
|
||||
input logic avs_waitrequest,
|
||||
output logic avs_write,
|
||||
output logic avs_read,
|
||||
output t_local_mem_byte_mask avs_byteenable,
|
||||
output t_local_mem_burst_cnt avs_burstcount,
|
||||
input avs_readdatavalid,
|
||||
|
||||
output logic [$clog2(NUM_LOCAL_MEM_BANKS)-1:0] mem_bank_select
|
||||
output t_local_mem_data avs_writedata [NUM_LOCAL_MEM_BANKS],
|
||||
input t_local_mem_data avs_readdata [NUM_LOCAL_MEM_BANKS],
|
||||
output t_local_mem_addr avs_address [NUM_LOCAL_MEM_BANKS],
|
||||
input logic avs_waitrequest [NUM_LOCAL_MEM_BANKS],
|
||||
output logic avs_write [NUM_LOCAL_MEM_BANKS],
|
||||
output logic avs_read [NUM_LOCAL_MEM_BANKS],
|
||||
output t_local_mem_byte_mask avs_byteenable [NUM_LOCAL_MEM_BANKS],
|
||||
output t_local_mem_burst_cnt avs_burstcount [NUM_LOCAL_MEM_BANKS],
|
||||
input avs_readdatavalid [NUM_LOCAL_MEM_BANKS]
|
||||
);
|
||||
|
||||
localparam RESET_DELAY = 3;
|
||||
|
||||
localparam DRAM_ADDR_WIDTH = $bits(t_local_mem_addr);
|
||||
localparam DRAM_LINE_WIDTH = $bits(t_local_mem_data);
|
||||
localparam DRAM_LINE_LW = $clog2(DRAM_LINE_WIDTH);
|
||||
localparam LMEM_LINE_WIDTH = $bits(t_local_mem_data);
|
||||
localparam LMEM_ADDR_WIDTH = $bits(t_local_mem_addr);
|
||||
localparam LMEM_BURST_CTRW = $bits(t_local_mem_burst_cnt);
|
||||
|
||||
localparam VX_DRAM_LINE_LW = $clog2(`VX_DRAM_LINE_WIDTH);
|
||||
localparam VX_DRAM_LINE_IDX = (DRAM_LINE_LW - VX_DRAM_LINE_LW);
|
||||
localparam CCI_LINE_WIDTH = $bits(t_ccip_clData);
|
||||
localparam CCI_LINE_SIZE = CCI_LINE_WIDTH / 8;
|
||||
localparam CCI_ADDR_WIDTH = 32 - $clog2(CCI_LINE_WIDTH / 8);
|
||||
|
||||
localparam AVS_RD_QUEUE_SIZE = 16;
|
||||
localparam AVS_REQ_TAGW = `VX_DRAM_TAG_WIDTH + VX_DRAM_LINE_IDX;
|
||||
localparam AVS_REQ_TAGW_VX = `MAX(`VX_MEM_TAG_WIDTH, `VX_MEM_TAG_WIDTH + $clog2(LMEM_LINE_WIDTH) - $clog2(`VX_MEM_LINE_WIDTH));
|
||||
localparam AVS_REQ_TAGW_CCI = `MAX(CCI_ADDR_WIDTH, CCI_ADDR_WIDTH + $clog2(LMEM_LINE_WIDTH) - $clog2(CCI_LINE_WIDTH));
|
||||
localparam AVS_REQ_TAGW = `MAX(AVS_REQ_TAGW_VX, AVS_REQ_TAGW_CCI);
|
||||
|
||||
|
||||
localparam CCI_RD_WINDOW_SIZE = 8;
|
||||
localparam CCI_RD_QUEUE_SIZE = 2 * CCI_RD_WINDOW_SIZE;
|
||||
|
@ -74,7 +81,7 @@ localparam MMIO_CSR_DATA = `AFU_IMAGE_MMIO_CSR_DATA;
|
|||
localparam MMIO_CSR_READ = `AFU_IMAGE_MMIO_CSR_READ;
|
||||
|
||||
localparam CCI_RD_RQ_TAGW = $clog2(CCI_RD_WINDOW_SIZE);
|
||||
localparam CCI_RD_RQ_DATAW = $bits(t_ccip_clData) + CCI_RD_RQ_TAGW;
|
||||
localparam CCI_RD_RQ_DATAW = CCI_LINE_WIDTH + CCI_RD_RQ_TAGW;
|
||||
|
||||
localparam STATE_IDLE = 0;
|
||||
localparam STATE_READ = 1;
|
||||
|
@ -96,18 +103,18 @@ reg [STATE_WIDTH-1:0] state;
|
|||
|
||||
// Vortex ports ///////////////////////////////////////////////////////////////
|
||||
|
||||
wire vx_dram_req_valid;
|
||||
wire vx_dram_req_rw;
|
||||
wire [`VX_DRAM_BYTEEN_WIDTH-1:0] vx_dram_req_byteen;
|
||||
wire [`VX_DRAM_ADDR_WIDTH-1:0] vx_dram_req_addr;
|
||||
wire [`VX_DRAM_LINE_WIDTH-1:0] vx_dram_req_data;
|
||||
wire [`VX_DRAM_TAG_WIDTH-1:0] vx_dram_req_tag;
|
||||
wire vx_dram_req_ready;
|
||||
wire vx_mem_req_valid;
|
||||
wire vx_mem_req_rw;
|
||||
wire [`VX_MEM_BYTEEN_WIDTH-1:0] vx_mem_req_byteen;
|
||||
wire [`VX_MEM_ADDR_WIDTH-1:0] vx_mem_req_addr;
|
||||
wire [`VX_MEM_LINE_WIDTH-1:0] vx_mem_req_data;
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] vx_mem_req_tag;
|
||||
wire vx_mem_req_ready;
|
||||
|
||||
wire vx_dram_rsp_valid;
|
||||
wire [`VX_DRAM_LINE_WIDTH-1:0] vx_dram_rsp_data;
|
||||
wire [`VX_DRAM_TAG_WIDTH-1:0] vx_dram_rsp_tag;
|
||||
wire vx_dram_rsp_ready;
|
||||
wire vx_mem_rsp_valid;
|
||||
wire [`VX_MEM_LINE_WIDTH-1:0] vx_mem_rsp_data;
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] vx_mem_rsp_tag;
|
||||
wire vx_mem_rsp_ready;
|
||||
|
||||
wire vx_csr_io_req_valid;
|
||||
wire [`VX_CSR_ID_WIDTH-1:0] vx_csr_io_req_coreid;
|
||||
|
@ -123,13 +130,13 @@ wire vx_csr_io_rsp_ready;
|
|||
wire vx_busy;
|
||||
|
||||
reg vx_reset;
|
||||
reg vx_dram_en;
|
||||
reg vx_mem_en;
|
||||
|
||||
// CMD variables //////////////////////////////////////////////////////////////
|
||||
|
||||
t_ccip_clAddr cmd_io_addr;
|
||||
reg [DRAM_ADDR_WIDTH-1:0] cmd_mem_addr;
|
||||
reg [DRAM_ADDR_WIDTH-1:0] cmd_data_size;
|
||||
reg [CCI_ADDR_WIDTH-1:0] cmd_mem_addr;
|
||||
reg [CCI_ADDR_WIDTH-1:0] cmd_data_size;
|
||||
|
||||
`ifdef SCOPE
|
||||
wire [63:0] cmd_scope_rdata;
|
||||
|
@ -216,9 +223,9 @@ always @(posedge clk) begin
|
|||
`endif
|
||||
end
|
||||
MMIO_MEM_ADDR: begin
|
||||
cmd_mem_addr <= t_local_mem_addr'(cp2af_sRxPort.c0.data);
|
||||
cmd_mem_addr <= $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data);
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
$display("%t: MMIO_MEM_ADDR: addr=%0h, data=0x%0h", $time, mmio_hdr.address, t_local_mem_addr'(cp2af_sRxPort.c0.data));
|
||||
$display("%t: MMIO_MEM_ADDR: addr=%0h, data=0x%0h", $time, mmio_hdr.address, $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data));
|
||||
`endif
|
||||
end
|
||||
MMIO_DATA_SIZE: begin
|
||||
|
@ -335,7 +342,7 @@ always @(posedge clk) begin
|
|||
if (reset) begin
|
||||
state <= STATE_IDLE;
|
||||
vx_reset <= 0;
|
||||
vx_dram_en <= 0;
|
||||
vx_mem_en <= 0;
|
||||
end else begin
|
||||
case (state)
|
||||
STATE_IDLE: begin
|
||||
|
@ -399,14 +406,14 @@ always @(posedge clk) begin
|
|||
// vortex reset cycles
|
||||
if (vx_reset_ctr == $bits(vx_reset_ctr)'(RESET_DELAY)) begin
|
||||
vx_reset <= 0;
|
||||
vx_dram_en <= 1;
|
||||
vx_mem_en <= 1;
|
||||
state <= STATE_RUN;
|
||||
end
|
||||
end
|
||||
|
||||
STATE_RUN: begin
|
||||
if (cmd_run_done) begin
|
||||
vx_dram_en <= 0;
|
||||
vx_mem_en <= 0;
|
||||
state <= STATE_IDLE;
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
$display("%t: STATE IDLE", $time);
|
||||
|
@ -442,187 +449,251 @@ end
|
|||
|
||||
// AVS Controller /////////////////////////////////////////////////////////////
|
||||
|
||||
wire dram_req_valid;
|
||||
wire dram_req_rw;
|
||||
t_local_mem_byte_mask dram_req_byteen;
|
||||
t_local_mem_addr dram_req_addr;
|
||||
t_local_mem_data dram_req_data;
|
||||
wire [AVS_REQ_TAGW:0] dram_req_tag;
|
||||
wire dram_req_ready;
|
||||
|
||||
wire dram_rsp_valid;
|
||||
t_local_mem_data dram_rsp_data;
|
||||
wire [AVS_REQ_TAGW:0] dram_rsp_tag;
|
||||
wire dram_rsp_ready;
|
||||
|
||||
wire cci_dram_req_valid;
|
||||
wire cci_dram_req_rw;
|
||||
t_local_mem_byte_mask cci_dram_req_byteen;
|
||||
t_local_mem_addr cci_dram_req_addr;
|
||||
t_local_mem_data cci_dram_req_data;
|
||||
wire [AVS_REQ_TAGW-1:0] cci_dram_req_tag;
|
||||
wire cci_dram_req_ready;
|
||||
|
||||
wire cci_dram_rsp_valid;
|
||||
t_local_mem_data cci_dram_rsp_data;
|
||||
wire [AVS_REQ_TAGW-1:0] cci_dram_rsp_tag;
|
||||
wire cci_dram_rsp_ready;
|
||||
|
||||
wire vx_dram_req_valid_qual;
|
||||
t_local_mem_addr vx_dram_req_addr_qual;
|
||||
t_local_mem_byte_mask vx_dram_req_byteen_qual;
|
||||
t_local_mem_data vx_dram_req_data_qual;
|
||||
wire [AVS_REQ_TAGW-1:0] vx_dram_req_tag_qual;
|
||||
|
||||
wire [(1 << VX_DRAM_LINE_IDX)-1:0][`VX_DRAM_LINE_WIDTH-1:0] vx_dram_rsp_data_unqual;
|
||||
wire [AVS_REQ_TAGW-1:0] vx_dram_rsp_tag_unqual;
|
||||
|
||||
wire cci_dram_rd_req_valid, cci_dram_wr_req_valid;
|
||||
wire [DRAM_ADDR_WIDTH-1:0] cci_dram_rd_req_addr, cci_dram_wr_req_addr;
|
||||
wire cci_mem_rd_req_valid;
|
||||
wire cci_mem_wr_req_valid;
|
||||
wire [CCI_RD_RQ_DATAW-1:0] cci_rdq_dout;
|
||||
|
||||
//--
|
||||
wire cci_mem_req_valid;
|
||||
wire cci_mem_req_rw;
|
||||
wire [CCI_ADDR_WIDTH-1:0] cci_mem_req_addr;
|
||||
wire [CCI_ADDR_WIDTH-1:0] cci_mem_req_tag;
|
||||
wire cci_mem_req_ready;
|
||||
|
||||
assign cci_dram_req_valid = (CMD_MEM_WRITE == state) ? cci_dram_wr_req_valid : cci_dram_rd_req_valid;
|
||||
assign cci_dram_req_addr = (CMD_MEM_WRITE == state) ? cci_dram_wr_req_addr : cci_dram_rd_req_addr;
|
||||
assign cci_dram_req_rw = (CMD_MEM_WRITE == state);
|
||||
assign cci_dram_req_byteen = {64{1'b1}};
|
||||
assign cci_dram_req_data = cci_rdq_dout[CCI_RD_RQ_DATAW-1:CCI_RD_RQ_TAGW];
|
||||
assign cci_dram_req_tag = AVS_REQ_TAGW'(0);
|
||||
|
||||
`UNUSED_VAR (cci_dram_rsp_tag)
|
||||
wire cci_mem_rsp_valid;
|
||||
wire [CCI_LINE_WIDTH-1:0] cci_mem_rsp_data;
|
||||
wire [CCI_ADDR_WIDTH-1:0] cci_mem_rsp_tag;
|
||||
wire cci_mem_rsp_ready;
|
||||
|
||||
//--
|
||||
|
||||
assign vx_dram_req_valid_qual = vx_dram_req_valid && vx_dram_en;
|
||||
wire cci_mem_req_arb_valid;
|
||||
wire cci_mem_req_arb_rw;
|
||||
t_local_mem_byte_mask cci_mem_req_arb_byteen;
|
||||
t_local_mem_addr cci_mem_req_arb_addr;
|
||||
t_local_mem_data cci_mem_req_arb_data;
|
||||
wire [AVS_REQ_TAGW-1:0] cci_mem_req_arb_tag;
|
||||
wire cci_mem_req_arb_ready;
|
||||
|
||||
assign vx_dram_req_addr_qual = vx_dram_req_addr[`VX_DRAM_ADDR_WIDTH-1:`VX_DRAM_ADDR_WIDTH-DRAM_ADDR_WIDTH];
|
||||
wire cci_mem_rsp_arb_valid;
|
||||
t_local_mem_data cci_mem_rsp_arb_data;
|
||||
wire [AVS_REQ_TAGW-1:0] cci_mem_rsp_arb_tag;
|
||||
wire cci_mem_rsp_arb_ready;
|
||||
|
||||
if (`VX_DRAM_LINE_WIDTH != DRAM_LINE_WIDTH) begin
|
||||
wire [VX_DRAM_LINE_IDX-1:0] vx_dram_req_idx = vx_dram_req_addr[VX_DRAM_LINE_IDX-1:0];
|
||||
wire [VX_DRAM_LINE_IDX-1:0] vx_dram_rsp_idx = vx_dram_rsp_tag_unqual[VX_DRAM_LINE_IDX-1:0];
|
||||
assign vx_dram_req_byteen_qual = 64'(vx_dram_req_byteen) << (6'(vx_dram_req_addr[VX_DRAM_LINE_IDX-1:0]) << (VX_DRAM_LINE_LW-3));
|
||||
assign vx_dram_req_data_qual = DRAM_LINE_WIDTH'(vx_dram_req_data) << ((DRAM_LINE_LW'(vx_dram_req_idx)) << VX_DRAM_LINE_LW);
|
||||
assign vx_dram_req_tag_qual = {vx_dram_req_tag, vx_dram_req_idx};
|
||||
assign vx_dram_rsp_data = vx_dram_rsp_data_unqual[vx_dram_rsp_idx];
|
||||
end else begin
|
||||
assign vx_dram_req_byteen_qual = vx_dram_req_byteen;
|
||||
assign vx_dram_req_tag_qual = vx_dram_req_tag;
|
||||
assign vx_dram_req_data_qual = vx_dram_req_data;
|
||||
assign vx_dram_rsp_data = vx_dram_rsp_data_unqual;
|
||||
end
|
||||
VX_to_mem #(
|
||||
.SRC_DATA_WIDTH (CCI_LINE_WIDTH),
|
||||
.DST_DATA_WIDTH (LMEM_LINE_WIDTH),
|
||||
.SRC_ADDR_WIDTH (CCI_ADDR_WIDTH),
|
||||
.DST_ADDR_WIDTH (LMEM_ADDR_WIDTH),
|
||||
.SRC_TAG_WIDTH (CCI_ADDR_WIDTH),
|
||||
.DST_TAG_WIDTH (AVS_REQ_TAGW)
|
||||
) cci_to_mem (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
assign vx_dram_rsp_tag = vx_dram_rsp_tag_unqual[`VX_DRAM_TAG_WIDTH+VX_DRAM_LINE_IDX-1:VX_DRAM_LINE_IDX];
|
||||
.mem_req_valid_in (cci_mem_req_valid),
|
||||
.mem_req_addr_in (cci_mem_req_addr),
|
||||
.mem_req_rw_in (cci_mem_req_rw),
|
||||
.mem_req_byteen_in ({CCI_LINE_SIZE{1'b1}}),
|
||||
.mem_req_data_in (cci_rdq_dout[CCI_RD_RQ_DATAW-1:CCI_RD_RQ_TAGW]),
|
||||
.mem_req_tag_in (cci_mem_req_tag),
|
||||
.mem_req_ready_in (cci_mem_req_ready),
|
||||
|
||||
.mem_req_valid_out (cci_mem_req_arb_valid),
|
||||
.mem_req_addr_out (cci_mem_req_arb_addr),
|
||||
.mem_req_rw_out (cci_mem_req_arb_rw),
|
||||
.mem_req_byteen_out (cci_mem_req_arb_byteen),
|
||||
.mem_req_data_out (cci_mem_req_arb_data),
|
||||
.mem_req_tag_out (cci_mem_req_arb_tag),
|
||||
.mem_req_ready_out (cci_mem_req_arb_ready),
|
||||
|
||||
.mem_rsp_valid_in (cci_mem_rsp_arb_valid),
|
||||
.mem_rsp_data_in (cci_mem_rsp_arb_data),
|
||||
.mem_rsp_tag_in (cci_mem_rsp_arb_tag),
|
||||
.mem_rsp_ready_in (cci_mem_rsp_arb_ready),
|
||||
|
||||
.mem_rsp_valid_out (cci_mem_rsp_valid),
|
||||
.mem_rsp_data_out (cci_mem_rsp_data),
|
||||
.mem_rsp_tag_out (cci_mem_rsp_tag),
|
||||
.mem_rsp_ready_out (cci_mem_rsp_ready)
|
||||
);
|
||||
|
||||
//--
|
||||
|
||||
wire vx_mem_req_arb_valid;
|
||||
wire vx_mem_req_arb_rw;
|
||||
t_local_mem_byte_mask vx_mem_req_arb_byteen;
|
||||
t_local_mem_addr vx_mem_req_arb_addr;
|
||||
t_local_mem_data vx_mem_req_arb_data;
|
||||
wire [AVS_REQ_TAGW-1:0] vx_mem_req_arb_tag;
|
||||
wire vx_mem_req_arb_ready;
|
||||
|
||||
wire vx_mem_rsp_arb_valid;
|
||||
t_local_mem_data vx_mem_rsp_arb_data;
|
||||
wire [AVS_REQ_TAGW-1:0] vx_mem_rsp_arb_tag;
|
||||
wire vx_mem_rsp_arb_ready;
|
||||
|
||||
VX_to_mem #(
|
||||
.SRC_DATA_WIDTH (`VX_MEM_LINE_WIDTH),
|
||||
.DST_DATA_WIDTH (LMEM_LINE_WIDTH),
|
||||
.SRC_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH),
|
||||
.DST_ADDR_WIDTH (LMEM_ADDR_WIDTH),
|
||||
.SRC_TAG_WIDTH (`VX_MEM_TAG_WIDTH),
|
||||
.DST_TAG_WIDTH (AVS_REQ_TAGW)
|
||||
) vx_to_mem (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
.mem_req_valid_in (vx_mem_req_valid && vx_mem_en),
|
||||
.mem_req_addr_in (vx_mem_req_addr),
|
||||
.mem_req_rw_in (vx_mem_req_rw),
|
||||
.mem_req_byteen_in (vx_mem_req_byteen),
|
||||
.mem_req_data_in (vx_mem_req_data),
|
||||
.mem_req_tag_in (vx_mem_req_tag),
|
||||
.mem_req_ready_in (vx_mem_req_ready),
|
||||
|
||||
.mem_req_valid_out (vx_mem_req_arb_valid),
|
||||
.mem_req_addr_out (vx_mem_req_arb_addr),
|
||||
.mem_req_rw_out (vx_mem_req_arb_rw),
|
||||
.mem_req_byteen_out (vx_mem_req_arb_byteen),
|
||||
.mem_req_data_out (vx_mem_req_arb_data),
|
||||
.mem_req_tag_out (vx_mem_req_arb_tag),
|
||||
.mem_req_ready_out (vx_mem_req_arb_ready),
|
||||
|
||||
.mem_rsp_valid_in (vx_mem_rsp_arb_valid),
|
||||
.mem_rsp_data_in (vx_mem_rsp_arb_data),
|
||||
.mem_rsp_tag_in (vx_mem_rsp_arb_tag),
|
||||
.mem_rsp_ready_in (vx_mem_rsp_arb_ready),
|
||||
|
||||
.mem_rsp_valid_out (vx_mem_rsp_valid),
|
||||
.mem_rsp_data_out (vx_mem_rsp_data),
|
||||
.mem_rsp_tag_out (vx_mem_rsp_tag),
|
||||
.mem_rsp_ready_out (vx_mem_rsp_ready)
|
||||
);
|
||||
|
||||
//--
|
||||
|
||||
wire mem_req_valid;
|
||||
wire mem_req_rw;
|
||||
t_local_mem_byte_mask mem_req_byteen;
|
||||
t_local_mem_addr mem_req_addr;
|
||||
t_local_mem_data mem_req_data;
|
||||
wire [AVS_REQ_TAGW:0] mem_req_tag;
|
||||
wire mem_req_ready;
|
||||
|
||||
wire mem_rsp_valid;
|
||||
t_local_mem_data mem_rsp_data;
|
||||
wire [AVS_REQ_TAGW:0] mem_rsp_tag;
|
||||
wire mem_rsp_ready;
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_REQS (2),
|
||||
.DATA_WIDTH ($bits(t_local_mem_data)),
|
||||
.ADDR_WIDTH ($bits(t_local_mem_addr)),
|
||||
.DATA_WIDTH (LMEM_LINE_WIDTH),
|
||||
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
|
||||
.TAG_IN_WIDTH (AVS_REQ_TAGW),
|
||||
.TAG_OUT_WIDTH (AVS_REQ_TAGW+1)
|
||||
) dram_arb (
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
// Source request
|
||||
.req_valid_in ({cci_dram_req_valid, vx_dram_req_valid_qual}),
|
||||
.req_rw_in ({cci_dram_req_rw, vx_dram_req_rw}),
|
||||
.req_byteen_in ({cci_dram_req_byteen, vx_dram_req_byteen_qual}),
|
||||
.req_addr_in ({cci_dram_req_addr, vx_dram_req_addr_qual}),
|
||||
.req_data_in ({cci_dram_req_data, vx_dram_req_data_qual}),
|
||||
.req_tag_in ({cci_dram_req_tag, vx_dram_req_tag_qual}),
|
||||
.req_ready_in ({cci_dram_req_ready, vx_dram_req_ready}),
|
||||
.req_valid_in ({cci_mem_req_arb_valid, vx_mem_req_arb_valid}),
|
||||
.req_rw_in ({cci_mem_req_arb_rw, vx_mem_req_arb_rw}),
|
||||
.req_byteen_in ({cci_mem_req_arb_byteen, vx_mem_req_arb_byteen}),
|
||||
.req_addr_in ({cci_mem_req_arb_addr, vx_mem_req_arb_addr}),
|
||||
.req_data_in ({cci_mem_req_arb_data, vx_mem_req_arb_data}),
|
||||
.req_tag_in ({cci_mem_req_arb_tag, vx_mem_req_arb_tag}),
|
||||
.req_ready_in ({cci_mem_req_arb_ready, vx_mem_req_arb_ready}),
|
||||
|
||||
// DRAM request
|
||||
.req_valid_out (dram_req_valid),
|
||||
.req_rw_out (dram_req_rw),
|
||||
.req_byteen_out (dram_req_byteen),
|
||||
.req_addr_out (dram_req_addr),
|
||||
.req_data_out (dram_req_data),
|
||||
.req_tag_out (dram_req_tag),
|
||||
.req_ready_out (dram_req_ready),
|
||||
// Memory request
|
||||
.req_valid_out (mem_req_valid),
|
||||
.req_rw_out (mem_req_rw),
|
||||
.req_byteen_out (mem_req_byteen),
|
||||
.req_addr_out (mem_req_addr),
|
||||
.req_data_out (mem_req_data),
|
||||
.req_tag_out (mem_req_tag),
|
||||
.req_ready_out (mem_req_ready),
|
||||
|
||||
// Source response
|
||||
.rsp_valid_out ({cci_dram_rsp_valid, vx_dram_rsp_valid}),
|
||||
.rsp_data_out ({cci_dram_rsp_data, vx_dram_rsp_data_unqual}),
|
||||
.rsp_tag_out ({cci_dram_rsp_tag, vx_dram_rsp_tag_unqual}),
|
||||
.rsp_ready_out ({cci_dram_rsp_ready, vx_dram_rsp_ready}),
|
||||
.rsp_valid_out ({cci_mem_rsp_arb_valid, vx_mem_rsp_arb_valid}),
|
||||
.rsp_data_out ({cci_mem_rsp_arb_data, vx_mem_rsp_arb_data}),
|
||||
.rsp_tag_out ({cci_mem_rsp_arb_tag, vx_mem_rsp_arb_tag}),
|
||||
.rsp_ready_out ({cci_mem_rsp_arb_ready, vx_mem_rsp_arb_ready}),
|
||||
|
||||
// DRAM response
|
||||
.rsp_valid_in (dram_rsp_valid),
|
||||
.rsp_tag_in (dram_rsp_tag),
|
||||
.rsp_data_in (dram_rsp_data),
|
||||
.rsp_ready_in (dram_rsp_ready)
|
||||
// Memory response
|
||||
.rsp_valid_in (mem_rsp_valid),
|
||||
.rsp_tag_in (mem_rsp_tag),
|
||||
.rsp_data_in (mem_rsp_data),
|
||||
.rsp_ready_in (mem_rsp_ready)
|
||||
);
|
||||
|
||||
//--
|
||||
|
||||
VX_avs_wrapper #(
|
||||
.AVS_DATAW ($bits(t_local_mem_data)),
|
||||
.AVS_ADDRW ($bits(t_local_mem_addr)),
|
||||
.AVS_BURSTW ($bits(t_local_mem_burst_cnt)),
|
||||
.AVS_BANKS (NUM_LOCAL_MEM_BANKS),
|
||||
.REQ_TAGW (AVS_REQ_TAGW+1),
|
||||
.RD_QUEUE_SIZE (AVS_RD_QUEUE_SIZE)
|
||||
.NUM_BANKS (NUM_LOCAL_MEM_BANKS),
|
||||
.AVS_DATA_WIDTH (LMEM_LINE_WIDTH),
|
||||
.AVS_ADDR_WIDTH (LMEM_ADDR_WIDTH),
|
||||
.AVS_BURST_WIDTH (LMEM_BURST_CTRW),
|
||||
.AVS_BANKS (NUM_LOCAL_MEM_BANKS),
|
||||
.REQ_TAG_WIDTH (AVS_REQ_TAGW + 1),
|
||||
.RD_QUEUE_SIZE (AVS_RD_QUEUE_SIZE)
|
||||
) avs_wrapper (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
// Memory request
|
||||
.mem_req_valid (mem_req_valid),
|
||||
.mem_req_rw (mem_req_rw),
|
||||
.mem_req_byteen (mem_req_byteen),
|
||||
.mem_req_addr (mem_req_addr),
|
||||
.mem_req_data (mem_req_data),
|
||||
.mem_req_tag (mem_req_tag),
|
||||
.mem_req_ready (mem_req_ready),
|
||||
|
||||
// Memory response
|
||||
.mem_rsp_valid (mem_rsp_valid),
|
||||
.mem_rsp_data (mem_rsp_data),
|
||||
.mem_rsp_tag (mem_rsp_tag),
|
||||
.mem_rsp_ready (mem_rsp_ready),
|
||||
|
||||
// AVS bus
|
||||
.avs_writedata (avs_writedata),
|
||||
.avs_readdata (avs_readdata),
|
||||
.avs_address (avs_address),
|
||||
.avs_waitrequest (avs_waitrequest),
|
||||
.avs_write (avs_write),
|
||||
.avs_read (avs_read),
|
||||
.avs_byteenable (avs_byteenable),
|
||||
.avs_burstcount (avs_burstcount),
|
||||
.avs_readdatavalid (avs_readdatavalid),
|
||||
.avs_bankselect (mem_bank_select),
|
||||
|
||||
// DRAM request
|
||||
.dram_req_valid (dram_req_valid),
|
||||
.dram_req_rw (dram_req_rw),
|
||||
.dram_req_byteen (dram_req_byteen),
|
||||
.dram_req_addr (dram_req_addr),
|
||||
.dram_req_data (dram_req_data),
|
||||
.dram_req_tag (dram_req_tag),
|
||||
.dram_req_ready (dram_req_ready),
|
||||
|
||||
// DRAM response
|
||||
.dram_rsp_valid (dram_rsp_valid),
|
||||
.dram_rsp_data (dram_rsp_data),
|
||||
.dram_rsp_tag (dram_rsp_tag),
|
||||
.dram_rsp_ready (dram_rsp_ready)
|
||||
.avs_writedata (avs_writedata),
|
||||
.avs_readdata (avs_readdata),
|
||||
.avs_address (avs_address),
|
||||
.avs_waitrequest (avs_waitrequest),
|
||||
.avs_write (avs_write),
|
||||
.avs_read (avs_read),
|
||||
.avs_byteenable (avs_byteenable),
|
||||
.avs_burstcount (avs_burstcount),
|
||||
.avs_readdatavalid(avs_readdatavalid)
|
||||
);
|
||||
|
||||
// CCI-P Read Request ///////////////////////////////////////////////////////////
|
||||
|
||||
reg [DRAM_ADDR_WIDTH-1:0] cci_dram_wr_req_ctr;
|
||||
reg [DRAM_ADDR_WIDTH-1:0] cci_rd_req_ctr;
|
||||
wire [DRAM_ADDR_WIDTH-1:0] cci_rd_req_ctr_next;
|
||||
reg [DRAM_ADDR_WIDTH-1:0] cci_dram_wr_req_addr_unqual;
|
||||
wire [CCI_RD_RQ_TAGW-1:0] cci_rd_req_tag, cci_rd_rsp_tag;
|
||||
reg [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_ctr;
|
||||
wire [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_addr;
|
||||
reg [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_addr_unqual;
|
||||
reg [CCI_ADDR_WIDTH-1:0] cci_rd_req_ctr;
|
||||
wire [CCI_ADDR_WIDTH-1:0] cci_rd_req_ctr_next;
|
||||
wire [CCI_RD_RQ_TAGW-1:0] cci_rd_req_tag;
|
||||
wire [CCI_RD_RQ_TAGW-1:0] cci_rd_rsp_tag;
|
||||
reg [CCI_RD_RQ_TAGW-1:0] cci_rd_rsp_ctr;
|
||||
t_ccip_clAddr cci_rd_req_addr;
|
||||
|
||||
reg cci_rd_req_enable, cci_rd_req_wait;
|
||||
wire cci_rd_req_fire;
|
||||
t_ccip_clAddr cci_rd_req_addr;
|
||||
reg cci_rd_req_valid, cci_rd_req_wait;
|
||||
|
||||
wire cci_rdq_push, cci_rdq_pop;
|
||||
wire [CCI_RD_RQ_DATAW-1:0] cci_rdq_din;
|
||||
wire cci_rdq_empty;
|
||||
|
||||
always @(*) begin
|
||||
af2cp_sTxPort.c0.valid = cci_rd_req_fire;
|
||||
af2cp_sTxPort.c0.hdr = t_ccip_c0_ReqMemHdr'(0);
|
||||
af2cp_sTxPort.c0.hdr.address = cci_rd_req_addr;
|
||||
af2cp_sTxPort.c0.hdr.mdata = t_ccip_mdata'(cci_rd_req_tag);
|
||||
end
|
||||
|
||||
wire cci_dram_wr_req_fire = cci_dram_wr_req_valid && cci_dram_req_ready;
|
||||
|
||||
wire cci_rd_req_fire = af2cp_sTxPort.c0.valid;
|
||||
wire cci_mem_wr_req_fire = cci_mem_wr_req_valid && cci_mem_req_ready;
|
||||
|
||||
wire cci_rd_rsp_fire = (STATE_WRITE == state)
|
||||
&& cp2af_sRxPort.c0.rspValid
|
||||
|
@ -631,10 +702,8 @@ wire cci_rd_rsp_fire = (STATE_WRITE == state)
|
|||
assign cci_rd_req_tag = CCI_RD_RQ_TAGW'(cci_rd_req_ctr);
|
||||
assign cci_rd_rsp_tag = CCI_RD_RQ_TAGW'(cp2af_sRxPort.c0.hdr.mdata);
|
||||
|
||||
assign cci_rd_req_ctr_next = cci_rd_req_ctr + DRAM_ADDR_WIDTH'(cci_rd_req_fire ? 1 : 0);
|
||||
|
||||
assign cci_rdq_pop = cci_dram_wr_req_fire;
|
||||
assign cci_rdq_push = cci_rd_rsp_fire;
|
||||
assign cci_rdq_pop = cci_mem_wr_req_fire;
|
||||
assign cci_rdq_din = {cp2af_sRxPort.c0.data, cci_rd_rsp_tag};
|
||||
|
||||
wire [$clog2(CCI_RD_QUEUE_SIZE+1)-1:0] cci_pending_reads;
|
||||
|
@ -646,79 +715,80 @@ VX_pending_size #(
|
|||
.reset (reset),
|
||||
.push (cci_rd_req_fire),
|
||||
.pop (cci_rdq_pop),
|
||||
`UNUSED_PIN (empty),
|
||||
.full (cci_pending_reads_full),
|
||||
.size (cci_pending_reads)
|
||||
.size (cci_pending_reads),
|
||||
`UNUSED_PIN (empty)
|
||||
);
|
||||
`UNUSED_VAR (cci_pending_reads)
|
||||
|
||||
assign cci_dram_wr_req_valid = !cci_rdq_empty;
|
||||
assign cci_rd_req_ctr_next = cci_rd_req_ctr + CCI_ADDR_WIDTH'(cci_rd_req_fire ? 1 : 0);
|
||||
|
||||
assign cci_dram_wr_req_addr = cci_dram_wr_req_addr_unqual + (DRAM_ADDR_WIDTH'(CCI_RD_RQ_TAGW'(cci_rdq_dout)));
|
||||
assign cci_rd_req_fire = cci_rd_req_valid && !(cci_rd_req_wait || cci_pending_reads_full);
|
||||
|
||||
assign cci_mem_wr_req_valid = !cci_rdq_empty;
|
||||
|
||||
assign cci_mem_wr_req_addr = cci_mem_wr_req_addr_unqual + (CCI_ADDR_WIDTH'(CCI_RD_RQ_TAGW'(cci_rdq_dout)));
|
||||
|
||||
assign af2cp_sTxPort.c0.valid = cci_rd_req_enable && !cci_rd_req_wait;
|
||||
|
||||
assign cmd_write_done = (cci_dram_wr_req_ctr == cmd_data_size);
|
||||
assign cmd_write_done = (cci_mem_wr_req_ctr == cmd_data_size);
|
||||
|
||||
// Send read requests to CCI
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
cci_rd_req_addr <= 0;
|
||||
cci_rd_req_ctr <= 0;
|
||||
cci_rd_rsp_ctr <= 0;
|
||||
cci_rd_req_enable <= 0;
|
||||
cci_rd_req_wait <= 0;
|
||||
cci_dram_wr_req_ctr <= 0;
|
||||
cci_dram_wr_req_addr_unqual <= 0;
|
||||
end
|
||||
else begin
|
||||
cci_rd_req_valid <= 0;
|
||||
cci_rd_req_wait <= 0;
|
||||
end else begin
|
||||
if ((STATE_IDLE == state)
|
||||
&& (CMD_MEM_WRITE == cmd_type)) begin
|
||||
cci_rd_req_addr <= cmd_io_addr;
|
||||
cci_rd_req_ctr <= 0;
|
||||
cci_rd_rsp_ctr <= 0;
|
||||
cci_rd_req_enable <= (cmd_data_size != 0);
|
||||
cci_rd_req_wait <= 0;
|
||||
cci_dram_wr_req_ctr <= 0;
|
||||
cci_dram_wr_req_addr_unqual <= cmd_mem_addr;
|
||||
cci_rd_req_valid <= (cmd_data_size != 0);
|
||||
cci_rd_req_wait <= 0;
|
||||
end
|
||||
|
||||
cci_rd_req_enable <= (STATE_WRITE == state)
|
||||
&& (cci_rd_req_ctr_next != cmd_data_size)
|
||||
&& !cci_pending_reads_full
|
||||
&& !cp2af_sRxPort.c0TxAlmFull;
|
||||
cci_rd_req_valid <= (STATE_WRITE == state)
|
||||
&& (cci_rd_req_ctr_next != cmd_data_size)
|
||||
&& !cp2af_sRxPort.c0TxAlmFull;
|
||||
|
||||
if (cci_rd_req_fire) begin
|
||||
cci_rd_req_addr <= cci_rd_req_addr + 1;
|
||||
cci_rd_req_ctr <= cci_rd_req_ctr_next;
|
||||
if (cci_rd_req_tag == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) begin
|
||||
cci_rd_req_wait <= 1; // end current request batch
|
||||
end
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
$display("%t: CCI Rd Req: addr=%0h, tag=%0h, rem=%0d, pending=%0d", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr_next), cci_pending_reads);
|
||||
`endif
|
||||
if (cci_rd_req_fire && (cci_rd_req_tag == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1))) begin
|
||||
cci_rd_req_wait <= 1; // end current request batch
|
||||
end
|
||||
|
||||
if (cci_rd_rsp_fire) begin
|
||||
cci_rd_rsp_ctr <= cci_rd_rsp_ctr + CCI_RD_RQ_TAGW'(1);
|
||||
if (cci_rd_rsp_ctr == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) begin
|
||||
cci_rd_req_wait <= 0; // restart new request batch
|
||||
end
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
$display("%t: CCI Rd Rsp: idx=%0d, ctr=%0d, data=%0h", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data);
|
||||
`endif
|
||||
end
|
||||
if (cci_rd_rsp_fire && (cci_rd_rsp_ctr == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1))) begin
|
||||
cci_rd_req_wait <= 0; // begin new request batch
|
||||
end
|
||||
end
|
||||
|
||||
/*if (cci_rdq_pop) begin
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
$display("%t: CCI Rd Queue Pop: pending=%0d", $time, cci_pending_reads);
|
||||
`endif
|
||||
end*/
|
||||
if ((STATE_IDLE == state)
|
||||
&& (CMD_MEM_WRITE == cmd_type)) begin
|
||||
cci_rd_req_addr <= cmd_io_addr;
|
||||
cci_rd_req_ctr <= 0;
|
||||
cci_rd_rsp_ctr <= 0;
|
||||
cci_mem_wr_req_ctr <= 0;
|
||||
cci_mem_wr_req_addr_unqual <= cmd_mem_addr;
|
||||
end
|
||||
|
||||
if (cci_dram_wr_req_fire) begin
|
||||
cci_dram_wr_req_addr_unqual <= cci_dram_wr_req_addr_unqual + ((CCI_RD_RQ_TAGW'(cci_dram_wr_req_ctr) == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) ? DRAM_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE) : DRAM_ADDR_WIDTH'(0));
|
||||
cci_dram_wr_req_ctr <= cci_dram_wr_req_ctr + DRAM_ADDR_WIDTH'(1);
|
||||
end
|
||||
if (cci_rd_req_fire) begin
|
||||
cci_rd_req_addr <= cci_rd_req_addr + 1;
|
||||
cci_rd_req_ctr <= cci_rd_req_ctr + 1;
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
$display("%t: CCI Rd Req: addr=%0h, tag=%0h, rem=%0d, pending=%0d", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr - 1), cci_pending_reads);
|
||||
`endif
|
||||
end
|
||||
|
||||
if (cci_rd_rsp_fire) begin
|
||||
cci_rd_rsp_ctr <= cci_rd_rsp_ctr + CCI_RD_RQ_TAGW'(1);
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
$display("%t: CCI Rd Rsp: idx=%0d, ctr=%0d, data=%0h", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data);
|
||||
`endif
|
||||
end
|
||||
|
||||
if (cci_rdq_pop) begin
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
$display("%t: CCI Rd Queue Pop: pending=%0d", $time, cci_pending_reads);
|
||||
`endif
|
||||
end
|
||||
|
||||
if (cci_mem_wr_req_fire) begin
|
||||
cci_mem_wr_req_addr_unqual <= cci_mem_wr_req_addr_unqual + ((CCI_RD_RQ_TAGW'(cci_mem_wr_req_ctr) == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) ? CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE) : CCI_ADDR_WIDTH'(0));
|
||||
cci_mem_wr_req_ctr <= cci_mem_wr_req_ctr + CCI_ADDR_WIDTH'(1);
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -761,22 +831,24 @@ VX_fifo_queue #(
|
|||
|
||||
// CCI-P Write Request //////////////////////////////////////////////////////////
|
||||
|
||||
reg [DRAM_ADDR_WIDTH-1:0] cci_dram_rd_req_ctr;
|
||||
reg [DRAM_ADDR_WIDTH-1:0] cci_wr_req_ctr;
|
||||
reg [DRAM_ADDR_WIDTH-1:0] cci_dram_rd_req_addr_r;
|
||||
reg [CCI_ADDR_WIDTH-1:0] cci_mem_rd_req_ctr;
|
||||
reg [CCI_ADDR_WIDTH-1:0] cci_mem_rd_req_addr;
|
||||
reg [CCI_ADDR_WIDTH-1:0] cci_wr_req_ctr;
|
||||
|
||||
reg cci_wr_req_fire;
|
||||
t_ccip_clAddr cci_wr_req_addr;
|
||||
t_ccip_clData cci_wr_req_data;
|
||||
|
||||
always @(*) begin
|
||||
af2cp_sTxPort.c1.valid = cci_wr_req_fire;
|
||||
af2cp_sTxPort.c1.hdr = t_ccip_c1_ReqMemHdr'(0);
|
||||
af2cp_sTxPort.c1.hdr.address = cci_wr_req_addr;
|
||||
af2cp_sTxPort.c1.hdr.sop = 1; // single line write mode
|
||||
af2cp_sTxPort.c1.data = t_ccip_clData'(cci_dram_rsp_data);
|
||||
af2cp_sTxPort.c1.hdr.address = cci_wr_req_addr;
|
||||
af2cp_sTxPort.c1.data = cci_wr_req_data;
|
||||
end
|
||||
|
||||
wire cci_dram_rd_req_fire = cci_dram_rd_req_valid && cci_dram_req_ready;
|
||||
wire cci_dram_rd_rsp_fire = cci_dram_rsp_valid && cci_dram_rsp_ready;
|
||||
|
||||
wire cci_wr_req_fire = cci_dram_rd_rsp_fire;
|
||||
wire cci_mem_rd_req_fire = cci_mem_rd_req_valid && cci_mem_req_ready;
|
||||
wire cci_mem_rd_rsp_fire = cci_mem_rsp_valid && cci_mem_rsp_ready;
|
||||
|
||||
wire cci_wr_rsp_fire = (STATE_READ == state)
|
||||
&& cp2af_sRxPort.c1.rspValid
|
||||
|
@ -785,12 +857,13 @@ wire cci_wr_rsp_fire = (STATE_READ == state)
|
|||
wire [$clog2(CCI_RW_PENDING_SIZE+1)-1:0] cci_pending_writes;
|
||||
wire cci_pending_writes_empty;
|
||||
wire cci_pending_writes_full;
|
||||
|
||||
VX_pending_size #(
|
||||
.SIZE (CCI_RW_PENDING_SIZE)
|
||||
) cci_wr_pending_size (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.push (cci_wr_req_fire),
|
||||
.push (cci_mem_rd_rsp_fire),
|
||||
.pop (cci_wr_rsp_fire),
|
||||
.empty (cci_pending_writes_empty),
|
||||
.full (cci_pending_writes_full),
|
||||
|
@ -798,54 +871,61 @@ VX_pending_size #(
|
|||
);
|
||||
`UNUSED_VAR (cci_pending_writes)
|
||||
|
||||
assign cci_dram_rd_req_valid = (cci_dram_rd_req_ctr != 0);
|
||||
assign cci_dram_rd_req_addr = cci_dram_rd_req_addr_r;
|
||||
assign cci_mem_rd_req_valid = (STATE_READ == state)
|
||||
&& (cci_mem_rd_req_ctr != cmd_data_size);
|
||||
|
||||
assign af2cp_sTxPort.c1.valid = cci_dram_rd_rsp_fire;
|
||||
assign cci_dram_rsp_ready = !cp2af_sRxPort.c1TxAlmFull && !cci_pending_writes_full;
|
||||
assign cci_mem_rsp_ready = !cp2af_sRxPort.c1TxAlmFull
|
||||
&& !cci_pending_writes_full;
|
||||
|
||||
assign cmd_read_done = (0 == cci_wr_req_ctr) && cci_pending_writes_empty;
|
||||
assign cmd_read_done = (0 == cci_wr_req_ctr)
|
||||
&& cci_pending_writes_empty;
|
||||
|
||||
// Send write requests to CCI
|
||||
always @(posedge clk)
|
||||
begin
|
||||
if (reset) begin
|
||||
cci_wr_req_addr <= 0;
|
||||
cci_wr_req_ctr <= 0;
|
||||
cci_dram_rd_req_ctr <= 0;
|
||||
cci_dram_rd_req_addr_r <= 0;
|
||||
cci_wr_req_fire <= 0;
|
||||
end else begin
|
||||
cci_wr_req_fire <= cci_mem_rd_rsp_fire;
|
||||
end
|
||||
else begin
|
||||
if ((STATE_IDLE == state)
|
||||
&& (CMD_MEM_READ == cmd_type)) begin
|
||||
cci_wr_req_addr <= cmd_io_addr;
|
||||
cci_wr_req_ctr <= cmd_data_size;
|
||||
cci_dram_rd_req_ctr <= cmd_data_size;
|
||||
cci_dram_rd_req_addr_r <= cmd_mem_addr;
|
||||
end
|
||||
|
||||
if ((STATE_IDLE == state)
|
||||
&& (CMD_MEM_READ == cmd_type)) begin
|
||||
cci_mem_rd_req_ctr <= 0;
|
||||
cci_mem_rd_req_addr <= cmd_mem_addr;
|
||||
cci_wr_req_ctr <= cmd_data_size;
|
||||
end
|
||||
|
||||
if (cci_wr_req_fire) begin
|
||||
assert(cci_wr_req_ctr != 0);
|
||||
cci_wr_req_addr <= cci_wr_req_addr + t_ccip_clAddr'(1);
|
||||
cci_wr_req_ctr <= cci_wr_req_ctr - DRAM_ADDR_WIDTH'(1);
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
$display("%t: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d, data=%0h", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data);
|
||||
`endif
|
||||
end
|
||||
if (cci_mem_rd_req_fire) begin
|
||||
cci_mem_rd_req_addr <= cci_mem_rd_req_addr + CCI_ADDR_WIDTH'(1);
|
||||
cci_mem_rd_req_ctr <= cci_mem_rd_req_ctr + CCI_ADDR_WIDTH'(1);
|
||||
end
|
||||
|
||||
/*`ifdef DBG_PRINT_OPAE
|
||||
if (cci_wr_rsp_fire) begin
|
||||
$display("%t: CCI Wr Rsp: pending=%0d", $time, cci_pending_writes);
|
||||
end
|
||||
`endif*/
|
||||
cci_wr_req_addr <= cmd_io_addr + t_ccip_clAddr'(cci_mem_rsp_tag);
|
||||
cci_wr_req_data <= t_ccip_clData'(cci_mem_rsp_data);
|
||||
|
||||
if (cci_dram_rd_req_fire) begin
|
||||
cci_dram_rd_req_addr_r <= cci_dram_rd_req_addr_r + DRAM_ADDR_WIDTH'(1);
|
||||
cci_dram_rd_req_ctr <= cci_dram_rd_req_ctr - DRAM_ADDR_WIDTH'(1);
|
||||
end
|
||||
if (cci_wr_req_fire) begin
|
||||
assert(cci_wr_req_ctr != 0);
|
||||
cci_wr_req_ctr <= cci_wr_req_ctr - CCI_ADDR_WIDTH'(1);
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
$display("%t: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d, data=%0h", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data);
|
||||
`endif
|
||||
end
|
||||
|
||||
if (cci_wr_rsp_fire) begin
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
$display("%t: CCI Wr Rsp: pending=%0d", $time, cci_pending_writes);
|
||||
`endif
|
||||
end
|
||||
end
|
||||
|
||||
//--
|
||||
|
||||
assign cci_mem_req_rw = (CMD_MEM_WRITE == state);
|
||||
assign cci_mem_req_valid = cci_mem_req_rw ? cci_mem_wr_req_valid : cci_mem_rd_req_valid;
|
||||
assign cci_mem_req_addr = cci_mem_req_rw ? cci_mem_wr_req_addr : cci_mem_rd_req_addr;
|
||||
assign cci_mem_req_tag = cci_mem_req_rw ? cci_mem_wr_req_ctr : cci_mem_rd_req_ctr;
|
||||
|
||||
// CSRs ///////////////////////////////////////////////////////////////////////
|
||||
|
||||
reg csr_io_req_sent;
|
||||
|
@ -890,20 +970,20 @@ Vortex #() vortex (
|
|||
.clk (clk),
|
||||
.reset (reset | vx_reset),
|
||||
|
||||
// DRAM request
|
||||
.dram_req_valid (vx_dram_req_valid),
|
||||
.dram_req_rw (vx_dram_req_rw),
|
||||
.dram_req_byteen(vx_dram_req_byteen),
|
||||
.dram_req_addr (vx_dram_req_addr),
|
||||
.dram_req_data (vx_dram_req_data),
|
||||
.dram_req_tag (vx_dram_req_tag),
|
||||
.dram_req_ready (vx_dram_req_ready),
|
||||
// Memory request
|
||||
.mem_req_valid (vx_mem_req_valid),
|
||||
.mem_req_rw (vx_mem_req_rw),
|
||||
.mem_req_byteen (vx_mem_req_byteen),
|
||||
.mem_req_addr (vx_mem_req_addr),
|
||||
.mem_req_data (vx_mem_req_data),
|
||||
.mem_req_tag (vx_mem_req_tag),
|
||||
.mem_req_ready (vx_mem_req_ready),
|
||||
|
||||
// DRAM response
|
||||
.dram_rsp_valid (vx_dram_rsp_valid),
|
||||
.dram_rsp_data (vx_dram_rsp_data),
|
||||
.dram_rsp_tag (vx_dram_rsp_tag),
|
||||
.dram_rsp_ready (vx_dram_rsp_ready),
|
||||
// Memory response
|
||||
.mem_rsp_valid (vx_mem_rsp_valid),
|
||||
.mem_rsp_data (vx_mem_rsp_data),
|
||||
.mem_rsp_tag (vx_mem_rsp_tag),
|
||||
.mem_rsp_ready (vx_mem_rsp_ready),
|
||||
|
||||
// CSR Request
|
||||
.csr_req_valid (vx_csr_io_req_valid),
|
||||
|
@ -944,16 +1024,15 @@ Vortex #() vortex (
|
|||
`SCOPE_ASSIGN (cci_sTxPort_c2_mmioRdValid, af2cp_sTxPort.c2.mmioRdValid);
|
||||
`SCOPE_ASSIGN (cci_sRxPort_c0TxAlmFull, cp2af_sRxPort.c0TxAlmFull);
|
||||
`SCOPE_ASSIGN (cci_sRxPort_c1TxAlmFull, cp2af_sRxPort.c1TxAlmFull);
|
||||
`SCOPE_ASSIGN (avs_address, avs_address);
|
||||
`SCOPE_ASSIGN (avs_waitrequest, avs_waitrequest);
|
||||
`SCOPE_ASSIGN (avs_write_fire, avs_write && !avs_waitrequest);
|
||||
`SCOPE_ASSIGN (avs_read_fire, avs_read && !avs_waitrequest);
|
||||
`SCOPE_ASSIGN (avs_byteenable, avs_byteenable);
|
||||
`SCOPE_ASSIGN (avs_burstcount, avs_burstcount);
|
||||
`SCOPE_ASSIGN (avs_readdatavalid, avs_readdatavalid);
|
||||
`SCOPE_ASSIGN (mem_bank_select, mem_bank_select);
|
||||
`SCOPE_ASSIGN (cci_dram_rd_req_ctr, cci_dram_rd_req_ctr);
|
||||
`SCOPE_ASSIGN (cci_dram_wr_req_ctr, cci_dram_wr_req_ctr);
|
||||
`SCOPE_ASSIGN (avs_address, avs_address[0]);
|
||||
`SCOPE_ASSIGN (avs_waitrequest, avs_waitrequest[0]);
|
||||
`SCOPE_ASSIGN (avs_write_fire, avs_write[0] && !avs_waitrequest[0]);
|
||||
`SCOPE_ASSIGN (avs_read_fire, avs_read[0] && !avs_waitrequest[0]);
|
||||
`SCOPE_ASSIGN (avs_byteenable, avs_byteenable[0]);
|
||||
`SCOPE_ASSIGN (avs_burstcount, avs_burstcount[0]);
|
||||
`SCOPE_ASSIGN (avs_readdatavalid, avs_readdatavalid[0]);
|
||||
`SCOPE_ASSIGN (cci_mem_rd_req_ctr, cci_mem_rd_req_ctr);
|
||||
`SCOPE_ASSIGN (cci_mem_wr_req_ctr, cci_mem_wr_req_ctr);
|
||||
`SCOPE_ASSIGN (cci_rd_req_ctr, cci_rd_req_ctr);
|
||||
`SCOPE_ASSIGN (cci_rd_rsp_ctr, cci_rd_rsp_ctr);
|
||||
`SCOPE_ASSIGN (cci_wr_req_ctr, cci_wr_req_ctr);
|
||||
|
@ -964,11 +1043,11 @@ Vortex #() vortex (
|
|||
`SCOPE_ASSIGN (cci_pending_reads_full, cci_pending_reads_full);
|
||||
`SCOPE_ASSIGN (cci_pending_writes_empty, cci_pending_writes_empty);
|
||||
`SCOPE_ASSIGN (cci_pending_writes_full, cci_pending_writes_full);
|
||||
`SCOPE_ASSIGN (afu_dram_req_fire, (dram_req_valid && dram_req_ready));
|
||||
`SCOPE_ASSIGN (afu_dram_req_addr, dram_req_addr);
|
||||
`SCOPE_ASSIGN (afu_dram_req_tag, dram_req_tag);
|
||||
`SCOPE_ASSIGN (afu_dram_rsp_fire, (dram_rsp_valid && dram_rsp_ready));
|
||||
`SCOPE_ASSIGN (afu_dram_rsp_tag, dram_rsp_tag);
|
||||
`SCOPE_ASSIGN (afu_mem_req_fire, (mem_req_valid && mem_req_ready));
|
||||
`SCOPE_ASSIGN (afu_mem_req_addr, mem_req_addr);
|
||||
`SCOPE_ASSIGN (afu_mem_req_tag, mem_req_tag);
|
||||
`SCOPE_ASSIGN (afu_mem_rsp_fire, (mem_rsp_valid && mem_rsp_ready));
|
||||
`SCOPE_ASSIGN (afu_mem_rsp_tag, mem_rsp_tag);
|
||||
|
||||
wire scope_changed = `SCOPE_TRIGGER;
|
||||
|
||||
|
|
|
@ -1,18 +1,27 @@
|
|||
`ifndef __VORTEX_AFU__
|
||||
`define __VORTEX_AFU__
|
||||
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
`include "ccip_if_pkg.sv"
|
||||
`IGNORE_WARNINGS_END
|
||||
|
||||
`define PLATFORM_PROVIDES_LOCAL_MEMORY
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH 26
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH 512
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH 4
|
||||
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BANKS
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_BANKS 2
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH 26
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH 512
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH 4
|
||||
`endif
|
||||
|
||||
`include "local_mem_cfg_pkg.sv"
|
||||
`IGNORE_WARNINGS_END
|
||||
|
||||
`define AFU_ACCEL_NAME "vortex_afu"
|
||||
`define AFU_ACCEL_UUID 128'h35f9452b_25c2_434c_93d5_6f8c60db361c
|
||||
|
|
168
hw/rtl/cache/VX_bank.v
vendored
168
hw/rtl/cache/VX_bank.v
vendored
|
@ -22,8 +22,8 @@ module VX_bank #(
|
|||
parameter CREQ_SIZE = 1,
|
||||
// Miss Reserv Queue Knob
|
||||
parameter MSHR_SIZE = 1,
|
||||
// DRAM Request Queue Size
|
||||
parameter DREQ_SIZE = 1,
|
||||
// Memory Request Queue Size
|
||||
parameter MREQ_SIZE = 1,
|
||||
|
||||
// Enable cache writeable
|
||||
parameter WRITE_ENABLE = 1,
|
||||
|
@ -35,10 +35,7 @@ module VX_bank #(
|
|||
parameter CORE_TAG_ID_BITS = 0,
|
||||
|
||||
// bank offset from beginning of index range
|
||||
parameter BANK_ADDR_OFFSET = 0,
|
||||
|
||||
// in-order DRAN
|
||||
parameter IN_ORDER_DRAM = 0
|
||||
parameter BANK_ADDR_OFFSET = 0
|
||||
) (
|
||||
`SCOPE_IO_VX_bank
|
||||
|
||||
|
@ -71,19 +68,19 @@ module VX_bank #(
|
|||
output wire [CORE_TAG_WIDTH-1:0] core_rsp_tag,
|
||||
input wire core_rsp_ready,
|
||||
|
||||
// DRAM request
|
||||
output wire dram_req_valid,
|
||||
output wire dram_req_rw,
|
||||
output wire [CACHE_LINE_SIZE-1:0] dram_req_byteen,
|
||||
output wire [`LINE_ADDR_WIDTH-1:0] dram_req_addr,
|
||||
output wire [`CACHE_LINE_WIDTH-1:0] dram_req_data,
|
||||
input wire dram_req_ready,
|
||||
// Memory request
|
||||
output wire mem_req_valid,
|
||||
output wire mem_req_rw,
|
||||
output wire [CACHE_LINE_SIZE-1:0] mem_req_byteen,
|
||||
output wire [`LINE_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [`CACHE_LINE_WIDTH-1:0] mem_req_data,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// DRAM response
|
||||
input wire dram_rsp_valid,
|
||||
input wire [`LINE_ADDR_WIDTH-1:0] dram_rsp_addr,
|
||||
input wire [`CACHE_LINE_WIDTH-1:0] dram_rsp_data,
|
||||
output wire dram_rsp_ready,
|
||||
// Memory response
|
||||
input wire mem_rsp_valid,
|
||||
input wire [`LINE_ADDR_WIDTH-1:0] mem_rsp_addr,
|
||||
input wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data,
|
||||
output wire mem_rsp_ready,
|
||||
|
||||
// flush
|
||||
input wire flush_enable,
|
||||
|
@ -93,10 +90,10 @@ module VX_bank #(
|
|||
`UNUSED_PARAM (CORE_TAG_ID_BITS)
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
/* verilator lint_off UNUSED */
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
wire [31:0] debug_pc_sel, debug_pc_st0, debug_pc_st1;
|
||||
wire [`NW_BITS-1:0] debug_wid_sel, debug_wid_st0, debug_wid_st1;
|
||||
/* verilator lint_on UNUSED */
|
||||
`IGNORE_WARNINGS_END
|
||||
`endif
|
||||
|
||||
wire creq_pop;
|
||||
|
@ -167,8 +164,8 @@ module VX_bank #(
|
|||
wire is_flush_st0;
|
||||
|
||||
wire crsq_in_valid, crsq_in_ready, crsq_in_stall;
|
||||
wire dreq_alm_full;
|
||||
wire drsq_pop;
|
||||
wire mreq_alm_full;
|
||||
wire mrsq_pop;
|
||||
|
||||
wire crsq_in_fire = crsq_in_valid && crsq_in_ready;
|
||||
|
||||
|
@ -186,24 +183,24 @@ module VX_bank #(
|
|||
|
||||
// determine which queue to pop next in priority order
|
||||
wire mshr_pop_unqual = mshr_valid
|
||||
&& !dreq_alm_full; // ensure DRAM request queue not full (deadlock prevention)
|
||||
wire drsq_pop_unqual = !mshr_pop_unqual && dram_rsp_valid;
|
||||
wire creq_pop_unqual = !mshr_pop_unqual && !drsq_pop_unqual && !creq_empty && !flush_enable;
|
||||
&& !mreq_alm_full; // ensure memory request queue not full (deadlock prevention)
|
||||
wire mrsq_pop_unqual = !mshr_pop_unqual && mem_rsp_valid;
|
||||
wire creq_pop_unqual = !mshr_pop_unqual && !mrsq_pop_unqual && !creq_empty && !flush_enable;
|
||||
|
||||
wire is_miss_st1 = valid_st1 && (miss_st1 || force_miss_st1);
|
||||
assign mshr_pop = mshr_pop_unqual
|
||||
&& !(!IN_ORDER_DRAM && is_miss_st1 && is_mshr_st1) // do not schedule another mshr request if the previous one missed
|
||||
&& !(is_miss_st1 && is_mshr_st1) // do not schedule another mshr request if the previous one missed
|
||||
&& !crsq_in_stall; // ensure core response ready
|
||||
|
||||
assign drsq_pop = drsq_pop_unqual
|
||||
assign mrsq_pop = mrsq_pop_unqual
|
||||
&& !crsq_in_stall; // ensure core response ready
|
||||
|
||||
assign creq_pop = creq_pop_unqual
|
||||
&& !dreq_alm_full // ensure dram request ready
|
||||
&& !mreq_alm_full // ensure memory request ready
|
||||
&& !mshr_alm_full // ensure mshr enqueue ready
|
||||
&& !crsq_in_stall; // ensure core response ready
|
||||
|
||||
assign dram_rsp_ready = drsq_pop;
|
||||
assign mem_rsp_ready = mrsq_pop;
|
||||
|
||||
// we have a miss in mshr or entering it for the current address
|
||||
wire mshr_pending_sel = mshr_pending
|
||||
|
@ -237,15 +234,7 @@ module VX_bank #(
|
|||
end else begin
|
||||
assign creq_line_data = creq_data;
|
||||
end
|
||||
|
||||
wire [`LINE_ADDR_WIDTH-1:0] dram_rsp_addr_qual;
|
||||
if (IN_ORDER_DRAM) begin
|
||||
`UNUSED_VAR (dram_rsp_addr)
|
||||
assign dram_rsp_addr_qual = mshr_addr;
|
||||
end else begin
|
||||
assign dram_rsp_addr_qual = dram_rsp_addr;
|
||||
end
|
||||
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + (`UP(`WORD_SELECT_BITS) + WORD_SIZE + `REQS_BITS + 1) * NUM_PORTS + CORE_TAG_WIDTH + 1 + 1),
|
||||
.RESETW (1)
|
||||
|
@ -254,13 +243,13 @@ module VX_bank #(
|
|||
.reset (reset),
|
||||
.enable (!crsq_in_stall),
|
||||
.data_in ({
|
||||
flush_enable || mshr_pop || drsq_pop || creq_pop,
|
||||
flush_enable || mshr_pop || mrsq_pop || creq_pop,
|
||||
flush_enable,
|
||||
mshr_pop_unqual,
|
||||
drsq_pop_unqual || flush_enable,
|
||||
mrsq_pop_unqual || flush_enable,
|
||||
mshr_pop_unqual ? 1'b0 : creq_rw,
|
||||
mshr_pop_unqual ? mshr_addr : (dram_rsp_valid ? dram_rsp_addr_qual : (flush_enable ? `LINE_ADDR_WIDTH'(flush_addr) : creq_addr)),
|
||||
dram_rsp_valid ? dram_rsp_data : creq_line_data,
|
||||
mshr_pop_unqual ? mshr_addr : (mem_rsp_valid ? mem_rsp_addr : (flush_enable ? `LINE_ADDR_WIDTH'(flush_addr) : creq_addr)),
|
||||
mem_rsp_valid ? mem_rsp_data : creq_line_data,
|
||||
mshr_pop_unqual ? mshr_wsel : creq_wsel,
|
||||
mshr_pop_unqual ? mshr_byteen : creq_byteen,
|
||||
mshr_pop_unqual ? mshr_tid : creq_tid,
|
||||
|
@ -307,7 +296,7 @@ module VX_bank #(
|
|||
);
|
||||
|
||||
// redundant fills
|
||||
wire is_redundant_fill_st0 = !IN_ORDER_DRAM && is_fill_st0 && tag_match_st0;
|
||||
wire is_redundant_fill_st0 = is_fill_st0 && tag_match_st0;
|
||||
|
||||
// we had a miss with prior request for the current address
|
||||
assign prev_miss_dep_st0 = is_miss_st1 && (addr_st0 == addr_st1);
|
||||
|
@ -322,9 +311,9 @@ module VX_bank #(
|
|||
assign writeen_unqual_st0 = (WRITE_ENABLE && !is_fill_st0 && tag_match_st0 && mem_rw_st0)
|
||||
|| (is_fill_st0 && !is_redundant_fill_st0);
|
||||
|
||||
assign incoming_fill_st0 = dram_rsp_valid && (addr_st0 == dram_rsp_addr_qual);
|
||||
assign incoming_fill_st0 = mem_rsp_valid && (addr_st0 == mem_rsp_addr);
|
||||
|
||||
assign fill_req_unqual_st0 = !mem_rw_st0 && (!force_miss_st0 || (!IN_ORDER_DRAM && is_mshr_st0 && !prev_miss_dep_st0));
|
||||
assign fill_req_unqual_st0 = !mem_rw_st0 && (!force_miss_st0 || (is_mshr_st0 && !prev_miss_dep_st0));
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + (`UP(`WORD_SELECT_BITS) + WORD_SIZE + `REQS_BITS + 1) * NUM_PORTS + CORE_TAG_WIDTH),
|
||||
|
@ -351,12 +340,12 @@ module VX_bank #(
|
|||
|
||||
wire mshr_push_st1 = !is_fill_st1 && !mem_rw_st1 && (miss_st1 || force_miss_st1);
|
||||
|
||||
wire incoming_fill_qual_st1 = (dram_rsp_valid && (addr_st1 == dram_rsp_addr_qual))
|
||||
wire incoming_fill_qual_st1 = (mem_rsp_valid && (addr_st1 == mem_rsp_addr))
|
||||
|| incoming_fill_st1;
|
||||
|
||||
wire do_writeback_st1 = !is_fill_st1 && mem_rw_st1;
|
||||
|
||||
wire dreq_push_st1 = (miss_st1 && fill_req_unqual_st1 && !incoming_fill_qual_st1)
|
||||
wire mreq_push_st1 = (miss_st1 && fill_req_unqual_st1 && !incoming_fill_qual_st1)
|
||||
|| do_writeback_st1;
|
||||
|
||||
wire [`WORDS_PER_LINE-1:0][WORD_SIZE-1:0] line_byteen_st1;
|
||||
|
@ -408,15 +397,14 @@ module VX_bank #(
|
|||
|
||||
assign mshr_push = valid_st1 && mshr_push_st1;
|
||||
wire mshr_dequeue = valid_st1 && is_mshr_st1 && !mshr_push_st1 && crsq_in_ready;
|
||||
wire mshr_restore = !IN_ORDER_DRAM && is_mshr_st1;
|
||||
`RUNTIME_ASSERT(!IN_ORDER_DRAM || !(mshr_push && mshr_restore), ("Oops!"))
|
||||
wire mshr_restore = is_mshr_st1;
|
||||
|
||||
// push a missed request as 'ready' if it was a forced miss that actually had a hit
|
||||
// or the fill request for this block is comming
|
||||
wire mshr_init_ready_state = !miss_st1 || incoming_fill_qual_st1;
|
||||
|
||||
// use dram rsp or core req address to lookup the mshr
|
||||
wire [`LINE_ADDR_WIDTH-1:0] lookup_addr = dram_rsp_valid ? dram_rsp_addr_qual : creq_addr;
|
||||
// use memory rsp or core req address to lookup the mshr
|
||||
wire [`LINE_ADDR_WIDTH-1:0] lookup_addr = mem_rsp_valid ? mem_rsp_addr : creq_addr;
|
||||
|
||||
VX_miss_resrv #(
|
||||
.BANK_ID (BANK_ID),
|
||||
|
@ -450,7 +438,7 @@ module VX_bank #(
|
|||
`UNUSED_PIN (enqueue_full),
|
||||
|
||||
// lookup
|
||||
.lookup_ready (drsq_pop),
|
||||
.lookup_ready (mrsq_pop),
|
||||
.lookup_addr (lookup_addr),
|
||||
.lookup_match (mshr_pending),
|
||||
|
||||
|
@ -500,41 +488,41 @@ module VX_bank #(
|
|||
.ready_out (core_rsp_ready)
|
||||
);
|
||||
|
||||
// Enqueue DRAM request
|
||||
// Enqueue memory request
|
||||
|
||||
wire [CACHE_LINE_SIZE-1:0] dreq_byteen;
|
||||
wire [`LINE_ADDR_WIDTH-1:0] dreq_addr;
|
||||
wire [`CACHE_LINE_WIDTH-1:0] dreq_data;
|
||||
wire dreq_push, dreq_pop, dreq_empty, dreq_rw;
|
||||
wire [CACHE_LINE_SIZE-1:0] mreq_byteen;
|
||||
wire [`LINE_ADDR_WIDTH-1:0] mreq_addr;
|
||||
wire [`CACHE_LINE_WIDTH-1:0] mreq_data;
|
||||
wire mreq_push, mreq_pop, mreq_empty, mreq_rw;
|
||||
|
||||
assign dreq_push = valid_st1 && dreq_push_st1;
|
||||
assign mreq_push = valid_st1 && mreq_push_st1;
|
||||
|
||||
assign dreq_pop = dram_req_valid && dram_req_ready;
|
||||
assign mreq_pop = mem_req_valid && mem_req_ready;
|
||||
|
||||
assign dreq_rw = WRITE_ENABLE && do_writeback_st1;
|
||||
assign dreq_byteen = dreq_rw ? line_byteen_st1 : {CACHE_LINE_SIZE{1'b1}};
|
||||
assign dreq_addr = addr_st1;
|
||||
assign dreq_data = wdata_st1;
|
||||
assign mreq_rw = WRITE_ENABLE && do_writeback_st1;
|
||||
assign mreq_byteen = mreq_rw ? line_byteen_st1 : {CACHE_LINE_SIZE{1'b1}};
|
||||
assign mreq_addr = addr_st1;
|
||||
assign mreq_data = wdata_st1;
|
||||
|
||||
VX_fifo_queue #(
|
||||
.DATAW (1 + CACHE_LINE_SIZE + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH),
|
||||
.SIZE (DREQ_SIZE),
|
||||
.ALM_FULL (DREQ_SIZE-2)
|
||||
) dram_req_queue (
|
||||
.SIZE (MREQ_SIZE),
|
||||
.ALM_FULL (MREQ_SIZE-2)
|
||||
) mem_req_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.push (dreq_push),
|
||||
.pop (dreq_pop),
|
||||
.data_in ({dreq_rw, dreq_byteen, dreq_addr, dreq_data}),
|
||||
.data_out ({dram_req_rw, dram_req_byteen, dram_req_addr, dram_req_data}),
|
||||
.empty (dreq_empty),
|
||||
.alm_full (dreq_alm_full),
|
||||
.push (mreq_push),
|
||||
.pop (mreq_pop),
|
||||
.data_in ({mreq_rw, mreq_byteen, mreq_addr, mreq_data}),
|
||||
.data_out ({mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_data}),
|
||||
.empty (mreq_empty),
|
||||
.alm_full (mreq_alm_full),
|
||||
`UNUSED_PIN (full),
|
||||
`UNUSED_PIN (alm_empty),
|
||||
`UNUSED_PIN (size)
|
||||
);
|
||||
|
||||
assign dram_req_valid = !dreq_empty;
|
||||
assign mem_req_valid = !mreq_empty;
|
||||
|
||||
`SCOPE_ASSIGN (valid_st0, valid_st0);
|
||||
`SCOPE_ASSIGN (valid_st1, valid_st1);
|
||||
|
@ -544,7 +532,7 @@ module VX_bank #(
|
|||
`SCOPE_ASSIGN (force_miss_st0, force_miss_st0);
|
||||
`SCOPE_ASSIGN (mshr_push, mshr_push);
|
||||
`SCOPE_ASSIGN (crsq_in_stall, crsq_in_stall);
|
||||
`SCOPE_ASSIGN (dreq_alm_full, dreq_alm_full);
|
||||
`SCOPE_ASSIGN (mreq_alm_full, mreq_alm_full);
|
||||
`SCOPE_ASSIGN (mshr_alm_full, mshr_alm_full);
|
||||
`SCOPE_ASSIGN (addr_st0, `LINE_TO_BYTE_ADDR(addr_st0, BANK_ID));
|
||||
`SCOPE_ASSIGN (addr_st1, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID));
|
||||
|
@ -552,45 +540,45 @@ module VX_bank #(
|
|||
`ifdef PERF_ENABLE
|
||||
assign perf_read_misses = valid_st1 && !is_fill_st1 && !is_mshr_st1 && miss_st1 && !mem_rw_st1;
|
||||
assign perf_write_misses = valid_st1 && !is_fill_st1 && !is_mshr_st1 && miss_st1 && mem_rw_st1;
|
||||
assign perf_pipe_stalls = crsq_in_stall || dreq_alm_full || mshr_alm_full;
|
||||
assign perf_pipe_stalls = crsq_in_stall || mreq_alm_full || mshr_alm_full;
|
||||
assign perf_mshr_stalls = mshr_alm_full;
|
||||
`endif
|
||||
|
||||
`ifdef DBG_PRINT_CACHE_BANK
|
||||
always @(posedge clk) begin
|
||||
/*if (valid_st1 && pmask_st1 == {NUM_PORTS{1'b1}}) begin
|
||||
$display("%t: cache%0d:%0d full bank multi-porting - addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID));
|
||||
end*/
|
||||
/*if (crsq_in_fire && (NUM_PORTS > 1) && $countones(crsq_pmask) > 1) begin
|
||||
$display("%t: *** cache%0d:%0d multi-port-out: pmask=%b, addr=%0h, tag=%0h", $time, CACHE_ID, BANK_ID, crsq_pmask, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag);
|
||||
end */
|
||||
if (valid_st1 && !is_fill_st1 && miss_st1 && incoming_fill_qual_st1) begin
|
||||
$display("%t: cache%0d:%0d miss with incoming fill - addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID));
|
||||
$display("%t: *** cache%0d:%0d miss with incoming fill - addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID));
|
||||
assert(!is_mshr_st1);
|
||||
end
|
||||
if (crsq_in_stall || dreq_alm_full || mshr_alm_full) begin
|
||||
$display("%t: cache%0d:%0d pipeline-stall: cwbq=%b, dwbq=%b, mshr=%b", $time, CACHE_ID, BANK_ID, crsq_in_stall, dreq_alm_full, mshr_alm_full);
|
||||
if (crsq_in_stall || mreq_alm_full || mshr_alm_full) begin
|
||||
$display("%t: *** cache%0d:%0d pipeline-stall: cwbq=%b, dwbq=%b, mshr=%b", $time, CACHE_ID, BANK_ID, crsq_in_stall, mreq_alm_full, mshr_alm_full);
|
||||
end
|
||||
if (flush_enable) begin
|
||||
$display("%t: cache%0d:%0d flush: addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(flush_addr, BANK_ID));
|
||||
end
|
||||
if (drsq_pop) begin
|
||||
$display("%t: cache%0d:%0d fill-rsp: addr=%0h, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_rsp_addr_qual, BANK_ID), dram_rsp_data);
|
||||
if (mrsq_pop) begin
|
||||
$display("%t: cache%0d:%0d fill-rsp: addr=%0h, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_data);
|
||||
end
|
||||
if (mshr_pop) begin
|
||||
$display("%t: cache%0d:%0d mshr-rd-req: addr=%0h, tag=%0h, pmask=%0b, tid=%0d, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, mshr_byteen, debug_wid_sel, debug_pc_sel);
|
||||
$display("%t: cache%0d:%0d mshr-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, mshr_byteen, debug_wid_sel, debug_pc_sel);
|
||||
end
|
||||
if (creq_pop) begin
|
||||
if (creq_rw)
|
||||
$display("%t: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%0b, tid=%0d, byteen=%b, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, debug_wid_sel, debug_pc_sel);
|
||||
$display("%t: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, debug_wid_sel, debug_pc_sel);
|
||||
else
|
||||
$display("%t: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%0b, tid=%0d, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, debug_wid_sel, debug_pc_sel);
|
||||
$display("%t: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, debug_wid_sel, debug_pc_sel);
|
||||
end
|
||||
if (crsq_in_fire) begin
|
||||
$display("%t: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%0b, tid=%0d, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, debug_wid_st1, debug_pc_st1);
|
||||
$display("%t: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, debug_wid_st1, debug_pc_st1);
|
||||
end
|
||||
if (dreq_push) begin
|
||||
if (mreq_push) begin
|
||||
if (do_writeback_st1)
|
||||
$display("%t: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dreq_addr, BANK_ID), dreq_data, dreq_byteen, debug_wid_st1, debug_pc_st1);
|
||||
$display("%t: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, debug_wid_st1, debug_pc_st1);
|
||||
else
|
||||
$display("%t: cache%0d:%0d fill-req: addr=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dreq_addr, BANK_ID), debug_wid_st1, debug_pc_st1);
|
||||
$display("%t: cache%0d:%0d fill-req: addr=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), debug_wid_st1, debug_pc_st1);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
231
hw/rtl/cache/VX_cache.v
vendored
231
hw/rtl/cache/VX_cache.v
vendored
|
@ -21,10 +21,10 @@ module VX_cache #(
|
|||
parameter CREQ_SIZE = 4,
|
||||
// Miss Reserv Queue Knob
|
||||
parameter MSHR_SIZE = 8,
|
||||
// DRAM Response Queue Size
|
||||
parameter DRSQ_SIZE = 4,
|
||||
// DRAM Request Queue Size
|
||||
parameter DREQ_SIZE = 4,
|
||||
// Memory Response Queue Size
|
||||
parameter MRSQ_SIZE = 4,
|
||||
// Memory Request Queue Size
|
||||
parameter MREQ_SIZE = 4,
|
||||
|
||||
// Enable cache writeable
|
||||
parameter WRITE_ENABLE = 1,
|
||||
|
@ -35,22 +35,17 @@ module VX_cache #(
|
|||
// size of tag id in core request tag
|
||||
parameter CORE_TAG_ID_BITS = CORE_TAG_WIDTH,
|
||||
|
||||
// dram request tag size
|
||||
parameter DRAM_TAG_WIDTH = (32 - $clog2(CACHE_LINE_SIZE)),
|
||||
// Memory request tag size
|
||||
parameter MEM_TAG_WIDTH = (32 - $clog2(CACHE_LINE_SIZE)),
|
||||
|
||||
// bank offset from beginning of index range
|
||||
parameter BANK_ADDR_OFFSET = 0,
|
||||
|
||||
// in-order DRAN
|
||||
parameter IN_ORDER_DRAM = 0
|
||||
parameter BANK_ADDR_OFFSET = 0
|
||||
) (
|
||||
`SCOPE_IO_VX_cache
|
||||
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
input wire flush,
|
||||
|
||||
// Core request
|
||||
input wire [NUM_REQS-1:0] core_req_valid,
|
||||
input wire [NUM_REQS-1:0] core_req_rw,
|
||||
|
@ -66,29 +61,32 @@ module VX_cache #(
|
|||
output wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag,
|
||||
input wire [`CORE_REQ_TAG_COUNT-1:0] core_rsp_ready,
|
||||
|
||||
// Memory request
|
||||
output wire mem_req_valid,
|
||||
output wire mem_req_rw,
|
||||
output wire [CACHE_LINE_SIZE-1:0] mem_req_byteen,
|
||||
output wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [`CACHE_LINE_WIDTH-1:0] mem_req_data,
|
||||
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// Memory response
|
||||
input wire mem_rsp_valid,
|
||||
input wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire mem_rsp_ready,
|
||||
|
||||
// PERF
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_cache_if perf_cache_if,
|
||||
`endif
|
||||
|
||||
// DRAM request
|
||||
output wire dram_req_valid,
|
||||
output wire dram_req_rw,
|
||||
output wire [CACHE_LINE_SIZE-1:0] dram_req_byteen,
|
||||
output wire [`DRAM_ADDR_WIDTH-1:0] dram_req_addr,
|
||||
output wire [`CACHE_LINE_WIDTH-1:0] dram_req_data,
|
||||
output wire [DRAM_TAG_WIDTH-1:0] dram_req_tag,
|
||||
input wire dram_req_ready,
|
||||
|
||||
// DRAM response
|
||||
input wire dram_rsp_valid,
|
||||
input wire [`CACHE_LINE_WIDTH-1:0] dram_rsp_data,
|
||||
input wire [DRAM_TAG_WIDTH-1:0] dram_rsp_tag,
|
||||
output wire dram_rsp_ready
|
||||
// device flush
|
||||
input wire flush
|
||||
);
|
||||
|
||||
`STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid value"))
|
||||
|
||||
|
||||
wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_req_valid;
|
||||
wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] per_bank_core_req_wsel;
|
||||
wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen;
|
||||
|
@ -106,17 +104,17 @@ module VX_cache #(
|
|||
wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_rsp_tag;
|
||||
wire [NUM_BANKS-1:0] per_bank_core_rsp_ready;
|
||||
|
||||
wire [NUM_BANKS-1:0] per_bank_dram_req_valid;
|
||||
wire [NUM_BANKS-1:0] per_bank_dram_req_rw;
|
||||
wire [NUM_BANKS-1:0][CACHE_LINE_SIZE-1:0] per_bank_dram_req_byteen;
|
||||
wire [NUM_BANKS-1:0][`DRAM_ADDR_WIDTH-1:0] per_bank_dram_req_addr;
|
||||
wire [NUM_BANKS-1:0][`CACHE_LINE_WIDTH-1:0] per_bank_dram_req_data;
|
||||
wire [NUM_BANKS-1:0] per_bank_dram_req_ready;
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_req_valid;
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_req_rw;
|
||||
wire [NUM_BANKS-1:0][CACHE_LINE_SIZE-1:0] per_bank_mem_req_byteen;
|
||||
wire [NUM_BANKS-1:0][`MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
|
||||
wire [NUM_BANKS-1:0][`CACHE_LINE_WIDTH-1:0] per_bank_mem_req_data;
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_req_ready;
|
||||
|
||||
wire [NUM_BANKS-1:0] per_bank_dram_rsp_ready;
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
|
||||
|
||||
wire [`CACHE_LINE_WIDTH-1:0] dram_rsp_data_qual;
|
||||
wire [DRAM_TAG_WIDTH-1:0] dram_rsp_tag_qual;
|
||||
wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_qual;
|
||||
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_qual;
|
||||
wire [`LINE_SELECT_BITS-1:0] flush_addr;
|
||||
wire flush_enable;
|
||||
|
||||
|
@ -129,35 +127,35 @@ module VX_cache #(
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire drsq_full, drsq_empty;
|
||||
wire drsq_push, drsq_pop;
|
||||
wire mrsq_full, mrsq_empty;
|
||||
wire mrsq_push, mrsq_pop;
|
||||
|
||||
assign drsq_push = dram_rsp_valid && dram_rsp_ready;
|
||||
assign dram_rsp_ready = !drsq_full;
|
||||
assign mrsq_push = mem_rsp_valid && mem_rsp_ready;
|
||||
assign mem_rsp_ready = !mrsq_full;
|
||||
|
||||
VX_fifo_queue #(
|
||||
.DATAW (DRAM_TAG_WIDTH + `CACHE_LINE_WIDTH),
|
||||
.SIZE (DRSQ_SIZE),
|
||||
.DATAW (MEM_TAG_WIDTH + `CACHE_LINE_WIDTH),
|
||||
.SIZE (MRSQ_SIZE),
|
||||
.BUFFERED (1)
|
||||
) dram_rsp_queue (
|
||||
) mem_rsp_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.push (drsq_push),
|
||||
.pop (drsq_pop),
|
||||
.data_in ({dram_rsp_tag, dram_rsp_data}),
|
||||
.data_out ({dram_rsp_tag_qual, dram_rsp_data_qual}),
|
||||
.empty (drsq_empty),
|
||||
.full (drsq_full),
|
||||
.push (mrsq_push),
|
||||
.pop (mrsq_pop),
|
||||
.data_in ({mem_rsp_tag, mem_rsp_data}),
|
||||
.data_out ({mem_rsp_tag_qual, mem_rsp_data_qual}),
|
||||
.empty (mrsq_empty),
|
||||
.full (mrsq_full),
|
||||
`UNUSED_PIN (alm_full),
|
||||
`UNUSED_PIN (alm_empty),
|
||||
`UNUSED_PIN (size)
|
||||
);
|
||||
|
||||
if (NUM_BANKS == 1) begin
|
||||
`UNUSED_VAR (dram_rsp_tag_qual)
|
||||
assign drsq_pop = !drsq_empty && per_bank_dram_rsp_ready;
|
||||
`UNUSED_VAR (mem_rsp_tag_qual)
|
||||
assign mrsq_pop = !mrsq_empty && per_bank_mem_rsp_ready;
|
||||
end else begin
|
||||
assign drsq_pop = !drsq_empty && per_bank_dram_rsp_ready[`DRAM_ADDR_BANK(dram_rsp_tag_qual)];
|
||||
assign mrsq_pop = !mrsq_empty && per_bank_mem_rsp_ready[`MEM_ADDR_BANK(mem_rsp_tag_qual)];
|
||||
end
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
@ -176,6 +174,7 @@ module VX_cache #(
|
|||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
VX_cache_core_req_bank_sel #(
|
||||
.CACHE_ID (CACHE_ID),
|
||||
.CACHE_LINE_SIZE (CACHE_LINE_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.NUM_PORTS (NUM_PORTS),
|
||||
|
@ -227,17 +226,17 @@ module VX_cache #(
|
|||
wire [CORE_TAG_WIDTH-1:0] curr_bank_core_rsp_tag;
|
||||
wire curr_bank_core_rsp_ready;
|
||||
|
||||
wire curr_bank_dram_req_valid;
|
||||
wire curr_bank_dram_req_rw;
|
||||
wire [CACHE_LINE_SIZE-1:0] curr_bank_dram_req_byteen;
|
||||
wire [`LINE_ADDR_WIDTH-1:0] curr_bank_dram_req_addr;
|
||||
wire[`CACHE_LINE_WIDTH-1:0] curr_bank_dram_req_data;
|
||||
wire curr_bank_dram_req_ready;
|
||||
wire curr_bank_mem_req_valid;
|
||||
wire curr_bank_mem_req_rw;
|
||||
wire [CACHE_LINE_SIZE-1:0] curr_bank_mem_req_byteen;
|
||||
wire [`LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr;
|
||||
wire[`CACHE_LINE_WIDTH-1:0] curr_bank_mem_req_data;
|
||||
wire curr_bank_mem_req_ready;
|
||||
|
||||
wire curr_bank_dram_rsp_valid;
|
||||
wire [`LINE_ADDR_WIDTH-1:0] curr_bank_dram_rsp_addr;
|
||||
wire [`CACHE_LINE_WIDTH-1:0] curr_bank_dram_rsp_data;
|
||||
wire curr_bank_dram_rsp_ready;
|
||||
wire curr_bank_mem_rsp_valid;
|
||||
wire [`LINE_ADDR_WIDTH-1:0] curr_bank_mem_rsp_addr;
|
||||
wire [`CACHE_LINE_WIDTH-1:0] curr_bank_mem_rsp_data;
|
||||
wire curr_bank_mem_rsp_ready;
|
||||
|
||||
// Core Req
|
||||
assign curr_bank_core_req_valid = per_bank_core_req_valid[i];
|
||||
|
@ -258,28 +257,28 @@ module VX_cache #(
|
|||
assign per_bank_core_rsp_tag [i] = curr_bank_core_rsp_tag;
|
||||
assign per_bank_core_rsp_data [i] = curr_bank_core_rsp_data;
|
||||
|
||||
// DRAM request
|
||||
assign per_bank_dram_req_valid[i] = curr_bank_dram_req_valid;
|
||||
assign per_bank_dram_req_rw[i] = curr_bank_dram_req_rw;
|
||||
assign per_bank_dram_req_byteen[i] = curr_bank_dram_req_byteen;
|
||||
// Memory request
|
||||
assign per_bank_mem_req_valid[i] = curr_bank_mem_req_valid;
|
||||
assign per_bank_mem_req_rw[i] = curr_bank_mem_req_rw;
|
||||
assign per_bank_mem_req_byteen[i] = curr_bank_mem_req_byteen;
|
||||
if (NUM_BANKS == 1) begin
|
||||
assign per_bank_dram_req_addr[i] = curr_bank_dram_req_addr;
|
||||
assign per_bank_mem_req_addr[i] = curr_bank_mem_req_addr;
|
||||
end else begin
|
||||
assign per_bank_dram_req_addr[i] = `LINE_TO_DRAM_ADDR(curr_bank_dram_req_addr, i);
|
||||
assign per_bank_mem_req_addr[i] = `LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, i);
|
||||
end
|
||||
assign per_bank_dram_req_data[i] = curr_bank_dram_req_data;
|
||||
assign curr_bank_dram_req_ready = per_bank_dram_req_ready[i];
|
||||
assign per_bank_mem_req_data[i] = curr_bank_mem_req_data;
|
||||
assign curr_bank_mem_req_ready = per_bank_mem_req_ready[i];
|
||||
|
||||
// DRAM response
|
||||
// Memory response
|
||||
if (NUM_BANKS == 1) begin
|
||||
assign curr_bank_dram_rsp_valid = !drsq_empty;
|
||||
assign curr_bank_dram_rsp_addr = dram_rsp_tag_qual;
|
||||
assign curr_bank_mem_rsp_valid = !mrsq_empty;
|
||||
assign curr_bank_mem_rsp_addr = mem_rsp_tag_qual;
|
||||
end else begin
|
||||
assign curr_bank_dram_rsp_valid = !drsq_empty && (`DRAM_ADDR_BANK(dram_rsp_tag_qual) == i);
|
||||
assign curr_bank_dram_rsp_addr = `DRAM_TO_LINE_ADDR(dram_rsp_tag_qual);
|
||||
assign curr_bank_mem_rsp_valid = !mrsq_empty && (`MEM_ADDR_BANK(mem_rsp_tag_qual) == i);
|
||||
assign curr_bank_mem_rsp_addr = `MEM_TO_LINE_ADDR(mem_rsp_tag_qual);
|
||||
end
|
||||
assign curr_bank_dram_rsp_data = dram_rsp_data_qual;
|
||||
assign per_bank_dram_rsp_ready[i] = curr_bank_dram_rsp_ready;
|
||||
assign curr_bank_mem_rsp_data = mem_rsp_data_qual;
|
||||
assign per_bank_mem_rsp_ready[i] = curr_bank_mem_rsp_ready;
|
||||
|
||||
VX_bank #(
|
||||
.BANK_ID (i),
|
||||
|
@ -292,12 +291,11 @@ module VX_cache #(
|
|||
.NUM_REQS (NUM_REQS),
|
||||
.CREQ_SIZE (CREQ_SIZE),
|
||||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.DREQ_SIZE (DREQ_SIZE),
|
||||
.MREQ_SIZE (MREQ_SIZE),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.CORE_TAG_WIDTH (CORE_TAG_WIDTH),
|
||||
.CORE_TAG_ID_BITS (CORE_TAG_ID_BITS),
|
||||
.BANK_ADDR_OFFSET (BANK_ADDR_OFFSET),
|
||||
.IN_ORDER_DRAM (IN_ORDER_DRAM)
|
||||
.BANK_ADDR_OFFSET (BANK_ADDR_OFFSET)
|
||||
) bank (
|
||||
`SCOPE_BIND_VX_cache_bank(i)
|
||||
|
||||
|
@ -330,19 +328,19 @@ module VX_cache #(
|
|||
.core_rsp_tag (curr_bank_core_rsp_tag),
|
||||
.core_rsp_ready (curr_bank_core_rsp_ready),
|
||||
|
||||
// DRAM request
|
||||
.dram_req_valid (curr_bank_dram_req_valid),
|
||||
.dram_req_rw (curr_bank_dram_req_rw),
|
||||
.dram_req_byteen (curr_bank_dram_req_byteen),
|
||||
.dram_req_addr (curr_bank_dram_req_addr),
|
||||
.dram_req_data (curr_bank_dram_req_data),
|
||||
.dram_req_ready (curr_bank_dram_req_ready),
|
||||
// Memory request
|
||||
.mem_req_valid (curr_bank_mem_req_valid),
|
||||
.mem_req_rw (curr_bank_mem_req_rw),
|
||||
.mem_req_byteen (curr_bank_mem_req_byteen),
|
||||
.mem_req_addr (curr_bank_mem_req_addr),
|
||||
.mem_req_data (curr_bank_mem_req_data),
|
||||
.mem_req_ready (curr_bank_mem_req_ready),
|
||||
|
||||
// DRAM response
|
||||
.dram_rsp_valid (curr_bank_dram_rsp_valid),
|
||||
.dram_rsp_addr (curr_bank_dram_rsp_addr),
|
||||
.dram_rsp_data (curr_bank_dram_rsp_data),
|
||||
.dram_rsp_ready (curr_bank_dram_rsp_ready),
|
||||
// Memory response
|
||||
.mem_rsp_valid (curr_bank_mem_rsp_valid),
|
||||
.mem_rsp_addr (curr_bank_mem_rsp_addr),
|
||||
.mem_rsp_data (curr_bank_mem_rsp_data),
|
||||
.mem_rsp_ready (curr_bank_mem_rsp_ready),
|
||||
|
||||
// flush
|
||||
.flush_enable (flush_enable),
|
||||
|
@ -351,6 +349,7 @@ module VX_cache #(
|
|||
end
|
||||
|
||||
VX_cache_core_rsp_merge #(
|
||||
.CACHE_ID (CACHE_ID),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.NUM_PORTS (NUM_PORTS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
|
@ -372,27 +371,27 @@ module VX_cache #(
|
|||
.core_rsp_ready (core_rsp_ready)
|
||||
);
|
||||
|
||||
wire [NUM_BANKS-1:0][(`DRAM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH)-1:0] data_in;
|
||||
wire [NUM_BANKS-1:0][(`MEM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH)-1:0] data_in;
|
||||
for (genvar i = 0; i < NUM_BANKS; i++) begin
|
||||
assign data_in[i] = {per_bank_dram_req_addr[i], per_bank_dram_req_rw[i], per_bank_dram_req_byteen[i], per_bank_dram_req_data[i]};
|
||||
assign data_in[i] = {per_bank_mem_req_addr[i], per_bank_mem_req_rw[i], per_bank_mem_req_byteen[i], per_bank_mem_req_data[i]};
|
||||
end
|
||||
|
||||
VX_stream_arbiter #(
|
||||
.NUM_REQS (NUM_BANKS),
|
||||
.DATAW (`DRAM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH),
|
||||
.DATAW (`MEM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH),
|
||||
.BUFFERED (1)
|
||||
) dram_req_arb (
|
||||
) mem_req_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (per_bank_dram_req_valid),
|
||||
.valid_in (per_bank_mem_req_valid),
|
||||
.data_in (data_in),
|
||||
.ready_in (per_bank_dram_req_ready),
|
||||
.valid_out (dram_req_valid),
|
||||
.data_out ({dram_req_addr, dram_req_rw, dram_req_byteen, dram_req_data}),
|
||||
.ready_out (dram_req_ready)
|
||||
.ready_in (per_bank_mem_req_ready),
|
||||
.valid_out (mem_req_valid),
|
||||
.data_out ({mem_req_addr, mem_req_rw, mem_req_byteen, mem_req_data}),
|
||||
.ready_out (mem_req_ready)
|
||||
);
|
||||
|
||||
assign dram_req_tag = dram_req_addr;
|
||||
assign mem_req_tag = mem_req_addr;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
// per cycle: core_reads, core_writes
|
||||
|
@ -420,13 +419,13 @@ module VX_cache #(
|
|||
assign perf_mshr_stall_per_cycle = $countones(perf_mshr_stall_per_bank);
|
||||
assign perf_pipe_stall_per_cycle = $countones(perf_pipe_stall_per_bank);
|
||||
|
||||
reg [43:0] perf_core_reads;
|
||||
reg [43:0] perf_core_writes;
|
||||
reg [43:0] perf_read_misses;
|
||||
reg [43:0] perf_write_misses;
|
||||
reg [43:0] perf_mshr_stalls;
|
||||
reg [43:0] perf_pipe_stalls;
|
||||
reg [43:0] perf_crsp_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_read_misses;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_write_misses;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mshr_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_pipe_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
@ -438,13 +437,13 @@ module VX_cache #(
|
|||
perf_pipe_stalls <= 0;
|
||||
perf_crsp_stalls <= 0;
|
||||
end else begin
|
||||
perf_core_reads <= perf_core_reads + 44'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + 44'(perf_core_writes_per_cycle);
|
||||
perf_read_misses <= perf_read_misses + 44'(perf_read_miss_per_cycle);
|
||||
perf_write_misses <= perf_write_misses+ 44'(perf_write_miss_per_cycle);
|
||||
perf_mshr_stalls <= perf_mshr_stalls + 44'(perf_mshr_stall_per_cycle);
|
||||
perf_pipe_stalls <= perf_pipe_stalls + 44'(perf_pipe_stall_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + 44'(perf_crsp_stall_per_cycle);
|
||||
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
|
||||
perf_read_misses <= perf_read_misses + `PERF_CTR_BITS'(perf_read_miss_per_cycle);
|
||||
perf_write_misses <= perf_write_misses+ `PERF_CTR_BITS'(perf_write_miss_per_cycle);
|
||||
perf_mshr_stalls <= perf_mshr_stalls + `PERF_CTR_BITS'(perf_mshr_stall_per_cycle);
|
||||
perf_pipe_stalls <= perf_pipe_stalls + `PERF_CTR_BITS'(perf_pipe_stall_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
end
|
||||
end
|
||||
|
||||
|
|
11
hw/rtl/cache/VX_cache_core_req_bank_sel.v
vendored
11
hw/rtl/cache/VX_cache_core_req_bank_sel.v
vendored
|
@ -1,6 +1,8 @@
|
|||
`include "VX_cache_define.vh"
|
||||
|
||||
module VX_cache_core_req_bank_sel #(
|
||||
parameter CACHE_ID = 0,
|
||||
|
||||
// Size of line inside a bank in bytes
|
||||
parameter CACHE_LINE_SIZE = 64,
|
||||
// Size of a word in bytes
|
||||
|
@ -22,7 +24,7 @@ module VX_cache_core_req_bank_sel #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output wire [43:0] bank_stalls,
|
||||
output wire [`PERF_CTR_BITS-1:0] bank_stalls,
|
||||
`endif
|
||||
|
||||
input wire [NUM_REQS-1:0] core_req_valid,
|
||||
|
@ -43,6 +45,7 @@ module VX_cache_core_req_bank_sel #(
|
|||
output wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag,
|
||||
input wire [`BANK_READY_COUNT-1:0] per_bank_core_req_ready
|
||||
);
|
||||
`UNUSED_PARAM (CACHE_ID)
|
||||
`STATIC_ASSERT (NUM_REQS >= NUM_BANKS, ("invalid number of banks"));
|
||||
|
||||
`UNUSED_VAR (clk)
|
||||
|
@ -148,7 +151,7 @@ module VX_cache_core_req_bank_sel #(
|
|||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
||||
end else begin
|
||||
|
||||
always @(*) begin
|
||||
|
@ -303,13 +306,13 @@ module VX_cache_core_req_bank_sel #(
|
|||
end
|
||||
end
|
||||
|
||||
reg [43:0] bank_stalls_r;
|
||||
reg [`PERF_CTR_BITS-1:0] bank_stalls_r;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
bank_stalls_r <= 0;
|
||||
end else begin
|
||||
bank_stalls_r <= bank_stalls_r + 44'($countones(core_req_sel_r & ~core_req_ready));
|
||||
bank_stalls_r <= bank_stalls_r + `PERF_CTR_BITS'($countones(core_req_sel_r & ~core_req_ready));
|
||||
end
|
||||
end
|
||||
|
||||
|
|
8
hw/rtl/cache/VX_cache_core_rsp_merge.v
vendored
8
hw/rtl/cache/VX_cache_core_rsp_merge.v
vendored
|
@ -1,6 +1,8 @@
|
|||
`include "VX_cache_define.vh"
|
||||
|
||||
module VX_cache_core_rsp_merge #(
|
||||
parameter CACHE_ID = 0,
|
||||
|
||||
// Number of Word requests per cycle
|
||||
parameter NUM_REQS = 1,
|
||||
// Number of banks
|
||||
|
@ -31,6 +33,8 @@ module VX_cache_core_rsp_merge #(
|
|||
output wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data,
|
||||
input wire [`CORE_REQ_TAG_COUNT-1:0] core_rsp_ready
|
||||
);
|
||||
`UNUSED_PARAM (CACHE_ID)
|
||||
|
||||
if (NUM_BANKS > 1) begin
|
||||
|
||||
reg [NUM_REQS-1:0] core_rsp_valid_unqual;
|
||||
|
@ -39,6 +43,10 @@ module VX_cache_core_rsp_merge #(
|
|||
|
||||
if (CORE_TAG_ID_BITS != 0) begin
|
||||
|
||||
// The core response bus handles a single tag at the time
|
||||
// We first need to select the current tag to process,
|
||||
// then send all bank responses for that tag as a batch
|
||||
|
||||
reg [CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual;
|
||||
wire core_rsp_ready_unqual;
|
||||
|
||||
|
|
10
hw/rtl/cache/VX_cache_define.vh
vendored
10
hw/rtl/cache/VX_cache_define.vh
vendored
|
@ -21,8 +21,8 @@
|
|||
`define WORDS_PER_LINE (CACHE_LINE_SIZE / WORD_SIZE)
|
||||
|
||||
`define WORD_ADDR_WIDTH (32-`CLOG2(WORD_SIZE))
|
||||
`define DRAM_ADDR_WIDTH (32-`CLOG2(CACHE_LINE_SIZE))
|
||||
`define LINE_ADDR_WIDTH (`DRAM_ADDR_WIDTH-`BANK_SELECT_BITS)
|
||||
`define MEM_ADDR_WIDTH (32-`CLOG2(CACHE_LINE_SIZE))
|
||||
`define LINE_ADDR_WIDTH (`MEM_ADDR_WIDTH-`BANK_SELECT_BITS)
|
||||
|
||||
// Word select
|
||||
`define WORD_SELECT_BITS `CLOG2(`WORDS_PER_LINE)
|
||||
|
@ -59,11 +59,11 @@
|
|||
|
||||
`define BANK_READY_COUNT ((SHARED_BANK_READY != 0) ? 1 : NUM_BANKS)
|
||||
|
||||
`define DRAM_ADDR_BANK(x) x[`BANK_SELECT_BITS+BANK_ADDR_OFFSET-1 : BANK_ADDR_OFFSET]
|
||||
`define MEM_ADDR_BANK(x) x[`BANK_SELECT_BITS+BANK_ADDR_OFFSET-1 : BANK_ADDR_OFFSET]
|
||||
|
||||
`define DRAM_TO_LINE_ADDR(x) x[`DRAM_ADDR_WIDTH-1 : `BANK_SELECT_BITS]
|
||||
`define MEM_TO_LINE_ADDR(x) x[`MEM_ADDR_WIDTH-1 : `BANK_SELECT_BITS]
|
||||
|
||||
`define LINE_TO_DRAM_ADDR(x, i) {x, `BANK_SELECT_BITS'(i)}
|
||||
`define LINE_TO_MEM_ADDR(x, i) {x, `BANK_SELECT_BITS'(i)}
|
||||
|
||||
`define LINE_TO_BYTE_ADDR(x, i) {x, (32-$bits(x))'(i << (32-$bits(x)-`BANK_SELECT_BITS))}
|
||||
|
||||
|
|
202
hw/rtl/cache/VX_shared_mem.v
vendored
202
hw/rtl/cache/VX_shared_mem.v
vendored
|
@ -4,25 +4,25 @@ module VX_shared_mem #(
|
|||
parameter CACHE_ID = 0,
|
||||
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 16384,
|
||||
parameter CACHE_SIZE = (1024*16),
|
||||
// Number of banks
|
||||
parameter NUM_BANKS = 4,
|
||||
parameter NUM_BANKS = 2,
|
||||
// Size of a word in bytes
|
||||
parameter WORD_SIZE = 4,
|
||||
// Number of Word requests per cycle
|
||||
parameter NUM_REQS = NUM_BANKS,
|
||||
parameter NUM_REQS = 4,
|
||||
|
||||
// Core Request Queue Size
|
||||
parameter CREQ_SIZE = 4,
|
||||
|
||||
// core request tag size
|
||||
parameter CORE_TAG_WIDTH = 1,
|
||||
parameter CREQ_SIZE = 8,
|
||||
|
||||
// size of tag id in core request tag
|
||||
parameter CORE_TAG_ID_BITS = 0,
|
||||
parameter CORE_TAG_ID_BITS = 8,
|
||||
|
||||
// core request tag size
|
||||
parameter CORE_TAG_WIDTH = (2 + CORE_TAG_ID_BITS),
|
||||
|
||||
// bank offset from beginning of index range
|
||||
parameter BANK_ADDR_OFFSET = 0
|
||||
parameter BANK_ADDR_OFFSET = `CLOG2(256)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
@ -54,13 +54,6 @@ module VX_shared_mem #(
|
|||
|
||||
localparam CACHE_LINE_SIZE = WORD_SIZE;
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
/* verilator lint_off UNUSED */
|
||||
wire [31:0] debug_pc_st0;
|
||||
wire [`NW_BITS-1:0] debug_wid_st0;
|
||||
/* verilator lint_on UNUSED */
|
||||
`endif
|
||||
|
||||
wire [NUM_BANKS-1:0] per_bank_core_req_valid_unqual;
|
||||
wire [NUM_BANKS-1:0] per_bank_core_req_rw_unqual;
|
||||
wire [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr_unqual;
|
||||
|
@ -71,6 +64,7 @@ module VX_shared_mem #(
|
|||
wire per_bank_core_req_ready_unqual;
|
||||
|
||||
VX_cache_core_req_bank_sel #(
|
||||
.CACHE_ID (CACHE_ID),
|
||||
.CACHE_LINE_SIZE (WORD_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.NUM_PORTS (1),
|
||||
|
@ -108,20 +102,26 @@ module VX_shared_mem #(
|
|||
wire [NUM_BANKS-1:0][`LINE_SELECT_BITS-1:0] per_bank_core_req_addr;
|
||||
wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen;
|
||||
wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data;
|
||||
wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag;
|
||||
wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag;
|
||||
wire [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid;
|
||||
|
||||
wire creq_push, creq_pop, creq_empty, creq_full;
|
||||
wire crsq_in_ready;
|
||||
wire crsq_in_fire_last;
|
||||
|
||||
wire [NUM_BANKS-1:0] per_bank_rsp_valid = per_bank_core_req_valid & ~per_bank_core_req_rw;
|
||||
|
||||
wire core_req_has_read = (| per_bank_rsp_valid);
|
||||
|
||||
assign creq_push = (| core_req_valid) && !creq_full;
|
||||
assign creq_pop = ~creq_empty && crsq_in_ready;
|
||||
assign creq_push = (| core_req_valid) && ~creq_full;
|
||||
|
||||
assign creq_pop = (~creq_empty && ~core_req_has_read)
|
||||
|| crsq_in_fire_last;
|
||||
|
||||
assign per_bank_core_req_ready_unqual = ~creq_full;
|
||||
|
||||
wire [NUM_REQS-1:0][`LINE_SELECT_BITS-1:0] per_bank_core_req_addr_qual;
|
||||
wire [NUM_BANKS-1:0][`LINE_SELECT_BITS-1:0] per_bank_core_req_addr_qual;
|
||||
`UNUSED_VAR (per_bank_core_req_addr_unqual)
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
for (genvar i = 0; i < NUM_BANKS; i++) begin
|
||||
assign per_bank_core_req_addr_qual[i] = per_bank_core_req_addr_unqual[i][`LINE_SELECT_BITS-1:0];
|
||||
end
|
||||
|
||||
|
@ -155,9 +155,14 @@ module VX_shared_mem #(
|
|||
`UNUSED_PIN (size)
|
||||
);
|
||||
|
||||
wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_rsp_data;
|
||||
wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_rsp_data;
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; i++) begin
|
||||
|
||||
wire wren = per_bank_core_req_rw[i]
|
||||
&& per_bank_core_req_valid[i]
|
||||
&& creq_pop;
|
||||
|
||||
VX_sp_ram #(
|
||||
.DATAW (`WORD_WIDTH),
|
||||
.SIZE (`LINES_PER_BANK),
|
||||
|
@ -166,13 +171,41 @@ module VX_shared_mem #(
|
|||
) data (
|
||||
.clk (clk),
|
||||
.addr (per_bank_core_req_addr[i]),
|
||||
.wren (per_bank_core_req_valid[i] && per_bank_core_req_rw[i]),
|
||||
.wren (wren),
|
||||
.byteen (per_bank_core_req_byteen[i]),
|
||||
.rden (1'b1),
|
||||
.din (per_bank_core_req_data[i]),
|
||||
.dout (per_bank_core_rsp_data[i])
|
||||
);
|
||||
end
|
||||
|
||||
// The core response bus handles a single tag at the time
|
||||
// We first need to select the current tag to process,
|
||||
// then send all bank responses for that tag as a batch
|
||||
|
||||
wire crsq_in_valid, crsq_in_ready;
|
||||
|
||||
reg [NUM_BANKS-1:0] bank_rsp_sel, bank_rsp_sel_r;
|
||||
|
||||
wire [NUM_BANKS-1:0] bank_rsp_sel_n = bank_rsp_sel | bank_rsp_sel_r;
|
||||
|
||||
wire crsq_in_fire = crsq_in_valid && crsq_in_ready;
|
||||
|
||||
assign crsq_in_fire_last = crsq_in_fire && (bank_rsp_sel_n == per_bank_rsp_valid);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
bank_rsp_sel <= 0;
|
||||
end else begin
|
||||
if (crsq_in_fire) begin
|
||||
if (bank_rsp_sel_n == per_bank_rsp_valid) begin
|
||||
bank_rsp_sel <= 0;
|
||||
end else begin
|
||||
bank_rsp_sel <= bank_rsp_sel_n;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
reg [NUM_REQS-1:0] core_rsp_valids_in;
|
||||
reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_in;
|
||||
|
@ -180,31 +213,30 @@ module VX_shared_mem #(
|
|||
|
||||
always @(*) begin
|
||||
core_rsp_valids_in = 0;
|
||||
core_rsp_data_in = 'x;
|
||||
core_rsp_data_in = 'x;
|
||||
core_rsp_tag_in = 'x;
|
||||
for (integer i = 0; i < NUM_BANKS; i++) begin
|
||||
if (per_bank_core_req_valid[i]) begin
|
||||
core_rsp_valids_in[per_bank_core_req_tid[i]] = 1;
|
||||
core_rsp_data_in[per_bank_core_req_tid[i]] = per_bank_core_rsp_data[i];
|
||||
bank_rsp_sel_r = 0;
|
||||
|
||||
for (integer i = NUM_BANKS-1; i >= 0; --i) begin
|
||||
if (per_bank_rsp_valid[i] && ~bank_rsp_sel[i]) begin
|
||||
core_rsp_tag_in = per_bank_core_req_tag[i];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
|
||||
assign {debug_pc_st0, debug_wid_st0} = core_rsp_tag_in[`CACHE_REQ_INFO_RNG];
|
||||
end else begin
|
||||
assign {debug_pc_st0, debug_wid_st0} = 0;
|
||||
|
||||
for (integer i = 0; i < NUM_BANKS; i++) begin
|
||||
if (per_bank_core_req_valid[i]
|
||||
&& (core_rsp_tag_in[CORE_TAG_ID_BITS-1:0] == per_bank_core_req_tag[i][CORE_TAG_ID_BITS-1:0])) begin
|
||||
core_rsp_valids_in[per_bank_core_req_tid[i]] = 1;
|
||||
core_rsp_data_in[per_bank_core_req_tid[i]] = per_bank_core_rsp_data[i];
|
||||
bank_rsp_sel_r[i] = 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
|
||||
wire [NUM_REQS-1:0] core_rsp_valids_out;
|
||||
wire core_rsp_valid_out;
|
||||
|
||||
wire core_rsp_rw = | (per_bank_core_req_valid & per_bank_core_req_rw);
|
||||
|
||||
wire crsq_in_valid = ~creq_empty && ~core_rsp_rw;
|
||||
assign crsq_in_valid = ~creq_empty && core_req_has_read;
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (NUM_BANKS * (1 + `WORD_WIDTH) + CORE_TAG_WIDTH)
|
||||
|
@ -221,16 +253,82 @@ module VX_shared_mem #(
|
|||
|
||||
assign core_rsp_valid = core_rsp_valids_out & {NUM_REQS{core_rsp_valid_out}};
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
wire [NUM_BANKS-1:0][31:0] debug_pc_st0, debug_pc_st1;
|
||||
wire [NUM_BANKS-1:0][`NW_BITS-1:0] debug_wid_st0, debug_wid_st1;
|
||||
`IGNORE_WARNINGS_END
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; ++i) begin
|
||||
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
|
||||
assign {debug_pc_st0[i], debug_wid_st0[i]} = per_bank_core_req_tag_unqual[i][CORE_TAG_WIDTH-1:CORE_TAG_ID_BITS];
|
||||
assign {debug_pc_st1[i], debug_wid_st1[i]} = per_bank_core_req_tag[i][CORE_TAG_WIDTH-1:CORE_TAG_ID_BITS];
|
||||
end else begin
|
||||
assign {debug_pc_st0[i], debug_wid_st0[i]} = 0;
|
||||
assign {debug_pc_st1[i], debug_wid_st1[i]} = 0;
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
`ifdef DBG_PRINT_CACHE_BANK
|
||||
|
||||
reg is_multi_tag_req;
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
reg [CORE_TAG_WIDTH-1:0] core_req_tag_sel;
|
||||
`IGNORE_WARNINGS_END
|
||||
|
||||
always @(*) begin
|
||||
core_req_tag_sel ='x;
|
||||
for (integer i = NUM_BANKS-1; i >= 0; --i) begin
|
||||
if (per_bank_core_req_valid[i]) begin
|
||||
core_req_tag_sel = per_bank_core_req_tag[i];
|
||||
end
|
||||
end
|
||||
is_multi_tag_req = 0;
|
||||
for (integer i = 0; i < NUM_BANKS; ++i) begin
|
||||
if (per_bank_core_req_valid[i]
|
||||
&& (core_req_tag_sel[CORE_TAG_ID_BITS-1:0] != per_bank_core_req_tag[i][CORE_TAG_ID_BITS-1:0])) begin
|
||||
is_multi_tag_req = !creq_empty;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (!crsq_in_ready) begin
|
||||
$display("%t: cache%0d pipeline-stall", $time, CACHE_ID);
|
||||
$display("%t: *** cache%0d pipeline-stall", $time, CACHE_ID);
|
||||
end
|
||||
if (is_multi_tag_req) begin
|
||||
$display("%t: *** cache%0d multi-tag request!", $time, CACHE_ID);
|
||||
end
|
||||
if (creq_push) begin
|
||||
for (integer i = 0; i < NUM_BANKS; ++i) begin
|
||||
if (per_bank_core_req_valid_unqual[i]) begin
|
||||
if (per_bank_core_req_rw_unqual[i]) begin
|
||||
$display("%t: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h",
|
||||
$time, CACHE_ID, i, per_bank_core_req_addr_unqual[i], per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], per_bank_core_req_data_unqual[i],
|
||||
debug_wid_st0[i], debug_pc_st0[i]);
|
||||
end else begin
|
||||
$display("%t: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, byteen=%b, wid=%0d, PC=%0h",
|
||||
$time, CACHE_ID, i, per_bank_core_req_addr_unqual[i], per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i],
|
||||
debug_wid_st0[i], debug_pc_st0[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
if (creq_pop) begin
|
||||
if (core_rsp_rw)
|
||||
$display("%t: cache%0d core-wr-req: tmask=%0b, addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, per_bank_core_req_valid, per_bank_core_req_addr, per_bank_core_req_tag, per_bank_core_req_byteen, per_bank_core_req_data, debug_wid_st0, debug_pc_st0);
|
||||
else
|
||||
$display("%t: cache%0d core-rd-req: tmask=%0b, addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, per_bank_core_req_valid, per_bank_core_req_addr, per_bank_core_req_tag, per_bank_core_req_byteen, per_bank_core_rsp_data, debug_wid_st0, debug_pc_st0);
|
||||
for (integer i = 0; i < NUM_BANKS; ++i) begin
|
||||
if (per_bank_core_req_valid[i]) begin
|
||||
if (per_bank_core_req_rw[i]) begin
|
||||
$display("%t: cache%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h",
|
||||
$time, CACHE_ID, i, per_bank_core_req_addr[i], per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_req_data[i],
|
||||
debug_wid_st1[i], debug_pc_st1[i]);
|
||||
end else begin
|
||||
$display("%t: cache%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, byteen=%b, wid=%0d, PC=%0h",
|
||||
$time, CACHE_ID, i, per_bank_core_req_addr[i], per_bank_core_req_tag[i], per_bank_core_req_byteen[i],
|
||||
debug_wid_st1[i], debug_pc_st1[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
@ -249,9 +347,9 @@ module VX_shared_mem #(
|
|||
assign perf_crsp_stall_per_cycle = $countones(core_rsp_valid & ~core_rsp_ready);
|
||||
end
|
||||
|
||||
reg [43:0] perf_core_reads;
|
||||
reg [43:0] perf_core_writes;
|
||||
reg [43:0] perf_crsp_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
@ -259,9 +357,9 @@ module VX_shared_mem #(
|
|||
perf_core_writes <= 0;
|
||||
perf_crsp_stalls <= 0;
|
||||
end else begin
|
||||
perf_core_reads <= perf_core_reads + 44'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + 44'(perf_core_writes_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + 44'(perf_crsp_stall_per_cycle);
|
||||
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -1,22 +0,0 @@
|
|||
`ifndef VX_CACHE_DRAM_REQ_IF
|
||||
`define VX_CACHE_DRAM_REQ_IF
|
||||
|
||||
`include "../cache/VX_cache_define.vh"
|
||||
|
||||
interface VX_cache_dram_req_if #(
|
||||
parameter DRAM_LINE_WIDTH = 1,
|
||||
parameter DRAM_ADDR_WIDTH = 1,
|
||||
parameter DRAM_TAG_WIDTH = 1
|
||||
) ();
|
||||
|
||||
wire valid;
|
||||
wire rw;
|
||||
wire [(DRAM_LINE_WIDTH/8)-1:0] byteen;
|
||||
wire [DRAM_ADDR_WIDTH-1:0] addr;
|
||||
wire [DRAM_LINE_WIDTH-1:0] data;
|
||||
wire [DRAM_TAG_WIDTH-1:0] tag;
|
||||
wire ready;
|
||||
|
||||
endinterface
|
||||
|
||||
`endif
|
|
@ -1,18 +0,0 @@
|
|||
`ifndef VX_CACHE_DRAM_RSP_IF
|
||||
`define VX_CACHE_DRAM_RSP_IF
|
||||
|
||||
`include "../cache/VX_cache_define.vh"
|
||||
|
||||
interface VX_cache_dram_rsp_if #(
|
||||
parameter DRAM_LINE_WIDTH = 1,
|
||||
parameter DRAM_TAG_WIDTH = 1
|
||||
) ();
|
||||
|
||||
wire valid;
|
||||
wire [DRAM_LINE_WIDTH-1:0] data;
|
||||
wire [DRAM_TAG_WIDTH-1:0] tag;
|
||||
wire ready;
|
||||
|
||||
endinterface
|
||||
|
||||
`endif
|
23
hw/rtl/interfaces/VX_cache_mem_req_if.v
Normal file
23
hw/rtl/interfaces/VX_cache_mem_req_if.v
Normal file
|
@ -0,0 +1,23 @@
|
|||
`ifndef VX_CACHE_MEM_REQ_IF
|
||||
`define VX_CACHE_MEM_REQ_IF
|
||||
|
||||
`include "../cache/VX_cache_config.vh"
|
||||
|
||||
interface VX_cache_mem_req_if #(
|
||||
parameter MEM_LINE_WIDTH = 1,
|
||||
parameter MEM_ADDR_WIDTH = 1,
|
||||
parameter MEM_TAG_WIDTH = 1,
|
||||
parameter MEM_LINE_SIZE = MEM_LINE_WIDTH / 8
|
||||
) ();
|
||||
|
||||
wire valid;
|
||||
wire rw;
|
||||
wire [MEM_LINE_SIZE-1:0] byteen;
|
||||
wire [MEM_ADDR_WIDTH-1:0] addr;
|
||||
wire [MEM_LINE_WIDTH-1:0] data;
|
||||
wire [MEM_TAG_WIDTH-1:0] tag;
|
||||
wire ready;
|
||||
|
||||
endinterface
|
||||
|
||||
`endif
|
18
hw/rtl/interfaces/VX_cache_mem_rsp_if.v
Normal file
18
hw/rtl/interfaces/VX_cache_mem_rsp_if.v
Normal file
|
@ -0,0 +1,18 @@
|
|||
`ifndef VX_CACHE_MEM_RSP_IF
|
||||
`define VX_CACHE_MEM_RSP_IF
|
||||
|
||||
`include "../cache/VX_cache_config.vh"
|
||||
|
||||
interface VX_cache_mem_rsp_if #(
|
||||
parameter MEM_LINE_WIDTH = 1,
|
||||
parameter MEM_TAG_WIDTH = 1
|
||||
) ();
|
||||
|
||||
wire valid;
|
||||
wire [MEM_LINE_WIDTH-1:0] data;
|
||||
wire [MEM_TAG_WIDTH-1:0] tag;
|
||||
wire ready;
|
||||
|
||||
endinterface
|
||||
|
||||
`endif
|
|
@ -5,14 +5,14 @@
|
|||
|
||||
interface VX_perf_cache_if ();
|
||||
|
||||
wire [43:0] reads;
|
||||
wire [43:0] writes;
|
||||
wire [43:0] read_misses;
|
||||
wire [43:0] write_misses;
|
||||
wire [43:0] bank_stalls;
|
||||
wire [43:0] mshr_stalls;
|
||||
wire [43:0] pipe_stalls;
|
||||
wire [43:0] crsp_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] reads;
|
||||
wire [`PERF_CTR_BITS-1:0] writes;
|
||||
wire [`PERF_CTR_BITS-1:0] read_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] write_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] bank_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] mshr_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] pipe_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] crsp_stalls;
|
||||
|
||||
endinterface
|
||||
|
||||
|
|
|
@ -5,28 +5,28 @@
|
|||
|
||||
interface VX_perf_memsys_if ();
|
||||
|
||||
wire [43:0] icache_reads;
|
||||
wire [43:0] icache_read_misses;
|
||||
wire [43:0] icache_pipe_stalls;
|
||||
wire [43:0] icache_crsp_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] icache_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] icache_read_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] icache_pipe_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] icache_crsp_stalls;
|
||||
|
||||
wire [43:0] dcache_reads;
|
||||
wire [43:0] dcache_writes;
|
||||
wire [43:0] dcache_read_misses;
|
||||
wire [43:0] dcache_write_misses;
|
||||
wire [43:0] dcache_bank_stalls;
|
||||
wire [43:0] dcache_mshr_stalls;
|
||||
wire [43:0] dcache_pipe_stalls;
|
||||
wire [43:0] dcache_crsp_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_writes;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_read_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_write_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_bank_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_mshr_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_pipe_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_crsp_stalls;
|
||||
|
||||
wire [43:0] smem_reads;
|
||||
wire [43:0] smem_writes;
|
||||
wire [43:0] smem_bank_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] smem_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] smem_writes;
|
||||
wire [`PERF_CTR_BITS-1:0] smem_bank_stalls;
|
||||
|
||||
wire [43:0] dram_reads;
|
||||
wire [43:0] dram_writes;
|
||||
wire [43:0] dram_stalls;
|
||||
wire [43:0] dram_latency;
|
||||
wire [`PERF_CTR_BITS-1:0] mem_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] mem_writes;
|
||||
wire [`PERF_CTR_BITS-1:0] mem_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] mem_latency;
|
||||
|
||||
endinterface
|
||||
|
||||
|
|
|
@ -4,14 +4,14 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
interface VX_perf_pipeline_if ();
|
||||
wire [43:0] ibf_stalls;
|
||||
wire [43:0] scb_stalls;
|
||||
wire [43:0] lsu_stalls;
|
||||
wire [43:0] csr_stalls;
|
||||
wire [43:0] alu_stalls;
|
||||
wire [43:0] gpu_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] lsu_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] csr_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] alu_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] gpu_stalls;
|
||||
`ifdef EXT_F_ENABLE
|
||||
wire [43:0] fpu_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] fpu_stalls;
|
||||
`endif
|
||||
endinterface
|
||||
|
||||
|
|
|
@ -94,13 +94,13 @@ module VX_scope #(
|
|||
delay_val <= $bits(delay_val)'(cmd_data);
|
||||
cmd_start <= 1;
|
||||
`ifdef DBG_PRINT_SCOPE
|
||||
$display("*** scope:CMD_SET_START: delay_val=%0d", $bits(delay_val)'(cmd_data));
|
||||
$display("%t: *** scope: CMD_SET_START: delay_val=%0d", $time, $bits(delay_val)'(cmd_data));
|
||||
`endif
|
||||
end
|
||||
CMD_SET_STOP: begin
|
||||
waddr_end <= $bits(waddr)'(cmd_data);
|
||||
`ifdef DBG_PRINT_SCOPE
|
||||
$display("*** scope:CMD_SET_STOP: waddr_end=%0d", $bits(waddr)'(cmd_data));
|
||||
$display("%t: *** scope: CMD_SET_STOP: waddr_end=%0d", $time, $bits(waddr)'(cmd_data));
|
||||
`endif
|
||||
end
|
||||
default:;
|
||||
|
@ -117,7 +117,7 @@ module VX_scope #(
|
|||
delay_cntr <= 0;
|
||||
start_time <= timestamp;
|
||||
`ifdef DBG_PRINT_SCOPE
|
||||
$display("*** scope: recording start - start_time=%0d", timestamp);
|
||||
$display("%t: *** scope: recording start - start_time=%0d", $time, timestamp);
|
||||
`endif
|
||||
end else begin
|
||||
start_wait <= 1;
|
||||
|
@ -133,7 +133,7 @@ module VX_scope #(
|
|||
delta <= 0;
|
||||
start_time <= timestamp;
|
||||
`ifdef DBG_PRINT_SCOPE
|
||||
$display("*** scope: recording start - start_time=%0d", timestamp);
|
||||
$display("%t: *** scope: recording start - start_time=%0d", $time, timestamp);
|
||||
`endif
|
||||
end
|
||||
end
|
||||
|
@ -162,7 +162,7 @@ module VX_scope #(
|
|||
if (stop
|
||||
|| (waddr >= waddr_end)) begin
|
||||
`ifdef DBG_PRINT_SCOPE
|
||||
$display("*** scope: recording stop - waddr=(%0d, %0d)", waddr, waddr_end);
|
||||
$display("%t: *** scope: recording stop - waddr=(%0d, %0d)", $time, waddr, waddr_end);
|
||||
`endif
|
||||
waddr <= waddr; // keep last address
|
||||
recording <= 0;
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
# coding=utf-8
|
||||
from __future__ import print_function
|
||||
|
||||
import sys
|
||||
import os
|
||||
import os.path as path
|
||||
import re
|
||||
|
@ -10,55 +11,19 @@ from datetime import datetime
|
|||
|
||||
script_dir = path.dirname(path.realpath(__file__))
|
||||
|
||||
defines = {}
|
||||
for k, v in os.environ.items():
|
||||
if k.upper().startswith('V_'):
|
||||
defines[k[2:]] = v
|
||||
|
||||
print('Custom params:', ', '.join(['='.join(x) for x in defines.items()]))
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--outc', default='none', help='Output C header')
|
||||
parser.add_argument('--outv', default='none', help='Output Verilog header')
|
||||
parser.add_argument('-i', "--input", default='none', help='Verilog header')
|
||||
parser.add_argument('-o', "--output", default='none', help='C header')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.outc == 'none' and args.outv == 'none':
|
||||
print('Warning: not emitting any files. Specify arguments')
|
||||
|
||||
if args.outv != 'none':
|
||||
with open(args.outv, 'w') as f:
|
||||
print('''
|
||||
// auto-generated by gen_config.py. DO NOT EDIT
|
||||
// Generated at {date}
|
||||
|
||||
`ifndef VX_USER_CONFIG
|
||||
`define VX_USER_CONFIG
|
||||
'''[1:].format(date=datetime.now()), file=f)
|
||||
|
||||
for k, v in defines.items():
|
||||
print('`define {} {}'.format(k, v), file=f)
|
||||
|
||||
print('\n`endif', file=f)
|
||||
|
||||
if args.outc != 'none':
|
||||
with open(args.outc, 'w') as f:
|
||||
print('''
|
||||
// auto-generated by gen_config.py. DO NOT EDIT
|
||||
// Generated at {date}
|
||||
|
||||
#ifndef VX_USER_CONFIG
|
||||
#define VX_USER_CONFIG
|
||||
'''[1:].format(date=datetime.now()), file=f)
|
||||
|
||||
for k, v in defines.items():
|
||||
print('#define {} {}'.format(k, v), file=f)
|
||||
|
||||
print('\n#endif', file=f)
|
||||
if args.input == 'none' or args.output == 'none':
|
||||
print('Error: invalid arguments')
|
||||
sys.exit()
|
||||
|
||||
translation_rules = [
|
||||
# preprocessor directives
|
||||
(re.compile(r'^\s*`include .*$'), r''),
|
||||
(re.compile(r'`include\s+.*$'), r''),
|
||||
(re.compile(r'`ifdef'), r'#ifdef'),
|
||||
(re.compile(r'`ifndef'), r'#ifndef'),
|
||||
(re.compile(r'`elif'), r'#elif'),
|
||||
|
@ -75,25 +40,24 @@ translation_rules = [
|
|||
(re.compile(r"\d+'h([\da-fA-F]+)"), r'0x\1')
|
||||
]
|
||||
|
||||
if args.outc != 'none':
|
||||
with open(args.outc, 'a') as f:
|
||||
print('''
|
||||
with open(args.output, 'w') as f:
|
||||
print('''
|
||||
// auto-generated by gen_config.py. DO NOT EDIT
|
||||
// Generated at {date}
|
||||
|
||||
// Translated from VX_config.vh:
|
||||
'''[1:].format(date=datetime.now()), file=f)
|
||||
with open(path.join(script_dir, '../rtl/VX_config.vh'), 'r') as r:
|
||||
lineno = 0
|
||||
for line in r:
|
||||
for pat, repl in translation_rules:
|
||||
match = pat.search(line)
|
||||
if match:
|
||||
line = re.sub(pat, repl, line)
|
||||
#print("*** match @" + str(lineno) + ": " + match.group() + " => " + line)
|
||||
f.write(line)
|
||||
lineno = lineno + 1
|
||||
print('''
|
||||
with open(args.input, 'r') as r:
|
||||
lineno = 0
|
||||
for line in r:
|
||||
for pat, repl in translation_rules:
|
||||
match = pat.search(line)
|
||||
if match:
|
||||
line = re.sub(pat, repl, line)
|
||||
#print("*** match @" + str(lineno) + ": " + match.group() + " => " + line)
|
||||
f.write(line)
|
||||
lineno = lineno + 1
|
||||
print('''
|
||||
'''[1:], file=f)
|
||||
|
||||
|
||||
|
|
|
@ -97,9 +97,8 @@
|
|||
"avs_byteenable":64,
|
||||
"avs_burstcount":4,
|
||||
"avs_readdatavalid":1,
|
||||
"mem_bank_select":1,
|
||||
"cci_dram_rd_req_ctr":26,
|
||||
"cci_dram_wr_req_ctr":26,
|
||||
"cci_mem_rd_req_ctr":26,
|
||||
"cci_mem_wr_req_ctr":26,
|
||||
"cci_rd_req_ctr":26,
|
||||
"cci_rd_rsp_ctr":3,
|
||||
"cci_wr_req_ctr":26,
|
||||
|
@ -110,23 +109,23 @@
|
|||
"!cci_pending_reads_full":1,
|
||||
"!cci_pending_writes_empty":1,
|
||||
"!cci_pending_writes_full": 1,
|
||||
"?afu_dram_req_fire": 1,
|
||||
"afu_dram_req_addr": 26,
|
||||
"afu_dram_req_tag": 28,
|
||||
"?afu_dram_rsp_fire": 1,
|
||||
"afu_dram_rsp_tag": 28
|
||||
"?afu_mem_req_fire": 1,
|
||||
"afu_mem_req_addr": 26,
|
||||
"afu_mem_req_tag": 28,
|
||||
"?afu_mem_rsp_fire": 1,
|
||||
"afu_mem_rsp_tag": 28
|
||||
},
|
||||
"afu/vortex": {
|
||||
"!reset": 1,
|
||||
"?dram_req_fire": 1,
|
||||
"dram_req_addr": 32,
|
||||
"dram_req_rw": 1,
|
||||
"dram_req_byteen":"`VX_DRAM_BYTEEN_WIDTH",
|
||||
"dram_req_data":"`VX_DRAM_LINE_WIDTH",
|
||||
"dram_req_tag":"`VX_DRAM_TAG_WIDTH",
|
||||
"?dram_rsp_fire": 1,
|
||||
"dram_rsp_data":"`VX_DRAM_LINE_WIDTH",
|
||||
"dram_rsp_tag":"`VX_DRAM_TAG_WIDTH",
|
||||
"?mem_req_fire": 1,
|
||||
"mem_req_addr": 32,
|
||||
"mem_req_rw": 1,
|
||||
"mem_req_byteen":"`VX_MEM_BYTEEN_WIDTH",
|
||||
"mem_req_data":"`VX_MEM_LINE_WIDTH",
|
||||
"mem_req_tag":"`VX_MEM_TAG_WIDTH",
|
||||
"?mem_rsp_fire": 1,
|
||||
"mem_rsp_data":"`VX_MEM_LINE_WIDTH",
|
||||
"mem_rsp_tag":"`VX_MEM_TAG_WIDTH",
|
||||
"busy": 1
|
||||
},
|
||||
"afu/vortex/cluster/core/pipeline/fetch/icache_stage": {
|
||||
|
@ -207,7 +206,7 @@
|
|||
"force_miss_st0": 1,
|
||||
"mshr_push": 1,
|
||||
"?crsq_in_stall": 1,
|
||||
"?dreq_alm_full": 1,
|
||||
"?mreq_alm_full": 1,
|
||||
"?mshr_alm_full": 1
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
CFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
|
||||
#CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors
|
||||
|
||||
CFLAGS += -Wno-aligned-new -Wno-maybe-uninitialized
|
||||
CFLAGS += -Wno-maybe-uninitialized
|
||||
|
||||
CFLAGS += -I../..
|
||||
|
||||
|
@ -13,7 +13,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
|
|||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
|
||||
|
@ -22,11 +22,11 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_TEX
|
|||
DBG_FLAGS += $(DBG_PRINT_FLAGS)
|
||||
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
|
||||
|
||||
SINGLECORE += -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0
|
||||
SINGLECORE = -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0
|
||||
|
||||
#MULTICORE ?= -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
|
||||
#MULTICORE ?= -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
|
||||
MULTICORE ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
|
||||
#MULTICORE = -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
|
||||
#MULTICORE = -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
|
||||
MULTICORE = -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
|
||||
|
||||
SINGLECORE += $(CONFIGS)
|
||||
MULTICORE += $(CONFIGS)
|
||||
|
@ -44,15 +44,16 @@ SRCS = simulator.cpp testbench.cpp
|
|||
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
|
||||
|
||||
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
|
||||
VL_FLAGS += -Wno-DECLFILENAME
|
||||
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
||||
VL_FLAGS += --x-initial unique --x-assign unique
|
||||
VL_FLAGS += verilator.vlt
|
||||
|
||||
VL_FLAGS += --exe $(SRCS) $(RTL_INCLUDE)
|
||||
VL_FLAGS += --cc Vortex.v --top-module $(TOP)
|
||||
|
||||
# Use FPNEW PFU core
|
||||
VL_FLAGS += -DFPU_FPNEW
|
||||
# FPU backend
|
||||
FPU_CORE ?= FPU_FPNEW
|
||||
VL_FLAGS += -D$(FPU_CORE)
|
||||
|
||||
DBG_FLAGS += -DVCD_OUTPUT
|
||||
|
||||
|
|
|
@ -5,10 +5,23 @@
|
|||
|
||||
#define RESET_DELAY 4
|
||||
|
||||
#define ENABLE_DRAM_STALLS
|
||||
#define DRAM_LATENCY 24
|
||||
#define DRAM_RQ_SIZE 16
|
||||
#define DRAM_STALLS_MODULO 16
|
||||
#define ENABLE_MEM_STALLS
|
||||
|
||||
#ifndef MEM_LATENCY
|
||||
#define MEM_LATENCY 24
|
||||
#endif
|
||||
|
||||
#ifndef MEM_RQ_SIZE
|
||||
#define MEM_RQ_SIZE 16
|
||||
#endif
|
||||
|
||||
#ifndef MEM_STALLS_MODULO
|
||||
#define MEM_STALLS_MODULO 16
|
||||
#endif
|
||||
|
||||
#ifndef VERILATOR_RESET_VALUE
|
||||
#define VERILATOR_RESET_VALUE 2
|
||||
#endif
|
||||
|
||||
#define VL_WDATA_GETW(lwp, i, n, w) \
|
||||
VL_SEL_IWII(0, n * w, 0, 0, lwp, i * w, w)
|
||||
|
@ -21,7 +34,7 @@ double sc_time_stamp() {
|
|||
|
||||
Simulator::Simulator() {
|
||||
// force random values for unitialized signals
|
||||
Verilated::randReset(2);
|
||||
Verilated::randReset(VERILATOR_RESET_VALUE);
|
||||
Verilated::randSeed(50);
|
||||
|
||||
// Turn off assertion before reset
|
||||
|
@ -56,19 +69,19 @@ Simulator::~Simulator() {
|
|||
|
||||
void Simulator::attach_ram(RAM* ram) {
|
||||
ram_ = ram;
|
||||
dram_rsp_vec_.clear();
|
||||
mem_rsp_vec_.clear();
|
||||
}
|
||||
|
||||
void Simulator::reset() {
|
||||
print_bufs_.clear();
|
||||
dram_rsp_vec_.clear();
|
||||
mem_rsp_vec_.clear();
|
||||
|
||||
dram_rsp_active_ = false;
|
||||
mem_rsp_active_ = false;
|
||||
csr_req_active_ = false;
|
||||
csr_rsp_value_ = nullptr;
|
||||
|
||||
vortex_->dram_rsp_valid = 0;
|
||||
vortex_->dram_req_ready = 0;
|
||||
vortex_->mem_rsp_valid = 0;
|
||||
vortex_->mem_req_ready = 0;
|
||||
//vortex_->io_req_ready = 0;
|
||||
//vortex_->io_rsp_valid = 0;
|
||||
vortex_->csr_req_valid = 0;
|
||||
|
@ -94,13 +107,13 @@ void Simulator::step() {
|
|||
vortex_->clk = 0;
|
||||
this->eval();
|
||||
|
||||
dram_rsp_ready_ = vortex_->dram_rsp_ready;
|
||||
mem_rsp_ready_ = vortex_->mem_rsp_ready;
|
||||
csr_req_ready_ = vortex_->csr_req_ready;
|
||||
|
||||
vortex_->clk = 1;
|
||||
this->eval();
|
||||
|
||||
this->eval_dram_bus();
|
||||
this->eval_mem_bus();
|
||||
this->eval_io_bus();
|
||||
this->eval_csr_bus();
|
||||
|
||||
|
@ -117,83 +130,83 @@ void Simulator::eval() {
|
|||
++timestamp;
|
||||
}
|
||||
|
||||
void Simulator::eval_dram_bus() {
|
||||
void Simulator::eval_mem_bus() {
|
||||
if (ram_ == nullptr) {
|
||||
vortex_->dram_req_ready = 0;
|
||||
vortex_->mem_req_ready = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
// update DRAM responses schedule
|
||||
for (auto& rsp : dram_rsp_vec_) {
|
||||
// update memory responses schedule
|
||||
for (auto& rsp : mem_rsp_vec_) {
|
||||
if (rsp.cycles_left > 0)
|
||||
rsp.cycles_left -= 1;
|
||||
}
|
||||
|
||||
// schedule DRAM responses in FIFO order
|
||||
std::list<dram_req_t>::iterator dram_rsp_it(dram_rsp_vec_.end());
|
||||
if (!dram_rsp_vec_.empty()
|
||||
&& (0 == dram_rsp_vec_.begin()->cycles_left)) {
|
||||
dram_rsp_it = dram_rsp_vec_.begin();
|
||||
// schedule memory responses in FIFO order
|
||||
std::list<mem_req_t>::iterator mem_rsp_it(mem_rsp_vec_.end());
|
||||
if (!mem_rsp_vec_.empty()
|
||||
&& (0 == mem_rsp_vec_.begin()->cycles_left)) {
|
||||
mem_rsp_it = mem_rsp_vec_.begin();
|
||||
}
|
||||
|
||||
// send DRAM response
|
||||
if (dram_rsp_active_
|
||||
&& vortex_->dram_rsp_valid && dram_rsp_ready_) {
|
||||
dram_rsp_active_ = false;
|
||||
// send memory response
|
||||
if (mem_rsp_active_
|
||||
&& vortex_->mem_rsp_valid && mem_rsp_ready_) {
|
||||
mem_rsp_active_ = false;
|
||||
}
|
||||
if (!dram_rsp_active_) {
|
||||
if (dram_rsp_it != dram_rsp_vec_.end()) {
|
||||
vortex_->dram_rsp_valid = 1;
|
||||
memcpy((uint8_t*)vortex_->dram_rsp_data, dram_rsp_it->block.data(), GLOBAL_BLOCK_SIZE);
|
||||
vortex_->dram_rsp_tag = dram_rsp_it->tag;
|
||||
dram_rsp_vec_.erase(dram_rsp_it);
|
||||
dram_rsp_active_ = true;
|
||||
if (!mem_rsp_active_) {
|
||||
if (mem_rsp_it != mem_rsp_vec_.end()) {
|
||||
vortex_->mem_rsp_valid = 1;
|
||||
memcpy((uint8_t*)vortex_->mem_rsp_data, mem_rsp_it->block.data(), MEM_BLOCK_SIZE);
|
||||
vortex_->mem_rsp_tag = mem_rsp_it->tag;
|
||||
mem_rsp_vec_.erase(mem_rsp_it);
|
||||
mem_rsp_active_ = true;
|
||||
} else {
|
||||
vortex_->dram_rsp_valid = 0;
|
||||
vortex_->mem_rsp_valid = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// handle DRAM stalls
|
||||
bool dram_stalled = false;
|
||||
#ifdef ENABLE_DRAM_STALLS
|
||||
if (0 == ((timestamp/2) % DRAM_STALLS_MODULO)) {
|
||||
dram_stalled = true;
|
||||
// handle memory stalls
|
||||
bool mem_stalled = false;
|
||||
#ifdef ENABLE_MEM_STALLS
|
||||
if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) {
|
||||
mem_stalled = true;
|
||||
} else
|
||||
if (dram_rsp_vec_.size() >= DRAM_RQ_SIZE) {
|
||||
dram_stalled = true;
|
||||
if (mem_rsp_vec_.size() >= MEM_RQ_SIZE) {
|
||||
mem_stalled = true;
|
||||
}
|
||||
#endif
|
||||
|
||||
// process DRAM requests
|
||||
if (!dram_stalled) {
|
||||
if (vortex_->dram_req_valid) {
|
||||
if (vortex_->dram_req_rw) {
|
||||
uint64_t byteen = vortex_->dram_req_byteen;
|
||||
unsigned base_addr = (vortex_->dram_req_addr * GLOBAL_BLOCK_SIZE);
|
||||
uint8_t* data = (uint8_t*)(vortex_->dram_req_data);
|
||||
for (int i = 0; i < GLOBAL_BLOCK_SIZE; i++) {
|
||||
// process memory requests
|
||||
if (!mem_stalled) {
|
||||
if (vortex_->mem_req_valid) {
|
||||
if (vortex_->mem_req_rw) {
|
||||
uint64_t byteen = vortex_->mem_req_byteen;
|
||||
unsigned base_addr = (vortex_->mem_req_addr * MEM_BLOCK_SIZE);
|
||||
uint8_t* data = (uint8_t*)(vortex_->mem_req_data);
|
||||
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
|
||||
if ((byteen >> i) & 0x1) {
|
||||
(*ram_)[base_addr + i] = data[i];
|
||||
}
|
||||
}
|
||||
} else {
|
||||
dram_req_t dram_req;
|
||||
dram_req.tag = vortex_->dram_req_tag;
|
||||
dram_req.addr = vortex_->dram_req_addr;
|
||||
ram_->read(vortex_->dram_req_addr * GLOBAL_BLOCK_SIZE, GLOBAL_BLOCK_SIZE, dram_req.block.data());
|
||||
dram_req.cycles_left = DRAM_LATENCY;
|
||||
for (auto& rsp : dram_rsp_vec_) {
|
||||
if (dram_req.addr == rsp.addr) {
|
||||
dram_req.cycles_left = rsp.cycles_left;
|
||||
mem_req_t mem_req;
|
||||
mem_req.tag = vortex_->mem_req_tag;
|
||||
mem_req.addr = vortex_->mem_req_addr;
|
||||
ram_->read(vortex_->mem_req_addr * MEM_BLOCK_SIZE, MEM_BLOCK_SIZE, mem_req.block.data());
|
||||
mem_req.cycles_left = MEM_LATENCY;
|
||||
for (auto& rsp : mem_rsp_vec_) {
|
||||
if (mem_req.addr == rsp.addr) {
|
||||
mem_req.cycles_left = rsp.cycles_left;
|
||||
break;
|
||||
}
|
||||
}
|
||||
dram_rsp_vec_.emplace_back(dram_req);
|
||||
mem_rsp_vec_.emplace_back(mem_req);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vortex_->dram_req_ready = !dram_stalled;
|
||||
vortex_->mem_req_ready = !mem_stalled;
|
||||
}
|
||||
|
||||
void Simulator::eval_io_bus() {
|
||||
|
|
|
@ -48,23 +48,23 @@ private:
|
|||
|
||||
typedef struct {
|
||||
int cycles_left;
|
||||
std::array<uint8_t, GLOBAL_BLOCK_SIZE> block;
|
||||
std::array<uint8_t, MEM_BLOCK_SIZE> block;
|
||||
uint32_t addr;
|
||||
uint32_t tag;
|
||||
} dram_req_t;
|
||||
} mem_req_t;
|
||||
|
||||
std::unordered_map<int, std::stringstream> print_bufs_;
|
||||
|
||||
void eval();
|
||||
|
||||
void eval_dram_bus();
|
||||
void eval_mem_bus();
|
||||
void eval_io_bus();
|
||||
void eval_csr_bus();
|
||||
|
||||
std::list<dram_req_t> dram_rsp_vec_;
|
||||
bool dram_rsp_active_;
|
||||
std::list<mem_req_t> mem_rsp_vec_;
|
||||
bool mem_rsp_active_;
|
||||
|
||||
bool dram_rsp_ready_;
|
||||
bool mem_rsp_ready_;
|
||||
bool csr_req_ready_;
|
||||
bool csr_req_active_;
|
||||
uint32_t* csr_rsp_value_;
|
||||
|
|
|
@ -1,32 +1,114 @@
|
|||
ASE_BUILD_DIR=build_ase
|
||||
FPGA_BUILD_DIR=build_fpga
|
||||
DEVICE_FAMILY ?= arria10
|
||||
ASE_BUILD_DIR ?= build_ase_$(DEVICE_FAMILY)
|
||||
FPGA_BUILD_DIR ?= build_fpga_$(DEVICE_FAMILY)
|
||||
RTL_DIR=../../rtl
|
||||
|
||||
ifeq (, $(shell which qsub-synth))
|
||||
ifeq ($(shell which qsub-synth),)
|
||||
RUN_SYNTH=$(OPAE_PLATFORM_ROOT)/bin/run.sh > build.log 2>&1 &
|
||||
else
|
||||
RUN_SYNTH=qsub-synth
|
||||
endif
|
||||
|
||||
# control RTL debug print states
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
|
||||
|
||||
DBG_FLAGS += $(DBG_PRINT_FLAGS)
|
||||
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
|
||||
|
||||
CONFIG1 := -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
|
||||
CONFIG2 := -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
|
||||
CONFIG4 := -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
|
||||
CONFIG8 := -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
|
||||
CONFIG16 := -DNUM_CLUSTERS=4 -DNUM_CORES=4 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
|
||||
CONFIG32 := -DNUM_CLUSTERS=4 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
|
||||
CONFIG64 := -DNUM_CLUSTERS=8 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
|
||||
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)
|
||||
RTL_INCLUDE = -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) -I$(RTL_DIR) -I$(RTL_DIR)/afu
|
||||
|
||||
CFLAGS += $(RTL_INCLUDE)
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CFLAGS += $(DBG_FLAGS)
|
||||
else
|
||||
CFLAGS += -DNDEBUG
|
||||
endif
|
||||
|
||||
# Enable scope analyzer
|
||||
ifdef SCOPE
|
||||
CFLAGS += -DSCOPE
|
||||
endif
|
||||
|
||||
# Enable perf counters
|
||||
ifdef PERF
|
||||
CFLAGS += -DPERF_ENABLE
|
||||
endif
|
||||
|
||||
all: ase-1c
|
||||
|
||||
gen_sources_a10:
|
||||
./gen_sources.sh arria10 > sources.txt
|
||||
$(ASE_BUILD_DIR)_1c/Makefile:
|
||||
afu_sim_setup -s setup.cfg $(ASE_BUILD_DIR)_1c
|
||||
|
||||
gen_sources_s10:
|
||||
./gen_sources.sh stratix10 > sources.txt
|
||||
$(ASE_BUILD_DIR)_2c/Makefile:
|
||||
afu_sim_setup -s setup.cfg $(ASE_BUILD_DIR)_2c
|
||||
|
||||
ase-1c: gen_sources_a10 setup-ase-1c
|
||||
make -C $(ASE_BUILD_DIR)_1c
|
||||
cp $(RTL_DIR)/fp_cores/altera/arria10/*.hex $(ASE_BUILD_DIR)_1c/work
|
||||
$(ASE_BUILD_DIR)_4c/Makefile:
|
||||
afu_sim_setup -s setup.cfg $(ASE_BUILD_DIR)_4c
|
||||
|
||||
ase-2c: gen_sources_a10 setup-ase-2c
|
||||
make -C $(ASE_BUILD_DIR)_2c
|
||||
cp $(RTL_DIR)/fp_cores/altera/arria10/*.hex $(ASE_BUILD_DIR)_2c/work
|
||||
$(FPGA_BUILD_DIR)_1c/build/dcp.qpf:
|
||||
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_1c
|
||||
|
||||
ase-4c: gen_sources_a10 setup-ase-4c
|
||||
make -C $(ASE_BUILD_DIR)_4c
|
||||
cp $(RTL_DIR)/fp_cores/altera/arria10/*.hex $(ASE_BUILD_DIR)_4c/work
|
||||
$(FPGA_BUILD_DIR)_2c/build/dcp.qpf:
|
||||
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_2c
|
||||
|
||||
$(FPGA_BUILD_DIR)_4c/build/dcp.qpf:
|
||||
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_4c
|
||||
|
||||
$(FPGA_BUILD_DIR)_8c/build/dcp.qpf:
|
||||
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_8c
|
||||
|
||||
$(FPGA_BUILD_DIR)_16c/build/dcp.qpf:
|
||||
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_16c
|
||||
|
||||
$(FPGA_BUILD_DIR)_32c/build/dcp.qpf:
|
||||
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_32c
|
||||
|
||||
$(FPGA_BUILD_DIR)_64c/build/dcp.qpf:
|
||||
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_64c
|
||||
|
||||
gen-sources-1c:
|
||||
./gen_sources.sh $(CFLAGS) $(CONFIG1) > sources.txt
|
||||
|
||||
gen-sources-2c:
|
||||
./gen_sources.sh $(CFLAGS) $(CONFIG2) > sources.txt
|
||||
|
||||
gen-sources-4c:
|
||||
./gen_sources.sh $(CFLAGS) $(CONFIG4) > sources.txt
|
||||
|
||||
gen-sources-8c:
|
||||
./gen_sources.sh $(CFLAGS) $(CONFIG8) > sources.txt
|
||||
|
||||
gen-sources-16c:
|
||||
./gen_sources.sh $(CFLAGS) $(CONFIG16) > sources.txt
|
||||
|
||||
gen-sources-32c:
|
||||
./gen_sources.sh $(CFLAGS) $(CONFIG32) > sources.txt
|
||||
|
||||
gen-sources-64c:
|
||||
./gen_sources.sh $(CFLAGS) $(CONFIG64) > sources.txt
|
||||
|
||||
# setup
|
||||
|
||||
setup-ase-1c: $(ASE_BUILD_DIR)_1c/Makefile
|
||||
|
||||
|
@ -34,36 +116,6 @@ setup-ase-2c: $(ASE_BUILD_DIR)_2c/Makefile
|
|||
|
||||
setup-ase-4c: $(ASE_BUILD_DIR)_4c/Makefile
|
||||
|
||||
$(ASE_BUILD_DIR)_1c/Makefile:
|
||||
afu_sim_setup -s sources_1c.txt $(ASE_BUILD_DIR)_1c
|
||||
|
||||
$(ASE_BUILD_DIR)_2c/Makefile:
|
||||
afu_sim_setup -s sources_2c.txt $(ASE_BUILD_DIR)_2c
|
||||
|
||||
$(ASE_BUILD_DIR)_4c/Makefile:
|
||||
afu_sim_setup -s sources_4c.txt $(ASE_BUILD_DIR)_4c
|
||||
|
||||
fpga-1c: gen_sources_a10 setup-fpga-1c
|
||||
cd $(FPGA_BUILD_DIR)_1c && $(RUN_SYNTH)
|
||||
|
||||
fpga-2c: gen_sources_a10 setup-fpga-2c
|
||||
cd $(FPGA_BUILD_DIR)_2c && $(RUN_SYNTH)
|
||||
|
||||
fpga-4c: gen_sources_a10 setup-fpga-4c
|
||||
cd $(FPGA_BUILD_DIR)_4c && $(RUN_SYNTH)
|
||||
|
||||
fpga-8c: gen_sources_a10 setup-fpga-8c
|
||||
cd $(FPGA_BUILD_DIR)_8c && $(RUN_SYNTH)
|
||||
|
||||
fpga-16c: gen_sources_a10 setup-fpga-16c
|
||||
cd $(FPGA_BUILD_DIR)_16c && $(RUN_SYNTH)
|
||||
|
||||
fpga-32c: gen_sources_s10 setup-fpga-32c
|
||||
cd $(FPGA_BUILD_DIR)_32c && $(RUN_SYNTH)
|
||||
|
||||
fpga-64c: gen_sources_s10 setup-fpga-64c
|
||||
cd $(FPGA_BUILD_DIR)_64c && $(RUN_SYNTH)
|
||||
|
||||
setup-fpga-1c: $(FPGA_BUILD_DIR)_1c/build/dcp.qpf
|
||||
|
||||
setup-fpga-2c: $(FPGA_BUILD_DIR)_2c/build/dcp.qpf
|
||||
|
@ -78,35 +130,42 @@ setup-fpga-32c: $(FPGA_BUILD_DIR)_32c/build/dcp.qpf
|
|||
|
||||
setup-fpga-64c: $(FPGA_BUILD_DIR)_64c/build/dcp.qpf
|
||||
|
||||
$(FPGA_BUILD_DIR)_1c/build/dcp.qpf:
|
||||
afu_synth_setup -s sources_1c.txt $(FPGA_BUILD_DIR)_1c
|
||||
# build
|
||||
|
||||
$(FPGA_BUILD_DIR)_2c/build/dcp.qpf:
|
||||
afu_synth_setup -s sources_2c.txt $(FPGA_BUILD_DIR)_2c
|
||||
ase-1c: gen-sources-1c setup-ase-1c
|
||||
make -C $(ASE_BUILD_DIR)_1c
|
||||
cp $(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)/*.hex $(ASE_BUILD_DIR)_1c/work
|
||||
|
||||
$(FPGA_BUILD_DIR)_4c/build/dcp.qpf:
|
||||
afu_synth_setup -s sources_4c.txt $(FPGA_BUILD_DIR)_4c
|
||||
ase-2c: gen-sources-2c setup-ase-2c
|
||||
make -C $(ASE_BUILD_DIR)_2c
|
||||
cp $(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)/*.hex $(ASE_BUILD_DIR)_2c/work
|
||||
|
||||
$(FPGA_BUILD_DIR)_8c/build/dcp.qpf:
|
||||
afu_synth_setup -s sources_8c.txt $(FPGA_BUILD_DIR)_8c
|
||||
ase-4c: gen-sources-4c setup-ase-4c
|
||||
make -C $(ASE_BUILD_DIR)_4c
|
||||
cp $(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)/*.hex $(ASE_BUILD_DIR)_4c/work
|
||||
|
||||
$(FPGA_BUILD_DIR)_16c/build/dcp.qpf:
|
||||
afu_synth_setup -s sources_16c.txt $(FPGA_BUILD_DIR)_16c
|
||||
fpga-1c: gen-sources-1c setup-fpga-1c
|
||||
cd $(FPGA_BUILD_DIR)_1c && $(RUN_SYNTH)
|
||||
|
||||
$(FPGA_BUILD_DIR)_32c/build/dcp.qpf:
|
||||
afu_synth_setup -s sources_32c.txt $(FPGA_BUILD_DIR)_32c
|
||||
fpga-2c: gen-sources-2c setup-fpga-2c
|
||||
cd $(FPGA_BUILD_DIR)_2c && $(RUN_SYNTH)
|
||||
|
||||
$(FPGA_BUILD_DIR)_64c/build/dcp.qpf:
|
||||
afu_synth_setup -s sources_64c.txt $(FPGA_BUILD_DIR)_64c
|
||||
fpga-4c: gen-sources-4c setup-fpga-4c
|
||||
cd $(FPGA_BUILD_DIR)_4c && $(RUN_SYNTH)
|
||||
|
||||
run-ase-1c:
|
||||
cd $(ASE_BUILD_DIR)_1c && make sim
|
||||
fpga-8c: gen-sources-8c setup-fpga-8c
|
||||
cd $(FPGA_BUILD_DIR)_8c && $(RUN_SYNTH)
|
||||
|
||||
run-ase-2c:
|
||||
cd $(ASE_BUILD_DIR)_2c && make sim
|
||||
fpga-16c: gen-sources-16c setup-fpga-16c
|
||||
cd $(FPGA_BUILD_DIR)_16c && $(RUN_SYNTH)
|
||||
|
||||
run-ase-4c:
|
||||
cd $(ASE_BUILD_DIR)_4c && make sim
|
||||
fpga-32c: gen-sources-32c setup-fpga-32c
|
||||
cd $(FPGA_BUILD_DIR)_32c && $(RUN_SYNTH)
|
||||
|
||||
fpga-64c: gen-sources-64c setup-fpga-64c
|
||||
cd $(FPGA_BUILD_DIR)_64c && $(RUN_SYNTH)
|
||||
|
||||
# cleanup
|
||||
|
||||
clean-ase-1c:
|
||||
rm -rf $(ASE_BUILD_DIR)_1c sources.txt
|
||||
|
|
|
@ -44,6 +44,9 @@ fpgaconf vortex_afu.gbs
|
|||
# If this says Multiple ports. Then use --bus with fpgaconf. #bus info can be found by fpgainfo port
|
||||
fpgaconf --bus 0xaf vortex_afu.gbs
|
||||
|
||||
# get portid
|
||||
fpgainfo port
|
||||
|
||||
# Running the Test case
|
||||
cd /driver/tests/basic
|
||||
make run-fpga
|
||||
|
@ -60,11 +63,13 @@ qsub-sim
|
|||
make ase
|
||||
|
||||
# tests
|
||||
./run_ase.sh build_ase_1c ../../../driver/tests/basic/basic -n16
|
||||
./run_ase.sh build_ase_1c ../../../driver/tests/demo/demo -n16
|
||||
./run_ase.sh build_ase_1c ../../../driver/tests/dogfood/dogfood -n16
|
||||
./run_ase.sh build_ase_1c ../../../benchmarks/opencl/vecadd/vecadd
|
||||
./run_ase.sh build_ase_1c ../../../benchmarks/opencl/sgemm/sgemm -n4
|
||||
./run_ase.sh build_ase_arria10_1c ../../../driver/tests/basic/basic -n1 -t0
|
||||
./run_ase.sh build_ase_arria10_1c ../../../driver/tests/basic/basic -n1 -t1
|
||||
./run_ase.sh build_ase_arria10_1c ../../../driver/tests/basic/basic -n16
|
||||
./run_ase.sh build_ase_arria10_1c ../../../driver/tests/demo/demo -n16
|
||||
./run_ase.sh build_ase_arria10_1c ../../../driver/tests/dogfood/dogfood -n16
|
||||
./run_ase.sh build_ase_arria10_1c ../../../benchmarks/opencl/vecadd/vecadd
|
||||
./run_ase.sh build_ase_arria10_1c ../../../benchmarks/opencl/sgemm/sgemm -n4
|
||||
|
||||
# modify "vsim_run.tcl" to dump VCD trace
|
||||
vcd file trace.vcd
|
||||
|
@ -75,17 +80,10 @@ run -all
|
|||
tar -zcvf output_files_1c.tar.gz `find ./build_fpga_1c -type f \( -iname \*.rpt -o -iname \*.txt -o -iname \*summary -o -iname \*.log \)`
|
||||
|
||||
# compress VCD trace
|
||||
tar -zcvf vortex.vcd.tar.gz ./build_ase_1c/work/vortex.vcd
|
||||
tar -zcvf trace.vcd.tar.gz obj_dir/trace.vcd
|
||||
tar -zcvf trace.fst.tar.gz trace.fst run.log
|
||||
tar -zcvf run.log.tar.gz run.log
|
||||
tar -cvjf vortex.vcd.tar.bz2 build_ase_1c/work/vortex.vcd
|
||||
tar -zcvf vortex.vcd.tar.gz build_ase_1c/work/vortex.vcd
|
||||
tar -zcvf run.log.tar.gz build_ase_1c/work/run.log
|
||||
tar -zcvf vx_scope.vcd.tar.gz vx_scope.vcd
|
||||
tar -cvjf vx_scope.vcd.tar.bz2 vx_scope.vcd
|
||||
tar -cvjf trace.fst.tar.bz2 trace.fst run.log
|
||||
tar -cvjf trace.vcd.tar.bz2 trace.vcd run.log
|
||||
tar -cvjf trace.vcd.tar.bz2 build_ase_arria10_1c/work/run.log build_ase_arria10_1c/work/trace.vcd
|
||||
|
||||
# decompress VCD trace
|
||||
tar -zxvf vortex.vcd.tar.gz
|
||||
|
@ -95,15 +93,4 @@ tar -xvf vortex.vcd.tar.bz2
|
|||
lsof +D build_ase_1c
|
||||
|
||||
# quick off synthesis
|
||||
make -C unittest clean && make -C unittest > unittest/build.log 2>&1 &
|
||||
make -C pipeline clean && make -C pipeline > pipeline/build.log 2>&1 &
|
||||
make -C cache clean && make -C cache > cache/build.log 2>&1 &
|
||||
make -C core clean && make -C core > core/build.log 2>&1 &
|
||||
make -C vortex clean && make -C vortex > vortex/build.log 2>&1 &
|
||||
make -C top1 clean && make -C top1 > top1/build.log 2>&1 &
|
||||
make -C top2 clean && make -C top2 > top2/build.log 2>&1 &
|
||||
make -C top4 clean && make -C top4 > top4/build.log 2>&1 &
|
||||
make -C top8 clean && make -C top8 > top8/build.log 2>&1 &
|
||||
make -C top16 clean && make -C top16 > top16/build.log 2>&1 &
|
||||
make -C top32 clean && make -C top32 > top32/build.log 2>&1 &
|
||||
make -C top64 clean && make -C top64 > top64/build.log 2>&1 &
|
||||
make core
|
|
@ -1,39 +1,46 @@
|
|||
#!/bin/bash
|
||||
|
||||
rtl_dir="../../rtl"
|
||||
exclude_list="VX_fpu_fpnew.v"
|
||||
file_list=""
|
||||
macros=()
|
||||
includes=()
|
||||
|
||||
add_dirs()
|
||||
{
|
||||
for dir in $*; do
|
||||
echo "+incdir+$dir"
|
||||
for file in $(find $dir -maxdepth 1 -name '*.v' -o -name '*.sv' -type f); do
|
||||
exclude=0
|
||||
for fe in $exclude_list; do
|
||||
if [[ $file =~ $fe ]]; then
|
||||
exclude=1
|
||||
fi
|
||||
done
|
||||
if [[ $exclude == 0 ]]; then
|
||||
file_list="$file_list $file"
|
||||
# parse command arguments
|
||||
while getopts D:I:h flag
|
||||
do
|
||||
case "${flag}" in
|
||||
D) macros+=( ${OPTARG} );;
|
||||
I) includes+=( ${OPTARG} );;
|
||||
h) echo "Usage: [-D macro] [-I include] [-h help]"
|
||||
exit 0
|
||||
;;
|
||||
\?)
|
||||
echo "Invalid option: -$OPTARG" 1>&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
# dump macros
|
||||
for value in ${macros[@]}; do
|
||||
echo "+define+$value"
|
||||
done
|
||||
|
||||
# dump include directories
|
||||
for dir in ${includes[@]}; do
|
||||
echo "+incdir+$dir"
|
||||
done
|
||||
|
||||
# dump source files
|
||||
for dir in ${includes[@]}; do
|
||||
for file in $(find $dir -maxdepth 1 -name '*.v' -o -name '*.sv' -type f); do
|
||||
exclude=0
|
||||
for fe in $exclude_list; do
|
||||
if [[ $file =~ $fe ]]; then
|
||||
exclude=1
|
||||
fi
|
||||
done
|
||||
if [[ $exclude == 0 ]]; then
|
||||
echo $file
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
add_files()
|
||||
{
|
||||
for file in $*; do
|
||||
file_list="$file_list $file"
|
||||
done
|
||||
}
|
||||
|
||||
add_dirs $rtl_dir/fp_cores/altera/$1
|
||||
|
||||
add_dirs $rtl_dir/libs $rtl_dir/interfaces $rtl_dir/fp_cores $rtl_dir/cache $rtl_dir/tex_unit $rtl_dir $rtl_dir/afu
|
||||
|
||||
# dump file list
|
||||
for file in $file_list; do
|
||||
echo $file
|
||||
done
|
|
@ -1,8 +1,5 @@
|
|||
+define+NUM_CORES=2
|
||||
|
||||
+define+SYNTHESIS
|
||||
+define+QUARTUS
|
||||
#+define+PERF_ENABLE
|
||||
|
||||
vortex_afu.json
|
||||
QI:vortex_afu.qsf
|
|
@ -1,12 +0,0 @@
|
|||
+define+NUM_CORES=4
|
||||
+define+NUM_CLUSTERS=4
|
||||
#+define+L3_ENABLE=1
|
||||
|
||||
+define+SYNTHESIS
|
||||
+define+QUARTUS
|
||||
#+define+PERF_ENABLE
|
||||
|
||||
vortex_afu16.json
|
||||
QI:vortex_afu.qsf
|
||||
|
||||
C:sources.txt
|
|
@ -1,24 +0,0 @@
|
|||
+define+NUM_CORES=1
|
||||
|
||||
+define+SYNTHESIS
|
||||
+define+QUARTUS
|
||||
#+define+SCOPE
|
||||
#+define+PERF_ENABLE
|
||||
|
||||
#+define+DBG_PRINT_CORE_ICACHE
|
||||
#+define+DBG_PRINT_CORE_DCACHE
|
||||
#+define+DBG_PRINT_CACHE_BANK
|
||||
#+define+DBG_PRINT_CACHE_MSHR
|
||||
#+define+DBG_PRINT_CACHE_TAG
|
||||
#+define+DBG_PRINT_CACHE_DATA
|
||||
#+define+DBG_PRINT_DRAM
|
||||
#+define+DBG_PRINT_PIPELINE
|
||||
#+define+DBG_PRINT_OPAE
|
||||
#+define+DBG_PRINT_AVS
|
||||
#+define+DBG_PRINT_SCOPE
|
||||
#+define+DBG_CACHE_REQ_INFO
|
||||
|
||||
vortex_afu.json
|
||||
QI:vortex_afu.qsf
|
||||
|
||||
C:sources.txt
|
|
@ -1,14 +0,0 @@
|
|||
+define+NUM_CORES=8
|
||||
+define+NUM_CLUSTERS=4
|
||||
#+define+L3_ENABLE=1
|
||||
|
||||
+define+GLOBAL_BLOCK_SIZE=16
|
||||
|
||||
+define+SYNTHESIS
|
||||
+define+QUARTUS
|
||||
#+define+PERF_ENABLE
|
||||
|
||||
vortex_afu.json
|
||||
QI:vortex_afu.qsf
|
||||
|
||||
C:sources.txt
|
|
@ -1,10 +0,0 @@
|
|||
+define+NUM_CORES=4
|
||||
|
||||
+define+SYNTHESIS
|
||||
+define+QUARTUS
|
||||
#+define+PERF_ENABLE
|
||||
|
||||
vortex_afu.json
|
||||
QI:vortex_afu.qsf
|
||||
|
||||
C:sources.txt
|
|
@ -1,14 +0,0 @@
|
|||
+define+NUM_CORES=8
|
||||
+define+NUM_CLUSTERS=8
|
||||
#+define+L3_ENABLE=1
|
||||
|
||||
+define+GLOBAL_BLOCK_SIZE=16
|
||||
|
||||
+define+SYNTHESIS
|
||||
+define+QUARTUS
|
||||
#+define+PERF_ENABLE
|
||||
|
||||
vortex_afu.json
|
||||
QI:vortex_afu.qsf
|
||||
|
||||
C:sources.txt
|
|
@ -1,12 +0,0 @@
|
|||
+define+NUM_CORES=4
|
||||
+define+NUM_CLUSTERS=2
|
||||
#+define+L3_ENABLE=1
|
||||
|
||||
+define+SYNTHESIS
|
||||
+define+QUARTUS
|
||||
#+define+PERF_ENABLE
|
||||
|
||||
vortex_afu8.json
|
||||
QI:vortex_afu.qsf
|
||||
|
||||
C:sources.txt
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue