Merge branch 'master' into graphics

This commit is contained in:
Blaise Tine 2021-05-26 23:33:06 -07:00
commit d42171d2ed
120 changed files with 4269 additions and 2329 deletions

View file

@ -18,10 +18,10 @@ install:
- export RISCV_TOOLCHAIN_PATH=/opt/riscv-gnu-toolchain
- export VERILATOR_ROOT=/opt/verilator
- export PATH=$VERILATOR_ROOT/bin:$PATH
- make -s
script:
- ./ci/regression.sh
- ./ci/test_compiler.sh
after_success:
# Gather code coverage

View file

@ -233,7 +233,7 @@ free(allPlatforms);*/
//--cambine-4: Create an OpenCL command queue
oclHandles.queue = clCreateCommandQueue(
oclHandles.context, oclHandles.devices[DEVICE_ID_INUSED], 0, &resultCL);
printf("resultCL=%d, queue=0x%x\n", resultCL, oclHandles.queue);
//printf("resultCL=%d, queue=0x%x\n", resultCL, oclHandles.queue);
if ((resultCL != CL_SUCCESS) || (oclHandles.queue == NULL))
throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)"));
@ -383,8 +383,8 @@ void _clRelease() {
errorFlag = true;
}
oclHandles.kernel[nKernel] = NULL;
printf("clReleaseKernel()\n");
}
oclHandles.kernel.clear();
}
if (oclHandles.program != NULL) {
@ -394,6 +394,7 @@ void _clRelease() {
errorFlag = true;
}
oclHandles.program = NULL;
printf("clReleaseProgram()\n");
}
if (oclHandles.queue != NULL) {
@ -403,10 +404,9 @@ void _clRelease() {
errorFlag = true;
}
oclHandles.queue = NULL;
printf("clReleaseCommandQueue()\n");
}
free(oclHandles.devices);
if (oclHandles.context != NULL) {
cl_int resultCL = clReleaseContext(oclHandles.context);
if (resultCL != CL_SUCCESS) {
@ -414,6 +414,17 @@ void _clRelease() {
errorFlag = true;
}
oclHandles.context = NULL;
printf("clReleaseContext()\n");
}
if (oclHandles.devices != NULL) {
cl_int resultCL = clReleaseDevice(oclHandles.devices[0]);
if (resultCL != CL_SUCCESS) {
cerr << "ReleaseCL()::Error: In clReleaseDevice" << endl;
errorFlag = true;
}
free(oclHandles.devices);
printf("clReleaseDevice()\n");
}
if (errorFlag)
@ -675,7 +686,7 @@ void _clFinish() throw(string) {
void _clInvokeKernel(int kernel_id, int work_items,
int work_group_size) throw(string) {
cl_uint work_dim = WORK_DIM;
cl_event e[1];
//cl_event e[1];
if (work_items % work_group_size != 0) // process situations that work_items
// cannot be divided by work_group_size
work_items =
@ -684,7 +695,7 @@ void _clInvokeKernel(int kernel_id, int work_items,
size_t global_work_size[] = {work_items, 1};
oclHandles.cl_status = clEnqueueNDRangeKernel(
oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0,
global_work_size, local_work_size, 0, 0, &(e[0]));
global_work_size, local_work_size, 0, 0, NULL);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
switch (oclHandles.cl_status) {
@ -749,13 +760,13 @@ void _clInvokeKernel2D(int kernel_id, int range_x, int range_y, int group_x,
cl_uint work_dim = WORK_DIM;
size_t local_work_size[] = {group_x, group_y};
size_t global_work_size[] = {range_x, range_y};
cl_event e[1];
//cl_event e[1];
/*if(work_items%work_group_size != 0) //process situations that work_items
cannot be divided by work_group_size
work_items = work_items + (work_group_size-(work_items%work_group_size));*/
oclHandles.cl_status = clEnqueueNDRangeKernel(
oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0,
global_work_size, local_work_size, 0, 0, &(e[0]));
global_work_size, local_work_size, 0, 0, NULL);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
switch (oclHandles.cl_status) {

View file

@ -78,14 +78,15 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
char h_over;
cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask,
d_graph_visited, d_cost, d_over;
try {
//--1 transfer data from host to device
_clInit();
d_graph_nodes = _clMalloc(no_of_nodes * sizeof(Node), h_graph_nodes);
d_graph_edges = _clMalloc(edge_list_size * sizeof(int), h_graph_edges);
d_graph_mask = _clMallocRW(no_of_nodes * sizeof(char), h_graph_mask);
d_updating_graph_mask =
_clMallocRW(no_of_nodes * sizeof(char), h_updating_graph_mask);
d_updating_graph_mask = _clMallocRW(no_of_nodes * sizeof(char), h_updating_graph_mask);
d_graph_visited = _clMallocRW(no_of_nodes * sizeof(char), h_graph_visited);
d_cost = _clMallocRW(no_of_nodes * sizeof(int), h_cost);
@ -94,8 +95,7 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
_clMemcpyH2D(d_graph_nodes, no_of_nodes * sizeof(Node), h_graph_nodes);
_clMemcpyH2D(d_graph_edges, edge_list_size * sizeof(int), h_graph_edges);
_clMemcpyH2D(d_graph_mask, no_of_nodes * sizeof(char), h_graph_mask);
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes * sizeof(char),
h_updating_graph_mask);
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes * sizeof(char), h_updating_graph_mask);
_clMemcpyH2D(d_graph_visited, no_of_nodes * sizeof(char), h_graph_visited);
_clMemcpyH2D(d_cost, no_of_nodes * sizeof(int), h_cost);
@ -106,6 +106,7 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
kernel_timer.reset();
kernel_timer.start();
#endif
do {
h_over = false;
_clMemcpyH2D(d_over, sizeof(char), &h_over);
@ -136,9 +137,8 @@ void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
_clMemcpyD2H(d_over, sizeof(char), &h_over);
} while (h_over);
} while (h_over);
_clFinish();
#ifdef PROFILING
kernel_timer.stop();
kernel_time = kernel_timer.getTimeInSeconds();

View file

@ -60,10 +60,10 @@ void compare_results(const datatype *cpu_results, const datatype *gpu_results, c
}
}
if (passed){
std::cout << "--cambine:passed:-)" << endl;
std::cout << "--cambine: passed: -)" << endl;
}
else{
std::cout << "--cambine: failed:-(" << endl;
std::cout << "--cambine: failed :-(" << endl;
}
return ;
}

View file

@ -69,7 +69,7 @@ static cl_uint numPlatforms;
//! All discoverable OpenCL devices (one pointer per platform)
static cl_device_id* devices = NULL;
static cl_uint* numDevices;
static cl_uint* numDevices = NULL;
//! The chosen OpenCL platform
static cl_platform_id platform = NULL;
@ -88,7 +88,6 @@ static cl_command_queue commandQueueNoProf = NULL;
//! Global status of events
static bool eventsEnabled = false;
//-------------------------------------------------------
// Initialization and Cleanup
//-------------------------------------------------------
@ -239,14 +238,34 @@ static bool eventsEnabled = false;
return context;
}*/
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return -1;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return 0;
}
cl_context cl_init_context(int platform, int dev,int quiet) {
int printInfo=1;
if (platform >= 0 && dev >= 0) printInfo = 0;
cl_int status;
// Used to iterate through the platforms and devices, respectively
cl_uint numPlatforms;
cl_uint numDevices;
// These will hold the platform and device we select (can potentially be
// multiple, but we're just doing one for now)
// cl_platform_id platform = NULL;
@ -376,23 +395,24 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
// Getting platform and device information
numPlatforms = 1;
numDevices = 1;
int platform_touse = 0;
int device_touse = 0;
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
devices = (cl_device_id*)malloc(sizeof(cl_device_id)*numDevices);
status = clGetPlatformIDs(1, platforms, NULL);
numDevices = (cl_uint*)malloc(sizeof(cl_uint)*numPlatforms);
numDevices[0] = 1;
devices = (cl_device_id*)malloc(sizeof(cl_device_id)*numDevices[0]);
int platform_touse = 0;
int device_touse = 0;
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
cl_errChk(status, "Oops!", true);
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_DEFAULT, 1, devices, NULL);
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_DEFAULT, numDevices[0], devices, NULL);
cl_errChk(status, "Oops!", true);
context = clCreateContext(NULL, 1, devices, NULL, NULL, &status);
context = clCreateContext(NULL, numDevices[0], devices, NULL, NULL, &status);
cl_errChk(status, "Oops!", true);
device=devices[device_touse];
#define PROFILING
#ifdef PROFILING
commandQueue = clCreateCommandQueue(context,
@ -400,7 +420,7 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
#else
clCommandQueue = clCreateCommandQueue(clGPUContext,
commandQueue = clCreateCommandQueue(context,
devices[device_touse], NULL, &status);
#endif // PROFILING
@ -413,22 +433,34 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
/*!
Release all resources that the user doesn't have access to.
*/
void cl_cleanup()
void cl_cleanup()
{
cl_int status;
// Free the command queue
if(commandQueue) {
clReleaseCommandQueue(commandQueue);
if (commandQueue) {
status = clReleaseCommandQueue(commandQueue);
cl_errChk(status, "Oops!", true);
printf("clReleaseCommandQueue()\n");
}
// Free the context
if(context) {
clReleaseContext(context);
if (context) {
status = clReleaseContext(context);
cl_errChk(status, "Oops!", true);
printf("clReleaseContext()\n");
}
for (int p = 0; p < numPlatforms; ++p) {
for (int d = 0; d < numDevices[p]; ++d) {
status = clReleaseDevice(devices[d]);
cl_errChk(status, "Oops!", true);
printf("clReleaseDevice()\n");
}
}
free(devices);
free(numDevices);
// Free the platforms
free(platforms);
}
@ -443,6 +475,7 @@ void cl_freeKernel(cl_kernel kernel)
if(kernel != NULL) {
status = clReleaseKernel(kernel);
cl_errChk(status, "Releasing kernel object", true);
printf("clReleaseKernel()\n");
}
}
@ -457,6 +490,7 @@ void cl_freeMem(cl_mem mem)
if(mem != NULL) {
status = clReleaseMemObject(mem);
cl_errChk(status, "Releasing mem object", true);
printf("clReleaseMemObject()\n");
}
}
@ -471,6 +505,7 @@ void cl_freeProgram(cl_program program)
if(program != NULL) {
status = clReleaseProgram(program);
cl_errChk(status, "Releasing program object", true);
printf("clReleaseProgram()\n");
}
}
@ -782,27 +817,6 @@ void cl_writeToZCBuffer(cl_mem mem, void* data, size_t size)
cl_unmapBuffer(mem, ptr);
}
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return -1;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return 0;
}
//-------------------------------------------------------
// Program and kernels
//-------------------------------------------------------
@ -858,17 +872,17 @@ cl_program cl_compileProgram(char* kernelPath, char* compileoptions, bool verbos
fread(source, 1, size, fp);
source[size] = '\0';*/
// Create the program object
//cl_program clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &status);
//cl_program clProgramReturn = clCreateProgramWithBuiltInKernels(context, 1, &device, "Fan1;Fan2", &status);
// read kernel binary from file
// read kernel binary from file
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
status = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
cl_errChk(status, "read_kernel_file", true);
cl_int binary_status = 0;
int err = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
cl_errChk(err, "read_kernel_file", true);
// Create the program object
//cl_program clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &status);
cl_program clProgramReturn = clCreateProgramWithBinary(
context, 1, &device, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &status);
context, 1, devices, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &status);
free(kernel_bin);
cl_errChk(status, "Creating program", true);
@ -1440,4 +1454,4 @@ char* itoa_portable(int value, char* result, int base) {
}
return result;
}
}

View file

@ -76,6 +76,9 @@ int main(int argc, char *argv[]) {
free(b);
free(finalVec);
// OpenClGaussianElimination(context,timing);
cl_cleanup();
printf("Passed!\n");
return 0;
}
@ -142,7 +145,8 @@ void ForwardSub(cl_context context, float *a, float *b, float *m, int size,
writeTime += eventTime(writeEvent, command_queue);
clReleaseEvent(writeEvent);
error = clEnqueueWriteBuffer(command_queue, m_dev,
error = clEnqueueWriteBuffer(command_queue,
m_dev,
1, // change to 0 for nonblocking write
0, // offset
sizeof(float) * size * size, m, 0, NULL,
@ -258,6 +262,13 @@ void ForwardSub(cl_context context, float *a, float *b, float *m, int size,
printf("%f\n\n", writeTime + kernelTime + readTime);
}
cl_freeMem(a_dev);
cl_freeMem(b_dev);
cl_freeMem(m_dev);
cl_freeKernel(fan1_kernel);
cl_freeKernel(fan2_kernel);
cl_freeProgram(gaussianElim_program);
}
float eventTime(cl_event event, cl_command_queue command_queue) {

View file

@ -69,7 +69,7 @@ static cl_uint numPlatforms;
//! All discoverable OpenCL devices (one pointer per platform)
static cl_device_id* devices = NULL;
static cl_uint* numDevices;
static cl_uint* numDevices = NULL;
//! The chosen OpenCL platform
static cl_platform_id platform = NULL;
@ -265,9 +265,7 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
if (platform >= 0 && dev >= 0) printInfo = 0;
cl_int status;
// Used to iterate through the platforms and devices, respectively
cl_uint numPlatforms;
cl_uint numDevices;
// These will hold the platform and device we select (can potentially be
// multiple, but we're just doing one for now)
// cl_platform_id platform = NULL;
@ -397,23 +395,24 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
// Getting platform and device information
numPlatforms = 1;
numDevices = 1;
int platform_touse = 0;
int device_touse = 0;
platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));
devices = (cl_device_id*)malloc(sizeof(cl_device_id)*numDevices);
status = clGetPlatformIDs(1, platforms, NULL);
numDevices = (cl_uint*)malloc(sizeof(cl_uint)*numPlatforms);
numDevices[0] = 1;
devices = (cl_device_id*)malloc(sizeof(cl_device_id)*numDevices[0]);
int platform_touse = 0;
int device_touse = 0;
status = clGetPlatformIDs(numPlatforms, platforms, NULL);
cl_errChk(status, "Oops!", true);
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_DEFAULT, 1, devices, NULL);
status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_DEFAULT, numDevices[0], devices, NULL);
cl_errChk(status, "Oops!", true);
context = clCreateContext(NULL, 1, devices, NULL, NULL, &status);
context = clCreateContext(NULL, numDevices[0], devices, NULL, NULL, &status);
cl_errChk(status, "Oops!", true);
device=devices[device_touse];
#define PROFILING
#ifdef PROFILING
commandQueue = clCreateCommandQueue(context,
@ -421,7 +420,7 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
#else
clCommandQueue = clCreateCommandQueue(clGPUContext,
commandQueue = clCreateCommandQueue(context,
devices[device_touse], NULL, &status);
#endif // PROFILING
@ -434,22 +433,34 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
/*!
Release all resources that the user doesn't have access to.
*/
void cl_cleanup()
void cl_cleanup()
{
cl_int status;
// Free the command queue
if(commandQueue) {
clReleaseCommandQueue(commandQueue);
if (commandQueue) {
status = clReleaseCommandQueue(commandQueue);
cl_errChk(status, "Oops!", true);
printf("clReleaseCommandQueue()\n");
}
// Free the context
if(context) {
clReleaseContext(context);
if (context) {
status = clReleaseContext(context);
cl_errChk(status, "Oops!", true);
printf("clReleaseContext()\n");
}
for (int p = 0; p < numPlatforms; ++p) {
for (int d = 0; d < numDevices[p]; ++d) {
status = clReleaseDevice(devices[d]);
cl_errChk(status, "Oops!", true);
printf("clReleaseDevice()\n");
}
}
free(devices);
free(numDevices);
// Free the platforms
free(platforms);
}
@ -464,6 +475,7 @@ void cl_freeKernel(cl_kernel kernel)
if(kernel != NULL) {
status = clReleaseKernel(kernel);
cl_errChk(status, "Releasing kernel object", true);
printf("clReleaseKernel()\n");
}
}
@ -478,6 +490,7 @@ void cl_freeMem(cl_mem mem)
if(mem != NULL) {
status = clReleaseMemObject(mem);
cl_errChk(status, "Releasing mem object", true);
printf("clReleaseMemObject()\n");
}
}
@ -492,6 +505,7 @@ void cl_freeProgram(cl_program program)
if(program != NULL) {
status = clReleaseProgram(program);
cl_errChk(status, "Releasing program object", true);
printf("clReleaseProgram()\n");
}
}

View file

@ -49,25 +49,27 @@ int main(int argc, char *argv[]) {
printf("%s --> Distance=%f\n", records[i].recString, records[i].distance);
}
free(recordDistances);
cl_cleanup();
printf("Passed!\n");
return 0;
}
float *OpenClFindNearestNeighbors(cl_context context, int numRecords,
std::vector<LatLong> &locations, float lat,
float lng, int timing) {
// 1. set up kernel
cl_kernel NN_kernel;
cl_int status;
// 1. set up kernel
cl_kernel NN_kernel;
cl_program cl_NN_program;
cl_NN_program = cl_compileProgram((char *)"nearestNeighbor_kernel.cl", NULL);
NN_kernel = clCreateKernel(cl_NN_program, "NearestNeighbor", &status);
status =
cl_errChk(status, (char *)"Error Creating Nearest Neighbor kernel", true);
if (status)
exit(1);
cl_errChk(status, (char *)"Error Creating Nearest Neighbor kernel", true);
// 2. set up memory on device and send ipts data to device
// copy ipts(1,2) to device
// also need to alloate memory for the distancePoints
@ -78,9 +80,11 @@ float *OpenClFindNearestNeighbors(cl_context context, int numRecords,
d_locations = clCreateBuffer(context, CL_MEM_READ_ONLY,
sizeof(LatLong) * numRecords, NULL, &error);
cl_errChk(error, "ERROR: clCreateBuffer() failed", true);
d_distances = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(float) * numRecords, NULL, &error);
cl_errChk(error, "ERROR: clCreateBuffer() failed", true);
cl_command_queue command_queue = cl_getCommandQueue();
cl_event writeEvent, kernelEvent, readEvent;
@ -89,6 +93,7 @@ float *OpenClFindNearestNeighbors(cl_context context, int numRecords,
0, // offset
sizeof(LatLong) * numRecords, &locations[0], 0,
NULL, &writeEvent);
cl_errChk(error, "ERROR: clEnqueueWriteBuffer() failed", true);
// 3. send arguments to device
cl_int argchk;
@ -124,8 +129,10 @@ float *OpenClFindNearestNeighbors(cl_context context, int numRecords,
&readEvent);
cl_errChk(error, "ERROR with clEnqueueReadBuffer", true);
if (timing) {
clFinish(command_queue);
clFinish(command_queue);
if (timing) {
cl_ulong eventStart, eventEnd, totalTime = 0;
printf("# Records\tWrite(s) [size]\t\tKernel(s)\tRead(s) "
"[size]\t\tTotal(s)\n");
@ -166,8 +173,14 @@ float *OpenClFindNearestNeighbors(cl_context context, int numRecords,
printf("%f\n\n", (float)(totalTime / 1e9));
}
// 6. return finalized data and release buffers
clReleaseMemObject(d_locations);
clReleaseMemObject(d_distances);
clReleaseEvent(writeEvent);
clReleaseEvent(kernelEvent);
clReleaseEvent(readEvent);
cl_freeMem(d_locations);
cl_freeMem(d_distances);
cl_freeKernel(NN_kernel);
cl_freeProgram(cl_NN_program);
return distances;
}

View file

@ -7,6 +7,8 @@ POCL_RT_PATH ?= /opt/pocl/runtime
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n1024
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
@ -33,19 +35,19 @@ $(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;

View file

@ -29,11 +29,9 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <chrono>
//#define NUM_DATA 65536
#define NUM_DATA 1024
#define CL_CHECK(_expr) \
do { \
cl_int _err = _expr; \
@ -85,14 +83,18 @@ uint8_t *kernel_bin = NULL;
///
// Cleanup any created OpenCL resources
//
void Cleanup(cl_context context, cl_command_queue commandQueue,
cl_program program, cl_kernel kernel, cl_mem memObjects[3]) {
for (int i = 0; i < 3; i++) {
void Cleanup(cl_device_id device_id, cl_context context, cl_command_queue commandQueue,
cl_program program, cl_kernel kernel, cl_mem memObjects[2]) {
if (kernel_bin)
free(kernel_bin);
if (commandQueue != 0)
clReleaseCommandQueue(commandQueue);
for (int i = 0; i < 2; i++) {
if (memObjects[i] != 0)
clReleaseMemObject(memObjects[i]);
}
if (commandQueue != 0)
clReleaseCommandQueue(commandQueue);
if (kernel != 0)
clReleaseKernel(kernel);
@ -103,11 +105,40 @@ void Cleanup(cl_context context, cl_command_queue commandQueue,
if (context != 0)
clReleaseContext(context);
if (kernel_bin) free(kernel_bin);
if (device_id != 0)
clReleaseDevice(device_id);
}
int size = 1024;
static void show_usage() {
printf("Usage: [-n size] [-h: help]\n");
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:h?")) != -1) {
switch (c) {
case 'n':
size = atoi(optarg);
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
printf("Workload size=%d\n", size);
}
int main(int argc, char **argv) {
printf("enter demo main\n");
// parse command arguments
parse_args(argc, argv);
cl_platform_id platform_id;
cl_device_id device_id;
@ -126,7 +157,7 @@ int main(int argc, char **argv) {
context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, &pfn_notify, NULL, &_err));
cl_command_queue queue;
queue = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &_err));
queue = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, NULL, &_err));
cl_kernel kernel = 0;
cl_mem memObjects[2] = {0, 0};
@ -139,7 +170,7 @@ int main(int argc, char **argv) {
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
if (program == NULL) {
std::cerr << "Failed to write program binary" << std::endl;
Cleanup(context, queue, program, kernel, memObjects);
Cleanup(device_id, context, queue, program, kernel, memObjects);
return 1;
} else {
std::cout << "Read program from binary." << std::endl;
@ -148,7 +179,7 @@ int main(int argc, char **argv) {
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
size_t nbytes = sizeof(float) * NUM_DATA;
size_t nbytes = sizeof(float) * size;
printf("attempting to create input buffer\n");
cl_mem input_buffer;
@ -175,13 +206,13 @@ int main(int argc, char **argv) {
printf("attempting to enqueue write buffer\n");
float* h_src = (float*)malloc(nbytes);
for (int i = 0; i < NUM_DATA; i++) {
for (int i = 0; i < size; i++) {
h_src[i] = ((float)rand() / (float)(RAND_MAX)) * 100.0;
}
CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL));
free(h_src);
size_t global_work_size[] = {NUM_DATA/2, NUM_DATA/2};
size_t global_work_size[] = {size/2, size/2};
printf("attempting to enqueue kernel\n");
auto time_start = std::chrono::high_resolution_clock::now();
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size,
@ -196,18 +227,13 @@ int main(int argc, char **argv) {
CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL));
/*printf("Result:");
for (int i = 0; i < NUM_DATA; i++) {
for (int i = 0; i < size; i++) {
float data = h_dst[i];
printf(" %f", data);
}*/
free(h_dst);
CL_CHECK(clReleaseMemObject(memObjects[0]));
CL_CHECK(clReleaseMemObject(memObjects[1]));
CL_CHECK(clReleaseKernel(kernel));
CL_CHECK(clReleaseProgram(program));
CL_CHECK(clReleaseContext(context));
Cleanup(device_id, context, queue, program, kernel, memObjects);
return 0;
}

View file

@ -7,6 +7,8 @@ POCL_RT_PATH ?= /opt/pocl/runtime
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n16
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
@ -33,19 +35,19 @@ $(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;

View file

@ -35,8 +35,6 @@
#include <unistd.h>
#include <chrono>
#define NUM_DATA (16+2)
#define CL_CHECK(_expr) \
do { \
cl_int _err = _expr; \
@ -159,14 +157,18 @@ float poclu_cl_half_to_float(cl_half value) {
///
// Cleanup any created OpenCL resources
//
void Cleanup(cl_context context, cl_command_queue commandQueue,
cl_program program, cl_kernel kernel, cl_mem memObjects[3]) {
for (int i = 0; i < 3; i++) {
void Cleanup(cl_device_id device_id, cl_context context, cl_command_queue commandQueue,
cl_program program, cl_kernel kernel, cl_mem memObjects[2]) {
if (kernel_bin)
free(kernel_bin);
if (commandQueue != 0)
clReleaseCommandQueue(commandQueue);
for (int i = 0; i < 2; i++) {
if (memObjects[i] != 0)
clReleaseMemObject(memObjects[i]);
}
if (commandQueue != 0)
clReleaseCommandQueue(commandQueue);
if (kernel != 0)
clReleaseKernel(kernel);
@ -177,11 +179,40 @@ void Cleanup(cl_context context, cl_command_queue commandQueue,
if (context != 0)
clReleaseContext(context);
if (kernel_bin) free(kernel_bin);
if (device_id != 0)
clReleaseDevice(device_id);
}
int size = 16+2;
static void show_usage() {
printf("Usage: [-n size] [-h: help]\n");
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:h?")) != -1) {
switch (c) {
case 'n':
size = atoi(optarg)+2;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
printf("Workload size=%d\n", size);
}
int main(int argc, char **argv) {
printf("enter demo main\n");
// parse command arguments
parse_args(argc, argv);
cl_platform_id platform_id;
cl_device_id device_id;
@ -213,7 +244,7 @@ int main(int argc, char **argv) {
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
if (program == NULL) {
std::cerr << "Failed to write program binary" << std::endl;
Cleanup(context, queue, program, kernel, memObjects);
Cleanup(device_id, context, queue, program, kernel, memObjects);
return 1;
} else {
std::cout << "Read program from binary." << std::endl;
@ -222,7 +253,7 @@ int main(int argc, char **argv) {
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
size_t nbytes = sizeof(float) * NUM_DATA * NUM_DATA;
size_t nbytes = sizeof(float) * size * size;
printf("attempting to create input buffer\n");
cl_mem input_buffer;
@ -235,7 +266,7 @@ int main(int argc, char **argv) {
memObjects[0] = input_buffer;
memObjects[1] = output_buffer;
long long ldc = NUM_DATA;
long long ldc = size;
float m0 = 1.0;
float m1 = 1.0;
@ -265,15 +296,15 @@ int main(int argc, char **argv) {
printf("attempting to enqueue write buffer\n");
float* h_src = (float*)malloc(nbytes);
for (int i = 0; i < NUM_DATA * NUM_DATA; i++) {
for (int i = 0; i < size * size; i++) {
h_src[i] = ((float)rand() / (float)(RAND_MAX)) * 100.0;
}
CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL));
free(h_src);
size_t global_offset[2] = {1, 1};
size_t global_work_size[2] = {NUM_DATA - 2, NUM_DATA - 2}; // avoid the edges
const size_t local_work_size[2] = {NUM_DATA - 2, 1};
size_t global_work_size[2] = {size - 2, size - 2}; // avoid the edges
const size_t local_work_size[2] = {size - 2, 1};
printf("attempting to enqueue kernel\n");
auto time_start = std::chrono::high_resolution_clock::now();
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, global_offset,
@ -286,20 +317,15 @@ int main(int argc, char **argv) {
printf("Download destination buffer\n");
float* h_dst = (float*)malloc(nbytes);
CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL));
/*printf("Result:");
for (int i = 0; i < NUM_DATA * NUM_DATA; i++) {
for (int i = 0; i < size; i++) {
float data = h_dst[i];
printf(" %f", data);
}*/
free(h_dst);
CL_CHECK(clReleaseMemObject(memObjects[0]));
CL_CHECK(clReleaseMemObject(memObjects[1]));
CL_CHECK(clReleaseKernel(kernel));
CL_CHECK(clReleaseProgram(program));
CL_CHECK(clReleaseContext(context));
Cleanup(device_id, context, queue, program, kernel, memObjects);
return 0;
}

View file

@ -129,6 +129,8 @@ static void parse_args(int argc, char **argv) {
printf("Error: invalid size!\n");
exit(-1);
}
printf("Workload size=%d\n", size);
}
int main (int argc, char **argv) {
@ -218,7 +220,8 @@ int main (int argc, char **argv) {
matmul(h_ref, h_a, h_b, size, size, size);
for (int i = 0; i < (size * size); i++) {
if (!almost_equal(h_c[i], h_ref[i])) {
printf("*** error: [%d] expected=%f, actual=%f\n", i, h_ref[i], h_c[i]);
if (errors < 100)
printf("*** error: [%d] expected=%f, actual=%f\n", i, h_ref[i], h_c[i]);
++errors;
}
}

View file

@ -112,6 +112,8 @@ static void parse_args(int argc, char **argv) {
exit(-1);
}
}
printf("Workload size=%d\n", size);
}
int main (int argc, char **argv) {
@ -196,7 +198,8 @@ int main (int argc, char **argv) {
for (int i = 0; i < size; ++i) {
float ref = h_a[i] + h_b[i];
if (!almost_equal(h_c[i], ref)) {
printf("*** error: [%d] expected=%f, actual=%f, a=%f, b=%f\n", i, ref, h_c[i], h_a[i], h_b[i]);
if (errors < 100)
printf("*** error: [%d] expected=%f, actual=%f, a=%f, b=%f\n", i, ref, h_c[i], h_a[i], h_b[i]);
++errors;
}
}

View file

@ -123,11 +123,15 @@ esac
if [ -d "$VORTEX_HOME/driver/tests/$APP" ];
then
APP_PATH=$VORTEX_HOME/driver/tests/$APP
else
elif [ -d "$VORTEX_HOME/benchmarks/opencl/$APP" ];
then
APP_PATH=$VORTEX_HOME/benchmarks/opencl/$APP
else
echo "Application folder found: $APP"
exit -1
fi
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DL2_ENABLE=$L2 -DL3_ENABLE=$L3 $PERF_FLAG"
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DL2_ENABLE=$L2 -DL3_ENABLE=$L3 $PERF_FLAG $CONFIGS"
echo "CONFIGS=$CONFIGS"

View file

@ -3,25 +3,62 @@
# exit when any command fails
set -e
make -s
# Dogfood tests
./ci/test_runtime.sh
./ci/test_riscv_isa.sh
./ci/test_opencl.sh
./ci/test_driver.sh
./ci/test_simx.sh
./ci/test_compiler.sh
# Build tests disabling extensions
# warp/threads configurations
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=2 --app=demo
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=demo
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=demo
# cores clustering
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=demo --args="-n1"
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=1 --app=demo --args="-n1"
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=demo --args="-n1"
# L2/L3
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=demo --args="-n1"
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=demo --args="-n1"
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=demo --args="-n1"
# build flags
./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --perf --app=demo --args="-n1"
./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --debug --app=demo --args="-n1"
./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --scope --app=basic --args="-t0 -n1"
# disabling M extension
CONFIGS=-DEXT_M_DISABLE make -C hw/simulate
# disabling F extension
CONFIGS=-DEXT_F_DISABLE make -C hw/simulate
# disable shared memory
CONFIGS=-DSM_ENABLE=0 make -C hw/simulate
# Blackbox tests
./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --perf --app=demo --args="-n1"
./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --debug --app=demo --args="-n1"
./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --scope --app=demo --args="-n1"
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=demo --args="-n1"
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=demo --args="-n1"
./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=demo --args="-n1"
# using FPNEW core
FPU_CORE=FPU_FPNEW ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood
# test 128-bit MEM block
CONFIGS=-DMEM_BLOCK_SIZE=16 ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
# test 128-bit MEM and DRAM block
CONFIGS="-DMEM_BLOCK_SIZE=16 -DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=128 -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=28 -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
# test 27-bit DRAM address
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
# test 128-bit DRAM block
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1 -DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=128 -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=28" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
# test verilator reset values
CONFIGS="-DVERILATOR_RESET_VALUE=0" ./ci/blackbox.sh --driver=vlsim --cores=4 --app=sgemm
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --cores=4 --app=sgemm
# test vlsim memory stress
CONFIGS="-DMEM_LATENCY=100 -DMEM_RQ_SIZE=4 -DMEM_STALLS_MODULO=4" ./ci/blackbox.sh --driver=vlsim --cores=4 --app=sgemm

70
doc/Cache_Subsystem.md Normal file
View file

@ -0,0 +1,70 @@
# Vortex Cache Subsystem
The Vortex Cache Sub-system has the following main properties:
- High-bandwidth with bank parallelism
- Snoop protocol to flush data for CPU access
- Generic design: Dcache, Icache, Shared Memory, L2 cache, L3 cache
### Cache Hierarchy
![Image of Cache Hierarchy](./Images/cache_hierarchy.png)
- Cache can be configured to be any level in the hierarchy
- Caches communicate via snooping
- Cache flush from AFU is passed down the hierarchy
### VX_cache.v (Top Module)
VX.cache.v is the top module of the cache verilog code located in the `/hw/rtl/cache` directory.
![Image of Vortex Cache](./Images/vortex_cache_top_module.png)
- Configurable (Cache size, number of banks, bank line size, etc.)
- I/O signals
- Core Request
- Core Rsp
- DRAM Req
- DRAM Rsp
- Snoop Rsp
- Snoop Rsp
- Snoop Forwarding Out
- Snoop Forwarding In
- Bank Select
- Assigns valid and ready signals for each bank
- Snoop Forwarder
- DRAM Request Arbiter
- Prepares cache response for communication with DRAM
- Snoop Response Arbiter
- Sends snoop response
- Core Response Merge
- Cache accesses one line at a time. As a result, each request may not come back in the same response. This module tries to recombine the responses by thread ID.
### VX_bank.v
VX_bank.v is the verilog code that handles cache bank functionality and is located in the `/hw/rtl/cache` directory.
![Image of Vortex Cache Bank](./Images/vortex_bank.png)
- Allows for high throughput
- Each bank contains queues to hold requests to the cache
- I/O signals
- Core request
- Core Response
- DRAM Fill Requests
- DRAM Fill Response
- DRAM WB Requests
- Snp Request
- Snp Response
- Request Priority: DRAM fill, miss reserve, core request, snoop request
- Snoop Request Queue
- DRAM Fill Queue
- Core Req Arbiter
- Requests to be processed by the bank
- Tag Data Store
- Registers for valid, dirty, dirtyb, tag, and data
- Length of registers determined by lines in the bank
- Tag Data Access:
- I/O: stall, snoop info, force request miss
- Writes to cache or sends read response; hit or miss determined here
- A missed request goes to the miss reserve if it is not a snoop request or DRAM fill

35
doc/Codebase.md Normal file
View file

@ -0,0 +1,35 @@
# Vortex Codebase
The directory/file layout of the Vortex codebase is as followed:
- `benchmark`: contains opencl, risc-v, and vector tests
- `opencl`: contains basic kernel operation tests (i.e. vector add, transpose, dot product)
- `riscv`: contains official riscv tests which are pre-compiled into binaries
- `vector`: tests for vector instructions (not yet implemented)
- `ci`: contain tests to be run during continuous integration (Travis CI)
- driver, opencl, riscv_isa, and runtime tests
- `driver`: contains driver software implementation (software that is run on the host to communicate with the vortex processor)
- `opae`: contains code for driver that runs on FPGA
- `rtlsim`: contains code for driver that runs on local machine (driver built using verilator which converts rtl to c++ binary)
- `simx`: contains code for driver that runs on local machine (vortex)
- `include`: contains vortex.h which has the vortex API that is used by the drivers
- `runtime`: contains software used inside kernel programs to expose GPGPU capabilities
- `include`: contains vortex API needed for runtime
- `linker`: contains linker file for compiling kernels
- `src`: contains implementation of vortex API (from include folder)
- `tests`: contains runtime tests
- `simple`: contains test for GPGPU functionality allowed in vortex
- `simx`: contains simX, the cycle approximate simulator for vortex
- `miscs`: contains old code that is no longer used
- `hw`:
- `unit_tests`: contains unit test for RTL of cache and queue
- `syn`: contains all synthesis scripts (quartus and yosys)
- `quartus`: contains code to synthesis cache, core, pipeline, top, and vortex stand-alone
- `simulate`: contains RTL simulator (verilator)
- `testbench.cpp`: runs either the riscv, runtime, or opencl tests
- `opae`: contains source code for the accelerator functional unit (AFU) and code which programs the fpga
- `rtl`: contains rtl source code
- `cache`: contains cache subsystem code
- `fp_cores`: contains floating point unit code
- `interfaces`: contains code that handles communication for each of the units of the microarchitecture
- `libs`: contains general-purpose modules (i.e., buffers, encoders, arbiters, pipe registers)

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

BIN
doc/Images/vortex_bank.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 77 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 67 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 517 KiB

94
doc/Microarchitecture.md Normal file
View file

@ -0,0 +1,94 @@
# Vortex Microarchitecture
### Vortex GPGPU Execution Model
Vortex uses the SIMT (Single Instruction, Multiple Threads) execution model with a single warp issued per cycle.
- **Threads**
- Smallest unit of computation
- Each thread has its own register file (32 int + 32 fp registers)
- Threads execute in parallel
- **Warps**
- A logical clster of threads
- Each thread in a warp execute the same instruction
- The PC is shared; maintain thread mask for Writeback
- Warp's execution is time-multiplexed at log steps
- Ex. warp 0 executes at cycle 0, warp 1 executes at cycle 1
### Vortex RISC-V ISA Extension
- **Thread Mask Control**
- Control the number of warps to activate during execution
- `TMC` *count*: activate count threads
- **Warp Scheduling**
- Control the number of warps to activate during execution
- `WSPAWN` *count, addr*: activate count warps and jump to addr location
- **Control-Flow Divergence**
- Control threads to activate when a branch diverges
- `SPLIT` *predicate*: apply 'taken' predicate thread mask adn save 'not-taken' into IPDOM stack
- `JOIN`: restore 'not-taken' thread mask
- **Warp Synchronization**
- `BAR` *id, count*: stall warps entering barrier *id* until count is reached
### Vortex Pipeline/Datapath
![Image of Vortex Microarchitecture](./Images/vortex_microarchitecture_v2.png)
Vortex has a 5-stage pipeline: FI | ID | Issue | EX | WB.
- **Fetch**
- Warp Scheduler
- Track stalled & active warps, resolve branches and barriers, maintain split/join IPDOM stack
- Instruction Cache
- Retrieve instruction from cache, issue I-cache requests/responses
- **Decode**
- Decode fetched instructions, notify warp scheduler when the following instructions are decoded:
- Branch, tmc, split/join, wspawn
- Precompute used_regs mask (needed for Issue stage)
- **Issue**
- Scheduling
- In-order issue (operands/execute unit ready), out-of-order commit
- IBuffer
- Store fetched instructions, separate queues per-warp, selects next warp through round-robin scheduling
- Scoreboard
- Track in-use registers
- GPRs (General-Purpose Registers) stage
- Fetch issued instruction operands and send operands to execute unit
- **Execute**
- ALU Unit
- Single-cycle operations (+,-,>>,<<,&,|,^), Branch instructions (Share ALU resources)
- MULDIV Unit
- Multiplier - done in 2 cycles
- Divider - division and remainder, done in 32 cycles
- Implements serial alogrithm (Stalls the pipeline)
- FPU Unit
- Multi-cycle operations, uses `FPnew` Library on ASIC, uses hard DSPs on FPGA
- CSR Unit
- Store constant status registers - device caps, FPU status flags, performance counters
- Handle external CSR requests (requests from host CPU)
- LSU Unit
- Handle load/store operations, issue D-cache requests, handle D-cache responses
- Commit load responses - saves storage, Scoreboard tracks completion
- GPGPU Unit
- Handle GPGPU instructions
- TMC, WSPAWN, SPLIT, BAR
- JOIN is handled by Warp Scheduler (upon SPLIT response)
- **Commit**
- Commit
- Update CSR flags, update performance counters
- Writeback
- Write result back to GPRs, notify Scoreboard (release in-use register), select candidate instruction (ALU unit has highest priority)
- **Clustering**
- Group mulitple cores into clusters (optionally share L2 cache)
- Group multiple clusters (optionally share L3 cache)
- Configurable at build time
- Default configuration:
- #Clusters = 1
- #Cores = 4
- #Warps = 4
- #Threads = 4
- **FPGA AFU Interface**
- Manage CPU-GPU comunication
- Query devices caps, load kernel instructions and resource buffers, start kernel execution, read destination buffers
- Local Memory - GPU access to local DRAM
- Reserved I/O addresses - redirect to host CPU, console output

View file

@ -24,10 +24,9 @@ Running tests under specific drivers (rtlsim,simx,fpga) is done using the script
- *L3cache* - used to enable the shared l3cache among the Vortex clusters.
- *Driver* - used to specify which driver to run the Vortex simulation (either rtlsim, vlsim, fpga, or simx).
- *Debug* - used to enable debug mode for the Vortex simulation.
- *Scope* -
- *Perf* - is used to enable the detailed performance counters within the Vortex simulation.
- *App* - is used to specify which test/benchmark to run in the Vortex simulation. The main choices are vecadd, sgemm, basic, demo, and dogfood. Other tests/benchmarks are located in the `/benchmarks/opencl` folder though not all of them work wit the current version of Vortex.
- *Args* -
- *Perf* - used to enable the detailed performance counters within the Vortex simulation.
- *App* - used to specify which test/benchmark to run in the Vortex simulation. The main choices are vecadd, sgemm, basic, demo, and dogfood. Other tests/benchmarks are located in the `/benchmarks/opencl` folder though not all of them work wit the current version of Vortex.
- *Args* - used to pass additional arguments to the application.
Example use of command line arguments: Run the sgemm benchmark using the vlsim driver with a Vortex configuration of 1 cluster, 4 cores, 4 warps, and 4 threads.

View file

@ -2,10 +2,12 @@
### Table of Contents
- Vortex Architecture
- [Vortex Codebase Layout](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Codebase.md)
- [Vortex Microarchitecture and Extended RISC-V ISA](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Microarchitecture.md)
- [Vortex Cache Subsystem](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Cache_Subsystem.md)
- Vortex Software
- [Vortex Simulation](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Simulation.md)
- [FPGA](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Flubber_FPGA_Startup_Guide.md)
- [FPGA Configuration, Program and Test](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/Flubber_FPGA_Startup_Guide.md)
- Debugging
- Useful Links

View file

@ -1,4 +1,4 @@
all: stub rtlsim simx opae
all: stub rtlsim simx opae tests
stub:
$(MAKE) -C stub
@ -12,10 +12,14 @@ rtlsim:
simx:
$(MAKE) -C simx
tests:
$(MAKE) -C tests
clean:
$(MAKE) clean -C stub
$(MAKE) clean -C opae
$(MAKE) clean -C rtlsim
$(MAKE) clean -C simx
$(MAKE) clean -C tests
.PHONY: all stub opae rtlsim simx clean
.PHONY: all stub opae rtlsim simx tests clean

View file

@ -33,6 +33,13 @@ extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_
while (offset < size) {
auto chunk_size = std::min<size_t>(buffer_transfer_size, size - offset);
std::memcpy(buf_ptr, (uint8_t*)content + offset, chunk_size);
/*printf("** Upload Kernel to 0x%0x: data=", kernel_base_addr + offset);
for (int i = 0, n = ((chunk_size+7)/8); i < n; ++i) {
printf("%08x", ((uint64_t*)((uint8_t*)content + offset))[n-1-i]);
}
printf("\n");*/
err = vx_copy_to_dev(buffer, kernel_base_addr + offset, chunk_size, 0);
if (err != 0) {
vx_buf_release(buffer);
@ -115,10 +122,10 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
uint64_t smem_writes = 0;
uint64_t smem_bank_stalls = 0;
// PERF: memory
uint64_t dram_reads = 0;
uint64_t dram_writes = 0;
uint64_t dram_stalls = 0;
uint64_t dram_lat = 0;
uint64_t mem_reads = 0;
uint64_t mem_writes = 0;
uint64_t mem_stalls = 0;
uint64_t mem_lat = 0;
#endif
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
@ -255,21 +262,21 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
if (num_cores > 1) fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_st_per_core, smem_bank_utilization);
smem_bank_stalls += smem_bank_st_per_core;
// PERF: DRAM
uint64_t dram_reads_per_core, dram_writes_per_core, dram_stalls_per_core, dram_lat_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_READS, CSR_MPM_DRAM_READS_H, &dram_reads_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_WRITES, CSR_MPM_DRAM_WRITES_H, &dram_writes_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_ST, CSR_MPM_DRAM_ST_H, &dram_stalls_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_DRAM_LAT, CSR_MPM_DRAM_LAT_H, &dram_lat_per_core);
int dram_utilization = (int)((double(dram_reads_per_core + dram_writes_per_core) / double(dram_reads_per_core + dram_writes_per_core + dram_stalls_per_core)) * 100);
int dram_avg_lat = (int)(double(dram_lat_per_core) / double(dram_reads_per_core));
if (num_cores > 1) fprintf(stream, "PERF: core%d: dram requests=%ld (reads=%ld, writes=%ld)\n", core_id, (dram_reads_per_core + dram_writes_per_core), dram_reads_per_core, dram_writes_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dram stalls=%ld (utilization=%d%%)\n", core_id, dram_stalls_per_core, dram_utilization);
if (num_cores > 1) fprintf(stream, "PERF: core%d: dram average latency=%d cycles\n", core_id, dram_avg_lat);
dram_reads += dram_reads_per_core;
dram_writes += dram_writes_per_core;
dram_stalls += dram_stalls_per_core;
dram_lat += dram_lat_per_core;
// PERF: memory
uint64_t mem_reads_per_core, mem_writes_per_core, mem_stalls_per_core, mem_lat_per_core;
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_READS, CSR_MPM_MEM_READS_H, &mem_reads_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_WRITES, CSR_MPM_MEM_WRITES_H, &mem_writes_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_ST, CSR_MPM_MEM_ST_H, &mem_stalls_per_core);
ret |= vx_csr_get_l(device, core_id, CSR_MPM_MEM_LAT, CSR_MPM_MEM_LAT_H, &mem_lat_per_core);
int mem_utilization = (int)((double(mem_reads_per_core + mem_writes_per_core) / double(mem_reads_per_core + mem_writes_per_core + mem_stalls_per_core)) * 100);
int mem_avg_lat = (int)(double(mem_lat_per_core) / double(mem_reads_per_core));
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory requests=%ld (reads=%ld, writes=%ld)\n", core_id, (mem_reads_per_core + mem_writes_per_core), mem_reads_per_core, mem_writes_per_core);
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory stalls=%ld (utilization=%d%%)\n", core_id, mem_stalls_per_core, mem_utilization);
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory average latency=%d cycles\n", core_id, mem_avg_lat);
mem_reads += mem_reads_per_core;
mem_writes += mem_writes_per_core;
mem_stalls += mem_stalls_per_core;
mem_lat += mem_lat_per_core;
#endif
}
@ -282,8 +289,8 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100);
int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100);
int dram_utilization = (int)((double(dram_reads + dram_writes) / double(dram_reads + dram_writes + dram_stalls)) * 100);
int dram_avg_lat = (int)(double(dram_lat) / double(dram_reads));
int mem_utilization = (int)((double(mem_reads + mem_writes) / double(mem_reads + mem_writes + mem_stalls)) * 100);
int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads));
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls);
@ -306,9 +313,9 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
fprintf(stream, "PERF: smem reads=%ld\n", smem_reads);
fprintf(stream, "PERF: smem writes=%ld\n", smem_writes);
fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization);
fprintf(stream, "PERF: dram requests=%ld (reads=%ld, writes=%ld)\n", (dram_reads + dram_writes), dram_reads, dram_writes);
fprintf(stream, "PERF: dram stalls=%ld (utilization=%d%%)\n", dram_stalls, dram_utilization);
fprintf(stream, "PERF: dram average latency=%d cycles\n", dram_avg_lat);
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
fprintf(stream, "PERF: memory stalls=%ld (utilization=%d%%)\n", mem_stalls, mem_utilization);
fprintf(stream, "PERF: memory average latency=%d cycles\n", mem_avg_lat);
#endif
return ret;

View file

@ -1,8 +1,7 @@
CFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
#CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors
CFLAGS += -Wno-aligned-new -Wno-maybe-uninitialized
CFLAGS += -DUSE_VLSIM -fPIC -Wno-maybe-uninitialized
CFLAGS += -I../../../../hw
# control RTL debug print states
@ -13,7 +12,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
@ -22,15 +21,9 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_TEX
DBG_FLAGS += $(DBG_PRINT_FLAGS)
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
#CONFIGS ?= -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=1
CFLAGS += -fPIC
CFLAGS += -DUSE_VLSIM $(CONFIGS)
CFLAGS += $(CONFIGS)
CFLAGS += -DDUMP_PERF_STATS
LDFLAGS += -shared -pthread
@ -49,10 +42,11 @@ TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE)
RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
VL_FLAGS += -Wno-DECLFILENAME
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += verilator.vlt
VL_FLAGS += $(CONFIGS)
# Enable Verilator multithreaded simulation
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
@ -83,16 +77,20 @@ endif
VL_FLAGS += -DNOPAE
CFLAGS += -DNOPAE
# use DPI FPU
VL_FLAGS += -DFPU_DPI
# FPU backend
FPU_CORE ?= FPU_DPI
VL_FLAGS += -D$(FPU_CORE)
PROJECT = libopae-c-vlsim.so
all: $(PROJECT)
vortex_afu.h : $(RTL_DIR)/afu/vortex_afu.vh
../../../hw/scripts/gen_config.py -i $(RTL_DIR)/afu/vortex_afu.vh -o vortex_afu.h
$(PROJECT): $(SRCS)
$(PROJECT): $(SRCS) vortex_afu.h
verilator --exe --cc $(TOP) --top-module $(TOP) $(RTL_INCLUDE) $(VL_FLAGS) $(SRCS) -CFLAGS '$(CFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT)
make -j -C obj_dir -f V$(TOP).mk
clean:
rm -rf $(PROJECT) obj_dir ../scope-defs.h $(RTL_DIR)/scope-defs.vh
rm -rf $(PROJECT) obj_dir ../scope-defs.h $(RTL_DIR)/scope-defs.vh vortex_afu.h

View file

@ -10,10 +10,23 @@
#define RESET_DELAY 4
#define ENABLE_DRAM_STALLS
#define DRAM_LATENCY 24
#define DRAM_RQ_SIZE 16
#define DRAM_STALLS_MODULO 16
#define ENABLE_MEM_STALLS
#ifndef MEM_LATENCY
#define MEM_LATENCY 24
#endif
#ifndef MEM_RQ_SIZE
#define MEM_RQ_SIZE 16
#endif
#ifndef MEM_STALLS_MODULO
#define MEM_STALLS_MODULO 16
#endif
#ifndef VERILATOR_RESET_VALUE
#define VERILATOR_RESET_VALUE 2
#endif
uint64_t timestamp = 0;
@ -23,7 +36,7 @@ double sc_time_stamp() {
opae_sim::opae_sim() {
// force random values for unitialized signals
Verilated::randReset(2);
Verilated::randReset(VERILATOR_RESET_VALUE);
Verilated::randSeed(50);
// Turn off assertion before reset
@ -137,16 +150,19 @@ void opae_sim::flush() {
void opae_sim::reset() {
host_buffers_.clear();
dram_reads_.clear();
host_buffers_.clear();
cci_reads_.clear();
cci_writes_.clear();
vortex_afu_->vcp2af_sRxPort_c0_rspValid = 0;
vortex_afu_->vcp2af_sRxPort_c1_rspValid = 0;
vortex_afu_->vcp2af_sRxPort_c0_TxAlmFull = 0;
vortex_afu_->vcp2af_sRxPort_c1_TxAlmFull = 0;
vortex_afu_->avs_readdatavalid = 0;
vortex_afu_->avs_waitrequest = 0;
for (int b = 0; b < PLATFORM_PARAM_LOCAL_MEMORY_BANKS; ++b) {
mem_reads_[b].clear();
vortex_afu_->avs_readdatavalid[b] = 0;
vortex_afu_->avs_waitrequest[b] = 0;
}
vortex_afu_->reset = 1;
@ -268,84 +284,89 @@ void opae_sim::sTxPort_bus() {
}
void opae_sim::avs_bus() {
// update DRAM responses schedule
for (auto& rsp : dram_reads_) {
if (rsp.cycles_left > 0)
rsp.cycles_left -= 1;
}
// schedule DRAM responses in FIFO order
std::list<dram_rd_req_t>::iterator dram_rd_it(dram_reads_.end());
if (!dram_reads_.empty()
&& (0 == dram_reads_.begin()->cycles_left)) {
dram_rd_it = dram_reads_.begin();
}
// send DRAM response
vortex_afu_->avs_readdatavalid = 0;
if (dram_rd_it != dram_reads_.end()) {
vortex_afu_->avs_readdatavalid = 1;
memcpy(vortex_afu_->avs_readdata, dram_rd_it->data.data(), CACHE_BLOCK_SIZE);
uint32_t addr = dram_rd_it->addr;
dram_reads_.erase(dram_rd_it);
/*printf("%0ld: [sim] DRAM Rd Rsp: addr=%x, pending={", timestamp, addr * CACHE_BLOCK_SIZE);
for (auto& req : dram_reads_) {
if (req.cycles_left != 0)
printf(" !%0x", req.addr * CACHE_BLOCK_SIZE);
else
printf(" %0x", req.addr * CACHE_BLOCK_SIZE);
for (int b = 0; b < PLATFORM_PARAM_LOCAL_MEMORY_BANKS; ++b) {
// update memory responses schedule
for (auto& rsp : mem_reads_[b]) {
if (rsp.cycles_left > 0)
rsp.cycles_left -= 1;
}
printf("}\n");*/
}
// handle DRAM stalls
bool dram_stalled = false;
#ifdef ENABLE_DRAM_STALLS
if (0 == ((timestamp/2) % DRAM_STALLS_MODULO)) {
dram_stalled = true;
} else
if (dram_reads_.size() >= DRAM_RQ_SIZE) {
dram_stalled = true;
}
#endif
// process DRAM requests
if (!dram_stalled) {
assert(!vortex_afu_->avs_read || !vortex_afu_->avs_write);
if (vortex_afu_->avs_write) {
assert(0 == vortex_afu_->mem_bank_select);
uint64_t byteen = vortex_afu_->avs_byteenable;
unsigned base_addr = (vortex_afu_->avs_address * CACHE_BLOCK_SIZE);
uint8_t* data = (uint8_t*)(vortex_afu_->avs_writedata);
for (int i = 0; i < CACHE_BLOCK_SIZE; i++) {
if ((byteen >> i) & 0x1) {
ram_[base_addr + i] = data[i];
}
}
// schedule memory responses in FIFO order
std::list<mem_rd_req_t>::iterator mem_rd_it(mem_reads_[b].end());
if (!mem_reads_[b].empty()
&& (0 == mem_reads_[b].begin()->cycles_left)) {
mem_rd_it = mem_reads_[b].begin();
}
if (vortex_afu_->avs_read) {
assert(0 == vortex_afu_->mem_bank_select);
dram_rd_req_t dram_req;
dram_req.addr = vortex_afu_->avs_address;
ram_.read(vortex_afu_->avs_address * CACHE_BLOCK_SIZE, CACHE_BLOCK_SIZE, dram_req.data.data());
dram_req.cycles_left = DRAM_LATENCY;
for (auto& rsp : dram_reads_) {
if (dram_req.addr == rsp.addr) {
dram_req.cycles_left = rsp.cycles_left;
break;
}
}
dram_reads_.emplace_back(dram_req);
/*printf("%0ld: [sim] DRAM Rd Req: addr=%x, pending={", timestamp, dram_req.addr * CACHE_BLOCK_SIZE);
for (auto& req : dram_reads_) {
// send memory response
vortex_afu_->avs_readdatavalid[b] = 0;
if (mem_rd_it != mem_reads_[b].end()) {
vortex_afu_->avs_readdatavalid[b] = 1;
memcpy(vortex_afu_->avs_readdata[b], mem_rd_it->data.data(), MEM_BLOCK_SIZE);
uint32_t addr = mem_rd_it->addr;
mem_reads_[b].erase(mem_rd_it);
/*printf("%0ld: [sim] MEM Rd Rsp: addr=%x, pending={", timestamp, addr * MEM_BLOCK_SIZE);
for (auto& req : mem_reads_[b]) {
if (req.cycles_left != 0)
printf(" !%0x", req.addr * CACHE_BLOCK_SIZE);
printf(" !%0x", req.addr * MEM_BLOCK_SIZE);
else
printf(" %0x", req.addr * CACHE_BLOCK_SIZE);
printf(" %0x", req.addr * MEM_BLOCK_SIZE);
}
printf("}\n");*/
}
}
}
vortex_afu_->avs_waitrequest = dram_stalled;
// handle memory stalls
bool mem_stalled = false;
#ifdef ENABLE_MEM_STALLS
if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) {
mem_stalled = true;
} else
if (mem_reads_[b].size() >= MEM_RQ_SIZE) {
mem_stalled = true;
}
#endif
// process memory requests
if (!mem_stalled) {
assert(!vortex_afu_->avs_read[b] || !vortex_afu_->avs_write[b]);
if (vortex_afu_->avs_write[b]) {
uint64_t byteen = vortex_afu_->avs_byteenable[b];
unsigned base_addr = vortex_afu_->avs_address[b] * MEM_BLOCK_SIZE;
uint8_t* data = (uint8_t*)(vortex_afu_->avs_writedata[b]);
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
if ((byteen >> i) & 0x1) {
ram_[base_addr + i] = data[i];
}
}
/*printf("%0ld: [sim] MEM Wr Req: addr=%x, data=", timestamp, base_addr);
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
printf("%0x", data[(MEM_BLOCK_SIZE-1)-i]);
}
printf("\n");*/
}
if (vortex_afu_->avs_read[b]) {
mem_rd_req_t mem_req;
mem_req.addr = vortex_afu_->avs_address[b];
ram_.read(vortex_afu_->avs_address[b] * MEM_BLOCK_SIZE, MEM_BLOCK_SIZE, mem_req.data.data());
mem_req.cycles_left = MEM_LATENCY;
for (auto& rsp : mem_reads_[b]) {
if (mem_req.addr == rsp.addr) {
mem_req.cycles_left = rsp.cycles_left;
break;
}
}
mem_reads_[b].emplace_back(mem_req);
/*printf("%0ld: [sim] MEM Rd Req: addr=%x, pending={", timestamp, mem_req.addr * MEM_BLOCK_SIZE);
for (auto& req : mem_reads_[b]) {
if (req.cycles_left != 0)
printf(" !%0x", req.addr * MEM_BLOCK_SIZE);
else
printf(" %0x", req.addr * MEM_BLOCK_SIZE);
}
printf("}\n");*/
}
}
vortex_afu_->avs_waitrequest[b] = mem_stalled;
}
}

View file

@ -1,14 +1,16 @@
#pragma once
#include "verilated.h"
//#include "verilated_stub.h"
#include "Vvortex_afu_shim.h"
#include "Vvortex_afu_shim__Syms.h"
#include "verilated.h"
#ifdef VCD_OUTPUT
#include <verilated_vcd_c.h>
#endif
#include <VX_config.h>
#include "vortex_afu.h"
#include "ram.h"
#include <ostream>
@ -16,7 +18,10 @@
#include <list>
#include <unordered_map>
#define CACHE_BLOCK_SIZE 64
#undef MEM_BLOCK_SIZE
#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8)
#define CACHE_BLOCK_SIZE 64
class opae_sim {
public:
@ -40,9 +45,9 @@ private:
typedef struct {
int cycles_left;
std::array<uint8_t, CACHE_BLOCK_SIZE> data;
std::array<uint8_t, MEM_BLOCK_SIZE> data;
uint32_t addr;
} dram_rd_req_t;
} mem_rd_req_t;
typedef struct {
int cycles_left;
@ -77,7 +82,7 @@ private:
std::unordered_map<int64_t, host_buffer_t> host_buffers_;
std::list<dram_rd_req_t> dram_reads_;
std::list<mem_rd_req_t> mem_reads_ [PLATFORM_PARAM_LOCAL_MEMORY_BANKS];
std::list<cci_rd_req_t> cci_reads_;

View file

@ -1,13 +1,16 @@
`include "VX_define.vh"
`include "VX_platform.vh"
`IGNORE_WARNINGS_BEGIN
`include "vortex_afu.vh"
`IGNORE_WARNINGS_END
/* verilator lint_off IMPORTSTAR */
import ccip_if_pkg::*;
import local_mem_cfg_pkg::*;
/* verilator lint_on IMPORTSTAR */
/* verilator lint_on IMPORTSTAR */
module vortex_afu_shim #(
parameter NUM_LOCAL_MEM_BANKS = 2
) (
`include "VX_define.vh"
module vortex_afu_shim (
// global signals
input clk,
input reset,
@ -69,24 +72,22 @@ module vortex_afu_shim #(
output t_ccip_mmioData af2cp_sTxPort_c2_data,
// Avalon signals for local memory access
output t_local_mem_data avs_writedata,
input t_local_mem_data avs_readdata,
output t_local_mem_addr avs_address,
input logic avs_waitrequest,
output logic avs_write,
output logic avs_read,
output t_local_mem_byte_mask avs_byteenable,
output t_local_mem_burst_cnt avs_burstcount,
input avs_readdatavalid,
output logic [$clog2(NUM_LOCAL_MEM_BANKS)-1:0] mem_bank_select
output t_local_mem_data avs_writedata [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
input t_local_mem_data avs_readdata [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
output t_local_mem_addr avs_address [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
input logic avs_waitrequest [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
output logic avs_write [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
output logic avs_read [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
output t_local_mem_byte_mask avs_byteenable [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
output t_local_mem_burst_cnt avs_burstcount [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS],
input avs_readdatavalid [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS]
);
t_if_ccip_Rx cp2af_sRxPort;
t_if_ccip_Tx af2cp_sTxPort;
vortex_afu #(
.NUM_LOCAL_MEM_BANKS(NUM_LOCAL_MEM_BANKS)
.NUM_LOCAL_MEM_BANKS(`PLATFORM_PARAM_LOCAL_MEMORY_BANKS)
) afu (
.clk(clk),
.reset(reset),
@ -100,8 +101,7 @@ vortex_afu #(
.avs_read(avs_read),
.avs_byteenable(avs_byteenable),
.avs_burstcount(avs_burstcount),
.avs_readdatavalid(avs_readdatavalid),
.mem_bank_select(mem_bank_select)
.avs_readdatavalid(avs_readdatavalid)
);
t_if_ccip_c0_RxHdr c0_RxHdr;

View file

@ -1,33 +1,29 @@
CFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
#CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors
CFLAGS += -fPIC -Wno-aligned-new -Wno-maybe-uninitialized
CFLAGS += -DUSE_RTLSIM -fPIC -Wno-maybe-uninitialized
CFLAGS += -I../../include -I../../../hw/simulate -I../../../hw
# control RTL debug print states
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE
#DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
#DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
#DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
#DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
#DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM
#DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
#DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
#DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
DBG_PRINT_FLAGS += -DDBG_PRINT_TEX
DBG_FLAGS += $(DBG_PRINT_FLAGS)
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
#CONFIGS ?= -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=1
CFLAGS += $(CONFIGS)
CFLAGS += -DDUMP_PERF_STATS
LDFLAGS += -shared -pthread
@ -45,10 +41,11 @@ FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE)
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
VL_FLAGS += -Wno-DECLFILENAME
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += verilator.vlt
VL_FLAGS += $(CONFIGS)
# Enable Verilator multithreaded simulation
#THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(max(1, mp.cpu_count() // 2))')
@ -69,8 +66,9 @@ ifdef PERF
CFLAGS += -DPERF_ENABLE
endif
# use DPI FPU
VL_FLAGS += -DFPU_DPI
# FPU backend
FPU_CORE ?= FPU_DPI
VL_FLAGS += -D$(FPU_CORE)
PROJECT = libvortex.so
# PROJECT = libvortex.dylib

View file

@ -6,16 +6,13 @@ SIMX_DIR = ../../simX
CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
#CXXFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -fPIC -Wno-aligned-new -Wno-maybe-uninitialized
CXXFLAGS += -DUSE_SIMX -fPIC -Wno-maybe-uninitialized
CXXFLAGS += -I../include -I../../hw -I$(SIMX_DIR)
CXXFLAGS += -DDUMP_PERF_STATS
#CONFIGS ?= -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
#CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
CONFIGS ?= -DNUM_CLUSTERS=1 -DNUM_CORES=1
CXXFLAGS += $(CONFIGS)
CXXFLAGS += -DDUMP_PERF_STATS
LDFLAGS += -shared -pthread
#LDFLAGS += -dynamiclib -pthread

View file

@ -2,19 +2,23 @@ all:
$(MAKE) -C basic
$(MAKE) -C demo
$(MAKE) -C dogfood
$(MAKE) -C stress
run:
$(MAKE) -C basic run-vlsim
$(MAKE) -C demo run-vlsim
$(MAKE) -C dogfood run-vlsim
$(MAKE) -C stress run-vlsim
clean:
$(MAKE) -C basic clean
$(MAKE) -C demo clean
$(MAKE) -C dogfood clean
$(MAKE) -C stress clean
clean-all:
$(MAKE) -C basic clean-all
$(MAKE) -C demo clean-all
$(MAKE) -C dogfood clean-all
$(MAKE) -C stress clean-all

View file

@ -23,7 +23,7 @@ int test = -1;
uint32_t count = 0;
vx_device_h device = nullptr;
vx_buffer_h buffer = nullptr;
vx_buffer_h staging_buf = nullptr;
static void show_usage() {
std::cout << "Vortex Driver Test." << std::endl;
@ -56,8 +56,8 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (buffer) {
vx_buf_release(buffer);
if (staging_buf) {
vx_buf_release(staging_buf);
}
if (device) {
vx_dev_close(device);
@ -77,38 +77,38 @@ int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) {
// update source buffer
for (int i = 0; i < num_blocks_8; ++i) {
((uint64_t*)vx_host_ptr(buffer))[i] = shuffle(i, value);
((uint64_t*)vx_host_ptr(staging_buf))[i] = shuffle(i, value);
}
/*for (int i = 0; i < num_blocks; ++i) {
std::cout << "data[" << i << "]=0x";
for (int j = 7; j >= 0; --j) {
std::cout << std::hex << ((uint64_t*)vx_host_ptr(buffer))[i * 8 +j];
std::cout << std::hex << ((uint64_t*)vx_host_ptr(staging_buf))[i * 8 +j];
}
std::cout << std::endl;
}*/
// write buffer to local memory
std::cout << "write buffer to local memory" << std::endl;
// write source buffer to local memory
std::cout << "write source buffer to local memory" << std::endl;
auto t0 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_to_dev(buffer, dev_addr, 64 * num_blocks, 0));
RT_CHECK(vx_copy_to_dev(staging_buf, dev_addr, 64 * num_blocks, 0));
auto t1 = std::chrono::high_resolution_clock::now();
// clear destination buffer
for (int i = 0; i < num_blocks_8; ++i) {
((uint64_t*)vx_host_ptr(buffer))[i] = 0;
((uint64_t*)vx_host_ptr(staging_buf))[i] = 0;
}
// read buffer from local memory
std::cout << "read buffer from local memory" << std::endl;
// read destination buffer from local memory
std::cout << "read destination buffer from local memory" << std::endl;
auto t2 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_from_dev(buffer, dev_addr, 64 * num_blocks, 0));
RT_CHECK(vx_copy_from_dev(staging_buf, dev_addr, 64 * num_blocks, 0));
auto t3 = std::chrono::high_resolution_clock::now();
// verify result
std::cout << "verify result" << std::endl;
for (int i = 0; i < num_blocks_8; ++i) {
auto curr = ((uint64_t*)vx_host_ptr(buffer))[i];
auto curr = ((uint64_t*)vx_host_ptr(staging_buf))[i];
auto ref = shuffle(i, value);
if (curr != ref) {
std::cout << "error at 0x" << std::hex << (dev_addr + 8 * i)
@ -145,25 +145,25 @@ int run_kernel_test(const kernel_arg_t& kernel_arg,
// update source buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i;
}
}
std::cout << "upload source buffer" << std::endl;
auto t0 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_ptr, buf_size, 0));
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_ptr, buf_size, 0));
auto t1 = std::chrono::high_resolution_clock::now();
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0));
// start device
std::cout << "start execution" << std::endl;
@ -172,17 +172,17 @@ int run_kernel_test(const kernel_arg_t& kernel_arg,
RT_CHECK(vx_ready_wait(device, -1));
auto t3 = std::chrono::high_resolution_clock::now();
// read buffer from local memory
std::cout << "read buffer from local memory" << std::endl;
// read destination buffer from local memory
std::cout << "read destination buffer from local memory" << std::endl;
auto t4 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0));
auto t5 = std::chrono::high_resolution_clock::now();
// verify result
std::cout << "verify result" << std::endl;
for (uint32_t i = 0; i < num_points; ++i) {
int32_t curr = ((int32_t*)vx_host_ptr(buffer))[i];
int32_t curr = ((int32_t*)vx_host_ptr(staging_buf))[i];
int32_t ref = i;
if (curr != ref) {
std::cout << "error at result #" << i
@ -233,8 +233,8 @@ int main(int argc, char *argv[]) {
unsigned max_cores;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
uint32_t num_points = 1 * count;
uint32_t num_blocks = (num_points * sizeof(uint32_t) + 63) / 64;
uint32_t buf_size = num_blocks * 64;
uint32_t num_blocks = (num_points * sizeof(int32_t) + 63) / 64;
uint32_t buf_size = num_blocks * 64;
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
@ -253,7 +253,7 @@ int main(int argc, char *argv[]) {
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer));
RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &staging_buf));
// run tests
if (0 == test || -1 == test) {
@ -269,9 +269,9 @@ int main(int argc, char *argv[]) {
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (void*)vx_host_ptr(buffer);
auto buf_ptr = (void*)vx_host_ptr(staging_buf);
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
}
std::cout << "run kernel test" << std::endl;

View file

@ -20,7 +20,7 @@ const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
vx_buffer_h buffer = nullptr;
vx_buffer_h staging_buf = nullptr;
static void show_usage() {
std::cout << "Vortex Driver Test." << std::endl;
@ -50,8 +50,8 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (buffer) {
vx_buf_release(buffer);
if (staging_buf) {
vx_buf_release(staging_buf);
}
if (device) {
vx_dev_close(device);
@ -71,13 +71,13 @@ int run_test(const kernel_arg_t& kernel_arg,
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i + i;
int cur = buf_ptr[i];
@ -119,7 +119,7 @@ int main(int argc, char *argv[]) {
uint32_t num_tasks = max_cores * max_warps * max_threads;
uint32_t num_points = count * num_tasks;
uint32_t buf_size = num_points * sizeof(uint32_t);
uint32_t buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
@ -148,45 +148,45 @@ int main(int argc, char *argv[]) {
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer));
RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &staging_buf));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)vx_host_ptr(buffer);
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
}
// upload source buffer0
{
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
}
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src0_ptr, buf_size, 0));
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src0_ptr, buf_size, 0));
// upload source buffer1
{
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i+1;
}
}
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src1_ptr, buf_size, 0));
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src1_ptr, buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View file

@ -0,0 +1,67 @@
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
OPTS ?= -n64
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
#CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I../../include
PROJECT = stress
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -L../../stub -lvortex -o $@
run-fpga: $(PROJECT)
LD_LIBRARY_PATH=../../opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT)
ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT)
LD_LIBRARY_PATH=../../opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT)
LD_LIBRARY_PATH=../../rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-simx: $(PROJECT)
LD_LIBRARY_PATH=../../simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View file

@ -0,0 +1,17 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#define NUM_LOADS 8
struct kernel_arg_t {
uint32_t num_tasks;
uint32_t size;
uint32_t stride;
uint32_t addr_ptr;
uint32_t src_ptr;
uint32_t dst_ptr;
};
#endif

BIN
driver/tests/stress/kernel.bin Executable file

Binary file not shown.

View file

@ -0,0 +1,29 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, void* arg) {
struct kernel_arg_t* _arg = (struct kernel_arg_t*)(arg);
uint32_t stride = _arg->stride;
uint32_t* addr_ptr = (uint32_t*)_arg->addr_ptr;
float* src_ptr = (float*)_arg->src_ptr;
float* dst_ptr = (float*)_arg->dst_ptr;
uint32_t offset = task_id * stride;
for (uint32_t i = 0; i < stride; ++i) {
float value = 0.0f;
for (uint32_t j = 0; j < NUM_LOADS; ++j) {
uint32_t addr = offset + i + j;
uint32_t index = addr_ptr[addr];
value *= src_ptr[index];
}
dst_ptr[offset+i] = value;
}
}
void main() {
struct kernel_arg_t* arg = (struct kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, kernel_body, arg);
}

View file

@ -0,0 +1,596 @@
kernel.elf: file format elf32-littleriscv
Disassembly of section .init:
80000000 <_start>:
80000000: 00000597 auipc a1,0x0
80000004: 17058593 addi a1,a1,368 # 80000170 <vx_set_sp>
80000008: fc102573 csrr a0,0xfc1
8000000c: 00b5106b 0xb5106b
80000010: 160000ef jal ra,80000170 <vx_set_sp>
80000014: 00100513 li a0,1
80000018: 0005006b 0x5006b
8000001c: 00002517 auipc a0,0x2
80000020: ba850513 addi a0,a0,-1112 # 80001bc4 <g_wspawn_args>
80000024: 00002617 auipc a2,0x2
80000028: c2060613 addi a2,a2,-992 # 80001c44 <__BSS_END__>
8000002c: 40a60633 sub a2,a2,a0
80000030: 00000593 li a1,0
80000034: 4c0000ef jal ra,800004f4 <memset>
80000038: 00000517 auipc a0,0x0
8000003c: 3c450513 addi a0,a0,964 # 800003fc <__libc_fini_array>
80000040: 374000ef jal ra,800003b4 <atexit>
80000044: 414000ef jal ra,80000458 <__libc_init_array>
80000048: 008000ef jal ra,80000050 <main>
8000004c: 37c0006f j 800003c8 <exit>
Disassembly of section .text:
80000050 <main>:
80000050: 7ffff7b7 lui a5,0x7ffff
80000054: 0007a503 lw a0,0(a5) # 7ffff000 <__stack_size+0x7fffec00>
80000058: 800005b7 lui a1,0x80000
8000005c: 7ffff637 lui a2,0x7ffff
80000060: 08058593 addi a1,a1,128 # 80000080 <__stack_top+0x81000080>
80000064: 2080006f j 8000026c <vx_spawn_tasks>
80000068 <register_fini>:
80000068: 00000793 li a5,0
8000006c: 00078863 beqz a5,8000007c <register_fini+0x14>
80000070: 80000537 lui a0,0x80000
80000074: 3fc50513 addi a0,a0,1020 # 800003fc <__stack_top+0x810003fc>
80000078: 33c0006f j 800003b4 <atexit>
8000007c: 00008067 ret
80000080 <kernel_body>:
80000080: 0085a783 lw a5,8(a1)
80000084: 00c5a603 lw a2,12(a1)
80000088: 0105a703 lw a4,16(a1)
8000008c: 02f506b3 mul a3,a0,a5
80000090: 0145a883 lw a7,20(a1)
80000094: 0c078863 beqz a5,80000164 <kernel_body+0xe4>
80000098: 00d78833 add a6,a5,a3
8000009c: f0000653 fmv.w.x fa2,zero
800000a0: 00269693 slli a3,a3,0x2
800000a4: 00281813 slli a6,a6,0x2
800000a8: 00c686b3 add a3,a3,a2
800000ac: 00c80833 add a6,a6,a2
800000b0: 40c888b3 sub a7,a7,a2
800000b4: 0006a583 lw a1,0(a3)
800000b8: 0086a603 lw a2,8(a3)
800000bc: 00c6a503 lw a0,12(a3)
800000c0: 00259593 slli a1,a1,0x2
800000c4: 00b705b3 add a1,a4,a1
800000c8: 0005a787 flw fa5,0(a1)
800000cc: 0046a583 lw a1,4(a3)
800000d0: 00261613 slli a2,a2,0x2
800000d4: 10f677d3 fmul.s fa5,fa2,fa5
800000d8: 00259593 slli a1,a1,0x2
800000dc: 00b705b3 add a1,a4,a1
800000e0: 0005a687 flw fa3,0(a1)
800000e4: 00c70633 add a2,a4,a2
800000e8: 00062707 flw fa4,0(a2) # 7ffff000 <__stack_size+0x7fffec00>
800000ec: 10d7f7d3 fmul.s fa5,fa5,fa3
800000f0: 00251513 slli a0,a0,0x2
800000f4: 00a70533 add a0,a4,a0
800000f8: 0106a583 lw a1,16(a3)
800000fc: 0146a603 lw a2,20(a3)
80000100: 10e7f7d3 fmul.s fa5,fa5,fa4
80000104: 00052707 flw fa4,0(a0)
80000108: 00259593 slli a1,a1,0x2
8000010c: 00b705b3 add a1,a4,a1
80000110: 0005a687 flw fa3,0(a1)
80000114: 10e7f7d3 fmul.s fa5,fa5,fa4
80000118: 00261613 slli a2,a2,0x2
8000011c: 00c70633 add a2,a4,a2
80000120: 00062707 flw fa4,0(a2)
80000124: 0186a583 lw a1,24(a3)
80000128: 10d7f7d3 fmul.s fa5,fa5,fa3
8000012c: 01c6a603 lw a2,28(a3)
80000130: 00259593 slli a1,a1,0x2
80000134: 00b705b3 add a1,a4,a1
80000138: 00261613 slli a2,a2,0x2
8000013c: 10e7f7d3 fmul.s fa5,fa5,fa4
80000140: 0005a707 flw fa4,0(a1)
80000144: 00c70633 add a2,a4,a2
80000148: 00d887b3 add a5,a7,a3
8000014c: 00468693 addi a3,a3,4
80000150: 10e7f7d3 fmul.s fa5,fa5,fa4
80000154: 00062707 flw fa4,0(a2)
80000158: 10f777d3 fmul.s fa5,fa4,fa5
8000015c: 00f7a027 fsw fa5,0(a5)
80000160: f4d81ae3 bne a6,a3,800000b4 <kernel_body+0x34>
80000164: 00008067 ret
80000168 <_exit>:
80000168: 00000513 li a0,0
8000016c: 0005006b 0x5006b
80000170 <vx_set_sp>:
80000170: fc002573 csrr a0,0xfc0
80000174: 0005006b 0x5006b
80000178: 00002197 auipc gp,0x2
8000017c: e2018193 addi gp,gp,-480 # 80001f98 <__global_pointer>
80000180: 7f000117 auipc sp,0x7f000
80000184: e8010113 addi sp,sp,-384 # ff000000 <__stack_top>
80000188: 40000593 li a1,1024
8000018c: cc102673 csrr a2,0xcc1
80000190: 02c585b3 mul a1,a1,a2
80000194: 40b10133 sub sp,sp,a1
80000198: cc3026f3 csrr a3,0xcc3
8000019c: 00068663 beqz a3,800001a8 <RETURN>
800001a0: 00000513 li a0,0
800001a4: 0005006b 0x5006b
800001a8 <RETURN>:
800001a8: 00008067 ret
800001ac <spawn_tasks_callback>:
800001ac: fe010113 addi sp,sp,-32
800001b0: 00112e23 sw ra,28(sp)
800001b4: 00812c23 sw s0,24(sp)
800001b8: 00912a23 sw s1,20(sp)
800001bc: 01212823 sw s2,16(sp)
800001c0: 01312623 sw s3,12(sp)
800001c4: fc0027f3 csrr a5,0xfc0
800001c8: 0007806b 0x7806b
800001cc: cc5027f3 csrr a5,0xcc5
800001d0: cc3029f3 csrr s3,0xcc3
800001d4: cc002773 csrr a4,0xcc0
800001d8: fc002673 csrr a2,0xfc0
800001dc: 00279693 slli a3,a5,0x2
800001e0: 800027b7 lui a5,0x80002
800001e4: bc478793 addi a5,a5,-1084 # 80001bc4 <__stack_top+0x81001bc4>
800001e8: 00d787b3 add a5,a5,a3
800001ec: 0007a483 lw s1,0(a5)
800001f0: 0104a403 lw s0,16(s1)
800001f4: 00c4a683 lw a3,12(s1)
800001f8: 0089a933 slt s2,s3,s0
800001fc: 00040793 mv a5,s0
80000200: 00d90933 add s2,s2,a3
80000204: 03368433 mul s0,a3,s3
80000208: 00f9d463 bge s3,a5,80000210 <spawn_tasks_callback+0x64>
8000020c: 00098793 mv a5,s3
80000210: 00f40433 add s0,s0,a5
80000214: 0084a683 lw a3,8(s1)
80000218: 02c40433 mul s0,s0,a2
8000021c: 02e907b3 mul a5,s2,a4
80000220: 00d40433 add s0,s0,a3
80000224: 00f40433 add s0,s0,a5
80000228: 00890933 add s2,s2,s0
8000022c: 01245e63 bge s0,s2,80000248 <spawn_tasks_callback+0x9c>
80000230: 0004a783 lw a5,0(s1)
80000234: 0044a583 lw a1,4(s1)
80000238: 00040513 mv a0,s0
8000023c: 00140413 addi s0,s0,1
80000240: 000780e7 jalr a5
80000244: fe8916e3 bne s2,s0,80000230 <spawn_tasks_callback+0x84>
80000248: 0019b993 seqz s3,s3
8000024c: 0009806b 0x9806b
80000250: 01c12083 lw ra,28(sp)
80000254: 01812403 lw s0,24(sp)
80000258: 01412483 lw s1,20(sp)
8000025c: 01012903 lw s2,16(sp)
80000260: 00c12983 lw s3,12(sp)
80000264: 02010113 addi sp,sp,32
80000268: 00008067 ret
8000026c <vx_spawn_tasks>:
8000026c: fc010113 addi sp,sp,-64
80000270: 02112e23 sw ra,60(sp)
80000274: 02812c23 sw s0,56(sp)
80000278: 02912a23 sw s1,52(sp)
8000027c: 03212823 sw s2,48(sp)
80000280: 03312623 sw s3,44(sp)
80000284: fc2026f3 csrr a3,0xfc2
80000288: fc102873 csrr a6,0xfc1
8000028c: fc002473 csrr s0,0xfc0
80000290: cc5027f3 csrr a5,0xcc5
80000294: 01f00713 li a4,31
80000298: 0cf74463 blt a4,a5,80000360 <vx_spawn_tasks+0xf4>
8000029c: 030408b3 mul a7,s0,a6
800002a0: 00100713 li a4,1
800002a4: 00a8d463 bge a7,a0,800002ac <vx_spawn_tasks+0x40>
800002a8: 03154733 div a4,a0,a7
800002ac: 0ce6c863 blt a3,a4,8000037c <vx_spawn_tasks+0x110>
800002b0: 0ae7d863 bge a5,a4,80000360 <vx_spawn_tasks+0xf4>
800002b4: fff68693 addi a3,a3,-1
800002b8: 02e54333 div t1,a0,a4
800002bc: 00030893 mv a7,t1
800002c0: 00f69663 bne a3,a5,800002cc <vx_spawn_tasks+0x60>
800002c4: 02e56533 rem a0,a0,a4
800002c8: 006508b3 add a7,a0,t1
800002cc: 0288c4b3 div s1,a7,s0
800002d0: 0288e933 rem s2,a7,s0
800002d4: 0b04ca63 blt s1,a6,80000388 <vx_spawn_tasks+0x11c>
800002d8: 00100693 li a3,1
800002dc: 0304c733 div a4,s1,a6
800002e0: 00070663 beqz a4,800002ec <vx_spawn_tasks+0x80>
800002e4: 00070693 mv a3,a4
800002e8: 0304e733 rem a4,s1,a6
800002ec: 800029b7 lui s3,0x80002
800002f0: bc498993 addi s3,s3,-1084 # 80001bc4 <__stack_top+0x81001bc4>
800002f4: 00e12e23 sw a4,28(sp)
800002f8: 00c10713 addi a4,sp,12
800002fc: 00b12623 sw a1,12(sp)
80000300: 00c12823 sw a2,16(sp)
80000304: 00d12c23 sw a3,24(sp)
80000308: 02f30333 mul t1,t1,a5
8000030c: 00279793 slli a5,a5,0x2
80000310: 00f987b3 add a5,s3,a5
80000314: 00e7a023 sw a4,0(a5)
80000318: 00612a23 sw t1,20(sp)
8000031c: 06904c63 bgtz s1,80000394 <vx_spawn_tasks+0x128>
80000320: 04090063 beqz s2,80000360 <vx_spawn_tasks+0xf4>
80000324: 02848433 mul s0,s1,s0
80000328: 00812a23 sw s0,20(sp)
8000032c: 0009006b 0x9006b
80000330: cc5027f3 csrr a5,0xcc5
80000334: cc202573 csrr a0,0xcc2
80000338: 00279793 slli a5,a5,0x2
8000033c: 00f989b3 add s3,s3,a5
80000340: 0009a783 lw a5,0(s3)
80000344: 0087a683 lw a3,8(a5)
80000348: 0007a703 lw a4,0(a5)
8000034c: 0047a583 lw a1,4(a5)
80000350: 00d50533 add a0,a0,a3
80000354: 000700e7 jalr a4
80000358: 00100793 li a5,1
8000035c: 0007806b 0x7806b
80000360: 03c12083 lw ra,60(sp)
80000364: 03812403 lw s0,56(sp)
80000368: 03412483 lw s1,52(sp)
8000036c: 03012903 lw s2,48(sp)
80000370: 02c12983 lw s3,44(sp)
80000374: 04010113 addi sp,sp,64
80000378: 00008067 ret
8000037c: 00068713 mv a4,a3
80000380: f2e7cae3 blt a5,a4,800002b4 <vx_spawn_tasks+0x48>
80000384: fddff06f j 80000360 <vx_spawn_tasks+0xf4>
80000388: 00000713 li a4,0
8000038c: 00100693 li a3,1
80000390: f5dff06f j 800002ec <vx_spawn_tasks+0x80>
80000394: 00048713 mv a4,s1
80000398: 00985463 bge a6,s1,800003a0 <vx_spawn_tasks+0x134>
8000039c: 00080713 mv a4,a6
800003a0: 800007b7 lui a5,0x80000
800003a4: 1ac78793 addi a5,a5,428 # 800001ac <__stack_top+0x810001ac>
800003a8: 00f7106b 0xf7106b
800003ac: e01ff0ef jal ra,800001ac <spawn_tasks_callback>
800003b0: f71ff06f j 80000320 <vx_spawn_tasks+0xb4>
800003b4 <atexit>:
800003b4: 00050593 mv a1,a0
800003b8: 00000693 li a3,0
800003bc: 00000613 li a2,0
800003c0: 00000513 li a0,0
800003c4: 20c0006f j 800005d0 <__register_exitproc>
800003c8 <exit>:
800003c8: ff010113 addi sp,sp,-16
800003cc: 00000593 li a1,0
800003d0: 00812423 sw s0,8(sp)
800003d4: 00112623 sw ra,12(sp)
800003d8: 00050413 mv s0,a0
800003dc: 290000ef jal ra,8000066c <__call_exitprocs>
800003e0: 800027b7 lui a5,0x80002
800003e4: bc07a503 lw a0,-1088(a5) # 80001bc0 <__stack_top+0x81001bc0>
800003e8: 03c52783 lw a5,60(a0)
800003ec: 00078463 beqz a5,800003f4 <exit+0x2c>
800003f0: 000780e7 jalr a5
800003f4: 00040513 mv a0,s0
800003f8: d71ff0ef jal ra,80000168 <_exit>
800003fc <__libc_fini_array>:
800003fc: ff010113 addi sp,sp,-16
80000400: 00812423 sw s0,8(sp)
80000404: 800017b7 lui a5,0x80001
80000408: 80001437 lui s0,0x80001
8000040c: 79440413 addi s0,s0,1940 # 80001794 <__stack_top+0x81001794>
80000410: 79478793 addi a5,a5,1940 # 80001794 <__stack_top+0x81001794>
80000414: 408787b3 sub a5,a5,s0
80000418: 00912223 sw s1,4(sp)
8000041c: 00112623 sw ra,12(sp)
80000420: 4027d493 srai s1,a5,0x2
80000424: 02048063 beqz s1,80000444 <__libc_fini_array+0x48>
80000428: ffc78793 addi a5,a5,-4
8000042c: 00878433 add s0,a5,s0
80000430: 00042783 lw a5,0(s0)
80000434: fff48493 addi s1,s1,-1
80000438: ffc40413 addi s0,s0,-4
8000043c: 000780e7 jalr a5
80000440: fe0498e3 bnez s1,80000430 <__libc_fini_array+0x34>
80000444: 00c12083 lw ra,12(sp)
80000448: 00812403 lw s0,8(sp)
8000044c: 00412483 lw s1,4(sp)
80000450: 01010113 addi sp,sp,16
80000454: 00008067 ret
80000458 <__libc_init_array>:
80000458: ff010113 addi sp,sp,-16
8000045c: 00812423 sw s0,8(sp)
80000460: 01212023 sw s2,0(sp)
80000464: 80001437 lui s0,0x80001
80000468: 80001937 lui s2,0x80001
8000046c: 79040793 addi a5,s0,1936 # 80001790 <__stack_top+0x81001790>
80000470: 79090913 addi s2,s2,1936 # 80001790 <__stack_top+0x81001790>
80000474: 40f90933 sub s2,s2,a5
80000478: 00112623 sw ra,12(sp)
8000047c: 00912223 sw s1,4(sp)
80000480: 40295913 srai s2,s2,0x2
80000484: 02090063 beqz s2,800004a4 <__libc_init_array+0x4c>
80000488: 79040413 addi s0,s0,1936
8000048c: 00000493 li s1,0
80000490: 00042783 lw a5,0(s0)
80000494: 00148493 addi s1,s1,1
80000498: 00440413 addi s0,s0,4
8000049c: 000780e7 jalr a5
800004a0: fe9918e3 bne s2,s1,80000490 <__libc_init_array+0x38>
800004a4: 80001437 lui s0,0x80001
800004a8: 80001937 lui s2,0x80001
800004ac: 79040793 addi a5,s0,1936 # 80001790 <__stack_top+0x81001790>
800004b0: 79490913 addi s2,s2,1940 # 80001794 <__stack_top+0x81001794>
800004b4: 40f90933 sub s2,s2,a5
800004b8: 40295913 srai s2,s2,0x2
800004bc: 02090063 beqz s2,800004dc <__libc_init_array+0x84>
800004c0: 79040413 addi s0,s0,1936
800004c4: 00000493 li s1,0
800004c8: 00042783 lw a5,0(s0)
800004cc: 00148493 addi s1,s1,1
800004d0: 00440413 addi s0,s0,4
800004d4: 000780e7 jalr a5
800004d8: fe9918e3 bne s2,s1,800004c8 <__libc_init_array+0x70>
800004dc: 00c12083 lw ra,12(sp)
800004e0: 00812403 lw s0,8(sp)
800004e4: 00412483 lw s1,4(sp)
800004e8: 00012903 lw s2,0(sp)
800004ec: 01010113 addi sp,sp,16
800004f0: 00008067 ret
800004f4 <memset>:
800004f4: 00f00313 li t1,15
800004f8: 00050713 mv a4,a0
800004fc: 02c37e63 bgeu t1,a2,80000538 <memset+0x44>
80000500: 00f77793 andi a5,a4,15
80000504: 0a079063 bnez a5,800005a4 <memset+0xb0>
80000508: 08059263 bnez a1,8000058c <memset+0x98>
8000050c: ff067693 andi a3,a2,-16
80000510: 00f67613 andi a2,a2,15
80000514: 00e686b3 add a3,a3,a4
80000518: 00b72023 sw a1,0(a4)
8000051c: 00b72223 sw a1,4(a4)
80000520: 00b72423 sw a1,8(a4)
80000524: 00b72623 sw a1,12(a4)
80000528: 01070713 addi a4,a4,16
8000052c: fed766e3 bltu a4,a3,80000518 <memset+0x24>
80000530: 00061463 bnez a2,80000538 <memset+0x44>
80000534: 00008067 ret
80000538: 40c306b3 sub a3,t1,a2
8000053c: 00269693 slli a3,a3,0x2
80000540: 00000297 auipc t0,0x0
80000544: 005686b3 add a3,a3,t0
80000548: 00c68067 jr 12(a3)
8000054c: 00b70723 sb a1,14(a4)
80000550: 00b706a3 sb a1,13(a4)
80000554: 00b70623 sb a1,12(a4)
80000558: 00b705a3 sb a1,11(a4)
8000055c: 00b70523 sb a1,10(a4)
80000560: 00b704a3 sb a1,9(a4)
80000564: 00b70423 sb a1,8(a4)
80000568: 00b703a3 sb a1,7(a4)
8000056c: 00b70323 sb a1,6(a4)
80000570: 00b702a3 sb a1,5(a4)
80000574: 00b70223 sb a1,4(a4)
80000578: 00b701a3 sb a1,3(a4)
8000057c: 00b70123 sb a1,2(a4)
80000580: 00b700a3 sb a1,1(a4)
80000584: 00b70023 sb a1,0(a4)
80000588: 00008067 ret
8000058c: 0ff5f593 andi a1,a1,255
80000590: 00859693 slli a3,a1,0x8
80000594: 00d5e5b3 or a1,a1,a3
80000598: 01059693 slli a3,a1,0x10
8000059c: 00d5e5b3 or a1,a1,a3
800005a0: f6dff06f j 8000050c <memset+0x18>
800005a4: 00279693 slli a3,a5,0x2
800005a8: 00000297 auipc t0,0x0
800005ac: 005686b3 add a3,a3,t0
800005b0: 00008293 mv t0,ra
800005b4: fa0680e7 jalr -96(a3)
800005b8: 00028093 mv ra,t0
800005bc: ff078793 addi a5,a5,-16
800005c0: 40f70733 sub a4,a4,a5
800005c4: 00f60633 add a2,a2,a5
800005c8: f6c378e3 bgeu t1,a2,80000538 <memset+0x44>
800005cc: f3dff06f j 80000508 <memset+0x14>
800005d0 <__register_exitproc>:
800005d0: 800027b7 lui a5,0x80002
800005d4: bc07a703 lw a4,-1088(a5) # 80001bc0 <__stack_top+0x81001bc0>
800005d8: 14872783 lw a5,328(a4)
800005dc: 04078c63 beqz a5,80000634 <__register_exitproc+0x64>
800005e0: 0047a703 lw a4,4(a5)
800005e4: 01f00813 li a6,31
800005e8: 06e84e63 blt a6,a4,80000664 <__register_exitproc+0x94>
800005ec: 00271813 slli a6,a4,0x2
800005f0: 02050663 beqz a0,8000061c <__register_exitproc+0x4c>
800005f4: 01078333 add t1,a5,a6
800005f8: 08c32423 sw a2,136(t1)
800005fc: 1887a883 lw a7,392(a5)
80000600: 00100613 li a2,1
80000604: 00e61633 sll a2,a2,a4
80000608: 00c8e8b3 or a7,a7,a2
8000060c: 1917a423 sw a7,392(a5)
80000610: 10d32423 sw a3,264(t1)
80000614: 00200693 li a3,2
80000618: 02d50463 beq a0,a3,80000640 <__register_exitproc+0x70>
8000061c: 00170713 addi a4,a4,1
80000620: 00e7a223 sw a4,4(a5)
80000624: 010787b3 add a5,a5,a6
80000628: 00b7a423 sw a1,8(a5)
8000062c: 00000513 li a0,0
80000630: 00008067 ret
80000634: 14c70793 addi a5,a4,332
80000638: 14f72423 sw a5,328(a4)
8000063c: fa5ff06f j 800005e0 <__register_exitproc+0x10>
80000640: 18c7a683 lw a3,396(a5)
80000644: 00170713 addi a4,a4,1
80000648: 00e7a223 sw a4,4(a5)
8000064c: 00c6e633 or a2,a3,a2
80000650: 18c7a623 sw a2,396(a5)
80000654: 010787b3 add a5,a5,a6
80000658: 00b7a423 sw a1,8(a5)
8000065c: 00000513 li a0,0
80000660: 00008067 ret
80000664: fff00513 li a0,-1
80000668: 00008067 ret
8000066c <__call_exitprocs>:
8000066c: fd010113 addi sp,sp,-48
80000670: 800027b7 lui a5,0x80002
80000674: 01412c23 sw s4,24(sp)
80000678: bc07aa03 lw s4,-1088(a5) # 80001bc0 <__stack_top+0x81001bc0>
8000067c: 03212023 sw s2,32(sp)
80000680: 02112623 sw ra,44(sp)
80000684: 148a2903 lw s2,328(s4)
80000688: 02812423 sw s0,40(sp)
8000068c: 02912223 sw s1,36(sp)
80000690: 01312e23 sw s3,28(sp)
80000694: 01512a23 sw s5,20(sp)
80000698: 01612823 sw s6,16(sp)
8000069c: 01712623 sw s7,12(sp)
800006a0: 01812423 sw s8,8(sp)
800006a4: 04090063 beqz s2,800006e4 <__call_exitprocs+0x78>
800006a8: 00050b13 mv s6,a0
800006ac: 00058b93 mv s7,a1
800006b0: 00100a93 li s5,1
800006b4: fff00993 li s3,-1
800006b8: 00492483 lw s1,4(s2)
800006bc: fff48413 addi s0,s1,-1
800006c0: 02044263 bltz s0,800006e4 <__call_exitprocs+0x78>
800006c4: 00249493 slli s1,s1,0x2
800006c8: 009904b3 add s1,s2,s1
800006cc: 040b8463 beqz s7,80000714 <__call_exitprocs+0xa8>
800006d0: 1044a783 lw a5,260(s1)
800006d4: 05778063 beq a5,s7,80000714 <__call_exitprocs+0xa8>
800006d8: fff40413 addi s0,s0,-1
800006dc: ffc48493 addi s1,s1,-4
800006e0: ff3416e3 bne s0,s3,800006cc <__call_exitprocs+0x60>
800006e4: 02c12083 lw ra,44(sp)
800006e8: 02812403 lw s0,40(sp)
800006ec: 02412483 lw s1,36(sp)
800006f0: 02012903 lw s2,32(sp)
800006f4: 01c12983 lw s3,28(sp)
800006f8: 01812a03 lw s4,24(sp)
800006fc: 01412a83 lw s5,20(sp)
80000700: 01012b03 lw s6,16(sp)
80000704: 00c12b83 lw s7,12(sp)
80000708: 00812c03 lw s8,8(sp)
8000070c: 03010113 addi sp,sp,48
80000710: 00008067 ret
80000714: 00492783 lw a5,4(s2)
80000718: 0044a683 lw a3,4(s1)
8000071c: fff78793 addi a5,a5,-1
80000720: 04878e63 beq a5,s0,8000077c <__call_exitprocs+0x110>
80000724: 0004a223 sw zero,4(s1)
80000728: fa0688e3 beqz a3,800006d8 <__call_exitprocs+0x6c>
8000072c: 18892783 lw a5,392(s2)
80000730: 008a9733 sll a4,s5,s0
80000734: 00492c03 lw s8,4(s2)
80000738: 00f777b3 and a5,a4,a5
8000073c: 02079263 bnez a5,80000760 <__call_exitprocs+0xf4>
80000740: 000680e7 jalr a3
80000744: 00492703 lw a4,4(s2)
80000748: 148a2783 lw a5,328(s4)
8000074c: 01871463 bne a4,s8,80000754 <__call_exitprocs+0xe8>
80000750: f92784e3 beq a5,s2,800006d8 <__call_exitprocs+0x6c>
80000754: f80788e3 beqz a5,800006e4 <__call_exitprocs+0x78>
80000758: 00078913 mv s2,a5
8000075c: f5dff06f j 800006b8 <__call_exitprocs+0x4c>
80000760: 18c92783 lw a5,396(s2)
80000764: 0844a583 lw a1,132(s1)
80000768: 00f77733 and a4,a4,a5
8000076c: 00071c63 bnez a4,80000784 <__call_exitprocs+0x118>
80000770: 000b0513 mv a0,s6
80000774: 000680e7 jalr a3
80000778: fcdff06f j 80000744 <__call_exitprocs+0xd8>
8000077c: 00892223 sw s0,4(s2)
80000780: fa9ff06f j 80000728 <__call_exitprocs+0xbc>
80000784: 00058513 mv a0,a1
80000788: 000680e7 jalr a3
8000078c: fb9ff06f j 80000744 <__call_exitprocs+0xd8>
Disassembly of section .init_array:
80001790 <__init_array_start>:
80001790: 0068 addi a0,sp,12
80001792: 8000 0x8000
Disassembly of section .data:
80001798 <impure_data>:
80001798: 0000 unimp
8000179a: 0000 unimp
8000179c: 1a84 addi s1,sp,368
8000179e: 8000 0x8000
800017a0: 1aec addi a1,sp,380
800017a2: 8000 0x8000
800017a4: 1b54 addi a3,sp,436
800017a6: 8000 0x8000
...
80001840: 0001 nop
80001842: 0000 unimp
80001844: 0000 unimp
80001846: 0000 unimp
80001848: 330e fld ft6,224(sp)
8000184a: abcd j 80001e3c <__BSS_END__+0x1f8>
8000184c: 1234 addi a3,sp,296
8000184e: e66d bnez a2,80001938 <impure_data+0x1a0>
80001850: deec sw a1,124(a3)
80001852: 0005 c.nop 1
80001854: 0000000b 0xb
...
Disassembly of section .sdata:
80001bc0 <_global_impure_ptr>:
80001bc0: 1798 addi a4,sp,992
80001bc2: 8000 0x8000
Disassembly of section .bss:
80001bc4 <g_wspawn_args>:
...
Disassembly of section .comment:
00000000 <.comment>:
0: 3a434347 fmsub.d ft6,ft6,ft4,ft7,rmm
4: 2820 fld fs0,80(s0)
6: 29554e47 fmsub.s ft8,fa0,fs5,ft5,rmm
a: 3120 fld fs0,96(a0)
c: 2e30 fld fa2,88(a2)
e: 2e32 fld ft8,264(sp)
10: 0030 addi a2,sp,8
Disassembly of section .riscv.attributes:
00000000 <.riscv.attributes>:
0: 2941 jal 490 <__stack_size+0x90>
2: 0000 unimp
4: 7200 flw fs0,32(a2)
6: 7369 lui t1,0xffffa
8: 01007663 bgeu zero,a6,14 <__stack_usage+0x14>
c: 001f 0000 1004 0x10040000001f
12: 7205 lui tp,0xfffe1
14: 3376 fld ft6,376(sp)
16: 6932 flw fs2,12(sp)
18: 7032 flw ft0,44(sp)
1a: 5f30 lw a2,120(a4)
1c: 326d jal fffff9c6 <__stack_top+0xfff9c6>
1e: 3070 fld fa2,224(s0)
20: 665f 7032 0030 0x307032665f
26: 0108 addi a0,sp,128
28: 0b0a slli s6,s6,0x2

BIN
driver/tests/stress/kernel.elf Executable file

Binary file not shown.

View file

@ -0,0 +1,293 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include "common.h"
#include <assert.h>
#include <limits>
#include <math.h>
#include <vector>
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
union Float_t {
float f;
int i;
struct {
uint32_t man : 23;
uint32_t exp : 8;
uint32_t sign : 1;
} parts;
};
inline float fround(float x, int32_t precision = 8) {
auto power_of_10 = std::pow(10, precision);
return std::round(x * power_of_10) / power_of_10;
}
inline bool almost_equal_eps(float a, float b, int ulp = 128) {
auto eps = std::numeric_limits<float>::epsilon() * (std::max(fabs(a), fabs(b)) * ulp);
auto d = fabs(a - b);
if (d > eps) {
std::cout << "*** almost_equal_eps: d=" << d << ", eps=" << eps << std::endl;
return false;
}
return true;
}
inline bool almost_equal_ulp(float a, float b, int32_t ulp = 6) {
Float_t fa{a}, fb{b};
auto d = std::abs(fa.i - fb.i);
if (d > ulp) {
std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl;
return false;
}
return true;
}
inline bool almost_equal(float a, float b) {
if (a == b)
return true;
/*if (almost_equal_eps(a, b))
return true;*/
return almost_equal_ulp(a, b);
}
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> test_data;
std::vector<uint32_t> addr_table;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
static void show_usage() {
std::cout << "Vortex Driver Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (staging_buf) {
vx_buf_release(staging_buf);
}
if (device) {
vx_dev_close(device);
}
}
void gen_input_data(uint32_t num_points) {
test_data.resize(num_points);
addr_table.resize(num_points + NUM_LOADS - 1);
for (uint32_t i = 0; i < test_data.size(); ++i) {
float r = static_cast<float>(std::rand()) / RAND_MAX;
test_data[i] = r;
}
for (uint32_t i = 0; i < addr_table.size(); ++i) {
float r = static_cast<float>(std::rand()) / RAND_MAX;
uint32_t index = static_cast<uint32_t>(r * num_points);
assert(index < num_points);
addr_table[i] = index;
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t dst_buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, -1));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_ptr, dst_buf_size, 0));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
float ref = 0.0f;
for (uint32_t j = 0; j < NUM_LOADS; ++j) {
uint32_t addr = i + j;
uint32_t index = addr_table.at(addr);
float value = test_data.at(index);
//printf("*** [%d] addr=%d, index=%d, value=%f\n", i, addr, index, value);
ref *= value;
}
float cur = buf_ptr[i];
if (!almost_equal(cur, ref)) {
std::cout << "error at result #" << std::dec << i
<< ": actual " << cur << ", expected " << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
kernel_arg_t kernel_arg;
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
unsigned max_cores, max_warps, max_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
uint32_t num_tasks = max_cores * max_warps * max_threads;
uint32_t num_points = count * num_tasks;
// generate input data
gen_input_data(num_points);
uint32_t addr_buf_size = addr_table.size() * sizeof(int32_t);
uint32_t src_buf_size = test_data.size() * sizeof(int32_t);
uint32_t dst_buf_size = test_data.size() * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_alloc_dev_mem(device, addr_buf_size, &value));
kernel_arg.addr_ptr = value;
RT_CHECK(vx_alloc_dev_mem(device, src_buf_size, &value));
kernel_arg.src_ptr = value;
RT_CHECK(vx_alloc_dev_mem(device, dst_buf_size, &value));
kernel_arg.dst_ptr = value;
kernel_arg.num_tasks = num_tasks;
kernel_arg.stride = count;
std::cout << "dev_addr=" << std::hex << kernel_arg.addr_ptr << std::endl;
std::cout << "dev_src=" << std::hex << kernel_arg.src_ptr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_ptr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(addr_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t))));
RT_CHECK(vx_alloc_shared_mem(device, staging_buf_size, &staging_buf));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
}
// upload source buffer0
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < addr_table.size(); ++i) {
buf_ptr[i] = addr_table.at(i);
}
}
std::cout << "upload address buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.addr_ptr, addr_buf_size, 0));
// upload source buffer1
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < test_data.size(); ++i) {
buf_ptr[i] = test_data.at(i);
}
}
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_ptr, src_buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < test_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_ptr, dst_buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View file

@ -5,19 +5,16 @@ Description: Makes the build in the opae directory with the specified core
exists, a make clean command is ran before the build. Script waits
until the inteldev script or quartus program is finished running.
Usage: ./build.sh -c [1|2|4|8|16] [-p perf] [-w wait]
Usage: ./build.sh -c [1|2|4|8|16] [-p [y|n]]
Options:
-c
Core count (1, 2, 4, 8, or 16).
-p
Performance profiling enable. Changes the source file in the
Performance profiling enable (y or n). Changes the source file in the
opae directory to include/exclude "+define+PERF_ENABLE".
-w
Wait for the build to complete
_______________________________________________________________________________
@ -27,6 +24,7 @@ Description: Runs build.sh with performance profiling enabled for all valid
core configurations.
_______________________________________________________________________________
_______________________________________________________________________________
-program_fpga.sh-
@ -41,6 +39,7 @@ Options:
Core count (1, 2, 4, 8, or 16).
_______________________________________________________________________________
_______________________________________________________________________________
-gather_perf_results.sh-
@ -65,3 +64,53 @@ _______________________________________________________________________________
Description: Programs fpga and runs gather_perf_results.sh for all valid core
configurations. All builds should already be made before running
this.
_______________________________________________________________________________
_______________________________________________________________________________
-export_csv.sh-
Description: Creates specified .csv output file from an input directory, file,
and parameter. The .csv file contains two columns: cores, and the input
parameter. The output file is located within the directory specified with -d.
Usage: ./export_csv.sh -c [cores] -d [directory] -i [input filename] -o
[output filename] -p '[parameter]'
Example: ./export_csv.sh -c 16 -d perf_2021_03_07 -i sgemm.result -o output.csv
-p 'PERF: scoreboard stalls'
Options:
-c
Upper limit of cores to be read in. Core directories should exist in
the directory specified by -d e.g. 1c, 2c, 4c for -c 4.
-d
The directory of the form perf_{date} located in the evaluation
directory.
-i
The input filename located in each core directory within the
directory specified by -d.
-o
The output filename to be created within the directory specified
by -d.
-p
The parameter corresponding to the core count in the .csv file. The
full name of the parameter from the start of the line should be
inputted to avoid the parameter name being matched multiple times.
_______________________________________________________________________________
-export_ipc_csv.sh-
Description: Runs export_csv.sh for the parameter IPC.
Usage: ./export_csv.sh -c [cores] -d [directory] -i [input filename] -o
[output filename]
Example: ./export_ipc.sh -c 16 -d perf_2021_03_07 -i sgemm.result -o output.csv

View file

@ -28,26 +28,15 @@ fi
cd ${BUILD_DIR}
sources_file="./sources_${cores}c.txt"
if [ ${perf} = 1 ]; then
if grep -Fxq '#+define+PERF_ENABLE' ${sources_file}; then
sed -i 's/+define+PERF_ENABLE/#+define+PERF_ENABLE/' ${sources_file}
elif ! grep -Fxq '+define+PERF_ENABLE' ${sources_file}; then
sed -i '1s/^/+define+PERF_ENABLE\n/' ${sources_file}
fi
else
if grep -v '^ *#' ${sources_file} | grep -Fxq '+define+SYNTHESIS'; then
sed -i 's/+define+PERF_ENABLE/#+define+PERF_ENABLE/' ${sources_file}
elif ! grep -Fxq '#+define+PERF_ENABLE' ${sources_file}; then
sed -i '1s/^/#+define+PERF_ENABLE\n/' ${sources_file}
fi
fi
if [ -d "./build_fpga_{$cores}c" ]; then
make "clean-fpga-${cores}c"
fi
make "fpga-${cores}c"
if [ ${perf} = 1 ]; then
PERF=1 make "fpga-${cores}c"
else
make "fpga-${cores}c"
fi
if [ ${wait} = 1 ]; then
sleep 30

View file

@ -0,0 +1,33 @@
#!/bin/bash
while getopts c:d:i:o:p: flag
do
case "${flag}" in
c) cores=${OPTARG};; #1, 2, 4, 8, 16
d) dir=${OPTARG};; #directory name (e.g. perf_2021_03_07)
i) ifile=${OPTARG};; #input filename
o) ofile=${OPTARG};; #output filename
p) param=${OPTARG};; #parameter to be made into csv
esac
done
if [[ ! "$cores" =~ ^(1|2|4|8|16)$ ]]; then
echo 'Invalid parameter for argument -c (1, 2, 4, 8, or 16 expected)'
exit 1
fi
if [ -z "$ifile" ]; then
echo 'No input filename given for argument -f'
exit 1
fi
if [ -z "$dir" ]; then
echo 'No directory given for argument -d'
exit 1
fi
printf "cores,${param}\n" > "../${dir}/${ofile}"
for ((i=1; i<=$cores; i=i*2)); do
printf "${i}," >> "../${dir}/${ofile}"
(sed -n "s/${param}=\(.*\)/\1/p" < "../${dir}/${i}c/${ifile}") >> "../${dir}/${ofile}"
done

View file

@ -0,0 +1,32 @@
#!/bin/bash
while getopts c:d:f:o: flag
do
case "${flag}" in
c) cores=${OPTARG};; #1, 2, 4, 8, 16
d) dir=${OPTARG};; #directory name (e.g. perf_2021_03_07)
i) ifile=${OPTARG};; #input filename
o) ofile=${OPTARG};; #output filename
esac
done
if [[ ! "$cores" =~ ^(1|2|4|8|16)$ ]]; then
echo 'Invalid parameter for argument -c (1, 2, 4, 8, or 16 expected)'
exit 1
fi
if [ -z "$ifile" ]; then
echo 'No input filename given for argument -f'
exit 1
fi
if [ -z "$dir" ]; then
echo 'No directory given for argument -d'
exit 1
fi
printf "cores,IPC" > "../${dir}/${ofile}"
for ((i=1; i<=$cores; i=i*2)); do
printf "${i}," >> "../${dir}/${ofile}"
(sed -n "s/IPC=\(.*\)/\1/p" < "../${dir}/${i}c/${ifile}" | awk 'END {print $NF}') >> "../${dir}/${ofile}"
done

View file

@ -1,9 +1,9 @@
.PHONY: build_config
build_config:
./scripts/gen_config.py --outv ./rtl/VX_user_config.vh --outc ./VX_config.h
build_config: ./rtl/VX_config.vh
./scripts/gen_config.py -i ./rtl/VX_config.vh -o ./VX_config.h
$(MAKE) -C simulate
clean:
rm -f ./rtl/VX_user_config.vh ./VX_config.h
rm -f ./VX_config.h
$(MAKE) -C simulate clean

View file

@ -9,20 +9,20 @@ module VX_cluster #(
input wire clk,
input wire reset,
// DRAM request
output wire dram_req_valid,
output wire dram_req_rw,
output wire [`L2DRAM_BYTEEN_WIDTH-1:0] dram_req_byteen,
output wire [`L2DRAM_ADDR_WIDTH-1:0] dram_req_addr,
output wire [`L2DRAM_LINE_WIDTH-1:0] dram_req_data,
output wire [`L2DRAM_TAG_WIDTH-1:0] dram_req_tag,
input wire dram_req_ready,
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [`L2MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
output wire [`L2MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`L2MEM_LINE_WIDTH-1:0] mem_req_data,
output wire [`L2MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// DRAM response
input wire dram_rsp_valid,
input wire [`L2DRAM_LINE_WIDTH-1:0] dram_rsp_data,
input wire [`L2DRAM_TAG_WIDTH-1:0] dram_rsp_tag,
output wire dram_rsp_ready,
// Memory response
input wire mem_rsp_valid,
input wire [`L2MEM_LINE_WIDTH-1:0] mem_rsp_data,
input wire [`L2MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
// CSR Request
input wire csr_req_valid,
@ -42,31 +42,31 @@ module VX_cluster #(
output wire ebreak
);
wire [`NUM_CORES-1:0] per_core_dram_req_valid;
wire [`NUM_CORES-1:0] per_core_dram_req_rw;
wire [`NUM_CORES-1:0][`DDRAM_BYTEEN_WIDTH-1:0] per_core_dram_req_byteen;
wire [`NUM_CORES-1:0][`DDRAM_ADDR_WIDTH-1:0] per_core_dram_req_addr;
wire [`NUM_CORES-1:0][`DDRAM_LINE_WIDTH-1:0] per_core_dram_req_data;
wire [`NUM_CORES-1:0][`XDRAM_TAG_WIDTH-1:0] per_core_dram_req_tag;
wire [`NUM_CORES-1:0] per_core_dram_req_ready;
wire [`NUM_CORES-1:0] per_core_mem_req_valid;
wire [`NUM_CORES-1:0] per_core_mem_req_rw;
wire [`NUM_CORES-1:0][`DMEM_BYTEEN_WIDTH-1:0] per_core_mem_req_byteen;
wire [`NUM_CORES-1:0][`DMEM_ADDR_WIDTH-1:0] per_core_mem_req_addr;
wire [`NUM_CORES-1:0][`DMEM_LINE_WIDTH-1:0] per_core_mem_req_data;
wire [`NUM_CORES-1:0][`XMEM_TAG_WIDTH-1:0] per_core_mem_req_tag;
wire [`NUM_CORES-1:0] per_core_mem_req_ready;
wire [`NUM_CORES-1:0] per_core_dram_rsp_valid;
wire [`NUM_CORES-1:0][`DDRAM_LINE_WIDTH-1:0] per_core_dram_rsp_data;
wire [`NUM_CORES-1:0][`XDRAM_TAG_WIDTH-1:0] per_core_dram_rsp_tag;
wire [`NUM_CORES-1:0] per_core_dram_rsp_ready;
wire [`NUM_CORES-1:0] per_core_mem_rsp_valid;
wire [`NUM_CORES-1:0][`DMEM_LINE_WIDTH-1:0] per_core_mem_rsp_data;
wire [`NUM_CORES-1:0][`XMEM_TAG_WIDTH-1:0] per_core_mem_rsp_tag;
wire [`NUM_CORES-1:0] per_core_mem_rsp_ready;
wire [`NUM_CORES-1:0] per_core_csr_req_valid;
wire [`NUM_CORES-1:0][11:0] per_core_csr_req_addr;
wire [`NUM_CORES-1:0] per_core_csr_req_rw;
wire [`NUM_CORES-1:0][31:0] per_core_csr_req_data;
wire [`NUM_CORES-1:0] per_core_csr_req_ready;
wire [`NUM_CORES-1:0] per_core_csr_req_valid;
wire [`NUM_CORES-1:0][11:0] per_core_csr_req_addr;
wire [`NUM_CORES-1:0] per_core_csr_req_rw;
wire [`NUM_CORES-1:0][31:0] per_core_csr_req_data;
wire [`NUM_CORES-1:0] per_core_csr_req_ready;
wire [`NUM_CORES-1:0] per_core_csr_rsp_valid;
wire [`NUM_CORES-1:0][31:0] per_core_csr_rsp_data;
wire [`NUM_CORES-1:0] per_core_csr_rsp_ready;
wire [`NUM_CORES-1:0] per_core_csr_rsp_valid;
wire [`NUM_CORES-1:0][31:0] per_core_csr_rsp_data;
wire [`NUM_CORES-1:0] per_core_csr_rsp_ready;
wire [`NUM_CORES-1:0] per_core_busy;
wire [`NUM_CORES-1:0] per_core_ebreak;
wire [`NUM_CORES-1:0] per_core_busy;
wire [`NUM_CORES-1:0] per_core_ebreak;
for (genvar i = 0; i < `NUM_CORES; i++) begin
@ -87,18 +87,18 @@ module VX_cluster #(
.clk (clk),
.reset (core_reset),
.dram_req_valid (per_core_dram_req_valid[i]),
.dram_req_rw (per_core_dram_req_rw [i]),
.dram_req_byteen(per_core_dram_req_byteen[i]),
.dram_req_addr (per_core_dram_req_addr [i]),
.dram_req_data (per_core_dram_req_data [i]),
.dram_req_tag (per_core_dram_req_tag [i]),
.dram_req_ready (per_core_dram_req_ready[i]),
.mem_req_valid (per_core_mem_req_valid[i]),
.mem_req_rw (per_core_mem_req_rw [i]),
.mem_req_byteen (per_core_mem_req_byteen[i]),
.mem_req_addr (per_core_mem_req_addr [i]),
.mem_req_data (per_core_mem_req_data [i]),
.mem_req_tag (per_core_mem_req_tag [i]),
.mem_req_ready (per_core_mem_req_ready[i]),
.dram_rsp_valid (per_core_dram_rsp_valid[i]),
.dram_rsp_data (per_core_dram_rsp_data [i]),
.dram_rsp_tag (per_core_dram_rsp_tag [i]),
.dram_rsp_ready (per_core_dram_rsp_ready[i]),
.mem_rsp_valid (per_core_mem_rsp_valid[i]),
.mem_rsp_data (per_core_mem_rsp_data [i]),
.mem_rsp_tag (per_core_mem_rsp_tag [i]),
.mem_rsp_ready (per_core_mem_rsp_ready[i]),
.csr_req_valid (per_core_csr_req_valid [i]),
.csr_req_rw (per_core_csr_req_rw [i]),
@ -169,12 +169,12 @@ module VX_cluster #(
.NUM_REQS (`NUM_CORES),
.CREQ_SIZE (`L2CREQ_SIZE),
.MSHR_SIZE (`L2MSHR_SIZE),
.DRSQ_SIZE (`L2DRSQ_SIZE),
.DREQ_SIZE (`L2DREQ_SIZE),
.MRSQ_SIZE (`L2MRSQ_SIZE),
.MREQ_SIZE (`L2MREQ_SIZE),
.WRITE_ENABLE (1),
.CORE_TAG_WIDTH (`XDRAM_TAG_WIDTH),
.CORE_TAG_WIDTH (`XMEM_TAG_WIDTH),
.CORE_TAG_ID_BITS (0),
.DRAM_TAG_WIDTH (`L2DRAM_TAG_WIDTH)
.MEM_TAG_WIDTH (`L2MEM_TAG_WIDTH)
) l2cache (
`SCOPE_BIND_VX_cluster_l2cache
@ -188,78 +188,78 @@ module VX_cluster #(
`endif
// Core request
.core_req_valid (per_core_dram_req_valid),
.core_req_rw (per_core_dram_req_rw),
.core_req_byteen (per_core_dram_req_byteen),
.core_req_addr (per_core_dram_req_addr),
.core_req_data (per_core_dram_req_data),
.core_req_tag (per_core_dram_req_tag),
.core_req_ready (per_core_dram_req_ready),
.core_req_valid (per_core_mem_req_valid),
.core_req_rw (per_core_mem_req_rw),
.core_req_byteen (per_core_mem_req_byteen),
.core_req_addr (per_core_mem_req_addr),
.core_req_data (per_core_mem_req_data),
.core_req_tag (per_core_mem_req_tag),
.core_req_ready (per_core_mem_req_ready),
// Core response
.core_rsp_valid (per_core_dram_rsp_valid),
.core_rsp_data (per_core_dram_rsp_data),
.core_rsp_tag (per_core_dram_rsp_tag),
.core_rsp_ready (per_core_dram_rsp_ready),
.core_rsp_valid (per_core_mem_rsp_valid),
.core_rsp_data (per_core_mem_rsp_data),
.core_rsp_tag (per_core_mem_rsp_tag),
.core_rsp_ready (per_core_mem_rsp_ready),
// DRAM request
.dram_req_valid (dram_req_valid),
.dram_req_rw (dram_req_rw),
.dram_req_byteen (dram_req_byteen),
.dram_req_addr (dram_req_addr),
.dram_req_data (dram_req_data),
.dram_req_tag (dram_req_tag),
.dram_req_ready (dram_req_ready),
// Memory request
.mem_req_valid (mem_req_valid),
.mem_req_rw (mem_req_rw),
.mem_req_byteen (mem_req_byteen),
.mem_req_addr (mem_req_addr),
.mem_req_data (mem_req_data),
.mem_req_tag (mem_req_tag),
.mem_req_ready (mem_req_ready),
// DRAM response
.dram_rsp_valid (dram_rsp_valid),
.dram_rsp_tag (dram_rsp_tag),
.dram_rsp_data (dram_rsp_data),
.dram_rsp_ready (dram_rsp_ready)
// Memory response
.mem_rsp_valid (mem_rsp_valid),
.mem_rsp_tag (mem_rsp_tag),
.mem_rsp_data (mem_rsp_data),
.mem_rsp_ready (mem_rsp_ready)
);
end else begin
VX_mem_arb #(
.NUM_REQS (`NUM_CORES),
.DATA_WIDTH (`L2DRAM_LINE_WIDTH),
.TAG_IN_WIDTH (`XDRAM_TAG_WIDTH),
.TAG_OUT_WIDTH (`L2DRAM_TAG_WIDTH),
.BUFFERED_REQ (1),
.BUFFERED_RSP (1)
) dram_arb (
.NUM_REQS (`NUM_CORES),
.DATA_WIDTH (`L2MEM_LINE_WIDTH),
.TAG_IN_WIDTH (`XMEM_TAG_WIDTH),
.TAG_OUT_WIDTH (`L2MEM_TAG_WIDTH),
.BUFFERED_REQ (1),
.BUFFERED_RSP (1)
) mem_arb (
.clk (clk),
.reset (reset),
// Core request
.req_valid_in (per_core_dram_req_valid),
.req_rw_in (per_core_dram_req_rw),
.req_byteen_in (per_core_dram_req_byteen),
.req_addr_in (per_core_dram_req_addr),
.req_data_in (per_core_dram_req_data),
.req_tag_in (per_core_dram_req_tag),
.req_ready_in (per_core_dram_req_ready),
.req_valid_in (per_core_mem_req_valid),
.req_rw_in (per_core_mem_req_rw),
.req_byteen_in (per_core_mem_req_byteen),
.req_addr_in (per_core_mem_req_addr),
.req_data_in (per_core_mem_req_data),
.req_tag_in (per_core_mem_req_tag),
.req_ready_in (per_core_mem_req_ready),
// DRAM request
.req_valid_out (dram_req_valid),
.req_rw_out (dram_req_rw),
.req_byteen_out (dram_req_byteen),
.req_addr_out (dram_req_addr),
.req_data_out (dram_req_data),
.req_tag_out (dram_req_tag),
.req_ready_out (dram_req_ready),
// Memory request
.req_valid_out (mem_req_valid),
.req_rw_out (mem_req_rw),
.req_byteen_out (mem_req_byteen),
.req_addr_out (mem_req_addr),
.req_data_out (mem_req_data),
.req_tag_out (mem_req_tag),
.req_ready_out (mem_req_ready),
// Core response
.rsp_valid_out (per_core_dram_rsp_valid),
.rsp_data_out (per_core_dram_rsp_data),
.rsp_tag_out (per_core_dram_rsp_tag),
.rsp_ready_out (per_core_dram_rsp_ready),
.rsp_valid_out (per_core_mem_rsp_valid),
.rsp_data_out (per_core_mem_rsp_data),
.rsp_tag_out (per_core_mem_rsp_tag),
.rsp_ready_out (per_core_mem_rsp_ready),
// DRAM response
.rsp_valid_in (dram_rsp_valid),
.rsp_tag_in (dram_rsp_tag),
.rsp_data_in (dram_rsp_data),
.rsp_ready_in (dram_rsp_ready)
// Memory response
.rsp_valid_in (mem_rsp_valid),
.rsp_tag_in (mem_rsp_tag),
.rsp_data_in (mem_rsp_data),
.rsp_ready_in (mem_rsp_ready)
);
end

View file

@ -1,8 +1,6 @@
`ifndef VX_CONFIG
`define VX_CONFIG
`include "VX_user_config.vh"
`ifndef NUM_CLUSTERS
`define NUM_CLUSTERS 1
`endif
@ -35,8 +33,8 @@
`define SM_ENABLE 1
`endif
`ifndef GLOBAL_BLOCK_SIZE
`define GLOBAL_BLOCK_SIZE 64
`ifndef MEM_BLOCK_SIZE
`define MEM_BLOCK_SIZE 64
`endif
`ifndef L1_BLOCK_SIZE
@ -209,14 +207,14 @@
`define CSR_MPM_SMEM_BANK_ST 12'hB18 // bank conflicts stalls
`define CSR_MPM_SMEM_BANK_ST_H 12'hB98
// PERF: memory
`define CSR_MPM_DRAM_READS 12'hB19 // dram reads
`define CSR_MPM_DRAM_READS_H 12'hB99
`define CSR_MPM_DRAM_WRITES 12'hB1A // dram writes
`define CSR_MPM_DRAM_WRITES_H 12'hB9A
`define CSR_MPM_DRAM_ST 12'hB1B // dram request stalls
`define CSR_MPM_DRAM_ST_H 12'hB9B
`define CSR_MPM_DRAM_LAT 12'hB1C // dram latency (total)
`define CSR_MPM_DRAM_LAT_H 12'hB9C
`define CSR_MPM_MEM_READS 12'hB19 // memory reads
`define CSR_MPM_MEM_READS_H 12'hB99
`define CSR_MPM_MEM_WRITES 12'hB1A // memory writes
`define CSR_MPM_MEM_WRITES_H 12'hB9A
`define CSR_MPM_MEM_ST 12'hB1B // memory request stalls
`define CSR_MPM_MEM_ST_H 12'hB9B
`define CSR_MPM_MEM_LAT 12'hB1C // memory latency (total)
`define CSR_MPM_MEM_LAT_H 12'hB9C
// Machine Information Registers
`define CSR_MVENDORID 12'hF11
@ -281,14 +279,14 @@
`define IMSHR_SIZE `NUM_WARPS
`endif
// DRAM Request Queue Size
`ifndef IDREQ_SIZE
`define IDREQ_SIZE 4
// Memory Request Queue Size
`ifndef IMREQ_SIZE
`define IMREQ_SIZE 4
`endif
// DRAM Response Queue Size
`ifndef IDRSQ_SIZE
`define IDRSQ_SIZE 4
// Memory Response Queue Size
`ifndef IMRSQ_SIZE
`define IMRSQ_SIZE 4
`endif
// Dcache Configurable Knobs //////////////////////////////////////////////////
@ -318,14 +316,14 @@
`define DMSHR_SIZE `LSUQ_SIZE
`endif
// DRAM Request Queue Size
`ifndef DDREQ_SIZE
`define DDREQ_SIZE 4
// Memory Request Queue Size
`ifndef DMREQ_SIZE
`define DMREQ_SIZE 4
`endif
// DRAM Response Queue Size
`ifndef DDRSQ_SIZE
`define DDRSQ_SIZE `MAX(4, (`DNUM_BANKS * 2))
// Memory Response Queue Size
`ifndef DMRSQ_SIZE
`define DMRSQ_SIZE `MAX(4, (`DNUM_BANKS * 2))
`endif
// SM Configurable Knobs //////////////////////////////////////////////////////
@ -372,14 +370,14 @@
`define L2MSHR_SIZE 16
`endif
// DRAM Request Queue Size
`ifndef L2DREQ_SIZE
`define L2DREQ_SIZE 4
// L2 Request Queue Size
`ifndef L2MREQ_SIZE
`define L2MREQ_SIZE 4
`endif
// DRAM Response Queue Size
`ifndef L2DRSQ_SIZE
`define L2DRSQ_SIZE `MAX(4, (`L2NUM_BANKS * 2))
// L2 Response Queue Size
`ifndef L2MRSQ_SIZE
`define L2MRSQ_SIZE `MAX(4, (`L2NUM_BANKS * 2))
`endif
// L3cache Configurable Knobs /////////////////////////////////////////////////
@ -404,14 +402,14 @@
`define L3MSHR_SIZE 16
`endif
// DRAM Request Queue Size
`ifndef L3DREQ_SIZE
`define L3DREQ_SIZE 4
// L3 Request Queue Size
`ifndef L3MREQ_SIZE
`define L3MREQ_SIZE 4
`endif
// DRAM Response Queue Size
`ifndef L3DRSQ_SIZE
`define L3DRSQ_SIZE `MAX(4, (`L3NUM_BANKS * 2))
// L3 Response Queue Size
`ifndef L3MRSQ_SIZE
`define L3MRSQ_SIZE `MAX(4, (`L3NUM_BANKS * 2))
`endif
`endif

View file

@ -9,20 +9,20 @@ module VX_core #(
input wire clk,
input wire reset,
// DRAM request
output wire dram_req_valid,
output wire dram_req_rw,
output wire [`DDRAM_BYTEEN_WIDTH-1:0] dram_req_byteen,
output wire [`DDRAM_ADDR_WIDTH-1:0] dram_req_addr,
output wire [`DDRAM_LINE_WIDTH-1:0] dram_req_data,
output wire [`XDRAM_TAG_WIDTH-1:0] dram_req_tag,
input wire dram_req_ready,
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [`DMEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
output wire [`DMEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`DMEM_LINE_WIDTH-1:0] mem_req_data,
output wire [`XMEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// DRAM reponse
input wire dram_rsp_valid,
input wire [`DDRAM_LINE_WIDTH-1:0] dram_rsp_data,
input wire [`XDRAM_TAG_WIDTH-1:0] dram_rsp_tag,
output wire dram_rsp_ready,
// Memory reponse
input wire mem_rsp_valid,
input wire [`DMEM_LINE_WIDTH-1:0] mem_rsp_data,
input wire [`XMEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
// CSR request
input wire csr_req_valid,
@ -44,29 +44,29 @@ module VX_core #(
VX_perf_memsys_if perf_memsys_if();
`endif
VX_cache_dram_req_if #(
.DRAM_LINE_WIDTH(`DDRAM_LINE_WIDTH),
.DRAM_ADDR_WIDTH(`DDRAM_ADDR_WIDTH),
.DRAM_TAG_WIDTH(`XDRAM_TAG_WIDTH)
) dram_req_if();
VX_cache_mem_req_if #(
.MEM_LINE_WIDTH(`DMEM_LINE_WIDTH),
.MEM_ADDR_WIDTH(`DMEM_ADDR_WIDTH),
.MEM_TAG_WIDTH(`XMEM_TAG_WIDTH)
) mem_req_if();
VX_cache_dram_rsp_if #(
.DRAM_LINE_WIDTH(`DDRAM_LINE_WIDTH),
.DRAM_TAG_WIDTH(`XDRAM_TAG_WIDTH)
) dram_rsp_if();
VX_cache_mem_rsp_if #(
.MEM_LINE_WIDTH(`DMEM_LINE_WIDTH),
.MEM_TAG_WIDTH(`XMEM_TAG_WIDTH)
) mem_rsp_if();
assign dram_req_valid = dram_req_if.valid;
assign dram_req_rw = dram_req_if.rw;
assign dram_req_byteen= dram_req_if.byteen;
assign dram_req_addr = dram_req_if.addr;
assign dram_req_data = dram_req_if.data;
assign dram_req_tag = dram_req_if.tag;
assign dram_req_if.ready = dram_req_ready;
assign mem_req_valid = mem_req_if.valid;
assign mem_req_rw = mem_req_if.rw;
assign mem_req_byteen= mem_req_if.byteen;
assign mem_req_addr = mem_req_if.addr;
assign mem_req_data = mem_req_if.data;
assign mem_req_tag = mem_req_if.tag;
assign mem_req_if.ready = mem_req_ready;
assign dram_rsp_if.valid = dram_rsp_valid;
assign dram_rsp_if.data = dram_rsp_data;
assign dram_rsp_if.tag = dram_rsp_tag;
assign dram_rsp_ready = dram_rsp_if.ready;
assign mem_rsp_if.valid = mem_rsp_valid;
assign mem_rsp_if.data = mem_rsp_data;
assign mem_rsp_if.tag = mem_rsp_tag;
assign mem_rsp_ready = mem_rsp_if.ready;
//--
@ -168,9 +168,9 @@ module VX_core #(
.icache_core_req_if (icache_core_req_if),
.icache_core_rsp_if (icache_core_rsp_if),
// DRAM
.dram_req_if (dram_req_if),
.dram_rsp_if (dram_rsp_if)
// Memory
.mem_req_if (mem_req_if),
.mem_rsp_if (mem_rsp_if)
);
endmodule

View file

@ -123,61 +123,61 @@ module VX_csr_data #(
`ifdef PERF_ENABLE
// PERF: pipeline
`CSR_MPM_IBUF_ST : read_data_r = perf_pipeline_if.ibf_stalls[31:0];
`CSR_MPM_IBUF_ST_H : read_data_r = 32'(perf_pipeline_if.ibf_stalls[43:32]);
`CSR_MPM_IBUF_ST_H : read_data_r = 32'(perf_pipeline_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SCRB_ST : read_data_r = perf_pipeline_if.scb_stalls[31:0];
`CSR_MPM_SCRB_ST_H : read_data_r = 32'(perf_pipeline_if.scb_stalls[43:32]);
`CSR_MPM_SCRB_ST_H : read_data_r = 32'(perf_pipeline_if.scb_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_ALU_ST : read_data_r = perf_pipeline_if.alu_stalls[31:0];
`CSR_MPM_ALU_ST_H : read_data_r = 32'(perf_pipeline_if.alu_stalls[43:32]);
`CSR_MPM_ALU_ST_H : read_data_r = 32'(perf_pipeline_if.alu_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_LSU_ST : read_data_r = perf_pipeline_if.lsu_stalls[31:0];
`CSR_MPM_LSU_ST_H : read_data_r = 32'(perf_pipeline_if.lsu_stalls[43:32]);
`CSR_MPM_LSU_ST_H : read_data_r = 32'(perf_pipeline_if.lsu_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_CSR_ST : read_data_r = perf_pipeline_if.csr_stalls[31:0];
`CSR_MPM_CSR_ST_H : read_data_r = 32'(perf_pipeline_if.csr_stalls[43:32]);
`CSR_MPM_CSR_ST_H : read_data_r = 32'(perf_pipeline_if.csr_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_FPU_ST : read_data_r = perf_pipeline_if.fpu_stalls[31:0];
`CSR_MPM_FPU_ST_H : read_data_r = 32'(perf_pipeline_if.fpu_stalls[43:32]);
`CSR_MPM_FPU_ST_H : read_data_r = 32'(perf_pipeline_if.fpu_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_GPU_ST : read_data_r = perf_pipeline_if.gpu_stalls[31:0];
`CSR_MPM_GPU_ST_H : read_data_r = 32'(perf_pipeline_if.gpu_stalls[43:32]);
`CSR_MPM_GPU_ST_H : read_data_r = 32'(perf_pipeline_if.gpu_stalls[`PERF_CTR_BITS-1:32]);
// PERF: icache
`CSR_MPM_ICACHE_READS : read_data_r = perf_memsys_if.icache_reads[31:0];
`CSR_MPM_ICACHE_READS_H : read_data_r = 32'(perf_memsys_if.icache_reads[43:32]);
`CSR_MPM_ICACHE_READS_H : read_data_r = 32'(perf_memsys_if.icache_reads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_ICACHE_MISS_R : read_data_r = perf_memsys_if.icache_read_misses[31:0];
`CSR_MPM_ICACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.icache_read_misses[43:32]);
`CSR_MPM_ICACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.icache_read_misses[`PERF_CTR_BITS-1:32]);
`CSR_MPM_ICACHE_PIPE_ST : read_data_r = perf_memsys_if.icache_pipe_stalls[31:0];
`CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.icache_pipe_stalls[43:32]);
`CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.icache_pipe_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_ICACHE_CRSP_ST : read_data_r = perf_memsys_if.icache_crsp_stalls[31:0];
`CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.icache_crsp_stalls[43:32]);
`CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.icache_crsp_stalls[`PERF_CTR_BITS-1:32]);
// PERF: dcache
`CSR_MPM_DCACHE_READS : read_data_r = perf_memsys_if.dcache_reads[31:0];
`CSR_MPM_DCACHE_READS_H : read_data_r = 32'(perf_memsys_if.dcache_reads[43:32]);
`CSR_MPM_DCACHE_READS_H : read_data_r = 32'(perf_memsys_if.dcache_reads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_DCACHE_WRITES : read_data_r = perf_memsys_if.dcache_writes[31:0];
`CSR_MPM_DCACHE_WRITES_H : read_data_r = 32'(perf_memsys_if.dcache_writes[43:32]);
`CSR_MPM_DCACHE_WRITES_H : read_data_r = 32'(perf_memsys_if.dcache_writes[`PERF_CTR_BITS-1:32]);
`CSR_MPM_DCACHE_MISS_R : read_data_r = perf_memsys_if.dcache_read_misses[31:0];
`CSR_MPM_DCACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.dcache_read_misses[43:32]);
`CSR_MPM_DCACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.dcache_read_misses[`PERF_CTR_BITS-1:32]);
`CSR_MPM_DCACHE_MISS_W : read_data_r = perf_memsys_if.dcache_write_misses[31:0];
`CSR_MPM_DCACHE_MISS_W_H : read_data_r = 32'(perf_memsys_if.dcache_write_misses[43:32]);
`CSR_MPM_DCACHE_MISS_W_H : read_data_r = 32'(perf_memsys_if.dcache_write_misses[`PERF_CTR_BITS-1:32]);
`CSR_MPM_DCACHE_BANK_ST : read_data_r = perf_memsys_if.dcache_bank_stalls[31:0];
`CSR_MPM_DCACHE_BANK_ST_H : read_data_r = 32'(perf_memsys_if.dcache_bank_stalls[43:32]);
`CSR_MPM_DCACHE_BANK_ST_H : read_data_r = 32'(perf_memsys_if.dcache_bank_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_DCACHE_MSHR_ST : read_data_r = perf_memsys_if.dcache_mshr_stalls[31:0];
`CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = 32'(perf_memsys_if.dcache_mshr_stalls[43:32]);
`CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = 32'(perf_memsys_if.dcache_mshr_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_DCACHE_PIPE_ST : read_data_r = perf_memsys_if.dcache_pipe_stalls[31:0];
`CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.dcache_pipe_stalls[43:32]);
`CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.dcache_pipe_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_DCACHE_CRSP_ST : read_data_r = perf_memsys_if.dcache_crsp_stalls[31:0];
`CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.dcache_crsp_stalls[43:32]);
`CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.dcache_crsp_stalls[`PERF_CTR_BITS-1:32]);
// PERF: smem
`CSR_MPM_SMEM_READS : read_data_r = perf_memsys_if.smem_reads[31:0];
`CSR_MPM_SMEM_READS_H : read_data_r = 32'(perf_memsys_if.smem_reads[43:32]);
`CSR_MPM_SMEM_READS_H : read_data_r = 32'(perf_memsys_if.smem_reads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SMEM_WRITES : read_data_r = perf_memsys_if.smem_writes[31:0];
`CSR_MPM_SMEM_WRITES_H : read_data_r = 32'(perf_memsys_if.smem_writes[43:32]);
`CSR_MPM_SMEM_WRITES_H : read_data_r = 32'(perf_memsys_if.smem_writes[`PERF_CTR_BITS-1:32]);
`CSR_MPM_SMEM_BANK_ST : read_data_r = perf_memsys_if.smem_bank_stalls[31:0];
`CSR_MPM_SMEM_BANK_ST_H : read_data_r = 32'(perf_memsys_if.smem_bank_stalls[43:32]);
// PERF: DRAM
`CSR_MPM_DRAM_READS : read_data_r = perf_memsys_if.dram_reads[31:0];
`CSR_MPM_DRAM_READS_H : read_data_r = 32'(perf_memsys_if.dram_reads[43:32]);
`CSR_MPM_DRAM_WRITES : read_data_r = perf_memsys_if.dram_writes[31:0];
`CSR_MPM_DRAM_WRITES_H : read_data_r = 32'(perf_memsys_if.dram_writes[43:32]);
`CSR_MPM_DRAM_ST : read_data_r = perf_memsys_if.dram_stalls[31:0];
`CSR_MPM_DRAM_ST_H : read_data_r = 32'(perf_memsys_if.dram_stalls[43:32]);
`CSR_MPM_DRAM_LAT : read_data_r = perf_memsys_if.dram_latency[31:0];
`CSR_MPM_DRAM_LAT_H : read_data_r = 32'(perf_memsys_if.dram_latency[43:32]);
`CSR_MPM_SMEM_BANK_ST_H : read_data_r = 32'(perf_memsys_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]);
// PERF: MEM
`CSR_MPM_MEM_READS : read_data_r = perf_memsys_if.mem_reads[31:0];
`CSR_MPM_MEM_READS_H : read_data_r = 32'(perf_memsys_if.mem_reads[`PERF_CTR_BITS-1:32]);
`CSR_MPM_MEM_WRITES : read_data_r = perf_memsys_if.mem_writes[31:0];
`CSR_MPM_MEM_WRITES_H : read_data_r = 32'(perf_memsys_if.mem_writes[`PERF_CTR_BITS-1:32]);
`CSR_MPM_MEM_ST : read_data_r = perf_memsys_if.mem_stalls[31:0];
`CSR_MPM_MEM_ST_H : read_data_r = 32'(perf_memsys_if.mem_stalls[`PERF_CTR_BITS-1:32]);
`CSR_MPM_MEM_LAT : read_data_r = perf_memsys_if.mem_latency[31:0];
`CSR_MPM_MEM_LAT_H : read_data_r = 32'(perf_memsys_if.mem_latency[`PERF_CTR_BITS-1:32]);
`endif
`CSR_SATP : read_data_r = 32'(csr_satp);
@ -195,9 +195,9 @@ module VX_csr_data #(
`CSR_PMPADDR0 : read_data_r = 32'(csr_pmpaddr[0]);
`CSR_CYCLE : read_data_r = csr_cycle[31:0];
`CSR_CYCLE_H : read_data_r = 32'(csr_cycle[43:32]);
`CSR_CYCLE_H : read_data_r = 32'(csr_cycle[`PERF_CTR_BITS-1:32]);
`CSR_INSTRET : read_data_r = csr_instret[31:0];
`CSR_INSTRET_H : read_data_r = 32'(csr_instret[43:32]);
`CSR_INSTRET_H : read_data_r = 32'(csr_instret[`PERF_CTR_BITS-1:32]);
`CSR_MVENDORID : read_data_r = `VENDOR_ID;
`CSR_MARCHID : read_data_r = `ARCHITECTURE_ID;

View file

@ -30,6 +30,8 @@
`define CSR_WIDTH 12
`define PERF_CTR_BITS 44
///////////////////////////////////////////////////////////////////////////////
`define INST_LUI 7'b0110111
@ -244,7 +246,7 @@
`define ICACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 0)
// Block size in bytes
`define ICACHE_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `GLOBAL_BLOCK_SIZE)
`define ICACHE_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `MEM_BLOCK_SIZE)
// Word size in bytes
`define IWORD_SIZE 4
@ -264,11 +266,11 @@
// Core request tag bits
`define ICORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `ICORE_TAG_ID_BITS)
// DRAM request data bits
`define IDRAM_LINE_WIDTH (`ICACHE_LINE_SIZE * 8)
// Memory request data bits
`define IMEM_LINE_WIDTH (`ICACHE_LINE_SIZE * 8)
// DRAM byte enable bits
`define IDRAM_BYTEEN_WIDTH `ICACHE_LINE_SIZE
// Memory byte enable bits
`define IMEM_BYTEEN_WIDTH `ICACHE_LINE_SIZE
////////////////////////// Dcache Configurable Knobs //////////////////////////
@ -276,7 +278,7 @@
`define DCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 1)
// Block size in bytes
`define DCACHE_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `GLOBAL_BLOCK_SIZE)
`define DCACHE_LINE_SIZE (`L2_ENABLE ? `L1_BLOCK_SIZE : `MEM_BLOCK_SIZE)
// Word size in bytes
`define DWORD_SIZE 4
@ -299,14 +301,14 @@
// DRAM request data bits
`define DDRAM_LINE_WIDTH (`DCACHE_LINE_SIZE * 8)
// DRAM request address bits
`define DDRAM_ADDR_WIDTH (32 - `CLOG2(`DCACHE_LINE_SIZE))
// Memory request address bits
`define DMEM_ADDR_WIDTH (32 - `CLOG2(`DCACHE_LINE_SIZE))
// DRAM byte enable bits
`define DDRAM_BYTEEN_WIDTH `DCACHE_LINE_SIZE
// Memory byte enable bits
`define DMEM_BYTEEN_WIDTH `DCACHE_LINE_SIZE
// DRAM request tag bits
`define DDRAM_TAG_WIDTH `DDRAM_ADDR_WIDTH
// Memory request tag bits
`define DMEM_TAG_WIDTH `DMEM_ADDR_WIDTH
// Core request size
`define DNUM_REQUESTS `NUM_THREADS
@ -334,7 +336,7 @@
`define L2CACHE_ID (32'(`L3_ENABLE) + CLUSTER_ID)
// Block size in bytes
`define L2CACHE_LINE_SIZE `GLOBAL_BLOCK_SIZE
`define L2CACHE_LINE_SIZE `MEM_BLOCK_SIZE
// Word size in bytes
`define L2WORD_SIZE `DCACHE_LINE_SIZE
@ -342,17 +344,17 @@
// Core request tag bits
`define L2CORE_TAG_WIDTH (`DCORE_TAG_WIDTH + `CLOG2(`NUM_CORES))
// DRAM request data bits
`define L2DRAM_LINE_WIDTH (`L2CACHE_LINE_SIZE * 8)
// Memory request data bits
`define L2MEM_LINE_WIDTH (`L2CACHE_LINE_SIZE * 8)
// DRAM request address bits
`define L2DRAM_ADDR_WIDTH (32 - `CLOG2(`L2CACHE_LINE_SIZE))
// Memory request address bits
`define L2MEM_ADDR_WIDTH (32 - `CLOG2(`L2CACHE_LINE_SIZE))
// DRAM byte enable bits
`define L2DRAM_BYTEEN_WIDTH `L2CACHE_LINE_SIZE
// Memory byte enable bits
`define L2MEM_BYTEEN_WIDTH `L2CACHE_LINE_SIZE
// DRAM request tag bits
`define L2DRAM_TAG_WIDTH (`L2_ENABLE ? `L2DRAM_ADDR_WIDTH : (`XDRAM_TAG_WIDTH+`CLOG2(`NUM_CORES)))
// Memory request tag bits
`define L2MEM_TAG_WIDTH (`L2_ENABLE ? `L2MEM_ADDR_WIDTH : (`XMEM_TAG_WIDTH+`CLOG2(`NUM_CORES)))
////////////////////////// L3cache Configurable Knobs /////////////////////////
@ -360,7 +362,7 @@
`define L3CACHE_ID 0
// Block size in bytes
`define L3CACHE_LINE_SIZE `GLOBAL_BLOCK_SIZE
`define L3CACHE_LINE_SIZE `MEM_BLOCK_SIZE
// Word size in bytes
`define L3WORD_SIZE `L2CACHE_LINE_SIZE
@ -368,30 +370,30 @@
// Core request tag bits
`define L3CORE_TAG_WIDTH (`L2CORE_TAG_WIDTH + `CLOG2(`NUM_CLUSTERS))
// DRAM request data bits
`define L3DRAM_LINE_WIDTH (`L3CACHE_LINE_SIZE * 8)
// Memory request data bits
`define L3MEM_LINE_WIDTH (`L3CACHE_LINE_SIZE * 8)
// DRAM request address bits
`define L3DRAM_ADDR_WIDTH (32 - `CLOG2(`L3CACHE_LINE_SIZE))
// Memory request address bits
`define L3MEM_ADDR_WIDTH (32 - `CLOG2(`L3CACHE_LINE_SIZE))
// DRAM byte enable bits
`define L3DRAM_BYTEEN_WIDTH `L3CACHE_LINE_SIZE
// Memory byte enable bits
`define L3MEM_BYTEEN_WIDTH `L3CACHE_LINE_SIZE
// DRAM request tag bits
`define L3DRAM_TAG_WIDTH (`L3_ENABLE ? `L3DRAM_ADDR_WIDTH : (`L2DRAM_TAG_WIDTH+`CLOG2(`NUM_CLUSTERS)))
// Memory request tag bits
`define L3MEM_TAG_WIDTH (`L3_ENABLE ? `L3MEM_ADDR_WIDTH : (`L2MEM_TAG_WIDTH+`CLOG2(`NUM_CLUSTERS)))
///////////////////////////////////////////////////////////////////////////////
`define VX_DRAM_BYTEEN_WIDTH `L3DRAM_BYTEEN_WIDTH
`define VX_DRAM_ADDR_WIDTH `L3DRAM_ADDR_WIDTH
`define VX_DRAM_LINE_WIDTH `L3DRAM_LINE_WIDTH
`define VX_DRAM_TAG_WIDTH `L3DRAM_TAG_WIDTH
`define VX_MEM_BYTEEN_WIDTH `L3MEM_BYTEEN_WIDTH
`define VX_MEM_ADDR_WIDTH `L3MEM_ADDR_WIDTH
`define VX_MEM_LINE_WIDTH `L3MEM_LINE_WIDTH
`define VX_MEM_TAG_WIDTH `L3MEM_TAG_WIDTH
`define VX_CORE_TAG_WIDTH `L3CORE_TAG_WIDTH
`define VX_CSR_ID_WIDTH `LOG2UP(`NUM_CLUSTERS * `NUM_CORES)
`define TO_FULL_ADDR(x) {x, (32-$bits(x))'(0)}
`define XDRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH+`CLOG2(2))
`define XMEM_TAG_WIDTH (`DMEM_TAG_WIDTH+`CLOG2(2))
///////////////////////////////////////////////////////////////////////////////

View file

@ -7,7 +7,6 @@ module VX_ibuffer #(
input wire reset,
// inputs
input wire freeze, // keep current warp
VX_decode_if ibuf_enq_if,
// outputs
@ -117,18 +116,9 @@ module VX_ibuffer #(
deq_valid_n = 0;
deq_wid_n = 'x;
deq_instr_n = 'x;
schedule_table_n = 'x;
if ((0 == num_warps)
|| (1 == num_warps && deq_fire && q_alm_empty[deq_wid])) begin
deq_valid_n = enq_fire;
deq_wid_n = ibuf_enq_if.wid;
deq_instr_n = q_data_in;
end else if ((1 == num_warps) || freeze) begin
deq_valid_n = 1;
deq_wid_n = deq_wid;
deq_instr_n = deq_fire ? q_data_prev[deq_wid] : q_data_out[deq_wid];
end else begin
schedule_table_n = 'x;
if (num_warps > 1) begin
deq_valid_n = (| schedule_table);
schedule_table_n = schedule_table;
for (integer i = 0; i < `NUM_WARPS; i++) begin
@ -139,6 +129,14 @@ module VX_ibuffer #(
break;
end
end
end else if (1 == num_warps && !(deq_fire && q_alm_empty[deq_wid])) begin
deq_valid_n = 1;
deq_wid_n = deq_wid;
deq_instr_n = deq_fire ? q_data_prev[deq_wid] : q_data_out[deq_wid];
end else begin
deq_valid_n = enq_fire;
deq_wid_n = ibuf_enq_if.wid;
deq_instr_n = q_data_in;
end
end

View file

@ -33,7 +33,6 @@ module VX_issue #(
) ibuffer (
.clk (clk),
.reset (reset),
.freeze (1'b0),
.ibuf_enq_if (decode_if),
.ibuf_deq_if (ibuf_deq_if)
);
@ -121,14 +120,14 @@ module VX_issue #(
`SCOPE_ASSIGN (writeback_eop, writeback_if.eop);
`ifdef PERF_ENABLE
reg [43:0] perf_ibf_stalls;
reg [43:0] perf_scb_stalls;
reg [43:0] perf_alu_stalls;
reg [43:0] perf_lsu_stalls;
reg [43:0] perf_csr_stalls;
reg [43:0] perf_gpu_stalls;
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
reg [`PERF_CTR_BITS-1:0] perf_scb_stalls;
reg [`PERF_CTR_BITS-1:0] perf_alu_stalls;
reg [`PERF_CTR_BITS-1:0] perf_lsu_stalls;
reg [`PERF_CTR_BITS-1:0] perf_csr_stalls;
reg [`PERF_CTR_BITS-1:0] perf_gpu_stalls;
`ifdef EXT_F_ENABLE
reg [43:0] perf_fpu_stalls;
reg [`PERF_CTR_BITS-1:0] perf_fpu_stalls;
`endif
always @(posedge clk) begin
@ -144,26 +143,26 @@ module VX_issue #(
`endif
end else begin
if (decode_if.valid & !decode_if.ready) begin
perf_ibf_stalls <= perf_ibf_stalls + 44'd1;
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'd1;
end
if (ibuf_deq_if.valid & scoreboard_delay) begin
perf_scb_stalls <= perf_scb_stalls + 44'd1;
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'd1;
end
if (alu_req_if.valid & !alu_req_if.ready) begin
perf_alu_stalls <= perf_alu_stalls + 44'd1;
perf_alu_stalls <= perf_alu_stalls + `PERF_CTR_BITS'd1;
end
if (lsu_req_if.valid & !lsu_req_if.ready) begin
perf_lsu_stalls <= perf_lsu_stalls + 44'd1;
perf_lsu_stalls <= perf_lsu_stalls + `PERF_CTR_BITS'd1;
end
if (csr_req_if.valid & !csr_req_if.ready) begin
perf_csr_stalls <= perf_csr_stalls + 44'd1;
perf_csr_stalls <= perf_csr_stalls + `PERF_CTR_BITS'd1;
end
if (gpu_req_if.valid & !gpu_req_if.ready) begin
perf_gpu_stalls <= perf_gpu_stalls + 44'd1;
perf_gpu_stalls <= perf_gpu_stalls + `PERF_CTR_BITS'd1;
end
`ifdef EXT_F_ENABLE
if (fpu_req_if.valid & !fpu_req_if.ready) begin
perf_fpu_stalls <= perf_fpu_stalls + 44'd1;
perf_fpu_stalls <= perf_fpu_stalls + `PERF_CTR_BITS'd1;
end
`endif
end

View file

@ -44,10 +44,6 @@ module VX_lsu_unit #(
end
wire is_dup_load = lsu_req_if.wb && lsu_req_if.tmask[0] && (& addr_matches);
`IGNORE_WARNINGS_BEGIN
reg [`LSUQ_SIZE-1:0][`LSUQ_ADDR_BITS-1:0] pending_tags;
`IGNORE_WARNINGS_END
wire ready_in;
wire stall_in = ~ready_in && req_valid;
@ -79,7 +75,7 @@ module VX_lsu_unit #(
wire [`NUM_THREADS-1:0] rsp_tmask;
reg [`NUM_THREADS-1:0] req_sent_mask;
wire sent_all_ready;
wire req_ready_all;
wire [`LSUQ_ADDR_BITS-1:0] mbuf_waddr, mbuf_raddr;
wire mbuf_full;
@ -118,13 +114,7 @@ module VX_lsu_unit #(
`UNUSED_PIN (empty)
);
always @(posedge clk) begin
if (mbuf_push) begin
pending_tags[mbuf_waddr] <= req_tag;
end
end
assign sent_all_ready = &(dcache_req_if.ready | req_sent_mask);
assign req_ready_all = &(dcache_req_if.ready | req_sent_mask | ~req_tmask);
wire [`NUM_THREADS-1:0] req_sent_dup = {{(`NUM_THREADS-1){dcache_req_fire[0] && req_is_dup}}, 1'b0};
@ -132,19 +122,22 @@ module VX_lsu_unit #(
if (reset) begin
req_sent_mask <= 0;
end else begin
if (sent_all_ready)
if (req_ready_all)
req_sent_mask <= 0;
else
req_sent_mask <= req_sent_mask | dcache_req_fire | req_sent_dup;
end
end
wire is_req_start = (0 == req_sent_mask);
// need to hold the acquired tag index until the full request is submitted
reg [`LSUQ_ADDR_BITS-1:0] req_tag_hold;
wire [`LSUQ_ADDR_BITS-1:0] req_tag = (0 == req_sent_mask) ? mbuf_waddr : req_tag_hold;
reg [`DCORE_TAG_ID_BITS-1:0] req_tag_hold;
wire [`DCORE_TAG_ID_BITS-1:0] req_tag = is_req_start ? mbuf_waddr : req_tag_hold;
always @(posedge clk) begin
if (mbuf_push)
if (mbuf_push) begin
req_tag_hold <= mbuf_waddr;
end
end
wire [`NUM_THREADS-1:0] req_tmask_dup = req_tmask & {{(`NUM_THREADS-1){~req_is_dup}}, 1'b1};
@ -160,7 +153,8 @@ module VX_lsu_unit #(
end
end
wire req_ready_dep = (req_wb && ~mbuf_full)
// ensure all dependencies for the requests are resolved
wire req_dep_ready = (req_wb && (~mbuf_full || ~is_req_start))
|| (~req_wb && st_commit_if.ready);
// DCache Request
@ -193,7 +187,7 @@ module VX_lsu_unit #(
end
end
assign dcache_req_if.valid = {`NUM_THREADS{req_valid && req_ready_dep}} & req_tmask_dup & ~req_sent_mask;
assign dcache_req_if.valid = {`NUM_THREADS{req_valid && req_dep_ready}} & req_tmask_dup & ~req_sent_mask;
assign dcache_req_if.rw = {`NUM_THREADS{~req_wb}};
assign dcache_req_if.addr = mem_req_addr;
assign dcache_req_if.byteen = mem_req_byteen;
@ -205,11 +199,11 @@ module VX_lsu_unit #(
assign dcache_req_if.tag = {`NUM_THREADS{req_tag}};
`endif
assign ready_in = req_ready_dep && sent_all_ready;
assign ready_in = req_dep_ready && req_ready_all;
// send store commit
wire is_store_rsp = req_valid && ~req_wb && sent_all_ready;
wire is_store_rsp = req_valid && ~req_wb && req_ready_all;
assign st_commit_if.valid = is_store_rsp;
assign st_commit_if.wid = req_wid;
@ -280,23 +274,46 @@ module VX_lsu_unit #(
`SCOPE_ASSIGN (dcache_rsp_tag, mbuf_raddr);
`ifdef DBG_PRINT_CORE_DCACHE
`IGNORE_WARNINGS_BEGIN
reg [`LSUQ_SIZE-1:0][`DCORE_TAG_WIDTH:0] pending_reqs;
`IGNORE_WARNINGS_END
always @(posedge clk) begin
if (reset) begin
pending_reqs <= '0;
end else if (mbuf_push) begin
pending_reqs[mbuf_waddr] <= {dcache_req_if.tag[0], 1'b1};
end else if (mbuf_pop) begin
pending_reqs[mbuf_raddr] <= '0;
end
end
always @(posedge clk) begin
if ((| dcache_req_fire)) begin
if ((| dcache_req_if.rw))
$display("%t: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, byteen=%0h, data=%0h",
$time, CORE_ID, req_wid, req_pc, dcache_req_fire, req_addr, dcache_req_if.tag, dcache_req_if.byteen, dcache_req_if.data);
else
$display("%t: D$%0d Rd Req: wid=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, byteen=%0h, rd=%0d, is_dup=%b",
$time, CORE_ID, req_wid, req_pc, dcache_req_fire, req_addr, dcache_req_if.tag, dcache_req_if.byteen, req_rd, req_is_dup);
if (dcache_req_if.rw[0]) begin
$write("%t: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire);
`PRINT_ARRAY1D(req_addr, `NUM_THREADS);
$write(", tag=%0h, byteen=%0h, data=", dcache_req_if.tag[0], dcache_req_if.byteen);
`PRINT_ARRAY1D(dcache_req_if.data, `NUM_THREADS);
$write("\n");
end else begin
$write("%t: D$%0d Rd Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire);
`PRINT_ARRAY1D(req_addr, `NUM_THREADS);
$write(", tag=%0h, byteen=%0h, rd=%0d, is_dup=%b\n", dcache_req_if.tag[0], dcache_req_if.byteen, req_rd, req_is_dup);
end
end
if (dcache_rsp_fire) begin
$display("%t: D$%0d Rsp: valid=%b, wid=%0d, PC=%0h, tag=%0h, rd=%0d, data=%0h, is_dup=%b",
$time, CORE_ID, dcache_rsp_if.valid, rsp_wid, rsp_pc, dcache_rsp_if.tag, rsp_rd, dcache_rsp_if.data, rsp_is_dup);
$write("%t: D$%0d Rsp: valid=%b, wid=%0d, PC=%0h, tag=%0h, rd=%0d, data=",
$time, CORE_ID, dcache_rsp_if.valid, rsp_wid, rsp_pc, dcache_rsp_if.tag, rsp_rd);
`PRINT_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS);
$write(", is_dup=%b\n", rsp_is_dup);
end
if (mbuf_full) begin
$write("%t: D$%0d queue-full:", $time, CORE_ID);
$write("%t: *** D$%0d queue-full:", $time, CORE_ID);
for (integer j = 0; j < `LSUQ_SIZE; j++) begin
$write(" tag%0d=%0h", j, pending_tags[j]);
if (pending_reqs[j][0]) begin
$write(" %0d->%0h", j, pending_reqs[j][1 +: `DCORE_TAG_WIDTH]);
end
end
$write("\n");
end

View file

@ -20,25 +20,25 @@ module VX_mem_unit # (
VX_icache_core_req_if icache_core_req_if,
VX_icache_core_rsp_if icache_core_rsp_if,
// DRAM
VX_cache_dram_req_if dram_req_if,
VX_cache_dram_rsp_if dram_rsp_if
// Memory
VX_cache_mem_req_if mem_req_if,
VX_cache_mem_rsp_if mem_rsp_if
);
`ifdef PERF_ENABLE
VX_perf_cache_if perf_icache_if(), perf_dcache_if(), perf_smem_if();
`endif
VX_cache_dram_req_if #(
.DRAM_LINE_WIDTH (`DDRAM_LINE_WIDTH),
.DRAM_ADDR_WIDTH (`DDRAM_ADDR_WIDTH),
.DRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH)
) dcache_dram_req_if(), icache_dram_req_if();
VX_cache_mem_req_if #(
.MEM_LINE_WIDTH (`DMEM_LINE_WIDTH),
.MEM_ADDR_WIDTH (`DMEM_ADDR_WIDTH),
.MEM_TAG_WIDTH (`DMEM_TAG_WIDTH)
) dcache_mem_req_if(), icache_mem_req_if();
VX_cache_dram_rsp_if #(
.DRAM_LINE_WIDTH (`DDRAM_LINE_WIDTH),
.DRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH)
) dcache_dram_rsp_if(), icache_dram_rsp_if();
VX_cache_mem_rsp_if #(
.MEM_LINE_WIDTH (`DMEM_LINE_WIDTH),
.MEM_TAG_WIDTH (`DMEM_TAG_WIDTH)
) dcache_mem_rsp_if(), icache_mem_rsp_if();
VX_dcache_core_req_if #(
.LANES (`DNUM_REQUESTS),
@ -96,12 +96,12 @@ module VX_mem_unit # (
.NUM_REQS (1),
.CREQ_SIZE (`ICREQ_SIZE),
.MSHR_SIZE (`IMSHR_SIZE),
.DRSQ_SIZE (`IDRSQ_SIZE),
.DREQ_SIZE (`IDREQ_SIZE),
.MRSQ_SIZE (`IMRSQ_SIZE),
.MREQ_SIZE (`IMREQ_SIZE),
.WRITE_ENABLE (0),
.CORE_TAG_WIDTH (`ICORE_TAG_WIDTH),
.CORE_TAG_ID_BITS (`ICORE_TAG_ID_BITS),
.DRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH)
.MEM_TAG_WIDTH (`DMEM_TAG_WIDTH)
) icache (
`SCOPE_BIND_VX_mem_unit_icache
@ -129,20 +129,20 @@ module VX_mem_unit # (
.perf_cache_if (perf_icache_if),
`endif
// DRAM Req
.dram_req_valid (icache_dram_req_if.valid),
.dram_req_rw (icache_dram_req_if.rw),
.dram_req_byteen (icache_dram_req_if.byteen),
.dram_req_addr (icache_dram_req_if.addr),
.dram_req_data (icache_dram_req_if.data),
.dram_req_tag (icache_dram_req_if.tag),
.dram_req_ready (icache_dram_req_if.ready),
// Memory Request
.mem_req_valid (icache_mem_req_if.valid),
.mem_req_rw (icache_mem_req_if.rw),
.mem_req_byteen (icache_mem_req_if.byteen),
.mem_req_addr (icache_mem_req_if.addr),
.mem_req_data (icache_mem_req_if.data),
.mem_req_tag (icache_mem_req_if.tag),
.mem_req_ready (icache_mem_req_if.ready),
// DRAM response
.dram_rsp_valid (icache_dram_rsp_if.valid),
.dram_rsp_data (icache_dram_rsp_if.data),
.dram_rsp_tag (icache_dram_rsp_if.tag),
.dram_rsp_ready (icache_dram_rsp_if.ready)
// Memory response
.mem_rsp_valid (icache_mem_rsp_if.valid),
.mem_rsp_data (icache_mem_rsp_if.data),
.mem_rsp_tag (icache_mem_rsp_if.tag),
.mem_rsp_ready (icache_mem_rsp_if.ready)
);
VX_cache #(
@ -155,12 +155,12 @@ module VX_mem_unit # (
.NUM_REQS (`DNUM_REQUESTS),
.CREQ_SIZE (`DCREQ_SIZE),
.MSHR_SIZE (`DMSHR_SIZE),
.DRSQ_SIZE (`DDRSQ_SIZE),
.DREQ_SIZE (`DDREQ_SIZE),
.MRSQ_SIZE (`DMRSQ_SIZE),
.MREQ_SIZE (`DMREQ_SIZE),
.WRITE_ENABLE (1),
.CORE_TAG_WIDTH (`DCORE_TAG_WIDTH),
.CORE_TAG_ID_BITS (`DCORE_TAG_ID_BITS),
.DRAM_TAG_WIDTH (`DDRAM_TAG_WIDTH)
.MEM_TAG_WIDTH (`DMEM_TAG_WIDTH)
) dcache (
`SCOPE_BIND_VX_mem_unit_dcache
@ -188,20 +188,20 @@ module VX_mem_unit # (
.perf_cache_if (perf_dcache_if),
`endif
// DRAM request
.dram_req_valid (dcache_dram_req_if.valid),
.dram_req_rw (dcache_dram_req_if.rw),
.dram_req_byteen (dcache_dram_req_if.byteen),
.dram_req_addr (dcache_dram_req_if.addr),
.dram_req_data (dcache_dram_req_if.data),
.dram_req_tag (dcache_dram_req_if.tag),
.dram_req_ready (dcache_dram_req_if.ready),
// Memory request
.mem_req_valid (dcache_mem_req_if.valid),
.mem_req_rw (dcache_mem_req_if.rw),
.mem_req_byteen (dcache_mem_req_if.byteen),
.mem_req_addr (dcache_mem_req_if.addr),
.mem_req_data (dcache_mem_req_if.data),
.mem_req_tag (dcache_mem_req_if.tag),
.mem_req_ready (dcache_mem_req_if.ready),
// DRAM response
.dram_rsp_valid (dcache_dram_rsp_if.valid),
.dram_rsp_data (dcache_dram_rsp_if.data),
.dram_rsp_tag (dcache_dram_rsp_if.tag),
.dram_rsp_ready (dcache_dram_rsp_if.ready)
// Memory response
.mem_rsp_valid (dcache_mem_rsp_if.valid),
.mem_rsp_data (dcache_mem_rsp_if.data),
.mem_rsp_tag (dcache_mem_rsp_if.tag),
.mem_rsp_ready (dcache_mem_rsp_if.ready)
);
if (`SM_ENABLE) begin
@ -252,45 +252,45 @@ module VX_mem_unit # (
VX_mem_arb #(
.NUM_REQS (2),
.DATA_WIDTH (`DDRAM_LINE_WIDTH),
.ADDR_WIDTH (`DDRAM_ADDR_WIDTH),
.TAG_IN_WIDTH (`DDRAM_TAG_WIDTH),
.TAG_OUT_WIDTH (`XDRAM_TAG_WIDTH),
.DATA_WIDTH (`DMEM_LINE_WIDTH),
.ADDR_WIDTH (`DMEM_ADDR_WIDTH),
.TAG_IN_WIDTH (`DMEM_TAG_WIDTH),
.TAG_OUT_WIDTH (`XMEM_TAG_WIDTH),
.BUFFERED_REQ (1),
.BUFFERED_RSP (0)
) dram_arb (
) mem_arb (
.clk (clk),
.reset (reset),
// Source request
.req_valid_in ({dcache_dram_req_if.valid, icache_dram_req_if.valid}),
.req_rw_in ({dcache_dram_req_if.rw, icache_dram_req_if.rw}),
.req_byteen_in ({dcache_dram_req_if.byteen, icache_dram_req_if.byteen}),
.req_addr_in ({dcache_dram_req_if.addr, icache_dram_req_if.addr}),
.req_data_in ({dcache_dram_req_if.data, icache_dram_req_if.data}),
.req_tag_in ({dcache_dram_req_if.tag, icache_dram_req_if.tag}),
.req_ready_in ({dcache_dram_req_if.ready, icache_dram_req_if.ready}),
.req_valid_in ({dcache_mem_req_if.valid, icache_mem_req_if.valid}),
.req_rw_in ({dcache_mem_req_if.rw, icache_mem_req_if.rw}),
.req_byteen_in ({dcache_mem_req_if.byteen, icache_mem_req_if.byteen}),
.req_addr_in ({dcache_mem_req_if.addr, icache_mem_req_if.addr}),
.req_data_in ({dcache_mem_req_if.data, icache_mem_req_if.data}),
.req_tag_in ({dcache_mem_req_if.tag, icache_mem_req_if.tag}),
.req_ready_in ({dcache_mem_req_if.ready, icache_mem_req_if.ready}),
// DRAM request
.req_valid_out (dram_req_if.valid),
.req_rw_out (dram_req_if.rw),
.req_byteen_out (dram_req_if.byteen),
.req_addr_out (dram_req_if.addr),
.req_data_out (dram_req_if.data),
.req_tag_out (dram_req_if.tag),
.req_ready_out (dram_req_if.ready),
// Memory request
.req_valid_out (mem_req_if.valid),
.req_rw_out (mem_req_if.rw),
.req_byteen_out (mem_req_if.byteen),
.req_addr_out (mem_req_if.addr),
.req_data_out (mem_req_if.data),
.req_tag_out (mem_req_if.tag),
.req_ready_out (mem_req_if.ready),
// Source response
.rsp_valid_out ({dcache_dram_rsp_if.valid, icache_dram_rsp_if.valid}),
.rsp_data_out ({dcache_dram_rsp_if.data, icache_dram_rsp_if.data}),
.rsp_tag_out ({dcache_dram_rsp_if.tag, icache_dram_rsp_if.tag}),
.rsp_ready_out ({dcache_dram_rsp_if.ready, icache_dram_rsp_if.ready}),
.rsp_valid_out ({dcache_mem_rsp_if.valid, icache_mem_rsp_if.valid}),
.rsp_data_out ({dcache_mem_rsp_if.data, icache_mem_rsp_if.data}),
.rsp_tag_out ({dcache_mem_rsp_if.tag, icache_mem_rsp_if.tag}),
.rsp_ready_out ({dcache_mem_rsp_if.ready, icache_mem_rsp_if.ready}),
// DRAM response
.rsp_valid_in (dram_rsp_if.valid),
.rsp_tag_in (dram_rsp_if.tag),
.rsp_data_in (dram_rsp_if.data),
.rsp_ready_in (dram_rsp_if.ready)
// Memory response
.rsp_valid_in (mem_rsp_if.valid),
.rsp_tag_in (mem_rsp_if.tag),
.rsp_data_in (mem_rsp_if.data),
.rsp_ready_in (mem_rsp_if.ready)
);
`ifdef PERF_ENABLE
@ -319,47 +319,47 @@ end else begin
assign perf_memsys_if.smem_bank_stalls = 0;
end
reg [43:0] perf_dram_lat_per_cycle;
reg [`PERF_CTR_BITS-1:0] perf_mem_lat_per_cycle;
always @(posedge clk) begin
if (reset) begin
perf_dram_lat_per_cycle <= 0;
perf_mem_lat_per_cycle <= 0;
end else begin
perf_dram_lat_per_cycle <= perf_dram_lat_per_cycle +
44'($signed(2'((dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready) && !(dram_rsp_if.valid && dram_rsp_if.ready)) -
2'((dram_rsp_if.valid && dram_rsp_if.ready) && !(dram_req_if.valid && !dram_req_if.rw && dram_req_if.ready))));
perf_mem_lat_per_cycle <= perf_mem_lat_per_cycle +
`PERF_CTR_BITS'($signed(2'((mem_req_if.valid && !mem_req_if.rw && mem_req_if.ready) && !(mem_rsp_if.valid && mem_rsp_if.ready)) -
2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && !mem_req_if.rw && mem_req_if.ready))));
end
end
reg [43:0] perf_dram_reads;
reg [43:0] perf_dram_writes;
reg [43:0] perf_dram_lat;
reg [43:0] perf_dram_stalls;
reg [`PERF_CTR_BITS-1:0] perf_mem_reads;
reg [`PERF_CTR_BITS-1:0] perf_mem_writes;
reg [`PERF_CTR_BITS-1:0] perf_mem_lat;
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
always @(posedge clk) begin
if (reset) begin
perf_dram_reads <= 0;
perf_dram_writes <= 0;
perf_dram_lat <= 0;
perf_dram_stalls <= 0;
perf_mem_reads <= 0;
perf_mem_writes <= 0;
perf_mem_lat <= 0;
perf_mem_stalls <= 0;
end else begin
if (dram_req_if.valid && dram_req_if.ready && !dram_req_if.rw) begin
perf_dram_reads <= perf_dram_reads + 44'd1;
if (mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) begin
perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'd1;
end
if (dram_req_if.valid && dram_req_if.ready && dram_req_if.rw) begin
perf_dram_writes <= perf_dram_writes + 44'd1;
if (mem_req_if.valid && mem_req_if.ready && mem_req_if.rw) begin
perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'd1;
end
if (dram_req_if.valid && !dram_req_if.ready) begin
perf_dram_stalls <= perf_dram_stalls + 44'd1;
if (mem_req_if.valid && !mem_req_if.ready) begin
perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'd1;
end
perf_dram_lat <= perf_dram_lat + perf_dram_lat_per_cycle;
perf_mem_lat <= perf_mem_lat + perf_mem_lat_per_cycle;
end
end
assign perf_memsys_if.dram_reads = perf_dram_reads;
assign perf_memsys_if.dram_writes = perf_dram_writes;
assign perf_memsys_if.dram_latency = perf_dram_lat;
assign perf_memsys_if.dram_stalls = perf_dram_stalls;
assign perf_memsys_if.mem_reads = perf_mem_reads;
assign perf_memsys_if.mem_writes = perf_mem_writes;
assign perf_memsys_if.mem_latency = perf_mem_lat;
assign perf_memsys_if.mem_stalls = perf_mem_stalls;
`endif
endmodule

View file

@ -70,6 +70,8 @@
`define LOG2UP(x) (((x) > 1) ? $clog2(x) : 1)
`define ISPOW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1))))
`define ABS(x) (($signed(x) < 0) ? (-$signed(x)) : x);
`define MIN(x, y) ((x < y) ? (x) : (y))
`define MAX(x, y) ((x > y) ? (x) : (y))

View file

@ -31,7 +31,7 @@ module VX_scoreboard #(
if (release_reg) begin
inuse_regs[writeback_if.wid][writeback_if.rd] <= 0;
assert(inuse_regs[writeback_if.wid][writeback_if.rd] != 0)
else $error("*** %t: core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d",
else $error("%t: *** core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d",
$time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd);
end
end
@ -40,7 +40,7 @@ module VX_scoreboard #(
`ifdef DBG_PRINT_PIPELINE
always @(posedge clk) begin
if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin
$display("%t: core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b",
$display("%t: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b",
$time, CORE_ID, ibuf_deq_if.wid, ibuf_deq_if.PC, ibuf_deq_if.rd, ibuf_deq_if.wb,
deq_inuse_regs[ibuf_deq_if.rd], deq_inuse_regs[ibuf_deq_if.rs1], deq_inuse_regs[ibuf_deq_if.rs2], deq_inuse_regs[ibuf_deq_if.rs3]);
end
@ -54,7 +54,7 @@ module VX_scoreboard #(
deadlock_ctr <= 0;
end else if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin
deadlock_ctr <= deadlock_ctr + 1;
assert(deadlock_ctr < deadlock_timeout) else $error("*** %t: core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b",
assert(deadlock_ctr < deadlock_timeout) else $error("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b",
$time, CORE_ID, ibuf_deq_if.wid, ibuf_deq_if.PC, ibuf_deq_if.rd, ibuf_deq_if.wb,
deq_inuse_regs[ibuf_deq_if.rd], deq_inuse_regs[ibuf_deq_if.rs1], deq_inuse_regs[ibuf_deq_if.rs2], deq_inuse_regs[ibuf_deq_if.rs3]);
end else if (ibuf_deq_if.valid && ibuf_deq_if.ready) begin

View file

@ -34,7 +34,7 @@ module VX_smem_arb (
wire is_smem_addr_in, is_smem_addr_out;
// select shared memory bus
assign is_smem_addr_in = core_req_if.valid[i] && `SM_ENABLE
assign is_smem_addr_in = `SM_ENABLE
&& (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] >= (32-SMEM_ASHIFT)'((`SHARED_MEM_BASE_ADDR - `SMEM_SIZE) >> SMEM_ASHIFT))
&& (core_req_if.addr[i][REQ_ADDRW-1:SMEM_ASHIFT-REQ_ASHIFT] < (32-SMEM_ASHIFT)'(`SHARED_MEM_BASE_ADDR >> SMEM_ASHIFT));
@ -51,13 +51,13 @@ module VX_smem_arb (
.ready_out (cache_req_ready_out)
);
if (`SM_ENABLE ) begin
if (`SM_ENABLE) begin
assign cache_req_if.valid[i] = cache_req_valid_out && ~is_smem_addr_out;
assign smem_req_if.valid[i] = cache_req_valid_out && is_smem_addr_out;
assign cache_req_ready_out = is_smem_addr_out ? smem_req_if.ready[i] : cache_req_if.ready[i];
assign smem_req_if.addr[i] = cache_req_if.addr[i];
assign smem_req_if.rw[i] = cache_req_if.rw[i];
assign smem_req_if.rw[i] = cache_req_if.rw[i];
assign smem_req_if.byteen[i] = cache_req_if.byteen[i];
assign smem_req_if.data[i] = cache_req_if.data[i];
assign smem_req_if.tag[i] = cache_req_if.tag[i];

View file

@ -7,20 +7,20 @@ module Vortex (
input wire clk,
input wire reset,
// DRAM request
output wire dram_req_valid,
output wire dram_req_rw,
output wire [`VX_DRAM_BYTEEN_WIDTH-1:0] dram_req_byteen,
output wire [`VX_DRAM_ADDR_WIDTH-1:0] dram_req_addr,
output wire [`VX_DRAM_LINE_WIDTH-1:0] dram_req_data,
output wire [`VX_DRAM_TAG_WIDTH-1:0] dram_req_tag,
input wire dram_req_ready,
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`VX_MEM_LINE_WIDTH-1:0] mem_req_data,
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// DRAM response
input wire dram_rsp_valid,
input wire [`VX_DRAM_LINE_WIDTH-1:0] dram_rsp_data,
input wire [`VX_DRAM_TAG_WIDTH-1:0] dram_rsp_tag,
output wire dram_rsp_ready,
// Memory response
input wire mem_rsp_valid,
input wire [`VX_MEM_LINE_WIDTH-1:0] mem_rsp_data,
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
// CSR Request
input wire csr_req_valid,
@ -40,18 +40,18 @@ module Vortex (
output wire ebreak
);
wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_valid;
wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_rw;
wire [`NUM_CLUSTERS-1:0][`L2DRAM_BYTEEN_WIDTH-1:0] per_cluster_dram_req_byteen;
wire [`NUM_CLUSTERS-1:0][`L2DRAM_ADDR_WIDTH-1:0] per_cluster_dram_req_addr;
wire [`NUM_CLUSTERS-1:0][`L2DRAM_LINE_WIDTH-1:0] per_cluster_dram_req_data;
wire [`NUM_CLUSTERS-1:0][`L2DRAM_TAG_WIDTH-1:0] per_cluster_dram_req_tag;
wire [`NUM_CLUSTERS-1:0] per_cluster_dram_req_ready;
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_valid;
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_rw;
wire [`NUM_CLUSTERS-1:0][`L2MEM_BYTEEN_WIDTH-1:0] per_cluster_mem_req_byteen;
wire [`NUM_CLUSTERS-1:0][`L2MEM_ADDR_WIDTH-1:0] per_cluster_mem_req_addr;
wire [`NUM_CLUSTERS-1:0][`L2MEM_LINE_WIDTH-1:0] per_cluster_mem_req_data;
wire [`NUM_CLUSTERS-1:0][`L2MEM_TAG_WIDTH-1:0] per_cluster_mem_req_tag;
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_ready;
wire [`NUM_CLUSTERS-1:0] per_cluster_dram_rsp_valid;
wire [`NUM_CLUSTERS-1:0][`L2DRAM_LINE_WIDTH-1:0] per_cluster_dram_rsp_data;
wire [`NUM_CLUSTERS-1:0][`L2DRAM_TAG_WIDTH-1:0] per_cluster_dram_rsp_tag;
wire [`NUM_CLUSTERS-1:0] per_cluster_dram_rsp_ready;
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_rsp_valid;
wire [`NUM_CLUSTERS-1:0][`L2MEM_LINE_WIDTH-1:0] per_cluster_mem_rsp_data;
wire [`NUM_CLUSTERS-1:0][`L2MEM_TAG_WIDTH-1:0] per_cluster_mem_rsp_tag;
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_rsp_ready;
wire [`NUM_CLUSTERS-1:0] per_cluster_csr_req_valid;
wire [`NUM_CLUSTERS-1:0][11:0] per_cluster_csr_req_addr;
@ -88,18 +88,18 @@ module Vortex (
.clk (clk),
.reset (cluster_reset),
.dram_req_valid (per_cluster_dram_req_valid [i]),
.dram_req_rw (per_cluster_dram_req_rw [i]),
.dram_req_byteen(per_cluster_dram_req_byteen[i]),
.dram_req_addr (per_cluster_dram_req_addr [i]),
.dram_req_data (per_cluster_dram_req_data [i]),
.dram_req_tag (per_cluster_dram_req_tag [i]),
.dram_req_ready (per_cluster_dram_req_ready [i]),
.mem_req_valid (per_cluster_mem_req_valid [i]),
.mem_req_rw (per_cluster_mem_req_rw [i]),
.mem_req_byteen (per_cluster_mem_req_byteen[i]),
.mem_req_addr (per_cluster_mem_req_addr [i]),
.mem_req_data (per_cluster_mem_req_data [i]),
.mem_req_tag (per_cluster_mem_req_tag [i]),
.mem_req_ready (per_cluster_mem_req_ready [i]),
.dram_rsp_valid (per_cluster_dram_rsp_valid [i]),
.dram_rsp_data (per_cluster_dram_rsp_data [i]),
.dram_rsp_tag (per_cluster_dram_rsp_tag [i]),
.dram_rsp_ready (per_cluster_dram_rsp_ready [i]),
.mem_rsp_valid (per_cluster_mem_rsp_valid [i]),
.mem_rsp_data (per_cluster_mem_rsp_data [i]),
.mem_rsp_tag (per_cluster_mem_rsp_tag [i]),
.mem_rsp_ready (per_cluster_mem_rsp_ready [i]),
.csr_req_valid (per_cluster_csr_req_valid [i]),
.csr_req_coreid (csr_core_id),
@ -171,12 +171,12 @@ module Vortex (
.NUM_REQS (`NUM_CLUSTERS),
.CREQ_SIZE (`L3CREQ_SIZE),
.MSHR_SIZE (`L3MSHR_SIZE),
.DRSQ_SIZE (`L3DRSQ_SIZE),
.DREQ_SIZE (`L3DREQ_SIZE),
.MRSQ_SIZE (`L3MRSQ_SIZE),
.MREQ_SIZE (`L3MREQ_SIZE),
.WRITE_ENABLE (1),
.CORE_TAG_WIDTH (`L2DRAM_TAG_WIDTH),
.CORE_TAG_WIDTH (`L2MEM_TAG_WIDTH),
.CORE_TAG_ID_BITS (0),
.DRAM_TAG_WIDTH (`L3DRAM_TAG_WIDTH)
.MEM_TAG_WIDTH (`L3MEM_TAG_WIDTH)
) l3cache (
`SCOPE_BIND_Vortex_l3cache
@ -190,105 +190,105 @@ module Vortex (
`endif
// Core request
.core_req_valid (per_cluster_dram_req_valid),
.core_req_rw (per_cluster_dram_req_rw),
.core_req_byteen (per_cluster_dram_req_byteen),
.core_req_addr (per_cluster_dram_req_addr),
.core_req_data (per_cluster_dram_req_data),
.core_req_tag (per_cluster_dram_req_tag),
.core_req_ready (per_cluster_dram_req_ready),
.core_req_valid (per_cluster_mem_req_valid),
.core_req_rw (per_cluster_mem_req_rw),
.core_req_byteen (per_cluster_mem_req_byteen),
.core_req_addr (per_cluster_mem_req_addr),
.core_req_data (per_cluster_mem_req_data),
.core_req_tag (per_cluster_mem_req_tag),
.core_req_ready (per_cluster_mem_req_ready),
// Core response
.core_rsp_valid (per_cluster_dram_rsp_valid),
.core_rsp_data (per_cluster_dram_rsp_data),
.core_rsp_tag (per_cluster_dram_rsp_tag),
.core_rsp_ready (per_cluster_dram_rsp_ready),
.core_rsp_valid (per_cluster_mem_rsp_valid),
.core_rsp_data (per_cluster_mem_rsp_data),
.core_rsp_tag (per_cluster_mem_rsp_tag),
.core_rsp_ready (per_cluster_mem_rsp_ready),
// DRAM request
.dram_req_valid (dram_req_valid),
.dram_req_rw (dram_req_rw),
.dram_req_byteen (dram_req_byteen),
.dram_req_addr (dram_req_addr),
.dram_req_data (dram_req_data),
.dram_req_tag (dram_req_tag),
.dram_req_ready (dram_req_ready),
// Memory request
.mem_req_valid (mem_req_valid),
.mem_req_rw (mem_req_rw),
.mem_req_byteen (mem_req_byteen),
.mem_req_addr (mem_req_addr),
.mem_req_data (mem_req_data),
.mem_req_tag (mem_req_tag),
.mem_req_ready (mem_req_ready),
// DRAM response
.dram_rsp_valid (dram_rsp_valid),
.dram_rsp_data (dram_rsp_data),
.dram_rsp_tag (dram_rsp_tag),
.dram_rsp_ready (dram_rsp_ready)
// Memory response
.mem_rsp_valid (mem_rsp_valid),
.mem_rsp_data (mem_rsp_data),
.mem_rsp_tag (mem_rsp_tag),
.mem_rsp_ready (mem_rsp_ready)
);
end else begin
VX_mem_arb #(
.NUM_REQS (`NUM_CLUSTERS),
.DATA_WIDTH (`L3DRAM_LINE_WIDTH),
.TAG_IN_WIDTH (`L2DRAM_TAG_WIDTH),
.TAG_OUT_WIDTH (`L3DRAM_TAG_WIDTH),
.BUFFERED_REQ (1),
.BUFFERED_RSP (1)
) dram_arb (
.NUM_REQS (`NUM_CLUSTERS),
.DATA_WIDTH (`L3MEM_LINE_WIDTH),
.TAG_IN_WIDTH (`L2MEM_TAG_WIDTH),
.TAG_OUT_WIDTH (`L3MEM_TAG_WIDTH),
.BUFFERED_REQ (1),
.BUFFERED_RSP (1)
) mem_arb (
.clk (clk),
.reset (reset),
// Core request
.req_valid_in (per_cluster_dram_req_valid),
.req_rw_in (per_cluster_dram_req_rw),
.req_byteen_in (per_cluster_dram_req_byteen),
.req_addr_in (per_cluster_dram_req_addr),
.req_data_in (per_cluster_dram_req_data),
.req_tag_in (per_cluster_dram_req_tag),
.req_ready_in (per_cluster_dram_req_ready),
.req_valid_in (per_cluster_mem_req_valid),
.req_rw_in (per_cluster_mem_req_rw),
.req_byteen_in (per_cluster_mem_req_byteen),
.req_addr_in (per_cluster_mem_req_addr),
.req_data_in (per_cluster_mem_req_data),
.req_tag_in (per_cluster_mem_req_tag),
.req_ready_in (per_cluster_mem_req_ready),
// DRAM request
.req_valid_out (dram_req_valid),
.req_rw_out (dram_req_rw),
.req_byteen_out (dram_req_byteen),
.req_addr_out (dram_req_addr),
.req_data_out (dram_req_data),
.req_tag_out (dram_req_tag),
.req_ready_out (dram_req_ready),
// Memory request
.req_valid_out (mem_req_valid),
.req_rw_out (mem_req_rw),
.req_byteen_out (mem_req_byteen),
.req_addr_out (mem_req_addr),
.req_data_out (mem_req_data),
.req_tag_out (mem_req_tag),
.req_ready_out (mem_req_ready),
// Core response
.rsp_valid_out (per_cluster_dram_rsp_valid),
.rsp_data_out (per_cluster_dram_rsp_data),
.rsp_tag_out (per_cluster_dram_rsp_tag),
.rsp_ready_out (per_cluster_dram_rsp_ready),
.rsp_valid_out (per_cluster_mem_rsp_valid),
.rsp_data_out (per_cluster_mem_rsp_data),
.rsp_tag_out (per_cluster_mem_rsp_tag),
.rsp_ready_out (per_cluster_mem_rsp_ready),
// DRAM response
.rsp_valid_in (dram_rsp_valid),
.rsp_tag_in (dram_rsp_tag),
.rsp_data_in (dram_rsp_data),
.rsp_ready_in (dram_rsp_ready)
// Memory response
.rsp_valid_in (mem_rsp_valid),
.rsp_tag_in (mem_rsp_tag),
.rsp_data_in (mem_rsp_data),
.rsp_ready_in (mem_rsp_ready)
);
end
`SCOPE_ASSIGN (reset, reset);
`SCOPE_ASSIGN (dram_req_fire, dram_req_valid && dram_req_ready);
`SCOPE_ASSIGN (dram_req_addr, `TO_FULL_ADDR(dram_req_addr));
`SCOPE_ASSIGN (dram_req_rw, dram_req_rw);
`SCOPE_ASSIGN (dram_req_byteen, dram_req_byteen);
`SCOPE_ASSIGN (dram_req_data, dram_req_data);
`SCOPE_ASSIGN (dram_req_tag, dram_req_tag);
`SCOPE_ASSIGN (dram_rsp_fire, dram_rsp_valid && dram_rsp_ready);
`SCOPE_ASSIGN (dram_rsp_data, dram_rsp_data);
`SCOPE_ASSIGN (dram_rsp_tag, dram_rsp_tag);
`SCOPE_ASSIGN (mem_req_fire, mem_req_valid && mem_req_ready);
`SCOPE_ASSIGN (mem_req_addr, `TO_FULL_ADDR(mem_req_addr));
`SCOPE_ASSIGN (mem_req_rw, mem_req_rw);
`SCOPE_ASSIGN (mem_req_byteen, mem_req_byteen);
`SCOPE_ASSIGN (mem_req_data, mem_req_data);
`SCOPE_ASSIGN (mem_req_tag, mem_req_tag);
`SCOPE_ASSIGN (mem_rsp_fire, mem_rsp_valid && mem_rsp_ready);
`SCOPE_ASSIGN (mem_rsp_data, mem_rsp_data);
`SCOPE_ASSIGN (mem_rsp_tag, mem_rsp_tag);
`SCOPE_ASSIGN (busy, busy);
`ifdef DBG_PRINT_DRAM
`ifdef DBG_PRINT_MEM
always @(posedge clk) begin
if (dram_req_valid && dram_req_ready) begin
if (dram_req_rw)
$display("%t: DRAM Wr Req: addr=%0h, tag=%0h, byteen=%0h data=%0h", $time, `TO_FULL_ADDR(dram_req_addr), dram_req_tag, dram_req_byteen, dram_req_data);
if (mem_req_valid && mem_req_ready) begin
if (mem_req_rw)
$display("%t: MEM Wr Req: addr=%0h, tag=%0h, byteen=%0h data=%0h", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data);
else
$display("%t: DRAM Rd Req: addr=%0h, tag=%0h, byteen=%0h", $time, `TO_FULL_ADDR(dram_req_addr), dram_req_tag, dram_req_byteen);
$display("%t: MEM Rd Req: addr=%0h, tag=%0h, byteen=%0h", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen);
end
if (dram_rsp_valid && dram_rsp_ready) begin
$display("%t: DRAM Rsp: tag=%0h, data=%0h", $time, dram_rsp_tag, dram_rsp_data);
if (mem_rsp_valid && mem_rsp_ready) begin
$display("%t: MEM Rsp: tag=%0h, data=%0h", $time, mem_rsp_tag, mem_rsp_data);
end
end
`endif

View file

@ -1,133 +1,166 @@
`include "VX_define.vh"
module VX_avs_wrapper #(
parameter AVS_DATAW = 1,
parameter AVS_ADDRW = 1,
parameter AVS_BURSTW = 1,
parameter AVS_BANKS = 1,
parameter REQ_TAGW = 1,
parameter RD_QUEUE_SIZE = 1,
parameter NUM_BANKS = 1,
parameter AVS_DATA_WIDTH = 1,
parameter AVS_ADDR_WIDTH = 1,
parameter AVS_BURST_WIDTH = 1,
parameter AVS_BANKS = 1,
parameter REQ_TAG_WIDTH = 1,
parameter RD_QUEUE_SIZE = 1,
parameter AVS_BYTEENW = (AVS_DATAW / 8),
parameter RD_QUEUE_ADDRW= $clog2(RD_QUEUE_SIZE+1),
parameter AVS_BANKS_BITS= $clog2(AVS_BANKS)
parameter AVS_BYTEENW = (AVS_DATA_WIDTH / 8),
parameter RD_QUEUE_ADDR_WIDTH = $clog2(RD_QUEUE_SIZE+1),
parameter AVS_BANKS_BITS = $clog2(AVS_BANKS)
) (
input wire clk,
input wire reset,
input wire clk,
input wire reset,
// Memory request
input wire mem_req_valid,
input wire mem_req_rw,
input wire [AVS_BYTEENW-1:0] mem_req_byteen,
input wire [AVS_ADDR_WIDTH-1:0] mem_req_addr,
input wire [AVS_DATA_WIDTH-1:0] mem_req_data,
input wire [REQ_TAG_WIDTH-1:0] mem_req_tag,
output wire mem_req_ready,
// Memory response
output wire mem_rsp_valid,
output wire [AVS_DATA_WIDTH-1:0] mem_rsp_data,
output wire [REQ_TAG_WIDTH-1:0] mem_rsp_tag,
input wire mem_rsp_ready,
// AVS bus
output wire [AVS_DATAW-1:0] avs_writedata,
input wire [AVS_DATAW-1:0] avs_readdata,
output wire [AVS_ADDRW-1:0] avs_address,
input wire avs_waitrequest,
output wire avs_write,
output wire avs_read,
output wire [AVS_BYTEENW-1:0] avs_byteenable,
output wire [AVS_BURSTW-1:0] avs_burstcount,
input avs_readdatavalid,
output wire [AVS_BANKS_BITS-1:0] avs_bankselect,
// DRAM request
input wire dram_req_valid,
input wire dram_req_rw,
input wire [AVS_BYTEENW-1:0] dram_req_byteen,
input wire [AVS_ADDRW-1:0] dram_req_addr,
input wire [AVS_DATAW-1:0] dram_req_data,
input wire [REQ_TAGW-1:0] dram_req_tag,
output wire dram_req_ready,
// DRAM response
output wire dram_rsp_valid,
output wire [AVS_DATAW-1:0] dram_rsp_data,
output wire [REQ_TAGW-1:0] dram_rsp_tag,
input wire dram_rsp_ready
output wire [AVS_DATA_WIDTH-1:0] avs_writedata [NUM_BANKS],
input wire [AVS_DATA_WIDTH-1:0] avs_readdata [NUM_BANKS],
output wire [AVS_ADDR_WIDTH-1:0] avs_address [NUM_BANKS],
input wire avs_waitrequest [NUM_BANKS],
output wire avs_write [NUM_BANKS],
output wire avs_read [NUM_BANKS],
output wire [AVS_BYTEENW-1:0] avs_byteenable [NUM_BANKS],
output wire [AVS_BURST_WIDTH-1:0] avs_burstcount [NUM_BANKS],
input avs_readdatavalid [NUM_BANKS]
);
reg [AVS_BANKS_BITS-1:0] avs_bankselect_r;
reg [AVS_BURSTW-1:0] avs_burstcount_r;
wire avs_reqq_push = dram_req_valid && dram_req_ready && !dram_req_rw;
wire avs_reqq_pop = dram_rsp_valid && dram_rsp_ready;
localparam BANK_ADDRW = `LOG2UP(NUM_BANKS);
wire avs_rspq_push = avs_readdatavalid;
wire avs_rspq_pop = avs_reqq_pop;
wire avs_rspq_empty;
wire rsp_queue_going_full;
wire [RD_QUEUE_ADDRW-1:0] rsp_queue_size;
VX_pending_size #(
.SIZE (RD_QUEUE_SIZE)
) pending_size (
.clk (clk),
.reset (reset),
.push (avs_reqq_push),
.pop (avs_rspq_pop),
`UNUSED_PIN (empty),
.full (rsp_queue_going_full),
.size (rsp_queue_size)
);
`UNUSED_VAR (rsp_queue_size)
always @(posedge clk) begin
avs_burstcount_r <= 1;
avs_bankselect_r <= 0;
end
// Requests handling
VX_fifo_queue #(
.DATAW (REQ_TAGW),
.SIZE (RD_QUEUE_SIZE)
) rd_req_queue (
.clk (clk),
.reset (reset),
.push (avs_reqq_push),
.pop (avs_reqq_pop),
.data_in (dram_req_tag),
.data_out (dram_rsp_tag),
`UNUSED_PIN (empty),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
wire [NUM_BANKS-1:0] avs_reqq_push, avs_reqq_pop, avs_reqq_ready;
wire [NUM_BANKS-1:0] req_queue_going_full;
wire [NUM_BANKS-1:0][RD_QUEUE_ADDR_WIDTH-1:0] req_queue_size;
wire [NUM_BANKS-1:0][REQ_TAG_WIDTH-1:0] avs_reqq_data_out;
wire [BANK_ADDRW-1:0] req_bank_sel = (NUM_BANKS >= 2) ? mem_req_addr[BANK_ADDRW-1:0] : '0;
for (genvar i = 0; i < NUM_BANKS; i++) begin
assign avs_reqq_ready[i] = !req_queue_going_full[i] && !avs_waitrequest[i];
assign avs_reqq_push[i] = mem_req_valid && !mem_req_rw && avs_reqq_ready[i] && (req_bank_sel == i);
end
for (genvar i = 0; i < NUM_BANKS; i++) begin
VX_pending_size #(
.SIZE (RD_QUEUE_SIZE)
) pending_size (
.clk (clk),
.reset (reset),
.push (avs_reqq_push[i]),
.pop (avs_reqq_pop[i]),
.full (req_queue_going_full[i]),
.size (req_queue_size[i]),
`UNUSED_PIN (empty)
);
`UNUSED_VAR (req_queue_size)
VX_fifo_queue #(
.DATAW (REQ_TAG_WIDTH),
.SIZE (RD_QUEUE_SIZE)
) rd_req_queue (
.clk (clk),
.reset (reset),
.push (avs_reqq_push[i]),
.pop (avs_reqq_pop[i]),
.data_in (mem_req_tag),
.data_out (avs_reqq_data_out[i]),
`UNUSED_PIN (empty),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end
for (genvar i = 0; i < NUM_BANKS; i++) begin
assign avs_read[i] = mem_req_valid && !mem_req_rw && !req_queue_going_full[i] && (req_bank_sel == i);
assign avs_write[i] = mem_req_valid && mem_req_rw && !req_queue_going_full[i] && (req_bank_sel == i);
assign avs_address[i] = mem_req_addr;
assign avs_byteenable[i] = mem_req_byteen;
assign avs_writedata[i] = mem_req_data;
assign avs_burstcount[i] = AVS_BURST_WIDTH'(1);
end
assign mem_req_ready = avs_reqq_ready[req_bank_sel];
// Responses handling
wire [NUM_BANKS-1:0] rsp_arb_valid_in;
wire [NUM_BANKS-1:0][AVS_DATA_WIDTH+REQ_TAG_WIDTH-1:0] rsp_arb_data_in;
wire [NUM_BANKS-1:0] rsp_arb_ready_in;
wire [NUM_BANKS-1:0][AVS_DATA_WIDTH-1:0] avs_rspq_data_out;
wire [NUM_BANKS-1:0] avs_rspq_empty;
for (genvar i = 0; i < NUM_BANKS; i++) begin
VX_fifo_queue #(
.DATAW (AVS_DATA_WIDTH),
.SIZE (RD_QUEUE_SIZE)
) rd_rsp_queue (
.clk (clk),
.reset (reset),
.push (avs_readdatavalid[i]),
.pop (avs_reqq_pop[i]),
.data_in (avs_readdata[i]),
.data_out (avs_rspq_data_out[i]),
.empty (avs_rspq_empty[i]),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end
for (genvar i = 0; i < NUM_BANKS; i++) begin
assign rsp_arb_valid_in[i] = !avs_rspq_empty[i];
assign rsp_arb_data_in[i] = {avs_rspq_data_out[i], avs_reqq_data_out[i]};
assign avs_reqq_pop[i] = rsp_arb_valid_in[i] && rsp_arb_ready_in[i];
end
VX_stream_arbiter #(
.NUM_REQS (NUM_BANKS),
.DATAW (AVS_DATA_WIDTH + REQ_TAG_WIDTH),
.BUFFERED (NUM_BANKS > 2)
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in (rsp_arb_valid_in),
.data_in (rsp_arb_data_in),
.ready_in (rsp_arb_ready_in),
.valid_out (mem_rsp_valid),
.data_out ({mem_rsp_data, mem_rsp_tag}),
.ready_out (mem_rsp_ready)
);
VX_fifo_queue #(
.DATAW (AVS_DATAW),
.SIZE (RD_QUEUE_SIZE)
) rd_rsp_queue (
.clk (clk),
.reset (reset),
.push (avs_rspq_push),
.pop (avs_rspq_pop),
.data_in (avs_readdata),
.data_out (dram_rsp_data),
.empty (avs_rspq_empty),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
assign avs_read = dram_req_valid && !dram_req_rw && !rsp_queue_going_full;
assign avs_write = dram_req_valid && dram_req_rw && !rsp_queue_going_full;
assign avs_address = dram_req_addr;
assign avs_byteenable = dram_req_byteen;
assign avs_writedata = dram_req_data;
assign avs_burstcount = avs_burstcount_r;
assign avs_bankselect = avs_bankselect_r;
assign dram_req_ready = !avs_waitrequest && !rsp_queue_going_full;
assign dram_rsp_valid = !avs_rspq_empty;
`ifdef DBG_PRINT_AVS
always @(posedge clk) begin
if (dram_req_valid && dram_req_ready) begin
if (dram_req_rw)
$display("%t: AVS Wr Req: addr=%0h, byteen=%0h, tag=%0h, data=%0h", $time, `TO_FULL_ADDR(dram_req_addr), dram_req_byteen, dram_req_tag, dram_req_data);
if (mem_req_valid && mem_req_ready) begin
if (mem_req_rw)
$display("%t: AVS Wr Req: addr=%0h, byteen=%0h, tag=%0h, data=%0h", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_byteen, mem_req_tag, mem_req_data);
else
$display("%t: AVS Rd Req: addr=%0h, byteen=%0h, tag=%0h, pending=%0d", $time, `TO_FULL_ADDR(dram_req_addr), dram_req_byteen, dram_req_tag, rsp_queue_size);
$display("%t: AVS Rd Req: addr=%0h, byteen=%0h, tag=%0h, pending=%0d", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_byteen, mem_req_tag, req_queue_size);
end
if (dram_rsp_valid && dram_rsp_ready) begin
$display("%t: AVS Rd Rsp: tag=%0h, data=%0h, pending=%0d", $time, dram_rsp_tag, dram_rsp_data, rsp_queue_size);
if (mem_rsp_valid && mem_rsp_ready) begin
$display("%t: AVS Rd Rsp: tag=%0h, data=%0h, pending=%0d", $time, mem_rsp_tag, mem_rsp_data, req_queue_size);
end
end
`endif

178
hw/rtl/afu/VX_to_mem.v Normal file
View file

@ -0,0 +1,178 @@
`include "VX_define.vh"
module VX_to_mem #(
parameter SRC_DATA_WIDTH = 1,
parameter SRC_ADDR_WIDTH = 1,
parameter DST_DATA_WIDTH = 1,
parameter DST_ADDR_WIDTH = 1,
parameter SRC_TAG_WIDTH = 1,
parameter DST_TAG_WIDTH = 1,
parameter SRC_DATA_SIZE = (SRC_DATA_WIDTH / 8),
parameter DST_DATA_SIZE = (DST_DATA_WIDTH / 8)
) (
input wire clk,
input wire reset,
input wire mem_req_valid_in,
input wire [SRC_ADDR_WIDTH-1:0] mem_req_addr_in,
input wire mem_req_rw_in,
input wire [SRC_DATA_SIZE-1:0] mem_req_byteen_in,
input wire [SRC_DATA_WIDTH-1:0] mem_req_data_in,
input wire [SRC_TAG_WIDTH-1:0] mem_req_tag_in,
output wire mem_req_ready_in,
output wire mem_req_valid_out,
output wire [DST_ADDR_WIDTH-1:0] mem_req_addr_out,
output wire mem_req_rw_out,
output wire [DST_DATA_SIZE-1:0] mem_req_byteen_out,
output wire [DST_DATA_WIDTH-1:0] mem_req_data_out,
output wire [DST_TAG_WIDTH-1:0] mem_req_tag_out,
input wire mem_req_ready_out,
input wire mem_rsp_valid_in,
input wire [DST_DATA_WIDTH-1:0] mem_rsp_data_in,
input wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_in,
output wire mem_rsp_ready_in,
output wire mem_rsp_valid_out,
output wire [SRC_DATA_WIDTH-1:0] mem_rsp_data_out,
output wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_out,
input wire mem_rsp_ready_out
);
`STATIC_ASSERT ((DST_TAG_WIDTH >= SRC_TAG_WIDTH), ("oops!"))
localparam DST_LDATAW = $clog2(DST_DATA_WIDTH);
localparam SRC_LDATAW = $clog2(SRC_DATA_WIDTH);
localparam D = `ABS(DST_LDATAW - SRC_LDATAW);
localparam P = 2**D;
`UNUSED_VAR (mem_rsp_tag_in)
if (DST_LDATAW > SRC_LDATAW) begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
wire [D-1:0] req_idx = mem_req_addr_in[D-1:0];
wire [D-1:0] rsp_idx = mem_rsp_tag_in[D-1:0];
wire [SRC_ADDR_WIDTH-D-1:0] mem_req_addr_in_qual = mem_req_addr_in[SRC_ADDR_WIDTH-1:D];
wire [P-1:0][SRC_DATA_WIDTH-1:0] mem_rsp_data_in_w = mem_rsp_data_in;
if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH - D)) begin
`UNUSED_VAR (mem_req_addr_in_qual)
assign mem_req_addr_out = mem_req_addr_in_qual[DST_ADDR_WIDTH-1:0];
end else if (DST_ADDR_WIDTH > (SRC_ADDR_WIDTH - D)) begin
assign mem_req_addr_out = DST_ADDR_WIDTH'(mem_req_addr_in_qual);
end else begin
assign mem_req_addr_out = mem_req_addr_in_qual;
end
assign mem_req_valid_out = mem_req_valid_in;
assign mem_req_rw_out = mem_req_rw_in;
assign mem_req_byteen_out = DST_DATA_SIZE'(mem_req_byteen_in) << ((DST_LDATAW-3)'(req_idx) << (SRC_LDATAW-3));
assign mem_req_data_out = DST_DATA_WIDTH'(mem_req_data_in) << ((DST_LDATAW'(req_idx)) << SRC_LDATAW);
assign mem_req_tag_out = DST_TAG_WIDTH'({mem_req_tag_in, req_idx});
assign mem_req_ready_in = mem_req_ready_out;
assign mem_rsp_valid_out = mem_rsp_valid_in;
assign mem_rsp_data_out = mem_rsp_data_in_w[rsp_idx];
assign mem_rsp_tag_out = SRC_TAG_WIDTH'(mem_rsp_tag_in[SRC_TAG_WIDTH+D-1:D]);
assign mem_rsp_ready_in = mem_rsp_ready_out;
end else if (DST_LDATAW < SRC_LDATAW) begin
reg [D-1:0] req_ctr, rsp_ctr;
reg [P-1:0][DST_DATA_WIDTH-1:0] mem_rsp_data_out_r, mem_rsp_data_out_n;
wire mem_req_out_fire = mem_req_valid_out && mem_req_ready_out;
wire mem_rsp_in_fire = mem_rsp_valid_in && mem_rsp_ready_in;
wire [P-1:0][DST_DATA_WIDTH-1:0] mem_req_data_in_w = mem_req_data_in;
wire [P-1:0][DST_DATA_SIZE-1:0] mem_req_byteen_in_w = mem_req_byteen_in;
always @(*) begin
mem_rsp_data_out_n = mem_rsp_data_out_r;
mem_rsp_data_out_n[rsp_ctr] = mem_rsp_data_in;
end
always @(posedge clk) begin
if (reset) begin
req_ctr <= 0;
rsp_ctr <= 0;
end else begin
if (mem_req_out_fire) begin
req_ctr <= req_ctr + 1;
end
if (mem_rsp_in_fire) begin
rsp_ctr <= rsp_ctr + 1;
mem_rsp_data_out_r <= mem_rsp_data_out_n;
end
end
end
reg [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_r;
wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_w;
always @(posedge clk) begin
if (mem_rsp_in_fire) begin
mem_rsp_tag_in_r <= mem_rsp_tag_in;
end
end
assign mem_rsp_tag_in_w = (rsp_ctr != 0) ? mem_rsp_tag_in_r : mem_rsp_tag_in;
`RUNTIME_ASSERT((mem_rsp_tag_in_w == mem_rsp_tag_in), ("oops!"))
wire [SRC_ADDR_WIDTH+D-1:0] mem_req_addr_in_qual = {mem_req_addr_in, req_ctr};
if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH + D)) begin
`UNUSED_VAR (mem_req_addr_in_qual)
assign mem_req_addr_out = mem_req_addr_in_qual[DST_ADDR_WIDTH-1:0];
end else if (DST_ADDR_WIDTH > (SRC_ADDR_WIDTH + D)) begin
assign mem_req_addr_out = DST_ADDR_WIDTH'(mem_req_addr_in_qual);
end else begin
assign mem_req_addr_out = mem_req_addr_in_qual;
end
assign mem_req_valid_out = mem_req_valid_in;
assign mem_req_rw_out = mem_req_rw_in;
assign mem_req_byteen_out = mem_req_byteen_in_w[req_ctr];
assign mem_req_data_out = mem_req_data_in_w[req_ctr];
assign mem_req_tag_out = DST_TAG_WIDTH'(mem_req_tag_in);
assign mem_req_ready_in = mem_req_ready_out && (req_ctr == (P-1));
assign mem_rsp_valid_out = mem_rsp_valid_in && (rsp_ctr == (P-1));
assign mem_rsp_data_out = mem_rsp_data_out_n;
assign mem_rsp_tag_out = SRC_TAG_WIDTH'(mem_rsp_tag_in);
assign mem_rsp_ready_in = mem_rsp_ready_out;
end else begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
if (DST_ADDR_WIDTH < SRC_ADDR_WIDTH) begin
`UNUSED_VAR (mem_req_addr_in)
assign mem_req_addr_out = mem_req_addr_in[DST_ADDR_WIDTH-1:0];
end else if (DST_ADDR_WIDTH > SRC_ADDR_WIDTH) begin
assign mem_req_addr_out = DST_ADDR_WIDTH'(mem_req_addr_in);
end else begin
assign mem_req_addr_out = mem_req_addr_in;
end
assign mem_req_valid_out = mem_req_valid_in;
assign mem_req_rw_out = mem_req_rw_in;
assign mem_req_byteen_out = mem_req_byteen_in;
assign mem_req_data_out = mem_req_data_in;
assign mem_req_tag_out = DST_TAG_WIDTH'(mem_req_tag_in);
assign mem_req_ready_in = mem_req_ready_out;
assign mem_rsp_valid_out = mem_rsp_valid_in;
assign mem_rsp_data_out = mem_rsp_data_in;
assign mem_rsp_tag_out = SRC_TAG_WIDTH'(mem_rsp_tag_in);
assign mem_rsp_ready_in = mem_rsp_ready_out;
end
endmodule

View file

@ -77,30 +77,28 @@ module ccip_std_afu #(
// User AFU goes here
// ====================================================================
//
// vortex_afu depends on CCI-P and local memory being in the same
// clock domain. This is accomplished by choosing a common clock
// in the AFU's JSON description. The platform instantiates clock-
// crossing shims automatically, as needed.
//
t_local_mem_byte_mask avs_byteenable [NUM_LOCAL_MEM_BANKS];
logic avs_waitrequest [NUM_LOCAL_MEM_BANKS];
t_local_mem_data avs_readdata [NUM_LOCAL_MEM_BANKS];
logic avs_readdatavalid [NUM_LOCAL_MEM_BANKS];
t_local_mem_burst_cnt avs_burstcount [NUM_LOCAL_MEM_BANKS];
t_local_mem_data avs_writedata [NUM_LOCAL_MEM_BANKS];
t_local_mem_addr avs_address [NUM_LOCAL_MEM_BANKS];
logic avs_write [NUM_LOCAL_MEM_BANKS];
logic avs_read [NUM_LOCAL_MEM_BANKS];
//
// Memory banks are used very simply here. Only bank is active at
// a time, selected by mem_bank_select. mem_bank_select is set
// by a CSR from the host.
//
t_local_mem_byte_mask avs_byteenable;
logic avs_waitrequest;
t_local_mem_data avs_readdata;
logic avs_readdatavalid;
t_local_mem_burst_cnt avs_burstcount;
t_local_mem_data avs_writedata;
t_local_mem_addr avs_address;
logic avs_write;
logic avs_read;
// choose which memory bank to test
logic [$clog2(NUM_LOCAL_MEM_BANKS)-1:0] mem_bank_select;
for (genvar b = 0; b < NUM_LOCAL_MEM_BANKS; b++) begin
assign local_mem[b].burstcount = avs_burstcount[b];
assign local_mem[b].writedata = avs_writedata[b];
assign local_mem[b].address = avs_address[b];
assign local_mem[b].byteenable = avs_byteenable[b];
assign local_mem[b].write = avs_write[b];
assign local_mem[b].read = avs_read[b];
assign avs_waitrequest[b] = local_mem[b].waitrequest;
assign avs_readdata[b] = local_mem[b].readdata;
assign avs_readdatavalid[b] = local_mem[b].readdatavalid;
end
vortex_afu #(
.NUM_LOCAL_MEM_BANKS(NUM_LOCAL_MEM_BANKS)
@ -108,6 +106,9 @@ module ccip_std_afu #(
.clk (clk),
.reset (reset_T1),
.cp2af_sRxPort (cp2af_sRx_T1),
.af2cp_sTxPort (af2cp_sTx_T0),
.avs_writedata (avs_writedata),
.avs_readdata (avs_readdata),
.avs_address (avs_address),
@ -116,52 +117,7 @@ module ccip_std_afu #(
.avs_read (avs_read),
.avs_byteenable (avs_byteenable),
.avs_burstcount (avs_burstcount),
.avs_readdatavalid (avs_readdatavalid),
.mem_bank_select (mem_bank_select),
.cp2af_sRxPort (cp2af_sRx_T1),
.af2cp_sTxPort (af2cp_sTx_T0)
);
//
// Export the local memory interface signals as vectors so that bank
// selection can use array syntax.
//
logic avs_waitrequest_v[NUM_LOCAL_MEM_BANKS];
t_local_mem_data avs_readdata_v[NUM_LOCAL_MEM_BANKS];
logic avs_readdatavalid_v[NUM_LOCAL_MEM_BANKS];
genvar b;
generate
for (b = 0; b < NUM_LOCAL_MEM_BANKS; b = b + 1)
begin : lmb
always_comb
begin
// Local memory to AFU signals
avs_waitrequest_v[b] = local_mem[b].waitrequest;
avs_readdata_v[b] = local_mem[b].readdata;
avs_readdatavalid_v[b] = local_mem[b].readdatavalid;
// Replicate address and write data to all banks. Only
// the request signals have to be bank-specific.
local_mem[b].burstcount = avs_burstcount;
local_mem[b].writedata = avs_writedata;
local_mem[b].address = avs_address;
local_mem[b].byteenable = avs_byteenable;
// Request a write to this bank?
local_mem[b].write = avs_write &&
($bits(mem_bank_select)'(b) == mem_bank_select);
// Request a read from this bank?
local_mem[b].read = avs_read &&
($bits(mem_bank_select)'(b) == mem_bank_select);
end
end
endgenerate
assign avs_waitrequest = avs_waitrequest_v[mem_bank_select];
assign avs_readdata = avs_readdata_v[mem_bank_select];
assign avs_readdatavalid = avs_readdatavalid_v[mem_bank_select];
.avs_readdatavalid (avs_readdatavalid)
);
endmodule

View file

@ -1,13 +1,18 @@
`include "VX_define.vh"
`ifndef NOPAE
`include "afu_json_info.vh"
`else
`include "VX_platform.vh"
`ifdef NOPAE
`IGNORE_WARNINGS_BEGIN
`include "vortex_afu.vh"
`IGNORE_WARNINGS_END
`else
`include "afu_json_info.vh"
`endif
/* verilator lint_off IMPORTSTAR */
import ccip_if_pkg::*;
import local_mem_cfg_pkg::*;
/* verilator lint_on IMPORTSTAR */
/* verilator lint_on IMPORTSTAR */
`include "VX_define.vh"
module vortex_afu #(
parameter NUM_LOCAL_MEM_BANKS = 2
@ -21,30 +26,32 @@ module vortex_afu #(
output t_if_ccip_Tx af2cp_sTxPort,
// Avalon signals for local memory access
output t_local_mem_data avs_writedata,
input t_local_mem_data avs_readdata,
output t_local_mem_addr avs_address,
input logic avs_waitrequest,
output logic avs_write,
output logic avs_read,
output t_local_mem_byte_mask avs_byteenable,
output t_local_mem_burst_cnt avs_burstcount,
input avs_readdatavalid,
output logic [$clog2(NUM_LOCAL_MEM_BANKS)-1:0] mem_bank_select
output t_local_mem_data avs_writedata [NUM_LOCAL_MEM_BANKS],
input t_local_mem_data avs_readdata [NUM_LOCAL_MEM_BANKS],
output t_local_mem_addr avs_address [NUM_LOCAL_MEM_BANKS],
input logic avs_waitrequest [NUM_LOCAL_MEM_BANKS],
output logic avs_write [NUM_LOCAL_MEM_BANKS],
output logic avs_read [NUM_LOCAL_MEM_BANKS],
output t_local_mem_byte_mask avs_byteenable [NUM_LOCAL_MEM_BANKS],
output t_local_mem_burst_cnt avs_burstcount [NUM_LOCAL_MEM_BANKS],
input avs_readdatavalid [NUM_LOCAL_MEM_BANKS]
);
localparam RESET_DELAY = 3;
localparam DRAM_ADDR_WIDTH = $bits(t_local_mem_addr);
localparam DRAM_LINE_WIDTH = $bits(t_local_mem_data);
localparam DRAM_LINE_LW = $clog2(DRAM_LINE_WIDTH);
localparam LMEM_LINE_WIDTH = $bits(t_local_mem_data);
localparam LMEM_ADDR_WIDTH = $bits(t_local_mem_addr);
localparam LMEM_BURST_CTRW = $bits(t_local_mem_burst_cnt);
localparam VX_DRAM_LINE_LW = $clog2(`VX_DRAM_LINE_WIDTH);
localparam VX_DRAM_LINE_IDX = (DRAM_LINE_LW - VX_DRAM_LINE_LW);
localparam CCI_LINE_WIDTH = $bits(t_ccip_clData);
localparam CCI_LINE_SIZE = CCI_LINE_WIDTH / 8;
localparam CCI_ADDR_WIDTH = 32 - $clog2(CCI_LINE_WIDTH / 8);
localparam AVS_RD_QUEUE_SIZE = 16;
localparam AVS_REQ_TAGW = `VX_DRAM_TAG_WIDTH + VX_DRAM_LINE_IDX;
localparam AVS_REQ_TAGW_VX = `MAX(`VX_MEM_TAG_WIDTH, `VX_MEM_TAG_WIDTH + $clog2(LMEM_LINE_WIDTH) - $clog2(`VX_MEM_LINE_WIDTH));
localparam AVS_REQ_TAGW_CCI = `MAX(CCI_ADDR_WIDTH, CCI_ADDR_WIDTH + $clog2(LMEM_LINE_WIDTH) - $clog2(CCI_LINE_WIDTH));
localparam AVS_REQ_TAGW = `MAX(AVS_REQ_TAGW_VX, AVS_REQ_TAGW_CCI);
localparam CCI_RD_WINDOW_SIZE = 8;
localparam CCI_RD_QUEUE_SIZE = 2 * CCI_RD_WINDOW_SIZE;
@ -74,7 +81,7 @@ localparam MMIO_CSR_DATA = `AFU_IMAGE_MMIO_CSR_DATA;
localparam MMIO_CSR_READ = `AFU_IMAGE_MMIO_CSR_READ;
localparam CCI_RD_RQ_TAGW = $clog2(CCI_RD_WINDOW_SIZE);
localparam CCI_RD_RQ_DATAW = $bits(t_ccip_clData) + CCI_RD_RQ_TAGW;
localparam CCI_RD_RQ_DATAW = CCI_LINE_WIDTH + CCI_RD_RQ_TAGW;
localparam STATE_IDLE = 0;
localparam STATE_READ = 1;
@ -96,18 +103,18 @@ reg [STATE_WIDTH-1:0] state;
// Vortex ports ///////////////////////////////////////////////////////////////
wire vx_dram_req_valid;
wire vx_dram_req_rw;
wire [`VX_DRAM_BYTEEN_WIDTH-1:0] vx_dram_req_byteen;
wire [`VX_DRAM_ADDR_WIDTH-1:0] vx_dram_req_addr;
wire [`VX_DRAM_LINE_WIDTH-1:0] vx_dram_req_data;
wire [`VX_DRAM_TAG_WIDTH-1:0] vx_dram_req_tag;
wire vx_dram_req_ready;
wire vx_mem_req_valid;
wire vx_mem_req_rw;
wire [`VX_MEM_BYTEEN_WIDTH-1:0] vx_mem_req_byteen;
wire [`VX_MEM_ADDR_WIDTH-1:0] vx_mem_req_addr;
wire [`VX_MEM_LINE_WIDTH-1:0] vx_mem_req_data;
wire [`VX_MEM_TAG_WIDTH-1:0] vx_mem_req_tag;
wire vx_mem_req_ready;
wire vx_dram_rsp_valid;
wire [`VX_DRAM_LINE_WIDTH-1:0] vx_dram_rsp_data;
wire [`VX_DRAM_TAG_WIDTH-1:0] vx_dram_rsp_tag;
wire vx_dram_rsp_ready;
wire vx_mem_rsp_valid;
wire [`VX_MEM_LINE_WIDTH-1:0] vx_mem_rsp_data;
wire [`VX_MEM_TAG_WIDTH-1:0] vx_mem_rsp_tag;
wire vx_mem_rsp_ready;
wire vx_csr_io_req_valid;
wire [`VX_CSR_ID_WIDTH-1:0] vx_csr_io_req_coreid;
@ -123,13 +130,13 @@ wire vx_csr_io_rsp_ready;
wire vx_busy;
reg vx_reset;
reg vx_dram_en;
reg vx_mem_en;
// CMD variables //////////////////////////////////////////////////////////////
t_ccip_clAddr cmd_io_addr;
reg [DRAM_ADDR_WIDTH-1:0] cmd_mem_addr;
reg [DRAM_ADDR_WIDTH-1:0] cmd_data_size;
reg [CCI_ADDR_WIDTH-1:0] cmd_mem_addr;
reg [CCI_ADDR_WIDTH-1:0] cmd_data_size;
`ifdef SCOPE
wire [63:0] cmd_scope_rdata;
@ -216,9 +223,9 @@ always @(posedge clk) begin
`endif
end
MMIO_MEM_ADDR: begin
cmd_mem_addr <= t_local_mem_addr'(cp2af_sRxPort.c0.data);
cmd_mem_addr <= $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data);
`ifdef DBG_PRINT_OPAE
$display("%t: MMIO_MEM_ADDR: addr=%0h, data=0x%0h", $time, mmio_hdr.address, t_local_mem_addr'(cp2af_sRxPort.c0.data));
$display("%t: MMIO_MEM_ADDR: addr=%0h, data=0x%0h", $time, mmio_hdr.address, $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data));
`endif
end
MMIO_DATA_SIZE: begin
@ -335,7 +342,7 @@ always @(posedge clk) begin
if (reset) begin
state <= STATE_IDLE;
vx_reset <= 0;
vx_dram_en <= 0;
vx_mem_en <= 0;
end else begin
case (state)
STATE_IDLE: begin
@ -399,14 +406,14 @@ always @(posedge clk) begin
// vortex reset cycles
if (vx_reset_ctr == $bits(vx_reset_ctr)'(RESET_DELAY)) begin
vx_reset <= 0;
vx_dram_en <= 1;
vx_mem_en <= 1;
state <= STATE_RUN;
end
end
STATE_RUN: begin
if (cmd_run_done) begin
vx_dram_en <= 0;
vx_mem_en <= 0;
state <= STATE_IDLE;
`ifdef DBG_PRINT_OPAE
$display("%t: STATE IDLE", $time);
@ -442,187 +449,251 @@ end
// AVS Controller /////////////////////////////////////////////////////////////
wire dram_req_valid;
wire dram_req_rw;
t_local_mem_byte_mask dram_req_byteen;
t_local_mem_addr dram_req_addr;
t_local_mem_data dram_req_data;
wire [AVS_REQ_TAGW:0] dram_req_tag;
wire dram_req_ready;
wire dram_rsp_valid;
t_local_mem_data dram_rsp_data;
wire [AVS_REQ_TAGW:0] dram_rsp_tag;
wire dram_rsp_ready;
wire cci_dram_req_valid;
wire cci_dram_req_rw;
t_local_mem_byte_mask cci_dram_req_byteen;
t_local_mem_addr cci_dram_req_addr;
t_local_mem_data cci_dram_req_data;
wire [AVS_REQ_TAGW-1:0] cci_dram_req_tag;
wire cci_dram_req_ready;
wire cci_dram_rsp_valid;
t_local_mem_data cci_dram_rsp_data;
wire [AVS_REQ_TAGW-1:0] cci_dram_rsp_tag;
wire cci_dram_rsp_ready;
wire vx_dram_req_valid_qual;
t_local_mem_addr vx_dram_req_addr_qual;
t_local_mem_byte_mask vx_dram_req_byteen_qual;
t_local_mem_data vx_dram_req_data_qual;
wire [AVS_REQ_TAGW-1:0] vx_dram_req_tag_qual;
wire [(1 << VX_DRAM_LINE_IDX)-1:0][`VX_DRAM_LINE_WIDTH-1:0] vx_dram_rsp_data_unqual;
wire [AVS_REQ_TAGW-1:0] vx_dram_rsp_tag_unqual;
wire cci_dram_rd_req_valid, cci_dram_wr_req_valid;
wire [DRAM_ADDR_WIDTH-1:0] cci_dram_rd_req_addr, cci_dram_wr_req_addr;
wire cci_mem_rd_req_valid;
wire cci_mem_wr_req_valid;
wire [CCI_RD_RQ_DATAW-1:0] cci_rdq_dout;
//--
wire cci_mem_req_valid;
wire cci_mem_req_rw;
wire [CCI_ADDR_WIDTH-1:0] cci_mem_req_addr;
wire [CCI_ADDR_WIDTH-1:0] cci_mem_req_tag;
wire cci_mem_req_ready;
assign cci_dram_req_valid = (CMD_MEM_WRITE == state) ? cci_dram_wr_req_valid : cci_dram_rd_req_valid;
assign cci_dram_req_addr = (CMD_MEM_WRITE == state) ? cci_dram_wr_req_addr : cci_dram_rd_req_addr;
assign cci_dram_req_rw = (CMD_MEM_WRITE == state);
assign cci_dram_req_byteen = {64{1'b1}};
assign cci_dram_req_data = cci_rdq_dout[CCI_RD_RQ_DATAW-1:CCI_RD_RQ_TAGW];
assign cci_dram_req_tag = AVS_REQ_TAGW'(0);
`UNUSED_VAR (cci_dram_rsp_tag)
wire cci_mem_rsp_valid;
wire [CCI_LINE_WIDTH-1:0] cci_mem_rsp_data;
wire [CCI_ADDR_WIDTH-1:0] cci_mem_rsp_tag;
wire cci_mem_rsp_ready;
//--
assign vx_dram_req_valid_qual = vx_dram_req_valid && vx_dram_en;
wire cci_mem_req_arb_valid;
wire cci_mem_req_arb_rw;
t_local_mem_byte_mask cci_mem_req_arb_byteen;
t_local_mem_addr cci_mem_req_arb_addr;
t_local_mem_data cci_mem_req_arb_data;
wire [AVS_REQ_TAGW-1:0] cci_mem_req_arb_tag;
wire cci_mem_req_arb_ready;
assign vx_dram_req_addr_qual = vx_dram_req_addr[`VX_DRAM_ADDR_WIDTH-1:`VX_DRAM_ADDR_WIDTH-DRAM_ADDR_WIDTH];
wire cci_mem_rsp_arb_valid;
t_local_mem_data cci_mem_rsp_arb_data;
wire [AVS_REQ_TAGW-1:0] cci_mem_rsp_arb_tag;
wire cci_mem_rsp_arb_ready;
if (`VX_DRAM_LINE_WIDTH != DRAM_LINE_WIDTH) begin
wire [VX_DRAM_LINE_IDX-1:0] vx_dram_req_idx = vx_dram_req_addr[VX_DRAM_LINE_IDX-1:0];
wire [VX_DRAM_LINE_IDX-1:0] vx_dram_rsp_idx = vx_dram_rsp_tag_unqual[VX_DRAM_LINE_IDX-1:0];
assign vx_dram_req_byteen_qual = 64'(vx_dram_req_byteen) << (6'(vx_dram_req_addr[VX_DRAM_LINE_IDX-1:0]) << (VX_DRAM_LINE_LW-3));
assign vx_dram_req_data_qual = DRAM_LINE_WIDTH'(vx_dram_req_data) << ((DRAM_LINE_LW'(vx_dram_req_idx)) << VX_DRAM_LINE_LW);
assign vx_dram_req_tag_qual = {vx_dram_req_tag, vx_dram_req_idx};
assign vx_dram_rsp_data = vx_dram_rsp_data_unqual[vx_dram_rsp_idx];
end else begin
assign vx_dram_req_byteen_qual = vx_dram_req_byteen;
assign vx_dram_req_tag_qual = vx_dram_req_tag;
assign vx_dram_req_data_qual = vx_dram_req_data;
assign vx_dram_rsp_data = vx_dram_rsp_data_unqual;
end
VX_to_mem #(
.SRC_DATA_WIDTH (CCI_LINE_WIDTH),
.DST_DATA_WIDTH (LMEM_LINE_WIDTH),
.SRC_ADDR_WIDTH (CCI_ADDR_WIDTH),
.DST_ADDR_WIDTH (LMEM_ADDR_WIDTH),
.SRC_TAG_WIDTH (CCI_ADDR_WIDTH),
.DST_TAG_WIDTH (AVS_REQ_TAGW)
) cci_to_mem (
.clk (clk),
.reset (reset),
assign vx_dram_rsp_tag = vx_dram_rsp_tag_unqual[`VX_DRAM_TAG_WIDTH+VX_DRAM_LINE_IDX-1:VX_DRAM_LINE_IDX];
.mem_req_valid_in (cci_mem_req_valid),
.mem_req_addr_in (cci_mem_req_addr),
.mem_req_rw_in (cci_mem_req_rw),
.mem_req_byteen_in ({CCI_LINE_SIZE{1'b1}}),
.mem_req_data_in (cci_rdq_dout[CCI_RD_RQ_DATAW-1:CCI_RD_RQ_TAGW]),
.mem_req_tag_in (cci_mem_req_tag),
.mem_req_ready_in (cci_mem_req_ready),
.mem_req_valid_out (cci_mem_req_arb_valid),
.mem_req_addr_out (cci_mem_req_arb_addr),
.mem_req_rw_out (cci_mem_req_arb_rw),
.mem_req_byteen_out (cci_mem_req_arb_byteen),
.mem_req_data_out (cci_mem_req_arb_data),
.mem_req_tag_out (cci_mem_req_arb_tag),
.mem_req_ready_out (cci_mem_req_arb_ready),
.mem_rsp_valid_in (cci_mem_rsp_arb_valid),
.mem_rsp_data_in (cci_mem_rsp_arb_data),
.mem_rsp_tag_in (cci_mem_rsp_arb_tag),
.mem_rsp_ready_in (cci_mem_rsp_arb_ready),
.mem_rsp_valid_out (cci_mem_rsp_valid),
.mem_rsp_data_out (cci_mem_rsp_data),
.mem_rsp_tag_out (cci_mem_rsp_tag),
.mem_rsp_ready_out (cci_mem_rsp_ready)
);
//--
wire vx_mem_req_arb_valid;
wire vx_mem_req_arb_rw;
t_local_mem_byte_mask vx_mem_req_arb_byteen;
t_local_mem_addr vx_mem_req_arb_addr;
t_local_mem_data vx_mem_req_arb_data;
wire [AVS_REQ_TAGW-1:0] vx_mem_req_arb_tag;
wire vx_mem_req_arb_ready;
wire vx_mem_rsp_arb_valid;
t_local_mem_data vx_mem_rsp_arb_data;
wire [AVS_REQ_TAGW-1:0] vx_mem_rsp_arb_tag;
wire vx_mem_rsp_arb_ready;
VX_to_mem #(
.SRC_DATA_WIDTH (`VX_MEM_LINE_WIDTH),
.DST_DATA_WIDTH (LMEM_LINE_WIDTH),
.SRC_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH),
.DST_ADDR_WIDTH (LMEM_ADDR_WIDTH),
.SRC_TAG_WIDTH (`VX_MEM_TAG_WIDTH),
.DST_TAG_WIDTH (AVS_REQ_TAGW)
) vx_to_mem (
.clk (clk),
.reset (reset),
.mem_req_valid_in (vx_mem_req_valid && vx_mem_en),
.mem_req_addr_in (vx_mem_req_addr),
.mem_req_rw_in (vx_mem_req_rw),
.mem_req_byteen_in (vx_mem_req_byteen),
.mem_req_data_in (vx_mem_req_data),
.mem_req_tag_in (vx_mem_req_tag),
.mem_req_ready_in (vx_mem_req_ready),
.mem_req_valid_out (vx_mem_req_arb_valid),
.mem_req_addr_out (vx_mem_req_arb_addr),
.mem_req_rw_out (vx_mem_req_arb_rw),
.mem_req_byteen_out (vx_mem_req_arb_byteen),
.mem_req_data_out (vx_mem_req_arb_data),
.mem_req_tag_out (vx_mem_req_arb_tag),
.mem_req_ready_out (vx_mem_req_arb_ready),
.mem_rsp_valid_in (vx_mem_rsp_arb_valid),
.mem_rsp_data_in (vx_mem_rsp_arb_data),
.mem_rsp_tag_in (vx_mem_rsp_arb_tag),
.mem_rsp_ready_in (vx_mem_rsp_arb_ready),
.mem_rsp_valid_out (vx_mem_rsp_valid),
.mem_rsp_data_out (vx_mem_rsp_data),
.mem_rsp_tag_out (vx_mem_rsp_tag),
.mem_rsp_ready_out (vx_mem_rsp_ready)
);
//--
wire mem_req_valid;
wire mem_req_rw;
t_local_mem_byte_mask mem_req_byteen;
t_local_mem_addr mem_req_addr;
t_local_mem_data mem_req_data;
wire [AVS_REQ_TAGW:0] mem_req_tag;
wire mem_req_ready;
wire mem_rsp_valid;
t_local_mem_data mem_rsp_data;
wire [AVS_REQ_TAGW:0] mem_rsp_tag;
wire mem_rsp_ready;
VX_mem_arb #(
.NUM_REQS (2),
.DATA_WIDTH ($bits(t_local_mem_data)),
.ADDR_WIDTH ($bits(t_local_mem_addr)),
.DATA_WIDTH (LMEM_LINE_WIDTH),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.TAG_IN_WIDTH (AVS_REQ_TAGW),
.TAG_OUT_WIDTH (AVS_REQ_TAGW+1)
) dram_arb (
) mem_arb (
.clk (clk),
.reset (reset),
// Source request
.req_valid_in ({cci_dram_req_valid, vx_dram_req_valid_qual}),
.req_rw_in ({cci_dram_req_rw, vx_dram_req_rw}),
.req_byteen_in ({cci_dram_req_byteen, vx_dram_req_byteen_qual}),
.req_addr_in ({cci_dram_req_addr, vx_dram_req_addr_qual}),
.req_data_in ({cci_dram_req_data, vx_dram_req_data_qual}),
.req_tag_in ({cci_dram_req_tag, vx_dram_req_tag_qual}),
.req_ready_in ({cci_dram_req_ready, vx_dram_req_ready}),
.req_valid_in ({cci_mem_req_arb_valid, vx_mem_req_arb_valid}),
.req_rw_in ({cci_mem_req_arb_rw, vx_mem_req_arb_rw}),
.req_byteen_in ({cci_mem_req_arb_byteen, vx_mem_req_arb_byteen}),
.req_addr_in ({cci_mem_req_arb_addr, vx_mem_req_arb_addr}),
.req_data_in ({cci_mem_req_arb_data, vx_mem_req_arb_data}),
.req_tag_in ({cci_mem_req_arb_tag, vx_mem_req_arb_tag}),
.req_ready_in ({cci_mem_req_arb_ready, vx_mem_req_arb_ready}),
// DRAM request
.req_valid_out (dram_req_valid),
.req_rw_out (dram_req_rw),
.req_byteen_out (dram_req_byteen),
.req_addr_out (dram_req_addr),
.req_data_out (dram_req_data),
.req_tag_out (dram_req_tag),
.req_ready_out (dram_req_ready),
// Memory request
.req_valid_out (mem_req_valid),
.req_rw_out (mem_req_rw),
.req_byteen_out (mem_req_byteen),
.req_addr_out (mem_req_addr),
.req_data_out (mem_req_data),
.req_tag_out (mem_req_tag),
.req_ready_out (mem_req_ready),
// Source response
.rsp_valid_out ({cci_dram_rsp_valid, vx_dram_rsp_valid}),
.rsp_data_out ({cci_dram_rsp_data, vx_dram_rsp_data_unqual}),
.rsp_tag_out ({cci_dram_rsp_tag, vx_dram_rsp_tag_unqual}),
.rsp_ready_out ({cci_dram_rsp_ready, vx_dram_rsp_ready}),
.rsp_valid_out ({cci_mem_rsp_arb_valid, vx_mem_rsp_arb_valid}),
.rsp_data_out ({cci_mem_rsp_arb_data, vx_mem_rsp_arb_data}),
.rsp_tag_out ({cci_mem_rsp_arb_tag, vx_mem_rsp_arb_tag}),
.rsp_ready_out ({cci_mem_rsp_arb_ready, vx_mem_rsp_arb_ready}),
// DRAM response
.rsp_valid_in (dram_rsp_valid),
.rsp_tag_in (dram_rsp_tag),
.rsp_data_in (dram_rsp_data),
.rsp_ready_in (dram_rsp_ready)
// Memory response
.rsp_valid_in (mem_rsp_valid),
.rsp_tag_in (mem_rsp_tag),
.rsp_data_in (mem_rsp_data),
.rsp_ready_in (mem_rsp_ready)
);
//--
VX_avs_wrapper #(
.AVS_DATAW ($bits(t_local_mem_data)),
.AVS_ADDRW ($bits(t_local_mem_addr)),
.AVS_BURSTW ($bits(t_local_mem_burst_cnt)),
.AVS_BANKS (NUM_LOCAL_MEM_BANKS),
.REQ_TAGW (AVS_REQ_TAGW+1),
.RD_QUEUE_SIZE (AVS_RD_QUEUE_SIZE)
.NUM_BANKS (NUM_LOCAL_MEM_BANKS),
.AVS_DATA_WIDTH (LMEM_LINE_WIDTH),
.AVS_ADDR_WIDTH (LMEM_ADDR_WIDTH),
.AVS_BURST_WIDTH (LMEM_BURST_CTRW),
.AVS_BANKS (NUM_LOCAL_MEM_BANKS),
.REQ_TAG_WIDTH (AVS_REQ_TAGW + 1),
.RD_QUEUE_SIZE (AVS_RD_QUEUE_SIZE)
) avs_wrapper (
.clk (clk),
.reset (reset),
.clk (clk),
.reset (reset),
// Memory request
.mem_req_valid (mem_req_valid),
.mem_req_rw (mem_req_rw),
.mem_req_byteen (mem_req_byteen),
.mem_req_addr (mem_req_addr),
.mem_req_data (mem_req_data),
.mem_req_tag (mem_req_tag),
.mem_req_ready (mem_req_ready),
// Memory response
.mem_rsp_valid (mem_rsp_valid),
.mem_rsp_data (mem_rsp_data),
.mem_rsp_tag (mem_rsp_tag),
.mem_rsp_ready (mem_rsp_ready),
// AVS bus
.avs_writedata (avs_writedata),
.avs_readdata (avs_readdata),
.avs_address (avs_address),
.avs_waitrequest (avs_waitrequest),
.avs_write (avs_write),
.avs_read (avs_read),
.avs_byteenable (avs_byteenable),
.avs_burstcount (avs_burstcount),
.avs_readdatavalid (avs_readdatavalid),
.avs_bankselect (mem_bank_select),
// DRAM request
.dram_req_valid (dram_req_valid),
.dram_req_rw (dram_req_rw),
.dram_req_byteen (dram_req_byteen),
.dram_req_addr (dram_req_addr),
.dram_req_data (dram_req_data),
.dram_req_tag (dram_req_tag),
.dram_req_ready (dram_req_ready),
// DRAM response
.dram_rsp_valid (dram_rsp_valid),
.dram_rsp_data (dram_rsp_data),
.dram_rsp_tag (dram_rsp_tag),
.dram_rsp_ready (dram_rsp_ready)
.avs_writedata (avs_writedata),
.avs_readdata (avs_readdata),
.avs_address (avs_address),
.avs_waitrequest (avs_waitrequest),
.avs_write (avs_write),
.avs_read (avs_read),
.avs_byteenable (avs_byteenable),
.avs_burstcount (avs_burstcount),
.avs_readdatavalid(avs_readdatavalid)
);
// CCI-P Read Request ///////////////////////////////////////////////////////////
reg [DRAM_ADDR_WIDTH-1:0] cci_dram_wr_req_ctr;
reg [DRAM_ADDR_WIDTH-1:0] cci_rd_req_ctr;
wire [DRAM_ADDR_WIDTH-1:0] cci_rd_req_ctr_next;
reg [DRAM_ADDR_WIDTH-1:0] cci_dram_wr_req_addr_unqual;
wire [CCI_RD_RQ_TAGW-1:0] cci_rd_req_tag, cci_rd_rsp_tag;
reg [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_ctr;
wire [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_addr;
reg [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_addr_unqual;
reg [CCI_ADDR_WIDTH-1:0] cci_rd_req_ctr;
wire [CCI_ADDR_WIDTH-1:0] cci_rd_req_ctr_next;
wire [CCI_RD_RQ_TAGW-1:0] cci_rd_req_tag;
wire [CCI_RD_RQ_TAGW-1:0] cci_rd_rsp_tag;
reg [CCI_RD_RQ_TAGW-1:0] cci_rd_rsp_ctr;
t_ccip_clAddr cci_rd_req_addr;
reg cci_rd_req_enable, cci_rd_req_wait;
wire cci_rd_req_fire;
t_ccip_clAddr cci_rd_req_addr;
reg cci_rd_req_valid, cci_rd_req_wait;
wire cci_rdq_push, cci_rdq_pop;
wire [CCI_RD_RQ_DATAW-1:0] cci_rdq_din;
wire cci_rdq_empty;
always @(*) begin
af2cp_sTxPort.c0.valid = cci_rd_req_fire;
af2cp_sTxPort.c0.hdr = t_ccip_c0_ReqMemHdr'(0);
af2cp_sTxPort.c0.hdr.address = cci_rd_req_addr;
af2cp_sTxPort.c0.hdr.mdata = t_ccip_mdata'(cci_rd_req_tag);
end
wire cci_dram_wr_req_fire = cci_dram_wr_req_valid && cci_dram_req_ready;
wire cci_rd_req_fire = af2cp_sTxPort.c0.valid;
wire cci_mem_wr_req_fire = cci_mem_wr_req_valid && cci_mem_req_ready;
wire cci_rd_rsp_fire = (STATE_WRITE == state)
&& cp2af_sRxPort.c0.rspValid
@ -631,10 +702,8 @@ wire cci_rd_rsp_fire = (STATE_WRITE == state)
assign cci_rd_req_tag = CCI_RD_RQ_TAGW'(cci_rd_req_ctr);
assign cci_rd_rsp_tag = CCI_RD_RQ_TAGW'(cp2af_sRxPort.c0.hdr.mdata);
assign cci_rd_req_ctr_next = cci_rd_req_ctr + DRAM_ADDR_WIDTH'(cci_rd_req_fire ? 1 : 0);
assign cci_rdq_pop = cci_dram_wr_req_fire;
assign cci_rdq_push = cci_rd_rsp_fire;
assign cci_rdq_pop = cci_mem_wr_req_fire;
assign cci_rdq_din = {cp2af_sRxPort.c0.data, cci_rd_rsp_tag};
wire [$clog2(CCI_RD_QUEUE_SIZE+1)-1:0] cci_pending_reads;
@ -646,79 +715,80 @@ VX_pending_size #(
.reset (reset),
.push (cci_rd_req_fire),
.pop (cci_rdq_pop),
`UNUSED_PIN (empty),
.full (cci_pending_reads_full),
.size (cci_pending_reads)
.size (cci_pending_reads),
`UNUSED_PIN (empty)
);
`UNUSED_VAR (cci_pending_reads)
assign cci_dram_wr_req_valid = !cci_rdq_empty;
assign cci_rd_req_ctr_next = cci_rd_req_ctr + CCI_ADDR_WIDTH'(cci_rd_req_fire ? 1 : 0);
assign cci_dram_wr_req_addr = cci_dram_wr_req_addr_unqual + (DRAM_ADDR_WIDTH'(CCI_RD_RQ_TAGW'(cci_rdq_dout)));
assign cci_rd_req_fire = cci_rd_req_valid && !(cci_rd_req_wait || cci_pending_reads_full);
assign cci_mem_wr_req_valid = !cci_rdq_empty;
assign cci_mem_wr_req_addr = cci_mem_wr_req_addr_unqual + (CCI_ADDR_WIDTH'(CCI_RD_RQ_TAGW'(cci_rdq_dout)));
assign af2cp_sTxPort.c0.valid = cci_rd_req_enable && !cci_rd_req_wait;
assign cmd_write_done = (cci_dram_wr_req_ctr == cmd_data_size);
assign cmd_write_done = (cci_mem_wr_req_ctr == cmd_data_size);
// Send read requests to CCI
always @(posedge clk) begin
if (reset) begin
cci_rd_req_addr <= 0;
cci_rd_req_ctr <= 0;
cci_rd_rsp_ctr <= 0;
cci_rd_req_enable <= 0;
cci_rd_req_wait <= 0;
cci_dram_wr_req_ctr <= 0;
cci_dram_wr_req_addr_unqual <= 0;
end
else begin
cci_rd_req_valid <= 0;
cci_rd_req_wait <= 0;
end else begin
if ((STATE_IDLE == state)
&& (CMD_MEM_WRITE == cmd_type)) begin
cci_rd_req_addr <= cmd_io_addr;
cci_rd_req_ctr <= 0;
cci_rd_rsp_ctr <= 0;
cci_rd_req_enable <= (cmd_data_size != 0);
cci_rd_req_wait <= 0;
cci_dram_wr_req_ctr <= 0;
cci_dram_wr_req_addr_unqual <= cmd_mem_addr;
cci_rd_req_valid <= (cmd_data_size != 0);
cci_rd_req_wait <= 0;
end
cci_rd_req_enable <= (STATE_WRITE == state)
&& (cci_rd_req_ctr_next != cmd_data_size)
&& !cci_pending_reads_full
&& !cp2af_sRxPort.c0TxAlmFull;
cci_rd_req_valid <= (STATE_WRITE == state)
&& (cci_rd_req_ctr_next != cmd_data_size)
&& !cp2af_sRxPort.c0TxAlmFull;
if (cci_rd_req_fire) begin
cci_rd_req_addr <= cci_rd_req_addr + 1;
cci_rd_req_ctr <= cci_rd_req_ctr_next;
if (cci_rd_req_tag == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) begin
cci_rd_req_wait <= 1; // end current request batch
end
`ifdef DBG_PRINT_OPAE
$display("%t: CCI Rd Req: addr=%0h, tag=%0h, rem=%0d, pending=%0d", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr_next), cci_pending_reads);
`endif
if (cci_rd_req_fire && (cci_rd_req_tag == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1))) begin
cci_rd_req_wait <= 1; // end current request batch
end
if (cci_rd_rsp_fire) begin
cci_rd_rsp_ctr <= cci_rd_rsp_ctr + CCI_RD_RQ_TAGW'(1);
if (cci_rd_rsp_ctr == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) begin
cci_rd_req_wait <= 0; // restart new request batch
end
`ifdef DBG_PRINT_OPAE
$display("%t: CCI Rd Rsp: idx=%0d, ctr=%0d, data=%0h", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data);
`endif
end
if (cci_rd_rsp_fire && (cci_rd_rsp_ctr == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1))) begin
cci_rd_req_wait <= 0; // begin new request batch
end
end
/*if (cci_rdq_pop) begin
`ifdef DBG_PRINT_OPAE
$display("%t: CCI Rd Queue Pop: pending=%0d", $time, cci_pending_reads);
`endif
end*/
if ((STATE_IDLE == state)
&& (CMD_MEM_WRITE == cmd_type)) begin
cci_rd_req_addr <= cmd_io_addr;
cci_rd_req_ctr <= 0;
cci_rd_rsp_ctr <= 0;
cci_mem_wr_req_ctr <= 0;
cci_mem_wr_req_addr_unqual <= cmd_mem_addr;
end
if (cci_dram_wr_req_fire) begin
cci_dram_wr_req_addr_unqual <= cci_dram_wr_req_addr_unqual + ((CCI_RD_RQ_TAGW'(cci_dram_wr_req_ctr) == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) ? DRAM_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE) : DRAM_ADDR_WIDTH'(0));
cci_dram_wr_req_ctr <= cci_dram_wr_req_ctr + DRAM_ADDR_WIDTH'(1);
end
if (cci_rd_req_fire) begin
cci_rd_req_addr <= cci_rd_req_addr + 1;
cci_rd_req_ctr <= cci_rd_req_ctr + 1;
`ifdef DBG_PRINT_OPAE
$display("%t: CCI Rd Req: addr=%0h, tag=%0h, rem=%0d, pending=%0d", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr - 1), cci_pending_reads);
`endif
end
if (cci_rd_rsp_fire) begin
cci_rd_rsp_ctr <= cci_rd_rsp_ctr + CCI_RD_RQ_TAGW'(1);
`ifdef DBG_PRINT_OPAE
$display("%t: CCI Rd Rsp: idx=%0d, ctr=%0d, data=%0h", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data);
`endif
end
if (cci_rdq_pop) begin
`ifdef DBG_PRINT_OPAE
$display("%t: CCI Rd Queue Pop: pending=%0d", $time, cci_pending_reads);
`endif
end
if (cci_mem_wr_req_fire) begin
cci_mem_wr_req_addr_unqual <= cci_mem_wr_req_addr_unqual + ((CCI_RD_RQ_TAGW'(cci_mem_wr_req_ctr) == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) ? CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE) : CCI_ADDR_WIDTH'(0));
cci_mem_wr_req_ctr <= cci_mem_wr_req_ctr + CCI_ADDR_WIDTH'(1);
end
end
@ -761,22 +831,24 @@ VX_fifo_queue #(
// CCI-P Write Request //////////////////////////////////////////////////////////
reg [DRAM_ADDR_WIDTH-1:0] cci_dram_rd_req_ctr;
reg [DRAM_ADDR_WIDTH-1:0] cci_wr_req_ctr;
reg [DRAM_ADDR_WIDTH-1:0] cci_dram_rd_req_addr_r;
reg [CCI_ADDR_WIDTH-1:0] cci_mem_rd_req_ctr;
reg [CCI_ADDR_WIDTH-1:0] cci_mem_rd_req_addr;
reg [CCI_ADDR_WIDTH-1:0] cci_wr_req_ctr;
reg cci_wr_req_fire;
t_ccip_clAddr cci_wr_req_addr;
t_ccip_clData cci_wr_req_data;
always @(*) begin
af2cp_sTxPort.c1.valid = cci_wr_req_fire;
af2cp_sTxPort.c1.hdr = t_ccip_c1_ReqMemHdr'(0);
af2cp_sTxPort.c1.hdr.address = cci_wr_req_addr;
af2cp_sTxPort.c1.hdr.sop = 1; // single line write mode
af2cp_sTxPort.c1.data = t_ccip_clData'(cci_dram_rsp_data);
af2cp_sTxPort.c1.hdr.address = cci_wr_req_addr;
af2cp_sTxPort.c1.data = cci_wr_req_data;
end
wire cci_dram_rd_req_fire = cci_dram_rd_req_valid && cci_dram_req_ready;
wire cci_dram_rd_rsp_fire = cci_dram_rsp_valid && cci_dram_rsp_ready;
wire cci_wr_req_fire = cci_dram_rd_rsp_fire;
wire cci_mem_rd_req_fire = cci_mem_rd_req_valid && cci_mem_req_ready;
wire cci_mem_rd_rsp_fire = cci_mem_rsp_valid && cci_mem_rsp_ready;
wire cci_wr_rsp_fire = (STATE_READ == state)
&& cp2af_sRxPort.c1.rspValid
@ -785,12 +857,13 @@ wire cci_wr_rsp_fire = (STATE_READ == state)
wire [$clog2(CCI_RW_PENDING_SIZE+1)-1:0] cci_pending_writes;
wire cci_pending_writes_empty;
wire cci_pending_writes_full;
VX_pending_size #(
.SIZE (CCI_RW_PENDING_SIZE)
) cci_wr_pending_size (
.clk (clk),
.reset (reset),
.push (cci_wr_req_fire),
.push (cci_mem_rd_rsp_fire),
.pop (cci_wr_rsp_fire),
.empty (cci_pending_writes_empty),
.full (cci_pending_writes_full),
@ -798,54 +871,61 @@ VX_pending_size #(
);
`UNUSED_VAR (cci_pending_writes)
assign cci_dram_rd_req_valid = (cci_dram_rd_req_ctr != 0);
assign cci_dram_rd_req_addr = cci_dram_rd_req_addr_r;
assign cci_mem_rd_req_valid = (STATE_READ == state)
&& (cci_mem_rd_req_ctr != cmd_data_size);
assign af2cp_sTxPort.c1.valid = cci_dram_rd_rsp_fire;
assign cci_dram_rsp_ready = !cp2af_sRxPort.c1TxAlmFull && !cci_pending_writes_full;
assign cci_mem_rsp_ready = !cp2af_sRxPort.c1TxAlmFull
&& !cci_pending_writes_full;
assign cmd_read_done = (0 == cci_wr_req_ctr) && cci_pending_writes_empty;
assign cmd_read_done = (0 == cci_wr_req_ctr)
&& cci_pending_writes_empty;
// Send write requests to CCI
always @(posedge clk)
begin
if (reset) begin
cci_wr_req_addr <= 0;
cci_wr_req_ctr <= 0;
cci_dram_rd_req_ctr <= 0;
cci_dram_rd_req_addr_r <= 0;
cci_wr_req_fire <= 0;
end else begin
cci_wr_req_fire <= cci_mem_rd_rsp_fire;
end
else begin
if ((STATE_IDLE == state)
&& (CMD_MEM_READ == cmd_type)) begin
cci_wr_req_addr <= cmd_io_addr;
cci_wr_req_ctr <= cmd_data_size;
cci_dram_rd_req_ctr <= cmd_data_size;
cci_dram_rd_req_addr_r <= cmd_mem_addr;
end
if ((STATE_IDLE == state)
&& (CMD_MEM_READ == cmd_type)) begin
cci_mem_rd_req_ctr <= 0;
cci_mem_rd_req_addr <= cmd_mem_addr;
cci_wr_req_ctr <= cmd_data_size;
end
if (cci_wr_req_fire) begin
assert(cci_wr_req_ctr != 0);
cci_wr_req_addr <= cci_wr_req_addr + t_ccip_clAddr'(1);
cci_wr_req_ctr <= cci_wr_req_ctr - DRAM_ADDR_WIDTH'(1);
`ifdef DBG_PRINT_OPAE
$display("%t: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d, data=%0h", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data);
`endif
end
if (cci_mem_rd_req_fire) begin
cci_mem_rd_req_addr <= cci_mem_rd_req_addr + CCI_ADDR_WIDTH'(1);
cci_mem_rd_req_ctr <= cci_mem_rd_req_ctr + CCI_ADDR_WIDTH'(1);
end
/*`ifdef DBG_PRINT_OPAE
if (cci_wr_rsp_fire) begin
$display("%t: CCI Wr Rsp: pending=%0d", $time, cci_pending_writes);
end
`endif*/
cci_wr_req_addr <= cmd_io_addr + t_ccip_clAddr'(cci_mem_rsp_tag);
cci_wr_req_data <= t_ccip_clData'(cci_mem_rsp_data);
if (cci_dram_rd_req_fire) begin
cci_dram_rd_req_addr_r <= cci_dram_rd_req_addr_r + DRAM_ADDR_WIDTH'(1);
cci_dram_rd_req_ctr <= cci_dram_rd_req_ctr - DRAM_ADDR_WIDTH'(1);
end
if (cci_wr_req_fire) begin
assert(cci_wr_req_ctr != 0);
cci_wr_req_ctr <= cci_wr_req_ctr - CCI_ADDR_WIDTH'(1);
`ifdef DBG_PRINT_OPAE
$display("%t: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d, data=%0h", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data);
`endif
end
if (cci_wr_rsp_fire) begin
`ifdef DBG_PRINT_OPAE
$display("%t: CCI Wr Rsp: pending=%0d", $time, cci_pending_writes);
`endif
end
end
//--
assign cci_mem_req_rw = (CMD_MEM_WRITE == state);
assign cci_mem_req_valid = cci_mem_req_rw ? cci_mem_wr_req_valid : cci_mem_rd_req_valid;
assign cci_mem_req_addr = cci_mem_req_rw ? cci_mem_wr_req_addr : cci_mem_rd_req_addr;
assign cci_mem_req_tag = cci_mem_req_rw ? cci_mem_wr_req_ctr : cci_mem_rd_req_ctr;
// CSRs ///////////////////////////////////////////////////////////////////////
reg csr_io_req_sent;
@ -890,20 +970,20 @@ Vortex #() vortex (
.clk (clk),
.reset (reset | vx_reset),
// DRAM request
.dram_req_valid (vx_dram_req_valid),
.dram_req_rw (vx_dram_req_rw),
.dram_req_byteen(vx_dram_req_byteen),
.dram_req_addr (vx_dram_req_addr),
.dram_req_data (vx_dram_req_data),
.dram_req_tag (vx_dram_req_tag),
.dram_req_ready (vx_dram_req_ready),
// Memory request
.mem_req_valid (vx_mem_req_valid),
.mem_req_rw (vx_mem_req_rw),
.mem_req_byteen (vx_mem_req_byteen),
.mem_req_addr (vx_mem_req_addr),
.mem_req_data (vx_mem_req_data),
.mem_req_tag (vx_mem_req_tag),
.mem_req_ready (vx_mem_req_ready),
// DRAM response
.dram_rsp_valid (vx_dram_rsp_valid),
.dram_rsp_data (vx_dram_rsp_data),
.dram_rsp_tag (vx_dram_rsp_tag),
.dram_rsp_ready (vx_dram_rsp_ready),
// Memory response
.mem_rsp_valid (vx_mem_rsp_valid),
.mem_rsp_data (vx_mem_rsp_data),
.mem_rsp_tag (vx_mem_rsp_tag),
.mem_rsp_ready (vx_mem_rsp_ready),
// CSR Request
.csr_req_valid (vx_csr_io_req_valid),
@ -944,16 +1024,15 @@ Vortex #() vortex (
`SCOPE_ASSIGN (cci_sTxPort_c2_mmioRdValid, af2cp_sTxPort.c2.mmioRdValid);
`SCOPE_ASSIGN (cci_sRxPort_c0TxAlmFull, cp2af_sRxPort.c0TxAlmFull);
`SCOPE_ASSIGN (cci_sRxPort_c1TxAlmFull, cp2af_sRxPort.c1TxAlmFull);
`SCOPE_ASSIGN (avs_address, avs_address);
`SCOPE_ASSIGN (avs_waitrequest, avs_waitrequest);
`SCOPE_ASSIGN (avs_write_fire, avs_write && !avs_waitrequest);
`SCOPE_ASSIGN (avs_read_fire, avs_read && !avs_waitrequest);
`SCOPE_ASSIGN (avs_byteenable, avs_byteenable);
`SCOPE_ASSIGN (avs_burstcount, avs_burstcount);
`SCOPE_ASSIGN (avs_readdatavalid, avs_readdatavalid);
`SCOPE_ASSIGN (mem_bank_select, mem_bank_select);
`SCOPE_ASSIGN (cci_dram_rd_req_ctr, cci_dram_rd_req_ctr);
`SCOPE_ASSIGN (cci_dram_wr_req_ctr, cci_dram_wr_req_ctr);
`SCOPE_ASSIGN (avs_address, avs_address[0]);
`SCOPE_ASSIGN (avs_waitrequest, avs_waitrequest[0]);
`SCOPE_ASSIGN (avs_write_fire, avs_write[0] && !avs_waitrequest[0]);
`SCOPE_ASSIGN (avs_read_fire, avs_read[0] && !avs_waitrequest[0]);
`SCOPE_ASSIGN (avs_byteenable, avs_byteenable[0]);
`SCOPE_ASSIGN (avs_burstcount, avs_burstcount[0]);
`SCOPE_ASSIGN (avs_readdatavalid, avs_readdatavalid[0]);
`SCOPE_ASSIGN (cci_mem_rd_req_ctr, cci_mem_rd_req_ctr);
`SCOPE_ASSIGN (cci_mem_wr_req_ctr, cci_mem_wr_req_ctr);
`SCOPE_ASSIGN (cci_rd_req_ctr, cci_rd_req_ctr);
`SCOPE_ASSIGN (cci_rd_rsp_ctr, cci_rd_rsp_ctr);
`SCOPE_ASSIGN (cci_wr_req_ctr, cci_wr_req_ctr);
@ -964,11 +1043,11 @@ Vortex #() vortex (
`SCOPE_ASSIGN (cci_pending_reads_full, cci_pending_reads_full);
`SCOPE_ASSIGN (cci_pending_writes_empty, cci_pending_writes_empty);
`SCOPE_ASSIGN (cci_pending_writes_full, cci_pending_writes_full);
`SCOPE_ASSIGN (afu_dram_req_fire, (dram_req_valid && dram_req_ready));
`SCOPE_ASSIGN (afu_dram_req_addr, dram_req_addr);
`SCOPE_ASSIGN (afu_dram_req_tag, dram_req_tag);
`SCOPE_ASSIGN (afu_dram_rsp_fire, (dram_rsp_valid && dram_rsp_ready));
`SCOPE_ASSIGN (afu_dram_rsp_tag, dram_rsp_tag);
`SCOPE_ASSIGN (afu_mem_req_fire, (mem_req_valid && mem_req_ready));
`SCOPE_ASSIGN (afu_mem_req_addr, mem_req_addr);
`SCOPE_ASSIGN (afu_mem_req_tag, mem_req_tag);
`SCOPE_ASSIGN (afu_mem_rsp_fire, (mem_rsp_valid && mem_rsp_ready));
`SCOPE_ASSIGN (afu_mem_rsp_tag, mem_rsp_tag);
wire scope_changed = `SCOPE_TRIGGER;

View file

@ -1,18 +1,27 @@
`ifndef __VORTEX_AFU__
`define __VORTEX_AFU__
`IGNORE_WARNINGS_BEGIN
`include "ccip_if_pkg.sv"
`IGNORE_WARNINGS_END
`define PLATFORM_PROVIDES_LOCAL_MEMORY
`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH 26
`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH 512
`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH 4
`IGNORE_WARNINGS_BEGIN
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BANKS
`define PLATFORM_PARAM_LOCAL_MEMORY_BANKS 2
`endif
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH 26
`endif
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH 512
`endif
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH 4
`endif
`include "local_mem_cfg_pkg.sv"
`IGNORE_WARNINGS_END
`define AFU_ACCEL_NAME "vortex_afu"
`define AFU_ACCEL_UUID 128'h35f9452b_25c2_434c_93d5_6f8c60db361c

168
hw/rtl/cache/VX_bank.v vendored
View file

@ -22,8 +22,8 @@ module VX_bank #(
parameter CREQ_SIZE = 1,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 1,
// DRAM Request Queue Size
parameter DREQ_SIZE = 1,
// Memory Request Queue Size
parameter MREQ_SIZE = 1,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
@ -35,10 +35,7 @@ module VX_bank #(
parameter CORE_TAG_ID_BITS = 0,
// bank offset from beginning of index range
parameter BANK_ADDR_OFFSET = 0,
// in-order DRAN
parameter IN_ORDER_DRAM = 0
parameter BANK_ADDR_OFFSET = 0
) (
`SCOPE_IO_VX_bank
@ -71,19 +68,19 @@ module VX_bank #(
output wire [CORE_TAG_WIDTH-1:0] core_rsp_tag,
input wire core_rsp_ready,
// DRAM request
output wire dram_req_valid,
output wire dram_req_rw,
output wire [CACHE_LINE_SIZE-1:0] dram_req_byteen,
output wire [`LINE_ADDR_WIDTH-1:0] dram_req_addr,
output wire [`CACHE_LINE_WIDTH-1:0] dram_req_data,
input wire dram_req_ready,
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [CACHE_LINE_SIZE-1:0] mem_req_byteen,
output wire [`LINE_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`CACHE_LINE_WIDTH-1:0] mem_req_data,
input wire mem_req_ready,
// DRAM response
input wire dram_rsp_valid,
input wire [`LINE_ADDR_WIDTH-1:0] dram_rsp_addr,
input wire [`CACHE_LINE_WIDTH-1:0] dram_rsp_data,
output wire dram_rsp_ready,
// Memory response
input wire mem_rsp_valid,
input wire [`LINE_ADDR_WIDTH-1:0] mem_rsp_addr,
input wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data,
output wire mem_rsp_ready,
// flush
input wire flush_enable,
@ -93,10 +90,10 @@ module VX_bank #(
`UNUSED_PARAM (CORE_TAG_ID_BITS)
`ifdef DBG_CACHE_REQ_INFO
/* verilator lint_off UNUSED */
`IGNORE_WARNINGS_BEGIN
wire [31:0] debug_pc_sel, debug_pc_st0, debug_pc_st1;
wire [`NW_BITS-1:0] debug_wid_sel, debug_wid_st0, debug_wid_st1;
/* verilator lint_on UNUSED */
`IGNORE_WARNINGS_END
`endif
wire creq_pop;
@ -167,8 +164,8 @@ module VX_bank #(
wire is_flush_st0;
wire crsq_in_valid, crsq_in_ready, crsq_in_stall;
wire dreq_alm_full;
wire drsq_pop;
wire mreq_alm_full;
wire mrsq_pop;
wire crsq_in_fire = crsq_in_valid && crsq_in_ready;
@ -186,24 +183,24 @@ module VX_bank #(
// determine which queue to pop next in priority order
wire mshr_pop_unqual = mshr_valid
&& !dreq_alm_full; // ensure DRAM request queue not full (deadlock prevention)
wire drsq_pop_unqual = !mshr_pop_unqual && dram_rsp_valid;
wire creq_pop_unqual = !mshr_pop_unqual && !drsq_pop_unqual && !creq_empty && !flush_enable;
&& !mreq_alm_full; // ensure memory request queue not full (deadlock prevention)
wire mrsq_pop_unqual = !mshr_pop_unqual && mem_rsp_valid;
wire creq_pop_unqual = !mshr_pop_unqual && !mrsq_pop_unqual && !creq_empty && !flush_enable;
wire is_miss_st1 = valid_st1 && (miss_st1 || force_miss_st1);
assign mshr_pop = mshr_pop_unqual
&& !(!IN_ORDER_DRAM && is_miss_st1 && is_mshr_st1) // do not schedule another mshr request if the previous one missed
&& !(is_miss_st1 && is_mshr_st1) // do not schedule another mshr request if the previous one missed
&& !crsq_in_stall; // ensure core response ready
assign drsq_pop = drsq_pop_unqual
assign mrsq_pop = mrsq_pop_unqual
&& !crsq_in_stall; // ensure core response ready
assign creq_pop = creq_pop_unqual
&& !dreq_alm_full // ensure dram request ready
&& !mreq_alm_full // ensure memory request ready
&& !mshr_alm_full // ensure mshr enqueue ready
&& !crsq_in_stall; // ensure core response ready
assign dram_rsp_ready = drsq_pop;
assign mem_rsp_ready = mrsq_pop;
// we have a miss in mshr or entering it for the current address
wire mshr_pending_sel = mshr_pending
@ -237,15 +234,7 @@ module VX_bank #(
end else begin
assign creq_line_data = creq_data;
end
wire [`LINE_ADDR_WIDTH-1:0] dram_rsp_addr_qual;
if (IN_ORDER_DRAM) begin
`UNUSED_VAR (dram_rsp_addr)
assign dram_rsp_addr_qual = mshr_addr;
end else begin
assign dram_rsp_addr_qual = dram_rsp_addr;
end
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + (`UP(`WORD_SELECT_BITS) + WORD_SIZE + `REQS_BITS + 1) * NUM_PORTS + CORE_TAG_WIDTH + 1 + 1),
.RESETW (1)
@ -254,13 +243,13 @@ module VX_bank #(
.reset (reset),
.enable (!crsq_in_stall),
.data_in ({
flush_enable || mshr_pop || drsq_pop || creq_pop,
flush_enable || mshr_pop || mrsq_pop || creq_pop,
flush_enable,
mshr_pop_unqual,
drsq_pop_unqual || flush_enable,
mrsq_pop_unqual || flush_enable,
mshr_pop_unqual ? 1'b0 : creq_rw,
mshr_pop_unqual ? mshr_addr : (dram_rsp_valid ? dram_rsp_addr_qual : (flush_enable ? `LINE_ADDR_WIDTH'(flush_addr) : creq_addr)),
dram_rsp_valid ? dram_rsp_data : creq_line_data,
mshr_pop_unqual ? mshr_addr : (mem_rsp_valid ? mem_rsp_addr : (flush_enable ? `LINE_ADDR_WIDTH'(flush_addr) : creq_addr)),
mem_rsp_valid ? mem_rsp_data : creq_line_data,
mshr_pop_unqual ? mshr_wsel : creq_wsel,
mshr_pop_unqual ? mshr_byteen : creq_byteen,
mshr_pop_unqual ? mshr_tid : creq_tid,
@ -307,7 +296,7 @@ module VX_bank #(
);
// redundant fills
wire is_redundant_fill_st0 = !IN_ORDER_DRAM && is_fill_st0 && tag_match_st0;
wire is_redundant_fill_st0 = is_fill_st0 && tag_match_st0;
// we had a miss with prior request for the current address
assign prev_miss_dep_st0 = is_miss_st1 && (addr_st0 == addr_st1);
@ -322,9 +311,9 @@ module VX_bank #(
assign writeen_unqual_st0 = (WRITE_ENABLE && !is_fill_st0 && tag_match_st0 && mem_rw_st0)
|| (is_fill_st0 && !is_redundant_fill_st0);
assign incoming_fill_st0 = dram_rsp_valid && (addr_st0 == dram_rsp_addr_qual);
assign incoming_fill_st0 = mem_rsp_valid && (addr_st0 == mem_rsp_addr);
assign fill_req_unqual_st0 = !mem_rw_st0 && (!force_miss_st0 || (!IN_ORDER_DRAM && is_mshr_st0 && !prev_miss_dep_st0));
assign fill_req_unqual_st0 = !mem_rw_st0 && (!force_miss_st0 || (is_mshr_st0 && !prev_miss_dep_st0));
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + (`UP(`WORD_SELECT_BITS) + WORD_SIZE + `REQS_BITS + 1) * NUM_PORTS + CORE_TAG_WIDTH),
@ -351,12 +340,12 @@ module VX_bank #(
wire mshr_push_st1 = !is_fill_st1 && !mem_rw_st1 && (miss_st1 || force_miss_st1);
wire incoming_fill_qual_st1 = (dram_rsp_valid && (addr_st1 == dram_rsp_addr_qual))
wire incoming_fill_qual_st1 = (mem_rsp_valid && (addr_st1 == mem_rsp_addr))
|| incoming_fill_st1;
wire do_writeback_st1 = !is_fill_st1 && mem_rw_st1;
wire dreq_push_st1 = (miss_st1 && fill_req_unqual_st1 && !incoming_fill_qual_st1)
wire mreq_push_st1 = (miss_st1 && fill_req_unqual_st1 && !incoming_fill_qual_st1)
|| do_writeback_st1;
wire [`WORDS_PER_LINE-1:0][WORD_SIZE-1:0] line_byteen_st1;
@ -408,15 +397,14 @@ module VX_bank #(
assign mshr_push = valid_st1 && mshr_push_st1;
wire mshr_dequeue = valid_st1 && is_mshr_st1 && !mshr_push_st1 && crsq_in_ready;
wire mshr_restore = !IN_ORDER_DRAM && is_mshr_st1;
`RUNTIME_ASSERT(!IN_ORDER_DRAM || !(mshr_push && mshr_restore), ("Oops!"))
wire mshr_restore = is_mshr_st1;
// push a missed request as 'ready' if it was a forced miss that actually had a hit
// or the fill request for this block is comming
wire mshr_init_ready_state = !miss_st1 || incoming_fill_qual_st1;
// use dram rsp or core req address to lookup the mshr
wire [`LINE_ADDR_WIDTH-1:0] lookup_addr = dram_rsp_valid ? dram_rsp_addr_qual : creq_addr;
// use memory rsp or core req address to lookup the mshr
wire [`LINE_ADDR_WIDTH-1:0] lookup_addr = mem_rsp_valid ? mem_rsp_addr : creq_addr;
VX_miss_resrv #(
.BANK_ID (BANK_ID),
@ -450,7 +438,7 @@ module VX_bank #(
`UNUSED_PIN (enqueue_full),
// lookup
.lookup_ready (drsq_pop),
.lookup_ready (mrsq_pop),
.lookup_addr (lookup_addr),
.lookup_match (mshr_pending),
@ -500,41 +488,41 @@ module VX_bank #(
.ready_out (core_rsp_ready)
);
// Enqueue DRAM request
// Enqueue memory request
wire [CACHE_LINE_SIZE-1:0] dreq_byteen;
wire [`LINE_ADDR_WIDTH-1:0] dreq_addr;
wire [`CACHE_LINE_WIDTH-1:0] dreq_data;
wire dreq_push, dreq_pop, dreq_empty, dreq_rw;
wire [CACHE_LINE_SIZE-1:0] mreq_byteen;
wire [`LINE_ADDR_WIDTH-1:0] mreq_addr;
wire [`CACHE_LINE_WIDTH-1:0] mreq_data;
wire mreq_push, mreq_pop, mreq_empty, mreq_rw;
assign dreq_push = valid_st1 && dreq_push_st1;
assign mreq_push = valid_st1 && mreq_push_st1;
assign dreq_pop = dram_req_valid && dram_req_ready;
assign mreq_pop = mem_req_valid && mem_req_ready;
assign dreq_rw = WRITE_ENABLE && do_writeback_st1;
assign dreq_byteen = dreq_rw ? line_byteen_st1 : {CACHE_LINE_SIZE{1'b1}};
assign dreq_addr = addr_st1;
assign dreq_data = wdata_st1;
assign mreq_rw = WRITE_ENABLE && do_writeback_st1;
assign mreq_byteen = mreq_rw ? line_byteen_st1 : {CACHE_LINE_SIZE{1'b1}};
assign mreq_addr = addr_st1;
assign mreq_data = wdata_st1;
VX_fifo_queue #(
.DATAW (1 + CACHE_LINE_SIZE + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH),
.SIZE (DREQ_SIZE),
.ALM_FULL (DREQ_SIZE-2)
) dram_req_queue (
.SIZE (MREQ_SIZE),
.ALM_FULL (MREQ_SIZE-2)
) mem_req_queue (
.clk (clk),
.reset (reset),
.push (dreq_push),
.pop (dreq_pop),
.data_in ({dreq_rw, dreq_byteen, dreq_addr, dreq_data}),
.data_out ({dram_req_rw, dram_req_byteen, dram_req_addr, dram_req_data}),
.empty (dreq_empty),
.alm_full (dreq_alm_full),
.push (mreq_push),
.pop (mreq_pop),
.data_in ({mreq_rw, mreq_byteen, mreq_addr, mreq_data}),
.data_out ({mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_data}),
.empty (mreq_empty),
.alm_full (mreq_alm_full),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (size)
);
assign dram_req_valid = !dreq_empty;
assign mem_req_valid = !mreq_empty;
`SCOPE_ASSIGN (valid_st0, valid_st0);
`SCOPE_ASSIGN (valid_st1, valid_st1);
@ -544,7 +532,7 @@ module VX_bank #(
`SCOPE_ASSIGN (force_miss_st0, force_miss_st0);
`SCOPE_ASSIGN (mshr_push, mshr_push);
`SCOPE_ASSIGN (crsq_in_stall, crsq_in_stall);
`SCOPE_ASSIGN (dreq_alm_full, dreq_alm_full);
`SCOPE_ASSIGN (mreq_alm_full, mreq_alm_full);
`SCOPE_ASSIGN (mshr_alm_full, mshr_alm_full);
`SCOPE_ASSIGN (addr_st0, `LINE_TO_BYTE_ADDR(addr_st0, BANK_ID));
`SCOPE_ASSIGN (addr_st1, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID));
@ -552,45 +540,45 @@ module VX_bank #(
`ifdef PERF_ENABLE
assign perf_read_misses = valid_st1 && !is_fill_st1 && !is_mshr_st1 && miss_st1 && !mem_rw_st1;
assign perf_write_misses = valid_st1 && !is_fill_st1 && !is_mshr_st1 && miss_st1 && mem_rw_st1;
assign perf_pipe_stalls = crsq_in_stall || dreq_alm_full || mshr_alm_full;
assign perf_pipe_stalls = crsq_in_stall || mreq_alm_full || mshr_alm_full;
assign perf_mshr_stalls = mshr_alm_full;
`endif
`ifdef DBG_PRINT_CACHE_BANK
always @(posedge clk) begin
/*if (valid_st1 && pmask_st1 == {NUM_PORTS{1'b1}}) begin
$display("%t: cache%0d:%0d full bank multi-porting - addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID));
end*/
/*if (crsq_in_fire && (NUM_PORTS > 1) && $countones(crsq_pmask) > 1) begin
$display("%t: *** cache%0d:%0d multi-port-out: pmask=%b, addr=%0h, tag=%0h", $time, CACHE_ID, BANK_ID, crsq_pmask, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag);
end */
if (valid_st1 && !is_fill_st1 && miss_st1 && incoming_fill_qual_st1) begin
$display("%t: cache%0d:%0d miss with incoming fill - addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID));
$display("%t: *** cache%0d:%0d miss with incoming fill - addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID));
assert(!is_mshr_st1);
end
if (crsq_in_stall || dreq_alm_full || mshr_alm_full) begin
$display("%t: cache%0d:%0d pipeline-stall: cwbq=%b, dwbq=%b, mshr=%b", $time, CACHE_ID, BANK_ID, crsq_in_stall, dreq_alm_full, mshr_alm_full);
if (crsq_in_stall || mreq_alm_full || mshr_alm_full) begin
$display("%t: *** cache%0d:%0d pipeline-stall: cwbq=%b, dwbq=%b, mshr=%b", $time, CACHE_ID, BANK_ID, crsq_in_stall, mreq_alm_full, mshr_alm_full);
end
if (flush_enable) begin
$display("%t: cache%0d:%0d flush: addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(flush_addr, BANK_ID));
end
if (drsq_pop) begin
$display("%t: cache%0d:%0d fill-rsp: addr=%0h, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_rsp_addr_qual, BANK_ID), dram_rsp_data);
if (mrsq_pop) begin
$display("%t: cache%0d:%0d fill-rsp: addr=%0h, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_data);
end
if (mshr_pop) begin
$display("%t: cache%0d:%0d mshr-rd-req: addr=%0h, tag=%0h, pmask=%0b, tid=%0d, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, mshr_byteen, debug_wid_sel, debug_pc_sel);
$display("%t: cache%0d:%0d mshr-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, mshr_byteen, debug_wid_sel, debug_pc_sel);
end
if (creq_pop) begin
if (creq_rw)
$display("%t: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%0b, tid=%0d, byteen=%b, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, debug_wid_sel, debug_pc_sel);
$display("%t: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, debug_wid_sel, debug_pc_sel);
else
$display("%t: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%0b, tid=%0d, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, debug_wid_sel, debug_pc_sel);
$display("%t: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, debug_wid_sel, debug_pc_sel);
end
if (crsq_in_fire) begin
$display("%t: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%0b, tid=%0d, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, debug_wid_st1, debug_pc_st1);
$display("%t: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, debug_wid_st1, debug_pc_st1);
end
if (dreq_push) begin
if (mreq_push) begin
if (do_writeback_st1)
$display("%t: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dreq_addr, BANK_ID), dreq_data, dreq_byteen, debug_wid_st1, debug_pc_st1);
$display("%t: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, debug_wid_st1, debug_pc_st1);
else
$display("%t: cache%0d:%0d fill-req: addr=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dreq_addr, BANK_ID), debug_wid_st1, debug_pc_st1);
$display("%t: cache%0d:%0d fill-req: addr=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), debug_wid_st1, debug_pc_st1);
end
end
`endif

View file

@ -21,10 +21,10 @@ module VX_cache #(
parameter CREQ_SIZE = 4,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 8,
// DRAM Response Queue Size
parameter DRSQ_SIZE = 4,
// DRAM Request Queue Size
parameter DREQ_SIZE = 4,
// Memory Response Queue Size
parameter MRSQ_SIZE = 4,
// Memory Request Queue Size
parameter MREQ_SIZE = 4,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
@ -35,22 +35,17 @@ module VX_cache #(
// size of tag id in core request tag
parameter CORE_TAG_ID_BITS = CORE_TAG_WIDTH,
// dram request tag size
parameter DRAM_TAG_WIDTH = (32 - $clog2(CACHE_LINE_SIZE)),
// Memory request tag size
parameter MEM_TAG_WIDTH = (32 - $clog2(CACHE_LINE_SIZE)),
// bank offset from beginning of index range
parameter BANK_ADDR_OFFSET = 0,
// in-order DRAN
parameter IN_ORDER_DRAM = 0
parameter BANK_ADDR_OFFSET = 0
) (
`SCOPE_IO_VX_cache
input wire clk,
input wire reset,
input wire flush,
// Core request
input wire [NUM_REQS-1:0] core_req_valid,
input wire [NUM_REQS-1:0] core_req_rw,
@ -66,29 +61,32 @@ module VX_cache #(
output wire [`CORE_REQ_TAG_COUNT-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag,
input wire [`CORE_REQ_TAG_COUNT-1:0] core_rsp_ready,
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [CACHE_LINE_SIZE-1:0] mem_req_byteen,
output wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`CACHE_LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data,
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
// PERF
`ifdef PERF_ENABLE
VX_perf_cache_if perf_cache_if,
`endif
// DRAM request
output wire dram_req_valid,
output wire dram_req_rw,
output wire [CACHE_LINE_SIZE-1:0] dram_req_byteen,
output wire [`DRAM_ADDR_WIDTH-1:0] dram_req_addr,
output wire [`CACHE_LINE_WIDTH-1:0] dram_req_data,
output wire [DRAM_TAG_WIDTH-1:0] dram_req_tag,
input wire dram_req_ready,
// DRAM response
input wire dram_rsp_valid,
input wire [`CACHE_LINE_WIDTH-1:0] dram_rsp_data,
input wire [DRAM_TAG_WIDTH-1:0] dram_rsp_tag,
output wire dram_rsp_ready
// device flush
input wire flush
);
`STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid value"))
wire [NUM_BANKS-1:0][NUM_PORTS-1:0] per_bank_core_req_valid;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0][`UP(`WORD_SELECT_BITS)-1:0] per_bank_core_req_wsel;
wire [NUM_BANKS-1:0][NUM_PORTS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen;
@ -106,17 +104,17 @@ module VX_cache #(
wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_rsp_tag;
wire [NUM_BANKS-1:0] per_bank_core_rsp_ready;
wire [NUM_BANKS-1:0] per_bank_dram_req_valid;
wire [NUM_BANKS-1:0] per_bank_dram_req_rw;
wire [NUM_BANKS-1:0][CACHE_LINE_SIZE-1:0] per_bank_dram_req_byteen;
wire [NUM_BANKS-1:0][`DRAM_ADDR_WIDTH-1:0] per_bank_dram_req_addr;
wire [NUM_BANKS-1:0][`CACHE_LINE_WIDTH-1:0] per_bank_dram_req_data;
wire [NUM_BANKS-1:0] per_bank_dram_req_ready;
wire [NUM_BANKS-1:0] per_bank_mem_req_valid;
wire [NUM_BANKS-1:0] per_bank_mem_req_rw;
wire [NUM_BANKS-1:0][CACHE_LINE_SIZE-1:0] per_bank_mem_req_byteen;
wire [NUM_BANKS-1:0][`MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
wire [NUM_BANKS-1:0][`CACHE_LINE_WIDTH-1:0] per_bank_mem_req_data;
wire [NUM_BANKS-1:0] per_bank_mem_req_ready;
wire [NUM_BANKS-1:0] per_bank_dram_rsp_ready;
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
wire [`CACHE_LINE_WIDTH-1:0] dram_rsp_data_qual;
wire [DRAM_TAG_WIDTH-1:0] dram_rsp_tag_qual;
wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_qual;
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_qual;
wire [`LINE_SELECT_BITS-1:0] flush_addr;
wire flush_enable;
@ -129,35 +127,35 @@ module VX_cache #(
///////////////////////////////////////////////////////////////////////////
wire drsq_full, drsq_empty;
wire drsq_push, drsq_pop;
wire mrsq_full, mrsq_empty;
wire mrsq_push, mrsq_pop;
assign drsq_push = dram_rsp_valid && dram_rsp_ready;
assign dram_rsp_ready = !drsq_full;
assign mrsq_push = mem_rsp_valid && mem_rsp_ready;
assign mem_rsp_ready = !mrsq_full;
VX_fifo_queue #(
.DATAW (DRAM_TAG_WIDTH + `CACHE_LINE_WIDTH),
.SIZE (DRSQ_SIZE),
.DATAW (MEM_TAG_WIDTH + `CACHE_LINE_WIDTH),
.SIZE (MRSQ_SIZE),
.BUFFERED (1)
) dram_rsp_queue (
) mem_rsp_queue (
.clk (clk),
.reset (reset),
.push (drsq_push),
.pop (drsq_pop),
.data_in ({dram_rsp_tag, dram_rsp_data}),
.data_out ({dram_rsp_tag_qual, dram_rsp_data_qual}),
.empty (drsq_empty),
.full (drsq_full),
.push (mrsq_push),
.pop (mrsq_pop),
.data_in ({mem_rsp_tag, mem_rsp_data}),
.data_out ({mem_rsp_tag_qual, mem_rsp_data_qual}),
.empty (mrsq_empty),
.full (mrsq_full),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (size)
);
if (NUM_BANKS == 1) begin
`UNUSED_VAR (dram_rsp_tag_qual)
assign drsq_pop = !drsq_empty && per_bank_dram_rsp_ready;
`UNUSED_VAR (mem_rsp_tag_qual)
assign mrsq_pop = !mrsq_empty && per_bank_mem_rsp_ready;
end else begin
assign drsq_pop = !drsq_empty && per_bank_dram_rsp_ready[`DRAM_ADDR_BANK(dram_rsp_tag_qual)];
assign mrsq_pop = !mrsq_empty && per_bank_mem_rsp_ready[`MEM_ADDR_BANK(mem_rsp_tag_qual)];
end
///////////////////////////////////////////////////////////////////////////
@ -176,6 +174,7 @@ module VX_cache #(
///////////////////////////////////////////////////////////////////////////
VX_cache_core_req_bank_sel #(
.CACHE_ID (CACHE_ID),
.CACHE_LINE_SIZE (CACHE_LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_PORTS (NUM_PORTS),
@ -227,17 +226,17 @@ module VX_cache #(
wire [CORE_TAG_WIDTH-1:0] curr_bank_core_rsp_tag;
wire curr_bank_core_rsp_ready;
wire curr_bank_dram_req_valid;
wire curr_bank_dram_req_rw;
wire [CACHE_LINE_SIZE-1:0] curr_bank_dram_req_byteen;
wire [`LINE_ADDR_WIDTH-1:0] curr_bank_dram_req_addr;
wire[`CACHE_LINE_WIDTH-1:0] curr_bank_dram_req_data;
wire curr_bank_dram_req_ready;
wire curr_bank_mem_req_valid;
wire curr_bank_mem_req_rw;
wire [CACHE_LINE_SIZE-1:0] curr_bank_mem_req_byteen;
wire [`LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr;
wire[`CACHE_LINE_WIDTH-1:0] curr_bank_mem_req_data;
wire curr_bank_mem_req_ready;
wire curr_bank_dram_rsp_valid;
wire [`LINE_ADDR_WIDTH-1:0] curr_bank_dram_rsp_addr;
wire [`CACHE_LINE_WIDTH-1:0] curr_bank_dram_rsp_data;
wire curr_bank_dram_rsp_ready;
wire curr_bank_mem_rsp_valid;
wire [`LINE_ADDR_WIDTH-1:0] curr_bank_mem_rsp_addr;
wire [`CACHE_LINE_WIDTH-1:0] curr_bank_mem_rsp_data;
wire curr_bank_mem_rsp_ready;
// Core Req
assign curr_bank_core_req_valid = per_bank_core_req_valid[i];
@ -258,28 +257,28 @@ module VX_cache #(
assign per_bank_core_rsp_tag [i] = curr_bank_core_rsp_tag;
assign per_bank_core_rsp_data [i] = curr_bank_core_rsp_data;
// DRAM request
assign per_bank_dram_req_valid[i] = curr_bank_dram_req_valid;
assign per_bank_dram_req_rw[i] = curr_bank_dram_req_rw;
assign per_bank_dram_req_byteen[i] = curr_bank_dram_req_byteen;
// Memory request
assign per_bank_mem_req_valid[i] = curr_bank_mem_req_valid;
assign per_bank_mem_req_rw[i] = curr_bank_mem_req_rw;
assign per_bank_mem_req_byteen[i] = curr_bank_mem_req_byteen;
if (NUM_BANKS == 1) begin
assign per_bank_dram_req_addr[i] = curr_bank_dram_req_addr;
assign per_bank_mem_req_addr[i] = curr_bank_mem_req_addr;
end else begin
assign per_bank_dram_req_addr[i] = `LINE_TO_DRAM_ADDR(curr_bank_dram_req_addr, i);
assign per_bank_mem_req_addr[i] = `LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, i);
end
assign per_bank_dram_req_data[i] = curr_bank_dram_req_data;
assign curr_bank_dram_req_ready = per_bank_dram_req_ready[i];
assign per_bank_mem_req_data[i] = curr_bank_mem_req_data;
assign curr_bank_mem_req_ready = per_bank_mem_req_ready[i];
// DRAM response
// Memory response
if (NUM_BANKS == 1) begin
assign curr_bank_dram_rsp_valid = !drsq_empty;
assign curr_bank_dram_rsp_addr = dram_rsp_tag_qual;
assign curr_bank_mem_rsp_valid = !mrsq_empty;
assign curr_bank_mem_rsp_addr = mem_rsp_tag_qual;
end else begin
assign curr_bank_dram_rsp_valid = !drsq_empty && (`DRAM_ADDR_BANK(dram_rsp_tag_qual) == i);
assign curr_bank_dram_rsp_addr = `DRAM_TO_LINE_ADDR(dram_rsp_tag_qual);
assign curr_bank_mem_rsp_valid = !mrsq_empty && (`MEM_ADDR_BANK(mem_rsp_tag_qual) == i);
assign curr_bank_mem_rsp_addr = `MEM_TO_LINE_ADDR(mem_rsp_tag_qual);
end
assign curr_bank_dram_rsp_data = dram_rsp_data_qual;
assign per_bank_dram_rsp_ready[i] = curr_bank_dram_rsp_ready;
assign curr_bank_mem_rsp_data = mem_rsp_data_qual;
assign per_bank_mem_rsp_ready[i] = curr_bank_mem_rsp_ready;
VX_bank #(
.BANK_ID (i),
@ -292,12 +291,11 @@ module VX_cache #(
.NUM_REQS (NUM_REQS),
.CREQ_SIZE (CREQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.DREQ_SIZE (DREQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.CORE_TAG_WIDTH (CORE_TAG_WIDTH),
.CORE_TAG_ID_BITS (CORE_TAG_ID_BITS),
.BANK_ADDR_OFFSET (BANK_ADDR_OFFSET),
.IN_ORDER_DRAM (IN_ORDER_DRAM)
.BANK_ADDR_OFFSET (BANK_ADDR_OFFSET)
) bank (
`SCOPE_BIND_VX_cache_bank(i)
@ -330,19 +328,19 @@ module VX_cache #(
.core_rsp_tag (curr_bank_core_rsp_tag),
.core_rsp_ready (curr_bank_core_rsp_ready),
// DRAM request
.dram_req_valid (curr_bank_dram_req_valid),
.dram_req_rw (curr_bank_dram_req_rw),
.dram_req_byteen (curr_bank_dram_req_byteen),
.dram_req_addr (curr_bank_dram_req_addr),
.dram_req_data (curr_bank_dram_req_data),
.dram_req_ready (curr_bank_dram_req_ready),
// Memory request
.mem_req_valid (curr_bank_mem_req_valid),
.mem_req_rw (curr_bank_mem_req_rw),
.mem_req_byteen (curr_bank_mem_req_byteen),
.mem_req_addr (curr_bank_mem_req_addr),
.mem_req_data (curr_bank_mem_req_data),
.mem_req_ready (curr_bank_mem_req_ready),
// DRAM response
.dram_rsp_valid (curr_bank_dram_rsp_valid),
.dram_rsp_addr (curr_bank_dram_rsp_addr),
.dram_rsp_data (curr_bank_dram_rsp_data),
.dram_rsp_ready (curr_bank_dram_rsp_ready),
// Memory response
.mem_rsp_valid (curr_bank_mem_rsp_valid),
.mem_rsp_addr (curr_bank_mem_rsp_addr),
.mem_rsp_data (curr_bank_mem_rsp_data),
.mem_rsp_ready (curr_bank_mem_rsp_ready),
// flush
.flush_enable (flush_enable),
@ -351,6 +349,7 @@ module VX_cache #(
end
VX_cache_core_rsp_merge #(
.CACHE_ID (CACHE_ID),
.NUM_BANKS (NUM_BANKS),
.NUM_PORTS (NUM_PORTS),
.WORD_SIZE (WORD_SIZE),
@ -372,27 +371,27 @@ module VX_cache #(
.core_rsp_ready (core_rsp_ready)
);
wire [NUM_BANKS-1:0][(`DRAM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH)-1:0] data_in;
wire [NUM_BANKS-1:0][(`MEM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH)-1:0] data_in;
for (genvar i = 0; i < NUM_BANKS; i++) begin
assign data_in[i] = {per_bank_dram_req_addr[i], per_bank_dram_req_rw[i], per_bank_dram_req_byteen[i], per_bank_dram_req_data[i]};
assign data_in[i] = {per_bank_mem_req_addr[i], per_bank_mem_req_rw[i], per_bank_mem_req_byteen[i], per_bank_mem_req_data[i]};
end
VX_stream_arbiter #(
.NUM_REQS (NUM_BANKS),
.DATAW (`DRAM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH),
.DATAW (`MEM_ADDR_WIDTH + 1 + CACHE_LINE_SIZE + `CACHE_LINE_WIDTH),
.BUFFERED (1)
) dram_req_arb (
) mem_req_arb (
.clk (clk),
.reset (reset),
.valid_in (per_bank_dram_req_valid),
.valid_in (per_bank_mem_req_valid),
.data_in (data_in),
.ready_in (per_bank_dram_req_ready),
.valid_out (dram_req_valid),
.data_out ({dram_req_addr, dram_req_rw, dram_req_byteen, dram_req_data}),
.ready_out (dram_req_ready)
.ready_in (per_bank_mem_req_ready),
.valid_out (mem_req_valid),
.data_out ({mem_req_addr, mem_req_rw, mem_req_byteen, mem_req_data}),
.ready_out (mem_req_ready)
);
assign dram_req_tag = dram_req_addr;
assign mem_req_tag = mem_req_addr;
`ifdef PERF_ENABLE
// per cycle: core_reads, core_writes
@ -420,13 +419,13 @@ module VX_cache #(
assign perf_mshr_stall_per_cycle = $countones(perf_mshr_stall_per_bank);
assign perf_pipe_stall_per_cycle = $countones(perf_pipe_stall_per_bank);
reg [43:0] perf_core_reads;
reg [43:0] perf_core_writes;
reg [43:0] perf_read_misses;
reg [43:0] perf_write_misses;
reg [43:0] perf_mshr_stalls;
reg [43:0] perf_pipe_stalls;
reg [43:0] perf_crsp_stalls;
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
reg [`PERF_CTR_BITS-1:0] perf_read_misses;
reg [`PERF_CTR_BITS-1:0] perf_write_misses;
reg [`PERF_CTR_BITS-1:0] perf_mshr_stalls;
reg [`PERF_CTR_BITS-1:0] perf_pipe_stalls;
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
always @(posedge clk) begin
if (reset) begin
@ -438,13 +437,13 @@ module VX_cache #(
perf_pipe_stalls <= 0;
perf_crsp_stalls <= 0;
end else begin
perf_core_reads <= perf_core_reads + 44'(perf_core_reads_per_cycle);
perf_core_writes <= perf_core_writes + 44'(perf_core_writes_per_cycle);
perf_read_misses <= perf_read_misses + 44'(perf_read_miss_per_cycle);
perf_write_misses <= perf_write_misses+ 44'(perf_write_miss_per_cycle);
perf_mshr_stalls <= perf_mshr_stalls + 44'(perf_mshr_stall_per_cycle);
perf_pipe_stalls <= perf_pipe_stalls + 44'(perf_pipe_stall_per_cycle);
perf_crsp_stalls <= perf_crsp_stalls + 44'(perf_crsp_stall_per_cycle);
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
perf_read_misses <= perf_read_misses + `PERF_CTR_BITS'(perf_read_miss_per_cycle);
perf_write_misses <= perf_write_misses+ `PERF_CTR_BITS'(perf_write_miss_per_cycle);
perf_mshr_stalls <= perf_mshr_stalls + `PERF_CTR_BITS'(perf_mshr_stall_per_cycle);
perf_pipe_stalls <= perf_pipe_stalls + `PERF_CTR_BITS'(perf_pipe_stall_per_cycle);
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
end
end

View file

@ -1,6 +1,8 @@
`include "VX_cache_define.vh"
module VX_cache_core_req_bank_sel #(
parameter CACHE_ID = 0,
// Size of line inside a bank in bytes
parameter CACHE_LINE_SIZE = 64,
// Size of a word in bytes
@ -22,7 +24,7 @@ module VX_cache_core_req_bank_sel #(
input wire reset,
`ifdef PERF_ENABLE
output wire [43:0] bank_stalls,
output wire [`PERF_CTR_BITS-1:0] bank_stalls,
`endif
input wire [NUM_REQS-1:0] core_req_valid,
@ -43,6 +45,7 @@ module VX_cache_core_req_bank_sel #(
output wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag,
input wire [`BANK_READY_COUNT-1:0] per_bank_core_req_ready
);
`UNUSED_PARAM (CACHE_ID)
`STATIC_ASSERT (NUM_REQS >= NUM_BANKS, ("invalid number of banks"));
`UNUSED_VAR (clk)
@ -148,7 +151,7 @@ module VX_cache_core_req_bank_sel #(
end
end
end
end else begin
always @(*) begin
@ -303,13 +306,13 @@ module VX_cache_core_req_bank_sel #(
end
end
reg [43:0] bank_stalls_r;
reg [`PERF_CTR_BITS-1:0] bank_stalls_r;
always @(posedge clk) begin
if (reset) begin
bank_stalls_r <= 0;
end else begin
bank_stalls_r <= bank_stalls_r + 44'($countones(core_req_sel_r & ~core_req_ready));
bank_stalls_r <= bank_stalls_r + `PERF_CTR_BITS'($countones(core_req_sel_r & ~core_req_ready));
end
end

View file

@ -1,6 +1,8 @@
`include "VX_cache_define.vh"
module VX_cache_core_rsp_merge #(
parameter CACHE_ID = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 1,
// Number of banks
@ -31,6 +33,8 @@ module VX_cache_core_rsp_merge #(
output wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data,
input wire [`CORE_REQ_TAG_COUNT-1:0] core_rsp_ready
);
`UNUSED_PARAM (CACHE_ID)
if (NUM_BANKS > 1) begin
reg [NUM_REQS-1:0] core_rsp_valid_unqual;
@ -39,6 +43,10 @@ module VX_cache_core_rsp_merge #(
if (CORE_TAG_ID_BITS != 0) begin
// The core response bus handles a single tag at the time
// We first need to select the current tag to process,
// then send all bank responses for that tag as a batch
reg [CORE_TAG_WIDTH-1:0] core_rsp_tag_unqual;
wire core_rsp_ready_unqual;

View file

@ -21,8 +21,8 @@
`define WORDS_PER_LINE (CACHE_LINE_SIZE / WORD_SIZE)
`define WORD_ADDR_WIDTH (32-`CLOG2(WORD_SIZE))
`define DRAM_ADDR_WIDTH (32-`CLOG2(CACHE_LINE_SIZE))
`define LINE_ADDR_WIDTH (`DRAM_ADDR_WIDTH-`BANK_SELECT_BITS)
`define MEM_ADDR_WIDTH (32-`CLOG2(CACHE_LINE_SIZE))
`define LINE_ADDR_WIDTH (`MEM_ADDR_WIDTH-`BANK_SELECT_BITS)
// Word select
`define WORD_SELECT_BITS `CLOG2(`WORDS_PER_LINE)
@ -59,11 +59,11 @@
`define BANK_READY_COUNT ((SHARED_BANK_READY != 0) ? 1 : NUM_BANKS)
`define DRAM_ADDR_BANK(x) x[`BANK_SELECT_BITS+BANK_ADDR_OFFSET-1 : BANK_ADDR_OFFSET]
`define MEM_ADDR_BANK(x) x[`BANK_SELECT_BITS+BANK_ADDR_OFFSET-1 : BANK_ADDR_OFFSET]
`define DRAM_TO_LINE_ADDR(x) x[`DRAM_ADDR_WIDTH-1 : `BANK_SELECT_BITS]
`define MEM_TO_LINE_ADDR(x) x[`MEM_ADDR_WIDTH-1 : `BANK_SELECT_BITS]
`define LINE_TO_DRAM_ADDR(x, i) {x, `BANK_SELECT_BITS'(i)}
`define LINE_TO_MEM_ADDR(x, i) {x, `BANK_SELECT_BITS'(i)}
`define LINE_TO_BYTE_ADDR(x, i) {x, (32-$bits(x))'(i << (32-$bits(x)-`BANK_SELECT_BITS))}

View file

@ -4,25 +4,25 @@ module VX_shared_mem #(
parameter CACHE_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 16384,
parameter CACHE_SIZE = (1024*16),
// Number of banks
parameter NUM_BANKS = 4,
parameter NUM_BANKS = 2,
// Size of a word in bytes
parameter WORD_SIZE = 4,
// Number of Word requests per cycle
parameter NUM_REQS = NUM_BANKS,
parameter NUM_REQS = 4,
// Core Request Queue Size
parameter CREQ_SIZE = 4,
// core request tag size
parameter CORE_TAG_WIDTH = 1,
parameter CREQ_SIZE = 8,
// size of tag id in core request tag
parameter CORE_TAG_ID_BITS = 0,
parameter CORE_TAG_ID_BITS = 8,
// core request tag size
parameter CORE_TAG_WIDTH = (2 + CORE_TAG_ID_BITS),
// bank offset from beginning of index range
parameter BANK_ADDR_OFFSET = 0
parameter BANK_ADDR_OFFSET = `CLOG2(256)
) (
input wire clk,
input wire reset,
@ -54,13 +54,6 @@ module VX_shared_mem #(
localparam CACHE_LINE_SIZE = WORD_SIZE;
`ifdef DBG_CACHE_REQ_INFO
/* verilator lint_off UNUSED */
wire [31:0] debug_pc_st0;
wire [`NW_BITS-1:0] debug_wid_st0;
/* verilator lint_on UNUSED */
`endif
wire [NUM_BANKS-1:0] per_bank_core_req_valid_unqual;
wire [NUM_BANKS-1:0] per_bank_core_req_rw_unqual;
wire [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr_unqual;
@ -71,6 +64,7 @@ module VX_shared_mem #(
wire per_bank_core_req_ready_unqual;
VX_cache_core_req_bank_sel #(
.CACHE_ID (CACHE_ID),
.CACHE_LINE_SIZE (WORD_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_PORTS (1),
@ -108,20 +102,26 @@ module VX_shared_mem #(
wire [NUM_BANKS-1:0][`LINE_SELECT_BITS-1:0] per_bank_core_req_addr;
wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_core_req_byteen;
wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_req_data;
wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag;
wire [NUM_BANKS-1:0][CORE_TAG_WIDTH-1:0] per_bank_core_req_tag;
wire [NUM_BANKS-1:0][`REQS_BITS-1:0] per_bank_core_req_tid;
wire creq_push, creq_pop, creq_empty, creq_full;
wire crsq_in_ready;
wire crsq_in_fire_last;
wire [NUM_BANKS-1:0] per_bank_rsp_valid = per_bank_core_req_valid & ~per_bank_core_req_rw;
wire core_req_has_read = (| per_bank_rsp_valid);
assign creq_push = (| core_req_valid) && !creq_full;
assign creq_pop = ~creq_empty && crsq_in_ready;
assign creq_push = (| core_req_valid) && ~creq_full;
assign creq_pop = (~creq_empty && ~core_req_has_read)
|| crsq_in_fire_last;
assign per_bank_core_req_ready_unqual = ~creq_full;
wire [NUM_REQS-1:0][`LINE_SELECT_BITS-1:0] per_bank_core_req_addr_qual;
wire [NUM_BANKS-1:0][`LINE_SELECT_BITS-1:0] per_bank_core_req_addr_qual;
`UNUSED_VAR (per_bank_core_req_addr_unqual)
for (genvar i = 0; i < NUM_REQS; i++) begin
for (genvar i = 0; i < NUM_BANKS; i++) begin
assign per_bank_core_req_addr_qual[i] = per_bank_core_req_addr_unqual[i][`LINE_SELECT_BITS-1:0];
end
@ -155,9 +155,14 @@ module VX_shared_mem #(
`UNUSED_PIN (size)
);
wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_rsp_data;
wire [NUM_BANKS-1:0][`WORD_WIDTH-1:0] per_bank_core_rsp_data;
for (genvar i = 0; i < NUM_BANKS; i++) begin
wire wren = per_bank_core_req_rw[i]
&& per_bank_core_req_valid[i]
&& creq_pop;
VX_sp_ram #(
.DATAW (`WORD_WIDTH),
.SIZE (`LINES_PER_BANK),
@ -166,13 +171,41 @@ module VX_shared_mem #(
) data (
.clk (clk),
.addr (per_bank_core_req_addr[i]),
.wren (per_bank_core_req_valid[i] && per_bank_core_req_rw[i]),
.wren (wren),
.byteen (per_bank_core_req_byteen[i]),
.rden (1'b1),
.din (per_bank_core_req_data[i]),
.dout (per_bank_core_rsp_data[i])
);
end
// The core response bus handles a single tag at the time
// We first need to select the current tag to process,
// then send all bank responses for that tag as a batch
wire crsq_in_valid, crsq_in_ready;
reg [NUM_BANKS-1:0] bank_rsp_sel, bank_rsp_sel_r;
wire [NUM_BANKS-1:0] bank_rsp_sel_n = bank_rsp_sel | bank_rsp_sel_r;
wire crsq_in_fire = crsq_in_valid && crsq_in_ready;
assign crsq_in_fire_last = crsq_in_fire && (bank_rsp_sel_n == per_bank_rsp_valid);
always @(posedge clk) begin
if (reset) begin
bank_rsp_sel <= 0;
end else begin
if (crsq_in_fire) begin
if (bank_rsp_sel_n == per_bank_rsp_valid) begin
bank_rsp_sel <= 0;
end else begin
bank_rsp_sel <= bank_rsp_sel_n;
end
end
end
end
reg [NUM_REQS-1:0] core_rsp_valids_in;
reg [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_in;
@ -180,31 +213,30 @@ module VX_shared_mem #(
always @(*) begin
core_rsp_valids_in = 0;
core_rsp_data_in = 'x;
core_rsp_data_in = 'x;
core_rsp_tag_in = 'x;
for (integer i = 0; i < NUM_BANKS; i++) begin
if (per_bank_core_req_valid[i]) begin
core_rsp_valids_in[per_bank_core_req_tid[i]] = 1;
core_rsp_data_in[per_bank_core_req_tid[i]] = per_bank_core_rsp_data[i];
bank_rsp_sel_r = 0;
for (integer i = NUM_BANKS-1; i >= 0; --i) begin
if (per_bank_rsp_valid[i] && ~bank_rsp_sel[i]) begin
core_rsp_tag_in = per_bank_core_req_tag[i];
end
end
end
`ifdef DBG_CACHE_REQ_INFO
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
assign {debug_pc_st0, debug_wid_st0} = core_rsp_tag_in[`CACHE_REQ_INFO_RNG];
end else begin
assign {debug_pc_st0, debug_wid_st0} = 0;
for (integer i = 0; i < NUM_BANKS; i++) begin
if (per_bank_core_req_valid[i]
&& (core_rsp_tag_in[CORE_TAG_ID_BITS-1:0] == per_bank_core_req_tag[i][CORE_TAG_ID_BITS-1:0])) begin
core_rsp_valids_in[per_bank_core_req_tid[i]] = 1;
core_rsp_data_in[per_bank_core_req_tid[i]] = per_bank_core_rsp_data[i];
bank_rsp_sel_r[i] = 1;
end
end
end
`endif
wire [NUM_REQS-1:0] core_rsp_valids_out;
wire core_rsp_valid_out;
wire core_rsp_rw = | (per_bank_core_req_valid & per_bank_core_req_rw);
wire crsq_in_valid = ~creq_empty && ~core_rsp_rw;
assign crsq_in_valid = ~creq_empty && core_req_has_read;
VX_skid_buffer #(
.DATAW (NUM_BANKS * (1 + `WORD_WIDTH) + CORE_TAG_WIDTH)
@ -221,16 +253,82 @@ module VX_shared_mem #(
assign core_rsp_valid = core_rsp_valids_out & {NUM_REQS{core_rsp_valid_out}};
`ifdef DBG_CACHE_REQ_INFO
`IGNORE_WARNINGS_BEGIN
wire [NUM_BANKS-1:0][31:0] debug_pc_st0, debug_pc_st1;
wire [NUM_BANKS-1:0][`NW_BITS-1:0] debug_wid_st0, debug_wid_st1;
`IGNORE_WARNINGS_END
for (genvar i = 0; i < NUM_BANKS; ++i) begin
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
assign {debug_pc_st0[i], debug_wid_st0[i]} = per_bank_core_req_tag_unqual[i][CORE_TAG_WIDTH-1:CORE_TAG_ID_BITS];
assign {debug_pc_st1[i], debug_wid_st1[i]} = per_bank_core_req_tag[i][CORE_TAG_WIDTH-1:CORE_TAG_ID_BITS];
end else begin
assign {debug_pc_st0[i], debug_wid_st0[i]} = 0;
assign {debug_pc_st1[i], debug_wid_st1[i]} = 0;
end
end
`endif
`ifdef DBG_PRINT_CACHE_BANK
reg is_multi_tag_req;
`IGNORE_WARNINGS_BEGIN
reg [CORE_TAG_WIDTH-1:0] core_req_tag_sel;
`IGNORE_WARNINGS_END
always @(*) begin
core_req_tag_sel ='x;
for (integer i = NUM_BANKS-1; i >= 0; --i) begin
if (per_bank_core_req_valid[i]) begin
core_req_tag_sel = per_bank_core_req_tag[i];
end
end
is_multi_tag_req = 0;
for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_req_valid[i]
&& (core_req_tag_sel[CORE_TAG_ID_BITS-1:0] != per_bank_core_req_tag[i][CORE_TAG_ID_BITS-1:0])) begin
is_multi_tag_req = !creq_empty;
end
end
end
always @(posedge clk) begin
if (!crsq_in_ready) begin
$display("%t: cache%0d pipeline-stall", $time, CACHE_ID);
$display("%t: *** cache%0d pipeline-stall", $time, CACHE_ID);
end
if (is_multi_tag_req) begin
$display("%t: *** cache%0d multi-tag request!", $time, CACHE_ID);
end
if (creq_push) begin
for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_req_valid_unqual[i]) begin
if (per_bank_core_req_rw_unqual[i]) begin
$display("%t: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h",
$time, CACHE_ID, i, per_bank_core_req_addr_unqual[i], per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], per_bank_core_req_data_unqual[i],
debug_wid_st0[i], debug_pc_st0[i]);
end else begin
$display("%t: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, byteen=%b, wid=%0d, PC=%0h",
$time, CACHE_ID, i, per_bank_core_req_addr_unqual[i], per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i],
debug_wid_st0[i], debug_pc_st0[i]);
end
end
end
end
if (creq_pop) begin
if (core_rsp_rw)
$display("%t: cache%0d core-wr-req: tmask=%0b, addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, per_bank_core_req_valid, per_bank_core_req_addr, per_bank_core_req_tag, per_bank_core_req_byteen, per_bank_core_req_data, debug_wid_st0, debug_pc_st0);
else
$display("%t: cache%0d core-rd-req: tmask=%0b, addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h", $time, CACHE_ID, per_bank_core_req_valid, per_bank_core_req_addr, per_bank_core_req_tag, per_bank_core_req_byteen, per_bank_core_rsp_data, debug_wid_st0, debug_pc_st0);
for (integer i = 0; i < NUM_BANKS; ++i) begin
if (per_bank_core_req_valid[i]) begin
if (per_bank_core_req_rw[i]) begin
$display("%t: cache%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h",
$time, CACHE_ID, i, per_bank_core_req_addr[i], per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_req_data[i],
debug_wid_st1[i], debug_pc_st1[i]);
end else begin
$display("%t: cache%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, byteen=%b, wid=%0d, PC=%0h",
$time, CACHE_ID, i, per_bank_core_req_addr[i], per_bank_core_req_tag[i], per_bank_core_req_byteen[i],
debug_wid_st1[i], debug_pc_st1[i]);
end
end
end
end
end
`endif
@ -249,9 +347,9 @@ module VX_shared_mem #(
assign perf_crsp_stall_per_cycle = $countones(core_rsp_valid & ~core_rsp_ready);
end
reg [43:0] perf_core_reads;
reg [43:0] perf_core_writes;
reg [43:0] perf_crsp_stalls;
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
always @(posedge clk) begin
if (reset) begin
@ -259,9 +357,9 @@ module VX_shared_mem #(
perf_core_writes <= 0;
perf_crsp_stalls <= 0;
end else begin
perf_core_reads <= perf_core_reads + 44'(perf_core_reads_per_cycle);
perf_core_writes <= perf_core_writes + 44'(perf_core_writes_per_cycle);
perf_crsp_stalls <= perf_crsp_stalls + 44'(perf_crsp_stall_per_cycle);
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
end
end

View file

@ -1,22 +0,0 @@
`ifndef VX_CACHE_DRAM_REQ_IF
`define VX_CACHE_DRAM_REQ_IF
`include "../cache/VX_cache_define.vh"
interface VX_cache_dram_req_if #(
parameter DRAM_LINE_WIDTH = 1,
parameter DRAM_ADDR_WIDTH = 1,
parameter DRAM_TAG_WIDTH = 1
) ();
wire valid;
wire rw;
wire [(DRAM_LINE_WIDTH/8)-1:0] byteen;
wire [DRAM_ADDR_WIDTH-1:0] addr;
wire [DRAM_LINE_WIDTH-1:0] data;
wire [DRAM_TAG_WIDTH-1:0] tag;
wire ready;
endinterface
`endif

View file

@ -1,18 +0,0 @@
`ifndef VX_CACHE_DRAM_RSP_IF
`define VX_CACHE_DRAM_RSP_IF
`include "../cache/VX_cache_define.vh"
interface VX_cache_dram_rsp_if #(
parameter DRAM_LINE_WIDTH = 1,
parameter DRAM_TAG_WIDTH = 1
) ();
wire valid;
wire [DRAM_LINE_WIDTH-1:0] data;
wire [DRAM_TAG_WIDTH-1:0] tag;
wire ready;
endinterface
`endif

View file

@ -0,0 +1,23 @@
`ifndef VX_CACHE_MEM_REQ_IF
`define VX_CACHE_MEM_REQ_IF
`include "../cache/VX_cache_config.vh"
interface VX_cache_mem_req_if #(
parameter MEM_LINE_WIDTH = 1,
parameter MEM_ADDR_WIDTH = 1,
parameter MEM_TAG_WIDTH = 1,
parameter MEM_LINE_SIZE = MEM_LINE_WIDTH / 8
) ();
wire valid;
wire rw;
wire [MEM_LINE_SIZE-1:0] byteen;
wire [MEM_ADDR_WIDTH-1:0] addr;
wire [MEM_LINE_WIDTH-1:0] data;
wire [MEM_TAG_WIDTH-1:0] tag;
wire ready;
endinterface
`endif

View file

@ -0,0 +1,18 @@
`ifndef VX_CACHE_MEM_RSP_IF
`define VX_CACHE_MEM_RSP_IF
`include "../cache/VX_cache_config.vh"
interface VX_cache_mem_rsp_if #(
parameter MEM_LINE_WIDTH = 1,
parameter MEM_TAG_WIDTH = 1
) ();
wire valid;
wire [MEM_LINE_WIDTH-1:0] data;
wire [MEM_TAG_WIDTH-1:0] tag;
wire ready;
endinterface
`endif

View file

@ -5,14 +5,14 @@
interface VX_perf_cache_if ();
wire [43:0] reads;
wire [43:0] writes;
wire [43:0] read_misses;
wire [43:0] write_misses;
wire [43:0] bank_stalls;
wire [43:0] mshr_stalls;
wire [43:0] pipe_stalls;
wire [43:0] crsp_stalls;
wire [`PERF_CTR_BITS-1:0] reads;
wire [`PERF_CTR_BITS-1:0] writes;
wire [`PERF_CTR_BITS-1:0] read_misses;
wire [`PERF_CTR_BITS-1:0] write_misses;
wire [`PERF_CTR_BITS-1:0] bank_stalls;
wire [`PERF_CTR_BITS-1:0] mshr_stalls;
wire [`PERF_CTR_BITS-1:0] pipe_stalls;
wire [`PERF_CTR_BITS-1:0] crsp_stalls;
endinterface

View file

@ -5,28 +5,28 @@
interface VX_perf_memsys_if ();
wire [43:0] icache_reads;
wire [43:0] icache_read_misses;
wire [43:0] icache_pipe_stalls;
wire [43:0] icache_crsp_stalls;
wire [`PERF_CTR_BITS-1:0] icache_reads;
wire [`PERF_CTR_BITS-1:0] icache_read_misses;
wire [`PERF_CTR_BITS-1:0] icache_pipe_stalls;
wire [`PERF_CTR_BITS-1:0] icache_crsp_stalls;
wire [43:0] dcache_reads;
wire [43:0] dcache_writes;
wire [43:0] dcache_read_misses;
wire [43:0] dcache_write_misses;
wire [43:0] dcache_bank_stalls;
wire [43:0] dcache_mshr_stalls;
wire [43:0] dcache_pipe_stalls;
wire [43:0] dcache_crsp_stalls;
wire [`PERF_CTR_BITS-1:0] dcache_reads;
wire [`PERF_CTR_BITS-1:0] dcache_writes;
wire [`PERF_CTR_BITS-1:0] dcache_read_misses;
wire [`PERF_CTR_BITS-1:0] dcache_write_misses;
wire [`PERF_CTR_BITS-1:0] dcache_bank_stalls;
wire [`PERF_CTR_BITS-1:0] dcache_mshr_stalls;
wire [`PERF_CTR_BITS-1:0] dcache_pipe_stalls;
wire [`PERF_CTR_BITS-1:0] dcache_crsp_stalls;
wire [43:0] smem_reads;
wire [43:0] smem_writes;
wire [43:0] smem_bank_stalls;
wire [`PERF_CTR_BITS-1:0] smem_reads;
wire [`PERF_CTR_BITS-1:0] smem_writes;
wire [`PERF_CTR_BITS-1:0] smem_bank_stalls;
wire [43:0] dram_reads;
wire [43:0] dram_writes;
wire [43:0] dram_stalls;
wire [43:0] dram_latency;
wire [`PERF_CTR_BITS-1:0] mem_reads;
wire [`PERF_CTR_BITS-1:0] mem_writes;
wire [`PERF_CTR_BITS-1:0] mem_stalls;
wire [`PERF_CTR_BITS-1:0] mem_latency;
endinterface

View file

@ -4,14 +4,14 @@
`include "VX_define.vh"
interface VX_perf_pipeline_if ();
wire [43:0] ibf_stalls;
wire [43:0] scb_stalls;
wire [43:0] lsu_stalls;
wire [43:0] csr_stalls;
wire [43:0] alu_stalls;
wire [43:0] gpu_stalls;
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
wire [`PERF_CTR_BITS-1:0] scb_stalls;
wire [`PERF_CTR_BITS-1:0] lsu_stalls;
wire [`PERF_CTR_BITS-1:0] csr_stalls;
wire [`PERF_CTR_BITS-1:0] alu_stalls;
wire [`PERF_CTR_BITS-1:0] gpu_stalls;
`ifdef EXT_F_ENABLE
wire [43:0] fpu_stalls;
wire [`PERF_CTR_BITS-1:0] fpu_stalls;
`endif
endinterface

View file

@ -94,13 +94,13 @@ module VX_scope #(
delay_val <= $bits(delay_val)'(cmd_data);
cmd_start <= 1;
`ifdef DBG_PRINT_SCOPE
$display("*** scope:CMD_SET_START: delay_val=%0d", $bits(delay_val)'(cmd_data));
$display("%t: *** scope: CMD_SET_START: delay_val=%0d", $time, $bits(delay_val)'(cmd_data));
`endif
end
CMD_SET_STOP: begin
waddr_end <= $bits(waddr)'(cmd_data);
`ifdef DBG_PRINT_SCOPE
$display("*** scope:CMD_SET_STOP: waddr_end=%0d", $bits(waddr)'(cmd_data));
$display("%t: *** scope: CMD_SET_STOP: waddr_end=%0d", $time, $bits(waddr)'(cmd_data));
`endif
end
default:;
@ -117,7 +117,7 @@ module VX_scope #(
delay_cntr <= 0;
start_time <= timestamp;
`ifdef DBG_PRINT_SCOPE
$display("*** scope: recording start - start_time=%0d", timestamp);
$display("%t: *** scope: recording start - start_time=%0d", $time, timestamp);
`endif
end else begin
start_wait <= 1;
@ -133,7 +133,7 @@ module VX_scope #(
delta <= 0;
start_time <= timestamp;
`ifdef DBG_PRINT_SCOPE
$display("*** scope: recording start - start_time=%0d", timestamp);
$display("%t: *** scope: recording start - start_time=%0d", $time, timestamp);
`endif
end
end
@ -162,7 +162,7 @@ module VX_scope #(
if (stop
|| (waddr >= waddr_end)) begin
`ifdef DBG_PRINT_SCOPE
$display("*** scope: recording stop - waddr=(%0d, %0d)", waddr, waddr_end);
$display("%t: *** scope: recording stop - waddr=(%0d, %0d)", $time, waddr, waddr_end);
`endif
waddr <= waddr; // keep last address
recording <= 0;

View file

@ -2,6 +2,7 @@
# coding=utf-8
from __future__ import print_function
import sys
import os
import os.path as path
import re
@ -10,55 +11,19 @@ from datetime import datetime
script_dir = path.dirname(path.realpath(__file__))
defines = {}
for k, v in os.environ.items():
if k.upper().startswith('V_'):
defines[k[2:]] = v
print('Custom params:', ', '.join(['='.join(x) for x in defines.items()]))
parser = argparse.ArgumentParser()
parser.add_argument('--outc', default='none', help='Output C header')
parser.add_argument('--outv', default='none', help='Output Verilog header')
parser.add_argument('-i', "--input", default='none', help='Verilog header')
parser.add_argument('-o', "--output", default='none', help='C header')
args = parser.parse_args()
if args.outc == 'none' and args.outv == 'none':
print('Warning: not emitting any files. Specify arguments')
if args.outv != 'none':
with open(args.outv, 'w') as f:
print('''
// auto-generated by gen_config.py. DO NOT EDIT
// Generated at {date}
`ifndef VX_USER_CONFIG
`define VX_USER_CONFIG
'''[1:].format(date=datetime.now()), file=f)
for k, v in defines.items():
print('`define {} {}'.format(k, v), file=f)
print('\n`endif', file=f)
if args.outc != 'none':
with open(args.outc, 'w') as f:
print('''
// auto-generated by gen_config.py. DO NOT EDIT
// Generated at {date}
#ifndef VX_USER_CONFIG
#define VX_USER_CONFIG
'''[1:].format(date=datetime.now()), file=f)
for k, v in defines.items():
print('#define {} {}'.format(k, v), file=f)
print('\n#endif', file=f)
if args.input == 'none' or args.output == 'none':
print('Error: invalid arguments')
sys.exit()
translation_rules = [
# preprocessor directives
(re.compile(r'^\s*`include .*$'), r''),
(re.compile(r'`include\s+.*$'), r''),
(re.compile(r'`ifdef'), r'#ifdef'),
(re.compile(r'`ifndef'), r'#ifndef'),
(re.compile(r'`elif'), r'#elif'),
@ -75,25 +40,24 @@ translation_rules = [
(re.compile(r"\d+'h([\da-fA-F]+)"), r'0x\1')
]
if args.outc != 'none':
with open(args.outc, 'a') as f:
print('''
with open(args.output, 'w') as f:
print('''
// auto-generated by gen_config.py. DO NOT EDIT
// Generated at {date}
// Translated from VX_config.vh:
'''[1:].format(date=datetime.now()), file=f)
with open(path.join(script_dir, '../rtl/VX_config.vh'), 'r') as r:
lineno = 0
for line in r:
for pat, repl in translation_rules:
match = pat.search(line)
if match:
line = re.sub(pat, repl, line)
#print("*** match @" + str(lineno) + ": " + match.group() + " => " + line)
f.write(line)
lineno = lineno + 1
print('''
with open(args.input, 'r') as r:
lineno = 0
for line in r:
for pat, repl in translation_rules:
match = pat.search(line)
if match:
line = re.sub(pat, repl, line)
#print("*** match @" + str(lineno) + ": " + match.group() + " => " + line)
f.write(line)
lineno = lineno + 1
print('''
'''[1:], file=f)

View file

@ -97,9 +97,8 @@
"avs_byteenable":64,
"avs_burstcount":4,
"avs_readdatavalid":1,
"mem_bank_select":1,
"cci_dram_rd_req_ctr":26,
"cci_dram_wr_req_ctr":26,
"cci_mem_rd_req_ctr":26,
"cci_mem_wr_req_ctr":26,
"cci_rd_req_ctr":26,
"cci_rd_rsp_ctr":3,
"cci_wr_req_ctr":26,
@ -110,23 +109,23 @@
"!cci_pending_reads_full":1,
"!cci_pending_writes_empty":1,
"!cci_pending_writes_full": 1,
"?afu_dram_req_fire": 1,
"afu_dram_req_addr": 26,
"afu_dram_req_tag": 28,
"?afu_dram_rsp_fire": 1,
"afu_dram_rsp_tag": 28
"?afu_mem_req_fire": 1,
"afu_mem_req_addr": 26,
"afu_mem_req_tag": 28,
"?afu_mem_rsp_fire": 1,
"afu_mem_rsp_tag": 28
},
"afu/vortex": {
"!reset": 1,
"?dram_req_fire": 1,
"dram_req_addr": 32,
"dram_req_rw": 1,
"dram_req_byteen":"`VX_DRAM_BYTEEN_WIDTH",
"dram_req_data":"`VX_DRAM_LINE_WIDTH",
"dram_req_tag":"`VX_DRAM_TAG_WIDTH",
"?dram_rsp_fire": 1,
"dram_rsp_data":"`VX_DRAM_LINE_WIDTH",
"dram_rsp_tag":"`VX_DRAM_TAG_WIDTH",
"?mem_req_fire": 1,
"mem_req_addr": 32,
"mem_req_rw": 1,
"mem_req_byteen":"`VX_MEM_BYTEEN_WIDTH",
"mem_req_data":"`VX_MEM_LINE_WIDTH",
"mem_req_tag":"`VX_MEM_TAG_WIDTH",
"?mem_rsp_fire": 1,
"mem_rsp_data":"`VX_MEM_LINE_WIDTH",
"mem_rsp_tag":"`VX_MEM_TAG_WIDTH",
"busy": 1
},
"afu/vortex/cluster/core/pipeline/fetch/icache_stage": {
@ -207,7 +206,7 @@
"force_miss_st0": 1,
"mshr_push": 1,
"?crsq_in_stall": 1,
"?dreq_alm_full": 1,
"?mreq_alm_full": 1,
"?mshr_alm_full": 1
}
}

View file

@ -1,7 +1,7 @@
CFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
#CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors
CFLAGS += -Wno-aligned-new -Wno-maybe-uninitialized
CFLAGS += -Wno-maybe-uninitialized
CFLAGS += -I../..
@ -13,7 +13,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
DBG_PRINT_FLAGS += -DDBG_PRINT_DRAM
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
@ -22,11 +22,11 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_TEX
DBG_FLAGS += $(DBG_PRINT_FLAGS)
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
SINGLECORE += -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0
SINGLECORE = -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0
#MULTICORE ?= -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
#MULTICORE ?= -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
MULTICORE ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
#MULTICORE = -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=1
#MULTICORE = -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1
MULTICORE = -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
SINGLECORE += $(CONFIGS)
MULTICORE += $(CONFIGS)
@ -44,15 +44,16 @@ SRCS = simulator.cpp testbench.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += verilator.vlt
VL_FLAGS += --exe $(SRCS) $(RTL_INCLUDE)
VL_FLAGS += --cc Vortex.v --top-module $(TOP)
# Use FPNEW PFU core
VL_FLAGS += -DFPU_FPNEW
# FPU backend
FPU_CORE ?= FPU_FPNEW
VL_FLAGS += -D$(FPU_CORE)
DBG_FLAGS += -DVCD_OUTPUT

View file

@ -5,10 +5,23 @@
#define RESET_DELAY 4
#define ENABLE_DRAM_STALLS
#define DRAM_LATENCY 24
#define DRAM_RQ_SIZE 16
#define DRAM_STALLS_MODULO 16
#define ENABLE_MEM_STALLS
#ifndef MEM_LATENCY
#define MEM_LATENCY 24
#endif
#ifndef MEM_RQ_SIZE
#define MEM_RQ_SIZE 16
#endif
#ifndef MEM_STALLS_MODULO
#define MEM_STALLS_MODULO 16
#endif
#ifndef VERILATOR_RESET_VALUE
#define VERILATOR_RESET_VALUE 2
#endif
#define VL_WDATA_GETW(lwp, i, n, w) \
VL_SEL_IWII(0, n * w, 0, 0, lwp, i * w, w)
@ -21,7 +34,7 @@ double sc_time_stamp() {
Simulator::Simulator() {
// force random values for unitialized signals
Verilated::randReset(2);
Verilated::randReset(VERILATOR_RESET_VALUE);
Verilated::randSeed(50);
// Turn off assertion before reset
@ -56,19 +69,19 @@ Simulator::~Simulator() {
void Simulator::attach_ram(RAM* ram) {
ram_ = ram;
dram_rsp_vec_.clear();
mem_rsp_vec_.clear();
}
void Simulator::reset() {
print_bufs_.clear();
dram_rsp_vec_.clear();
mem_rsp_vec_.clear();
dram_rsp_active_ = false;
mem_rsp_active_ = false;
csr_req_active_ = false;
csr_rsp_value_ = nullptr;
vortex_->dram_rsp_valid = 0;
vortex_->dram_req_ready = 0;
vortex_->mem_rsp_valid = 0;
vortex_->mem_req_ready = 0;
//vortex_->io_req_ready = 0;
//vortex_->io_rsp_valid = 0;
vortex_->csr_req_valid = 0;
@ -94,13 +107,13 @@ void Simulator::step() {
vortex_->clk = 0;
this->eval();
dram_rsp_ready_ = vortex_->dram_rsp_ready;
mem_rsp_ready_ = vortex_->mem_rsp_ready;
csr_req_ready_ = vortex_->csr_req_ready;
vortex_->clk = 1;
this->eval();
this->eval_dram_bus();
this->eval_mem_bus();
this->eval_io_bus();
this->eval_csr_bus();
@ -117,83 +130,83 @@ void Simulator::eval() {
++timestamp;
}
void Simulator::eval_dram_bus() {
void Simulator::eval_mem_bus() {
if (ram_ == nullptr) {
vortex_->dram_req_ready = 0;
vortex_->mem_req_ready = 0;
return;
}
// update DRAM responses schedule
for (auto& rsp : dram_rsp_vec_) {
// update memory responses schedule
for (auto& rsp : mem_rsp_vec_) {
if (rsp.cycles_left > 0)
rsp.cycles_left -= 1;
}
// schedule DRAM responses in FIFO order
std::list<dram_req_t>::iterator dram_rsp_it(dram_rsp_vec_.end());
if (!dram_rsp_vec_.empty()
&& (0 == dram_rsp_vec_.begin()->cycles_left)) {
dram_rsp_it = dram_rsp_vec_.begin();
// schedule memory responses in FIFO order
std::list<mem_req_t>::iterator mem_rsp_it(mem_rsp_vec_.end());
if (!mem_rsp_vec_.empty()
&& (0 == mem_rsp_vec_.begin()->cycles_left)) {
mem_rsp_it = mem_rsp_vec_.begin();
}
// send DRAM response
if (dram_rsp_active_
&& vortex_->dram_rsp_valid && dram_rsp_ready_) {
dram_rsp_active_ = false;
// send memory response
if (mem_rsp_active_
&& vortex_->mem_rsp_valid && mem_rsp_ready_) {
mem_rsp_active_ = false;
}
if (!dram_rsp_active_) {
if (dram_rsp_it != dram_rsp_vec_.end()) {
vortex_->dram_rsp_valid = 1;
memcpy((uint8_t*)vortex_->dram_rsp_data, dram_rsp_it->block.data(), GLOBAL_BLOCK_SIZE);
vortex_->dram_rsp_tag = dram_rsp_it->tag;
dram_rsp_vec_.erase(dram_rsp_it);
dram_rsp_active_ = true;
if (!mem_rsp_active_) {
if (mem_rsp_it != mem_rsp_vec_.end()) {
vortex_->mem_rsp_valid = 1;
memcpy((uint8_t*)vortex_->mem_rsp_data, mem_rsp_it->block.data(), MEM_BLOCK_SIZE);
vortex_->mem_rsp_tag = mem_rsp_it->tag;
mem_rsp_vec_.erase(mem_rsp_it);
mem_rsp_active_ = true;
} else {
vortex_->dram_rsp_valid = 0;
vortex_->mem_rsp_valid = 0;
}
}
// handle DRAM stalls
bool dram_stalled = false;
#ifdef ENABLE_DRAM_STALLS
if (0 == ((timestamp/2) % DRAM_STALLS_MODULO)) {
dram_stalled = true;
// handle memory stalls
bool mem_stalled = false;
#ifdef ENABLE_MEM_STALLS
if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) {
mem_stalled = true;
} else
if (dram_rsp_vec_.size() >= DRAM_RQ_SIZE) {
dram_stalled = true;
if (mem_rsp_vec_.size() >= MEM_RQ_SIZE) {
mem_stalled = true;
}
#endif
// process DRAM requests
if (!dram_stalled) {
if (vortex_->dram_req_valid) {
if (vortex_->dram_req_rw) {
uint64_t byteen = vortex_->dram_req_byteen;
unsigned base_addr = (vortex_->dram_req_addr * GLOBAL_BLOCK_SIZE);
uint8_t* data = (uint8_t*)(vortex_->dram_req_data);
for (int i = 0; i < GLOBAL_BLOCK_SIZE; i++) {
// process memory requests
if (!mem_stalled) {
if (vortex_->mem_req_valid) {
if (vortex_->mem_req_rw) {
uint64_t byteen = vortex_->mem_req_byteen;
unsigned base_addr = (vortex_->mem_req_addr * MEM_BLOCK_SIZE);
uint8_t* data = (uint8_t*)(vortex_->mem_req_data);
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
if ((byteen >> i) & 0x1) {
(*ram_)[base_addr + i] = data[i];
}
}
} else {
dram_req_t dram_req;
dram_req.tag = vortex_->dram_req_tag;
dram_req.addr = vortex_->dram_req_addr;
ram_->read(vortex_->dram_req_addr * GLOBAL_BLOCK_SIZE, GLOBAL_BLOCK_SIZE, dram_req.block.data());
dram_req.cycles_left = DRAM_LATENCY;
for (auto& rsp : dram_rsp_vec_) {
if (dram_req.addr == rsp.addr) {
dram_req.cycles_left = rsp.cycles_left;
mem_req_t mem_req;
mem_req.tag = vortex_->mem_req_tag;
mem_req.addr = vortex_->mem_req_addr;
ram_->read(vortex_->mem_req_addr * MEM_BLOCK_SIZE, MEM_BLOCK_SIZE, mem_req.block.data());
mem_req.cycles_left = MEM_LATENCY;
for (auto& rsp : mem_rsp_vec_) {
if (mem_req.addr == rsp.addr) {
mem_req.cycles_left = rsp.cycles_left;
break;
}
}
dram_rsp_vec_.emplace_back(dram_req);
mem_rsp_vec_.emplace_back(mem_req);
}
}
}
vortex_->dram_req_ready = !dram_stalled;
vortex_->mem_req_ready = !mem_stalled;
}
void Simulator::eval_io_bus() {

View file

@ -48,23 +48,23 @@ private:
typedef struct {
int cycles_left;
std::array<uint8_t, GLOBAL_BLOCK_SIZE> block;
std::array<uint8_t, MEM_BLOCK_SIZE> block;
uint32_t addr;
uint32_t tag;
} dram_req_t;
} mem_req_t;
std::unordered_map<int, std::stringstream> print_bufs_;
void eval();
void eval_dram_bus();
void eval_mem_bus();
void eval_io_bus();
void eval_csr_bus();
std::list<dram_req_t> dram_rsp_vec_;
bool dram_rsp_active_;
std::list<mem_req_t> mem_rsp_vec_;
bool mem_rsp_active_;
bool dram_rsp_ready_;
bool mem_rsp_ready_;
bool csr_req_ready_;
bool csr_req_active_;
uint32_t* csr_rsp_value_;

View file

@ -1,32 +1,114 @@
ASE_BUILD_DIR=build_ase
FPGA_BUILD_DIR=build_fpga
DEVICE_FAMILY ?= arria10
ASE_BUILD_DIR ?= build_ase_$(DEVICE_FAMILY)
FPGA_BUILD_DIR ?= build_fpga_$(DEVICE_FAMILY)
RTL_DIR=../../rtl
ifeq (, $(shell which qsub-synth))
ifeq ($(shell which qsub-synth),)
RUN_SYNTH=$(OPAE_PLATFORM_ROOT)/bin/run.sh > build.log 2>&1 &
else
RUN_SYNTH=qsub-synth
endif
# control RTL debug print states
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
DBG_FLAGS += $(DBG_PRINT_FLAGS)
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
CONFIG1 := -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
CONFIG2 := -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
CONFIG4 := -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
CONFIG8 := -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
CONFIG16 := -DNUM_CLUSTERS=4 -DNUM_CORES=4 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
CONFIG32 := -DNUM_CLUSTERS=4 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
CONFIG64 := -DNUM_CLUSTERS=8 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)
RTL_INCLUDE = -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) -I$(RTL_DIR) -I$(RTL_DIR)/afu
CFLAGS += $(RTL_INCLUDE)
# Debugigng
ifdef DEBUG
CFLAGS += $(DBG_FLAGS)
else
CFLAGS += -DNDEBUG
endif
# Enable scope analyzer
ifdef SCOPE
CFLAGS += -DSCOPE
endif
# Enable perf counters
ifdef PERF
CFLAGS += -DPERF_ENABLE
endif
all: ase-1c
gen_sources_a10:
./gen_sources.sh arria10 > sources.txt
$(ASE_BUILD_DIR)_1c/Makefile:
afu_sim_setup -s setup.cfg $(ASE_BUILD_DIR)_1c
gen_sources_s10:
./gen_sources.sh stratix10 > sources.txt
$(ASE_BUILD_DIR)_2c/Makefile:
afu_sim_setup -s setup.cfg $(ASE_BUILD_DIR)_2c
ase-1c: gen_sources_a10 setup-ase-1c
make -C $(ASE_BUILD_DIR)_1c
cp $(RTL_DIR)/fp_cores/altera/arria10/*.hex $(ASE_BUILD_DIR)_1c/work
$(ASE_BUILD_DIR)_4c/Makefile:
afu_sim_setup -s setup.cfg $(ASE_BUILD_DIR)_4c
ase-2c: gen_sources_a10 setup-ase-2c
make -C $(ASE_BUILD_DIR)_2c
cp $(RTL_DIR)/fp_cores/altera/arria10/*.hex $(ASE_BUILD_DIR)_2c/work
$(FPGA_BUILD_DIR)_1c/build/dcp.qpf:
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_1c
ase-4c: gen_sources_a10 setup-ase-4c
make -C $(ASE_BUILD_DIR)_4c
cp $(RTL_DIR)/fp_cores/altera/arria10/*.hex $(ASE_BUILD_DIR)_4c/work
$(FPGA_BUILD_DIR)_2c/build/dcp.qpf:
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_2c
$(FPGA_BUILD_DIR)_4c/build/dcp.qpf:
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_4c
$(FPGA_BUILD_DIR)_8c/build/dcp.qpf:
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_8c
$(FPGA_BUILD_DIR)_16c/build/dcp.qpf:
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_16c
$(FPGA_BUILD_DIR)_32c/build/dcp.qpf:
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_32c
$(FPGA_BUILD_DIR)_64c/build/dcp.qpf:
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_64c
gen-sources-1c:
./gen_sources.sh $(CFLAGS) $(CONFIG1) > sources.txt
gen-sources-2c:
./gen_sources.sh $(CFLAGS) $(CONFIG2) > sources.txt
gen-sources-4c:
./gen_sources.sh $(CFLAGS) $(CONFIG4) > sources.txt
gen-sources-8c:
./gen_sources.sh $(CFLAGS) $(CONFIG8) > sources.txt
gen-sources-16c:
./gen_sources.sh $(CFLAGS) $(CONFIG16) > sources.txt
gen-sources-32c:
./gen_sources.sh $(CFLAGS) $(CONFIG32) > sources.txt
gen-sources-64c:
./gen_sources.sh $(CFLAGS) $(CONFIG64) > sources.txt
# setup
setup-ase-1c: $(ASE_BUILD_DIR)_1c/Makefile
@ -34,36 +116,6 @@ setup-ase-2c: $(ASE_BUILD_DIR)_2c/Makefile
setup-ase-4c: $(ASE_BUILD_DIR)_4c/Makefile
$(ASE_BUILD_DIR)_1c/Makefile:
afu_sim_setup -s sources_1c.txt $(ASE_BUILD_DIR)_1c
$(ASE_BUILD_DIR)_2c/Makefile:
afu_sim_setup -s sources_2c.txt $(ASE_BUILD_DIR)_2c
$(ASE_BUILD_DIR)_4c/Makefile:
afu_sim_setup -s sources_4c.txt $(ASE_BUILD_DIR)_4c
fpga-1c: gen_sources_a10 setup-fpga-1c
cd $(FPGA_BUILD_DIR)_1c && $(RUN_SYNTH)
fpga-2c: gen_sources_a10 setup-fpga-2c
cd $(FPGA_BUILD_DIR)_2c && $(RUN_SYNTH)
fpga-4c: gen_sources_a10 setup-fpga-4c
cd $(FPGA_BUILD_DIR)_4c && $(RUN_SYNTH)
fpga-8c: gen_sources_a10 setup-fpga-8c
cd $(FPGA_BUILD_DIR)_8c && $(RUN_SYNTH)
fpga-16c: gen_sources_a10 setup-fpga-16c
cd $(FPGA_BUILD_DIR)_16c && $(RUN_SYNTH)
fpga-32c: gen_sources_s10 setup-fpga-32c
cd $(FPGA_BUILD_DIR)_32c && $(RUN_SYNTH)
fpga-64c: gen_sources_s10 setup-fpga-64c
cd $(FPGA_BUILD_DIR)_64c && $(RUN_SYNTH)
setup-fpga-1c: $(FPGA_BUILD_DIR)_1c/build/dcp.qpf
setup-fpga-2c: $(FPGA_BUILD_DIR)_2c/build/dcp.qpf
@ -78,35 +130,42 @@ setup-fpga-32c: $(FPGA_BUILD_DIR)_32c/build/dcp.qpf
setup-fpga-64c: $(FPGA_BUILD_DIR)_64c/build/dcp.qpf
$(FPGA_BUILD_DIR)_1c/build/dcp.qpf:
afu_synth_setup -s sources_1c.txt $(FPGA_BUILD_DIR)_1c
# build
$(FPGA_BUILD_DIR)_2c/build/dcp.qpf:
afu_synth_setup -s sources_2c.txt $(FPGA_BUILD_DIR)_2c
ase-1c: gen-sources-1c setup-ase-1c
make -C $(ASE_BUILD_DIR)_1c
cp $(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)/*.hex $(ASE_BUILD_DIR)_1c/work
$(FPGA_BUILD_DIR)_4c/build/dcp.qpf:
afu_synth_setup -s sources_4c.txt $(FPGA_BUILD_DIR)_4c
ase-2c: gen-sources-2c setup-ase-2c
make -C $(ASE_BUILD_DIR)_2c
cp $(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)/*.hex $(ASE_BUILD_DIR)_2c/work
$(FPGA_BUILD_DIR)_8c/build/dcp.qpf:
afu_synth_setup -s sources_8c.txt $(FPGA_BUILD_DIR)_8c
ase-4c: gen-sources-4c setup-ase-4c
make -C $(ASE_BUILD_DIR)_4c
cp $(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)/*.hex $(ASE_BUILD_DIR)_4c/work
$(FPGA_BUILD_DIR)_16c/build/dcp.qpf:
afu_synth_setup -s sources_16c.txt $(FPGA_BUILD_DIR)_16c
fpga-1c: gen-sources-1c setup-fpga-1c
cd $(FPGA_BUILD_DIR)_1c && $(RUN_SYNTH)
$(FPGA_BUILD_DIR)_32c/build/dcp.qpf:
afu_synth_setup -s sources_32c.txt $(FPGA_BUILD_DIR)_32c
fpga-2c: gen-sources-2c setup-fpga-2c
cd $(FPGA_BUILD_DIR)_2c && $(RUN_SYNTH)
$(FPGA_BUILD_DIR)_64c/build/dcp.qpf:
afu_synth_setup -s sources_64c.txt $(FPGA_BUILD_DIR)_64c
fpga-4c: gen-sources-4c setup-fpga-4c
cd $(FPGA_BUILD_DIR)_4c && $(RUN_SYNTH)
run-ase-1c:
cd $(ASE_BUILD_DIR)_1c && make sim
fpga-8c: gen-sources-8c setup-fpga-8c
cd $(FPGA_BUILD_DIR)_8c && $(RUN_SYNTH)
run-ase-2c:
cd $(ASE_BUILD_DIR)_2c && make sim
fpga-16c: gen-sources-16c setup-fpga-16c
cd $(FPGA_BUILD_DIR)_16c && $(RUN_SYNTH)
run-ase-4c:
cd $(ASE_BUILD_DIR)_4c && make sim
fpga-32c: gen-sources-32c setup-fpga-32c
cd $(FPGA_BUILD_DIR)_32c && $(RUN_SYNTH)
fpga-64c: gen-sources-64c setup-fpga-64c
cd $(FPGA_BUILD_DIR)_64c && $(RUN_SYNTH)
# cleanup
clean-ase-1c:
rm -rf $(ASE_BUILD_DIR)_1c sources.txt

View file

@ -44,6 +44,9 @@ fpgaconf vortex_afu.gbs
# If this says Multiple ports. Then use --bus with fpgaconf. #bus info can be found by fpgainfo port
fpgaconf --bus 0xaf vortex_afu.gbs
# get portid
fpgainfo port
# Running the Test case
cd /driver/tests/basic
make run-fpga
@ -60,11 +63,13 @@ qsub-sim
make ase
# tests
./run_ase.sh build_ase_1c ../../../driver/tests/basic/basic -n16
./run_ase.sh build_ase_1c ../../../driver/tests/demo/demo -n16
./run_ase.sh build_ase_1c ../../../driver/tests/dogfood/dogfood -n16
./run_ase.sh build_ase_1c ../../../benchmarks/opencl/vecadd/vecadd
./run_ase.sh build_ase_1c ../../../benchmarks/opencl/sgemm/sgemm -n4
./run_ase.sh build_ase_arria10_1c ../../../driver/tests/basic/basic -n1 -t0
./run_ase.sh build_ase_arria10_1c ../../../driver/tests/basic/basic -n1 -t1
./run_ase.sh build_ase_arria10_1c ../../../driver/tests/basic/basic -n16
./run_ase.sh build_ase_arria10_1c ../../../driver/tests/demo/demo -n16
./run_ase.sh build_ase_arria10_1c ../../../driver/tests/dogfood/dogfood -n16
./run_ase.sh build_ase_arria10_1c ../../../benchmarks/opencl/vecadd/vecadd
./run_ase.sh build_ase_arria10_1c ../../../benchmarks/opencl/sgemm/sgemm -n4
# modify "vsim_run.tcl" to dump VCD trace
vcd file trace.vcd
@ -75,17 +80,10 @@ run -all
tar -zcvf output_files_1c.tar.gz `find ./build_fpga_1c -type f \( -iname \*.rpt -o -iname \*.txt -o -iname \*summary -o -iname \*.log \)`
# compress VCD trace
tar -zcvf vortex.vcd.tar.gz ./build_ase_1c/work/vortex.vcd
tar -zcvf trace.vcd.tar.gz obj_dir/trace.vcd
tar -zcvf trace.fst.tar.gz trace.fst run.log
tar -zcvf run.log.tar.gz run.log
tar -cvjf vortex.vcd.tar.bz2 build_ase_1c/work/vortex.vcd
tar -zcvf vortex.vcd.tar.gz build_ase_1c/work/vortex.vcd
tar -zcvf run.log.tar.gz build_ase_1c/work/run.log
tar -zcvf vx_scope.vcd.tar.gz vx_scope.vcd
tar -cvjf vx_scope.vcd.tar.bz2 vx_scope.vcd
tar -cvjf trace.fst.tar.bz2 trace.fst run.log
tar -cvjf trace.vcd.tar.bz2 trace.vcd run.log
tar -cvjf trace.vcd.tar.bz2 build_ase_arria10_1c/work/run.log build_ase_arria10_1c/work/trace.vcd
# decompress VCD trace
tar -zxvf vortex.vcd.tar.gz
@ -95,15 +93,4 @@ tar -xvf vortex.vcd.tar.bz2
lsof +D build_ase_1c
# quick off synthesis
make -C unittest clean && make -C unittest > unittest/build.log 2>&1 &
make -C pipeline clean && make -C pipeline > pipeline/build.log 2>&1 &
make -C cache clean && make -C cache > cache/build.log 2>&1 &
make -C core clean && make -C core > core/build.log 2>&1 &
make -C vortex clean && make -C vortex > vortex/build.log 2>&1 &
make -C top1 clean && make -C top1 > top1/build.log 2>&1 &
make -C top2 clean && make -C top2 > top2/build.log 2>&1 &
make -C top4 clean && make -C top4 > top4/build.log 2>&1 &
make -C top8 clean && make -C top8 > top8/build.log 2>&1 &
make -C top16 clean && make -C top16 > top16/build.log 2>&1 &
make -C top32 clean && make -C top32 > top32/build.log 2>&1 &
make -C top64 clean && make -C top64 > top64/build.log 2>&1 &
make core

View file

@ -1,39 +1,46 @@
#!/bin/bash
rtl_dir="../../rtl"
exclude_list="VX_fpu_fpnew.v"
file_list=""
macros=()
includes=()
add_dirs()
{
for dir in $*; do
echo "+incdir+$dir"
for file in $(find $dir -maxdepth 1 -name '*.v' -o -name '*.sv' -type f); do
exclude=0
for fe in $exclude_list; do
if [[ $file =~ $fe ]]; then
exclude=1
fi
done
if [[ $exclude == 0 ]]; then
file_list="$file_list $file"
# parse command arguments
while getopts D:I:h flag
do
case "${flag}" in
D) macros+=( ${OPTARG} );;
I) includes+=( ${OPTARG} );;
h) echo "Usage: [-D macro] [-I include] [-h help]"
exit 0
;;
\?)
echo "Invalid option: -$OPTARG" 1>&2
exit 1
;;
esac
done
# dump macros
for value in ${macros[@]}; do
echo "+define+$value"
done
# dump include directories
for dir in ${includes[@]}; do
echo "+incdir+$dir"
done
# dump source files
for dir in ${includes[@]}; do
for file in $(find $dir -maxdepth 1 -name '*.v' -o -name '*.sv' -type f); do
exclude=0
for fe in $exclude_list; do
if [[ $file =~ $fe ]]; then
exclude=1
fi
done
if [[ $exclude == 0 ]]; then
echo $file
fi
done
}
add_files()
{
for file in $*; do
file_list="$file_list $file"
done
}
add_dirs $rtl_dir/fp_cores/altera/$1
add_dirs $rtl_dir/libs $rtl_dir/interfaces $rtl_dir/fp_cores $rtl_dir/cache $rtl_dir/tex_unit $rtl_dir $rtl_dir/afu
# dump file list
for file in $file_list; do
echo $file
done

View file

@ -1,8 +1,5 @@
+define+NUM_CORES=2
+define+SYNTHESIS
+define+QUARTUS
#+define+PERF_ENABLE
vortex_afu.json
QI:vortex_afu.qsf

View file

@ -1,12 +0,0 @@
+define+NUM_CORES=4
+define+NUM_CLUSTERS=4
#+define+L3_ENABLE=1
+define+SYNTHESIS
+define+QUARTUS
#+define+PERF_ENABLE
vortex_afu16.json
QI:vortex_afu.qsf
C:sources.txt

View file

@ -1,24 +0,0 @@
+define+NUM_CORES=1
+define+SYNTHESIS
+define+QUARTUS
#+define+SCOPE
#+define+PERF_ENABLE
#+define+DBG_PRINT_CORE_ICACHE
#+define+DBG_PRINT_CORE_DCACHE
#+define+DBG_PRINT_CACHE_BANK
#+define+DBG_PRINT_CACHE_MSHR
#+define+DBG_PRINT_CACHE_TAG
#+define+DBG_PRINT_CACHE_DATA
#+define+DBG_PRINT_DRAM
#+define+DBG_PRINT_PIPELINE
#+define+DBG_PRINT_OPAE
#+define+DBG_PRINT_AVS
#+define+DBG_PRINT_SCOPE
#+define+DBG_CACHE_REQ_INFO
vortex_afu.json
QI:vortex_afu.qsf
C:sources.txt

View file

@ -1,14 +0,0 @@
+define+NUM_CORES=8
+define+NUM_CLUSTERS=4
#+define+L3_ENABLE=1
+define+GLOBAL_BLOCK_SIZE=16
+define+SYNTHESIS
+define+QUARTUS
#+define+PERF_ENABLE
vortex_afu.json
QI:vortex_afu.qsf
C:sources.txt

View file

@ -1,10 +0,0 @@
+define+NUM_CORES=4
+define+SYNTHESIS
+define+QUARTUS
#+define+PERF_ENABLE
vortex_afu.json
QI:vortex_afu.qsf
C:sources.txt

View file

@ -1,14 +0,0 @@
+define+NUM_CORES=8
+define+NUM_CLUSTERS=8
#+define+L3_ENABLE=1
+define+GLOBAL_BLOCK_SIZE=16
+define+SYNTHESIS
+define+QUARTUS
#+define+PERF_ENABLE
vortex_afu.json
QI:vortex_afu.qsf
C:sources.txt

View file

@ -1,12 +0,0 @@
+define+NUM_CORES=4
+define+NUM_CLUSTERS=2
#+define+L3_ENABLE=1
+define+SYNTHESIS
+define+QUARTUS
#+define+PERF_ENABLE
vortex_afu8.json
QI:vortex_afu.qsf
C:sources.txt

Some files were not shown because too many files have changed in this diff Show more