mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
Merge branch 'vortexgpgpu-master'
Merged updated codebase with master branch.
This commit is contained in:
commit
b8b3405efe
215 changed files with 10742 additions and 8715 deletions
14
.gitmodules
vendored
14
.gitmodules
vendored
|
@ -1,6 +1,12 @@
|
|||
[submodule "hw/rtl/fp_cores/fpnew"]
|
||||
path = hw/rtl/fp_cores/fpnew
|
||||
[submodule "third_party/fpnew"]
|
||||
path = third_party/fpnew
|
||||
url = https://github.com/pulp-platform/fpnew.git
|
||||
[submodule "sim/common/softfloat"]
|
||||
path = sim/common/softfloat
|
||||
[submodule "third_party/softfloat"]
|
||||
path = third_party/softfloat
|
||||
url = https://github.com/ucb-bar/berkeley-softfloat-3.git
|
||||
[submodule "third_party/cocogfx"]
|
||||
path = third_party/cocogfx
|
||||
url = https://github.com/gtcasl/cocogfx.git
|
||||
[submodule "third_party/ramulator"]
|
||||
path = third_party/ramulator
|
||||
url = https://github.com/CMU-SAFARI/ramulator.git
|
||||
|
|
3
Makefile
3
Makefile
|
@ -1,10 +1,11 @@
|
|||
all:
|
||||
$(MAKE) -C third_party
|
||||
$(MAKE) -C hw
|
||||
$(MAKE) -C sim
|
||||
$(MAKE) -C driver
|
||||
$(MAKE) -C runtime
|
||||
$(MAKE) -C tests
|
||||
|
||||
|
||||
clean:
|
||||
$(MAKE) -C hw clean
|
||||
$(MAKE) -C sim clean
|
||||
|
|
|
@ -124,7 +124,19 @@ CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_TH
|
|||
|
||||
echo "CONFIGS=$CONFIGS"
|
||||
|
||||
make -C $DRIVER_PATH clean
|
||||
BLACKBOX_CACHE=blackbox.$DRIVER.cache
|
||||
|
||||
if [ -f "$BLACKBOX_CACHE" ]
|
||||
then
|
||||
LAST_CONFIGS=`cat $BLACKBOX_CACHE`
|
||||
fi
|
||||
|
||||
if [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ];
|
||||
then
|
||||
make -C $DRIVER_PATH clean
|
||||
fi
|
||||
|
||||
echo "$CONFIGS+$DEBUG+$SCOPE" > $BLACKBOX_CACHE
|
||||
|
||||
status=0
|
||||
|
||||
|
|
|
@ -27,8 +27,11 @@ tex()
|
|||
echo "begin texture tests..."
|
||||
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=vlsim --app=tex --args="-isoccer.png -osoccer_result.png -g0"
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-isoccer.png -osoccer_result.png -g0"
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1"
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-irainbow.png -orainbow_result.png -g1"
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-irainbow.png -orainbow_result.png -g2"
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1" --perf
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-itoad.png -otoad_result.png -g1" --perf
|
||||
|
||||
echo "coverage texture done!"
|
||||
}
|
||||
|
@ -40,15 +43,21 @@ echo "begin clustering tests..."
|
|||
# warp/threads configurations
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=demo
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=demo
|
||||
./ci/blackbox.sh --driver=simx --cores=1 --warps=8 --threads=16 --app=demo
|
||||
|
||||
# cores clustering
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=demo --args="-n1"
|
||||
|
||||
# L2/L3
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=demo --args="-n1"
|
||||
|
||||
echo "clustering tests done!"
|
||||
}
|
||||
|
@ -58,7 +67,9 @@ debug()
|
|||
echo "begin debugging tests..."
|
||||
|
||||
./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --perf --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --perf --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --debug --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=vlsim --cores=1 --scope --app=basic --args="-t0 -n1"
|
||||
|
||||
echo "debugging tests done!"
|
||||
|
@ -73,9 +84,13 @@ CONFIGS=-DEXT_M_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_e
|
|||
|
||||
# disabling F extension
|
||||
CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
|
||||
CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext --perf
|
||||
CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=simx --cores=1 --app=no_mf_ext --perf
|
||||
|
||||
# disable shared memory
|
||||
CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem
|
||||
CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem --perf
|
||||
CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=simx --cores=1 --app=no_smem --perf
|
||||
|
||||
# using Default FPU core
|
||||
FPU_CORE=FPU_DEFAULT ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood
|
||||
|
@ -87,33 +102,29 @@ FPU_CORE=FPU_FPNEW ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood
|
|||
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo
|
||||
|
||||
# adjust l1 block size to match l2
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=16 -DL1_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1"
|
||||
CONFIGS="-DL1_BLOCK_SIZE=64" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1"
|
||||
|
||||
# test cache banking
|
||||
CONFIGS="-DDNUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr
|
||||
CONFIGS="-DDNUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr
|
||||
CONFIGS="-DDNUM_BANKS=2" ./ci/blackbox.sh --driver=simx --cores=1 --app=io_addr
|
||||
|
||||
# test cache multi-porting
|
||||
CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr
|
||||
CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo --debug --args="-n1"
|
||||
CONFIGS="-DL2_NUM_PORTS=2 -DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr
|
||||
CONFIGS="-DL2_NUM_PORTS=4 -DDNUM_PORTS=4" ./ci/blackbox.sh --driver=rtlsim --cores=4 --l2cache --app=io_addr
|
||||
CONFIGS="-DL2_NUM_PORTS=4 -DDNUM_PORTS=4" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --app=io_addr
|
||||
|
||||
# test 128-bit MEM block
|
||||
CONFIGS=-DMEM_BLOCK_SIZE=16 ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
|
||||
|
||||
# test 128-bit MEM and DRAM block
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=16 -DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=128 -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=28 -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
|
||||
# test single-bank DRAM
|
||||
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
|
||||
|
||||
# test 27-bit DRAM address
|
||||
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
|
||||
|
||||
# test 128-bit DRAM block
|
||||
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=128 -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=28 -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
|
||||
|
||||
# test long memory latency
|
||||
CONFIGS="-DMEM_LATENCY=100 -DMEM_RQ_SIZE=4 -DMEM_STALLS_MODULO=4" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
|
||||
|
||||
echo "configuration tests done!"
|
||||
}
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include <fpga.h>
|
||||
#endif
|
||||
|
||||
#include "vx_utils.h"
|
||||
#include <vortex.h>
|
||||
#include <VX_config.h>
|
||||
#include "vortex_afu.h"
|
||||
|
@ -52,7 +53,7 @@
|
|||
|
||||
typedef struct vx_device_ {
|
||||
fpga_handle fpga;
|
||||
size_t mem_allocation;
|
||||
uint64_t mem_allocation;
|
||||
unsigned version;
|
||||
unsigned num_cores;
|
||||
unsigned num_warps;
|
||||
|
@ -64,19 +65,9 @@ typedef struct vx_buffer_ {
|
|||
void* host_ptr;
|
||||
uint64_t io_addr;
|
||||
vx_device_h hdevice;
|
||||
size_t size;
|
||||
uint64_t size;
|
||||
} vx_buffer_t;
|
||||
|
||||
inline size_t align_size(size_t size, size_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return (size + alignment - 1) & ~(alignment - 1);
|
||||
}
|
||||
|
||||
inline bool is_aligned(size_t addr, size_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return 0 == (addr & (alignment - 1));
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
|
@ -107,7 +98,7 @@ AutoPerfDump gAutoPerfDump;
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
|
||||
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
|
@ -279,7 +270,7 @@ extern int vx_dev_close(vx_device_h hdevice) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) {
|
||||
extern int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr) {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == dev_maddr
|
||||
|| 0 >= size)
|
||||
|
@ -288,7 +279,7 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr)
|
|||
vx_device_t *device = ((vx_device_t*)hdevice);
|
||||
|
||||
size_t dev_mem_size = LOCAL_MEM_SIZE;
|
||||
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
size_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
|
||||
if (device->mem_allocation + asize > dev_mem_size)
|
||||
return -1;
|
||||
|
@ -299,7 +290,7 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr)
|
|||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) {
|
||||
extern int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer) {
|
||||
fpga_result res;
|
||||
void* host_ptr;
|
||||
uint64_t wsid;
|
||||
|
@ -313,7 +304,7 @@ extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hb
|
|||
|
||||
vx_device_t *device = ((vx_device_t*)hdevice);
|
||||
|
||||
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
size_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
|
||||
res = fpgaPrepareBuffer(device->fpga, asize, &host_ptr, &wsid, 0);
|
||||
if (FPGA_OK != res) {
|
||||
|
@ -367,7 +358,7 @@ extern int vx_buf_release(vx_buffer_h hbuffer) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
|
||||
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
|
@ -386,7 +377,7 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
|
|||
#endif
|
||||
|
||||
// to milliseconds
|
||||
long long sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000);
|
||||
uint64_t sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000);
|
||||
|
||||
for (;;) {
|
||||
uint64_t status;
|
||||
|
@ -430,7 +421,7 @@ extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) {
|
||||
extern int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset) {
|
||||
if (nullptr == hbuffer
|
||||
|| 0 >= size)
|
||||
return -1;
|
||||
|
@ -438,8 +429,8 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si
|
|||
vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer);
|
||||
vx_device_t *device = ((vx_device_t*)buffer->hdevice);
|
||||
|
||||
size_t dev_mem_size = LOCAL_MEM_SIZE;
|
||||
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
uint64_t dev_mem_size = LOCAL_MEM_SIZE;
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
|
||||
// check alignment
|
||||
if (!is_aligned(dev_maddr, CACHE_BLOCK_SIZE))
|
||||
|
@ -454,7 +445,7 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si
|
|||
return -1;
|
||||
|
||||
// Ensure ready for new command
|
||||
if (vx_ready_wait(buffer->hdevice, -1) != 0)
|
||||
if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0)
|
||||
return -1;
|
||||
|
||||
auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE);
|
||||
|
@ -465,13 +456,13 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si
|
|||
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_WRITE));
|
||||
|
||||
// Wait for the write operation to finish
|
||||
if (vx_ready_wait(buffer->hdevice, -1) != 0)
|
||||
if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0)
|
||||
return -1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) {
|
||||
extern int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dest_offset) {
|
||||
if (nullptr == hbuffer
|
||||
|| 0 >= size)
|
||||
return -1;
|
||||
|
@ -479,8 +470,8 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size,
|
|||
vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer);
|
||||
vx_device_t *device = ((vx_device_t*)buffer->hdevice);
|
||||
|
||||
size_t dev_mem_size = LOCAL_MEM_SIZE;
|
||||
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
uint64_t dev_mem_size = LOCAL_MEM_SIZE;
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
|
||||
// check alignment
|
||||
if (!is_aligned(dev_maddr, CACHE_BLOCK_SIZE))
|
||||
|
@ -495,7 +486,7 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size,
|
|||
return -1;
|
||||
|
||||
// Ensure ready for new command
|
||||
if (vx_ready_wait(buffer->hdevice, -1) != 0)
|
||||
if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0)
|
||||
return -1;
|
||||
|
||||
auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE);
|
||||
|
@ -506,7 +497,7 @@ extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size,
|
|||
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_READ));
|
||||
|
||||
// Wait for the write operation to finish
|
||||
if (vx_ready_wait(buffer->hdevice, -1) != 0)
|
||||
if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0)
|
||||
return -1;
|
||||
|
||||
return 0;
|
||||
|
@ -519,7 +510,7 @@ extern int vx_start(vx_device_h hdevice) {
|
|||
vx_device_t *device = ((vx_device_t*)hdevice);
|
||||
|
||||
// Ensure ready for new command
|
||||
if (vx_ready_wait(hdevice, -1) != 0)
|
||||
if (vx_ready_wait(hdevice, MAX_TIMEOUT) != 0)
|
||||
return -1;
|
||||
|
||||
// start execution
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef USE_VLSIM
|
||||
#include <fpga.h>
|
||||
|
|
|
@ -1,17 +1,29 @@
|
|||
#include "vx_utils.h"
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <cstring>
|
||||
#include <vortex.h>
|
||||
#include <VX_config.h>
|
||||
#include <assert.h>
|
||||
|
||||
extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size) {
|
||||
uint64_t aligned_size(uint64_t size, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return (size + alignment - 1) & ~(alignment - 1);
|
||||
}
|
||||
|
||||
bool is_aligned(uint64_t addr, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return 0 == (addr & (alignment - 1));
|
||||
}
|
||||
|
||||
extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, uint64_t size) {
|
||||
int err = 0;
|
||||
|
||||
if (NULL == content || 0 == size)
|
||||
return -1;
|
||||
|
||||
uint32_t buffer_transfer_size = 65536;
|
||||
unsigned kernel_base_addr;
|
||||
uint64_t kernel_base_addr;
|
||||
err = vx_dev_caps(device, VX_CAPS_KERNEL_BASE_ADDR, &kernel_base_addr);
|
||||
if (err != 0)
|
||||
return -1;
|
||||
|
@ -29,9 +41,9 @@ extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_
|
|||
// upload content
|
||||
//
|
||||
|
||||
size_t offset = 0;
|
||||
uint64_t offset = 0;
|
||||
while (offset < size) {
|
||||
auto chunk_size = std::min<size_t>(buffer_transfer_size, size - offset);
|
||||
auto chunk_size = std::min<uint64_t>(buffer_transfer_size, size - offset);
|
||||
std::memcpy(buf_ptr, (uint8_t*)content + offset, chunk_size);
|
||||
|
||||
/*printf("*** Upload Kernel to 0x%0x: data=", kernel_base_addr + offset);
|
||||
|
@ -102,11 +114,13 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
uint64_t csr_stalls = 0;
|
||||
uint64_t alu_stalls = 0;
|
||||
uint64_t gpu_stalls = 0;
|
||||
// PERF: decode
|
||||
uint64_t loads = 0;
|
||||
uint64_t stores = 0;
|
||||
uint64_t branches = 0;
|
||||
// PERF: Icache
|
||||
uint64_t icache_reads = 0;
|
||||
uint64_t icache_read_misses = 0;
|
||||
uint64_t icache_pipe_stalls = 0;
|
||||
uint64_t icache_rsp_stalls = 0;
|
||||
// PERF: Dcache
|
||||
uint64_t dcache_reads = 0;
|
||||
uint64_t dcache_writes = 0;
|
||||
|
@ -114,20 +128,22 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
uint64_t dcache_write_misses = 0;
|
||||
uint64_t dcache_bank_stalls = 0;
|
||||
uint64_t dcache_mshr_stalls = 0;
|
||||
uint64_t dcache_pipe_stalls = 0;
|
||||
uint64_t dcache_rsp_stalls = 0;
|
||||
// PERF: SMEM
|
||||
// PERF: shared memory
|
||||
uint64_t smem_reads = 0;
|
||||
uint64_t smem_writes = 0;
|
||||
uint64_t smem_bank_stalls = 0;
|
||||
// PERF: memory
|
||||
uint64_t mem_reads = 0;
|
||||
uint64_t mem_writes = 0;
|
||||
uint64_t mem_stalls = 0;
|
||||
uint64_t mem_lat = 0;
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
// PERF: texunit
|
||||
uint64_t tex_mem_reads = 0;
|
||||
uint64_t tex_mem_lat = 0;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
unsigned num_cores;
|
||||
uint64_t num_cores;
|
||||
ret = vx_dev_caps(device, VX_CAPS_MAX_CORES, &num_cores);
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
@ -184,6 +200,20 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
if (num_cores > 1) fprintf(stream, "PERF: core%d: gpu unit stalls=%ld\n", core_id, gpu_stalls_per_core);
|
||||
gpu_stalls += gpu_stalls_per_core;
|
||||
|
||||
// PERF: decode
|
||||
// loads
|
||||
uint64_t loads_per_core = get_csr_64(staging_ptr, CSR_MPM_LOADS);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
|
||||
loads += loads_per_core;
|
||||
// stores
|
||||
uint64_t stores_per_core = get_csr_64(staging_ptr, CSR_MPM_STORES);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
|
||||
stores += stores_per_core;
|
||||
// branches
|
||||
uint64_t branches_per_core = get_csr_64(staging_ptr, CSR_MPM_BRANCHES);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: branches=%ld\n", core_id, branches_per_core);
|
||||
branches += branches_per_core;
|
||||
|
||||
// PERF: Icache
|
||||
// total reads
|
||||
uint64_t icache_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_READS);
|
||||
|
@ -192,16 +222,8 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
// read misses
|
||||
uint64_t icache_miss_r_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_MISS_R);
|
||||
int icache_read_hit_ratio = (int)((1.0 - (double(icache_miss_r_per_core) / double(icache_reads_per_core))) * 100);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_miss_r_per_core, icache_read_hit_ratio);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache misses=%ld (hit ratio=%d%%)\n", core_id, icache_miss_r_per_core, icache_read_hit_ratio);
|
||||
icache_read_misses += icache_miss_r_per_core;
|
||||
// pipeline stalls
|
||||
uint64_t icache_pipe_st_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_PIPE_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache pipeline stalls=%ld\n", core_id, icache_pipe_st_per_core);
|
||||
icache_pipe_stalls += icache_pipe_st_per_core;
|
||||
// response stalls
|
||||
uint64_t icache_crsp_st_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_CRSP_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache reponse stalls=%ld\n", core_id, icache_crsp_st_per_core);
|
||||
icache_rsp_stalls += icache_crsp_st_per_core;
|
||||
|
||||
// PERF: Dcache
|
||||
// total reads
|
||||
|
@ -231,14 +253,6 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
uint64_t dcache_mshr_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_MSHR_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_st_per_core);
|
||||
dcache_mshr_stalls += dcache_mshr_st_per_core;
|
||||
// pipeline stalls
|
||||
uint64_t dcache_pipe_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_PIPE_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache pipeline stalls=%ld\n", core_id, dcache_pipe_st_per_core);
|
||||
dcache_pipe_stalls += dcache_pipe_st_per_core;
|
||||
// response stalls
|
||||
uint64_t dcache_crsp_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_CRSP_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reponse stalls=%ld\n", core_id, dcache_crsp_st_per_core);
|
||||
dcache_rsp_stalls += dcache_crsp_st_per_core;
|
||||
|
||||
// PERF: SMEM
|
||||
// total reads
|
||||
|
@ -258,17 +272,26 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
// PERF: memory
|
||||
uint64_t mem_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_READS);
|
||||
uint64_t mem_writes_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_WRITES);
|
||||
uint64_t mem_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_ST);
|
||||
uint64_t mem_lat_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_LAT);
|
||||
int mem_utilization = (int)((double(mem_reads_per_core + mem_writes_per_core) / double(mem_reads_per_core + mem_writes_per_core + mem_stalls_per_core)) * 100);
|
||||
int mem_avg_lat = (int)(double(mem_lat_per_core) / double(mem_reads_per_core));
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory requests=%ld (reads=%ld, writes=%ld)\n", core_id, (mem_reads_per_core + mem_writes_per_core), mem_reads_per_core, mem_writes_per_core);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory stalls=%ld (utilization=%d%%)\n", core_id, mem_stalls_per_core, mem_utilization);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory average latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
mem_reads += mem_reads_per_core;
|
||||
mem_writes += mem_writes_per_core;
|
||||
mem_stalls += mem_stalls_per_core;
|
||||
mem_lat += mem_lat_per_core;
|
||||
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
// total reads
|
||||
uint64_t tex_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_TEX_READS);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: tex memory reads=%ld\n", core_id, tex_reads_per_core);
|
||||
tex_mem_reads += tex_reads_per_core;
|
||||
|
||||
// read latency
|
||||
uint64_t tex_lat_per_core = get_csr_64(staging_ptr, CSR_MPM_TEX_LAT);
|
||||
int tex_avg_lat = (int)(double(tex_lat_per_core) / double(tex_reads_per_core));
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: tex memory latency=%d cycles\n", core_id, tex_avg_lat);
|
||||
tex_mem_lat += tex_lat_per_core;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -281,7 +304,6 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
|
||||
int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100);
|
||||
int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100);
|
||||
int mem_utilization = (int)((double(mem_reads + mem_writes) / double(mem_reads + mem_writes + mem_stalls)) * 100);
|
||||
int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads));
|
||||
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
|
||||
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
|
||||
|
@ -290,24 +312,27 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
fprintf(stream, "PERF: csr unit stalls=%ld\n", csr_stalls);
|
||||
fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls);
|
||||
fprintf(stream, "PERF: gpu unit stalls=%ld\n", gpu_stalls);
|
||||
fprintf(stream, "PERF: loads=%ld\n", loads);
|
||||
fprintf(stream, "PERF: stores=%ld\n", stores);
|
||||
fprintf(stream, "PERF: branches=%ld\n", branches);
|
||||
fprintf(stream, "PERF: icache reads=%ld\n", icache_reads);
|
||||
fprintf(stream, "PERF: icache read misses=%ld (hit ratio=%d%%)\n", icache_read_misses, icache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: icache pipeline stalls=%ld\n", icache_pipe_stalls);
|
||||
fprintf(stream, "PERF: icache reponse stalls=%ld\n", icache_rsp_stalls);
|
||||
fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads);
|
||||
fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes);
|
||||
fprintf(stream, "PERF: dcache read misses=%ld (hit ratio=%d%%)\n", dcache_read_misses, dcache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: dcache write misses=%ld (hit ratio=%d%%)\n", dcache_write_misses, dcache_write_hit_ratio);
|
||||
fprintf(stream, "PERF: dcache bank stalls=%ld (utilization=%d%%)\n", dcache_bank_stalls, dcache_bank_utilization);
|
||||
fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls);
|
||||
fprintf(stream, "PERF: dcache pipeline stalls=%ld\n", dcache_pipe_stalls);
|
||||
fprintf(stream, "PERF: dcache reponse stalls=%ld\n", dcache_rsp_stalls);
|
||||
fprintf(stream, "PERF: smem reads=%ld\n", smem_reads);
|
||||
fprintf(stream, "PERF: smem writes=%ld\n", smem_writes);
|
||||
fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization);
|
||||
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
|
||||
fprintf(stream, "PERF: memory stalls=%ld (utilization=%d%%)\n", mem_stalls, mem_utilization);
|
||||
fprintf(stream, "PERF: memory average latency=%d cycles\n", mem_avg_lat);
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
int tex_avg_lat = (int)(double(tex_mem_lat) / double(tex_mem_reads));
|
||||
fprintf(stream, "PERF: tex memory reads=%ld\n", tex_mem_reads);
|
||||
fprintf(stream, "PERF: tex memory latency=%d cycles\n", tex_avg_lat);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// release allocated resources
|
||||
|
|
11
driver/common/vx_utils.h
Normal file
11
driver/common/vx_utils.h
Normal file
|
@ -0,0 +1,11 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
uint64_t aligned_size(uint64_t size, uint64_t alignment);
|
||||
|
||||
bool is_aligned(uint64_t addr, uint64_t alignment);
|
||||
|
||||
#define CACHE_BLOCK_SIZE 64
|
||||
#define ALLOC_BASE_ADDR 0x00000000
|
||||
#define LOCAL_MEM_SIZE 4294967296 // 4 GB
|
|
@ -2,6 +2,7 @@
|
|||
#define __VX_DRIVER_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@ -22,9 +23,7 @@ typedef void* vx_buffer_h;
|
|||
#define VX_CAPS_ALLOC_BASE_ADDR 0x6
|
||||
#define VX_CAPS_KERNEL_BASE_ADDR 0x7
|
||||
|
||||
#define CACHE_BLOCK_SIZE 64
|
||||
#define ALLOC_BASE_ADDR 0x00000000
|
||||
#define LOCAL_MEM_SIZE 0xffffffff
|
||||
#define MAX_TIMEOUT (60*60*1000) // 1hr
|
||||
|
||||
// open the device and connect to it
|
||||
int vx_dev_open(vx_device_h* hdevice);
|
||||
|
@ -33,10 +32,10 @@ int vx_dev_open(vx_device_h* hdevice);
|
|||
int vx_dev_close(vx_device_h hdevice);
|
||||
|
||||
// return device configurations
|
||||
int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value);
|
||||
int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value);
|
||||
|
||||
// Allocate shared buffer with device
|
||||
int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer);
|
||||
int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer);
|
||||
|
||||
// Get host pointer address
|
||||
void* vx_host_ptr(vx_buffer_h hbuffer);
|
||||
|
@ -45,24 +44,24 @@ void* vx_host_ptr(vx_buffer_h hbuffer);
|
|||
int vx_buf_release(vx_buffer_h hbuffer);
|
||||
|
||||
// allocate device memory and return address
|
||||
int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr);
|
||||
int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr);
|
||||
|
||||
// Copy bytes from buffer to device local memory
|
||||
int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset);
|
||||
int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset);
|
||||
|
||||
// Copy bytes from device local memory to buffer
|
||||
int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dst_offset);
|
||||
int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dst_offset);
|
||||
|
||||
// Start device execution
|
||||
int vx_start(vx_device_h hdevice);
|
||||
|
||||
// Wait for device ready with milliseconds timeout
|
||||
int vx_ready_wait(vx_device_h hdevice, long long timeout);
|
||||
int vx_ready_wait(vx_device_h hdevice, uint64_t timeout);
|
||||
|
||||
////////////////////////////// UTILITY FUNCIONS ///////////////////////////////
|
||||
|
||||
// upload kernel bytes to device
|
||||
int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size);
|
||||
int vx_upload_kernel_bytes(vx_device_h device, const void* content, uint64_t size);
|
||||
|
||||
// upload kernel file to device
|
||||
int vx_upload_kernel_file(vx_device_h device, const char* filename);
|
||||
|
|
|
@ -3,9 +3,7 @@ RTLSIM_DIR = ../../sim/rtlsim
|
|||
CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors
|
||||
#CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I../include -I../../hw -I$(RTLSIM_DIR) -I$(RTLSIM_DIR)/../common
|
||||
|
||||
LDFLAGS += $(RTLSIM_DIR)/librtlsim.a
|
||||
CXXFLAGS += -I../include -I../common -I../../hw -I$(RTLSIM_DIR) -I$(RTLSIM_DIR)/../common
|
||||
|
||||
# Position independent code
|
||||
CXXFLAGS += -fPIC
|
||||
|
@ -17,6 +15,7 @@ CXXFLAGS += $(CONFIGS)
|
|||
CXXFLAGS += -DDUMP_PERF_STATS
|
||||
|
||||
LDFLAGS += -shared -pthread
|
||||
LDFLAGS += -L. -lrtlsim
|
||||
|
||||
SRCS = vortex.cpp ../common/vx_utils.cpp
|
||||
|
||||
|
@ -30,9 +29,9 @@ PROJECT = libvortex.so
|
|||
all: $(PROJECT)
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(MAKE) -C $(RTLSIM_DIR) static
|
||||
DESTDIR=../../driver/rtlsim $(MAKE) -C $(RTLSIM_DIR) ../../driver/rtlsim/librtlsim.so
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT)
|
||||
|
||||
clean:
|
||||
$(MAKE) -C $(RTLSIM_DIR) clean-static
|
||||
DESTDIR=../../driver/rtlsim $(MAKE) -C $(RTLSIM_DIR) clean
|
||||
rm -rf $(PROJECT) *.o
|
|
@ -4,13 +4,17 @@
|
|||
#include <assert.h>
|
||||
#include <iostream>
|
||||
#include <future>
|
||||
#include <list>
|
||||
#include <chrono>
|
||||
|
||||
#include <vortex.h>
|
||||
#include <vx_utils.h>
|
||||
#include <VX_config.h>
|
||||
#include <mem.h>
|
||||
#include <util.h>
|
||||
#include <simulator.h>
|
||||
#include <processor.h>
|
||||
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
|
@ -19,10 +23,10 @@ using namespace vortex;
|
|||
class vx_device;
|
||||
class vx_buffer {
|
||||
public:
|
||||
vx_buffer(size_t size, vx_device* device)
|
||||
vx_buffer(uint64_t size, vx_device* device)
|
||||
: size_(size)
|
||||
, device_(device) {
|
||||
auto aligned_asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
auto aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
data_ = malloc(aligned_asize);
|
||||
}
|
||||
|
||||
|
@ -36,7 +40,7 @@ public:
|
|||
return data_;
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
uint64_t size() const {
|
||||
return size_;
|
||||
}
|
||||
|
||||
|
@ -45,7 +49,7 @@ public:
|
|||
}
|
||||
|
||||
private:
|
||||
size_t size_;
|
||||
uint64_t size_;
|
||||
vx_device* device_;
|
||||
void* data_;
|
||||
};
|
||||
|
@ -54,9 +58,12 @@ private:
|
|||
|
||||
class vx_device {
|
||||
public:
|
||||
vx_device() : ram_((1<<12), (1<<20)) {
|
||||
mem_allocation_ = ALLOC_BASE_ADDR;
|
||||
}
|
||||
vx_device()
|
||||
: ram_(RAM_PAGE_SIZE)
|
||||
, mem_allocation_(ALLOC_BASE_ADDR)
|
||||
{
|
||||
processor_.attach_ram(&ram_);
|
||||
}
|
||||
|
||||
~vx_device() {
|
||||
if (future_.valid()) {
|
||||
|
@ -64,9 +71,9 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
int alloc_local_mem(size_t size, size_t* dev_maddr) {
|
||||
auto dev_mem_size = LOCAL_MEM_SIZE;
|
||||
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
int alloc_local_mem(uint64_t size, uint64_t* dev_maddr) {
|
||||
uint64_t dev_mem_size = LOCAL_MEM_SIZE;
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (mem_allocation_ + asize > dev_mem_size)
|
||||
return -1;
|
||||
*dev_maddr = mem_allocation_;
|
||||
|
@ -74,9 +81,9 @@ public:
|
|||
return 0;
|
||||
}
|
||||
|
||||
int upload(const void* src, size_t dest_addr, size_t size, size_t src_offset) {
|
||||
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dest_addr + asize > ram_.size())
|
||||
int upload(const void* src, uint64_t dest_addr, uint64_t size, uint64_t src_offset) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dest_addr + asize > LOCAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
/*printf("VXDRV: upload %ld bytes from 0x%lx:", size, uintptr_t((uint8_t*)src + src_offset));
|
||||
|
@ -92,9 +99,9 @@ public:
|
|||
return 0;
|
||||
}
|
||||
|
||||
int download(void* dest, size_t src_addr, size_t size, size_t dest_offset) {
|
||||
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
if (src_addr + asize > ram_.size())
|
||||
int download(void* dest, uint64_t src_addr, uint64_t size, uint64_t dest_offset) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (src_addr + asize > LOCAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.read((uint8_t*)dest + dest_offset, src_addr, asize);
|
||||
|
@ -112,26 +119,25 @@ public:
|
|||
}
|
||||
|
||||
int start() {
|
||||
// ensure prior run completed
|
||||
if (future_.valid()) {
|
||||
future_.wait(); // ensure prior run completed
|
||||
future_.wait();
|
||||
}
|
||||
simulator_.attach_ram(&ram_);
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
simulator_.reset();
|
||||
while (simulator_.is_busy()) {
|
||||
simulator_.step();
|
||||
}
|
||||
// start new run
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
processor_.run();
|
||||
});
|
||||
return 0;
|
||||
}
|
||||
|
||||
int wait(long long timeout) {
|
||||
int wait(uint64_t timeout) {
|
||||
if (!future_.valid())
|
||||
return 0;
|
||||
auto timeout_sec = (timeout < 0) ? timeout : (timeout / 1000);
|
||||
uint64_t timeout_sec = timeout / 1000;
|
||||
std::chrono::seconds wait_time(1);
|
||||
for (;;) {
|
||||
auto status = future_.wait_for(wait_time); // wait for 1 sec and check status
|
||||
// wait for 1 sec and check status
|
||||
auto status = future_.wait_for(wait_time);
|
||||
if (status == std::future_status::ready
|
||||
|| 0 == timeout_sec--)
|
||||
break;
|
||||
|
@ -141,9 +147,9 @@ public:
|
|||
|
||||
private:
|
||||
|
||||
size_t mem_allocation_;
|
||||
RAM ram_;
|
||||
Simulator simulator_;
|
||||
Processor processor_;
|
||||
uint64_t mem_allocation_;
|
||||
std::future<void> future_;
|
||||
};
|
||||
|
||||
|
@ -177,7 +183,7 @@ AutoPerfDump gAutoPerfDump;
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
|
||||
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
|
@ -198,10 +204,10 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
|
|||
*value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||
*value = 0xffffffff;
|
||||
*value = LOCAL_MEM_SIZE;
|
||||
break;
|
||||
case VX_CAPS_ALLOC_BASE_ADDR:
|
||||
*value = 0x10000000;
|
||||
*value = ALLOC_BASE_ADDR;
|
||||
break;
|
||||
case VX_CAPS_KERNEL_BASE_ADDR:
|
||||
*value = STARTUP_ADDR;
|
||||
|
@ -244,7 +250,7 @@ extern int vx_dev_close(vx_device_h hdevice) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) {
|
||||
extern int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr) {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == dev_maddr
|
||||
|| 0 >= size)
|
||||
|
@ -255,7 +261,7 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr)
|
|||
}
|
||||
|
||||
|
||||
extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) {
|
||||
extern int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer) {
|
||||
if (nullptr == hdevice
|
||||
|| 0 >= size
|
||||
|| nullptr == hbuffer)
|
||||
|
@ -294,7 +300,7 @@ extern int vx_buf_release(vx_buffer_h hbuffer) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) {
|
||||
extern int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset) {
|
||||
if (nullptr == hbuffer
|
||||
|| 0 >= size)
|
||||
return -1;
|
||||
|
@ -307,7 +313,7 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si
|
|||
return buffer->device()->upload(buffer->data(), dev_maddr, size, src_offset);
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) {
|
||||
extern int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dest_offset) {
|
||||
if (nullptr == hbuffer
|
||||
|| 0 >= size)
|
||||
return -1;
|
||||
|
@ -329,7 +335,7 @@ extern int vx_start(vx_device_h hdevice) {
|
|||
return device->start();
|
||||
}
|
||||
|
||||
extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
|
||||
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
|
|
|
@ -1,15 +1,15 @@
|
|||
SIMX_DIR = ../../sim/simX
|
||||
SIMX_DIR = ../../sim/simx
|
||||
|
||||
CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
|
||||
#CXXFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
|
||||
CXXFLAGS += -I../include -I../../hw -I$(SIMX_DIR) -I$(SIMX_DIR)/../common
|
||||
CXXFLAGS += -I../include -I../common -I../../hw -I$(SIMX_DIR) -I$(SIMX_DIR)/../common
|
||||
CXXFLAGS += $(CONFIGS)
|
||||
CXXFLAGS += -DDUMP_PERF_STATS
|
||||
|
||||
LDFLAGS += -shared -pthread
|
||||
LDFLAGS += $(SIMX_DIR)/libsimX.a
|
||||
LDFLAGS += -L. -lsimx
|
||||
|
||||
SRCS = vortex.cpp ../common/vx_utils.cpp
|
||||
|
||||
|
@ -18,9 +18,9 @@ PROJECT = libvortex.so
|
|||
all: $(PROJECT)
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(MAKE) -C $(SIMX_DIR) static
|
||||
DESTDIR=../../driver/simx $(MAKE) -C $(SIMX_DIR) ../../driver/simx/libsimx.so
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
clean:
|
||||
$(MAKE) -C $(SIMX_DIR) clean-static
|
||||
rm -rf $(PROJECT) *.o
|
||||
DESTDIR=../../driver/simx $(MAKE) -C $(SIMX_DIR) clean
|
||||
rm -rf libsimx.so $(PROJECT) *.o
|
|
@ -3,16 +3,21 @@
|
|||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <iostream>
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <future>
|
||||
#include <chrono>
|
||||
|
||||
#include <vortex.h>
|
||||
#include <core.h>
|
||||
#include <vx_utils.h>
|
||||
|
||||
#include <VX_config.h>
|
||||
|
||||
#include <util.h>
|
||||
|
||||
#define PAGE_SIZE 4096
|
||||
#include <processor.h>
|
||||
#include <archdef.h>
|
||||
#include <mem.h>
|
||||
#include <constants.h>
|
||||
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
|
@ -22,10 +27,10 @@ class vx_device;
|
|||
|
||||
class vx_buffer {
|
||||
public:
|
||||
vx_buffer(size_t size, vx_device* device)
|
||||
vx_buffer(uint64_t size, vx_device* device)
|
||||
: size_(size)
|
||||
, device_(device) {
|
||||
auto aligned_asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
uint64_t aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
data_ = malloc(aligned_asize);
|
||||
}
|
||||
|
||||
|
@ -39,7 +44,7 @@ public:
|
|||
return data_;
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
uint64_t size() const {
|
||||
return size_;
|
||||
}
|
||||
|
||||
|
@ -48,7 +53,7 @@ public:
|
|||
}
|
||||
|
||||
private:
|
||||
size_t size_;
|
||||
uint64_t size_;
|
||||
vx_device* device_;
|
||||
void* data_;
|
||||
};
|
||||
|
@ -58,33 +63,24 @@ private:
|
|||
class vx_device {
|
||||
public:
|
||||
vx_device()
|
||||
: arch_("rv32i", NUM_CORES, NUM_WARPS, NUM_THREADS)
|
||||
, decoder_(arch_)
|
||||
, mmu_(PAGE_SIZE, arch_.wsize(), true)
|
||||
, cores_(arch_.num_cores())
|
||||
, is_done_(false)
|
||||
, is_running_(false)
|
||||
, thread_(__thread_proc__, this)
|
||||
, ram_((1<<12), (1<<20)) {
|
||||
|
||||
mem_allocation_ = ALLOC_BASE_ADDR;
|
||||
mmu_.attach(ram_, 0, 0xffffffff);
|
||||
for (int i = 0; i < arch_.num_cores(); ++i) {
|
||||
cores_[i] = std::make_shared<Core>(arch_, decoder_, mmu_, i);
|
||||
}
|
||||
: arch_("rv32i", NUM_CORES * NUM_CLUSTERS, NUM_WARPS, NUM_THREADS)
|
||||
, ram_(RAM_PAGE_SIZE)
|
||||
, processor_(arch_)
|
||||
, mem_allocation_(ALLOC_BASE_ADDR)
|
||||
{
|
||||
// attach memory module
|
||||
processor_.attach_ram(&ram_);
|
||||
}
|
||||
|
||||
~vx_device() {
|
||||
mutex_.lock();
|
||||
is_done_ = true;
|
||||
mutex_.unlock();
|
||||
|
||||
thread_.join();
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
}
|
||||
|
||||
int alloc_local_mem(size_t size, size_t* dev_maddr) {
|
||||
auto dev_mem_size = LOCAL_MEM_SIZE;
|
||||
auto asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
int alloc_local_mem(uint64_t size, uint64_t* dev_maddr) {
|
||||
uint64_t dev_mem_size = LOCAL_MEM_SIZE;
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (mem_allocation_ + asize > dev_mem_size)
|
||||
return -1;
|
||||
*dev_maddr = mem_allocation_;
|
||||
|
@ -92,9 +88,9 @@ public:
|
|||
return 0;
|
||||
}
|
||||
|
||||
int upload(const void* src, size_t dest_addr, size_t size, size_t src_offset) {
|
||||
auto asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dest_addr + asize > ram_.size())
|
||||
int upload(const void* src, uint64_t dest_addr, uint64_t size, uint64_t src_offset) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dest_addr + asize > LOCAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.write((const uint8_t*)src + src_offset, dest_addr, asize);
|
||||
|
@ -107,9 +103,9 @@ public:
|
|||
return 0;
|
||||
}
|
||||
|
||||
int download(void* dest, size_t src_addr, size_t size, size_t dest_offset) {
|
||||
size_t asize = align_size(size, CACHE_BLOCK_SIZE);
|
||||
if (src_addr + asize > ram_.size())
|
||||
int download(void* dest, uint64_t src_addr, uint64_t size, uint64_t dest_offset) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (src_addr + asize > LOCAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.read((uint8_t*)dest + dest_offset, src_addr, asize);
|
||||
|
@ -123,98 +119,40 @@ public:
|
|||
}
|
||||
|
||||
int start() {
|
||||
|
||||
mutex_.lock();
|
||||
for (int i = 0; i < arch_.num_cores(); ++i) {
|
||||
cores_[i]->clear();
|
||||
// ensure prior run completed
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
is_running_ = true;
|
||||
mutex_.unlock();
|
||||
|
||||
|
||||
// start new run
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
processor_.run();
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int wait(long long timeout) {
|
||||
auto timeout_sec = (timeout < 0) ? timeout : (timeout / 1000);
|
||||
int wait(uint64_t timeout) {
|
||||
if (!future_.valid())
|
||||
return 0;
|
||||
uint64_t timeout_sec = timeout / 1000;
|
||||
std::chrono::seconds wait_time(1);
|
||||
for (;;) {
|
||||
mutex_.lock();
|
||||
bool is_running = is_running_;
|
||||
mutex_.unlock();
|
||||
|
||||
if (!is_running || 0 == timeout_sec--)
|
||||
// wait for 1 sec and check status
|
||||
auto status = future_.wait_for(wait_time);
|
||||
if (status == std::future_status::ready
|
||||
|| 0 == timeout_sec--)
|
||||
break;
|
||||
|
||||
std::this_thread::sleep_for(std::chrono::seconds(1));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int get_csr(int core_id, int addr, unsigned *value) {
|
||||
*value = cores_.at(core_id)->get_csr(addr, 0, 0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int set_csr(int core_id, int addr, unsigned value) {
|
||||
cores_.at(core_id)->set_csr(addr, value, 0, 0);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
void run() {
|
||||
bool running;
|
||||
do {
|
||||
running = false;
|
||||
for (auto& core : cores_) {
|
||||
core->step();
|
||||
if (core->running())
|
||||
running = true;
|
||||
}
|
||||
} while (running);
|
||||
}
|
||||
|
||||
void thread_proc() {
|
||||
std::cout << "Device ready..." << std::flush << std::endl;
|
||||
|
||||
for (;;) {
|
||||
mutex_.lock();
|
||||
bool is_done = is_done_;
|
||||
bool is_running = is_running_;
|
||||
mutex_.unlock();
|
||||
|
||||
if (is_done)
|
||||
break;
|
||||
|
||||
if (is_running) {
|
||||
std::cout << "Device running..." << std::flush << std::endl;
|
||||
|
||||
this->run();
|
||||
|
||||
mutex_.lock();
|
||||
is_running_ = false;
|
||||
mutex_.unlock();
|
||||
|
||||
std::cout << "Device ready..." << std::flush << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "Device shutdown..." << std::flush << std::endl;
|
||||
}
|
||||
|
||||
static void __thread_proc__(vx_device* device) {
|
||||
device->thread_proc();
|
||||
}
|
||||
|
||||
ArchDef arch_;
|
||||
Decoder decoder_;
|
||||
MemoryUnit mmu_;
|
||||
std::vector<std::shared_ptr<Core>> cores_;
|
||||
bool is_done_;
|
||||
bool is_running_;
|
||||
size_t mem_allocation_;
|
||||
std::thread thread_;
|
||||
RAM ram_;
|
||||
std::mutex mutex_;
|
||||
Processor processor_;
|
||||
uint64_t mem_allocation_;
|
||||
std::future<void> future_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -276,7 +214,7 @@ extern int vx_dev_close(vx_device_h hdevice) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
|
||||
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
|
@ -314,7 +252,7 @@ extern int vx_dev_caps(vx_device_h hdevice, unsigned caps_id, unsigned *value) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr) {
|
||||
extern int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr) {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == dev_maddr
|
||||
|| 0 >= size)
|
||||
|
@ -324,7 +262,7 @@ extern int vx_alloc_dev_mem(vx_device_h hdevice, size_t size, size_t* dev_maddr)
|
|||
return device->alloc_local_mem(size, dev_maddr);
|
||||
}
|
||||
|
||||
extern int vx_alloc_shared_mem(vx_device_h hdevice, size_t size, vx_buffer_h* hbuffer) {
|
||||
extern int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer) {
|
||||
if (nullptr == hdevice
|
||||
|| 0 >= size
|
||||
|| nullptr == hbuffer)
|
||||
|
@ -363,7 +301,7 @@ extern int vx_buf_release(vx_buffer_h hbuffer) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t src_offset) {
|
||||
extern int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset) {
|
||||
if (nullptr == hbuffer
|
||||
|| 0 >= size)
|
||||
return -1;
|
||||
|
@ -376,7 +314,7 @@ extern int vx_copy_to_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, si
|
|||
return buffer->device()->upload(buffer->data(), dev_maddr, size, src_offset);
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(vx_buffer_h hbuffer, size_t dev_maddr, size_t size, size_t dest_offset) {
|
||||
extern int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dest_offset) {
|
||||
if (nullptr == hbuffer
|
||||
|| 0 >= size)
|
||||
return -1;
|
||||
|
@ -398,7 +336,7 @@ extern int vx_start(vx_device_h hdevice) {
|
|||
return device->start();
|
||||
}
|
||||
|
||||
extern int vx_ready_wait(vx_device_h hdevice, long long timeout) {
|
||||
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
|
|
|
@ -8,15 +8,15 @@ extern int vx_dev_close(vx_device_h /*hdevice*/) {
|
|||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_dev_caps(vx_device_h /*hdevice*/, unsigned /*caps_id*/, unsigned* /*value*/) {
|
||||
extern int vx_dev_caps(vx_device_h /*hdevice*/, uint32_t /*caps_id*/, uint64_t* /*value*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_alloc_dev_mem(vx_device_h /*hdevice*/, size_t /*size*/, size_t* /*dev_maddr*/) {
|
||||
extern int vx_alloc_dev_mem(vx_device_h /*hdevice*/, uint64_t /*size*/, uint64_t* /*dev_maddr*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_alloc_shared_mem(vx_device_h /*hdevice*/, size_t /*size*/, vx_buffer_h* /*hbuffer*/) {
|
||||
extern int vx_alloc_shared_mem(vx_device_h /*hdevice*/, uint64_t /*size*/, vx_buffer_h* /*hbuffer*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -28,11 +28,11 @@ extern int vx_buf_release(vx_buffer_h /*hbuffer*/) {
|
|||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_copy_to_dev(vx_buffer_h /*hbuffer*/, size_t /*dev_maddr*/, size_t /*size*/, size_t /*src_offset*/) {
|
||||
extern int vx_copy_to_dev(vx_buffer_h /*hbuffer*/, uint64_t /*dev_maddr*/, uint64_t /*size*/, uint64_t /*src_offset*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(vx_buffer_h /*hbuffer*/, size_t /*dev_maddr*/, size_t /*size*/, size_t /*dest_offset*/) {
|
||||
extern int vx_copy_from_dev(vx_buffer_h /*hbuffer*/, uint64_t /*dev_maddr*/, uint64_t /*size*/, uint64_t /*dest_offset*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -40,6 +40,6 @@ extern int vx_start(vx_device_h /*hdevice*/) {
|
|||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_ready_wait(vx_device_h /*hdevice*/, long long /*timeout*/) {
|
||||
extern int vx_ready_wait(vx_device_h /*hdevice*/, uint64_t /*timeout*/) {
|
||||
return -1;
|
||||
}
|
|
@ -9,8 +9,6 @@ CXXFLAGS += -std=c++11 -O2 -DNDEBUG -Wall -Wextra -pedantic -Wfatal-errors
|
|||
|
||||
CXXFLAGS += -I. -I../include -I../../hw -I$(VLSIM_DIR)
|
||||
|
||||
LDFLAGS += $(VLSIM_DIR)/libopae-c-vlsim.a
|
||||
|
||||
# Position independent code
|
||||
CXXFLAGS += -fPIC
|
||||
|
||||
|
@ -21,6 +19,7 @@ CXXFLAGS += $(CONFIGS)
|
|||
CXXFLAGS += -DDUMP_PERF_STATS
|
||||
|
||||
LDFLAGS += -shared -pthread
|
||||
LDFLAGS += -L. -lopae-c-vlsim
|
||||
|
||||
SRCS = ../common/opae.cpp ../common/vx_utils.cpp
|
||||
|
||||
|
@ -47,9 +46,9 @@ scope-defs.h: $(SCRIPT_DIR)/scope.json
|
|||
scope: scope-defs.h
|
||||
|
||||
$(PROJECT): $(SRCS) $(SCOPE_H)
|
||||
$(SCOPE_ENABLE) $(PERF_ENABLE) $(MAKE) -C $(VLSIM_DIR) static
|
||||
DESTDIR=../../driver/vlsim $(MAKE) -C $(VLSIM_DIR) ../../driver/vlsim/libopae-c-vlsim.so
|
||||
$(CXX) $(CXXFLAGS) -DUSE_VLSIM $(SRCS) $(LDFLAGS) -o $(PROJECT)
|
||||
|
||||
clean:
|
||||
$(MAKE) -C $(VLSIM_DIR) clean-static
|
||||
rm -rf $(PROJECT) *.o scope-defs.h
|
||||
DESTDIR=../../driver/vlsim $(MAKE) -C $(VLSIM_DIR) clean
|
||||
rm -rf libopae-c-vlsim.so $(PROJECT) *.o scope-defs.h
|
|
@ -96,6 +96,7 @@ module VX_alu_unit #(
|
|||
wire alu_ready_in;
|
||||
wire alu_valid_out;
|
||||
wire alu_ready_out;
|
||||
wire [`UUID_BITS-1:0] alu_uuid;
|
||||
wire [`NW_BITS-1:0] alu_wid;
|
||||
wire [`NUM_THREADS-1:0] alu_tmask;
|
||||
wire [31:0] alu_PC;
|
||||
|
@ -112,14 +113,14 @@ module VX_alu_unit #(
|
|||
assign alu_ready_in = alu_ready_out || ~alu_valid_out;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32),
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (alu_ready_in),
|
||||
.data_in ({alu_valid_in, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result, is_br_op, br_op, is_less, is_equal, br_dest}),
|
||||
.data_out ({alu_valid_out, alu_wid, alu_tmask, alu_PC, alu_rd, alu_wb, alu_data, is_br_op_r, br_op_r, is_less_r, is_equal_r, br_dest_r})
|
||||
.data_in ({alu_valid_in, alu_req_if.uuid, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result, is_br_op, br_op, is_less, is_equal, br_dest}),
|
||||
.data_out ({alu_valid_out, alu_uuid, alu_wid, alu_tmask, alu_PC, alu_rd, alu_wb, alu_data, is_br_op_r, br_op_r, is_less_r, is_equal_r, br_dest_r})
|
||||
);
|
||||
|
||||
`UNUSED_VAR (br_op_r)
|
||||
|
@ -138,6 +139,7 @@ module VX_alu_unit #(
|
|||
wire mul_ready_in;
|
||||
wire mul_valid_out;
|
||||
wire mul_ready_out;
|
||||
wire [`UUID_BITS-1:0] mul_uuid;
|
||||
wire [`NW_BITS-1:0] mul_wid;
|
||||
wire [`NUM_THREADS-1:0] mul_tmask;
|
||||
wire [31:0] mul_PC;
|
||||
|
@ -153,6 +155,7 @@ module VX_alu_unit #(
|
|||
|
||||
// Inputs
|
||||
.alu_op (mul_op),
|
||||
.uuid_in (alu_req_if.uuid),
|
||||
.wid_in (alu_req_if.wid),
|
||||
.tmask_in (alu_req_if.tmask),
|
||||
.PC_in (alu_req_if.PC),
|
||||
|
@ -163,6 +166,7 @@ module VX_alu_unit #(
|
|||
|
||||
// Outputs
|
||||
.wid_out (mul_wid),
|
||||
.uuid_out (mul_uuid),
|
||||
.tmask_out (mul_tmask),
|
||||
.PC_out (mul_PC),
|
||||
.rd_out (mul_rd),
|
||||
|
@ -184,6 +188,7 @@ module VX_alu_unit #(
|
|||
assign mul_valid_in = alu_req_if.valid && is_mul_op;
|
||||
|
||||
assign alu_commit_if.valid = alu_valid_out || mul_valid_out;
|
||||
assign alu_commit_if.uuid = alu_valid_out ? alu_uuid : mul_uuid;
|
||||
assign alu_commit_if.wid = alu_valid_out ? alu_wid : mul_wid;
|
||||
assign alu_commit_if.tmask = alu_valid_out ? alu_tmask : mul_tmask;
|
||||
assign alu_commit_if.PC = alu_valid_out ? alu_PC : mul_PC;
|
||||
|
@ -201,6 +206,7 @@ module VX_alu_unit #(
|
|||
assign alu_valid_in = alu_req_if.valid;
|
||||
|
||||
assign alu_commit_if.valid = alu_valid_out;
|
||||
assign alu_commit_if.uuid = alu_uuid;
|
||||
assign alu_commit_if.wid = alu_wid;
|
||||
assign alu_commit_if.tmask = alu_tmask;
|
||||
assign alu_commit_if.PC = alu_PC;
|
||||
|
@ -220,8 +226,8 @@ module VX_alu_unit #(
|
|||
`ifdef DBG_TRACE_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (branch_ctl_if.valid) begin
|
||||
dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h\n",
|
||||
$time, CORE_ID, branch_ctl_if.wid, alu_commit_if.PC, branch_ctl_if.taken, branch_ctl_if.dest);
|
||||
dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h (#%0d)\n",
|
||||
$time, CORE_ID, branch_ctl_if.wid, alu_commit_if.PC, branch_ctl_if.taken, branch_ctl_if.dest, alu_uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -40,27 +40,35 @@ module VX_commit #(
|
|||
`endif
|
||||
|| gpu_commit_fire;
|
||||
|
||||
wire [`NUM_THREADS-1:0] commit_tmask;
|
||||
assign commit_tmask = alu_commit_fire ? alu_commit_if.tmask:
|
||||
ld_commit_fire ? ld_commit_if.tmask:
|
||||
st_commit_fire ? st_commit_if.tmask:
|
||||
csr_commit_fire ? csr_commit_if.tmask:
|
||||
`ifdef EXT_F_ENABLE
|
||||
fpu_commit_fire ? fpu_commit_if.tmask:
|
||||
`endif
|
||||
/*gpu_commit_fire ?*/ gpu_commit_if.tmask;
|
||||
`ifdef EXT_F_ENABLE
|
||||
wire [(6*`NUM_THREADS)-1:0] commit_tmask;
|
||||
`else
|
||||
wire [(5*`NUM_THREADS)-1:0] commit_tmask;
|
||||
`endif
|
||||
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] commit_cnt;
|
||||
`POP_COUNT(commit_cnt, commit_tmask);
|
||||
wire [$clog2($bits(commit_tmask)+1)-1:0] commit_size;
|
||||
|
||||
assign commit_tmask = {
|
||||
{`NUM_THREADS{alu_commit_fire}} & alu_commit_if.tmask,
|
||||
{`NUM_THREADS{ld_commit_fire}} & ld_commit_if.tmask,
|
||||
{`NUM_THREADS{st_commit_fire}} & st_commit_if.tmask,
|
||||
{`NUM_THREADS{csr_commit_fire}} & csr_commit_if.tmask,
|
||||
`ifdef EXT_F_ENABLE
|
||||
{`NUM_THREADS{fpu_commit_fire}} & fpu_commit_if.tmask,
|
||||
`endif
|
||||
{`NUM_THREADS{gpu_commit_fire}} & gpu_commit_if.tmask
|
||||
};
|
||||
|
||||
`POP_COUNT(commit_size, commit_tmask);
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + $clog2(`NUM_THREADS+1)),
|
||||
.DATAW (1 + $bits(commit_size)),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (1'b1),
|
||||
.data_in ({commit_fire, commit_cnt}),
|
||||
.data_in ({commit_fire, commit_size}),
|
||||
.data_out ({cmt_to_csr_if.valid, cmt_to_csr_if.commit_size})
|
||||
);
|
||||
|
||||
|
@ -90,32 +98,32 @@ module VX_commit #(
|
|||
if (alu_commit_if.valid && alu_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.wb, alu_commit_if.rd);
|
||||
`TRACE_ARRAY1D(alu_commit_if.data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", alu_commit_if.uuid);
|
||||
end
|
||||
if (ld_commit_if.valid && ld_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.wb, ld_commit_if.rd);
|
||||
`TRACE_ARRAY1D(ld_commit_if.data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", ld_commit_if.uuid);
|
||||
end
|
||||
if (st_commit_if.valid && st_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d\n", $time, CORE_ID, st_commit_if.wid, st_commit_if.PC, st_commit_if.tmask, st_commit_if.wb, st_commit_if.rd);
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d (#%0d)\n", $time, CORE_ID, st_commit_if.wid, st_commit_if.PC, st_commit_if.tmask, st_commit_if.wb, st_commit_if.rd, st_commit_if.uuid);
|
||||
end
|
||||
if (csr_commit_if.valid && csr_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=CSR, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.wb, csr_commit_if.rd);
|
||||
`TRACE_ARRAY1D(csr_commit_if.data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", csr_commit_if.uuid);
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (fpu_commit_if.valid && fpu_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=FPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.wb, fpu_commit_if.rd);
|
||||
`TRACE_ARRAY1D(fpu_commit_if.data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", fpu_commit_if.uuid);
|
||||
end
|
||||
`endif
|
||||
if (gpu_commit_if.valid && gpu_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=GPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.wb, gpu_commit_if.rd);
|
||||
`TRACE_ARRAY1D(gpu_commit_if.data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", gpu_commit_if.uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -1,6 +1,10 @@
|
|||
`ifndef VX_CONFIG
|
||||
`define VX_CONFIG
|
||||
|
||||
`ifndef XLEN
|
||||
`define XLEN 32
|
||||
`endif
|
||||
|
||||
`ifndef NUM_CLUSTERS
|
||||
`define NUM_CLUSTERS 1
|
||||
`endif
|
||||
|
@ -167,48 +171,50 @@
|
|||
`define CSR_MPM_FPU_ST_H 12'hB88
|
||||
`define CSR_MPM_GPU_ST 12'hB09
|
||||
`define CSR_MPM_GPU_ST_H 12'hB89
|
||||
// PERF: decode
|
||||
`define CSR_MPM_LOADS 12'hB0A
|
||||
`define CSR_MPM_LOADS_H 12'hB8A
|
||||
`define CSR_MPM_STORES 12'hB0B
|
||||
`define CSR_MPM_STORES_H 12'hB8B
|
||||
`define CSR_MPM_BRANCHES 12'hB0C
|
||||
`define CSR_MPM_BRANCHES_H 12'hB8C
|
||||
// PERF: icache
|
||||
`define CSR_MPM_ICACHE_READS 12'hB0A // total reads
|
||||
`define CSR_MPM_ICACHE_READS_H 12'hB8A
|
||||
`define CSR_MPM_ICACHE_MISS_R 12'hB0B // total misses
|
||||
`define CSR_MPM_ICACHE_MISS_R_H 12'hB8B
|
||||
`define CSR_MPM_ICACHE_PIPE_ST 12'hB0C // pipeline stalls
|
||||
`define CSR_MPM_ICACHE_PIPE_ST_H 12'hB8C
|
||||
`define CSR_MPM_ICACHE_CRSP_ST 12'hB0D // core response stalls
|
||||
`define CSR_MPM_ICACHE_CRSP_ST_H 12'hB8D
|
||||
`define CSR_MPM_ICACHE_READS 12'hB0D // total reads
|
||||
`define CSR_MPM_ICACHE_READS_H 12'hB8D
|
||||
`define CSR_MPM_ICACHE_MISS_R 12'hB0E // read misses
|
||||
`define CSR_MPM_ICACHE_MISS_R_H 12'hB8E
|
||||
// PERF: dcache
|
||||
`define CSR_MPM_DCACHE_READS 12'hB0E // total reads
|
||||
`define CSR_MPM_DCACHE_READS_H 12'hB8E
|
||||
`define CSR_MPM_DCACHE_WRITES 12'hB0F // total writes
|
||||
`define CSR_MPM_DCACHE_WRITES_H 12'hB8F
|
||||
`define CSR_MPM_DCACHE_MISS_R 12'hB10 // read misses
|
||||
`define CSR_MPM_DCACHE_MISS_R_H 12'hB90
|
||||
`define CSR_MPM_DCACHE_MISS_W 12'hB11 // write misses
|
||||
`define CSR_MPM_DCACHE_MISS_W_H 12'hB91
|
||||
`define CSR_MPM_DCACHE_BANK_ST 12'hB12 // bank conflicts stalls
|
||||
`define CSR_MPM_DCACHE_BANK_ST_H 12'hB92
|
||||
`define CSR_MPM_DCACHE_MSHR_ST 12'hB13 // MSHR stalls
|
||||
`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB93
|
||||
`define CSR_MPM_DCACHE_PIPE_ST 12'hB14 // pipeline stalls
|
||||
`define CSR_MPM_DCACHE_PIPE_ST_H 12'hB94
|
||||
`define CSR_MPM_DCACHE_CRSP_ST 12'hB15 // core response stalls
|
||||
`define CSR_MPM_DCACHE_CRSP_ST_H 12'hB95
|
||||
`define CSR_MPM_DCACHE_READS 12'hB0F // total reads
|
||||
`define CSR_MPM_DCACHE_READS_H 12'hB8F
|
||||
`define CSR_MPM_DCACHE_WRITES 12'hB10 // total writes
|
||||
`define CSR_MPM_DCACHE_WRITES_H 12'hB90
|
||||
`define CSR_MPM_DCACHE_MISS_R 12'hB11 // read misses
|
||||
`define CSR_MPM_DCACHE_MISS_R_H 12'hB91
|
||||
`define CSR_MPM_DCACHE_MISS_W 12'hB12 // write misses
|
||||
`define CSR_MPM_DCACHE_MISS_W_H 12'hB92
|
||||
`define CSR_MPM_DCACHE_BANK_ST 12'hB13 // bank conflicts
|
||||
`define CSR_MPM_DCACHE_BANK_ST_H 12'hB93
|
||||
`define CSR_MPM_DCACHE_MSHR_ST 12'hB14 // MSHR stalls
|
||||
`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB94
|
||||
// PERF: smem
|
||||
`define CSR_MPM_SMEM_READS 12'hB16 // total reads
|
||||
`define CSR_MPM_SMEM_READS_H 12'hB96
|
||||
`define CSR_MPM_SMEM_WRITES 12'hB17 // total writes
|
||||
`define CSR_MPM_SMEM_WRITES_H 12'hB97
|
||||
`define CSR_MPM_SMEM_BANK_ST 12'hB18 // bank conflicts stalls
|
||||
`define CSR_MPM_SMEM_BANK_ST_H 12'hB98
|
||||
`define CSR_MPM_SMEM_READS 12'hB15 // total reads
|
||||
`define CSR_MPM_SMEM_READS_H 12'hB95
|
||||
`define CSR_MPM_SMEM_WRITES 12'hB16 // total writes
|
||||
`define CSR_MPM_SMEM_WRITES_H 12'hB96
|
||||
`define CSR_MPM_SMEM_BANK_ST 12'hB17 // bank conflicts
|
||||
`define CSR_MPM_SMEM_BANK_ST_H 12'hB97
|
||||
// PERF: memory
|
||||
`define CSR_MPM_MEM_READS 12'hB19 // memory reads
|
||||
`define CSR_MPM_MEM_READS_H 12'hB99
|
||||
`define CSR_MPM_MEM_WRITES 12'hB1A // memory writes
|
||||
`define CSR_MPM_MEM_WRITES_H 12'hB9A
|
||||
`define CSR_MPM_MEM_ST 12'hB1B // memory request stalls
|
||||
`define CSR_MPM_MEM_ST_H 12'hB9B
|
||||
`define CSR_MPM_MEM_LAT 12'hB1C // memory latency (total)
|
||||
`define CSR_MPM_MEM_LAT_H 12'hB9C
|
||||
`define CSR_MPM_MEM_READS 12'hB18 // memory reads
|
||||
`define CSR_MPM_MEM_READS_H 12'hB98
|
||||
`define CSR_MPM_MEM_WRITES 12'hB19 // memory writes
|
||||
`define CSR_MPM_MEM_WRITES_H 12'hB99
|
||||
`define CSR_MPM_MEM_LAT 12'hB1A // memory latency
|
||||
`define CSR_MPM_MEM_LAT_H 12'hB9A
|
||||
// PERF: texunit
|
||||
`define CSR_MPM_TEX_READS 12'hB1B // texture accesses
|
||||
`define CSR_MPM_TEX_READS_H 12'hB9B
|
||||
`define CSR_MPM_TEX_LAT 12'hB1C // texture latency
|
||||
`define CSR_MPM_TEX_LAT_H 12'hB9C
|
||||
|
||||
// Machine Information Registers
|
||||
`define CSR_MVENDORID 12'hF11
|
||||
|
@ -232,18 +238,40 @@
|
|||
|
||||
////////// Texture Units //////////////////////////////////////////////////////
|
||||
|
||||
`define NUM_TEX_UNITS 2
|
||||
`define NUM_TEX_UNITS 2
|
||||
`define TEX_SUBPIXEL_BITS 8
|
||||
|
||||
`define CSR_TEX_STATES 7
|
||||
`define CSR_TEX_BEGIN(x) (12'hFD0 + (x) * `CSR_TEX_STATES)
|
||||
`define TEX_DIM_BITS 15
|
||||
`define TEX_LOD_MAX `TEX_DIM_BITS
|
||||
`define TEX_LOD_BITS 4
|
||||
|
||||
`define CSR_TEX_ADDR(x) (`CSR_TEX_BEGIN(x) + 12'h00)
|
||||
`define CSR_TEX_FORMAT(x) (`CSR_TEX_BEGIN(x) + 12'h01)
|
||||
`define CSR_TEX_WRAP(x) (`CSR_TEX_BEGIN(x) + 12'h02)
|
||||
`define CSR_TEX_FILTER(x) (`CSR_TEX_BEGIN(x) + 12'h03)
|
||||
`define CSR_TEX_MIPOFF(x) (`CSR_TEX_BEGIN(x) + 12'h04)
|
||||
`define CSR_TEX_WIDTH(x) (`CSR_TEX_BEGIN(x) + 12'h05)
|
||||
`define CSR_TEX_HEIGHT(x) (`CSR_TEX_BEGIN(x) + 12'h06)
|
||||
`define TEX_FXD_BITS 32
|
||||
`define TEX_FXD_FRAC (`TEX_DIM_BITS+`TEX_SUBPIXEL_BITS)
|
||||
|
||||
`define TEX_STATE_ADDR 0
|
||||
`define TEX_STATE_WIDTH 1
|
||||
`define TEX_STATE_HEIGHT 2
|
||||
`define TEX_STATE_FORMAT 3
|
||||
`define TEX_STATE_FILTER 4
|
||||
`define TEX_STATE_WRAPU 5
|
||||
`define TEX_STATE_WRAPV 6
|
||||
`define TEX_STATE_MIPOFF(lod) (7+(lod))
|
||||
`define NUM_TEX_STATES (`TEX_STATE_MIPOFF(`TEX_LOD_MAX)+1)
|
||||
|
||||
`define CSR_TEX_UNIT 12'hFD0
|
||||
|
||||
`define CSR_TEX_STATE_BEGIN 12'hFD1
|
||||
`define CSR_TEX_ADDR (`CSR_TEX_STATE_BEGIN+`TEX_STATE_ADDR)
|
||||
`define CSR_TEX_WIDTH (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WIDTH)
|
||||
`define CSR_TEX_HEIGHT (`CSR_TEX_STATE_BEGIN+`TEX_STATE_HEIGHT)
|
||||
`define CSR_TEX_FORMAT (`CSR_TEX_STATE_BEGIN+`TEX_STATE_FORMAT)
|
||||
`define CSR_TEX_FILTER (`CSR_TEX_STATE_BEGIN+`TEX_STATE_FILTER)
|
||||
`define CSR_TEX_WRAPU (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WRAPU)
|
||||
`define CSR_TEX_WRAPV (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WRAPV)
|
||||
`define CSR_TEX_MIPOFF(lod) (`CSR_TEX_STATE_BEGIN+`TEX_STATE_MIPOFF(lod))
|
||||
`define CSR_TEX_STATE_END (`CSR_TEX_STATE_BEGIN+`NUM_TEX_STATES)
|
||||
|
||||
`define CSR_TEX_STATE(addr) ((addr) - `CSR_TEX_STATE_BEGIN)
|
||||
|
||||
// Pipeline Queues ////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -262,6 +290,11 @@
|
|||
`define FPUQ_SIZE 8
|
||||
`endif
|
||||
|
||||
// Texture Unit Request Queue
|
||||
`ifndef TEXQ_SIZE
|
||||
`define TEXQ_SIZE (`NUM_WARPS * 2)
|
||||
`endif
|
||||
|
||||
// Icache Configurable Knobs //////////////////////////////////////////////////
|
||||
|
||||
// Size of cache in bytes
|
||||
|
@ -373,7 +406,7 @@
|
|||
|
||||
// Number of banks
|
||||
`ifndef L2_NUM_BANKS
|
||||
`define L2_NUM_BANKS `MIN(`NUM_CORES, 4)
|
||||
`define L2_NUM_BANKS ((`NUM_CORES < 4) ? `NUM_CORES : 4)
|
||||
`endif
|
||||
|
||||
// Number of ports per bank
|
||||
|
@ -415,7 +448,7 @@
|
|||
|
||||
// Number of banks
|
||||
`ifndef L3_NUM_BANKS
|
||||
`define L3_NUM_BANKS `MIN(`NUM_CLUSTERS, 4)
|
||||
`define L3_NUM_BANKS ((`NUM_CLUSTERS < 4) ? `NUM_CORES : 4)
|
||||
`endif
|
||||
|
||||
// Number of ports per bank
|
||||
|
|
|
@ -7,6 +7,9 @@ module VX_csr_data #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
VX_perf_tex_if.slave perf_tex_if,
|
||||
`endif
|
||||
VX_perf_memsys_if.slave perf_memsys_if,
|
||||
VX_perf_pipeline_if.slave perf_pipeline_if,
|
||||
`endif
|
||||
|
@ -22,11 +25,13 @@ module VX_csr_data #(
|
|||
`endif
|
||||
|
||||
input wire read_enable,
|
||||
input wire [`UUID_BITS-1:0] read_uuid,
|
||||
input wire[`CSR_ADDR_BITS-1:0] read_addr,
|
||||
input wire[`NW_BITS-1:0] read_wid,
|
||||
output wire[31:0] read_data,
|
||||
|
||||
input wire write_enable,
|
||||
input wire [`UUID_BITS-1:0] write_uuid,
|
||||
input wire[`CSR_ADDR_BITS-1:0] write_addr,
|
||||
input wire[`NW_BITS-1:0] write_wid,
|
||||
input wire[31:0] write_data,
|
||||
|
@ -50,35 +55,41 @@ module VX_csr_data #(
|
|||
reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FFLAGS_BITS-1:0] fcsr;
|
||||
|
||||
always @(posedge clk) begin
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (reset) begin
|
||||
fcsr <= '0;
|
||||
end
|
||||
if (fpu_to_csr_if.write_enable) begin
|
||||
fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0]
|
||||
| fpu_to_csr_if.write_fflags;
|
||||
end
|
||||
`endif
|
||||
if (write_enable) begin
|
||||
case (write_addr)
|
||||
`CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0];
|
||||
`CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0];
|
||||
`CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0];
|
||||
`CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0];
|
||||
default: begin
|
||||
`ASSERT(write_addr >= `CSR_TEX_BEGIN(0)
|
||||
&& write_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES),
|
||||
("%t: invalid CSR write address: %0h", $time, write_addr));
|
||||
end
|
||||
endcase
|
||||
end else begin
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (fpu_to_csr_if.write_enable) begin
|
||||
fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0]
|
||||
| fpu_to_csr_if.write_fflags;
|
||||
end
|
||||
`endif
|
||||
if (write_enable) begin
|
||||
case (write_addr)
|
||||
`CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0];
|
||||
`CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0];
|
||||
`CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0];
|
||||
`CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0];
|
||||
default: begin
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
`ASSERT((write_addr == `CSR_TEX_UNIT)
|
||||
|| (write_addr >= `CSR_TEX_STATE_BEGIN
|
||||
&& write_addr < `CSR_TEX_STATE_END),
|
||||
("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid));
|
||||
`else
|
||||
`ASSERT(~write_enable, ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid));
|
||||
`endif
|
||||
end
|
||||
endcase
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -89,6 +100,7 @@ module VX_csr_data #(
|
|||
assign tex_csr_if.write_enable = write_enable;
|
||||
assign tex_csr_if.write_addr = write_addr;
|
||||
assign tex_csr_if.write_data = write_data;
|
||||
assign tex_csr_if.write_uuid = write_uuid;
|
||||
`endif
|
||||
|
||||
always @(posedge clk) begin
|
||||
|
@ -147,20 +159,28 @@ module VX_csr_data #(
|
|||
`CSR_MPM_LSU_ST_H : read_data_r = 32'(perf_pipeline_if.lsu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_CSR_ST : read_data_r = perf_pipeline_if.csr_stalls[31:0];
|
||||
`CSR_MPM_CSR_ST_H : read_data_r = 32'(perf_pipeline_if.csr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`ifdef EXT_F_ENABLE
|
||||
`CSR_MPM_FPU_ST : read_data_r = perf_pipeline_if.fpu_stalls[31:0];
|
||||
`CSR_MPM_FPU_ST_H : read_data_r = 32'(perf_pipeline_if.fpu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`else
|
||||
`CSR_MPM_FPU_ST : read_data_r = '0;
|
||||
`CSR_MPM_FPU_ST_H : read_data_r = '0;
|
||||
`endif
|
||||
`CSR_MPM_GPU_ST : read_data_r = perf_pipeline_if.gpu_stalls[31:0];
|
||||
`CSR_MPM_GPU_ST_H : read_data_r = 32'(perf_pipeline_if.gpu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: decode
|
||||
`CSR_MPM_LOADS : read_data_r = perf_pipeline_if.loads[31:0];
|
||||
`CSR_MPM_LOADS_H : read_data_r = 32'(perf_pipeline_if.loads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_STORES : read_data_r = perf_pipeline_if.stores[31:0];
|
||||
`CSR_MPM_STORES_H : read_data_r = 32'(perf_pipeline_if.stores[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_BRANCHES : read_data_r = perf_pipeline_if.branches[31:0];
|
||||
`CSR_MPM_BRANCHES_H : read_data_r = 32'(perf_pipeline_if.branches[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: icache
|
||||
`CSR_MPM_ICACHE_READS : read_data_r = perf_memsys_if.icache_reads[31:0];
|
||||
`CSR_MPM_ICACHE_READS_H : read_data_r = 32'(perf_memsys_if.icache_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_ICACHE_MISS_R : read_data_r = perf_memsys_if.icache_read_misses[31:0];
|
||||
`CSR_MPM_ICACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.icache_read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_ICACHE_PIPE_ST : read_data_r = perf_memsys_if.icache_pipe_stalls[31:0];
|
||||
`CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.icache_pipe_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_ICACHE_CRSP_ST : read_data_r = perf_memsys_if.icache_crsp_stalls[31:0];
|
||||
`CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.icache_crsp_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: dcache
|
||||
// PERF: dcache
|
||||
`CSR_MPM_DCACHE_READS : read_data_r = perf_memsys_if.dcache_reads[31:0];
|
||||
`CSR_MPM_DCACHE_READS_H : read_data_r = 32'(perf_memsys_if.dcache_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_WRITES : read_data_r = perf_memsys_if.dcache_writes[31:0];
|
||||
|
@ -173,26 +193,27 @@ module VX_csr_data #(
|
|||
`CSR_MPM_DCACHE_BANK_ST_H : read_data_r = 32'(perf_memsys_if.dcache_bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_MSHR_ST : read_data_r = perf_memsys_if.dcache_mshr_stalls[31:0];
|
||||
`CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = 32'(perf_memsys_if.dcache_mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_PIPE_ST : read_data_r = perf_memsys_if.dcache_pipe_stalls[31:0];
|
||||
`CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.dcache_pipe_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_CRSP_ST : read_data_r = perf_memsys_if.dcache_crsp_stalls[31:0];
|
||||
`CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.dcache_crsp_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: smem
|
||||
// PERF: smem
|
||||
`CSR_MPM_SMEM_READS : read_data_r = perf_memsys_if.smem_reads[31:0];
|
||||
`CSR_MPM_SMEM_READS_H : read_data_r = 32'(perf_memsys_if.smem_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_SMEM_WRITES : read_data_r = perf_memsys_if.smem_writes[31:0];
|
||||
`CSR_MPM_SMEM_WRITES_H : read_data_r = 32'(perf_memsys_if.smem_writes[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_SMEM_BANK_ST : read_data_r = perf_memsys_if.smem_bank_stalls[31:0];
|
||||
`CSR_MPM_SMEM_BANK_ST_H : read_data_r = 32'(perf_memsys_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: MEM
|
||||
// PERF: memory
|
||||
`CSR_MPM_MEM_READS : read_data_r = perf_memsys_if.mem_reads[31:0];
|
||||
`CSR_MPM_MEM_READS_H : read_data_r = 32'(perf_memsys_if.mem_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_MEM_WRITES : read_data_r = perf_memsys_if.mem_writes[31:0];
|
||||
`CSR_MPM_MEM_WRITES_H : read_data_r = 32'(perf_memsys_if.mem_writes[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_MEM_ST : read_data_r = perf_memsys_if.mem_stalls[31:0];
|
||||
`CSR_MPM_MEM_ST_H : read_data_r = 32'(perf_memsys_if.mem_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_MEM_LAT : read_data_r = perf_memsys_if.mem_latency[31:0];
|
||||
`CSR_MPM_MEM_LAT_H : read_data_r = 32'(perf_memsys_if.mem_latency[`PERF_CTR_BITS-1:32]);
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
// PERF: texunit
|
||||
`CSR_MPM_TEX_READS : read_data_r = perf_tex_if.mem_reads[31:0];
|
||||
`CSR_MPM_TEX_READS_H : read_data_r = 32'(perf_tex_if.mem_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_TEX_LAT : read_data_r = perf_tex_if.mem_latency[31:0];
|
||||
`CSR_MPM_TEX_LAT_H : read_data_r = 32'(perf_tex_if.mem_latency[`PERF_CTR_BITS-1:32]);
|
||||
`endif
|
||||
// PERF: reserved
|
||||
`CSR_MPM_RESERVED : read_data_r = '0;
|
||||
`CSR_MPM_RESERVED_H : read_data_r = '0;
|
||||
|
@ -217,16 +238,23 @@ module VX_csr_data #(
|
|||
`CSR_MIMPID : read_data_r = `IMPLEMENTATION_ID;
|
||||
|
||||
default: begin
|
||||
if (!((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32))
|
||||
|| (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32)
|
||||
|| (read_addr >= `CSR_TEX_BEGIN(0) && read_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES))))) begin
|
||||
if ((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32))
|
||||
|| (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32))) begin
|
||||
read_addr_valid_r = 1;
|
||||
end else
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
if ((read_addr == `CSR_TEX_UNIT)
|
||||
|| (read_addr >= `CSR_TEX_STATE_BEGIN
|
||||
&& read_addr < `CSR_TEX_STATE_END)) begin
|
||||
read_addr_valid_r = 1;
|
||||
end else
|
||||
`endif
|
||||
read_addr_valid_r = 0;
|
||||
end
|
||||
end
|
||||
endcase
|
||||
end
|
||||
|
||||
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("invalid CSR read address: %0h", read_addr))
|
||||
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: %0h (#%0d)", $time, read_addr, read_uuid))
|
||||
|
||||
assign read_data = read_data_r;
|
||||
|
||||
|
|
|
@ -7,6 +7,9 @@ module VX_csr_unit #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
VX_perf_tex_if.slave perf_tex_if,
|
||||
`endif
|
||||
VX_perf_memsys_if.slave perf_memsys_if,
|
||||
VX_perf_pipeline_if.slave perf_pipeline_if,
|
||||
`endif
|
||||
|
@ -29,7 +32,8 @@ module VX_csr_unit #(
|
|||
);
|
||||
wire csr_we_s1;
|
||||
wire [`CSR_ADDR_BITS-1:0] csr_addr_s1;
|
||||
wire [31:0] csr_read_data, csr_read_data_s1;
|
||||
wire [31:0] csr_read_data;
|
||||
wire [31:0] csr_read_data_s1;
|
||||
wire [31:0] csr_updated_data_s1;
|
||||
|
||||
wire write_enable = csr_commit_if.valid && csr_we_s1;
|
||||
|
@ -42,8 +46,11 @@ module VX_csr_unit #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_memsys_if (perf_memsys_if),
|
||||
.perf_pipeline_if (perf_pipeline_if),
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
.perf_tex_if (perf_tex_if),
|
||||
`endif
|
||||
.perf_memsys_if (perf_memsys_if),
|
||||
.perf_pipeline_if(perf_pipeline_if),
|
||||
`endif
|
||||
.cmt_to_csr_if (cmt_to_csr_if),
|
||||
.fetch_to_csr_if(fetch_to_csr_if),
|
||||
|
@ -54,10 +61,12 @@ module VX_csr_unit #(
|
|||
.tex_csr_if (tex_csr_if),
|
||||
`endif
|
||||
.read_enable (csr_req_if.valid),
|
||||
.read_uuid (csr_req_if.uuid),
|
||||
.read_addr (csr_req_if.addr),
|
||||
.read_wid (csr_req_if.wid),
|
||||
.read_data (csr_read_data),
|
||||
.write_enable (write_enable),
|
||||
.write_uuid (csr_commit_if.uuid),
|
||||
.write_addr (csr_addr_s1),
|
||||
.write_wid (csr_commit_if.wid),
|
||||
.write_data (csr_updated_data_s1),
|
||||
|
@ -101,14 +110,14 @@ module VX_csr_unit #(
|
|||
wire stall_out = ~csr_commit_if.ready && csr_commit_if.valid;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 32 + 32),
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 32 + 32),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_out),
|
||||
.data_in ({csr_req_valid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.rd, csr_req_if.wb, csr_we_s0_unqual, csr_req_if.addr, csr_read_data_qual, csr_updated_data}),
|
||||
.data_out ({csr_commit_if.valid, csr_commit_if.wid, csr_commit_if.tmask, csr_commit_if.PC, csr_commit_if.rd, csr_commit_if.wb, csr_we_s1, csr_addr_s1, csr_read_data_s1, csr_updated_data_s1})
|
||||
.data_in ({csr_req_valid, csr_req_if.uuid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.rd, csr_req_if.wb, csr_we_s0_unqual, csr_req_if.addr, csr_read_data_qual, csr_updated_data}),
|
||||
.data_out ({csr_commit_if.valid, csr_commit_if.uuid, csr_commit_if.wid, csr_commit_if.tmask, csr_commit_if.PC, csr_commit_if.rd, csr_commit_if.wb, csr_we_s1, csr_addr_s1, csr_read_data_s1, csr_updated_data_s1})
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
|
|
|
@ -20,6 +20,10 @@ module VX_decode #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_pipeline_if.decode perf_decode_if,
|
||||
`endif
|
||||
|
||||
// inputs
|
||||
VX_ifetch_rsp_if.slave ifetch_rsp_if,
|
||||
|
||||
|
@ -57,7 +61,6 @@ module VX_decode #(
|
|||
wire [11:0] s_imm = {func7, rd};
|
||||
wire [12:0] b_imm = {instr[31], instr[7], instr[30:25], instr[11:8], 1'b0};
|
||||
wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0};
|
||||
wire [11:0] jalr_imm = {func7, rs2};
|
||||
|
||||
`UNUSED_VAR (rs3)
|
||||
|
||||
|
@ -169,7 +172,7 @@ module VX_decode #(
|
|||
use_rd = 1;
|
||||
use_imm = 1;
|
||||
is_wstall = 1;
|
||||
imm = {{20{jalr_imm[11]}}, jalr_imm};
|
||||
imm = {{20{u_12[11]}}, u_12};
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
|
@ -192,7 +195,7 @@ module VX_decode #(
|
|||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
`INST_F: begin
|
||||
`INST_FENCE: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_mod = `INST_MOD_BITS'(1);
|
||||
end
|
||||
|
@ -214,9 +217,9 @@ module VX_decode #(
|
|||
case (u_12)
|
||||
12'h000: op_type = `INST_OP_BITS'(`INST_BR_ECALL);
|
||||
12'h001: op_type = `INST_OP_BITS'(`INST_BR_EBREAK);
|
||||
12'h002: op_type = `INST_OP_BITS'(`INST_BR_URET);
|
||||
12'h102: op_type = `INST_OP_BITS'(`INST_BR_SRET);
|
||||
12'h302: op_type = `INST_OP_BITS'(`INST_BR_MRET);
|
||||
12'h102: op_type = `INST_OP_BITS'(`INST_BR_SRET);
|
||||
12'h7B2: op_type = `INST_OP_BITS'(`INST_BR_DRET);
|
||||
default:;
|
||||
endcase
|
||||
op_mod = 1;
|
||||
|
@ -347,7 +350,7 @@ module VX_decode #(
|
|||
endcase
|
||||
end
|
||||
`endif
|
||||
`INST_GPU: begin
|
||||
`INST_GPGPU: begin
|
||||
ex_type = `EX_GPU;
|
||||
case (func3)
|
||||
3'h0: begin
|
||||
|
@ -374,9 +377,21 @@ module VX_decode #(
|
|||
is_wstall = 1;
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
end
|
||||
3'h5: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_OP_BITS'(`INST_LSU_LW);
|
||||
op_mod = `INST_MOD_BITS'(2);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
`INST_GPU: begin
|
||||
case (func3)
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
3'h0: begin
|
||||
ex_type = `EX_GPU;
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_TEX);
|
||||
op_mod = `INST_MOD_BITS'(func2);
|
||||
use_rd = 1;
|
||||
|
@ -386,12 +401,6 @@ module VX_decode #(
|
|||
`USED_IREG (rs3);
|
||||
end
|
||||
`endif
|
||||
3'h6: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_OP_BITS'(`INST_LSU_LW);
|
||||
op_mod = `INST_MOD_BITS'(2);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
|
@ -405,6 +414,7 @@ module VX_decode #(
|
|||
wire wb = use_rd && (| rd_r);
|
||||
|
||||
assign decode_if.valid = ifetch_rsp_if.valid;
|
||||
assign decode_if.uuid = ifetch_rsp_if.uuid;
|
||||
assign decode_if.wid = ifetch_rsp_if.wid;
|
||||
assign decode_if.tmask = ifetch_rsp_if.tmask;
|
||||
assign decode_if.PC = ifetch_rsp_if.PC;
|
||||
|
@ -433,6 +443,42 @@ module VX_decode #(
|
|||
|
||||
assign ifetch_rsp_if.ready = decode_if.ready;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] perf_loads_per_cycle;
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] perf_stores_per_cycle;
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] perf_branches_per_cycle;
|
||||
|
||||
wire [`NUM_THREADS-1:0] perf_loads_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && decode_if.wb}};
|
||||
wire [`NUM_THREADS-1:0] perf_stores_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && ~decode_if.wb}};
|
||||
wire [`NUM_THREADS-1:0] perf_branches_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_ALU && `INST_ALU_IS_BR(decode_if.op_mod)}};
|
||||
|
||||
`POP_COUNT(perf_loads_per_cycle, perf_loads_per_mask);
|
||||
`POP_COUNT(perf_stores_per_cycle, perf_stores_per_mask);
|
||||
`POP_COUNT(perf_branches_per_cycle, perf_branches_per_mask);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_loads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_stores;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_branches;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_loads <= 0;
|
||||
perf_stores <= 0;
|
||||
perf_branches <= 0;
|
||||
end else begin
|
||||
if (decode_if.valid && decode_if.ready) begin
|
||||
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_loads_per_cycle);
|
||||
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_stores_per_cycle);
|
||||
perf_branches <= perf_branches + `PERF_CTR_BITS'(perf_branches_per_cycle);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_decode_if.loads = perf_loads;
|
||||
assign perf_decode_if.stores = perf_stores;
|
||||
assign perf_decode_if.branches = perf_branches;
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (decode_if.valid && decode_if.ready) begin
|
||||
|
@ -440,7 +486,8 @@ module VX_decode #(
|
|||
trace_ex_type(decode_if.ex_type);
|
||||
dpi_trace(", op=");
|
||||
trace_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod);
|
||||
dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm);
|
||||
dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b (#%0d)\n",
|
||||
decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm, decode_if.uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -34,6 +34,8 @@
|
|||
|
||||
`define PERF_CTR_BITS 44
|
||||
|
||||
`define UUID_BITS 44
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define EX_NOP 3'h0
|
||||
|
@ -55,7 +57,7 @@
|
|||
`define INST_S 7'b0100011 // store instructions
|
||||
`define INST_I 7'b0010011 // immediate instructions
|
||||
`define INST_R 7'b0110011 // register instructions
|
||||
`define INST_F 7'b0001111 // Fence instructions
|
||||
`define INST_FENCE 7'b0001111 // Fence instructions
|
||||
`define INST_SYS 7'b1110011 // system instructions
|
||||
|
||||
`define INST_FL 7'b0000111 // float load instruction
|
||||
|
@ -66,7 +68,8 @@
|
|||
`define INST_FNMADD 7'b1001111
|
||||
`define INST_FCI 7'b1010011 // float common instructions
|
||||
|
||||
`define INST_GPU 7'b1101011
|
||||
`define INST_GPGPU 7'b1101011
|
||||
`define INST_GPU 7'b1011011
|
||||
|
||||
`define INST_TEX 7'b0101011
|
||||
|
||||
|
@ -117,9 +120,9 @@
|
|||
`define INST_BR_JALR 4'b1001
|
||||
`define INST_BR_ECALL 4'b1010
|
||||
`define INST_BR_EBREAK 4'b1011
|
||||
`define INST_BR_MRET 4'b1100
|
||||
`define INST_BR_URET 4'b1100
|
||||
`define INST_BR_SRET 4'b1101
|
||||
`define INST_BR_DRET 4'b1110
|
||||
`define INST_BR_MRET 4'b1110
|
||||
`define INST_BR_OTHER 4'b1111
|
||||
`define INST_BR_BITS 4
|
||||
`define INST_BR_NEG(x) x[1]
|
||||
|
@ -154,6 +157,7 @@
|
|||
`define INST_LSU_BITS 4
|
||||
`define INST_LSU_FMT(x) x[2:0]
|
||||
`define INST_LSU_WSIZE(x) x[1:0]
|
||||
`define INST_LSU_IS_MEM(x) (3'h0 == x)
|
||||
`define INST_LSU_IS_FENCE(x) (3'h1 == x)
|
||||
`define INST_LSU_IS_PREFETCH(x) (3'h2 == x)
|
||||
|
||||
|
@ -185,14 +189,14 @@
|
|||
`define INST_FPU_NMADD 4'hF
|
||||
`define INST_FPU_BITS 4
|
||||
|
||||
`define INST_GPU_TMC 3'h0
|
||||
`define INST_GPU_WSPAWN 3'h1
|
||||
`define INST_GPU_SPLIT 3'h2
|
||||
`define INST_GPU_JOIN 3'h3
|
||||
`define INST_GPU_BAR 3'h4
|
||||
`define INST_GPU_PRED 3'h5
|
||||
`define INST_GPU_TEX 3'h6
|
||||
`define INST_GPU_BITS 3
|
||||
`define INST_GPU_TMC 4'h0
|
||||
`define INST_GPU_WSPAWN 4'h1
|
||||
`define INST_GPU_SPLIT 4'h2
|
||||
`define INST_GPU_JOIN 4'h3
|
||||
`define INST_GPU_BAR 4'h4
|
||||
`define INST_GPU_PRED 4'h5
|
||||
`define INST_GPU_TEX 4'h6
|
||||
`define INST_GPU_BITS 4
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -237,18 +241,15 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO // wid PC
|
||||
`define DBG_CACHE_REQ_MDATAW (`NW_BITS + 32)
|
||||
`else
|
||||
`define DBG_CACHE_REQ_MDATAW 0
|
||||
`endif
|
||||
|
||||
// non-cacheable tag bits
|
||||
`define NC_TAG_BIT 1
|
||||
|
||||
// texture tag bits
|
||||
`define TEX_TAG_BIT 1
|
||||
|
||||
// cache address type bits
|
||||
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BIT + `SM_ENABLE)
|
||||
|
||||
////////////////////////// Icache Configurable Knobs //////////////////////////
|
||||
|
||||
// Cache ID
|
||||
|
@ -264,7 +265,7 @@
|
|||
`define ICACHE_CORE_TAG_ID_BITS `NW_BITS
|
||||
|
||||
// Core request tag bits
|
||||
`define ICACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `ICACHE_CORE_TAG_ID_BITS)
|
||||
`define ICACHE_CORE_TAG_WIDTH (`UUID_BITS + `ICACHE_CORE_TAG_ID_BITS)
|
||||
|
||||
// Memory request data bits
|
||||
`define ICACHE_MEM_DATA_WIDTH (`ICACHE_LINE_SIZE * 8)
|
||||
|
@ -289,17 +290,14 @@
|
|||
// Core request tag bits
|
||||
`define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE)
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
`define LSU_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE)
|
||||
`define TEX_TAG_ID_BITS (2)
|
||||
`define LSU_TEX_TAG_ID_BITS `MAX(`LSU_TAG_ID_BITS, `TEX_TAG_ID_BITS)
|
||||
`define DCACHE_CORE_TAG_ID_BITS (`LSU_TEX_TAG_ID_BITS + `TEX_TAG_BIT)
|
||||
`define LSU_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TAG_ID_BITS)
|
||||
`define TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `TEX_TAG_ID_BITS)
|
||||
`define LSU_TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TEX_TAG_ID_BITS)
|
||||
`define LSU_TAG_ID_BITS `MAX(`LSUQ_ADDR_BITS, 2)
|
||||
`define LSU_TEX_DCACHE_TAG_BITS (`UUID_BITS + `LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS)
|
||||
`define DCACHE_CORE_TAG_ID_BITS (`LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS + `TEX_TAG_BIT)
|
||||
`else
|
||||
`define DCACHE_CORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE)
|
||||
`define LSU_TAG_ID_BITS `LSUQ_ADDR_BITS
|
||||
`define DCACHE_CORE_TAG_ID_BITS (`LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS)
|
||||
`endif
|
||||
`define DCACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCACHE_CORE_TAG_ID_BITS)
|
||||
`define DCACHE_CORE_TAG_WIDTH (`UUID_BITS + `DCACHE_CORE_TAG_ID_BITS)
|
||||
|
||||
// Memory request data bits
|
||||
`define DCACHE_MEM_DATA_WIDTH (`DCACHE_LINE_SIZE * 8)
|
||||
|
|
|
@ -42,15 +42,15 @@ module VX_dispatch (
|
|||
wire [`INST_ALU_BITS-1:0] alu_op_type = `INST_ALU_BITS'(ibuffer_if.op_type);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)),
|
||||
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)),
|
||||
.OUT_REG (1)
|
||||
) alu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (alu_req_valid),
|
||||
.ready_in (alu_req_ready),
|
||||
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, alu_op_type, ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
|
||||
.data_out ({alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}),
|
||||
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, alu_op_type, ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
|
||||
.data_out ({alu_req_if.uuid, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}),
|
||||
.valid_out (alu_req_if.valid),
|
||||
.ready_out (alu_req_if.ready)
|
||||
);
|
||||
|
@ -63,15 +63,15 @@ module VX_dispatch (
|
|||
wire lsu_is_prefetch = `INST_LSU_IS_PREFETCH(ibuffer_if.op_mod);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1),
|
||||
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1),
|
||||
.OUT_REG (1)
|
||||
) lsu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (lsu_req_valid),
|
||||
.ready_in (lsu_req_ready),
|
||||
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, lsu_is_prefetch}),
|
||||
.data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data, lsu_req_if.is_prefetch}),
|
||||
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, lsu_is_prefetch}),
|
||||
.data_out ({lsu_req_if.uuid, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data, lsu_req_if.is_prefetch}),
|
||||
.valid_out (lsu_req_if.valid),
|
||||
.ready_out (lsu_req_if.ready)
|
||||
);
|
||||
|
@ -85,15 +85,15 @@ module VX_dispatch (
|
|||
wire [31:0] csr_rs1_data = gpr_rsp_if.rs1_data[tid];
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32),
|
||||
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32),
|
||||
.OUT_REG (1)
|
||||
) csr_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (csr_req_valid),
|
||||
.ready_in (csr_req_ready),
|
||||
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, csr_op_type, csr_addr, ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, csr_imm, csr_rs1_data}),
|
||||
.data_out ({csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.imm, csr_req_if.rs1_data}),
|
||||
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, csr_op_type, csr_addr, ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, csr_imm, csr_rs1_data}),
|
||||
.data_out ({csr_req_if.uuid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.imm, csr_req_if.rs1_data}),
|
||||
.valid_out (csr_req_if.valid),
|
||||
.ready_out (csr_req_if.ready)
|
||||
);
|
||||
|
@ -105,15 +105,15 @@ module VX_dispatch (
|
|||
wire [`INST_FPU_BITS-1:0] fpu_op_type = `INST_FPU_BITS'(ibuffer_if.op_type);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)),
|
||||
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)),
|
||||
.OUT_REG (1)
|
||||
) fpu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (fpu_req_valid),
|
||||
.ready_in (fpu_req_ready),
|
||||
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, fpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
|
||||
.data_out ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}),
|
||||
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, fpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
|
||||
.data_out ({fpu_req_if.uuid, fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}),
|
||||
.valid_out (fpu_req_if.valid),
|
||||
.ready_out (fpu_req_if.ready)
|
||||
);
|
||||
|
@ -127,15 +127,15 @@ module VX_dispatch (
|
|||
wire [`INST_GPU_BITS-1:0] gpu_op_type = `INST_GPU_BITS'(ibuffer_if.op_type);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)),
|
||||
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)),
|
||||
.OUT_REG (1)
|
||||
) gpu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (gpu_req_valid),
|
||||
.ready_in (gpu_req_ready),
|
||||
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
|
||||
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}),
|
||||
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
|
||||
.data_out ({gpu_req_if.uuid, gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}),
|
||||
.valid_out (gpu_req_if.valid),
|
||||
.ready_out (gpu_req_if.ready)
|
||||
);
|
||||
|
|
|
@ -52,49 +52,31 @@ module VX_execute #(
|
|||
VX_dcache_req_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`LSU_DCACHE_TAG_BITS)
|
||||
.TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS)
|
||||
) lsu_dcache_req_if();
|
||||
|
||||
VX_dcache_rsp_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`LSU_DCACHE_TAG_BITS)
|
||||
.TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS)
|
||||
) lsu_dcache_rsp_if();
|
||||
|
||||
VX_dcache_req_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`TEX_DCACHE_TAG_BITS)
|
||||
.TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS)
|
||||
) tex_dcache_req_if();
|
||||
|
||||
VX_dcache_rsp_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`TEX_DCACHE_TAG_BITS)
|
||||
.TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS)
|
||||
) tex_dcache_rsp_if();
|
||||
|
||||
VX_tex_csr_if tex_csr_if();
|
||||
|
||||
wire [`NUM_THREADS-1:0][`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_in, lsu_tag_in;
|
||||
wire [`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_out, lsu_tag_out;
|
||||
|
||||
`UNUSED_VAR (tex_tag_out)
|
||||
`UNUSED_VAR (lsu_tag_out)
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
assign tex_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(tex_dcache_req_if.tag[i][`TEX_TAG_ID_BITS-1:0]);
|
||||
assign lsu_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(lsu_dcache_req_if.tag[i][`LSU_TAG_ID_BITS-1:0]);
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
assign tex_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = tex_dcache_req_if.tag[i][`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS];
|
||||
assign lsu_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = lsu_dcache_req_if.tag[i][`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS];
|
||||
`endif
|
||||
end
|
||||
|
||||
assign tex_dcache_rsp_if.tag[`TEX_TAG_ID_BITS-1:0] = tex_tag_out[`TEX_TAG_ID_BITS-1:0];
|
||||
assign lsu_dcache_rsp_if.tag[`LSU_TAG_ID_BITS-1:0] = lsu_tag_out[`LSU_TAG_ID_BITS-1:0];
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
assign tex_dcache_rsp_if.tag[`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS] = tex_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS];
|
||||
assign lsu_dcache_rsp_if.tag[`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS] = lsu_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS];
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_tex_if perf_tex_if();
|
||||
`endif
|
||||
|
||||
VX_cache_arb #(
|
||||
|
@ -113,7 +95,7 @@ module VX_execute #(
|
|||
.req_byteen_in ({tex_dcache_req_if.byteen, lsu_dcache_req_if.byteen}),
|
||||
.req_addr_in ({tex_dcache_req_if.addr, lsu_dcache_req_if.addr}),
|
||||
.req_data_in ({tex_dcache_req_if.data, lsu_dcache_req_if.data}),
|
||||
.req_tag_in ({tex_tag_in, lsu_tag_in}),
|
||||
.req_tag_in ({tex_dcache_req_if.tag, lsu_dcache_req_if.tag}),
|
||||
.req_ready_in ({tex_dcache_req_if.ready, lsu_dcache_req_if.ready}),
|
||||
|
||||
// Dcache request
|
||||
|
@ -136,7 +118,7 @@ module VX_execute #(
|
|||
.rsp_valid_out ({tex_dcache_rsp_if.valid, lsu_dcache_rsp_if.valid}),
|
||||
.rsp_tmask_out ({tex_dcache_rsp_if.tmask, lsu_dcache_rsp_if.tmask}),
|
||||
.rsp_data_out ({tex_dcache_rsp_if.data, lsu_dcache_rsp_if.data}),
|
||||
.rsp_tag_out ({tex_tag_out, lsu_tag_out}),
|
||||
.rsp_tag_out ({tex_dcache_rsp_if.tag, lsu_dcache_rsp_if.tag}),
|
||||
.rsp_ready_out ({tex_dcache_rsp_if.ready, lsu_dcache_rsp_if.ready})
|
||||
);
|
||||
|
||||
|
@ -187,6 +169,9 @@ module VX_execute #(
|
|||
.clk (clk),
|
||||
.reset (csr_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
.perf_tex_if (perf_tex_if),
|
||||
`endif
|
||||
.perf_memsys_if (perf_memsys_if),
|
||||
.perf_pipeline_if(perf_pipeline_if),
|
||||
`endif
|
||||
|
@ -231,6 +216,9 @@ module VX_execute #(
|
|||
.reset (gpu_reset),
|
||||
.gpu_req_if (gpu_req_if),
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_tex_if (perf_tex_if),
|
||||
`endif
|
||||
.tex_csr_if (tex_csr_if),
|
||||
.dcache_req_if (tex_dcache_req_if),
|
||||
.dcache_rsp_if (tex_dcache_rsp_if),
|
||||
|
|
|
@ -22,6 +22,7 @@ module VX_fpu_unit #(
|
|||
wire valid_out;
|
||||
wire ready_out;
|
||||
|
||||
wire [`UUID_BITS-1:0] rsp_uuid;
|
||||
wire [`NW_BITS-1:0] rsp_wid;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
wire [31:0] rsp_PC;
|
||||
|
@ -39,7 +40,7 @@ module VX_fpu_unit #(
|
|||
wire fpuq_pop = valid_out && ready_out;
|
||||
|
||||
VX_index_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1),
|
||||
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1),
|
||||
.SIZE (`FPUQ_SIZE)
|
||||
) req_metadata (
|
||||
.clk (clk),
|
||||
|
@ -48,8 +49,8 @@ module VX_fpu_unit #(
|
|||
.write_addr (tag_in),
|
||||
.read_addr (tag_out),
|
||||
.release_addr (tag_out),
|
||||
.write_data ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.rd, fpu_req_if.wb}),
|
||||
.read_data ({rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}),
|
||||
.write_data ({fpu_req_if.uuid, fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.rd, fpu_req_if.wb}),
|
||||
.read_data ({rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}),
|
||||
.release_slot (fpuq_pop),
|
||||
.full (fpuq_full),
|
||||
`UNUSED_PIN (empty)
|
||||
|
@ -180,14 +181,14 @@ module VX_fpu_unit #(
|
|||
wire stall_out = ~fpu_commit_if.ready && fpu_commit_if.valid;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFLAGS_BITS),
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFLAGS_BITS),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_out),
|
||||
.data_in ({valid_out, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, result, has_fflags, rsp_fflags}),
|
||||
.data_out ({fpu_commit_if.valid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.wb, fpu_commit_if.data, has_fflags_r, fflags_r})
|
||||
.data_in ({valid_out, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, result, has_fflags, rsp_fflags}),
|
||||
.data_out ({fpu_commit_if.valid, fpu_commit_if.uuid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.wb, fpu_commit_if.data, has_fflags_r, fflags_r})
|
||||
);
|
||||
|
||||
assign fpu_commit_if.eop = 1'b1;
|
||||
|
|
|
@ -12,6 +12,10 @@ module VX_gpu_unit #(
|
|||
VX_gpu_req_if.slave gpu_req_if,
|
||||
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
// PERF
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_tex_if.master perf_tex_if,
|
||||
`endif
|
||||
VX_dcache_req_if.master dcache_req_if,
|
||||
VX_dcache_rsp_if.slave dcache_rsp_if,
|
||||
VX_tex_csr_if.slave tex_csr_if,
|
||||
|
@ -28,12 +32,13 @@ module VX_gpu_unit #(
|
|||
localparam WCTL_DATAW = `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS;
|
||||
localparam RSP_DATAW = `MAX(`NUM_THREADS * 32, WCTL_DATAW);
|
||||
|
||||
wire rsp_valid;
|
||||
wire [`NW_BITS-1:0] rsp_wid;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
wire [31:0] rsp_PC;
|
||||
wire [`NR_BITS-1:0] rsp_rd;
|
||||
wire rsp_wb;
|
||||
wire rsp_valid;
|
||||
wire [`UUID_BITS-1:0] rsp_uuid;
|
||||
wire [`NW_BITS-1:0] rsp_wid;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
wire [31:0] rsp_PC;
|
||||
wire [`NR_BITS-1:0] rsp_rd;
|
||||
wire rsp_wb;
|
||||
|
||||
wire [RSP_DATAW-1:0] rsp_data, rsp_data_r;
|
||||
|
||||
|
@ -112,6 +117,7 @@ module VX_gpu_unit #(
|
|||
wire is_tex = (gpu_req_if.op_type == `INST_GPU_TEX);
|
||||
|
||||
assign tex_req_if.valid = gpu_req_if.valid && is_tex;
|
||||
assign tex_req_if.uuid = gpu_req_if.uuid;
|
||||
assign tex_req_if.wid = gpu_req_if.wid;
|
||||
assign tex_req_if.tmask = gpu_req_if.tmask;
|
||||
assign tex_req_if.PC = gpu_req_if.PC;
|
||||
|
@ -128,6 +134,9 @@ module VX_gpu_unit #(
|
|||
) tex_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_tex_if (perf_tex_if),
|
||||
`endif
|
||||
.tex_req_if (tex_req_if),
|
||||
.tex_csr_if (tex_csr_if),
|
||||
.tex_rsp_if (tex_rsp_if),
|
||||
|
@ -143,6 +152,7 @@ module VX_gpu_unit #(
|
|||
assign is_warp_ctl = !(is_tex || tex_rsp_if.valid);
|
||||
|
||||
assign rsp_valid = tex_rsp_if.valid || (gpu_req_if.valid && ~is_tex);
|
||||
assign rsp_uuid = tex_rsp_if.valid ? tex_rsp_if.uuid : gpu_req_if.uuid;
|
||||
assign rsp_wid = tex_rsp_if.valid ? tex_rsp_if.wid : gpu_req_if.wid;
|
||||
assign rsp_tmask = tex_rsp_if.valid ? tex_rsp_if.tmask : gpu_req_if.tmask;
|
||||
assign rsp_PC = tex_rsp_if.valid ? tex_rsp_if.PC : gpu_req_if.PC;
|
||||
|
@ -161,6 +171,7 @@ module VX_gpu_unit #(
|
|||
assign is_warp_ctl = 1;
|
||||
|
||||
assign rsp_valid = gpu_req_if.valid;
|
||||
assign rsp_uuid = gpu_req_if.uuid;
|
||||
assign rsp_wid = gpu_req_if.wid;
|
||||
assign rsp_tmask = gpu_req_if.tmask;
|
||||
assign rsp_PC = gpu_req_if.PC;
|
||||
|
@ -176,14 +187,14 @@ module VX_gpu_unit #(
|
|||
assign stall_out = ~gpu_commit_if.ready && gpu_commit_if.valid;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1),
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_out),
|
||||
.data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}),
|
||||
.data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data_r, is_warp_ctl_r})
|
||||
.data_in ({rsp_valid, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}),
|
||||
.data_out ({gpu_commit_if.valid, gpu_commit_if.uuid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data_r, is_warp_ctl_r})
|
||||
);
|
||||
|
||||
assign gpu_commit_if.data = rsp_data_r[(`NUM_THREADS * 32)-1:0];
|
||||
|
@ -200,7 +211,7 @@ module VX_gpu_unit #(
|
|||
assign gpu_req_if.ready = ~stall_in;
|
||||
|
||||
`SCOPE_ASSIGN (gpu_rsp_valid, warp_ctl_if.valid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_wid, warp_ctl_if.wid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_uuid, gpu_commit_if.uuid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_tmc, warp_ctl_if.tmc.valid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_wspawn, warp_ctl_if.wspawn.valid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_split, warp_ctl_if.split.valid);
|
||||
|
|
|
@ -15,7 +15,7 @@ module VX_ibuffer #(
|
|||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1;
|
||||
localparam DATAW = `UUID_BITS + `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1;
|
||||
localparam ADDRW = $clog2(`IBUF_SIZE+1);
|
||||
localparam NWARPSW = $clog2(`NUM_WARPS+1);
|
||||
|
||||
|
@ -168,7 +168,8 @@ module VX_ibuffer #(
|
|||
|
||||
assign decode_if.ready = ~q_full[decode_if.wid];
|
||||
|
||||
assign q_data_in = {decode_if.tmask,
|
||||
assign q_data_in = {decode_if.uuid,
|
||||
decode_if.tmask,
|
||||
decode_if.PC,
|
||||
decode_if.ex_type,
|
||||
decode_if.op_type,
|
||||
|
@ -184,7 +185,8 @@ module VX_ibuffer #(
|
|||
|
||||
assign ibuffer_if.valid = deq_valid;
|
||||
assign ibuffer_if.wid = deq_wid;
|
||||
assign {ibuffer_if.tmask,
|
||||
assign {ibuffer_if.uuid,
|
||||
ibuffer_if.tmask,
|
||||
ibuffer_if.PC,
|
||||
ibuffer_if.ex_type,
|
||||
ibuffer_if.op_type,
|
||||
|
|
|
@ -24,77 +24,77 @@ module VX_icache_stage #(
|
|||
|
||||
localparam OUT_REG = 0;
|
||||
|
||||
wire [`NW_BITS-1:0] req_tag, rsp_tag;
|
||||
|
||||
wire icache_req_fire = icache_req_if.valid && icache_req_if.ready;
|
||||
|
||||
wire [`NW_BITS-1:0] req_tag = ifetch_req_if.wid;
|
||||
wire [`NW_BITS-1:0] rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0];
|
||||
assign req_tag = ifetch_req_if.wid;
|
||||
assign rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0];
|
||||
|
||||
wire [`UUID_BITS-1:0] rsp_uuid;
|
||||
wire [31:0] rsp_PC;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (32 + `NUM_THREADS),
|
||||
.DATAW (32 + `NUM_THREADS + `UUID_BITS),
|
||||
.SIZE (`NUM_WARPS),
|
||||
.LUTRAM (1)
|
||||
) req_metadata (
|
||||
.clk (clk),
|
||||
.wren (icache_req_fire),
|
||||
.waddr (req_tag),
|
||||
.wdata ({ifetch_req_if.PC, ifetch_req_if.tmask}),
|
||||
.wdata ({ifetch_req_if.PC, ifetch_req_if.tmask, ifetch_req_if.uuid}),
|
||||
.raddr (rsp_tag),
|
||||
.rdata ({rsp_PC, rsp_tmask})
|
||||
.rdata ({rsp_PC, rsp_tmask, rsp_uuid})
|
||||
);
|
||||
|
||||
`RUNTIME_ASSERT((!ifetch_req_if.valid || ifetch_req_if.PC >= `STARTUP_ADDR),
|
||||
("invalid PC=%0h, wid=%0d, tmask=%b", ifetch_req_if.PC, ifetch_req_if.wid, ifetch_req_if.tmask))
|
||||
("%t: *** invalid PC=%0h, wid=%0d, tmask=%b (#%0d)", $time, ifetch_req_if.PC, ifetch_req_if.wid, ifetch_req_if.tmask, ifetch_req_if.uuid))
|
||||
|
||||
// Icache Request
|
||||
assign icache_req_if.valid = ifetch_req_if.valid;
|
||||
assign icache_req_if.addr = ifetch_req_if.PC[31:2];
|
||||
assign icache_req_if.tag = {ifetch_req_if.uuid, req_tag};
|
||||
|
||||
// Can accept new request?
|
||||
assign ifetch_req_if.ready = icache_req_if.ready;
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
assign icache_req_if.tag = {ifetch_req_if.wid, ifetch_req_if.PC, req_tag};
|
||||
`else
|
||||
assign icache_req_if.tag = req_tag;
|
||||
`endif
|
||||
|
||||
wire [`NW_BITS-1:0] rsp_wid = rsp_tag;
|
||||
|
||||
wire stall_out = ~ifetch_rsp_if.ready && (0 == OUT_REG && ifetch_rsp_if.valid);
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + 32),
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + 32 + `UUID_BITS),
|
||||
.RESETW (1),
|
||||
.DEPTH (OUT_REG)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_out),
|
||||
.data_in ({icache_rsp_if.valid, rsp_wid, rsp_tmask, rsp_PC, icache_rsp_if.data}),
|
||||
.data_out ({ifetch_rsp_if.valid, ifetch_rsp_if.wid, ifetch_rsp_if.tmask, ifetch_rsp_if.PC, ifetch_rsp_if.data})
|
||||
.data_in ({icache_rsp_if.valid, rsp_wid, rsp_tmask, rsp_PC, icache_rsp_if.data, rsp_uuid}),
|
||||
.data_out ({ifetch_rsp_if.valid, ifetch_rsp_if.wid, ifetch_rsp_if.tmask, ifetch_rsp_if.PC, ifetch_rsp_if.data, ifetch_rsp_if.uuid})
|
||||
);
|
||||
|
||||
// Can accept new response?
|
||||
assign icache_rsp_if.ready = ~stall_out;
|
||||
|
||||
`SCOPE_ASSIGN (icache_req_fire, icache_req_fire);
|
||||
`SCOPE_ASSIGN (icache_req_wid, ifetch_req_if.wid);
|
||||
`SCOPE_ASSIGN (icache_req_uuid, ifetch_req_if.uuid);
|
||||
`SCOPE_ASSIGN (icache_req_addr, {icache_req_if.addr, 2'b0});
|
||||
`SCOPE_ASSIGN (icache_req_tag, req_tag);
|
||||
|
||||
`SCOPE_ASSIGN (icache_rsp_fire, icache_rsp_if.valid && icache_rsp_if.ready);
|
||||
`SCOPE_ASSIGN (icache_rsp_uuid, rsp_uuid);
|
||||
`SCOPE_ASSIGN (icache_rsp_data, icache_rsp_if.data);
|
||||
`SCOPE_ASSIGN (icache_rsp_tag, rsp_tag);
|
||||
|
||||
`ifdef DBG_TRACE_CORE_ICACHE
|
||||
always @(posedge clk) begin
|
||||
if (icache_req_if.valid && icache_req_if.ready) begin
|
||||
dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC);
|
||||
if (icache_req_fire) begin
|
||||
dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h (#%0d)\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC, ifetch_req_if.uuid);
|
||||
end
|
||||
if (ifetch_rsp_if.valid && ifetch_rsp_if.ready) begin
|
||||
dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, data=%0h\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, ifetch_rsp_if.data);
|
||||
dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, data=%0h (#%0d)\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, ifetch_rsp_if.data, ifetch_rsp_if.uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -9,7 +9,7 @@ module VX_issue #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_pipeline_if.master perf_pipeline_if,
|
||||
VX_perf_pipeline_if.issue perf_issue_if,
|
||||
`endif
|
||||
|
||||
VX_decode_if.slave decode_if,
|
||||
|
@ -38,6 +38,7 @@ module VX_issue #(
|
|||
|
||||
// scoreboard writeback interface
|
||||
assign sboard_wb_if.valid = writeback_if.valid;
|
||||
assign sboard_wb_if.uuid = writeback_if.uuid;
|
||||
assign sboard_wb_if.wid = writeback_if.wid;
|
||||
assign sboard_wb_if.PC = writeback_if.PC;
|
||||
assign sboard_wb_if.rd = writeback_if.rd;
|
||||
|
@ -45,6 +46,7 @@ module VX_issue #(
|
|||
|
||||
// scoreboard interface
|
||||
assign scoreboard_if.valid = ibuffer_if.valid && dispatch_if.ready;
|
||||
assign scoreboard_if.uuid = ibuffer_if.uuid;
|
||||
assign scoreboard_if.wid = ibuffer_if.wid;
|
||||
assign scoreboard_if.PC = ibuffer_if.PC;
|
||||
assign scoreboard_if.wb = ibuffer_if.wb;
|
||||
|
@ -57,6 +59,7 @@ module VX_issue #(
|
|||
|
||||
// dispatch interface
|
||||
assign dispatch_if.valid = ibuffer_if.valid && scoreboard_if.ready;
|
||||
assign dispatch_if.uuid = ibuffer_if.uuid;
|
||||
assign dispatch_if.wid = ibuffer_if.wid;
|
||||
assign dispatch_if.tmask = ibuffer_if.tmask;
|
||||
assign dispatch_if.PC = ibuffer_if.PC;
|
||||
|
@ -121,9 +124,8 @@ module VX_issue #(
|
|||
);
|
||||
|
||||
`SCOPE_ASSIGN (issue_fire, ibuffer_if.valid && ibuffer_if.ready);
|
||||
`SCOPE_ASSIGN (issue_wid, ibuffer_if.wid);
|
||||
`SCOPE_ASSIGN (issue_uuid, ibuffer_if.uuid);
|
||||
`SCOPE_ASSIGN (issue_tmask, ibuffer_if.tmask);
|
||||
`SCOPE_ASSIGN (issue_pc, ibuffer_if.PC);
|
||||
`SCOPE_ASSIGN (issue_ex_type, ibuffer_if.ex_type);
|
||||
`SCOPE_ASSIGN (issue_op_type, ibuffer_if.op_type);
|
||||
`SCOPE_ASSIGN (issue_op_mod, ibuffer_if.op_mod);
|
||||
|
@ -140,10 +142,9 @@ module VX_issue #(
|
|||
`SCOPE_ASSIGN (gpr_rs1, gpr_rsp_if.rs1_data);
|
||||
`SCOPE_ASSIGN (gpr_rs2, gpr_rsp_if.rs2_data);
|
||||
`SCOPE_ASSIGN (gpr_rs3, gpr_rsp_if.rs3_data);
|
||||
`SCOPE_ASSIGN (writeback_valid, writeback_if.valid);
|
||||
`SCOPE_ASSIGN (writeback_valid, writeback_if.valid);
|
||||
`SCOPE_ASSIGN (writeback_uuid, writeback_if.uuid);
|
||||
`SCOPE_ASSIGN (writeback_tmask, writeback_if.tmask);
|
||||
`SCOPE_ASSIGN (writeback_wid, writeback_if.wid);
|
||||
`SCOPE_ASSIGN (writeback_pc, writeback_if.PC);
|
||||
`SCOPE_ASSIGN (writeback_rd, writeback_if.rd);
|
||||
`SCOPE_ASSIGN (writeback_data, writeback_if.data);
|
||||
`SCOPE_ASSIGN (writeback_eop, writeback_if.eop);
|
||||
|
@ -171,40 +172,35 @@ module VX_issue #(
|
|||
perf_fpu_stalls <= 0;
|
||||
`endif
|
||||
end else begin
|
||||
if (decode_if.valid & !decode_if.ready) begin
|
||||
if (decode_if.valid & ~decode_if.ready) begin
|
||||
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (scoreboard_if.valid & !scoreboard_if.ready) begin
|
||||
if (scoreboard_if.valid & ~scoreboard_if.ready) begin
|
||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (alu_req_if.valid & !alu_req_if.ready) begin
|
||||
perf_alu_stalls <= perf_alu_stalls + `PERF_CTR_BITS'd1;
|
||||
if (dispatch_if.valid & ~dispatch_if.ready) begin
|
||||
case (dispatch_if.ex_type)
|
||||
`EX_ALU: perf_alu_stalls <= perf_alu_stalls + `PERF_CTR_BITS'd1;
|
||||
`ifdef EXT_F_ENABLE
|
||||
`EX_FPU: perf_fpu_stalls <= perf_fpu_stalls + `PERF_CTR_BITS'd1;
|
||||
`endif
|
||||
`EX_LSU: perf_lsu_stalls <= perf_lsu_stalls + `PERF_CTR_BITS'd1;
|
||||
`EX_CSR: perf_csr_stalls <= perf_csr_stalls + `PERF_CTR_BITS'd1;
|
||||
//`EX_GPU:
|
||||
default: perf_gpu_stalls <= perf_gpu_stalls + `PERF_CTR_BITS'd1;
|
||||
endcase
|
||||
end
|
||||
if (lsu_req_if.valid & !lsu_req_if.ready) begin
|
||||
perf_lsu_stalls <= perf_lsu_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (csr_req_if.valid & !csr_req_if.ready) begin
|
||||
perf_csr_stalls <= perf_csr_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (gpu_req_if.valid & !gpu_req_if.ready) begin
|
||||
perf_gpu_stalls <= perf_gpu_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (fpu_req_if.valid & !fpu_req_if.ready) begin
|
||||
perf_fpu_stalls <= perf_fpu_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
`endif
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_pipeline_if.ibf_stalls = perf_ibf_stalls;
|
||||
assign perf_pipeline_if.scb_stalls = perf_scb_stalls;
|
||||
assign perf_pipeline_if.alu_stalls = perf_alu_stalls;
|
||||
assign perf_pipeline_if.lsu_stalls = perf_lsu_stalls;
|
||||
assign perf_pipeline_if.csr_stalls = perf_csr_stalls;
|
||||
assign perf_pipeline_if.gpu_stalls = perf_gpu_stalls;
|
||||
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
|
||||
assign perf_issue_if.scb_stalls = perf_scb_stalls;
|
||||
assign perf_issue_if.alu_stalls = perf_alu_stalls;
|
||||
assign perf_issue_if.lsu_stalls = perf_lsu_stalls;
|
||||
assign perf_issue_if.csr_stalls = perf_csr_stalls;
|
||||
assign perf_issue_if.gpu_stalls = perf_gpu_stalls;
|
||||
`ifdef EXT_F_ENABLE
|
||||
assign perf_pipeline_if.fpu_stalls = perf_fpu_stalls;
|
||||
assign perf_issue_if.fpu_stalls = perf_fpu_stalls;
|
||||
`endif
|
||||
`endif
|
||||
|
||||
|
@ -216,7 +212,7 @@ module VX_issue #(
|
|||
`TRACE_ARRAY1D(alu_req_if.rs1_data, `NUM_THREADS);
|
||||
dpi_trace(", rs2_data=");
|
||||
`TRACE_ARRAY1D(alu_req_if.rs2_data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", alu_req_if.uuid);
|
||||
end
|
||||
if (lsu_req_if.valid && lsu_req_if.ready) begin
|
||||
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=LSU, tmask=%b, rd=%0d, offset=%0h, addr=",
|
||||
|
@ -224,13 +220,13 @@ module VX_issue #(
|
|||
`TRACE_ARRAY1D(lsu_req_if.base_addr, `NUM_THREADS);
|
||||
dpi_trace(", data=");
|
||||
`TRACE_ARRAY1D(lsu_req_if.store_data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", lsu_req_if.uuid);
|
||||
end
|
||||
if (csr_req_if.valid && csr_req_if.ready) begin
|
||||
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=CSR, tmask=%b, rd=%0d, addr=%0h, rs1_data=",
|
||||
$time, CORE_ID, csr_req_if.wid, csr_req_if.PC, csr_req_if.tmask, csr_req_if.rd, csr_req_if.addr);
|
||||
`TRACE_ARRAY1D(csr_req_if.rs1_data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", csr_req_if.uuid);
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (fpu_req_if.valid && fpu_req_if.ready) begin
|
||||
|
@ -241,7 +237,7 @@ module VX_issue #(
|
|||
`TRACE_ARRAY1D(fpu_req_if.rs2_data, `NUM_THREADS);
|
||||
dpi_trace(", rs3_data=");
|
||||
`TRACE_ARRAY1D(fpu_req_if.rs3_data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", fpu_req_if.uuid);
|
||||
end
|
||||
`endif
|
||||
if (gpu_req_if.valid && gpu_req_if.ready) begin
|
||||
|
@ -252,7 +248,7 @@ module VX_issue #(
|
|||
`TRACE_ARRAY1D(gpu_req_if.rs2_data, `NUM_THREADS);
|
||||
dpi_trace(", rs3_data=");
|
||||
`TRACE_ARRAY1D(gpu_req_if.rs3_data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", gpu_req_if.uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -21,16 +21,14 @@ module VX_lsu_unit #(
|
|||
);
|
||||
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
|
||||
localparam MEM_ADDRW = 32 - MEM_ASHIFT;
|
||||
|
||||
localparam REQ_ASHIFT = `CLOG2(`DCACHE_WORD_SIZE);
|
||||
|
||||
localparam ADDR_TYPEW = `NC_TAG_BIT + `SM_ENABLE;
|
||||
|
||||
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
|
||||
`STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
|
||||
`STATIC_ASSERT(`SMEM_SIZE == `MEM_BLOCK_SIZE * (`SMEM_SIZE / `MEM_BLOCK_SIZE), ("invalid parameter"))
|
||||
|
||||
wire req_valid;
|
||||
wire [`UUID_BITS-1:0] req_uuid;
|
||||
wire [`NUM_THREADS-1:0] req_tmask;
|
||||
wire [`NUM_THREADS-1:0][31:0] req_addr;
|
||||
wire [`INST_LSU_BITS-1:0] req_type;
|
||||
|
@ -44,8 +42,9 @@ module VX_lsu_unit #(
|
|||
|
||||
wire mbuf_empty;
|
||||
|
||||
wire [`NUM_THREADS-1:0][ADDR_TYPEW-1:0] lsu_addr_type, req_addr_type;
|
||||
wire [`NUM_THREADS-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type, req_addr_type;
|
||||
|
||||
// full address calculation
|
||||
wire [`NUM_THREADS-1:0][31:0] full_addr;
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
assign full_addr[i] = lsu_req_if.base_addr[i] + lsu_req_if.offset;
|
||||
|
@ -56,16 +55,16 @@ module VX_lsu_unit #(
|
|||
for (genvar i = 0; i < (`NUM_THREADS-1); i++) begin
|
||||
assign addr_matches[i] = (lsu_req_if.base_addr[i+1] == lsu_req_if.base_addr[0]) || ~lsu_req_if.tmask[i+1];
|
||||
end
|
||||
|
||||
wire lsu_is_dup = lsu_req_if.tmask[0] && (& addr_matches);
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
// is non-cacheable address
|
||||
wire is_addr_nc = (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] >= MEM_ADDRW'(`IO_BASE_ADDR >> MEM_ASHIFT));
|
||||
|
||||
if (`SM_ENABLE) begin
|
||||
// is shared memory address
|
||||
wire is_addr_sm = (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] >= MEM_ADDRW'((`SMEM_BASE_ADDR - `SMEM_SIZE) >> MEM_ASHIFT))
|
||||
& (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] < MEM_ADDRW'(`SMEM_BASE_ADDR >> MEM_ASHIFT));
|
||||
& (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] < MEM_ADDRW'(`SMEM_BASE_ADDR >> MEM_ASHIFT));
|
||||
assign lsu_addr_type[i] = {is_addr_nc, is_addr_sm};
|
||||
end else begin
|
||||
assign lsu_addr_type[i] = is_addr_nc;
|
||||
|
@ -83,19 +82,20 @@ module VX_lsu_unit #(
|
|||
wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.DATAW (1 + 1 + 1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * `CACHE_ADDR_TYPE_BITS) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.RESETW (1)
|
||||
) req_pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_in),
|
||||
.data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}),
|
||||
.data_out ({req_valid, req_is_dup, req_is_prefetch, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data})
|
||||
.data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.uuid, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}),
|
||||
.data_out ({req_valid, req_is_dup, req_is_prefetch, req_uuid, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data})
|
||||
);
|
||||
|
||||
// Can accept new request?
|
||||
assign lsu_req_if.ready = ~stall_in && ~fence_wait;
|
||||
|
||||
wire [`UUID_BITS-1:0] rsp_uuid;
|
||||
wire [`NW_BITS-1:0] rsp_wid;
|
||||
wire [31:0] rsp_pc;
|
||||
wire [`NR_BITS-1:0] rsp_rd;
|
||||
|
@ -104,9 +104,6 @@ module VX_lsu_unit #(
|
|||
wire rsp_is_dup;
|
||||
wire rsp_is_prefetch;
|
||||
|
||||
`UNUSED_VAR (rsp_type)
|
||||
`UNUSED_VAR (rsp_is_prefetch)
|
||||
|
||||
reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask;
|
||||
wire [`NUM_THREADS-1:0] rsp_rem_mask_n;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
|
@ -117,6 +114,9 @@ module VX_lsu_unit #(
|
|||
wire [`LSUQ_ADDR_BITS-1:0] mbuf_waddr, mbuf_raddr;
|
||||
wire mbuf_full;
|
||||
|
||||
`UNUSED_VAR (rsp_type)
|
||||
`UNUSED_VAR (rsp_is_prefetch)
|
||||
|
||||
wire [`NUM_THREADS-1:0][REQ_ASHIFT-1:0] req_offset, rsp_offset;
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
assign req_offset[i] = req_addr[i][1:0];
|
||||
|
@ -135,14 +135,14 @@ module VX_lsu_unit #(
|
|||
|
||||
wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n);
|
||||
|
||||
assign mbuf_raddr = dcache_rsp_if.tag[ADDR_TYPEW +: `LSUQ_ADDR_BITS];
|
||||
assign mbuf_raddr = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS +: `LSUQ_ADDR_BITS];
|
||||
`UNUSED_VAR (dcache_rsp_if.tag)
|
||||
|
||||
// do not writeback from software prefetch
|
||||
wire req_wb2 = req_wb && ~req_is_prefetch;
|
||||
|
||||
VX_index_buffer #(
|
||||
.DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1),
|
||||
.DATAW (`UUID_BITS + `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1),
|
||||
.SIZE (`LSUQ_SIZE)
|
||||
) req_metadata (
|
||||
.clk (clk),
|
||||
|
@ -150,8 +150,8 @@ module VX_lsu_unit #(
|
|||
.write_addr (mbuf_waddr),
|
||||
.acquire_slot (mbuf_push),
|
||||
.read_addr (mbuf_raddr),
|
||||
.write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}),
|
||||
.read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}),
|
||||
.write_data ({req_uuid, req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}),
|
||||
.read_data ({rsp_uuid, rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}),
|
||||
.release_addr (mbuf_raddr),
|
||||
.release_slot (mbuf_pop),
|
||||
.full (mbuf_full),
|
||||
|
@ -214,7 +214,7 @@ module VX_lsu_unit #(
|
|||
0: mem_req_byteen[req_offset[i]] = 1;
|
||||
1: begin
|
||||
mem_req_byteen[req_offset[i]] = 1;
|
||||
mem_req_byteen[{req_addr[i][1], 1'b1}] = 1;
|
||||
mem_req_byteen[{req_offset[i][1], 1'b1}] = 1;
|
||||
end
|
||||
default : mem_req_byteen = {4{1'b1}};
|
||||
endcase
|
||||
|
@ -235,14 +235,9 @@ module VX_lsu_unit #(
|
|||
assign dcache_req_if.addr[i] = req_addr[i][31:2];
|
||||
assign dcache_req_if.byteen[i] = mem_req_byteen;
|
||||
assign dcache_req_if.data[i] = mem_req_data;
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
assign dcache_req_if.tag[i] = {req_wid, req_pc, req_tag, req_addr_type[i]};
|
||||
`else
|
||||
assign dcache_req_if.tag[i] = {req_tag, req_addr_type[i]};
|
||||
`endif
|
||||
assign dcache_req_if.tag[i] = {req_uuid, `LSU_TAG_ID_BITS'(req_tag), req_addr_type[i]};
|
||||
end
|
||||
|
||||
|
||||
assign ready_in = req_dep_ready && dcache_req_ready;
|
||||
|
||||
// send store commit
|
||||
|
@ -250,6 +245,7 @@ module VX_lsu_unit #(
|
|||
wire is_store_rsp = req_valid && ~req_wb && dcache_req_ready;
|
||||
|
||||
assign st_commit_if.valid = is_store_rsp;
|
||||
assign st_commit_if.uuid = req_uuid;
|
||||
assign st_commit_if.wid = req_wid;
|
||||
assign st_commit_if.tmask = req_tmask;
|
||||
assign st_commit_if.PC = req_pc;
|
||||
|
@ -286,14 +282,14 @@ module VX_lsu_unit #(
|
|||
wire load_rsp_stall = ~ld_commit_if.ready && ld_commit_if.valid;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1),
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1),
|
||||
.RESETW (1)
|
||||
) rsp_pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!load_rsp_stall),
|
||||
.data_in ({dcache_rsp_if.valid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}),
|
||||
.data_out ({ld_commit_if.valid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop})
|
||||
.data_in ({dcache_rsp_if.valid, rsp_uuid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}),
|
||||
.data_out ({ld_commit_if.valid, ld_commit_if.uuid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop})
|
||||
);
|
||||
|
||||
// Can accept new cache response?
|
||||
|
@ -301,19 +297,19 @@ module VX_lsu_unit #(
|
|||
|
||||
// scope registration
|
||||
`SCOPE_ASSIGN (dcache_req_fire, dcache_req_fire);
|
||||
`SCOPE_ASSIGN (dcache_req_wid, req_wid);
|
||||
`SCOPE_ASSIGN (dcache_req_pc, req_pc);
|
||||
`SCOPE_ASSIGN (dcache_req_uuid, req_uuid);
|
||||
`SCOPE_ASSIGN (dcache_req_addr, req_addr);
|
||||
`SCOPE_ASSIGN (dcache_req_rw, ~req_wb);
|
||||
`SCOPE_ASSIGN (dcache_req_byteen,dcache_req_if.byteen);
|
||||
`SCOPE_ASSIGN (dcache_req_data, dcache_req_if.data);
|
||||
`SCOPE_ASSIGN (dcache_req_tag, req_tag);
|
||||
`SCOPE_ASSIGN (dcache_rsp_fire, dcache_rsp_if.tmask & {`NUM_THREADS{dcache_rsp_fire}});
|
||||
`SCOPE_ASSIGN (dcache_rsp_uuid, rsp_uuid);
|
||||
`SCOPE_ASSIGN (dcache_rsp_data, dcache_rsp_if.data);
|
||||
`SCOPE_ASSIGN (dcache_rsp_tag, mbuf_raddr);
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + 64 + 1)-1:0] pending_reqs;
|
||||
reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + `UUID_BITS + 64 + 1)-1:0] pending_reqs;
|
||||
wire [63:0] delay_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
|
||||
|
||||
always @(posedge clk) begin
|
||||
|
@ -321,7 +317,7 @@ module VX_lsu_unit #(
|
|||
pending_reqs <= '0;
|
||||
end begin
|
||||
if (mbuf_push) begin
|
||||
pending_reqs[mbuf_waddr] <= {req_wid, req_pc, req_rd, $time, 1'b1};
|
||||
pending_reqs[mbuf_waddr] <= {req_wid, req_pc, req_rd, req_uuid, $time, 1'b1};
|
||||
end
|
||||
if (mbuf_pop) begin
|
||||
pending_reqs[mbuf_raddr] <= '0;
|
||||
|
@ -331,8 +327,11 @@ module VX_lsu_unit #(
|
|||
for (integer i = 0; i < `LSUQ_SIZE; ++i) begin
|
||||
if (pending_reqs[i][0]) begin
|
||||
`ASSERT(($time - pending_reqs[i][1 +: 64]) < delay_timeout,
|
||||
("%t: *** D$%0d response timeout: remaining=%b, wid=%0d, PC=%0h, rd=%0d",
|
||||
$time, CORE_ID, rsp_rem_mask[i], pending_reqs[i][1+64+32+`NR_BITS +: `NW_BITS], pending_reqs[i][1+64+`NR_BITS +: 32], pending_reqs[i][1+64 +: `NR_BITS]));
|
||||
("%t: *** D$%0d response timeout: remaining=%b, wid=%0d, PC=%0h, rd=%0d (#%0d)",
|
||||
$time, CORE_ID, rsp_rem_mask[i], pending_reqs[i][1+64+`UUID_BITS+`NR_BITS+32 +: `NW_BITS],
|
||||
pending_reqs[i][1+64+`UUID_BITS+`NR_BITS +: 32],
|
||||
pending_reqs[i][1+64+`UUID_BITS +: `NR_BITS],
|
||||
pending_reqs[i][1+64 +: `UUID_BITS]));
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -352,20 +351,20 @@ module VX_lsu_unit #(
|
|||
`TRACE_ARRAY1D(req_addr_type, `NUM_THREADS);
|
||||
dpi_trace(", data=");
|
||||
`TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(", (#%0d)\n", req_uuid);
|
||||
end else begin
|
||||
dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire);
|
||||
`TRACE_ARRAY1D(req_addr, `NUM_THREADS);
|
||||
dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
|
||||
`TRACE_ARRAY1D(req_addr_type, `NUM_THREADS);
|
||||
dpi_trace(", rd=%0d, is_dup=%b\n", req_rd, req_is_dup);
|
||||
dpi_trace(", rd=%0d, is_dup=%b (#%0d)\n", req_rd, req_is_dup, req_uuid);
|
||||
end
|
||||
end
|
||||
if (dcache_rsp_fire) begin
|
||||
dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=",
|
||||
$time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd);
|
||||
`TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS);
|
||||
dpi_trace(", is_dup=%b\n", rsp_is_dup);
|
||||
dpi_trace(", is_dup=%b (#%0d)\n", rsp_is_dup, rsp_uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -358,19 +358,17 @@ module VX_mem_unit # (
|
|||
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
`UNUSED_VAR (perf_dcache_if.mem_stalls)
|
||||
`UNUSED_VAR (perf_dcache_if.crsp_stalls)
|
||||
|
||||
assign perf_memsys_if.icache_reads = perf_icache_if.reads;
|
||||
assign perf_memsys_if.icache_read_misses = perf_icache_if.read_misses;
|
||||
assign perf_memsys_if.icache_pipe_stalls = perf_icache_if.pipe_stalls;
|
||||
assign perf_memsys_if.icache_crsp_stalls = perf_icache_if.crsp_stalls;
|
||||
|
||||
assign perf_memsys_if.dcache_reads = perf_dcache_if.reads;
|
||||
assign perf_memsys_if.dcache_writes = perf_dcache_if.writes;
|
||||
assign perf_memsys_if.dcache_read_misses = perf_dcache_if.read_misses;
|
||||
assign perf_memsys_if.dcache_write_misses= perf_dcache_if.write_misses;
|
||||
assign perf_memsys_if.dcache_bank_stalls = perf_dcache_if.bank_stalls;
|
||||
assign perf_memsys_if.dcache_mshr_stalls = perf_dcache_if.mshr_stalls;
|
||||
assign perf_memsys_if.dcache_pipe_stalls = perf_dcache_if.pipe_stalls;
|
||||
assign perf_memsys_if.dcache_crsp_stalls = perf_dcache_if.crsp_stalls;
|
||||
|
||||
if (`SM_ENABLE) begin
|
||||
assign perf_memsys_if.smem_reads = perf_smem_if.reads;
|
||||
|
@ -382,47 +380,41 @@ end else begin
|
|||
assign perf_memsys_if.smem_bank_stalls = 0;
|
||||
end
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_lat_per_cycle;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_mem_lat_per_cycle <= 0;
|
||||
perf_mem_pending_reads <= 0;
|
||||
end else begin
|
||||
perf_mem_lat_per_cycle <= perf_mem_lat_per_cycle +
|
||||
`PERF_CTR_BITS'($signed(2'((mem_req_if.valid && !mem_req_if.rw && mem_req_if.ready) && !(mem_rsp_if.valid && mem_rsp_if.ready)) -
|
||||
2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && !mem_req_if.rw && mem_req_if.ready))));
|
||||
perf_mem_pending_reads <= perf_mem_pending_reads +
|
||||
`PERF_CTR_BITS'($signed(2'((mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) && !(mem_rsp_if.valid && mem_rsp_if.ready)) -
|
||||
2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw))));
|
||||
end
|
||||
end
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_writes;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_lat;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_mem_reads <= 0;
|
||||
perf_mem_writes <= 0;
|
||||
perf_mem_lat <= 0;
|
||||
perf_mem_stalls <= 0;
|
||||
end else begin
|
||||
if (mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) begin
|
||||
perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (mem_req_if.valid && mem_req_if.ready && mem_req_if.rw) begin
|
||||
perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (mem_req_if.valid && !mem_req_if.ready) begin
|
||||
perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
perf_mem_lat <= perf_mem_lat + perf_mem_lat_per_cycle;
|
||||
end
|
||||
perf_mem_lat <= perf_mem_lat + perf_mem_pending_reads;
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_memsys_if.mem_reads = perf_mem_reads;
|
||||
assign perf_memsys_if.mem_writes = perf_mem_writes;
|
||||
assign perf_memsys_if.mem_latency = perf_mem_lat;
|
||||
assign perf_memsys_if.mem_stalls = perf_mem_stalls;
|
||||
assign perf_memsys_if.mem_latency = perf_mem_lat;
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -6,6 +6,7 @@ module VX_muldiv (
|
|||
|
||||
// Inputs
|
||||
input wire [`INST_MUL_BITS-1:0] alu_op,
|
||||
input wire [`UUID_BITS-1:0] uuid_in,
|
||||
input wire [`NW_BITS-1:0] wid_in,
|
||||
input wire [`NUM_THREADS-1:0] tmask_in,
|
||||
input wire [31:0] PC_in,
|
||||
|
@ -15,6 +16,7 @@ module VX_muldiv (
|
|||
input wire [`NUM_THREADS-1:0][31:0] alu_in2,
|
||||
|
||||
// Outputs
|
||||
output wire [`UUID_BITS-1:0] uuid_out,
|
||||
output wire [`NW_BITS-1:0] wid_out,
|
||||
output wire [`NUM_THREADS-1:0] tmask_out,
|
||||
output wire [31:0] PC_out,
|
||||
|
@ -32,6 +34,7 @@ module VX_muldiv (
|
|||
wire is_div_op = `INST_MUL_IS_DIV(alu_op);
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] mul_result;
|
||||
wire [`UUID_BITS-1:0] mul_uuid_out;
|
||||
wire [`NW_BITS-1:0] mul_wid_out;
|
||||
wire [`NUM_THREADS-1:0] mul_tmask_out;
|
||||
wire [31:0] mul_PC_out;
|
||||
|
@ -63,15 +66,15 @@ module VX_muldiv (
|
|||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.DEPTH (`LATENCY_IMUL),
|
||||
.RESETW (1)
|
||||
) mul_shift_reg (
|
||||
.clk(clk),
|
||||
.reset (reset),
|
||||
.enable (mul_ready_in),
|
||||
.data_in ({mul_valid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, mul_result_tmp}),
|
||||
.data_out ({mul_valid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_result})
|
||||
.data_in ({mul_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, mul_result_tmp}),
|
||||
.data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_result})
|
||||
);
|
||||
|
||||
`else
|
||||
|
@ -103,15 +106,15 @@ module VX_muldiv (
|
|||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1),
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1),
|
||||
.DEPTH (`LATENCY_IMUL),
|
||||
.RESETW (1)
|
||||
) mul_shift_reg (
|
||||
.clk(clk),
|
||||
.reset (reset),
|
||||
.enable (mul_ready_in),
|
||||
.data_in ({mul_valid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_mulh_in}),
|
||||
.data_out ({mul_valid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out})
|
||||
.data_in ({mul_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_mulh_in}),
|
||||
.data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out})
|
||||
);
|
||||
|
||||
`endif
|
||||
|
@ -119,6 +122,7 @@ module VX_muldiv (
|
|||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] div_result;
|
||||
wire [`UUID_BITS-1:0] div_uuid_out;
|
||||
wire [`NW_BITS-1:0] div_wid_out;
|
||||
wire [`NUM_THREADS-1:0] div_tmask_out;
|
||||
wire [31:0] div_PC_out;
|
||||
|
@ -147,15 +151,15 @@ module VX_muldiv (
|
|||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.DEPTH (`LATENCY_IMUL),
|
||||
.RESETW (1)
|
||||
) div_shift_reg (
|
||||
.clk(clk),
|
||||
.reset (reset),
|
||||
.enable (div_ready_in),
|
||||
.data_in ({div_valid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, div_result_tmp}),
|
||||
.data_out ({div_valid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_result})
|
||||
.data_in ({div_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, div_result_tmp}),
|
||||
.data_out ({div_valid_out, div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_result})
|
||||
);
|
||||
|
||||
assign div_ready_in = div_ready_out || ~div_valid_out;
|
||||
|
@ -171,21 +175,21 @@ module VX_muldiv (
|
|||
.WIDTHQ (32),
|
||||
.WIDTHR (32),
|
||||
.LANES (`NUM_THREADS),
|
||||
.TAGW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1)
|
||||
.TAGW (64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1)
|
||||
) divide (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (div_valid_in),
|
||||
.ready_in (div_ready_in),
|
||||
.signed_mode(is_signed_div),
|
||||
.tag_in ({wid_in, tmask_in, PC_in, rd_in, wb_in, is_rem_op_in}),
|
||||
.tag_in ({uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_rem_op_in}),
|
||||
.numer (alu_in1),
|
||||
.denom (alu_in2),
|
||||
.quotient (div_result_tmp),
|
||||
.remainder (rem_result_tmp),
|
||||
.ready_out (div_ready_out),
|
||||
.valid_out (div_valid_out),
|
||||
.tag_out ({div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out})
|
||||
.tag_out ({div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out})
|
||||
);
|
||||
|
||||
assign div_result = is_rem_op_out ? rem_result_tmp : div_result_tmp;
|
||||
|
@ -195,6 +199,7 @@ module VX_muldiv (
|
|||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire rsp_valid = mul_valid_out || div_valid_out;
|
||||
wire [`UUID_BITS-1:0] rsp_uuid = mul_valid_out ? mul_uuid_out : div_uuid_out;
|
||||
wire [`NW_BITS-1:0] rsp_wid = mul_valid_out ? mul_wid_out : div_wid_out;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask = mul_valid_out ? mul_tmask_out : div_tmask_out;
|
||||
wire [31:0] rsp_PC = mul_valid_out ? mul_PC_out : div_PC_out;
|
||||
|
@ -205,14 +210,14 @@ module VX_muldiv (
|
|||
assign stall_out = ~ready_out && valid_out;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data}),
|
||||
.data_out ({valid_out, wid_out, tmask_out, PC_out, rd_out, wb_out, data_out})
|
||||
.data_in ({rsp_valid, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data}),
|
||||
.data_out ({valid_out, uuid_out, wid_out, tmask_out, PC_out, rd_out, wb_out, data_out})
|
||||
);
|
||||
|
||||
// can accept new request?
|
||||
|
|
|
@ -165,6 +165,9 @@ module VX_pipeline #(
|
|||
) decode (
|
||||
.clk (clk),
|
||||
.reset (decode_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_decode_if (perf_pipeline_if.decode),
|
||||
`endif
|
||||
.ifetch_rsp_if (ifetch_rsp_if),
|
||||
.decode_if (decode_if),
|
||||
.wstall_if (wstall_if),
|
||||
|
@ -180,7 +183,7 @@ module VX_pipeline #(
|
|||
.reset (issue_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_pipeline_if (perf_pipeline_if),
|
||||
.perf_issue_if (perf_pipeline_if.issue),
|
||||
`endif
|
||||
|
||||
.decode_if (decode_if),
|
||||
|
|
|
@ -60,22 +60,22 @@ module VX_scoreboard #(
|
|||
end else begin
|
||||
`ifdef DBG_TRACE_PIPELINE
|
||||
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
|
||||
dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b\n",
|
||||
dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b (#%0d)\n",
|
||||
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,
|
||||
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3);
|
||||
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3, ibuffer_if.uuid);
|
||||
end
|
||||
`endif
|
||||
if (release_reg) begin
|
||||
`ASSERT(inuse_regs[writeback_if.wid][writeback_if.rd] != 0,
|
||||
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d",
|
||||
$time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd));
|
||||
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d (#%0d)",
|
||||
$time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd,writeback_if.uuid));
|
||||
end
|
||||
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
|
||||
deadlock_ctr <= deadlock_ctr + 1;
|
||||
`ASSERT(deadlock_ctr < deadlock_timeout,
|
||||
("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b",
|
||||
("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b (#%0d)",
|
||||
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,
|
||||
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3));
|
||||
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3, ibuffer_if.uuid));
|
||||
end else if (ibuffer_if.valid && ibuffer_if.ready) begin
|
||||
deadlock_ctr <= 0;
|
||||
end
|
||||
|
|
|
@ -35,9 +35,9 @@ task trace_ex_op (
|
|||
`INST_BR_JALR: dpi_trace("JALR");
|
||||
`INST_BR_ECALL: dpi_trace("ECALL");
|
||||
`INST_BR_EBREAK:dpi_trace("EBREAK");
|
||||
`INST_BR_MRET: dpi_trace("MRET");
|
||||
`INST_BR_URET: dpi_trace("URET");
|
||||
`INST_BR_SRET: dpi_trace("SRET");
|
||||
`INST_BR_DRET: dpi_trace("DRET");
|
||||
`INST_BR_MRET: dpi_trace("MRET");
|
||||
default: dpi_trace("?");
|
||||
endcase
|
||||
end else if (`INST_ALU_IS_MUL(op_mod)) begin
|
||||
|
|
|
@ -46,6 +46,8 @@ module VX_warp_sched #(
|
|||
wire schedule_valid;
|
||||
wire warp_scheduled;
|
||||
|
||||
reg [`UUID_BITS-1:0] issued_instrs;
|
||||
|
||||
wire ifetch_req_fire = ifetch_req_if.valid && ifetch_req_if.ready;
|
||||
|
||||
wire tmc_active = (warp_ctl_if.tmc.tmask != 0);
|
||||
|
@ -62,12 +64,13 @@ module VX_warp_sched #(
|
|||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
barrier_masks <= 0;
|
||||
use_wspawn <= 0;
|
||||
stalled_warps <= 0;
|
||||
barrier_masks <= '0;
|
||||
use_wspawn <= '0;
|
||||
stalled_warps <= '0;
|
||||
warp_pcs <= '0;
|
||||
active_warps <= '0;
|
||||
thread_masks <= '0;
|
||||
issued_instrs <= '0;
|
||||
|
||||
// activate first warp
|
||||
warp_pcs[0] <= `STARTUP_ADDR;
|
||||
|
@ -117,6 +120,8 @@ module VX_warp_sched #(
|
|||
if (use_wspawn[schedule_wid]) begin
|
||||
thread_masks[schedule_wid] <= 1;
|
||||
end
|
||||
|
||||
issued_instrs <= issued_instrs + 1;
|
||||
end
|
||||
|
||||
if (ifetch_req_fire) begin
|
||||
|
@ -223,20 +228,23 @@ module VX_warp_sched #(
|
|||
|
||||
assign warp_scheduled = schedule_valid && ~stall_out;
|
||||
|
||||
wire [`UUID_BITS-1:0] instr_uuid = (issued_instrs * `NUM_CORES * `NUM_CLUSTERS) + `UUID_BITS'(CORE_ID);
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NUM_THREADS + 32 + `NW_BITS),
|
||||
.DATAW (1 + `UUID_BITS + `NUM_THREADS + 32 + `NW_BITS),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_out),
|
||||
.data_in ({schedule_valid, schedule_tmask, schedule_pc, schedule_wid}),
|
||||
.data_out ({ifetch_req_if.valid, ifetch_req_if.tmask, ifetch_req_if.PC, ifetch_req_if.wid})
|
||||
.data_in ({schedule_valid, instr_uuid, schedule_tmask, schedule_pc, schedule_wid}),
|
||||
.data_out ({ifetch_req_if.valid, ifetch_req_if.uuid, ifetch_req_if.tmask, ifetch_req_if.PC, ifetch_req_if.wid})
|
||||
);
|
||||
|
||||
assign busy = (active_warps != 0);
|
||||
|
||||
`SCOPE_ASSIGN (wsched_scheduled, warp_scheduled);
|
||||
`SCOPE_ASSIGN (wsched_schedule_uuid, instr_uuid);
|
||||
`SCOPE_ASSIGN (wsched_active_warps, active_warps);
|
||||
`SCOPE_ASSIGN (wsched_stalled_warps, stalled_warps);
|
||||
`SCOPE_ASSIGN (wsched_schedule_wid, schedule_wid);
|
||||
|
|
|
@ -23,17 +23,9 @@ module VX_writeback #(
|
|||
|
||||
localparam DATAW = `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32) + 1;
|
||||
`ifdef EXT_F_ENABLE
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
localparam NUM_RSPS = 5;
|
||||
`else
|
||||
localparam NUM_RSPS = 4;
|
||||
`endif
|
||||
`else
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
localparam NUM_RSPS = 4;
|
||||
`else
|
||||
localparam NUM_RSPS = 3;
|
||||
`endif
|
||||
`endif
|
||||
|
||||
wire wb_valid;
|
||||
|
@ -50,9 +42,7 @@ module VX_writeback #(
|
|||
wire stall;
|
||||
|
||||
assign rsp_valid = {
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
gpu_commit_if.valid && gpu_commit_if.wb,
|
||||
`endif
|
||||
csr_commit_if.valid && csr_commit_if.wb,
|
||||
alu_commit_if.valid && alu_commit_if.wb,
|
||||
`ifdef EXT_F_ENABLE
|
||||
|
@ -62,9 +52,7 @@ module VX_writeback #(
|
|||
};
|
||||
|
||||
assign rsp_data = {
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
{gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.rd, gpu_commit_if.data, gpu_commit_if.eop},
|
||||
`endif
|
||||
{csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.rd, csr_commit_if.data, csr_commit_if.eop},
|
||||
{alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.rd, alu_commit_if.data, alu_commit_if.eop},
|
||||
`ifdef EXT_F_ENABLE
|
||||
|
@ -76,7 +64,8 @@ module VX_writeback #(
|
|||
VX_stream_arbiter #(
|
||||
.NUM_REQS (NUM_RSPS),
|
||||
.DATAW (DATAW),
|
||||
.TYPE ("P")
|
||||
.BUFFERED (1),
|
||||
.TYPE ("R")
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -88,28 +77,17 @@ module VX_writeback #(
|
|||
.ready_out (~stall)
|
||||
);
|
||||
|
||||
assign ld_commit_if.ready = rsp_ready[0] || ~ld_commit_if.wb;
|
||||
assign ld_commit_if.ready = rsp_ready[0] || ~ld_commit_if.wb;
|
||||
`ifdef EXT_F_ENABLE
|
||||
assign fpu_commit_if.ready = rsp_ready[1] || ~fpu_commit_if.wb;
|
||||
assign alu_commit_if.ready = rsp_ready[2] || ~alu_commit_if.wb;
|
||||
assign csr_commit_if.ready = rsp_ready[3] || ~csr_commit_if.wb;
|
||||
assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb;
|
||||
`else
|
||||
assign alu_commit_if.ready = rsp_ready[1] || ~alu_commit_if.wb;
|
||||
assign csr_commit_if.ready = rsp_ready[2] || ~csr_commit_if.wb;
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb;
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
`ifdef EXT_F_ENABLE
|
||||
assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb;
|
||||
`else
|
||||
assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb;
|
||||
`endif
|
||||
`else
|
||||
assign gpu_commit_if.ready = 1;
|
||||
`endif
|
||||
|
||||
assign stall = ~writeback_if.ready && writeback_if.valid;
|
||||
|
||||
|
|
|
@ -124,7 +124,8 @@ module VX_to_mem #(
|
|||
end
|
||||
end
|
||||
assign mem_rsp_tag_in_w = (rsp_ctr != 0) ? mem_rsp_tag_in_r : mem_rsp_tag_in;
|
||||
`RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_w == mem_rsp_tag_in), ("out-of-order memory reponse! cur=%d, expected=%d", mem_rsp_tag_in_w, mem_rsp_tag_in))
|
||||
`RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_w == mem_rsp_tag_in),
|
||||
("%t: *** out-of-order memory reponse! cur=%d, expected=%d", $time, mem_rsp_tag_in_w, mem_rsp_tag_in))
|
||||
|
||||
wire [SRC_ADDR_WIDTH+D-1:0] mem_req_addr_in_qual = {mem_req_addr_in, req_ctr};
|
||||
|
||||
|
|
73
hw/rtl/cache/VX_bank.sv
vendored
73
hw/rtl/cache/VX_bank.sv
vendored
|
@ -33,9 +33,6 @@ module VX_bank #(
|
|||
// core request tag size
|
||||
parameter CORE_TAG_WIDTH = 1,
|
||||
|
||||
// size of tag id in core request tag
|
||||
parameter CORE_TAG_ID_BITS = 0,
|
||||
|
||||
// bank offset from beginning of index range
|
||||
parameter BANK_ADDR_OFFSET = 0,
|
||||
|
||||
|
@ -51,7 +48,6 @@ module VX_bank #(
|
|||
output wire perf_read_misses,
|
||||
output wire perf_write_misses,
|
||||
output wire perf_mshr_stalls,
|
||||
output wire perf_pipe_stalls,
|
||||
`endif
|
||||
|
||||
// Core Request
|
||||
|
@ -96,14 +92,9 @@ module VX_bank #(
|
|||
input wire [`LINE_SELECT_BITS-1:0] flush_addr
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_TAG_ID_BITS)
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
`IGNORE_UNUSED_BEGIN
|
||||
wire [31:0] debug_pc_sel, debug_pc_st0, debug_pc_st1;
|
||||
wire [`NW_BITS-1:0] debug_wid_sel, debug_wid_st0, debug_wid_st1;
|
||||
wire [`DBG_CACHE_REQ_IDW-1:0] req_id_sel, req_id_st0, req_id_st1;
|
||||
`IGNORE_UNUSED_END
|
||||
`endif
|
||||
|
||||
wire [NUM_PORTS-1:0] creq_pmask;
|
||||
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] creq_wsel;
|
||||
|
@ -197,13 +188,7 @@ module VX_bank #(
|
|||
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
|
||||
wire creq_fire = creq_valid && creq_ready;
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
|
||||
assign {debug_wid_sel, debug_pc_sel} = mshr_enable ? mshr_tag[0][`CACHE_REQ_INFO_RNG] : creq_tag[0][`CACHE_REQ_INFO_RNG];
|
||||
end else begin
|
||||
assign {debug_wid_sel, debug_pc_sel} = 0;
|
||||
end
|
||||
`endif
|
||||
assign req_id_sel = mshr_enable ? mshr_tag[0][`CACHE_REQ_ID_RNG] : creq_tag[0][`CACHE_REQ_ID_RNG];
|
||||
|
||||
wire [`CACHE_LINE_WIDTH-1:0] wdata_sel;
|
||||
assign wdata_sel[(NUM_PORTS * `WORD_WIDTH)-1:0] = (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data[(NUM_PORTS * `WORD_WIDTH)-1:0] : creq_data;
|
||||
|
@ -237,13 +222,7 @@ module VX_bank #(
|
|||
.data_out ({valid_st0, is_flush_st0, is_mshr_st0, is_fill_st0, is_read_st0, is_write_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_st0})
|
||||
);
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
|
||||
assign {debug_wid_st0, debug_pc_st0} = tag_st0[0][`CACHE_REQ_INFO_RNG];
|
||||
end else begin
|
||||
assign {debug_wid_st0, debug_pc_st0} = 0;
|
||||
end
|
||||
`endif
|
||||
assign req_id_st0 = tag_st0[0][`CACHE_REQ_ID_RNG];
|
||||
|
||||
wire do_fill_st0 = valid_st0 && is_fill_st0;
|
||||
wire do_flush_st0 = valid_st0 && is_flush_st0;
|
||||
|
@ -263,11 +242,9 @@ module VX_bank #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
.debug_pc (debug_pc_st0),
|
||||
.debug_wid (debug_wid_st0),
|
||||
`endif
|
||||
.stall (crsq_stall),
|
||||
.req_id (req_id_st0),
|
||||
|
||||
.stall (crsq_stall),
|
||||
|
||||
// read/Fill
|
||||
.lookup (do_lookup_st0),
|
||||
|
@ -293,13 +270,7 @@ module VX_bank #(
|
|||
.data_out ({valid_st1, is_mshr_st1, is_fill_st1, is_read_st1, is_write_st1, miss_st1, addr_st1, wdata_st1, wsel_st1, byteen_st1, req_tid_st1, pmask_st1, tag_st1, mshr_id_st1, mshr_pending_st1})
|
||||
);
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
|
||||
assign {debug_wid_st1, debug_pc_st1} = tag_st1[0][`CACHE_REQ_INFO_RNG];
|
||||
end else begin
|
||||
assign {debug_wid_st1, debug_pc_st1} = 0;
|
||||
end
|
||||
`endif
|
||||
assign req_id_st1 = tag_st1[0][`CACHE_REQ_ID_RNG];
|
||||
|
||||
wire do_read_st0 = valid_st0 && is_read_st0;
|
||||
wire do_read_st1 = valid_st1 && is_read_st1;
|
||||
|
@ -323,10 +294,8 @@ module VX_bank #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
.debug_pc (debug_pc_st1),
|
||||
.debug_wid (debug_wid_st1),
|
||||
`endif
|
||||
.req_id (req_id_st1),
|
||||
|
||||
.stall (crsq_stall),
|
||||
|
||||
.read (do_read_st1 || do_mshr_st1),
|
||||
|
@ -372,14 +341,9 @@ module VX_bank #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
.deq_debug_pc (debug_pc_sel),
|
||||
.deq_debug_wid (debug_wid_sel),
|
||||
.lkp_debug_pc (debug_pc_st0),
|
||||
.lkp_debug_wid (debug_wid_st0),
|
||||
.rel_debug_pc (debug_pc_st1),
|
||||
.rel_debug_wid (debug_wid_st1),
|
||||
`endif
|
||||
.deq_req_id (req_id_sel),
|
||||
.lkp_req_id (req_id_st0),
|
||||
.rel_req_id (req_id_st1),
|
||||
|
||||
// allocate
|
||||
.allocate_valid (mshr_allocate),
|
||||
|
@ -505,7 +469,6 @@ module VX_bank #(
|
|||
`ifdef PERF_ENABLE
|
||||
assign perf_read_misses = do_read_st1 && miss_st1;
|
||||
assign perf_write_misses = do_write_st1 && miss_st1;
|
||||
assign perf_pipe_stalls = crsq_stall || mreq_alm_full || mshr_alm_full;
|
||||
assign perf_mshr_stalls = mshr_alm_full;
|
||||
`endif
|
||||
|
||||
|
@ -525,22 +488,22 @@ module VX_bank #(
|
|||
dpi_trace("%d: cache%0d:%0d fill-rsp: addr=%0h, id=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data);
|
||||
end
|
||||
if (mshr_fire) begin
|
||||
dpi_trace("%d: cache%0d:%0d mshr-pop: addr=%0h, tag=%0h, pmask=%b, tid=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, debug_wid_sel, debug_pc_sel);
|
||||
dpi_trace("%d: cache%0d:%0d mshr-pop: addr=%0h, tag=%0h, pmask=%b, tid=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, req_id_sel);
|
||||
end
|
||||
if (creq_fire) begin
|
||||
if (creq_rw)
|
||||
dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, debug_wid_sel, debug_pc_sel);
|
||||
dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, req_id_sel);
|
||||
else
|
||||
dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, debug_wid_sel, debug_pc_sel);
|
||||
dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, req_id_sel);
|
||||
end
|
||||
if (crsq_fire) begin
|
||||
dpi_trace("%d: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, debug_wid_st1, debug_pc_st1);
|
||||
dpi_trace("%d: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, req_id_st1);
|
||||
end
|
||||
if (mreq_push) begin
|
||||
if (is_write_st1)
|
||||
dpi_trace("%d: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, debug_wid_st1, debug_pc_st1);
|
||||
dpi_trace("%d: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, req_id_st1);
|
||||
else
|
||||
dpi_trace("%d: cache%0d:%0d fill-req: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_id, debug_wid_st1, debug_pc_st1);
|
||||
dpi_trace("%d: cache%0d:%0d fill-req: addr=%0h, id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_id, req_id_st1);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
247
hw/rtl/cache/VX_cache.sv
vendored
247
hw/rtl/cache/VX_cache.sv
vendored
|
@ -102,7 +102,6 @@ module VX_cache #(
|
|||
wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
|
||||
wire [NUM_BANKS-1:0] perf_write_miss_per_bank;
|
||||
wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank;
|
||||
wire [NUM_BANKS-1:0] perf_pipe_stall_per_bank;
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
@ -219,37 +218,37 @@ module VX_cache #(
|
|||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Core request
|
||||
wire [NUM_REQS-1:0] core_req_valid_nc;
|
||||
wire [NUM_REQS-1:0] core_req_rw_nc;
|
||||
wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr_nc;
|
||||
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen_nc;
|
||||
wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data_nc;
|
||||
wire [NUM_REQS-1:0][CORE_TAG_X_WIDTH-1:0] core_req_tag_nc;
|
||||
wire [NUM_REQS-1:0] core_req_ready_nc;
|
||||
wire [NUM_REQS-1:0] core_req_valid_c;
|
||||
wire [NUM_REQS-1:0] core_req_rw_c;
|
||||
wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr_c;
|
||||
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen_c;
|
||||
wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data_c;
|
||||
wire [NUM_REQS-1:0][CORE_TAG_X_WIDTH-1:0] core_req_tag_c;
|
||||
wire [NUM_REQS-1:0] core_req_ready_c;
|
||||
|
||||
// Core response
|
||||
wire [`CORE_RSP_TAGS-1:0] core_rsp_valid_nc;
|
||||
wire [NUM_REQS-1:0] core_rsp_tmask_nc;
|
||||
wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_nc;
|
||||
wire [`CORE_RSP_TAGS-1:0][CORE_TAG_X_WIDTH-1:0] core_rsp_tag_nc;
|
||||
wire [`CORE_RSP_TAGS-1:0] core_rsp_ready_nc;
|
||||
wire [`CORE_RSP_TAGS-1:0] core_rsp_valid_c;
|
||||
wire [NUM_REQS-1:0] core_rsp_tmask_c;
|
||||
wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_c;
|
||||
wire [`CORE_RSP_TAGS-1:0][CORE_TAG_X_WIDTH-1:0] core_rsp_tag_c;
|
||||
wire [`CORE_RSP_TAGS-1:0] core_rsp_ready_c;
|
||||
|
||||
// Memory request
|
||||
wire mem_req_valid_nc;
|
||||
wire mem_req_rw_nc;
|
||||
wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_nc;
|
||||
wire [NUM_PORTS-1:0] mem_req_pmask_nc;
|
||||
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_nc;
|
||||
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_nc;
|
||||
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_nc;
|
||||
wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_nc;
|
||||
wire mem_req_ready_nc;
|
||||
wire mem_req_valid_c;
|
||||
wire mem_req_rw_c;
|
||||
wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_c;
|
||||
wire [NUM_PORTS-1:0] mem_req_pmask_c;
|
||||
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_c;
|
||||
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_c;
|
||||
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_c;
|
||||
wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_c;
|
||||
wire mem_req_ready_c;
|
||||
|
||||
// Memory response
|
||||
wire mem_rsp_valid_nc;
|
||||
wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_nc;
|
||||
wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_nc;
|
||||
wire mem_rsp_ready_nc;
|
||||
wire mem_rsp_valid_c;
|
||||
wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_c;
|
||||
wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_c;
|
||||
wire mem_rsp_ready_c;
|
||||
|
||||
if (NC_ENABLE) begin
|
||||
VX_nc_bypass #(
|
||||
|
@ -280,20 +279,20 @@ module VX_cache #(
|
|||
.core_req_ready_in (core_req_ready),
|
||||
|
||||
// Core request out
|
||||
.core_req_valid_out (core_req_valid_nc),
|
||||
.core_req_rw_out (core_req_rw_nc),
|
||||
.core_req_byteen_out(core_req_byteen_nc),
|
||||
.core_req_addr_out (core_req_addr_nc),
|
||||
.core_req_data_out (core_req_data_nc),
|
||||
.core_req_tag_out (core_req_tag_nc),
|
||||
.core_req_ready_out (core_req_ready_nc),
|
||||
.core_req_valid_out (core_req_valid_c),
|
||||
.core_req_rw_out (core_req_rw_c),
|
||||
.core_req_byteen_out(core_req_byteen_c),
|
||||
.core_req_addr_out (core_req_addr_c),
|
||||
.core_req_data_out (core_req_data_c),
|
||||
.core_req_tag_out (core_req_tag_c),
|
||||
.core_req_ready_out (core_req_ready_c),
|
||||
|
||||
// Core response in
|
||||
.core_rsp_valid_in (core_rsp_valid_nc),
|
||||
.core_rsp_tmask_in (core_rsp_tmask_nc),
|
||||
.core_rsp_data_in (core_rsp_data_nc),
|
||||
.core_rsp_tag_in (core_rsp_tag_nc),
|
||||
.core_rsp_ready_in (core_rsp_ready_nc),
|
||||
.core_rsp_valid_in (core_rsp_valid_c),
|
||||
.core_rsp_tmask_in (core_rsp_tmask_c),
|
||||
.core_rsp_data_in (core_rsp_data_c),
|
||||
.core_rsp_tag_in (core_rsp_tag_c),
|
||||
.core_rsp_ready_in (core_rsp_ready_c),
|
||||
|
||||
// Core response out
|
||||
.core_rsp_valid_out (core_rsp_valid_sb),
|
||||
|
@ -303,15 +302,15 @@ module VX_cache #(
|
|||
.core_rsp_ready_out (core_rsp_ready_sb),
|
||||
|
||||
// Memory request in
|
||||
.mem_req_valid_in (mem_req_valid_nc),
|
||||
.mem_req_rw_in (mem_req_rw_nc),
|
||||
.mem_req_addr_in (mem_req_addr_nc),
|
||||
.mem_req_pmask_in (mem_req_pmask_nc),
|
||||
.mem_req_byteen_in (mem_req_byteen_nc),
|
||||
.mem_req_wsel_in (mem_req_wsel_nc),
|
||||
.mem_req_data_in (mem_req_data_nc),
|
||||
.mem_req_tag_in (mem_req_tag_nc),
|
||||
.mem_req_ready_in (mem_req_ready_nc),
|
||||
.mem_req_valid_in (mem_req_valid_c),
|
||||
.mem_req_rw_in (mem_req_rw_c),
|
||||
.mem_req_addr_in (mem_req_addr_c),
|
||||
.mem_req_pmask_in (mem_req_pmask_c),
|
||||
.mem_req_byteen_in (mem_req_byteen_c),
|
||||
.mem_req_wsel_in (mem_req_wsel_c),
|
||||
.mem_req_data_in (mem_req_data_c),
|
||||
.mem_req_tag_in (mem_req_tag_c),
|
||||
.mem_req_ready_in (mem_req_ready_c),
|
||||
|
||||
// Memory request out
|
||||
.mem_req_valid_out (mem_req_valid_sb),
|
||||
|
@ -331,40 +330,40 @@ module VX_cache #(
|
|||
.mem_rsp_ready_in (mem_rsp_ready),
|
||||
|
||||
// Memory response out
|
||||
.mem_rsp_valid_out (mem_rsp_valid_nc),
|
||||
.mem_rsp_data_out (mem_rsp_data_nc),
|
||||
.mem_rsp_tag_out (mem_rsp_tag_nc),
|
||||
.mem_rsp_ready_out (mem_rsp_ready_nc)
|
||||
.mem_rsp_valid_out (mem_rsp_valid_c),
|
||||
.mem_rsp_data_out (mem_rsp_data_c),
|
||||
.mem_rsp_tag_out (mem_rsp_tag_c),
|
||||
.mem_rsp_ready_out (mem_rsp_ready_c)
|
||||
);
|
||||
end else begin
|
||||
assign core_req_valid_nc = core_req_valid;
|
||||
assign core_req_rw_nc = core_req_rw;
|
||||
assign core_req_addr_nc = core_req_addr;
|
||||
assign core_req_byteen_nc = core_req_byteen;
|
||||
assign core_req_data_nc = core_req_data;
|
||||
assign core_req_tag_nc = core_req_tag;
|
||||
assign core_req_ready = core_req_ready_nc;
|
||||
assign core_req_valid_c = core_req_valid;
|
||||
assign core_req_rw_c = core_req_rw;
|
||||
assign core_req_addr_c = core_req_addr;
|
||||
assign core_req_byteen_c = core_req_byteen;
|
||||
assign core_req_data_c = core_req_data;
|
||||
assign core_req_tag_c = core_req_tag;
|
||||
assign core_req_ready = core_req_ready_c;
|
||||
|
||||
assign core_rsp_valid_sb = core_rsp_valid_nc;
|
||||
assign core_rsp_tmask_sb = core_rsp_tmask_nc;
|
||||
assign core_rsp_data_sb = core_rsp_data_nc;
|
||||
assign core_rsp_tag_sb = core_rsp_tag_nc;
|
||||
assign core_rsp_ready_nc = core_rsp_ready_sb;
|
||||
assign core_rsp_valid_sb = core_rsp_valid_c;
|
||||
assign core_rsp_tmask_sb = core_rsp_tmask_c;
|
||||
assign core_rsp_data_sb = core_rsp_data_c;
|
||||
assign core_rsp_tag_sb = core_rsp_tag_c;
|
||||
assign core_rsp_ready_c = core_rsp_ready_sb;
|
||||
|
||||
assign mem_req_valid_sb = mem_req_valid_nc;
|
||||
assign mem_req_addr_sb = mem_req_addr_nc;
|
||||
assign mem_req_rw_p = mem_req_rw_nc;
|
||||
assign mem_req_pmask_p = mem_req_pmask_nc;
|
||||
assign mem_req_byteen_p = mem_req_byteen_nc;
|
||||
assign mem_req_wsel_p = mem_req_wsel_nc;
|
||||
assign mem_req_data_p = mem_req_data_nc;
|
||||
assign mem_req_tag_sb = mem_req_tag_nc;
|
||||
assign mem_req_ready_nc = mem_req_ready_sb;
|
||||
assign mem_req_valid_sb = mem_req_valid_c;
|
||||
assign mem_req_addr_sb = mem_req_addr_c;
|
||||
assign mem_req_rw_p = mem_req_rw_c;
|
||||
assign mem_req_pmask_p = mem_req_pmask_c;
|
||||
assign mem_req_byteen_p = mem_req_byteen_c;
|
||||
assign mem_req_wsel_p = mem_req_wsel_c;
|
||||
assign mem_req_data_p = mem_req_data_c;
|
||||
assign mem_req_tag_sb = mem_req_tag_c;
|
||||
assign mem_req_ready_c = mem_req_ready_sb;
|
||||
|
||||
assign mem_rsp_valid_nc = mem_rsp_valid;
|
||||
assign mem_rsp_data_nc = mem_rsp_data;
|
||||
assign mem_rsp_tag_nc = mem_rsp_tag;
|
||||
assign mem_rsp_ready = mem_rsp_ready_nc;
|
||||
assign mem_rsp_valid_c = mem_rsp_valid;
|
||||
assign mem_rsp_data_c = mem_rsp_data;
|
||||
assign mem_rsp_tag_c = mem_rsp_tag;
|
||||
assign mem_rsp_ready = mem_rsp_ready_c;
|
||||
end
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
@ -383,15 +382,15 @@ module VX_cache #(
|
|||
) mem_rsp_queue (
|
||||
.clk (clk),
|
||||
.reset (mrsq_reset),
|
||||
.ready_in (mem_rsp_ready_nc),
|
||||
.valid_in (mem_rsp_valid_nc),
|
||||
.data_in ({mem_rsp_tag_nc, mem_rsp_data_nc}),
|
||||
.ready_in (mem_rsp_ready_c),
|
||||
.valid_in (mem_rsp_valid_c),
|
||||
.data_in ({mem_rsp_tag_c, mem_rsp_data_c}),
|
||||
.data_out ({mem_rsp_tag_qual, mem_rsp_data_qual}),
|
||||
.ready_out (mrsq_out_ready),
|
||||
.valid_out (mrsq_out_valid)
|
||||
);
|
||||
|
||||
`UNUSED_VAR (mem_rsp_tag_nc)
|
||||
`UNUSED_VAR (mem_rsp_tag_c)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -464,13 +463,13 @@ module VX_cache #(
|
|||
`ifdef PERF_ENABLE
|
||||
.bank_stalls(perf_cache_if.bank_stalls),
|
||||
`endif
|
||||
.core_req_valid (core_req_valid_nc),
|
||||
.core_req_rw (core_req_rw_nc),
|
||||
.core_req_addr (core_req_addr_nc),
|
||||
.core_req_byteen (core_req_byteen_nc),
|
||||
.core_req_data (core_req_data_nc),
|
||||
.core_req_tag (core_req_tag_nc),
|
||||
.core_req_ready (core_req_ready_nc),
|
||||
.core_req_valid (core_req_valid_c),
|
||||
.core_req_rw (core_req_rw_c),
|
||||
.core_req_addr (core_req_addr_c),
|
||||
.core_req_byteen (core_req_byteen_c),
|
||||
.core_req_data (core_req_data_c),
|
||||
.core_req_tag (core_req_tag_c),
|
||||
.core_req_ready (core_req_ready_c),
|
||||
.per_bank_core_req_valid (per_bank_core_req_valid),
|
||||
.per_bank_core_req_pmask (per_bank_core_req_pmask),
|
||||
.per_bank_core_req_rw (per_bank_core_req_rw),
|
||||
|
@ -580,8 +579,7 @@ module VX_cache #(
|
|||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.MREQ_SIZE (MREQ_SIZE),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.CORE_TAG_WIDTH (CORE_TAG_X_WIDTH),
|
||||
.CORE_TAG_ID_BITS (CORE_TAG_ID_X_BITS),
|
||||
.CORE_TAG_WIDTH (CORE_TAG_X_WIDTH),
|
||||
.BANK_ADDR_OFFSET (BANK_ADDR_OFFSET)
|
||||
) bank (
|
||||
`SCOPE_BIND_VX_cache_bank(i)
|
||||
|
@ -593,7 +591,6 @@ module VX_cache #(
|
|||
.perf_read_misses (perf_read_miss_per_bank[i]),
|
||||
.perf_write_misses (perf_write_miss_per_bank[i]),
|
||||
.perf_mshr_stalls (perf_mshr_stall_per_bank[i]),
|
||||
.perf_pipe_stalls (perf_pipe_stall_per_bank[i]),
|
||||
`endif
|
||||
|
||||
// Core request
|
||||
|
@ -656,11 +653,11 @@ module VX_cache #(
|
|||
.per_bank_core_rsp_tag (per_bank_core_rsp_tag),
|
||||
.per_bank_core_rsp_tid (per_bank_core_rsp_tid),
|
||||
.per_bank_core_rsp_ready (per_bank_core_rsp_ready),
|
||||
.core_rsp_valid (core_rsp_valid_nc),
|
||||
.core_rsp_tmask (core_rsp_tmask_nc),
|
||||
.core_rsp_tag (core_rsp_tag_nc),
|
||||
.core_rsp_data (core_rsp_data_nc),
|
||||
.core_rsp_ready (core_rsp_ready_nc)
|
||||
.core_rsp_valid (core_rsp_valid_c),
|
||||
.core_rsp_tmask (core_rsp_tmask_c),
|
||||
.core_rsp_tag (core_rsp_tag_c),
|
||||
.core_rsp_data (core_rsp_data_c),
|
||||
.core_rsp_ready (core_rsp_ready_c)
|
||||
);
|
||||
|
||||
wire [NUM_BANKS-1:0][(`MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + NUM_PORTS * (1 + WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH))-1:0] data_in;
|
||||
|
@ -682,15 +679,15 @@ module VX_cache #(
|
|||
.valid_in (per_bank_mem_req_valid),
|
||||
.data_in (data_in),
|
||||
.ready_in (per_bank_mem_req_ready),
|
||||
.valid_out (mem_req_valid_nc),
|
||||
.data_out ({mem_req_addr_nc, mem_req_id, mem_req_rw_nc, mem_req_pmask_nc, mem_req_byteen_nc, mem_req_wsel_nc, mem_req_data_nc}),
|
||||
.ready_out (mem_req_ready_nc)
|
||||
.valid_out (mem_req_valid_c),
|
||||
.data_out ({mem_req_addr_c, mem_req_id, mem_req_rw_c, mem_req_pmask_c, mem_req_byteen_c, mem_req_wsel_c, mem_req_data_c}),
|
||||
.ready_out (mem_req_ready_c)
|
||||
);
|
||||
|
||||
if (NUM_BANKS == 1) begin
|
||||
assign mem_req_tag_nc = MEM_TAG_IN_WIDTH'(mem_req_id);
|
||||
assign mem_req_tag_c = MEM_TAG_IN_WIDTH'(mem_req_id);
|
||||
end else begin
|
||||
assign mem_req_tag_nc = MEM_TAG_IN_WIDTH'({`MEM_ADDR_TO_BANK_ID(mem_req_addr_nc), mem_req_id});
|
||||
assign mem_req_tag_c = MEM_TAG_IN_WIDTH'({`MEM_ADDR_TO_BANK_ID(mem_req_addr_c), mem_req_id});
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
|
@ -698,12 +695,21 @@ module VX_cache #(
|
|||
wire [$clog2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
|
||||
wire [$clog2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
|
||||
wire [$clog2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||
|
||||
wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid & core_req_ready & ~core_req_rw;
|
||||
wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid & core_req_ready & core_req_rw;
|
||||
|
||||
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask);
|
||||
|
||||
wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid_c & core_req_ready_c & ~core_req_rw;
|
||||
wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid_c & core_req_ready_c & core_req_rw;
|
||||
|
||||
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
|
||||
wire [$clog2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
|
||||
wire [$clog2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
|
||||
wire [$clog2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
|
||||
wire [$clog2(NUM_BANKS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||
|
||||
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask);
|
||||
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_mask);
|
||||
`POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank);
|
||||
`POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank);
|
||||
`POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank);
|
||||
|
||||
if (CORE_TAG_ID_BITS != 0) begin
|
||||
wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_tmask & {NUM_REQS{core_rsp_valid && ~core_rsp_ready}};
|
||||
|
@ -713,23 +719,14 @@ module VX_cache #(
|
|||
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask);
|
||||
end
|
||||
|
||||
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
|
||||
wire [$clog2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
|
||||
wire [$clog2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
|
||||
wire [$clog2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
|
||||
wire [$clog2(NUM_BANKS+1)-1:0] perf_pipe_stall_per_cycle;
|
||||
|
||||
`POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank);
|
||||
`POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank);
|
||||
`POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank);
|
||||
`POP_COUNT(perf_pipe_stall_per_cycle, perf_pipe_stall_per_bank);
|
||||
wire perf_mem_stall_per_cycle = mem_req_valid & ~mem_req_ready;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_read_misses;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_write_misses;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mshr_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_pipe_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
||||
|
||||
always @(posedge clk) begin
|
||||
|
@ -739,16 +736,16 @@ module VX_cache #(
|
|||
perf_read_misses <= 0;
|
||||
perf_write_misses <= 0;
|
||||
perf_mshr_stalls <= 0;
|
||||
perf_pipe_stalls <= 0;
|
||||
perf_mem_stalls <= 0;
|
||||
perf_crsp_stalls <= 0;
|
||||
end else begin
|
||||
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
|
||||
perf_read_misses <= perf_read_misses + `PERF_CTR_BITS'(perf_read_miss_per_cycle);
|
||||
perf_write_misses <= perf_write_misses+ `PERF_CTR_BITS'(perf_write_miss_per_cycle);
|
||||
perf_mshr_stalls <= perf_mshr_stalls + `PERF_CTR_BITS'(perf_mshr_stall_per_cycle);
|
||||
perf_pipe_stalls <= perf_pipe_stalls + `PERF_CTR_BITS'(perf_pipe_stall_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
|
||||
perf_read_misses <= perf_read_misses + `PERF_CTR_BITS'(perf_read_miss_per_cycle);
|
||||
perf_write_misses <= perf_write_misses + `PERF_CTR_BITS'(perf_write_miss_per_cycle);
|
||||
perf_mshr_stalls <= perf_mshr_stalls + `PERF_CTR_BITS'(perf_mshr_stall_per_cycle);
|
||||
perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'(perf_mem_stall_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -757,7 +754,7 @@ module VX_cache #(
|
|||
assign perf_cache_if.read_misses = perf_read_misses;
|
||||
assign perf_cache_if.write_misses = perf_write_misses;
|
||||
assign perf_cache_if.mshr_stalls = perf_mshr_stalls;
|
||||
assign perf_cache_if.pipe_stalls = perf_pipe_stalls;
|
||||
assign perf_cache_if.mem_stalls = perf_mem_stalls;
|
||||
assign perf_cache_if.crsp_stalls = perf_crsp_stalls;
|
||||
`endif
|
||||
|
||||
|
|
16
hw/rtl/cache/VX_cache_define.vh
vendored
16
hw/rtl/cache/VX_cache_define.vh
vendored
|
@ -3,9 +3,8 @@
|
|||
|
||||
`include "VX_platform.vh"
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
`include "VX_define.vh"
|
||||
`endif
|
||||
// cache request identifier
|
||||
`define DBG_CACHE_REQ_IDW 44
|
||||
|
||||
`define REQS_BITS `LOG2UP(NUM_REQS)
|
||||
|
||||
|
@ -24,7 +23,7 @@
|
|||
|
||||
`define WORD_ADDR_WIDTH (32-`CLOG2(WORD_SIZE))
|
||||
`define MEM_ADDR_WIDTH (32-`CLOG2(CACHE_LINE_SIZE))
|
||||
`define LINE_ADDR_WIDTH (`MEM_ADDR_WIDTH-`BANK_SELECT_BITS)
|
||||
`define LINE_ADDR_WIDTH (`MEM_ADDR_WIDTH-`CLOG2(NUM_BANKS))
|
||||
|
||||
// Word select
|
||||
`define WORD_SELECT_BITS `CLOG2(`WORDS_PER_LINE)
|
||||
|
@ -46,14 +45,13 @@
|
|||
`define TAG_SELECT_ADDR_START (1+`LINE_SELECT_ADDR_END)
|
||||
`define TAG_SELECT_ADDR_END (`WORD_ADDR_WIDTH-1)
|
||||
|
||||
`define BANK_SELECT_ADDR(x) x[`BANK_SELECT_ADDR_END : `BANK_SELECT_ADDR_START]
|
||||
|
||||
`define LINE_SELECT_ADDR0(x) x[`WORD_ADDR_WIDTH-1 : `LINE_SELECT_ADDR_START]
|
||||
`define LINE_SELECT_ADDRX(x) {x[`WORD_ADDR_WIDTH-1 : `LINE_SELECT_ADDR_START], x[`BANK_SELECT_ADDR_START-1 : 1+`WORD_SELECT_ADDR_END]}
|
||||
`define SELECT_BANK_ID(x) x[`BANK_SELECT_ADDR_END : `BANK_SELECT_ADDR_START]
|
||||
`define SELECT_LINE_ADDR0(x) x[`WORD_ADDR_WIDTH-1 : `LINE_SELECT_ADDR_START]
|
||||
`define SELECT_LINE_ADDRX(x) {x[`WORD_ADDR_WIDTH-1 : `LINE_SELECT_ADDR_START], x[`BANK_SELECT_ADDR_START-1 : 1+`WORD_SELECT_ADDR_END]}
|
||||
|
||||
`define LINE_TAG_ADDR(x) x[`LINE_ADDR_WIDTH-1 : `LINE_SELECT_BITS]
|
||||
|
||||
`define CACHE_REQ_INFO_RNG CORE_TAG_WIDTH-1 : (CORE_TAG_WIDTH-`DBG_CACHE_REQ_MDATAW)
|
||||
`define CACHE_REQ_ID_RNG CORE_TAG_WIDTH-1 : (CORE_TAG_WIDTH-`DBG_CACHE_REQ_IDW)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
|
11
hw/rtl/cache/VX_core_req_bank_sel.sv
vendored
11
hw/rtl/cache/VX_core_req_bank_sel.sv
vendored
|
@ -57,16 +57,16 @@ module VX_core_req_bank_sel #(
|
|||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
if (BANK_ADDR_OFFSET == 0) begin
|
||||
assign core_req_line_addr[i] = `LINE_SELECT_ADDR0(core_req_addr[i]);
|
||||
assign core_req_line_addr[i] = `SELECT_LINE_ADDR0(core_req_addr[i]);
|
||||
end else begin
|
||||
assign core_req_line_addr[i] = `LINE_SELECT_ADDRX(core_req_addr[i]);
|
||||
assign core_req_line_addr[i] = `SELECT_LINE_ADDRX(core_req_addr[i]);
|
||||
end
|
||||
assign core_req_wsel[i] = core_req_addr[i][`UP(`WORD_SELECT_BITS)-1:0];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
if (NUM_BANKS > 1) begin
|
||||
assign core_req_bid[i] = `BANK_SELECT_ADDR(core_req_addr[i]);
|
||||
assign core_req_bid[i] = `SELECT_BANK_ID(core_req_addr[i]);
|
||||
end else begin
|
||||
assign core_req_bid[i] = 0;
|
||||
end
|
||||
|
@ -88,6 +88,7 @@ module VX_core_req_bank_sel #(
|
|||
if (NUM_PORTS > 1) begin
|
||||
|
||||
reg [NUM_BANKS-1:0][`LINE_ADDR_WIDTH-1:0] per_bank_line_addr_r;
|
||||
reg [NUM_BANKS-1:0] per_bank_rw_r;
|
||||
wire [NUM_REQS-1:0] core_req_line_match;
|
||||
|
||||
always @(*) begin
|
||||
|
@ -95,12 +96,14 @@ module VX_core_req_bank_sel #(
|
|||
for (integer i = NUM_REQS-1; i >= 0; --i) begin
|
||||
if (core_req_valid[i]) begin
|
||||
per_bank_line_addr_r[core_req_bid[i]] = core_req_line_addr[i];
|
||||
per_bank_rw_r[core_req_bid[i]] = core_req_rw[i];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_req_line_match[i] = (core_req_line_addr[i] == per_bank_line_addr_r[core_req_bid[i]]);
|
||||
assign core_req_line_match[i] = (core_req_line_addr[i] == per_bank_line_addr_r[core_req_bid[i]])
|
||||
&& (core_req_rw[i] == per_bank_rw_r[core_req_bid[i]]);
|
||||
end
|
||||
|
||||
if (NUM_PORTS < NUM_REQS) begin
|
||||
|
|
9
hw/rtl/cache/VX_data_access.sv
vendored
9
hw/rtl/cache/VX_data_access.sv
vendored
|
@ -21,12 +21,9 @@ module VX_data_access #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
`IGNORE_UNUSED_BEGIN
|
||||
input wire[31:0] debug_pc,
|
||||
input wire[`NW_BITS-1:0] debug_wid,
|
||||
input wire[`DBG_CACHE_REQ_IDW-1:0] req_id,
|
||||
`IGNORE_UNUSED_END
|
||||
`endif
|
||||
|
||||
input wire stall,
|
||||
|
||||
|
@ -125,10 +122,10 @@ module VX_data_access #(
|
|||
dpi_trace("%d: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, fill_data);
|
||||
end
|
||||
if (read && ~stall) begin
|
||||
dpi_trace("%d: cache%0d:%0d data-read: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, read_data);
|
||||
dpi_trace("%d: cache%0d:%0d data-read: addr=%0h, blk_addr=%0d, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, read_data, req_id);
|
||||
end
|
||||
if (write && ~stall) begin
|
||||
dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, wid=%0d, PC=%0h, byteen=%b, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, byteen, line_addr, write_data);
|
||||
dpi_trace("%d: cache%0d:%0d data-write: addr=%0h, byteen=%b, blk_addr=%0d, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), byteen, line_addr, write_data, req_id);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
30
hw/rtl/cache/VX_miss_resrv.sv
vendored
30
hw/rtl/cache/VX_miss_resrv.sv
vendored
|
@ -25,16 +25,11 @@ module VX_miss_resrv #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
`IGNORE_UNUSED_BEGIN
|
||||
input wire[31:0] deq_debug_pc,
|
||||
input wire[`NW_BITS-1:0] deq_debug_wid,
|
||||
input wire[31:0] lkp_debug_pc,
|
||||
input wire[`NW_BITS-1:0] lkp_debug_wid,
|
||||
input wire[31:0] rel_debug_pc,
|
||||
input wire[`NW_BITS-1:0] rel_debug_wid,
|
||||
input wire[`DBG_CACHE_REQ_IDW-1:0] deq_req_id,
|
||||
input wire[`DBG_CACHE_REQ_IDW-1:0] lkp_req_id,
|
||||
input wire[`DBG_CACHE_REQ_IDW-1:0] rel_req_id,
|
||||
`IGNORE_UNUSED_END
|
||||
`endif
|
||||
|
||||
// allocate
|
||||
input wire allocate_valid,
|
||||
|
@ -206,23 +201,22 @@ module VX_miss_resrv #(
|
|||
always @(posedge clk) begin
|
||||
if (allocate_fire || fill_valid || dequeue_fire || lookup_replay || lookup_valid || release_valid) begin
|
||||
if (allocate_fire)
|
||||
dpi_trace("%d: cache%0d:%0d mshr-allocate: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID,
|
||||
`LINE_TO_BYTE_ADDR(allocate_addr, BANK_ID), allocate_id, deq_debug_wid, deq_debug_pc);
|
||||
dpi_trace("%d: cache%0d:%0d mshr-allocate: addr=%0h, id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID,
|
||||
`LINE_TO_BYTE_ADDR(allocate_addr, BANK_ID), allocate_id, deq_req_id);
|
||||
if (fill_valid)
|
||||
dpi_trace("%d: cache%0d:%0d mshr-fill: addr=%0h, id=%0d, addr=%0h\n", $time, CACHE_ID, BANK_ID,
|
||||
`LINE_TO_BYTE_ADDR(addr_table[fill_id], BANK_ID), fill_id, `LINE_TO_BYTE_ADDR(fill_addr, BANK_ID));
|
||||
if (dequeue_fire)
|
||||
dpi_trace("%d: cache%0d:%0d mshr-dequeue: addr=%0h, id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID,
|
||||
`LINE_TO_BYTE_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_debug_wid, deq_debug_pc);
|
||||
dpi_trace("%d: cache%0d:%0d mshr-dequeue: addr=%0h, id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID,
|
||||
`LINE_TO_BYTE_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_id);
|
||||
if (lookup_replay)
|
||||
dpi_trace("%d: cache%0d:%0d mshr-replay: addr=%0h, id=%0d\n", $time, CACHE_ID, BANK_ID,
|
||||
`LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id);
|
||||
dpi_trace("%d: cache%0d:%0d mshr-replay: addr=%0h, id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID,
|
||||
`LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id, lkp_req_id);
|
||||
if (lookup_valid)
|
||||
dpi_trace("%d: cache%0d:%0d mshr-lookup: addr=%0h, id=%0d, match=%b, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID,
|
||||
`LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id, lookup_match, lkp_debug_wid, lkp_debug_pc);
|
||||
dpi_trace("%d: cache%0d:%0d mshr-lookup: addr=%0h, id=%0d, match=%b (#%0d)\n", $time, CACHE_ID, BANK_ID,
|
||||
`LINE_TO_BYTE_ADDR(lookup_addr, BANK_ID), lookup_id, lookup_match, lkp_req_id);
|
||||
if (release_valid)
|
||||
dpi_trace("%d: cache%0d:%0d mshr-release id=%0d, wid=%0d, PC=%0h\n", $time, CACHE_ID, BANK_ID,
|
||||
release_id, rel_debug_wid, rel_debug_pc);
|
||||
dpi_trace("%d: cache%0d:%0d mshr-release id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, release_id, rel_req_id);
|
||||
dpi_trace("%d: cache%0d:%0d mshr-table", $time, CACHE_ID, BANK_ID);
|
||||
for (integer i = 0; i < MSHR_SIZE; ++i) begin
|
||||
if (valid_table[i]) begin
|
||||
|
|
54
hw/rtl/cache/VX_shared_mem.sv
vendored
54
hw/rtl/cache/VX_shared_mem.sv
vendored
|
@ -254,22 +254,19 @@ module VX_shared_mem #(
|
|||
.ready_out (core_rsp_ready)
|
||||
);
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
`IGNORE_UNUSED_BEGIN
|
||||
wire [NUM_BANKS-1:0][31:0] debug_pc_st0, debug_pc_st1;
|
||||
wire [NUM_BANKS-1:0][`NW_BITS-1:0] debug_wid_st0, debug_wid_st1;
|
||||
wire [NUM_BANKS-1:0][`DBG_CACHE_REQ_IDW-1:0] req_id_st0, req_id_st1;
|
||||
`IGNORE_UNUSED_END
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; ++i) begin
|
||||
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
|
||||
assign {debug_wid_st0[i], debug_pc_st0[i]} = per_bank_core_req_tag_unqual[i][`CACHE_REQ_INFO_RNG];
|
||||
assign {debug_wid_st1[i], debug_pc_st1[i]} = per_bank_core_req_tag[i][`CACHE_REQ_INFO_RNG];
|
||||
assign req_id_st0[i] = per_bank_core_req_tag_unqual[i][`CACHE_REQ_ID_RNG];
|
||||
assign req_id_st1[i] = per_bank_core_req_tag[i][`CACHE_REQ_ID_RNG];
|
||||
end else begin
|
||||
assign {debug_wid_st0[i], debug_pc_st0[i]} = 0;
|
||||
assign {debug_wid_st1[i], debug_pc_st1[i]} = 0;
|
||||
assign req_id_st0[i] = 0;
|
||||
assign req_id_st1[i] = 0;
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_CACHE_BANK
|
||||
|
||||
|
@ -309,11 +306,11 @@ module VX_shared_mem #(
|
|||
for (integer i = 0; i < NUM_BANKS; ++i) begin
|
||||
if (per_bank_core_req_valid_unqual[i]) begin
|
||||
if (per_bank_core_req_rw_unqual[i]) begin
|
||||
dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n",
|
||||
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], per_bank_core_req_data_unqual[i], debug_wid_st0[i], debug_pc_st0[i]);
|
||||
dpi_trace("%d: smem%0d:%0d core-wr-req: addr=%0h, tag=%0h, byteen=%b, data=%0h (#%0d)\n",
|
||||
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], per_bank_core_req_data_unqual[i], req_id_st0[i]);
|
||||
end else begin
|
||||
dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, byteen=%b, wid=%0d, PC=%0h\n",
|
||||
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], per_bank_core_req_byteen_unqual[i], debug_wid_st0[i], debug_pc_st0[i]);
|
||||
dpi_trace("%d: smem%0d:%0d core-rd-req: addr=%0h, tag=%0h (#%0d)\n",
|
||||
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr_unqual[i], i), per_bank_core_req_tag_unqual[i], req_id_st0[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -322,11 +319,11 @@ module VX_shared_mem #(
|
|||
for (integer i = 0; i < NUM_BANKS; ++i) begin
|
||||
if (per_bank_core_req_valid[i]) begin
|
||||
if (per_bank_core_req_rw[i]) begin
|
||||
dpi_trace("%d: cache%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n",
|
||||
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_req_data[i], debug_wid_st1[i], debug_pc_st1[i]);
|
||||
dpi_trace("%d: smem%0d:%0d core-wr-rsp: addr=%0h, tag=%0h, data=%0h (#%0d)\n",
|
||||
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_data[i], req_id_st1[i]);
|
||||
end else begin
|
||||
dpi_trace("%d: cache%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h\n",
|
||||
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_rsp_data[i], debug_wid_st1[i], debug_pc_st1[i]);
|
||||
dpi_trace("%d: smem%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, data=%0h (#%0d)\n",
|
||||
$time, CACHE_ID, i, `LINE_TO_BYTE_ADDR(per_bank_core_req_addr[i], i), per_bank_core_req_tag[i], per_bank_core_rsp_data[i], req_id_st1[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -338,21 +335,13 @@ module VX_shared_mem #(
|
|||
// per cycle: core_reads, core_writes
|
||||
wire [$clog2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
|
||||
wire [$clog2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
|
||||
wire [$clog2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||
|
||||
wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid & core_req_ready & ~core_req_rw;
|
||||
wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid & core_req_ready & core_req_rw;
|
||||
|
||||
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask);
|
||||
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_mask);
|
||||
|
||||
if (CORE_TAG_ID_BITS != 0) begin
|
||||
wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_tmask & {NUM_REQS{core_rsp_valid && ~core_rsp_ready}};
|
||||
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask);
|
||||
end else begin
|
||||
wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_valid & ~core_rsp_ready;
|
||||
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask);
|
||||
end
|
||||
wire perf_crsp_stall_per_cycle = core_rsp_valid & ~core_rsp_ready;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
|
||||
|
@ -360,13 +349,13 @@ module VX_shared_mem #(
|
|||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_core_reads <= 0;
|
||||
perf_core_writes <= 0;
|
||||
perf_crsp_stalls <= 0;
|
||||
perf_core_reads <= 0;
|
||||
perf_core_writes <= 0;
|
||||
perf_crsp_stalls <= 0;
|
||||
end else begin
|
||||
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -374,7 +363,8 @@ module VX_shared_mem #(
|
|||
assign perf_cache_if.writes = perf_core_writes;
|
||||
assign perf_cache_if.read_misses = '0;
|
||||
assign perf_cache_if.write_misses = '0;
|
||||
assign perf_cache_if.pipe_stalls = '0;
|
||||
assign perf_cache_if.mshr_stalls = '0;
|
||||
assign perf_cache_if.mem_stalls = '0;
|
||||
assign perf_cache_if.crsp_stalls = perf_crsp_stalls;
|
||||
`endif
|
||||
|
||||
|
|
9
hw/rtl/cache/VX_tag_access.sv
vendored
9
hw/rtl/cache/VX_tag_access.sv
vendored
|
@ -17,12 +17,9 @@ module VX_tag_access #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
`IGNORE_UNUSED_BEGIN
|
||||
input wire[31:0] debug_pc,
|
||||
input wire[`NW_BITS-1:0] debug_wid,
|
||||
input wire[`DBG_CACHE_REQ_IDW-1:0] req_id,
|
||||
`IGNORE_UNUSED_END
|
||||
`endif
|
||||
|
||||
input wire stall,
|
||||
|
||||
|
@ -71,9 +68,9 @@ module VX_tag_access #(
|
|||
end
|
||||
if (lookup && ~stall) begin
|
||||
if (tag_match) begin
|
||||
dpi_trace("%d: cache%0d:%0d tag-hit: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, line_tag);
|
||||
dpi_trace("%d: cache%0d:%0d tag-hit: addr=%0h, blk_addr=%0d, tag_id=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, line_tag, req_id);
|
||||
end else begin
|
||||
dpi_trace("%d: cache%0d:%0d tag-miss: addr=%0h, wid=%0d, PC=%0h, blk_addr=%0d, tag_id=%0h, old_tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), debug_wid, debug_pc, line_addr, line_tag, read_tag);
|
||||
dpi_trace("%d: cache%0d:%0d tag-miss: addr=%0h, blk_addr=%0d, tag_id=%0h, old_tag_id=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, line_tag, read_tag, req_id);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
Subproject commit 1def7bb630ceae2ebc58921f6b5ee3e686fb6d5a
|
|
@ -5,7 +5,8 @@
|
|||
|
||||
interface VX_alu_req_if ();
|
||||
|
||||
wire valid;
|
||||
wire valid;
|
||||
wire [`UUID_BITS-1:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -24,6 +25,7 @@ interface VX_alu_req_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -43,6 +45,7 @@ interface VX_alu_req_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -5,9 +5,12 @@
|
|||
|
||||
interface VX_cmt_to_csr_if ();
|
||||
|
||||
wire valid;
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] commit_size;
|
||||
|
||||
wire valid;
|
||||
`ifdef EXT_F_ENABLE
|
||||
wire [$clog2(6*`NUM_THREADS+1)-1:0] commit_size;
|
||||
`else
|
||||
wire [$clog2(5*`NUM_THREADS+1)-1:0] commit_size;
|
||||
`endif
|
||||
modport master (
|
||||
output valid,
|
||||
output commit_size
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_commit_if ();
|
||||
|
||||
wire valid;
|
||||
wire [`UUID_BITS-1:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -17,6 +18,7 @@ interface VX_commit_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -29,6 +31,7 @@ interface VX_commit_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_csr_req_if ();
|
||||
|
||||
wire valid;
|
||||
wire [`UUID_BITS-1:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -20,6 +21,7 @@ interface VX_csr_req_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -35,6 +37,7 @@ interface VX_csr_req_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_decode_if ();
|
||||
|
||||
wire valid;
|
||||
wire [`UUID_BITS-1:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -23,7 +24,8 @@ interface VX_decode_if ();
|
|||
wire ready;
|
||||
|
||||
modport master (
|
||||
output valid,
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -42,7 +44,8 @@ interface VX_decode_if ();
|
|||
);
|
||||
|
||||
modport slave (
|
||||
input valid,
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_fpu_req_if ();
|
||||
|
||||
wire valid;
|
||||
wire [`UUID_BITS-1:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -20,6 +21,7 @@ interface VX_fpu_req_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -35,6 +37,7 @@ interface VX_fpu_req_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
interface VX_gpu_req_if();
|
||||
|
||||
wire valid;
|
||||
|
||||
wire [`UUID_BITS-1:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -19,11 +19,11 @@ interface VX_gpu_req_if();
|
|||
wire [`NUM_THREADS-1:0][31:0] rs3_data;
|
||||
wire [`NR_BITS-1:0] rd;
|
||||
wire wb;
|
||||
|
||||
wire ready;
|
||||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -41,6 +41,7 @@ interface VX_gpu_req_if();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_ibuffer_if ();
|
||||
|
||||
wire valid;
|
||||
wire [`UUID_BITS-1:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -31,6 +32,7 @@ interface VX_ibuffer_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -55,6 +57,7 @@ interface VX_ibuffer_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -5,14 +5,16 @@
|
|||
|
||||
interface VX_ifetch_req_if ();
|
||||
|
||||
wire valid;
|
||||
wire valid;
|
||||
wire [`UUID_BITS-1:0] uuid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [31:0] PC;
|
||||
wire ready;
|
||||
|
||||
modport master (
|
||||
output valid,
|
||||
output valid,
|
||||
output uuid,
|
||||
output tmask,
|
||||
output wid,
|
||||
output PC,
|
||||
|
@ -20,7 +22,8 @@ interface VX_ifetch_req_if ();
|
|||
);
|
||||
|
||||
modport slave (
|
||||
input valid,
|
||||
input valid,
|
||||
input uuid,
|
||||
input tmask,
|
||||
input wid,
|
||||
input PC,
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_ifetch_rsp_if ();
|
||||
|
||||
wire valid;
|
||||
wire [`UUID_BITS-1:0] uuid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [31:0] PC;
|
||||
|
@ -13,7 +14,8 @@ interface VX_ifetch_rsp_if ();
|
|||
wire ready;
|
||||
|
||||
modport master (
|
||||
output valid,
|
||||
output valid,
|
||||
output uuid,
|
||||
output tmask,
|
||||
output wid,
|
||||
output PC,
|
||||
|
@ -22,7 +24,8 @@ interface VX_ifetch_rsp_if ();
|
|||
);
|
||||
|
||||
modport slave (
|
||||
input valid,
|
||||
input valid,
|
||||
input uuid,
|
||||
input tmask,
|
||||
input wid,
|
||||
input PC,
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_lsu_req_if ();
|
||||
|
||||
wire valid;
|
||||
wire [`UUID_BITS-1:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -21,6 +22,7 @@ interface VX_lsu_req_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -37,6 +39,7 @@ interface VX_lsu_req_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -11,7 +11,7 @@ interface VX_perf_cache_if ();
|
|||
wire [`PERF_CTR_BITS-1:0] write_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] bank_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] mshr_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] pipe_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] mem_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] crsp_stalls;
|
||||
|
||||
modport master (
|
||||
|
@ -21,7 +21,7 @@ interface VX_perf_cache_if ();
|
|||
output write_misses,
|
||||
output bank_stalls,
|
||||
output mshr_stalls,
|
||||
output pipe_stalls,
|
||||
output mem_stalls,
|
||||
output crsp_stalls
|
||||
);
|
||||
|
||||
|
@ -32,7 +32,7 @@ interface VX_perf_cache_if ();
|
|||
input write_misses,
|
||||
input bank_stalls,
|
||||
input mshr_stalls,
|
||||
input pipe_stalls,
|
||||
input mem_stalls,
|
||||
input crsp_stalls
|
||||
);
|
||||
|
||||
|
|
|
@ -7,68 +7,50 @@ interface VX_perf_memsys_if ();
|
|||
|
||||
wire [`PERF_CTR_BITS-1:0] icache_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] icache_read_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] icache_pipe_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] icache_crsp_stalls;
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_writes;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_writes;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_read_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_write_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_bank_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_mshr_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_pipe_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_crsp_stalls;
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] smem_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] smem_writes;
|
||||
wire [`PERF_CTR_BITS-1:0] smem_bank_stalls;
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] mem_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] mem_writes;
|
||||
wire [`PERF_CTR_BITS-1:0] mem_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] mem_latency;
|
||||
|
||||
modport master (
|
||||
output icache_reads,
|
||||
output icache_read_misses,
|
||||
output icache_pipe_stalls,
|
||||
output icache_crsp_stalls,
|
||||
output dcache_reads,
|
||||
output dcache_writes,
|
||||
output dcache_writes,
|
||||
output dcache_read_misses,
|
||||
output dcache_write_misses,
|
||||
output dcache_bank_stalls,
|
||||
output dcache_mshr_stalls,
|
||||
output dcache_pipe_stalls,
|
||||
output dcache_crsp_stalls,
|
||||
output smem_reads,
|
||||
output smem_writes,
|
||||
output smem_bank_stalls,
|
||||
output mem_reads,
|
||||
output mem_writes,
|
||||
output mem_stalls,
|
||||
output mem_latency
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input icache_reads,
|
||||
input icache_read_misses,
|
||||
input icache_pipe_stalls,
|
||||
input icache_crsp_stalls,
|
||||
input dcache_reads,
|
||||
input dcache_writes,
|
||||
input dcache_writes,
|
||||
input dcache_read_misses,
|
||||
input dcache_write_misses,
|
||||
input dcache_bank_stalls,
|
||||
input dcache_mshr_stalls,
|
||||
input dcache_pipe_stalls,
|
||||
input dcache_crsp_stalls,
|
||||
input smem_reads,
|
||||
input smem_writes,
|
||||
input smem_bank_stalls,
|
||||
input mem_reads,
|
||||
input mem_writes,
|
||||
input mem_stalls,
|
||||
input mem_latency
|
||||
);
|
||||
|
||||
|
|
|
@ -4,18 +4,27 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
interface VX_perf_pipeline_if ();
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] lsu_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] csr_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] alu_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] loads;
|
||||
wire [`PERF_CTR_BITS-1:0] stores;
|
||||
wire [`PERF_CTR_BITS-1:0] branches;
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] lsu_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] csr_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] alu_stalls;
|
||||
`ifdef EXT_F_ENABLE
|
||||
wire [`PERF_CTR_BITS-1:0] fpu_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] fpu_stalls;
|
||||
`endif
|
||||
wire [`PERF_CTR_BITS-1:0] gpu_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] gpu_stalls;
|
||||
|
||||
modport master (
|
||||
modport decode (
|
||||
output loads,
|
||||
output stores,
|
||||
output branches
|
||||
);
|
||||
|
||||
modport issue (
|
||||
output ibf_stalls,
|
||||
output scb_stalls,
|
||||
output lsu_stalls,
|
||||
|
@ -25,9 +34,12 @@ interface VX_perf_pipeline_if ();
|
|||
output fpu_stalls,
|
||||
`endif
|
||||
output gpu_stalls
|
||||
);
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input loads,
|
||||
input stores,
|
||||
input branches,
|
||||
input ibf_stalls,
|
||||
input scb_stalls,
|
||||
input lsu_stalls,
|
||||
|
|
23
hw/rtl/interfaces/VX_perf_tex_if.sv
Normal file
23
hw/rtl/interfaces/VX_perf_tex_if.sv
Normal file
|
@ -0,0 +1,23 @@
|
|||
`ifndef VX_PERF_TEX_IF
|
||||
`define VX_PERF_TEX_IF
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_perf_tex_if ();
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] mem_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] mem_latency;
|
||||
|
||||
modport master (
|
||||
output mem_reads,
|
||||
output mem_latency
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input mem_reads,
|
||||
input mem_latency
|
||||
);
|
||||
|
||||
endinterface
|
||||
|
||||
`endif
|
|
@ -8,17 +8,20 @@ interface VX_tex_csr_if ();
|
|||
wire write_enable;
|
||||
wire [`CSR_ADDR_BITS-1:0] write_addr;
|
||||
wire [31:0] write_data;
|
||||
wire [`UUID_BITS-1:0] write_uuid;
|
||||
|
||||
modport master (
|
||||
output write_enable,
|
||||
output write_addr,
|
||||
output write_data
|
||||
output write_data,
|
||||
output write_uuid
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input write_enable,
|
||||
input write_addr,
|
||||
input write_data
|
||||
input write_data,
|
||||
input write_uuid
|
||||
);
|
||||
|
||||
endinterface
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_tex_req_if ();
|
||||
|
||||
wire valid;
|
||||
wire [`UUID_BITS-1:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -20,6 +21,7 @@ interface VX_tex_req_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -33,6 +35,7 @@ interface VX_tex_req_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_tex_rsp_if ();
|
||||
|
||||
wire valid;
|
||||
wire [`UUID_BITS-1:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -16,6 +17,7 @@ interface VX_tex_rsp_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -27,6 +29,7 @@ interface VX_tex_rsp_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_writeback_if ();
|
||||
|
||||
wire valid;
|
||||
wire [`UUID_BITS-1:0] uuid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [31:0] PC;
|
||||
|
@ -16,6 +17,7 @@ interface VX_writeback_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output tmask,
|
||||
output wid,
|
||||
output PC,
|
||||
|
@ -27,6 +29,7 @@ interface VX_writeback_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input tmask,
|
||||
input wid,
|
||||
input PC,
|
||||
|
|
|
@ -125,7 +125,7 @@ module VX_axi_adapter #(
|
|||
|
||||
// AXI write response channel
|
||||
`UNUSED_VAR (m_axi_bid);
|
||||
`RUNTIME_ASSERT(~m_axi_bvalid || m_axi_bresp == 0, ("AXI response error"));
|
||||
`RUNTIME_ASSERT(~m_axi_bvalid || m_axi_bresp == 0, ("%t: *** AXI response error", $time));
|
||||
assign m_axi_bready = 1'b1;
|
||||
|
||||
// AXI read request channel
|
||||
|
@ -144,7 +144,7 @@ module VX_axi_adapter #(
|
|||
assign mem_rsp_valid = m_axi_rvalid;
|
||||
assign mem_rsp_tag = m_axi_rid;
|
||||
assign mem_rsp_data = m_axi_rdata;
|
||||
`RUNTIME_ASSERT(~m_axi_rvalid || m_axi_rresp == 0, ("AXI response error"));
|
||||
`RUNTIME_ASSERT(~m_axi_rvalid || m_axi_rresp == 0, ("%t: *** AXI response error", $time));
|
||||
`UNUSED_VAR (m_axi_rlast);
|
||||
assign m_axi_rready = mem_rsp_ready;
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ module VX_index_queue #(
|
|||
assign enqueue = push;
|
||||
assign dequeue = !empty && !valid[rd_a]; // auto-remove when head is invalid
|
||||
|
||||
`RUNTIME_ASSERT(!push || !full, ("invalid inputs"));
|
||||
`RUNTIME_ASSERT(!push || !full, ("%t: *** invalid inputs", $time));
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
|
|
@ -4,12 +4,17 @@
|
|||
module VX_popcount #(
|
||||
parameter MODEL = 1,
|
||||
parameter N = 1,
|
||||
parameter LOGN = $clog2(N),
|
||||
parameter M = LOGN+1
|
||||
parameter M = $clog2(N+1)
|
||||
) (
|
||||
input wire [N-1:0] in_i,
|
||||
output wire [M-1:0] cnt_o
|
||||
);
|
||||
`ifndef SYNTHESIS
|
||||
assign cnt_o = $countones(in_i);
|
||||
`else
|
||||
`ifdef QUARTUS
|
||||
assign cnt_o = $countones(in_i);
|
||||
`else
|
||||
if (N == 1) begin
|
||||
|
||||
assign cnt_o = in_i;
|
||||
|
@ -53,6 +58,8 @@ module VX_popcount #(
|
|||
assign cnt_o = cnt_r;
|
||||
|
||||
end
|
||||
`endif
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
`TRACING_ON
|
|
@ -30,7 +30,7 @@ module VX_skid_buffer #(
|
|||
|
||||
end else if (NOBACKPRESSURE) begin
|
||||
|
||||
`RUNTIME_ASSERT(ready_out, ("ready_out should always be asserted"))
|
||||
`RUNTIME_ASSERT(ready_out, ("%t: *** ready_out should always be asserted", $time))
|
||||
|
||||
wire stall = valid_out && ~ready_out;
|
||||
|
||||
|
|
|
@ -12,13 +12,14 @@ module VX_tex_addr #(
|
|||
|
||||
input wire req_valid,
|
||||
input wire [NUM_REQS-1:0] req_tmask,
|
||||
input wire [1:0][NUM_REQS-1:0][31:0] req_coords,
|
||||
input wire [1:0][NUM_REQS-1:0][`TEX_FXD_BITS-1:0] req_coords,
|
||||
input wire [`TEX_FORMAT_BITS-1:0] req_format,
|
||||
input wire [`TEX_FILTER_BITS-1:0] req_filter,
|
||||
input wire [1:0][`TEX_WRAP_BITS-1:0] req_wraps,
|
||||
input wire [`TEX_ADDR_BITS-1:0] req_baseaddr,
|
||||
input wire [NUM_REQS-1:0][`TEX_LOD_BITS-1:0] mip_level,
|
||||
input wire [NUM_REQS-1:0][`TEX_MIPOFF_BITS-1:0] req_mipoff,
|
||||
input wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] req_logdims,
|
||||
input wire [NUM_REQS-1:0][1:0][`TEX_LOD_BITS-1:0] req_logdims,
|
||||
input wire [REQ_INFOW-1:0] req_info,
|
||||
output wire req_ready,
|
||||
|
||||
|
@ -27,31 +28,35 @@ module VX_tex_addr #(
|
|||
output wire rsp_valid,
|
||||
output wire [NUM_REQS-1:0] rsp_tmask,
|
||||
output wire [`TEX_FILTER_BITS-1:0] rsp_filter,
|
||||
output wire [`TEX_STRIDE_BITS-1:0] rsp_stride,
|
||||
output wire [`TEX_LGSTRIDE_BITS-1:0] rsp_lgstride,
|
||||
output wire [NUM_REQS-1:0][31:0] rsp_baseaddr,
|
||||
output wire [NUM_REQS-1:0][3:0][31:0] rsp_addr,
|
||||
output wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends,
|
||||
output wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] rsp_blends,
|
||||
output wire [REQ_INFOW-1:0] rsp_info,
|
||||
input wire rsp_ready
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
localparam PITCH_BITS = `MAX(`TEX_DIM_BITS, `TEX_STRIDE_BITS) + 1;
|
||||
localparam SCALED_U_W = `FIXED_INT + `TEX_STRIDE_BITS;
|
||||
localparam SCALED_X_W = (2 * `FIXED_INT);
|
||||
localparam SCALED_V_W = SCALED_X_W + `TEX_STRIDE_BITS;
|
||||
localparam SHIFT_BITS = $clog2(`TEX_FXD_FRAC+1);
|
||||
localparam PITCH_BITS = `MAX(`TEX_LOD_BITS, `TEX_LGSTRIDE_BITS) + 1;
|
||||
localparam SCALED_DIM = `TEX_FXD_FRAC + `TEX_DIM_BITS;
|
||||
localparam SCALED_X_W = `TEX_DIM_BITS + `TEX_BLEND_FRAC;
|
||||
localparam OFFSET_U_W = `TEX_DIM_BITS + `TEX_LGSTRIDE_MAX;
|
||||
localparam OFFSET_V_W = `TEX_DIM_BITS + `TEX_DIM_BITS + `TEX_LGSTRIDE_MAX;
|
||||
|
||||
wire valid_s0;
|
||||
wire [NUM_REQS-1:0] tmask_s0;
|
||||
wire [`TEX_FILTER_BITS-1:0] filter_s0;
|
||||
wire [REQ_INFOW-1:0] req_info_s0;
|
||||
wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_lo, clamped_lo_s0;
|
||||
wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_hi, clamped_hi_s0;
|
||||
wire [`TEX_STRIDE_BITS-1:0] log_stride, log_stride_s0;
|
||||
wire [NUM_REQS-1:0][1:0][`TEX_FXD_FRAC-1:0] clamped_lo, clamped_lo_s0;
|
||||
wire [NUM_REQS-1:0][1:0][`TEX_FXD_FRAC-1:0] clamped_hi, clamped_hi_s0;
|
||||
wire [NUM_REQS-1:0][1:0][SHIFT_BITS-1:0] dim_shift, dim_shift_s0;
|
||||
wire [`TEX_LGSTRIDE_BITS-1:0] log_stride, log_stride_s0;
|
||||
wire [NUM_REQS-1:0][31:0] mip_addr, mip_addr_s0;
|
||||
wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] log_dims_s0;
|
||||
wire [NUM_REQS-1:0][PITCH_BITS-1:0] log_pitch, log_pitch_s0;
|
||||
|
||||
wire [NUM_REQS-1:0][PITCH_BITS-1:0] log_pitch, log_pitch_s0;
|
||||
|
||||
wire stall_out;
|
||||
|
||||
// stride
|
||||
|
@ -67,9 +72,9 @@ module VX_tex_addr #(
|
|||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
for (genvar j = 0; j < 2; ++j) begin
|
||||
wire [`FIXED_FRAC-1:0] delta = (`FIXED_HALF >> req_logdims[i][j]);
|
||||
wire [31:0] coord_lo = req_filter ? (req_coords[j][i] - 32'(delta)) : req_coords[j][i];
|
||||
wire [31:0] coord_hi = req_filter ? (req_coords[j][i] + 32'(delta)) : req_coords[j][i];
|
||||
wire [`TEX_FXD_FRAC-1:0] delta = `TEX_FXD_FRAC'((SCALED_DIM'(`TEX_FXD_HALF) << mip_level[i]) >> req_logdims[i][j]);
|
||||
wire [`TEX_FXD_BITS-1:0] coord_lo = req_filter ? (req_coords[j][i] - `TEX_FXD_BITS'(delta)) : req_coords[j][i];
|
||||
wire [`TEX_FXD_BITS-1:0] coord_hi = req_filter ? (req_coords[j][i] + `TEX_FXD_BITS'(delta)) : req_coords[j][i];
|
||||
|
||||
VX_tex_wrap #(
|
||||
.CORE_ID (CORE_ID)
|
||||
|
@ -86,66 +91,67 @@ module VX_tex_addr #(
|
|||
.coord_i (coord_hi),
|
||||
.coord_o (clamped_hi[i][j])
|
||||
);
|
||||
|
||||
assign dim_shift[i][j] = (`TEX_FXD_FRAC - `TEX_BLEND_FRAC - (req_logdims[i][j] - mip_level[i]));
|
||||
end
|
||||
assign log_pitch[i] = PITCH_BITS'(req_logdims[i][0]) + PITCH_BITS'(log_stride);
|
||||
assign mip_addr[i] = req_baseaddr + 32'(req_mipoff[i]);
|
||||
assign log_pitch[i] = PITCH_BITS'(req_logdims[i][0] - mip_level[i]) + PITCH_BITS'(log_stride);
|
||||
assign mip_addr[i] = req_baseaddr + `TEX_ADDR_BITS'(req_mipoff[i]);
|
||||
end
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * `TEX_DIM_BITS + 32 + 2 * 2 * `FIXED_FRAC)),
|
||||
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * SHIFT_BITS + `TEX_ADDR_BITS + 2 * 2 * `TEX_FXD_FRAC)),
|
||||
.RESETW (1)
|
||||
) pipe_reg0 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, log_pitch, req_logdims, mip_addr, clamped_lo, clamped_hi}),
|
||||
.data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, log_pitch_s0, log_dims_s0, mip_addr_s0, clamped_lo_s0, clamped_hi_s0})
|
||||
.data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, log_pitch, dim_shift, mip_addr, clamped_lo, clamped_hi}),
|
||||
.data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, log_pitch_s0, dim_shift_s0, mip_addr_s0, clamped_lo_s0, clamped_hi_s0})
|
||||
);
|
||||
|
||||
// addresses generation
|
||||
|
||||
wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_lo;
|
||||
wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_hi;
|
||||
wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] blends;
|
||||
wire [NUM_REQS-1:0][1:0][SCALED_X_W-1:0] scaled_lo;
|
||||
wire [NUM_REQS-1:0][1:0][SCALED_X_W-1:0] scaled_hi;
|
||||
wire [NUM_REQS-1:0][OFFSET_U_W-1:0] offset_u_lo;
|
||||
wire [NUM_REQS-1:0][OFFSET_U_W-1:0] offset_u_hi;
|
||||
wire [NUM_REQS-1:0][OFFSET_V_W-1:0] offset_v_lo;
|
||||
wire [NUM_REQS-1:0][OFFSET_V_W-1:0] offset_v_hi;
|
||||
wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] blends;
|
||||
wire [NUM_REQS-1:0][3:0][31:0] addr;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
for (genvar j = 0; j < 2; ++j) begin
|
||||
assign scaled_lo[i][j] = scale_to_dim(clamped_lo_s0[i][j], log_dims_s0[i][j]);
|
||||
assign scaled_hi[i][j] = scale_to_dim(clamped_hi_s0[i][j], log_dims_s0[i][j]);
|
||||
assign blends[i][j] = filter_s0 ? clamped_lo_s0[i][j][`BLEND_FRAC-1:0] : `BLEND_FRAC'(0);
|
||||
assign scaled_lo[i][j] = SCALED_X_W'(clamped_lo_s0[i][j] >> dim_shift_s0[i][j]);
|
||||
assign scaled_hi[i][j] = SCALED_X_W'(clamped_hi_s0[i][j] >> dim_shift_s0[i][j]);
|
||||
assign blends[i][j] = filter_s0 ? scaled_lo[i][j][`TEX_BLEND_FRAC-1:0] : `TEX_BLEND_FRAC'(0);
|
||||
end
|
||||
end
|
||||
|
||||
`UNUSED_VAR (log_pitch_s0)
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
wire [SCALED_U_W-1:0] offset_u_lo = SCALED_U_W'(scaled_lo[i][0]) << log_stride_s0;
|
||||
wire [SCALED_U_W-1:0] offset_u_hi = SCALED_U_W'(scaled_hi[i][0]) << log_stride_s0;
|
||||
assign offset_u_lo[i] = OFFSET_U_W'(scaled_lo[i][0][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_stride_s0;
|
||||
assign offset_u_hi[i] = OFFSET_U_W'(scaled_hi[i][0][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_stride_s0;
|
||||
|
||||
wire [SCALED_V_W-1:0] offset_v_lo = SCALED_V_W'(scaled_lo[i][1]) << log_pitch_s0[i];
|
||||
wire [SCALED_V_W-1:0] offset_v_hi = SCALED_V_W'(scaled_hi[i][1]) << log_pitch_s0[i];
|
||||
assign offset_v_lo[i] = OFFSET_V_W'(scaled_lo[i][1][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_pitch_s0[i];
|
||||
assign offset_v_hi[i] = OFFSET_V_W'(scaled_hi[i][1][`TEX_BLEND_FRAC +: `TEX_DIM_BITS]) << log_pitch_s0[i];
|
||||
|
||||
wire [31:0] base_addr_lo = mip_addr_s0[i] + 32'(offset_v_lo);
|
||||
wire [31:0] base_addr_hi = mip_addr_s0[i] + 32'(offset_v_hi);
|
||||
|
||||
assign addr[i][0] = base_addr_lo + 32'(offset_u_lo);
|
||||
assign addr[i][1] = base_addr_lo + 32'(offset_u_hi);
|
||||
assign addr[i][2] = base_addr_hi + 32'(offset_u_lo);
|
||||
assign addr[i][3] = base_addr_hi + 32'(offset_u_hi);
|
||||
assign addr[i][0] = 32'(offset_v_lo[i]) + 32'(offset_u_lo[i]);
|
||||
assign addr[i][1] = 32'(offset_v_lo[i]) + 32'(offset_u_hi[i]);
|
||||
assign addr[i][2] = 32'(offset_v_hi[i]) + 32'(offset_u_lo[i]);
|
||||
assign addr[i][3] = 32'(offset_v_hi[i]) + 32'(offset_u_hi[i]);
|
||||
end
|
||||
|
||||
assign stall_out = rsp_valid && ~rsp_ready;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `BLEND_FRAC) + REQ_INFOW),
|
||||
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + (NUM_REQS * 32) + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `TEX_BLEND_FRAC) + REQ_INFOW),
|
||||
.RESETW (1)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, addr, blends, req_info_s0}),
|
||||
.data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_stride, rsp_addr, rsp_blends, rsp_info})
|
||||
.data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, mip_addr_s0, addr, blends, req_info_s0}),
|
||||
.data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_lgstride, rsp_baseaddr, rsp_addr, rsp_blends, rsp_info})
|
||||
);
|
||||
|
||||
assign req_ready = ~stall_out;
|
||||
|
@ -157,22 +163,45 @@ module VX_tex_addr #(
|
|||
assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (req_valid && ~stall_out) begin
|
||||
dpi_trace("%d: *** log_pitch=", $time);
|
||||
`TRACE_ARRAY1D(log_pitch, NUM_REQS);
|
||||
dpi_trace(", mip_addr=");
|
||||
`TRACE_ARRAY1D(mip_addr, NUM_REQS);
|
||||
dpi_trace(", req_logdims=");
|
||||
`TRACE_ARRAY2D(req_logdims, 2, NUM_REQS);
|
||||
dpi_trace(", clamped_lo=");
|
||||
`TRACE_ARRAY2D(clamped_lo, 2, NUM_REQS);
|
||||
dpi_trace(", clamped_hi=");
|
||||
`TRACE_ARRAY2D(clamped_hi, 2, NUM_REQS);
|
||||
dpi_trace(", mip_addr=");
|
||||
`TRACE_ARRAY1D(mip_addr, NUM_REQS);
|
||||
dpi_trace("\n");
|
||||
end
|
||||
|
||||
if (valid_s0 && ~stall_out) begin
|
||||
dpi_trace("%d: *** scaled_lo=", $time);
|
||||
`TRACE_ARRAY2D(scaled_lo, 2, NUM_REQS);
|
||||
dpi_trace(", scaled_hi=");
|
||||
`TRACE_ARRAY2D(scaled_hi, 2, NUM_REQS);
|
||||
dpi_trace(", offset_u_lo=");
|
||||
`TRACE_ARRAY1D(offset_u_lo, NUM_REQS);
|
||||
dpi_trace(", offset_u_hi=");
|
||||
`TRACE_ARRAY1D(offset_u_hi, NUM_REQS);
|
||||
dpi_trace(", offset_v_lo=");
|
||||
`TRACE_ARRAY1D(offset_v_lo, NUM_REQS);
|
||||
dpi_trace(", offset_v_hi=");
|
||||
`TRACE_ARRAY1D(offset_v_hi, NUM_REQS);
|
||||
dpi_trace("\n");
|
||||
end
|
||||
|
||||
if (rsp_valid && rsp_ready) begin
|
||||
dpi_trace("%d: core%0d-tex-addr: wid=%0d, PC=%0h, tmask=%b, req_filter=%0d, tride=%0d, addr=",
|
||||
$time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask, rsp_filter, rsp_stride);
|
||||
dpi_trace("%d: core%0d-tex-addr: wid=%0d, PC=%0h, tmask=%b, req_filter=%0d, lgstride=%0d, addr=",
|
||||
$time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask, rsp_filter, rsp_lgstride);
|
||||
`TRACE_ARRAY2D(rsp_addr, 4, NUM_REQS);
|
||||
dpi_trace("\n");
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
function logic [`FIXED_INT-1:0] scale_to_dim (input logic [`FIXED_FRAC-1:0] src,
|
||||
input logic [`TEX_DIM_BITS-1:0] dim);
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
logic [`FIXED_BITS-1:0] out;
|
||||
`IGNORE_WARNINGS_END
|
||||
out = `FIXED_BITS'(src) << dim;
|
||||
return out[`FIXED_FRAC +: `FIXED_INT];
|
||||
endfunction
|
||||
|
||||
endmodule
|
|
@ -3,37 +3,49 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
`define FIXED_BITS 32
|
||||
`define FIXED_FRAC 20
|
||||
`define FIXED_INT (`FIXED_BITS - `FIXED_FRAC)
|
||||
`define FIXED_ONE (2 ** `FIXED_FRAC)
|
||||
`define FIXED_HALF (`FIXED_ONE >> 1)
|
||||
`define FIXED_MASK (`FIXED_ONE - 1)
|
||||
`define TEX_FXD_INT (`TEX_FXD_BITS - `TEX_FXD_FRAC)
|
||||
`define TEX_FXD_ONE (2 ** `TEX_FXD_FRAC)
|
||||
`define TEX_FXD_HALF (`TEX_FXD_ONE >> 1)
|
||||
`define TEX_FXD_MASK (`TEX_FXD_ONE - 1)
|
||||
|
||||
`define TEX_ADDR_BITS 32
|
||||
`define TEX_FORMAT_BITS 3
|
||||
`define TEX_WRAP_BITS 2
|
||||
`define TEX_DIM_BITS 4
|
||||
`define TEX_FILTER_BITS 1
|
||||
`define TEX_MIPOFF_BITS (2*`TEX_DIM_BITS+1)
|
||||
|
||||
`define TEX_MIPOFF_BITS (2*12+1)
|
||||
`define TEX_STRIDE_BITS 2
|
||||
|
||||
`define TEX_LOD_BITS 4
|
||||
`define TEX_MIP_BITS (`NTEX_BITS + `TEX_LOD_BITS)
|
||||
`define TEX_LGSTRIDE_MAX 2
|
||||
`define TEX_LGSTRIDE_BITS 2
|
||||
|
||||
`define TEX_WRAP_CLAMP 0
|
||||
`define TEX_WRAP_REPEAT 1
|
||||
`define TEX_WRAP_MIRROR 2
|
||||
|
||||
`define BLEND_FRAC 8
|
||||
`define BLEND_ONE (2 ** `BLEND_FRAC)
|
||||
`define TEX_BLEND_FRAC 8
|
||||
`define TEX_BLEND_ONE (2 ** `TEX_BLEND_FRAC)
|
||||
|
||||
`define TEX_FORMAT_R8G8B8A8 `TEX_FORMAT_BITS'(0)
|
||||
`define TEX_FORMAT_A8R8G8B8 `TEX_FORMAT_BITS'(0)
|
||||
`define TEX_FORMAT_R5G6B5 `TEX_FORMAT_BITS'(1)
|
||||
`define TEX_FORMAT_R4G4B4A4 `TEX_FORMAT_BITS'(2)
|
||||
`define TEX_FORMAT_L8A8 `TEX_FORMAT_BITS'(3)
|
||||
`define TEX_FORMAT_L8 `TEX_FORMAT_BITS'(4)
|
||||
`define TEX_FORMAT_A8 `TEX_FORMAT_BITS'(5)
|
||||
`define TEX_FORMAT_A1R5G5B5 `TEX_FORMAT_BITS'(2)
|
||||
`define TEX_FORMAT_A4R4G4B4 `TEX_FORMAT_BITS'(3)
|
||||
`define TEX_FORMAT_A8L8 `TEX_FORMAT_BITS'(4)
|
||||
`define TEX_FORMAT_L8 `TEX_FORMAT_BITS'(5)
|
||||
`define TEX_FORMAT_A8 `TEX_FORMAT_BITS'(6)
|
||||
|
||||
task trace_tex_state (
|
||||
input [`CSR_ADDR_BITS-1:0] state
|
||||
);
|
||||
case (state)
|
||||
`CSR_TEX_ADDR: dpi_trace("ADDR");
|
||||
`CSR_TEX_WIDTH: dpi_trace("WIDTH");
|
||||
`CSR_TEX_HEIGHT: dpi_trace("HEIGHT");
|
||||
`CSR_TEX_FORMAT: dpi_trace("FORMAT");
|
||||
`CSR_TEX_FILTER: dpi_trace("FILTER");
|
||||
`CSR_TEX_WRAPU: dpi_trace("WRAPU");
|
||||
`CSR_TEX_WRAPV: dpi_trace("WRAPV");
|
||||
//`CSR_TEX_MIPOFF
|
||||
default: dpi_trace("MIPOFF");
|
||||
endcase
|
||||
endtask
|
||||
|
||||
`endif
|
|
@ -13,25 +13,31 @@ module VX_tex_format #(
|
|||
|
||||
always @(*) begin
|
||||
case (format)
|
||||
`TEX_FORMAT_R8G8B8A8: begin
|
||||
`TEX_FORMAT_A8R8G8B8: begin
|
||||
texel_out_r[07:00] = texel_in[7:0];
|
||||
texel_out_r[15:08] = texel_in[15:8];
|
||||
texel_out_r[23:16] = texel_in[23:16];
|
||||
texel_out_r[31:24] = texel_in[31:24];
|
||||
end
|
||||
`TEX_FORMAT_R5G6B5: begin
|
||||
texel_out_r[07:00] = {texel_in[15:11], texel_in[15:13]};
|
||||
texel_out_r[07:00] = {texel_in[4:0], texel_in[4:2]};
|
||||
texel_out_r[15:08] = {texel_in[10:5], texel_in[10:9]};
|
||||
texel_out_r[23:16] = {texel_in[4:0], texel_in[4:2]};
|
||||
texel_out_r[23:16] = {texel_in[15:11], texel_in[15:13]};
|
||||
texel_out_r[31:24] = 8'hff;
|
||||
end
|
||||
`TEX_FORMAT_R4G4B4A4: begin
|
||||
texel_out_r[07:00] = {texel_in[11:8], texel_in[15:12]};
|
||||
`TEX_FORMAT_A1R5G5B5: begin
|
||||
texel_out_r[07:00] = {texel_in[4:0], texel_in[4:2]};
|
||||
texel_out_r[15:08] = {texel_in[9:5], texel_in[9:7]};
|
||||
texel_out_r[23:16] = {texel_in[14:10], texel_in[14:12]};
|
||||
texel_out_r[31:24] = {8{texel_in[15]}};
|
||||
end
|
||||
`TEX_FORMAT_A4R4G4B4: begin
|
||||
texel_out_r[07:00] = {2{texel_in[3:0]}};
|
||||
texel_out_r[15:08] = {2{texel_in[7:4]}};
|
||||
texel_out_r[23:16] = {2{texel_in[3:0]}};
|
||||
texel_out_r[23:16] = {2{texel_in[11:8]}};
|
||||
texel_out_r[31:24] = {2{texel_in[15:12]}};
|
||||
end
|
||||
`TEX_FORMAT_L8A8: begin
|
||||
`TEX_FORMAT_A8L8: begin
|
||||
texel_out_r[07:00] = texel_in[7:0];
|
||||
texel_out_r[15:08] = texel_in[7:0];
|
||||
texel_out_r[23:16] = texel_in[7:0];
|
||||
|
@ -45,9 +51,9 @@ module VX_tex_format #(
|
|||
end
|
||||
//`TEX_FORMAT_A8
|
||||
default: begin
|
||||
texel_out_r[07:00] = 0;
|
||||
texel_out_r[15:08] = 0;
|
||||
texel_out_r[23:16] = 0;
|
||||
texel_out_r[07:00] = 8'hff;
|
||||
texel_out_r[15:08] = 8'hff;
|
||||
texel_out_r[23:16] = 8'hff;
|
||||
texel_out_r[31:24] = texel_in[7:0];
|
||||
end
|
||||
endcase
|
||||
|
|
|
@ -3,12 +3,11 @@
|
|||
module VX_tex_lerp (
|
||||
input wire [3:0][7:0] in1,
|
||||
input wire [3:0][7:0] in2,
|
||||
input wire [8:0] alpha,
|
||||
input wire [7:0] beta,
|
||||
input wire [7:0] frac,
|
||||
output wire [3:0][7:0] out
|
||||
);
|
||||
);
|
||||
for (genvar i = 0; i < 4; ++i) begin
|
||||
wire [16:0] sum = in1[i] * alpha + in2[i] * beta;
|
||||
wire [16:0] sum = in1[i] * 8'(8'hff - frac) + in2[i] * frac;
|
||||
`UNUSED_VAR (sum)
|
||||
assign out[i] = sum[15:8];
|
||||
end
|
||||
|
|
|
@ -15,7 +15,8 @@ module VX_tex_mem #(
|
|||
input wire req_valid,
|
||||
input wire [NUM_REQS-1:0] req_tmask,
|
||||
input wire [`TEX_FILTER_BITS-1:0] req_filter,
|
||||
input wire [`TEX_STRIDE_BITS-1:0] req_stride,
|
||||
input wire [`TEX_LGSTRIDE_BITS-1:0] req_lgstride,
|
||||
input wire [NUM_REQS-1:0][31:0] req_baseaddr,
|
||||
input wire [NUM_REQS-1:0][3:0][31:0] req_addr,
|
||||
input wire [REQ_INFOW-1:0] req_info,
|
||||
output wire req_ready,
|
||||
|
@ -32,6 +33,14 @@ module VX_tex_mem #(
|
|||
|
||||
localparam RSP_CTR_W = $clog2(NUM_REQS * 4 + 1);
|
||||
|
||||
// full address calculation
|
||||
wire [NUM_REQS-1:0][3:0][31:0] full_addr;
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
for (genvar j = 0; j < 4; ++j) begin
|
||||
assign full_addr[i][j] = req_baseaddr[i] + req_addr[i][j];
|
||||
end
|
||||
end
|
||||
|
||||
wire [3:0] dup_reqs;
|
||||
wire [3:0][NUM_REQS-1:0][29:0] req_addr_w;
|
||||
wire [3:0][NUM_REQS-1:0][1:0] align_offs;
|
||||
|
@ -40,17 +49,17 @@ module VX_tex_mem #(
|
|||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
for (genvar j = 0; j < 4; ++j) begin
|
||||
assign req_addr_w[j][i] = req_addr[i][j][31:2];
|
||||
assign align_offs[j][i] = req_addr[i][j][1:0];
|
||||
assign req_addr_w[j][i] = full_addr[i][j][31:2];
|
||||
assign align_offs[j][i] = full_addr[i][j][1:0];
|
||||
end
|
||||
end
|
||||
|
||||
// find duplicate addresses
|
||||
// detect duplicate addresses
|
||||
|
||||
for (genvar i = 0; i < 4; ++i) begin
|
||||
wire [NUM_REQS-1:0] addr_matches;
|
||||
for (genvar j = 0; j < NUM_REQS; j++) begin
|
||||
assign addr_matches[j] = (req_addr_w[i][0] == req_addr_w[i][j]) || ~req_tmask[j];
|
||||
wire [NUM_REQS-2:0] addr_matches;
|
||||
for (genvar j = 0; j < (NUM_REQS-1); ++j) begin
|
||||
assign addr_matches[j] = (req_addr_w[i][j+1] == req_addr_w[i][0]) || ~req_tmask[j+1];
|
||||
end
|
||||
assign dup_reqs[i] = req_tmask[0] && (& addr_matches);
|
||||
end
|
||||
|
@ -63,23 +72,26 @@ module VX_tex_mem #(
|
|||
wire [NUM_REQS-1:0] q_req_tmask;
|
||||
wire [`TEX_FILTER_BITS-1:0] q_req_filter;
|
||||
wire [REQ_INFOW-1:0] q_req_info;
|
||||
wire [`TEX_STRIDE_BITS-1:0] q_req_stride;
|
||||
wire [`TEX_LGSTRIDE_BITS-1:0] q_req_lgstride;
|
||||
wire [3:0][NUM_REQS-1:0][1:0] q_align_offs;
|
||||
wire [3:0] q_dup_reqs;
|
||||
wire [`NW_BITS-1:0] q_req_wid;
|
||||
wire [31:0] q_req_PC;
|
||||
wire [`UUID_BITS-1:0] q_req_uuid;
|
||||
|
||||
assign reqq_push = req_valid && req_ready;
|
||||
|
||||
VX_fifo_queue #(
|
||||
.DATAW ((NUM_REQS * 4 * 30) + NUM_REQS + REQ_INFOW + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (4 * NUM_REQS * 2) + 4),
|
||||
.SIZE (`LSUQ_SIZE),
|
||||
.DATAW ((NUM_REQS * 4 * 30) + NUM_REQS + REQ_INFOW + `TEX_FILTER_BITS + `TEX_LGSTRIDE_BITS + (4 * NUM_REQS * 2) + 4),
|
||||
.SIZE (`TEXQ_SIZE),
|
||||
.OUT_REG (1)
|
||||
) req_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.push (reqq_push),
|
||||
.pop (reqq_pop),
|
||||
.data_in ({req_addr_w, req_tmask, req_info, req_filter, req_stride, align_offs, dup_reqs}),
|
||||
.data_out ({q_req_addr, q_req_tmask, q_req_info, q_req_filter, q_req_stride, q_align_offs, q_dup_reqs}),
|
||||
.data_in ({req_addr_w, req_tmask, req_info, req_filter, req_lgstride, align_offs, dup_reqs}),
|
||||
.data_out ({q_req_addr, q_req_tmask, q_req_info, q_req_filter, q_req_lgstride, q_align_offs, q_dup_reqs}),
|
||||
.empty (reqq_empty),
|
||||
.full (reqq_full),
|
||||
`UNUSED_PIN (alm_full),
|
||||
|
@ -143,17 +155,16 @@ module VX_tex_mem #(
|
|||
|
||||
wire [NUM_REQS-1:0] req_dup_mask = {{(NUM_REQS-1){~req_texel_dup}}, 1'b1};
|
||||
|
||||
assign {q_req_wid, q_req_PC, q_req_uuid} = q_req_info[`NW_BITS+32+`UUID_BITS-1:0];
|
||||
`UNUSED_VAR (q_req_wid)
|
||||
`UNUSED_VAR (q_req_PC)
|
||||
|
||||
assign dcache_req_if.valid = {NUM_REQS{req_texel_valid}} & q_req_tmask & req_dup_mask & ~texel_sent_mask;
|
||||
assign dcache_req_if.rw = {NUM_REQS{1'b0}};
|
||||
assign dcache_req_if.addr = req_texel_addr;
|
||||
assign dcache_req_if.byteen = {NUM_REQS{4'b1111}};
|
||||
assign dcache_req_if.byteen = {NUM_REQS{4'b0}};
|
||||
assign dcache_req_if.data = 'x;
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
assign dcache_req_if.tag = {NUM_REQS{q_req_info[`DBG_CACHE_REQ_MDATAW-1:0], req_texel_idx}};
|
||||
`else
|
||||
assign dcache_req_if.tag = {NUM_REQS{req_texel_idx}};
|
||||
`endif
|
||||
assign dcache_req_if.tag = {NUM_REQS{q_req_uuid, `LSU_TAG_ID_BITS'(req_texel_idx), `CACHE_ADDR_TYPE_BITS'(0)}};
|
||||
|
||||
// Dcache Response
|
||||
|
||||
|
@ -162,14 +173,18 @@ module VX_tex_mem #(
|
|||
reg [NUM_REQS-1:0][31:0] rsp_data_qual;
|
||||
reg [RSP_CTR_W-1:0] rsp_rem_ctr, rsp_rem_ctr_init;
|
||||
wire [RSP_CTR_W-1:0] rsp_rem_ctr_n;
|
||||
wire [NUM_REQS-1:0][1:0] rsp_align_offs;
|
||||
wire [$clog2(NUM_REQS+1)-1:0] q_req_size;
|
||||
wire [$clog2(NUM_REQS+1)-1:0] dcache_rsp_size;
|
||||
wire dcache_rsp_fire;
|
||||
wire [1:0] rsp_texel_idx;
|
||||
wire rsp_texel_dup;
|
||||
|
||||
assign rsp_texel_idx = dcache_rsp_if.tag[1:0];
|
||||
|
||||
assign rsp_texel_idx = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS +: 2];
|
||||
`UNUSED_VAR (dcache_rsp_if.tag)
|
||||
|
||||
assign rsp_texel_dup = q_dup_reqs[rsp_texel_idx];
|
||||
assign rsp_align_offs = q_align_offs[rsp_texel_idx];
|
||||
|
||||
assign dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready;
|
||||
|
||||
|
@ -180,12 +195,12 @@ module VX_tex_mem #(
|
|||
reg [31:0] rsp_data_shifted;
|
||||
always @(*) begin
|
||||
rsp_data_shifted[31:16] = src_data[31:16];
|
||||
rsp_data_shifted[15:0] = q_align_offs[rsp_texel_idx][i][1] ? src_data[31:16] : src_data[15:0];
|
||||
rsp_data_shifted[7:0] = q_align_offs[rsp_texel_idx][i][0] ? rsp_data_shifted[15:8] : rsp_data_shifted[7:0];
|
||||
rsp_data_shifted[15:0] = rsp_align_offs[i][1] ? src_data[31:16] : src_data[15:0];
|
||||
rsp_data_shifted[7:0] = rsp_align_offs[i][0] ? rsp_data_shifted[15:8] : rsp_data_shifted[7:0];
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
case (q_req_stride)
|
||||
case (q_req_lgstride)
|
||||
0: rsp_data_qual[i] = 32'(rsp_data_shifted[7:0]);
|
||||
1: rsp_data_qual[i] = 32'(rsp_data_shifted[15:0]);
|
||||
default: rsp_data_qual[i] = rsp_data_shifted;
|
||||
|
@ -206,16 +221,21 @@ module VX_tex_mem #(
|
|||
end
|
||||
end
|
||||
|
||||
`POP_COUNT(q_req_size, q_req_tmask);
|
||||
|
||||
always @(*) begin
|
||||
rsp_rem_ctr_init = RSP_CTR_W'($countones(q_dup_reqs[0] ? NUM_REQS'(1) : q_req_tmask));
|
||||
rsp_rem_ctr_init = q_dup_reqs[0] ? RSP_CTR_W'(1) : RSP_CTR_W'(q_req_size);
|
||||
if (q_req_filter) begin
|
||||
for (integer i = 1; i < 4; ++i) begin
|
||||
rsp_rem_ctr_init += RSP_CTR_W'($countones(q_dup_reqs[i] ? NUM_REQS'(1) : q_req_tmask));
|
||||
rsp_rem_ctr_init += q_dup_reqs[i] ? RSP_CTR_W'(1) : RSP_CTR_W'(q_req_size);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign rsp_rem_ctr_n = rsp_rem_ctr - RSP_CTR_W'($countones(dcache_rsp_if.tmask));
|
||||
wire [NUM_REQS-1:0] dcache_rsp_tmask = dcache_rsp_if.tmask;
|
||||
`POP_COUNT(dcache_rsp_size, dcache_rsp_tmask);
|
||||
|
||||
assign rsp_rem_ctr_n = rsp_rem_ctr - RSP_CTR_W'(dcache_rsp_size);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
@ -237,7 +257,7 @@ module VX_tex_mem #(
|
|||
|
||||
wire stall_out = rsp_valid && ~rsp_ready;
|
||||
|
||||
wire is_last_rsp = (0 == rsp_rem_ctr_n);
|
||||
wire is_last_rsp = (rsp_rem_ctr == RSP_CTR_W'(dcache_rsp_size));
|
||||
|
||||
wire rsp_texels_done = dcache_rsp_fire && is_last_rsp;
|
||||
|
||||
|
@ -257,37 +277,39 @@ module VX_tex_mem #(
|
|||
// Can accept new cache response?
|
||||
assign dcache_rsp_if.ready = ~(is_last_rsp && stall_out);
|
||||
|
||||
`ifdef DBG_TRACE_TEX
|
||||
wire [`NW_BITS-1:0] q_req_wid, req_wid, rsp_wid;
|
||||
wire [31:0] q_req_PC, req_PC, rsp_PC;
|
||||
assign {q_req_wid, q_req_PC} = q_req_info[`NW_BITS+32-1:0];
|
||||
assign {req_wid, req_PC} = req_info[`NW_BITS+32-1:0];
|
||||
assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0];
|
||||
`ifdef DBG_TRACE_TEX
|
||||
wire [`NW_BITS-1:0] req_wid, rsp_wid;
|
||||
wire [31:0] req_PC, rsp_PC;
|
||||
wire [`UUID_BITS-1:0] req_uuid, rsp_uuid;
|
||||
assign {req_wid, req_PC, req_uuid} = req_info[`NW_BITS+32+`UUID_BITS-1:0];
|
||||
assign {rsp_wid, rsp_PC, rsp_uuid} = rsp_info[`NW_BITS+32+`UUID_BITS-1:0];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (dcache_req_fire_any) begin
|
||||
dpi_trace("%d: core%0d-tex-cache-req: wid=%0d, PC=%0h, tmask=%b, tag=%0h, addr=",
|
||||
$time, CORE_ID, q_req_wid, q_req_PC, dcache_req_fire, req_texel_idx);
|
||||
`TRACE_ARRAY1D(req_texel_addr, NUM_REQS);
|
||||
dpi_trace(", is_dup=%b\n", req_texel_dup);
|
||||
dpi_trace(", is_dup=%b (#%0d)\n", req_texel_dup, q_req_uuid);
|
||||
end
|
||||
if (dcache_rsp_fire) begin
|
||||
dpi_trace("%d: core%0d-tex-cache-rsp: wid=%0d, PC=%0h, tmask=%b, tag=%0h, data=",
|
||||
$time, CORE_ID, q_req_wid, q_req_PC, dcache_rsp_if.tmask, rsp_texel_idx);
|
||||
`TRACE_ARRAY1D(dcache_rsp_if.data, NUM_REQS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", q_req_uuid);
|
||||
end
|
||||
if (req_valid && req_ready) begin
|
||||
dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, stride=%0d, addr=",
|
||||
$time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_stride);
|
||||
dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, lgstride=%0d, baseaddr=",
|
||||
$time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_lgstride);
|
||||
`TRACE_ARRAY1D(req_baseaddr, NUM_REQS);
|
||||
dpi_trace(", addr=");
|
||||
`TRACE_ARRAY2D(req_addr, 4, NUM_REQS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", req_uuid);
|
||||
end
|
||||
if (rsp_valid && rsp_ready) begin
|
||||
dpi_trace("%d: core%0d-tex-mem-rsp: wid=%0d, PC=%0h, tmask=%b, data=",
|
||||
$time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask);
|
||||
`TRACE_ARRAY2D(rsp_data, 4, NUM_REQS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", rsp_uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -12,7 +12,7 @@ module VX_tex_sampler #(
|
|||
input wire req_valid,
|
||||
input wire [NUM_REQS-1:0] req_tmask,
|
||||
input wire [`TEX_FORMAT_BITS-1:0] req_format,
|
||||
input wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] req_blends,
|
||||
input wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] req_blends,
|
||||
input wire [NUM_REQS-1:0][3:0][31:0] req_data,
|
||||
input wire [REQ_INFOW-1:0] req_info,
|
||||
output wire req_ready,
|
||||
|
@ -27,75 +27,78 @@ module VX_tex_sampler #(
|
|||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
wire valid_s0;
|
||||
wire [NUM_REQS-1:0] tmask_s0;
|
||||
wire [REQ_INFOW-1:0] req_info_s0;
|
||||
wire valid_s0, valid_s1;
|
||||
wire [NUM_REQS-1:0] req_tmask_s0, req_tmask_s1;
|
||||
wire [REQ_INFOW-1:0] req_info_s0, req_info_s1;
|
||||
wire [NUM_REQS-1:0][31:0] texel_ul, texel_uh;
|
||||
wire [NUM_REQS-1:0][31:0] texel_ul_s0, texel_uh_s0;
|
||||
wire [NUM_REQS-1:0][`BLEND_FRAC-1:0] blend_v, blend_v_s0;
|
||||
wire [NUM_REQS-1:0][31:0] texel_ul_s1, texel_uh_s1;
|
||||
wire [NUM_REQS-1:0][1:0][`TEX_BLEND_FRAC-1:0] req_blends_s0;
|
||||
wire [NUM_REQS-1:0][`TEX_BLEND_FRAC-1:0] blend_v, blend_v_s1;
|
||||
wire [NUM_REQS-1:0][31:0] texel_v;
|
||||
wire [NUM_REQS-1:0][3:0][31:0] fmt_texels, fmt_texels_s0;
|
||||
|
||||
wire stall_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
|
||||
wire [3:0][31:0] fmt_texels;
|
||||
|
||||
for (genvar j = 0; j < 4; ++j) begin
|
||||
VX_tex_format #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) tex_format (
|
||||
.format (req_format),
|
||||
.texel_in (req_data[i][j]),
|
||||
.texel_out (fmt_texels[j])
|
||||
.texel_out (fmt_texels[i][j])
|
||||
);
|
||||
end
|
||||
|
||||
wire [7:0] beta = req_blends[i][0];
|
||||
wire [8:0] alpha = `BLEND_ONE - beta;
|
||||
|
||||
VX_tex_lerp #(
|
||||
) tex_lerp_ul (
|
||||
.in1 (fmt_texels[0]),
|
||||
.in2 (fmt_texels[1]),
|
||||
.alpha (alpha),
|
||||
.beta (beta),
|
||||
.out (texel_ul[i])
|
||||
);
|
||||
|
||||
VX_tex_lerp #(
|
||||
) tex_lerp_uh (
|
||||
.in1 (fmt_texels[2]),
|
||||
.in2 (fmt_texels[3]),
|
||||
.alpha (alpha),
|
||||
.beta (beta),
|
||||
.out (texel_uh[i])
|
||||
);
|
||||
|
||||
assign blend_v[i] = req_blends[i][1];
|
||||
end
|
||||
end
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `BLEND_FRAC) + (2 * NUM_REQS * 32)),
|
||||
.DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * 2 * `TEX_BLEND_FRAC) + (NUM_REQS * 4 * 32)),
|
||||
.RESETW (1)
|
||||
) pipe_reg0 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({req_valid, req_tmask, req_info, blend_v, texel_ul, texel_uh}),
|
||||
.data_out ({valid_s0, tmask_s0, req_info_s0, blend_v_s0, texel_ul_s0, texel_uh_s0})
|
||||
.data_in ({req_valid, req_tmask, req_info, req_blends, fmt_texels}),
|
||||
.data_out ({valid_s0, req_tmask_s0, req_info_s0, req_blends_s0, fmt_texels_s0})
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
VX_tex_lerp #(
|
||||
) tex_lerp_ul (
|
||||
.in1 (fmt_texels_s0[i][0]),
|
||||
.in2 (fmt_texels_s0[i][1]),
|
||||
.frac (req_blends_s0[i][0]),
|
||||
.out (texel_ul[i])
|
||||
);
|
||||
|
||||
VX_tex_lerp #(
|
||||
) tex_lerp_uh (
|
||||
.in1 (fmt_texels_s0[i][2]),
|
||||
.in2 (fmt_texels_s0[i][3]),
|
||||
.frac (req_blends_s0[i][0]),
|
||||
.out (texel_uh[i])
|
||||
);
|
||||
|
||||
assign blend_v[i] = req_blends_s0[i][1];
|
||||
end
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `TEX_BLEND_FRAC) + (2 * NUM_REQS * 32)),
|
||||
.RESETW (1)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({valid_s0, req_tmask_s0, req_info_s0, blend_v, texel_ul, texel_uh}),
|
||||
.data_out ({valid_s1, req_tmask_s1, req_info_s1, blend_v_s1, texel_ul_s1, texel_uh_s1})
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
wire [7:0] beta = blend_v_s0[i];
|
||||
wire [8:0] alpha = `BLEND_ONE - beta;
|
||||
|
||||
VX_tex_lerp #(
|
||||
) tex_lerp_v (
|
||||
.in1 (texel_ul_s0[i]),
|
||||
.in2 (texel_uh_s0[i]),
|
||||
.alpha (alpha),
|
||||
.beta (beta),
|
||||
.in1 (texel_ul_s1[i]),
|
||||
.in2 (texel_uh_s1[i]),
|
||||
.frac (blend_v_s1[i]),
|
||||
.out (texel_v[i])
|
||||
);
|
||||
end
|
||||
|
@ -105,12 +108,12 @@ module VX_tex_sampler #(
|
|||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * 32)),
|
||||
.RESETW (1)
|
||||
) pipe_reg1 (
|
||||
) pipe_reg2 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({valid_s0, tmask_s0, req_info_s0, texel_v}),
|
||||
.data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data})
|
||||
.data_in ({valid_s1, req_tmask_s1, req_info_s1, texel_v}),
|
||||
.data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data})
|
||||
);
|
||||
|
||||
// can accept new request?
|
||||
|
|
|
@ -4,21 +4,22 @@ module VX_tex_stride #(
|
|||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire [`TEX_FORMAT_BITS-1:0] format,
|
||||
output wire [`TEX_STRIDE_BITS-1:0] log_stride
|
||||
output wire [`TEX_LGSTRIDE_BITS-1:0] log_stride
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
reg [`TEX_STRIDE_BITS-1:0] log_stride_r;
|
||||
reg [`TEX_LGSTRIDE_BITS-1:0] log_stride_r;
|
||||
|
||||
always @(*) begin
|
||||
case (format)
|
||||
`TEX_FORMAT_A8: log_stride_r = 0;
|
||||
`TEX_FORMAT_L8: log_stride_r = 0;
|
||||
`TEX_FORMAT_L8A8: log_stride_r = 1;
|
||||
`TEX_FORMAT_R5G6B5: log_stride_r = 1;
|
||||
`TEX_FORMAT_R4G4B4A4: log_stride_r = 1;
|
||||
//`TEX_FORMAT_R8G8B8A8
|
||||
default: log_stride_r = 2;
|
||||
`TEX_FORMAT_A8R8G8B8: log_stride_r = 2;
|
||||
`TEX_FORMAT_R5G6B5,
|
||||
`TEX_FORMAT_A1R5G5B5,
|
||||
`TEX_FORMAT_A4R4G4B4,
|
||||
`TEX_FORMAT_A8L8: log_stride_r = 1;
|
||||
// `TEX_FORMAT_L8:
|
||||
// `TEX_FORMAT_A8:
|
||||
default: log_stride_r = 0;
|
||||
endcase
|
||||
end
|
||||
|
||||
|
|
|
@ -6,6 +6,11 @@ module VX_tex_unit #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// PERF
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_tex_if.master perf_tex_if,
|
||||
`endif
|
||||
|
||||
// Texture unit <-> Memory Unit
|
||||
VX_dcache_req_if.master dcache_req_if,
|
||||
VX_dcache_rsp_if.slave dcache_rsp_if,
|
||||
|
@ -18,74 +23,73 @@ module VX_tex_unit #(
|
|||
VX_tex_rsp_if.master tex_rsp_if
|
||||
);
|
||||
|
||||
localparam REQ_INFOW_S = `NR_BITS + 1 + `NW_BITS + 32;
|
||||
localparam REQ_INFOW_A = `TEX_FORMAT_BITS + REQ_INFOW_S;
|
||||
localparam REQ_INFOW_M = (2 * `NUM_THREADS * `BLEND_FRAC) + REQ_INFOW_A;
|
||||
localparam REQ_INFO_W = `NR_BITS + 1 + `NW_BITS + 32 + `UUID_BITS;
|
||||
localparam BLEND_FRAC_W = (2 * `NUM_THREADS * `TEX_BLEND_FRAC);
|
||||
|
||||
reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0];
|
||||
reg [1:0][`TEX_DIM_BITS-1:0] tex_dims [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0];
|
||||
reg [$clog2(`NUM_TEX_UNITS)-1:0] csr_tex_unit;
|
||||
reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][(`TEX_LOD_MAX+1)-1:0];
|
||||
reg [1:0][`TEX_LOD_BITS-1:0] tex_logdims [`NUM_TEX_UNITS-1:0];
|
||||
reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0];
|
||||
reg [`TEX_ADDR_BITS-1:0] tex_baddr [`NUM_TEX_UNITS-1:0];
|
||||
reg [`TEX_FORMAT_BITS-1:0] tex_format [`NUM_TEX_UNITS-1:0];
|
||||
reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0];
|
||||
reg [`TEX_FILTER_BITS-1:0] tex_filter [`NUM_TEX_UNITS-1:0];
|
||||
|
||||
// CSRs programming
|
||||
// CSRs programming
|
||||
|
||||
reg [`NUM_TEX_UNITS-1:0] csrs_dirty;
|
||||
`UNUSED_VAR (csrs_dirty)
|
||||
|
||||
for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin
|
||||
wire [`TEX_LOD_BITS-1:0] mip_level = tex_csr_if.write_data[28 +: `TEX_LOD_BITS];
|
||||
always @(posedge clk) begin
|
||||
if (tex_csr_if.write_enable) begin
|
||||
case (tex_csr_if.write_addr)
|
||||
`CSR_TEX_ADDR(i) : begin
|
||||
tex_baddr[i] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
always @(posedge clk) begin
|
||||
if (tex_csr_if.write_enable) begin
|
||||
case (tex_csr_if.write_addr)
|
||||
`CSR_TEX_UNIT: begin
|
||||
csr_tex_unit <= tex_csr_if.write_data[$clog2(`NUM_TEX_UNITS)-1:0];
|
||||
end
|
||||
`CSR_TEX_ADDR: begin
|
||||
tex_baddr[csr_tex_unit] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0];
|
||||
end
|
||||
`CSR_TEX_FORMAT: begin
|
||||
tex_format[csr_tex_unit] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0];
|
||||
end
|
||||
`CSR_TEX_WRAPU: begin
|
||||
tex_wraps[csr_tex_unit][0] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0];
|
||||
end
|
||||
`CSR_TEX_WRAPV: begin
|
||||
tex_wraps[csr_tex_unit][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0];
|
||||
end
|
||||
`CSR_TEX_FILTER: begin
|
||||
tex_filter[csr_tex_unit] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0];
|
||||
end
|
||||
`CSR_TEX_WIDTH: begin
|
||||
tex_logdims[csr_tex_unit][0] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0];
|
||||
end
|
||||
`CSR_TEX_HEIGHT: begin
|
||||
tex_logdims[csr_tex_unit][1] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0];
|
||||
end
|
||||
default: begin
|
||||
for (integer j = 0; j <= `TEX_LOD_MAX; ++j) begin
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
if (tex_csr_if.write_addr == `CSR_TEX_MIPOFF(j)) begin
|
||||
`IGNORE_WARNINGS_END
|
||||
tex_mipoff[csr_tex_unit][j] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0];
|
||||
end
|
||||
end
|
||||
`CSR_TEX_FORMAT(i) : begin
|
||||
tex_format[i] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
`CSR_TEX_WRAP(i) : begin
|
||||
tex_wraps[i][0] <= tex_csr_if.write_data[0 +: `TEX_WRAP_BITS];
|
||||
tex_wraps[i][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS +: `TEX_WRAP_BITS];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
`CSR_TEX_FILTER(i) : begin
|
||||
tex_filter[i] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
`CSR_TEX_MIPOFF(i) : begin
|
||||
tex_mipoff[i][mip_level] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
`CSR_TEX_WIDTH(i) : begin
|
||||
tex_dims[i][mip_level][0] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
`CSR_TEX_HEIGHT(i) : begin
|
||||
tex_dims[i][mip_level][1] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
endcase
|
||||
end
|
||||
if (reset || (tex_req_if.valid && tex_req_if.ready)) begin
|
||||
csrs_dirty[i] <= '0;
|
||||
end
|
||||
end
|
||||
endcase
|
||||
end
|
||||
end
|
||||
wire [`UUID_BITS-1:0] write_uuid = tex_csr_if.write_uuid;
|
||||
`UNUSED_VAR (write_uuid);
|
||||
|
||||
// mipmap attributes
|
||||
|
||||
wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff;
|
||||
wire [`NUM_THREADS-1:0][1:0][`TEX_DIM_BITS-1:0] sel_dims;
|
||||
wire [`NUM_THREADS-1:0][`TEX_LOD_BITS-1:0] mip_level;
|
||||
wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff;
|
||||
wire [`NUM_THREADS-1:0][1:0][`TEX_LOD_BITS-1:0] sel_logdims;
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
wire [`NTEX_BITS-1:0] unit = tex_req_if.unit[`NTEX_BITS-1:0];
|
||||
wire [`TEX_LOD_BITS-1:0] mip_level = tex_req_if.lod[i][20+:`TEX_LOD_BITS];
|
||||
assign sel_mipoff[i] = tex_mipoff[unit][mip_level];
|
||||
assign sel_dims[i] = tex_dims[unit][mip_level];
|
||||
assign mip_level[i] = tex_req_if.lod[i][`TEX_LOD_BITS-1:0];
|
||||
assign sel_mipoff[i] = tex_mipoff[unit][mip_level[i]];
|
||||
assign sel_logdims[i][0] = tex_logdims[unit][0];
|
||||
assign sel_logdims[i][1] = tex_logdims[unit][1];
|
||||
end
|
||||
|
||||
// address generation
|
||||
|
@ -93,15 +97,16 @@ module VX_tex_unit #(
|
|||
wire mem_req_valid;
|
||||
wire [`NUM_THREADS-1:0] mem_req_tmask;
|
||||
wire [`TEX_FILTER_BITS-1:0] mem_req_filter;
|
||||
wire [`TEX_STRIDE_BITS-1:0] mem_req_stride;
|
||||
wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] mem_req_blends;
|
||||
wire [`TEX_LGSTRIDE_BITS-1:0] mem_req_lgstride;
|
||||
wire [`NUM_THREADS-1:0][1:0][`TEX_BLEND_FRAC-1:0] mem_req_blends;
|
||||
wire [`NUM_THREADS-1:0][3:0][31:0] mem_req_addr;
|
||||
wire [REQ_INFOW_A-1:0] mem_req_info;
|
||||
wire [`NUM_THREADS-1:0][31:0] mem_req_baseaddr;
|
||||
wire [(`TEX_FORMAT_BITS + REQ_INFO_W)-1:0] mem_req_info;
|
||||
wire mem_req_ready;
|
||||
|
||||
VX_tex_addr #(
|
||||
.CORE_ID (CORE_ID),
|
||||
.REQ_INFOW (REQ_INFOW_A),
|
||||
.REQ_INFOW (`TEX_FORMAT_BITS + REQ_INFO_W),
|
||||
.NUM_REQS (`NUM_THREADS)
|
||||
) tex_addr (
|
||||
.clk (clk),
|
||||
|
@ -113,16 +118,18 @@ module VX_tex_unit #(
|
|||
.req_format (tex_format[tex_req_if.unit]),
|
||||
.req_filter (tex_filter[tex_req_if.unit]),
|
||||
.req_wraps (tex_wraps[tex_req_if.unit]),
|
||||
.req_baseaddr (tex_baddr[tex_req_if.unit]),
|
||||
.req_baseaddr(tex_baddr[tex_req_if.unit]),
|
||||
.mip_level (mip_level),
|
||||
.req_mipoff (sel_mipoff),
|
||||
.req_logdims (sel_dims),
|
||||
.req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}),
|
||||
.req_logdims(sel_logdims),
|
||||
.req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC, tex_req_if.uuid}),
|
||||
.req_ready (tex_req_if.ready),
|
||||
|
||||
.rsp_valid (mem_req_valid),
|
||||
.rsp_tmask (mem_req_tmask),
|
||||
.rsp_filter (mem_req_filter),
|
||||
.rsp_stride (mem_req_stride),
|
||||
.rsp_lgstride(mem_req_lgstride),
|
||||
.rsp_baseaddr(mem_req_baseaddr),
|
||||
.rsp_addr (mem_req_addr),
|
||||
.rsp_blends (mem_req_blends),
|
||||
.rsp_info (mem_req_info),
|
||||
|
@ -134,16 +141,16 @@ module VX_tex_unit #(
|
|||
wire mem_rsp_valid;
|
||||
wire [`NUM_THREADS-1:0] mem_rsp_tmask;
|
||||
wire [`NUM_THREADS-1:0][3:0][31:0] mem_rsp_data;
|
||||
wire [REQ_INFOW_M-1:0] mem_rsp_info;
|
||||
wire [(BLEND_FRAC_W + `TEX_FORMAT_BITS + REQ_INFO_W)-1:0] mem_rsp_info;
|
||||
wire mem_rsp_ready;
|
||||
|
||||
VX_tex_mem #(
|
||||
.CORE_ID (CORE_ID),
|
||||
.REQ_INFOW (REQ_INFOW_M),
|
||||
.REQ_INFOW (BLEND_FRAC_W + `TEX_FORMAT_BITS + REQ_INFO_W),
|
||||
.NUM_REQS (`NUM_THREADS)
|
||||
) tex_mem (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
// memory interface
|
||||
.dcache_req_if (dcache_req_if),
|
||||
|
@ -153,7 +160,8 @@ module VX_tex_unit #(
|
|||
.req_valid (mem_req_valid),
|
||||
.req_tmask (mem_req_tmask),
|
||||
.req_filter(mem_req_filter),
|
||||
.req_stride(mem_req_stride),
|
||||
.req_lgstride(mem_req_lgstride),
|
||||
.req_baseaddr(mem_req_baseaddr),
|
||||
.req_addr (mem_req_addr),
|
||||
.req_info ({mem_req_blends, mem_req_info}),
|
||||
.req_ready (mem_req_ready),
|
||||
|
@ -168,15 +176,9 @@ module VX_tex_unit #(
|
|||
|
||||
// apply sampler
|
||||
|
||||
wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends;
|
||||
wire [`TEX_FORMAT_BITS-1:0] rsp_format;
|
||||
wire [REQ_INFOW_S-1:0] rsp_info;
|
||||
|
||||
assign {rsp_blends, rsp_format, rsp_info} = mem_rsp_info;
|
||||
|
||||
VX_tex_sampler #(
|
||||
.CORE_ID (CORE_ID),
|
||||
.REQ_INFOW (REQ_INFOW_S),
|
||||
.REQ_INFOW (REQ_INFO_W),
|
||||
.NUM_REQS (`NUM_THREADS)
|
||||
) tex_sampler (
|
||||
.clk (clk),
|
||||
|
@ -186,47 +188,77 @@ module VX_tex_unit #(
|
|||
.req_valid (mem_rsp_valid),
|
||||
.req_tmask (mem_rsp_tmask),
|
||||
.req_data (mem_rsp_data),
|
||||
.req_format (rsp_format),
|
||||
.req_blends (rsp_blends),
|
||||
.req_info (rsp_info),
|
||||
.req_blends (mem_rsp_info[(REQ_INFO_W+`TEX_FORMAT_BITS) +: BLEND_FRAC_W]),
|
||||
.req_format (mem_rsp_info[REQ_INFO_W +: `TEX_FORMAT_BITS]),
|
||||
.req_info (mem_rsp_info[0 +: REQ_INFO_W]),
|
||||
.req_ready (mem_rsp_ready),
|
||||
|
||||
// outputs
|
||||
.rsp_valid (tex_rsp_if.valid),
|
||||
.rsp_tmask (tex_rsp_if.tmask),
|
||||
.rsp_data (tex_rsp_if.data),
|
||||
.rsp_info ({tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC}),
|
||||
.rsp_info ({tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC, tex_rsp_if.uuid}),
|
||||
.rsp_ready (tex_rsp_if.ready)
|
||||
);
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] perf_mem_req_per_cycle;
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] perf_mem_rsp_per_cycle;
|
||||
|
||||
wire [`NUM_THREADS-1:0] perf_mem_req_per_mask = dcache_req_if.valid & dcache_req_if.ready;
|
||||
wire [`NUM_THREADS-1:0] perf_mem_rsp_per_mask = dcache_rsp_if.tmask & {`NUM_THREADS{dcache_rsp_if.valid & dcache_rsp_if.ready}};
|
||||
|
||||
`POP_COUNT(perf_mem_req_per_cycle, perf_mem_req_per_mask);
|
||||
`POP_COUNT(perf_mem_rsp_per_cycle, perf_mem_rsp_per_mask);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_pending_reads;
|
||||
wire [$clog2(`NUM_THREADS+1)+1-1:0] perf_pending_reads_cycle = perf_mem_req_per_cycle - perf_mem_rsp_per_cycle;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_pending_reads <= 0;
|
||||
end else begin
|
||||
perf_pending_reads <= perf_pending_reads + `PERF_CTR_BITS'($signed(perf_pending_reads_cycle));
|
||||
end
|
||||
end
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_latency;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_mem_reads <= 0;
|
||||
perf_mem_latency <= 0;
|
||||
end else begin
|
||||
perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'(perf_mem_req_per_cycle);
|
||||
perf_mem_latency <= perf_mem_latency + `PERF_CTR_BITS'(perf_pending_reads);
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_tex_if.mem_reads = perf_mem_reads;
|
||||
assign perf_tex_if.mem_latency = perf_mem_latency;
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_TEX
|
||||
always @(posedge clk) begin
|
||||
if (tex_csr_if.write_enable) begin
|
||||
dpi_trace("%d: core%0d-tex-csr: unit=%0d, state=", $time, CORE_ID, csr_tex_unit);
|
||||
trace_tex_state(tex_csr_if.write_addr);
|
||||
dpi_trace(", data=%0h (#%0d)\n", tex_csr_if.write_data, tex_csr_if.write_uuid);
|
||||
end
|
||||
if (tex_req_if.valid && tex_req_if.ready) begin
|
||||
for (integer i = 0; i < `NUM_TEX_UNITS; ++i) begin
|
||||
if (csrs_dirty[i]) begin
|
||||
dpi_trace("%d: core%0d-tex-csr: tex%0d_addr=%0h\n", $time, CORE_ID, i, tex_baddr[i]);
|
||||
dpi_trace("%d: core%0d-tex-csr: tex%0d_format=%0h\n", $time, CORE_ID, i, tex_format[i]);
|
||||
dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_u=%0h\n", $time, CORE_ID, i, tex_wraps[i][0]);
|
||||
dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_v=%0h\n", $time, CORE_ID, i, tex_wraps[i][1]);
|
||||
dpi_trace("%d: core%0d-tex-csr: tex%0d_filter=%0h\n", $time, CORE_ID, i, tex_filter[i]);
|
||||
dpi_trace("%d: core%0d-tex-csr: tex%0d_mipoff[0]=%0h\n", $time, CORE_ID, i, tex_mipoff[i][0]);
|
||||
dpi_trace("%d: core%0d-tex-csr: tex%0d_width[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][0]);
|
||||
dpi_trace("%d: core%0d-tex-csr: tex%0d_height[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][1]);
|
||||
end
|
||||
end
|
||||
|
||||
dpi_trace("%d: core%0d-tex-req: wid=%0d, PC=%0h, tmask=%b, unit=%0d, lod=%0h, u=",
|
||||
$time, CORE_ID, tex_req_if.wid, tex_req_if.PC, tex_req_if.tmask, tex_req_if.unit, tex_req_if.lod);
|
||||
$time, CORE_ID, tex_req_if.wid, tex_req_if.PC, tex_req_if.tmask, tex_req_if.unit, tex_req_if.lod);
|
||||
`TRACE_ARRAY1D(tex_req_if.coords[0], `NUM_THREADS);
|
||||
dpi_trace(", v=");
|
||||
`TRACE_ARRAY1D(tex_req_if.coords[1], `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", tex_req_if.uuid);
|
||||
end
|
||||
if (tex_rsp_if.valid && tex_rsp_if.ready) begin
|
||||
dpi_trace("%d: core%0d-tex-rsp: wid=%0d, PC=%0h, tmask=%b, data=",
|
||||
$time, CORE_ID, tex_rsp_if.wid, tex_rsp_if.PC, tex_rsp_if.tmask);
|
||||
`TRACE_ARRAY1D(tex_rsp_if.data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", tex_rsp_if.uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -4,19 +4,19 @@ module VX_tex_wrap #(
|
|||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire [`TEX_WRAP_BITS-1:0] wrap_i,
|
||||
input wire [31:0] coord_i,
|
||||
output wire [`FIXED_FRAC-1:0] coord_o
|
||||
input wire [`TEX_FXD_BITS-1:0] coord_i,
|
||||
output wire [`TEX_FXD_FRAC-1:0] coord_o
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
reg [`FIXED_FRAC-1:0] coord_r;
|
||||
reg [`TEX_FXD_FRAC-1:0] coord_r;
|
||||
|
||||
wire [`FIXED_FRAC-1:0] clamp;
|
||||
wire [`TEX_FXD_FRAC-1:0] clamp;
|
||||
|
||||
VX_tex_sat #(
|
||||
.IN_W (32),
|
||||
.OUT_W (`FIXED_FRAC)
|
||||
.IN_W (`TEX_FXD_BITS),
|
||||
.OUT_W (`TEX_FXD_FRAC)
|
||||
) sat_fx (
|
||||
.data_in (coord_i),
|
||||
.data_out (clamp)
|
||||
|
@ -27,9 +27,9 @@ module VX_tex_wrap #(
|
|||
`TEX_WRAP_CLAMP:
|
||||
coord_r = clamp;
|
||||
`TEX_WRAP_MIRROR:
|
||||
coord_r = coord_i[`FIXED_FRAC-1:0] ^ {`FIXED_FRAC{coord_i[`FIXED_FRAC]}};
|
||||
coord_r = coord_i[`TEX_FXD_FRAC-1:0] ^ {`TEX_FXD_FRAC{coord_i[`TEX_FXD_FRAC]}};
|
||||
default: //`TEX_WRAP_REPEAT
|
||||
coord_r = coord_i[`FIXED_FRAC-1:0];
|
||||
coord_r = coord_i[`TEX_FXD_FRAC-1:0];
|
||||
endcase
|
||||
end
|
||||
|
||||
|
|
|
@ -123,9 +123,9 @@
|
|||
"!cci_pending_writes_full": 1,
|
||||
"?afu_mem_req_fire": 1,
|
||||
"afu_mem_req_addr": 26,
|
||||
"afu_mem_req_tag": 27,
|
||||
"afu_mem_req_tag": "`VX_MEM_TAG_WIDTH+1",
|
||||
"?afu_mem_rsp_fire": 1,
|
||||
"afu_mem_rsp_tag": 27
|
||||
"afu_mem_rsp_tag": "`VX_MEM_TAG_WIDTH+1"
|
||||
},
|
||||
"afu/vortex": {
|
||||
"!reset": 1,
|
||||
|
@ -140,49 +140,29 @@
|
|||
"mem_rsp_tag":"`VX_MEM_TAG_WIDTH",
|
||||
"busy": 1
|
||||
},
|
||||
"afu/vortex/cluster/core/pipeline/fetch/icache_stage": {
|
||||
"?icache_req_fire": 1,
|
||||
"icache_req_wid":"`NW_BITS",
|
||||
"icache_req_addr": 32,
|
||||
"icache_req_tag":"`ICACHE_CORE_TAG_ID_BITS",
|
||||
"?icache_rsp_fire": 1,
|
||||
"icache_rsp_data": 32,
|
||||
"icache_rsp_tag":"`ICACHE_CORE_TAG_ID_BITS"
|
||||
},
|
||||
"afu/vortex/cluster/core/pipeline/fetch/warp_sched": {
|
||||
"?wsched_scheduled": 1,
|
||||
"wsched_schedule_uuid": "`UUID_BITS",
|
||||
"wsched_active_warps": "`NUM_WARPS",
|
||||
"wsched_stalled_warps": "`NUM_WARPS",
|
||||
"wsched_schedule_tmask": "`NUM_THREADS",
|
||||
"wsched_schedule_wid": "`NW_BITS",
|
||||
"wsched_schedule_pc": "32"
|
||||
"wsched_schedule_pc": 32
|
||||
},
|
||||
"afu/vortex/cluster/core/pipeline/execute/gpu_unit": {
|
||||
"?gpu_rsp_valid": 1,
|
||||
"gpu_rsp_wid": "`NW_BITS",
|
||||
"gpu_rsp_tmc": 1,
|
||||
"gpu_rsp_wspawn": 1,
|
||||
"gpu_rsp_split": 1,
|
||||
"gpu_rsp_barrier": 1
|
||||
},
|
||||
"afu/vortex/cluster/core/pipeline/execute/lsu_unit": {
|
||||
"?dcache_req_fire":"`NUM_THREADS",
|
||||
"dcache_req_wid":"`NW_BITS",
|
||||
"dcache_req_pc": 32,
|
||||
"dcache_req_addr":"`NUM_THREADS * 32",
|
||||
"dcache_req_rw": 1,
|
||||
"dcache_req_byteen":"`NUM_THREADS * 4",
|
||||
"dcache_req_data": "`NUM_THREADS * 32",
|
||||
"dcache_req_tag":"`LSUQ_ADDR_BITS",
|
||||
"?dcache_rsp_fire":"`NUM_THREADS",
|
||||
"dcache_rsp_data":"`NUM_THREADS * 32",
|
||||
"dcache_rsp_tag":"`LSUQ_ADDR_BITS"
|
||||
"afu/vortex/cluster/core/pipeline/fetch/icache_stage": {
|
||||
"?icache_req_fire": 1,
|
||||
"icache_req_uuid": "`UUID_BITS",
|
||||
"icache_req_addr": 32,
|
||||
"icache_req_tag":"`ICACHE_CORE_TAG_ID_BITS",
|
||||
"?icache_rsp_fire": 1,
|
||||
"icache_rsp_uuid": "`UUID_BITS",
|
||||
"icache_rsp_data": 32,
|
||||
"icache_rsp_tag":"`ICACHE_CORE_TAG_ID_BITS"
|
||||
},
|
||||
"afu/vortex/cluster/core/pipeline/issue": {
|
||||
"?issue_fire": 1,
|
||||
"issue_wid":"`NW_BITS",
|
||||
"issue_tmask":"`NUM_THREADS",
|
||||
"issue_pc": 32,
|
||||
"issue_uuid": "`UUID_BITS",
|
||||
"issue_tmask":"`NUM_THREADS",
|
||||
"issue_ex_type":"`EX_BITS",
|
||||
"issue_op_type":"`INST_OP_BITS",
|
||||
"issue_op_mod":"`INST_MOD_BITS",
|
||||
|
@ -198,15 +178,35 @@
|
|||
"gpr_rs2":"`NUM_THREADS * 32",
|
||||
"gpr_rs3":"`NUM_THREADS * 32",
|
||||
"?writeback_valid": 1,
|
||||
"writeback_wid":"`NW_BITS",
|
||||
"writeback_pc": 32,
|
||||
"writeback_uuid": "`UUID_BITS",
|
||||
"writeback_tmask":"`NUM_THREADS",
|
||||
"writeback_rd":"`NR_BITS",
|
||||
"writeback_data":"`NUM_THREADS * 32",
|
||||
"writeback_eop": 1,
|
||||
"!scoreboard_delay": 1,
|
||||
"!dispatch_delay": 1
|
||||
},
|
||||
},
|
||||
"afu/vortex/cluster/core/pipeline/execute/lsu_unit": {
|
||||
"?dcache_req_fire":"`NUM_THREADS",
|
||||
"dcache_req_uuid": "`UUID_BITS",
|
||||
"dcache_req_addr":"`NUM_THREADS * 32",
|
||||
"dcache_req_rw": 1,
|
||||
"dcache_req_byteen":"`NUM_THREADS * 4",
|
||||
"dcache_req_data":"`NUM_THREADS * 32",
|
||||
"dcache_req_tag":"`LSUQ_ADDR_BITS",
|
||||
"?dcache_rsp_fire":"`NUM_THREADS",
|
||||
"dcache_rsp_uuid": "`UUID_BITS",
|
||||
"dcache_rsp_data":"`NUM_THREADS * 32",
|
||||
"dcache_rsp_tag":"`LSUQ_ADDR_BITS"
|
||||
},
|
||||
"afu/vortex/cluster/core/pipeline/execute/gpu_unit": {
|
||||
"?gpu_rsp_valid": 1,
|
||||
"gpu_rsp_uuid": "`UUID_BITS",
|
||||
"gpu_rsp_tmc": 1,
|
||||
"gpu_rsp_wspawn": 1,
|
||||
"gpu_rsp_split": 1,
|
||||
"gpu_rsp_barrier": 1
|
||||
},
|
||||
"afu/vortex/l3cache/bank, afu/vortex/cluster/l2cache/bank, afu/vortex/cluster/core/mem_unit/dcache/bank, afu/vortex/cluster/core/mem_unit/icache/bank": {
|
||||
"?valid_st0": 1,
|
||||
"?valid_st1": 1,
|
||||
|
|
|
@ -23,7 +23,6 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
|
|||
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
|
||||
|
||||
DBG_FLAGS += $(DBG_TRACE_FLAGS)
|
||||
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
|
||||
|
||||
CONFIG1 := -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
|
||||
CONFIG2 := -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
BUILD_DIR ?= build
|
||||
|
||||
.PHONY: dogfood unittest pipeline smem cache fpu_core core vortex top1 top2 top4 top8 top16 top32 top64
|
||||
.PHONY: dogfood unittest pipeline smem cache fpu_core core vortex top1 top2 top4 top8 top16 top32 top64 texunit
|
||||
|
||||
dogfood:
|
||||
mkdir -p dogfood/$(BUILD_DIR)
|
||||
|
@ -75,4 +75,9 @@ top32:
|
|||
top64:
|
||||
mkdir -p top64/$(BUILD_DIR)
|
||||
cp top64/Makefile top64/$(BUILD_DIR)
|
||||
$(MAKE) -C top64/$(BUILD_DIR) clean && $(MAKE) -C top64/$(BUILD_DIR) > top64/$(BUILD_DIR)/build.log 2>&1 &
|
||||
$(MAKE) -C top64/$(BUILD_DIR) clean && $(MAKE) -C top64/$(BUILD_DIR) > top64/$(BUILD_DIR)/build.log 2>&1 &
|
||||
|
||||
texunit:
|
||||
mkdir -p texunit/$(BUILD_DIR)
|
||||
cp texunit/Makefile texunit/$(BUILD_DIR)
|
||||
$(MAKE) -C texunit/$(BUILD_DIR) clean && $(MAKE) -C texunit/$(BUILD_DIR) > texunit/$(BUILD_DIR)/build.log 2>&1 &
|
|
@ -2,6 +2,7 @@ PROJECT = Core
|
|||
TOP_LEVEL_ENTITY = VX_core
|
||||
SRC_FILE = VX_core.v
|
||||
RTL_DIR = ../../../../rtl
|
||||
THIRD_PARTY_DIR = ../../../../../third_party
|
||||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
|
@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ PROJECT = VX_fpu_fpga
|
|||
TOP_LEVEL_ENTITY = VX_fpu_fpga
|
||||
SRC_FILE = VX_fpu_fpga.v
|
||||
RTL_DIR = ../../../../rtl
|
||||
THIRD_PARTY_DIR = ../../../../../third_party
|
||||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
|
@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ PROJECT = VX_pipeline
|
|||
TOP_LEVEL_ENTITY = VX_pipeline
|
||||
SRC_FILE = VX_pipeline.v
|
||||
RTL_DIR = ../../../../rtl
|
||||
THIRD_PARTY_DIR = ../../../../../third_party
|
||||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
|
@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
|
|
81
hw/syn/quartus/texunit/Makefile
Normal file
81
hw/syn/quartus/texunit/Makefile
Normal file
|
@ -0,0 +1,81 @@
|
|||
PROJECT = Core
|
||||
TOP_LEVEL_ENTITY = VX_core
|
||||
SRC_FILE = VX_core.v
|
||||
RTL_DIR = ../../../../rtl
|
||||
THIRD_PARTY_DIR = ../../../../../third_party
|
||||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
||||
|
||||
#FAMILY = "Stratix 10"
|
||||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
# Executable Configuration
|
||||
SYN_ARGS = --parallel --read_settings_files=on
|
||||
FIT_ARGS = --parallel --part=$(DEVICE) --read_settings_files=on
|
||||
ASM_ARGS =
|
||||
STA_ARGS = --parallel --do_report_timing
|
||||
|
||||
# Build targets
|
||||
all: $(PROJECT).sta.rpt
|
||||
|
||||
syn: $(PROJECT).syn.rpt
|
||||
|
||||
fit: $(PROJECT).fit.rpt
|
||||
|
||||
asm: $(PROJECT).asm.rpt
|
||||
|
||||
sta: $(PROJECT).sta.rpt
|
||||
|
||||
smart: smart.log
|
||||
|
||||
# Target implementations
|
||||
STAMP = echo done >
|
||||
|
||||
$(PROJECT).syn.rpt: smart.log syn.chg $(SOURCE_FILES)
|
||||
quartus_syn $(PROJECT) $(SYN_ARGS)
|
||||
$(STAMP) fit.chg
|
||||
|
||||
$(PROJECT).fit.rpt: smart.log fit.chg $(PROJECT).syn.rpt
|
||||
quartus_fit $(PROJECT) $(FIT_ARGS)
|
||||
$(STAMP) asm.chg
|
||||
$(STAMP) sta.chg
|
||||
|
||||
$(PROJECT).asm.rpt: smart.log asm.chg $(PROJECT).fit.rpt
|
||||
quartus_asm $(PROJECT) $(ASM_ARGS)
|
||||
|
||||
$(PROJECT).sta.rpt: smart.log sta.chg $(PROJECT).fit.rpt
|
||||
quartus_sta $(PROJECT) $(STA_ARGS)
|
||||
|
||||
smart.log: $(PROJECT_FILES)
|
||||
quartus_sh --determine_smart_action $(PROJECT) > smart.log
|
||||
|
||||
# Project initialization
|
||||
$(PROJECT_FILES):
|
||||
quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "EXT_TEX_ENABLE=1"
|
||||
|
||||
syn.chg:
|
||||
$(STAMP) syn.chg
|
||||
|
||||
fit.chg:
|
||||
$(STAMP) fit.chg
|
||||
|
||||
sta.chg:
|
||||
$(STAMP) sta.chg
|
||||
|
||||
asm.chg:
|
||||
$(STAMP) asm.chg
|
||||
|
||||
program: $(PROJECT).sof
|
||||
quartus_pgm --no_banner --mode=jtag -o "$(PROJECT).sof"
|
||||
|
||||
clean:
|
||||
rm -rf bin *.rpt *.chg *.qsf *.qpf *.qws *.log *.htm *.eqn *.pin *.sof *.pof qdb incremental_db tmp-clearbox
|
|
@ -2,6 +2,7 @@ PROJECT = vortex_afu
|
|||
TOP_LEVEL_ENTITY = vortex_afu
|
||||
SRC_FILE = vortex_afu.sv
|
||||
RTL_DIR = ../../../../rtl
|
||||
THIRD_PARTY_DIR = ../../../../../third_party
|
||||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
|
@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ PROJECT = vortex_afu
|
|||
TOP_LEVEL_ENTITY = vortex_afu
|
||||
SRC_FILE = vortex_afu.sv
|
||||
RTL_DIR = ../../../../rtl
|
||||
THIRD_PARTY_DIR = ../../../../../third_party
|
||||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
|
@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ PROJECT = vortex_afu
|
|||
TOP_LEVEL_ENTITY = vortex_afu
|
||||
SRC_FILE = vortex_afu.sv
|
||||
RTL_DIR = ../../../../rtl
|
||||
THIRD_PARTY_DIR = ../../../../../third_party
|
||||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
|
@ -11,7 +12,7 @@ FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/arria10
|
|||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ PROJECT = vortex_afu
|
|||
TOP_LEVEL_ENTITY = vortex_afu
|
||||
SRC_FILE = vortex_afu.sv
|
||||
RTL_DIR = ../../../../rtl
|
||||
THIRD_PARTY_DIR = ../../../../../third_party
|
||||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
|
@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ PROJECT = vortex_afu
|
|||
TOP_LEVEL_ENTITY = vortex_afu
|
||||
SRC_FILE = vortex_afu.sv
|
||||
RTL_DIR = ../../../../rtl
|
||||
THIRD_PARTY_DIR = ../../../../../third_party
|
||||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
|
@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
PROJECT = vortex_afu
|
||||
TOP_LEVEL_ENTITY = vortex_afu
|
||||
SRC_FILE = vortex_afu.sv
|
||||
RTL_DIR=../../../../rtl
|
||||
RTL_DIR = ../../../../rtl
|
||||
THIRD_PARTY_DIR = ../../../../../third_party
|
||||
|
||||
#FAMILY = "Arria 10"
|
||||
#DEVICE = 10AX115N3F40E2SG
|
||||
|
@ -11,7 +12,7 @@ FAMILY = "Stratix 10"
|
|||
DEVICE = 1SX280HN2F43E2VG
|
||||
FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ PROJECT = vortex_afu
|
|||
TOP_LEVEL_ENTITY = vortex_afu
|
||||
SRC_FILE = vortex_afu.sv
|
||||
RTL_DIR = ../../../../rtl
|
||||
THIRD_PARTY_DIR = ../../../../../third_party
|
||||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
|
@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ PROJECT = Unittest
|
|||
TOP_LEVEL_ENTITY = VX_core_req_bank_sel
|
||||
SRC_FILE = VX_core_req_bank_sel.v
|
||||
RTL_DIR = ../../../../rtl
|
||||
THIRD_PARTY_DIR = ../../../../../third_party
|
||||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
|
@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ PROJECT = Vortex
|
|||
TOP_LEVEL_ENTITY = Vortex
|
||||
SRC_FILE = Vortex.sv
|
||||
RTL_DIR = ../../../../rtl
|
||||
THIRD_PARTY_DIR = ../../../../../third_party
|
||||
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
|
@ -11,7 +12,7 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#DEVICE = 1SX280HN2F43E2VG
|
||||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(THIRD_PARTY_DIR)/fpnew/src;$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include;$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
|
|
46
miscs/patch/ramulator.patch
Normal file
46
miscs/patch/ramulator.patch
Normal file
|
@ -0,0 +1,46 @@
|
|||
diff --git a/Makefile b/Makefile
|
||||
index ea340c8..d2aac5b 100644
|
||||
--- a/Makefile
|
||||
+++ b/Makefile
|
||||
@@ -7,16 +7,16 @@ OBJS := $(patsubst $(SRCDIR)/%.cpp, $(OBJDIR)/%.o, $(SRCS))
|
||||
|
||||
# Ramulator currently supports g++ 5.1+ or clang++ 3.4+. It will NOT work with
|
||||
# g++ 4.x due to an internal compiler error when processing lambda functions.
|
||||
-CXX := clang++
|
||||
+#CXX := clang++
|
||||
# CXX := g++-5
|
||||
-CXXFLAGS := -O3 -std=c++11 -g -Wall
|
||||
+CXXFLAGS := -std=c++11 -O3 -g -Wall -fPIC
|
||||
|
||||
.PHONY: all clean depend
|
||||
|
||||
all: depend ramulator
|
||||
|
||||
clean:
|
||||
- rm -f ramulator
|
||||
+ rm -f ramulator libramulator.a
|
||||
rm -rf $(OBJDIR)
|
||||
|
||||
depend: $(OBJDIR)/.depend
|
||||
@@ -36,7 +36,7 @@ ramulator: $(MAIN) $(OBJS) $(SRCDIR)/*.h | depend
|
||||
$(CXX) $(CXXFLAGS) -DRAMULATOR -o $@ $(MAIN) $(OBJS)
|
||||
|
||||
libramulator.a: $(OBJS) $(OBJDIR)/Gem5Wrapper.o
|
||||
- libtool -static -o $@ $(OBJS) $(OBJDIR)/Gem5Wrapper.o
|
||||
+ $(AR) rcs $@ $^
|
||||
|
||||
$(OBJS): | $(OBJDIR)
|
||||
|
||||
diff --git a/src/Request.h b/src/Request.h
|
||||
index 57abd0d..a5ce061 100644
|
||||
--- a/src/Request.h
|
||||
+++ b/src/Request.h
|
||||
@@ -36,7 +36,7 @@ public:
|
||||
|
||||
Request(long addr, Type type, int coreid = 0)
|
||||
: is_first_command(true), addr(addr), coreid(coreid), type(type),
|
||||
- callback([](Request& req){}) {}
|
||||
+ callback([](Request&){}) {}
|
||||
|
||||
Request(long addr, Type type, function<void(Request&)> callback, int coreid = 0)
|
||||
: is_first_command(true), addr(addr), coreid(coreid), type(type), callback(callback) {}
|
|
@ -36,6 +36,6 @@ ELF:
|
|||
$(COMP) $(CC_FLAGS) $(VX_STR) $(VX_VEC) $(VX_FIO) $(NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_MAIN).c $(LIBS) -Iinclude -o $(VX_MAIN).elf
|
||||
|
||||
run:
|
||||
../../simX/obj_dir/Vcache_simX -E -a rv32i --core vx_vector_main.hex -s -b 1> emulator.debug
|
||||
../../simx/obj_dir/Vcache_simX -E -a rv32i --core vx_vector_main.hex -s -b 1> emulator.debug
|
||||
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@ CFLAGS += -I./include -I../hw
|
|||
|
||||
PROJECT = libvortexrt
|
||||
|
||||
SRCS = ./src/vx_start.S ./src/vx_syscalls.c ./src/vx_print.S ./src/vx_print.c ./src/vx_spawn.c ./src/vx_spawn.S ./src/vx_perf.c
|
||||
SRCS = ./src/vx_start.S ./src/vx_syscalls.c ./src/vx_print.S ./src/tinyprintf.c ./src/vx_print.c ./src/vx_spawn.c ./src/vx_spawn.S ./src/vx_perf.c
|
||||
|
||||
OBJS := $(addsuffix .o, $(notdir $(SRCS)))
|
||||
|
||||
|
@ -26,7 +26,7 @@ $(PROJECT).dump: $(PROJECT).a
|
|||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
$(PROJECT).a: $(OBJS)
|
||||
$(AR) rcs $(PROJECT).a $^
|
||||
$(AR) rcs $@ $^
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CC) $(CFLAGS) -MM $^ > .depend;
|
||||
|
|
|
@ -5,115 +5,85 @@
|
|||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __ASSEMBLY__
|
||||
#define __ASM_STR(x) x
|
||||
#else
|
||||
#define __ASM_STR(x) #x
|
||||
#endif
|
||||
|
||||
#define vx_csr_swap(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
|
||||
__v; \
|
||||
})
|
||||
|
||||
#define vx_csr_read(csr) ({ \
|
||||
register unsigned __v; \
|
||||
__asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \
|
||||
__v; \
|
||||
})
|
||||
|
||||
#define vx_csr_write(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
|
||||
})
|
||||
|
||||
#define vx_csr_read_set(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
|
||||
__v; \
|
||||
})
|
||||
|
||||
#define vx_csr_set(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
|
||||
})
|
||||
|
||||
#define vx_csr_read_clear(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
|
||||
__v; \
|
||||
})
|
||||
|
||||
#define vx_csr_clear(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
|
||||
})
|
||||
|
||||
// Texture load
|
||||
#define vx_tex(unit, u, v, l) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __u = u; \
|
||||
unsigned __v = v; \
|
||||
unsigned __l = l; \
|
||||
__asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \
|
||||
#define csr_read(csr) ({ \
|
||||
unsigned __r; \
|
||||
__asm__ __volatile__ ("csrr %0, %1" : "=r" (__r) : "i" (csr)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#ifdef __ASSEMBLY__
|
||||
#define __ASM_STR(x) x
|
||||
#else
|
||||
#define __ASM_STR(x) #x
|
||||
#endif
|
||||
|
||||
#define vx_csr_swap(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
|
||||
__v; \
|
||||
#define csr_write(csr, val) ({ \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrw %0, %1" :: "i" (csr), "r" (__v)); \
|
||||
})
|
||||
|
||||
#define vx_csr_read(csr) ({ \
|
||||
register unsigned __v; \
|
||||
__asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \
|
||||
__v; \
|
||||
#define csr_swap(csr, val) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrrw %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#define vx_csr_write(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
|
||||
#define csr_read_set(csr, val) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrrs %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#define vx_csr_read_set(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
|
||||
__v; \
|
||||
#define csr_set(csr, val) ({ \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrs %0, %1" :: "i" (csr), "r" (__v)); \
|
||||
})
|
||||
|
||||
#define vx_csr_set(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
|
||||
#define csr_read_clear(csr, val) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrrc %0, %1, %2" : "=r" (__r) : "i" (csr), "r" (__v)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#define vx_csr_read_clear(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
|
||||
__v; \
|
||||
})
|
||||
|
||||
#define vx_csr_clear(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
|
||||
#define csr_clear(csr, val) ({ \
|
||||
unsigned __v = (unsigned)(val); \
|
||||
if (__builtin_constant_p(val) && __v < 32) \
|
||||
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "i" (__v)); \
|
||||
else \
|
||||
__asm__ __volatile__ ("csrc %0, %1" :: "i" (csr), "r" (__v)); \
|
||||
})
|
||||
|
||||
// Texture load
|
||||
#define vx_tex(unit, u, v, l) ({ \
|
||||
#define vx_tex(unit, u, v, lod) ({ \
|
||||
unsigned __r; \
|
||||
__asm__ __volatile__ (".insn r4 0x5b, 0, %1, %0, %2, %3, %4" : "=r"(__r) : "i"(unit), "r"(u), "r"(v), "r"(lod)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
// Conditional move
|
||||
#define vx_cmov(c, t, f) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __u = u; \
|
||||
unsigned __v = v; \
|
||||
unsigned __l = l; \
|
||||
__asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \
|
||||
__asm__ __volatile__ (".insn r4 0x5b, 1, 0, %0, %1, %2, %3" : "=r"(__r : "r"(c), "r"(t), "r"(f)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
|
@ -151,7 +121,7 @@ inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
|
|||
|
||||
// Prefetch
|
||||
inline void vx_prefetch(unsigned addr) {
|
||||
asm volatile (".insn s 0x6b, 6, x0, 0(%0)" :: "r"(addr) );
|
||||
asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) );
|
||||
}
|
||||
|
||||
// Return active warp's thread id
|
||||
|
|
890
runtime/src/tinyprintf.c
Normal file
890
runtime/src/tinyprintf.c
Normal file
|
@ -0,0 +1,890 @@
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
// \author (c) Marco Paland (info@paland.com)
|
||||
// 2014-2019, PALANDesign Hannover, Germany
|
||||
//
|
||||
// \license The MIT License (MIT)
|
||||
//
|
||||
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
// of this software and associated documentation files (the "Software"), to deal
|
||||
// in the Software without restriction, including without limitation the rights
|
||||
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
// copies of the Software, and to permit persons to whom the Software is
|
||||
// furnished to do so, subject to the following conditions:
|
||||
//
|
||||
// The above copyright notice and this permission notice shall be included in
|
||||
// all copies or substantial portions of the Software.
|
||||
//
|
||||
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
// THE SOFTWARE.
|
||||
//
|
||||
// \brief Tiny printf, sprintf and (v)snprintf implementation, optimized for speed on
|
||||
// embedded systems with a very limited resources. These routines are thread
|
||||
// safe and reentrant!
|
||||
// Use this instead of the bloated standard/newlib printf cause these use
|
||||
// malloc for printf (and may not be thread safe).
|
||||
//
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include "tinyprintf.h"
|
||||
#include "vx_print.h"
|
||||
|
||||
|
||||
// define this globally (e.g. gcc -DPRINTF_INCLUDE_CONFIG_H ...) to include the
|
||||
// printf_config.h header file
|
||||
// default: undefined
|
||||
#ifdef PRINTF_INCLUDE_CONFIG_H
|
||||
#include "printf_config.h"
|
||||
#endif
|
||||
|
||||
|
||||
// 'ntoa' conversion buffer size, this must be big enough to hold one converted
|
||||
// numeric number including padded zeros (dynamically created on stack)
|
||||
// default: 32 byte
|
||||
#ifndef PRINTF_NTOA_BUFFER_SIZE
|
||||
#define PRINTF_NTOA_BUFFER_SIZE 32U
|
||||
#endif
|
||||
|
||||
// 'ftoa' conversion buffer size, this must be big enough to hold one converted
|
||||
// float number including padded zeros (dynamically created on stack)
|
||||
// default: 32 byte
|
||||
#ifndef PRINTF_FTOA_BUFFER_SIZE
|
||||
#define PRINTF_FTOA_BUFFER_SIZE 32U
|
||||
#endif
|
||||
|
||||
// support for the floating point type (%f)
|
||||
// default: activated
|
||||
#ifndef PRINTF_DISABLE_SUPPORT_FLOAT
|
||||
#define PRINTF_SUPPORT_FLOAT
|
||||
#endif
|
||||
|
||||
// support for exponential floating point notation (%e/%g)
|
||||
// default: activated
|
||||
#ifndef PRINTF_DISABLE_SUPPORT_EXPONENTIAL
|
||||
#define PRINTF_SUPPORT_EXPONENTIAL
|
||||
#endif
|
||||
|
||||
// define the default floating point precision
|
||||
// default: 6 digits
|
||||
#ifndef PRINTF_DEFAULT_FLOAT_PRECISION
|
||||
#define PRINTF_DEFAULT_FLOAT_PRECISION 6U
|
||||
#endif
|
||||
|
||||
// define the largest float suitable to print with %f
|
||||
// default: 1e9
|
||||
#ifndef PRINTF_MAX_FLOAT
|
||||
#define PRINTF_MAX_FLOAT 1e9
|
||||
#endif
|
||||
|
||||
// support for the long long types (%llu or %p)
|
||||
// default: activated
|
||||
#ifndef PRINTF_DISABLE_SUPPORT_LONG_LONG
|
||||
#define PRINTF_SUPPORT_LONG_LONG
|
||||
#endif
|
||||
|
||||
// support for the ptrdiff_t type (%t)
|
||||
// ptrdiff_t is normally defined in <stddef.h> as long or long long type
|
||||
// default: activated
|
||||
#ifndef PRINTF_DISABLE_SUPPORT_PTRDIFF_T
|
||||
#define PRINTF_SUPPORT_PTRDIFF_T
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// internal flag definitions
|
||||
#define FLAGS_ZEROPAD (1U << 0U)
|
||||
#define FLAGS_LEFT (1U << 1U)
|
||||
#define FLAGS_PLUS (1U << 2U)
|
||||
#define FLAGS_SPACE (1U << 3U)
|
||||
#define FLAGS_HASH (1U << 4U)
|
||||
#define FLAGS_UPPERCASE (1U << 5U)
|
||||
#define FLAGS_CHAR (1U << 6U)
|
||||
#define FLAGS_SHORT (1U << 7U)
|
||||
#define FLAGS_LONG (1U << 8U)
|
||||
#define FLAGS_LONG_LONG (1U << 9U)
|
||||
#define FLAGS_PRECISION (1U << 10U)
|
||||
#define FLAGS_ADAPT_EXP (1U << 11U)
|
||||
|
||||
|
||||
// import float.h for DBL_MAX
|
||||
#if defined(PRINTF_SUPPORT_FLOAT)
|
||||
#include <float.h>
|
||||
#endif
|
||||
|
||||
|
||||
// output function type
|
||||
typedef void (*out_fct_type)(char character, void* buffer, size_t idx, size_t maxlen);
|
||||
|
||||
|
||||
// wrapper (used as buffer) for output function type
|
||||
typedef struct {
|
||||
void (*fct)(char character, void* arg);
|
||||
void* arg;
|
||||
} out_fct_wrap_type;
|
||||
|
||||
|
||||
// internal buffer output
|
||||
static inline void _out_buffer(char character, void* buffer, size_t idx, size_t maxlen)
|
||||
{
|
||||
if (idx < maxlen) {
|
||||
((char*)buffer)[idx] = character;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// internal null output
|
||||
static inline void _out_null(char character, void* buffer, size_t idx, size_t maxlen)
|
||||
{
|
||||
(void)character; (void)buffer; (void)idx; (void)maxlen;
|
||||
}
|
||||
|
||||
|
||||
// internal _putchar wrapper
|
||||
static inline void _out_char(char character, void* buffer, size_t idx, size_t maxlen)
|
||||
{
|
||||
(void)buffer; (void)idx; (void)maxlen;
|
||||
if (character) {
|
||||
vx_putchar(character);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// internal output function wrapper
|
||||
static inline void _out_fct(char character, void* buffer, size_t idx, size_t maxlen)
|
||||
{
|
||||
(void)idx; (void)maxlen;
|
||||
if (character) {
|
||||
// buffer is the output fct pointer
|
||||
((out_fct_wrap_type*)buffer)->fct(character, ((out_fct_wrap_type*)buffer)->arg);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// internal secure strlen
|
||||
// \return The length of the string (excluding the terminating 0) limited by 'maxsize'
|
||||
static inline unsigned int _strnlen_s(const char* str, size_t maxsize)
|
||||
{
|
||||
const char* s;
|
||||
for (s = str; *s && maxsize--; ++s);
|
||||
return (unsigned int)(s - str);
|
||||
}
|
||||
|
||||
|
||||
// internal test if char is a digit (0-9)
|
||||
// \return true if char is a digit
|
||||
static inline bool _is_digit(char ch)
|
||||
{
|
||||
return (ch >= '0') && (ch <= '9');
|
||||
}
|
||||
|
||||
|
||||
// internal ASCII string to unsigned int conversion
|
||||
static unsigned int _atoi(const char** str)
|
||||
{
|
||||
unsigned int i = 0U;
|
||||
while (_is_digit(**str)) {
|
||||
i = i * 10U + (unsigned int)(*((*str)++) - '0');
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
|
||||
// output the specified string in reverse, taking care of any zero-padding
|
||||
static size_t _out_rev(out_fct_type out, char* buffer, size_t idx, size_t maxlen, const char* buf, size_t len, unsigned int width, unsigned int flags)
|
||||
{
|
||||
const size_t start_idx = idx;
|
||||
|
||||
// pad spaces up to given width
|
||||
if (!(flags & FLAGS_LEFT) && !(flags & FLAGS_ZEROPAD)) {
|
||||
for (size_t i = len; i < width; i++) {
|
||||
out(' ', buffer, idx++, maxlen);
|
||||
}
|
||||
}
|
||||
|
||||
// reverse string
|
||||
while (len) {
|
||||
out(buf[--len], buffer, idx++, maxlen);
|
||||
}
|
||||
|
||||
// append pad spaces up to given width
|
||||
if (flags & FLAGS_LEFT) {
|
||||
while (idx - start_idx < width) {
|
||||
out(' ', buffer, idx++, maxlen);
|
||||
}
|
||||
}
|
||||
|
||||
return idx;
|
||||
}
|
||||
|
||||
|
||||
// internal itoa format
|
||||
static size_t _ntoa_format(out_fct_type out, char* buffer, size_t idx, size_t maxlen, char* buf, size_t len, bool negative, unsigned int base, unsigned int prec, unsigned int width, unsigned int flags)
|
||||
{
|
||||
// pad leading zeros
|
||||
if (!(flags & FLAGS_LEFT)) {
|
||||
if (width && (flags & FLAGS_ZEROPAD) && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) {
|
||||
width--;
|
||||
}
|
||||
while ((len < prec) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
|
||||
buf[len++] = '0';
|
||||
}
|
||||
while ((flags & FLAGS_ZEROPAD) && (len < width) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
|
||||
buf[len++] = '0';
|
||||
}
|
||||
}
|
||||
|
||||
// handle hash
|
||||
if (flags & FLAGS_HASH) {
|
||||
if (!(flags & FLAGS_PRECISION) && len && ((len == prec) || (len == width))) {
|
||||
len--;
|
||||
if (len && (base == 16U)) {
|
||||
len--;
|
||||
}
|
||||
}
|
||||
if ((base == 16U) && !(flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
|
||||
buf[len++] = 'x';
|
||||
}
|
||||
else if ((base == 16U) && (flags & FLAGS_UPPERCASE) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
|
||||
buf[len++] = 'X';
|
||||
}
|
||||
else if ((base == 2U) && (len < PRINTF_NTOA_BUFFER_SIZE)) {
|
||||
buf[len++] = 'b';
|
||||
}
|
||||
if (len < PRINTF_NTOA_BUFFER_SIZE) {
|
||||
buf[len++] = '0';
|
||||
}
|
||||
}
|
||||
|
||||
if (len < PRINTF_NTOA_BUFFER_SIZE) {
|
||||
if (negative) {
|
||||
buf[len++] = '-';
|
||||
}
|
||||
else if (flags & FLAGS_PLUS) {
|
||||
buf[len++] = '+'; // ignore the space if the '+' exists
|
||||
}
|
||||
else if (flags & FLAGS_SPACE) {
|
||||
buf[len++] = ' ';
|
||||
}
|
||||
}
|
||||
|
||||
return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags);
|
||||
}
|
||||
|
||||
|
||||
// internal itoa for 'long' type
|
||||
static size_t _ntoa_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long value, bool negative, unsigned long base, unsigned int prec, unsigned int width, unsigned int flags)
|
||||
{
|
||||
char buf[PRINTF_NTOA_BUFFER_SIZE];
|
||||
size_t len = 0U;
|
||||
|
||||
// no hash for 0 values
|
||||
if (!value) {
|
||||
flags &= ~FLAGS_HASH;
|
||||
}
|
||||
|
||||
// write if precision != 0 and value is != 0
|
||||
if (!(flags & FLAGS_PRECISION) || value) {
|
||||
do {
|
||||
const char digit = (char)(value % base);
|
||||
buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10;
|
||||
value /= base;
|
||||
} while (value && (len < PRINTF_NTOA_BUFFER_SIZE));
|
||||
}
|
||||
|
||||
return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags);
|
||||
}
|
||||
|
||||
|
||||
// internal itoa for 'long long' type
|
||||
#if defined(PRINTF_SUPPORT_LONG_LONG)
|
||||
static size_t _ntoa_long_long(out_fct_type out, char* buffer, size_t idx, size_t maxlen, unsigned long long value, bool negative, unsigned long long base, unsigned int prec, unsigned int width, unsigned int flags)
|
||||
{
|
||||
char buf[PRINTF_NTOA_BUFFER_SIZE];
|
||||
size_t len = 0U;
|
||||
|
||||
// no hash for 0 values
|
||||
if (!value) {
|
||||
flags &= ~FLAGS_HASH;
|
||||
}
|
||||
|
||||
// write if precision != 0 and value is != 0
|
||||
if (!(flags & FLAGS_PRECISION) || value) {
|
||||
do {
|
||||
const char digit = (char)(value % base);
|
||||
buf[len++] = digit < 10 ? '0' + digit : (flags & FLAGS_UPPERCASE ? 'A' : 'a') + digit - 10;
|
||||
value /= base;
|
||||
} while (value && (len < PRINTF_NTOA_BUFFER_SIZE));
|
||||
}
|
||||
|
||||
return _ntoa_format(out, buffer, idx, maxlen, buf, len, negative, (unsigned int)base, prec, width, flags);
|
||||
}
|
||||
#endif // PRINTF_SUPPORT_LONG_LONG
|
||||
|
||||
|
||||
#if defined(PRINTF_SUPPORT_FLOAT)
|
||||
|
||||
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
|
||||
// forward declaration so that _ftoa can switch to exp notation for values > PRINTF_MAX_FLOAT
|
||||
static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags);
|
||||
#endif
|
||||
|
||||
|
||||
// internal ftoa for fixed decimal floating point
|
||||
static size_t _ftoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags)
|
||||
{
|
||||
char buf[PRINTF_FTOA_BUFFER_SIZE];
|
||||
size_t len = 0U;
|
||||
double diff = 0.0;
|
||||
|
||||
// powers of 10
|
||||
static const double pow10[] = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000 };
|
||||
|
||||
// test for special values
|
||||
if (value != value)
|
||||
return _out_rev(out, buffer, idx, maxlen, "nan", 3, width, flags);
|
||||
if (value < -DBL_MAX)
|
||||
return _out_rev(out, buffer, idx, maxlen, "fni-", 4, width, flags);
|
||||
if (value > DBL_MAX)
|
||||
return _out_rev(out, buffer, idx, maxlen, (flags & FLAGS_PLUS) ? "fni+" : "fni", (flags & FLAGS_PLUS) ? 4U : 3U, width, flags);
|
||||
|
||||
// test for very large values
|
||||
// standard printf behavior is to print EVERY whole number digit -- which could be 100s of characters overflowing your buffers == bad
|
||||
if ((value > PRINTF_MAX_FLOAT) || (value < -PRINTF_MAX_FLOAT)) {
|
||||
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
|
||||
return _etoa(out, buffer, idx, maxlen, value, prec, width, flags);
|
||||
#else
|
||||
return 0U;
|
||||
#endif
|
||||
}
|
||||
|
||||
// test for negative
|
||||
bool negative = false;
|
||||
if (value < 0) {
|
||||
negative = true;
|
||||
value = 0 - value;
|
||||
}
|
||||
|
||||
// set default precision, if not set explicitly
|
||||
if (!(flags & FLAGS_PRECISION)) {
|
||||
prec = PRINTF_DEFAULT_FLOAT_PRECISION;
|
||||
}
|
||||
// limit precision to 9, cause a prec >= 10 can lead to overflow errors
|
||||
while ((len < PRINTF_FTOA_BUFFER_SIZE) && (prec > 9U)) {
|
||||
buf[len++] = '0';
|
||||
prec--;
|
||||
}
|
||||
|
||||
int whole = (int)value;
|
||||
double tmp = (value - whole) * pow10[prec];
|
||||
unsigned long frac = (unsigned long)tmp;
|
||||
diff = tmp - frac;
|
||||
|
||||
if (diff > 0.5) {
|
||||
++frac;
|
||||
// handle rollover, e.g. case 0.99 with prec 1 is 1.0
|
||||
if (frac >= pow10[prec]) {
|
||||
frac = 0;
|
||||
++whole;
|
||||
}
|
||||
}
|
||||
else if (diff < 0.5) {
|
||||
}
|
||||
else if ((frac == 0U) || (frac & 1U)) {
|
||||
// if halfway, round up if odd OR if last digit is 0
|
||||
++frac;
|
||||
}
|
||||
|
||||
if (prec == 0U) {
|
||||
diff = value - (double)whole;
|
||||
if ((!(diff < 0.5) || (diff > 0.5)) && (whole & 1)) {
|
||||
// exactly 0.5 and ODD, then round up
|
||||
// 1.5 -> 2, but 2.5 -> 2
|
||||
++whole;
|
||||
}
|
||||
}
|
||||
else {
|
||||
unsigned int count = prec;
|
||||
// now do fractional part, as an unsigned number
|
||||
while (len < PRINTF_FTOA_BUFFER_SIZE) {
|
||||
--count;
|
||||
buf[len++] = (char)(48U + (frac % 10U));
|
||||
if (!(frac /= 10U)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// add extra 0s
|
||||
while ((len < PRINTF_FTOA_BUFFER_SIZE) && (count-- > 0U)) {
|
||||
buf[len++] = '0';
|
||||
}
|
||||
if (len < PRINTF_FTOA_BUFFER_SIZE) {
|
||||
// add decimal
|
||||
buf[len++] = '.';
|
||||
}
|
||||
}
|
||||
|
||||
// do whole part, number is reversed
|
||||
while (len < PRINTF_FTOA_BUFFER_SIZE) {
|
||||
buf[len++] = (char)(48 + (whole % 10));
|
||||
if (!(whole /= 10)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// pad leading zeros
|
||||
if (!(flags & FLAGS_LEFT) && (flags & FLAGS_ZEROPAD)) {
|
||||
if (width && (negative || (flags & (FLAGS_PLUS | FLAGS_SPACE)))) {
|
||||
width--;
|
||||
}
|
||||
while ((len < width) && (len < PRINTF_FTOA_BUFFER_SIZE)) {
|
||||
buf[len++] = '0';
|
||||
}
|
||||
}
|
||||
|
||||
if (len < PRINTF_FTOA_BUFFER_SIZE) {
|
||||
if (negative) {
|
||||
buf[len++] = '-';
|
||||
}
|
||||
else if (flags & FLAGS_PLUS) {
|
||||
buf[len++] = '+'; // ignore the space if the '+' exists
|
||||
}
|
||||
else if (flags & FLAGS_SPACE) {
|
||||
buf[len++] = ' ';
|
||||
}
|
||||
}
|
||||
|
||||
return _out_rev(out, buffer, idx, maxlen, buf, len, width, flags);
|
||||
}
|
||||
|
||||
|
||||
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
|
||||
// internal ftoa variant for exponential floating-point type, contributed by Martijn Jasperse <m.jasperse@gmail.com>
|
||||
static size_t _etoa(out_fct_type out, char* buffer, size_t idx, size_t maxlen, double value, unsigned int prec, unsigned int width, unsigned int flags)
|
||||
{
|
||||
// check for NaN and special values
|
||||
if ((value != value) || (value > DBL_MAX) || (value < -DBL_MAX)) {
|
||||
return _ftoa(out, buffer, idx, maxlen, value, prec, width, flags);
|
||||
}
|
||||
|
||||
// determine the sign
|
||||
const bool negative = value < 0;
|
||||
if (negative) {
|
||||
value = -value;
|
||||
}
|
||||
|
||||
// default precision
|
||||
if (!(flags & FLAGS_PRECISION)) {
|
||||
prec = PRINTF_DEFAULT_FLOAT_PRECISION;
|
||||
}
|
||||
|
||||
// determine the decimal exponent
|
||||
// based on the algorithm by David Gay (https://www.ampl.com/netlib/fp/dtoa.c)
|
||||
union {
|
||||
uint64_t U;
|
||||
double F;
|
||||
} conv;
|
||||
|
||||
conv.F = value;
|
||||
int exp2 = (int)((conv.U >> 52U) & 0x07FFU) - 1023; // effectively log2
|
||||
conv.U = (conv.U & ((1ULL << 52U) - 1U)) | (1023ULL << 52U); // drop the exponent so conv.F is now in [1,2)
|
||||
// now approximate log10 from the log2 integer part and an expansion of ln around 1.5
|
||||
int expval = (int)(0.1760912590558 + exp2 * 0.301029995663981 + (conv.F - 1.5) * 0.289529654602168);
|
||||
// now we want to compute 10^expval but we want to be sure it won't overflow
|
||||
exp2 = (int)(expval * 3.321928094887362 + 0.5);
|
||||
const double z = expval * 2.302585092994046 - exp2 * 0.6931471805599453;
|
||||
const double z2 = z * z;
|
||||
conv.U = (uint64_t)(exp2 + 1023) << 52U;
|
||||
// compute exp(z) using continued fractions, see https://en.wikipedia.org/wiki/Exponential_function#Continued_fractions_for_ex
|
||||
conv.F *= 1 + 2 * z / (2 - z + (z2 / (6 + (z2 / (10 + z2 / 14)))));
|
||||
// correct for rounding errors
|
||||
if (value < conv.F) {
|
||||
expval--;
|
||||
conv.F /= 10;
|
||||
}
|
||||
|
||||
// the exponent format is "%+03d" and largest value is "307", so set aside 4-5 characters
|
||||
unsigned int minwidth = ((expval < 100) && (expval > -100)) ? 4U : 5U;
|
||||
|
||||
// in "%g" mode, "prec" is the number of *significant figures* not decimals
|
||||
if (flags & FLAGS_ADAPT_EXP) {
|
||||
// do we want to fall-back to "%f" mode?
|
||||
if ((value >= 1e-4) && (value < 1e6)) {
|
||||
if ((int)prec > expval) {
|
||||
prec = (unsigned)((int)prec - expval - 1);
|
||||
}
|
||||
else {
|
||||
prec = 0;
|
||||
}
|
||||
flags |= FLAGS_PRECISION; // make sure _ftoa respects precision
|
||||
// no characters in exponent
|
||||
minwidth = 0U;
|
||||
expval = 0;
|
||||
}
|
||||
else {
|
||||
// we use one sigfig for the whole part
|
||||
if ((prec > 0) && (flags & FLAGS_PRECISION)) {
|
||||
--prec;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// will everything fit?
|
||||
unsigned int fwidth = width;
|
||||
if (width > minwidth) {
|
||||
// we didn't fall-back so subtract the characters required for the exponent
|
||||
fwidth -= minwidth;
|
||||
} else {
|
||||
// not enough characters, so go back to default sizing
|
||||
fwidth = 0U;
|
||||
}
|
||||
if ((flags & FLAGS_LEFT) && minwidth) {
|
||||
// if we're padding on the right, DON'T pad the floating part
|
||||
fwidth = 0U;
|
||||
}
|
||||
|
||||
// rescale the float value
|
||||
if (expval) {
|
||||
value /= conv.F;
|
||||
}
|
||||
|
||||
// output the floating part
|
||||
const size_t start_idx = idx;
|
||||
idx = _ftoa(out, buffer, idx, maxlen, negative ? -value : value, prec, fwidth, flags & ~FLAGS_ADAPT_EXP);
|
||||
|
||||
// output the exponent part
|
||||
if (minwidth) {
|
||||
// output the exponential symbol
|
||||
out((flags & FLAGS_UPPERCASE) ? 'E' : 'e', buffer, idx++, maxlen);
|
||||
// output the exponent value
|
||||
idx = _ntoa_long(out, buffer, idx, maxlen, (expval < 0) ? -expval : expval, expval < 0, 10, 0, minwidth-1, FLAGS_ZEROPAD | FLAGS_PLUS);
|
||||
// might need to right-pad spaces
|
||||
if (flags & FLAGS_LEFT) {
|
||||
while (idx - start_idx < width) out(' ', buffer, idx++, maxlen);
|
||||
}
|
||||
}
|
||||
return idx;
|
||||
}
|
||||
#endif // PRINTF_SUPPORT_EXPONENTIAL
|
||||
#endif // PRINTF_SUPPORT_FLOAT
|
||||
|
||||
|
||||
// internal vsnprintf
|
||||
static int _vsnprintf(out_fct_type out, char* buffer, const size_t maxlen, const char* format, va_list va) {
|
||||
unsigned int flags, width, precision, n;
|
||||
size_t idx = 0U;
|
||||
|
||||
if (!buffer) {
|
||||
// use null output function
|
||||
out = _out_null;
|
||||
}
|
||||
|
||||
while (*format)
|
||||
{
|
||||
// format specifier? %[flags][width][.precision][length]
|
||||
if (*format != '%') {
|
||||
// no
|
||||
out(*format, buffer, idx++, maxlen);
|
||||
format++;
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
// yes, evaluate it
|
||||
format++;
|
||||
}
|
||||
|
||||
// evaluate flags
|
||||
flags = 0U;
|
||||
do {
|
||||
switch (*format) {
|
||||
case '0': flags |= FLAGS_ZEROPAD; format++; n = 1U; break;
|
||||
case '-': flags |= FLAGS_LEFT; format++; n = 1U; break;
|
||||
case '+': flags |= FLAGS_PLUS; format++; n = 1U; break;
|
||||
case ' ': flags |= FLAGS_SPACE; format++; n = 1U; break;
|
||||
case '#': flags |= FLAGS_HASH; format++; n = 1U; break;
|
||||
default : n = 0U; break;
|
||||
}
|
||||
} while (n);
|
||||
|
||||
// evaluate width field
|
||||
width = 0U;
|
||||
if (_is_digit(*format)) {
|
||||
width = _atoi(&format);
|
||||
}
|
||||
else if (*format == '*') {
|
||||
const int w = va_arg(va, int);
|
||||
if (w < 0) {
|
||||
flags |= FLAGS_LEFT; // reverse padding
|
||||
width = (unsigned int)-w;
|
||||
}
|
||||
else {
|
||||
width = (unsigned int)w;
|
||||
}
|
||||
format++;
|
||||
}
|
||||
|
||||
// evaluate precision field
|
||||
precision = 0U;
|
||||
if (*format == '.') {
|
||||
flags |= FLAGS_PRECISION;
|
||||
format++;
|
||||
if (_is_digit(*format)) {
|
||||
precision = _atoi(&format);
|
||||
}
|
||||
else if (*format == '*') {
|
||||
const int prec = (int)va_arg(va, int);
|
||||
precision = prec > 0 ? (unsigned int)prec : 0U;
|
||||
format++;
|
||||
}
|
||||
}
|
||||
|
||||
// evaluate length field
|
||||
switch (*format) {
|
||||
case 'l' :
|
||||
flags |= FLAGS_LONG;
|
||||
format++;
|
||||
if (*format == 'l') {
|
||||
flags |= FLAGS_LONG_LONG;
|
||||
format++;
|
||||
}
|
||||
break;
|
||||
case 'h' :
|
||||
flags |= FLAGS_SHORT;
|
||||
format++;
|
||||
if (*format == 'h') {
|
||||
flags |= FLAGS_CHAR;
|
||||
format++;
|
||||
}
|
||||
break;
|
||||
#if defined(PRINTF_SUPPORT_PTRDIFF_T)
|
||||
case 't' :
|
||||
flags |= (sizeof(ptrdiff_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
|
||||
format++;
|
||||
break;
|
||||
#endif
|
||||
case 'j' :
|
||||
flags |= (sizeof(intmax_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
|
||||
format++;
|
||||
break;
|
||||
case 'z' :
|
||||
flags |= (sizeof(size_t) == sizeof(long) ? FLAGS_LONG : FLAGS_LONG_LONG);
|
||||
format++;
|
||||
break;
|
||||
default :
|
||||
break;
|
||||
}
|
||||
|
||||
// evaluate specifier
|
||||
switch (*format) {
|
||||
case 'd' :
|
||||
case 'i' :
|
||||
case 'u' :
|
||||
case 'x' :
|
||||
case 'X' :
|
||||
case 'o' :
|
||||
case 'b' : {
|
||||
// set the base
|
||||
unsigned int base;
|
||||
if (*format == 'x' || *format == 'X') {
|
||||
base = 16U;
|
||||
}
|
||||
else if (*format == 'o') {
|
||||
base = 8U;
|
||||
}
|
||||
else if (*format == 'b') {
|
||||
base = 2U;
|
||||
}
|
||||
else {
|
||||
base = 10U;
|
||||
flags &= ~FLAGS_HASH; // no hash for dec format
|
||||
}
|
||||
// uppercase
|
||||
if (*format == 'X') {
|
||||
flags |= FLAGS_UPPERCASE;
|
||||
}
|
||||
|
||||
// no plus or space flag for u, x, X, o, b
|
||||
if ((*format != 'i') && (*format != 'd')) {
|
||||
flags &= ~(FLAGS_PLUS | FLAGS_SPACE);
|
||||
}
|
||||
|
||||
// ignore '0' flag when precision is given
|
||||
if (flags & FLAGS_PRECISION) {
|
||||
flags &= ~FLAGS_ZEROPAD;
|
||||
}
|
||||
|
||||
// convert the integer
|
||||
if ((*format == 'i') || (*format == 'd')) {
|
||||
// signed
|
||||
if (flags & FLAGS_LONG_LONG) {
|
||||
#if defined(PRINTF_SUPPORT_LONG_LONG)
|
||||
const long long value = va_arg(va, long long);
|
||||
idx = _ntoa_long_long(out, buffer, idx, maxlen, (unsigned long long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
|
||||
#endif
|
||||
}
|
||||
else if (flags & FLAGS_LONG) {
|
||||
const long value = va_arg(va, long);
|
||||
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
|
||||
}
|
||||
else {
|
||||
const int value = (flags & FLAGS_CHAR) ? (char)va_arg(va, int) : (flags & FLAGS_SHORT) ? (short int)va_arg(va, int) : va_arg(va, int);
|
||||
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned int)(value > 0 ? value : 0 - value), value < 0, base, precision, width, flags);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// unsigned
|
||||
if (flags & FLAGS_LONG_LONG) {
|
||||
#if defined(PRINTF_SUPPORT_LONG_LONG)
|
||||
idx = _ntoa_long_long(out, buffer, idx, maxlen, va_arg(va, unsigned long long), false, base, precision, width, flags);
|
||||
#endif
|
||||
}
|
||||
else if (flags & FLAGS_LONG) {
|
||||
idx = _ntoa_long(out, buffer, idx, maxlen, va_arg(va, unsigned long), false, base, precision, width, flags);
|
||||
}
|
||||
else {
|
||||
const unsigned int value = (flags & FLAGS_CHAR) ? (unsigned char)va_arg(va, unsigned int) : (flags & FLAGS_SHORT) ? (unsigned short int)va_arg(va, unsigned int) : va_arg(va, unsigned int);
|
||||
idx = _ntoa_long(out, buffer, idx, maxlen, value, false, base, precision, width, flags);
|
||||
}
|
||||
}
|
||||
format++;
|
||||
break;
|
||||
}
|
||||
#if defined(PRINTF_SUPPORT_FLOAT)
|
||||
case 'f' :
|
||||
case 'F' :
|
||||
if (*format == 'F') flags |= FLAGS_UPPERCASE;
|
||||
idx = _ftoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags);
|
||||
format++;
|
||||
break;
|
||||
#if defined(PRINTF_SUPPORT_EXPONENTIAL)
|
||||
case 'e':
|
||||
case 'E':
|
||||
case 'g':
|
||||
case 'G':
|
||||
if ((*format == 'g')||(*format == 'G')) flags |= FLAGS_ADAPT_EXP;
|
||||
if ((*format == 'E')||(*format == 'G')) flags |= FLAGS_UPPERCASE;
|
||||
idx = _etoa(out, buffer, idx, maxlen, va_arg(va, double), precision, width, flags);
|
||||
format++;
|
||||
break;
|
||||
#endif // PRINTF_SUPPORT_EXPONENTIAL
|
||||
#endif // PRINTF_SUPPORT_FLOAT
|
||||
case 'c' : {
|
||||
unsigned int l = 1U;
|
||||
// pre padding
|
||||
if (!(flags & FLAGS_LEFT)) {
|
||||
while (l++ < width) {
|
||||
out(' ', buffer, idx++, maxlen);
|
||||
}
|
||||
}
|
||||
// char output
|
||||
out((char)va_arg(va, int), buffer, idx++, maxlen);
|
||||
// post padding
|
||||
if (flags & FLAGS_LEFT) {
|
||||
while (l++ < width) {
|
||||
out(' ', buffer, idx++, maxlen);
|
||||
}
|
||||
}
|
||||
format++;
|
||||
break;
|
||||
}
|
||||
|
||||
case 's' : {
|
||||
const char* p = va_arg(va, char*);
|
||||
unsigned int l = _strnlen_s(p, precision ? precision : (size_t)-1);
|
||||
// pre padding
|
||||
if (flags & FLAGS_PRECISION) {
|
||||
l = (l < precision ? l : precision);
|
||||
}
|
||||
if (!(flags & FLAGS_LEFT)) {
|
||||
while (l++ < width) {
|
||||
out(' ', buffer, idx++, maxlen);
|
||||
}
|
||||
}
|
||||
// string output
|
||||
while ((*p != 0) && (!(flags & FLAGS_PRECISION) || precision--)) {
|
||||
out(*(p++), buffer, idx++, maxlen);
|
||||
}
|
||||
// post padding
|
||||
if (flags & FLAGS_LEFT) {
|
||||
while (l++ < width) {
|
||||
out(' ', buffer, idx++, maxlen);
|
||||
}
|
||||
}
|
||||
format++;
|
||||
break;
|
||||
}
|
||||
|
||||
case 'p' : {
|
||||
width = sizeof(void*) * 2U;
|
||||
flags |= FLAGS_ZEROPAD | FLAGS_UPPERCASE;
|
||||
#if defined(PRINTF_SUPPORT_LONG_LONG)
|
||||
const bool is_ll = sizeof(uintptr_t) == sizeof(long long);
|
||||
if (is_ll) {
|
||||
idx = _ntoa_long_long(out, buffer, idx, maxlen, (uintptr_t)va_arg(va, void*), false, 16U, precision, width, flags);
|
||||
}
|
||||
else {
|
||||
#endif
|
||||
idx = _ntoa_long(out, buffer, idx, maxlen, (unsigned long)((uintptr_t)va_arg(va, void*)), false, 16U, precision, width, flags);
|
||||
#if defined(PRINTF_SUPPORT_LONG_LONG)
|
||||
}
|
||||
#endif
|
||||
format++;
|
||||
break;
|
||||
}
|
||||
|
||||
case '%' :
|
||||
out('%', buffer, idx++, maxlen);
|
||||
format++;
|
||||
break;
|
||||
|
||||
default :
|
||||
out(*format, buffer, idx++, maxlen);
|
||||
format++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// termination
|
||||
out((char)0, buffer, idx < maxlen ? idx : maxlen - 1U, maxlen);
|
||||
|
||||
// return written chars without terminating \0
|
||||
return (int)idx;
|
||||
}
|
||||
|
||||
int tiny_printf(const char* format, ...) {
|
||||
va_list va;
|
||||
va_start(va, format);
|
||||
char buffer[1];
|
||||
const int ret = _vsnprintf(_out_char, buffer, (size_t)-1, format, va);
|
||||
va_end(va);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int tiny_sprintf(char* buffer, const char* format, ...) {
|
||||
va_list va;
|
||||
va_start(va, format);
|
||||
const int ret = _vsnprintf(_out_buffer, buffer, (size_t)-1, format, va);
|
||||
va_end(va);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int tiny_snprintf(char* buffer, size_t count, const char* format, ...) {
|
||||
va_list va;
|
||||
va_start(va, format);
|
||||
const int ret = _vsnprintf(_out_buffer, buffer, count, format, va);
|
||||
va_end(va);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int tiny_vprintf(const char* format, va_list va) {
|
||||
char buffer[1];
|
||||
return _vsnprintf(_out_char, buffer, (size_t)-1, format, va);
|
||||
}
|
||||
|
||||
int tiny_vsnprintf(char* buffer, size_t count, const char* format, va_list va) {
|
||||
return _vsnprintf(_out_buffer, buffer, count, format, va);
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue