merge from master branch

This commit is contained in:
Jaewon Lee 2024-09-12 10:32:02 -04:00
commit e91eb4aed4
124 changed files with 1933 additions and 1718 deletions

View file

@ -219,7 +219,9 @@ jobs:
runs-on: ubuntu-20.04
needs: build_vm
strategy:
fail-fast: false
matrix:
name: [regression, opencl, cache, config1, config2, debug, stress, vm]
xlen: [32, 64]
steps:
@ -267,4 +269,4 @@ jobs:
steps:
- name: Check Completion
run: echo "All matrix jobs passed"
run: echo "All matrix jobs passed"

View file

@ -44,10 +44,10 @@ clean: clean-build
$(MAKE) -C $(VORTEX_HOME)/third_party clean
# Install setup
KERNEL_INC_DST = $(PREFIX)/kernel/include
KERNEL_LIB_DST = $(PREFIX)/kernel/lib$(XLEN)
RUNTIME_INC_DST = $(PREFIX)/runtime/include
RUNTIME_LIB_DST = $(PREFIX)/runtime/lib
KERNEL_INC_DST = $(INSTALLDIR)/kernel/include
KERNEL_LIB_DST = $(INSTALLDIR)/kernel/lib$(XLEN)
RUNTIME_INC_DST = $(INSTALLDIR)/runtime/include
RUNTIME_LIB_DST = $(INSTALLDIR)/runtime/lib
KERNEL_HEADERS = $(wildcard $(VORTEX_HOME)/kernel/include/*.h)
KERNEL_LIBS = $(wildcard kernel/*.a)

View file

@ -1,5 +1,3 @@
[![Build Status](https://travis-ci.com/vortexgpgpu/vortex.svg?branch=master)](https://travis-ci.com/vortexgpgpu/vortex)
# Vortex GPGPU
Vortex is a full-stack open-source RISC-V GPGPU.
@ -47,20 +45,20 @@ More detailed build instructions can be found [here](docs/install_vortex.md).
- [Yosys](https://github.com/YosysHQ/yosys)
- [Sv2v](https://github.com/zachjs/sv2v)
### Install development tools
```
sudo apt-get install build-essential
sudo apt-get install binutils
sudo apt-get install python
sudo apt-get install uuid-dev
sudo apt-get install git
```sh
sudo apt-get install build-essential
sudo apt-get install binutils
sudo apt-get install python
sudo apt-get install uuid-dev
sudo apt-get install git
```
### Install Vortex codebase
```sh
git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git
cd vortex
```
git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git -b vortex_vm
cd vortex
```
### Configure your build folder
```sh
#
# By default, the toolchain default install location is the /opt folder and can be overridden by setting --tooldir.
# This is the example for volvo server
@ -72,38 +70,45 @@ More detailed build instructions can be found [here](docs/install_vortex.md).
../configure --xlen=32 --tooldir=/software/vortex-toolchain-2024-2024-08-09 --prefix=$OUT_DIR
# Run the following instead to enable virtual memory feature in compilation
../configure --xlen=32 --tooldir=/software/vortex-toolchain-2024-2024-08-09 --prefix=$OUT_DIR --vm_enable=1
```
### Install prebuilt toolchain
# We will use the precomipled tools in volvo toolchanin directory
### set environment variables
```sh
# should always run before using the toolchain!
source ./ci/toolchain_env.sh
```
### Building Vortex
make -s
```sh
make -s
```
### Quick demo running vecadd OpenCL kernel on 2 cores
$ ./ci/blackbox.sh --cores=2 --app=vecadd
```sh
./ci/blackbox.sh --cores=2 --app=vecadd
```
### Common Developer Tips
- Installing Vortex kernel and runtime libraries to use with external tools requires passing --prefix=<install-path> to the configure script.
```sh
$ ../configure --xlen=32 --tooldir=$HOME/tools --prefix=<install-path>
$ make -s
$ make install
``````
```sh
../configure --xlen=32 --tooldir=$HOME/tools --prefix=<install-path>
make -s
make install
```
- Building Vortex 64-bit simply requires using --xlen=64 configure option.
```sh
$ ../configure --xlen=32 --tooldir=$HOME/tools
```
```sh
../configure --xlen=32 --tooldir=$HOME/tools
```
- Sourcing "./ci/toolchain_env.sh" is required everytime you start a new terminal. we recommend adding "source <build-path>/ci/toolchain_env.sh" to your ~/.bashrc file to automate the process at login.
```sh
$ echo "source <build-path>/ci/toolchain_env.sh" >> ~/.bashrc
```
```sh
echo "source <build-path>/ci/toolchain_env.sh" >> ~/.bashrc
```
- Making changes to Makefiles in your source tree or adding new folders will require executing the "configure" script again to get it propagated into your build folder.
```sh
$ ../configure
```
```sh
../configure
```
- To debug the GPU, you can generate a "run.log" trace. see /docs/debugging.md for more information.
```sh
$ ./ci/blackbox.sh --app=demo --debug=3
```
```sh
./ci/blackbox.sh --app=demo --debug=3
```
- For additional information, check out the /docs.

View file

@ -23,6 +23,8 @@ rm -f blackbox.*.cache
XLEN=${XLEN:=@XLEN@}
XSIZE=$((XLEN / 8))
echo "Vortex Regression Test: XLEN=$XLEN"
unittest()
@ -99,11 +101,11 @@ regression()
# test global barrier
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tgbar" --cores=2
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -tgbar" --cores=2
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tgbar" --cores=2
# test local barrier
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar"
./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -tbar"
./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tbar"
echo "regression tests done!"
}
@ -148,32 +150,54 @@ vm(){
echo "vm tests done!"
}
test_csv_trace()
cache()
{
# test CSV trace generation
make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-simx-32im > run_simx.log
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
diff trace_rtlsim.csv trace_simx.csv
# clean build
make -C sim/simx clean
make -C sim/rtlsim clean
}
echo "begin cache tests..."
debug()
{
echo "begin debugging tests..."
# disable local memory
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo --perf=1
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=demo --perf=1
test_csv_trace
# disable L1 cache
CONFIGS="-DL1_DISABLE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1"
# reduce l1 line size
CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=rtlsim --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx
echo "debugging tests done!"
# test cache ways
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache banking
CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test writeback
CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --app=mstress
CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
# cache clustering
CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=4 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=4 --warps=1 --threads=2
# L2/L3
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=4 --l2cache --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=diverge --args="-n1"
echo "begin cache tests..."
}
config1()
@ -189,10 +213,12 @@ config1()
./ci/blackbox.sh --driver=simx --warps=8 --threads=16 --app=diverge
# cores clustering
./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=4 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --app=diverge --args="-n1"
# issue width
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
@ -212,22 +238,19 @@ config1()
CONFIGS="-DISSUE_WIDTH=2 -DNUM_FPU_BLOCK=1 -DNUM_FPU_LANES=2" ./ci/blackbox.sh --driver=simx --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_FPU_BLOCK=4 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=simx --app=vecaddx
# FPU's PE scaling
CONFIGS="-DFMA_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfmadd"
CONFIGS="-DFCVT_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tftoi"
CONFIGS="-DFDIV_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfdiv"
CONFIGS="-DFSQRT_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfsqrt"
CONFIGS="-DFNCP_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfclamp"
# LSU scaling
CONFIGS="-DISSUE_WIDTH=2 -DNUM_LSU_BLOCK=1 -DNUM_LSU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_LSU_BLOCK=4 -DNUM_LSU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx
CONFIGS="-DISSUE_WIDTH=2 -DNUM_LSU_BLOCK=1 -DNUM_LSU_LANES=2" ./ci/blackbox.sh --driver=simx --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_LSU_BLOCK=4 -DNUM_LSU_LANES=4" ./ci/blackbox.sh --driver=simx --app=vecaddx
# L2/L3
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=diverge --args="-n1"
# multiple L1 caches per socket
CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=2 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=8 --warps=1 --threads=2
echo "configuration-1 tests done!"
}
@ -262,55 +285,63 @@ config2()
# disabling ZICOND extension
CONFIGS="-DEXT_ZICOND_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo
# disable local memory
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo --perf=1
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=demo --perf=1
# test AXI bus
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --app=demo
# disable L1 cache
CONFIGS="-DL1_DISABLE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
# reduce l1 line size
CONFIGS="-DL1_LINE_SIZE=$XLEN/8" ./ci/blackbox.sh --driver=rtlsim --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XLEN/8" ./ci/blackbox.sh --driver=simx --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XLEN/8 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_LINE_SIZE=$XLEN/8 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache ways
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache banking
CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --app=mstress
# test 128-bit MEM block
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=demo
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=mstress
# test XLEN-bit MEM block
CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=mstress
# test memory coalescing
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8
# test single-bank DRAM
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --app=demo
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
# test 27-bit DRAM address
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --app=demo
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --app=mstress
echo "configuration-2 tests done!"
}
test_csv_trace()
{
# test CSV trace generation
make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-simx-32im > run_simx.log
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
diff trace_rtlsim.csv trace_simx.csv
# clean build
make -C sim/simx clean
make -C sim/rtlsim clean
}
debug()
{
echo "begin debugging tests..."
test_csv_trace
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1"
echo "debugging tests done!"
}
stress()
{
echo "begin stress tests..."
# test verilator reset values
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
CONFIGS="-DVERILATOR_RESET_VALUE=1 -DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --args="-n128" --l2cache
echo "stress tests done!"
@ -329,19 +360,14 @@ synthesis()
show_usage()
{
echo "Vortex Regression Test"
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--config1] [--config2] [--debug] [--stress] [--synthesis] [--all] [--h|--help]"
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--stress] [--synthesis] [--all] [--h|--help]"
}
start=$SECONDS
declare -a tests=()
clean=0
while [ "$1" != "" ]; do
case $1 in
--vm )
tests+=("vm")
;;
--clean )
clean=1
;;
@ -360,6 +386,12 @@ while [ "$1" != "" ]; do
--opencl )
tests+=("opencl")
;;
--cache )
tests+=("cache")
;;
--vm )
tests+=("vm")
;;
--config1 )
tests+=("config1")
;;
@ -382,6 +414,7 @@ while [ "$1" != "" ]; do
tests+=("kernel")
tests+=("regression")
tests+=("opencl")
tests+=("cache")
tests+=("config1")
tests+=("config2")
tests+=("debug")
@ -405,6 +438,8 @@ then
make -s
fi
start=$SECONDS
for test in "${tests[@]}"; do
$test
done

View file

@ -19,6 +19,8 @@ import csv
import re
import inspect
configs = None
def parse_args():
parser = argparse.ArgumentParser(description='CPU trace log to CSV format converter.')
parser.add_argument('-t', '--type', default='simx', help='log type (rtlsim or simx)')
@ -26,6 +28,24 @@ def parse_args():
parser.add_argument('log', help='Input log file')
return parser.parse_args()
def load_config(filename):
config_pattern = r"CONFIGS: num_threads=(\d+), num_warps=(\d+), num_cores=(\d+), num_clusters=(\d+), socket_size=(\d+), local_mem_base=0x([0-9a-fA-F]+), num_barriers=(\d+)"
with open(filename, 'r') as file:
for line in file:
config_match = re.search(config_pattern, line)
if config_match:
config = {
'num_threads': int(config_match.group(1)),
'num_warps': int(config_match.group(2)),
'num_cores': int(config_match.group(3)),
'num_clusters': int(config_match.group(4)),
'socket_size': int(config_match.group(5)),
'local_mem_base': int(config_match.group(6), 16),
'num_barriers': int(config_match.group(7)),
}
return config
return None
def parse_simx(log_lines):
pc_pattern = r"PC=(0x[0-9a-fA-F]+)"
instr_pattern = r"Instr (0x[0-9a-fA-F]+):"
@ -46,10 +66,10 @@ def parse_simx(log_lines):
instr_data = {}
instr_data["lineno"] = lineno
instr_data["PC"] = re.search(pc_pattern, line).group(1)
instr_data["core_id"] = re.search(core_id_pattern, line).group(1)
instr_data["warp_id"] = re.search(warp_id_pattern, line).group(1)
instr_data["core_id"] = int(re.search(core_id_pattern, line).group(1))
instr_data["warp_id"] = int(re.search(warp_id_pattern, line).group(1))
instr_data["tmask"] = re.search(tmask_pattern, line).group(1)
instr_data["uuid"] = re.search(uuid_pattern, line).group(1)
instr_data["uuid"] = int(re.search(uuid_pattern, line).group(1))
elif line.startswith("DEBUG Instr"):
instr_data["instr"] = re.search(instr_pattern, line).group(1)
instr_data["opcode"] = re.search(opcode_pattern, line).group(1)
@ -60,6 +80,7 @@ def parse_simx(log_lines):
instr_data["destination"] = re.search(destination_pattern, line).group(1)
except Exception as e:
print("Error at line {}: {}".format(lineno, e))
instr_data = None
if instr_data:
entries.append(instr_data)
return entries
@ -95,7 +116,7 @@ def append_value(text, reg, value, tmask_arr, sep):
return text, sep
def parse_rtlsim(log_lines):
config_pattern = r"CONFIGS: num_threads=(\d+), num_warps=(\d+), num_cores=(\d+), num_clusters=(\d+), socket_size=(\d+), local_mem_base=(\d+), num_barriers=(\d+)"
global configs
line_pattern = r"\d+: cluster(\d+)-socket(\d+)-core(\d+)-(decode|issue|commit)"
pc_pattern = r"PC=(0x[0-9a-fA-F]+)"
instr_pattern = r"instr=(0x[0-9a-fA-F]+)"
@ -117,36 +138,20 @@ def parse_rtlsim(log_lines):
uuid_pattern = r"#(\d+)"
entries = []
instr_data = {}
num_threads = 0
num_warps = 0
num_cores = 0
num_clusters = 0
socket_size = 0
local_mem_base = 0
num_barriers = 0
num_sockets = 0
num_cores = configs['num_cores']
socket_size = configs['socket_size']
num_sockets = (num_cores + socket_size - 1) // socket_size
for lineno, line in enumerate(log_lines, start=1):
try:
config_match = re.search(config_pattern, line)
if config_match:
num_threads = int(config_match.group(1))
num_warps = int(config_match.group(2))
num_cores = int(config_match.group(3))
num_clusters = int(config_match.group(4))
socket_size = int(config_match.group(5))
local_mem_base = int(config_match.group(6))
num_barriers = int(config_match.group(7))
num_sockets = (num_cores + socket_size - 1) // socket_size
continue
line_match = re.search(line_pattern, line)
if line_match:
PC = re.search(pc_pattern, line).group(1)
warp_id = re.search(warp_id_pattern, line).group(1)
warp_id = int(re.search(warp_id_pattern, line).group(1))
tmask = re.search(tmask_pattern, line).group(1)
uuid = re.search(uuid_pattern, line).group(1)
cluster_id = line_match.group(1)
socket_id = line_match.group(2)
core_id = line_match.group(3)
uuid = int(re.search(uuid_pattern, line).group(1))
cluster_id = int(line_match.group(1))
socket_id = int(line_match.group(2))
core_id = int(line_match.group(3))
stage = line_match.group(4)
if stage == "decode":
trace = {}
@ -273,7 +278,9 @@ def split_log_file(log_filename):
return sublogs
def main():
global configs
args = parse_args()
configs = load_config(args.log)
sublogs = split_log_file(args.log)
write_csv(sublogs, args.csv, args.type)

View file

@ -19,7 +19,7 @@ TOOLDIR ?= @TOOLDIR@
OSVERSION ?= @OSVERSION@
PREFIX ?= @PREFIX@
INSTALLDIR ?= @INSTALLDIR@
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex

4
configure vendored
View file

@ -63,7 +63,7 @@ copy_files() {
filename_no_ext="${filename%.in}"
dest_file="$dest_dir/$filename_no_ext"
mkdir -p "$dest_dir"
sed "s|@VORTEX_HOME@|$SCRIPT_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@PREFIX@|$PREFIX|g; s|@VM_ENABLE@|$VM_ENABLE|g" "$file" > "$dest_file"
sed "s|@VORTEX_HOME@|$SCRIPT_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g; s|@VM_ENABLE@|$VM_ENABLE|g" "$file" > "$dest_file"
# apply permissions to bash scripts
read -r firstline < "$dest_file"
if [[ "$firstline" =~ ^#!.*bash ]]; then
@ -178,4 +178,4 @@ THIRD_PARTY_DIR=$SCRIPT_DIR/third_party
copy_files "$SCRIPT_DIR" "$CURRENT_DIR"
echo "VM Enable: "$VM_ENABLE
echo "VM Enable: "$VM_ENABLE

View file

@ -53,9 +53,9 @@ A waveform trace `trace.vcd` will be generated in the current directory during t
## Analyzing Vortex trace log
When debugging Vortex RTL or SimX Simulator, reading the trace run.log file can be overwhelming when the trace gets really large.
We provide a trace sanitizer tool under ./hw/scripts/trace_csv.py that you can use to convert the large trace into a CSV file containing all the instructions that executed with their source and destination operands. To increase compatibility between traces you will need to initialize RTLSIM's GPRs to zero by defining GPR_RESET.
We provide a trace sanitizer tool under ./hw/scripts/trace_csv.py that you can use to convert the large trace into a CSV file containing all the instructions that executed with their source and destination operands.
$ CONFIGS="-DGPR_RESET" ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=3 --log=run_rtlsim.log
$ ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=3 --log=run_rtlsim.log
$ ./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
$ ./ci/blackbox.sh --driver=simx --app=demo --debug=3 --log=run_simx.log

View file

@ -96,10 +96,11 @@ module VX_cluster import VX_gpu_pkg::*; #(
.CRSQ_SIZE (`L2_CRSQ_SIZE),
.MSHR_SIZE (`L2_MSHR_SIZE),
.MRSQ_SIZE (`L2_MRSQ_SIZE),
.MREQ_SIZE (`L2_MREQ_SIZE),
.MREQ_SIZE (`L2_WRITEBACK ? `L2_MSHR_SIZE : `L2_MREQ_SIZE),
.TAG_WIDTH (L2_TAG_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`L2_WRITEBACK),
.DIRTY_BYTES (`L2_WRITEBACK),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2),

View file

@ -217,7 +217,7 @@
`ifndef IO_COUT_ADDR
`define IO_COUT_ADDR `IO_BASE_ADDR
`endif
`define IO_COUT_SIZE `MEM_BLOCK_SIZE
`define IO_COUT_SIZE 64
`ifndef IO_MPM_ADDR
`define IO_MPM_ADDR (`IO_COUT_ADDR + `IO_COUT_SIZE)
@ -685,7 +685,7 @@
// Number of Banks
`ifndef L3_NUM_BANKS
`define L3_NUM_BANKS `MIN(4, `NUM_CLUSTERS)
`define L3_NUM_BANKS `MIN(8, `NUM_CLUSTERS)
`endif
// Core Response Queue Size
@ -718,6 +718,15 @@
`define L3_WRITEBACK 0
`endif
`ifndef MEMORY_BANKS
`define MEMORY_BANKS 8
`endif
// Number of Memory Ports from LLC
`ifndef NUM_MEM_PORTS
`define NUM_MEM_PORTS `MIN(`MEMORY_BANKS, `L3_NUM_BANKS)
`endif
// ISA Extensions /////////////////////////////////////////////////////////////
`ifdef EXT_A_ENABLE

View file

@ -238,11 +238,11 @@
`define RESET_RELAY(dst, src) \
`RESET_RELAY_EX (dst, src, 1, 0)
// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2
`define TO_OUT_BUF_SIZE(out_reg) `MIN(out_reg, 2)
// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2, 5 -> 2
`define TO_OUT_BUF_SIZE(s) `MIN(s, 2)
// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2
`define TO_OUT_BUF_REG(out_reg) ((out_reg & 1) + ((out_reg >> 2) << 1))
// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2, 5 > 3
`define TO_OUT_BUF_REG(s) ((s < 2) ? s : (s - 2))
`define REPEAT(n,f,s) `_REPEAT_``n(f,s)
`define _REPEAT_0(f,s)

View file

@ -145,11 +145,12 @@ module VX_socket import VX_gpu_pkg::*; #(
.CRSQ_SIZE (`DCACHE_CRSQ_SIZE),
.MSHR_SIZE (`DCACHE_MSHR_SIZE),
.MRSQ_SIZE (`DCACHE_MRSQ_SIZE),
.MREQ_SIZE (`DCACHE_MREQ_SIZE),
.MREQ_SIZE (`DCACHE_WRITEBACK ? `DCACHE_MSHR_SIZE : `DCACHE_MREQ_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`DCACHE_WRITEBACK),
.DIRTY_BYTES (`DCACHE_WRITEBACK),
.NC_ENABLE (1),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2)
@ -178,8 +179,6 @@ module VX_socket import VX_gpu_pkg::*; #(
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
`RESET_RELAY (mem_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (2),
.DATA_SIZE (`L1_LINE_SIZE),
@ -190,7 +189,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.RSP_OUT_BUF (2)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
.reset (reset),
.bus_in_if (l1_mem_bus_if),
.bus_out_if (l1_mem_arb_bus_if)
);

View file

@ -166,6 +166,10 @@
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99
`define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency
`define VX_CSR_MPM_MEM_LT_H 12'hB9A
`define VX_CSR_MPM_MEM_BANK_CNTR 12'hB1E // memory bank requests
`define VX_CSR_MPM_MEM_BANK_CNTR_H 12'hB9E
`define VX_CSR_MPM_MEM_BANK_TICK 12'hB1F // memory ticks
`define VX_CSR_MPM_MEM_BANK_TICK_H 12'hB9F
// PERF: lmem
`define VX_CSR_MPM_LMEM_READS 12'hB1B // memory reads
`define VX_CSR_MPM_LMEM_READS_H 12'hB9B

View file

@ -80,10 +80,11 @@ module Vortex import VX_gpu_pkg::*; (
.CRSQ_SIZE (`L3_CRSQ_SIZE),
.MSHR_SIZE (`L3_MSHR_SIZE),
.MRSQ_SIZE (`L3_MRSQ_SIZE),
.MREQ_SIZE (`L3_MREQ_SIZE),
.MREQ_SIZE (`L3_WRITEBACK ? `L3_MSHR_SIZE : `L3_MREQ_SIZE),
.TAG_WIDTH (L2_MEM_TAG_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`L3_WRITEBACK),
.DIRTY_BYTES (`L3_WRITEBACK),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2),
@ -192,12 +193,12 @@ module Vortex import VX_gpu_pkg::*; (
always @(posedge clk) begin
if (mem_req_fire) begin
if (mem_req_rw)
`TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h data=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data));
`TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%h data=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data));
else
`TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen));
`TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen));
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: MEM Rsp: tag=0x%0h, data=0x%0h\n", $time, mem_rsp_tag, mem_rsp_data));
`TRACE(1, ("%d: MEM Rd Rsp: tag=0x%0h, data=0x%h\n", $time, mem_rsp_tag, mem_rsp_data));
end
end
`endif

View file

@ -240,13 +240,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
MMIO_CMD_ARG0: begin
cmd_args[0] <= 64'(cp2af_sRxPort.c0.data);
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_CMD_ARG0: data=0x%0h\n", $time, 64'(cp2af_sRxPort.c0.data)));
`TRACE(2, ("%d: MMIO_CMD_ARG0: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data)));
`endif
end
MMIO_CMD_ARG1: begin
cmd_args[1] <= 64'(cp2af_sRxPort.c0.data);
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_CMD_ARG1: data=0x%0h\n", $time, 64'(cp2af_sRxPort.c0.data)));
`TRACE(2, ("%d: MMIO_CMD_ARG1: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data)));
`endif
end
MMIO_CMD_ARG2: begin
@ -263,13 +263,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
`ifdef SCOPE
MMIO_SCOPE_WRITE: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_SCOPE_WRITE: data=0x%0h\n", $time, cmd_scope_wdata));
`TRACE(2, ("%d: MMIO_SCOPE_WRITE: data=0x%h\n", $time, cmd_scope_wdata));
`endif
end
`endif
default: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: Unknown MMIO Wr: addr=0x%0h, data=0x%0h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data)));
`TRACE(2, ("%d: Unknown MMIO Wr: addr=0x%0h, data=0x%h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data)));
`endif
end
endcase
@ -305,14 +305,14 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
MMIO_SCOPE_READ: begin
mmio_tx.data <= cmd_scope_rdata;
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_SCOPE_READ: data=0x%0h\n", $time, cmd_scope_rdata));
`TRACE(2, ("%d: MMIO_SCOPE_READ: data=0x%h\n", $time, cmd_scope_rdata));
`endif
end
`endif
MMIO_DEV_CAPS: begin
mmio_tx.data <= dev_caps;
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_DEV_CAPS: data=0x%0h\n", $time, dev_caps));
`TRACE(2, ("%d: MMIO_DEV_CAPS: data=0x%h\n", $time, dev_caps));
`endif
end
MMIO_ISA_CAPS: begin
@ -580,8 +580,6 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.TAG_WIDTH (AVS_REQ_TAGW+1)
) mem_bus_if[1]();
`RESET_RELAY (mem_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (2),
.DATA_SIZE (LMEM_DATA_SIZE),
@ -592,7 +590,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.RSP_OUT_BUF (0)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
.reset (reset),
.bus_in_if (cci_vx_mem_bus_if),
.bus_out_if (mem_bus_if)
);
@ -760,7 +758,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
cci_mem_wr_req_addr_base <= cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE);
end
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=0x%0h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data));
`TRACE(2, ("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=0x%h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data));
`endif
end
@ -778,14 +776,12 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
end
end
`RESET_RELAY (cci_rdq_reset, reset);
VX_fifo_queue #(
.DATAW (CCI_RD_QUEUE_DATAW),
.DEPTH (CCI_RD_QUEUE_SIZE)
) cci_rd_req_queue (
.clk (clk),
.reset (cci_rdq_reset),
.reset (reset),
.push (cci_rdq_push),
.pop (cci_rdq_pop),
.data_in (cci_rdq_din),
@ -906,7 +902,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
cci_wr_req_done <= 1;
end
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: CCI Wr Req: addr=0x%0h, rem=%0d, pending=%0d, data=0x%0h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data));
`TRACE(2, ("%d: CCI Wr Req: addr=0x%0h, rem=%0d, pending=%0d, data=0x%h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data));
`endif
end
@ -1093,13 +1089,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
always @(posedge clk) begin
for (integer i = 0; i < NUM_LOCAL_MEM_BANKS; ++i) begin
if (avs_write[i] && ~avs_waitrequest[i]) begin
`TRACE(2, ("%d: AVS Wr Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h, data=0x%0h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i], avs_writedata[i]));
`TRACE(2, ("%d: AVS Wr Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h, data=0x%h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i], avs_writedata[i]));
end
if (avs_read[i] && ~avs_waitrequest[i]) begin
`TRACE(2, ("%d: AVS Rd Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i]));
end
if (avs_readdatavalid[i]) begin
`TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%0h\n", $time, i, avs_readdata[i]));
`TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%h\n", $time, i, avs_readdata[i]));
end
end
end

View file

@ -377,13 +377,13 @@ module VX_afu_wrap #(
`TRACE(2, ("%d: AFU Wr Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]));
end
if (m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i]) begin
`TRACE(2, ("%d: AFU Wr Req [%0d]: data=0x%0h\n", $time, i, m_axi_mem_wdata_a[i]));
`TRACE(2, ("%d: AFU Wr Req [%0d]: data=0x%h\n", $time, i, m_axi_mem_wdata_a[i]));
end
if (m_axi_mem_arvalid_a[i] && m_axi_mem_arready_a[i]) begin
`TRACE(2, ("%d: AFU Rd Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i]));
end
if (m_axi_mem_rvalid_a[i] && m_axi_mem_rready_a[i]) begin
`TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]));
`TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]));
end
end
end

View file

@ -14,6 +14,7 @@
`include "VX_cache_define.vh"
module VX_bank_flush #(
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
@ -27,33 +28,36 @@ module VX_bank_flush #(
) (
input wire clk,
input wire reset,
input wire flush_in_valid,
output wire flush_in_ready,
output wire flush_out_init,
output wire flush_out_valid,
output wire [`CS_LINE_SEL_BITS-1:0] flush_out_line,
output wire [NUM_WAYS-1:0] flush_out_way,
input wire flush_out_ready,
input wire mshr_empty
input wire flush_begin,
output wire flush_end,
output wire flush_init,
output wire flush_valid,
output wire [`CS_LINE_SEL_BITS-1:0] flush_line,
output wire [NUM_WAYS-1:0] flush_way,
input wire flush_ready,
input wire mshr_empty,
input wire bank_empty
);
parameter CTR_WIDTH = `CS_LINE_SEL_BITS + (WRITEBACK ? `CS_WAY_SEL_BITS : 0);
// ways interation is only needed when eviction is enabled
localparam CTR_WIDTH = `CS_LINE_SEL_BITS + (WRITEBACK ? `CS_WAY_SEL_BITS : 0);
parameter STATE_IDLE = 2'd0;
parameter STATE_INIT = 2'd1;
parameter STATE_FLUSH = 2'd2;
localparam STATE_IDLE = 0;
localparam STATE_INIT = 1;
localparam STATE_WAIT1 = 2;
localparam STATE_FLUSH = 3;
localparam STATE_WAIT2 = 4;
localparam STATE_DONE = 5;
reg [2:0] state_r, state_n;
reg [CTR_WIDTH-1:0] counter_r;
reg [1:0] state_r, state_n;
reg flush_in_ready_r, flush_in_ready_n;
always @(*) begin
state_n = state_r;
flush_in_ready_n = 0;
case (state_r)
// STATE_IDLE
default: begin
if (flush_in_valid && mshr_empty) begin
state_n = STATE_FLUSH;
STATE_IDLE: begin
if (flush_begin) begin
state_n = STATE_WAIT1;
end
end
STATE_INIT: begin
@ -61,25 +65,41 @@ module VX_bank_flush #(
state_n = STATE_IDLE;
end
end
STATE_FLUSH: begin
if (counter_r == ((2 ** CTR_WIDTH)-1)) begin
state_n = STATE_IDLE;
flush_in_ready_n = 1;
STATE_WAIT1: begin
// wait for pending requests to complete
if (mshr_empty) begin
state_n = STATE_FLUSH;
end
end
STATE_FLUSH: begin
if (counter_r == ((2 ** CTR_WIDTH)-1) && flush_ready) begin
state_n = (BANK_ID == 0) ? STATE_DONE : STATE_WAIT2;
end
end
STATE_WAIT2: begin
// ensure the bank is empty before notifying the cache flush unit,
// because the flush request to lower caches only goes through bank0
// and it is important that request gets send out last.
if (bank_empty) begin
state_n = STATE_DONE;
end
end
STATE_DONE: begin
// generate a completion pulse
state_n = STATE_IDLE;
end
endcase
end
always @(posedge clk) begin
if (reset) begin
state_r <= STATE_INIT;
state_r <= STATE_INIT;
counter_r <= '0;
flush_in_ready_r <= '0;
end else begin
state_r <= state_n;
flush_in_ready_r <= flush_in_ready_n;
if (state_r != STATE_IDLE) begin
if ((state_r == STATE_INIT) || flush_out_ready) begin
if ((state_r == STATE_INIT)
|| ((state_r == STATE_FLUSH) && flush_ready)) begin
counter_r <= counter_r + CTR_WIDTH'(1);
end
end else begin
@ -88,22 +108,20 @@ module VX_bank_flush #(
end
end
assign flush_in_ready = flush_in_ready_r;
assign flush_out_init = (state_r == STATE_INIT);
assign flush_out_valid = (state_r == STATE_FLUSH);
assign flush_out_line = counter_r[`CS_LINE_SEL_BITS-1:0];
assign flush_end = (state_r == STATE_DONE);
assign flush_init = (state_r == STATE_INIT);
assign flush_valid = (state_r == STATE_FLUSH);
assign flush_line = counter_r[`CS_LINE_SEL_BITS-1:0];
if (WRITEBACK && `CS_WAY_SEL_BITS > 0) begin
reg [NUM_WAYS-1:0] flush_out_way_r;
reg [NUM_WAYS-1:0] flush_way_r;
always @(*) begin
flush_out_way_r = '0;
flush_out_way_r[counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]] = 1;
flush_way_r = '0;
flush_way_r[counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]] = 1;
end
assign flush_out_way = flush_out_way_r;
assign flush_way = flush_way_r;
end else begin
assign flush_out_way = {NUM_WAYS{1'b1}};
assign flush_way = {NUM_WAYS{1'b1}};
end
endmodule

View file

@ -45,6 +45,9 @@ module VX_cache import VX_gpu_pkg::*; #(
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -69,8 +72,13 @@ module VX_cache import VX_gpu_pkg::*; #(
VX_mem_bus_if.master mem_bus_if
);
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
`STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter"))
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter: number of banks must be power of 2"))
`STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter: writeback requires write enable"))
`STATIC_ASSERT(WRITEBACK || !DIRTY_BYTES, ("invalid parameter: dirty bytes require writeback"))
// In writeback mode, memory fill response may issue a new memory request to handle evicted blocks.
// We need to ensure that the memory request queue never fills up to avoid deadlock.
`STATIC_ASSERT(!WRITEBACK || (MREQ_SIZE >= MSHR_SIZE), ("invalid parameter: writeback requires MREQ_SIZE >= MSHR_SIZE"))
localparam REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS);
localparam WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS);
@ -101,26 +109,23 @@ module VX_cache import VX_gpu_pkg::*; #(
.TAG_WIDTH (TAG_WIDTH)
) core_bus2_if[NUM_REQS]();
wire [NUM_BANKS-1:0] per_bank_flush_valid;
wire [NUM_BANKS-1:0] per_bank_flush_ready;
wire [NUM_BANKS-1:0] per_bank_flush_begin;
wire [NUM_BANKS-1:0] per_bank_flush_end;
wire [NUM_BANKS-1:0] per_bank_core_req_fire;
// this reset relay is required to sync with bank initialization
`RESET_RELAY (flush_reset, reset);
VX_cache_flush #(
.NUM_REQS (NUM_REQS),
.NUM_BANKS (NUM_BANKS),
.BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // bank xbar latency
) flush_unit (
.clk (clk),
.reset (flush_reset),
.reset (reset),
.core_bus_in_if (core_bus_if),
.core_bus_out_if (core_bus2_if),
.bank_req_fire (per_bank_core_req_fire),
.flush_valid (per_bank_flush_valid),
.flush_ready (per_bank_flush_ready)
.flush_begin (per_bank_flush_begin),
.flush_end (per_bank_flush_end)
);
///////////////////////////////////////////////////////////////////////////
@ -131,9 +136,9 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
wire [NUM_REQS-1:0] core_rsp_ready_s;
for (genvar i = 0; i < NUM_REQS; ++i) begin
`RESET_RELAY_EX (core_rsp_reset, reset, NUM_REQS, `MAX_FANOUT);
`RESET_RELAY (core_rsp_reset, reset);
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
@ -141,7 +146,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
.clk (clk),
.reset (core_rsp_reset),
.reset (core_rsp_reset[i]),
.valid_in (core_rsp_valid_s[i]),
.ready_in (core_rsp_ready_s[i]),
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
@ -165,15 +170,13 @@ module VX_cache import VX_gpu_pkg::*; #(
wire mem_bus_if_flush;
`RESET_RELAY (mem_req_reset, reset);
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + 1),
.SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (mem_req_reset),
.reset (reset),
.valid_in (mem_req_valid_s),
.ready_in (mem_req_ready_s),
.data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s, mem_req_flush_s}),
@ -192,15 +195,13 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s;
wire mem_rsp_ready_s;
`RESET_RELAY (mem_rsp_reset, reset);
VX_elastic_buffer #(
.DATAW (MEM_TAG_WIDTH + `CS_LINE_WIDTH),
.SIZE (MRSQ_SIZE),
.OUT_REG (MRSQ_SIZE > 2)
) mem_rsp_queue (
.clk (clk),
.reset (mem_rsp_reset),
.reset (reset),
.valid_in (mem_bus_if.rsp_valid),
.ready_in (mem_bus_if.rsp_ready),
.data_in ({mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data}),
@ -316,6 +317,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (CORE_REQ_DATAW),
.PERF_CTR_BITS (`PERF_CTR_BITS),
.ARBITER ("F"),
.OUT_BUF (REQ_XBAR_BUF)
) req_xbar (
.clk (clk),
@ -373,6 +375,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.MSHR_SIZE (MSHR_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.DIRTY_BYTES (DIRTY_BYTES),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
@ -423,8 +426,8 @@ module VX_cache import VX_gpu_pkg::*; #(
.mem_rsp_id (`CS_MEM_TAG_TO_REQ_ID(mem_rsp_tag_s)),
.mem_rsp_ready (per_bank_mem_rsp_ready[bank_id]),
.flush_valid (per_bank_flush_valid[bank_id]),
.flush_ready (per_bank_flush_ready[bank_id])
.flush_begin (per_bank_flush_begin[bank_id]),
.flush_end (per_bank_flush_end[bank_id])
);
if (NUM_BANKS == 1) begin
@ -448,7 +451,8 @@ module VX_cache import VX_gpu_pkg::*; #(
VX_stream_xbar #(
.NUM_INPUTS (NUM_BANKS),
.NUM_OUTPUTS (NUM_REQS),
.DATAW (CORE_RSP_DATAW)
.DATAW (CORE_RSP_DATAW),
.ARBITER ("F")
) rsp_xbar (
.clk (clk),
.reset (rsp_xbar_reset),
@ -494,15 +498,13 @@ module VX_cache import VX_gpu_pkg::*; #(
};
end
`RESET_RELAY (mem_arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (NUM_BANKS),
.DATAW (`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + MSHR_ADDR_WIDTH + 1),
.ARBITER ("F")
) mem_req_arb (
.clk (clk),
.reset (mem_arb_reset),
.reset (reset),
.valid_in (per_bank_mem_req_valid),
.ready_in (per_bank_mem_req_ready),
.data_in (data_in),

View file

@ -44,6 +44,9 @@ module VX_cache_bank #(
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -105,8 +108,8 @@ module VX_cache_bank #(
output wire mem_rsp_ready,
// flush
input wire flush_valid,
output wire flush_ready
input wire flush_begin,
output wire flush_end
);
localparam PIPELINE_STAGES = 2;
@ -117,6 +120,7 @@ module VX_cache_bank #(
wire crsp_queue_stall;
wire mshr_alm_full;
wire mreq_queue_empty;
wire mreq_queue_alm_full;
wire [`CS_LINE_ADDR_WIDTH-1:0] mem_rsp_addr;
@ -132,11 +136,12 @@ module VX_cache_bank #(
wire [MSHR_ADDR_WIDTH-1:0] replay_id;
wire replay_ready;
wire is_init_st0;
wire is_init_st0, is_init_st1;
wire is_flush_st0, is_flush_st1;
wire [NUM_WAYS-1:0] flush_way_st0;
wire [`CS_LINE_ADDR_WIDTH-1:0] addr_sel, addr_st0, addr_st1;
wire [`CS_LINE_SEL_BITS-1:0] line_sel_st0, line_sel_st1;
wire rw_sel, rw_st0, rw_st1;
wire [WORD_SEL_WIDTH-1:0] wsel_sel, wsel_st0, wsel_st1;
wire [WORD_SIZE-1:0] byteen_sel, byteen_st0, byteen_st1;
@ -149,7 +154,8 @@ module VX_cache_bank #(
wire is_creq_st0, is_creq_st1;
wire is_fill_st0, is_fill_st1;
wire is_replay_st0, is_replay_st1;
wire creq_flush_st0, creq_flush_st1;
wire creq_flush_sel, creq_flush_st0, creq_flush_st1;
wire evict_dirty_st0, evict_dirty_st1;
wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1;
wire [NUM_WAYS-1:0] tag_matches_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0;
@ -157,73 +163,82 @@ module VX_cache_bank #(
wire mshr_pending_st0, mshr_pending_st1;
wire mshr_empty;
wire line_flush_valid;
wire line_flush_init;
wire [`CS_LINE_SEL_BITS-1:0] line_flush_sel;
wire [NUM_WAYS-1:0] line_flush_way;
wire line_flush_ready;
wire flush_valid;
wire init_valid;
wire [`CS_LINE_SEL_BITS-1:0] flush_sel;
wire [NUM_WAYS-1:0] flush_way;
wire flush_ready;
// ensure we have no pending memory request in the bank
wire no_pending_req = ~valid_st0 && ~valid_st1 && mreq_queue_empty;
// flush unit
VX_bank_flush #(
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WRITEBACK (WRITEBACK)
) flush_unit (
.clk (clk),
.reset (reset),
.flush_in_valid (flush_valid),
.flush_in_ready (flush_ready),
.flush_out_init (line_flush_init),
.flush_out_valid (line_flush_valid),
.flush_out_line (line_flush_sel),
.flush_out_way (line_flush_way),
.flush_out_ready (line_flush_ready),
.mshr_empty (mshr_empty)
.clk (clk),
.reset (reset),
.flush_begin (flush_begin),
.flush_end (flush_end),
.flush_init (init_valid),
.flush_valid (flush_valid),
.flush_line (flush_sel),
.flush_way (flush_way),
.flush_ready (flush_ready),
.mshr_empty (mshr_empty),
.bank_empty (no_pending_req)
);
wire rdw_hazard_st0;
reg rdw_hazard_st1;
wire rdw_hazard1_sel;
wire rdw_hazard2_sel;
reg rdw_hazard3_st1;
wire pipe_stall = crsp_queue_stall || rdw_hazard_st1;
wire pipe_stall = crsp_queue_stall || rdw_hazard3_st1;
// inputs arbitration:
// mshr replay has highest priority to maximize utilization since there is no miss.
// handle memory responses next to prevent deadlock with potential memory request from a miss.
// flush has precedence over core requests to ensure that the cache is in a consistent state.
wire replay_grant = ~line_flush_init;
wire replay_grant = ~init_valid;
wire replay_enable = replay_grant && replay_valid;
wire fill_grant = ~line_flush_init && ~replay_enable;
wire fill_grant = ~init_valid && ~replay_enable;
wire fill_enable = fill_grant && mem_rsp_valid;
wire flush_grant = ~line_flush_init && ~replay_enable && ~fill_enable;
wire flush_enable = flush_grant && line_flush_valid;
wire flush_grant = ~init_valid && ~replay_enable && ~fill_enable;
wire flush_enable = flush_grant && flush_valid;
wire creq_grant = ~line_flush_init && ~replay_enable && ~fill_enable && ~flush_enable;
wire creq_grant = ~init_valid && ~replay_enable && ~fill_enable && ~flush_enable;
wire creq_enable = creq_grant && core_req_valid;
assign replay_ready = replay_grant
&& ~rdw_hazard_st0
&& ~rdw_hazard1_sel
&& ~pipe_stall;
assign mem_rsp_ready = fill_grant
&& (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions
&& ~rdw_hazard2_sel
&& ~pipe_stall;
assign line_flush_ready = flush_grant
&& ~mreq_queue_alm_full
&& ~pipe_stall;
assign flush_ready = flush_grant
&& (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions
&& ~rdw_hazard2_sel
&& ~pipe_stall;
assign core_req_ready = creq_grant
&& ~mreq_queue_alm_full
&& ~mshr_alm_full
&& ~pipe_stall;
wire init_fire = line_flush_init;
wire init_fire = init_valid;
wire replay_fire = replay_valid && replay_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
wire flush_fire = line_flush_valid && line_flush_ready;
wire flush_fire = flush_valid && flush_ready;
wire core_req_fire = core_req_valid && core_req_ready;
assign valid_sel = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire;
@ -232,8 +247,9 @@ module VX_cache_bank #(
assign wsel_sel = replay_valid ? replay_wsel : core_req_wsel;
assign req_idx_sel = replay_valid ? replay_idx : core_req_idx;
assign tag_sel = replay_valid ? replay_tag : core_req_tag;
assign creq_flush_sel = core_req_valid && core_req_flush;
assign addr_sel = (line_flush_init | line_flush_valid) ? `CS_LINE_ADDR_WIDTH'(line_flush_sel) :
assign addr_sel = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) :
(replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : core_req_addr));
if (WRITE_ENABLE) begin
@ -260,8 +276,8 @@ module VX_cache_bank #(
.clk (clk),
.reset (reset),
.enable (~pipe_stall),
.data_in ({valid_sel, line_flush_init, replay_enable, fill_enable, flush_enable, creq_enable, core_req_flush, line_flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}),
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
.data_in ({valid_sel, init_valid, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}),
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
);
if (UUID_WIDTH != 0) begin
@ -273,18 +289,20 @@ module VX_cache_bank #(
wire do_init_st0 = valid_st0 && is_init_st0;
wire do_flush_st0 = valid_st0 && is_flush_st0;
wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0;
wire do_creq_wr_st0 = valid_st0 && is_creq_st0 && rw_st0;
wire do_replay_rd_st0 = valid_st0 && is_replay_st0 && ~rw_st0;
wire do_replay_wr_st0 = valid_st0 && is_replay_st0 && rw_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire do_lookup_st0 = valid_st0 && ~(is_fill_st0 || is_init_st0);
wire do_cache_rd_st0 = do_creq_rd_st0 || do_replay_rd_st0;
wire do_cache_wr_st0 = do_creq_wr_st0 || do_replay_wr_st0;
wire do_lookup_st0 = do_cache_rd_st0 || do_cache_wr_st0;
wire [`CS_WORD_WIDTH-1:0] write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0];
wire [NUM_WAYS-1:0] repl_way_st0;
wire [`CS_TAG_SEL_BITS-1:0] repl_tag_st0;
assign line_sel_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0];
`RESET_RELAY (tag_reset, reset);
wire [NUM_WAYS-1:0] evict_way_st0;
wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0;
VX_cache_tags #(
.INSTANCE_ID($sformatf("%s-tags", INSTANCE_ID)),
@ -294,42 +312,51 @@ module VX_cache_bank #(
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH)
) cache_tags (
.clk (clk),
.reset (tag_reset),
.reset (reset),
.req_uuid (req_uuid_st0),
.stall (pipe_stall),
// init/fill/lookup/flush
.init (do_init_st0 || do_flush_st0),
// init/flush/fill/write/lookup
.init (do_init_st0),
.flush (do_flush_st0),
.fill (do_fill_st0),
.write (do_cache_wr_st0),
.lookup (do_lookup_st0),
.line_addr (addr_st0),
.way_sel (flush_way_st0),
.tag_matches(tag_matches_st0),
// replacement
.repl_way (repl_way_st0),
.repl_tag (repl_tag_st0)
.evict_dirty(evict_dirty_st0),
.evict_way (evict_way_st0),
.evict_tag (evict_tag_st0)
);
wire [`CS_LINE_ADDR_WIDTH-1:0] addr2_st0;
wire is_flush2_st0 = WRITEBACK && is_flush_st0;
assign mshr_id_st0 = is_creq_st0 ? mshr_alloc_id_st0 : replay_id_st0;
assign way_sel_st0 = is_fill_st0 ? repl_way_st0 : (is_flush_st0 ? flush_way_st0 : tag_matches_st0);
assign way_sel_st0 = (is_fill_st0 || is_flush2_st0) ? evict_way_st0 : tag_matches_st0;
wire [`CS_LINE_ADDR_WIDTH-1:0] addr_r_st0 = (is_fill_st0 || is_flush_st0) ? {repl_tag_st0, addr_st0[`CS_LINE_SEL_BITS-1:0]} : addr_st0;
assign addr2_st0 = (is_fill_st0 || is_flush2_st0) ? {evict_tag_st0, line_sel_st0} : addr_st0;
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1),
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1 + 1),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (~pipe_stall),
.data_in ({valid_st0, is_flush_st0, is_replay_st0, is_fill_st0, is_creq_st0, creq_flush_st0, rw_st0, addr_r_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_flush_st1, is_replay_st1, is_fill_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, mshr_pending_st1})
.data_in ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush2_st0, is_creq_st0, creq_flush_st0, rw_st0, addr2_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, evict_dirty_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_init_st1, is_replay_st1, is_fill_st1, is_flush_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, evict_dirty_st1, mshr_pending_st1})
);
// we have a tag hit
@ -343,35 +370,40 @@ module VX_cache_bank #(
wire is_read_st1 = is_creq_st1 && ~rw_st1;
wire is_write_st1 = is_creq_st1 && rw_st1;
wire do_init_st1 = valid_st1 && is_init_st1;
wire do_fill_st1 = valid_st1 && is_fill_st1;
wire do_flush_st1 = valid_st1 && is_flush_st1;
wire do_creq_rd_st1 = valid_st1 && is_read_st1;
wire do_creq_wr_st1 = valid_st1 && is_write_st1;
wire do_fill_st1 = valid_st1 && is_fill_st1;
wire do_replay_rd_st1 = valid_st1 && is_replay_st1 && ~rw_st1;
wire do_replay_wr_st1 = valid_st1 && is_replay_st1 && rw_st1;
wire do_cache_rd_st1 = do_read_hit_st1 || do_replay_rd_st1;
wire do_cache_wr_st1 = do_write_hit_st1 || do_replay_wr_st1;
wire do_read_hit_st1 = do_creq_rd_st1 && is_hit_st1;
wire do_read_miss_st1 = do_creq_rd_st1 && ~is_hit_st1;
wire do_write_hit_st1 = do_creq_wr_st1 && is_hit_st1;
wire do_write_miss_st1= do_creq_wr_st1 && ~is_hit_st1;
wire do_flush_st1 = valid_st1 && is_flush_st1;
wire do_cache_rd_st1 = do_read_hit_st1 || do_replay_rd_st1;
wire do_cache_wr_st1 = do_write_hit_st1 || do_replay_wr_st1;
assign line_sel_st1 = addr_st1[`CS_LINE_SEL_BITS-1:0];
`UNUSED_VAR (do_write_miss_st1)
// ensure mshr replay always get a hit
`RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("runtime error: invalid mshr replay"));
`RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("missed mshr replay"));
// detect BRAM's read-during-write hazard
assign rdw_hazard_st0 = do_fill_st0; // stall cycle after a fill
wire rdw_case1 = do_cache_rd_st0 && do_cache_wr_st1 && (addr_st0 == addr_st1); // standard cache access
wire rdw_case2 = WRITEBACK && (do_flush_st0 || do_fill_st0) && do_cache_wr_st1; // a writeback can evict preceeding write
always @(posedge clk) begin // after a write to same address
rdw_hazard_st1 <= (rdw_case1 || rdw_case2)
&& ~rdw_hazard_st1; // invalidate if pipeline stalled to avoid repeats
// both tag and data stores use BRAM with no read-during-write protection.
// we ned to stall the pipeline to prevent read-after-write hazards.
assign rdw_hazard1_sel = do_fill_st0; // stall first replay following a fill
assign rdw_hazard2_sel = WRITEBACK && do_cache_wr_st0; // a writeback can evict any preceeding write
always @(posedge clk) begin
// stall reads following writes to same line address
rdw_hazard3_st1 <= do_cache_rd_st0 && do_cache_wr_st1 && (line_sel_st0 == line_sel_st1)
&& ~rdw_hazard3_st1; // release pipeline stall
end
wire [`CS_LINE_WIDTH-1:0] write_data_st1 = {`CS_WORDS_PER_LINE{data_st1[`CS_WORD_WIDTH-1:0]}};
@ -380,7 +412,6 @@ module VX_cache_bank #(
wire [`CS_LINE_WIDTH-1:0] dirty_data_st1;
wire [LINE_SIZE-1:0] dirty_byteen_st1;
wire dirty_valid_st1;
if (`CS_WORDS_PER_LINE > 1) begin
reg [LINE_SIZE-1:0] write_byteen_r;
@ -393,8 +424,6 @@ module VX_cache_bank #(
assign write_byteen_st1 = byteen_st1;
end
`RESET_RELAY (data_reset, reset);
VX_cache_data #(
.INSTANCE_ID ($sformatf("%s-data", INSTANCE_ID)),
.BANK_ID (BANK_ID),
@ -405,17 +434,19 @@ module VX_cache_bank #(
.WORD_SIZE (WORD_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH)
) cache_data (
.clk (clk),
.reset (data_reset),
.reset (reset),
.req_uuid (req_uuid_st1),
.stall (pipe_stall),
.init (do_init_st1),
.read (do_cache_rd_st1),
.fill (do_fill_st1 && ~rdw_hazard_st1),
.fill (do_fill_st1),
.flush (do_flush_st1),
.write (do_cache_wr_st1),
.way_sel (way_sel_st1),
@ -425,7 +456,6 @@ module VX_cache_bank #(
.write_data (write_data_st1),
.write_byteen(write_byteen_st1),
.read_data (read_data_st1),
.dirty_valid(dirty_valid_st1),
.dirty_data (dirty_data_st1),
.dirty_byteen(dirty_byteen_st1)
);
@ -461,8 +491,6 @@ module VX_cache_bank #(
`UNUSED_PIN (size)
);
`RESET_RELAY (mshr_reset, reset);
VX_cache_mshr #(
.INSTANCE_ID ($sformatf("%s-mshr", INSTANCE_ID)),
.BANK_ID (BANK_ID),
@ -473,7 +501,7 @@ module VX_cache_bank #(
.DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH)
) cache_mshr (
.clk (clk),
.reset (mshr_reset),
.reset (reset),
.deq_req_uuid (req_uuid_sel),
.lkp_req_uuid (req_uuid_st0),
@ -536,16 +564,14 @@ module VX_cache_bank #(
assign crsp_queue_data = read_data_st1;
assign crsp_queue_tag = tag_st1;
`RESET_RELAY (crsp_queue_reset, reset);
VX_elastic_buffer #(
.DATAW (TAG_WIDTH + `CS_WORD_WIDTH + REQ_SEL_WIDTH),
.SIZE (CRSQ_SIZE),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_queue (
.clk (clk),
.reset (crsp_queue_reset),
.valid_in (crsp_queue_valid && ~rdw_hazard_st1),
.reset (reset),
.valid_in (crsp_queue_valid && ~rdw_hazard3_st1),
.ready_in (crsp_queue_ready),
.data_in ({crsp_queue_tag, crsp_queue_data, crsp_queue_idx}),
.data_out ({core_rsp_tag, core_rsp_data, core_rsp_idx}),
@ -557,7 +583,7 @@ module VX_cache_bank #(
// schedule memory request
wire mreq_queue_push, mreq_queue_pop, mreq_queue_empty;
wire mreq_queue_push, mreq_queue_pop;
wire [`CS_LINE_WIDTH-1:0] mreq_queue_data;
wire [LINE_SIZE-1:0] mreq_queue_byteen;
wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr;
@ -565,30 +591,42 @@ module VX_cache_bank #(
wire mreq_queue_rw;
wire mreq_queue_flush;
wire is_evict_st1 = (is_fill_st1 || is_flush_st1) && dirty_valid_st1;
wire do_writeback_st1 = valid_st1 && is_evict_st1;
`UNUSED_VAR (do_writeback_st1)
wire is_fill_or_flush_st1 = is_fill_st1 || is_flush_st1;
wire do_fill_or_flush_st1 = valid_st1 && is_fill_or_flush_st1;
wire do_writeback_st1 = do_fill_or_flush_st1 && evict_dirty_st1;
if (WRITEBACK) begin
if (DIRTY_BYTES) begin
// ensure dirty bytes match the tag info
wire has_dirty_bytes = (| dirty_byteen_st1);
`RUNTIME_ASSERT (~do_fill_or_flush_st1 || (evict_dirty_st1 == has_dirty_bytes), ("missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", evict_dirty_st1, has_dirty_bytes, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID)));
end
assign mreq_queue_push = (((do_read_miss_st1 || do_write_miss_st1) && ~mshr_pending_st1)
|| do_writeback_st1)
&& ~rdw_hazard_st1;
&& ~rdw_hazard3_st1;
end else begin
`UNUSED_VAR (dirty_valid_st1)
`UNUSED_VAR (do_writeback_st1)
assign mreq_queue_push = ((do_read_miss_st1 && ~mshr_pending_st1)
|| do_creq_wr_st1)
&& ~rdw_hazard_st1;
&& ~rdw_hazard3_st1;
end
assign mreq_queue_pop = mem_req_valid && mem_req_ready;
assign mreq_queue_rw = WRITE_ENABLE && (WRITEBACK ? is_evict_st1 : rw_st1);
assign mreq_queue_pop = mem_req_valid && mem_req_ready;
assign mreq_queue_addr = addr_st1;
assign mreq_queue_id = mshr_id_st1;
assign mreq_queue_data = is_write_st1 ? write_data_st1 : dirty_data_st1;
assign mreq_queue_byteen = is_write_st1 ? write_byteen_st1 : dirty_byteen_st1;
assign mreq_queue_id = mshr_id_st1;
assign mreq_queue_flush = creq_flush_st1;
`RESET_RELAY (mreq_queue_reset, reset);
if (WRITE_ENABLE) begin
assign mreq_queue_rw = WRITEBACK ? is_fill_or_flush_st1 : rw_st1;
assign mreq_queue_data = WRITEBACK ? dirty_data_st1 : write_data_st1;
assign mreq_queue_byteen = WRITEBACK ? dirty_byteen_st1 : write_byteen_st1;
end else begin
assign mreq_queue_rw = 0;
assign mreq_queue_data = 0;
assign mreq_queue_byteen = 0;
`UNUSED_VAR (dirty_data_st1)
`UNUSED_VAR (dirty_byteen_st1)
end
VX_fifo_queue #(
.DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + 1),
@ -597,7 +635,7 @@ module VX_cache_bank #(
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_queue (
.clk (clk),
.reset (mreq_queue_reset),
.reset (reset),
.push (mreq_queue_push),
.pop (mreq_queue_pop),
.data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_id, mreq_queue_byteen, mreq_queue_data, mreq_queue_flush}),
@ -621,32 +659,32 @@ module VX_cache_bank #(
`ifdef DBG_TRACE_CACHE
wire crsp_queue_fire = crsp_queue_valid && crsp_queue_ready;
wire pipeline_stall = (replay_valid || mem_rsp_valid || core_req_valid || line_flush_valid)
&& ~(replay_fire || mem_rsp_fire || core_req_fire || line_flush_valid);
wire input_stall = (replay_valid || mem_rsp_valid || core_req_valid || flush_valid)
&& ~(replay_fire || mem_rsp_fire || core_req_fire || flush_fire);
always @(posedge clk) begin
if (pipeline_stall) begin
`TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw_st0=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard_st0));
if (input_stall || pipe_stall) begin
`TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw1=%b, rdw2=%b, rdw3=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard1_sel, rdw_hazard2_sel, rdw_hazard3_st1));
end
if (mem_rsp_fire) begin
`TRACE(2, ("%d: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data));
`TRACE(2, ("%d: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data));
end
if (replay_fire) begin
`TRACE(2, ("%d: %s mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel));
end
if (core_req_fire) begin
if (core_req_rw)
`TRACE(2, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel));
`TRACE(2, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel));
else
`TRACE(2, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel));
end
if (crsp_queue_fire) begin
`TRACE(2, ("%d: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1));
`TRACE(2, ("%d: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1));
end
if (mreq_queue_push) begin
if (do_creq_wr_st1 && !WRITEBACK)
`TRACE(2, ("%d: %s writethrough: addr=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1));
`TRACE(2, ("%d: %s writethrough: addr=0x%0h, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1));
else if (do_writeback_st1)
`TRACE(2, ("%d: %s writeback: addr=0x%0h, byteen=%b, data=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data));
`TRACE(2, ("%d: %s writeback: addr=0x%0h, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data));
else
`TRACE(2, ("%d: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_id, req_uuid_st1));
end

View file

@ -49,6 +49,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -99,6 +102,8 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.TAG_WIDTH (ARB_TAG_WIDTH)
) arb_core_bus_if[NUM_CACHES * NUM_REQS]();
`RESET_RELAY_EX (cache_arb_reset, reset, NUM_REQS, `MAX_FANOUT);
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
@ -114,8 +119,6 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
`ASSIGN_VX_MEM_BUS_IF (core_bus_tmp_if[j], core_bus_if[j * NUM_REQS + i]);
end
`RESET_RELAY (arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (NUM_INPUTS),
.NUM_OUTPUTS (NUM_CACHES),
@ -127,7 +130,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.RSP_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0)
) cache_arb (
.clk (clk),
.reset (arb_reset),
.reset (cache_arb_reset[i]),
.bus_in_if (core_bus_tmp_if),
.bus_out_if (arb_core_bus_tmp_if)
);
@ -155,6 +158,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (ARB_TAG_WIDTH),
.TAG_SEL_IDX (TAG_SEL_IDX),

View file

@ -30,6 +30,8 @@ module VX_cache_data #(
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0
) (
@ -42,6 +44,7 @@ module VX_cache_data #(
input wire stall,
input wire init,
input wire read,
input wire fill,
input wire flush,
@ -53,89 +56,88 @@ module VX_cache_data #(
input wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_byteen,
input wire [NUM_WAYS-1:0] way_sel,
output wire [`CS_WORD_WIDTH-1:0] read_data,
output wire dirty_valid,
output wire [`CS_LINE_WIDTH-1:0] dirty_data,
output wire [LINE_SIZE-1:0] dirty_byteen
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_PARAM (WORD_SIZE)
`UNUSED_VAR (reset)
`UNUSED_VAR (stall)
`UNUSED_VAR (line_addr)
`UNUSED_VAR (init)
`UNUSED_VAR (read)
`UNUSED_VAR (flush)
localparam BYTEENW = (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) ? (LINE_SIZE * NUM_WAYS) : 1;
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_rdata;
wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
if (WRITEBACK) begin
reg [`CS_LINES_PER_BANK * NUM_WAYS-1:0][LINE_SIZE-1:0] dirty_bytes_r;
reg [`CS_LINES_PER_BANK * NUM_WAYS-1:0] dirty_blocks_r;
if (DIRTY_BYTES) begin
wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_rdata;
wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_wdata;
wire [`CLOG2(`CS_LINES_PER_BANK * NUM_WAYS)-1:0] way_addr;
if (NUM_WAYS > 1) begin
assign way_addr = {line_sel, way_idx};
for (genvar i = 0; i < NUM_WAYS; ++i) begin
wire [LINE_SIZE-1:0] wdata = write ? (bs_rdata[i] | write_byteen) : ((fill || flush) ? '0 : bs_rdata[i]);
assign bs_wdata[i] = init ? '0 : (way_sel[i] ? wdata : bs_rdata[i]);
end
VX_sp_ram #(
.DATAW (LINE_SIZE * NUM_WAYS),
.SIZE (`CS_LINES_PER_BANK)
) byteen_store (
.clk (clk),
.reset (reset),
.read (write || fill || flush),
.write (init || write || fill || flush),
.wren (1'b1),
.addr (line_sel),
.wdata (bs_wdata),
.rdata (bs_rdata)
);
assign dirty_byteen = bs_rdata[way_idx];
end else begin
assign way_addr = line_sel;
assign dirty_byteen = {LINE_SIZE{1'b1}};
end
always @(posedge clk) begin
if (fill) begin
dirty_bytes_r[way_addr] <= '0;
end else if (write) begin
dirty_bytes_r[way_addr] <= dirty_bytes_r[way_addr] | write_byteen;
wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] flipped_rdata;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
for (genvar j = 0; j < NUM_WAYS; ++j) begin
assign flipped_rdata[j][i] = line_rdata[i][j];
end
end
always @(posedge clk) begin
if (reset) begin
for (integer i = 0; i < `CS_LINES_PER_BANK * NUM_WAYS; ++i) begin
dirty_blocks_r[i] <= 0;
end
end else begin
if (fill) begin
dirty_blocks_r[way_addr] <= 0;
end else if (write) begin
dirty_blocks_r[way_addr] <= 1;
end
end
end
assign dirty_byteen = dirty_bytes_r[way_addr];
assign dirty_valid = dirty_blocks_r[way_addr];
assign dirty_data = flipped_rdata[way_idx];
end else begin
assign dirty_byteen = '0;
assign dirty_valid = 0;
assign dirty_data = '0;
end
// order the data layout to perform ways multiplexing last.
// this allows converting way index to binary in parallel with BRAM read.
// this allows converting way index to binary in parallel with BRAM readaccess and way selection.
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] wdata;
wire [BYTEENW-1:0] wren;
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_wdata;
wire [BYTEENW-1:0] line_wren;
if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
assign wdata[i] = (fill || !WRITE_ENABLE) ? {NUM_WAYS{fill_data[i]}} : {NUM_WAYS{write_data[i]}};
end
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
for (genvar j = 0; j < NUM_WAYS; ++j) begin
assign line_wdata[i][j] = (fill || !WRITE_ENABLE) ? fill_data[i] : write_data[i];
assign wren_w[i][j] = ((fill || !WRITE_ENABLE) ? {WORD_SIZE{1'b1}} : write_byteen[i])
& {WORD_SIZE{(way_sel[j] || (NUM_WAYS == 1))}};
end
end
assign wren = wren_w;
assign line_wren = wren_w;
end else begin
`UNUSED_VAR (write)
`UNUSED_VAR (write_byteen)
`UNUSED_VAR (write_data)
assign wdata = fill_data;
assign wren = fill;
assign line_wdata = fill_data;
assign line_wren = fill;
end
VX_onehot_encoder #(
@ -146,53 +148,50 @@ module VX_cache_data #(
`UNUSED_PIN (valid_out)
);
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] rdata;
wire line_read = (read && ~stall)
|| (WRITEBACK && (fill || flush));
wire line_write = write || fill;
VX_sp_ram #(
.DATAW (`CS_LINE_WIDTH * NUM_WAYS),
.SIZE (`CS_LINES_PER_BANK),
.WRENW (BYTEENW),
.NO_RWCHECK (1)
.NO_RWCHECK (1),
.RW_ASSERT (1)
) data_store (
.clk (clk),
.read (1'b1),
.write (write || fill),
.wren (wren),
.reset (reset),
.read (line_read),
.write (line_write),
.wren (line_wren),
.addr (line_sel),
.wdata (wdata),
.rdata (rdata)
.wdata (line_wdata),
.rdata (line_rdata)
);
wire [NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] per_way_rdata;
if (`CS_WORDS_PER_LINE > 1) begin
assign per_way_rdata = rdata[wsel];
assign per_way_rdata = line_rdata[wsel];
end else begin
`UNUSED_VAR (wsel)
assign per_way_rdata = rdata;
assign per_way_rdata = line_rdata;
end
assign read_data = per_way_rdata[way_idx];
wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] dirty_data_w;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
for (genvar j = 0; j < NUM_WAYS; ++j) begin
assign dirty_data_w[j][i] = rdata[i][j];
end
end
assign dirty_data = dirty_data_w[way_idx];
`ifdef DBG_TRACE_CACHE
always @(posedge clk) begin
if (fill && ~stall) begin
`TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data));
`TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data));
end
if (flush && ~stall) begin
`TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, dirty=%b, byteen=%b\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, dirty_valid, dirty_byteen));
`TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, dirty_byteen, dirty_data));
end
if (read && ~stall) begin
`TRACE(3, ("%d: %s read: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, read_data, req_uuid));
`TRACE(3, ("%d: %s read: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, read_data, req_uuid));
end
if (write && ~stall) begin
`TRACE(3, ("%d: %s write: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, write_byteen[wsel], write_data[wsel], req_uuid));
`TRACE(3, ("%d: %s write: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, write_byteen[wsel], write_data[wsel], req_uuid));
end
end
`endif

View file

@ -26,13 +26,16 @@ module VX_cache_flush #(
VX_mem_bus_if.slave core_bus_in_if [NUM_REQS],
VX_mem_bus_if.master core_bus_out_if [NUM_REQS],
input wire [NUM_BANKS-1:0] bank_req_fire,
output wire [NUM_BANKS-1:0] flush_valid,
input wire [NUM_BANKS-1:0] flush_ready
output wire [NUM_BANKS-1:0] flush_begin,
input wire [NUM_BANKS-1:0] flush_end
);
localparam STATE_IDLE = 0;
localparam STATE_WAIT = 1;
localparam STATE_WAIT1 = 1;
localparam STATE_FLUSH = 2;
localparam STATE_DONE = 3;
localparam STATE_WAIT2 = 3;
localparam STATE_DONE = 4;
reg [2:0] state, state_n;
// track in-flight core requests
@ -76,8 +79,6 @@ module VX_cache_flush #(
`UNUSED_VAR (bank_req_fire)
end
reg [1:0] state, state_n;
reg [NUM_BANKS-1:0] flush_done, flush_done_n;
wire [NUM_REQS-1:0] flush_req_mask;
@ -113,22 +114,32 @@ module VX_cache_flush #(
case (state)
STATE_IDLE: begin
if (flush_req_enable) begin
state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT : STATE_FLUSH;
state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT1 : STATE_FLUSH;
end
end
STATE_WAIT: begin
STATE_WAIT1: begin
if (no_inflight_reqs) begin
state_n = STATE_FLUSH;
end
end
STATE_FLUSH: begin
flush_done_n = flush_done | flush_ready;
if (flush_done_n == 0) begin
// generate a flush request pulse
state_n = STATE_WAIT2;
end
STATE_WAIT2: begin
// wait for all banks to finish flushing
flush_done_n = flush_done | flush_end;
if (flush_done_n == {NUM_BANKS{1'b1}}) begin
state_n = STATE_DONE;
flush_done_n = '0;
// only release current flush requests
// and keep normal requests locked
lock_released_n = flush_req_mask;
end
end
STATE_DONE: begin
// wait until released flush requests are issued
// when returning to IDLE state other requests will unlock
lock_released_n = lock_released & ~core_bus_out_ready;
if (lock_released_n == 0) begin
state_n = STATE_IDLE;
@ -149,6 +160,6 @@ module VX_cache_flush #(
end
end
assign flush_valid = {NUM_BANKS{state == STATE_FLUSH}};
assign flush_begin = {NUM_BANKS{state == STATE_FLUSH}};
endmodule

View file

@ -1,52 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
// cache flush unit
module VX_cache_init #(
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1
) (
input wire clk,
input wire reset,
output wire [`CS_LINE_SEL_BITS-1:0] addr_out,
output wire valid_out
);
reg enabled;
reg [`CS_LINE_SEL_BITS-1:0] line_ctr;
always @(posedge clk) begin
if (reset) begin
enabled <= 1;
line_ctr <= '0;
end else begin
if (enabled) begin
if (line_ctr == ((2 ** `CS_LINE_SEL_BITS)-1)) begin
enabled <= 0;
end
line_ctr <= line_ctr + `CS_LINE_SEL_BITS'(1);
end
end
end
assign addr_out = line_ctr;
assign valid_out = enabled;
endmodule

View file

@ -232,9 +232,10 @@ module VX_cache_mshr #(
.LUTRAM (1)
) entries (
.clk (clk),
.reset (reset),
.read (1'b1),
.write (allocate_valid),
`UNUSED_PIN (wren),
.wren (1'b1),
.waddr (allocate_id_r),
.wdata (allocate_data),
.raddr (dequeue_id_r),

View file

@ -26,6 +26,8 @@ module VX_cache_tags #(
parameter NUM_WAYS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Request debug identifier
parameter UUID_WIDTH = 0
) (
@ -40,74 +42,100 @@ module VX_cache_tags #(
// init/fill/lookup
input wire init,
input wire flush,
input wire fill,
input wire write,
input wire lookup,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire [NUM_WAYS-1:0] way_sel,
output wire [NUM_WAYS-1:0] tag_matches,
// replacement
output wire [NUM_WAYS-1:0] repl_way,
output wire [`CS_TAG_SEL_BITS-1:0] repl_tag
// eviction
output wire evict_dirty,
output wire [NUM_WAYS-1:0] evict_way,
output wire [`CS_TAG_SEL_BITS-1:0] evict_tag
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_VAR (reset)
`UNUSED_VAR (lookup)
// valid, tag
localparam TAG_WIDTH = 1 + `CS_TAG_SEL_BITS;
// valid, dirty, tag
localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS;
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_ADDR_TAG(line_addr);
wire [NUM_WAYS-1:0][`CS_TAG_SEL_BITS-1:0] read_tag;
wire [NUM_WAYS-1:0] read_valid;
wire [NUM_WAYS-1:0] read_dirty;
if (NUM_WAYS > 1) begin
reg [NUM_WAYS-1:0] repl_way_r;
reg [NUM_WAYS-1:0] evict_way_r;
// cyclic assignment of replacement way
always @(posedge clk) begin
if (reset) begin
repl_way_r <= 1;
end else if (~stall) begin // hold the value on stalls prevent filling different slots twice
repl_way_r <= {repl_way_r[NUM_WAYS-2:0], repl_way_r[NUM_WAYS-1]};
evict_way_r <= 1;
end else if (~stall) begin // holding the value on stalls prevents filling different slots twice
evict_way_r <= {evict_way_r[NUM_WAYS-2:0], evict_way_r[NUM_WAYS-1]};
end
end
assign repl_way = repl_way_r;
assign evict_way = fill ? evict_way_r : way_sel;
VX_onehot_mux #(
.DATAW (`CS_TAG_SEL_BITS),
.N (NUM_WAYS)
) repl_tag_sel (
) evict_tag_sel (
.data_in (read_tag),
.sel_in (repl_way_r),
.data_out (repl_tag)
.sel_in (evict_way),
.data_out (evict_tag)
);
end else begin
`UNUSED_VAR (stall)
assign repl_way = 1'b1;
assign repl_tag = read_tag;
assign evict_way = 1'b1;
assign evict_tag = read_tag;
end
// fill and flush need to also read in writeback mode
wire fill_s = fill && (!WRITEBACK || ~stall);
wire flush_s = flush && (!WRITEBACK || ~stall);
for (genvar i = 0; i < NUM_WAYS; ++i) begin
wire do_fill = fill && repl_way[i];
wire do_write = init || do_fill;
wire line_valid = ~init;
wire do_fill = fill_s && evict_way[i];
wire do_flush = flush_s && (!WRITEBACK || way_sel[i]); // flush the whole line in writethrough mode
wire do_write = WRITEBACK && write && tag_matches[i];
wire line_read = (WRITEBACK && (fill_s || flush_s));
wire line_write = init || do_fill || do_flush || do_write;
wire line_valid = ~(init || flush);
wire [TAG_WIDTH-1:0] line_wdata;
wire [TAG_WIDTH-1:0] line_rdata;
if (WRITEBACK) begin
assign line_wdata = {line_valid, write, line_tag};
assign {read_valid[i], read_dirty[i], read_tag[i]} = line_rdata;
end else begin
assign line_wdata = {line_valid, line_tag};
assign {read_valid[i], read_tag[i]} = line_rdata;
assign read_dirty[i] = 1'b0;
end
VX_sp_ram #(
.DATAW (TAG_WIDTH),
.SIZE (`CS_LINES_PER_BANK),
.NO_RWCHECK (1)
.NO_RWCHECK (1),
.RW_ASSERT (1)
) tag_store (
.clk (clk),
.read (1'b1),
.write (do_write),
`UNUSED_PIN (wren),
.reset (reset),
.read (line_read),
.write (line_write),
.wren (1'b1),
.addr (line_sel),
.wdata ({line_valid, line_tag}),
.rdata ({read_valid[i], read_tag[i]})
.wdata (line_wdata),
.rdata (line_rdata)
);
end
@ -115,19 +143,31 @@ module VX_cache_tags #(
assign tag_matches[i] = read_valid[i] && (line_tag == read_tag[i]);
end
assign evict_dirty = | (read_dirty & evict_way);
`ifdef DBG_TRACE_CACHE
wire [`CS_LINE_ADDR_WIDTH-1:0] evict_line_addr = {evict_tag, line_sel};
always @(posedge clk) begin
if (fill && ~stall) begin
`TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), repl_way, line_sel, line_tag));
`TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, dirty=%b, evict_addr=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), evict_way, line_sel, line_tag, evict_dirty, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID)));
end
if (init) begin
`TRACE(3, ("%d: %s init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel));
end
if (flush && ~stall) begin
`TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, dirty=%b\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID), way_sel, line_sel, evict_dirty));
end
if (lookup && ~stall) begin
if (tag_matches != 0) begin
`TRACE(3, ("%d: %s hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid));
if (write)
`TRACE(3, ("%d: %s write-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid));
else
`TRACE(3, ("%d: %s read-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid));
end else begin
`TRACE(3, ("%d: %s miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
if (write)
`TRACE(3, ("%d: %s write-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
else
`TRACE(3, ("%d: %s read-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
end
end
end

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -20,20 +20,20 @@ module VX_cache_top import VX_gpu_pkg::*; #(
parameter NUM_REQS = 4,
// Size of cache in bytes
parameter CACHE_SIZE = 16384,
parameter CACHE_SIZE = 16384,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 4,
// Number of associative ways
parameter NUM_WAYS = 4,
// Size of a word in bytes
parameter WORD_SIZE = 4,
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 16,
parameter MSHR_SIZE = 16,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
@ -42,6 +42,12 @@ module VX_cache_top import VX_gpu_pkg::*; #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -55,7 +61,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
parameter MEM_OUT_BUF = 2,
parameter MEM_TAG_WIDTH = `CLOG2(MSHR_SIZE) + `CLOG2(NUM_BANKS)
) (
) (
input wire clk,
input wire reset,
@ -82,17 +88,17 @@ module VX_cache_top import VX_gpu_pkg::*; #(
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire mem_req_rw,
output wire [LINE_SIZE-1:0] mem_req_byteen,
output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire mem_rsp_valid,
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready
);
VX_mem_bus_if #(
@ -127,18 +133,18 @@ module VX_cache_top import VX_gpu_pkg::*; #(
// Memory request
assign mem_req_valid = mem_bus_if.req_valid;
assign mem_req_rw = mem_bus_if.req_data.rw;
assign mem_req_rw = mem_bus_if.req_data.rw;
assign mem_req_byteen = mem_bus_if.req_data.byteen;
assign mem_req_addr = mem_bus_if.req_data.addr;
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
`UNUSED_VAR (mem_bus_if.req_data.atype)
// Memory response
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data;
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
assign mem_rsp_ready = mem_bus_if.rsp_ready;
VX_cache #(
@ -156,6 +162,8 @@ module VX_cache_top import VX_gpu_pkg::*; #(
.TAG_WIDTH (TAG_WIDTH),
.UUID_WIDTH (UUID_WIDTH),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.CORE_OUT_BUF (CORE_OUT_BUF),
.MEM_OUT_BUF (MEM_OUT_BUF)
) cache (

View file

@ -48,6 +48,9 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -187,6 +190,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_OUT_BUF (NC_OR_BYPASS ? 1 : CORE_OUT_BUF),
@ -223,12 +227,12 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (core_req_fire) begin
if (core_bus_if[i].req_data.rw)
`TRACE(1, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid));
`TRACE(1, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid));
else
`TRACE(1, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_req_uuid));
end
if (core_rsp_fire) begin
`TRACE(1, ("%d: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid));
`TRACE(1, ("%d: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid));
end
end
end
@ -250,14 +254,14 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (mem_req_fire) begin
if (mem_bus_if.req_data.rw)
`TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n",
`TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_bus_if.req_data.byteen, mem_bus_if.req_data.data, mem_req_uuid));
else
`TRACE(1, ("%d: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_req_uuid));
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%0h (#%0d)\n",
`TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data, mem_rsp_uuid));
end
end

View file

@ -83,7 +83,7 @@ module VX_alu_muldiv #(
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) mul_shift_reg (
.clk(clk),
.clk (clk),
.reset (reset),
.enable (mul_ready_in),
.data_in ({mul_valid_in, execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, mul_result_tmp}),
@ -324,6 +324,7 @@ module VX_alu_muldiv #(
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (TAG_WIDTH + (NUM_LANES * `XLEN)),
.ARBITER ("F"),
.OUT_BUF (1)
) rsp_buf (
.clk (clk),

View file

@ -57,7 +57,7 @@ module VX_alu_unit #(
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
`RESET_RELAY (block_reset, reset);
`RESET_RELAY_EN (block_reset, reset,(BLOCK_SIZE > 1));
wire is_muldiv_op = `EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == `ALU_TYPE_MULDIV);
@ -72,15 +72,13 @@ module VX_alu_unit #(
assign int_execute_if.valid = per_block_execute_if[block_idx].valid && ~is_muldiv_op;
assign int_execute_if.data = per_block_execute_if[block_idx].data;
`RESET_RELAY (int_reset, block_reset);
VX_alu_int #(
.INSTANCE_ID ($sformatf("%s-int%0d", INSTANCE_ID, block_idx)),
.BLOCK_IDX (block_idx),
.NUM_LANES (NUM_LANES)
) alu_int (
.clk (clk),
.reset (int_reset),
.reset (block_reset),
.execute_if (int_execute_if),
.branch_ctl_if (branch_ctl_if[block_idx]),
.commit_if (int_commit_if)
@ -99,14 +97,12 @@ module VX_alu_unit #(
assign muldiv_execute_if.valid = per_block_execute_if[block_idx].valid && is_muldiv_op;
assign muldiv_execute_if.data = per_block_execute_if[block_idx].data;
`RESET_RELAY (muldiv_reset, block_reset);
VX_alu_muldiv #(
.INSTANCE_ID ($sformatf("%s-muldiv%0d", INSTANCE_ID, block_idx)),
.NUM_LANES (NUM_LANES)
) muldiv_unit (
.clk (clk),
.reset (muldiv_reset),
.reset (block_reset),
.execute_if (muldiv_execute_if),
.commit_if (muldiv_commit_if)
);
@ -121,15 +117,14 @@ module VX_alu_unit #(
// send response
`RESET_RELAY (arb_reset, block_reset);
VX_stream_arb #(
.NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW),
.OUT_BUF (PARTIAL_BW ? 1 : 3)
.OUT_BUF (PARTIAL_BW ? 1 : 3),
.ARBITER ("F")
) rsp_arb (
.clk (clk),
.reset (arb_reset),
.reset (block_reset),
.valid_in ({
`ifdef EXT_M_ENABLE
muldiv_commit_if.valid,

View file

@ -313,6 +313,7 @@ module VX_core import VX_gpu_pkg::*; #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (0)
) lsu_adapter (

View file

@ -52,7 +52,7 @@ module VX_dcr_data import VX_gpu_pkg::*, VX_trace_pkg::*; (
if (dcr_bus_if.write_valid) begin
`TRACE(1, ("%d: base-dcr: state=", $time));
trace_base_dcr(1, dcr_bus_if.write_addr);
`TRACE(1, (", data=0x%0h\n", dcr_bus_if.write_data));
`TRACE(1, (", data=0x%h\n", dcr_bus_if.write_data));
end
end
`endif

View file

@ -40,7 +40,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
localparam ISSUE_W = `LOG2UP(`ISSUE_WIDTH);
localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN);
localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
localparam FANOUT_ENABLE= (`NUM_THREADS > MAX_FANOUT);
localparam FANOUT_ENABLE= (`NUM_THREADS > (MAX_FANOUT + MAX_FANOUT /2));
localparam DATA_TMASK_OFF = IN_DATAW - (`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS);
localparam DATA_REGS_OFF = 0;
@ -85,6 +85,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
assign issue_indices[block_idx] = issue_idx;
`RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1));
wire valid_p, ready_p;
if (`NUM_THREADS != NUM_LANES) begin
@ -100,7 +102,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire fire_eop = fire_p && is_last_p;
always @(posedge clk) begin
if (reset) begin
if (block_reset) begin
sent_mask_p <= '0;
is_first_p <= 1;
end else begin
@ -215,8 +217,6 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
assign isw = block_idx;
end
`RESET_RELAY(buf_out_reset, reset);
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
VX_elastic_buffer #(
@ -225,7 +225,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) buf_out (
.clk (clk),
.reset (buf_out_reset),
.reset (block_reset),
.valid_in (valid_p),
.ready_in (ready_p),
.data_in ({

View file

@ -56,9 +56,10 @@ module VX_fetch import VX_gpu_pkg::*; #(
.LUTRAM (1)
) tag_store (
.clk (clk),
.reset (reset),
.read (1'b1),
.write (icache_req_fire),
`UNUSED_PIN (wren),
.wren (1'b1),
.waddr (req_tag),
.wdata ({schedule_if.data.PC, schedule_if.data.tmask}),
.raddr (rsp_tag),

View file

@ -57,7 +57,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
`UNUSED_VAR (per_block_execute_if[block_idx].data.tid)
`UNUSED_VAR (per_block_execute_if[block_idx].data.wb)
`RESET_RELAY (block_reset, reset);
`RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1));
// Store request info
wire fpu_req_valid, fpu_req_ready;
@ -84,14 +84,12 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
wire execute_fire = per_block_execute_if[block_idx].valid && per_block_execute_if[block_idx].ready;
wire fpu_rsp_fire = fpu_rsp_valid && fpu_rsp_ready;
`RESET_RELAY (ibuf_reset, block_reset);
VX_index_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + PID_WIDTH + 1 + 1),
.SIZE (`FPUQ_SIZE)
) tag_store (
.clk (clk),
.reset (ibuf_reset),
.reset (block_reset),
.acquire_en (execute_fire),
.write_addr (fpu_req_tag),
.write_data ({per_block_execute_if[block_idx].data.uuid, per_block_execute_if[block_idx].data.wid, per_block_execute_if[block_idx].data.tmask, per_block_execute_if[block_idx].data.PC, per_block_execute_if[block_idx].data.rd, per_block_execute_if[block_idx].data.pid, per_block_execute_if[block_idx].data.sop, per_block_execute_if[block_idx].data.eop}),
@ -113,8 +111,6 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
assign fpu_req_valid = per_block_execute_if[block_idx].valid && ~mdata_full;
assign per_block_execute_if[block_idx].ready = fpu_req_ready && ~mdata_full;
`RESET_RELAY (fpu_reset, block_reset);
`ifdef FPU_DPI
VX_fpu_dpi #(
@ -123,7 +119,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_dpi (
.clk (clk),
.reset (fpu_reset),
.reset (block_reset),
.valid_in (fpu_req_valid),
.mask_in (per_block_execute_if[block_idx].data.tmask),
@ -152,7 +148,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_fpnew (
.clk (clk),
.reset (fpu_reset),
.reset (block_reset),
.valid_in (fpu_req_valid),
.mask_in (per_block_execute_if[block_idx].data.tmask),
@ -181,7 +177,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_dsp (
.clk (clk),
.reset (fpu_reset),
.reset (block_reset),
.valid_in (fpu_req_valid),
.mask_in (per_block_execute_if[block_idx].data.tmask),
@ -228,14 +224,12 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
// send response
`RESET_RELAY (rsp_reset, block_reset);
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
.SIZE (0)
) rsp_buf (
.clk (clk),
.reset (rsp_reset),
.reset (block_reset),
.valid_in (fpu_rsp_valid),
.ready_in (fpu_rsp_ready),
.data_in ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_result, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),

View file

@ -79,15 +79,13 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
.NUM_LANES (NUM_LANES)
) commit_tmp_if();
`RESET_RELAY(commit_out_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf (
.clk (clk),
.reset (commit_out_reset),
.reset (reset),
.valid_in (commit_out_valid[i]),
.ready_in (commit_out_ready[i]),
.data_in (commit_out_data[i]),

View file

@ -72,9 +72,10 @@ module VX_ipdom_stack #(
.LUTRAM (OUT_REG ? 0 : 1)
) store (
.clk (clk),
.reset (reset),
.read (1'b1),
.write (push),
`UNUSED_PIN (wren),
.wren (1'b1),
.waddr (wr_ptr),
.wdata ({q1, q0}),
.raddr (rd_ptr),

View file

@ -39,6 +39,8 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_switch_if[`NUM_LSU_BLOCKS]();
`RESET_RELAY_EX (block_reset, reset, `NUM_LSU_BLOCKS, 1);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
wire [`NUM_LSU_LANES-1:0] is_addr_local_mask;
@ -52,15 +54,13 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
wire req_global_ready;
wire req_local_ready;
`RESET_RELAY (switch_reset, reset);
VX_elastic_buffer #(
.DATAW (REQ_DATAW),
.SIZE (2),
.OUT_REG (1)
) req_global_buf (
.clk (clk),
.reset (switch_reset),
.reset (block_reset[i]),
.valid_in (lsu_mem_in_if[i].req_valid && is_addr_global),
.data_in ({
lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask,
@ -91,7 +91,7 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
.OUT_REG (0)
) req_local_buf (
.clk (clk),
.reset (switch_reset),
.reset (block_reset[i]),
.valid_in (lsu_mem_in_if[i].req_valid && is_addr_local),
.data_in ({
lsu_mem_in_if[i].req_data.mask & is_addr_local_mask,
@ -126,7 +126,7 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
.OUT_BUF (1)
) rsp_arb (
.clk (clk),
.reset (switch_reset),
.reset (block_reset[i]),
.valid_in ({
lsu_switch_if[i].rsp_valid,
lsu_mem_out_if[i].rsp_valid
@ -157,18 +157,17 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_tmp_if[`NUM_LSU_LANES]();
`RESET_RELAY (adapter_reset, reset);
VX_lsu_adapter #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (3),
.RSP_OUT_BUF (0)
) lsu_adapter (
.clk (clk),
.reset (adapter_reset),
.reset (block_reset[i]),
.lsu_mem_if (lsu_switch_if[i]),
.mem_bus_if (lmem_bus_tmp_if)
);

View file

@ -490,6 +490,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_ARB_DATAW),
.ARBITER ("P"), // prioritize commit_rsp_if
.OUT_BUF (3)
) rsp_arb (
.clk (clk),

View file

@ -13,6 +13,13 @@
`include "VX_define.vh"
// reset all GPRs in debug mode
`ifdef SIMULATION
`ifndef NDEBUG
`define GPR_RESET
`endif
`endif
module VX_operands import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter NUM_BANKS = 4,
@ -36,8 +43,9 @@ module VX_operands import VX_gpu_pkg::*; #(
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS;
localparam METADATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS;
localparam DATAW = `UUID_WIDTH + METADATAW + 3 * `NUM_THREADS * `XLEN;
localparam META_DATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + `UUID_WIDTH;
localparam REGS_DATAW = `XLEN * `NUM_THREADS;
localparam DATAW = META_DATAW + NUM_SRC_REGS * REGS_DATAW;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS);
localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS;
localparam XLEN_SIZE = `XLEN / 8;
@ -46,30 +54,28 @@ module VX_operands import VX_gpu_pkg::*; #(
`UNUSED_VAR (writeback_if.data.sop)
wire [NUM_SRC_REGS-1:0] src_valid;
wire [NUM_SRC_REGS-1:0] req_in_valid;
wire [NUM_SRC_REGS-1:0] req_in_ready;
wire [NUM_SRC_REGS-1:0] req_in_valid, req_in_ready;
wire [NUM_SRC_REGS-1:0][PER_BANK_ADDRW-1:0] req_in_data;
wire [NUM_SRC_REGS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
wire [NUM_BANKS-1:0] gpr_rd_valid_n, gpr_rd_ready;
reg [NUM_BANKS-1:0] gpr_rd_valid;
wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr_n;
reg [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr;
wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx_n;
reg [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx;
wire [NUM_BANKS-1:0] gpr_rd_valid, gpr_rd_ready;
wire [NUM_BANKS-1:0] gpr_rd_valid_st1, gpr_rd_valid_st2;
wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr, gpr_rd_addr_st1;
wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data_st1, gpr_rd_data_st2;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx, gpr_rd_req_idx_st1, gpr_rd_req_idx_st2;
wire pipe_in_ready;
reg pipe_out_valid;
wire pipe_out_ready;
reg [`UUID_WIDTH-1:0] pipe_out_uuid;
reg [METADATAW-1:0] pipe_out_data;
wire pipe_valid_st1, pipe_ready_st1;
wire pipe_valid_st2, pipe_ready_st2;
wire [META_DATAW-1:0] pipe_data, pipe_data_st1, pipe_data_st2;
reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data, src_data_n;
reg [NUM_SRC_REGS-1:0] data_fetched;
reg has_collision, has_collision_n;
reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_n;
wire [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_st1, src_data_st2;
wire stg_in_valid, stg_in_ready;
reg [NUM_SRC_REGS-1:0] data_fetched_n;
wire [NUM_SRC_REGS-1:0] data_fetched_st1;
reg has_collision_n;
wire has_collision_st1;
wire [NUM_SRC_REGS-1:0][`NR_BITS-1:0] src_regs = {scoreboard_if.data.rs3,
scoreboard_if.data.rs2,
@ -89,7 +95,7 @@ module VX_operands import VX_gpu_pkg::*; #(
end
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
assign src_valid[i] = (src_regs[i] != 0) && ~data_fetched[i];
assign src_valid[i] = (src_regs[i] != 0) && ~data_fetched_st1[i];
end
assign req_in_valid = {NUM_SRC_REGS{scoreboard_if.valid}} & src_valid;
@ -109,13 +115,20 @@ module VX_operands import VX_gpu_pkg::*; #(
.data_in (req_in_data),
.sel_in (req_bank_idx),
.ready_in (req_in_ready),
.valid_out (gpr_rd_valid_n),
.data_out (gpr_rd_addr_n),
.sel_out (gpr_rd_req_idx_n),
.valid_out (gpr_rd_valid),
.data_out (gpr_rd_addr),
.sel_out (gpr_rd_req_idx),
.ready_out (gpr_rd_ready)
);
assign gpr_rd_ready = {NUM_BANKS{stg_in_ready}};
wire pipe_in_ready = pipe_ready_st1 || ~pipe_valid_st1;
assign gpr_rd_ready = {NUM_BANKS{pipe_in_ready}};
assign scoreboard_if.ready = pipe_in_ready && ~has_collision_n;
wire pipe_fire_st1 = pipe_valid_st1 && pipe_ready_st1;
wire pipe_fire_st2 = pipe_valid_st2 && pipe_ready_st2;
always @(*) begin
has_collision_n = 0;
@ -129,83 +142,82 @@ module VX_operands import VX_gpu_pkg::*; #(
end
always @(*) begin
src_data_n = src_data;
for (integer b = 0; b < NUM_BANKS; ++b) begin
if (gpr_rd_valid[b]) begin
src_data_n[gpr_rd_req_idx[b]] = gpr_rd_data[b];
end
end
end
wire pipe_stall = pipe_out_valid && ~pipe_out_ready;
assign pipe_in_ready = ~pipe_stall;
assign scoreboard_if.ready = pipe_in_ready && ~has_collision_n;
wire stg_in_fire = stg_in_valid && stg_in_ready;
always @(posedge clk) begin
if (reset) begin
pipe_out_valid <= 0;
gpr_rd_valid <= '0;
data_fetched <= '0;
src_data <= '0;
data_fetched_n = data_fetched_st1;
if (scoreboard_if.ready) begin
data_fetched_n = '0;
end else begin
if (~pipe_stall) begin
pipe_out_valid <= scoreboard_if.valid;
gpr_rd_valid <= gpr_rd_valid_n;
if (scoreboard_if.ready) begin
data_fetched <= '0;
end else begin
data_fetched <= data_fetched | req_in_ready;
end
if (stg_in_fire) begin
src_data <= '0;
end else begin
src_data <= src_data_n;
end
end
end
if (~pipe_stall) begin
pipe_out_uuid <= scoreboard_if.data.uuid;
pipe_out_data <= {
scoreboard_if.data.wis,
scoreboard_if.data.tmask,
scoreboard_if.data.PC,
scoreboard_if.data.wb,
scoreboard_if.data.ex_type,
scoreboard_if.data.op_type,
scoreboard_if.data.op_args,
scoreboard_if.data.rd
};
has_collision <= has_collision_n;
gpr_rd_addr <= gpr_rd_addr_n;
gpr_rd_req_idx <= gpr_rd_req_idx_n;
data_fetched_n = data_fetched_st1 | req_in_ready;
end
end
assign pipe_out_ready = stg_in_ready;
assign stg_in_valid = pipe_out_valid && ~has_collision;
assign pipe_data = {
scoreboard_if.data.wis,
scoreboard_if.data.tmask,
scoreboard_if.data.PC,
scoreboard_if.data.wb,
scoreboard_if.data.ex_type,
scoreboard_if.data.op_type,
scoreboard_if.data.op_args,
scoreboard_if.data.rd,
scoreboard_if.data.uuid
};
VX_pipe_register #(
.DATAW (1 + NUM_SRC_REGS + NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)),
.RESETW (1 + NUM_SRC_REGS)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (pipe_in_ready),
.data_in ({scoreboard_if.valid, data_fetched_n, gpr_rd_valid, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}),
.data_out ({pipe_valid_st1, data_fetched_st1, gpr_rd_valid_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1})
);
assign pipe_ready_st1 = pipe_ready_st2 || ~pipe_valid_st2;
assign src_data_st1 = pipe_fire_st2 ? '0 : src_data_n;
wire pipe_valid2_st1 = pipe_valid_st1 && ~has_collision_st1;
`RESET_RELAY (pipe2_reset, reset); // needed for pipe_reg2's wide RESETW
VX_pipe_register #(
.DATAW (1 + NUM_SRC_REGS * REGS_DATAW + NUM_BANKS + NUM_BANKS * REGS_DATAW + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH),
.RESETW (1 + NUM_SRC_REGS * REGS_DATAW)
) pipe_reg2 (
.clk (clk),
.reset (pipe2_reset),
.enable (pipe_ready_st1),
.data_in ({pipe_valid2_st1, src_data_st1, gpr_rd_valid_st1, gpr_rd_data_st1, pipe_data_st1, gpr_rd_req_idx_st1}),
.data_out ({pipe_valid_st2, src_data_st2, gpr_rd_valid_st2, gpr_rd_data_st2, pipe_data_st2, gpr_rd_req_idx_st2})
);
always @(*) begin
src_data_n = src_data_st2;
for (integer b = 0; b < NUM_BANKS; ++b) begin
if (gpr_rd_valid_st2[b]) begin
src_data_n[gpr_rd_req_idx_st2[b]] = gpr_rd_data_st2[b];
end
end
end
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)),
.LUTRAM (1)
) out_buffer (
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (stg_in_valid),
.ready_in (stg_in_ready),
.valid_in (pipe_valid_st2),
.ready_in (pipe_ready_st2),
.data_in ({
pipe_out_uuid,
pipe_out_data,
pipe_data_st2,
src_data_n[0],
src_data_n[1],
src_data_n[2]
}),
.data_out ({
operands_if.data.uuid,
operands_if.data.wis,
operands_if.data.tmask,
operands_if.data.PC,
@ -214,6 +226,7 @@ module VX_operands import VX_gpu_pkg::*; #(
operands_if.data.op_type,
operands_if.data.op_args,
operands_if.data.rd,
operands_if.data.uuid,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data
@ -262,27 +275,24 @@ module VX_operands import VX_gpu_pkg::*; #(
assign wren[i*XLEN_SIZE+:XLEN_SIZE] = {XLEN_SIZE{writeback_if.data.tmask[i]}};
end
`ifdef GPR_RESET
VX_dp_ram_rst #(
`else
VX_dp_ram #(
`endif
.DATAW (`XLEN * `NUM_THREADS),
.DATAW (REGS_DATAW),
.SIZE (PER_BANK_REGS * PER_ISSUE_WARPS),
.WRENW (BYTEENW),
`ifdef GPR_RESET
.RESET_RAM (1),
`endif
.NO_RWCHECK (1)
) gpr_ram (
.clk (clk),
`ifdef GPR_RESET
.reset (reset),
`endif
.read (1'b1),
.read (pipe_fire_st1),
.wren (wren),
.write (gpr_wr_enabled),
.waddr (gpr_wr_addr),
.wdata (writeback_if.data.data),
.raddr (gpr_rd_addr[b]),
.rdata (gpr_rd_data[b])
.raddr (gpr_rd_addr_st1[b]),
.rdata (gpr_rd_data_st1[b])
);
end

View file

@ -383,16 +383,16 @@ module VX_schedule import VX_gpu_pkg::*; #(
wire [`NUM_WARPS-1:0] pending_warp_empty;
wire [`NUM_WARPS-1:0] pending_warp_alm_empty;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
`RESET_RELAY_EX (pending_instr_reset, reset, `NUM_WARPS, `MAX_FANOUT);
`RESET_RELAY (pending_instr_reset, reset);
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
VX_pending_size #(
.SIZE (4096),
.ALM_EMPTY (1)
) counter (
.clk (clk),
.reset (pending_instr_reset),
.reset (pending_instr_reset[i]),
.incr (per_warp_incr[i]),
.decr (commit_sched_if.committed_warps[i]),
.empty (pending_warp_empty[i]),

View file

@ -179,7 +179,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_BUF (1)
.OUT_BUF (3)
) gather_unit (
.clk (clk),
.reset (reset),

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -21,7 +21,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
parameter TAG_WIDTH = 1
) (
input wire clk,
input wire reset,
input wire reset,
output wire ready_in,
input wire valid_in,
@ -36,7 +36,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
input wire is_signed,
input wire [NUM_LANES-1:0][31:0] dataa,
output wire [NUM_LANES-1:0][31:0] result,
output wire [NUM_LANES-1:0][31:0] result,
output wire has_fflags,
output wire [`FP_FLAGS_BITS-1:0] fflags,
@ -45,25 +45,26 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
input wire ready_out,
output wire valid_out
);
);
`UNUSED_VAR (frm)
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
fflags_t [NUM_LANES-1:0] fflags_out;
wire pe_enable;
wire pe_enable;
wire [NUM_PES-1:0][31:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
VX_pe_serializer #(
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.LATENCY (`LATENCY_FCVT),
.DATA_IN_WIDTH(32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG (0)
.PE_REG (0),
.OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
) pe_serializer (
.clk (clk),
.reset (reset),
@ -94,7 +95,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.enable (pe_enable),
.frm (frm),
.is_itof (is_itof),
.is_signed (is_signed),
.is_signed (is_signed),
.dataa (pe_data_in[i][0 +: 32]),
.result (pe_data_out[i][0 +: 32]),
.fflags (pe_data_out[i][32 +: `FP_FLAGS_BITS])

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -21,7 +21,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
parameter TAG_WIDTH = 1
) (
input wire clk,
input wire reset,
input wire reset,
input wire valid_in,
output wire ready_in,
@ -31,10 +31,10 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
input wire [TAG_WIDTH-1:0] tag_in,
input wire [`INST_FRM_BITS-1:0] frm,
input wire [NUM_LANES-1:0][31:0] dataa,
input wire [NUM_LANES-1:0][31:0] datab,
output wire [NUM_LANES-1:0][31:0] result,
output wire [NUM_LANES-1:0][31:0] result,
output wire has_fflags,
output wire [`FP_FLAGS_BITS-1:0] fflags,
@ -47,27 +47,28 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
`UNUSED_VAR (frm)
wire [NUM_LANES-1:0][2*32-1:0] data_in;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags_out;
wire pe_enable;
wire pe_enable;
wire [NUM_PES-1:0][2*32-1:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign data_in[i][0 +: 32] = dataa[i];
assign data_in[i][32 +: 32] = datab[i];
end
VX_pe_serializer #(
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.LATENCY (`LATENCY_FDIV),
.DATA_IN_WIDTH(2*32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG (0)
.PE_REG (0),
.OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
) pe_serializer (
.clk (clk),
.reset (reset),
@ -92,7 +93,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
fflags_t [NUM_LANES-1:0] per_lane_fflags;
`ifdef QUARTUS
for (genvar i = 0; i < NUM_PES; ++i) begin
acl_fdiv fdiv (
.clk (clk),
@ -103,8 +104,8 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
.q (pe_data_out[i][0 +: 32])
);
assign pe_data_out[i][32 +: `FP_FLAGS_BITS] = 'x;
end
end
assign has_fflags = 0;
assign per_lane_fflags = 'x;
`UNUSED_VAR (fflags_out)
@ -131,21 +132,21 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
assign has_fflags = 1;
assign per_lane_fflags = fflags_out;
`else
`else
for (genvar i = 0; i < NUM_PES; ++i) begin
reg [63:0] r;
`UNUSED_VAR (r)
`UNUSED_VAR (r)
fflags_t f;
always @(*) begin
always @(*) begin
dpi_fdiv (
pe_enable,
int'(0),
{32'hffffffff, pe_data_in[i][0 +: 32]},
{32'hffffffff, pe_data_in[i][32 +: 32]},
frm,
r,
pe_enable,
int'(0),
{32'hffffffff, pe_data_in[i][0 +: 32]},
{32'hffffffff, pe_data_in[i][32 +: 32]},
frm,
r,
f
);
end

View file

@ -98,7 +98,8 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
.DATA_IN_WIDTH(3*32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG ((NUM_LANES != NUM_PES) ? 1 : 0)
.PE_REG ((NUM_LANES != NUM_PES) ? 1 : 0), // must be registered for DSPs
.OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
) pe_serializer (
.clk (clk),
.reset (reset),

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -35,7 +35,7 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #(
input wire [NUM_LANES-1:0][31:0] dataa,
input wire [NUM_LANES-1:0][31:0] datab,
output wire [NUM_LANES-1:0][31:0] result,
output wire [NUM_LANES-1:0][31:0] result,
output wire has_fflags,
output wire [`FP_FLAGS_BITS-1:0] fflags,
@ -44,15 +44,15 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #(
input wire ready_out,
output wire valid_out
);
);
`UNUSED_VAR (frm)
wire [NUM_LANES-1:0][2*32-1:0] data_in;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
fflags_t [NUM_LANES-1:0] fflags_out;
wire pe_enable;
wire pe_enable;
wire [NUM_PES-1:0][2*32-1:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
@ -60,15 +60,16 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #(
assign data_in[i][0 +: 32] = dataa[i];
assign data_in[i][32 +: 32] = datab[i];
end
VX_pe_serializer #(
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.LATENCY (`LATENCY_FNCP),
.DATA_IN_WIDTH(2*32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG (0)
.PE_REG (0),
.OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
) pe_serializer (
.clk (clk),
.reset (reset),
@ -97,8 +98,8 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (pe_enable),
.frm (frm),
.op_type (op_type),
.frm (frm),
.op_type (op_type),
.dataa (pe_data_in[i][0 +: 32]),
.datab (pe_data_in[i][32 +: 32]),
.result (pe_data_out[i][0 +: 32]),

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -18,10 +18,10 @@
module VX_fpu_sqrt import VX_fpu_pkg::*; #(
parameter NUM_LANES = 1,
parameter NUM_PES = `UP(NUM_LANES /`FSQRT_PE_RATIO),
parameter TAG_WIDTH = 1
parameter TAG_WIDTH = 1
) (
input wire clk,
input wire reset,
input wire reset,
output wire ready_in,
input wire valid_in,
@ -29,11 +29,11 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #(
input wire [NUM_LANES-1:0] mask_in,
input wire [TAG_WIDTH-1:0] tag_in,
input wire [`INST_FRM_BITS-1:0] frm,
input wire [NUM_LANES-1:0][31:0] dataa,
output wire [NUM_LANES-1:0][31:0] result,
output wire [NUM_LANES-1:0][31:0] result,
output wire has_fflags,
output wire [`FP_FLAGS_BITS-1:0] fflags,
@ -46,22 +46,23 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #(
`UNUSED_VAR (frm)
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags_out;
wire pe_enable;
wire pe_enable;
wire [NUM_PES-1:0][31:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
VX_pe_serializer #(
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.LATENCY (`LATENCY_FSQRT),
.DATA_IN_WIDTH(32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG (0)
.PE_REG (0),
.OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
) pe_serializer (
.clk (clk),
.reset (reset),
@ -83,10 +84,10 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #(
assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS];
end
fflags_t [NUM_LANES-1:0] per_lane_fflags;
fflags_t [NUM_LANES-1:0] per_lane_fflags;
`ifdef QUARTUS
for (genvar i = 0; i < NUM_PES; ++i) begin
acl_fsqrt fsqrt (
.clk (clk),
@ -105,7 +106,7 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #(
`elsif VIVADO
for (genvar i = 0; i < NUM_PES; ++i) begin
wire tuser;
wire tuser;
xil_fsqrt fsqrt (
.aclk (clk),
@ -130,17 +131,17 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #(
`UNUSED_VAR (r)
fflags_t f;
always @(*) begin
always @(*) begin
dpi_fsqrt (
pe_enable,
int'(0),
{32'hffffffff, pe_data_in[i]},
frm,
r,
pe_enable,
int'(0),
{32'hffffffff, pe_data_in[i]},
frm,
r,
f
);
end
VX_shift_register #(
.DATAW (32 + $bits(fflags_t)),
.DEPTH (`LATENCY_FSQRT)

View file

@ -81,12 +81,15 @@ module VX_avs_adapter #(
assign req_queue_push[i] = mem_req_valid && ~mem_req_rw && bank_req_ready[i] && (req_bank_sel == i);
end
`RESET_RELAY_EX (bank_reset, reset, NUM_BANKS, 1);
for (genvar i = 0; i < NUM_BANKS; ++i) begin
VX_pending_size #(
.SIZE (RD_QUEUE_SIZE)
) pending_size (
.clk (clk),
.reset (reset),
.reset (bank_reset[i]),
.incr (req_queue_push[i]),
.decr (req_queue_pop[i]),
`UNUSED_PIN (empty),
@ -102,7 +105,7 @@ module VX_avs_adapter #(
.DEPTH (RD_QUEUE_SIZE)
) rd_req_queue (
.clk (clk),
.reset (reset),
.reset (bank_reset[i]),
.push (req_queue_push[i]),
.pop (req_queue_pop[i]),
.data_in (mem_req_tag),
@ -132,7 +135,7 @@ module VX_avs_adapter #(
.OUT_REG (`TO_OUT_BUF_REG(REQ_OUT_BUF))
) req_out_buf (
.clk (clk),
.reset (reset),
.reset (bank_reset[i]),
.valid_in (valid_out_w),
.ready_in (ready_out_w),
.data_in ({mem_req_rw, mem_req_byteen, req_bank_off, mem_req_data}),
@ -168,12 +171,13 @@ module VX_avs_adapter #(
wire [NUM_BANKS-1:0] rsp_queue_empty;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
VX_fifo_queue #(
.DATAW (DATA_WIDTH),
.DEPTH (RD_QUEUE_SIZE)
) rd_rsp_queue (
.clk (clk),
.reset (reset),
.reset (bank_reset[i]),
.push (avs_readdatavalid[i]),
.pop (req_queue_pop[i]),
.data_in (avs_readdata[i]),
@ -195,7 +199,7 @@ module VX_avs_adapter #(
VX_stream_arb #(
.NUM_INPUTS (NUM_BANKS),
.DATAW (DATA_WIDTH + TAG_WIDTH),
.ARBITER ("R"),
.ARBITER ("F"),
.OUT_BUF (RSP_OUT_BUF)
) rsp_arb (
.clk (clk),

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -15,10 +15,10 @@
`TRACING_OFF
module VX_axi_adapter #(
parameter DATA_WIDTH = 512,
parameter DATA_WIDTH = 512,
parameter ADDR_WIDTH = 32,
parameter TAG_WIDTH = 8,
parameter NUM_BANKS = 1,
parameter NUM_BANKS = 1,
parameter AVS_ADDR_WIDTH = (ADDR_WIDTH - `CLOG2(DATA_WIDTH/8)),
parameter RSP_OUT_BUF = 0
) (
@ -34,13 +34,13 @@ module VX_axi_adapter #(
input wire [TAG_WIDTH-1:0] mem_req_tag,
output wire mem_req_ready,
// Vortex response
output wire mem_rsp_valid,
// Vortex response
output wire mem_rsp_valid,
output wire [DATA_WIDTH-1:0] mem_rsp_data,
output wire [TAG_WIDTH-1:0] mem_rsp_tag,
input wire mem_rsp_ready,
// AXI write request address channel
// AXI write request address channel
output wire m_axi_awvalid [NUM_BANKS],
input wire m_axi_awready [NUM_BANKS],
output wire [ADDR_WIDTH-1:0] m_axi_awaddr [NUM_BANKS],
@ -54,7 +54,7 @@ module VX_axi_adapter #(
output wire [3:0] m_axi_awqos [NUM_BANKS],
output wire [3:0] m_axi_awregion [NUM_BANKS],
// AXI write request data channel
// AXI write request data channel
output wire m_axi_wvalid [NUM_BANKS],
input wire m_axi_wready [NUM_BANKS],
output wire [DATA_WIDTH-1:0] m_axi_wdata [NUM_BANKS],
@ -66,7 +66,7 @@ module VX_axi_adapter #(
output wire m_axi_bready [NUM_BANKS],
input wire [TAG_WIDTH-1:0] m_axi_bid [NUM_BANKS],
input wire [1:0] m_axi_bresp [NUM_BANKS],
// AXI read address channel
output wire m_axi_arvalid [NUM_BANKS],
input wire m_axi_arready [NUM_BANKS],
@ -74,13 +74,13 @@ module VX_axi_adapter #(
output wire [TAG_WIDTH-1:0] m_axi_arid [NUM_BANKS],
output wire [7:0] m_axi_arlen [NUM_BANKS],
output wire [2:0] m_axi_arsize [NUM_BANKS],
output wire [1:0] m_axi_arburst [NUM_BANKS],
output wire [1:0] m_axi_arburst [NUM_BANKS],
output wire [1:0] m_axi_arlock [NUM_BANKS],
output wire [3:0] m_axi_arcache [NUM_BANKS],
output wire [2:0] m_axi_arprot [NUM_BANKS],
output wire [3:0] m_axi_arqos [NUM_BANKS],
output wire [3:0] m_axi_arregion [NUM_BANKS],
// AXI read response channel
input wire m_axi_rvalid [NUM_BANKS],
output wire m_axi_rready [NUM_BANKS],
@ -88,15 +88,15 @@ module VX_axi_adapter #(
input wire m_axi_rlast [NUM_BANKS],
input wire [TAG_WIDTH-1:0] m_axi_rid [NUM_BANKS],
input wire [1:0] m_axi_rresp [NUM_BANKS]
);
);
localparam AXSIZE = `CLOG2(DATA_WIDTH/8);
localparam BANK_ADDRW = `LOG2UP(NUM_BANKS);
localparam BANK_ADDRW = `LOG2UP(NUM_BANKS);
localparam LOG2_NUM_BANKS = `CLOG2(NUM_BANKS);
wire [BANK_ADDRW-1:0] req_bank_sel;
if (NUM_BANKS > 1) begin
assign req_bank_sel = mem_req_addr[BANK_ADDRW-1:0];
assign req_bank_sel = mem_req_addr[BANK_ADDRW-1:0];
end else begin
assign req_bank_sel = '0;
end
@ -108,12 +108,12 @@ module VX_axi_adapter #(
for (genvar i = 0; i < NUM_BANKS; ++i) begin
wire m_axi_aw_fire = m_axi_awvalid[i] && m_axi_awready[i];
wire m_axi_w_fire = m_axi_wvalid[i] && m_axi_wready[i];
wire m_axi_w_fire = m_axi_wvalid[i] && m_axi_wready[i];
always @(posedge clk) begin
if (reset) begin
m_axi_aw_ack[i] <= 0;
m_axi_w_ack[i] <= 0;
end else begin
end else begin
if (mem_req_fire && (req_bank_sel == i)) begin
m_axi_aw_ack[i] <= 0;
m_axi_w_ack[i] <= 0;
@ -127,10 +127,10 @@ module VX_axi_adapter #(
end
end
wire axi_write_ready [NUM_BANKS];
wire axi_write_ready [NUM_BANKS];
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign axi_write_ready[i] = (m_axi_awready[i] || m_axi_aw_ack[i])
assign axi_write_ready[i] = (m_axi_awready[i] || m_axi_aw_ack[i])
&& (m_axi_wready[i] || m_axi_w_ack[i]);
end
@ -141,17 +141,17 @@ module VX_axi_adapter #(
assign mem_req_ready = mem_req_rw ? axi_write_ready[0] : m_axi_arready[0];
end
// AXI write request address channel
// AXI write request address channel
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign m_axi_awvalid[i] = mem_req_valid && mem_req_rw && (req_bank_sel == i) && ~m_axi_aw_ack[i];
assign m_axi_awaddr[i] = (ADDR_WIDTH'(mem_req_addr) >> LOG2_NUM_BANKS) << AXSIZE;
assign m_axi_awid[i] = mem_req_tag;
assign m_axi_awlen[i] = 8'b00000000;
assign m_axi_awlen[i] = 8'b00000000;
assign m_axi_awsize[i] = 3'(AXSIZE);
assign m_axi_awburst[i] = 2'b00;
assign m_axi_awlock[i] = 2'b00;
assign m_axi_awburst[i] = 2'b00;
assign m_axi_awlock[i] = 2'b00;
assign m_axi_awcache[i] = 4'b0000;
assign m_axi_awprot[i] = 3'b000;
assign m_axi_awprot[i] = 3'b000;
assign m_axi_awqos[i] = 4'b0000;
assign m_axi_awregion[i]= 4'b0000;
end
@ -170,31 +170,31 @@ module VX_axi_adapter #(
`UNUSED_VAR (m_axi_bid[i])
`UNUSED_VAR (m_axi_bresp[i])
assign m_axi_bready[i] = 1'b1;
`RUNTIME_ASSERT(~m_axi_bvalid[i] || m_axi_bresp[i] == 0, ("%t: *** AXI response error", $time));
`RUNTIME_ASSERT(~m_axi_bvalid[i] || m_axi_bresp[i] == 0, ("%t: *** AXI response error", $time));
end
// AXI read request channel
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign m_axi_arvalid[i] = mem_req_valid && ~mem_req_rw && (req_bank_sel == i);
assign m_axi_arvalid[i] = mem_req_valid && ~mem_req_rw && (req_bank_sel == i);
assign m_axi_araddr[i] = (ADDR_WIDTH'(mem_req_addr) >> LOG2_NUM_BANKS) << AXSIZE;
assign m_axi_arid[i] = mem_req_tag;
assign m_axi_arlen[i] = 8'b00000000;
assign m_axi_arsize[i] = 3'(AXSIZE);
assign m_axi_arburst[i] = 2'b00;
assign m_axi_arlock[i] = 2'b00;
assign m_axi_arburst[i] = 2'b00;
assign m_axi_arlock[i] = 2'b00;
assign m_axi_arcache[i] = 4'b0000;
assign m_axi_arprot[i] = 3'b000;
assign m_axi_arqos[i] = 4'b0000;
assign m_axi_arregion[i]= 4'b0000;
end
// AXI read response channel
// AXI read response channel
wire [NUM_BANKS-1:0] rsp_arb_valid_in;
wire [NUM_BANKS-1:0][DATA_WIDTH+TAG_WIDTH-1:0] rsp_arb_data_in;
wire [NUM_BANKS-1:0] rsp_arb_ready_in;
`UNUSED_VAR (m_axi_rlast)
`UNUSED_VAR (m_axi_rlast)
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign rsp_arb_valid_in[i] = m_axi_rvalid[i];
@ -203,11 +203,11 @@ module VX_axi_adapter #(
`RUNTIME_ASSERT(~m_axi_rvalid[i] || m_axi_rlast[i] == 1, ("%t: *** AXI response error", $time));
`RUNTIME_ASSERT(~m_axi_rvalid[i] || m_axi_rresp[i] == 0, ("%t: *** AXI response error", $time));
end
VX_stream_arb #(
.NUM_INPUTS (NUM_BANKS),
.DATAW (DATA_WIDTH + TAG_WIDTH),
.ARBITER ("R"),
.ARBITER ("F"),
.OUT_BUF (RSP_OUT_BUF)
) rsp_arb (
.clk (clk),

View file

@ -22,12 +22,16 @@ module VX_dp_ram #(
parameter OUT_REG = 0,
parameter NO_RWCHECK = 0,
parameter LUTRAM = 0,
parameter RW_ASSERT = 0,
parameter RESET_RAM = 0,
parameter READ_ENABLE = 0,
parameter INIT_ENABLE = 0,
parameter INIT_FILE = "",
parameter [DATAW-1:0] INIT_VALUE = 0,
parameter ADDRW = `LOG2UP(SIZE)
) (
input wire clk,
input wire reset,
input wire read,
input wire write,
input wire [WRENW-1:0] wren,
@ -50,44 +54,44 @@ module VX_dp_ram #(
end \
end
`UNUSED_PARAM (RW_ASSERT)
`UNUSED_VAR (read)
if (WRENW > 1) begin
`RUNTIME_ASSERT(~write || (| wren), ("invalid write enable mask"));
end
wire [DATAW-1:0] rdata_w;
`ifdef SYNTHESIS
if (WRENW > 1) begin
`ifdef QUARTUS
if (LUTRAM != 0) begin
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
`USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i] <= wdata[i * WSELW +: WSELW];
end
end
if (read) begin
rdata_r <= ram[raddr];
`USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i] <= wdata[i * WSELW +: WSELW];
end
end
assign rdata = rdata_r;
end else begin
`USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i] <= wdata[i * WSELW +: WSELW];
end
end
end
assign rdata = ram[raddr];
end
assign rdata_w = ram[raddr];
end else begin
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
if (NO_RWCHECK != 0) begin
`NO_RW_RAM_CHECK reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i] <= wdata[i * WSELW +: WSELW];
end
end
end
assign rdata_w = ram[raddr];
end else begin
reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
@ -97,37 +101,8 @@ module VX_dp_ram #(
ram[waddr][i] <= wdata[i * WSELW +: WSELW];
end
end
if (read) begin
rdata_r <= ram[raddr];
end
end
assign rdata = rdata_r;
end else begin
if (NO_RWCHECK != 0) begin
`NO_RW_RAM_CHECK reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i] <= wdata[i * WSELW +: WSELW];
end
end
end
assign rdata = ram[raddr];
end else begin
reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i] <= wdata[i * WSELW +: WSELW];
end
end
end
assign rdata = ram[raddr];
end
assign rdata_w = ram[raddr];
end
end
`else
@ -135,35 +110,18 @@ module VX_dp_ram #(
if (LUTRAM != 0) begin
`USE_FAST_BRAM reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
end
end
if (read) begin
rdata_r <= ram[raddr];
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
end
end
assign rdata = rdata_r;
end else begin
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
end
end
end
assign rdata = ram[raddr];
end
assign rdata_w = ram[raddr];
end else begin
if (OUT_REG != 0) begin
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
reg [DATAW-1:0] rdata_r;
if (NO_RWCHECK != 0) begin
`NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
@ -172,37 +130,20 @@ module VX_dp_ram #(
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
end
end
if (read) begin
rdata_r <= ram[raddr];
end
end
assign rdata = rdata_r;
assign rdata_w = ram[raddr];
end else begin
if (NO_RWCHECK != 0) begin
`NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
end
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
end
end
assign rdata = ram[raddr];
end else begin
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
for (integer i = 0; i < WRENW; ++i) begin
if (wren[i])
ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW];
end
end
end
assign rdata = ram[raddr];
end
assign rdata_w = ram[raddr];
end
end
`endif
@ -211,64 +152,36 @@ module VX_dp_ram #(
if (LUTRAM != 0) begin
`USE_FAST_BRAM reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
if (read) begin
rdata_r <= ram[raddr];
end
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
assign rdata = rdata_r;
end else begin
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
end
assign rdata = ram[raddr];
end
assign rdata_w = ram[raddr];
end else begin
if (OUT_REG != 0) begin
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
reg [DATAW-1:0] rdata_r;
if (NO_RWCHECK != 0) begin
`NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
if (read) begin
rdata_r <= ram[raddr];
end
end
assign rdata = rdata_r;
assign rdata_w = ram[raddr];
end else begin
if (NO_RWCHECK != 0) begin
`NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
assign rdata = ram[raddr];
end else begin
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
ram[waddr] <= wdata;
end
end
assign rdata = ram[raddr];
end
assign rdata_w = ram[raddr];
end
end
end
`else
// RAM emulation
// simulation
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
@ -277,39 +190,57 @@ module VX_dp_ram #(
assign ram_n[i * WSELW +: WSELW] = ((WRENW == 1) | wren[i]) ? wdata[i * WSELW +: WSELW] : ram[waddr][i * WSELW +: WSELW];
end
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
reg [DATAW-1:0] prev_data;
reg [ADDRW-1:0] prev_waddr;
reg prev_write;
always @(posedge clk) begin
if (RESET_RAM && reset) begin
for (integer i = 0; i < SIZE; ++i) begin
ram[i] <= DATAW'(INIT_VALUE);
end
end else begin
if (write) begin
ram[waddr] <= ram_n;
end
if (read) begin
rdata_r <= ram[raddr];
end
end
assign rdata = rdata_r;
end else begin
reg [DATAW-1:0] prev_data;
reg [ADDRW-1:0] prev_waddr;
reg prev_write;
always @(posedge clk) begin
if (write) begin
ram[waddr] <= ram_n;
end
prev_write <= (| wren);
if (reset) begin
prev_write <= 0;
prev_data <= '0;
prev_waddr <= '0;
end else begin
prev_write <= write;
prev_data <= ram[waddr];
prev_waddr <= waddr;
end
if (LUTRAM || !NO_RWCHECK) begin
`UNUSED_VAR (prev_write)
`UNUSED_VAR (prev_data)
`UNUSED_VAR (prev_waddr)
assign rdata = ram[raddr];
end else begin
assign rdata = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr];
end
if (LUTRAM || !NO_RWCHECK) begin
`UNUSED_VAR (prev_write)
`UNUSED_VAR (prev_data)
`UNUSED_VAR (prev_waddr)
assign rdata_w = ram[raddr];
end else begin
assign rdata_w = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr];
if (RW_ASSERT) begin
`RUNTIME_ASSERT(~read || (rdata_w == ram[raddr]), ("read after write hazard"));
end
end
`endif
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (READ_ENABLE && reset) begin
rdata_r <= '0;
end else if (!READ_ENABLE || read) begin
rdata_r <= rdata_w;
end
end
assign rdata = rdata_r;
end else begin
assign rdata = rdata_w;
end
endmodule
`TRACING_ON

View file

@ -1,115 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
`TRACING_OFF
module VX_dp_ram_rst #(
parameter DATAW = 1,
parameter SIZE = 1,
parameter ADDR_MIN = 0,
parameter WRENW = 1,
parameter OUT_REG = 0,
parameter NO_RWCHECK = 0,
parameter LUTRAM = 0,
parameter INIT_ENABLE = 0,
parameter INIT_FILE = "",
parameter [DATAW-1:0] INIT_VALUE = 0,
parameter ADDRW = `LOG2UP(SIZE)
) (
input wire clk,
input wire reset,
input wire read,
input wire write,
input wire [WRENW-1:0] wren,
input wire [ADDRW-1:0] waddr,
input wire [DATAW-1:0] wdata,
input wire [ADDRW-1:0] raddr,
output wire [DATAW-1:0] rdata
);
localparam WSELW = DATAW / WRENW;
`STATIC_ASSERT((WRENW * WSELW == DATAW), ("invalid parameter"))
`define RAM_INITIALIZATION \
if (INIT_ENABLE != 0) begin \
if (INIT_FILE != "") begin \
initial $readmemh(INIT_FILE, ram); \
end else begin \
initial \
for (integer i = 0; i < SIZE; ++i) \
ram[i] = INIT_VALUE; \
end \
end
`UNUSED_VAR (read)
// RAM emulation
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
wire [DATAW-1:0] ram_n;
for (genvar i = 0; i < WRENW; ++i) begin
assign ram_n[i * WSELW +: WSELW] = ((WRENW == 1) | wren[i]) ? wdata[i * WSELW +: WSELW] : ram[waddr][i * WSELW +: WSELW];
end
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (reset) begin
for (integer i = 0; i < SIZE; ++i) begin
ram[i] <= DATAW'(INIT_VALUE);
end
rdata_r <= '0;
end else begin
if (write) begin
ram[waddr] <= ram_n;
end
if (read) begin
rdata_r <= ram[raddr];
end
end
end
assign rdata = rdata_r;
end else begin
reg [DATAW-1:0] prev_data;
reg [ADDRW-1:0] prev_waddr;
reg prev_write;
always @(posedge clk) begin
if (reset) begin
for (integer i = 0; i < SIZE; ++i) begin
ram[i] <= DATAW'(INIT_VALUE);
end
prev_write <= 0;
prev_data <= '0;
prev_waddr <= '0;
end else begin
if (write) begin
ram[waddr] <= ram_n;
end
prev_write <= (| wren);
prev_data <= ram[waddr];
prev_waddr <= waddr;
end
end
if (LUTRAM || !NO_RWCHECK) begin
`UNUSED_VAR (prev_write)
`UNUSED_VAR (prev_data)
`UNUSED_VAR (prev_waddr)
assign rdata = ram[raddr];
end else begin
assign rdata = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr];
end
end
endmodule
`TRACING_ON

View file

@ -18,7 +18,8 @@ module VX_elastic_buffer #(
parameter DATAW = 1,
parameter SIZE = 1,
parameter OUT_REG = 0,
parameter LUTRAM = 0
parameter LUTRAM = 0,
parameter MAX_FANOUT = 0
) (
input wire clk,
input wire reset,
@ -40,6 +41,43 @@ module VX_elastic_buffer #(
assign data_out = data_in;
assign ready_in = ready_out;
end else if (MAX_FANOUT != 0 && (DATAW > (MAX_FANOUT + MAX_FANOUT/2))) begin
localparam NUM_SLICES = `CDIV(DATAW, MAX_FANOUT);
localparam N_DATAW = DATAW / NUM_SLICES;
for (genvar i = 0; i < NUM_SLICES; ++i) begin
localparam S_DATAW = (i == NUM_SLICES-1) ? (DATAW - i * N_DATAW) : N_DATAW;
wire valid_out_t, ready_in_t;
`UNUSED_VAR (valid_out_t)
`UNUSED_VAR (ready_in_t)
`RESET_RELAY (slice_reset, reset);
VX_elastic_buffer #(
.DATAW (S_DATAW),
.SIZE (SIZE),
.OUT_REG (OUT_REG),
.LUTRAM (LUTRAM)
) buffer_slice (
.clk (clk),
.reset (slice_reset),
.valid_in (valid_in),
.data_in (data_in[i * N_DATAW +: S_DATAW]),
.ready_in (ready_in_t),
.valid_out (valid_out_t),
.data_out (data_out[i * N_DATAW +: S_DATAW]),
.ready_out (ready_out)
);
if (i == 0) begin
assign ready_in = ready_in_t;
assign valid_out = valid_out_t;
end
end
end else if (SIZE == 1) begin
VX_pipe_buffer #(
@ -103,9 +141,9 @@ module VX_elastic_buffer #(
assign ready_in = ~full;
VX_elastic_buffer #(
VX_pipe_buffer #(
.DATAW (DATAW),
.SIZE ((OUT_REG == 2) ? 1 : 0)
.DEPTH ((OUT_REG > 0) ? (OUT_REG-1) : 0)
) out_buf (
.clk (clk),
.reset (reset),

View file

@ -38,17 +38,16 @@ module VX_fair_arbiter #(
end else begin
reg [NUM_REQS-1:0] grant_mask;
reg [NUM_REQS-1:0] requests_r;
wire [NUM_REQS-1:0] requests_rem = requests & ~grant_mask;
wire rem_valid = (| requests_rem);
wire [NUM_REQS-1:0] requests_qual = rem_valid ? requests_rem : requests;
wire [NUM_REQS-1:0] requests_sel = requests_r & requests;
wire [NUM_REQS-1:0] requests_qual = (| requests_sel) ? requests_sel : requests;
always @(posedge clk) begin
if (reset) begin
grant_mask <= '0;
requests_r <= '0;
end else if (grant_ready) begin
grant_mask <= rem_valid ? (grant_mask | grant_onehot) : grant_onehot;
requests_r <= requests_qual & ~grant_onehot;
end
end

View file

@ -177,10 +177,11 @@ module VX_fifo_queue #(
.SIZE (DEPTH),
.LUTRAM (LUTRAM)
) dp_ram (
.clk(clk),
.clk (clk),
.reset (reset),
.read (1'b1),
.write (push),
`UNUSED_PIN (wren),
.wren (1'b1),
.waddr (wr_ptr_r),
.wdata (data_in),
.raddr (rd_ptr_r),
@ -226,9 +227,10 @@ module VX_fifo_queue #(
.LUTRAM (LUTRAM)
) dp_ram (
.clk (clk),
.reset (reset),
.read (1'b1),
.write (push),
`UNUSED_PIN (wren),
.wren (1'b1),
.waddr (wr_ptr_r),
.wdata (data_in),
.raddr (rd_ptr_n_r),

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -17,10 +17,10 @@
module VX_find_first #(
parameter N = 1,
parameter DATAW = 1,
parameter REVERSE = 0
parameter REVERSE = 0
) (
input wire [N-1:0][DATAW-1:0] data_in,
input wire [N-1:0] valid_in,
input wire [N-1:0] valid_in,
output wire [DATAW-1:0] data_out,
output wire valid_out
);
@ -37,10 +37,12 @@ module VX_find_first #(
assign s_n[TL+i] = REVERSE ? valid_in[N-1-i] : valid_in[i];
assign d_n[TL+i] = REVERSE ? data_in[N-1-i] : data_in[i];
end
for (genvar i = TL+N; i < TN; ++i) begin
assign s_n[i] = 0;
assign d_n[i] = '0;
if (TL < (TN-N)) begin
for (genvar i = TL+N; i < TN; ++i) begin
assign s_n[i] = 0;
assign d_n[i] = '0;
end
end
for (genvar j = 0; j < LOGN; ++j) begin
@ -48,10 +50,10 @@ module VX_find_first #(
assign s_n[2**j-1+i] = s_n[2**(j+1)-1+i*2] | s_n[2**(j+1)-1+i*2+1];
assign d_n[2**j-1+i] = s_n[2**(j+1)-1+i*2] ? d_n[2**(j+1)-1+i*2] : d_n[2**(j+1)-1+i*2+1];
end
end
end
assign valid_out = s_n[0];
assign data_out = d_n[0];
endmodule
`TRACING_ON

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -24,17 +24,17 @@ module VX_index_buffer #(
input wire reset,
output wire [ADDRW-1:0] write_addr,
input wire [DATAW-1:0] write_data,
input wire [DATAW-1:0] write_data,
input wire acquire_en,
input wire [ADDRW-1:0] read_addr,
output wire [DATAW-1:0] read_data,
input wire release_en,
output wire empty,
output wire full
output wire full
);
VX_allocator #(
.SIZE (SIZE)
) allocator (
@ -43,9 +43,9 @@ module VX_index_buffer #(
.acquire_en (acquire_en),
.acquire_addr (write_addr),
.release_en (release_en),
.release_addr (read_addr),
.release_addr (read_addr),
.empty (empty),
.full (full)
.full (full)
);
VX_dp_ram #(
@ -54,14 +54,15 @@ module VX_index_buffer #(
.LUTRAM (LUTRAM)
) data_table (
.clk (clk),
.reset (reset),
.read (1'b1),
.write (acquire_en),
`UNUSED_PIN (wren),
.wren (1'b1),
.waddr (write_addr),
.wdata (write_data),
.raddr (read_addr),
.rdata (read_data)
);
endmodule
`TRACING_ON

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -15,10 +15,10 @@
`TRACING_OFF
module VX_mem_adapter #(
parameter SRC_DATA_WIDTH = 1,
parameter SRC_ADDR_WIDTH = 1,
parameter DST_DATA_WIDTH = 1,
parameter DST_ADDR_WIDTH = 1,
parameter SRC_DATA_WIDTH = 1,
parameter SRC_ADDR_WIDTH = 1,
parameter DST_DATA_WIDTH = 1,
parameter DST_ADDR_WIDTH = 1,
parameter SRC_TAG_WIDTH = 1,
parameter DST_TAG_WIDTH = 1,
parameter REQ_OUT_BUF = 0,
@ -35,9 +35,9 @@ module VX_mem_adapter #(
input wire [SRC_TAG_WIDTH-1:0] mem_req_tag_in,
output wire mem_req_ready_in,
output wire mem_rsp_valid_in,
output wire [SRC_DATA_WIDTH-1:0] mem_rsp_data_in,
output wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_in,
output wire mem_rsp_valid_in,
output wire [SRC_DATA_WIDTH-1:0] mem_rsp_data_in,
output wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_in,
input wire mem_rsp_ready_in,
output wire mem_req_valid_out,
@ -48,12 +48,12 @@ module VX_mem_adapter #(
output wire [DST_TAG_WIDTH-1:0] mem_req_tag_out,
input wire mem_req_ready_out,
input wire mem_rsp_valid_out,
input wire [DST_DATA_WIDTH-1:0] mem_rsp_data_out,
input wire mem_rsp_valid_out,
input wire [DST_DATA_WIDTH-1:0] mem_rsp_data_out,
input wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_out,
output wire mem_rsp_ready_out
);
`STATIC_ASSERT ((DST_TAG_WIDTH >= SRC_TAG_WIDTH), ("oops!"))
);
`STATIC_ASSERT ((DST_TAG_WIDTH >= SRC_TAG_WIDTH), ("oops!"))
localparam DST_DATA_SIZE = (DST_DATA_WIDTH / 8);
localparam DST_LDATAW = `CLOG2(DST_DATA_WIDTH);
@ -69,7 +69,7 @@ module VX_mem_adapter #(
wire [DST_TAG_WIDTH-1:0] mem_req_tag_out_w;
wire mem_req_ready_out_w;
wire mem_rsp_valid_in_w;
wire mem_rsp_valid_in_w;
wire [SRC_DATA_WIDTH-1:0] mem_rsp_data_in_w;
wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_in_w;
wire mem_rsp_ready_in_w;
@ -80,7 +80,7 @@ module VX_mem_adapter #(
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
wire [D-1:0] req_idx = mem_req_addr_in[D-1:0];
wire [D-1:0] rsp_idx = mem_rsp_tag_out[D-1:0];
@ -99,31 +99,31 @@ module VX_mem_adapter #(
assign mem_req_valid_out_w = mem_req_valid_in;
assign mem_req_rw_out_w = mem_req_rw_in;
assign mem_req_byteen_out_w = DST_DATA_SIZE'(mem_req_byteen_in) << ((DST_LDATAW-3)'(req_idx) << (SRC_LDATAW-3));
assign mem_req_byteen_out_w = DST_DATA_SIZE'(mem_req_byteen_in) << ((DST_LDATAW-3)'(req_idx) << (SRC_LDATAW-3));
assign mem_req_data_out_w = DST_DATA_WIDTH'(mem_req_data_in) << ((DST_LDATAW'(req_idx)) << SRC_LDATAW);
assign mem_req_tag_out_w = DST_TAG_WIDTH'({mem_req_tag_in, req_idx});
assign mem_req_ready_in = mem_req_ready_out_w;
assign mem_rsp_valid_in_w = mem_rsp_valid_out;
assign mem_rsp_data_in_w = mem_rsp_data_out_w[rsp_idx];
assign mem_rsp_data_in_w = mem_rsp_data_out_w[rsp_idx];
assign mem_rsp_tag_in_w = SRC_TAG_WIDTH'(mem_rsp_tag_out[SRC_TAG_WIDTH+D-1:D]);
assign mem_rsp_ready_out = mem_rsp_ready_in_w;
end else if (DST_LDATAW < SRC_LDATAW) begin
reg [D-1:0] req_ctr, rsp_ctr;
reg [P-1:0][DST_DATA_WIDTH-1:0] mem_rsp_data_out_r, mem_rsp_data_out_n;
wire mem_req_out_fire = mem_req_valid_out && mem_req_ready_out;
wire mem_rsp_in_fire = mem_rsp_valid_out && mem_rsp_ready_out;
wire mem_rsp_in_fire = mem_rsp_valid_out && mem_rsp_ready_out;
wire [P-1:0][DST_DATA_WIDTH-1:0] mem_req_data_in_w = mem_req_data_in;
wire [P-1:0][DST_DATA_SIZE-1:0] mem_req_byteen_in_w = mem_req_byteen_in;
always @(*) begin
mem_rsp_data_out_n = mem_rsp_data_out_r;
if (mem_rsp_in_fire) begin
if (mem_rsp_in_fire) begin
mem_rsp_data_out_n[rsp_ctr] = mem_rsp_data_out;
end
end
@ -139,24 +139,24 @@ module VX_mem_adapter #(
if (mem_rsp_in_fire) begin
rsp_ctr <= rsp_ctr + 1;
end
end
end
mem_rsp_data_out_r <= mem_rsp_data_out_n;
end
reg [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_r;
wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_x;
always @(posedge clk) begin
if (mem_rsp_in_fire) begin
mem_rsp_tag_in_r <= mem_rsp_tag_out;
end
end
end
assign mem_rsp_tag_in_x = (rsp_ctr != 0) ? mem_rsp_tag_in_r : mem_rsp_tag_out;
`RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_x == mem_rsp_tag_out),
`RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_x == mem_rsp_tag_out),
("%t: *** out-of-order memory reponse! cur=%d, expected=%d", $time, mem_rsp_tag_in_x, mem_rsp_tag_out))
wire [SRC_ADDR_WIDTH+D-1:0] mem_req_addr_in_qual = {mem_req_addr_in, req_ctr};
if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH + D)) begin
`UNUSED_VAR (mem_req_addr_in_qual)
assign mem_req_addr_out_w = mem_req_addr_in_qual[DST_ADDR_WIDTH-1:0];
@ -181,8 +181,8 @@ module VX_mem_adapter #(
end else begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (reset)
if (DST_ADDR_WIDTH < SRC_ADDR_WIDTH) begin
`UNUSED_VAR (mem_req_addr_in)
assign mem_req_addr_out_w = mem_req_addr_in[DST_ADDR_WIDTH-1:0];

View file

@ -87,16 +87,16 @@ module VX_mem_coalescer #(
localparam STATE_SETUP = 0;
localparam STATE_SEND = 1;
reg state_r, state_n;
logic state_r, state_n;
reg out_req_valid_r, out_req_valid_n;
reg out_req_rw_r, out_req_rw_n;
reg [OUT_REQS-1:0] out_req_mask_r, out_req_mask_n;
reg [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr_r, out_req_addr_n;
reg [OUT_REQS-1:0][ATYPE_WIDTH-1:0] out_req_atype_r, out_req_atype_n;
reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] out_req_byteen_r, out_req_byteen_n;
reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_WIDTH-1:0] out_req_data_r, out_req_data_n;
reg [OUT_TAG_WIDTH-1:0] out_req_tag_r, out_req_tag_n;
logic out_req_valid_r, out_req_valid_n;
logic out_req_rw_r, out_req_rw_n;
logic [OUT_REQS-1:0] out_req_mask_r, out_req_mask_n;
logic [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr_r, out_req_addr_n;
logic [OUT_REQS-1:0][ATYPE_WIDTH-1:0] out_req_atype_r, out_req_atype_n;
logic [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] out_req_byteen_r, out_req_byteen_n;
logic [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_WIDTH-1:0] out_req_data_r, out_req_data_n;
logic [OUT_TAG_WIDTH-1:0] out_req_tag_r, out_req_tag_n;
reg in_req_ready_n;
@ -135,7 +135,11 @@ module VX_mem_coalescer #(
`UNUSED_PIN (onehot),
.valid_out (batch_valid_n[i])
);
assign seed_idx[i] = NUM_REQS_W'(i * DATA_RATIO) + NUM_REQS_W'(batch_idx);
if (OUT_REQS > 1) begin
assign seed_idx[i] = {(NUM_REQS_W-DATA_RATIO_W)'(i), batch_idx};
end else begin
assign seed_idx[i] = batch_idx;
end
end
for (genvar i = 0; i < OUT_REQS; ++i) begin
@ -149,29 +153,6 @@ module VX_mem_coalescer #(
end
end
always @(posedge clk) begin
if (reset) begin
state_r <= STATE_SETUP;
processed_mask_r <= '0;
out_req_valid_r <= 0;
end else begin
state_r <= state_n;
batch_valid_r <= batch_valid_n;
seed_addr_r <= seed_addr_n;
seed_atype_r <= seed_atype_n;
addr_matches_r <= addr_matches_n;
out_req_valid_r <= out_req_valid_n;
out_req_mask_r <= out_req_mask_n;
out_req_rw_r <= out_req_rw_n;
out_req_addr_r <= out_req_addr_n;
out_req_atype_r <= out_req_atype_n;
out_req_byteen_r <= out_req_byteen_n;
out_req_data_r <= out_req_data_n;
out_req_tag_r <= out_req_tag_n;
processed_mask_r <= processed_mask_n;
end
end
wire [NUM_REQS-1:0] current_pmask = in_req_mask & addr_matches_r;
reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] req_byteen_merged;
@ -248,6 +229,17 @@ module VX_mem_coalescer #(
endcase
end
VX_pipe_register #(
.DATAW (1 + NUM_REQS + 1 + 1 + NUM_REQS + OUT_REQS * (1 + 1 + OUT_ADDR_WIDTH + ATYPE_WIDTH + OUT_ADDR_WIDTH + ATYPE_WIDTH + DATA_OUT_SIZE + DATA_OUT_WIDTH) + OUT_TAG_WIDTH),
.RESETW (1 + NUM_REQS + 1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({state_n, processed_mask_n, out_req_valid_n, out_req_rw_n, addr_matches_n, batch_valid_n, out_req_mask_n, seed_addr_n, seed_atype_n, out_req_addr_n, out_req_atype_n, out_req_byteen_n, out_req_data_n, out_req_tag_n}),
.data_out ({state_r, processed_mask_r, out_req_valid_r, out_req_rw_r, addr_matches_r, batch_valid_r, out_req_mask_r, seed_addr_r, seed_atype_r, out_req_addr_r, out_req_atype_r, out_req_byteen_r, out_req_data_r, out_req_tag_r})
);
wire out_rsp_fire = out_rsp_valid && out_rsp_ready;
wire out_rsp_eop;

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -23,13 +23,13 @@ module VX_onehot_encoder #(
parameter MODEL = 1,
parameter LN = `LOG2UP(N)
) (
input wire [N-1:0] data_in,
input wire [N-1:0] data_in,
output wire [LN-1:0] data_out,
output wire valid_out
);
);
if (N == 1) begin
assign data_out = data_in;
assign data_out = 0;
assign valid_out = data_in;
end else if (N == 2) begin
@ -37,43 +37,43 @@ module VX_onehot_encoder #(
assign data_out = data_in[!REVERSE];
assign valid_out = (| data_in);
end else if (MODEL == 1) begin
localparam M = 1 << LN;
`IGNORE_UNOPTFLAT_BEGIN
end else if (MODEL == 1) begin
localparam M = 1 << LN;
`IGNORE_UNOPTFLAT_BEGIN
wire [LN-1:0][M-1:0] addr;
wire [LN:0][M-1:0] v;
`IGNORE_UNOPTFLAT_END
// base case, also handle padding for non-power of two inputs
assign v[0] = REVERSE ? (M'(data_in) << (M - N)) : M'(data_in);
for (genvar lvl = 1; lvl < (LN+1); ++lvl) begin
localparam SN = 1 << (LN - lvl);
localparam SI = M / SN;
localparam SW = lvl;
for (genvar s = 0; s < SN; ++s) begin
`IGNORE_UNOPTFLAT_BEGIN
wire [1:0] vs = {v[lvl-1][s*SI+(SI>>1)], v[lvl-1][s*SI]};
`IGNORE_UNOPTFLAT_END
assign v[lvl][s*SI] = (| vs);
if (lvl == 1) begin
assign addr[lvl-1][s*SI +: SW] = vs[!REVERSE];
assign addr[lvl-1][s*SI +: SW] = vs[!REVERSE];
end else begin
assign addr[lvl-1][s*SI +: SW] = {
assign addr[lvl-1][s*SI +: SW] = {
vs[!REVERSE],
addr[lvl-2][s*SI +: SW-1] | addr[lvl-2][s*SI+(SI>>1) +: SW-1]
};
end
end
end
end
end
end
assign data_out = addr[LN-1][LN-1:0];
assign valid_out = v[LN][0];
end else if (MODEL == 2 && REVERSE == 0) begin
end else if (MODEL == 2 && REVERSE == 0) begin
for (genvar j = 0; j < LN; ++j) begin
wire [N-1:0] mask;
@ -90,19 +90,19 @@ module VX_onehot_encoder #(
reg [LN-1:0] index_r;
if (REVERSE != 0) begin
always @(*) begin
index_r = 'x;
always @(*) begin
index_r = 'x;
for (integer i = N-1; i >= 0; --i) begin
if (data_in[i]) begin
if (data_in[i]) begin
index_r = LN'(N-1-i);
end
end
end
end else begin
always @(*) begin
index_r = 'x;
always @(*) begin
index_r = 'x;
for (integer i = 0; i < N; ++i) begin
if (data_in[i]) begin
if (data_in[i]) begin
index_r = LN'(i);
end
end

View file

@ -17,7 +17,8 @@
module VX_onehot_mux #(
parameter DATAW = 1,
parameter N = 1,
parameter MODEL = 1
parameter MODEL = 1,
parameter LUT_OPT = 0
) (
input wire [N-1:0][DATAW-1:0] data_in,
input wire [N-1:0] sel_in,
@ -26,6 +27,90 @@ module VX_onehot_mux #(
if (N == 1) begin
`UNUSED_VAR (sel_in)
assign data_out = data_in;
end else if (LUT_OPT && N == 2) begin
`UNUSED_VAR (sel_in)
assign data_out = sel_in[0] ? data_in[0] : data_in[1];
end else if (LUT_OPT && N == 3) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
3'b001: data_out_r = data_in[0];
3'b010: data_out_r = data_in[1];
3'b100: data_out_r = data_in[2];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else if (LUT_OPT && N == 4) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
4'b0001: data_out_r = data_in[0];
4'b0010: data_out_r = data_in[1];
4'b0100: data_out_r = data_in[2];
4'b1000: data_out_r = data_in[3];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else if (LUT_OPT && N == 5) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
5'b00001: data_out_r = data_in[0];
5'b00010: data_out_r = data_in[1];
5'b00100: data_out_r = data_in[2];
5'b01000: data_out_r = data_in[3];
5'b10000: data_out_r = data_in[4];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else if (LUT_OPT && N == 6) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
6'b000001: data_out_r = data_in[0];
6'b000010: data_out_r = data_in[1];
6'b000100: data_out_r = data_in[2];
6'b001000: data_out_r = data_in[3];
6'b010000: data_out_r = data_in[4];
6'b100000: data_out_r = data_in[5];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else if (LUT_OPT && N == 7) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
7'b0000001: data_out_r = data_in[0];
7'b0000010: data_out_r = data_in[1];
7'b0000100: data_out_r = data_in[2];
7'b0001000: data_out_r = data_in[3];
7'b0010000: data_out_r = data_in[4];
7'b0100000: data_out_r = data_in[5];
7'b1000000: data_out_r = data_in[6];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else if (LUT_OPT && N == 8) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
8'b00000001: data_out_r = data_in[0];
8'b00000010: data_out_r = data_in[1];
8'b00000100: data_out_r = data_in[2];
8'b00001000: data_out_r = data_in[3];
8'b00010000: data_out_r = data_in[4];
8'b00100000: data_out_r = data_in[5];
8'b01000000: data_out_r = data_in[6];
8'b10000000: data_out_r = data_in[7];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else if (MODEL == 1) begin
wire [N-1:0][DATAW-1:0] mask;
for (genvar i = 0; i < N; ++i) begin

View file

@ -21,7 +21,8 @@ module VX_pe_serializer #(
parameter DATA_IN_WIDTH = 1,
parameter DATA_OUT_WIDTH = 1,
parameter TAG_WIDTH = 0,
parameter PE_REG = 0
parameter PE_REG = 0,
parameter OUT_BUF = 0
) (
input wire clk,
input wire reset,
@ -43,6 +44,11 @@ module VX_pe_serializer #(
output wire [TAG_WIDTH-1:0] tag_out,
input wire ready_out
);
wire valid_out_u;
wire [NUM_LANES-1:0][DATA_OUT_WIDTH-1:0] data_out_u;
wire [TAG_WIDTH-1:0] tag_out_u;
wire ready_out_u;
wire [NUM_PES-1:0][DATA_IN_WIDTH-1:0] pe_data_in_s;
wire valid_out_s;
wire [TAG_WIDTH-1:0] tag_out_s;
@ -105,7 +111,7 @@ module VX_pe_serializer #(
reg [TAG_WIDTH-1:0] tag_out_r;
wire valid_out_b = valid_out_s && batch_out_done;
wire ready_out_b = ready_out || ~valid_out;
wire ready_out_b = ready_out_u || ~valid_out_u;
always @(posedge clk) begin
if (reset) begin
@ -119,29 +125,42 @@ module VX_pe_serializer #(
end
end
assign enable = ready_out_b || ~valid_out_b;
assign ready_in = enable && batch_in_done;
assign enable = ready_out_b || ~valid_out_b;
assign ready_in = enable && batch_in_done;
assign pe_enable = enable;
assign pe_enable = enable;
assign valid_out = valid_out_r;
assign data_out = data_out_r;
assign tag_out = tag_out_r;
assign valid_out_u = valid_out_r;
assign data_out_u = data_out_r;
assign tag_out_u = tag_out_r;
end else begin
assign pe_data_in_s = data_in;
assign enable = ready_out || ~valid_out;
assign ready_in = enable;
assign enable = ready_out_u || ~valid_out_u;
assign ready_in = enable;
assign pe_enable = enable;
assign pe_enable = enable;
assign valid_out = valid_out_s;
assign data_out = pe_data_out;
assign tag_out = tag_out_s;
assign valid_out_u = valid_out_s;
assign data_out_u = pe_data_out;
assign tag_out_u = tag_out_s;
end
VX_elastic_buffer #(
.DATAW (NUM_LANES * DATA_OUT_WIDTH + TAG_WIDTH),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (valid_out_u),
.ready_in (ready_out_u),
.data_in ({data_out_u, tag_out_u}),
.data_out ({data_out, tag_out}),
.valid_out (valid_out),
.ready_out (ready_out)
);
endmodule
`TRACING_ON

View file

@ -1,11 +1,11 @@
// Copyright 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -24,39 +24,53 @@
`TRACING_OFF
module VX_pipe_buffer #(
parameter DATAW = 1,
parameter PASSTHRU = 0
) (
parameter DATAW = 1,
parameter DEPTH = 1
) (
input wire clk,
input wire reset,
input wire valid_in,
output wire ready_in,
output wire ready_in,
input wire [DATAW-1:0] data_in,
output wire [DATAW-1:0] data_out,
input wire ready_out,
output wire valid_out
);
if (PASSTHRU != 0) begin
);
if (DEPTH == 0) begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
assign ready_in = ready_out;
assign valid_out = valid_in;
assign valid_out = valid_in;
assign data_out = data_in;
end else begin
wire stall = valid_out && ~ready_out;
wire [DEPTH:0] valid;
`IGNORE_UNOPTFLAT_BEGIN
wire [DEPTH:0] ready;
`IGNORE_UNOPTFLAT_END
wire [DEPTH:0][DATAW-1:0] data;
VX_pipe_register #(
.DATAW (1 + DATAW),
.RESETW (1)
) pipe_register (
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in, data_in}),
.data_out ({valid_out, data_out})
);
assign valid[0] = valid_in;
assign data[0] = data_in;
assign ready_in = ready[0];
for (genvar i = 0; i < DEPTH; ++i) begin
assign ready[i] = (ready[i+1] || ~valid[i+1]);
VX_pipe_register #(
.DATAW (1 + DATAW),
.RESETW (1)
) pipe_register (
.clk (clk),
.reset (reset),
.enable (ready[i]),
.data_in ({valid[i], data[i]}),
.data_out ({valid[i+1], data[i+1]})
);
end
assign valid_out = valid[DEPTH];
assign data_out = data[DEPTH];
assign ready[DEPTH] = ready_out;
assign ready_in = ~stall;
end
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,10 +14,11 @@
`include "VX_platform.vh"
`TRACING_OFF
module VX_pipe_register #(
parameter DATAW = 1,
parameter RESETW = 0,
parameter DEPTH = 1
module VX_pipe_register #(
parameter DATAW = 1,
parameter RESETW = 0,
parameter DEPTH = 1,
parameter MAX_FANOUT = 0
) (
input wire clk,
input wire reset,
@ -25,54 +26,76 @@ module VX_pipe_register #(
input wire [DATAW-1:0] data_in,
output wire [DATAW-1:0] data_out
);
if (DEPTH == 0) begin
if (DEPTH == 0) begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (enable)
assign data_out = data_in;
end else if (DEPTH == 1) begin
if (RESETW == 0) begin
`UNUSED_VAR (reset)
reg [DATAW-1:0] value;
always @(posedge clk) begin
if (enable) begin
value <= data_in;
end
assign data_out = data_in;
end else if (DEPTH == 1) begin
if (MAX_FANOUT != 0 && (DATAW > (MAX_FANOUT + MAX_FANOUT/2))) begin
localparam NUM_SLICES = `CDIV(DATAW, MAX_FANOUT);
localparam N_DATAW = DATAW / NUM_SLICES;
for (genvar i = 0; i < NUM_SLICES; ++i) begin
localparam SLICE_START = i * N_DATAW;
localparam SLICE_END = SLICE_START + S_DATAW - 1;
localparam S_DATAW = (i == NUM_SLICES-1) ? (DATAW - SLICE_START) : N_DATAW;
localparam S_RESETW = (SLICE_END >= (DATAW - RESETW)) ?
((SLICE_START >= (DATAW - RESETW)) ? S_DATAW : (SLICE_END - (DATAW - RESETW) + 1)) : 0;
VX_pipe_register #(
.DATAW (S_DATAW),
.RESETW (S_RESETW)
) pipe_register_slice (
.clk (clk),
.reset (reset),
.enable (enable),
.data_in (data_in[i * N_DATAW +: S_DATAW]),
.data_out (data_out[i * N_DATAW +: S_DATAW])
);
end
assign data_out = value;
end else if (RESETW == DATAW) begin
reg [DATAW-1:0] value;
always @(posedge clk) begin
if (reset) begin
value <= RESETW'(0);
end else if (enable) begin
value <= data_in;
end
end
assign data_out = value;
end else begin
reg [DATAW-RESETW-1:0] value_d;
reg [RESETW-1:0] value_r;
if (RESETW == 0) begin
`UNUSED_VAR (reset)
reg [DATAW-1:0] value;
always @(posedge clk) begin
if (reset) begin
value_r <= RESETW'(0);
end else if (enable) begin
value_r <= data_in[DATAW-1:DATAW-RESETW];
always @(posedge clk) begin
if (enable) begin
value <= data_in;
end
end
assign data_out = value;
end else if (RESETW == DATAW) begin
reg [DATAW-1:0] value;
always @(posedge clk) begin
if (reset) begin
value <= RESETW'(0);
end else if (enable) begin
value <= data_in;
end
end
assign data_out = value;
end else begin
reg [DATAW-RESETW-1:0] value_d;
reg [RESETW-1:0] value_r;
always @(posedge clk) begin
if (reset) begin
value_r <= RESETW'(0);
end else if (enable) begin
value_r <= data_in[DATAW-1:DATAW-RESETW];
end
end
always @(posedge clk) begin
if (enable) begin
value_d <= data_in[DATAW-RESETW-1:0];
end
end
assign data_out = {value_r, value_d};
end
always @(posedge clk) begin
if (enable) begin
value_d <= data_in[DATAW-RESETW-1:0];
end
end
assign data_out = {value_r, value_d};
end
end else begin
wire [DEPTH:0][DATAW-1:0] data_delayed;
wire [DEPTH:0][DATAW-1:0] data_delayed;
assign data_delayed[0] = data_in;
for (genvar i = 1; i <= DEPTH; ++i) begin
VX_pipe_register #(

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -21,8 +21,8 @@ module VX_reset_relay #(
input wire clk,
input wire reset,
output wire [N-1:0] reset_o
);
if (MAX_FANOUT >= 0 && N > MAX_FANOUT) begin
);
if (MAX_FANOUT >= 0 && N > (MAX_FANOUT + MAX_FANOUT/2)) begin
localparam F = `UP(MAX_FANOUT);
localparam R = N / F;
`PRESERVE_NET reg [R-1:0] reset_r;
@ -38,6 +38,6 @@ module VX_reset_relay #(
`UNUSED_VAR (clk)
assign reset_o = {N{reset}};
end
endmodule
`TRACING_ON

View file

@ -15,9 +15,10 @@
`TRACING_OFF
module VX_rr_arbiter #(
parameter NUM_REQS = 1,
parameter MODEL = 1,
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
parameter NUM_REQS = 1,
parameter MODEL = 1,
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS),
parameter LUT_OPT = 0
) (
input wire clk,
input wire reset,
@ -37,7 +38,7 @@ module VX_rr_arbiter #(
assign grant_onehot = requests;
assign grant_valid = requests[0];
end else if (NUM_REQS == 2) begin
end else if (LUT_OPT && NUM_REQS == 2) begin
reg [LOG_NUM_REQS-1:0] grant_index_r;
reg [NUM_REQS-1:0] grant_onehot_r;
@ -63,7 +64,7 @@ module VX_rr_arbiter #(
assign grant_onehot = grant_onehot_r;
assign grant_valid = (| requests);
end /*else if (NUM_REQS == 3) begin
end else if (LUT_OPT && NUM_REQS == 3) begin
reg [LOG_NUM_REQS-1:0] grant_index_r;
reg [NUM_REQS-1:0] grant_onehot_r;
@ -93,7 +94,7 @@ module VX_rr_arbiter #(
assign grant_onehot = grant_onehot_r;
assign grant_valid = (| requests);
end */else if (NUM_REQS == 4) begin
end else if (LUT_OPT && NUM_REQS == 4) begin
reg [LOG_NUM_REQS-1:0] grant_index_r;
reg [NUM_REQS-1:0] grant_onehot_r;
@ -129,7 +130,7 @@ module VX_rr_arbiter #(
assign grant_onehot = grant_onehot_r;
assign grant_valid = (| requests);
end /*else if (NUM_REQS == 5) begin
end else if (LUT_OPT && NUM_REQS == 5) begin
reg [LOG_NUM_REQS-1:0] grant_index_r;
reg [NUM_REQS-1:0] grant_onehot_r;
@ -173,7 +174,7 @@ module VX_rr_arbiter #(
assign grant_onehot = grant_onehot_r;
assign grant_valid = (| requests);
end else if (NUM_REQS == 6) begin
end else if (LUT_OPT && NUM_REQS == 6) begin
reg [LOG_NUM_REQS-1:0] grant_index_r;
reg [NUM_REQS-1:0] grant_onehot_r;
@ -227,7 +228,7 @@ module VX_rr_arbiter #(
assign grant_onehot = grant_onehot_r;
assign grant_valid = (| requests);
end else if (NUM_REQS == 7) begin
end else if (LUT_OPT && NUM_REQS == 7) begin
reg [LOG_NUM_REQS-1:0] grant_index_r;
reg [NUM_REQS-1:0] grant_onehot_r;
@ -293,7 +294,7 @@ module VX_rr_arbiter #(
assign grant_onehot = grant_onehot_r;
assign grant_valid = (| requests);
end */else if (NUM_REQS == 8) begin
end else if (LUT_OPT && NUM_REQS == 8) begin
reg [LOG_NUM_REQS-1:0] grant_index_r;
reg [NUM_REQS-1:0] grant_onehot_r;

View file

@ -21,13 +21,16 @@ module VX_sp_ram #(
parameter WRENW = 1,
parameter OUT_REG = 0,
parameter NO_RWCHECK = 0,
parameter RW_ASSERT = 0,
parameter LUTRAM = 0,
parameter RESET_RAM = 0,
parameter INIT_ENABLE = 0,
parameter INIT_FILE = "",
parameter [DATAW-1:0] INIT_VALUE = 0,
parameter ADDRW = `LOG2UP(SIZE)
) (
input wire clk,
input wire reset,
input wire read,
input wire write,
input wire [WRENW-1:0] wren,
@ -42,13 +45,16 @@ module VX_sp_ram #(
.WRENW (WRENW),
.OUT_REG (OUT_REG),
.NO_RWCHECK (NO_RWCHECK),
.RW_ASSERT (RW_ASSERT),
.LUTRAM (LUTRAM),
.RESET_RAM (RESET_RAM),
.INIT_ENABLE (INIT_ENABLE),
.INIT_FILE (INIT_FILE),
.INIT_VALUE (INIT_VALUE),
.ADDRW (ADDRW)
) dp_ram (
.clk (clk),
.reset (reset),
.read (read),
.write (write),
.wren (wren),

View file

@ -18,7 +18,7 @@ module VX_stream_arb #(
parameter NUM_INPUTS = 1,
parameter NUM_OUTPUTS = 1,
parameter DATAW = 1,
parameter `STRING ARBITER = "P",
parameter `STRING ARBITER = "R",
parameter MAX_FANOUT = `MAX_FANOUT,
parameter OUT_BUF = 0,
parameter LUTRAM = 0,
@ -46,14 +46,14 @@ module VX_stream_arb #(
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
localparam BATCH_BEGIN = i * NUM_REQS;
localparam BATCH_END = `MIN(BATCH_BEGIN + NUM_REQS, NUM_INPUTS);
localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN;
localparam SLICE_BEGIN = i * NUM_REQS;
localparam SLICE_END = `MIN(SLICE_BEGIN + NUM_REQS, NUM_INPUTS);
localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN;
`RESET_RELAY (slice_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (BATCH_SIZE),
.NUM_INPUTS (SLICE_SIZE),
.NUM_OUTPUTS (1),
.DATAW (DATAW),
.ARBITER (ARBITER),
@ -63,9 +63,9 @@ module VX_stream_arb #(
) arb_slice (
.clk (clk),
.reset (slice_reset),
.valid_in (valid_in[BATCH_END-1: BATCH_BEGIN]),
.ready_in (ready_in[BATCH_END-1: BATCH_BEGIN]),
.data_in (data_in[BATCH_END-1: BATCH_BEGIN]),
.valid_in (valid_in[SLICE_END-1: SLICE_BEGIN]),
.ready_in (ready_in[SLICE_END-1: SLICE_BEGIN]),
.data_in (data_in[SLICE_END-1: SLICE_BEGIN]),
.data_out (data_out[i]),
.sel_out (sel_out[i]),
.valid_out (valid_out[i]),
@ -73,32 +73,32 @@ module VX_stream_arb #(
);
end
end else if (MAX_FANOUT != 0 && (NUM_INPUTS > MAX_FANOUT)) begin
end else if (MAX_FANOUT != 0 && (NUM_INPUTS > (MAX_FANOUT + MAX_FANOUT /2))) begin
// (#inputs > max_fanout) and (#outputs == 1)
localparam NUM_BATCHES = `CDIV(NUM_INPUTS, MAX_FANOUT);
localparam NUM_SLICES = `CDIV(NUM_INPUTS, MAX_FANOUT);
localparam LOG_NUM_REQS2 = `CLOG2(MAX_FANOUT);
localparam LOG_NUM_REQS3 = `CLOG2(NUM_BATCHES);
localparam LOG_NUM_REQS3 = `CLOG2(NUM_SLICES);
wire [NUM_BATCHES-1:0] valid_tmp;
wire [NUM_BATCHES-1:0][DATAW+LOG_NUM_REQS2-1:0] data_tmp;
wire [NUM_BATCHES-1:0] ready_tmp;
wire [NUM_SLICES-1:0] valid_tmp;
wire [NUM_SLICES-1:0][DATAW+LOG_NUM_REQS2-1:0] data_tmp;
wire [NUM_SLICES-1:0] ready_tmp;
for (genvar i = 0; i < NUM_BATCHES; ++i) begin
for (genvar i = 0; i < NUM_SLICES; ++i) begin
localparam BATCH_BEGIN = i * MAX_FANOUT;
localparam BATCH_END = `MIN(BATCH_BEGIN + MAX_FANOUT, NUM_INPUTS);
localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN;
localparam SLICE_BEGIN = i * MAX_FANOUT;
localparam SLICE_END = `MIN(SLICE_BEGIN + MAX_FANOUT, NUM_INPUTS);
localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN;
wire [DATAW-1:0] data_tmp_u;
wire [`LOG2UP(BATCH_SIZE)-1:0] sel_tmp_u;
wire [`LOG2UP(SLICE_SIZE)-1:0] sel_tmp_u;
`RESET_RELAY (slice_reset, reset);
if (MAX_FANOUT != 1) begin
VX_stream_arb #(
.NUM_INPUTS (BATCH_SIZE),
.NUM_INPUTS (SLICE_SIZE),
.NUM_OUTPUTS (1),
.DATAW (DATAW),
.ARBITER (ARBITER),
@ -108,9 +108,9 @@ module VX_stream_arb #(
) fanout_slice_arb (
.clk (clk),
.reset (slice_reset),
.valid_in (valid_in[BATCH_END-1: BATCH_BEGIN]),
.data_in (data_in[BATCH_END-1: BATCH_BEGIN]),
.ready_in (ready_in[BATCH_END-1: BATCH_BEGIN]),
.valid_in (valid_in[SLICE_END-1: SLICE_BEGIN]),
.data_in (data_in[SLICE_END-1: SLICE_BEGIN]),
.ready_in (ready_in[SLICE_END-1: SLICE_BEGIN]),
.valid_out (valid_tmp[i]),
.data_out (data_tmp_u),
.sel_out (sel_tmp_u),
@ -125,7 +125,7 @@ module VX_stream_arb #(
wire [LOG_NUM_REQS3-1:0] sel_out_u;
VX_stream_arb #(
.NUM_INPUTS (NUM_BATCHES),
.NUM_INPUTS (NUM_SLICES),
.NUM_OUTPUTS (1),
.DATAW (DATAW + LOG_NUM_REQS2),
.ARBITER (ARBITER),
@ -174,17 +174,9 @@ module VX_stream_arb #(
);
assign valid_in_r = arb_valid;
assign data_in_r = data_in[arb_index];
assign arb_ready = ready_in_r;
VX_onehot_mux #(
.DATAW (DATAW),
.N (NUM_REQS)
) onehot_mux (
.data_in (data_in),
.sel_in (arb_onehot),
.data_out (data_in_r)
);
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign ready_in[i] = ready_in_r && arb_onehot[i];
end
@ -214,15 +206,15 @@ module VX_stream_arb #(
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
localparam BATCH_BEGIN = i * NUM_REQS;
localparam BATCH_END = `MIN(BATCH_BEGIN + NUM_REQS, NUM_OUTPUTS);
localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN;
localparam SLICE_BEGIN = i * NUM_REQS;
localparam SLICE_END = `MIN(SLICE_BEGIN + NUM_REQS, NUM_OUTPUTS);
localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN;
`RESET_RELAY (slice_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (1),
.NUM_OUTPUTS (BATCH_SIZE),
.NUM_OUTPUTS (SLICE_SIZE),
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
@ -234,30 +226,30 @@ module VX_stream_arb #(
.valid_in (valid_in[i]),
.ready_in (ready_in[i]),
.data_in (data_in[i]),
.data_out (data_out[BATCH_END-1: BATCH_BEGIN]),
.valid_out (valid_out[BATCH_END-1: BATCH_BEGIN]),
.ready_out (ready_out[BATCH_END-1: BATCH_BEGIN]),
.data_out (data_out[SLICE_END-1: SLICE_BEGIN]),
.valid_out (valid_out[SLICE_END-1: SLICE_BEGIN]),
.ready_out (ready_out[SLICE_END-1: SLICE_BEGIN]),
`UNUSED_PIN (sel_out)
);
for (genvar j = BATCH_BEGIN; j < BATCH_END; ++j) begin
for (genvar j = SLICE_BEGIN; j < SLICE_END; ++j) begin
assign sel_out[j] = i;
end
end
end else if (MAX_FANOUT != 0 && (NUM_OUTPUTS > MAX_FANOUT)) begin
end else if (MAX_FANOUT != 0 && (NUM_OUTPUTS > (MAX_FANOUT + MAX_FANOUT /2))) begin
// (#inputs == 1) and (#outputs > max_fanout)
localparam NUM_BATCHES = `CDIV(NUM_OUTPUTS, MAX_FANOUT);
localparam NUM_SLICES = `CDIV(NUM_OUTPUTS, MAX_FANOUT);
wire [NUM_BATCHES-1:0] valid_tmp;
wire [NUM_BATCHES-1:0][DATAW-1:0] data_tmp;
wire [NUM_BATCHES-1:0] ready_tmp;
wire [NUM_SLICES-1:0] valid_tmp;
wire [NUM_SLICES-1:0][DATAW-1:0] data_tmp;
wire [NUM_SLICES-1:0] ready_tmp;
VX_stream_arb #(
.NUM_INPUTS (1),
.NUM_OUTPUTS (NUM_BATCHES),
.NUM_OUTPUTS (NUM_SLICES),
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
@ -275,17 +267,17 @@ module VX_stream_arb #(
`UNUSED_PIN (sel_out)
);
for (genvar i = 0; i < NUM_BATCHES; ++i) begin
for (genvar i = 0; i < NUM_SLICES; ++i) begin
localparam BATCH_BEGIN = i * MAX_FANOUT;
localparam BATCH_END = `MIN(BATCH_BEGIN + MAX_FANOUT, NUM_OUTPUTS);
localparam BATCH_SIZE = BATCH_END - BATCH_BEGIN;
localparam SLICE_BEGIN = i * MAX_FANOUT;
localparam SLICE_END = `MIN(SLICE_BEGIN + MAX_FANOUT, NUM_OUTPUTS);
localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN;
`RESET_RELAY (slice_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (1),
.NUM_OUTPUTS (BATCH_SIZE),
.NUM_OUTPUTS (SLICE_SIZE),
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
@ -297,9 +289,9 @@ module VX_stream_arb #(
.valid_in (valid_tmp[i]),
.ready_in (ready_tmp[i]),
.data_in (data_tmp[i]),
.data_out (data_out[BATCH_END-1: BATCH_BEGIN]),
.valid_out (valid_out[BATCH_END-1: BATCH_BEGIN]),
.ready_out (ready_out[BATCH_END-1: BATCH_BEGIN]),
.data_out (data_out[SLICE_END-1: SLICE_BEGIN]),
.valid_out (valid_out[SLICE_END-1: SLICE_BEGIN]),
.ready_out (ready_out[SLICE_END-1: SLICE_BEGIN]),
`UNUSED_PIN (sel_out)
);
end
@ -357,9 +349,9 @@ module VX_stream_arb #(
// #Inputs == #Outputs
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
`RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT);
`RESET_RELAY_EN (out_buf_reset, reset, (NUM_OUTPUTS > 1));
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
VX_elastic_buffer #(
.DATAW (DATAW),
@ -368,7 +360,7 @@ module VX_stream_arb #(
.LUTRAM (LUTRAM)
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.reset (out_buf_reset[i]),
.valid_in (valid_in[i]),
.ready_in (ready_in[i]),
.data_in (data_in[i]),

View file

@ -39,8 +39,9 @@ module VX_stream_pack #(
input wire ready_out
);
if (NUM_REQS > 1) begin
localparam LOG_NUM_REQS = `CLOG2(NUM_REQS);
wire [NUM_REQS-1:0] grant_onehot;
wire [LOG_NUM_REQS-1:0] grant_index;
wire grant_valid;
wire grant_ready;
@ -52,21 +53,12 @@ module VX_stream_pack #(
.reset (reset),
.requests (valid_in),
.grant_valid (grant_valid),
`UNUSED_PIN (grant_index),
.grant_onehot(grant_onehot),
.grant_index (grant_index),
`UNUSED_PIN (grant_onehot),
.grant_ready (grant_ready)
);
wire [TAG_WIDTH-1:0] tag_sel;
VX_onehot_mux #(
.DATAW (TAG_WIDTH),
.N (NUM_REQS)
) onehot_mux (
.data_in (tag_in),
.sel_in (grant_onehot),
.data_out (tag_sel)
);
wire [TAG_WIDTH-1:0] tag_sel = tag_in[grant_index];
wire [NUM_REQS-1:0] tag_matches;

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -33,7 +33,7 @@ module VX_stream_switch #(
output wire [NUM_INPUTS-1:0] ready_in,
output wire [NUM_OUTPUTS-1:0] valid_out,
output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out,
output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out,
input wire [NUM_OUTPUTS-1:0] ready_out
);
if (NUM_INPUTS > NUM_OUTPUTS) begin
@ -52,7 +52,7 @@ module VX_stream_switch #(
assign data_in_r[i][j] = '0;
end
end
end
end
wire [NUM_OUTPUTS-1:0] valid_out_r;
wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_r;
@ -65,25 +65,24 @@ module VX_stream_switch #(
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
for (genvar j = 0; j < NUM_REQS; ++j) begin
localparam ii = i * NUM_REQS + j;
if (ii < NUM_INPUTS) begin
localparam ii = i * NUM_REQS + j;
if (ii < NUM_INPUTS) begin
assign ready_in[ii] = ready_out_r[i] & (sel_in[i] == LOG_NUM_REQS'(j));
end
end
end
`RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT);
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
`RESET_RELAY_EN (out_buf_reset, reset, (NUM_OUTPUTS > 1));
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.valid_in (valid_out_r[i]),
.reset (out_buf_reset[i]),
.valid_in (valid_out_r[i]),
.ready_in (ready_out_r[i]),
.data_in (data_out_r[i]),
.data_out (data_out[i]),
@ -93,7 +92,7 @@ module VX_stream_switch #(
end
end else if (NUM_OUTPUTS > NUM_INPUTS) begin
wire [NUM_INPUTS-1:0][NUM_REQS-1:0] valid_out_r;
wire [NUM_INPUTS-1:0][NUM_REQS-1:0] ready_out_r;
@ -104,51 +103,50 @@ module VX_stream_switch #(
assign ready_in[i] = ready_out_r[i][sel_in[i]];
end
`RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT);
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
for (genvar j = 0; j < NUM_REQS; ++j) begin
localparam ii = i * NUM_REQS + j;
if (ii < NUM_OUTPUTS) begin
`RESET_RELAY (out_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.reset (out_buf_reset[ii]),
.valid_in (valid_out_r[i][j]),
.ready_in (ready_out_r[i][j]),
.data_in (data_in[i]),
.data_in (data_in[i]),
.data_out (data_out[ii]),
.valid_out (valid_out[ii]),
.ready_out (ready_out[ii])
);
end else begin
`UNUSED_VAR (out_buf_reset[ii])
`UNUSED_VAR (valid_out_r[i][j])
assign ready_out_r[i][j] = '0;
end
end
end
end
end else begin
// #Inputs == #Outputs
`UNUSED_VAR (sel_in)
`RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT);
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
`RESET_RELAY_EN (out_buf_reset, reset, (NUM_OUTPUTS > 1));
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.reset (out_buf_reset[i]),
.valid_in (valid_in[i]),
.ready_in (ready_in[i]),
.data_in (data_in[i]),
@ -159,6 +157,6 @@ module VX_stream_switch #(
end
end
endmodule
`TRACING_ON

View file

@ -20,7 +20,7 @@ module VX_stream_xbar #(
parameter DATAW = 4,
parameter IN_WIDTH = `LOG2UP(NUM_INPUTS),
parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS),
parameter ARBITER = "P",
parameter ARBITER = "R",
parameter OUT_BUF = 0,
parameter LUTRAM = 0,
parameter MAX_FANOUT = `MAX_FANOUT,
@ -126,10 +126,9 @@ module VX_stream_xbar #(
assign data_out_r = {NUM_OUTPUTS{data_in}};
assign ready_in = ready_out_r[sel_in];
`RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT);
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin
`RESET_RELAY (out_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
@ -137,7 +136,7 @@ module VX_stream_xbar #(
.LUTRAM (LUTRAM)
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.reset (out_buf_reset[i]),
.valid_in (valid_out_r[i]),
.ready_in (ready_out_r[i]),
.data_in (data_out_r[i]),

View file

@ -94,7 +94,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_req_idx;
wire [NUM_BANKS-1:0] per_bank_req_ready;
wire [NUM_BANKS-1:0][REQ_DATAW-1:0] per_bank_req_data_all;
wire [NUM_BANKS-1:0][REQ_DATAW-1:0] per_bank_req_data_aos;
wire [NUM_REQS-1:0] req_valid_in;
wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_in;
@ -111,7 +111,8 @@ module VX_local_mem import VX_gpu_pkg::*; #(
req_bank_addr[i],
mem_bus_if[i].req_data.byteen,
mem_bus_if[i].req_data.data,
mem_bus_if[i].req_data.tag};
mem_bus_if[i].req_data.tag
};
assign mem_bus_if[i].req_ready = req_ready_in[i];
end
@ -120,6 +121,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (REQ_DATAW),
.PERF_CTR_BITS (`PERF_CTR_BITS),
.ARBITER ("F"),
.OUT_BUF (3) // output should be registered for the data_store addressing
) req_xbar (
.clk (clk),
@ -134,7 +136,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
.sel_in (req_bank_idx),
.ready_in (req_ready_in),
.valid_out (per_bank_req_valid),
.data_out (per_bank_req_data_all),
.data_out (per_bank_req_data_aos),
.sel_out (per_bank_req_idx),
.ready_out (per_bank_req_ready)
);
@ -145,7 +147,8 @@ module VX_local_mem import VX_gpu_pkg::*; #(
per_bank_req_addr[i],
per_bank_req_byteen[i],
per_bank_req_data[i],
per_bank_req_tag[i]} = per_bank_req_data_all[i];
per_bank_req_tag[i]
} = per_bank_req_data_aos[i];
end
// banks access
@ -156,38 +159,55 @@ module VX_local_mem import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_rsp_tag;
wire [NUM_BANKS-1:0] per_bank_rsp_ready;
`RESET_RELAY (bank_reset, reset);
for (genvar i = 0; i < NUM_BANKS; ++i) begin
wire bank_rsp_valid, bank_rsp_ready;
wire [WORD_WIDTH-1:0] bank_rsp_data;
`RESET_RELAY_EN (bram_reset, reset, (NUM_BANKS > 1));
VX_sp_ram #(
.DATAW (WORD_WIDTH),
.SIZE (WORDS_PER_BANK),
.WRENW (WORD_SIZE)
.WRENW (WORD_SIZE),
.NO_RWCHECK (1)
) data_store (
.clk (clk),
.read (1'b1),
.reset (bram_reset),
.read (per_bank_req_valid[i] && per_bank_req_ready[i] && ~per_bank_req_rw[i]),
.write (per_bank_req_valid[i] && per_bank_req_ready[i] && per_bank_req_rw[i]),
.wren (per_bank_req_byteen[i]),
.addr (per_bank_req_addr[i]),
.wdata (per_bank_req_data[i]),
.rdata (per_bank_rsp_data[i])
.rdata (bank_rsp_data)
);
// drop write response
wire per_bank_req_valid_w, per_bank_req_ready_w;
assign per_bank_req_valid_w = per_bank_req_valid[i] && ~per_bank_req_rw[i];
assign per_bank_req_ready[i] = per_bank_req_ready_w || per_bank_req_rw[i];
// read-during-write hazard detection
reg [BANK_ADDR_WIDTH-1:0] last_wr_addr;
reg last_wr_valid;
always @(posedge clk) begin
if (bram_reset) begin
last_wr_valid <= 0;
end else begin
last_wr_valid <= per_bank_req_valid[i] && per_bank_req_ready[i] && per_bank_req_rw[i];
end
last_wr_addr <= per_bank_req_addr[i];
end
wire is_rdw_hazard = last_wr_valid && ~per_bank_req_rw[i] && (per_bank_req_addr[i] == last_wr_addr);
VX_elastic_buffer #(
.DATAW (REQ_SEL_WIDTH + TAG_WIDTH),
.SIZE (0)
) bank_buf (
// drop write response and stall on read-during-write hazard
assign bank_rsp_valid = per_bank_req_valid[i] && ~per_bank_req_rw[i] && ~is_rdw_hazard;
assign per_bank_req_ready[i] = (bank_rsp_ready || per_bank_req_rw[i]) && ~is_rdw_hazard;
// register BRAM output
VX_pipe_buffer #(
.DATAW (REQ_SEL_WIDTH + WORD_WIDTH + TAG_WIDTH)
) bram_buf (
.clk (clk),
.reset (bank_reset),
.valid_in (per_bank_req_valid_w),
.ready_in (per_bank_req_ready_w),
.data_in ({per_bank_req_idx[i], per_bank_req_tag[i]}),
.data_out ({per_bank_rsp_idx[i], per_bank_rsp_tag[i]}),
.reset (bram_reset),
.valid_in (bank_rsp_valid),
.ready_in (bank_rsp_ready),
.data_in ({per_bank_req_idx[i], bank_rsp_data, per_bank_req_tag[i]}),
.data_out ({per_bank_rsp_idx[i], per_bank_rsp_data[i], per_bank_rsp_tag[i]}),
.valid_out (per_bank_rsp_valid[i]),
.ready_out (per_bank_rsp_ready[i])
);
@ -195,10 +215,10 @@ module VX_local_mem import VX_gpu_pkg::*; #(
// bank responses gather
wire [NUM_BANKS-1:0][RSP_DATAW-1:0] per_bank_rsp_data_all;
wire [NUM_BANKS-1:0][RSP_DATAW-1:0] per_bank_rsp_data_aos;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign per_bank_rsp_data_all[i] = {per_bank_rsp_data[i], per_bank_rsp_tag[i]};
assign per_bank_rsp_data_aos[i] = {per_bank_rsp_data[i], per_bank_rsp_tag[i]};
end
wire [NUM_REQS-1:0] rsp_valid_out;
@ -209,6 +229,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
.NUM_INPUTS (NUM_BANKS),
.NUM_OUTPUTS (NUM_REQS),
.DATAW (RSP_DATAW),
.ARBITER ("P"), // this priority arbiter has negligeable impact om performance
.OUT_BUF (OUT_BUF)
) rsp_xbar (
.clk (clk),
@ -216,7 +237,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
`UNUSED_PIN (collisions),
.sel_in (per_bank_rsp_idx),
.valid_in (per_bank_rsp_valid),
.data_in (per_bank_rsp_data_all),
.data_in (per_bank_rsp_data_aos),
.ready_in (per_bank_rsp_ready),
.valid_out (rsp_valid_out),
.data_out (rsp_data_out),
@ -310,7 +331,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
if (mem_bus_if[i].req_data.rw) begin
`TRACE(1, ("%d: %s wr-req: req_idx=%0d, addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n",
`TRACE(1, ("%d: %s wr-req: req_idx=%0d, addr=0x%0h, tag=0x%0h, byteen=%h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, req_uuid[i]));
end else begin
`TRACE(1, ("%d: %s rd-req: req_idx=%0d, addr=0x%0h, tag=0x%0h (#%0d)\n",
@ -318,7 +339,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
end
end
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
`TRACE(1, ("%d: %s rd-rsp: req_idx=%0d, tag=0x%0h, data=0x%0h (#%0d)\n",
`TRACE(1, ("%d: %s rd-rsp: req_idx=%0d, tag=0x%0h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.tag, mem_bus_if[i].rsp_data.data[i], rsp_uuid[i]));
end
end
@ -328,7 +349,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (per_bank_req_valid[i] && per_bank_req_ready[i]) begin
if (per_bank_req_rw[i]) begin
`TRACE(2, ("%d: %s-bank%0d wr-req: addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n",
`TRACE(2, ("%d: %s-bank%0d wr-req: addr=0x%0h, tag=0x%0h, byteen=%h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag[i], per_bank_req_byteen[i], per_bank_req_data[i], per_bank_req_uuid[i]));
end else begin
`TRACE(2, ("%d: %s-bank%0d rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
@ -336,7 +357,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
end
end
if (per_bank_rsp_valid[i] && per_bank_rsp_ready[i]) begin
`TRACE(2, ("%d: %s-bank%0d rd-rsp: tag=0x%0h, data=0x%0h (#%0d)\n",
`TRACE(2, ("%d: %s-bank%0d rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, i, per_bank_rsp_tag[i], per_bank_rsp_data[i], per_bank_rsp_uuid[i]));
end
end

View file

@ -73,12 +73,12 @@ ifneq ($(TARGET), fpga)
CFLAGS += -DSIMULATION
endif
# Debugigng
# Debugging
ifdef DEBUG
ifneq ($(TARGET), fpga)
CFLAGS += -DNDEBUG
CFLAGS += -DDEBUG_LEVEL=$(DEBUG) $(DBG_TRACE_FLAGS)
else
CFLAGS += $(DBG_TRACE_FLAGS)
CFLAGS += -DNDEBUG
endif
else
CFLAGS += -DNDEBUG

View file

@ -1 +1 @@
create_clock -name {clk} -period "220 MHz" -waveform { 0.000 1.0 } [get_ports {clk}]
create_clock -name {clk} -period "200 MHz" -waveform { 0.000 1.0 } [get_ports {clk}]

View file

@ -45,6 +45,7 @@ FPGA_BIN_DIR=<bin_dir> XRT_DEVICE_INDEX=1 TARGET=hw ./ci/blackbox.sh --driver=xr
# build report logs
<build_dir>/bin/vortex_afu.xclbin.info
<build_dir>/_x/logs/link/vivado.log # search for keyword "Very high fanout"
<build_dir>/_x/reports/link/link/imp/impl_1_full_util_routed.rpt
<build_dir>/_x/reports/link/imp/impl_1_hw_bb_locked_timing_summary_routed.rpt # search for keyword "VIOLATED"
<build_dir>/_x/logs/link/syn/ulp_vortex_afu_1_0_synth_1_runme.log

View file

@ -111,14 +111,14 @@ ifeq ($(TARGET), hw_emu)
CFLAGS += -DSIMULATION
endif
# Debugigng
# Debugging
ifdef DEBUG
VPP_FLAGS += -g --debug.protocol all
ifneq ($(TARGET), hw)
CFLAGS += -DNDEBUG
else
VPP_FLAGS += --vivado.prop fileset.sim_1.xsim.elaborate.debug_level=all
CFLAGS += $(DBG_TRACE_FLAGS)
CFLAGS += -DDEBUG_LEVEL=$(DEBUG) $(DBG_TRACE_FLAGS)
else
CFLAGS += -DNDEBUG
endif
else
VPP_FLAGS += --optimize 3

View file

@ -49,7 +49,7 @@ endif
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache
RTL_INCLUDE += $(FPU_INCLUDE)
# Debugigng
# Debugging
ifdef DEBUG
CFLAGS += $(DBG_TRACE_FLAGS)
else

View file

@ -29,7 +29,7 @@ THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count()
VL_FLAGS += -j $(THREADS)
#VL_FLAGS += --threads $(THREADS)
# Debugigng
# Debugging
ifdef DEBUG
VL_FLAGS += --trace --trace-structs $(DBG_FLAGS)
CXXFLAGS += -g -O0 $(DBG_FLAGS)

View file

@ -13,6 +13,7 @@
#include <VX_config.h>
#include <VX_types.h>
#include <newlib.h>
#include "common.h"
.section .init, "ax"
@ -51,12 +52,10 @@ _start:
# la t0, trap_entry
# csrw mtvec, t0
# register global termination functions
la a0, __libc_fini_array
call atexit
#ifdef HAVE_INITFINI_ARRAY
# run global initialization functions
call __libc_init_array
#endif
# call main program routine
call main

View file

@ -119,70 +119,13 @@ void __libc_fini_array (void) {
}
#endif
/*
#define MAX_CORES 64
volatile int g_cxa_locks[MAX_CORES] = {0};
*/
void __cxa_lock() {
/*int core_id = vx_core_id();
g_cxa_locks[core_id] = 1;
vx_fence();
for (int i = 1; i < MAX_CORES; ++i) {
int other = (core_id + i) % MAX_CORES;
while (g_cxa_locks[other]) {
vx_fence(); // cache coherence not supported, so we need to flush the caches
}
}*/
}
void __cxa_unlock() {
/*vx_fence();
int core_id = vx_core_id();
g_cxa_locks[core_id] = 0;*/
}
#define MAX_FEXITS 64
typedef struct {
void (*f[MAX_FEXITS])(void*);
void *a[MAX_FEXITS];
} fexit_list_t;
static fexit_list_t g_fexit_list;
static int g_num_fexits = 0;
void __funcs_on_exit() {
void (*func)(void *), *arg;
fexit_list_t* fexit_list = &g_fexit_list;
for (int i = 0; i < g_num_fexits; ++i) {
func = fexit_list->f[i];
arg = fexit_list->a[i];
func(arg);
}
}
void __cxa_finalize(void *dso) {}
int __cxa_atexit(void (*func)(void *), void *arg, void *dso) {
__cxa_lock();
int num_fexits = g_num_fexits;
if (num_fexits >= MAX_FEXITS)
return -1;
fexit_list_t* fexit_list = &g_fexit_list;
fexit_list->f[num_fexits] = func;
fexit_list->a[num_fexits] = arg;
g_num_fexits = num_fexits + 1;
__cxa_unlock();
return 0;
}
static void call(void *p) {
((void (*)(void))(uintptr_t)p)();
}
int atexit(void (*func)(void)) {
return __cxa_atexit(call, (void*)(uintptr_t)func, 0);
// This function will be called by LIBC at program exit.
// Since this platform only support statically linked programs,
// it is not required to support LIBC's exit functions registration via atexit().
void __funcs_on_exit (void) {
#ifdef HAVE_INITFINI_ARRAY
__libc_fini_array();
#endif
}
#ifdef __cplusplus

View file

@ -21,6 +21,7 @@
#include <cstdint>
#include <unordered_map>
#include <array>
#define CACHE_BLOCK_SIZE 64

View file

@ -34,6 +34,7 @@ typedef void* vx_buffer_h;
#define VX_CAPS_GLOBAL_MEM_SIZE 0x5
#define VX_CAPS_LOCAL_MEM_SIZE 0x6
#define VX_CAPS_ISA_FLAGS 0x7
#define VX_CAPS_NUM_MEM_BANKS 0x8
// device isa flags
#define VX_ISA_STD_A (1ull << ISA_STD_A)

View file

@ -30,7 +30,7 @@ else
CXXFLAGS += -I$(SYN_DIR)
endif
# Debugigng
# Debugging
ifdef DEBUG
CXXFLAGS += -g -O0
else

View file

@ -232,6 +232,9 @@ public:
case VX_CAPS_ISA_FLAGS:
_value = isa_caps_;
break;
case VX_CAPS_NUM_MEM_BANKS:
_value = MEMORY_BANKS;
break;
default:
fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id);
std::abort();

View file

@ -19,7 +19,7 @@ LDFLAGS += -L$(DESTDIR) -lrtlsim
SRCS := $(SRC_DIR)/vortex.cpp
# Debugigng
# Debugging
ifdef DEBUG
CXXFLAGS += -g -O0
else

View file

@ -77,6 +77,9 @@ public:
case VX_CAPS_ISA_FLAGS:
_value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
break;
case VX_CAPS_NUM_MEM_BANKS:
_value = MEMORY_BANKS;
break;
default:
std::cout << "invalid caps id: " << caps_id << std::endl;
std::abort();

View file

@ -19,7 +19,7 @@ LDFLAGS += -L$(DESTDIR) -lsimx
SRCS := $(SRC_DIR)/vortex.cpp
# Debugigng
# Debugging
ifdef DEBUG
CXXFLAGS += -g -O0
else

View file

@ -105,6 +105,9 @@ public:
case VX_CAPS_ISA_FLAGS:
_value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD;
break;
case VX_CAPS_NUM_MEM_BANKS:
_value = MEMORY_BANKS;
break;
default:
std::cout << "invalid caps id: " << caps_id << std::endl;
std::abort();

View file

@ -12,7 +12,7 @@ LDFLAGS += -shared -pthread -ldl
SRCS := $(SRC_DIR)/vortex.cpp $(SRC_DIR)/utils.cpp
# Debugigng
# Debugging
ifdef DEBUG
CXXFLAGS += -g -O0
else

View file

@ -211,6 +211,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
uint64_t mem_reads = 0;
uint64_t mem_writes = 0;
uint64_t mem_lat = 0;
uint64_t mem_req_counter = 0;
uint64_t mem_ticks = 0;
uint64_t num_cores;
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), {
@ -221,6 +223,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_ISA_FLAGS, &isa_flags), {
return err;
});
uint64_t num_mem_bank_ports;
CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_MEM_BANKS, &num_mem_bank_ports), {
return err;
});
bool icache_enable = isa_flags & VX_ISA_EXT_ICACHE;
bool dcache_enable = isa_flags & VX_ISA_EXT_DCACHE;
@ -314,7 +321,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
if (num_cores > 1) {
uint64_t scrb_total = scrb_alu_per_core + scrb_fpu_per_core + scrb_lsu_per_core + scrb_csrs_per_core + scrb_wctl_per_core;
int scrb_percent_per_core = calcAvgPercent(scrb_stalls_per_core, cycles_per_core);
fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, scrs=%d%%, wctl=%d%%)\n"
fprintf(stream, "PERF: core%d: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, csrs=%d%%, wctl=%d%%)\n"
, core_id
, scrb_stalls_per_core
, scrb_percent_per_core
@ -533,6 +540,12 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_LT, core_id, &mem_lat), {
return err;
});
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_BANK_CNTR, core_id, &mem_req_counter), {
return err;
});
CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_BANK_TICK, core_id, &mem_ticks), {
return err;
});
}
} break;
default:
@ -559,7 +572,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
fprintf(stream, "PERF: scheduler idle=%ld (%d%%)\n", sched_idles, sched_idles_percent);
fprintf(stream, "PERF: scheduler stalls=%ld (%d%%)\n", sched_stalls, sched_stalls_percent);
fprintf(stream, "PERF: ibuffer stalls=%ld (%d%%)\n", ibuffer_stalls, ibuffer_percent);
fprintf(stream, "PERF: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, scrs=%d%%, wctl=%d%%)\n"
fprintf(stream, "PERF: scoreboard stalls=%ld (%d%%) (alu=%d%%, fpu=%d%%, lsu=%d%%, csrs=%d%%, wctl=%d%%)\n"
, scrb_stalls
, scrb_percent
, calcAvgPercent(scrb_alu, scrb_total)
@ -599,7 +612,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
int read_hit_ratio = calcRatio(l3cache_read_misses, l3cache_reads);
int write_hit_ratio = calcRatio(l3cache_write_misses, l3cache_writes);
int bank_utilization = calcAvgPercent(l3cache_reads + l3cache_writes, l3cache_reads + l3cache_writes + l3cache_bank_stalls);
int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls);
int mshr_utilization = calcAvgPercent(l3cache_read_misses + l3cache_write_misses, l3cache_read_misses + l3cache_write_misses + l3cache_mshr_stalls);
fprintf(stream, "PERF: l3cache reads=%ld\n", l3cache_reads);
fprintf(stream, "PERF: l3cache writes=%ld\n", l3cache_writes);
fprintf(stream, "PERF: l3cache read misses=%ld (hit ratio=%d%%)\n", l3cache_read_misses, read_hit_ratio);
@ -609,8 +622,10 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) {
}
int mem_avg_lat = caclAverage(mem_lat, mem_reads);
int memory_bank_port_utilization = calcAvgPercent(mem_req_counter, (mem_ticks * num_mem_bank_ports));
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat);
fprintf(stream, "PERF: memory bank port utilization=%d%%\n", memory_bank_port_utilization);
} break;
default:
break;

View file

@ -26,7 +26,7 @@ endif
PROJECT := libvortex-xrt.so
# Debugigng
# Debugging
ifdef DEBUG
CXXFLAGS += -g -O0
else

View file

@ -404,6 +404,9 @@ public:
case VX_CAPS_ISA_FLAGS:
_value = isa_caps_;
break;
case VX_CAPS_NUM_MEM_BANKS:
_value = MEMORY_BANKS;
break;
default:
fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id);
std::abort();

View file

@ -41,11 +41,11 @@ public:
dram_config["MemorySystem"]["DRAM"]["impl"] = "HBM2";
dram_config["MemorySystem"]["DRAM"]["org"]["preset"] = "HBM2_8Gb";
dram_config["MemorySystem"]["DRAM"]["org"]["density"] = 8192;
dram_config["MemorySystem"]["DRAM"]["org"]["channel"] = 8;
dram_config["MemorySystem"]["DRAM"]["timing"]["preset"] = "HBM2_2Gbps";
dram_config["MemorySystem"]["Controller"]["impl"] = "Generic";
dram_config["MemorySystem"]["Controller"]["Scheduler"]["impl"] = "FRFCFS";
dram_config["MemorySystem"]["Controller"]["RefreshManager"]["impl"] = "AllBank";
dram_config["MemorySystem"]["Controller"]["RefreshManager"]["impl"] = "AllBank";
dram_config["MemorySystem"]["Controller"]["RowPolicy"]["impl"] = "OpenRowPolicy";
{
YAML::Node draw_plugin;
@ -66,7 +66,7 @@ public:
auto original_buf = std::cout.rdbuf();
std::cout.rdbuf(nullstream.rdbuf());
ramulator_frontend_->finalize();
ramulator_memorysystem_->finalize();
ramulator_memorysystem_->finalize();
std::cout.rdbuf(original_buf);
}

View file

@ -59,7 +59,7 @@ void RamMemDevice::read(void* data, uint64_t addr, uint64_t size) {
if ((addr & (wordSize_-1))
|| (addr_end & (wordSize_-1))
|| (addr_end <= contents_.size())) {
std::cout << "lookup of 0x" << std::hex << (addr_end-1) << " failed.\n";
std::cout << "lookup of 0x" << std::hex << (addr_end-1) << std::dec << " failed.\n";
throw BadAddress();
}
@ -74,7 +74,7 @@ void RamMemDevice::write(const void* data, uint64_t addr, uint64_t size) {
if ((addr & (wordSize_-1))
|| (addr_end & (wordSize_-1))
|| (addr_end <= contents_.size())) {
std::cout << "lookup of 0x" << std::hex << (addr_end-1) << " failed.\n";
std::cout << "lookup of 0x" << std::hex << (addr_end-1) << std::dec << " failed.\n";
throw BadAddress();
}
@ -115,8 +115,7 @@ void MemoryUnit::ADecoder::map(uint64_t start, uint64_t end, MemDevice &md) {
void MemoryUnit::ADecoder::read(void* data, uint64_t addr, uint64_t size) {
mem_accessor_t ma;
if (!this->lookup(addr, size, &ma)) {
assert(0);
std::cout << "lookup of 0x" << std::hex << addr << " failed.\n";
std::cout << "lookup of 0x" << std::hex << addr << std::dec << " failed.\n";
throw BadAddress();
}
ma.md->read(data, ma.addr, size);
@ -125,8 +124,7 @@ void MemoryUnit::ADecoder::read(void* data, uint64_t addr, uint64_t size) {
void MemoryUnit::ADecoder::write(const void* data, uint64_t addr, uint64_t size) {
mem_accessor_t ma;
if (!this->lookup(addr, size, &ma)) {
assert(0);
std::cout << "lookup of 0x" << std::hex << addr << " failed.\n";
std::cout << "lookup of 0x" << std::hex << addr << std::dec << " failed.\n";
throw BadAddress();
}
ma.md->write(data, ma.addr, size);
@ -408,7 +406,7 @@ bool ACLManager::check(uint64_t addr, uint64_t size, int flags) const {
while (it != acl_map_.end() && it->first < end) {
if (it->second.end > addr) {
if ((it->second.flags & flags) != flags) {
std::cout << "Memory access violation from 0x" << std::hex << addr << " to 0x" << end << ", curent flags=" << it->second.flags << ", access flags=" << flags << std::endl;
std::cout << "Memory access violation from 0x" << std::hex << addr << " to 0x" << end << ", curent flags=" << it->second.flags << ", access flags=" << flags << std::dec << std::endl;
return false; // Overlapping entry is missing at least one required flag bit
}
addr = it->second.end; // Move to the end of the current matching range
@ -759,4 +757,4 @@ std::pair<uint64_t, uint8_t> MemoryUnit::page_table_walk(uint64_t vAddr_bits, AC
return std::make_pair(cur_base_ppn, flags);
}
#endif
#endif

View file

@ -168,23 +168,23 @@ public:
{}
void* operator new(size_t /*size*/) {
return allocator().allocate();
return allocator_.allocate();
}
void operator delete(void* ptr) {
allocator().deallocate(ptr);
allocator_.deallocate(ptr);
}
protected:
Func func_;
Pkt pkt_;
static MemoryPool<SimCallEvent<Pkt>>& allocator() {
static MemoryPool<SimCallEvent<Pkt>> instance(64);
return instance;
}
static MemoryPool<SimCallEvent<Pkt>> allocator_;
};
template <typename Pkt>
MemoryPool<SimCallEvent<Pkt>> SimCallEvent<Pkt>::allocator_(64);
///////////////////////////////////////////////////////////////////////////////
template <typename Pkt>
@ -201,23 +201,23 @@ public:
{}
void* operator new(size_t /*size*/) {
return allocator().allocate();
return allocator_.allocate();
}
void operator delete(void* ptr) {
allocator().deallocate(ptr);
allocator_.deallocate(ptr);
}
protected:
const SimPort<Pkt>* port_;
Pkt pkt_;
static MemoryPool<SimPortEvent<Pkt>>& allocator() {
static MemoryPool<SimPortEvent<Pkt>> instance(64);
return instance;
}
static MemoryPool<SimPortEvent<Pkt>> allocator_;
};
template <typename Pkt>
MemoryPool<SimPortEvent<Pkt>> SimPortEvent<Pkt>::allocator_(64);
///////////////////////////////////////////////////////////////////////////////
class SimContext;

View file

@ -70,4 +70,28 @@ const char* fileExtension(const char* filepath);
#endif
void *aligned_malloc(size_t size, size_t alignment);
void aligned_free(void *ptr);
void aligned_free(void *ptr);
namespace vortex {
// Verilator data type casting
template <typename R, size_t W, typename Enable = void>
class VDataCast;
template <typename R, size_t W>
class VDataCast<R, W, typename std::enable_if<(W > 8)>::type> {
public:
template <typename T>
static R get(T& obj) {
return reinterpret_cast<R>(obj.data());
}
};
template <typename R, size_t W>
class VDataCast<R, W, typename std::enable_if<(W <= 8)>::type> {
public:
template <typename T>
static R get(T& obj) {
return reinterpret_cast<R>(&obj);
}
};
}

View file

@ -83,13 +83,13 @@ THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count()
VL_FLAGS += -j $(THREADS)
#VL_FLAGS += --threads $(THREADS)
# Debugigng
# Debugging
ifdef DEBUG
VL_FLAGS += --trace --trace-structs $(DBG_FLAGS)
CXXFLAGS += -g -O0 $(DBG_FLAGS)
else
VL_FLAGS += -DNDEBUG
CXXFLAGS += -O3 -DNDEBUG
CXXFLAGS += -O2 -DNDEBUG
endif
# Enable scope analyzer
@ -123,7 +123,7 @@ $(DESTDIR)/vortex_afu.h : $(AFU_DIR)/vortex_afu.vh
$(SCRIPT_DIR)/gen_config.py -i $^ -o $@
$(DESTDIR)/$(PROJECT): $(SRCS) $(DESTDIR)/vortex_afu.h $(SCOPE_JSON)
verilator --build --exe -O3 $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' --Mdir $@.obj_dir -o $@
verilator --build --exe $(VL_FLAGS) --cc $(TOP) --top-module $(TOP) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' --Mdir $@.obj_dir -o $@
clean:
rm -rf $(DESTDIR)/$(PROJECT).obj_dir

View file

@ -35,13 +35,13 @@
#include <unordered_map>
#include <util.h>
#ifndef MEMORY_BANKS
//#ifndef MEMORY_BANKS
#ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS
#define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS
#else
#define MEMORY_BANKS 2
#endif
#endif
//#endif
#ifndef MEM_CLOCK_RATIO
#define MEM_CLOCK_RATIO 1
@ -380,7 +380,7 @@ private:
device_->vcp2af_sRxPort_c0_hdr_resp_type = 0;
memcpy(device_->vcp2af_sRxPort_c0_data, cci_rd_it->data.data(), CACHE_BLOCK_SIZE);
device_->vcp2af_sRxPort_c0_hdr_mdata = cci_rd_it->mdata;
/*printf("%0ld: [sim] CCI Rd Rsp: addr=%ld, mdata=%d, data=", timestamp, cci_rd_it->addr, cci_rd_it->mdata);
/*printf("%0ld: [sim] CCI Rd Rsp: addr=0x%lx, mdata=0x%x, data=0x", timestamp, cci_rd_it->addr, cci_rd_it->mdata);
for (int i = 0; i < CACHE_BLOCK_SIZE; ++i)
printf("%02x", cci_rd_it->data[CACHE_BLOCK_SIZE-1-i]);
printf("\n");*/
@ -398,7 +398,7 @@ private:
cci_req.mdata = device_->af2cp_sTxPort_c0_hdr_mdata;
auto host_ptr = (uint64_t*)(device_->af2cp_sTxPort_c0_hdr_address * CACHE_BLOCK_SIZE);
memcpy(cci_req.data.data(), host_ptr, CACHE_BLOCK_SIZE);
//printf("%0ld: [sim] CCI Rd Req: addr=%ld, mdata=%d\n", timestamp, device_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata);
//printf("%0ld: [sim] CCI Rd Req: addr=0x%lx, mdata=0x%x\n", timestamp, device_->af2cp_sTxPort_c0_hdr_address, cci_req.mdata);
cci_reads_.emplace_back(cci_req);
}
@ -453,7 +453,7 @@ private:
}
}
/*printf("%0ld: [sim] MEM Wr Req: bank=%d, addr=%x, data=", timestamp, b, byte_addr);
/*printf("%0ld: [sim] MEM Wr Req: bank=%d, 0x%x, data=0x", timestamp, b, byte_addr);
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]);
}

View file

@ -65,7 +65,7 @@ THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count()
VL_FLAGS += -j $(THREADS)
#VL_FLAGS += --threads $(THREADS)
# Debugigng
# Debugging
ifdef DEBUG
VL_FLAGS += --trace --trace-structs $(DBG_FLAGS)
CXXFLAGS += -g -O0 $(DBG_FLAGS)

View file

@ -39,6 +39,7 @@ typedef VVortex Device;
#include <unordered_map>
#include <dram_sim.h>
#include <util.h>
#ifndef MEMORY_BANKS
#ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS
@ -316,11 +317,11 @@ private:
auto mem_rsp_it = pending_mem_reqs_.begin();
auto mem_rsp = *mem_rsp_it;
/*
printf("%0ld: [sim] MEM Rd Rsp: addr=%0lx, data=", timestamp, mem_rsp->addr);
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
printf("%02x", mem_rsp->block[(MEM_BLOCK_SIZE-1)-i]);
}
printf("\n");
printf("%0ld: [sim] MEM Rd Rsp: addr=0x%0lx, data=0x", timestamp, mem_rsp->addr);
for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) {
printf("%02x", mem_rsp->block[i]);
}
printf("\n");
*/
device_->m_axi_rvalid[0] = 1;
device_->m_axi_rid[0] = mem_rsp->tag;
@ -347,7 +348,7 @@ private:
auto mem_rsp_it = pending_mem_reqs_.begin();
auto mem_rsp = *mem_rsp_it;
/*
printf("%0ld: [sim] MEM Wr Rsp: addr=%0lx\n", timestamp, mem_rsp->addr);
printf("%0ld: [sim] MEM Wr Rsp: addr=0x%0lx\n", timestamp, mem_rsp->addr);
*/
device_->m_axi_bvalid[0] = 1;
device_->m_axi_bid[0] = mem_rsp->tag;
@ -387,11 +388,15 @@ private:
} else {
// process writes
/*
printf("%0ld: [sim] MEM Wr: addr=%0x, byteen=%0lx, data=", timestamp, base_addr, byteen);
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]);
}
printf("\n");
printf("%0ld: [sim] MEM Wr: addr=0x%0lx, byteen=0x", timestamp, base_addr);
for (int i = (MEM_BLOCK_SIZE/4)-1; i >= 0; --i) {
printf("%x", (int)((byteen >> (4 * i)) & 0xf));
}
printf(", data=0x");
for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) {
printf("%02x", data[i]);
}
printf("\n");
*/
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
if ((byteen >> i) & 0x1) {
@ -459,13 +464,13 @@ private:
auto mem_rsp_it = pending_mem_reqs_.begin();
auto mem_rsp = *mem_rsp_it;
/*
printf("%0ld: [sim] MEM Rd: tag=%0lx, addr=%0lx, data=", timestamp, mem_rsp->tag, mem_rsp->addr);
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
printf("%02x", mem_rsp->block[(MEM_BLOCK_SIZE-1)-i]);
}
printf("\n");
printf("%0ld: [sim] MEM Rd Rsp: tag=0x%0lx, addr=0x%0lx, data=0x", timestamp, mem_rsp->tag, mem_rsp->addr);
for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) {
printf("%02x", mem_rsp->block[i]);
}
printf("\n");
*/
memcpy(device_->mem_rsp_data.data(), mem_rsp->block.data(), MEM_BLOCK_SIZE);
memcpy(VDataCast<void*, MEM_BLOCK_SIZE>::get(device_->mem_rsp_data), mem_rsp->block.data(), MEM_BLOCK_SIZE);
device_->mem_rsp_tag = mem_rsp->tag;
pending_mem_reqs_.erase(mem_rsp_it);
mem_rd_rsp_active_ = true;
@ -480,7 +485,7 @@ private:
uint64_t byte_addr = (device_->mem_req_addr * MEM_BLOCK_SIZE);
if (device_->mem_req_rw) {
auto byteen = device_->mem_req_byteen;
auto data = (uint8_t*)(device_->mem_req_data.data());
auto data = VDataCast<uint8_t*, MEM_BLOCK_SIZE>::get(device_->mem_req_data);
if (byte_addr >= uint64_t(IO_COUT_ADDR)
&& byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) {
@ -499,11 +504,15 @@ private:
} else {
// process writes
/*
printf("%0ld: [sim] MEM Wr: tag=%0lx, addr=%0x, byteen=%0lx, data=", timestamp, device_->mem_req_tag, byte_addr, byteen);
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]);
}
printf("\n");
printf("%0ld: [sim] MEM Wr Req: tag=0x%0lx, addr=0x%0lx, byteen=0x", timestamp, device_->mem_req_tag, byte_addr);
for (int i = (MEM_BLOCK_SIZE/4)-1; i >= 0; --i) {
printf("%x", (int)((byteen >> (4 * i)) & 0xf));
}
printf(", data=0x");
for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) {
printf("%d=%02x,", i, data[i]);
}
printf("\n");
*/
for (int i = 0; i < MEM_BLOCK_SIZE; i++) {
if ((byteen >> i) & 0x1) {
@ -530,7 +539,7 @@ private:
ram_->read(mem_req->block.data(), byte_addr, MEM_BLOCK_SIZE);
pending_mem_reqs_.emplace_back(mem_req);
//printf("%0ld: [sim] MEM Rd Req: addr=%0x, tag=%0lx\n", timestamp, byte_addr, device_->mem_req_tag);
//printf("%0ld: [sim] MEM Rd Req: addr=0x%0lx, tag=0x%0lx\n", timestamp, byte_addr, device_->mem_req_tag);
// send dram request
dram_queue_.push(mem_req);

View file

@ -24,7 +24,7 @@ LDFLAGS += -Wl,-rpath,$(THIRD_PARTY_DIR)/ramulator -L$(THIRD_PARTY_DIR)/ramulato
SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp
SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/emulator.cpp $(SRC_DIR)/decode.cpp $(SRC_DIR)/execute.cpp $(SRC_DIR)/func_unit.cpp $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp
# Debugigng
# Debugging
ifdef DEBUG
CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG)
#CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) -fsanitize=address -fno-omit-frame-pointer

View file

@ -77,8 +77,8 @@ public:
caches_.at(i)->CoreRspPorts.at(j).bind(&mem_arbs.at(j)->RspOut.at(i));
}
caches_.at(i)->MemReqPort.bind(&cache_arb->ReqIn.at(i));
cache_arb->RspIn.at(i).bind(&caches_.at(i)->MemRspPort);
caches_.at(i)->MemReqPorts.at(0).bind(&cache_arb->ReqIn.at(i));
cache_arb->RspIn.at(i).bind(&caches_.at(i)->MemRspPorts.at(0));
}
cache_arb->ReqOut.at(0).bind(&this->MemReqPort);

Some files were not shown because too many files have changed in this diff Show more