Compare commits

..

No commits in common. "master" and "v2.2" have entirely different histories.
master ... v2.2

398 changed files with 29997 additions and 54343 deletions

View file

@ -1,8 +0,0 @@
Language: Cpp
BasedOnStyle: LLVM
IndentWidth: 2
TabWidth: 2
ColumnLimit: 0
UseTab: Never
BreakBeforeBraces: Attach
AlwaysBreakTemplateDeclarations: true

View file

@ -17,17 +17,17 @@ on: [push, pull_request]
jobs:
setup:
runs-on: ubuntu-22.04
runs-on: ubuntu-20.04
steps:
- name: Checkout code
uses: actions/checkout@v4
uses: actions/checkout@v2
with:
submodules: recursive
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v4
uses: actions/cache@v2
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
@ -36,7 +36,7 @@ jobs:
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v4
uses: actions/cache@v2
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
@ -46,7 +46,7 @@ jobs:
- name: Install Dependencies
if: steps.cache-toolchain.outputs.cache-hit != 'true' || steps.cache-thirdparty.outputs.cache-hit != 'true'
run: |
sudo bash ./ci/install_dependencies.sh
sudo bash ./ci/system_updates.sh
- name: Setup Toolchain
if: steps.cache-toolchain.outputs.cache-hit != 'true'
@ -63,7 +63,7 @@ jobs:
make -C third_party > /dev/null
build:
runs-on: ubuntu-22.04
runs-on: ubuntu-20.04
needs: setup
strategy:
matrix:
@ -71,15 +71,15 @@ jobs:
steps:
- name: Checkout code
uses: actions/checkout@v4
uses: actions/checkout@v2
- name: Install Dependencies
run: |
sudo bash ./ci/install_dependencies.sh
sudo bash ./ci/system_updates.sh
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v4
uses: actions/cache@v2
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
@ -88,7 +88,7 @@ jobs:
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v4
uses: actions/cache@v2
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
@ -106,31 +106,31 @@ jobs:
make tests -s > /dev/null
- name: Upload Build Artifact
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v2
with:
name: build-${{ matrix.xlen }}
path: build${{ matrix.xlen }}
tests:
runs-on: ubuntu-22.04
runs-on: ubuntu-20.04
needs: build
strategy:
fail-fast: false
matrix:
name: [regression, opencl, cache, config1, config2, debug, scope, stress, synthesis, vm, vector]
name: [regression, opencl, cache, config1, config2, debug, stress]
xlen: [32, 64]
steps:
- name: Checkout code
uses: actions/checkout@v4
uses: actions/checkout@v2
- name: Install Dependencies
run: |
sudo bash ./ci/install_dependencies.sh
sudo bash ./ci/system_updates.sh
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v4
uses: actions/cache@v2
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
@ -139,7 +139,7 @@ jobs:
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v4
uses: actions/cache@v2
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
@ -147,7 +147,7 @@ jobs:
${{ runner.os }}-thirdparty-
- name: Download Build Artifact
uses: actions/download-artifact@v4
uses: actions/download-artifact@v2
with:
name: build-${{ matrix.xlen }}
path: build${{ matrix.xlen }}
@ -161,15 +161,16 @@ jobs:
./ci/regression.sh --unittest
./ci/regression.sh --isa
./ci/regression.sh --kernel
./ci/regression.sh --synthesis
./ci/regression.sh --regression
else
./ci/regression.sh --${{ matrix.name }}
fi
complete:
runs-on: ubuntu-22.04
runs-on: ubuntu-20.04
needs: tests
steps:
- name: Check Completion
run: echo "All matrix jobs passed"
run: echo "All matrix jobs passed"

3
.gitignore vendored
View file

@ -1,4 +1,3 @@
/build*
/.vscode
*.cache
*.code-workspace
*.cache

6
.gitmodules vendored
View file

@ -1,9 +1,9 @@
[submodule "third_party/fpnew"]
path = third_party/fpnew
url = https://github.com/pulp-platform/fpnew.git
[submodule "third_party/softfloat"]
path = third_party/softfloat
url = https://github.com/ucb-bar/berkeley-softfloat-3.git
[submodule "third_party/ramulator"]
path = third_party/ramulator
url = https://github.com/CMU-SAFARI/ramulator2.git
[submodule "third_party/cvfpu"]
path = third_party/cvfpu
url = https://github.com/openhwgroup/cvfpu.git

View file

@ -1,20 +0,0 @@
FROM ubuntu:20.04
LABEL "Udit Subramanya"="usubramanya3@gatech.edu"
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get install -y build-essential valgrind git wget libpng-dev libboost-all-dev uuid-dev ccache cmake
# Third-Party Repository to Install g++11 on Ubuntu 18.04
RUN apt-get install -y manpages-dev software-properties-common
RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test
RUN apt-get install -y gcc-11 g++-11
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
RUN update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11
# create a directory for mounting the volume
WORKDIR /root/vortex

View file

@ -1,35 +1,10 @@
# Vortex GPGPU
Vortex is a full-stack open-source RISC-V GPGPU. Vortex supports multiple **backend drivers**, including our C++ simulator (simx), an RTL simulator, and physical Xilinx and Altera FPGAs-- all controlled by a single driver script. The chosen driver determines the corresponding code invoked to run Vortex. Generally, developers will prototype their intended design in simx, before completing going forward with an RTL implementation. Alternatively, you can get up and running by selecting a driver of your choice and running a demo program.
## Website
Vortex news can be found on its [website](https://vortex.cc.gatech.edu/)
## Citation
```
@inproceedings{10.1145/3466752.3480128,
author = {Tine, Blaise and Yalamarthy, Krishna Praveen and Elsabbagh, Fares and Hyesoon, Kim},
title = {Vortex: Extending the RISC-V ISA for GPGPU and 3D-Graphics},
year = {2021},
isbn = {9781450385572},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3466752.3480128},
doi = {10.1145/3466752.3480128},
abstract = {The importance of open-source hardware and software has been increasing. However, despite GPUs being one of the more popular accelerators across various applications, there is very little open-source GPU infrastructure in the public domain. We argue that one of the reasons for the lack of open-source infrastructure for GPUs is rooted in the complexity of their ISA and software stacks. In this work, we first propose an ISA extension to RISC-V that supports GPGPUs and graphics. The main goal of the ISA extension proposal is to minimize the ISA changes so that the corresponding changes to the open-source ecosystem are also minimal, which makes for a sustainable development ecosystem. To demonstrate the feasibility of the minimally extended RISC-V ISA, we implemented the complete software and hardware stacks of Vortex on FPGA. Vortex is a PCIe-based soft GPU that supports OpenCL and OpenGL. Vortex can be used in a variety of applications, including machine learning, graph analytics, and graphics rendering. Vortex can scale up to 32 cores on an Altera Stratix 10 FPGA, delivering a peak performance of 25.6 GFlops at 200 Mhz.},
booktitle = {MICRO-54: 54th Annual IEEE/ACM International Symposium on Microarchitecture},
pages = {754766},
numpages = {13},
keywords = {reconfigurable computing, memory systems., computer graphics},
location = {Virtual Event, Greece},
series = {MICRO '21}
}
```
Vortex is a full-stack open-source RISC-V GPGPU.
## Specifications
- Support RISC-V RV32IMAF and RV64IMAFD
- Microarchitecture:
- configurable number of cores, warps, and threads.
- configurable number of ALU, FPU, LSU, and SFU units per core.
@ -54,50 +29,48 @@ Vortex news can be found on its [website](https://vortex.cc.gatech.edu/)
- `ci`: Continuous integration scripts.
- `miscs`: Miscellaneous resources.
## Quick Start
If you are interested in a stable release of Vortex, you can download the latest release [here](https://github.com/vortexgpgpu/vortex/releases/latest). Otherwise, you can pull the most recent, but (potentially) unstable version as shown below. The following steps demonstrate how to build and run Vortex with the default driver: SimX. If you are interested in a different backend, look [here](docs/simulation.md).
## Build Instructions
More detailed build instructions can be found [here](docs/install_vortex.md).
### Supported OS Platforms
- Ubuntu 18.04, 20.04, 22.04, 24.04
- Ubuntu 18.04, 20.04
- Centos 7
### Toolchain Dependencies
The following dependencies will be fetched prebuilt by `toolchain_install.sh`.
- [POCL](http://portablecl.org/)
- [LLVM](https://llvm.org/)
- [RISCV-GNU-TOOLCHAIN](https://github.com/riscv-collab/riscv-gnu-toolchain)
- [Verilator](https://www.veripool.org/verilator)
- [cvfpu](https://github.com/openhwgroup/cvfpu.git)
- [FpNew](https://github.com/pulp-platform/fpnew.git)
- [SoftFloat](https://github.com/ucb-bar/berkeley-softfloat-3.git)
- [Ramulator](https://github.com/CMU-SAFARI/ramulator.git)
- [Yosys](https://github.com/YosysHQ/yosys)
- [Sv2v](https://github.com/zachjs/sv2v)
### Install development tools
```sh
sudo apt-get install build-essential
sudo apt-get install binutils
sudo apt-get install python
sudo apt-get install uuid-dev
sudo apt-get install git
```
### Install Vortex codebase
```sh
git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git
cd vortex
```
### Install system dependencies
```sh
# ensure dependent libraries are present
sudo ./ci/install_dependencies.sh
git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git
cd vortex
```
### Configure your build folder
```sh
mkdir build
cd build
# for 32bit
../configure --xlen=32 --tooldir=$HOME/tools
# for 64bit
../configure --xlen=64 --tooldir=$HOME/tools
mkdir build
cd build
../configure --xlen=32 --tooldir=$HOME/tools
```
### Install prebuilt toolchain
```sh
./ci/toolchain_install.sh --all
./ci/toolchain_install.sh --all
```
### set environment variables
### Set environment variables
```sh
# should always run before using the toolchain!
source ./ci/toolchain_env.sh
# should always run before using the toolchain!
source ./ci/toolchain_env.sh
```
### Building Vortex
```sh
@ -115,20 +88,20 @@ make -s
make -s
make install
```
- Building Vortex 64-bit requires setting --xlen=64 configure option.
- Building Vortex 64-bit simply requires using --xlen=64 configure option.
```sh
../configure --xlen=64 --tooldir=$HOME/tools
../configure --xlen=32 --tooldir=$HOME/tools
```
- Sourcing "./ci/toolchain_env.sh" is required everytime you start a new terminal. we recommend adding "source <build-path>/ci/toolchain_env.sh" to your ~/.bashrc file to automate the process at login.
```sh
echo "source <build-path>/ci/toolchain_env.sh" >> ~/.bashrc
```
- Making changes to Makefiles in your source tree or adding new folders will require executing the "configure" script again without any options to get changes propagated to your build folder.
- Making changes to Makefiles in your source tree or adding new folders will require executing the "configure" script again to get it propagated into your build folder.
```sh
../configure
```
- To debug the GPU, the simulation can generate a runtime trace for analysis. See /docs/debugging.md for more information.
- To debug the GPU, you can generate a "run.log" trace. see /docs/debugging.md for more information.
```sh
./ci/blackbox.sh --app=demo --debug=3
```
- For additional information, check out the [documentation](docs/index.md)
- For additional information, check out the /docs.

View file

@ -13,9 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
SCRIPT_DIR=$(dirname "$0")
ROOT_DIR=$SCRIPT_DIR/..
show_usage()
{
echo "Vortex BlackBox Test Driver v1.0"
@ -32,174 +29,302 @@ show_help()
echo "--rebuild: 0=disable, 1=force, 2=auto, 3=temp"
}
add_option() {
if [ -n "$1" ]; then
echo "$1 $2"
else
echo "$2"
fi
}
SCRIPT_DIR=$(dirname "$0")
ROOT_DIR=$SCRIPT_DIR/..
DEFAULTS() {
DRIVER=simx
APP=sgemm
DEBUG=0
DEBUG_LEVEL=0
SCOPE=0
HAS_ARGS=0
PERF_CLASS=0
CONFIGS="$CONFIGS"
REBUILD=2
TEMPBUILD=0
LOGFILE=run.log
}
DRIVER=simx
APP=sgemm
CLUSTERS=1
CORES=1
WARPS=4
THREADS=4
L2=
L3=
DEBUG=0
DEBUG_LEVEL=0
SCOPE=0
HAS_ARGS=0
PERF_CLASS=0
REBUILD=2
TEMPBUILD=0
LOGFILE=run.log
parse_args() {
DEFAULTS
for i in "$@"; do
case $i in
--driver=*) DRIVER=${i#*=} ;;
--app=*) APP=${i#*=} ;;
--clusters=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_CLUSTERS=${i#*=}") ;;
--cores=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_CORES=${i#*=}") ;;
--warps=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_WARPS=${i#*=}") ;;
--threads=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_THREADS=${i#*=}") ;;
--l2cache) CONFIGS=$(add_option "$CONFIGS" "-DL2_ENABLE") ;;
--l3cache) CONFIGS=$(add_option "$CONFIGS" "-DL3_ENABLE") ;;
--perf=*) CONFIGS=$(add_option "$CONFIGS" "-DPERF_ENABLE"); PERF_CLASS=${i#*=} ;;
--debug=*) DEBUG=1; DEBUG_LEVEL=${i#*=} ;;
--scope) SCOPE=1; ;;
--args=*) HAS_ARGS=1; ARGS=${i#*=} ;;
--rebuild=*) REBUILD=${i#*=} ;;
--log=*) LOGFILE=${i#*=} ;;
--help) show_help; exit 0 ;;
*) show_usage; exit 1 ;;
esac
done
for i in "$@"
do
case $i in
--driver=*)
DRIVER=${i#*=}
shift
;;
--app=*)
APP=${i#*=}
shift
;;
--clusters=*)
CLUSTERS=${i#*=}
shift
;;
--cores=*)
CORES=${i#*=}
shift
;;
--warps=*)
WARPS=${i#*=}
shift
;;
--threads=*)
THREADS=${i#*=}
shift
;;
--l2cache)
L2=-DL2_ENABLE
shift
;;
--l3cache)
L3=-DL3_ENABLE
shift
;;
--debug=*)
DEBUG_LEVEL=${i#*=}
DEBUG=1
shift
;;
--scope)
SCOPE=1
CORES=1
shift
;;
--perf=*)
PERF_FLAG=-DPERF_ENABLE
PERF_CLASS=${i#*=}
shift
;;
--args=*)
ARGS=${i#*=}
HAS_ARGS=1
shift
;;
--rebuild=*)
REBUILD=${i#*=}
shift
;;
--log=*)
LOGFILE=${i#*=}
shift
;;
--help)
show_help
exit 0
;;
*)
show_usage
exit -1
;;
esac
done
if [ $REBUILD -eq 3 ];
if [ $REBUILD -eq 3 ];
then
REBUILD=1
TEMPBUILD=1
fi
case $DRIVER in
gpu)
DRIVER_PATH=
;;
simx)
DRIVER_PATH=$ROOT_DIR/runtime/simx
;;
rtlsim)
DRIVER_PATH=$ROOT_DIR/runtime/rtlsim
;;
opae)
DRIVER_PATH=$ROOT_DIR/runtime/opae
;;
xrt)
DRIVER_PATH=$ROOT_DIR/runtime/xrt
;;
*)
echo "invalid driver: $DRIVER"
exit -1
;;
esac
if [ -d "$ROOT_DIR/tests/opencl/$APP" ];
then
APP_PATH=$ROOT_DIR/tests/opencl/$APP
elif [ -d "$ROOT_DIR/tests/regression/$APP" ];
then
APP_PATH=$ROOT_DIR/tests/regression/$APP
else
echo "Application folder not found: $APP"
exit -1
fi
if [ "$DRIVER" = "gpu" ];
then
# running application
if [ $HAS_ARGS -eq 1 ]
then
REBUILD=1
TEMPBUILD=1
fi
}
set_driver_path() {
case $DRIVER in
gpu) DRIVER_PATH="" ;;
simx|rtlsim|opae|xrt) DRIVER_PATH="$ROOT_DIR/runtime/$DRIVER" ;;
*) echo "Invalid driver: $DRIVER"; exit 1 ;;
esac
}
set_app_path() {
if [ -d "$ROOT_DIR/tests/opencl/$APP" ]; then
APP_PATH="$ROOT_DIR/tests/opencl/$APP"
elif [ -d "$ROOT_DIR/tests/regression/$APP" ]; then
APP_PATH="$ROOT_DIR/tests/regression/$APP"
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
OPTS=$ARGS make -C $APP_PATH run-$DRIVER
status=$?
else
echo "Application folder not found: $APP"
exit 1
fi
}
build_driver() {
local cmd_opts=""
[ $DEBUG -ne 0 ] && cmd_opts=$(add_option "$cmd_opts" "DEBUG=$DEBUG_LEVEL")
[ $SCOPE -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "SCOPE=1")
[ $TEMPBUILD -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "DESTDIR=\"$TEMPDIR\"")
[ -n "$CONFIGS" ] && cmd_opts=$(add_option "$cmd_opts" "CONFIGS=\"$CONFIGS\"")
if [ -n "$cmd_opts" ]; then
echo "Running: $cmd_opts make -C $DRIVER_PATH > /dev/null"
eval "$cmd_opts make -C $DRIVER_PATH > /dev/null"
else
echo "Running: make -C $DRIVER_PATH > /dev/null"
make -C $DRIVER_PATH > /dev/null
fi
}
run_app() {
local cmd_opts=""
[ $DEBUG -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "DEBUG=1")
[ $TEMPBUILD -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "VORTEX_RT_PATH=\"$TEMPDIR\"")
[ $HAS_ARGS -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "OPTS=\"$ARGS\"")
if [ $DEBUG -ne 0 ]; then
if [ -n "$cmd_opts" ]; then
echo "Running: $cmd_opts make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
eval "$cmd_opts make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
else
echo "Running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
fi
else
if [ -n "$cmd_opts" ]; then
echo "Running: $cmd_opts make -C $APP_PATH run-$DRIVER"
eval "$cmd_opts make -C $APP_PATH run-$DRIVER"
else
echo "Running: make -C $APP_PATH run-$DRIVER"
make -C $APP_PATH run-$DRIVER
fi
fi
status=$?
return $status
}
main() {
parse_args "$@"
set_driver_path
set_app_path
# execute on default installed GPU
if [ "$DRIVER" = "gpu" ]; then
run_app
exit $?
fi
if [ -n "$CONFIGS" ]; then
echo "CONFIGS=$CONFIGS"
fi
if [ $REBUILD -ne 0 ]; then
BLACKBOX_CACHE=blackbox.$DRIVER.cache
LAST_CONFIGS=$(cat "$BLACKBOX_CACHE" 2>/dev/null || echo "")
if [ $REBUILD -eq 1 ] || [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ]; then
make -C $DRIVER_PATH clean-driver > /dev/null
echo "$CONFIGS+$DEBUG+$SCOPE" > "$BLACKBOX_CACHE"
fi
fi
export VORTEX_PROFILING=$PERF_CLASS
make -C "$ROOT_DIR/hw" config > /dev/null
make -C "$ROOT_DIR/runtime/stub" > /dev/null
if [ $TEMPBUILD -eq 1 ]; then
# setup temp directory
TEMPDIR=$(mktemp -d)
mkdir -p "$TEMPDIR"
# build stub driver
echo "running: DESTDIR=$TEMPDIR make -C $ROOT_DIR/runtime/stub"
DESTDIR="$TEMPDIR" make -C $ROOT_DIR/runtime/stub > /dev/null
# register tempdir cleanup on exit
trap "rm -rf $TEMPDIR" EXIT
fi
build_driver
run_app
status=$?
if [ $DEBUG -eq 1 ] && [ -f "$APP_PATH/trace.vcd" ]; then
mv -f $APP_PATH/trace.vcd .
fi
if [ $SCOPE -eq 1 ] && [ -f "$APP_PATH/scope.vcd" ]; then
mv -f $APP_PATH/scope.vcd .
echo "running: make -C $APP_PATH run-$DRIVER"
make -C $APP_PATH run-$DRIVER
status=$?
fi
exit $status
}
fi
main "$@"
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS"
echo "CONFIGS=$CONFIGS"
if [ $REBUILD -ne 0 ]
then
BLACKBOX_CACHE=blackbox.$DRIVER.cache
if [ -f "$BLACKBOX_CACHE" ]
then
LAST_CONFIGS=`cat $BLACKBOX_CACHE`
fi
if [ $REBUILD -eq 1 ] || [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ];
then
make -C $DRIVER_PATH clean-driver > /dev/null
echo "$CONFIGS+$DEBUG+$SCOPE" > $BLACKBOX_CACHE
fi
fi
# export performance monitor class identifier
export VORTEX_PROFILING=$PERF_CLASS
status=0
# ensure config update
make -C $ROOT_DIR/hw config > /dev/null
# ensure the stub driver is present
make -C $ROOT_DIR/runtime/stub > /dev/null
if [ $DEBUG -ne 0 ]
then
# running application
if [ $TEMPBUILD -eq 1 ]
then
# setup temp directory
TEMPDIR=$(mktemp -d)
mkdir -p "$TEMPDIR/$DRIVER"
# driver initialization
if [ $SCOPE -eq 1 ]
then
echo "running: DESTDIR=$TEMPDIR/$DRIVER DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: DESTDIR=$TEMPDIR/$DRIVER DEBUG=$DEBUG_LEVEL CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
else
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
fi
# cleanup temp directory
trap "rm -rf $TEMPDIR" EXIT
else
# driver initialization
if [ $SCOPE -eq 1 ]
then
echo "running: DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: DEBUG=$DEBUG_LEVEL CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
DEBUG=1 OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
else
echo "running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
DEBUG=1 make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
fi
fi
if [ -f "$APP_PATH/trace.vcd" ]
then
mv -f $APP_PATH/trace.vcd .
fi
else
if [ $TEMPBUILD -eq 1 ]
then
# setup temp directory
TEMPDIR=$(mktemp -d)
mkdir -p "$TEMPDIR/$DRIVER"
# driver initialization
if [ $SCOPE -eq 1 ]
then
echo "running: DESTDIR=$TEMPDIR/$DRIVER SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: DESTDIR=$TEMPDIR/$DRIVER CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER
status=$?
else
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER"
VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER
status=$?
fi
# cleanup temp directory
trap "rm -rf $TEMPDIR" EXIT
else
# driver initialization
if [ $SCOPE -eq 1 ]
then
echo "running: SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: CONFIGS=$CONFIGS make -C $DRIVER_PATH"
CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
OPTS=$ARGS make -C $APP_PATH run-$DRIVER
status=$?
else
echo "running: make -C $APP_PATH run-$DRIVER"
make -C $APP_PATH run-$DRIVER
status=$?
fi
fi
fi
exit $status

View file

@ -1,46 +0,0 @@
#!/bin/sh
# Copyright 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
# Function to check if GCC version is less than 11
check_gcc_version() {
local gcc_version
gcc_version=$(gcc -dumpversion)
if dpkg --compare-versions "$gcc_version" lt 11; then
return 0 # GCC version is less than 11
else
return 1 # GCC version is 11 or greater
fi
}
# Update package list
apt-get update -y
# install system dependencies
apt-get install -y build-essential valgrind libstdc++6 binutils python3 uuid-dev ccache cmake libffi7
# Check and install GCC 11 if necessary
if check_gcc_version; then
echo "GCC version is less than 11. Installing GCC 11..."
add-apt-repository -y ppa:ubuntu-toolchain-r/test
apt-get update
apt-get install -y g++-11 gcc-11
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100
else
echo "GCC version is 11 or greater. No need to install GCC 11."
fi

View file

@ -19,8 +19,6 @@ set -e
# clear blackbox cache
rm -f blackbox.*.cache
# HW: add a test "VM Test" to make sure VM feature is enabled
XLEN=${XLEN:=@XLEN@}
XSIZE=$((XLEN / 8))
@ -43,23 +41,31 @@ isa()
make -C tests/riscv/isa run-simx
make -C tests/riscv/isa run-rtlsim
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
if [ "$XLEN" == "64" ]
then
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64d
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64d
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64d
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64d
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64fx
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64fx
fi
# clean build
@ -94,18 +100,10 @@ regression()
# test global barrier
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tgbar" --cores=2
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tgbar" --cores=2
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=xrt --app=dogfood --args="-n1 -tgbar" --cores=2
# test local barrier
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar"
./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tbar"
./ci/blackbox.sh --driver=xrt --app=dogfood --args="-n1 -tbar"
# test temp driver mode for
./ci/blackbox.sh --driver=simx --app=vecadd --rebuild=3
# test for matmul
CONFIGS="-DTC_NUM=4 -DTC_SIZE=8" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args="-n128 -d1"
echo "regression tests done!"
}
@ -126,22 +124,6 @@ opencl()
echo "opencl tests done!"
}
vm(){
echo "begin vm tests..."
make -C sim/simx clean && CONFIGS="-DVM_ENABLE" make -C sim/simx
make -C runtime/simx clean && CONFIGS="-DVM_ENABLE" make -C runtime/simx
make -C tests/opencl run-simx
make -C tests/regression run-simx
make -C sim/simx clean && CONFIGS="-DVM_ENABLE -DVM_ADDR_MODE=BARE" make -C sim/simx
make -C runtime/simx clean && CONFIGS="-DVM_ENABLE -DVM_ADDR_MODE=BARE" make -C runtime/simx
make -C tests/opencl run-simx
make -C tests/regression run-simx
echo "vm tests done!"
}
cache()
{
echo "begin cache tests..."
@ -158,33 +140,27 @@ cache()
# reduce l1 line size
CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=rtlsim --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XSIZE -DDISABLE_L1" ./ci/blackbox.sh --driver=rtlsim --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache ways
CONFIGS="-DICACHE_NUM_WAYS=1 -DDCACHE_NUM_WAYS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_NUM_WAYS=4 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_NUM_WAYS=4 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache banking
CONFIGS="-DMEM_BLOCK_SIZE=8 -DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DMEM_BLOCK_SIZE=8 -DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DMEM_BLOCK_SIZE=8 -DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=8 -DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=4" ./ci/blackbox.sh --driver=simx --app=sgemmx --threads=8
# replacement policy
CONFIGS="-DDCACHE_REPL_POLICY=0" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_REPL_POLICY=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_REPL_POLICY=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test writeback
CONFIGS="-DDCACHE_WRITEBACK=1 -DDCACHE_DIRTYBYTES=0 -DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=mstress
CONFIGS="-DDCACHE_WRITEBACK=1 -DDCACHE_DIRTYBYTES=1 -DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=mstress
CONFIGS="-DDCACHE_WRITEBACK=1 -DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=simx --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --app=mstress
CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
# cache clustering
CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=4 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=4 --warps=1 --threads=2
@ -259,39 +235,33 @@ config2()
# test opaesim
./ci/blackbox.sh --driver=opae --app=printf
./ci/blackbox.sh --driver=opae --app=diverge
./ci/blackbox.sh --driver=xrt --app=diverge
# disable DPI
if [ "$XLEN" == "64" ]; then
# need to disable trig on 64-bit due to a bug inside fpnew's sqrt core.
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-xtrig -xbar -xgbar"
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood --args="-xtrig -xbar -xgbar"
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=xrt --app=dogfood --args="-xtrig -xbar -xgbar"
else
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=xrt --app=dogfood
fi
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
# custom program startup address
make -C tests/regression/dogfood clean-kernel
STARTUP_ADDR=0x80000000 make -C tests/regression/dogfood
STARTUP_ADDR=0x40000000 make -C tests/regression/dogfood
./ci/blackbox.sh --driver=simx --app=dogfood
./ci/blackbox.sh --driver=rtlsim --app=dogfood
make -C tests/regression/dogfood clean-kernel
# disabling M & F extensions
make -C sim/rtlsim clean && CONFIGS="-DEXT_M_DISABLE -DEXT_F_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32i
make -C sim/rtlsim clean && CONFIGS="-DEXT_M_DISABLE -DEXT_F_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32i
make -C sim/rtlsim clean
# disabling ZICOND extension
CONFIGS="-DEXT_ZICOND_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo
# test 128-bit memory block
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=xrt --app=mstress
# test AXI bus
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --app=mstress
# test XLEN-bit memory block
# test 128-bit MEM block
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=mstress
# test XLEN-bit MEM block
CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=mstress
@ -299,35 +269,11 @@ config2()
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8
# test single-bank memory
if [ "$XLEN" == "64" ]; then
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=xrt --app=mstress
else
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=xrt --app=mstress
fi
# test single-bank DRAM
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
# test larger memory address
if [ "$XLEN" == "64" ]; then
CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=49" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=49" ./ci/blackbox.sh --driver=xrt --app=mstress
else
CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=33" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=33" ./ci/blackbox.sh --driver=xrt --app=mstress
fi
# test memory banks interleaving
CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=1" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=0" ./ci/blackbox.sh --driver=opae --app=mstress
# test memory ports
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=8" ./ci/blackbox.sh --driver=opae --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=8" ./ci/blackbox.sh --driver=xrt --app=mstress --threads=8
# test 27-bit DRAM address
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --app=mstress
echo "configuration-2 tests done!"
}
@ -353,32 +299,20 @@ debug()
test_csv_trace
CONFIGS="-O0" ./ci/blackbox.sh --driver=opae --app=demo --args="-n1"
CONFIGS="-O0" ./ci/blackbox.sh --driver=xrt --app=demo --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=xrt --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1"
echo "debugging tests done!"
}
scope()
{
echo "begin scope tests..."
SCOPE_DEPTH=128 ./ci/blackbox.sh --driver=opae --app=demo --args="-n1" --scope
SCOPE_DEPTH=128 ./ci/blackbox.sh --driver=xrt --app=demo --args="-n1" --scope
echo "debugging scope done!"
}
stress()
{
echo "begin stress tests..."
# test verilator reset values
CONFIGS="-DVERILATOR_RESET_VALUE=1 -DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=xrt --app=sgemmx --args="-n128" --l2cache
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --args="-n128" --l2cache
echo "stress tests done!"
}
@ -388,25 +322,15 @@ synthesis()
echo "begin synthesis tests..."
PREFIX=build_base make -C hw/syn/yosys clean
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE -DNUM_WARPS=2 -DNUM_THREADS=2" make -C hw/syn/yosys synthesis
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys synthesis
echo "synthesis tests done!"
}
vector()
{
echo "begin vector tests..."
make -C sim/simx clean && CONFIGS="-DEXT_V_ENABLE" make -C sim/simx
TOOLDIR=@TOOLDIR@ XLEN=@XLEN@ VLEN=256 REG_TESTS=1 ./tests/riscv/riscv-vector-tests/run-test.sh
echo "vector tests done!"
}
show_usage()
{
echo "Vortex Regression Test"
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--scope] [--stress] [--synthesis] [--vector] [--all] [--h|--help]"
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--stress] [--synthesis] [--all] [--h|--help]"
}
declare -a tests=()
@ -435,9 +359,6 @@ while [ "$1" != "" ]; do
--cache )
tests+=("cache")
;;
--vm )
tests+=("vm")
;;
--config1 )
tests+=("config1")
;;
@ -447,18 +368,12 @@ while [ "$1" != "" ]; do
--debug )
tests+=("debug")
;;
--scope )
tests+=("scope")
;;
--stress )
tests+=("stress")
;;
--synthesis )
tests+=("synthesis")
;;
--vector )
tests+=("vector")
;;
--all )
tests=()
tests+=("unittest")
@ -467,14 +382,11 @@ while [ "$1" != "" ]; do
tests+=("regression")
tests+=("opencl")
tests+=("cache")
tests+=("vm")
tests+=("config1")
tests+=("config2")
tests+=("debug")
tests+=("scope")
tests+=("stress")
tests+=("synthesis")
tests+=("vector")
;;
-h | --help )
show_usage

27
ci/system_updates.sh Executable file
View file

@ -0,0 +1,27 @@
#!/bin/sh
# Copyright 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
apt-get update -y
add-apt-repository -y ppa:ubuntu-toolchain-r/test
apt-get update
apt-get install -y g++-11 gcc-11
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100
apt-get install -y build-essential valgrind libstdc++6 binutils python uuid-dev ccache

View file

@ -1,13 +1,13 @@
#!/bin/sh
# Copyright 2019-2023
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -15,6 +15,7 @@
# limitations under the License.
TOOLDIR=${TOOLDIR:=@TOOLDIR@}
export PATH=$TOOLDIR/verilator/bin:$PATH
export SV2V_PATH=$TOOLDIR/sv2v

View file

@ -23,9 +23,9 @@ OSVERSION=${OSVERSION:=@OSVERSION@}
riscv32()
{
case $OSVERSION in
"centos/7") parts=$(eval echo {a..l}) ;;
"ubuntu/bionic") parts=$(eval echo {a..j}) ;;
*) parts=$(eval echo {a..k}) ;;
"centos/7") parts=$(eval echo {a..h}) ;;
"ubuntu/focal") parts=$(eval echo {a..k}) ;;
*) parts=$(eval echo {a..j}) ;;
esac
rm -f riscv32-gnu-toolchain.tar.bz2.parta*
for x in $parts
@ -41,7 +41,7 @@ riscv32()
riscv64()
{
case $OSVERSION in
"centos/7") parts=$(eval echo {a..l}) ;;
"centos/7") parts=$(eval echo {a..h}) ;;
*) parts=$(eval echo {a..j}) ;;
esac
rm -f riscv64-gnu-toolchain.tar.bz2.parta*

View file

@ -44,8 +44,7 @@ def load_config(filename):
'num_barriers': int(config_match.group(7)),
}
return config
print("Error: missing CONFIGS: header")
sys.exit(1)
return None
def parse_simx(log_lines):
pc_pattern = r"PC=(0x[0-9a-fA-F]+)"
@ -275,8 +274,6 @@ def split_log_file(log_filename):
if current_sublog is not None:
sublogs.append(current_sublog)
else:
sublogs.append(log_lines)
return sublogs

View file

@ -1,4 +1,4 @@
#!/usr/bin/env python3
#!/usr/bin/env python
# Copyright 2019-2023
#

View file

@ -31,4 +31,7 @@ RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv$(XLEN)-gnu-toolchain
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
THIRD_PARTY_DIR ?= $(VORTEX_HOME)/third_party
VORTEX_RT_PATH ?= $(VORTEX_HOME)/runtime
VORTEX_KN_PATH ?= $(VORTEX_HOME)/kernel
THIRD_PARTY_DIR ?= $(VORTEX_HOME)/third_party

10
configure vendored
View file

@ -26,8 +26,6 @@ detect_osversion() {
case "$VERSION_CODENAME" in
bionic) osversion="ubuntu/bionic";;
focal) osversion="ubuntu/focal";;
jammy) osversion="ubuntu/focal";;
noble) osversion="ubuntu/focal";;
# Add new versions as needed
esac
;;
@ -65,7 +63,7 @@ copy_files() {
filename_no_ext="${filename%.in}"
dest_file="$dest_dir/$filename_no_ext"
mkdir -p "$dest_dir"
sed "s|@VORTEX_HOME@|$SOURCE_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g; s|@BUILDDIR@|$CURRENT_DIR|g" "$file" > "$dest_file"
sed "s|@VORTEX_HOME@|$SCRIPT_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g" "$file" > "$dest_file"
# apply permissions to bash scripts
read -r firstline < "$dest_file"
if [[ "$firstline" =~ ^#!.*bash ]]; then
@ -169,8 +167,8 @@ fi
SUBDIRS=("." "!ci" "!perf" "hw*" "kernel*" "runtime*" "sim*" "tests*")
# Get the directory of the script
SOURCE_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
THIRD_PARTY_DIR=$SOURCE_DIR/third_party
THIRD_PARTY_DIR=$SCRIPT_DIR/third_party
copy_files "$SOURCE_DIR" "$CURRENT_DIR"
copy_files "$SCRIPT_DIR" "$CURRENT_DIR"

79
docs/altera_fpga_guide.md Normal file
View file

@ -0,0 +1,79 @@
# FPGA Startup and Configuration Guide
OPAE Environment Setup
----------------------
$ source /opt/inteldevstack/init_env_user.sh
$ export OPAE_HOME=/opt/opae/1.1.2
$ export PATH=$OPAE_HOME/bin:$PATH
$ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH
$ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH
$ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH
OPAE Build
------------------
The FPGA has to following configuration options:
- DEVICE_FAMILY=arria10 | stratix10
- NUM_CORES=#n
Command line:
$ cd hw/syn/altera/opae
$ PREFIX=test1 TARGET=fpga NUM_CORES=4 make
A new folder (ex: `test1_xxx_4c`) will be created and the build will start and take ~30-480 min to complete.
Setting TARGET=ase will build the project for simulation using Intel ASE.
OPAE Build Configuration
------------------------
The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware parameters that can be modified when build the processor.For example, have the following parameters that can be configured:
- `NUM_WARPS`: Number of warps per cores
- `NUM_THREADS`: Number of threads per warps
- `PERF_ENABLE`: enable the use of all profile counters
You configure the syntesis build from the command line:
$ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make
OPAE Build Progress
-------------------
You could check the last 10 lines in the build log for possible errors until build completion.
$ tail -n 10 <build_dir>/build.log
Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs.
$ ps -u <username>
If the build fails and you need to restart it, clean up the build folder using the following command:
$ make clean
The bitstream file `vortex_afu.gbs` should exist when the build is done:
$ ls -lsa <build_dir>/synth/vortex_afu.gbs
Signing the bitstream and Programming the FPGA
----------------------------------------------
$ cd <build_dir>
$ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs
$ fpgasupdate vortex_afu_unsigned_ssl.gbs
Sample FPGA Run Test
--------------------
Ensure you have the correct opae runtime for the FPGA target
$ make -C runtime/opae clean
$ TARGET=FPGA make -C runtime/opae
Run the following from your Vortex build directory
$ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128"

View file

@ -1,37 +1,18 @@
# Contributing to Vortex
# Contributing to Vortex on Github
## Github
Vortex uses Github to host its git repositories.
There are a lot of ways to use the features on Github for collaboration.
Therefore, this documentation details the standard procedure for contributing to Vortex.
Development of Vortex is consolidated to this repo, `vortex` and any associated forks.
Previously, there was active work done on a private repo named `vortex-dev`.
`vortex-dev` has officially been deprecated and fully merged into this public repo, `vortex`.
If you are returning to this project and have legacy versions of Vortex, you can use the releases branches to access older versions.
## Github Details
- There are two main repos, `vortex` (public, this one) and `vortex-dev` (private)
- todo: Most current development is on `vortex`
- If you have a legacy version of `vortex`, you can use the releases branch or tags to access the repo at that point in time
## Contribution Process
In an effort to keep `vortex` organized, permissions to directly create branches and push code has been limited to admins.
However, contributions are strongly encouraged and keep the project moving forward! Here is the procedure for contributing:
- You should create a new branch from develop that is clearly named with the feature that you want to add
- Avoid pushing directly to the `master` branch instead you will need to make a Pull Request (PR)
- There should be protections in place that prevent pushing directly to the main branch, but don't rely on it
- When you make a PR it will be tested against the continuous integration (ci) pipeline (see `continuous_integration.md`)
- It is not sufficient to just write some tests, they need to be incorporated into the ci pipeline to make sure they are run
- During a PR, you might receive feedback regarding your changes and you might need to make further commits to your branch
1. Create a fork of `vortex`
2. In your fork, create a branch from `master` that briefly explains the work you are adding (ie: `develop-documentation`)
3. Make your changes on the new branch in your fork. You may create as many commits as you need, which might be common if you are making multiple iterations
4. Since you are the owner of your fork, you have full permissions to push commits to your fork
4. When you are satisfied with the changes on your fork, you can open a PR from your fork using the online interface
5. If you recently made a push, you will get automatically get a prompt on Github online to create a PR, which you can press
6. Otherwise, you can go to your fork on Github online and manually create a PR (todo)
(todo): how to name and format your PR, what information you should add to the PR, does not need to be too strict if you are attending the weekly meetings*
7. Github uses the following semantics: `base repository` gets the changes from your `head repository`
8. Therefore, you should set the `base repository` to `vortexgpgpu/vortex` and the `base` branch to `master` since the master branch is protected by reviewed PRs.
9. And you should assign the `head repository` to `<your-github-username>/vortex` (which represents your fork of vortex) and the `base` branch to the one created in step 2
10. Now that your intended PR has been specified, you should review the status. Check for merge conflicts, if all your commits are present, and all the modified files make sense
11. You can still make a PR if there are issues in step 10, just make sure the structure is correct according to steps 7-9
12. Once the PR is made, the CI pipeline will run automatically, testing your changes
13. Remember, a PR is flexible if you need to make changes to the code you can go back to your branch of the fork to commit and push any updates
14. As long as the `head repository`'s `base` branch is the one you edited, the PR will automatically get the most recent changes
15. When all merge conflicts are resolved, changes are made, and tests pass you can have an admin merge your PR
## What Makes a Good Contribution?
- If you are contributing code changes, then review [testing.md](./testing.md) to ensure your tests are integrated into the [CI pipeline](continuous_integration.md)
- During a PR, you should consider the advice you are provided by your reviewers. Remember you keep adding commits to an open PR!
- If your change aims to fix an issue opened on Github, please tag that issue in the PR itself
## Creating and Adding Tests
see `testing.md`

View file

@ -33,13 +33,7 @@ The recommended method to enable debugging is to pass the `--debug` flag to `bla
// Running demo program on rtlsim in debug mode
$ ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=1
A debug trace `run.log` is generated in the current directory during the program execution. The trace includes important states of the simulated processor (memory, caches, pipeline, stalls, etc..). A waveform trace `trace.vcd` is also generated in the current directory during the program execution.
By default all library modules unde the /libs/ folder are excluded from the trace to reduce the waveform file size, you can chnage that behavoir by either explicitly commenting out `TRACING_OFF`/`TRACING_ON` inside a lib module source (e.g. VX_stream_buffer.sv) or simply enabling a full trace using the following command.
// Debugging the demo program with rtlsim in full tracing mode
$ CONFIGS="-DTRACING_ALL" ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=1
You can visualize the waveform trace using any tool that can open VCD files (Modelsim, Quartus, Vivado, etc..). [GTKwave] (http://gtkwave.sourceforge.net) is a great open-source scope analyzer that also works with VCD files.
A debug trace `run.log` is generated in the current directory during the program execution. The trace includes important states of the simulated processor (memory, caches, pipeline, stalls, etc..). A waveform trace `trace.vcd` is also generated in the current directory during the program execution. You can visualize the waveform trace using any tool that can open VCD files (Modelsim, Quartus, Vivado, etc..). [GTKwave] (http://gtkwave.sourceforge.net) is a great open-source scope analyzer that also works with VCD files.
## FPGA Debugging

View file

@ -1,19 +1,16 @@
# Environment Setup
These instructions apply to the development vortex repo using the updated toolchain. The updated toolchain is considered to be any commit of `master` pulled from July 2, 2023 onwards. The toolchain update in question can be viewed in this [commit](https://github.com/vortexgpgpu/vortex-dev/commit/0048496ba28d7b9a209a0e569d52d60f2b68fc04). Therefore, if you are unsure whether you are using the new toolchain or not, then you should check the `ci` folder for the existence of the `toolchain_prebuilt.sh` script. Furthermore, you should notice that the `toolchain_install.sh` script has the legacy `llvm()` split into `llvm-vortex()` and `llvm-pocl()`.
## Set Up on Your Own System
## Set Up on Your Own System
The toolchain binaries provided with Vortex are built on Ubuntu-based systems. To install Vortex on your own system, [follow these instructions](install_vortex.md).
## Servers for Georgia Tech Students and Collaborators
### Volvo
Volvo is a 64-core server provided by HPArch. You need valid credentials to access it. If you don't already have access, you can get in contact with your mentor to ask about setting your account up.
Setup on Volvo:
1. Connect to Georgia Tech's VPN or ssh into another machine on campus
2. `ssh volvo.cc.gatech.edu`
3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
@ -22,11 +19,9 @@ Setup on Volvo:
6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
### Nio
Nio is a 20-core desktop server provided by HPArch. If you have access to Volvo, you also have access to Nio.
Setup on Nio:
1. Connect to Georgia Tech's VPN or ssh into another machine on campus
2. `ssh nio.cc.gatech.edu`
3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
@ -34,12 +29,11 @@ Setup on Nio:
5. `make -s` in the `vortex` root directory
6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
## Docker (Experimental)
## Docker (Experimental)
Docker allows for isolated pre-built environments to be created, shared and used. The emulation mode required for ARM-based processors will incur a decrease in performance. Currently, the dockerfile is not included with the official vortex repository and is not actively maintained or supported.
### Setup with Docker
1. Clone repo recursively onto your local machine: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
2. Download the dockerfile from [here](https://github.gatech.edu/gist/usubramanya3/f1bf3e953faa38a6372e1292ffd0b65c) and place it in the root of the repo.
3. Build the Dockerfile into an image: `docker build --platform=linux/amd64 -t vortex -f dockerfile .`

View file

@ -1,217 +0,0 @@
# FPGA Startup and Configuration Guide
## Gaining Access to FPGA's with CRNCH
If you are associated with Georgia Tech (or related workshops) you can use CRNCH's server to gain remote access to FPGA's. Otherwise, you can skip to the Xilinx or Intel (Altera) synthesis steps below.
## What is CRNCH?
**C**enter for **R**esearch into **N**ovel **C**omputing **H**ierarchies
## What does CRNCH Offer?
**The Rogues Gallery (RG)**: new concept focused on developing our understanding of next-generation hardware with a focus on unorthodox and uncommon technologies. **RG** will acquire new and unique hardware (ie, the aforementioned “*rogues*”) from vendors, research labs, and startups and make this hardware available to students, faculty, and industry collaborators within a managed data center environment
## Why are the Rouges Important?
By exposing students and researchers to this set of unique hardware, we hope to foster cross-cutting discussions about hardware designs that will drive future *performance improvements in computing long after the Moores Law era of “cheap transistors” ends*. Specifically, the Rouges Gallery contains FPGA's which can be synthesized into Vortex hardware.
## How is the Rouges Gallery Funded?
Rogues Gallery testbed is primarily supported by the National Science Foundation (NSF) under NSF Award Number [#2016701](https://www.nsf.gov/awardsearch/showAward?AWD_ID=2016701&HistoricalAwards=false)
## Rouges Gallery Documentation
You can read about RG in more detail on its official documentation [page](https://gt-crnch-rg.readthedocs.io/en/main/index.html#).
You can listen to a talk about RG [here](https://mediaspace.gatech.edu/media/Jeff%20Young%20-%20Rogues%20Gallery%20-%20CRNCH%20Summit%202021/1_lqlgr0jj)
[CRNCH Summit 2023](https://github.com/gt-crnch/crnch-summit-2023/tree/main)
## Request Access for Rouges Gallery
You should use [this form](https://crnch-rg.cc.gatech.edu/request-rogues-gallery-access/) to request access to RGs reconfigurable computing (vortex fpga) resources. You should receive an email with your ticket item being created. Once it gets processed, you should get an email confirmed your access has been granted. It might take some time to get processed.
## How to Access Rouges Gallery?
There are two methods of accessing CRNCH's Rouges Gallery
1) Web-based GUI: [rg-ood.crnch.gatech.edu](http://rg-ood.crnch.gatech.edu/)
2) SSH: `ssh <your-gt-username>@rg-login.crnch.gatech.edu`
## Where should I keep my files?
The CRNCH servers have a folder called `USERSCRATCH` which can be found in your home directory: `echo $HOME`. You should keep all your files in this folder since it is available across all the Rouges Gallery Nodes.
## **What Machines are Available in the Rogues Gallery?**
Complete list of machines can be found [here](https://gt-crnch-rg.readthedocs.io/en/main/general/rg-hardware.html). Furthermore, you can find detailed information about the FPGA hardware [here](https://gt-crnch-rg.readthedocs.io/en/main/reconfig/xilinx/xilinx-getting-started.html).
## Allocate an FPGA Node
Once youve connected to the CRNCH login node, you can use the Slurm scheduler to request an interactive job using `salloc`. This [page](https://gt-crnch-rg.readthedocs.io/en/main/general/using-slurm.html) explains why we use Slurm to request resources. Documentation for `salloc` can be found [here](https://gt-crnch-rg.readthedocs.io/en/main/general/using-slurm-examples.html). And here.
To request 16 cores and 64GB of RAM for 6 hours on flubber9, a fpga dev node:
```bash
salloc -p rg-fpga --nodes=1 --ntasks-per-node=16 --mem=64G --nodelist flubber1 --time=06:00:00
```
Synthesis for Xilinx Boards
----------------------
Once you are logged in, you will need to complete some first time configurations. If you are interested in the Intel (Altera) synthesis steps, scroll down below.
### Source Configuration Scripts
```
# From any directory
$ source /opt/xilinx/xrt/setup.sh
$ source /tools/reconfig/xilinx/Vitis/2023.1/settings64.sh
```
### Check Installed FPGA Platforms
`platforminfo -l` which tells us the correct name of the platform installed on the current fpga node. It should be used for the `PLATFORM` variable below. Otherwise, if there is an error then there was an issue with the previous two commands.
### Install Vortex Toolchain
The Xilinx synthesis process requires verilator to generate the bitstream. Eventually, you will need the whole toolchain to run the bitstream on the FPGA. Therefore, the Vortex toolchain and can be installed as follows. If you complete these steps properly, you should only need to complete them once and you can skip to `Activate Vortex Toolchain`
```
# Make a build directory from root and configure scripts for your environment
mkdir build && cd build && ../configure --tooldir=$HOME/tools
# Install the whole prebuilt toolchain
./ci/toolchain_install.sh --all
# Add environment variables to bashrc
echo "source <full-path-to-vortex-root>/vortex/build/ci/toolchain_env.sh" >> ~/.bashrc
```
### Activate Vortex Toolchain
```
# From any directory
source ~/.bashrc
# Check environment setup
verilator --version
```
### Build the FPGA Bitstream
The root directory contains the path `hw/syn/xilinx/xrt` which has the makefile used to generate the Vortex bitstream.
```
$ cd hw/syn/xilinx/xrt
$ PREFIX=test1 PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 TARGET=hw NUM_CORES=1 make > build_u250_hw_1c.log 2>&1 &
```
Will run the synthesis under new build directory: BUILD_DIR := "\<PREFIX>\_\<PLATFORM>\_\<TARGET>"
The generated bitstream will be located under <BUILD_DIR>/bin/vortex_afu.xclbin
For long-running jobs, invocation of this makefile can be made of the following form:
`[CONFIGS=<vortex macros>] [PREFIX=<prefix directory name>] [NUM_CORES=<#>] TARGET=hw|hw_emu PLATFORM=<platform baseName> nohup make > <log filename> 2>&1 &`
For example:
```bash
CONFIGS="-DL2_ENABLE -DDCACHE_SIZE=8192" PREFIX=build_4c_u280 NUM_CORES=4 TARGET=hw PLATFORM=xilinx_u280_gen3x16_xdma_1_202310_1 nohup make > build_u250_hw_4c.log 2>&1 &
```
The build is complete when the bitstream file `vortex_afu.xclbin` exists in `<prefix directory name><platform baseName>hw|hw_emu/bin`.
### Running a Program on Xilinx FPGA
The [blackbox.sh](./simulation.md) script within the build directory can be used to run a test with Vortexs xrt driver using the following command:
`FPGA_BIN_DIR=<path to bitstream directory> TARGET=hw|hw_emu PLATFORM=<platform baseName> ./ci/blackbox.sh --driver=xrt --app=<test name>`
For example:
```FPGA_BIN_DIR=<realpath> hw/syn/xilinx/xrt/build_4c_u280_xilinx_u280_gen3x16_xdma_1_202211_1_hw/bin TARGET=hw PLATFORM=xilinx_u280_gen3x16_xdma_1_202211_1 ./ci/blackbox.sh --driver=xrt --app=demo```
Synthesis for Intel (Altera) Boards
----------------------
### OPAE Environment Setup
$ source /opt/inteldevstack/init_env_user.sh
$ export OPAE_HOME=/opt/opae/1.1.2
$ export PATH=$OPAE_HOME/bin:$PATH
$ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH
$ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH
$ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH
### OPAE Build
The FPGA has to following configuration options:
- DEVICE_FAMILY=arria10 | stratix10
- NUM_CORES=#n
Command line:
$ cd hw/syn/altera/opae
$ PREFIX=test1 TARGET=fpga NUM_CORES=4 make
A new folder (ex: `test1_xxx_4c`) will be created and the build will start and take ~30-480 min to complete.
Setting TARGET=ase will build the project for simulation using Intel ASE.
### OPAE Build Configuration
The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware parameters that can be modified when build the processor.For example, have the following parameters that can be configured:
- `NUM_WARPS`: Number of warps per cores
- `NUM_THREADS`: Number of threads per warps
- `PERF_ENABLE`: enable the use of all profile counters
You configure the syntesis build from the command line:
$ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make
### OPAE Build Progress
You could check the last 10 lines in the build log for possible errors until build completion.
$ tail -n 10 <build_dir>/build.log
Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs.
$ ps -u <username>
If the build fails and you need to restart it, clean up the build folder using the following command:
$ make clean
The file `vortex_afu.gbs` should exist when the build is done:
$ ls -lsa <build_dir>/synth/vortex_afu.gbs
### Signing the bitstream and Programming the FPGA
$ cd <build_dir>
$ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs
$ fpgasupdate vortex_afu_unsigned_ssl.gbs
### Sample FPGA Run Test
Ensure you have the correct opae runtime for the FPGA target
```
$ TARGET=FPGA make -C runtime/opae
```
Run the [blackbox.sh](./simulation.md) from your Vortex build directory
```
$ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128"
```
### FPGA sample test running OpenCL sgemm kernel
You can use the `blackbox.sh` script to run the following from your Vortex build directory
$ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128"
### Testing Vortex using OPAE with Intel ASE Simulation
Building ASE synthesis
```$ TARGET=asesim make -C runtime/opae```
Building ASE runtime
```$ TARGET=asesim make -C runtime/opae```
Running ASE simulation
```$ ASE_LOG=0 ASE_WORKDIR=<build_dir>/synth/work TARGET=asesim ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n16"```

View file

@ -2,8 +2,32 @@
## Table of Contents
- [Codebase Layout](codebase.md): Summary of repo file tree
- [Microarchitecture](microarchitecture.md): Vortex Pipeline and cache microarchitectural details and reconfigurability
- [Simulation](simulation.md): Details for building and running each simulation driver
- [Contributing](contributing.md): Process for contributing your own features including repo semantics and testing
- [Debugging](debugging.md): Debugging configurations for each Vortex driver
- [Codebase Layout](codebase.md)
- [Microarchitecture](microarchitecture.md)
- [Cache Subsystem](cache_subsystem.md)
- [Software](software.md)
- [Simulation](simulation.md)
- [Altera FPGA Setup Guide](altera_fpga_guide.md)
- [Xilinx FPGA Setup Guide](xilinx_fpga_guide.md)
- [Debugging](debugging.md)
- [Useful Links](references.md)
## Installation
- For the different environments Vortex supports, [read this document](environment_setup.md).
- To install on your own system, [follow this document](install_vortex.md).
## Quick Start Scenarios
Running Vortex simulators with different configurations:
- Run basic driver test with rtlsim driver and Vortex config of 2 clusters, 2 cores, 2 warps, 4 threads
$ ./ci/blackbox.sh --driver=rtlsim --clusters=2 --cores=2 --warps=2 --threads=4 --app=basic
- Run demo driver test with opae driver and Vortex config of 1 clusters, 4 cores, 4 warps, 2 threads
$ ./ci/blackbox.sh --driver=opae --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo
- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads
$ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood

View file

@ -77,7 +77,4 @@ Vortex has a 6-stage pipeline:
- Sockets
- Grouping multiple cores sharing L1 cache
- Clusters
- Grouping of sockets sharing L2 cache
### Vortex Cache Subsystem
More details about the cache subsystem are provided [here](./cache_subsystem.md).
- Grouping of sockets sharing L2 cache

View file

@ -6,16 +6,13 @@
### Cycle-Approximate Simulation
SimX is a C++ cycle-level in-house simulator developed for Vortex. The relevant files are located in the `simx` folder. The [readme](README.md) has the most detailed instructions for building and running simX.
- To install on your own system, [follow this document](install_vortex.md).
- For the different Georgia Tech environments Vortex supports, [read this document](environment_setup.md).
SimX is a C++ cycle-level in-house simulator developed for Vortex. The relevant files are located in the `simX` folder.
### FGPA Simulation
The guide to build the fpga with specific configurations is located [here.](fpga_setup.md) You can find instructions for both Xilinx and Altera based FPGAs.
The current target FPGA for simulation is the Arria10 Intel Accelerator Card v1.0. The guide to build the fpga with specific configurations is located [here.](fpga_setup.md)
### How to Test (using `blackbox.sh`)
### How to Test
Running tests under specific drivers (rtlsim,simx,fpga) is done using the script named `blackbox.sh` located in the `ci` folder. Running command `./ci/blackbox.sh --help` from the Vortex root directory will display the following command line arguments for `blackbox.sh`:
@ -50,20 +47,4 @@ PERF: core1: instrs=90693, cycles=53108, IPC=1.707709
PERF: core2: instrs=90849, cycles=53107, IPC=1.710678
PERF: core3: instrs=90836, cycles=50347, IPC=1.804199
PERF: instrs=363180, cycles=53108, IPC=6.838518
```
## Additional Quick Start Scenarios
Running Vortex simulators with different configurations and drivers is supported. For example:
- Run basic driver test with rtlsim driver and Vortex config of 2 clusters, 2 cores, 2 warps, 4 threads
$ ./ci/blackbox.sh --driver=rtlsim --clusters=2 --cores=2 --warps=2 --threads=4 --app=basic
- Run demo driver test with opae driver and Vortex config of 1 clusters, 4 cores, 4 warps, 2 threads
$ ./ci/blackbox.sh --driver=opae --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo
- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads
$ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood
```

View file

@ -2,7 +2,7 @@
## Running a Vortex application
The framework provides a utility script: blackbox.sh under the /ci/ folder for executing applications in the tests tree. It gets copied into the `build` directory with all the environment variables resolved, so you should run it from the `build` directory as follows:
The framework provides a utility script: blackbox.sh under the /ci/ folder for executing applications in the tests tree.
You can query the commandline options of the tool using:
$ ./ci/blackbox.sh --help
@ -49,4 +49,4 @@ Compile your test: `$ make -C tests/regression/<test-name>`
Run your test: `$ ./ci/blackbox.sh --driver=simx --app=<test-name> --debug`
## Adding Your Tests to the CI Pipeline
If you are a contributor, then you will need to add tests that integrate into the continuous integration pipeline. Remember, Pull Requests cannot be merged unless new code has tests and existing tests do not regress. Furthermore, if you are contributing a new feature, it is recommended that you add the ability to enable / disable the new feature that you are adding. See more at [contributing.md](contributing.md) and [continuous_integration.md](continuous_integration.md).
See `continuous_integration.md`

36
docs/xilinx_fpga_guide.md Normal file
View file

@ -0,0 +1,36 @@
# FPGA Startup and Configuration Guide
XRT Environment Setup
----------------------
$ source /opt/xilinx/Vitis/2023.1/settings64.sh
$ source /opt/xilinx/xrt/setup.sh
Check Installed FPGA Platforms
------------------------------
$ platforminfo -l
Build FPGA image
----------------
$ cd hw/syn/xilinx/xrt
$ PREFIX=test1 PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 TARGET=hw NUM_CORES=4 make
Will run the synthesis under new build directory: BUILD_DIR := "\<PREFIX>\_\<PLATFORM>\_\<TARGET>"
The generated bitstream will be located under <BUILD_DIR>/bin/vortex_afu.xclbin
Sample FPGA Run Test
--------------------
Ensure you have the correct opae runtime for the FPGA target
$ make -C runtime/xrt clean
$ TARGET=hw make -C runtime/xrt
Run the following from your Vortex build directory
$ TARGET=hw FPGA_BIN_DIR=<BUILD_DIR>/bin ./ci/blackbox.sh --driver=xrt --app=sgemm --args="-n128"

View file

@ -47,6 +47,8 @@ extern "C" {
void dpi_trace(int level, const char* format, ...);
void dpi_trace_start();
void dpi_trace_stop();
uint64_t dpi_uuid_gen(bool reset, int wid);
}
bool sim_trace_enabled();
@ -202,3 +204,17 @@ void dpi_trace_start() {
void dpi_trace_stop() {
sim_trace_enable(false);
}
///////////////////////////////////////////////////////////////////////////////
std::unordered_map<uint32_t, uint32_t> g_uuid_gens;
uint64_t dpi_uuid_gen(bool reset, int wid) {
if (reset) {
g_uuid_gens.clear();
return 0;
}
uint32_t instr_uuid = g_uuid_gens[wid]++;
uint64_t uuid = (uint64_t(wid) << 32) | instr_uuid;
return uuid;
}

View file

@ -30,4 +30,6 @@ import "DPI-C" function void dpi_trace(input int level, input string format /*ve
import "DPI-C" function void dpi_trace_start();
import "DPI-C" function void dpi_trace_stop();
import "DPI-C" function longint dpi_uuid_gen(input logic reset, input int wid);
`endif

View file

@ -24,14 +24,14 @@ module VX_cluster import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
input sysmem_perf_t sysmem_perf,
VX_mem_perf_if.slave mem_perf_if,
`endif
// DCRs
VX_dcr_bus_if.slave dcr_bus_if,
// Memory
VX_mem_bus_if.master mem_bus_if [`L2_MEM_PORTS],
VX_mem_bus_if.master mem_bus_if,
// Status
output wire busy
@ -43,12 +43,12 @@ module VX_cluster import VX_gpu_pkg::*; #(
`endif
`ifdef PERF_ENABLE
cache_perf_t l2_perf;
sysmem_perf_t sysmem_perf_tmp;
always @(*) begin
sysmem_perf_tmp = sysmem_perf;
sysmem_perf_tmp.l2cache = l2_perf;
end
VX_mem_perf_if mem_perf_tmp_if();
assign mem_perf_tmp_if.icache = 'x;
assign mem_perf_tmp_if.dcache = 'x;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.lmem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif
`ifdef GBAR_ENABLE
@ -56,21 +56,23 @@ module VX_cluster import VX_gpu_pkg::*; #(
VX_gbar_bus_if per_socket_gbar_bus_if[`NUM_SOCKETS]();
VX_gbar_bus_if gbar_bus_if();
`RESET_RELAY (gbar_reset, reset);
VX_gbar_arb #(
.NUM_REQS (`NUM_SOCKETS),
.OUT_BUF ((`NUM_SOCKETS > 2) ? 1 : 0) // bgar_unit has no backpressure
) gbar_arb (
.clk (clk),
.reset (reset),
.reset (gbar_reset),
.bus_in_if (per_socket_gbar_bus_if),
.bus_out_if (gbar_bus_if)
);
VX_gbar_unit #(
.INSTANCE_ID (`SFORMATF(("gbar%0d", CLUSTER_ID)))
.INSTANCE_ID ($sformatf("gbar%0d", CLUSTER_ID))
) gbar_unit (
.clk (clk),
.reset (reset),
.reset (gbar_reset),
.gbar_bus_if (gbar_bus_if)
);
@ -79,19 +81,18 @@ module VX_cluster import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) per_socket_mem_bus_if[`NUM_SOCKETS * `L1_MEM_PORTS]();
) per_socket_mem_bus_if[`NUM_SOCKETS]();
`RESET_RELAY (l2_reset, reset);
VX_cache_wrap #(
.INSTANCE_ID (`SFORMATF(("%s-l2cache", INSTANCE_ID))),
.INSTANCE_ID ($sformatf("%s-l2cache", INSTANCE_ID)),
.CACHE_SIZE (`L2_CACHE_SIZE),
.LINE_SIZE (`L2_LINE_SIZE),
.NUM_BANKS (`L2_NUM_BANKS),
.NUM_WAYS (`L2_NUM_WAYS),
.WORD_SIZE (L2_WORD_SIZE),
.NUM_REQS (L2_NUM_REQS),
.MEM_PORTS (`L2_MEM_PORTS),
.CRSQ_SIZE (`L2_CRSQ_SIZE),
.MSHR_SIZE (`L2_MSHR_SIZE),
.MRSQ_SIZE (`L2_MRSQ_SIZE),
@ -99,19 +100,17 @@ module VX_cluster import VX_gpu_pkg::*; #(
.TAG_WIDTH (L2_TAG_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`L2_WRITEBACK),
.DIRTY_BYTES (`L2_DIRTYBYTES),
.REPL_POLICY (`L2_REPL_POLICY),
.DIRTY_BYTES (`L2_WRITEBACK),
.UUID_WIDTH (`UUID_WIDTH),
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
.CORE_OUT_BUF (3),
.MEM_OUT_BUF (3),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2),
.NC_ENABLE (1),
.PASSTHRU (!`L2_ENABLED)
) l2cache (
.clk (clk),
.reset (l2_reset),
`ifdef PERF_ENABLE
.cache_perf (l2_perf),
.cache_perf (mem_perf_tmp_if.l2cache),
`endif
.core_bus_if (per_socket_mem_bus_if),
.mem_bus_if (mem_bus_if)
@ -119,20 +118,24 @@ module VX_cluster import VX_gpu_pkg::*; #(
///////////////////////////////////////////////////////////////////////////
VX_dcr_bus_if socket_dcr_bus_tmp_if();
assign socket_dcr_bus_tmp_if.write_valid = dcr_bus_if.write_valid && (dcr_bus_if.write_addr >= `VX_DCR_BASE_STATE_BEGIN && dcr_bus_if.write_addr < `VX_DCR_BASE_STATE_END);
assign socket_dcr_bus_tmp_if.write_addr = dcr_bus_if.write_addr;
assign socket_dcr_bus_tmp_if.write_data = dcr_bus_if.write_data;
wire [`NUM_SOCKETS-1:0] per_socket_busy;
VX_dcr_bus_if socket_dcr_bus_if();
`BUFFER_DCR_BUS_IF (socket_dcr_bus_if, socket_dcr_bus_tmp_if, (`NUM_SOCKETS > 1));
// Generate all sockets
for (genvar socket_id = 0; socket_id < `NUM_SOCKETS; ++socket_id) begin : g_sockets
for (genvar socket_id = 0; socket_id < `NUM_SOCKETS; ++socket_id) begin : sockets
`RESET_RELAY (socket_reset, reset);
VX_dcr_bus_if socket_dcr_bus_if();
wire is_base_dcr_addr = (dcr_bus_if.write_addr >= `VX_DCR_BASE_STATE_BEGIN && dcr_bus_if.write_addr < `VX_DCR_BASE_STATE_END);
`BUFFER_DCR_BUS_IF (socket_dcr_bus_if, dcr_bus_if, is_base_dcr_addr, (`NUM_SOCKETS > 1))
VX_socket #(
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + socket_id),
.INSTANCE_ID (`SFORMATF(("%s-socket%0d", INSTANCE_ID, socket_id)))
.INSTANCE_ID ($sformatf("%s-socket%0d", INSTANCE_ID, socket_id))
) socket (
`SCOPE_IO_BIND (scope_socket+socket_id)
@ -140,12 +143,12 @@ module VX_cluster import VX_gpu_pkg::*; #(
.reset (socket_reset),
`ifdef PERF_ENABLE
.sysmem_perf (sysmem_perf_tmp),
.mem_perf_if (mem_perf_tmp_if),
`endif
.dcr_bus_if (socket_dcr_bus_if),
.mem_bus_if (per_socket_mem_bus_if[socket_id * `L1_MEM_PORTS +: `L1_MEM_PORTS]),
.mem_bus_if (per_socket_mem_bus_if[socket_id]),
`ifdef GBAR_ENABLE
.gbar_bus_if (per_socket_gbar_bus_if[socket_id]),
@ -155,6 +158,6 @@ module VX_cluster import VX_gpu_pkg::*; #(
);
end
`BUFFER_EX(busy, (| per_socket_busy), 1'b1, 1, (`NUM_SOCKETS > 1));
`BUFFER_EX(busy, (| per_socket_busy), 1'b1, (`NUM_SOCKETS > 1));
endmodule

View file

@ -31,6 +31,7 @@
`endif
///////////////////////////////////////////////////////////////////////////////
`ifndef EXT_M_DISABLE
`define EXT_M_ENABLE
`endif
@ -85,10 +86,6 @@
`endif
`endif
`ifndef VLEN
`define VLEN 256
`endif
`ifndef NUM_CLUSTERS
`define NUM_CLUSTERS 1
`endif
@ -113,24 +110,6 @@
`define SOCKET_SIZE `MIN(4, `NUM_CORES)
`endif
// Size of Tensor Core
`ifndef TC_SIZE
`define TC_SIZE 8
`endif
// Number of TCs per Warp
`ifndef TC_NUM
`define TC_NUM 4
`endif
`ifndef NUM_TCU_LANES
`define NUM_TCU_LANES `TC_NUM
`endif
`ifndef NUM_TCU_BLOCKS
`define NUM_TCU_BLOCKS `ISSUE_WIDTH
`endif
`ifdef L2_ENABLE
`define L2_ENABLED 1
`else
@ -172,28 +151,6 @@
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
`endif
// Platform memory parameters
`ifndef PLATFORM_MEMORY_NUM_BANKS
`define PLATFORM_MEMORY_NUM_BANKS 2
`endif
`ifndef PLATFORM_MEMORY_ADDR_WIDTH
`ifdef XLEN_64
`define PLATFORM_MEMORY_ADDR_WIDTH 48
`else
`define PLATFORM_MEMORY_ADDR_WIDTH 32
`endif
`endif
`ifndef PLATFORM_MEMORY_DATA_SIZE
`define PLATFORM_MEMORY_DATA_SIZE 64
`endif
`ifndef PLATFORM_MEMORY_INTERLEAVE
`define PLATFORM_MEMORY_INTERLEAVE 1
`endif
`ifdef XLEN_64
`ifndef STACK_BASE_ADDR
@ -212,14 +169,7 @@
`define IO_BASE_ADDR 64'h000000040
`endif
`ifdef VM_ENABLE
`ifndef PAGE_TABLE_BASE_ADDR
`define PAGE_TABLE_BASE_ADDR 64'h0F0000000
`endif
`endif
`else // XLEN_32
`else
`ifndef STACK_BASE_ADDR
`define STACK_BASE_ADDR 32'hFFFF0000
@ -237,13 +187,6 @@
`define IO_BASE_ADDR 32'h00000040
`endif
`ifdef VM_ENABLE
`ifndef PAGE_TABLE_BASE_ADDR
`define PAGE_TABLE_BASE_ADDR 32'hF0000000
`endif
`endif
`endif
`define IO_END_ADDR `USER_BASE_ADDR
@ -271,17 +214,15 @@
`endif
`define STACK_SIZE (1 << `STACK_LOG2_SIZE)
`define RESET_DELAY 8
`define RESET_DELAY 8
`ifndef STALL_TIMEOUT
`define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
`endif
`ifndef SV_DPI
`ifndef DPI_DISABLE
`define DPI_DISABLE
`endif
`endif
`ifndef FPU_FPNEW
`ifndef FPU_DSP
@ -310,59 +251,6 @@
`define DEBUG_LEVEL 3
`endif
`ifndef MEM_PAGE_SIZE
`define MEM_PAGE_SIZE (4096)
`endif
`ifndef MEM_PAGE_LOG2_SIZE
`define MEM_PAGE_LOG2_SIZE (12)
`endif
// Virtual Memory Configuration ///////////////////////////////////////////////////////
`ifdef VM_ENABLE
`ifdef XLEN_32
`ifndef VM_ADDR_MODE
`define VM_ADDR_MODE SV32 //or BARE
`endif
`ifndef PT_LEVEL
`define PT_LEVEL (2)
`endif
`ifndef PTE_SIZE
`define PTE_SIZE (4)
`endif
`ifndef NUM_PTE_ENTRY
`define NUM_PTE_ENTRY (1024)
`endif
`ifndef PT_SIZE_LIMIT
`define PT_SIZE_LIMIT (1<<23)
`endif
`else
`ifndef VM_ADDR_MODE
`define VM_ADDR_MODE SV39 //or BARE
`endif
`ifndef PT_LEVEL
`define PT_LEVEL (3)
`endif
`ifndef PTE_SIZE
`define PTE_SIZE (8)
`endif
`ifndef NUM_PTE_ENTRY
`define NUM_PTE_ENTRY (512)
`endif
`ifndef PT_SIZE_LIMIT
`define PT_SIZE_LIMIT (1<<25)
`endif
`endif
`ifndef PT_SIZE
`define PT_SIZE MEM_PAGE_SIZE
`endif
`ifndef TLB_SIZE
`define TLB_SIZE (32)
`endif
`endif
// Pipeline Configuration /////////////////////////////////////////////////////
// Issue width
@ -590,16 +478,7 @@
// Number of Associative Ways
`ifndef ICACHE_NUM_WAYS
`define ICACHE_NUM_WAYS 4
`endif
// Replacement Policy
`ifndef ICACHE_REPL_POLICY
`define ICACHE_REPL_POLICY 1
`endif
`ifndef ICACHE_MEM_PORTS
`define ICACHE_MEM_PORTS 1
`define ICACHE_NUM_WAYS 1
`endif
// Dcache Configurable Knobs //////////////////////////////////////////////////
@ -628,7 +507,7 @@
// Number of Banks
`ifndef DCACHE_NUM_BANKS
`define DCACHE_NUM_BANKS `MIN(DCACHE_NUM_REQS, 16)
`define DCACHE_NUM_BANKS `MIN(`NUM_LSU_LANES, 4)
`endif
// Core Response Queue Size
@ -648,12 +527,12 @@
// Memory Response Queue Size
`ifndef DCACHE_MRSQ_SIZE
`define DCACHE_MRSQ_SIZE 4
`define DCACHE_MRSQ_SIZE 0
`endif
// Number of Associative Ways
`ifndef DCACHE_NUM_WAYS
`define DCACHE_NUM_WAYS 4
`define DCACHE_NUM_WAYS 1
`endif
// Enable Cache Writeback
@ -661,25 +540,6 @@
`define DCACHE_WRITEBACK 0
`endif
// Enable Cache Dirty bytes
`ifndef DCACHE_DIRTYBYTES
`define DCACHE_DIRTYBYTES `DCACHE_WRITEBACK
`endif
// Replacement Policy
`ifndef DCACHE_REPL_POLICY
`define DCACHE_REPL_POLICY 1
`endif
// Number of Memory Ports
`ifndef L1_MEM_PORTS
`ifdef L1_DISABLE
`define L1_MEM_PORTS `MIN(DCACHE_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
`else
`define L1_MEM_PORTS `MIN(`DCACHE_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
`endif
`endif
// LMEM Configurable Knobs ////////////////////////////////////////////////////
`ifndef LMEM_DISABLE
@ -702,12 +562,16 @@
// Cache Size
`ifndef L2_CACHE_SIZE
`ifdef ALTERA_S10
`define L2_CACHE_SIZE 2097152
`else
`define L2_CACHE_SIZE 1048576
`endif
`endif
// Number of Banks
`ifndef L2_NUM_BANKS
`define L2_NUM_BANKS `MIN(L2_NUM_REQS, 16)
`define L2_NUM_BANKS `MIN(4, `NUM_SOCKETS)
`endif
// Core Response Queue Size
@ -727,12 +591,12 @@
// Memory Response Queue Size
`ifndef L2_MRSQ_SIZE
`define L2_MRSQ_SIZE 4
`define L2_MRSQ_SIZE 0
`endif
// Number of Associative Ways
`ifndef L2_NUM_WAYS
`define L2_NUM_WAYS 8
`define L2_NUM_WAYS 2
`endif
// Enable Cache Writeback
@ -740,35 +604,20 @@
`define L2_WRITEBACK 0
`endif
// Enable Cache Dirty bytes
`ifndef L2_DIRTYBYTES
`define L2_DIRTYBYTES `L2_WRITEBACK
`endif
// Replacement Policy
`ifndef L2_REPL_POLICY
`define L2_REPL_POLICY 1
`endif
// Number of Memory Ports
`ifndef L2_MEM_PORTS
`ifdef L2_ENABLE
`define L2_MEM_PORTS `MIN(`L2_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
`else
`define L2_MEM_PORTS `MIN(L2_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
`endif
`endif
// L3cache Configurable Knobs /////////////////////////////////////////////////
// Cache Size
`ifndef L3_CACHE_SIZE
`ifdef ALTERA_S10
`define L3_CACHE_SIZE 2097152
`else
`define L3_CACHE_SIZE 1048576
`endif
`endif
// Number of Banks
`ifndef L3_NUM_BANKS
`define L3_NUM_BANKS `MIN(L3_NUM_REQS, 16)
`define L3_NUM_BANKS `MIN(4, `NUM_CLUSTERS)
`endif
// Core Response Queue Size
@ -788,12 +637,12 @@
// Memory Response Queue Size
`ifndef L3_MRSQ_SIZE
`define L3_MRSQ_SIZE 4
`define L3_MRSQ_SIZE 0
`endif
// Number of Associative Ways
`ifndef L3_NUM_WAYS
`define L3_NUM_WAYS 8
`define L3_NUM_WAYS 4
`endif
// Enable Cache Writeback
@ -801,25 +650,6 @@
`define L3_WRITEBACK 0
`endif
// Enable Cache Dirty bytes
`ifndef L3_DIRTYBYTES
`define L3_DIRTYBYTES `L3_WRITEBACK
`endif
// Replacement Policy
`ifndef L3_REPL_POLICY
`define L3_REPL_POLICY 1
`endif
// Number of Memory Ports
`ifndef L3_MEM_PORTS
`ifdef L3_ENABLE
`define L3_MEM_PORTS `MIN(`L3_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
`else
`define L3_MEM_PORTS `MIN(L3_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
`endif
`endif
// ISA Extensions /////////////////////////////////////////////////////////////
`ifdef EXT_A_ENABLE
@ -852,12 +682,6 @@
`define EXT_M_ENABLED 0
`endif
`ifdef EXT_V_ENABLE
`define EXT_V_ENABLED 1
`else
`define EXT_V_ENABLED 0
`endif
`ifdef EXT_ZICOND_ENABLE
`define EXT_ZICOND_ENABLED 1
`else
@ -874,7 +698,7 @@
`define ISA_STD_N 13
`define ISA_STD_Q 16
`define ISA_STD_S 18
`define ISA_STD_V 21
`define ISA_STD_U 20
`define ISA_EXT_ICACHE 0
`define ISA_EXT_DCACHE 1
@ -911,7 +735,7 @@
| (0 << 18) /* S - Supervisor mode implemented */ \
| (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \
| (1 << 20) /* U - User mode implemented */ \
| (`EXT_V_ENABLED << 21) /* V - Tentatively reserved for Vector extension */ \
| (0 << 21) /* V - Tentatively reserved for Vector extension */ \
| (0 << 22) /* W - Reserved */ \
| (1 << 23) /* X - Non-standard extensions present */ \
| (0 << 24) /* Y - Reserved */ \

View file

@ -50,16 +50,10 @@
`define PERF_CTR_BITS 44
`ifndef NDEBUG
`define UUID_ENABLE
`define UUID_WIDTH 44
`else
`ifdef SCOPE
`define UUID_ENABLE
`define UUID_WIDTH 44
`else
`define UUID_WIDTH 1
`endif
`endif
`define PC_BITS (`XLEN-1)
`define OFFSET_BITS 12
@ -233,19 +227,22 @@
`define INST_FENCE_D 1'h0
`define INST_FENCE_I 1'h1
`define INST_FPU_ADD 4'b0000 // SUB=fmt[1]
`define INST_FPU_MUL 4'b0001
`define INST_FPU_MADD 4'b0010 // SUB=fmt[1]
`define INST_FPU_NMADD 4'b0011 // SUB=fmt[1]
`define INST_FPU_DIV 4'b0100
`define INST_FPU_SQRT 4'b0101
`define INST_FPU_F2I 4'b1000 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
`define INST_FPU_F2U 4'b1001 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
`define INST_FPU_I2F 4'b1010 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
`define INST_FPU_U2F 4'b1011 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
`define INST_FPU_CMP 4'b1100 // frm: LE=0, LT=1, EQ=2
`define INST_FPU_F2F 4'b1101 // fmt[0]: F32=0, F64=1
`define INST_FPU_MISC 4'b1110 // frm: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
`define INST_FPU_ADD 4'b0000
`define INST_FPU_SUB 4'b0001
`define INST_FPU_MUL 4'b0010
`define INST_FPU_DIV 4'b0011
`define INST_FPU_SQRT 4'b0100
`define INST_FPU_CMP 4'b0101 // frm: LE=0, LT=1, EQ=2
`define INST_FPU_F2F 4'b0110
`define INST_FPU_MISC 4'b0111 // frm: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
`define INST_FPU_F2I 4'b1000
`define INST_FPU_F2U 4'b1001
`define INST_FPU_I2F 4'b1010
`define INST_FPU_U2F 4'b1011
`define INST_FPU_MADD 4'b1100
`define INST_FPU_MSUB 4'b1101
`define INST_FPU_NMSUB 4'b1110
`define INST_FPU_NMADD 4'b1111
`define INST_FPU_BITS 4
`define INST_FPU_IS_CLASS(op, frm) (op == `INST_FPU_MISC && frm == 3)
`define INST_FPU_IS_MVXW(op, frm) (op == `INST_FPU_MISC && frm == 4)
@ -270,14 +267,14 @@
///////////////////////////////////////////////////////////////////////////////
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width) \
(uuid_width + `CLOG2(mshr_size) + `CLOG2(`CDIV(num_banks, mem_ports)))
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
(`CLOG2(mshr_size) + `CLOG2(num_banks))
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width) \
(`CLOG2(`CDIV(num_reqs, mem_ports)) + `CLOG2(line_size / word_size) + tag_width)
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, tag_width, uuid_width) \
(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width), `CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width)) + 1)
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \
(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width)) + 1)
///////////////////////////////////////////////////////////////////////////////
@ -287,14 +284,14 @@
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, num_caches, uuid_width) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width), num_caches)
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
`define CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, tag_width, num_inputs, num_caches, uuid_width) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches), uuid_width), num_caches)
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
///////////////////////////////////////////////////////////////////////////////
@ -306,12 +303,11 @@
`define L1_ENABLE
`endif
`define MEM_REQ_FLAG_FLUSH 0
`define MEM_REQ_FLAG_IO 1
`define MEM_REQ_FLAG_LOCAL 2 // shoud be last since optional
`define MEM_REQ_FLAGS_WIDTH (`MEM_REQ_FLAG_LOCAL + `LMEM_ENABLED)
`define ADDR_TYPE_FLUSH 0
`define ADDR_TYPE_IO 1
`define ADDR_TYPE_LOCAL 2 // shoud be last since optional
`define ADDR_TYPE_WIDTH (`ADDR_TYPE_LOCAL + `LMEM_ENABLED)
`define VX_MEM_PORTS `L3_MEM_PORTS
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
@ -324,23 +320,12 @@
///////////////////////////////////////////////////////////////////////////////
`define NEG_EDGE(dst, src) \
VX_edge_trigger #( \
.POS (0), \
.INIT (0) \
) __neg_edge`__LINE__ ( \
.clk (clk), \
.reset (1'b0), \
.data_in (src), \
.data_out (dst) \
)
`define BUFFER_EX(dst, src, ena, resetw, latency) \
`define BUFFER_EX(dst, src, ena, latency) \
VX_pipe_register #( \
.DATAW ($bits(dst)), \
.RESETW (resetw), \
.RESETW ($bits(dst)), \
.DEPTH (latency) \
) __buffer_ex`__LINE__ ( \
) __``dst``__ ( \
.clk (clk), \
.reset (reset), \
.enable (ena), \
@ -348,13 +333,13 @@
.data_out (dst) \
)
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, $bits(dst), 1)
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 1)
`define POP_COUNT_EX(out, in, model) \
VX_popcount #( \
.N ($bits(in)), \
.MODEL (model) \
) __pop_count_ex`__LINE__ ( \
) __``out``__ ( \
.data_in (in), \
.data_out (out) \
)
@ -374,114 +359,50 @@
assign src.rsp_data = dst.rsp_data; \
assign dst.rsp_ready = src.rsp_ready
`define ASSIGN_VX_MEM_BUS_RO_IF(dst, src) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data.rw = 0; \
assign dst.req_data.addr = src.req_data.addr; \
assign dst.req_data.data = '0; \
assign dst.req_data.byteen = '1; \
assign dst.req_data.flags = src.req_data.flags; \
assign dst.req_data.tag = src.req_data.tag; \
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data.data = dst.rsp_data.data; \
assign src.rsp_data.tag = dst.rsp_data.tag; \
assign dst.rsp_ready = src.rsp_ready
`define ASSIGN_VX_MEM_BUS_IF_EX(dst, src, TD, TS, UUID) \
`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data.rw = src.req_data.rw; \
assign dst.req_data.addr = src.req_data.addr; \
assign dst.req_data.data = src.req_data.data; \
assign dst.req_data.byteen = src.req_data.byteen; \
assign dst.req_data.flags = src.req_data.flags; \
/* verilator lint_off GENUNNAMED */ \
if (TD != TS) begin \
if (UUID != 0) begin \
if (TD > TS) begin \
assign dst.req_data.tag = {src.req_data.tag.uuid, {(TD-TS){1'b0}}, src.req_data.tag.value}; \
end else begin \
assign dst.req_data.tag = {src.req_data.tag.uuid, src.req_data.tag.value[TD-UUID-1:0]}; \
end \
end else begin \
if (TD > TS) begin \
assign dst.req_data.tag = {{(TD-TS){1'b0}}, src.req_data.tag}; \
end else begin \
assign dst.req_data.tag = src.req_data.tag[TD-1:0]; \
end \
end \
end else begin \
assign dst.req_data.addr = src.req_data.addr; \
assign dst.req_data.atype = src.req_data.atype; \
assign dst.req_data.data = src.req_data.data; \
if (TD != TS) \
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
else \
assign dst.req_data.tag = src.req_data.tag; \
end \
/* verilator lint_on GENUNNAMED */ \
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data.data = dst.rsp_data.data; \
/* verilator lint_off GENUNNAMED */ \
if (TD != TS) begin \
if (UUID != 0) begin \
if (TD > TS) begin \
assign src.rsp_data.tag = {dst.rsp_data.tag.uuid, dst.rsp_data.tag.value[TS-UUID-1:0]}; \
end else begin \
assign src.rsp_data.tag = {dst.rsp_data.tag.uuid, {(TS-TD){1'b0}}, dst.rsp_data.tag.value}; \
end \
end else begin \
if (TD > TS) begin \
assign src.rsp_data.tag = dst.rsp_data.tag[TS-1:0]; \
end else begin \
assign src.rsp_data.tag = {{(TS-TD){1'b0}}, dst.rsp_data.tag}; \
end \
end \
end else begin \
assign src.rsp_data.tag = dst.rsp_data.tag; \
end \
/* verilator lint_on GENUNNAMED */ \
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
assign dst.rsp_ready = src.rsp_ready
`define INIT_VX_MEM_BUS_IF(itf) \
assign itf.req_valid = 0; \
assign itf.req_data = '0; \
`UNUSED_VAR (itf.req_ready) \
`UNUSED_VAR (itf.rsp_valid) \
`UNUSED_VAR (itf.rsp_data) \
assign itf.rsp_ready = 0;
`define ASSIGN_VX_LSU_MEM_IF(dst, src) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data = src.req_data; \
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data = dst.rsp_data; \
assign dst.rsp_ready = src.rsp_ready
`define UNUSED_VX_MEM_BUS_IF(itf) \
`UNUSED_VAR (itf.req_valid) \
`UNUSED_VAR (itf.req_data) \
assign itf.req_ready = 0; \
assign itf.rsp_valid = 0; \
assign itf.rsp_data = '0; \
`UNUSED_VAR (itf.rsp_ready)
`define BUFFER_DCR_BUS_IF(dst, src, ena, latency) \
/* verilator lint_off GENUNNAMED */ \
if (latency != 0) begin \
VX_pipe_register #( \
.DATAW (1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH), \
.DEPTH (latency) \
) pipe_reg ( \
.clk (clk), \
.reset (1'b0), \
.enable (1'b1), \
.data_in ({src.write_valid && ena, src.write_addr, src.write_data}), \
.data_out ({dst.write_valid, dst.write_addr, dst.write_data}) \
); \
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
if (enable) begin \
reg [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __dst; \
always @(posedge clk) begin \
__dst <= {src.write_valid, src.write_addr, src.write_data}; \
end \
assign {dst.write_valid, dst.write_addr, dst.write_data} = __dst; \
end else begin \
assign {dst.write_valid, dst.write_addr, dst.write_data} = {src.write_valid && ena, src.write_addr, src.write_data}; \
end \
/* verilator lint_on GENUNNAMED */
assign {dst.write_valid, dst.write_addr, dst.write_data} = {src.write_valid, src.write_addr, src.write_data}; \
end
`define PERF_COUNTER_ADD(dst, src, field, width, count, reg_enable) \
/* verilator lint_off GENUNNAMED */ \
if (count > 1) begin \
wire [count-1:0][width-1:0] __reduce_add_i_field; \
wire [width-1:0] __reduce_add_o_field; \
for (genvar __i = 0; __i < count; ++__i) begin \
assign __reduce_add_i_field[__i] = src[__i].``field; \
end \
VX_reduce_tree #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
__reduce_add_i_field, \
__reduce_add_o_field \
); \
@ -500,11 +421,9 @@
end \
end else begin \
assign dst.``field = src[0].``field; \
end \
/* verilator lint_on GENUNNAMED */
end
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
/* verilator lint_off GENUNNAMED */ \
if (block_size != 1) begin \
if (block_size != `NUM_WARPS) begin \
assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \
@ -513,7 +432,6 @@
end \
end else begin \
assign dst = src; \
end \
/* verilator lint_on GENUNNAMED */
end
`endif // VX_DEFINE_VH

View file

@ -73,17 +73,6 @@ package VX_gpu_pkg;
logic [`PERF_CTR_BITS-1:0] crsp_stalls;
} cache_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] reads;
logic [`PERF_CTR_BITS-1:0] writes;
logic [`PERF_CTR_BITS-1:0] bank_stalls;
logic [`PERF_CTR_BITS-1:0] crsp_stalls;
} lmem_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] misses;
} coalescer_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] reads;
logic [`PERF_CTR_BITS-1:0] writes;
@ -103,26 +92,6 @@ package VX_gpu_pkg;
logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses;
} issue_perf_t;
typedef struct packed {
cache_perf_t icache;
cache_perf_t dcache;
cache_perf_t l2cache;
cache_perf_t l3cache;
lmem_perf_t lmem;
coalescer_perf_t coalescer;
mem_perf_t mem;
} sysmem_perf_t;
typedef struct packed {
sched_perf_t sched;
issue_perf_t issue;
logic [`PERF_CTR_BITS-1:0] ifetches;
logic [`PERF_CTR_BITS-1:0] loads;
logic [`PERF_CTR_BITS-1:0] stores;
logic [`PERF_CTR_BITS-1:0] ifetch_latency;
logic [`PERF_CTR_BITS-1:0] load_latency;
} pipeline_perf_t;
//////////////////////// instruction arguments ////////////////////////////
typedef struct packed {
@ -176,7 +145,6 @@ package VX_gpu_pkg;
localparam LSU_TAG_ID_BITS = (`CLOG2(`LSUQ_IN_SIZE) + `CLOG2(LSU_MEM_BATCHES));
localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS);
localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES;
localparam LMEM_TAG_WIDTH = LSU_TAG_WIDTH + `CLOG2(`NUM_LSU_BLOCKS);
////////////////////////// Icache Parameters //////////////////////////////
@ -198,9 +166,9 @@ package VX_gpu_pkg;
// Memory request tag bits
`ifdef ICACHE_ENABLE
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, 1, `NUM_ICACHES, `UUID_WIDTH);
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
`else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, 1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif
////////////////////////// Dcache Parameters //////////////////////////////
@ -212,7 +180,7 @@ package VX_gpu_pkg;
// Block size in bytes
localparam DCACHE_LINE_SIZE = `L1_LINE_SIZE;
// Input request size (using coalesced memory blocks)
// Input request size
localparam DCACHE_CHANNELS = `UP((`NUM_LSU_LANES * LSU_WORD_SIZE) / DCACHE_WORD_SIZE);
localparam DCACHE_NUM_REQS = `NUM_LSU_BLOCKS * DCACHE_CHANNELS;
@ -229,27 +197,26 @@ package VX_gpu_pkg;
// Memory request tag bits
`ifdef DCACHE_ENABLE
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, `L1_MEM_PORTS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES, `UUID_WIDTH);
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`else
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, `L1_MEM_PORTS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`endif
/////////////////////////////// L1 Parameters /////////////////////////////
// arbitrate between icache and dcache
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
/////////////////////////////// L2 Parameters /////////////////////////////
localparam ICACHE_MEM_ARB_IDX = 0;
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
localparam ICACHE_MEM_ARB_IDX = 0;
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
// Word size in bytes
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
// Input request size
localparam L2_NUM_REQS = `NUM_SOCKETS * `L1_MEM_PORTS;
localparam L2_NUM_REQS = `NUM_SOCKETS;
// Core request tag bits
localparam L2_TAG_WIDTH = L1_MEM_ARB_TAG_WIDTH;
@ -259,9 +226,9 @@ package VX_gpu_pkg;
// Memory request tag bits
`ifdef L2_ENABLE
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_MEM_PORTS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH, `UUID_WIDTH);
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
`else
localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_MEM_PORTS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
`endif
/////////////////////////////// L3 Parameters /////////////////////////////
@ -270,7 +237,7 @@ package VX_gpu_pkg;
localparam L3_WORD_SIZE = `L2_LINE_SIZE;
// Input request size
localparam L3_NUM_REQS = `NUM_CLUSTERS * `L2_MEM_PORTS;
localparam L3_NUM_REQS = `NUM_CLUSTERS;
// Core request tag bits
localparam L3_TAG_WIDTH = L2_MEM_TAG_WIDTH;
@ -280,9 +247,9 @@ package VX_gpu_pkg;
// Memory request tag bits
`ifdef L3_ENABLE
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_MEM_PORTS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH, `UUID_WIDTH);
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`else
localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_MEM_PORTS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`endif
/////////////////////////////// Issue parameters //////////////////////////
@ -341,430 +308,6 @@ package VX_gpu_pkg;
`IGNORE_UNUSED_END
////////////////////////////////// Tracing ////////////////////////////////////
`ifdef SIMULATION
`ifdef SV_DPI
import "DPI-C" function void dpi_trace(input int level, input string format /*verilator sformat*/);
`endif
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
case (ex_type)
`EX_ALU: `TRACE(level, ("ALU"))
`EX_LSU: `TRACE(level, ("LSU"))
`EX_SFU: `TRACE(level, ("SFU"))
`ifdef EXT_F_ENABLE
`EX_FPU: `TRACE(level, ("FPU"))
`endif
default: `TRACE(level, ("?"))
endcase
endtask
task trace_ex_op(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input VX_gpu_pkg::op_args_t op_args
);
case (ex_type)
`EX_ALU: begin
case (op_args.alu.xtype)
`ALU_TYPE_ARITH: begin
if (op_args.alu.is_w) begin
if (op_args.alu.use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDIW"))
`INST_ALU_SLL: `TRACE(level, ("SLLIW"))
`INST_ALU_SRL: `TRACE(level, ("SRLIW"))
`INST_ALU_SRA: `TRACE(level, ("SRAIW"))
default: `TRACE(level, ("?"))
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDW"))
`INST_ALU_SUB: `TRACE(level, ("SUBW"))
`INST_ALU_SLL: `TRACE(level, ("SLLW"))
`INST_ALU_SRL: `TRACE(level, ("SRLW"))
`INST_ALU_SRA: `TRACE(level, ("SRAW"))
default: `TRACE(level, ("?"))
endcase
end
end else begin
if (op_args.alu.use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDI"))
`INST_ALU_SLL: `TRACE(level, ("SLLI"))
`INST_ALU_SRL: `TRACE(level, ("SRLI"))
`INST_ALU_SRA: `TRACE(level, ("SRAI"))
`INST_ALU_SLT: `TRACE(level, ("SLTI"))
`INST_ALU_SLTU: `TRACE(level, ("SLTIU"))
`INST_ALU_XOR: `TRACE(level, ("XORI"))
`INST_ALU_OR: `TRACE(level, ("ORI"))
`INST_ALU_AND: `TRACE(level, ("ANDI"))
`INST_ALU_LUI: `TRACE(level, ("LUI"))
`INST_ALU_AUIPC: `TRACE(level, ("AUIPC"))
default: `TRACE(level, ("?"))
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADD"))
`INST_ALU_SUB: `TRACE(level, ("SUB"))
`INST_ALU_SLL: `TRACE(level, ("SLL"))
`INST_ALU_SRL: `TRACE(level, ("SRL"))
`INST_ALU_SRA: `TRACE(level, ("SRA"))
`INST_ALU_SLT: `TRACE(level, ("SLT"))
`INST_ALU_SLTU: `TRACE(level, ("SLTU"))
`INST_ALU_XOR: `TRACE(level, ("XOR"))
`INST_ALU_OR: `TRACE(level, ("OR"))
`INST_ALU_AND: `TRACE(level, ("AND"))
`INST_ALU_CZEQ: `TRACE(level, ("CZERO.EQZ"))
`INST_ALU_CZNE: `TRACE(level, ("CZERO.NEZ"))
default: `TRACE(level, ("?"))
endcase
end
end
end
`ALU_TYPE_BRANCH: begin
case (`INST_BR_BITS'(op_type))
`INST_BR_EQ: `TRACE(level, ("BEQ"))
`INST_BR_NE: `TRACE(level, ("BNE"))
`INST_BR_LT: `TRACE(level, ("BLT"))
`INST_BR_GE: `TRACE(level, ("BGE"))
`INST_BR_LTU: `TRACE(level, ("BLTU"))
`INST_BR_GEU: `TRACE(level, ("BGEU"))
`INST_BR_JAL: `TRACE(level, ("JAL"))
`INST_BR_JALR: `TRACE(level, ("JALR"))
`INST_BR_ECALL: `TRACE(level, ("ECALL"))
`INST_BR_EBREAK:`TRACE(level, ("EBREAK"))
`INST_BR_URET: `TRACE(level, ("URET"))
`INST_BR_SRET: `TRACE(level, ("SRET"))
`INST_BR_MRET: `TRACE(level, ("MRET"))
default: `TRACE(level, ("?"))
endcase
end
`ALU_TYPE_MULDIV: begin
if (op_args.alu.is_w) begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MULW"))
`INST_M_DIV: `TRACE(level, ("DIVW"))
`INST_M_DIVU: `TRACE(level, ("DIVUW"))
`INST_M_REM: `TRACE(level, ("REMW"))
`INST_M_REMU: `TRACE(level, ("REMUW"))
default: `TRACE(level, ("?"))
endcase
end else begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MUL"))
`INST_M_MULH: `TRACE(level, ("MULH"))
`INST_M_MULHSU:`TRACE(level, ("MULHSU"))
`INST_M_MULHU: `TRACE(level, ("MULHU"))
`INST_M_DIV: `TRACE(level, ("DIV"))
`INST_M_DIVU: `TRACE(level, ("DIVU"))
`INST_M_REM: `TRACE(level, ("REM"))
`INST_M_REMU: `TRACE(level, ("REMU"))
default: `TRACE(level, ("?"))
endcase
end
end
default: `TRACE(level, ("?"))
endcase
end
`EX_LSU: begin
if (op_args.lsu.is_float) begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LW: `TRACE(level, ("FLW"))
`INST_LSU_LD: `TRACE(level, ("FLD"))
`INST_LSU_SW: `TRACE(level, ("FSW"))
`INST_LSU_SD: `TRACE(level, ("FSD"))
default: `TRACE(level, ("?"))
endcase
end else begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LB: `TRACE(level, ("LB"))
`INST_LSU_LH: `TRACE(level, ("LH"))
`INST_LSU_LW: `TRACE(level, ("LW"))
`INST_LSU_LD: `TRACE(level, ("LD"))
`INST_LSU_LBU:`TRACE(level, ("LBU"))
`INST_LSU_LHU:`TRACE(level, ("LHU"))
`INST_LSU_LWU:`TRACE(level, ("LWU"))
`INST_LSU_SB: `TRACE(level, ("SB"))
`INST_LSU_SH: `TRACE(level, ("SH"))
`INST_LSU_SW: `TRACE(level, ("SW"))
`INST_LSU_SD: `TRACE(level, ("SD"))
`INST_LSU_FENCE:`TRACE(level,("FENCE"))
default: `TRACE(level, ("?"))
endcase
end
end
`EX_SFU: begin
case (`INST_SFU_BITS'(op_type))
`INST_SFU_TMC: `TRACE(level, ("TMC"))
`INST_SFU_WSPAWN:`TRACE(level, ("WSPAWN"))
`INST_SFU_SPLIT: begin
if (op_args.wctl.is_neg) begin
`TRACE(level, ("SPLIT.N"))
end else begin
`TRACE(level, ("SPLIT"))
end
end
`INST_SFU_JOIN: `TRACE(level, ("JOIN"))
`INST_SFU_BAR: `TRACE(level, ("BAR"))
`INST_SFU_PRED: begin
if (op_args.wctl.is_neg) begin
`TRACE(level, ("PRED.N"))
end else begin
`TRACE(level, ("PRED"))
end
end
`INST_SFU_CSRRW: begin
if (op_args.csr.use_imm) begin
`TRACE(level, ("CSRRWI"))
end else begin
`TRACE(level, ("CSRRW"))
end
end
`INST_SFU_CSRRS: begin
if (op_args.csr.use_imm) begin
`TRACE(level, ("CSRRSI"))
end else begin
`TRACE(level, ("CSRRS"))
end
end
`INST_SFU_CSRRC: begin
if (op_args.csr.use_imm) begin
`TRACE(level, ("CSRRCI"))
end else begin
`TRACE(level, ("CSRRC"))
end
end
default: `TRACE(level, ("?"))
endcase
end
`ifdef EXT_F_ENABLE
`EX_FPU: begin
case (`INST_FPU_BITS'(op_type))
`INST_FPU_ADD: begin
if (op_args.fpu.fmt[1]) begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FSUB.D"))
end else begin
`TRACE(level, ("FSUB.S"))
end
end else begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FADD.D"))
end else begin
`TRACE(level, ("FADD.S"))
end
end
end
`INST_FPU_MADD: begin
if (op_args.fpu.fmt[1]) begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FMSUB.D"))
end else begin
`TRACE(level, ("FMSUB.S"))
end
end else begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FMADD.D"))
end else begin
`TRACE(level, ("FMADD.S"))
end
end
end
`INST_FPU_NMADD: begin
if (op_args.fpu.fmt[1]) begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FNMSUB.D"))
end else begin
`TRACE(level, ("FNMSUB.S"))
end
end else begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FNMADD.D"))
end else begin
`TRACE(level, ("FNMADD.S"))
end
end
end
`INST_FPU_MUL: begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FMUL.D"))
end else begin
`TRACE(level, ("FMUL.S"))
end
end
`INST_FPU_DIV: begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FDIV.D"))
end else begin
`TRACE(level, ("FDIV.S"))
end
end
`INST_FPU_SQRT: begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FSQRT.D"))
end else begin
`TRACE(level, ("FSQRT.S"))
end
end
`INST_FPU_CMP: begin
if (op_args.fpu.fmt[0]) begin
case (op_args.fpu.frm[1:0])
0: `TRACE(level, ("FLE.D"))
1: `TRACE(level, ("FLT.D"))
2: `TRACE(level, ("FEQ.D"))
default: `TRACE(level, ("?"))
endcase
end else begin
case (op_args.fpu.frm[1:0])
0: `TRACE(level, ("FLE.S"))
1: `TRACE(level, ("FLT.S"))
2: `TRACE(level, ("FEQ.S"))
default: `TRACE(level, ("?"))
endcase
end
end
`INST_FPU_F2F: begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FCVT.D.S"))
end else begin
`TRACE(level, ("FCVT.S.D"))
end
end
`INST_FPU_F2I: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.L.D"))
end else begin
`TRACE(level, ("FCVT.W.D"))
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.L.S"))
end else begin
`TRACE(level, ("FCVT.W.S"))
end
end
end
`INST_FPU_F2U: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.LU.D"))
end else begin
`TRACE(level, ("FCVT.WU.D"))
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.LU.S"))
end else begin
`TRACE(level, ("FCVT.WU.S"))
end
end
end
`INST_FPU_I2F: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.D.L"))
end else begin
`TRACE(level, ("FCVT.D.W"))
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.S.L"))
end else begin
`TRACE(level, ("FCVT.S.W"))
end
end
end
`INST_FPU_U2F: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.D.LU"))
end else begin
`TRACE(level, ("FCVT.D.WU"))
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.S.LU"))
end else begin
`TRACE(level, ("FCVT.S.WU"))
end
end
end
`INST_FPU_MISC: begin
if (op_args.fpu.fmt[0]) begin
case (op_args.fpu.frm)
0: `TRACE(level, ("FSGNJ.D"))
1: `TRACE(level, ("FSGNJN.D"))
2: `TRACE(level, ("FSGNJX.D"))
3: `TRACE(level, ("FCLASS.D"))
4: `TRACE(level, ("FMV.X.D"))
5: `TRACE(level, ("FMV.D.X"))
6: `TRACE(level, ("FMIN.D"))
7: `TRACE(level, ("FMAX.D"))
endcase
end else begin
case (op_args.fpu.frm)
0: `TRACE(level, ("FSGNJ.S"))
1: `TRACE(level, ("FSGNJN.S"))
2: `TRACE(level, ("FSGNJX.S"))
3: `TRACE(level, ("FCLASS.S"))
4: `TRACE(level, ("FMV.X.S"))
5: `TRACE(level, ("FMV.S.X"))
6: `TRACE(level, ("FMIN.S"))
7: `TRACE(level, ("FMAX.S"))
endcase
end
end
default: `TRACE(level, ("?"))
endcase
end
`endif
default: `TRACE(level, ("?"))
endcase
endtask
task trace_op_args(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input VX_gpu_pkg::op_args_t op_args
);
case (ex_type)
`EX_ALU: begin
`TRACE(level, (", use_PC=%b, use_imm=%b, imm=0x%0h", op_args.alu.use_PC, op_args.alu.use_imm, op_args.alu.imm))
end
`EX_LSU: begin
`TRACE(level, (", offset=0x%0h", op_args.lsu.offset))
end
`EX_SFU: begin
if (`INST_SFU_IS_CSR(op_type)) begin
`TRACE(level, (", addr=0x%0h, use_imm=%b, imm=0x%0h", op_args.csr.addr, op_args.csr.use_imm, op_args.csr.imm))
end
end
`ifdef EXT_F_ENABLE
`EX_FPU: begin
`TRACE(level, (", fmt=0x%0h, frm=0x%0h", op_args.fpu.fmt, op_args.fpu.frm))
end
`endif
default:;
endcase
endtask
task trace_base_dcr(input int level, input [`VX_DCR_ADDR_WIDTH-1:0] addr);
case (addr)
`VX_DCR_BASE_STARTUP_ADDR0: `TRACE(level, ("STARTUP_ADDR0"))
`VX_DCR_BASE_STARTUP_ADDR1: `TRACE(level, ("STARTUP_ADDR1"))
`VX_DCR_BASE_STARTUP_ARG0: `TRACE(level, ("STARTUP_ARG0"))
`VX_DCR_BASE_STARTUP_ARG1: `TRACE(level, ("STARTUP_ARG1"))
`VX_DCR_BASE_MPM_CLASS: `TRACE(level, ("MPM_CLASS"))
default: `TRACE(level, ("?"))
endcase
endtask
`endif
endpackage
`endif // VX_GPU_PKG_VH

View file

@ -22,34 +22,36 @@
///////////////////////////////////////////////////////////////////////////////
`ifdef SIMULATION
`define STATIC_ASSERT(cond, msg) \
/* verilator lint_off GENUNNAMED */ \
if (!(cond)) $error msg; \
/* verilator lint_on GENUNNAMED */ \
`define ERROR(msg) \
$error msg
`define ASSERT(cond, msg) \
assert(cond) else $error msg
`define RUNTIME_ASSERT(cond, msg) \
always @(posedge clk) begin \
if (!reset) begin \
`ASSERT(cond, msg); \
end \
end
`ifndef TRACING_ALL
`define TRACING_ON /* verilator tracing_on */
`define TRACING_OFF /* verilator tracing_off */
`ifdef VIVADO
`define STRING
`else
`define TRACING_ON
`define TRACING_OFF
`define STRING string
`endif
`ifdef SYNTHESIS
`define TRACING_ON
`define TRACING_OFF
`ifndef NDEBUG
`define DEBUG_BLOCK(x) x
`else
`define DEBUG_BLOCK(x)
`endif
`define IGNORE_UNOPTFLAT_BEGIN
`define IGNORE_UNOPTFLAT_END
`define IGNORE_UNUSED_BEGIN
`define IGNORE_UNUSED_END
`define IGNORE_WARNINGS_BEGIN
`define IGNORE_WARNINGS_END
`define UNUSED_PARAM(x)
`define UNUSED_SPARAM(x)
`define UNUSED_VAR(x)
`define UNUSED_PIN(x) . x ()
`define UNUSED_ARG(x) x
`define TRACE(level, args) if (level <= `DEBUG_LEVEL) $write args
`else
`ifdef VERILATOR
`define TRACING_ON /* verilator tracing_on */
`define TRACING_OFF /* verilator tracing_off */
`ifndef NDEBUG
`define DEBUG_BLOCK(x) /* verilator lint_off UNUSED */ \
x \
@ -98,99 +100,74 @@
localparam `STRING __``x = x; \
/* verilator lint_on UNUSED */
`define UNUSED_VAR(x) /* verilator lint_off GENUNNAMED */ \
if (1) begin \
`define UNUSED_VAR(x) if (1) begin \
/* verilator lint_off UNUSED */ \
wire [$bits(x)-1:0] __unused = x; \
wire [$bits(x)-1:0] __x = x; \
/* verilator lint_on UNUSED */ \
end \
/* verilator lint_on GENUNNAMED */
end
`define UNUSED_PIN(x) /* verilator lint_off PINCONNECTEMPTY */ \
. x () \
/* verilator lint_on PINCONNECTEMPTY */
`define UNUSED_ARG(x) /* verilator lint_off UNUSED */ \
x \
/* verilator lint_on UNUSED */
`ifdef SV_DPI
`define TRACE(level, args) dpi_trace(level, $sformatf args);
`else
`define TRACE(level, args) \
if (level <= `DEBUG_LEVEL) begin \
$write args; \
end
`endif
`define SFORMATF(x) $sformatf x
`ifdef SV_DPI
`define TRACE(level, args) dpi_trace(level, $sformatf args)
`else
`define TRACE(level, args) if (level <= `DEBUG_LEVEL) $write args
`endif
`else // SYNTHESIS
`endif
`define STATIC_ASSERT(cond, msg)
`define ERROR(msg) //
`define ASSERT(cond, msg) //
`define RUNTIME_ASSERT(cond, msg)
`ifdef SIMULATION
`define STATIC_ASSERT(cond, msg) \
generate \
if (!(cond)) $error msg; \
endgenerate
`define DEBUG_BLOCK(x)
`define TRACE(level, args)
`define SFORMATF(x) ""
`define ERROR(msg) \
$error msg
`define TRACING_ON
`define TRACING_OFF
`define IGNORE_UNOPTFLAT_BEGIN
`define IGNORE_UNOPTFLAT_END
`define IGNORE_UNUSED_BEGIN
`define IGNORE_UNUSED_END
`define IGNORE_WARNINGS_BEGIN
`define IGNORE_WARNINGS_END
`define UNUSED_PARAM(x)
`define UNUSED_SPARAM(x)
`define UNUSED_VAR(x)
`define UNUSED_PIN(x) . x ()
`define UNUSED_ARG(x) x
`define ASSERT(cond, msg) \
assert(cond) else $error msg
`define RUNTIME_ASSERT(cond, msg) \
always @(posedge clk) begin \
assert(cond) else $error msg; \
end
`else
`define STATIC_ASSERT(cond, msg)
`define ERROR(msg) //
`define ASSERT(cond, msg) //
`define RUNTIME_ASSERT(cond, msg)
`endif
///////////////////////////////////////////////////////////////////////////////
`ifdef QUARTUS
`define MAX_FANOUT 8
`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256)
`define USE_BLOCK_BRAM (* ramstyle = "block" *)
`define IF_DATA_SIZE(x) $bits(x.data)
`define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *)
`define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *)
`define RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams on" *)
`define DISABLE_BRAM (* ramstyle = "logic" *)
`define PRESERVE_NET (* preserve *)
`define BLACKBOX_CELL (* black_box *)
`define STRING string
`elsif VIVADO
`define MAX_FANOUT 8
`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256)
`define USE_BLOCK_BRAM (* ram_style = "block" *)
`define IF_DATA_SIZE(x) $bits(x.data)
`define USE_FAST_BRAM (* ram_style = "distributed" *)
`define NO_RW_RAM_CHECK (* rw_addr_collision = "no" *)
`define RW_RAM_CHECK (* rw_addr_collision = "yes" *)
`define DISABLE_BRAM (* ram_style = "registers" *)
`define PRESERVE_NET (* keep = "true" *)
`define BLACKBOX_CELL (* black_box *)
`define STRING
`ifndef SIMULATION
`define ASYNC_BRAM_PATCH
`endif
`else
`define MAX_FANOUT 8
`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256)
`define USE_BLOCK_BRAM
`define IF_DATA_SIZE(x) x.DATA_WIDTH
`define USE_FAST_BRAM
`define NO_RW_RAM_CHECK
`define RW_RAM_CHECK
`define DISABLE_BRAM
`define PRESERVE_NET
`define BLACKBOX_CELL
`define STRING string
`endif
///////////////////////////////////////////////////////////////////////////////
@ -215,7 +192,7 @@
`define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x)))
`define UP(x) (((x) > 0) ? (x) : 1)
`define UP(x) (((x) != 0) ? (x) : 1)
`define CDIV(n,d) ((n + d - 1) / (d))
@ -227,23 +204,23 @@
`define SEXT(len, x) {{(len-$bits(x)+1){x[$bits(x)-1]}}, x[$bits(x)-2:0]}
`define TRACE_ARRAY1D(lvl, fmt, arr, n) \
`TRACE(lvl, ("{")) \
`TRACE(lvl, ("{")); \
for (integer __i = (n-1); __i >= 0; --__i) begin \
if (__i != (n-1)) `TRACE(lvl, (", ")) \
`TRACE(lvl, (fmt, arr[__i])) \
if (__i != (n-1)) `TRACE(lvl, (", ")); \
`TRACE(lvl, (fmt, arr[__i])); \
end \
`TRACE(lvl, ("}"))
`TRACE(lvl, ("}"));
`define TRACE_ARRAY2D(lvl, fmt, arr, m, n) \
`TRACE(lvl, ("{")) \
`TRACE(lvl, ("{")); \
for (integer __i = n-1; __i >= 0; --__i) begin \
if (__i != (n-1)) `TRACE(lvl, (", ")) \
`TRACE(lvl, ("{")) \
if (__i != (n-1)) `TRACE(lvl, (", ")); \
`TRACE(lvl, ("{")); \
for (integer __j = (m-1); __j >= 0; --__j) begin \
if (__j != (m-1)) `TRACE(lvl, (", "))\
`TRACE(lvl, (fmt, arr[__i][__j])) \
if (__j != (m-1)) `TRACE(lvl, (", "));\
`TRACE(lvl, (fmt, arr[__i][__j])); \
end \
`TRACE(lvl, ("}")) \
`TRACE(lvl, ("}")); \
end \
`TRACE(lvl, ("}"))
@ -262,13 +239,10 @@
`RESET_RELAY_EX (dst, src, 1, 0)
// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2, 5 -> 2
`define TO_OUT_BUF_SIZE(s) `MIN(s & 7, 2)
`define TO_OUT_BUF_SIZE(s) `MIN(s, 2)
// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2, 5 > 3
`define TO_OUT_BUF_REG(s) (((s & 7) < 2) ? (s & 7) : ((s & 7) - 2))
// lut(x): (x & 8) != 0
`define TO_OUT_BUF_LUTRAM(s) ((s & 8) != 0)
`define TO_OUT_BUF_REG(s) ((s < 2) ? s : (s - 2))
`define REPEAT(n,f,s) `_REPEAT_``n(f,s)
`define _REPEAT_0(f,s)

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -21,66 +21,47 @@
input wire scope_bus_in, \
output wire scope_bus_out,
`define SCOPE_IO_SWITCH(__count) \
wire scope_bus_in_w [__count]; \
wire scope_bus_out_w [__count]; \
`RESET_RELAY_EX(scope_reset_w, scope_reset, __count, 4); \
VX_scope_switch #( \
.N (__count) \
) scope_switch ( \
.clk (clk), \
.reset (scope_reset), \
.req_in (scope_bus_in), \
.rsp_out (scope_bus_out), \
.req_out (scope_bus_in_w), \
.rsp_in (scope_bus_out_w) \
);
`define SCOPE_IO_BIND(__i) \
.scope_reset (scope_reset_w[__i]), \
.scope_bus_in (scope_bus_in_w[__i]), \
.scope_bus_out (scope_bus_out_w[__i]),
`define SCOPE_IO_UNUSED(__i) \
`define SCOPE_IO_UNUSED() \
`UNUSED_VAR (scope_reset); \
`UNUSED_VAR (scope_bus_in); \
assign scope_bus_out = 0;
`define SCOPE_IO_UNUSED_W(__i) \
`UNUSED_VAR (scope_reset_w[__i]); \
`UNUSED_VAR (scope_bus_in_w[__i]); \
assign scope_bus_out_w[__i] = 0;
`define SCOPE_IO_SWITCH(__count) \
wire [__count-1:0] scope_bus_in_w; \
wire [__count-1:0] scope_bus_out_w; \
wire [__count-1:0] scope_reset_w = {__count{scope_reset}}; \
VX_scope_switch #( \
.N (__count) \
) scope_switch ( \
.clk (clk), \
.reset (scope_reset), \
.req_in (scope_bus_in), \
.rsp_out (scope_bus_out), \
.req_out (scope_bus_in_w), \
.rsp_in (scope_bus_out_w) \
)
`define SCOPE_TAP_EX(__idx, __id, __xtriggers_w, __htriggers_w, __probes_w, __xtriggers, __htriggers, __probes, __start, __stop, __depth) \
VX_scope_tap #( \
.SCOPE_ID (__id), \
.XTRIGGERW(__xtriggers_w), \
.HTRIGGERW(__htriggers_w), \
.PROBEW (__probes_w), \
.DEPTH (__depth) \
) scope_tap_``idx ( \
.clk (clk), \
.reset (scope_reset_w[__idx]), \
.start (__start), \
.stop (__stop), \
.xtriggers(__xtriggers), \
.htriggers(__htriggers), \
.probes (__probes), \
.bus_in (scope_bus_in_w[__idx]), \
.bus_out(scope_bus_out_w[__idx]) \
)
`define SCOPE_TAP(__idx, __id, __xtriggers, __htriggers, __probes, __start, __stop, __depth) \
`SCOPE_TAP_EX(__idx, __id, $bits(__xtriggers), $bits(__htriggers), $bits(__probes), __xtriggers, __htriggers, __probes, __start, __stop, __depth)
`else
`define SCOPE_IO_DECL
`define SCOPE_IO_BIND(__i)
`define SCOPE_IO_UNUSED(__i)
`define SCOPE_IO_SWITCH(__count)
`define SCOPE_TAP(__idx, __id, __xtriggers, __probes, __depth)
`define SCOPE_IO_BIND(__i)
`define SCOPE_TAP_EX(__idx, __id, __xtriggers_w, __probes_w, __xtriggers, __probes, __depth)
`define SCOPE_IO_UNUSED_W(__i)
`define SCOPE_IO_UNUSED(__i)
`endif

View file

@ -24,14 +24,14 @@ module VX_socket import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
input sysmem_perf_t sysmem_perf,
VX_mem_perf_if.slave mem_perf_if,
`endif
// DCRs
VX_dcr_bus_if.slave dcr_bus_if,
// Memory
VX_mem_bus_if.master mem_bus_if [`L1_MEM_PORTS],
VX_mem_bus_if.master mem_bus_if,
`ifdef GBAR_ENABLE
// Barrier
@ -49,12 +49,14 @@ module VX_socket import VX_gpu_pkg::*; #(
`ifdef GBAR_ENABLE
VX_gbar_bus_if per_core_gbar_bus_if[`SOCKET_SIZE]();
`RESET_RELAY (gbar_arb_reset, reset);
VX_gbar_arb #(
.NUM_REQS (`SOCKET_SIZE),
.OUT_BUF ((`SOCKET_SIZE > 1) ? 2 : 0)
) gbar_arb (
.clk (clk),
.reset (reset),
.reset (gbar_arb_reset),
.bus_in_if (per_core_gbar_bus_if),
.bus_out_if (gbar_bus_if)
);
@ -63,13 +65,11 @@ module VX_socket import VX_gpu_pkg::*; #(
///////////////////////////////////////////////////////////////////////////
`ifdef PERF_ENABLE
cache_perf_t icache_perf, dcache_perf;
sysmem_perf_t sysmem_perf_tmp;
always @(*) begin
sysmem_perf_tmp = sysmem_perf;
sysmem_perf_tmp.icache = icache_perf;
sysmem_perf_tmp.dcache = dcache_perf;
end
VX_mem_perf_if mem_perf_tmp_if();
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.lmem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif
///////////////////////////////////////////////////////////////////////////
@ -82,12 +82,12 @@ module VX_socket import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
) icache_mem_bus_if[1]();
) icache_mem_bus_if();
`RESET_RELAY (icache_reset, reset);
VX_cache_cluster #(
.INSTANCE_ID (`SFORMATF(("%s-icache", INSTANCE_ID))),
.INSTANCE_ID ($sformatf("%s-icache", INSTANCE_ID)),
.NUM_UNITS (`NUM_ICACHES),
.NUM_INPUTS (`SOCKET_SIZE),
.TAG_SEL_IDX (0),
@ -97,22 +97,19 @@ module VX_socket import VX_gpu_pkg::*; #(
.NUM_WAYS (`ICACHE_NUM_WAYS),
.WORD_SIZE (ICACHE_WORD_SIZE),
.NUM_REQS (1),
.MEM_PORTS (1),
.CRSQ_SIZE (`ICACHE_CRSQ_SIZE),
.MSHR_SIZE (`ICACHE_MSHR_SIZE),
.MRSQ_SIZE (`ICACHE_MRSQ_SIZE),
.MREQ_SIZE (`ICACHE_MREQ_SIZE),
.TAG_WIDTH (ICACHE_TAG_WIDTH),
.FLAGS_WIDTH (0),
.UUID_WIDTH (`UUID_WIDTH),
.WRITE_ENABLE (0),
.REPL_POLICY (`ICACHE_REPL_POLICY),
.NC_ENABLE (0),
.CORE_OUT_BUF (3),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2)
) icache (
`ifdef PERF_ENABLE
.cache_perf (icache_perf),
.cache_perf (mem_perf_tmp_if.icache),
`endif
.clk (clk),
.reset (icache_reset),
@ -130,12 +127,12 @@ module VX_socket import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
) dcache_mem_bus_if[`L1_MEM_PORTS]();
) dcache_mem_bus_if();
`RESET_RELAY (dcache_reset, reset);
VX_cache_cluster #(
.INSTANCE_ID (`SFORMATF(("%s-dcache", INSTANCE_ID))),
.INSTANCE_ID ($sformatf("%s-dcache", INSTANCE_ID)),
.NUM_UNITS (`NUM_DCACHES),
.NUM_INPUTS (`SOCKET_SIZE),
.TAG_SEL_IDX (0),
@ -145,24 +142,21 @@ module VX_socket import VX_gpu_pkg::*; #(
.NUM_WAYS (`DCACHE_NUM_WAYS),
.WORD_SIZE (DCACHE_WORD_SIZE),
.NUM_REQS (DCACHE_NUM_REQS),
.MEM_PORTS (`L1_MEM_PORTS),
.CRSQ_SIZE (`DCACHE_CRSQ_SIZE),
.MSHR_SIZE (`DCACHE_MSHR_SIZE),
.MRSQ_SIZE (`DCACHE_MRSQ_SIZE),
.MREQ_SIZE (`DCACHE_WRITEBACK ? `DCACHE_MSHR_SIZE : `DCACHE_MREQ_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`DCACHE_WRITEBACK),
.DIRTY_BYTES (`DCACHE_DIRTYBYTES),
.REPL_POLICY (`DCACHE_REPL_POLICY),
.DIRTY_BYTES (`DCACHE_WRITEBACK),
.NC_ENABLE (1),
.CORE_OUT_BUF (3),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2)
) dcache (
`ifdef PERF_ENABLE
.cache_perf (dcache_perf),
.cache_perf (mem_perf_tmp_if.dcache),
`endif
.clk (clk),
.reset (dcache_reset),
@ -172,64 +166,51 @@ module VX_socket import VX_gpu_pkg::*; #(
///////////////////////////////////////////////////////////////////////////
for (genvar i = 0; i < `L1_MEM_PORTS; ++i) begin : g_mem_bus_if
if (i == 0) begin : g_i0
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
) l1_mem_bus_if[2]();
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
) l1_mem_bus_if[2]();
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) l1_mem_arb_bus_if[1]();
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) l1_mem_arb_bus_if[1]();
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_bus_if[0], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_bus_if[1], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
VX_mem_arb #(
.NUM_INPUTS (2),
.NUM_OUTPUTS(1),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX(0),
.ARBITER ("P"), // prioritize the icache
.REQ_OUT_BUF(3),
.RSP_OUT_BUF(3)
) mem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (l1_mem_bus_if),
.bus_out_if (l1_mem_arb_bus_if)
);
VX_mem_arb #(
.NUM_INPUTS (2),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX (0),
.ARBITER ("R"),
.REQ_OUT_BUF (2),
.RSP_OUT_BUF (2)
) mem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (l1_mem_bus_if),
.bus_out_if (l1_mem_arb_bus_if)
);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[0], l1_mem_arb_bus_if[0]);
end else begin : g_i
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) l1_mem_arb_bus_if();
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_arb_bus_if, dcache_mem_bus_if[i], L1_MEM_ARB_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], l1_mem_arb_bus_if);
end
end
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, l1_mem_arb_bus_if[0]);
///////////////////////////////////////////////////////////////////////////
wire [`SOCKET_SIZE-1:0] per_core_busy;
VX_dcr_bus_if core_dcr_bus_if();
`BUFFER_DCR_BUS_IF (core_dcr_bus_if, dcr_bus_if, (`SOCKET_SIZE > 1));
// Generate all cores
for (genvar core_id = 0; core_id < `SOCKET_SIZE; ++core_id) begin : g_cores
for (genvar core_id = 0; core_id < `SOCKET_SIZE; ++core_id) begin : cores
`RESET_RELAY (core_reset, reset);
VX_dcr_bus_if core_dcr_bus_if();
`BUFFER_DCR_BUS_IF (core_dcr_bus_if, dcr_bus_if, 1'b1, (`SOCKET_SIZE > 1))
VX_core #(
.CORE_ID ((SOCKET_ID * `SOCKET_SIZE) + core_id),
.INSTANCE_ID (`SFORMATF(("%s-core%0d", INSTANCE_ID, core_id)))
.INSTANCE_ID ($sformatf("%s-core%0d", INSTANCE_ID, core_id))
) core (
`SCOPE_IO_BIND (scope_core + core_id)
@ -237,7 +218,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.reset (core_reset),
`ifdef PERF_ENABLE
.sysmem_perf (sysmem_perf_tmp),
.mem_perf_if (mem_perf_tmp_if),
`endif
.dcr_bus_if (core_dcr_bus_if),
@ -254,6 +235,6 @@ module VX_socket import VX_gpu_pkg::*; #(
);
end
`BUFFER_EX(busy, (| per_core_busy), 1'b1, 1, (`SOCKET_SIZE > 1));
`BUFFER_EX(busy, (| per_core_busy), 1'b1, (`SOCKET_SIZE > 1));
endmodule

View file

@ -166,8 +166,6 @@
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99
`define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency
`define VX_CSR_MPM_MEM_LT_H 12'hB9A
`define VX_CSR_MPM_MEM_BANK_ST 12'hB1E // bank conflicts
`define VX_CSR_MPM_MEM_BANK_ST_H 12'hB9E
// PERF: lmem
`define VX_CSR_MPM_LMEM_READS 12'hB1B // memory reads
`define VX_CSR_MPM_LMEM_READS_H 12'hB9B
@ -175,9 +173,6 @@
`define VX_CSR_MPM_LMEM_WRITES_H 12'hB9C
`define VX_CSR_MPM_LMEM_BANK_ST 12'hB1D // bank conflicts
`define VX_CSR_MPM_LMEM_BANK_ST_H 12'hB9D
// PERF: coalescer
`define VX_CSR_MPM_COALESCER_MISS 12'hB1F // coalescer misses
`define VX_CSR_MPM_COALESCER_MISS_H 12'hB9F
// Machine Performance-monitoring memory counters (class 3) ///////////////////
// <Add your own counters: use addresses hB03..B1F, hB83..hB9F>
@ -189,19 +184,6 @@
`define VX_CSR_MIMPID 12'hF13
`define VX_CSR_MHARTID 12'hF14
// Vector CSRs
`define VX_CSR_VSTART 12'h008
`define VX_CSR_VXSAT 12'h009
`define VX_CSR_VXRM 12'h00A
`define VX_CSR_VCSR 12'h00F
`define VX_CSR_VL 12'hC20
`define VX_CSR_VTYPE 12'hC21
`define VX_CSR_VLENB 12'hC22
`define VX_CSR_VCYCLE 12'hC00
`define VX_CSR_VTIME 12'hC01
`define VX_CSR_VINSTRET 12'hC02
// GPGU CSRs
`define VX_CSR_THREAD_ID 12'hCC0
@ -215,10 +197,4 @@
`define VX_CSR_NUM_CORES 12'hFC2
`define VX_CSR_LOCAL_MEM_BASE 12'hFC3
`define VX_MAT_MUL_SIZE 12'hFC4 // VX_MAT_MUL_SIZE = Matrix Size / TC Size
`define VX_TC_NUM 12'hFC5
`define VX_TC_SIZE 12'hFC6
`endif // VX_TYPES_VH

View file

@ -21,19 +21,19 @@ module Vortex import VX_gpu_pkg::*; (
input wire reset,
// Memory request
output wire mem_req_valid [`VX_MEM_PORTS],
output wire mem_req_rw [`VX_MEM_PORTS],
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen [`VX_MEM_PORTS],
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr [`VX_MEM_PORTS],
output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data [`VX_MEM_PORTS],
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag [`VX_MEM_PORTS],
input wire mem_req_ready [`VX_MEM_PORTS],
output wire mem_req_valid,
output wire mem_req_rw,
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data,
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid [`VX_MEM_PORTS],
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data [`VX_MEM_PORTS],
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag [`VX_MEM_PORTS],
output wire mem_rsp_ready [`VX_MEM_PORTS],
input wire mem_rsp_valid,
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data,
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
// DCR write request
input wire dcr_wr_valid,
@ -50,25 +50,22 @@ module Vortex import VX_gpu_pkg::*; (
`endif
`ifdef PERF_ENABLE
cache_perf_t l3_perf;
mem_perf_t mem_perf;
sysmem_perf_t sysmem_perf;
always @(*) begin
sysmem_perf = '0;
sysmem_perf.l3cache = l3_perf;
sysmem_perf.mem = mem_perf;
end
VX_mem_perf_if mem_perf_if();
assign mem_perf_if.icache = 'x;
assign mem_perf_if.dcache = 'x;
assign mem_perf_if.l2cache = 'x;
assign mem_perf_if.lmem = 'x;
`endif
VX_mem_bus_if #(
.DATA_SIZE (`L2_LINE_SIZE),
.TAG_WIDTH (L2_MEM_TAG_WIDTH)
) per_cluster_mem_bus_if[`NUM_CLUSTERS * `L2_MEM_PORTS]();
) per_cluster_mem_bus_if[`NUM_CLUSTERS]();
VX_mem_bus_if #(
.DATA_SIZE (`L3_LINE_SIZE),
.TAG_WIDTH (L3_MEM_TAG_WIDTH)
) mem_bus_if[`L3_MEM_PORTS]();
) mem_bus_if();
`RESET_RELAY (l3_reset, reset);
@ -80,7 +77,6 @@ module Vortex import VX_gpu_pkg::*; (
.NUM_WAYS (`L3_NUM_WAYS),
.WORD_SIZE (L3_WORD_SIZE),
.NUM_REQS (L3_NUM_REQS),
.MEM_PORTS (`L3_MEM_PORTS),
.CRSQ_SIZE (`L3_CRSQ_SIZE),
.MSHR_SIZE (`L3_MSHR_SIZE),
.MRSQ_SIZE (`L3_MRSQ_SIZE),
@ -88,12 +84,10 @@ module Vortex import VX_gpu_pkg::*; (
.TAG_WIDTH (L2_MEM_TAG_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`L3_WRITEBACK),
.DIRTY_BYTES (`L3_DIRTYBYTES),
.REPL_POLICY (`L3_REPL_POLICY),
.DIRTY_BYTES (`L3_WRITEBACK),
.UUID_WIDTH (`UUID_WIDTH),
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
.CORE_OUT_BUF (3),
.MEM_OUT_BUF (3),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2),
.NC_ENABLE (1),
.PASSTHRU (!`L3_ENABLED)
) l3cache (
@ -101,28 +95,31 @@ module Vortex import VX_gpu_pkg::*; (
.reset (l3_reset),
`ifdef PERF_ENABLE
.cache_perf (l3_perf),
.cache_perf (mem_perf_if.l3cache),
`endif
.core_bus_if (per_cluster_mem_bus_if),
.mem_bus_if (mem_bus_if)
);
for (genvar i = 0; i < `L3_MEM_PORTS; ++i) begin : g_mem_bus_if
assign mem_req_valid[i] = mem_bus_if[i].req_valid;
assign mem_req_rw[i] = mem_bus_if[i].req_data.rw;
assign mem_req_byteen[i] = mem_bus_if[i].req_data.byteen;
assign mem_req_addr[i] = mem_bus_if[i].req_data.addr;
assign mem_req_data[i] = mem_bus_if[i].req_data.data;
assign mem_req_tag[i] = mem_bus_if[i].req_data.tag;
`UNUSED_VAR (mem_bus_if[i].req_data.flags)
assign mem_bus_if[i].req_ready = mem_req_ready[i];
assign mem_req_valid = mem_bus_if.req_valid;
assign mem_req_rw = mem_bus_if.req_data.rw;
assign mem_req_byteen= mem_bus_if.req_data.byteen;
assign mem_req_addr = mem_bus_if.req_data.addr;
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
`UNUSED_VAR (mem_bus_if.req_data.atype)
assign mem_bus_if[i].rsp_valid = mem_rsp_valid[i];
assign mem_bus_if[i].rsp_data.data = mem_rsp_data[i];
assign mem_bus_if[i].rsp_data.tag = mem_rsp_tag[i];
assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready;
end
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data;
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
assign mem_rsp_ready = mem_bus_if.rsp_ready;
wire mem_req_fire = mem_req_valid && mem_req_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
`UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire)
VX_dcr_bus_if dcr_bus_if();
assign dcr_bus_if.write_valid = dcr_wr_valid;
@ -132,16 +129,16 @@ module Vortex import VX_gpu_pkg::*; (
wire [`NUM_CLUSTERS-1:0] per_cluster_busy;
// Generate all clusters
for (genvar cluster_id = 0; cluster_id < `NUM_CLUSTERS; ++cluster_id) begin : g_clusters
for (genvar cluster_id = 0; cluster_id < `NUM_CLUSTERS; ++cluster_id) begin : clusters
`RESET_RELAY (cluster_reset, reset);
VX_dcr_bus_if cluster_dcr_bus_if();
`BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, 1'b1, (`NUM_CLUSTERS > 1))
`BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, (`NUM_CLUSTERS > 1));
VX_cluster #(
.CLUSTER_ID (cluster_id),
.INSTANCE_ID (`SFORMATF(("cluster%0d", cluster_id)))
.INSTANCE_ID ($sformatf("cluster%0d", cluster_id))
) cluster (
`SCOPE_IO_BIND (scope_cluster + cluster_id)
@ -149,83 +146,59 @@ module Vortex import VX_gpu_pkg::*; (
.reset (cluster_reset),
`ifdef PERF_ENABLE
.sysmem_perf (sysmem_perf),
.mem_perf_if (mem_perf_if),
`endif
.dcr_bus_if (cluster_dcr_bus_if),
.mem_bus_if (per_cluster_mem_bus_if[cluster_id * `L2_MEM_PORTS +: `L2_MEM_PORTS]),
.mem_bus_if (per_cluster_mem_bus_if[cluster_id]),
.busy (per_cluster_busy[cluster_id])
);
end
`BUFFER_EX(busy, (| per_cluster_busy), 1'b1, 1, (`NUM_CLUSTERS > 1));
`BUFFER_EX(busy, (| per_cluster_busy), 1'b1, (`NUM_CLUSTERS > 1));
`ifdef PERF_ENABLE
localparam MEM_PORTS_CTR_W = `CLOG2(`VX_MEM_PORTS+1);
wire [`VX_MEM_PORTS-1:0] mem_req_fire, mem_rsp_fire;
wire [`VX_MEM_PORTS-1:0] mem_rd_req_fire, mem_wr_req_fire;
for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_perf_ctrs
assign mem_req_fire[i] = mem_req_valid[i] & mem_req_ready[i];
assign mem_rsp_fire[i] = mem_rsp_valid[i] & mem_rsp_ready[i];
assign mem_rd_req_fire[i] = mem_req_fire[i] & ~mem_req_rw[i];
assign mem_wr_req_fire[i] = mem_req_fire[i] & mem_req_rw[i];
end
wire [MEM_PORTS_CTR_W-1:0] perf_mem_reads_per_cycle;
wire [MEM_PORTS_CTR_W-1:0] perf_mem_writes_per_cycle;
wire [MEM_PORTS_CTR_W-1:0] perf_mem_rsps_per_cycle;
`POP_COUNT(perf_mem_reads_per_cycle, mem_rd_req_fire);
`POP_COUNT(perf_mem_writes_per_cycle, mem_wr_req_fire);
`POP_COUNT(perf_mem_rsps_per_cycle, mem_rsp_fire);
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
mem_perf_t mem_perf;
always @(posedge clk) begin
if (reset) begin
perf_mem_pending_reads <= '0;
end else begin
perf_mem_pending_reads <= $signed(perf_mem_pending_reads) +
`PERF_CTR_BITS'($signed((MEM_PORTS_CTR_W+1)'(perf_mem_reads_per_cycle) - (MEM_PORTS_CTR_W+1)'(perf_mem_rsps_per_cycle)));
`PERF_CTR_BITS'($signed(2'(mem_req_fire && ~mem_bus_if.req_data.rw) - 2'(mem_rsp_fire)));
end
end
wire mem_rd_req_fire = mem_req_fire && ~mem_bus_if.req_data.rw;
wire mem_wr_req_fire = mem_req_fire && mem_bus_if.req_data.rw;
always @(posedge clk) begin
if (reset) begin
mem_perf <= '0;
end else begin
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(perf_mem_reads_per_cycle);
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(perf_mem_writes_per_cycle);
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(mem_rd_req_fire);
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(mem_wr_req_fire);
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
end
end
assign mem_perf_if.mem = mem_perf;
`endif
// dump device configuration
initial begin
`TRACE(0, ("CONFIGS: num_threads=%0d, num_warps=%0d, num_cores=%0d, num_clusters=%0d, socket_size=%0d, local_mem_base=0x%0h, num_barriers=%0d\n",
`NUM_THREADS, `NUM_WARPS, `NUM_CORES, `NUM_CLUSTERS, `SOCKET_SIZE, `LMEM_BASE_ADDR, `NUM_BARRIERS))
end
`ifdef DBG_TRACE_MEM
for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_trace
always @(posedge clk) begin
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
if (mem_bus_if[i].req_data.rw) begin
`TRACE(2, ("%t: MEM Wr Req[%0d]: addr=0x%0h, byteen=0x%h data=0x%h, tag=0x%0h (#%0d)\n", $time, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
end else begin
`TRACE(2, ("%t: MEM Rd Req[%0d]: addr=0x%0h, byteen=0x%h, tag=0x%0h (#%0d)\n", $time, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
end
end
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
`TRACE(2, ("%t: MEM Rd Rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n", $time, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid))
end
always @(posedge clk) begin
if (mem_req_fire) begin
if (mem_req_rw)
`TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%h data=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data));
else
`TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen));
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: MEM Rd Rsp: tag=0x%0h, data=0x%h\n", $time, mem_rsp_tag, mem_rsp_data));
end
end
`endif

View file

@ -82,26 +82,112 @@ module Vortex_axi import VX_gpu_pkg::*; #(
// Status
output wire busy
);
localparam DST_LDATAW = `CLOG2(AXI_DATA_WIDTH);
localparam SRC_LDATAW = `CLOG2(`VX_MEM_DATA_WIDTH);
localparam SUB_LDATAW = DST_LDATAW - SRC_LDATAW;
localparam VX_MEM_TAG_A_WIDTH = `VX_MEM_TAG_WIDTH + `MAX(SUB_LDATAW, 0);
localparam VX_MEM_ADDR_A_WIDTH = `VX_MEM_ADDR_WIDTH - SUB_LDATAW;
`STATIC_ASSERT((AXI_DATA_WIDTH == `VX_MEM_DATA_WIDTH), ("invalid memory data size: current=%0d, expected=%0d", AXI_DATA_WIDTH, `VX_MEM_DATA_WIDTH))
`STATIC_ASSERT((AXI_ADDR_WIDTH >= `MEM_ADDR_WIDTH), ("invalid memory address size: current=%0d, expected=%0d", AXI_ADDR_WIDTH, `VX_MEM_ADDR_WIDTH))
//`STATIC_ASSERT((AXI_TID_WIDTH >= `VX_MEM_TAG_WIDTH), ("invalid memory tag size: current=%0d, expected=%0d", AXI_TID_WIDTH, `VX_MEM_TAG_WIDTH))
wire mem_req_valid [`VX_MEM_PORTS];
wire mem_req_rw [`VX_MEM_PORTS];
wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen [`VX_MEM_PORTS];
wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr [`VX_MEM_PORTS];
wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data [`VX_MEM_PORTS];
wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag [`VX_MEM_PORTS];
wire mem_req_ready [`VX_MEM_PORTS];
wire mem_req_valid;
wire mem_req_rw;
wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen;
wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr;
wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data;
wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
wire mem_rsp_valid [`VX_MEM_PORTS];
wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data [`VX_MEM_PORTS];
wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag [`VX_MEM_PORTS];
wire mem_rsp_ready [`VX_MEM_PORTS];
wire mem_rsp_valid;
wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data;
wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_ready;
`SCOPE_IO_SWITCH (1);
wire [`MEM_ADDR_WIDTH-1:0] m_axi_awaddr_unqual [AXI_NUM_BANKS];
wire [`MEM_ADDR_WIDTH-1:0] m_axi_araddr_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_awid_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_arid_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_bid_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_rid_unqual [AXI_NUM_BANKS];
for (genvar i = 0; i < AXI_NUM_BANKS; ++i) begin
assign m_axi_awaddr[i] = `MEM_ADDR_WIDTH'(m_axi_awaddr_unqual[i]);
assign m_axi_araddr[i] = `MEM_ADDR_WIDTH'(m_axi_araddr_unqual[i]);
assign m_axi_awid[i] = AXI_TID_WIDTH'(m_axi_awid_unqual[i]);
assign m_axi_arid[i] = AXI_TID_WIDTH'(m_axi_arid_unqual[i]);
assign m_axi_rid_unqual[i] = `VX_MEM_TAG_WIDTH'(m_axi_rid[i]);
assign m_axi_bid_unqual[i] = `VX_MEM_TAG_WIDTH'(m_axi_bid[i]);
end
VX_axi_adapter #(
.DATA_WIDTH (`VX_MEM_DATA_WIDTH),
.ADDR_WIDTH (`MEM_ADDR_WIDTH),
.TAG_WIDTH (`VX_MEM_TAG_WIDTH),
.NUM_BANKS (AXI_NUM_BANKS),
.RSP_OUT_BUF((AXI_NUM_BANKS > 1) ? 2 : 0)
) axi_adapter (
.clk (clk),
.reset (reset),
.mem_req_valid (mem_req_valid),
.mem_req_rw (mem_req_rw),
.mem_req_byteen (mem_req_byteen),
.mem_req_addr (mem_req_addr),
.mem_req_data (mem_req_data),
.mem_req_tag (mem_req_tag),
.mem_req_ready (mem_req_ready),
.mem_rsp_valid (mem_rsp_valid),
.mem_rsp_data (mem_rsp_data),
.mem_rsp_tag (mem_rsp_tag),
.mem_rsp_ready (mem_rsp_ready),
.m_axi_awvalid (m_axi_awvalid),
.m_axi_awready (m_axi_awready),
.m_axi_awaddr (m_axi_awaddr_unqual),
.m_axi_awid (m_axi_awid_unqual),
.m_axi_awlen (m_axi_awlen),
.m_axi_awsize (m_axi_awsize),
.m_axi_awburst (m_axi_awburst),
.m_axi_awlock (m_axi_awlock),
.m_axi_awcache (m_axi_awcache),
.m_axi_awprot (m_axi_awprot),
.m_axi_awqos (m_axi_awqos),
.m_axi_awregion (m_axi_awregion),
.m_axi_wvalid (m_axi_wvalid),
.m_axi_wready (m_axi_wready),
.m_axi_wdata (m_axi_wdata),
.m_axi_wstrb (m_axi_wstrb),
.m_axi_wlast (m_axi_wlast),
.m_axi_bvalid (m_axi_bvalid),
.m_axi_bready (m_axi_bready),
.m_axi_bid (m_axi_bid_unqual),
.m_axi_bresp (m_axi_bresp),
.m_axi_arvalid (m_axi_arvalid),
.m_axi_arready (m_axi_arready),
.m_axi_araddr (m_axi_araddr_unqual),
.m_axi_arid (m_axi_arid_unqual),
.m_axi_arlen (m_axi_arlen),
.m_axi_arsize (m_axi_arsize),
.m_axi_arburst (m_axi_arburst),
.m_axi_arlock (m_axi_arlock),
.m_axi_arcache (m_axi_arcache),
.m_axi_arprot (m_axi_arprot),
.m_axi_arqos (m_axi_arqos),
.m_axi_arregion (m_axi_arregion),
.m_axi_rvalid (m_axi_rvalid),
.m_axi_rready (m_axi_rready),
.m_axi_rdata (m_axi_rdata),
.m_axi_rlast (m_axi_rlast) ,
.m_axi_rid (m_axi_rid_unqual),
.m_axi_rresp (m_axi_rresp)
);
`SCOPE_IO_SWITCH (1)
Vortex vortex (
`SCOPE_IO_BIND (0)
@ -129,133 +215,4 @@ module Vortex_axi import VX_gpu_pkg::*; #(
.busy (busy)
);
wire mem_req_valid_a [`VX_MEM_PORTS];
wire mem_req_rw_a [`VX_MEM_PORTS];
wire [(AXI_DATA_WIDTH/8)-1:0] mem_req_byteen_a [`VX_MEM_PORTS];
wire [VX_MEM_ADDR_A_WIDTH-1:0] mem_req_addr_a [`VX_MEM_PORTS];
wire [AXI_DATA_WIDTH-1:0] mem_req_data_a [`VX_MEM_PORTS];
wire [VX_MEM_TAG_A_WIDTH-1:0] mem_req_tag_a [`VX_MEM_PORTS];
wire mem_req_ready_a [`VX_MEM_PORTS];
wire mem_rsp_valid_a [`VX_MEM_PORTS];
wire [AXI_DATA_WIDTH-1:0] mem_rsp_data_a [`VX_MEM_PORTS];
wire [VX_MEM_TAG_A_WIDTH-1:0] mem_rsp_tag_a [`VX_MEM_PORTS];
wire mem_rsp_ready_a [`VX_MEM_PORTS];
// Adjust memory data width to match AXI interface
for (genvar i = 0; i < `VX_MEM_PORTS; i++) begin : g_mem_adapter
VX_mem_data_adapter #(
.SRC_DATA_WIDTH (`VX_MEM_DATA_WIDTH),
.DST_DATA_WIDTH (AXI_DATA_WIDTH),
.SRC_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH),
.DST_ADDR_WIDTH (VX_MEM_ADDR_A_WIDTH),
.SRC_TAG_WIDTH (`VX_MEM_TAG_WIDTH),
.DST_TAG_WIDTH (VX_MEM_TAG_A_WIDTH),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (0)
) mem_data_adapter (
.clk (clk),
.reset (reset),
.mem_req_valid_in (mem_req_valid[i]),
.mem_req_addr_in (mem_req_addr[i]),
.mem_req_rw_in (mem_req_rw[i]),
.mem_req_byteen_in (mem_req_byteen[i]),
.mem_req_data_in (mem_req_data[i]),
.mem_req_tag_in (mem_req_tag[i]),
.mem_req_ready_in (mem_req_ready[i]),
.mem_rsp_valid_in (mem_rsp_valid[i]),
.mem_rsp_data_in (mem_rsp_data[i]),
.mem_rsp_tag_in (mem_rsp_tag[i]),
.mem_rsp_ready_in (mem_rsp_ready[i]),
.mem_req_valid_out (mem_req_valid_a[i]),
.mem_req_addr_out (mem_req_addr_a[i]),
.mem_req_rw_out (mem_req_rw_a[i]),
.mem_req_byteen_out (mem_req_byteen_a[i]),
.mem_req_data_out (mem_req_data_a[i]),
.mem_req_tag_out (mem_req_tag_a[i]),
.mem_req_ready_out (mem_req_ready_a[i]),
.mem_rsp_valid_out (mem_rsp_valid_a[i]),
.mem_rsp_data_out (mem_rsp_data_a[i]),
.mem_rsp_tag_out (mem_rsp_tag_a[i]),
.mem_rsp_ready_out (mem_rsp_ready_a[i])
);
end
VX_axi_adapter #(
.DATA_WIDTH (AXI_DATA_WIDTH),
.ADDR_WIDTH_IN (VX_MEM_ADDR_A_WIDTH),
.ADDR_WIDTH_OUT (AXI_ADDR_WIDTH),
.TAG_WIDTH_IN (VX_MEM_TAG_A_WIDTH),
.TAG_WIDTH_OUT (AXI_TID_WIDTH),
.NUM_PORTS_IN (`VX_MEM_PORTS),
.NUM_BANKS_OUT (AXI_NUM_BANKS),
.INTERLEAVE (`PLATFORM_MEMORY_INTERLEAVE),
.REQ_OUT_BUF ((`VX_MEM_PORTS > 1) ? 2 : 0),
.RSP_OUT_BUF ((`VX_MEM_PORTS > 1 || AXI_NUM_BANKS > 1) ? 2 : 0)
) axi_adapter (
.clk (clk),
.reset (reset),
.mem_req_valid (mem_req_valid_a),
.mem_req_rw (mem_req_rw_a),
.mem_req_byteen (mem_req_byteen_a),
.mem_req_addr (mem_req_addr_a),
.mem_req_data (mem_req_data_a),
.mem_req_tag (mem_req_tag_a),
.mem_req_ready (mem_req_ready_a),
.mem_rsp_valid (mem_rsp_valid_a),
.mem_rsp_data (mem_rsp_data_a),
.mem_rsp_tag (mem_rsp_tag_a),
.mem_rsp_ready (mem_rsp_ready_a),
.m_axi_awvalid (m_axi_awvalid),
.m_axi_awready (m_axi_awready),
.m_axi_awaddr (m_axi_awaddr),
.m_axi_awid (m_axi_awid),
.m_axi_awlen (m_axi_awlen),
.m_axi_awsize (m_axi_awsize),
.m_axi_awburst (m_axi_awburst),
.m_axi_awlock (m_axi_awlock),
.m_axi_awcache (m_axi_awcache),
.m_axi_awprot (m_axi_awprot),
.m_axi_awqos (m_axi_awqos),
.m_axi_awregion (m_axi_awregion),
.m_axi_wvalid (m_axi_wvalid),
.m_axi_wready (m_axi_wready),
.m_axi_wdata (m_axi_wdata),
.m_axi_wstrb (m_axi_wstrb),
.m_axi_wlast (m_axi_wlast),
.m_axi_bvalid (m_axi_bvalid),
.m_axi_bready (m_axi_bready),
.m_axi_bid (m_axi_bid),
.m_axi_bresp (m_axi_bresp),
.m_axi_arvalid (m_axi_arvalid),
.m_axi_arready (m_axi_arready),
.m_axi_araddr (m_axi_araddr),
.m_axi_arid (m_axi_arid),
.m_axi_arlen (m_axi_arlen),
.m_axi_arsize (m_axi_arsize),
.m_axi_arburst (m_axi_arburst),
.m_axi_arlock (m_axi_arlock),
.m_axi_arcache (m_axi_arcache),
.m_axi_arprot (m_axi_arprot),
.m_axi_arqos (m_axi_arqos),
.m_axi_arregion (m_axi_arregion),
.m_axi_rvalid (m_axi_rvalid),
.m_axi_rready (m_axi_rready),
.m_axi_rdata (m_axi_rdata),
.m_axi_rlast (m_axi_rlast),
.m_axi_rid (m_axi_rid),
.m_axi_rresp (m_axi_rresp)
);
endmodule

View file

@ -28,19 +28,9 @@
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
`include "VX_define.vh"
//`include "platform_afu_top_config.vh"
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH ((`PLATFORM_MEMORY_ADDR_WIDTH - $clog2(`PLATFORM_MEMORY_NUM_BANKS)) - $clog2(`PLATFORM_MEMORY_DATA_SIZE))
`endif
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH (`PLATFORM_MEMORY_DATA_SIZE * 8)
`endif
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH 4
`endif
`ifdef PLATFORM_PROVIDES_LOCAL_MEMORY
package local_mem_cfg_pkg;
@ -67,3 +57,5 @@ package local_mem_cfg_pkg;
typedef logic [LOCAL_MEM_DATA_N_BYTES-1:0] t_local_mem_byte_mask;
endpackage // local_mem_cfg_pkg
`endif // PLATFORM_PROVIDES_LOCAL_MEMORY

File diff suppressed because it is too large Load diff

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -17,9 +17,9 @@
`define AFU_ACCEL_NAME "vortex_afu"
`define AFU_ACCEL_UUID 128'h35F9452B_25C2_434C_93D5_6F8C60DB361C
`define AFU_IMAGE_CMD_MEM_READ 1
`define AFU_IMAGE_CMD_MEM_READ 1
`define AFU_IMAGE_CMD_MEM_WRITE 2
`define AFU_IMAGE_CMD_RUN 3
`define AFU_IMAGE_CMD_RUN 3
`define AFU_IMAGE_CMD_DCR_WRITE 4
`define AFU_IMAGE_CMD_MAX_VALUE 4

View file

@ -14,20 +14,22 @@
`include "vortex_afu.vh"
module VX_afu_ctrl #(
parameter S_AXI_ADDR_WIDTH = 8,
parameter S_AXI_DATA_WIDTH = 32
parameter AXI_ADDR_WIDTH = 8,
parameter AXI_DATA_WIDTH = 32,
parameter AXI_NUM_BANKS = 1
) (
// axi4 lite slave signals
input wire clk,
input wire reset,
input wire clk_en,
input wire s_axi_awvalid,
input wire [S_AXI_ADDR_WIDTH-1:0] s_axi_awaddr,
input wire [AXI_ADDR_WIDTH-1:0] s_axi_awaddr,
output wire s_axi_awready,
input wire s_axi_wvalid,
input wire [S_AXI_DATA_WIDTH-1:0] s_axi_wdata,
input wire [S_AXI_DATA_WIDTH/8-1:0]s_axi_wstrb,
input wire [AXI_DATA_WIDTH-1:0] s_axi_wdata,
input wire [AXI_DATA_WIDTH/8-1:0] s_axi_wstrb,
output wire s_axi_wready,
output wire s_axi_bvalid,
@ -35,11 +37,11 @@ module VX_afu_ctrl #(
input wire s_axi_bready,
input wire s_axi_arvalid,
input wire [S_AXI_ADDR_WIDTH-1:0] s_axi_araddr,
input wire [AXI_ADDR_WIDTH-1:0] s_axi_araddr,
output wire s_axi_arready,
output wire s_axi_rvalid,
output wire [S_AXI_DATA_WIDTH-1:0] s_axi_rdata,
output wire [AXI_DATA_WIDTH-1:0] s_axi_rdata,
output wire [1:0] s_axi_rresp,
input wire s_axi_rready,
@ -50,13 +52,13 @@ module VX_afu_ctrl #(
input wire ap_idle,
output wire interrupt,
output wire ap_ctrl_read,
`ifdef SCOPE
input wire scope_bus_in,
output wire scope_bus_out,
`endif
output wire [63:0] mem_base [AXI_NUM_BANKS],
output wire dcr_wr_valid,
output wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
output wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data
@ -108,38 +110,39 @@ module VX_afu_ctrl #(
ADDR_DEV_0 = 8'h10,
ADDR_DEV_1 = 8'h14,
//ADDR_DEV_CTRL = 8'h18,
ADDR_ISA_0 = 8'h18,
ADDR_ISA_1 = 8'h1C,
ADDR_ISA_0 = 8'h1C,
ADDR_ISA_1 = 8'h20,
//ADDR_ISA_CTRL = 8'h24,
ADDR_DCR_0 = 8'h20,
ADDR_DCR_1 = 8'h24,
ADDR_DCR_0 = 8'h28,
ADDR_DCR_1 = 8'h2C,
//ADDR_DCR_CTRL = 8'h30,
`ifdef SCOPE
ADDR_SCP_0 = 8'h28,
ADDR_SCP_1 = 8'h2C,
ADDR_SCP_0 = 8'h34,
ADDR_SCP_1 = 8'h38,
//ADDR_SCP_CTRL = 8'h3C,
`endif
ADDR_MEM_0 = 8'h40,
ADDR_MEM_1 = 8'h44,
//ADDR_MEM_CTRL = 8'h48,
ADDR_BITS = 8;
localparam
WSTATE_ADDR = 2'd0,
WSTATE_IDLE = 2'd0,
WSTATE_DATA = 2'd1,
WSTATE_RESP = 2'd2,
WSTATE_WIDTH = 2;
WSTATE_RESP = 2'd2;
localparam
RSTATE_ADDR = 2'd0,
RSTATE_DATA = 2'd1,
RSTATE_RESP = 2'd2,
RSTATE_WIDTH = 2;
localparam MEMORY_BANK_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH - `CLOG2(`PLATFORM_MEMORY_NUM_BANKS);
RSTATE_IDLE = 2'd0,
RSTATE_DATA = 2'd1;
// device caps
wire [63:0] dev_caps = {8'b0,
5'(MEMORY_BANK_ADDR_WIDTH-20),
3'(`CLOG2(`PLATFORM_MEMORY_NUM_BANKS)),
wire [63:0] dev_caps = {16'b0,
8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
16'(`NUM_CORES * `NUM_CLUSTERS),
8'(`NUM_WARPS),
@ -150,18 +153,16 @@ module VX_afu_ctrl #(
2'(`CLOG2(`XLEN)-4),
30'(`MISA_STD)};
reg [WSTATE_WIDTH-1:0] wstate;
reg [1:0] wstate;
reg [ADDR_BITS-1:0] waddr;
wire [31:0] wmask;
wire s_axi_aw_fire;
wire s_axi_w_fire;
wire s_axi_b_fire;
logic [RSTATE_WIDTH-1:0] rstate;
reg [1:0] rstate;
reg [31:0] rdata;
reg [ADDR_BITS-1:0] raddr;
wire [ADDR_BITS-1:0] raddr;
wire s_axi_ar_fire;
wire s_axi_r_fire;
reg ap_reset_r;
reg ap_start_r;
@ -169,23 +170,20 @@ module VX_afu_ctrl #(
reg gie_r;
reg [1:0] ier_r;
reg [1:0] isr_r;
reg [63:0] mem_r [AXI_NUM_BANKS];
reg [31:0] dcra_r;
reg [31:0] dcrv_r;
reg dcr_wr_valid_r;
logic wready_stall;
logic rvalid_stall;
`ifdef SCOPE
reg [63:0] scope_bus_wdata, scope_bus_rdata;
reg [63:0] scope_bus_wdata;
reg [63:0] scope_bus_rdata;
reg [5:0] scope_bus_ctr;
reg cmd_scope_writing, cmd_scope_reading;
reg cmd_scope_reading;
reg cmd_scope_writing;
reg scope_bus_out_r;
reg scope_rdata_valid;
reg is_scope_waddr, is_scope_raddr;
always @(posedge clk) begin
if (reset) begin
@ -193,33 +191,18 @@ module VX_afu_ctrl #(
cmd_scope_writing <= 0;
scope_bus_ctr <= '0;
scope_bus_out_r <= 0;
is_scope_waddr <= 0;
is_scope_raddr <= 0;
scope_bus_rdata <= '0;
scope_rdata_valid <= 0;
end else begin
scope_bus_out_r <= 0;
if (s_axi_aw_fire) begin
is_scope_waddr <= (s_axi_awaddr[ADDR_BITS-1:0] == ADDR_SCP_0)
|| (s_axi_awaddr[ADDR_BITS-1:0] == ADDR_SCP_1);
end
if (s_axi_ar_fire) begin
is_scope_raddr <= (s_axi_araddr[ADDR_BITS-1:0] == ADDR_SCP_0)
|| (s_axi_araddr[ADDR_BITS-1:0] == ADDR_SCP_1);
end
end else if (clk_en) begin
if (s_axi_w_fire && waddr == ADDR_SCP_0) begin
scope_bus_wdata[31:0] <= (s_axi_wdata & wmask) | (scope_bus_wdata[31:0] & ~wmask);
end
if (s_axi_w_fire && waddr == ADDR_SCP_1) begin
scope_bus_wdata[63:32] <= (s_axi_wdata & wmask) | (scope_bus_wdata[63:32] & ~wmask);
cmd_scope_writing <= 1;
scope_rdata_valid <= 0;
scope_bus_out_r <= 1;
scope_bus_ctr <= 63;
end
if (scope_bus_in) begin
cmd_scope_reading <= 1;
scope_bus_rdata <= '0;
scope_bus_ctr <= 63;
end
if (cmd_scope_reading) begin
@ -227,16 +210,13 @@ module VX_afu_ctrl #(
scope_bus_ctr <= scope_bus_ctr - 1;
if (scope_bus_ctr == 0) begin
cmd_scope_reading <= 0;
scope_rdata_valid <= 1;
scope_bus_ctr <= 0;
end
end
if (cmd_scope_writing) begin
scope_bus_out_r <= scope_bus_wdata[scope_bus_ctr];
scope_bus_out_r <= 1'(scope_bus_wdata >> scope_bus_ctr);
scope_bus_ctr <= scope_bus_ctr - 1;
if (scope_bus_ctr == 0) begin
cmd_scope_writing <= 0;
scope_bus_ctr <= 0;
end
end
end
@ -244,50 +224,41 @@ module VX_afu_ctrl #(
assign scope_bus_out = scope_bus_out_r;
assign wready_stall = is_scope_waddr && cmd_scope_writing;
assign rvalid_stall = is_scope_raddr && ~scope_rdata_valid;
`else
assign wready_stall = 0;
assign rvalid_stall = 0;
`endif
// AXI Write Request
assign s_axi_awready = (wstate == WSTATE_ADDR);
assign s_axi_wready = (wstate == WSTATE_DATA) && ~wready_stall;
// AXI Write
// AXI Write Response
assign s_axi_awready = (wstate == WSTATE_IDLE);
assign s_axi_wready = (wstate == WSTATE_DATA);
assign s_axi_bvalid = (wstate == WSTATE_RESP);
assign s_axi_bresp = 2'b00; // OKAY
for (genvar i = 0; i < 4; ++i) begin : g_wmask
assign wmask[8 * i +: 8] = {8{s_axi_wstrb[i]}};
end
assign s_axi_aw_fire = s_axi_awvalid && s_axi_awready;
assign s_axi_w_fire = s_axi_wvalid && s_axi_wready;
assign s_axi_b_fire = s_axi_bvalid && s_axi_bready;
for (genvar i = 0; i < 4; ++i) begin
assign wmask[8 * i +: 8] = {8{s_axi_wstrb[i]}};
end
// wstate
always @(posedge clk) begin
if (reset) begin
wstate <= WSTATE_ADDR;
end else begin
wstate <= WSTATE_IDLE;
end else if (clk_en) begin
case (wstate)
WSTATE_ADDR: wstate <= s_axi_aw_fire ? WSTATE_DATA : WSTATE_ADDR;
WSTATE_DATA: wstate <= s_axi_w_fire ? WSTATE_RESP : WSTATE_DATA;
WSTATE_RESP: wstate <= s_axi_b_fire ? WSTATE_ADDR : WSTATE_RESP;
default: wstate <= WSTATE_ADDR;
WSTATE_IDLE: wstate <= s_axi_awvalid ? WSTATE_DATA : WSTATE_IDLE;
WSTATE_DATA: wstate <= s_axi_wvalid ? WSTATE_RESP : WSTATE_DATA;
WSTATE_RESP: wstate <= s_axi_bready ? WSTATE_IDLE : WSTATE_RESP;
default: wstate <= WSTATE_IDLE;
endcase
end
end
// waddr
always @(posedge clk) begin
if (s_axi_aw_fire) begin
waddr <= s_axi_awaddr[ADDR_BITS-1:0];
if (clk_en) begin
if (s_axi_aw_fire)
waddr <= s_axi_awaddr[ADDR_BITS-1:0];
end
end
@ -305,13 +276,16 @@ module VX_afu_ctrl #(
dcra_r <= '0;
dcrv_r <= '0;
dcr_wr_valid_r <= 0;
end else begin
dcr_wr_valid_r <= 0;
ap_reset_r <= 0;
for (integer i = 0; i < AXI_NUM_BANKS; ++i) begin
mem_r[i] <= '0;
end
end else if (clk_en) begin
if (ap_ready)
ap_start_r <= auto_restart_r;
dcr_wr_valid_r <= 0;
if (s_axi_w_fire) begin
case (waddr)
ADDR_AP_CTRL: begin
@ -343,7 +317,16 @@ module VX_afu_ctrl #(
dcrv_r <= (s_axi_wdata & wmask) | (dcrv_r & ~wmask);
dcr_wr_valid_r <= 1;
end
default:;
default: begin
for (integer i = 0; i < AXI_NUM_BANKS; ++i) begin
if (waddr == (ADDR_MEM_0 + 8'(i) * 8'd12)) begin
mem_r[i][31:0] <= (s_axi_wdata & wmask) | (mem_r[i][31:0] & ~wmask);
end
if (waddr == (ADDR_MEM_1 + 8'(i) * 8'd12)) begin
mem_r[i][63:32] <= (s_axi_wdata & wmask) | (mem_r[i][63:32] & ~wmask);
end
end
end
endcase
if (ier_r[0] & ap_done)
@ -354,87 +337,82 @@ module VX_afu_ctrl #(
end
end
// AXI Read Request
assign s_axi_arready = (rstate == RSTATE_ADDR);
// AXI Read
// AXI Read Response
assign s_axi_rvalid = (rstate == RSTATE_RESP);
assign s_axi_arready = (rstate == RSTATE_IDLE);
assign s_axi_rvalid = (rstate == RSTATE_DATA);
assign s_axi_rdata = rdata;
assign s_axi_rresp = 2'b00; // OKAY
assign s_axi_ar_fire = s_axi_arvalid && s_axi_arready;
assign s_axi_r_fire = s_axi_rvalid && s_axi_rready;
assign raddr = s_axi_araddr[ADDR_BITS-1:0];
// rstate
always @(posedge clk) begin
if (reset) begin
rstate <= RSTATE_ADDR;
end else begin
rstate <= RSTATE_IDLE;
end else if (clk_en) begin
case (rstate)
RSTATE_ADDR: rstate <= s_axi_ar_fire ? RSTATE_DATA : RSTATE_ADDR;
RSTATE_DATA: rstate <= rvalid_stall ? RSTATE_DATA : RSTATE_RESP;
RSTATE_RESP: rstate <= s_axi_r_fire ? RSTATE_ADDR : RSTATE_RESP;
default: rstate <= RSTATE_ADDR;
RSTATE_IDLE: rstate <= s_axi_arvalid ? RSTATE_DATA : RSTATE_IDLE;
RSTATE_DATA: rstate <= (s_axi_rready & s_axi_rvalid) ? RSTATE_IDLE : RSTATE_DATA;
default: rstate <= RSTATE_IDLE;
endcase
end
end
// raddr
always @(posedge clk) begin
if (s_axi_ar_fire) begin
raddr <= s_axi_araddr[ADDR_BITS-1:0];
end
end
// rdata
always @(posedge clk) begin
rdata <= '0;
case (raddr)
ADDR_AP_CTRL: begin
rdata[0] <= ap_start_r;
rdata[1] <= ap_done;
rdata[2] <= ap_idle;
rdata[3] <= ap_ready;
rdata[7] <= auto_restart_r;
if (clk_en) begin
if (s_axi_ar_fire) begin
rdata <= '0;
case (raddr)
ADDR_AP_CTRL: begin
rdata[0] <= ap_start_r;
rdata[1] <= ap_done;
rdata[2] <= ap_idle;
rdata[3] <= ap_ready;
rdata[7] <= auto_restart_r;
end
ADDR_GIE: begin
rdata <= 32'(gie_r);
end
ADDR_IER: begin
rdata <= 32'(ier_r);
end
ADDR_ISR: begin
rdata <= 32'(isr_r);
end
ADDR_DEV_0: begin
rdata <= dev_caps[31:0];
end
ADDR_DEV_1: begin
rdata <= dev_caps[63:32];
end
ADDR_ISA_0: begin
rdata <= isa_caps[31:0];
end
ADDR_ISA_1: begin
rdata <= isa_caps[63:32];
end
`ifdef SCOPE
ADDR_SCP_0: begin
rdata <= scope_bus_rdata[31:0];
end
ADDR_SCP_1: begin
rdata <= scope_bus_rdata[63:32];
end
`endif
default:;
endcase
end
ADDR_GIE: begin
rdata <= 32'(gie_r);
end
ADDR_IER: begin
rdata <= 32'(ier_r);
end
ADDR_ISR: begin
rdata <= 32'(isr_r);
end
ADDR_DEV_0: begin
rdata <= dev_caps[31:0];
end
ADDR_DEV_1: begin
rdata <= dev_caps[63:32];
end
ADDR_ISA_0: begin
rdata <= isa_caps[31:0];
end
ADDR_ISA_1: begin
rdata <= isa_caps[63:32];
end
`ifdef SCOPE
ADDR_SCP_0: begin
rdata <= scope_bus_rdata[31:0];
end
ADDR_SCP_1: begin
rdata <= scope_bus_rdata[63:32];
end
`endif
default:;
endcase
end
end
assign ap_reset = ap_reset_r;
assign ap_start = ap_start_r;
assign interrupt = gie_r & (| isr_r);
assign ap_ctrl_read = s_axi_r_fire && (raddr == ADDR_AP_CTRL);
assign mem_base = mem_r;
assign dcr_wr_valid = dcr_wr_valid_r;
assign dcr_wr_addr = `VX_DCR_ADDR_WIDTH'(dcra_r);

View file

@ -10,93 +10,68 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Reference: https://www.xilinx.com/developer/articles/porting-rtl-designs-to-vitis-rtl-kernels.html
`include "vortex_afu.vh"
module VX_afu_wrap #(
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH,
parameter C_M_AXI_MEM_DATA_WIDTH = `PLATFORM_MEMORY_DATA_SIZE * 8,
parameter C_M_AXI_MEM_ADDR_WIDTH = 64,
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
parameter C_M_AXI_MEM_NUM_BANKS = 1
`else
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS
`endif
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
parameter C_M_AXI_MEM_ID_WIDTH = `M_AXI_MEM_ID_WIDTH,
parameter C_M_AXI_MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter C_M_AXI_MEM_DATA_WIDTH = `VX_MEM_DATA_WIDTH
) (
// System signals
input wire clk,
input wire reset,
input wire ap_clk,
input wire ap_rst_n,
// AXI4 master interface
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA),
`else
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
`endif
`REPEAT (`M_AXI_MEM_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
// AXI4-Lite slave interface
input wire s_axi_ctrl_awvalid,
output wire s_axi_ctrl_awready,
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_awaddr,
input wire s_axi_ctrl_wvalid,
output wire s_axi_ctrl_wready,
input wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_wdata,
input wire [C_S_AXI_CTRL_DATA_WIDTH/8-1:0] s_axi_ctrl_wstrb,
input wire s_axi_ctrl_arvalid,
output wire s_axi_ctrl_arready,
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_araddr,
output wire s_axi_ctrl_rvalid,
input wire s_axi_ctrl_rready,
output wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_rdata,
output wire [1:0] s_axi_ctrl_rresp,
output wire s_axi_ctrl_bvalid,
input wire s_axi_ctrl_bready,
output wire [1:0] s_axi_ctrl_bresp,
output wire interrupt
);
localparam M_AXI_MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH;
localparam C_M_AXI_MEM_NUM_BANKS = `M_AXI_MEM_NUM_BANKS;
typedef enum logic [1:0] {
STATE_IDLE = 0,
STATE_INIT = 1,
STATE_RUN = 2,
STATE_DONE = 3
} state_e;
localparam PENDING_WR_SIZEW = 12; // max outstanding requests size
localparam NUM_MEM_BANKS_SIZEW = `CLOG2(C_M_AXI_MEM_NUM_BANKS+1);
localparam STATE_IDLE = 0;
localparam STATE_RUN = 1;
wire m_axi_mem_awvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_awready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_awid_a [C_M_AXI_MEM_NUM_BANKS];
wire [7:0] m_axi_mem_awlen_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_wvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_wready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_wdata_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_DATA_WIDTH/8-1:0] m_axi_mem_wstrb_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_wlast_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_bvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_bready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_bid_a [C_M_AXI_MEM_NUM_BANKS];
wire [1:0] m_axi_mem_bresp_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_arvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_arready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_arid_a [C_M_AXI_MEM_NUM_BANKS];
wire [7:0] m_axi_mem_arlen_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_rvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_rready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_rdata_a [C_M_AXI_MEM_NUM_BANKS];
@ -105,31 +80,30 @@ module VX_afu_wrap #(
wire [1:0] m_axi_mem_rresp_a [C_M_AXI_MEM_NUM_BANKS];
// convert memory interface to array
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
`else
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
`endif
`REPEAT (`M_AXI_MEM_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
wire reset = ~ap_rst_n;
reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr;
reg [PENDING_WR_SIZEW-1:0] vx_pending_writes;
reg vx_reset = 1; // asserted at initialization
reg [15:0] vx_pending_writes;
reg vx_busy_wait;
reg vx_running;
wire vx_busy;
wire [63:0] mem_base [C_M_AXI_MEM_NUM_BANKS];
wire dcr_wr_valid;
wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr;
wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data;
state_e state;
reg state;
wire ap_reset;
wire ap_start;
wire ap_ctrl_read;
wire ap_idle = (state == STATE_IDLE);
wire ap_done = (state == STATE_DONE) && (vx_pending_writes == '0);
wire ap_ready = ap_done;
wire ap_done_ack = ap_done && ap_ctrl_read;
wire ap_idle = ~vx_running;
wire ap_done = ~(state == STATE_RUN || vx_pending_writes != 0);
wire ap_ready = 1'b1;
`ifdef SCOPE
wire scope_bus_in;
@ -137,129 +111,108 @@ module VX_afu_wrap #(
wire scope_reset = reset;
`endif
always @(posedge clk) begin
always @(posedge ap_clk) begin
if (reset || ap_reset) begin
state <= STATE_IDLE;
vx_reset <= 1;
state <= STATE_IDLE;
vx_busy_wait <= 0;
vx_running <= 0;
end else begin
case (state)
STATE_IDLE: begin
if (ap_start) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: Begin initialization\n", $time))
`TRACE(2, ("%d: STATE RUN\n", $time));
`endif
state <= STATE_INIT;
vx_reset_ctr <= (`RESET_DELAY-1);
vx_reset <= 1;
end
end
STATE_INIT: begin
if (vx_reset) begin
// wait for reset to complete
if (vx_reset_ctr == 0) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: Initialization completed\n", $time))
`endif
vx_reset <= 0;
end
end else begin
// wait until processor goes busy
if (vx_busy) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: Begin execution\n", $time))
`endif
state <= STATE_RUN;
end
state <= STATE_RUN;
vx_running <= 0;
end
end
STATE_RUN: begin
// wait until the processor is not busy
if (~vx_busy) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: Execution completed\n", $time))
`endif
state <= STATE_DONE;
end
end
STATE_DONE: begin
// wait for host's done acknowledgement
if (ap_done_ack) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: Processor idle\n", $time))
`endif
state <= STATE_IDLE;
if (vx_running) begin
if (vx_busy_wait) begin
// wait until processor goes busy
if (vx_busy) begin
vx_busy_wait <= 0;
end
end else begin
// wait until the processor is not busy
if (~vx_busy) begin
state <= STATE_IDLE;
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: AFU: End execution\n", $time));
`TRACE(2, ("%d: STATE IDLE\n", $time));
`endif
end
end
end else begin
// wait until the reset sequence is complete
if (vx_reset_ctr == (`RESET_DELAY-1)) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: AFU: Begin execution\n", $time));
`endif
vx_running <= 1;
vx_busy_wait <= 1;
end
end
end
endcase
// ensure reset network initialization
if (vx_reset_ctr != '0) begin
vx_reset_ctr <= vx_reset_ctr - 1;
end
end
end
wire [C_M_AXI_MEM_NUM_BANKS-1:0] m_axi_wr_req_fire, m_axi_wr_rsp_fire;
wire [NUM_MEM_BANKS_SIZEW-1:0] cur_wr_reqs, cur_wr_rsps;
reg m_axi_mem_wfire;
reg m_axi_mem_bfire;
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_m_axi_wr_req_fire
VX_axi_write_ack axi_write_ack (
.clk (clk),
.reset (reset),
.awvalid(m_axi_mem_awvalid_a[i]),
.awready(m_axi_mem_awready_a[i]),
.wvalid (m_axi_mem_wvalid_a[i]),
.wready (m_axi_mem_wready_a[i]),
.tx_ack (m_axi_wr_req_fire[i]),
`UNUSED_PIN (aw_ack),
`UNUSED_PIN (w_ack),
`UNUSED_PIN (tx_rdy)
);
always @(*) begin
m_axi_mem_wfire = 0;
m_axi_mem_bfire = 0;
for (integer i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
m_axi_mem_wfire |= m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i];
m_axi_mem_bfire |= m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i];
end
end
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_m_axi_wr_rsp_fire
assign m_axi_wr_rsp_fire[i] = m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i];
end
`POP_COUNT(cur_wr_reqs, m_axi_wr_req_fire);
`POP_COUNT(cur_wr_rsps, m_axi_wr_rsp_fire);
wire signed [NUM_MEM_BANKS_SIZEW:0] reqs_sub = (NUM_MEM_BANKS_SIZEW+1)'(cur_wr_reqs) -
(NUM_MEM_BANKS_SIZEW+1)'(cur_wr_rsps);
always @(posedge clk) begin
if (reset) begin
always @(posedge ap_clk) begin
if (reset || ap_reset) begin
vx_pending_writes <= '0;
end else begin
vx_pending_writes <= vx_pending_writes + PENDING_WR_SIZEW'(reqs_sub);
if (m_axi_mem_wfire && ~m_axi_mem_bfire)
vx_pending_writes <= vx_pending_writes + 1;
if (~m_axi_mem_wfire && m_axi_mem_bfire)
vx_pending_writes <= vx_pending_writes - 1;
end
end
always @(posedge ap_clk) begin
if (state == STATE_RUN) begin
vx_reset_ctr <= vx_reset_ctr + 1;
end else begin
vx_reset_ctr <= '0;
end
end
VX_afu_ctrl #(
.S_AXI_ADDR_WIDTH (C_S_AXI_CTRL_ADDR_WIDTH),
.S_AXI_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH)
.AXI_ADDR_WIDTH (C_S_AXI_CTRL_ADDR_WIDTH),
.AXI_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH),
.AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
) afu_ctrl (
.clk (clk),
.reset (reset),
.clk (ap_clk),
.reset (reset || ap_reset),
.clk_en (1'b1),
.s_axi_awvalid (s_axi_ctrl_awvalid),
.s_axi_awready (s_axi_ctrl_awready),
.s_axi_awaddr (s_axi_ctrl_awaddr),
.s_axi_wvalid (s_axi_ctrl_wvalid),
.s_axi_wready (s_axi_ctrl_wready),
.s_axi_wdata (s_axi_ctrl_wdata),
.s_axi_wstrb (s_axi_ctrl_wstrb),
.s_axi_arvalid (s_axi_ctrl_arvalid),
.s_axi_arready (s_axi_ctrl_arready),
.s_axi_araddr (s_axi_ctrl_araddr),
.s_axi_rvalid (s_axi_ctrl_rvalid),
.s_axi_rready (s_axi_ctrl_rready),
.s_axi_rdata (s_axi_ctrl_rdata),
.s_axi_rresp (s_axi_ctrl_rresp),
.s_axi_bvalid (s_axi_ctrl_bvalid),
.s_axi_bready (s_axi_ctrl_bready),
.s_axi_bresp (s_axi_ctrl_bresp),
@ -271,42 +224,42 @@ module VX_afu_wrap #(
.ap_idle (ap_idle),
.interrupt (interrupt),
.ap_ctrl_read (ap_ctrl_read),
`ifdef SCOPE
.scope_bus_in (scope_bus_out),
.scope_bus_out (scope_bus_in),
`endif
.mem_base (mem_base),
.dcr_wr_valid (dcr_wr_valid),
.dcr_wr_addr (dcr_wr_addr),
.dcr_wr_data (dcr_wr_data)
);
wire [M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_u [C_M_AXI_MEM_NUM_BANKS];
wire [M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_u [C_M_AXI_MEM_NUM_BANKS];
wire [`MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_w [C_M_AXI_MEM_NUM_BANKS];
wire [`MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_w [C_M_AXI_MEM_NUM_BANKS];
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_addressing
assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_u[i]) + C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET);
assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_u[i]) + C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET);
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_w[i]) + C_M_AXI_MEM_ADDR_WIDTH'(mem_base[i]);
assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_w[i]) + C_M_AXI_MEM_ADDR_WIDTH'(mem_base[i]);
end
`SCOPE_IO_SWITCH (2);
`SCOPE_IO_SWITCH (2)
Vortex_axi #(
.AXI_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH),
.AXI_ADDR_WIDTH (M_AXI_MEM_ADDR_WIDTH),
.AXI_ADDR_WIDTH (`MEM_ADDR_WIDTH),
.AXI_TID_WIDTH (C_M_AXI_MEM_ID_WIDTH),
.AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
) vortex_axi (
`SCOPE_IO_BIND (1)
.clk (clk),
.reset (vx_reset),
.clk (ap_clk),
.reset (reset || ap_reset || ~vx_running),
.m_axi_awvalid (m_axi_mem_awvalid_a),
.m_axi_awready (m_axi_mem_awready_a),
.m_axi_awaddr (m_axi_mem_awaddr_u),
.m_axi_awaddr (m_axi_mem_awaddr_w),
.m_axi_awid (m_axi_mem_awid_a),
.m_axi_awlen (m_axi_mem_awlen_a),
`UNUSED_PIN (m_axi_awsize),
@ -330,7 +283,7 @@ module VX_afu_wrap #(
.m_axi_arvalid (m_axi_mem_arvalid_a),
.m_axi_arready (m_axi_mem_arready_a),
.m_axi_araddr (m_axi_mem_araddr_u),
.m_axi_araddr (m_axi_mem_araddr_w),
.m_axi_arid (m_axi_mem_arid_a),
.m_axi_arlen (m_axi_mem_arlen_a),
`UNUSED_PIN (m_axi_arsize),
@ -357,79 +310,38 @@ module VX_afu_wrap #(
// SCOPE //////////////////////////////////////////////////////////////////////
`ifdef SCOPE
`ifdef DBG_SCOPE_AFU
wire m_axi_mem_awfire_0 = m_axi_mem_awvalid_a[0] & m_axi_mem_awready_a[0];
wire m_axi_mem_arfire_0 = m_axi_mem_arvalid_a[0] & m_axi_mem_arready_a[0];
wire m_axi_mem_wfire_0 = m_axi_mem_wvalid_a[0] & m_axi_mem_wready_a[0];
wire m_axi_mem_bfire_0 = m_axi_mem_bvalid_a[0] & m_axi_mem_bready_a[0];
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP (0, 0, {
ap_reset,
ap_start,
ap_done,
ap_idle,
interrupt,
vx_reset,
vx_busy,
state,
m_axi_mem_awvalid_a[0],
m_axi_mem_awready_a[0],
m_axi_mem_wvalid_a[0],
m_axi_mem_wready_a[0],
m_axi_mem_bvalid_a[0],
m_axi_mem_bready_a[0],
m_axi_mem_arvalid_a[0],
m_axi_mem_arready_a[0],
m_axi_mem_rvalid_a[0],
m_axi_mem_rready_a[0]
}, {
dcr_wr_valid,
m_axi_mem_awfire_0,
m_axi_mem_arfire_0,
m_axi_mem_wfire_0,
m_axi_mem_bfire_0
}, {
dcr_wr_addr,
dcr_wr_data,
vx_pending_writes,
m_axi_mem_awaddr_u[0],
m_axi_mem_awid_a[0],
m_axi_mem_bid_a[0],
m_axi_mem_araddr_u[0],
m_axi_mem_arid_a[0],
m_axi_mem_rid_a[0]
},
reset_negedge, 1'b0, 4096
);
`else
`SCOPE_IO_UNUSED(0)
`endif
`endif
`define TRIGGERS { \
reset, \
ap_start, \
ap_done, \
ap_idle, \
interrupt, \
vx_busy_wait, \
vx_busy, \
vx_running \
}
`ifdef CHIPSCOPE
`ifdef DBG_SCOPE_AFU
ila_afu ila_afu_inst (
.clk (clk),
.probe0 ({
ap_reset,
ap_start,
ap_done,
ap_idle,
state,
interrupt
}),
.probe1 ({
vx_pending_writes,
vx_busy,
vx_reset,
dcr_wr_valid,
dcr_wr_addr,
dcr_wr_data
})
`define PROBES { \
vx_pending_writes \
}
VX_scope_tap #(
.SCOPE_ID (0),
.TRIGGERW ($bits(`TRIGGERS)),
.PROBEW ($bits(`PROBES))
) scope_tap (
.clk (clk),
.reset (scope_reset_w[0]),
.start (1'b0),
.stop (1'b0),
.triggers (`TRIGGERS),
.probes (`PROBES),
.bus_in (scope_bus_in_w[0]),
.bus_out (scope_bus_out_w[0])
);
`endif
`else
`SCOPE_IO_UNUSED_W(0)
`endif
`ifdef SIMULATION
@ -440,7 +352,7 @@ module VX_afu_wrap #(
initial begin
$assertoff(0, vortex_axi);
end
always @(posedge clk) begin
always @(posedge ap_clk) begin
if (reset) begin
assert_delay_ctr <= '0;
assert_enabled <= 0;
@ -459,22 +371,19 @@ module VX_afu_wrap #(
`endif
`ifdef DBG_TRACE_AFU
always @(posedge clk) begin
always @(posedge ap_clk) begin
for (integer i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
if (m_axi_mem_awvalid_a[i] && m_axi_mem_awready_a[i]) begin
`TRACE(2, ("%t: AXI Wr Req [%0d]: addr=0x%0h, id=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]))
`TRACE(2, ("%d: AFU Wr Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]));
end
if (m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i]) begin
`TRACE(2, ("%t: AXI Wr Req [%0d]: strb=0x%h, data=0x%h\n", $time, i, m_axi_mem_wstrb_a[i], m_axi_mem_wdata_a[i]))
end
if (m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i]) begin
`TRACE(2, ("%t: AXI Wr Rsp [%0d]: id=0x%0h\n", $time, i, m_axi_mem_bid_a[i]))
`TRACE(2, ("%d: AFU Wr Req [%0d]: data=0x%h\n", $time, i, m_axi_mem_wdata_a[i]));
end
if (m_axi_mem_arvalid_a[i] && m_axi_mem_arready_a[i]) begin
`TRACE(2, ("%t: AXI Rd Req [%0d]: addr=0x%0h, id=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i]))
`TRACE(2, ("%d: AFU Rd Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i]));
end
if (m_axi_mem_rvalid_a[i] && m_axi_mem_rready_a[i]) begin
`TRACE(2, ("%t: AXI Rd Rsp [%0d]: data=0x%h, id=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]))
`TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]));
end
end
end

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -16,50 +16,37 @@
module vortex_afu #(
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH,
parameter C_M_AXI_MEM_DATA_WIDTH = (`PLATFORM_MEMORY_DATA_SIZE * 8),
parameter C_M_AXI_MEM_ID_WIDTH = `M_AXI_MEM_ID_WIDTH,
parameter C_M_AXI_MEM_ADDR_WIDTH = 64,
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
parameter C_M_AXI_MEM_NUM_BANKS = 1
`else
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS
`endif
parameter C_M_AXI_MEM_DATA_WIDTH = `VX_MEM_DATA_WIDTH
) (
// System signals
input wire ap_clk,
input wire ap_rst_n,
// AXI4 master interface
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA),
`else
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
`endif
`REPEAT (`M_AXI_MEM_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
// AXI4-Lite slave interface
input wire s_axi_ctrl_awvalid,
output wire s_axi_ctrl_awready,
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_awaddr,
input wire s_axi_ctrl_wvalid,
output wire s_axi_ctrl_wready,
input wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_wdata,
input wire [C_S_AXI_CTRL_DATA_WIDTH/8-1:0] s_axi_ctrl_wstrb,
input wire s_axi_ctrl_arvalid,
input wire s_axi_ctrl_arvalid,
output wire s_axi_ctrl_arready,
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_araddr,
output wire s_axi_ctrl_rvalid,
input wire s_axi_ctrl_rready,
input wire s_axi_ctrl_rready,
output wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_rdata,
output wire [1:0] s_axi_ctrl_rresp,
output wire s_axi_ctrl_bvalid,
output wire s_axi_ctrl_bvalid,
input wire s_axi_ctrl_bready,
output wire [1:0] s_axi_ctrl_bresp,
output wire interrupt
output wire interrupt
);
VX_afu_wrap #(
@ -67,39 +54,32 @@ module vortex_afu #(
.C_S_AXI_CTRL_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH),
.C_M_AXI_MEM_ID_WIDTH (C_M_AXI_MEM_ID_WIDTH),
.C_M_AXI_MEM_ADDR_WIDTH (C_M_AXI_MEM_ADDR_WIDTH),
.C_M_AXI_MEM_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH),
.C_M_AXI_MEM_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
.C_M_AXI_MEM_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH)
) afu_wrap (
.clk (ap_clk),
.reset (~ap_rst_n),
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, AXI_MEM_ARGS, REPEAT_COMMA),
`else
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
`endif
.ap_clk (ap_clk),
.ap_rst_n (ap_rst_n),
`REPEAT (`M_AXI_MEM_NUM_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
.s_axi_ctrl_awvalid (s_axi_ctrl_awvalid),
.s_axi_ctrl_awready (s_axi_ctrl_awready),
.s_axi_ctrl_awaddr (s_axi_ctrl_awaddr),
.s_axi_ctrl_awaddr (s_axi_ctrl_awaddr),
.s_axi_ctrl_wvalid (s_axi_ctrl_wvalid),
.s_axi_ctrl_wready (s_axi_ctrl_wready),
.s_axi_ctrl_wdata (s_axi_ctrl_wdata),
.s_axi_ctrl_wstrb (s_axi_ctrl_wstrb),
.s_axi_ctrl_arvalid (s_axi_ctrl_arvalid),
.s_axi_ctrl_arready (s_axi_ctrl_arready),
.s_axi_ctrl_araddr (s_axi_ctrl_araddr),
.s_axi_ctrl_rvalid (s_axi_ctrl_rvalid),
.s_axi_ctrl_rready (s_axi_ctrl_rready),
.s_axi_ctrl_rdata (s_axi_ctrl_rdata),
.s_axi_ctrl_rresp (s_axi_ctrl_rresp),
.s_axi_ctrl_bvalid (s_axi_ctrl_bvalid),
.s_axi_ctrl_bready (s_axi_ctrl_bready),
.s_axi_ctrl_bresp (s_axi_ctrl_bresp),
.interrupt (interrupt)
);
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,12 +14,12 @@
`ifndef VORTEX_AFU_VH
`define VORTEX_AFU_VH
`ifndef PLATFORM_MEMORY_OFFSET
`define PLATFORM_MEMORY_OFFSET 0
`ifndef M_AXI_MEM_NUM_BANKS
`define M_AXI_MEM_NUM_BANKS 1
`endif
`ifndef PLATFORM_MEMORY_ID_WIDTH
`define PLATFORM_MEMORY_ID_WIDTH 32
`ifndef M_AXI_MEM_ID_WIDTH
`define M_AXI_MEM_ID_WIDTH 32
`endif
`define GEN_AXI_MEM(i) \

View file

@ -33,7 +33,7 @@ module VX_bank_flush #(
output wire flush_init,
output wire flush_valid,
output wire [`CS_LINE_SEL_BITS-1:0] flush_line,
output wire [`CS_WAY_SEL_WIDTH-1:0] flush_way,
output wire [NUM_WAYS-1:0] flush_way,
input wire flush_ready,
input wire mshr_empty,
input wire bank_empty
@ -48,21 +48,20 @@ module VX_bank_flush #(
localparam STATE_WAIT2 = 4;
localparam STATE_DONE = 5;
reg [2:0] state, state_n;
reg [2:0] state_r, state_n;
reg [CTR_WIDTH-1:0] counter;
reg [CTR_WIDTH-1:0] counter_r;
always @(*) begin
state_n = state;
case (state)
//STATE_IDLE:
default : begin
state_n = state_r;
case (state_r)
STATE_IDLE: begin
if (flush_begin) begin
state_n = STATE_WAIT1;
end
end
STATE_INIT: begin
if (counter == ((2 ** `CS_LINE_SEL_BITS)-1)) begin
if (counter_r == ((2 ** `CS_LINE_SEL_BITS)-1)) begin
state_n = STATE_IDLE;
end
end
@ -73,7 +72,7 @@ module VX_bank_flush #(
end
end
STATE_FLUSH: begin
if (counter == ((2 ** CTR_WIDTH)-1) && flush_ready) begin
if (counter_r == ((2 ** CTR_WIDTH)-1) && flush_ready) begin
state_n = (BANK_ID == 0) ? STATE_DONE : STATE_WAIT2;
end
end
@ -94,30 +93,35 @@ module VX_bank_flush #(
always @(posedge clk) begin
if (reset) begin
state <= STATE_INIT;
counter <= '0;
state_r <= STATE_INIT;
counter_r <= '0;
end else begin
state <= state_n;
if (state != STATE_IDLE) begin
if ((state == STATE_INIT)
|| ((state == STATE_FLUSH) && flush_ready)) begin
counter <= counter + CTR_WIDTH'(1);
state_r <= state_n;
if (state_r != STATE_IDLE) begin
if ((state_r == STATE_INIT)
|| ((state_r == STATE_FLUSH) && flush_ready)) begin
counter_r <= counter_r + CTR_WIDTH'(1);
end
end else begin
counter <= '0;
counter_r <= '0;
end
end
end
assign flush_end = (state == STATE_DONE);
assign flush_init = (state == STATE_INIT);
assign flush_valid = (state == STATE_FLUSH);
assign flush_line = counter[`CS_LINE_SEL_BITS-1:0];
assign flush_end = (state_r == STATE_DONE);
assign flush_init = (state_r == STATE_INIT);
assign flush_valid = (state_r == STATE_FLUSH);
assign flush_line = counter_r[`CS_LINE_SEL_BITS-1:0];
if (WRITEBACK && (NUM_WAYS > 1)) begin : g_flush_way
assign flush_way = counter[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS];
end else begin : g_flush_way_all
assign flush_way = '0;
if (WRITEBACK && `CS_WAY_SEL_BITS > 0) begin
reg [NUM_WAYS-1:0] flush_way_r;
always @(*) begin
flush_way_r = '0;
flush_way_r[counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]] = 1;
end
assign flush_way = flush_way_r;
end else begin
assign flush_way = {NUM_WAYS{1'b1}};
end
endmodule

View file

@ -19,26 +19,23 @@ module VX_cache import VX_gpu_pkg::*; #(
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Number of memory ports
parameter MEM_PORTS = 1,
// Size of cache in bytes
parameter CACHE_SIZE = 32768,
parameter CACHE_SIZE = 4096,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 4,
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 4,
parameter NUM_WAYS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 16,
parameter WORD_SIZE = `XLEN/8,
// Core Response Queue Size
parameter CRSQ_SIZE = 4,
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 16,
parameter MSHR_SIZE = 8,
// Memory Response Queue Size
parameter MRSQ_SIZE = 4,
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
parameter MREQ_SIZE = 4,
@ -51,23 +48,17 @@ module VX_cache import VX_gpu_pkg::*; #(
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Replacement policy
parameter REPL_POLICY = `CS_REPL_FIFO,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 1,
// core request flags
parameter FLAGS_WIDTH = 0,
// Core response output register
parameter CORE_OUT_BUF = 3,
parameter CORE_OUT_BUF = 0,
// Memory request output register
parameter MEM_OUT_BUF = 3
parameter MEM_OUT_BUF = 0
) (
// PERF
`ifdef PERF_ENABLE
@ -78,37 +69,34 @@ module VX_cache import VX_gpu_pkg::*; #(
input wire reset,
VX_mem_bus_if.slave core_bus_if [NUM_REQS],
VX_mem_bus_if.master mem_bus_if [MEM_PORTS]
VX_mem_bus_if.master mem_bus_if
);
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter: number of banks must be power of 2"))
`STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter: writeback requires write enable"))
`STATIC_ASSERT(WRITEBACK || !DIRTY_BYTES, ("invalid parameter: dirty bytes require writeback"))
`STATIC_ASSERT(NUM_BANKS >= MEM_PORTS, ("invalid parameter: number of banks must be greater or equal to number of memory ports"))
// In writeback mode, memory fill response may issue a new memory request to handle evicted blocks.
// We need to ensure that the memory request queue never fills up to avoid deadlock.
`STATIC_ASSERT(!WRITEBACK || (MREQ_SIZE >= MSHR_SIZE), ("invalid parameter: writeback requires MREQ_SIZE >= MSHR_SIZE"))
localparam REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS);
localparam WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS);
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
localparam MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH);
localparam MEM_TAG_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
localparam WORD_WIDTH = WORD_SIZE * 8;
localparam WORD_SEL_BITS = `CLOG2(WORDS_PER_LINE);
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam LINE_ADDR_WIDTH = (`CS_WORD_ADDR_WIDTH - BANK_SEL_BITS - WORD_SEL_BITS);
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + `UP(FLAGS_WIDTH);
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + 1;
localparam CORE_RSP_DATAW = WORD_WIDTH + TAG_WIDTH;
localparam BANK_MEM_TAG_WIDTH = UUID_WIDTH + MSHR_ADDR_WIDTH;
localparam MEM_REQ_DATAW = (`CS_LINE_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + `UP(FLAGS_WIDTH));
localparam MEM_RSP_DATAW = `CS_LINE_WIDTH + MEM_TAG_WIDTH;
localparam MEM_PORTS_SEL_BITS = `CLOG2(MEM_PORTS);
localparam MEM_PORTS_SEL_WIDTH = `UP(MEM_PORTS_SEL_BITS);
localparam MEM_ARB_SEL_BITS = `CLOG2(`CDIV(NUM_BANKS, MEM_PORTS));
localparam MEM_ARB_SEL_WIDTH = `UP(MEM_ARB_SEL_BITS);
localparam REQ_XBAR_BUF = (NUM_REQS > 2) ? 2 : 0;
localparam CORE_RSP_BUF_ENABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
localparam MEM_REQ_BUF_ENABLE = (NUM_BANKS != 1);
localparam CORE_REQ_BUF_ENABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
localparam MEM_REQ_BUF_ENABLE = (NUM_BANKS != 1);
localparam REQ_XBAR_BUF = (NUM_REQS > 4) ? 2 : 0;
`ifdef PERF_ENABLE
wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
@ -122,7 +110,6 @@ module VX_cache import VX_gpu_pkg::*; #(
) core_bus2_if[NUM_REQS]();
wire [NUM_BANKS-1:0] per_bank_flush_begin;
wire [`UP(UUID_WIDTH)-1:0] flush_uuid;
wire [NUM_BANKS-1:0] per_bank_flush_end;
wire [NUM_BANKS-1:0] per_bank_core_req_fire;
@ -130,9 +117,7 @@ module VX_cache import VX_gpu_pkg::*; #(
VX_cache_flush #(
.NUM_REQS (NUM_REQS),
.NUM_BANKS (NUM_BANKS),
.UUID_WIDTH(UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // request xbar latency
.BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // bank xbar latency
) flush_unit (
.clk (clk),
.reset (reset),
@ -140,101 +125,92 @@ module VX_cache import VX_gpu_pkg::*; #(
.core_bus_out_if (core_bus2_if),
.bank_req_fire (per_bank_core_req_fire),
.flush_begin (per_bank_flush_begin),
.flush_uuid (flush_uuid),
.flush_end (per_bank_flush_end)
);
// Memory response gather /////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) mem_bus_tmp_if[MEM_PORTS]();
// Core response buffering
wire [NUM_REQS-1:0] core_rsp_valid_s;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_s;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
wire [NUM_REQS-1:0] core_rsp_ready_s;
wire [MEM_PORTS-1:0] mem_rsp_queue_valid;
wire [MEM_PORTS-1:0][MEM_RSP_DATAW-1:0] mem_rsp_queue_data;
wire [MEM_PORTS-1:0] mem_rsp_queue_ready;
`RESET_RELAY_EX (core_rsp_reset, reset, NUM_REQS, `MAX_FANOUT);
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue
VX_elastic_buffer #(
.DATAW (MEM_RSP_DATAW),
.SIZE (MRSQ_SIZE),
.OUT_REG (MRSQ_SIZE > 2)
) mem_rsp_queue (
.clk (clk),
.reset (reset),
.valid_in (mem_bus_tmp_if[i].rsp_valid),
.data_in (mem_bus_tmp_if[i].rsp_data),
.ready_in (mem_bus_tmp_if[i].rsp_ready),
.valid_out (mem_rsp_queue_valid[i]),
.data_out (mem_rsp_queue_data[i]),
.ready_out (mem_rsp_queue_ready[i])
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
.SIZE (CORE_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
.clk (clk),
.reset (core_rsp_reset[i]),
.valid_in (core_rsp_valid_s[i]),
.ready_in (core_rsp_ready_s[i]),
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
.data_out ({core_bus2_if[i].rsp_data.data, core_bus2_if[i].rsp_data.tag}),
.valid_out (core_bus2_if[i].rsp_valid),
.ready_out (core_bus2_if[i].rsp_ready)
);
end
wire [MEM_PORTS-1:0][MEM_RSP_DATAW-MEM_ARB_SEL_BITS-1:0] mem_rsp_queue_data_s;
wire [MEM_PORTS-1:0][BANK_SEL_WIDTH-1:0] mem_rsp_queue_sel;
///////////////////////////////////////////////////////////////////////////
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue_data_s
wire [BANK_MEM_TAG_WIDTH-1:0] mem_rsp_tag_s = mem_rsp_queue_data[i][MEM_TAG_WIDTH-1:MEM_ARB_SEL_BITS];
wire [`CS_LINE_WIDTH-1:0] mem_rsp_data_s = mem_rsp_queue_data[i][MEM_RSP_DATAW-1:MEM_TAG_WIDTH];
assign mem_rsp_queue_data_s[i] = {mem_rsp_data_s, mem_rsp_tag_s};
end
// Memory request buffering
wire mem_req_valid_s;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_s;
wire mem_req_rw_s;
wire [LINE_SIZE-1:0] mem_req_byteen_s;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s;
wire mem_req_flush_s;
wire mem_req_ready_s;
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue_sel
if (NUM_BANKS > 1) begin : g_multibanks
if (NUM_BANKS != MEM_PORTS) begin : g_arb_sel
VX_bits_concat #(
.L (MEM_ARB_SEL_BITS),
.R (MEM_PORTS_SEL_BITS)
) mem_rsp_sel_concat (
.left_in (mem_rsp_queue_data[i][MEM_ARB_SEL_BITS-1:0]),
.right_in (MEM_PORTS_SEL_WIDTH'(i)),
.data_out (mem_rsp_queue_sel[i])
);
end else begin : g_no_arb_sel
assign mem_rsp_queue_sel[i] = MEM_PORTS_SEL_WIDTH'(i);
end
end else begin : g_singlebank
assign mem_rsp_queue_sel[i] = 0;
end
end
wire mem_bus_if_flush;
wire [NUM_BANKS-1:0] per_bank_mem_rsp_valid;
wire [NUM_BANKS-1:0][MEM_RSP_DATAW-MEM_ARB_SEL_BITS-1:0] per_bank_mem_rsp_pdata;
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
VX_stream_omega #(
.NUM_INPUTS (MEM_PORTS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (MEM_RSP_DATAW-MEM_ARB_SEL_BITS),
.ARBITER ("R"),
.OUT_BUF (3)
) mem_rsp_xbar (
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + 1),
.SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_rsp_queue_valid),
.data_in (mem_rsp_queue_data_s),
.sel_in (mem_rsp_queue_sel),
.ready_in (mem_rsp_queue_ready),
.valid_out (per_bank_mem_rsp_valid),
.data_out (per_bank_mem_rsp_pdata),
`UNUSED_PIN (sel_out),
.ready_out (per_bank_mem_rsp_ready),
`UNUSED_PIN (collisions)
.valid_in (mem_req_valid_s),
.ready_in (mem_req_ready_s),
.data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s, mem_req_flush_s}),
.data_out ({mem_bus_if.req_data.rw, mem_bus_if.req_data.byteen, mem_bus_if.req_data.addr, mem_bus_if.req_data.data, mem_bus_if.req_data.tag, mem_bus_if_flush}),
.valid_out (mem_bus_if.req_valid),
.ready_out (mem_bus_if.req_ready)
);
wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_rsp_data;
wire [NUM_BANKS-1:0][BANK_MEM_TAG_WIDTH-1:0] per_bank_mem_rsp_tag;
assign mem_bus_if.req_data.atype = mem_bus_if_flush ? `ADDR_TYPE_WIDTH'(1 << `ADDR_TYPE_FLUSH) : '0;
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_per_bank_mem_rsp_data
assign {
per_bank_mem_rsp_data[i],
per_bank_mem_rsp_tag[i]
} = per_bank_mem_rsp_pdata[i];
end
///////////////////////////////////////////////////////////////////////////
// Core requests dispatch /////////////////////////////////////////////////
// Memory response buffering
wire mem_rsp_valid_s;
wire [`CS_LINE_WIDTH-1:0] mem_rsp_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s;
wire mem_rsp_ready_s;
VX_elastic_buffer #(
.DATAW (MEM_TAG_WIDTH + `CS_LINE_WIDTH),
.SIZE (MRSQ_SIZE),
.OUT_REG (MRSQ_SIZE > 2)
) mem_rsp_queue (
.clk (clk),
.reset (reset),
.valid_in (mem_bus_if.rsp_valid),
.ready_in (mem_bus_if.rsp_ready),
.data_in ({mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data}),
.data_out ({mem_rsp_tag_s, mem_rsp_data_s}),
.valid_out (mem_rsp_valid_s),
.ready_out (mem_rsp_ready_s)
);
///////////////////////////////////////////////////////////////////////////
wire [NUM_BANKS-1:0] per_bank_core_req_valid;
wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
@ -244,7 +220,7 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_req_data;
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_req_tag;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_req_idx;
wire [NUM_BANKS-1:0][`UP(FLAGS_WIDTH)-1:0] per_bank_core_req_flags;
wire [NUM_BANKS-1:0] per_bank_core_req_flush;
wire [NUM_BANKS-1:0] per_bank_core_req_ready;
wire [NUM_BANKS-1:0] per_bank_core_rsp_valid;
@ -254,21 +230,33 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0] per_bank_core_rsp_ready;
wire [NUM_BANKS-1:0] per_bank_mem_req_valid;
wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
wire [NUM_BANKS-1:0][`CS_MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
wire [NUM_BANKS-1:0] per_bank_mem_req_rw;
wire [NUM_BANKS-1:0][LINE_SIZE-1:0] per_bank_mem_req_byteen;
wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_req_data;
wire [NUM_BANKS-1:0][BANK_MEM_TAG_WIDTH-1:0] per_bank_mem_req_tag;
wire [NUM_BANKS-1:0][`UP(FLAGS_WIDTH)-1:0] per_bank_mem_req_flags;
wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0] per_bank_mem_req_id;
wire [NUM_BANKS-1:0] per_bank_mem_req_flush;
wire [NUM_BANKS-1:0] per_bank_mem_req_ready;
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready;
if (NUM_BANKS == 1) begin
assign mem_rsp_ready_s = per_bank_mem_rsp_ready;
end else begin
assign mem_rsp_ready_s = per_bank_mem_rsp_ready[`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s)];
end
// Bank requests dispatch
wire [NUM_REQS-1:0] core_req_valid;
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr;
wire [NUM_REQS-1:0] core_req_rw;
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag;
wire [NUM_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] core_req_flags;
wire [NUM_REQS-1:0] core_req_flush;
wire [NUM_REQS-1:0] core_req_ready;
wire [NUM_REQS-1:0][LINE_ADDR_WIDTH-1:0] core_req_line_addr;
@ -278,38 +266,35 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_REQS-1:0][CORE_REQ_DATAW-1:0] core_req_data_in;
wire [NUM_BANKS-1:0][CORE_REQ_DATAW-1:0] core_req_data_out;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_valid[i] = core_bus2_if[i].req_valid;
assign core_req_rw[i] = core_bus2_if[i].req_data.rw;
assign core_req_byteen[i] = core_bus2_if[i].req_data.byteen;
assign core_req_addr[i] = core_bus2_if[i].req_data.addr;
assign core_req_data[i] = core_bus2_if[i].req_data.data;
assign core_req_tag[i] = core_bus2_if[i].req_data.tag;
assign core_req_flags[i] = `UP(FLAGS_WIDTH)'(core_bus2_if[i].req_data.flags);
assign core_req_flush[i] = core_bus2_if[i].req_data.atype[`ADDR_TYPE_FLUSH];
assign core_bus2_if[i].req_ready = core_req_ready[i];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_wsel
if (WORDS_PER_LINE > 1) begin : g_wsel
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (WORDS_PER_LINE > 1) begin
assign core_req_wsel[i] = core_req_addr[i][0 +: WORD_SEL_BITS];
end else begin : g_no_wsel
end else begin
assign core_req_wsel[i] = '0;
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_line_addr
assign core_req_line_addr[i] = core_req_addr[i][(BANK_SEL_BITS + WORD_SEL_BITS) +: LINE_ADDR_WIDTH];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_bid
if (NUM_BANKS > 1) begin : g_multibanks
if (NUM_BANKS > 1) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_bid[i] = core_req_addr[i][WORD_SEL_BITS +: BANK_SEL_BITS];
end else begin : g_singlebank
assign core_req_bid[i] = '0;
end
end else begin
assign core_req_bid = '0;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_data_in
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_data_in[i] = {
core_req_line_addr[i],
core_req_rw[i],
@ -317,26 +302,26 @@ module VX_cache import VX_gpu_pkg::*; #(
core_req_byteen[i],
core_req_data[i],
core_req_tag[i],
core_req_flags[i]
core_req_flush[i]
};
end
assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready;
`ifdef PERF_ENABLE
wire [`PERF_CTR_BITS-1:0] perf_collisions;
`endif
`RESET_RELAY (req_xbar_reset, reset);
VX_stream_xbar #(
.NUM_INPUTS (NUM_REQS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (CORE_REQ_DATAW),
.PERF_CTR_BITS (`PERF_CTR_BITS),
.ARBITER ("R"),
.ARBITER ("F"),
.OUT_BUF (REQ_XBAR_BUF)
) req_xbar (
.clk (clk),
.reset (reset),
.reset (req_xbar_reset),
`ifdef PERF_ENABLE
.collisions(perf_collisions),
`else
@ -352,7 +337,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.ready_out (per_bank_core_req_ready)
);
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_core_req_data_out
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign {
per_bank_core_req_addr[i],
per_bank_core_req_rw[i],
@ -360,42 +345,50 @@ module VX_cache import VX_gpu_pkg::*; #(
per_bank_core_req_byteen[i],
per_bank_core_req_data[i],
per_bank_core_req_tag[i],
per_bank_core_req_flags[i]
per_bank_core_req_flush[i]
} = core_req_data_out[i];
end
// Banks access ///////////////////////////////////////////////////////////
// Banks access
for (genvar bank_id = 0; bank_id < NUM_BANKS; ++bank_id) begin : banks
wire [`CS_LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr;
wire curr_bank_mem_rsp_valid;
if (NUM_BANKS == 1) begin
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s;
end else begin
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s && (`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s) == bank_id);
end
`RESET_RELAY (bank_reset, reset);
for (genvar bank_id = 0; bank_id < NUM_BANKS; ++bank_id) begin : g_banks
VX_cache_bank #(
.BANK_ID (bank_id),
.INSTANCE_ID (`SFORMATF(("%s-bank%0d", INSTANCE_ID, bank_id))),
.INSTANCE_ID ($sformatf("%s-bank%0d", INSTANCE_ID, bank_id)),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.REPL_POLICY (REPL_POLICY),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.DIRTY_BYTES (DIRTY_BYTES),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.FLAGS_WIDTH (FLAGS_WIDTH),
.CORE_OUT_REG (CORE_RSP_BUF_ENABLE ? 0 : `TO_OUT_BUF_REG(CORE_OUT_BUF)),
.MEM_OUT_REG (MEM_REQ_BUF_ENABLE ? 0 : `TO_OUT_BUF_REG(MEM_OUT_BUF))
.CORE_OUT_BUF (CORE_REQ_BUF_ENABLE ? 0 : CORE_OUT_BUF),
.MEM_OUT_BUF (MEM_REQ_BUF_ENABLE ? 0 : MEM_OUT_BUF)
) bank (
.clk (clk),
.reset (reset),
.reset (bank_reset),
`ifdef PERF_ENABLE
.perf_read_miss (perf_read_miss_per_bank[bank_id]),
.perf_write_miss (perf_write_miss_per_bank[bank_id]),
.perf_mshr_stall (perf_mshr_stall_per_bank[bank_id]),
.perf_read_misses (perf_read_miss_per_bank[bank_id]),
.perf_write_misses (perf_write_miss_per_bank[bank_id]),
.perf_mshr_stalls (perf_mshr_stall_per_bank[bank_id]),
`endif
// Core request
@ -407,7 +400,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.core_req_data (per_bank_core_req_data[bank_id]),
.core_req_tag (per_bank_core_req_tag[bank_id]),
.core_req_idx (per_bank_core_req_idx[bank_id]),
.core_req_flags (per_bank_core_req_flags[bank_id]),
.core_req_flush (per_bank_core_req_flush[bank_id]),
.core_req_ready (per_bank_core_req_ready[bank_id]),
// Core response
@ -419,49 +412,50 @@ module VX_cache import VX_gpu_pkg::*; #(
// Memory request
.mem_req_valid (per_bank_mem_req_valid[bank_id]),
.mem_req_addr (per_bank_mem_req_addr[bank_id]),
.mem_req_addr (curr_bank_mem_req_addr),
.mem_req_rw (per_bank_mem_req_rw[bank_id]),
.mem_req_byteen (per_bank_mem_req_byteen[bank_id]),
.mem_req_data (per_bank_mem_req_data[bank_id]),
.mem_req_tag (per_bank_mem_req_tag[bank_id]),
.mem_req_flags (per_bank_mem_req_flags[bank_id]),
.mem_req_id (per_bank_mem_req_id[bank_id]),
.mem_req_flush (per_bank_mem_req_flush[bank_id]),
.mem_req_ready (per_bank_mem_req_ready[bank_id]),
// Memory response
.mem_rsp_valid (per_bank_mem_rsp_valid[bank_id]),
.mem_rsp_data (per_bank_mem_rsp_data[bank_id]),
.mem_rsp_tag (per_bank_mem_rsp_tag[bank_id]),
.mem_rsp_valid (curr_bank_mem_rsp_valid),
.mem_rsp_data (mem_rsp_data_s),
.mem_rsp_id (`CS_MEM_TAG_TO_REQ_ID(mem_rsp_tag_s)),
.mem_rsp_ready (per_bank_mem_rsp_ready[bank_id]),
// Flush request
.flush_begin (per_bank_flush_begin[bank_id]),
.flush_uuid (flush_uuid),
.flush_end (per_bank_flush_end[bank_id])
);
if (NUM_BANKS == 1) begin
assign per_bank_mem_req_addr[bank_id] = curr_bank_mem_req_addr;
end else begin
assign per_bank_mem_req_addr[bank_id] = `CS_LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, bank_id);
end
end
// Core responses gather //////////////////////////////////////////////////
// Bank responses gather
wire [NUM_BANKS-1:0][CORE_RSP_DATAW-1:0] core_rsp_data_in;
wire [NUM_REQS-1:0][CORE_RSP_DATAW-1:0] core_rsp_data_out;
wire [NUM_REQS-1:0] core_rsp_valid_s;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_s;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
wire [NUM_REQS-1:0] core_rsp_ready_s;
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_core_rsp_data_in
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign core_rsp_data_in[i] = {per_bank_core_rsp_data[i], per_bank_core_rsp_tag[i]};
end
`RESET_RELAY (rsp_xbar_reset, reset);
VX_stream_xbar #(
.NUM_INPUTS (NUM_BANKS),
.NUM_OUTPUTS (NUM_REQS),
.DATAW (CORE_RSP_DATAW),
.ARBITER ("R")
.ARBITER ("F")
) rsp_xbar (
.clk (clk),
.reset (reset),
.reset (rsp_xbar_reset),
`UNUSED_PIN (collisions),
.valid_in (per_bank_core_rsp_valid),
.data_in (core_rsp_data_in),
@ -473,170 +467,113 @@ module VX_cache import VX_gpu_pkg::*; #(
`UNUSED_PIN (sel_out)
);
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_data_s
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign {core_rsp_data_s[i], core_rsp_tag_s[i]} = core_rsp_data_out[i];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_buf
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
.SIZE (CORE_RSP_BUF_ENABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_valid_s[i]),
.ready_in (core_rsp_ready_s[i]),
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
.data_out ({core_bus2_if[i].rsp_data.data, core_bus2_if[i].rsp_data.tag}),
.valid_out (core_bus2_if[i].rsp_valid),
.ready_out (core_bus2_if[i].rsp_ready)
);
end
///////////////////////////////////////////////////////////////////////////
// Memory request arbitration /////////////////////////////////////////////
wire mem_req_valid_p;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_p;
wire mem_req_rw_p;
wire [LINE_SIZE-1:0] mem_req_byteen_p;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_p;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_p;
wire [MSHR_ADDR_WIDTH-1:0] mem_req_id_p;
wire mem_req_flush_p;
wire mem_req_ready_p;
wire [NUM_BANKS-1:0][MEM_REQ_DATAW-1:0] per_bank_mem_req_pdata;
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_per_bank_mem_req_pdata
assign per_bank_mem_req_pdata[i] = {
per_bank_mem_req_rw[i],
// Memory request arbitration
wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + 1)-1:0] data_in;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign data_in[i] = {
per_bank_mem_req_addr[i],
per_bank_mem_req_data[i],
per_bank_mem_req_rw[i],
per_bank_mem_req_byteen[i],
per_bank_mem_req_flags[i],
per_bank_mem_req_tag[i]
per_bank_mem_req_data[i],
per_bank_mem_req_id[i],
per_bank_mem_req_flush[i]
};
end
wire [MEM_PORTS-1:0] mem_req_valid;
wire [MEM_PORTS-1:0][MEM_REQ_DATAW-1:0] mem_req_pdata;
wire [MEM_PORTS-1:0] mem_req_ready;
wire [MEM_PORTS-1:0][MEM_ARB_SEL_WIDTH-1:0] mem_req_sel_out;
VX_stream_arb #(
.NUM_INPUTS (NUM_BANKS),
.NUM_OUTPUTS(MEM_PORTS),
.DATAW (MEM_REQ_DATAW),
.ARBITER ("R")
.DATAW (`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + MSHR_ADDR_WIDTH + 1),
.ARBITER ("F")
) mem_req_arb (
.clk (clk),
.reset (reset),
.valid_in (per_bank_mem_req_valid),
.data_in (per_bank_mem_req_pdata),
.ready_in (per_bank_mem_req_ready),
.valid_out (mem_req_valid),
.data_out (mem_req_pdata),
.ready_out (mem_req_ready),
.sel_out (mem_req_sel_out)
.data_in (data_in),
.data_out ({mem_req_addr_p, mem_req_rw_p, mem_req_byteen_p, mem_req_data_p, mem_req_id_p, mem_req_flush_p}),
.valid_out (mem_req_valid_p),
.ready_out (mem_req_ready_p),
`UNUSED_PIN (sel_out)
);
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_req_buf
wire mem_req_rw;
wire [`CS_LINE_ADDR_WIDTH-1:0] mem_req_addr;
wire [`CS_LINE_WIDTH-1:0] mem_req_data;
wire [LINE_SIZE-1:0] mem_req_byteen;
wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags;
wire [BANK_MEM_TAG_WIDTH-1:0] mem_req_tag;
if (NUM_BANKS > 1) begin
wire [`CS_BANK_SEL_BITS-1:0] mem_req_bank_id = `CS_MEM_ADDR_TO_BANK_ID(mem_req_addr_p);
assign mem_req_tag_p = MEM_TAG_WIDTH'({mem_req_bank_id, mem_req_id_p});
end else begin
assign mem_req_tag_p = MEM_TAG_WIDTH'(mem_req_id_p);
end
assign {
mem_req_rw,
mem_req_addr,
mem_req_data,
mem_req_byteen,
mem_req_flags,
mem_req_tag
} = mem_req_pdata[i];
// Memory request multi-port handling
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_w;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_w;
wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags_w;
assign mem_req_valid_s = mem_req_valid_p;
assign mem_req_addr_s = mem_req_addr_p;
assign mem_req_tag_s = mem_req_tag_p;
assign mem_req_flush_s = mem_req_flush_p;
assign mem_req_ready_p = mem_req_ready_s;
if (NUM_BANKS > 1) begin : g_mem_req_tag_multibanks
if (NUM_BANKS != MEM_PORTS) begin : g_arb_sel
wire [`CS_BANK_SEL_BITS-1:0] mem_req_bank_id;
VX_bits_concat #(
.L (MEM_ARB_SEL_BITS),
.R (MEM_PORTS_SEL_BITS)
) bank_id_concat (
.left_in (mem_req_sel_out[i]),
.right_in (MEM_PORTS_SEL_WIDTH'(i)),
.data_out (mem_req_bank_id)
);
assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'({mem_req_addr, mem_req_bank_id});
assign mem_req_tag_w = {mem_req_tag, mem_req_sel_out[i]};
end else begin : g_no_arb_sel
`UNUSED_VAR (mem_req_sel_out)
assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'({mem_req_addr, MEM_PORTS_SEL_WIDTH'(i)});
assign mem_req_tag_w = MEM_TAG_WIDTH'(mem_req_tag);
end
end else begin : g_mem_req_tag
`UNUSED_VAR (mem_req_sel_out)
assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'(mem_req_addr);
assign mem_req_tag_w = MEM_TAG_WIDTH'(mem_req_tag);
end
if (WRITE_ENABLE != 0) begin
assign mem_req_rw_s = mem_req_rw_p;
assign mem_req_byteen_s = mem_req_byteen_p;
assign mem_req_data_s = mem_req_data_p;
end else begin
`UNUSED_VAR (mem_req_byteen_p)
`UNUSED_VAR (mem_req_data_p)
`UNUSED_VAR (mem_req_rw_p)
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)),
.SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_valid[i]),
.ready_in (mem_req_ready[i]),
.data_in ({mem_req_rw, mem_req_byteen, mem_req_addr_w, mem_req_data, mem_req_tag_w, mem_req_flags}),
.data_out ({mem_bus_tmp_if[i].req_data.rw, mem_bus_tmp_if[i].req_data.byteen, mem_bus_tmp_if[i].req_data.addr, mem_bus_tmp_if[i].req_data.data, mem_bus_tmp_if[i].req_data.tag, mem_req_flags_w}),
.valid_out (mem_bus_tmp_if[i].req_valid),
.ready_out (mem_bus_tmp_if[i].req_ready)
);
if (FLAGS_WIDTH != 0) begin : g_mem_req_flags
assign mem_bus_tmp_if[i].req_data.flags = mem_req_flags_w;
end else begin : g_no_mem_req_flags
assign mem_bus_tmp_if[i].req_data.flags = '0;
`UNUSED_VAR (mem_req_flags_w)
end
if (WRITE_ENABLE) begin : g_mem_bus_if
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
end else begin : g_mem_bus_if_ro
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
end
assign mem_req_rw_s = 0;
assign mem_req_byteen_s = {LINE_SIZE{1'b1}};
assign mem_req_data_s = '0;
end
`ifdef PERF_ENABLE
wire [NUM_REQS-1:0] perf_core_reads_per_req;
wire [NUM_REQS-1:0] perf_core_writes_per_req;
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
wire [MEM_PORTS-1:0] perf_mem_stall_per_port;
// per cycle: core_reads, core_writes
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
`BUFFER(perf_core_reads_per_req, core_req_valid & core_req_ready & ~core_req_rw);
`BUFFER(perf_core_writes_per_req, core_req_valid & core_req_ready & core_req_rw);
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready;
end
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_perf_mem_stall_per_port
assign perf_mem_stall_per_port[i] = mem_bus_if[i].req_valid && ~mem_bus_if[i].req_ready;
end
wire [NUM_REQS-1:0] perf_core_reads_per_req;
wire [NUM_REQS-1:0] perf_core_writes_per_req;
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
wire [`CLOG2(MEM_PORTS+1)-1:0] perf_mem_stall_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
`BUFFER(perf_core_reads_per_req, core_req_valid & core_req_ready & ~core_req_rw);
`BUFFER(perf_core_writes_per_req, core_req_valid & core_req_ready & core_req_rw);
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
`POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank);
`POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank);
`POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank);
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign perf_crsp_stall_per_req[i] = core_bus2_if[i].rsp_valid && ~core_bus2_if[i].rsp_ready;
end
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
`POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port);
wire perf_mem_stall_per_cycle = mem_bus_if.req_valid && ~mem_bus_if.req_ready;
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
reg [`PERF_CTR_BITS-1:0] perf_core_writes;

View file

@ -47,26 +47,19 @@ module VX_cache_bank #(
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Replacement policy
parameter REPL_POLICY = `CS_REPL_FIFO,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 1,
// core request flags
parameter FLAGS_WIDTH = 0,
// Core response output buffer
parameter CORE_OUT_BUF = 0,
// Core response output register
parameter CORE_OUT_REG = 0,
// Memory request output register
parameter MEM_OUT_REG = 0,
// Memory request output buffer
parameter MEM_OUT_BUF = 0,
parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE),
parameter MEM_TAG_WIDTH = UUID_WIDTH + MSHR_ADDR_WIDTH,
parameter REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS),
parameter WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS)
) (
@ -74,9 +67,9 @@ module VX_cache_bank #(
input wire reset,
`ifdef PERF_ENABLE
output wire perf_read_miss,
output wire perf_write_miss,
output wire perf_mshr_stall,
output wire perf_read_misses,
output wire perf_write_misses,
output wire perf_mshr_stalls,
`endif
// Core Request
@ -88,7 +81,7 @@ module VX_cache_bank #(
input wire [`CS_WORD_WIDTH-1:0] core_req_data, // data to be written
input wire [TAG_WIDTH-1:0] core_req_tag, // identifier of the request (request id)
input wire [REQ_SEL_WIDTH-1:0] core_req_idx, // index of the request in the core request array
input wire [`UP(FLAGS_WIDTH)-1:0] core_req_flags,
input wire core_req_flush, // flush enable
output wire core_req_ready,
// Core Response
@ -104,19 +97,18 @@ module VX_cache_bank #(
output wire mem_req_rw,
output wire [LINE_SIZE-1:0] mem_req_byteen,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
output wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags,
output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id, // index of the head entry in the mshr
output wire mem_req_flush,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
input wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id,
output wire mem_rsp_ready,
// flush
input wire flush_begin,
input wire [`UP(UUID_WIDTH)-1:0] flush_uuid,
output wire flush_end
);
@ -144,45 +136,43 @@ module VX_cache_bank #(
wire [MSHR_ADDR_WIDTH-1:0] replay_id;
wire replay_ready;
wire valid_sel, valid_st0, valid_st1;
wire is_init_st0;
wire is_creq_st0, is_creq_st1;
wire is_fill_st0, is_fill_st1;
wire is_init_st0, is_init_st1;
wire is_flush_st0, is_flush_st1;
wire [`CS_WAY_SEL_WIDTH-1:0] flush_way_st0, evict_way_st0;
wire [`CS_WAY_SEL_WIDTH-1:0] way_idx_st0, way_idx_st1;
wire [NUM_WAYS-1:0] flush_way_st0;
wire [`CS_LINE_ADDR_WIDTH-1:0] addr_sel, addr_st0, addr_st1;
wire [`CS_LINE_SEL_BITS-1:0] line_idx_st0, line_idx_st1;
wire [`CS_TAG_SEL_BITS-1:0] line_tag_st0, line_tag_st1;
wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0, evict_tag_st1;
wire [`CS_LINE_SEL_BITS-1:0] line_sel_st0, line_sel_st1;
wire rw_sel, rw_st0, rw_st1;
wire [WORD_SEL_WIDTH-1:0] word_idx_sel, word_idx_st0, word_idx_st1;
wire [WORD_SEL_WIDTH-1:0] wsel_sel, wsel_st0, wsel_st1;
wire [WORD_SIZE-1:0] byteen_sel, byteen_st0, byteen_st1;
wire [REQ_SEL_WIDTH-1:0] req_idx_sel, req_idx_st0, req_idx_st1;
wire [TAG_WIDTH-1:0] tag_sel, tag_st0, tag_st1;
wire [`CS_WORD_WIDTH-1:0] write_word_st0, write_word_st1;
wire [`CS_WORD_WIDTH-1:0] read_data_st1;
wire [`CS_LINE_WIDTH-1:0] data_sel, data_st0, data_st1;
wire [MSHR_ADDR_WIDTH-1:0] mshr_id_st0, mshr_id_st1;
wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0;
wire is_dirty_st0, is_dirty_st1;
wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0, mshr_id_st0, mshr_id_st1;
wire valid_sel, valid_st0, valid_st1;
wire is_creq_st0, is_creq_st1;
wire is_fill_st0, is_fill_st1;
wire is_replay_st0, is_replay_st1;
wire is_hit_st0, is_hit_st1;
wire [`UP(FLAGS_WIDTH)-1:0] flags_sel, flags_st0, flags_st1;
wire creq_flush_sel, creq_flush_st0, creq_flush_st1;
wire evict_dirty_st0, evict_dirty_st1;
wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1;
wire [NUM_WAYS-1:0] tag_matches_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_prev_st0, mshr_prev_st1;
wire mshr_pending_st0, mshr_pending_st1;
wire [MSHR_ADDR_WIDTH-1:0] mshr_previd_st0, mshr_previd_st1;
wire mshr_empty;
wire flush_valid;
wire init_valid;
wire [`CS_LINE_SEL_BITS-1:0] flush_sel;
wire [`CS_WAY_SEL_WIDTH-1:0] flush_way;
wire [NUM_WAYS-1:0] flush_way;
wire flush_ready;
// ensure we have no pending memory request in the bank
wire no_pending_req = ~valid_st0 && ~valid_st1 && mreq_queue_empty;
// flush unit
VX_bank_flush #(
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
@ -204,7 +194,11 @@ module VX_cache_bank #(
.bank_empty (no_pending_req)
);
wire pipe_stall = crsp_queue_stall;
wire rdw_hazard1_sel;
wire rdw_hazard2_sel;
reg rdw_hazard3_st1;
wire pipe_stall = crsp_queue_stall || rdw_hazard3_st1;
// inputs arbitration:
// mshr replay has highest priority to maximize utilization since there is no miss.
@ -223,217 +217,216 @@ module VX_cache_bank #(
wire creq_enable = creq_grant && core_req_valid;
assign replay_ready = replay_grant
&& ~(!WRITEBACK && replay_rw && mreq_queue_alm_full) // needed for writethrough
&& ~rdw_hazard1_sel
&& ~pipe_stall;
assign mem_rsp_ready = fill_grant
&& ~(WRITEBACK && mreq_queue_alm_full) // needed for writeback
&& (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions
&& ~rdw_hazard2_sel
&& ~pipe_stall;
assign flush_ready = flush_grant
&& ~(WRITEBACK && mreq_queue_alm_full) // needed for writeback
&& (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions
&& ~rdw_hazard2_sel
&& ~pipe_stall;
assign core_req_ready = creq_grant
&& ~mreq_queue_alm_full // needed for fill requests
&& ~mshr_alm_full // needed for mshr allocation
&& ~mreq_queue_alm_full
&& ~mshr_alm_full
&& ~pipe_stall;
wire init_fire = init_valid;
wire replay_fire = replay_valid && replay_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
wire flush_fire = flush_valid && flush_ready;
wire flush_fire = flush_valid && flush_ready;
wire core_req_fire = core_req_valid && core_req_ready;
wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id = mem_rsp_tag[MSHR_ADDR_WIDTH-1:0];
wire [TAG_WIDTH-1:0] mem_rsp_tag_s;
if (TAG_WIDTH > MEM_TAG_WIDTH) begin : g_mem_rsp_tag_s_pad
assign mem_rsp_tag_s = {mem_rsp_tag, (TAG_WIDTH-MEM_TAG_WIDTH)'(1'b0)};
end else begin : g_mem_rsp_tag_s_cut
assign mem_rsp_tag_s = mem_rsp_tag[MEM_TAG_WIDTH-1 -: TAG_WIDTH];
`UNUSED_VAR (mem_rsp_tag)
end
wire [TAG_WIDTH-1:0] flush_tag;
if (UUID_WIDTH != 0) begin : g_flush_tag_uuid
assign flush_tag = {flush_uuid, (TAG_WIDTH-UUID_WIDTH)'(1'b0)};
end else begin : g_flush_tag_0
`UNUSED_VAR (flush_uuid)
assign flush_tag = '0;
end
assign valid_sel = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire;
assign rw_sel = replay_valid ? replay_rw : core_req_rw;
assign byteen_sel = replay_valid ? replay_byteen : core_req_byteen;
assign wsel_sel = replay_valid ? replay_wsel : core_req_wsel;
assign req_idx_sel = replay_valid ? replay_idx : core_req_idx;
assign tag_sel = replay_valid ? replay_tag : core_req_tag;
assign creq_flush_sel = core_req_valid && core_req_flush;
assign addr_sel = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) :
(replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : core_req_addr));
assign word_idx_sel= replay_valid ? replay_wsel : core_req_wsel;
assign req_idx_sel = replay_valid ? replay_idx : core_req_idx;
assign tag_sel = (init_valid | flush_valid) ? (flush_valid ? flush_tag : '0) :
(replay_valid ? replay_tag : (mem_rsp_valid ? mem_rsp_tag_s : core_req_tag));
assign flags_sel = core_req_valid ? core_req_flags : '0;
if (WRITE_ENABLE) begin : g_data_sel
for (genvar i = 0; i < `CS_LINE_WIDTH; ++i) begin : g_i
if (i < `CS_WORD_WIDTH) begin : g_lo
assign data_sel[i] = replay_valid ? replay_data[i] : (mem_rsp_valid ? mem_rsp_data[i] : core_req_data[i]);
end else begin : g_hi
assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words of data_sel
end
end
end else begin : g_data_sel_ro
assign data_sel = mem_rsp_data;
if (WRITE_ENABLE) begin
assign data_sel[`CS_WORD_WIDTH-1:0] = replay_valid ? replay_data : (mem_rsp_valid ? mem_rsp_data[`CS_WORD_WIDTH-1:0] : core_req_data);
end else begin
assign data_sel[`CS_WORD_WIDTH-1:0] = mem_rsp_data[`CS_WORD_WIDTH-1:0];
`UNUSED_VAR (core_req_data)
`UNUSED_VAR (replay_data)
end
if (UUID_WIDTH != 0) begin : g_req_uuid_sel
assign req_uuid_sel = tag_sel[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin : g_req_uuid_sel_0
assign req_uuid_sel = '0;
for (genvar i = `CS_WORD_WIDTH; i < `CS_LINE_WIDTH; ++i) begin
assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words of data_sel
end
wire is_init_sel = init_valid;
wire is_creq_sel = creq_enable || replay_enable;
wire is_fill_sel = fill_enable;
wire is_flush_sel = flush_enable;
wire is_replay_sel = replay_enable;
if (UUID_WIDTH != 0) begin
assign req_uuid_sel = tag_sel[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign req_uuid_sel = 0;
end
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + `UP(FLAGS_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + NUM_WAYS + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
.RESETW (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (~pipe_stall),
.data_in ({valid_sel, is_init_sel, is_fill_sel, is_flush_sel, is_creq_sel, is_replay_sel, flags_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, word_idx_sel, req_idx_sel, tag_sel, replay_id}),
.data_out ({valid_st0, is_init_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, flags_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, replay_id_st0})
.data_in ({valid_sel, init_valid, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}),
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
);
if (UUID_WIDTH != 0) begin : g_req_uuid_st0
if (UUID_WIDTH != 0) begin
assign req_uuid_st0 = tag_st0[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin : g_req_uuid_st0_0
assign req_uuid_st0 = '0;
end else begin
assign req_uuid_st0 = 0;
end
wire is_read_st0 = is_creq_st0 && ~rw_st0;
wire is_write_st0 = is_creq_st0 && rw_st0;
wire do_init_st0 = valid_st0 && is_init_st0;
wire do_flush_st0 = valid_st0 && is_flush_st0;
wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0;
wire do_creq_wr_st0 = valid_st0 && is_creq_st0 && rw_st0;
wire do_replay_rd_st0 = valid_st0 && is_replay_st0 && ~rw_st0;
wire do_replay_wr_st0 = valid_st0 && is_replay_st0 && rw_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire do_cache_rd_st0 = do_creq_rd_st0 || do_replay_rd_st0;
wire do_cache_wr_st0 = do_creq_wr_st0 || do_replay_wr_st0;
wire do_lookup_st0 = do_cache_rd_st0 || do_cache_wr_st0;
wire do_init_st0 = valid_st0 && is_init_st0;
wire do_flush_st0 = valid_st0 && is_flush_st0;
wire do_read_st0 = valid_st0 && is_read_st0;
wire do_write_st0 = valid_st0 && is_write_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire [`CS_WORD_WIDTH-1:0] write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0];
wire is_read_st1 = is_creq_st1 && ~rw_st1;
wire is_write_st1 = is_creq_st1 && rw_st1;
assign line_sel_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0];
wire do_read_st1 = valid_st1 && is_read_st1;
wire do_write_st1 = valid_st1 && is_write_st1;
assign line_idx_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0];
assign line_tag_st0 = `CS_LINE_ADDR_TAG(addr_st0);
assign write_word_st0 = data_st0[`CS_WORD_WIDTH-1:0];
wire do_lookup_st0 = do_read_st0 || do_write_st0;
wire do_lookup_st1 = do_read_st1 || do_write_st1;
wire [`CS_WAY_SEL_WIDTH-1:0] victim_way_st0;
wire [NUM_WAYS-1:0] tag_matches_st0;
VX_cache_repl #(
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.REPL_POLICY (REPL_POLICY)
) cache_repl (
.clk (clk),
.reset (reset),
.stall (pipe_stall),
.init (do_init_st0),
.lookup_valid(do_lookup_st1 && ~pipe_stall),
.lookup_hit (is_hit_st1),
.lookup_line(line_idx_st1),
.lookup_way (way_idx_st1),
.repl_valid (do_fill_st0 && ~pipe_stall),
.repl_line (line_idx_st0),
.repl_way (victim_way_st0)
);
assign evict_way_st0 = is_fill_st0 ? victim_way_st0 : flush_way_st0;
wire [NUM_WAYS-1:0] evict_way_st0;
wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0;
VX_cache_tags #(
.INSTANCE_ID($sformatf("%s-tags", INSTANCE_ID)),
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.WRITEBACK (WRITEBACK)
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH)
) cache_tags (
.clk (clk),
.reset (reset),
// inputs
.req_uuid (req_uuid_st0),
.stall (pipe_stall),
// init/flush/fill/write/lookup
.init (do_init_st0),
.flush (do_flush_st0 && ~pipe_stall),
.fill (do_fill_st0 && ~pipe_stall),
.read (do_read_st0 && ~pipe_stall),
.write (do_write_st0 && ~pipe_stall),
.line_idx (line_idx_st0),
.line_tag (line_tag_st0),
.evict_way (evict_way_st0),
// outputs
.flush (do_flush_st0),
.fill (do_fill_st0),
.write (do_cache_wr_st0),
.lookup (do_lookup_st0),
.line_addr (addr_st0),
.way_sel (flush_way_st0),
.tag_matches(tag_matches_st0),
.evict_dirty(is_dirty_st0),
// replacement
.evict_dirty(evict_dirty_st0),
.evict_way (evict_way_st0),
.evict_tag (evict_tag_st0)
);
wire [`CS_WAY_SEL_WIDTH-1:0] hit_idx_st0;
VX_onehot_encoder #(
.N (NUM_WAYS)
) way_idx_enc (
.data_in (tag_matches_st0),
.data_out (hit_idx_st0),
`UNUSED_PIN (valid_out)
);
wire [`CS_LINE_ADDR_WIDTH-1:0] addr2_st0;
assign way_idx_st0 = is_creq_st0 ? hit_idx_st0 : evict_way_st0;
assign is_hit_st0 = (| tag_matches_st0);
wire is_flush2_st0 = WRITEBACK && is_flush_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0;
assign mshr_id_st0 = is_replay_st0 ? replay_id_st0 : mshr_alloc_id_st0;
assign mshr_id_st0 = is_creq_st0 ? mshr_alloc_id_st0 : replay_id_st0;
assign way_sel_st0 = (is_fill_st0 || is_flush2_st0) ? evict_way_st0 : tag_matches_st0;
assign addr2_st0 = (is_fill_st0 || is_flush2_st0) ? {evict_tag_st0, line_sel_st0} : addr_st0;
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(FLAGS_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_TAG_SEL_BITS + `CS_TAG_SEL_BITS + `CS_LINE_SEL_BITS + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1),
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1 + 1),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (~pipe_stall),
.data_in ({valid_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, is_dirty_st0, is_hit_st0, rw_st0, flags_st0, way_idx_st0, evict_tag_st0, line_tag_st0, line_idx_st0, data_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_previd_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_fill_st1, is_flush_st1, is_creq_st1, is_replay_st1, is_dirty_st1, is_hit_st1, rw_st1, flags_st1, way_idx_st1, evict_tag_st1, line_tag_st1, line_idx_st1, data_st1, byteen_st1, word_idx_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_previd_st1, mshr_pending_st1})
.data_in ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush2_st0, is_creq_st0, creq_flush_st0, rw_st0, addr2_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, evict_dirty_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_init_st1, is_replay_st1, is_fill_st1, is_flush_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, evict_dirty_st1, mshr_pending_st1})
);
if (UUID_WIDTH != 0) begin : g_req_uuid_st1
// we have a tag hit
wire is_hit_st1 = (| way_sel_st1);
if (UUID_WIDTH != 0) begin
assign req_uuid_st1 = tag_st1[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin : g_req_uuid_st1_0
assign req_uuid_st1 = '0;
end else begin
assign req_uuid_st1 = 0;
end
assign addr_st1 = {line_tag_st1, line_idx_st1};
wire is_read_st1 = is_creq_st1 && ~rw_st1;
wire is_write_st1 = is_creq_st1 && rw_st1;
wire do_init_st1 = valid_st1 && is_init_st1;
wire do_fill_st1 = valid_st1 && is_fill_st1;
wire do_flush_st1 = valid_st1 && is_flush_st1;
wire do_creq_rd_st1 = valid_st1 && is_read_st1;
wire do_creq_wr_st1 = valid_st1 && is_write_st1;
wire do_replay_rd_st1 = valid_st1 && is_replay_st1 && ~rw_st1;
wire do_replay_wr_st1 = valid_st1 && is_replay_st1 && rw_st1;
wire do_read_hit_st1 = do_creq_rd_st1 && is_hit_st1;
wire do_read_miss_st1 = do_creq_rd_st1 && ~is_hit_st1;
wire do_write_hit_st1 = do_creq_wr_st1 && is_hit_st1;
wire do_write_miss_st1= do_creq_wr_st1 && ~is_hit_st1;
wire do_cache_rd_st1 = do_read_hit_st1 || do_replay_rd_st1;
wire do_cache_wr_st1 = do_write_hit_st1 || do_replay_wr_st1;
assign line_sel_st1 = addr_st1[`CS_LINE_SEL_BITS-1:0];
`UNUSED_VAR (do_write_miss_st1)
// ensure mshr replay always get a hit
`RUNTIME_ASSERT (~(valid_st1 && is_replay_st1 && ~is_hit_st1), ("%t: missed mshr replay", $time))
`RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("missed mshr replay"));
assign write_word_st1 = data_st1[`CS_WORD_WIDTH-1:0];
`UNUSED_VAR (data_st1)
// both tag and data stores use BRAM with no read-during-write protection.
// we ned to stall the pipeline to prevent read-after-write hazards.
assign rdw_hazard1_sel = do_fill_st0; // stall first replay following a fill
assign rdw_hazard2_sel = WRITEBACK && do_cache_wr_st0; // a writeback can evict any preceeding write
always @(posedge clk) begin
// stall reads following writes to same line address
rdw_hazard3_st1 <= do_cache_rd_st0 && do_cache_wr_st1 && (line_sel_st0 == line_sel_st1)
&& ~rdw_hazard3_st1; // release pipeline stall
end
wire[`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] read_data_st1;
wire [LINE_SIZE-1:0] evict_byteen_st1;
wire [`CS_LINE_WIDTH-1:0] write_data_st1 = {`CS_WORDS_PER_LINE{data_st1[`CS_WORD_WIDTH-1:0]}};
wire [`CS_LINE_WIDTH-1:0] fill_data_st1 = data_st1;
wire [LINE_SIZE-1:0] write_byteen_st1;
wire [`CS_LINE_WIDTH-1:0] dirty_data_st1;
wire [LINE_SIZE-1:0] dirty_byteen_st1;
if (`CS_WORDS_PER_LINE > 1) begin
reg [LINE_SIZE-1:0] write_byteen_r;
always @(*) begin
write_byteen_r = '0;
write_byteen_r[wsel_st1 * WORD_SIZE +: WORD_SIZE] = byteen_st1;
end
assign write_byteen_st1 = write_byteen_r;
end else begin
assign write_byteen_st1 = byteen_st1;
end
VX_cache_data #(
.INSTANCE_ID ($sformatf("%s-data", INSTANCE_ID)),
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
@ -441,57 +434,56 @@ module VX_cache_bank #(
.WORD_SIZE (WORD_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES)
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH)
) cache_data (
.clk (clk),
.reset (reset),
// inputs
.init (do_init_st0),
.fill (do_fill_st0 && ~pipe_stall),
.flush (do_flush_st0 && ~pipe_stall),
.read (do_read_st0 && ~pipe_stall),
.write (do_write_st0 && ~pipe_stall),
.evict_way (evict_way_st0),
.tag_matches(tag_matches_st0),
.line_idx (line_idx_st0),
.fill_data (data_st0),
.write_word (write_word_st0),
.word_idx (word_idx_st0),
.write_byteen(byteen_st0),
.way_idx_r (way_idx_st1),
// outputs
.req_uuid (req_uuid_st1),
.stall (pipe_stall),
.init (do_init_st1),
.read (do_cache_rd_st1),
.fill (do_fill_st1),
.flush (do_flush_st1),
.write (do_cache_wr_st1),
.way_sel (way_sel_st1),
.line_addr (addr_st1),
.wsel (wsel_st1),
.fill_data (fill_data_st1),
.write_data (write_data_st1),
.write_byteen(write_byteen_st1),
.read_data (read_data_st1),
.evict_byteen(evict_byteen_st1)
.dirty_data (dirty_data_st1),
.dirty_byteen(dirty_byteen_st1)
);
// only allocate MSHR entries for non-replay core requests
wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~is_replay_st0;
wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~is_replay_st1;
wire [MSHR_SIZE-1:0] mshr_lookup_pending_st0;
wire [MSHR_SIZE-1:0] mshr_lookup_rw_st0;
wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~pipe_stall;
wire mshr_lookup_st0 = mshr_allocate_st0;
wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~pipe_stall;
// release allocated mshr entry if we had a hit
wire mshr_release_st1;
if (WRITEBACK) begin : g_mshr_release
if (WRITEBACK) begin
assign mshr_release_st1 = is_hit_st1;
end else begin : g_mshr_release_ro
// we need to keep missed write requests in MSHR if there is already a pending entry to the same address.
// this ensures that missed write requests are replayed locally in case a pending fill arrives without the write content.
// this can happen when writes are sent to memory late, when a related fill was already in flight.
end else begin
// we need to keep missed write requests in MSHR if there is already a pending entry to the same address
// this ensures that missed write requests are replayed locally in case a pending fill arrives without the write content
// this can happen when writes are sent late, when the fill was already in flight.
assign mshr_release_st1 = is_hit_st1 || (rw_st1 && ~mshr_pending_st1);
end
wire mshr_release_fire = mshr_finalize_st1 && mshr_release_st1 && ~pipe_stall;
wire [1:0] mshr_dequeue;
`POP_COUNT(mshr_dequeue, {replay_fire, mshr_release_fire});
VX_pending_size #(
.SIZE (MSHR_SIZE),
.DECRW (2)
.SIZE (MSHR_SIZE)
) mshr_pending_size (
.clk (clk),
.reset (reset),
.incr (core_req_fire),
.decr (mshr_dequeue),
.decr (replay_fire || (mshr_finalize_st1 && mshr_release_st1)),
.empty (mshr_empty),
`UNUSED_PIN (alm_empty),
.full (mshr_alm_full),
@ -500,12 +492,11 @@ module VX_cache_bank #(
);
VX_cache_mshr #(
.INSTANCE_ID (`SFORMATF(("%s-mshr", INSTANCE_ID))),
.INSTANCE_ID ($sformatf("%s-mshr", INSTANCE_ID)),
.BANK_ID (BANK_ID),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.MSHR_SIZE (MSHR_SIZE),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH),
.DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH)
) cache_mshr (
@ -513,7 +504,7 @@ module VX_cache_bank #(
.reset (reset),
.deq_req_uuid (req_uuid_sel),
.alc_req_uuid (req_uuid_st0),
.lkp_req_uuid (req_uuid_st0),
.fin_req_uuid (req_uuid_st1),
// memory fill
@ -530,23 +521,37 @@ module VX_cache_bank #(
.dequeue_ready (replay_ready),
// allocate
.allocate_valid (mshr_allocate_st0 && ~pipe_stall),
.allocate_valid (mshr_allocate_st0),
.allocate_addr (addr_st0),
.allocate_rw (rw_st0),
.allocate_data ({word_idx_st0, byteen_st0, write_word_st0, tag_st0, req_idx_st0}),
.allocate_data ({wsel_st0, byteen_st0, write_data_st0, tag_st0, req_idx_st0}),
.allocate_id (mshr_alloc_id_st0),
.allocate_pending(mshr_pending_st0),
.allocate_previd(mshr_previd_st0),
.allocate_prev (mshr_prev_st0),
`UNUSED_PIN (allocate_ready),
// lookup
.lookup_valid (mshr_lookup_st0),
.lookup_addr (addr_st0),
.lookup_pending (mshr_lookup_pending_st0),
.lookup_rw (mshr_lookup_rw_st0),
// finalize
.finalize_valid (mshr_finalize_st1 && ~pipe_stall),
.finalize_is_release(mshr_release_st1),
.finalize_is_pending(mshr_pending_st1),
.finalize_valid (mshr_finalize_st1),
.finalize_release(mshr_release_st1),
.finalize_pending(mshr_pending_st1),
.finalize_id (mshr_id_st1),
.finalize_previd(mshr_previd_st1)
.finalize_prev (mshr_prev_st1)
);
// check if there are pending requests to same line in the MSHR
wire [MSHR_SIZE-1:0] lookup_matches;
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
assign lookup_matches[i] = mshr_lookup_pending_st0[i]
&& (i != mshr_alloc_id_st0) // exclude current mshr id
&& (WRITEBACK || ~mshr_lookup_rw_st0[i]); // exclude write requests if writethrough
end
assign mshr_pending_st0 = (| lookup_matches);
// schedule core response
wire crsp_queue_valid, crsp_queue_ready;
@ -554,19 +559,19 @@ module VX_cache_bank #(
wire [REQ_SEL_WIDTH-1:0] crsp_queue_idx;
wire [TAG_WIDTH-1:0] crsp_queue_tag;
assign crsp_queue_valid = do_read_st1 && is_hit_st1;
assign crsp_queue_valid = do_cache_rd_st1;
assign crsp_queue_idx = req_idx_st1;
assign crsp_queue_data = read_data_st1[word_idx_st1];
assign crsp_queue_data = read_data_st1;
assign crsp_queue_tag = tag_st1;
VX_elastic_buffer #(
.DATAW (TAG_WIDTH + `CS_WORD_WIDTH + REQ_SEL_WIDTH),
.SIZE (CRSQ_SIZE),
.OUT_REG (CORE_OUT_REG)
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_queue (
.clk (clk),
.reset (reset),
.valid_in (crsp_queue_valid),
.valid_in (crsp_queue_valid && ~rdw_hazard3_st1),
.ready_in (crsp_queue_ready),
.data_in ({crsp_queue_tag, crsp_queue_data, crsp_queue_idx}),
.data_out ({core_rsp_tag, core_rsp_data, core_rsp_idx}),
@ -582,93 +587,59 @@ module VX_cache_bank #(
wire [`CS_LINE_WIDTH-1:0] mreq_queue_data;
wire [LINE_SIZE-1:0] mreq_queue_byteen;
wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr;
wire [MEM_TAG_WIDTH-1:0] mreq_queue_tag;
wire [MSHR_ADDR_WIDTH-1:0] mreq_queue_id;
wire mreq_queue_rw;
wire [`UP(FLAGS_WIDTH)-1:0] mreq_queue_flags;
wire mreq_queue_flush;
wire is_fill_or_flush_st1 = is_fill_st1 || (is_flush_st1 && WRITEBACK);
wire is_fill_or_flush_st1 = is_fill_st1 || is_flush_st1;
wire do_fill_or_flush_st1 = valid_st1 && is_fill_or_flush_st1;
wire do_writeback_st1 = do_fill_or_flush_st1 && is_dirty_st1;
wire [`CS_LINE_ADDR_WIDTH-1:0] evict_addr_st1 = {evict_tag_st1, line_idx_st1};
wire do_writeback_st1 = do_fill_or_flush_st1 && evict_dirty_st1;
if (WRITE_ENABLE) begin : g_mreq_queue
if (WRITEBACK) begin : g_wb
if (DIRTY_BYTES) begin : g_dirty_bytes
// ensure dirty bytes match the tag info
wire has_dirty_bytes = (| evict_byteen_st1);
`RUNTIME_ASSERT (~do_fill_or_flush_st1 || (is_dirty_st1 == has_dirty_bytes), ("%t: missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", $time, is_dirty_st1, has_dirty_bytes, `CS_BANK_TO_FULL_ADDR(addr_st1, BANK_ID)))
end
// issue a fill request on a read/write miss
// issue a writeback on a dirty line eviction
assign mreq_queue_push = ((do_lookup_st1 && ~is_hit_st1 && ~mshr_pending_st1)
|| do_writeback_st1)
&& ~pipe_stall;
assign mreq_queue_addr = is_fill_or_flush_st1 ? evict_addr_st1 : addr_st1;
assign mreq_queue_rw = is_fill_or_flush_st1;
assign mreq_queue_data = read_data_st1;
assign mreq_queue_byteen = is_fill_or_flush_st1 ? evict_byteen_st1 : '1;
`UNUSED_VAR (write_word_st1)
`UNUSED_VAR (byteen_st1)
end else begin : g_wt
wire [LINE_SIZE-1:0] line_byteen;
VX_demux #(
.DATAW (WORD_SIZE),
.N (`CS_WORDS_PER_LINE)
) byteen_demux (
.sel_in (word_idx_st1),
.data_in (byteen_st1),
.data_out (line_byteen)
);
// issue a fill request on a read miss
// issue a memory write on a write request
assign mreq_queue_push = ((do_read_st1 && ~is_hit_st1 && ~mshr_pending_st1)
|| do_write_st1)
&& ~pipe_stall;
assign mreq_queue_addr = addr_st1;
assign mreq_queue_rw = rw_st1;
assign mreq_queue_data = {`CS_WORDS_PER_LINE{write_word_st1}};
assign mreq_queue_byteen = rw_st1 ? line_byteen : '1;
`UNUSED_VAR (is_fill_or_flush_st1)
`UNUSED_VAR (do_writeback_st1)
`UNUSED_VAR (evict_addr_st1)
`UNUSED_VAR (evict_byteen_st1)
if (WRITEBACK) begin
if (DIRTY_BYTES) begin
// ensure dirty bytes match the tag info
wire has_dirty_bytes = (| dirty_byteen_st1);
`RUNTIME_ASSERT (~do_fill_or_flush_st1 || (evict_dirty_st1 == has_dirty_bytes), ("missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", evict_dirty_st1, has_dirty_bytes, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID)));
end
end else begin : g_mreq_queue_ro
// issue a fill request on a read miss
assign mreq_queue_push = (do_read_st1 && ~is_hit_st1 && ~mshr_pending_st1)
&& ~pipe_stall;
assign mreq_queue_addr = addr_st1;
assign mreq_queue_rw = 0;
assign mreq_queue_data = '0;
assign mreq_queue_byteen = '1;
assign mreq_queue_push = (((do_read_miss_st1 || do_write_miss_st1) && ~mshr_pending_st1)
|| do_writeback_st1)
&& ~rdw_hazard3_st1;
end else begin
`UNUSED_VAR (do_writeback_st1)
`UNUSED_VAR (evict_addr_st1)
`UNUSED_VAR (evict_byteen_st1)
`UNUSED_VAR (write_word_st1)
`UNUSED_VAR (byteen_st1)
end
if (UUID_WIDTH != 0) begin : g_mreq_queue_tag_uuid
assign mreq_queue_tag = {req_uuid_st1, mshr_id_st1};
end else begin : g_mreq_queue_tag
assign mreq_queue_tag = mshr_id_st1;
assign mreq_queue_push = ((do_read_miss_st1 && ~mshr_pending_st1)
|| do_creq_wr_st1)
&& ~rdw_hazard3_st1;
end
assign mreq_queue_pop = mem_req_valid && mem_req_ready;
assign mreq_queue_flags = flags_st1;
assign mreq_queue_addr = addr_st1;
assign mreq_queue_id = mshr_id_st1;
assign mreq_queue_flush = creq_flush_st1;
if (WRITE_ENABLE) begin
assign mreq_queue_rw = WRITEBACK ? is_fill_or_flush_st1 : rw_st1;
assign mreq_queue_data = WRITEBACK ? dirty_data_st1 : write_data_st1;
assign mreq_queue_byteen = WRITEBACK ? dirty_byteen_st1 : write_byteen_st1;
end else begin
assign mreq_queue_rw = 0;
assign mreq_queue_data = 0;
assign mreq_queue_byteen = 0;
`UNUSED_VAR (dirty_data_st1)
`UNUSED_VAR (dirty_byteen_st1)
end
VX_fifo_queue #(
.DATAW (1 + `CS_LINE_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)),
.DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + 1),
.DEPTH (MREQ_SIZE),
.ALM_FULL (MREQ_SIZE - PIPELINE_STAGES),
.OUT_REG (MEM_OUT_REG)
.ALM_FULL (MREQ_SIZE-PIPELINE_STAGES),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_queue (
.clk (clk),
.reset (reset),
.push (mreq_queue_push),
.pop (mreq_queue_pop),
.data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_byteen, mreq_queue_data, mreq_queue_tag, mreq_queue_flags}),
.data_out ({mem_req_rw, mem_req_addr, mem_req_byteen, mem_req_data, mem_req_tag, mem_req_flags}),
.data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_id, mreq_queue_byteen, mreq_queue_data, mreq_queue_flush}),
.data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_data, mem_req_flush}),
.empty (mreq_queue_empty),
.alm_full (mreq_queue_alm_full),
`UNUSED_PIN (full),
@ -678,101 +649,44 @@ module VX_cache_bank #(
assign mem_req_valid = ~mreq_queue_empty;
`UNUSED_VAR (do_lookup_st0)
///////////////////////////////////////////////////////////////////////////////
`ifdef PERF_ENABLE
assign perf_read_miss = do_read_st1 && ~is_hit_st1;
assign perf_write_miss = do_write_st1 && ~is_hit_st1;
assign perf_mshr_stall = mshr_alm_full;
assign perf_read_misses = do_read_miss_st1;
assign perf_write_misses = do_write_miss_st1;
assign perf_mshr_stalls = mshr_alm_full;
`endif
`ifdef DBG_TRACE_CACHE
wire crsp_queue_fire = crsp_queue_valid && crsp_queue_ready;
wire input_stall = (replay_valid || mem_rsp_valid || core_req_valid || flush_valid)
&& ~(replay_fire || mem_rsp_fire || core_req_fire || flush_fire);
wire [`XLEN-1:0] mem_rsp_full_addr = `CS_BANK_TO_FULL_ADDR(mem_rsp_addr, BANK_ID);
wire [`XLEN-1:0] replay_full_addr = `CS_BANK_TO_FULL_ADDR(replay_addr, BANK_ID);
wire [`XLEN-1:0] core_req_full_addr = `CS_BANK_TO_FULL_ADDR(core_req_addr, BANK_ID);
wire [`XLEN-1:0] full_addr_st0 = `CS_BANK_TO_FULL_ADDR(addr_st0, BANK_ID);
wire [`XLEN-1:0] full_addr_st1 = `CS_BANK_TO_FULL_ADDR(addr_st1, BANK_ID);
wire [`XLEN-1:0] mreq_queue_full_addr = `CS_BANK_TO_FULL_ADDR(mreq_queue_addr, BANK_ID);
always @(posedge clk) begin
if (input_stall || pipe_stall) begin
`TRACE(4, ("%t: *** %s stall: crsq=%b, mreq=%b, mshr=%b\n", $time, INSTANCE_ID,
crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full))
`TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw1=%b, rdw2=%b, rdw3=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard1_sel, rdw_hazard2_sel, rdw_hazard3_st1));
end
if (mem_rsp_fire) begin
`TRACE(2, ("%t: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
mem_rsp_full_addr, mem_rsp_id, mem_rsp_data, req_uuid_sel))
`TRACE(2, ("%d: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data));
end
if (replay_fire) begin
`TRACE(2, ("%t: %s mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID,
replay_full_addr, replay_tag, replay_idx, req_uuid_sel))
`TRACE(2, ("%d: %s mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel));
end
if (core_req_fire) begin
if (core_req_rw) begin
`TRACE(2, ("%t: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
core_req_full_addr, core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel))
end else begin
`TRACE(2, ("%t: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID,
core_req_full_addr, core_req_tag, core_req_idx, req_uuid_sel))
end
end
if (do_init_st0) begin
`TRACE(3, ("%t: %s tags-init: addr=0x%0h, line=%0d\n", $time, INSTANCE_ID, full_addr_st0, line_idx_st0))
end
if (do_fill_st0 && ~pipe_stall) begin
`TRACE(3, ("%t: %s tags-fill: addr=0x%0h, way=%0d, line=%0d, dirty=%b (#%0d)\n", $time, INSTANCE_ID,
full_addr_st0, evict_way_st0, line_idx_st0, is_dirty_st0, req_uuid_st0))
end
if (do_flush_st0 && ~pipe_stall) begin
`TRACE(3, ("%t: %s tags-flush: addr=0x%0h, way=%0d, line=%0d, dirty=%b (#%0d)\n", $time, INSTANCE_ID,
full_addr_st0, evict_way_st0, line_idx_st0, is_dirty_st0, req_uuid_st0))
end
if (do_lookup_st0 && ~pipe_stall) begin
if (is_hit_st0) begin
`TRACE(3, ("%t: %s tags-hit: addr=0x%0h, rw=%b, way=%0d, line=%0d, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID,
full_addr_st0, rw_st0, way_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0))
end else begin
`TRACE(3, ("%t: %s tags-miss: addr=0x%0h, rw=%b, way=%0d, line=%0d, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID,
full_addr_st0, rw_st0, way_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0))
end
end
if (do_fill_st0 && ~pipe_stall) begin
`TRACE(3, ("%t: %s data-fill: addr=0x%0h, way=%0d, line=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
full_addr_st0, way_idx_st0, line_idx_st0, data_st0, req_uuid_st0))
end
if (do_flush_st0 && ~pipe_stall) begin
`TRACE(3, ("%t: %s data-flush: addr=0x%0h, way=%0d, line=%0d (#%0d)\n", $time, INSTANCE_ID,
full_addr_st0, way_idx_st0, line_idx_st0, req_uuid_st0))
end
if (do_read_st1 && is_hit_st1 && ~pipe_stall) begin
`TRACE(3, ("%t: %s data-read: addr=0x%0h, way=%0d, line=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
full_addr_st1, way_idx_st1, line_idx_st1, word_idx_st1, crsp_queue_data, req_uuid_st1))
end
if (do_write_st1 && is_hit_st1 && ~pipe_stall) begin
`TRACE(3, ("%t: %s data-write: addr=0x%0h, way=%0d, line=%0d, wsel=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
full_addr_st1, way_idx_st1, line_idx_st1, word_idx_st1, byteen_st1, write_word_st1, req_uuid_st1))
if (core_req_rw)
`TRACE(2, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel));
else
`TRACE(2, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel));
end
if (crsp_queue_fire) begin
`TRACE(2, ("%t: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
full_addr_st1, crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1))
`TRACE(2, ("%d: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1));
end
if (mreq_queue_push) begin
if (!WRITEBACK && do_write_st1) begin
`TRACE(2, ("%t: %s writethrough: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
mreq_queue_full_addr, mreq_queue_byteen, mreq_queue_data, req_uuid_st1))
end else if (WRITEBACK && do_writeback_st1) begin
`TRACE(2, ("%t: %s writeback: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
mreq_queue_full_addr, mreq_queue_byteen, mreq_queue_data, req_uuid_st1))
end else begin
`TRACE(2, ("%t: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID,
mreq_queue_full_addr, mshr_id_st1, req_uuid_st1))
end
if (do_creq_wr_st1 && !WRITEBACK)
`TRACE(2, ("%d: %s writethrough: addr=0x%0h, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1));
else if (do_writeback_st1)
`TRACE(2, ("%d: %s writeback: addr=0x%0h, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data));
else
`TRACE(2, ("%d: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_id, req_uuid_st1));
end
end
`endif

View file

@ -15,10 +15,10 @@
module VX_cache_bypass #(
parameter NUM_REQS = 1,
parameter MEM_PORTS = 1,
parameter TAG_SEL_IDX = 0,
parameter CACHE_ENABLE = 0,
parameter PASSTHRU = 0,
parameter NC_ENABLE = 0,
parameter WORD_SIZE = 1,
parameter LINE_SIZE = 1,
@ -29,11 +29,14 @@ module VX_cache_bypass #(
parameter MEM_ADDR_WIDTH = 1,
parameter MEM_TAG_IN_WIDTH = 1,
parameter MEM_TAG_OUT_WIDTH = 1,
parameter UUID_WIDTH = 0,
parameter CORE_OUT_BUF = 0,
parameter MEM_OUT_BUF = 0
parameter MEM_OUT_BUF = 0,
parameter CORE_DATA_WIDTH = WORD_SIZE * 8
) (
input wire clk,
input wire reset,
@ -45,222 +48,304 @@ module VX_cache_bypass #(
VX_mem_bus_if.master core_bus_out_if [NUM_REQS],
// Memory request in
VX_mem_bus_if.slave mem_bus_in_if [MEM_PORTS],
VX_mem_bus_if.slave mem_bus_in_if,
// Memory request out
VX_mem_bus_if.master mem_bus_out_if [MEM_PORTS]
VX_mem_bus_if.master mem_bus_out_if
);
localparam DIRECT_PASSTHRU = !CACHE_ENABLE && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == MEM_PORTS);
localparam CORE_DATA_WIDTH = WORD_SIZE * 8;
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
localparam DIRECT_PASSTHRU = PASSTHRU && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == 1);
localparam CORE_TAG_ID_WIDTH = CORE_TAG_WIDTH - UUID_WIDTH;
localparam MEM_TAG_ID_WIDTH = `CLOG2(`CDIV(NUM_REQS, MEM_PORTS)) + CORE_TAG_ID_WIDTH;
localparam MEM_TAG_NC1_WIDTH = UUID_WIDTH + MEM_TAG_ID_WIDTH;
localparam MEM_TAG_NC2_WIDTH = MEM_TAG_NC1_WIDTH + WSEL_BITS;
localparam MEM_TAG_OUT_WIDTH = CACHE_ENABLE ? `MAX(MEM_TAG_IN_WIDTH, MEM_TAG_NC2_WIDTH) : MEM_TAG_NC2_WIDTH;
localparam REQ_SEL_BITS = `CLOG2(NUM_REQS);
localparam MUX_DATAW = 1 + WORD_SIZE + CORE_ADDR_WIDTH + `ADDR_TYPE_WIDTH + CORE_DATA_WIDTH + CORE_TAG_WIDTH;
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
localparam CORE_TAG_ID_BITS = CORE_TAG_WIDTH - UUID_WIDTH;
localparam MEM_TAG_ID_BITS = REQ_SEL_BITS + WSEL_BITS + CORE_TAG_ID_BITS;
localparam MEM_TAG_BYPASS_BITS = UUID_WIDTH + MEM_TAG_ID_BITS;
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % `MEM_BLOCK_SIZE), ("invalid parameter"))
// hanlde non-cacheable core request switch ///////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (CORE_TAG_WIDTH)
) core_bus_nc_switch_if[(CACHE_ENABLE ? 2 : 1) * NUM_REQS]();
// handle core requests ///////////////////////////////////////////////////
wire core_req_nc_valid;
wire [NUM_REQS-1:0] core_req_nc_valids;
wire [NUM_REQS-1:0] core_req_nc_idxs;
wire [`UP(REQ_SEL_BITS)-1:0] core_req_nc_idx;
wire [NUM_REQS-1:0] core_req_nc_sel;
wire core_req_nc_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_is_nc
if (CACHE_ENABLE) begin : g_cache
assign core_req_nc_sel[i] = ~core_bus_in_if[i].req_data.flags[`MEM_REQ_FLAG_IO];
end else begin : g_no_cache
assign core_req_nc_sel[i] = 1'b0;
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (PASSTHRU != 0) begin
assign core_req_nc_idxs[i] = 1'b1;
end else if (NC_ENABLE) begin
assign core_req_nc_idxs[i] = core_bus_in_if[i].req_data.atype[`ADDR_TYPE_IO];
end else begin
assign core_req_nc_idxs[i] = 1'b0;
end
assign core_req_nc_valids[i] = core_bus_in_if[i].req_valid && core_req_nc_idxs[i];
end
VX_mem_switch #(
.NUM_INPUTS (NUM_REQS),
.NUM_OUTPUTS ((CACHE_ENABLE ? 2 : 1) * NUM_REQS),
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (CORE_TAG_WIDTH),
.ARBITER ("R"),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(CORE_OUT_BUF))
) core_bus_nc_switch (
.clk (clk),
.reset (reset),
.bus_sel (core_req_nc_sel),
.bus_in_if (core_bus_in_if),
.bus_out_if(core_bus_nc_switch_if)
VX_generic_arbiter #(
.NUM_REQS (NUM_REQS),
.TYPE (PASSTHRU ? "R" : "P")
) core_req_nc_arb (
.clk (clk),
.reset (reset),
.requests (core_req_nc_valids),
.grant_index (core_req_nc_idx),
.grant_onehot (core_req_nc_sel),
.grant_valid (core_req_nc_valid),
.grant_ready (core_req_nc_ready)
);
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (CORE_TAG_WIDTH)
) core_bus_in_nc_if[NUM_REQS]();
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_nc_switch_if
assign core_bus_in_nc_if[i].req_valid = core_bus_nc_switch_if[0 * NUM_REQS + i].req_valid;
assign core_bus_in_nc_if[i].req_data = core_bus_nc_switch_if[0 * NUM_REQS + i].req_data;
assign core_bus_nc_switch_if[0 * NUM_REQS + i].req_ready = core_bus_in_nc_if[i].req_ready;
assign core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_valid = core_bus_in_nc_if[i].rsp_valid;
assign core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_data = core_bus_in_nc_if[i].rsp_data;
assign core_bus_in_nc_if[i].rsp_ready = core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_ready;
if (CACHE_ENABLE) begin : g_cache
assign core_bus_out_if[i].req_valid = core_bus_nc_switch_if[1 * NUM_REQS + i].req_valid;
assign core_bus_out_if[i].req_data = core_bus_nc_switch_if[1 * NUM_REQS + i].req_data;
assign core_bus_nc_switch_if[1 * NUM_REQS + i].req_ready = core_bus_out_if[i].req_ready;
assign core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_valid = core_bus_out_if[i].rsp_valid;
assign core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_data = core_bus_out_if[i].rsp_data;
assign core_bus_out_if[i].rsp_ready = core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_ready;
end else begin : g_no_cache
`INIT_VX_MEM_BUS_IF (core_bus_out_if[i])
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && ~core_req_nc_idxs[i];
assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
assign core_bus_in_if[i].req_ready = core_req_nc_valids[i] ? (core_req_nc_ready && core_req_nc_sel[i])
: core_bus_out_if[i].req_ready;
end
// handle memory requests /////////////////////////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (MEM_TAG_NC1_WIDTH)
) core_bus_nc_arb_if[MEM_PORTS]();
wire mem_req_out_valid;
wire mem_req_out_rw;
wire [LINE_SIZE-1:0] mem_req_out_byteen;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_out_addr;
wire [`ADDR_TYPE_WIDTH-1:0] mem_req_out_atype;
wire [`CS_LINE_WIDTH-1:0] mem_req_out_data;
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_out_tag;
wire mem_req_out_ready;
VX_mem_arb #(
.NUM_INPUTS (NUM_REQS),
.NUM_OUTPUTS(MEM_PORTS),
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (CORE_TAG_WIDTH),
.TAG_SEL_IDX(TAG_SEL_IDX),
.ARBITER (CACHE_ENABLE ? "P" : "R"),
.REQ_OUT_BUF(0),
.RSP_OUT_BUF(0)
) core_bus_nc_arb (
.clk (clk),
.reset (reset),
.bus_in_if (core_bus_in_nc_if),
.bus_out_if (core_bus_nc_arb_if)
);
wire core_req_nc_sel_rw;
wire [WORD_SIZE-1:0] core_req_nc_sel_byteen;
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_sel_addr;
wire [`ADDR_TYPE_WIDTH-1:0] core_req_nc_sel_atype;
wire [CORE_DATA_WIDTH-1:0] core_req_nc_sel_data;
wire [CORE_TAG_WIDTH-1:0] core_req_nc_sel_tag;
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_NC2_WIDTH)
) mem_bus_out_nc_if[MEM_PORTS]();
wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_nc_mux_in[i] = {
core_bus_in_if[i].req_data.rw,
core_bus_in_if[i].req_data.byteen,
core_bus_in_if[i].req_data.addr,
core_bus_in_if[i].req_data.atype,
core_bus_in_if[i].req_data.data,
core_bus_in_if[i].req_data.tag
};
end
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_out_nc
wire core_req_nc_arb_rw;
wire [WORD_SIZE-1:0] core_req_nc_arb_byteen;
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_arb_addr;
wire [`MEM_REQ_FLAGS_WIDTH-1:0] core_req_nc_arb_flags;
wire [CORE_DATA_WIDTH-1:0] core_req_nc_arb_data;
wire [MEM_TAG_NC1_WIDTH-1:0] core_req_nc_arb_tag;
assign {
core_req_nc_sel_rw,
core_req_nc_sel_byteen,
core_req_nc_sel_addr,
core_req_nc_sel_atype,
core_req_nc_sel_data,
core_req_nc_sel_tag
} = core_req_nc_mux_in[core_req_nc_idx];
assign {
core_req_nc_arb_rw,
core_req_nc_arb_addr,
core_req_nc_arb_data,
core_req_nc_arb_byteen,
core_req_nc_arb_flags,
core_req_nc_arb_tag
} = core_bus_nc_arb_if[i].req_data;
assign core_req_nc_ready = ~mem_bus_in_if.req_valid && mem_req_out_ready;
logic [MEM_ADDR_WIDTH-1:0] core_req_nc_arb_addr_w;
logic [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] core_req_nc_arb_byteen_w;
logic [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] core_req_nc_arb_data_w;
logic [CORE_DATA_WIDTH-1:0] core_rsp_nc_arb_data_w;
wire [MEM_TAG_NC2_WIDTH-1:0] core_req_nc_arb_tag_w;
wire [MEM_TAG_NC1_WIDTH-1:0] core_rsp_nc_arb_tag_w;
assign mem_req_out_valid = mem_bus_in_if.req_valid || core_req_nc_valid;
assign mem_req_out_rw = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.rw : core_req_nc_sel_rw;
assign mem_req_out_addr = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.addr : core_req_nc_sel_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
assign mem_req_out_atype = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.atype : core_req_nc_sel_atype;
if (WORDS_PER_LINE > 1) begin : g_multi_word_line
wire [WSEL_BITS-1:0] rsp_wsel;
wire [WSEL_BITS-1:0] req_wsel = core_req_nc_arb_addr[WSEL_BITS-1:0];
always @(*) begin
core_req_nc_arb_byteen_w = '0;
core_req_nc_arb_byteen_w[req_wsel] = core_req_nc_arb_byteen;
core_req_nc_arb_data_w = 'x;
core_req_nc_arb_data_w[req_wsel] = core_req_nc_arb_data;
end
wire [MEM_TAG_ID_BITS-1:0] mem_req_tag_id_bypass;
wire [CORE_TAG_ID_BITS-1:0] core_req_in_id = core_req_nc_sel_tag[CORE_TAG_ID_BITS-1:0];
if (WORDS_PER_LINE > 1) begin
reg [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] mem_req_byteen_in_r;
reg [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_r;
wire [WSEL_BITS-1:0] req_wsel = core_req_nc_sel_addr[WSEL_BITS-1:0];
always @(*) begin
mem_req_byteen_in_r = '0;
mem_req_byteen_in_r[req_wsel] = core_req_nc_sel_byteen;
mem_req_data_in_r = 'x;
mem_req_data_in_r[req_wsel] = core_req_nc_sel_data;
end
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : mem_req_byteen_in_r;
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : mem_req_data_in_r;
if (NUM_REQS > 1) begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, req_wsel, core_req_in_id});
end else begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({req_wsel, core_req_in_id});
end
end else begin
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : core_req_nc_sel_byteen;
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : core_req_nc_sel_data;
if (NUM_REQS > 1) begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, core_req_in_id});
end else begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_in_id});
end
end
wire [MEM_TAG_BYPASS_BITS-1:0] mem_req_tag_bypass;
if (UUID_WIDTH != 0) begin
assign mem_req_tag_bypass = {core_req_nc_sel_tag[CORE_TAG_ID_BITS +: UUID_WIDTH], mem_req_tag_id_bypass};
end else begin
assign mem_req_tag_bypass = mem_req_tag_id_bypass;
end
if (PASSTHRU != 0) begin
assign mem_req_out_tag = mem_req_tag_bypass;
`UNUSED_VAR (mem_bus_in_if.req_data.tag)
end else begin
if (NC_ENABLE) begin
VX_bits_insert #(
.N (MEM_TAG_NC1_WIDTH),
.S (WSEL_BITS),
.N (MEM_TAG_OUT_WIDTH-1),
.S (1),
.POS (TAG_SEL_IDX)
) wsel_insert (
.data_in (core_req_nc_arb_tag),
.ins_in (req_wsel),
.data_out (core_req_nc_arb_tag_w)
) mem_req_tag_in_nc_insert (
.data_in (mem_bus_in_if.req_valid ? (MEM_TAG_OUT_WIDTH-1)'(mem_bus_in_if.req_data.tag) : (MEM_TAG_OUT_WIDTH-1)'(mem_req_tag_bypass)),
.ins_in (~mem_bus_in_if.req_valid),
.data_out (mem_req_out_tag)
);
VX_bits_remove #(
.N (MEM_TAG_NC2_WIDTH),
.S (WSEL_BITS),
.POS (TAG_SEL_IDX)
) wsel_remove (
.data_in (mem_bus_out_nc_if[i].rsp_data.tag),
.sel_out (rsp_wsel),
.data_out (core_rsp_nc_arb_tag_w)
);
assign core_req_nc_arb_addr_w = core_req_nc_arb_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
assign core_rsp_nc_arb_data_w = mem_bus_out_nc_if[i].rsp_data.data[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
end else begin : g_single_word_line
assign core_req_nc_arb_addr_w = core_req_nc_arb_addr;
assign core_req_nc_arb_byteen_w = core_req_nc_arb_byteen;
assign core_req_nc_arb_data_w = core_req_nc_arb_data;
assign core_req_nc_arb_tag_w = MEM_TAG_NC2_WIDTH'(core_req_nc_arb_tag);
assign core_rsp_nc_arb_data_w = mem_bus_out_nc_if[i].rsp_data.data;
assign core_rsp_nc_arb_tag_w = MEM_TAG_NC1_WIDTH'(mem_bus_out_nc_if[i].rsp_data.tag);
end
assign mem_bus_out_nc_if[i].req_valid = core_bus_nc_arb_if[i].req_valid;
assign mem_bus_out_nc_if[i].req_data = {
core_req_nc_arb_rw,
core_req_nc_arb_addr_w,
core_req_nc_arb_data_w,
core_req_nc_arb_byteen_w,
core_req_nc_arb_flags,
core_req_nc_arb_tag_w
};
assign core_bus_nc_arb_if[i].req_ready = mem_bus_out_nc_if[i].req_ready;
assign core_bus_nc_arb_if[i].rsp_valid = mem_bus_out_nc_if[i].rsp_valid;
assign core_bus_nc_arb_if[i].rsp_data = {
core_rsp_nc_arb_data_w,
core_rsp_nc_arb_tag_w
};
assign mem_bus_out_nc_if[i].rsp_ready = core_bus_nc_arb_if[i].rsp_ready;
end
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_OUT_WIDTH)
) mem_bus_out_src_if[(CACHE_ENABLE ? 2 : 1) * MEM_PORTS]();
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_out_src
`ASSIGN_VX_MEM_BUS_IF_EX(mem_bus_out_src_if[0 * MEM_PORTS + i], mem_bus_out_nc_if[i], MEM_TAG_OUT_WIDTH, MEM_TAG_NC2_WIDTH, UUID_WIDTH);
if (CACHE_ENABLE) begin : g_cache
`ASSIGN_VX_MEM_BUS_IF_EX(mem_bus_out_src_if[1 * MEM_PORTS + i], mem_bus_in_if[i], MEM_TAG_OUT_WIDTH, MEM_TAG_IN_WIDTH, UUID_WIDTH);
end else begin : g_no_cache
`UNUSED_VX_MEM_BUS_IF(mem_bus_in_if[i])
end else begin
assign mem_req_out_tag = mem_bus_in_if.req_data.tag;
end
end
VX_mem_arb #(
.NUM_INPUTS ((CACHE_ENABLE ? 2 : 1) * MEM_PORTS),
.NUM_OUTPUTS(MEM_PORTS),
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_OUT_WIDTH),
.ARBITER ("R"),
.REQ_OUT_BUF(DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(MEM_OUT_BUF)),
.RSP_OUT_BUF(0)
) mem_bus_out_arb (
.clk (clk),
.reset (reset),
.bus_in_if (mem_bus_out_src_if),
.bus_out_if (mem_bus_out_if)
assign mem_bus_in_if.req_ready = mem_req_out_ready;
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `ADDR_TYPE_WIDTH + `CS_LINE_WIDTH + MEM_TAG_OUT_WIDTH),
.SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_out_valid),
.ready_in (mem_req_out_ready),
.data_in ({mem_req_out_rw, mem_req_out_byteen, mem_req_out_addr, mem_req_out_atype, mem_req_out_data, mem_req_out_tag}),
.data_out ({mem_bus_out_if.req_data.rw, mem_bus_out_if.req_data.byteen, mem_bus_out_if.req_data.addr, mem_bus_out_if.req_data.atype, mem_bus_out_if.req_data.data, mem_bus_out_if.req_data.tag}),
.valid_out (mem_bus_out_if.req_valid),
.ready_out (mem_bus_out_if.req_ready)
);
// handle core responses //////////////////////////////////////////////////
wire [NUM_REQS-1:0] core_rsp_in_valid;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_in_data;
wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_in_tag;
wire [NUM_REQS-1:0] core_rsp_in_ready;
wire is_mem_rsp_nc;
if (PASSTHRU != 0) begin
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid;
end else begin
if (NC_ENABLE) begin
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid && mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
end else begin
assign is_mem_rsp_nc = 1'b0;
end
end
wire [(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1:0] mem_rsp_tag_id_nc;
VX_bits_remove #(
.N (MEM_TAG_OUT_WIDTH),
.S (NC_ENABLE),
.POS (TAG_SEL_IDX)
) mem_rsp_tag_in_nc_remove (
.data_in (mem_bus_out_if.rsp_data.tag),
.data_out (mem_rsp_tag_id_nc)
);
wire [`UP(REQ_SEL_BITS)-1:0] rsp_idx;
if (NUM_REQS > 1) begin
assign rsp_idx = mem_rsp_tag_id_nc[(CORE_TAG_ID_BITS + WSEL_BITS) +: REQ_SEL_BITS];
end else begin
assign rsp_idx = 1'b0;
end
reg [NUM_REQS-1:0] rsp_nc_valid_r;
always @(*) begin
rsp_nc_valid_r = '0;
rsp_nc_valid_r[rsp_idx] = is_mem_rsp_nc;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_in_valid[i] = core_bus_out_if[i].rsp_valid || rsp_nc_valid_r[i];
assign core_bus_out_if[i].rsp_ready = core_rsp_in_ready[i];
end
if (WORDS_PER_LINE > 1) begin
wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS +: WSEL_BITS];
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ?
core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
end
end else begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data;
end
end
wire [(CORE_TAG_ID_BITS + UUID_WIDTH)-1:0] mem_rsp_tag_in_nc2;
if (UUID_WIDTH != 0) begin
assign mem_rsp_tag_in_nc2 = {mem_rsp_tag_id_nc[(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1 -: UUID_WIDTH], mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0]};
end else begin
assign mem_rsp_tag_in_nc2 = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (PASSTHRU) begin
assign core_rsp_in_tag[i] = mem_rsp_tag_in_nc2;
end else if (NC_ENABLE) begin
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.tag : mem_rsp_tag_in_nc2;
end else begin
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_data.tag;
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + CORE_TAG_WIDTH),
.SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_in_valid[i]),
.ready_in (core_rsp_in_ready[i]),
.data_in ({core_rsp_in_data[i], core_rsp_in_tag[i]}),
.data_out ({core_bus_in_if[i].rsp_data.data, core_bus_in_if[i].rsp_data.tag}),
.valid_out (core_bus_in_if[i].rsp_valid),
.ready_out (core_bus_in_if[i].rsp_ready)
);
end
// handle memory responses ////////////////////////////////////////////////
if (PASSTHRU != 0) begin
assign mem_bus_in_if.rsp_valid = 1'b0;
assign mem_bus_in_if.rsp_data.data = '0;
assign mem_bus_in_if.rsp_data.tag = '0;
end else if (NC_ENABLE) begin
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid && ~mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc[MEM_TAG_IN_WIDTH-1:0];
end else begin
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid;
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc;
end
wire [NUM_REQS-1:0] core_rsp_out_valid;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_out_valid[i] = core_bus_out_if[i].rsp_valid;
end
assign mem_bus_out_if.rsp_ready = is_mem_rsp_nc ? (~core_rsp_out_valid[rsp_idx] && core_rsp_in_ready[rsp_idx]) : mem_bus_in_if.rsp_ready;
endmodule

View file

@ -23,26 +23,23 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
// Number of requests per cycle
parameter NUM_REQS = 4,
// Number of memory ports
parameter MEM_PORTS = 1,
// Size of cache in bytes
parameter CACHE_SIZE = 32768,
parameter CACHE_SIZE = 16384,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 4,
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 4,
// Size of a word in bytes
parameter WORD_SIZE = 16,
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 4,
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 16,
parameter MSHR_SIZE = 8,
// Memory Response Queue Size
parameter MRSQ_SIZE = 4,
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
parameter MREQ_SIZE = 4,
@ -55,26 +52,20 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Replacement policy
parameter REPL_POLICY = `CS_REPL_FIFO,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 1,
// core request flags
parameter FLAGS_WIDTH = 0,
// enable bypass for non-cacheable addresses
parameter NC_ENABLE = 0,
// Core response output buffer
parameter CORE_OUT_BUF = 3,
parameter CORE_OUT_BUF = 0,
// Memory request output buffer
parameter MEM_OUT_BUF = 3
parameter MEM_OUT_BUF = 0
) (
input wire clk,
input wire reset,
@ -85,16 +76,14 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
`endif
VX_mem_bus_if.slave core_bus_if [NUM_INPUTS * NUM_REQS],
VX_mem_bus_if.master mem_bus_if [MEM_PORTS]
VX_mem_bus_if.master mem_bus_if
);
localparam NUM_CACHES = `UP(NUM_UNITS);
localparam PASSTHRU = (NUM_UNITS == 0);
localparam ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES);
localparam CACHE_MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH);
localparam BYPASS_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, MEM_PORTS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH);
localparam NC_TAG_WIDTH = `MAX(CACHE_MEM_TAG_WIDTH, BYPASS_TAG_WIDTH) + 1;
localparam MEM_TAG_WIDTH = PASSTHRU ? BYPASS_TAG_WIDTH : (NC_ENABLE ? NC_TAG_WIDTH : CACHE_MEM_TAG_WIDTH);
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
@ -106,14 +95,16 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) cache_mem_bus_if[NUM_CACHES * MEM_PORTS]();
) cache_mem_bus_if[NUM_CACHES]();
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (ARB_TAG_WIDTH)
) arb_core_bus_if[NUM_CACHES * NUM_REQS]();
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_arb
`RESET_RELAY_EX (cache_arb_reset, reset, NUM_REQS, `MAX_FANOUT);
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (TAG_WIDTH)
@ -124,7 +115,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.TAG_WIDTH (ARB_TAG_WIDTH)
) arb_core_bus_tmp_if[NUM_CACHES]();
for (genvar j = 0; j < NUM_INPUTS; ++j) begin : g_core_bus_tmp_if
for (genvar j = 0; j < NUM_INPUTS; ++j) begin
`ASSIGN_VX_MEM_BUS_IF (core_bus_tmp_if[j], core_bus_if[j * NUM_REQS + i]);
end
@ -136,40 +127,40 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.TAG_SEL_IDX (TAG_SEL_IDX),
.ARBITER ("R"),
.REQ_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0),
.RSP_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? CORE_OUT_BUF : 0)
) core_arb (
.RSP_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0)
) cache_arb (
.clk (clk),
.reset (reset),
.reset (cache_arb_reset[i]),
.bus_in_if (core_bus_tmp_if),
.bus_out_if (arb_core_bus_tmp_if)
);
for (genvar k = 0; k < NUM_CACHES; ++k) begin : g_arb_core_bus_if
for (genvar k = 0; k < NUM_CACHES; ++k) begin
`ASSIGN_VX_MEM_BUS_IF (arb_core_bus_if[k * NUM_REQS + i], arb_core_bus_tmp_if[k]);
end
end
for (genvar i = 0; i < NUM_CACHES; ++i) begin : g_cache_wrap
for (genvar i = 0; i < NUM_CACHES; ++i) begin : caches
`RESET_RELAY (cache_reset, reset);
VX_cache_wrap #(
.INSTANCE_ID (`SFORMATF(("%s%0d", INSTANCE_ID, i))),
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, i)),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.MEM_PORTS (MEM_PORTS),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.REPL_POLICY (REPL_POLICY),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (ARB_TAG_WIDTH),
.FLAGS_WIDTH (FLAGS_WIDTH),
.TAG_SEL_IDX (TAG_SEL_IDX),
.CORE_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : CORE_OUT_BUF),
.MEM_OUT_BUF ((NUM_CACHES > 1) ? 2 : MEM_OUT_BUF),
@ -180,48 +171,32 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.cache_perf (perf_cache_unit[i]),
`endif
.clk (clk),
.reset (reset),
.reset (cache_reset),
.core_bus_if (arb_core_bus_if[i * NUM_REQS +: NUM_REQS]),
.mem_bus_if (cache_mem_bus_if[i * MEM_PORTS +: MEM_PORTS])
.mem_bus_if (cache_mem_bus_if[i])
);
end
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_if
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) arb_core_bus_tmp_if[NUM_CACHES]();
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1))
) mem_bus_tmp_if[1]();
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1))
) mem_bus_tmp_if[1]();
VX_mem_arb #(
.NUM_INPUTS (NUM_CACHES),
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH),
.TAG_SEL_IDX (TAG_SEL_IDX),
.ARBITER ("R"),
.REQ_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0),
.RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0)
) mem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (cache_mem_bus_if),
.bus_out_if (mem_bus_tmp_if)
);
for (genvar j = 0; j < NUM_CACHES; ++j) begin : g_arb_core_bus_tmp_if
`ASSIGN_VX_MEM_BUS_IF (arb_core_bus_tmp_if[j], cache_mem_bus_if[j * MEM_PORTS + i]);
end
VX_mem_arb #(
.NUM_INPUTS (NUM_CACHES),
.NUM_OUTPUTS (1),
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH),
.TAG_SEL_IDX (TAG_SEL_IDX),
.ARBITER ("R"),
.REQ_OUT_BUF ((NUM_CACHES > 1) ? MEM_OUT_BUF : 0),
.RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0)
) mem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (arb_core_bus_tmp_if),
.bus_out_if (mem_bus_tmp_if)
);
if (WRITE_ENABLE) begin : g_we
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[0]);
end else begin : g_ro
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[0]);
end
end
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]);
endmodule

View file

@ -14,6 +14,8 @@
`include "VX_cache_define.vh"
module VX_cache_data #(
parameter `STRING INSTANCE_ID= "",
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
@ -29,116 +31,169 @@ module VX_cache_data #(
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0
) (
input wire clk,
input wire reset,
// inputs
`IGNORE_UNUSED_BEGIN
input wire[`UP(UUID_WIDTH)-1:0] req_uuid,
`IGNORE_UNUSED_END
input wire stall,
input wire init,
input wire read,
input wire fill,
input wire flush,
input wire read,
input wire write,
input wire [`CS_LINE_SEL_BITS-1:0] line_idx,
input wire [`CS_WAY_SEL_WIDTH-1:0] evict_way,
input wire [NUM_WAYS-1:0] tag_matches,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire [`UP(`CS_WORD_SEL_BITS)-1:0] wsel,
input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] fill_data,
input wire [`CS_WORD_WIDTH-1:0] write_word,
input wire [WORD_SIZE-1:0] write_byteen,
input wire [`UP(`CS_WORD_SEL_BITS)-1:0] word_idx,
input wire [`CS_WAY_SEL_WIDTH-1:0] way_idx_r,
// outputs
output wire [`CS_LINE_WIDTH-1:0] read_data,
output wire [LINE_SIZE-1:0] evict_byteen
input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] write_data,
input wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_byteen,
input wire [NUM_WAYS-1:0] way_sel,
output wire [`CS_WORD_WIDTH-1:0] read_data,
output wire [`CS_LINE_WIDTH-1:0] dirty_data,
output wire [LINE_SIZE-1:0] dirty_byteen
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_PARAM (WORD_SIZE)
`UNUSED_VAR (stall)
`UNUSED_VAR (line_addr)
`UNUSED_VAR (init)
`UNUSED_VAR (read)
`UNUSED_VAR (flush)
wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_mask;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin : g_write_mask
wire word_en = (`CS_WORDS_PER_LINE == 1) || (word_idx == i);
assign write_mask[i] = write_byteen & {WORD_SIZE{word_en}};
end
localparam BYTEENW = (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) ? (LINE_SIZE * NUM_WAYS) : 1;
if (DIRTY_BYTES != 0) begin : g_dirty_bytes
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
wire [NUM_WAYS-1:0][LINE_SIZE-1:0] byteen_rdata;
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_rdata;
wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_byteen_store
wire [LINE_SIZE-1:0] byteen_wdata = {LINE_SIZE{write}}; // only asserted on writes
wire [LINE_SIZE-1:0] byteen_wren = {LINE_SIZE{init || fill || flush}} | write_mask;
wire byteen_write = ((fill || flush) && ((NUM_WAYS == 1) || (evict_way == i)))
|| (write && tag_matches[i])
|| init;
wire byteen_read = fill || flush;
if (WRITEBACK) begin
if (DIRTY_BYTES) begin
wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_rdata;
wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_wdata;
for (genvar i = 0; i < NUM_WAYS; ++i) begin
wire [LINE_SIZE-1:0] wdata = write ? (bs_rdata[i] | write_byteen) : ((fill || flush) ? '0 : bs_rdata[i]);
assign bs_wdata[i] = init ? '0 : (way_sel[i] ? wdata : bs_rdata[i]);
end
VX_sp_ram #(
.DATAW (LINE_SIZE),
.WRENW (LINE_SIZE),
.SIZE (`CS_LINES_PER_BANK),
.OUT_REG (1),
.RDW_MODE ("R")
.DATAW (LINE_SIZE * NUM_WAYS),
.SIZE (`CS_LINES_PER_BANK)
) byteen_store (
.clk (clk),
.reset (reset),
.read (byteen_read),
.write (byteen_write),
.wren (byteen_wren),
.addr (line_idx),
.wdata (byteen_wdata),
.rdata (byteen_rdata[i])
.read (write || fill || flush),
.write (init || write || fill || flush),
.wren (1'b1),
.addr (line_sel),
.wdata (bs_wdata),
.rdata (bs_rdata)
);
assign dirty_byteen = bs_rdata[way_idx];
end else begin
assign dirty_byteen = {LINE_SIZE{1'b1}};
end
assign evict_byteen = byteen_rdata[way_idx_r];
end else begin : g_no_dirty_bytes
`UNUSED_VAR (init)
`UNUSED_VAR (flush)
assign evict_byteen = '1; // update whole line
end
wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] line_rdata;
for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_data_store
localparam WRENW = WRITE_ENABLE ? LINE_SIZE : 1;
wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] line_wdata;
wire [WRENW-1:0] line_wren;
if (WRITE_ENABLE) begin : g_wren
assign line_wdata = fill ? fill_data : {`CS_WORDS_PER_LINE{write_word}};
assign line_wren = {LINE_SIZE{fill}} | write_mask;
end else begin : g_no_wren
`UNUSED_VAR (write_word)
`UNUSED_VAR (write_mask)
assign line_wdata = fill_data;
assign line_wren = 1'b1;
wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] flipped_rdata;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
for (genvar j = 0; j < NUM_WAYS; ++j) begin
assign flipped_rdata[j][i] = line_rdata[i][j];
end
end
wire line_write = (fill && ((NUM_WAYS == 1) || (evict_way == i)))
|| (write && tag_matches[i] && WRITE_ENABLE);
wire line_read = read || ((fill || flush) && WRITEBACK);
VX_sp_ram #(
.DATAW (`CS_LINE_WIDTH),
.SIZE (`CS_LINES_PER_BANK),
.WRENW (WRENW),
.OUT_REG (1),
.RDW_MODE ("R")
) data_store (
.clk (clk),
.reset (reset),
.read (line_read),
.write (line_write),
.wren (line_wren),
.addr (line_idx),
.wdata (line_wdata),
.rdata (line_rdata[i])
);
assign dirty_data = flipped_rdata[way_idx];
end else begin
assign dirty_byteen = '0;
assign dirty_data = '0;
end
assign read_data = line_rdata[way_idx_r];
// order the data layout to perform ways multiplexing last.
// this allows converting way index to binary in parallel with BRAM readaccess and way selection.
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_wdata;
wire [BYTEENW-1:0] line_wren;
if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
for (genvar j = 0; j < NUM_WAYS; ++j) begin
assign line_wdata[i][j] = (fill || !WRITE_ENABLE) ? fill_data[i] : write_data[i];
assign wren_w[i][j] = ((fill || !WRITE_ENABLE) ? {WORD_SIZE{1'b1}} : write_byteen[i])
& {WORD_SIZE{(way_sel[j] || (NUM_WAYS == 1))}};
end
end
assign line_wren = wren_w;
end else begin
`UNUSED_VAR (write)
`UNUSED_VAR (write_byteen)
`UNUSED_VAR (write_data)
assign line_wdata = fill_data;
assign line_wren = fill;
end
VX_onehot_encoder #(
.N (NUM_WAYS)
) way_enc (
.data_in (way_sel),
.data_out (way_idx),
`UNUSED_PIN (valid_out)
);
wire line_read = (read && ~stall)
|| (WRITEBACK && (fill || flush));
wire line_write = write || fill;
VX_sp_ram #(
.DATAW (`CS_LINE_WIDTH * NUM_WAYS),
.SIZE (`CS_LINES_PER_BANK),
.WRENW (BYTEENW),
.NO_RWCHECK (1),
.RW_ASSERT (1)
) data_store (
.clk (clk),
.reset (reset),
.read (line_read),
.write (line_write),
.wren (line_wren),
.addr (line_sel),
.wdata (line_wdata),
.rdata (line_rdata)
);
wire [NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] per_way_rdata;
if (`CS_WORDS_PER_LINE > 1) begin
assign per_way_rdata = line_rdata[wsel];
end else begin
`UNUSED_VAR (wsel)
assign per_way_rdata = line_rdata;
end
assign read_data = per_way_rdata[way_idx];
`ifdef DBG_TRACE_CACHE
always @(posedge clk) begin
if (fill && ~stall) begin
`TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data));
end
if (flush && ~stall) begin
`TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, dirty_byteen, dirty_data));
end
if (read && ~stall) begin
`TRACE(3, ("%d: %s read: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, read_data, req_uuid));
end
if (write && ~stall) begin
`TRACE(3, ("%d: %s write: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, write_byteen[wsel], write_data[wsel], req_uuid));
end
end
`endif
endmodule

View file

@ -22,7 +22,6 @@
`define CS_LINE_WIDTH (8 * LINE_SIZE)
`define CS_BANK_SIZE (CACHE_SIZE / NUM_BANKS)
`define CS_WAY_SEL_BITS `CLOG2(NUM_WAYS)
`define CS_WAY_SEL_WIDTH `UP(`CS_WAY_SEL_BITS)
`define CS_LINES_PER_BANK (`CS_BANK_SIZE / (LINE_SIZE * NUM_WAYS))
`define CS_WORDS_PER_LINE (LINE_SIZE / WORD_SIZE)
@ -55,7 +54,12 @@
///////////////////////////////////////////////////////////////////////////////
`define CS_BANK_TO_FULL_ADDR(x, b) {x, (`XLEN-$bits(x))'(b << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
`define CS_LINE_TO_MEM_ADDR(x, i) {x, `CS_BANK_SEL_BITS'(i)}
`define CS_MEM_ADDR_TO_BANK_ID(x) x[0 +: `CS_BANK_SEL_BITS]
`define CS_MEM_TAG_TO_REQ_ID(x) x[MSHR_ADDR_WIDTH-1:0]
`define CS_MEM_TAG_TO_BANK_ID(x) x[MSHR_ADDR_WIDTH +: `CS_BANK_SEL_BITS]
`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
`define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)}
///////////////////////////////////////////////////////////////////////////////
@ -70,10 +74,4 @@
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, count, (count > 1))
///////////////////////////////////////////////////////////////////////////////
`define CS_REPL_RANDOM 0
`define CS_REPL_FIFO 1
`define CS_REPL_PLRU 2
`endif // VX_CACHE_DEFINE_VH

View file

@ -18,10 +18,6 @@ module VX_cache_flush #(
parameter NUM_REQS = 4,
// Number of banks
parameter NUM_BANKS = 1,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 1,
// Bank select latency
parameter BANK_SEL_LATENCY = 1
) (
@ -31,11 +27,8 @@ module VX_cache_flush #(
VX_mem_bus_if.master core_bus_out_if [NUM_REQS],
input wire [NUM_BANKS-1:0] bank_req_fire,
output wire [NUM_BANKS-1:0] flush_begin,
output wire [`UP(UUID_WIDTH)-1:0] flush_uuid,
input wire [NUM_BANKS-1:0] flush_end
);
`UNUSED_PARAM (TAG_WIDTH)
localparam STATE_IDLE = 0;
localparam STATE_WAIT1 = 1;
localparam STATE_FLUSH = 2;
@ -48,13 +41,13 @@ module VX_cache_flush #(
wire no_inflight_reqs;
if (BANK_SEL_LATENCY != 0) begin : g_bank_sel_latency
if (BANK_SEL_LATENCY != 0) begin
localparam NUM_REQS_W = `CLOG2(NUM_REQS+1);
localparam NUM_BANKS_W = `CLOG2(NUM_BANKS+1);
wire [NUM_REQS-1:0] core_bus_out_fire;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_fire
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_out_fire[i] = core_bus_out_if[i].req_valid && core_bus_out_if[i].req_ready;
end
@ -81,7 +74,7 @@ module VX_cache_flush #(
`UNUSED_PIN (size)
);
end else begin : g_no_bank_sel_latency
end else begin
assign no_inflight_reqs = 0;
`UNUSED_VAR (bank_req_fire)
end
@ -89,38 +82,28 @@ module VX_cache_flush #(
reg [NUM_BANKS-1:0] flush_done, flush_done_n;
wire [NUM_REQS-1:0] flush_req_mask;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_flush_req_mask
assign flush_req_mask[i] = core_bus_in_if[i].req_valid && core_bus_in_if[i].req_data.flags[`MEM_REQ_FLAG_FLUSH];
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign flush_req_mask[i] = core_bus_in_if[i].req_valid && core_bus_in_if[i].req_data.atype[`ADDR_TYPE_FLUSH];
end
wire flush_req_enable = (| flush_req_mask);
reg [NUM_REQS-1:0] lock_released, lock_released_n;
reg [`UP(UUID_WIDTH)-1:0] flush_uuid_r, flush_uuid_n;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_req
for (genvar i = 0; i < NUM_REQS; ++i) begin
wire input_enable = ~flush_req_enable || lock_released[i];
assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && input_enable;
assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
assign core_bus_in_if[i].req_ready = core_bus_out_if[i].req_ready && input_enable;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_in_rsp
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_in_if[i].rsp_valid = core_bus_out_if[i].rsp_valid;
assign core_bus_in_if[i].rsp_data = core_bus_out_if[i].rsp_data;
assign core_bus_out_if[i].rsp_ready = core_bus_in_if[i].rsp_ready;
end
reg [NUM_REQS-1:0][`UP(UUID_WIDTH)-1:0] core_bus_out_uuid;
wire [NUM_REQS-1:0] core_bus_out_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_uuid
if (UUID_WIDTH != 0) begin : g_uuid
assign core_bus_out_uuid[i] = core_bus_in_if[i].req_data.tag.uuid;
end else begin : g_no_uuid
assign core_bus_out_uuid[i] = 0;
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_ready
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_out_ready[i] = core_bus_out_if[i].req_ready;
end
@ -128,17 +111,10 @@ module VX_cache_flush #(
state_n = state;
flush_done_n = flush_done;
lock_released_n = lock_released;
flush_uuid_n = flush_uuid_r;
case (state)
//STATE_IDLE:
default: begin
STATE_IDLE: begin
if (flush_req_enable) begin
state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT1 : STATE_FLUSH;
for (integer i = NUM_REQS-1; i >= 0; --i) begin
if (flush_req_mask[i]) begin
flush_uuid_n = core_bus_out_uuid[i];
end
end
end
end
STATE_WAIT1: begin
@ -182,10 +158,8 @@ module VX_cache_flush #(
flush_done <= flush_done_n;
lock_released <= lock_released_n;
end
flush_uuid_r <= flush_uuid_n;
end
assign flush_begin = {NUM_BANKS{state == STATE_FLUSH}};
assign flush_uuid = flush_uuid_r;
endmodule

View file

@ -24,23 +24,36 @@
// arrival and are dequeued in the same order.
// Each entry has a next pointer to the next entry pending for the same cache line.
//
// During the fill request, the MSHR will dequue the MSHR entry at the fill_id location
// During the fill operation, the MSHR will release the MSHR entry at fill_id
// which represents the first request in the pending list that initiated the memory fill.
//
// The dequeue response directly follows the fill request and will release
// The dequeue operation directly follows the fill operation and will release
// all the subsequent entries linked to fill_id (pending the same cache line).
//
// During the allocation request, the MSHR will allocate the next free slot
// During the allocation operation, the MSHR will allocate the next free slot
// for the incoming core request. We return the allocated slot id as well as
// the slot id of the previous entry for the same cache line. This is used to
// link the new entry to the pending list.
// link the new entry to the pending list during finalization.
//
// The finalize request is used to persit or release the currently allocated MSHR entry
// if we had a cache miss or a hit, respectively.
// The lookup operation is used to find all pending entries for a given cache line.
// This is used to by the cache bank to determine if a cache miss is already pending
// and therefore avoid issuing a memory fill request.
//
// The finalize operation is used to release the allocated MSHR entry if we had a hit.
// If we had a miss and finalize_pending is true, we link the allocated entry to
// its corresponding pending list (via finalize_prev).
//
// Warning: This MSHR implementation is strongly coupled with the bank pipeline
// and as such changes to either module requires careful evaluation.
//
// This architecture implements three pipeline stages:
// - Arbitration: cache bank arbitration before entering pipeline.
// fill and dequeue operations are executed at this stage.
// - stage 0: cache bank tag access stage.
// allocate and lookup operations are executed at this stage.
// - stage 1: cache bank tdatag access stage.
// finalize operation is executed at this stage.
//
module VX_cache_mshr #(
parameter `STRING INSTANCE_ID= "",
@ -55,9 +68,6 @@ module VX_cache_mshr #(
parameter UUID_WIDTH = 0,
// MSHR parameters
parameter DATA_WIDTH = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE)
) (
input wire clk,
@ -65,7 +75,7 @@ module VX_cache_mshr #(
`IGNORE_UNUSED_BEGIN
input wire[`UP(UUID_WIDTH)-1:0] deq_req_uuid,
input wire[`UP(UUID_WIDTH)-1:0] alc_req_uuid,
input wire[`UP(UUID_WIDTH)-1:0] lkp_req_uuid,
input wire[`UP(UUID_WIDTH)-1:0] fin_req_uuid,
`IGNORE_UNUSED_END
@ -88,21 +98,26 @@ module VX_cache_mshr #(
input wire allocate_rw,
input wire [DATA_WIDTH-1:0] allocate_data,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_id,
output wire allocate_pending,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_previd,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_prev,
output wire allocate_ready,
// lookup
input wire lookup_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] lookup_addr,
output wire [MSHR_SIZE-1:0] lookup_pending,
output wire [MSHR_SIZE-1:0] lookup_rw,
// finalize
input wire finalize_valid,
input wire finalize_is_release,
input wire finalize_is_pending,
input wire [MSHR_ADDR_WIDTH-1:0] finalize_previd,
input wire [MSHR_ADDR_WIDTH-1:0] finalize_id
input wire finalize_release,
input wire finalize_pending,
input wire [MSHR_ADDR_WIDTH-1:0] finalize_id,
input wire [MSHR_ADDR_WIDTH-1:0] finalize_prev
);
`UNUSED_PARAM (BANK_ID)
reg [`CS_LINE_ADDR_WIDTH-1:0] addr_table [0:MSHR_SIZE-1];
reg [MSHR_ADDR_WIDTH-1:0] next_index [0:MSHR_SIZE-1];
reg [`CS_LINE_ADDR_WIDTH-1:0] addr_table [MSHR_SIZE-1:0];
reg [MSHR_ADDR_WIDTH-1:0] next_index [MSHR_SIZE-1:0];
reg [MSHR_SIZE-1:0] valid_table, valid_table_n;
reg [MSHR_SIZE-1:0] next_table, next_table_x, next_table_n;
@ -120,8 +135,8 @@ module VX_cache_mshr #(
wire dequeue_fire = dequeue_valid && dequeue_ready;
wire [MSHR_SIZE-1:0] addr_matches;
for (genvar i = 0; i < MSHR_SIZE; ++i) begin : g_addr_matches
assign addr_matches[i] = valid_table[i] && (addr_table[i] == allocate_addr);
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
assign addr_matches[i] = valid_table[i] && (addr_table[i] == lookup_addr);
end
VX_lzc #(
@ -133,13 +148,11 @@ module VX_cache_mshr #(
.valid_out (allocate_rdy_n)
);
// find matching tail-entry
VX_priority_encoder #(
VX_onehot_encoder #(
.N (MSHR_SIZE)
) prev_sel (
.data_in (addr_matches & ~next_table_x),
.index_out (prev_idx),
`UNUSED_PIN (onehot_out),
.data_out (prev_idx),
`UNUSED_PIN (valid_out)
);
@ -158,22 +171,17 @@ module VX_cache_mshr #(
valid_table_n[dequeue_id] = 0;
if (next_table[dequeue_id]) begin
dequeue_id_n = next_index[dequeue_id];
end else if (finalize_valid && finalize_is_pending && (finalize_previd == dequeue_id)) begin
dequeue_id_n = finalize_id;
end else begin
dequeue_val_n = 0;
end
end
if (finalize_valid) begin
if (finalize_is_release) begin
if (finalize_release) begin
valid_table_n[finalize_id] = 0;
end
// warning: This code allows 'finalize_is_pending' to be asserted regardless of hit/miss
// to reduce the its propagation delay into the MSHR. this is safe because wrong updates
// to 'next_table_n' will be cleared during 'allocate_fire' below.
if (finalize_is_pending) begin
next_table_x[finalize_previd] = 1;
if (finalize_pending) begin
next_table_x[finalize_prev] = 1;
end
end
@ -196,12 +204,12 @@ module VX_cache_mshr #(
end
if (allocate_fire) begin
addr_table[allocate_id] <= allocate_addr;
addr_table[allocate_id] <= allocate_addr;
write_table[allocate_id] <= allocate_rw;
end
if (finalize_valid && finalize_is_pending) begin
next_index[finalize_previd] <= finalize_id;
if (finalize_valid && finalize_pending) begin
next_index[finalize_prev] <= finalize_id;
end
dequeue_id_r <= dequeue_id_n;
@ -209,21 +217,20 @@ module VX_cache_mshr #(
next_table <= next_table_n;
end
`RUNTIME_ASSERT(~(allocate_fire && valid_table[allocate_id_r]), ("%t: *** %s inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
`CS_BANK_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id_r, alc_req_uuid))
`RUNTIME_ASSERT((~allocate_fire || ~valid_table[allocate_id_r]), ("%t: *** %s inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id_r, lkp_req_uuid))
`RUNTIME_ASSERT(~(finalize_valid && ~valid_table[finalize_id]), ("%t: *** %s invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
`CS_BANK_TO_FULL_ADDR(addr_table[finalize_id], BANK_ID), finalize_id, fin_req_uuid))
`RUNTIME_ASSERT((~finalize_valid || valid_table[finalize_id]), ("%t: *** %s invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[finalize_id], BANK_ID), finalize_id, fin_req_uuid))
`RUNTIME_ASSERT(~(fill_valid && ~valid_table[fill_id]), ("%t: *** %s invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID,
`CS_BANK_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), fill_id))
`RUNTIME_ASSERT((~fill_valid || valid_table[fill_id]), ("%t: *** %s invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), fill_id))
VX_dp_ram #(
.DATAW (DATA_WIDTH),
.SIZE (MSHR_SIZE),
.RDW_MODE ("R"),
.RADDR_REG (1)
) mshr_store (
.DATAW (DATA_WIDTH),
.SIZE (MSHR_SIZE),
.LUTRAM (1)
) entries (
.clk (clk),
.reset (reset),
.read (1'b1),
@ -238,20 +245,19 @@ module VX_cache_mshr #(
assign fill_addr = addr_table[fill_id];
assign allocate_ready = allocate_rdy;
assign allocate_id = allocate_id_r;
assign allocate_previd = prev_idx;
assign allocate_id = allocate_id_r;
assign allocate_prev = prev_idx;
if (WRITEBACK) begin : g_pending_wb
assign allocate_pending = |addr_matches;
end else begin : g_pending_wt
// exclude write requests if writethrough
assign allocate_pending = |(addr_matches & ~write_table);
end
assign dequeue_valid = dequeue_val;
assign dequeue_addr = addr_table[dequeue_id_r];
assign dequeue_rw = write_table[dequeue_id_r];
assign dequeue_id = dequeue_id_r;
assign dequeue_valid = dequeue_val;
assign dequeue_addr = addr_table[dequeue_id_r];
assign dequeue_rw = write_table[dequeue_id_r];
assign dequeue_id = dequeue_id_r;
// return pending entries for the given cache line
assign lookup_pending = addr_matches;
assign lookup_rw = write_table;
`UNUSED_VAR (lookup_valid)
`ifdef DBG_TRACE_CACHE
reg show_table;
@ -259,42 +265,37 @@ module VX_cache_mshr #(
if (reset) begin
show_table <= 0;
end else begin
show_table <= allocate_fire || finalize_valid || fill_valid || dequeue_fire;
end
if (allocate_fire) begin
`TRACE(3, ("%t: %s allocate: addr=0x%0h, id=%0d, pending=%b, prev=%0d (#%0d)\n", $time, INSTANCE_ID,
`CS_BANK_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id, allocate_pending, prev_idx, alc_req_uuid))
end
if (finalize_valid && finalize_is_release) begin
`TRACE(3, ("%t: %s release: id=%0d (#%0d)\n", $time, INSTANCE_ID, finalize_id, fin_req_uuid))
end
if (finalize_valid && finalize_is_pending) begin
`TRACE(3, ("%t: %s finalize: id=%0d (#%0d)\n", $time, INSTANCE_ID, finalize_id, fin_req_uuid))
end
if (fill_valid) begin
`TRACE(3, ("%t: %s fill: addr=0x%0h, id=%0d\n", $time, INSTANCE_ID,
`CS_BANK_TO_FULL_ADDR(fill_addr, BANK_ID), fill_id))
end
if (dequeue_fire) begin
`TRACE(3, ("%t: %s dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID,
`CS_BANK_TO_FULL_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_uuid))
show_table <= allocate_fire || lookup_valid || finalize_valid || fill_valid || dequeue_fire;
end
if (allocate_fire)
`TRACE(3, ("%d: %s allocate: addr=0x%0h, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_prev, allocate_id, lkp_req_uuid));
if (lookup_valid)
`TRACE(3, ("%d: %s lookup: addr=0x%0h, matches=%b (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(lookup_addr, BANK_ID), lookup_pending, lkp_req_uuid));
if (finalize_valid)
`TRACE(3, ("%d: %s finalize release=%b, pending=%b, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID,
finalize_release, finalize_pending, finalize_prev, finalize_id, fin_req_uuid));
if (fill_valid)
`TRACE(3, ("%d: %s fill: addr=0x%0h, addr=0x%0h, id=%0d\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), `CS_LINE_TO_FULL_ADDR(fill_addr, BANK_ID), fill_id));
if (dequeue_fire)
`TRACE(3, ("%d: %s dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_uuid));
if (show_table) begin
`TRACE(3, ("%t: %s table", $time, INSTANCE_ID))
`TRACE(3, ("%d: %s table", $time, INSTANCE_ID));
for (integer i = 0; i < MSHR_SIZE; ++i) begin
if (valid_table[i]) begin
`TRACE(3, (" %0d=0x%0h", i, `CS_BANK_TO_FULL_ADDR(addr_table[i], BANK_ID)))
if (write_table[i]) begin
`TRACE(3, ("(w)"))
end else begin
`TRACE(3, ("(r)"))
end
if (next_table[i]) begin
`TRACE(3, ("->%0d", next_index[i]))
end
`TRACE(3, (" %0d=0x%0h", i, `CS_LINE_TO_FULL_ADDR(addr_table[i], BANK_ID)));
if (write_table[i])
`TRACE(3, ("(w)"));
else
`TRACE(3, ("(r)"));
if (next_table[i])
`TRACE(3, ("->%0d", next_index[i]));
end
end
`TRACE(3, ("\n"))
`TRACE(3, ("\n"));
end
end
`endif

View file

@ -1,210 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
// Fast PLRU encoder and decoder utility
// Adapted from BaseJump STL: http://bjump.org/data_out.html
module plru_decoder #(
parameter NUM_WAYS = 1,
parameter WAY_IDX_BITS = $clog2(NUM_WAYS),
parameter WAY_IDX_WIDTH = `UP(WAY_IDX_BITS)
) (
input wire [WAY_IDX_WIDTH-1:0] way_idx,
output wire [`UP(NUM_WAYS-1)-1:0] lru_data,
output wire [`UP(NUM_WAYS-1)-1:0] lru_mask
);
if (NUM_WAYS > 1) begin : g_dec
wire [`UP(NUM_WAYS-1)-1:0] data;
`IGNORE_UNOPTFLAT_BEGIN
wire [`UP(NUM_WAYS-1)-1:0] mask;
`IGNORE_UNOPTFLAT_END
for (genvar i = 0; i < NUM_WAYS-1; ++i) begin : g_i
if (i == 0) begin : g_i_0
assign mask[i] = 1'b1;
end else if (i % 2 == 1) begin : g_i_odd
assign mask[i] = mask[(i-1)/2] & ~way_idx[WAY_IDX_BITS-$clog2(i+2)+1];
end else begin : g_i_even
assign mask[i] = mask[(i-2)/2] & way_idx[WAY_IDX_BITS-$clog2(i+2)+1];
end
assign data[i] = ~way_idx[WAY_IDX_BITS-$clog2(i+2)];
end
assign lru_data = data;
assign lru_mask = mask;
end else begin : g_no_dec
`UNUSED_VAR (way_idx)
assign lru_data = '0;
assign lru_mask = '0;
end
endmodule
module plru_encoder #(
parameter NUM_WAYS = 1,
parameter WAY_IDX_BITS = $clog2(NUM_WAYS),
parameter WAY_IDX_WIDTH = `UP(WAY_IDX_BITS)
) (
input wire [`UP(NUM_WAYS-1)-1:0] lru_in,
output wire [WAY_IDX_WIDTH-1:0] way_idx
);
if (NUM_WAYS > 1) begin : g_enc
wire [WAY_IDX_BITS-1:0] tmp;
for (genvar i = 0; i < WAY_IDX_BITS; ++i) begin : g_i
if (i == 0) begin : g_i_0
assign tmp[WAY_IDX_WIDTH-1] = lru_in[0];
end else begin : g_i_n
VX_mux #(
.N (2**i)
) mux (
.data_in (lru_in[((2**i)-1)+:(2**i)]),
.sel_in (tmp[WAY_IDX_BITS-1-:i]),
.data_out (tmp[WAY_IDX_BITS-1-i])
);
end
end
assign way_idx = tmp;
end else begin : g_no_enc
`UNUSED_VAR (lru_in)
assign way_idx = '0;
end
endmodule
module VX_cache_repl #(
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// replacement policy
parameter REPL_POLICY = `CS_REPL_FIFO
) (
input wire clk,
input wire reset,
input wire stall,
input wire init,
input wire lookup_valid,
input wire lookup_hit,
input wire [`CS_LINE_SEL_BITS-1:0] lookup_line,
input wire [`CS_WAY_SEL_WIDTH-1:0] lookup_way,
input wire repl_valid,
input wire [`CS_LINE_SEL_BITS-1:0] repl_line,
output wire [`CS_WAY_SEL_WIDTH-1:0] repl_way
);
localparam WAY_SEL_WIDTH = `CS_WAY_SEL_WIDTH;
`UNUSED_VAR (reset)
`UNUSED_VAR (init)
`UNUSED_VAR (stall)
if (NUM_WAYS > 1) begin : g_enable
if (REPL_POLICY == `CS_REPL_PLRU) begin : g_plru
// Pseudo Least Recently Used replacement policy
localparam LRU_WIDTH = `UP(NUM_WAYS-1);
wire [LRU_WIDTH-1:0] plru_rdata;
wire [LRU_WIDTH-1:0] plru_wdata;
wire [LRU_WIDTH-1:0] plru_wmask;
VX_dp_ram #(
.DATAW (LRU_WIDTH),
.SIZE (`CS_LINES_PER_BANK),
.WRENW (LRU_WIDTH),
.RDW_MODE ("R"),
.RADDR_REG (1)
) plru_store (
.clk (clk),
.reset (1'b0),
.read (repl_valid),
.write (init || (lookup_valid && lookup_hit)),
.wren (init ? '1 : plru_wmask),
.waddr (lookup_line),
.raddr (repl_line),
.wdata (init ? '0 : plru_wdata),
.rdata (plru_rdata)
);
plru_decoder #(
.NUM_WAYS (NUM_WAYS)
) plru_dec (
.way_idx (lookup_way),
.lru_data (plru_wdata),
.lru_mask (plru_wmask)
);
plru_encoder #(
.NUM_WAYS (NUM_WAYS)
) plru_enc (
.lru_in (plru_rdata),
.way_idx (repl_way)
);
end else if (REPL_POLICY == `CS_REPL_FIFO) begin : g_fifo
// Fifo replacement policy
`UNUSED_VAR (lookup_valid)
`UNUSED_VAR (lookup_hit)
`UNUSED_VAR (lookup_line)
`UNUSED_VAR (lookup_way)
wire [WAY_SEL_WIDTH-1:0] fifo_rdata;
wire [WAY_SEL_WIDTH-1:0] fifo_wdata = fifo_rdata + 1;
VX_sp_ram #(
.DATAW (WAY_SEL_WIDTH),
.SIZE (`CS_LINES_PER_BANK),
.RDW_MODE ("R"),
.RADDR_REG (1)
) fifo_store (
.clk (clk),
.reset (1'b0),
.read (repl_valid),
.write (init || repl_valid),
.wren (1'b1),
.addr (repl_line),
.wdata (init ? '0 : fifo_wdata),
.rdata (fifo_rdata)
);
assign repl_way = fifo_rdata;
end else begin : g_random
// Random replacement policy
`UNUSED_VAR (lookup_valid)
`UNUSED_VAR (lookup_hit)
`UNUSED_VAR (lookup_line)
`UNUSED_VAR (lookup_way)
`UNUSED_VAR (repl_valid)
`UNUSED_VAR (repl_line)
reg [WAY_SEL_WIDTH-1:0] victim_idx;
always @(posedge clk) begin
if (reset) begin
victim_idx <= 0;
end else if (~stall) begin
victim_idx <= victim_idx + 1;
end
end
assign repl_way = victim_idx;
end
end else begin : g_disable
`UNUSED_VAR (clk)
`UNUSED_VAR (lookup_valid)
`UNUSED_VAR (lookup_hit)
`UNUSED_VAR (lookup_line)
`UNUSED_VAR (lookup_way)
`UNUSED_VAR (repl_valid)
`UNUSED_VAR (repl_line)
assign repl_way = 1'b0;
end
endmodule

View file

@ -14,6 +14,8 @@
`include "VX_cache_define.vh"
module VX_cache_tags #(
parameter `STRING INSTANCE_ID = "",
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
@ -25,61 +27,96 @@ module VX_cache_tags #(
// Size of a word in bytes
parameter WORD_SIZE = 1,
// Enable cache writeback
parameter WRITEBACK = 0
parameter WRITEBACK = 0,
// Request debug identifier
parameter UUID_WIDTH = 0
) (
input wire clk,
input wire reset,
// inputs
`IGNORE_UNUSED_BEGIN
input wire [`UP(UUID_WIDTH)-1:0] req_uuid,
`IGNORE_UNUSED_END
input wire stall,
// init/fill/lookup
input wire init,
input wire flush,
input wire fill,
input wire read,
input wire write,
input wire [`CS_LINE_SEL_BITS-1:0] line_idx,
input wire [`CS_TAG_SEL_BITS-1:0] line_tag,
input wire [`CS_WAY_SEL_WIDTH-1:0] evict_way,
// outputs
input wire lookup,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire [NUM_WAYS-1:0] way_sel,
output wire [NUM_WAYS-1:0] tag_matches,
// eviction
output wire evict_dirty,
output wire [NUM_WAYS-1:0] evict_way,
output wire [`CS_TAG_SEL_BITS-1:0] evict_tag
);
// valid, dirty, tag
localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS;
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_VAR (lookup)
// valid, dirty, tag
localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS;
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_ADDR_TAG(line_addr);
wire [NUM_WAYS-1:0][`CS_TAG_SEL_BITS-1:0] read_tag;
wire [NUM_WAYS-1:0] read_valid;
wire [NUM_WAYS-1:0] read_dirty;
`UNUSED_VAR (read)
if (WRITEBACK) begin : g_evict_tag_wb
assign evict_dirty = read_dirty[evict_way];
assign evict_tag = read_tag[evict_way];
end else begin : g_evict_tag_wt
`UNUSED_VAR (read_dirty)
assign evict_dirty = 1'b0;
assign evict_tag = '0;
if (NUM_WAYS > 1) begin
reg [NUM_WAYS-1:0] evict_way_r;
// cyclic assignment of replacement way
always @(posedge clk) begin
if (reset) begin
evict_way_r <= 1;
end else if (~stall) begin // holding the value on stalls prevents filling different slots twice
evict_way_r <= {evict_way_r[NUM_WAYS-2:0], evict_way_r[NUM_WAYS-1]};
end
end
assign evict_way = fill ? evict_way_r : way_sel;
VX_onehot_mux #(
.DATAW (`CS_TAG_SEL_BITS),
.N (NUM_WAYS)
) evict_tag_sel (
.data_in (read_tag),
.sel_in (evict_way),
.data_out (evict_tag)
);
end else begin
`UNUSED_VAR (stall)
assign evict_way = 1'b1;
assign evict_tag = read_tag;
end
for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_tag_store
wire way_en = (NUM_WAYS == 1) || (evict_way == i);
wire do_init = init; // init all ways
wire do_fill = fill && way_en;
wire do_flush = flush && (!WRITEBACK || way_en); // flush the whole line in writethrough mode
wire do_write = WRITEBACK && write && tag_matches[i]; // only write on tag hit
// fill and flush need to also read in writeback mode
wire fill_s = fill && (!WRITEBACK || ~stall);
wire flush_s = flush && (!WRITEBACK || ~stall);
wire line_read = read || write || (WRITEBACK && (fill || flush));
wire line_write = do_init || do_fill || do_flush || do_write;
wire line_valid = fill || write;
for (genvar i = 0; i < NUM_WAYS; ++i) begin
wire do_fill = fill_s && evict_way[i];
wire do_flush = flush_s && (!WRITEBACK || way_sel[i]); // flush the whole line in writethrough mode
wire do_write = WRITEBACK && write && tag_matches[i];
wire line_read = (WRITEBACK && (fill_s || flush_s));
wire line_write = init || do_fill || do_flush || do_write;
wire line_valid = ~(init || flush);
wire [TAG_WIDTH-1:0] line_wdata;
wire [TAG_WIDTH-1:0] line_rdata;
if (WRITEBACK) begin : g_wdata
if (WRITEBACK) begin
assign line_wdata = {line_valid, write, line_tag};
assign {read_valid[i], read_dirty[i], read_tag[i]} = line_rdata;
end else begin : g_wdata
end else begin
assign line_wdata = {line_valid, line_tag};
assign {read_valid[i], read_tag[i]} = line_rdata;
assign read_dirty[i] = 1'b0;
@ -88,22 +125,52 @@ module VX_cache_tags #(
VX_sp_ram #(
.DATAW (TAG_WIDTH),
.SIZE (`CS_LINES_PER_BANK),
.RDW_MODE ("W"),
.RADDR_REG (1)
.NO_RWCHECK (1),
.RW_ASSERT (1)
) tag_store (
.clk (clk),
.reset (reset),
.read (line_read),
.write (line_write),
.wren (1'b1),
.addr (line_idx),
.addr (line_sel),
.wdata (line_wdata),
.rdata (line_rdata)
);
end
for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_tag_matches
for (genvar i = 0; i < NUM_WAYS; ++i) begin
assign tag_matches[i] = read_valid[i] && (line_tag == read_tag[i]);
end
assign evict_dirty = | (read_dirty & evict_way);
`ifdef DBG_TRACE_CACHE
wire [`CS_LINE_ADDR_WIDTH-1:0] evict_line_addr = {evict_tag, line_sel};
always @(posedge clk) begin
if (fill && ~stall) begin
`TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, dirty=%b, evict_addr=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), evict_way, line_sel, line_tag, evict_dirty, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID)));
end
if (init) begin
`TRACE(3, ("%d: %s init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel));
end
if (flush && ~stall) begin
`TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, dirty=%b\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID), way_sel, line_sel, evict_dirty));
end
if (lookup && ~stall) begin
if (tag_matches != 0) begin
if (write)
`TRACE(3, ("%d: %s write-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid));
else
`TRACE(3, ("%d: %s read-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid));
end else begin
if (write)
`TRACE(3, ("%d: %s write-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
else
`TRACE(3, ("%d: %s read-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
end
end
end
`endif
endmodule

View file

@ -19,11 +19,8 @@ module VX_cache_top import VX_gpu_pkg::*; #(
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Number of memory ports
parameter MEM_PORTS = 1,
// Size of cache in bytes
parameter CACHE_SIZE = 65536,
parameter CACHE_SIZE = 16384,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
@ -31,39 +28,39 @@ module VX_cache_top import VX_gpu_pkg::*; #(
// Number of associative ways
parameter NUM_WAYS = 4,
// Size of a word in bytes
parameter WORD_SIZE = 16,
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 8,
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 16,
// Memory Response Queue Size
parameter MRSQ_SIZE = 8,
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
parameter MREQ_SIZE = 8,
parameter MREQ_SIZE = 4,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 1,
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 1,
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = 32,
parameter TAG_WIDTH = 16,
// Core response output buffer
parameter CORE_OUT_BUF = 3,
parameter CORE_OUT_BUF = 2,
// Memory request output buffer
parameter MEM_OUT_BUF = 3,
parameter MEM_OUT_BUF = 2,
parameter MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH)
parameter MEM_TAG_WIDTH = `CLOG2(MSHR_SIZE) + `CLOG2(NUM_BANKS)
) (
input wire clk,
input wire reset,
@ -74,35 +71,35 @@ module VX_cache_top import VX_gpu_pkg::*; #(
`endif
// Core request
input wire core_req_valid [NUM_REQS],
input wire core_req_rw [NUM_REQS],
input wire[WORD_SIZE-1:0] core_req_byteen [NUM_REQS],
input wire[`CS_WORD_ADDR_WIDTH-1:0] core_req_addr [NUM_REQS],
input wire[`MEM_REQ_FLAGS_WIDTH-1:0] core_req_flags [NUM_REQS],
input wire[`CS_WORD_WIDTH-1:0] core_req_data [NUM_REQS],
input wire[TAG_WIDTH-1:0] core_req_tag [NUM_REQS],
output wire core_req_ready [NUM_REQS],
input wire [NUM_REQS-1:0] core_req_valid,
input wire [NUM_REQS-1:0] core_req_rw,
input wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr,
input wire [NUM_REQS-1:0][`ADDR_TYPE_WIDTH-1:0] core_req_atype,
input wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data,
input wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag,
output wire [NUM_REQS-1:0] core_req_ready,
// Core response
output wire core_rsp_valid [NUM_REQS],
output wire[`CS_WORD_WIDTH-1:0] core_rsp_data [NUM_REQS],
output wire[TAG_WIDTH-1:0] core_rsp_tag [NUM_REQS],
input wire core_rsp_ready [NUM_REQS],
output wire [NUM_REQS-1:0] core_rsp_valid,
output wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data,
output wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag,
input wire [NUM_REQS-1:0] core_rsp_ready,
// Memory request
output wire mem_req_valid [MEM_PORTS],
output wire mem_req_rw [MEM_PORTS],
output wire [LINE_SIZE-1:0] mem_req_byteen [MEM_PORTS],
output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr [MEM_PORTS],
output wire [`CS_LINE_WIDTH-1:0] mem_req_data [MEM_PORTS],
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag [MEM_PORTS],
input wire mem_req_ready [MEM_PORTS],
output wire mem_req_valid,
output wire mem_req_rw,
output wire [LINE_SIZE-1:0] mem_req_byteen,
output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid [MEM_PORTS],
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data [MEM_PORTS],
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag [MEM_PORTS],
output wire mem_rsp_ready [MEM_PORTS]
input wire mem_rsp_valid,
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready
);
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
@ -112,7 +109,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) mem_bus_if[MEM_PORTS]();
) mem_bus_if();
// Core request
for (genvar i = 0; i < NUM_REQS; ++i) begin
@ -120,7 +117,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
assign core_bus_if[i].req_data.rw = core_req_rw[i];
assign core_bus_if[i].req_data.byteen = core_req_byteen[i];
assign core_bus_if[i].req_data.addr = core_req_addr[i];
assign core_bus_if[i].req_data.flags = core_req_flags[i];
assign core_bus_if[i].req_data.atype = core_req_atype[i];
assign core_bus_if[i].req_data.data = core_req_data[i];
assign core_bus_if[i].req_data.tag = core_req_tag[i];
assign core_req_ready[i] = core_bus_if[i].req_ready;
@ -128,32 +125,29 @@ module VX_cache_top import VX_gpu_pkg::*; #(
// Core response
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_valid[i]= core_bus_if[i].rsp_valid;
assign core_rsp_valid[i] = core_bus_if[i].rsp_valid;
assign core_rsp_data[i] = core_bus_if[i].rsp_data.data;
assign core_rsp_tag[i] = core_bus_if[i].rsp_data.tag;
assign core_rsp_tag[i] = core_bus_if[i].rsp_data.tag;
assign core_bus_if[i].rsp_ready = core_rsp_ready[i];
end
// Memory request
for (genvar i = 0; i < MEM_PORTS; ++i) begin
assign mem_req_valid[i] = mem_bus_if[i].req_valid;
assign mem_req_rw[i] = mem_bus_if[i].req_data.rw;
assign mem_req_byteen[i]= mem_bus_if[i].req_data.byteen;
assign mem_req_addr[i] = mem_bus_if[i].req_data.addr;
assign mem_req_data[i] = mem_bus_if[i].req_data.data;
assign mem_req_tag[i] = mem_bus_if[i].req_data.tag;
assign mem_bus_if[i].req_ready = mem_req_ready[i];
end
assign mem_req_valid = mem_bus_if.req_valid;
assign mem_req_rw = mem_bus_if.req_data.rw;
assign mem_req_byteen = mem_bus_if.req_data.byteen;
assign mem_req_addr = mem_bus_if.req_data.addr;
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
`UNUSED_VAR (mem_bus_if.req_data.atype)
// Memory response
for (genvar i = 0; i < MEM_PORTS; ++i) begin
assign mem_bus_if[i].rsp_valid = mem_rsp_valid[i];
assign mem_bus_if[i].rsp_data.data = mem_rsp_data[i];
assign mem_bus_if[i].rsp_data.tag = mem_rsp_tag[i];
assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready;
end
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data;
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
assign mem_rsp_ready = mem_bus_if.rsp_ready;
VX_cache_wrap #(
VX_cache #(
.INSTANCE_ID (INSTANCE_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
@ -161,7 +155,6 @@ module VX_cache_top import VX_gpu_pkg::*; #(
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.MEM_PORTS (MEM_PORTS),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MRSQ_SIZE (MRSQ_SIZE),

View file

@ -21,26 +21,24 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Number of memory ports
parameter MEM_PORTS = 1,
// Size of cache in bytes
parameter CACHE_SIZE = 4096,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 4,
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 4,
parameter NUM_WAYS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 16,
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 4,
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 16,
parameter MSHR_SIZE = 8,
// Memory Response Queue Size
parameter MRSQ_SIZE = 4,
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
parameter MREQ_SIZE = 4,
@ -53,18 +51,12 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Replacement policy
parameter REPL_POLICY = `CS_REPL_FIFO,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 1,
// core request flags
parameter FLAGS_WIDTH = 0,
// enable bypass for non-cacheable addresses
parameter NC_ENABLE = 0,
@ -72,10 +64,10 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
parameter PASSTHRU = 0,
// Core response output buffer
parameter CORE_OUT_BUF = 3,
parameter CORE_OUT_BUF = 0,
// Memory request output buffer
parameter MEM_OUT_BUF = 3
parameter MEM_OUT_BUF = 0
) (
input wire clk,
@ -87,16 +79,19 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
`endif
VX_mem_bus_if.slave core_bus_if [NUM_REQS],
VX_mem_bus_if.master mem_bus_if [MEM_PORTS]
VX_mem_bus_if.master mem_bus_if
);
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
localparam CACHE_MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH);
localparam BYPASS_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, MEM_PORTS, LINE_SIZE, WORD_SIZE, TAG_WIDTH);
localparam NC_TAG_WIDTH = `MAX(CACHE_MEM_TAG_WIDTH, BYPASS_TAG_WIDTH) + 1;
localparam MEM_TAG_WIDTH = PASSTHRU ? BYPASS_TAG_WIDTH : (NC_ENABLE ? NC_TAG_WIDTH : CACHE_MEM_TAG_WIDTH);
localparam BYPASS_ENABLE = (NC_ENABLE || PASSTHRU);
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
localparam CACHE_MEM_TAG_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
localparam NC_OR_BYPASS = (NC_ENABLE || PASSTHRU);
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
@ -106,21 +101,18 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (CACHE_MEM_TAG_WIDTH)
) mem_bus_cache_if[MEM_PORTS]();
) mem_bus_cache_if();
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) mem_bus_tmp_if[MEM_PORTS]();
if (NC_OR_BYPASS) begin
if (BYPASS_ENABLE) begin : g_bypass
`RESET_RELAY (nc_bypass_reset, reset);
VX_cache_bypass #(
.NUM_REQS (NUM_REQS),
.MEM_PORTS (MEM_PORTS),
.TAG_SEL_IDX (TAG_SEL_IDX),
.CACHE_ENABLE (!PASSTHRU),
.PASSTHRU (PASSTHRU),
.NC_ENABLE (PASSTHRU ? 0 : NC_ENABLE),
.WORD_SIZE (WORD_SIZE),
.LINE_SIZE (LINE_SIZE),
@ -130,6 +122,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.MEM_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH),
.MEM_TAG_IN_WIDTH (CACHE_MEM_TAG_WIDTH),
.MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH),
.UUID_WIDTH (UUID_WIDTH),
@ -137,35 +130,51 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.MEM_OUT_BUF (MEM_OUT_BUF)
) cache_bypass (
.clk (clk),
.reset (reset),
.reset (nc_bypass_reset),
.core_bus_in_if (core_bus_if),
.core_bus_out_if(core_bus_cache_if),
.mem_bus_in_if (mem_bus_cache_if),
.mem_bus_out_if (mem_bus_tmp_if)
.mem_bus_out_if (mem_bus_if)
);
end else begin : g_no_bypass
end else begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_cache_if
for (genvar i = 0; i < NUM_REQS; ++i) begin
`ASSIGN_VX_MEM_BUS_IF (core_bus_cache_if[i], core_bus_if[i]);
end
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_tmp_if
`ASSIGN_VX_MEM_BUS_IF (mem_bus_tmp_if[i], mem_bus_cache_if[i]);
end
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_cache_if);
end
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_if
if (WRITE_ENABLE) begin : g_we
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
end else begin : g_ro
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
end
end
if (PASSTHRU != 0) begin
if (PASSTHRU == 0) begin : g_cache
for (genvar i = 0; i < NUM_REQS; ++i) begin
`UNUSED_VAR (core_bus_cache_if[i].req_valid)
`UNUSED_VAR (core_bus_cache_if[i].req_data)
assign core_bus_cache_if[i].req_ready = 0;
assign core_bus_cache_if[i].rsp_valid = 0;
assign core_bus_cache_if[i].rsp_data = '0;
`UNUSED_VAR (core_bus_cache_if[i].rsp_ready)
end
assign mem_bus_cache_if.req_valid = 0;
assign mem_bus_cache_if.req_data = '0;
`UNUSED_VAR (mem_bus_cache_if.req_ready)
`UNUSED_VAR (mem_bus_cache_if.rsp_valid)
`UNUSED_VAR (mem_bus_cache_if.rsp_data)
assign mem_bus_cache_if.rsp_ready = 0;
`ifdef PERF_ENABLE
assign cache_perf = '0;
`endif
end else begin
`RESET_RELAY (cache_reset, reset);
VX_cache #(
.INSTANCE_ID (INSTANCE_ID),
@ -175,23 +184,20 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.MEM_PORTS (MEM_PORTS),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.REPL_POLICY (REPL_POLICY),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.FLAGS_WIDTH (FLAGS_WIDTH),
.CORE_OUT_BUF (BYPASS_ENABLE ? 1 : CORE_OUT_BUF),
.MEM_OUT_BUF (BYPASS_ENABLE ? 1 : MEM_OUT_BUF)
.CORE_OUT_BUF (NC_OR_BYPASS ? 1 : CORE_OUT_BUF),
.MEM_OUT_BUF (NC_OR_BYPASS ? 1 : MEM_OUT_BUF)
) cache (
.clk (clk),
.reset (reset),
.reset (cache_reset),
`ifdef PERF_ENABLE
.cache_perf (cache_perf),
`endif
@ -199,105 +205,64 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.mem_bus_if (mem_bus_cache_if)
);
end else begin : g_passthru
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_cache_if
`UNUSED_VX_MEM_BUS_IF (core_bus_cache_if[i])
end
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_cache_if
`INIT_VX_MEM_BUS_IF (mem_bus_cache_if[i])
end
`ifdef PERF_ENABLE
wire [NUM_REQS-1:0] perf_core_reads_per_req;
wire [NUM_REQS-1:0] perf_core_writes_per_req;
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
wire [MEM_PORTS-1:0] perf_mem_stall_per_port;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req
assign perf_core_reads_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && ~core_bus_if[i].req_data.rw;
assign perf_core_writes_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && core_bus_if[i].req_data.rw;
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready;
end
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_perf_mem_stall_per_port
assign perf_mem_stall_per_port[i] = mem_bus_if[i].req_valid && ~mem_bus_if[i].req_ready;
end
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
wire [`CLOG2(MEM_PORTS+1)-1:0] perf_mem_stall_per_cycle;
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
`POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port);
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
always @(posedge clk) begin
if (reset) begin
perf_core_reads <= '0;
perf_core_writes <= '0;
perf_mem_stalls <= '0;
perf_crsp_stalls <= '0;
end else begin
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'(perf_mem_stall_per_cycle);
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
end
end
assign cache_perf.reads = perf_core_reads;
assign cache_perf.writes = perf_core_writes;
assign cache_perf.read_misses = '0;
assign cache_perf.write_misses = '0;
assign cache_perf.bank_stalls = '0;
assign cache_perf.mshr_stalls = '0;
assign cache_perf.mem_stalls = perf_mem_stalls;
assign cache_perf.crsp_stalls = perf_crsp_stalls;
`endif
end
`ifdef DBG_TRACE_CACHE
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_trace_core
for (genvar i = 0; i < NUM_REQS; ++i) begin
wire [`UP(UUID_WIDTH)-1:0] core_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] core_rsp_uuid;
if (UUID_WIDTH != 0) begin
assign core_req_uuid = core_bus_if[i].req_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
assign core_rsp_uuid = core_bus_if[i].rsp_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign core_req_uuid = 0;
assign core_rsp_uuid = 0;
end
wire core_req_fire = core_bus_if[i].req_valid && core_bus_if[i].req_ready;
wire core_rsp_fire = core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready;
always @(posedge clk) begin
if (core_bus_if[i].req_valid && core_bus_if[i].req_ready) begin
if (core_bus_if[i].req_data.rw) begin
`TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_bus_if[i].req_data.tag.uuid))
end else begin
`TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.tag.uuid))
end
if (core_req_fire) begin
if (core_bus_if[i].req_data.rw)
`TRACE(1, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid));
else
`TRACE(1, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_req_uuid));
end
if (core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready) begin
`TRACE(2, ("%t: %s core-rd-rsp[%0d]: tag=0x%0h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, core_bus_if[i].rsp_data.tag.value, core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag.uuid))
if (core_rsp_fire) begin
`TRACE(1, ("%d: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid));
end
end
end
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_trace_mem
always @(posedge clk) begin
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
if (mem_bus_if[i].req_data.rw) begin
`TRACE(2, ("%t: %s mem-wr-req[%0d]: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.uuid))
end else begin
`TRACE(2, ("%t: %s mem-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
end
end
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
`TRACE(2, ("%t: %s mem-rd-rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid))
end
wire [`UP(UUID_WIDTH)-1:0] mem_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] mem_rsp_uuid;
if ((UUID_WIDTH != 0) && (NC_OR_BYPASS != 0)) begin
assign mem_req_uuid = mem_bus_if.req_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
assign mem_rsp_uuid = mem_bus_if.rsp_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign mem_req_uuid = 0;
assign mem_rsp_uuid = 0;
end
wire mem_req_fire = mem_bus_if.req_valid && mem_bus_if.req_ready;
wire mem_rsp_fire = mem_bus_if.rsp_valid && mem_bus_if.rsp_ready;
always @(posedge clk) begin
if (mem_req_fire) begin
if (mem_bus_if.req_data.rw)
`TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_bus_if.req_data.byteen, mem_bus_if.req_data.data, mem_req_uuid));
else
`TRACE(1, ("%d: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_req_uuid));
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data, mem_rsp_uuid));
end
end
`endif

View file

@ -71,19 +71,19 @@ module VX_alu_int #(
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_imm = execute_if.data.op_args.alu.use_imm ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_args.alu.imm)}} : alu_in2;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_br = (execute_if.data.op_args.alu.use_imm && ~is_br_op) ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_args.alu.imm)}} : alu_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_add_result
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign add_result[i] = alu_in1_PC[i] + alu_in2_imm[i];
assign add_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] + alu_in2_imm[i][31:0]));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_sub_result
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN:0] sub_in1 = {is_signed & alu_in1[i][`XLEN-1], alu_in1[i]};
wire [`XLEN:0] sub_in2 = {is_signed & alu_in2_br[i][`XLEN-1], alu_in2_br[i]};
assign sub_result[i] = sub_in1 - sub_in2;
assign sub_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] - alu_in2_imm[i][31:0]));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_shr_result
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN:0] shr_in1 = {is_signed && alu_in1[i][`XLEN-1], alu_in1[i]};
always @(*) begin
case (alu_op[1:0])
@ -102,7 +102,7 @@ module VX_alu_int #(
assign shr_result_w[i] = `XLEN'($signed(shr_res_w));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_msc_result
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
case (alu_op[1:0])
2'b00: msc_result[i] = alu_in1[i] & alu_in2_imm[i]; // AND
@ -114,7 +114,7 @@ module VX_alu_int #(
assign msc_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] << alu_in2_imm[i][4:0])); // SLLW
end
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_alu_result
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] slt_br_result = `XLEN'({is_br_op && ~(| sub_result[i][`XLEN-1:0]), sub_result[i][`XLEN]});
wire [`XLEN-1:0] sub_slt_br_result = (is_sub_op && ~is_br_op) ? sub_result[i][`XLEN-1:0] : slt_br_result;
always @(*) begin
@ -141,9 +141,9 @@ module VX_alu_int #(
assign cbr_dest = add_result[0][1 +: `PC_BITS];
if (LANE_BITS != 0) begin : g_tid
if (LANE_BITS != 0) begin
assign tid = execute_if.data.tid[0 +: LANE_BITS];
end else begin : g_tid_0
end else begin
assign tid = 0;
end
@ -185,7 +185,7 @@ module VX_alu_int #(
.data_out ({branch_ctl_if.valid, branch_ctl_if.wid, branch_ctl_if.taken, branch_ctl_if.dest})
);
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_commit
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign commit_if.data.data[i] = (is_br_op_r && is_br_static) ? {(PC_r + `PC_BITS'(2)), 1'd0} : alu_result_r[i];
end
@ -194,8 +194,8 @@ module VX_alu_int #(
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (br_enable) begin
`TRACE(2, ("%t: %s branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n",
$time, INSTANCE_ID, br_wid, {commit_if.data.PC, 1'b0}, br_taken, {br_dest, 1'b0}, commit_if.data.uuid))
`TRACE(1, ("%d: %s-branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n",
$time, INSTANCE_ID, br_wid, {commit_if.data.PC, 1'b0}, br_taken, {br_dest, 1'b0}, commit_if.data.uuid));
end
end
`endif

View file

@ -68,7 +68,7 @@ module VX_alu_muldiv #(
wire mul_fire_in = mul_valid_in && mul_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mul_result_tmp
for (genvar i = 0; i < NUM_LANES; ++i) begin
reg [`XLEN-1:0] mul_resultl, mul_resulth;
wire [`XLEN-1:0] mul_in1 = is_alu_w ? (execute_if.data.rs1_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs1_data[i];
wire [`XLEN-1:0] mul_in2 = is_alu_w ? (execute_if.data.rs2_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs2_data[i];
@ -103,7 +103,7 @@ module VX_alu_muldiv #(
wire [NUM_LANES-1:0][`XLEN:0] mul_in1;
wire [NUM_LANES-1:0][`XLEN:0] mul_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mul_in
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign mul_in1[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]} : {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]};
assign mul_in2[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]} : {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]};
end
@ -149,7 +149,7 @@ module VX_alu_muldiv #(
`else
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_multiplier
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN:0] mul_in1 = {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]};
wire [`XLEN:0] mul_in2 = {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]};
@ -184,7 +184,7 @@ module VX_alu_muldiv #(
`endif
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mul_result_out
for (genvar i = 0; i < NUM_LANES; ++i) begin
`ifdef XLEN_64
assign mul_result_out[i] = is_mulh_out ? mul_result_tmp[i][2*(`XLEN)-1:`XLEN] :
(is_mul_w_out ? `XLEN'($signed(mul_result_tmp[i][31:0])) :
@ -219,7 +219,7 @@ module VX_alu_muldiv #(
wire [NUM_LANES-1:0][`XLEN-1:0] div_in1;
wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_div_in
for (genvar i = 0; i < NUM_LANES; ++i) begin
`ifdef XLEN_64
assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
@ -234,7 +234,7 @@ module VX_alu_muldiv #(
wire [NUM_LANES-1:0][`XLEN-1:0] div_result_in;
wire div_fire_in = div_valid_in && div_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_div_result_in
for (genvar i = 0; i < NUM_LANES; ++i) begin
reg [`XLEN-1:0] div_quotient, div_remainder;
always @(*) begin
dpi_idiv (div_fire_in, is_signed_op, div_in1[i], div_in2[i], div_quotient, div_remainder);
@ -306,7 +306,7 @@ module VX_alu_muldiv #(
assign {div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out, is_div_w_out, div_pid_out, div_sop_out, div_eop_out} = div_tag_r;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_div_result_out
for (genvar i = 0; i < NUM_LANES; ++i) begin
`ifdef XLEN_64
assign div_result_out[i] = is_rem_op_out ? (is_div_w_out ? `XLEN'($signed(div_remainder[i][31:0])) : div_remainder[i]) :
(is_div_w_out ? `XLEN'($signed(div_quotient[i][31:0])) : div_quotient[i]);
@ -324,8 +324,8 @@ module VX_alu_muldiv #(
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (TAG_WIDTH + (NUM_LANES * `XLEN)),
.ARBITER ("P"),
.OUT_BUF (2)
.ARBITER ("F"),
.OUT_BUF (1)
) rsp_buf (
.clk (clk),
.reset (reset),

View file

@ -30,24 +30,20 @@ module VX_alu_unit #(
`UNUSED_SPARAM (INSTANCE_ID)
localparam BLOCK_SIZE = `NUM_ALU_BLOCKS;
localparam NUM_LANES = `NUM_ALU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam RSP_ARB_SIZE = 1 + `EXT_M_ENABLED;
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
localparam PE_COUNT = 1 + `EXT_M_ENABLED;
localparam PE_SEL_BITS = `CLOG2(PE_COUNT);
localparam PE_IDX_INT = 0;
localparam PE_IDX_MDV = PE_IDX_INT + `EXT_M_ENABLED;
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) per_block_execute_if[BLOCK_SIZE]();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) per_block_commit_if[BLOCK_SIZE]();
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_BUF (PARTIAL_BW ? 3 : 0)
.OUT_BUF (PARTIAL_BW ? 1 : 0)
) dispatch_unit (
.clk (clk),
.reset (reset),
@ -55,62 +51,103 @@ module VX_alu_unit #(
.execute_if (per_block_execute_if)
);
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_alus
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) per_block_commit_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
`RESET_RELAY_EN (block_reset, reset,(BLOCK_SIZE > 1));
wire is_muldiv_op = `EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == `ALU_TYPE_MULDIV);
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) pe_execute_if[PE_COUNT]();
) int_execute_if();
VX_commit_if#(
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) pe_commit_if[PE_COUNT]();
) int_commit_if();
reg [`UP(PE_SEL_BITS)-1:0] pe_select;
always @(*) begin
pe_select = PE_IDX_INT;
if (`EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == `ALU_TYPE_MULDIV))
pe_select = PE_IDX_MDV;
end
VX_pe_switch #(
.PE_COUNT (PE_COUNT),
.NUM_LANES (NUM_LANES),
.ARBITER ("R"),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (PARTIAL_BW ? 1 : 3)
) pe_switch (
.clk (clk),
.reset (reset),
.pe_sel (pe_select),
.execute_in_if (per_block_execute_if[block_idx]),
.commit_out_if (per_block_commit_if[block_idx]),
.execute_out_if (pe_execute_if),
.commit_in_if (pe_commit_if)
);
assign int_execute_if.valid = per_block_execute_if[block_idx].valid && ~is_muldiv_op;
assign int_execute_if.data = per_block_execute_if[block_idx].data;
VX_alu_int #(
.INSTANCE_ID (`SFORMATF(("%s-int%0d", INSTANCE_ID, block_idx))),
.INSTANCE_ID ($sformatf("%s-int%0d", INSTANCE_ID, block_idx)),
.BLOCK_IDX (block_idx),
.NUM_LANES (NUM_LANES)
) alu_int (
.clk (clk),
.reset (reset),
.execute_if (pe_execute_if[PE_IDX_INT]),
.reset (block_reset),
.execute_if (int_execute_if),
.branch_ctl_if (branch_ctl_if[block_idx]),
.commit_if (pe_commit_if[PE_IDX_INT])
.commit_if (int_commit_if)
);
`ifdef EXT_M_ENABLE
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) muldiv_execute_if();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) muldiv_commit_if();
assign muldiv_execute_if.valid = per_block_execute_if[block_idx].valid && is_muldiv_op;
assign muldiv_execute_if.data = per_block_execute_if[block_idx].data;
VX_alu_muldiv #(
.INSTANCE_ID (`SFORMATF(("%s-muldiv%0d", INSTANCE_ID, block_idx))),
.INSTANCE_ID ($sformatf("%s-muldiv%0d", INSTANCE_ID, block_idx)),
.NUM_LANES (NUM_LANES)
) muldiv_unit (
.clk (clk),
.reset (reset),
.execute_if (pe_execute_if[PE_IDX_MDV]),
.commit_if (pe_commit_if[PE_IDX_MDV])
.reset (block_reset),
.execute_if (muldiv_execute_if),
.commit_if (muldiv_commit_if)
);
`endif
assign per_block_execute_if[block_idx].ready =
`ifdef EXT_M_ENABLE
is_muldiv_op ? muldiv_execute_if.ready :
`endif
int_execute_if.ready;
// send response
VX_stream_arb #(
.NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW),
.OUT_BUF (PARTIAL_BW ? 1 : 3),
.ARBITER ("F")
) rsp_arb (
.clk (clk),
.reset (block_reset),
.valid_in ({
`ifdef EXT_M_ENABLE
muldiv_commit_if.valid,
`endif
int_commit_if.valid
}),
.ready_in ({
`ifdef EXT_M_ENABLE
muldiv_commit_if.ready,
`endif
int_commit_if.ready
}),
.data_in ({
`ifdef EXT_M_ENABLE
muldiv_commit_if.data,
`endif
int_commit_if.data
}),
.data_out (per_block_commit_if[block_idx].data),
.valid_out (per_block_commit_if[block_idx].valid),
.ready_out (per_block_commit_if[block_idx].ready),
`UNUSED_PIN (sel_out)
);
end
VX_gather_unit #(

View file

@ -13,7 +13,7 @@
`include "VX_define.vh"
module VX_commit import VX_gpu_pkg::*; #(
module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
@ -41,26 +41,28 @@ module VX_commit import VX_gpu_pkg::*; #(
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] per_issue_commit_tmask;
wire [`ISSUE_WIDTH-1:0] per_issue_commit_eop;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_commit_arbs
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [`NUM_EX_UNITS-1:0] valid_in;
wire [`NUM_EX_UNITS-1:0][DATAW-1:0] data_in;
wire [`NUM_EX_UNITS-1:0] ready_in;
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin : g_data_in
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin
assign valid_in[j] = commit_if[j * `ISSUE_WIDTH + i].valid;
assign data_in[j] = commit_if[j * `ISSUE_WIDTH + i].data;
assign commit_if[j * `ISSUE_WIDTH + i].ready = ready_in[j];
end
`RESET_RELAY (arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (`NUM_EX_UNITS),
.DATAW (DATAW),
.ARBITER ("P"),
.ARBITER ("R"),
.OUT_BUF (1)
) commit_arb (
.clk (clk),
.reset (reset),
.reset (arb_reset),
.valid_in (valid_in),
.ready_in (ready_in),
.data_in (data_in),
@ -84,7 +86,7 @@ module VX_commit import VX_gpu_pkg::*; #(
assign commit_fire_any = (| per_issue_commit_fire);
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_commit_size
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [COMMIT_SIZEW-1:0] count;
`POP_COUNT(count, per_issue_commit_tmask[i]);
assign commit_size[i] = count;
@ -101,7 +103,7 @@ module VX_commit import VX_gpu_pkg::*; #(
.data_out ({commit_fire_any_r, commit_size_r})
);
VX_reduce_tree #(
VX_reduce #(
.DATAW_IN (COMMIT_SIZEW),
.DATAW_OUT (COMMIT_ALL_SIZEW),
.N (`ISSUE_WIDTH),
@ -160,7 +162,7 @@ module VX_commit import VX_gpu_pkg::*; #(
// Writeback
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_writeback
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign writeback_if[i].valid = commit_arb_if[i].valid && commit_arb_if[i].data.wb;
assign writeback_if[i].data.uuid = commit_arb_if[i].data.uuid;
assign writeback_if[i].data.wis = wid_to_wis(commit_arb_if[i].data.wid);
@ -174,15 +176,15 @@ module VX_commit import VX_gpu_pkg::*; #(
end
`ifdef DBG_TRACE_PIPELINE
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_trace
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin : g_j
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin
always @(posedge clk) begin
if (commit_if[j * `ISSUE_WIDTH + i].valid && commit_if[j * `ISSUE_WIDTH + i].ready) begin
`TRACE(1, ("%t: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, commit_if[j * `ISSUE_WIDTH + i].data.wid, {commit_if[j * `ISSUE_WIDTH + i].data.PC, 1'b0}))
`TRACE(1, ("%d: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, commit_if[j * `ISSUE_WIDTH + i].data.wid, {commit_if[j * `ISSUE_WIDTH + i].data.PC, 1'b0}));
trace_ex_type(1, j);
`TRACE(1, (", tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", commit_if[j * `ISSUE_WIDTH + i].data.tmask, commit_if[j * `ISSUE_WIDTH + i].data.wb, commit_if[j * `ISSUE_WIDTH + i].data.rd, commit_if[j * `ISSUE_WIDTH + i].data.sop, commit_if[j * `ISSUE_WIDTH + i].data.eop))
`TRACE_ARRAY1D(1, "0x%0h", commit_if[j * `ISSUE_WIDTH + i].data.data, `NUM_THREADS)
`TRACE(1, (" (#%0d)\n", commit_if[j * `ISSUE_WIDTH + i].data.uuid))
`TRACE(1, (", tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", commit_if[j * `ISSUE_WIDTH + i].data.tmask, commit_if[j * `ISSUE_WIDTH + i].data.wb, commit_if[j * `ISSUE_WIDTH + i].data.rd, commit_if[j * `ISSUE_WIDTH + i].data.sop, commit_if[j * `ISSUE_WIDTH + i].data.eop));
`TRACE_ARRAY1D(1, "0x%0h", commit_if[j * `ISSUE_WIDTH + i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", commit_if[j * `ISSUE_WIDTH + i].data.uuid));
end
end
end

View file

@ -28,7 +28,7 @@ module VX_core import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
input sysmem_perf_t sysmem_perf,
VX_mem_perf_if.slave mem_perf_if,
`endif
VX_dcr_bus_if.slave dcr_bus_if,
@ -65,37 +65,44 @@ module VX_core import VX_gpu_pkg::*; #(
) lsu_mem_if[`NUM_LSU_BLOCKS]();
`ifdef PERF_ENABLE
lmem_perf_t lmem_perf;
coalescer_perf_t coalescer_perf;
pipeline_perf_t pipeline_perf;
sysmem_perf_t sysmem_perf_tmp;
always @(*) begin
sysmem_perf_tmp = sysmem_perf;
sysmem_perf_tmp.lmem = lmem_perf;
sysmem_perf_tmp.coalescer = coalescer_perf;
end
VX_mem_perf_if mem_perf_tmp_if();
VX_pipeline_perf_if pipeline_perf_if();
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif
`RESET_RELAY (dcr_data_reset, reset);
`RESET_RELAY (schedule_reset, reset);
`RESET_RELAY (fetch_reset, reset);
`RESET_RELAY (decode_reset, reset);
`RESET_RELAY (issue_reset, reset);
`RESET_RELAY (execute_reset, reset);
`RESET_RELAY (commit_reset, reset);
base_dcrs_t base_dcrs;
VX_dcr_data dcr_data (
.clk (clk),
.reset (reset),
.reset (dcr_data_reset),
.dcr_bus_if (dcr_bus_if),
.base_dcrs (base_dcrs)
);
`SCOPE_IO_SWITCH (3);
`SCOPE_IO_SWITCH (3)
VX_schedule #(
.INSTANCE_ID (`SFORMATF(("%s-schedule", INSTANCE_ID))),
.INSTANCE_ID ($sformatf("%s-schedule", INSTANCE_ID)),
.CORE_ID (CORE_ID)
) schedule (
.clk (clk),
.reset (reset),
.reset (schedule_reset),
`ifdef PERF_ENABLE
.sched_perf (pipeline_perf.sched),
.sched_perf (pipeline_perf_if.sched),
`endif
.base_dcrs (base_dcrs),
@ -116,36 +123,36 @@ module VX_core import VX_gpu_pkg::*; #(
);
VX_fetch #(
.INSTANCE_ID (`SFORMATF(("%s-fetch", INSTANCE_ID)))
.INSTANCE_ID ($sformatf("%s-fetch", INSTANCE_ID))
) fetch (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (reset),
.reset (fetch_reset),
.icache_bus_if (icache_bus_if),
.schedule_if (schedule_if),
.fetch_if (fetch_if)
);
VX_decode #(
.INSTANCE_ID (`SFORMATF(("%s-decode", INSTANCE_ID)))
.INSTANCE_ID ($sformatf("%s-decode", INSTANCE_ID))
) decode (
.clk (clk),
.reset (reset),
.reset (decode_reset),
.fetch_if (fetch_if),
.decode_if (decode_if),
.decode_sched_if(decode_sched_if)
);
VX_issue #(
.INSTANCE_ID (`SFORMATF(("%s-issue", INSTANCE_ID)))
.INSTANCE_ID ($sformatf("%s-issue", INSTANCE_ID))
) issue (
`SCOPE_IO_BIND (1)
.clk (clk),
.reset (reset),
.reset (issue_reset),
`ifdef PERF_ENABLE
.issue_perf (pipeline_perf.issue),
.issue_perf (pipeline_perf_if.issue),
`endif
.decode_if (decode_if),
@ -154,17 +161,17 @@ module VX_core import VX_gpu_pkg::*; #(
);
VX_execute #(
.INSTANCE_ID (`SFORMATF(("%s-execute", INSTANCE_ID))),
.INSTANCE_ID ($sformatf("%s-execute", INSTANCE_ID)),
.CORE_ID (CORE_ID)
) execute (
`SCOPE_IO_BIND (2)
.clk (clk),
.reset (reset),
.reset (execute_reset),
`ifdef PERF_ENABLE
.sysmem_perf (sysmem_perf_tmp),
.pipeline_perf (pipeline_perf),
.mem_perf_if (mem_perf_tmp_if),
.pipeline_perf_if(pipeline_perf_if),
`endif
.base_dcrs (base_dcrs),
@ -182,10 +189,10 @@ module VX_core import VX_gpu_pkg::*; #(
);
VX_commit #(
.INSTANCE_ID (`SFORMATF(("%s-commit", INSTANCE_ID)))
.INSTANCE_ID ($sformatf("%s-commit", INSTANCE_ID))
) commit (
.clk (clk),
.reset (reset),
.reset (commit_reset),
.commit_if (commit_if),
@ -195,19 +202,134 @@ module VX_core import VX_gpu_pkg::*; #(
.commit_sched_if(commit_sched_if)
);
VX_mem_unit #(
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_dcache_if[`NUM_LSU_BLOCKS]();
`ifdef LMEM_ENABLE
`RESET_RELAY (lmem_unit_reset, reset);
VX_lmem_unit #(
.INSTANCE_ID (INSTANCE_ID)
) mem_unit (
.clk (clk),
.reset (reset),
) lmem_unit (
.clk (clk),
.reset (lmem_unit_reset),
`ifdef PERF_ENABLE
.lmem_perf (lmem_perf),
.coalescer_perf(coalescer_perf),
.cache_perf (mem_perf_tmp_if.lmem),
`endif
.lsu_mem_if (lsu_mem_if),
.dcache_bus_if (dcache_bus_if)
.lsu_mem_in_if (lsu_mem_if),
.lsu_mem_out_if (lsu_dcache_if)
);
`else
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
`ASSIGN_VX_LSU_MEM_IF (lsu_dcache_if[i], lsu_mem_if[i]);
end
`endif
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
VX_lsu_mem_if #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_coalesced_if();
if (LSU_WORD_SIZE != DCACHE_WORD_SIZE) begin
`RESET_RELAY (mem_coalescer_reset, reset);
VX_mem_coalescer #(
.INSTANCE_ID ($sformatf("%s-coalescer%0d", INSTANCE_ID, i)),
.NUM_REQS (`NUM_LSU_LANES),
.DATA_IN_SIZE (LSU_WORD_SIZE),
.DATA_OUT_SIZE (DCACHE_WORD_SIZE),
.ADDR_WIDTH (LSU_ADDR_WIDTH),
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.QUEUE_SIZE (`LSUQ_OUT_SIZE)
) mem_coalescer (
.clk (clk),
.reset (mem_coalescer_reset),
// Input request
.in_req_valid (lsu_dcache_if[i].req_valid),
.in_req_mask (lsu_dcache_if[i].req_data.mask),
.in_req_rw (lsu_dcache_if[i].req_data.rw),
.in_req_byteen (lsu_dcache_if[i].req_data.byteen),
.in_req_addr (lsu_dcache_if[i].req_data.addr),
.in_req_atype (lsu_dcache_if[i].req_data.atype),
.in_req_data (lsu_dcache_if[i].req_data.data),
.in_req_tag (lsu_dcache_if[i].req_data.tag),
.in_req_ready (lsu_dcache_if[i].req_ready),
// Input response
.in_rsp_valid (lsu_dcache_if[i].rsp_valid),
.in_rsp_mask (lsu_dcache_if[i].rsp_data.mask),
.in_rsp_data (lsu_dcache_if[i].rsp_data.data),
.in_rsp_tag (lsu_dcache_if[i].rsp_data.tag),
.in_rsp_ready (lsu_dcache_if[i].rsp_ready),
// Output request
.out_req_valid (dcache_coalesced_if.req_valid),
.out_req_mask (dcache_coalesced_if.req_data.mask),
.out_req_rw (dcache_coalesced_if.req_data.rw),
.out_req_byteen (dcache_coalesced_if.req_data.byteen),
.out_req_addr (dcache_coalesced_if.req_data.addr),
.out_req_atype (dcache_coalesced_if.req_data.atype),
.out_req_data (dcache_coalesced_if.req_data.data),
.out_req_tag (dcache_coalesced_if.req_data.tag),
.out_req_ready (dcache_coalesced_if.req_ready),
// Output response
.out_rsp_valid (dcache_coalesced_if.rsp_valid),
.out_rsp_mask (dcache_coalesced_if.rsp_data.mask),
.out_rsp_data (dcache_coalesced_if.rsp_data.data),
.out_rsp_tag (dcache_coalesced_if.rsp_data.tag),
.out_rsp_ready (dcache_coalesced_if.rsp_ready)
);
end else begin
`ASSIGN_VX_LSU_MEM_IF (dcache_coalesced_if, lsu_dcache_if[i]);
end
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_tmp_if[DCACHE_CHANNELS]();
`RESET_RELAY (lsu_adapter_reset, reset);
VX_lsu_adapter #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (0)
) lsu_adapter (
.clk (clk),
.reset (lsu_adapter_reset),
.lsu_mem_if (dcache_coalesced_if),
.mem_bus_if (dcache_bus_tmp_if)
);
for (genvar j = 0; j < DCACHE_CHANNELS; ++j) begin
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i * DCACHE_CHANNELS + j], dcache_bus_tmp_if[j]);
end
end
`ifdef PERF_ENABLE
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
@ -231,8 +353,8 @@ module VX_core import VX_gpu_pkg::*; #(
wire [LSU_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
wire [LSU_NUM_REQS-1:0] perf_dcache_rsp_fire;
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_perf_dcache
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin : g_j
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
assign perf_dcache_rd_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && ~lsu_mem_if[i].req_data.rw;
assign perf_dcache_wr_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && lsu_mem_if[i].req_data.rw;
assign perf_dcache_rsp_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].rsp_valid && lsu_mem_if[i].rsp_data.mask[j] && lsu_mem_if[i].rsp_ready;
@ -278,11 +400,12 @@ module VX_core import VX_gpu_pkg::*; #(
end
end
assign pipeline_perf.ifetches = perf_ifetches;
assign pipeline_perf.loads = perf_loads;
assign pipeline_perf.stores = perf_stores;
assign pipeline_perf.ifetch_latency = perf_icache_lat;
assign pipeline_perf.load_latency = perf_dcache_lat;
assign pipeline_perf_if.ifetches = perf_ifetches;
assign pipeline_perf_if.loads = perf_loads;
assign pipeline_perf_if.stores = perf_stores;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
`endif

View file

@ -32,7 +32,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
output wire [DCACHE_NUM_REQS-1:0] dcache_req_rw,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] dcache_req_byteen,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] dcache_req_addr,
output wire [DCACHE_NUM_REQS-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] dcache_req_flags,
output wire [DCACHE_NUM_REQS-1:0][`ADDR_TYPE_WIDTH-1:0] dcache_req_atype,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_req_data,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] dcache_req_tag,
input wire [DCACHE_NUM_REQS-1:0] dcache_req_ready,
@ -96,7 +96,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign dcache_req_rw[i] = dcache_bus_if[i].req_data.rw;
assign dcache_req_byteen[i] = dcache_bus_if[i].req_data.byteen;
assign dcache_req_addr[i] = dcache_bus_if[i].req_data.addr;
assign dcache_req_flags[i] = dcache_bus_if[i].req_data.flags;
assign dcache_req_atype[i] = dcache_bus_if[i].req_data.atype;
assign dcache_req_data[i] = dcache_bus_if[i].req_data.data;
assign dcache_req_tag[i] = dcache_bus_if[i].req_data.tag;
assign dcache_bus_if[i].req_ready = dcache_req_ready[i];
@ -119,7 +119,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign icache_req_data = icache_bus_if.req_data.data;
assign icache_req_tag = icache_bus_if.req_data.tag;
assign icache_bus_if.req_ready = icache_req_ready;
`UNUSED_VAR (icache_bus_if.req_data.flags)
`UNUSED_VAR (icache_bus_if.req_data.atype)
assign icache_bus_if.rsp_valid = icache_rsp_valid;
assign icache_bus_if.rsp_data.tag = icache_rsp_tag;
@ -127,13 +127,13 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign icache_rsp_ready = icache_bus_if.rsp_ready;
`ifdef PERF_ENABLE
sysmem_perf_t mem_perf;
assign mem_perf.icache = '0;
assign mem_perf.dcache = '0;
assign mem_perf.l2cache = '0;
assign mem_perf.l3cache = '0;
assign mem_perf.lmem = '0;
assign mem_perf.mem = '0;
VX_mem_perf_if mem_perf_if();
assign mem_perf_if.icache = '0;
assign mem_perf_if.dcache = '0;
assign mem_perf_if.l2cache = '0;
assign mem_perf_if.l3cache = '0;
assign mem_perf_if.lmem = '0;
assign mem_perf_if.mem = '0;
`endif
`ifdef SCOPE
@ -144,7 +144,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
`endif
VX_core #(
.INSTANCE_ID (`SFORMATF(("core"))),
.INSTANCE_ID ($sformatf("core")),
.CORE_ID (CORE_ID)
) core (
`SCOPE_IO_BIND (0)
@ -152,7 +152,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
.reset (reset),
`ifdef PERF_ENABLE
.sysmem_perf (sysmem_perf),
.mem_perf_if (mem_perf_if),
`endif
.dcr_bus_if (dcr_bus_if),

View file

@ -41,8 +41,8 @@ import VX_fpu_pkg::*;
input base_dcrs_t base_dcrs,
`ifdef PERF_ENABLE
input sysmem_perf_t sysmem_perf,
input pipeline_perf_t pipeline_perf,
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
`endif
VX_commit_csr_if.slave commit_csr_if,
@ -83,7 +83,7 @@ import VX_fpu_pkg::*;
wire [`NUM_FPU_BLOCKS-1:0][`NW_WIDTH-1:0] fpu_write_wid;
fflags_t [`NUM_FPU_BLOCKS-1:0] fpu_write_fflags;
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin : g_fpu_write
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
assign fpu_write_enable[i] = fpu_csr_if[i].write_enable;
assign fpu_write_wid[i] = fpu_csr_if[i].write_wid;
assign fpu_write_fflags[i] = fpu_csr_if[i].write_fflags;
@ -107,7 +107,7 @@ import VX_fpu_pkg::*;
end
end
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin : g_fpu_csr_read_frm
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
assign fpu_csr_if[i].read_frm = fcsr[fpu_csr_if[i].read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS];
end
@ -155,41 +155,41 @@ import VX_fpu_pkg::*;
// CSRs read //////////////////////////////////////////////////////////////
reg [`XLEN-1:0] read_data_ro_w;
reg [`XLEN-1:0] read_data_rw_w;
reg read_addr_valid_w;
reg [`XLEN-1:0] read_data_ro_r;
reg [`XLEN-1:0] read_data_rw_r;
reg read_addr_valid_r;
always @(*) begin
read_data_ro_w = '0;
read_data_rw_w = '0;
read_addr_valid_w = 1;
read_data_ro_r = '0;
read_data_rw_r = '0;
read_addr_valid_r = 1;
case (read_addr)
`VX_CSR_MVENDORID : read_data_ro_w = `XLEN'(`VENDOR_ID);
`VX_CSR_MARCHID : read_data_ro_w = `XLEN'(`ARCHITECTURE_ID);
`VX_CSR_MIMPID : read_data_ro_w = `XLEN'(`IMPLEMENTATION_ID);
`VX_CSR_MISA : read_data_ro_w = `XLEN'({2'(`CLOG2(`XLEN/16)), 30'(`MISA_STD)});
`VX_CSR_MVENDORID : read_data_ro_r = `XLEN'(`VENDOR_ID);
`VX_CSR_MARCHID : read_data_ro_r = `XLEN'(`ARCHITECTURE_ID);
`VX_CSR_MIMPID : read_data_ro_r = `XLEN'(`IMPLEMENTATION_ID);
`VX_CSR_MISA : read_data_ro_r = `XLEN'({2'(`CLOG2(`XLEN/16)), 30'(`MISA_STD)});
`ifdef EXT_F_ENABLE
`VX_CSR_FFLAGS : read_data_rw_w = `XLEN'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]);
`VX_CSR_FRM : read_data_rw_w = `XLEN'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
`VX_CSR_FCSR : read_data_rw_w = `XLEN'(fcsr[read_wid]);
`VX_CSR_FFLAGS : read_data_rw_r = `XLEN'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]);
`VX_CSR_FRM : read_data_rw_r = `XLEN'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
`VX_CSR_FCSR : read_data_rw_r = `XLEN'(fcsr[read_wid]);
`endif
`VX_CSR_MSCRATCH : read_data_rw_w = mscratch;
`VX_CSR_MSCRATCH : read_data_rw_r = mscratch;
`VX_CSR_WARP_ID : read_data_ro_w = `XLEN'(read_wid);
`VX_CSR_CORE_ID : read_data_ro_w = `XLEN'(CORE_ID);
`VX_CSR_ACTIVE_THREADS: read_data_ro_w = `XLEN'(thread_masks[read_wid]);
`VX_CSR_ACTIVE_WARPS: read_data_ro_w = `XLEN'(active_warps);
`VX_CSR_NUM_THREADS: read_data_ro_w = `XLEN'(`NUM_THREADS);
`VX_CSR_NUM_WARPS : read_data_ro_w = `XLEN'(`NUM_WARPS);
`VX_CSR_NUM_CORES : read_data_ro_w = `XLEN'(`NUM_CORES * `NUM_CLUSTERS);
`VX_CSR_LOCAL_MEM_BASE: read_data_ro_w = `XLEN'(`LMEM_BASE_ADDR);
`VX_CSR_WARP_ID : read_data_ro_r = `XLEN'(read_wid);
`VX_CSR_CORE_ID : read_data_ro_r = `XLEN'(CORE_ID);
`VX_CSR_ACTIVE_THREADS: read_data_ro_r = `XLEN'(thread_masks[read_wid]);
`VX_CSR_ACTIVE_WARPS: read_data_ro_r = `XLEN'(active_warps);
`VX_CSR_NUM_THREADS: read_data_ro_r = `XLEN'(`NUM_THREADS);
`VX_CSR_NUM_WARPS : read_data_ro_r = `XLEN'(`NUM_WARPS);
`VX_CSR_NUM_CORES : read_data_ro_r = `XLEN'(`NUM_CORES * `NUM_CLUSTERS);
`VX_CSR_LOCAL_MEM_BASE: read_data_ro_r = `XLEN'(`LMEM_BASE_ADDR);
`CSR_READ_64(`VX_CSR_MCYCLE, read_data_ro_w, cycles);
`CSR_READ_64(`VX_CSR_MCYCLE, read_data_ro_r, cycles);
`VX_CSR_MPM_RESERVED : read_data_ro_w = 'x;
`VX_CSR_MPM_RESERVED_H : read_data_ro_w = 'x;
`VX_CSR_MPM_RESERVED : read_data_ro_r = 'x;
`VX_CSR_MPM_RESERVED_H : read_data_ro_r = 'x;
`CSR_READ_64(`VX_CSR_MINSTRET, read_data_ro_w, commit_csr_if.instret);
`CSR_READ_64(`VX_CSR_MINSTRET, read_data_ro_r, commit_csr_if.instret);
`VX_CSR_SATP,
`VX_CSR_MSTATUS,
@ -200,79 +200,77 @@ import VX_fpu_pkg::*;
`VX_CSR_MTVEC,
`VX_CSR_MEPC,
`VX_CSR_PMPCFG0,
`VX_CSR_PMPADDR0 : read_data_ro_w = `XLEN'(0);
`VX_CSR_PMPADDR0 : read_data_ro_r = `XLEN'(0);
default: begin
read_addr_valid_w = 0;
read_addr_valid_r = 0;
if ((read_addr >= `VX_CSR_MPM_USER && read_addr < (`VX_CSR_MPM_USER + 32))
|| (read_addr >= `VX_CSR_MPM_USER_H && read_addr < (`VX_CSR_MPM_USER_H + 32))) begin
read_addr_valid_w = 1;
read_addr_valid_r = 1;
`ifdef PERF_ENABLE
case (base_dcrs.mpm_class)
`VX_DCR_MPM_CLASS_CORE: begin
case (read_addr)
// PERF: pipeline
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_w, pipeline_perf.sched.idles);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_w, pipeline_perf.sched.stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_w, pipeline_perf.issue.ibf_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_w, pipeline_perf.issue.scb_stalls);
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_w, pipeline_perf.issue.opd_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_ALU]);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_r, pipeline_perf_if.sched.idles);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_r, pipeline_perf_if.sched.stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_r, pipeline_perf_if.issue.ibf_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_r, pipeline_perf_if.issue.scb_stalls);
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_r, pipeline_perf_if.issue.opd_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_ALU]);
`ifdef EXT_F_ENABLE
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_FPU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_FPU]);
`else
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, `PERF_CTR_BITS'(0));
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, `PERF_CTR_BITS'(0));
`endif
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_LSU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_SFU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_CSRS]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_WCTL]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_LSU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_SFU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_CSRS]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_WCTL]);
// PERF: memory
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf.ifetches);
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf.loads);
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_w, pipeline_perf.stores);
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_w, pipeline_perf.ifetch_latency);
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_w, pipeline_perf.load_latency);
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_r, pipeline_perf_if.ifetches);
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_r, pipeline_perf_if.loads);
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_r, pipeline_perf_if.stores);
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_r, pipeline_perf_if.ifetch_latency);
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_r, pipeline_perf_if.load_latency);
default:;
endcase
end
`VX_DCR_MPM_CLASS_MEM: begin
case (read_addr)
// PERF: icache
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, sysmem_perf.icache.reads);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, sysmem_perf.icache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, sysmem_perf.icache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_r, mem_perf_if.icache.reads);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_r, mem_perf_if.icache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_r, mem_perf_if.icache.mshr_stalls);
// PERF: dcache
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_w, sysmem_perf.dcache.reads);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_w, sysmem_perf.dcache.writes);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_w, sysmem_perf.dcache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_w, sysmem_perf.dcache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_w, sysmem_perf.dcache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_w, sysmem_perf.dcache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_r, mem_perf_if.dcache.reads);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_r, mem_perf_if.dcache.writes);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_r, mem_perf_if.dcache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_r, mem_perf_if.dcache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_r, mem_perf_if.dcache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_r, mem_perf_if.dcache.mshr_stalls);
// PERF: lmem
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, sysmem_perf.lmem.reads);
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, sysmem_perf.lmem.writes);
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, sysmem_perf.lmem.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_r, mem_perf_if.lmem.reads);
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_r, mem_perf_if.lmem.writes);
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_r, mem_perf_if.lmem.bank_stalls);
// PERF: l2cache
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_w, sysmem_perf.l2cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_w, sysmem_perf.l2cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_w, sysmem_perf.l2cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_w, sysmem_perf.l2cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l2cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l2cache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_r, mem_perf_if.l2cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_r, mem_perf_if.l2cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_r, mem_perf_if.l2cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_r, mem_perf_if.l2cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_r, mem_perf_if.l2cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l2cache.mshr_stalls);
// PERF: l3cache
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_w, sysmem_perf.l3cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_w, sysmem_perf.l3cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_w, sysmem_perf.l3cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_w, sysmem_perf.l3cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l3cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l3cache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_r, mem_perf_if.l3cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_r, mem_perf_if.l3cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_r, mem_perf_if.l3cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_r, mem_perf_if.l3cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_r, mem_perf_if.l3cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l3cache.mshr_stalls);
// PERF: memory
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, sysmem_perf.mem.reads);
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, sysmem_perf.mem.writes);
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_w, sysmem_perf.mem.latency);
// PERF: coalescer
`CSR_READ_64(`VX_CSR_MPM_COALESCER_MISS, read_data_ro_w, sysmem_perf.coalescer.misses);
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_r, mem_perf_if.mem.reads);
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_r, mem_perf_if.mem.writes);
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_r, mem_perf_if.mem.latency);
default:;
endcase
end
@ -284,16 +282,16 @@ import VX_fpu_pkg::*;
endcase
end
assign read_data_ro = read_data_ro_w;
assign read_data_rw = read_data_rw_w;
assign read_data_ro = read_data_ro_r;
assign read_data_rw = read_data_rw_r;
`UNUSED_VAR (base_dcrs)
`RUNTIME_ASSERT(~read_enable || read_addr_valid_w, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
`ifdef PERF_ENABLE
`UNUSED_VAR (sysmem_perf.icache);
`UNUSED_VAR (sysmem_perf.lmem);
`UNUSED_VAR (mem_perf_if.icache);
`UNUSED_VAR (mem_perf_if.lmem);
`endif
endmodule

View file

@ -24,8 +24,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
input base_dcrs_t base_dcrs,
`ifdef PERF_ENABLE
input sysmem_perf_t sysmem_perf,
input pipeline_perf_t pipeline_perf,
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
`endif
`ifdef EXT_F_ENABLE
@ -66,7 +66,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
wire [NUM_LANES-1:0][`XLEN-1:0] rs1_data;
`UNUSED_VAR (rs1_data)
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_rs1_data
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign rs1_data[i] = execute_if.data.rs1_data[i];
end
@ -82,8 +82,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
.base_dcrs (base_dcrs),
`ifdef PERF_ENABLE
.sysmem_perf (sysmem_perf),
.pipeline_perf (pipeline_perf),
.mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if),
`endif
.commit_csr_if (commit_csr_if),
@ -113,15 +113,12 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
wire [NUM_LANES-1:0][`XLEN-1:0] wtid, gtid;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_wtid
if (PID_BITS != 0) begin : g_pid
for (genvar i = 0; i < NUM_LANES; ++i) begin
if (PID_BITS != 0) begin
assign wtid[i] = `XLEN'(execute_if.data.pid * NUM_LANES + i);
end else begin : g_no_pid
end else begin
assign wtid[i] = `XLEN'(i);
end
end
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_gtid
assign gtid[i] = (`XLEN'(CORE_ID) << (`NW_BITS + `NT_BITS)) + (`XLEN'(execute_if.data.wid) << `NT_BITS) + wtid[i];
end

View file

@ -13,7 +13,7 @@
`include "VX_define.vh"
module VX_dcr_data import VX_gpu_pkg::*; (
module VX_dcr_data import VX_gpu_pkg::*, VX_trace_pkg::*; (
input wire clk,
input wire reset,
@ -50,9 +50,9 @@ module VX_dcr_data import VX_gpu_pkg::*; (
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (dcr_bus_if.write_valid) begin
`TRACE(2, ("%t: base-dcr: state=", $time))
`TRACE(1, ("%d: base-dcr: state=", $time));
trace_base_dcr(1, dcr_bus_if.write_addr);
`TRACE(2, (", data=0x%h\n", dcr_bus_if.write_data))
`TRACE(1, (", data=0x%h\n", dcr_bus_if.write_data));
end
end
`endif

View file

@ -15,19 +15,19 @@
`ifdef EXT_F_ENABLE
`define USED_IREG(x) \
x``_v = {1'b0, ``x}; \
x``_r = {1'b0, ``x}; \
use_``x = 1
`define USED_FREG(x) \
x``_v = {1'b1, ``x}; \
x``_r = {1'b1, ``x}; \
use_``x = 1
`else
`define USED_IREG(x) \
x``_v = ``x; \
x``_r = ``x; \
use_``x = 1
`endif
module VX_decode import VX_gpu_pkg::*; #(
module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
@ -50,7 +50,7 @@ module VX_decode import VX_gpu_pkg::*; #(
reg [`EX_BITS-1:0] ex_type;
reg [`INST_OP_BITS-1:0] op_type;
op_args_t op_args;
reg [`NR_BITS-1:0] rd_v, rs1_v, rs2_v, rs3_v;
reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r;
reg use_rd, use_rs1, use_rs2, use_rs3;
reg is_wstall;
@ -152,13 +152,13 @@ module VX_decode import VX_gpu_pkg::*; #(
always @(*) begin
ex_type = 'x;
ex_type = '0;
op_type = 'x;
op_args = 'x;
rd_v = '0;
rs1_v = '0;
rs2_v = '0;
rs3_v = '0;
rd_r = '0;
rs1_r = '0;
rs2_r = '0;
rs3_r = '0;
use_rd = 0;
use_rs1 = 0;
use_rs2 = 0;
@ -376,16 +376,14 @@ module VX_decode import VX_gpu_pkg::*; #(
`USED_IREG (rs2);
end
`ifdef EXT_F_ENABLE
`INST_FMADD, // 7'b1000011
`INST_FMSUB, // 7'b1000111
`INST_FNMSUB, // 7'b1001011
`INST_FNMADD: // 7'b1001111
begin
`INST_FMADD,
`INST_FMSUB,
`INST_FNMSUB,
`INST_FNMADD: begin
ex_type = `EX_FPU;
op_type = `INST_OP_BITS'({2'b00, 1'b1, opcode[3]});
op_type = `INST_OP_BITS'({2'b11, opcode[3:2]});
op_args.fpu.frm = func3;
op_args.fpu.fmt[0] = func2[0]; // float / double
op_args.fpu.fmt[1] = opcode[3] ^ opcode[2]; // SUB
use_rd = 1;
`USED_FREG (rd);
`USED_FREG (rs1);
@ -401,10 +399,9 @@ module VX_decode import VX_gpu_pkg::*; #(
case (func5)
5'b00000, // FADD
5'b00001, // FSUB
5'b00010: // FMUL
begin
op_type = `INST_OP_BITS'({2'b00, 1'b0, func5[1]});
op_args.fpu.fmt[1] = func5[0]; // SUB
5'b00010, // FMUL
5'b00011: begin // FDIV
op_type = `INST_OP_BITS'(func5[1:0]);
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
@ -433,13 +430,6 @@ module VX_decode import VX_gpu_pkg::*; #(
`USED_FREG (rs1);
end
`endif
5'b00011: begin
// FDIV
op_type = `INST_OP_BITS'(`INST_FPU_DIV);
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
5'b01011: begin
// FSQRT
op_type = `INST_OP_BITS'(`INST_FPU_SQRT);
@ -537,7 +527,7 @@ module VX_decode import VX_gpu_pkg::*; #(
end
// disable write to integer register r0
wire wb = use_rd && (rd_v != 0);
wire wb = use_rd && (rd_r != 0);
VX_elastic_buffer #(
.DATAW (DATAW),
@ -547,7 +537,7 @@ module VX_decode import VX_gpu_pkg::*; #(
.reset (reset),
.valid_in (fetch_if.valid),
.ready_in (fetch_if.ready),
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_v, rs1_v, rs2_v, rs3_v}),
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_r, rs1_r, rs2_r, rs3_r}),
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
.valid_out (decode_if.valid),
.ready_out (decode_if.ready)
@ -557,10 +547,9 @@ module VX_decode import VX_gpu_pkg::*; #(
wire fetch_fire = fetch_if.valid && fetch_if.ready;
assign decode_sched_if.valid = fetch_fire;
assign decode_sched_if.wid = fetch_if.data.wid;
assign decode_sched_if.unlock = ~is_wstall;
assign decode_sched_if.valid = fetch_fire;
assign decode_sched_if.wid = fetch_if.data.wid;
assign decode_sched_if.is_wstall = is_wstall;
`ifndef L1_ENABLE
assign fetch_if.ibuf_pop = decode_if.ibuf_pop;
`endif
@ -568,14 +557,14 @@ module VX_decode import VX_gpu_pkg::*; #(
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (decode_if.valid && decode_if.ready) begin
`TRACE(1, ("%t: %s: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, INSTANCE_ID, decode_if.data.wid, {decode_if.data.PC, 1'd0}, instr))
`TRACE(1, ("%d: %s: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, INSTANCE_ID, decode_if.data.wid, {decode_if.data.PC, 1'd0}, instr));
trace_ex_type(1, decode_if.data.ex_type);
`TRACE(1, (", op="))
`TRACE(1, (", op="));
trace_ex_op(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args);
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, opds=%b%b%b%b",
decode_if.data.tmask, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, use_rd, use_rs1, use_rs2, use_rs3))
decode_if.data.tmask, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, use_rd, use_rs1, use_rs2, use_rs3));
trace_op_args(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args);
`TRACE(1, (" (#%0d)\n", decode_if.data.uuid))
`TRACE(1, (" (#%0d)\n", decode_if.data.uuid));
end
end
`endif

View file

@ -33,7 +33,7 @@ module VX_dispatch import VX_gpu_pkg::*; #(
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH;
wire [`NUM_THREADS-1:0][`NT_WIDTH-1:0] tids;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin : g_tids
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign tids[i] = `NT_WIDTH'(i);
end
@ -50,19 +50,23 @@ module VX_dispatch import VX_gpu_pkg::*; #(
`UNUSED_PIN (valid_out)
);
wire [`NUM_EX_UNITS-1:0] operands_ready_in;
assign operands_if.ready = operands_ready_in[operands_if.data.ex_type];
wire [`NUM_EX_UNITS-1:0] operands_reset;
assign operands_if.ready = operands_reset[operands_if.data.ex_type];
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
`RESET_RELAY (buffer_reset, reset);
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_buffers
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (1)
.OUT_REG (2), // 2-cycle EB for area reduction
.LUTRAM (1)
) buffer (
.clk (clk),
.reset (reset),
.reset (buffer_reset),
.valid_in (operands_if.valid && (operands_if.data.ex_type == `EX_BITS'(i))),
.ready_in (operands_ready_in[i]),
.ready_in (operands_reset[i]),
.data_in ({
operands_if.data.uuid,
operands_if.data.wis,
@ -88,7 +92,7 @@ module VX_dispatch import VX_gpu_pkg::*; #(
wire operands_if_stall = operands_if.valid && ~operands_if.ready;
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_perf_stalls
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_stalls_r[i] <= '0;

View file

@ -49,12 +49,13 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire [`ISSUE_WIDTH-1:0][IN_DATAW-1:0] dispatch_data;
wire [`ISSUE_WIDTH-1:0] dispatch_ready;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_dispatch_data
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign dispatch_valid[i] = dispatch_if[i].valid;
assign dispatch_data[i] = dispatch_if[i].data;
assign dispatch_if[i].ready = dispatch_ready[i];
end
wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices;
wire [BLOCK_SIZE-1:0] block_ready;
wire [BLOCK_SIZE-1:0][NUM_LANES-1:0] block_tmask;
wire [BLOCK_SIZE-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] block_regs;
@ -65,53 +66,30 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire batch_done = (& block_done);
// batch select logic
logic [BATCH_COUNT_W-1:0] batch_idx;
if (BATCH_COUNT != 1) begin : g_batch_idx
wire [BATCH_COUNT_W-1:0] batch_idx_n;
wire [BATCH_COUNT-1:0] valid_batches;
for (genvar i = 0; i < BATCH_COUNT; ++i) begin : g_valid_batches
assign valid_batches[i] = | dispatch_valid[i * BLOCK_SIZE +: BLOCK_SIZE];
end
VX_generic_arbiter #(
.NUM_REQS (BATCH_COUNT),
.TYPE ("P")
) batch_sel (
.clk (clk),
.reset (reset),
.requests (valid_batches),
.grant_index (batch_idx_n),
`UNUSED_PIN (grant_onehot),
`UNUSED_PIN (grant_valid),
.grant_ready (batch_done)
);
if (BATCH_COUNT != 1) begin
always @(posedge clk) begin
if (reset) begin
batch_idx <= '0;
end else if (batch_done) begin
batch_idx <= batch_idx_n;
end else begin
batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done);
end
end
end else begin : g_batch_idx_0
end else begin
assign batch_idx = 0;
`UNUSED_VAR (batch_done)
end
wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices;
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_issue_indices
assign issue_indices[block_idx] = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
end
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_blocks
wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
assign issue_indices[block_idx] = issue_idx;
`RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1));
wire [ISSUE_W-1:0] issue_idx = issue_indices[block_idx];
wire valid_p, ready_p;
if (`NUM_THREADS > NUM_LANES) begin : g_partial_threads
if (`NUM_THREADS != NUM_LANES) begin
reg [NUM_PACKETS-1:0] sent_mask_p;
wire [PID_WIDTH-1:0] start_p_n, start_p, end_p;
wire dispatch_valid_r;
@ -124,7 +102,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire fire_eop = fire_p && is_last_p;
always @(posedge clk) begin
if (reset) begin
if (block_reset) begin
sent_mask_p <= '0;
is_first_p <= 1;
end else begin
@ -146,8 +124,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
for (genvar i = 0; i < NUM_PACKETS; ++i) begin : g_per_packet_data
for (genvar j = 0; j < NUM_LANES; ++j) begin : g_j
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
for (genvar j = 0; j < NUM_LANES; ++j) begin
localparam k = i * NUM_LANES + j;
assign per_packet_tmask[i][j] = dispatch_tmask[k];
assign per_packet_regs[i][0][j] = dispatch_rs1_data[k];
@ -157,12 +135,10 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
end
wire [NUM_PACKETS-1:0] packet_valids;
for (genvar i = 0; i < NUM_PACKETS; ++i) begin : g_packet_valids
assign packet_valids[i] = (| per_packet_tmask[i]);
end
wire [NUM_PACKETS-1:0][PID_WIDTH-1:0] packet_ids;
for (genvar i = 0; i < NUM_PACKETS; ++i) begin : g_packet_ids
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
assign packet_valids[i] = (| per_packet_tmask[i]);
assign packet_ids[i] = PID_WIDTH'(i);
end
@ -211,13 +187,13 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
assign block_pid[block_idx] = start_p;
assign block_sop[block_idx] = is_first_p;
assign block_eop[block_idx] = is_last_p;
if (FANOUT_ENABLE) begin : g_block_ready_fanout
if (FANOUT_ENABLE) begin
assign block_ready[block_idx] = dispatch_valid_r && ready_p && block_enable;
end else begin : g_block_ready
end else begin
assign block_ready[block_idx] = ready_p && block_enable;
end
assign block_done[block_idx] = fire_eop || ~dispatch_valid[issue_idx];
end else begin : g_full_threads
assign block_done[block_idx] = ~dispatch_valid[issue_idx] || fire_eop;
end else begin
assign valid_p = dispatch_valid[issue_idx];
assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
@ -227,31 +203,29 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
assign block_sop[block_idx] = 1'b1;
assign block_eop[block_idx] = 1'b1;
assign block_ready[block_idx] = ready_p;
assign block_done[block_idx] = ready_p || ~valid_p;
assign block_done[block_idx] = ~valid_p || ready_p;
end
wire [ISSUE_ISW_W-1:0] isw;
if (BATCH_COUNT != 1) begin : g_isw_batch
if (BLOCK_SIZE != 1) begin : g_block
if (BATCH_COUNT != 1) begin
if (BLOCK_SIZE != 1) begin
assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)};
end else begin : g_no_block
end else begin
assign isw = batch_idx;
end
end else begin : g_isw
end else begin
assign isw = block_idx;
end
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
logic [OUT_DATAW-1:0] execute_data, execute_data_w;
VX_elastic_buffer #(
.DATAW (OUT_DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) buf_out (
.clk (clk),
.reset (reset),
.reset (block_reset),
.valid_in (valid_p),
.ready_in (ready_p),
.data_in ({
@ -265,27 +239,17 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
block_pid[block_idx],
block_sop[block_idx],
block_eop[block_idx]}),
.data_out (execute_data),
.data_out (execute_if[block_idx].data),
.valid_out (execute_if[block_idx].valid),
.ready_out (execute_if[block_idx].ready)
);
if (`NUM_THREADS != NUM_LANES) begin : g_execute_data_w_partial
assign execute_data_w = execute_data;
end else begin : g_execute_data_w_full
always @(*) begin
execute_data_w = execute_data;
execute_data_w[2:0] = {1'b0, 1'b1, 1'b1}; // default pid, sop, and eop
end
end
assign execute_if[block_idx].data = execute_data_w;
end
reg [`ISSUE_WIDTH-1:0] ready_in;
always @(*) begin
ready_in = 0;
for (integer block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
ready_in[issue_indices[block_idx]] = block_ready[block_idx] && block_eop[block_idx];
for (integer i = 0; i < BLOCK_SIZE; ++i) begin
ready_in[issue_indices[i]] = block_ready[i] && block_eop[i];
end
end
assign dispatch_ready = ready_in;

View file

@ -23,8 +23,8 @@ module VX_execute import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
input sysmem_perf_t sysmem_perf,
input pipeline_perf_t pipeline_perf,
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
`endif
input base_dcrs_t base_dcrs,
@ -51,35 +51,41 @@ module VX_execute import VX_gpu_pkg::*; #(
VX_fpu_csr_if fpu_csr_if[`NUM_FPU_BLOCKS]();
`endif
`RESET_RELAY (alu_reset, reset);
`RESET_RELAY (lsu_reset, reset);
`RESET_RELAY (sfu_reset, reset);
VX_alu_unit #(
.INSTANCE_ID (`SFORMATF(("%s-alu", INSTANCE_ID)))
.INSTANCE_ID ($sformatf("%s-alu", INSTANCE_ID))
) alu_unit (
.clk (clk),
.reset (reset),
.reset (alu_reset),
.dispatch_if (dispatch_if[`EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (commit_if[`EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.branch_ctl_if (branch_ctl_if)
);
`SCOPE_IO_SWITCH (1);
`SCOPE_IO_SWITCH (1)
VX_lsu_unit #(
.INSTANCE_ID (`SFORMATF(("%s-lsu", INSTANCE_ID)))
.INSTANCE_ID ($sformatf("%s-lsu", INSTANCE_ID))
) lsu_unit (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (reset),
.reset (lsu_reset),
.dispatch_if (dispatch_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (commit_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.lsu_mem_if (lsu_mem_if)
);
`ifdef EXT_F_ENABLE
`RESET_RELAY (fpu_reset, reset);
VX_fpu_unit #(
.INSTANCE_ID (`SFORMATF(("%s-fpu", INSTANCE_ID)))
.INSTANCE_ID ($sformatf("%s-fpu", INSTANCE_ID))
) fpu_unit (
.clk (clk),
.reset (reset),
.reset (fpu_reset),
.dispatch_if (dispatch_if[`EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (commit_if[`EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.fpu_csr_if (fpu_csr_if)
@ -87,14 +93,14 @@ module VX_execute import VX_gpu_pkg::*; #(
`endif
VX_sfu_unit #(
.INSTANCE_ID (`SFORMATF(("%s-sfu", INSTANCE_ID))),
.INSTANCE_ID ($sformatf("%s-sfu", INSTANCE_ID)),
.CORE_ID (CORE_ID)
) sfu_unit (
.clk (clk),
.reset (reset),
.reset (sfu_reset),
`ifdef PERF_ENABLE
.sysmem_perf (sysmem_perf),
.pipeline_perf (pipeline_perf),
.mem_perf_if (mem_perf_if),
.pipeline_perf_if (pipeline_perf_if),
`endif
.base_dcrs (base_dcrs),
.dispatch_if (dispatch_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),

View file

@ -51,9 +51,8 @@ module VX_fetch import VX_gpu_pkg::*; #(
wire [`NUM_THREADS-1:0] rsp_tmask;
VX_dp_ram #(
.DATAW (`PC_BITS + `NUM_THREADS),
.SIZE (`NUM_WARPS),
.RDW_MODE ("R"),
.DATAW (`PC_BITS + `NUM_THREADS),
.SIZE (`NUM_WARPS),
.LUTRAM (1)
) tag_store (
.clk (clk),
@ -72,7 +71,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
// This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache requests.
// This issue is particularly prevalent when the icache and dcache are disabled and both requests share the same bus.
wire [`NUM_WARPS-1:0] pending_ibuf_full;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_pending_reads
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
VX_pending_size #(
.SIZE (`IBUF_SIZE)
) pending_reads (
@ -117,9 +116,9 @@ module VX_fetch import VX_gpu_pkg::*; #(
.ready_out (icache_bus_if.req_ready)
);
assign icache_bus_if.req_data.flags = '0;
assign icache_bus_if.req_data.atype = '0;
assign icache_bus_if.req_data.rw = 0;
assign icache_bus_if.req_data.byteen = '1;
assign icache_bus_if.req_data.byteen = 4'b1111;
assign icache_bus_if.req_data.data = '0;
// Icache Response
@ -132,59 +131,47 @@ module VX_fetch import VX_gpu_pkg::*; #(
assign fetch_if.data.uuid = rsp_uuid;
assign icache_bus_if.rsp_ready = fetch_if.ready;
`ifdef SCOPE
`ifdef DBG_SCOPE_FETCH
`SCOPE_IO_SWITCH (1);
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire icache_bus_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
wire icache_bus_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 1, 6, 3, (
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +
`UUID_WIDTH + ICACHE_WORD_SIZE + ICACHE_ADDR_WIDTH +
`UUID_WIDTH + (ICACHE_WORD_SIZE * 8)
), {
schedule_if.valid,
schedule_if.ready,
icache_bus_if.req_valid,
icache_bus_if.req_ready,
icache_bus_if.rsp_valid,
icache_bus_if.rsp_ready
}, {
wire icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
VX_scope_tap #(
.SCOPE_ID (1),
.TRIGGERW (4),
.PROBEW (`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +
ICACHE_TAG_WIDTH + ICACHE_WORD_SIZE + ICACHE_ADDR_WIDTH +
(ICACHE_WORD_SIZE*8) + ICACHE_TAG_WIDTH)
) scope_tap (
.clk (clk),
.reset (scope_reset),
.start (1'b0),
.stop (1'b0),
.triggers ({
reset,
schedule_fire,
icache_bus_req_fire,
icache_bus_rsp_fire
},{
icache_req_fire,
icache_rsp_fire
}),
.probes ({
schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC,
icache_bus_if.req_data.tag.uuid, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
icache_bus_if.rsp_data.tag.uuid, icache_bus_if.rsp_data.data
},
reset_negedge, 1'b0, 4096
icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag
}),
.bus_in (scope_bus_in),
.bus_out (scope_bus_out)
);
`else
`SCOPE_IO_UNUSED(0)
`endif
`endif
`ifdef CHIPSCOPE
`ifdef DBG_SCOPE_FETCH
ila_fetch ila_fetch_inst (
.clk (clk),
.probe0 ({schedule_if.valid, schedule_if.data, schedule_if.ready}),
.probe1 ({icache_bus_if.req_valid, icache_bus_if.req_data, icache_bus_if.req_ready}),
.probe2 ({icache_bus_if.rsp_valid, icache_bus_if.rsp_data, icache_bus_if.rsp_ready})
);
`endif
`SCOPE_IO_UNUSED()
`endif
`ifdef DBG_TRACE_MEM
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire fetch_fire = fetch_if.valid && fetch_if.ready;
always @(posedge clk) begin
if (schedule_if.valid && schedule_if.ready) begin
`TRACE(1, ("%t: %s req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, INSTANCE_ID, schedule_if.data.wid, {schedule_if.data.PC, 1'b0}, schedule_if.data.tmask, schedule_if.data.uuid))
if (schedule_fire) begin
`TRACE(1, ("%d: %s req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, INSTANCE_ID, schedule_if.data.wid, {schedule_if.data.PC, 1'b0}, schedule_if.data.tmask, schedule_if.data.uuid));
end
if (fetch_if.valid && fetch_if.ready) begin
`TRACE(1, ("%t: %s rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, INSTANCE_ID, fetch_if.data.wid, {fetch_if.data.PC, 1'b0}, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid))
if (fetch_fire) begin
`TRACE(1, ("%d: %s rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, INSTANCE_ID, fetch_if.data.wid, {fetch_if.data.PC, 1'b0}, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid));
end
end
`endif

View file

@ -41,7 +41,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_BUF (PARTIAL_BW ? 3 : 0)
.OUT_BUF (PARTIAL_BW ? 1 : 0)
) dispatch_unit (
.clk (clk),
.reset (reset),
@ -53,10 +53,12 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.NUM_LANES (NUM_LANES)
) per_block_commit_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_fpus
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
`UNUSED_VAR (per_block_execute_if[block_idx].data.tid)
`UNUSED_VAR (per_block_execute_if[block_idx].data.wb)
`RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1));
// Store request info
wire fpu_req_valid, fpu_req_ready;
wire fpu_rsp_valid, fpu_rsp_ready;
@ -69,9 +71,9 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
wire [NUM_LANES-1:0] fpu_rsp_tmask;
wire [`PC_BITS-1:0] fpu_rsp_PC;
wire [`NR_BITS-1:0] fpu_rsp_rd;
wire [PID_WIDTH-1:0] fpu_rsp_pid, fpu_rsp_pid_u;
wire fpu_rsp_sop, fpu_rsp_sop_u;
wire fpu_rsp_eop, fpu_rsp_eop_u;
wire [PID_WIDTH-1:0] fpu_rsp_pid;
wire fpu_rsp_sop;
wire fpu_rsp_eop;
wire [TAG_WIDTH-1:0] fpu_req_tag, fpu_rsp_tag;
wire mdata_full;
@ -87,30 +89,17 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.SIZE (`FPUQ_SIZE)
) tag_store (
.clk (clk),
.reset (reset),
.reset (block_reset),
.acquire_en (execute_fire),
.write_addr (fpu_req_tag),
.write_data ({per_block_execute_if[block_idx].data.uuid, per_block_execute_if[block_idx].data.wid, per_block_execute_if[block_idx].data.tmask, per_block_execute_if[block_idx].data.PC, per_block_execute_if[block_idx].data.rd, per_block_execute_if[block_idx].data.pid, per_block_execute_if[block_idx].data.sop, per_block_execute_if[block_idx].data.eop}),
.read_data ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid_u, fpu_rsp_sop_u, fpu_rsp_eop_u}),
.read_data ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
.read_addr (fpu_rsp_tag),
.release_en (fpu_rsp_fire),
.full (mdata_full),
`UNUSED_PIN (empty)
);
if (PID_BITS != 0) begin : g_fpu_rsp_pid
assign fpu_rsp_pid = fpu_rsp_pid_u;
assign fpu_rsp_sop = fpu_rsp_sop_u;
assign fpu_rsp_eop = fpu_rsp_eop_u;
end else begin : g_no_fpu_rsp_pid
`UNUSED_VAR (fpu_rsp_pid_u)
`UNUSED_VAR (fpu_rsp_sop_u)
`UNUSED_VAR (fpu_rsp_eop_u)
assign fpu_rsp_pid = 0;
assign fpu_rsp_sop = 1;
assign fpu_rsp_eop = 1;
end
// resolve dynamic FRM from CSR
wire [`INST_FRM_BITS-1:0] fpu_req_frm;
`ASSIGN_BLOCKED_WID (fpu_csr_if[block_idx].read_wid, per_block_execute_if[block_idx].data.wid, block_idx, `NUM_FPU_BLOCKS)
@ -130,7 +119,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_dpi (
.clk (clk),
.reset (reset),
.reset (block_reset),
.valid_in (fpu_req_valid),
.mask_in (per_block_execute_if[block_idx].data.tmask),
@ -159,7 +148,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_fpnew (
.clk (clk),
.reset (reset),
.reset (block_reset),
.valid_in (fpu_req_valid),
.mask_in (per_block_execute_if[block_idx].data.tmask),
@ -188,7 +177,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_dsp (
.clk (clk),
.reset (reset),
.reset (block_reset),
.valid_in (fpu_req_valid),
.mask_in (per_block_execute_if[block_idx].data.tmask),
@ -211,38 +200,27 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
`endif
// handle CSR update
// handle FPU response
fflags_t fpu_rsp_fflags_q;
if (PID_BITS != 0) begin : g_pid
if (PID_BITS != 0) begin
fflags_t fpu_rsp_fflags_r;
always @(posedge clk) begin
if (reset) begin
if (block_reset) begin
fpu_rsp_fflags_r <= '0;
end else if (fpu_rsp_fire) begin
fpu_rsp_fflags_r <= fpu_rsp_eop ? '0 : (fpu_rsp_fflags_r | fpu_rsp_fflags);
end
end
assign fpu_rsp_fflags_q = fpu_rsp_fflags_r | fpu_rsp_fflags;
end else begin : g_no_pid
end else begin
assign fpu_rsp_fflags_q = fpu_rsp_fflags;
end
VX_fpu_csr_if fpu_csr_tmp_if();
assign fpu_csr_tmp_if.write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags;
`ASSIGN_BLOCKED_WID (fpu_csr_tmp_if.write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS)
assign fpu_csr_tmp_if.write_fflags = fpu_rsp_fflags_q;
VX_pipe_register #(
.DATAW (1 + `NW_WIDTH + $bits(fflags_t)),
.RESETW (1)
) fpu_csr_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({fpu_csr_tmp_if.write_enable, fpu_csr_tmp_if.write_wid, fpu_csr_tmp_if.write_fflags}),
.data_out ({fpu_csr_if[block_idx].write_enable, fpu_csr_if[block_idx].write_wid, fpu_csr_if[block_idx].write_fflags})
);
assign fpu_csr_if[block_idx].write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags;
`ASSIGN_BLOCKED_WID (fpu_csr_if[block_idx].write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS)
assign fpu_csr_if[block_idx].write_fflags = fpu_rsp_fflags_q;
// send response
@ -251,7 +229,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.SIZE (0)
) rsp_buf (
.clk (clk),
.reset (reset),
.reset (block_reset),
.valid_in (fpu_rsp_valid),
.ready_in (fpu_rsp_ready),
.data_in ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_result, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),

View file

@ -41,17 +41,17 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
wire [BLOCK_SIZE-1:0] commit_in_ready;
wire [BLOCK_SIZE-1:0][ISSUE_ISW_W-1:0] commit_in_isw;
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin : g_commit_in
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
assign commit_in_valid[i] = commit_in_if[i].valid;
assign commit_in_data[i] = commit_in_if[i].data;
assign commit_in_if[i].ready = commit_in_ready[i];
if (BLOCK_SIZE != `ISSUE_WIDTH) begin : g_commit_in_isw_partial
if (BLOCK_SIZE != 1) begin : g_block
if (BLOCK_SIZE != `ISSUE_WIDTH) begin
if (BLOCK_SIZE != 1) begin
assign commit_in_isw[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_ISW_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)};
end else begin : g_no_block
end else begin
assign commit_in_isw[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_ISW_W];
end
end else begin : g_commit_in_isw_full
end else begin
assign commit_in_isw[i] = BLOCK_SIZE_W'(i);
end
end
@ -70,12 +70,11 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
commit_out_data[commit_in_isw[i]] = commit_in_data[i];
end
end
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin : g_commit_in_ready
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
assign commit_in_ready[i] = commit_out_ready[commit_in_isw[i]];
end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin: g_out_bufs
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_tmp_if();
@ -95,31 +94,31 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
.ready_out (commit_tmp_if.ready)
);
logic [`NUM_THREADS-1:0] commit_tmask_w;
logic [`NUM_THREADS-1:0][`XLEN-1:0] commit_data_w;
if (PID_BITS != 0) begin : g_commit_data_with_pid
logic [`NUM_THREADS-1:0] commit_tmask_r;
logic [`NUM_THREADS-1:0][`XLEN-1:0] commit_data_r;
if (PID_BITS != 0) begin
always @(*) begin
commit_tmask_w = '0;
commit_data_w = 'x;
commit_tmask_r = '0;
commit_data_r = 'x;
for (integer j = 0; j < NUM_LANES; ++j) begin
commit_tmask_w[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.tmask[j];
commit_data_w[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.data[j];
commit_tmask_r[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.tmask[j];
commit_data_r[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.data[j];
end
end
end else begin : g_commit_data_no_pid
assign commit_tmask_w = commit_tmp_if.data.tmask;
assign commit_data_w = commit_tmp_if.data.data;
end else begin
assign commit_tmask_r = commit_tmp_if.data.tmask;
assign commit_data_r = commit_tmp_if.data.data;
end
assign commit_out_if[i].valid = commit_tmp_if.valid;
assign commit_out_if[i].data = {
commit_tmp_if.data.uuid,
commit_tmp_if.data.wid,
commit_tmask_w,
commit_tmask_r,
commit_tmp_if.data.PC,
commit_tmp_if.data.wb,
commit_tmp_if.data.rd,
commit_data_w,
commit_data_r,
1'b0, // PID
commit_tmp_if.data.sop,
commit_tmp_if.data.eop

View file

@ -35,11 +35,11 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
wire [PER_ISSUE_WARPS-1:0] ibuf_ready_in;
assign decode_if.ready = ibuf_ready_in[decode_if.data.wid];
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_instr_bufs
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`IBUF_SIZE),
.OUT_REG (1)
.OUT_REG (2) // 2-cycle EB for area reduction
) instr_buf (
.clk (clk),
.reset (reset),

View file

@ -16,6 +16,7 @@
module VX_ipdom_stack #(
parameter WIDTH = 1,
parameter DEPTH = 1,
parameter OUT_REG = 0,
parameter ADDRW = `LOG2UP(DEPTH)
) (
input wire clk,
@ -30,63 +31,76 @@ module VX_ipdom_stack #(
output wire empty,
output wire full
);
reg [ADDRW-1:0] rd_ptr, rd_ptr_n, wr_ptr;
reg slot_set [DEPTH-1:0];
reg [ADDRW-1:0] rd_ptr, wr_ptr;
reg empty_r, full_r;
wire [WIDTH-1:0] d0, d1;
wire d_set_r;
always @(*) begin
rd_ptr_n = rd_ptr;
if (push) begin
rd_ptr_n = wr_ptr;
end else if (pop) begin
rd_ptr_n = rd_ptr - ADDRW'(d_set_r);
end
end
wire d_set_n = slot_set[rd_ptr];
always @(posedge clk) begin
if (reset) begin
rd_ptr <= '0;
wr_ptr <= '0;
empty_r <= 1;
full_r <= 0;
rd_ptr <= '0;
end else begin
`ASSERT(~push || ~full, ("%t: runtime error: writing to a full stack!", $time));
`ASSERT(~pop || ~empty, ("%t: runtime error: reading an empty stack!", $time));
`ASSERT(~push || ~pop, ("%t: runtime error: push and pop in same cycle not supported!", $time));
`ASSERT(~push || ~full, ("runtime error: writing to a full stack!"));
`ASSERT(~pop || ~empty, ("runtime error: reading an empty stack!"));
`ASSERT(~push || ~pop, ("runtime error: push and pop in same cycle not supported!"));
if (push) begin
rd_ptr <= wr_ptr;
wr_ptr <= wr_ptr + ADDRW'(1);
empty_r <= 0;
full_r <= (ADDRW'(DEPTH-1) == wr_ptr);
end else if (pop) begin
wr_ptr <= wr_ptr - ADDRW'(d_set_r);
empty_r <= (rd_ptr == 0) && d_set_r;
wr_ptr <= wr_ptr - ADDRW'(d_set_n);
rd_ptr <= rd_ptr - ADDRW'(d_set_n);
empty_r <= (rd_ptr == 0) && (d_set_n == 1);
full_r <= 0;
end
rd_ptr <= rd_ptr_n;
end
end
wire [WIDTH * 2:0] qout = push ? {1'b0, q1, q0} : {1'b1, d1, d0};
VX_dp_ram #(
.DATAW (1 + WIDTH * 2),
.SIZE (DEPTH),
.OUT_REG (1),
.RDW_MODE ("R")
) ipdom_store (
.DATAW (WIDTH * 2),
.SIZE (DEPTH),
.OUT_REG (OUT_REG ? 1 : 0),
.LUTRAM (OUT_REG ? 0 : 1)
) store (
.clk (clk),
.reset (reset),
.read (1'b1),
.write (push || pop),
.write (push),
.wren (1'b1),
.waddr (push ? wr_ptr : rd_ptr),
.wdata (qout),
.raddr (rd_ptr_n),
.rdata ({d_set_r, d1, d0})
.waddr (wr_ptr),
.wdata ({q1, q0}),
.raddr (rd_ptr),
.rdata ({d1, d0})
);
always @(posedge clk) begin
if (push) begin
slot_set[wr_ptr] <= 0;
end else if (pop) begin
slot_set[rd_ptr] <= 1;
end
end
wire d_set_r;
VX_pipe_register #(
.DATAW (1),
.DEPTH (OUT_REG)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in (d_set_n),
.data_out (d_set_r)
);
assign d = d_set_r ? d0 : d1;

View file

@ -29,17 +29,16 @@ module VX_issue import VX_gpu_pkg::*; #(
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH]
);
`STATIC_ASSERT ((`ISSUE_WIDTH <= `NUM_WARPS), ("invalid parameter"))
`ifdef PERF_ENABLE
issue_perf_t per_issue_perf [`ISSUE_WIDTH];
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, ibf_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, scb_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, opd_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_issue_perf_units_uses
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, units_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
end
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin : g_issue_perf_sfu_uses
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, sfu_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
end
`endif
@ -50,9 +49,9 @@ module VX_issue import VX_gpu_pkg::*; #(
wire [`ISSUE_WIDTH-1:0] decode_ready_in;
assign decode_if.ready = decode_ready_in[decode_isw];
`SCOPE_IO_SWITCH (`ISSUE_WIDTH);
`SCOPE_IO_SWITCH (`ISSUE_WIDTH)
for (genvar issue_id = 0; issue_id < `ISSUE_WIDTH; ++issue_id) begin : g_slices
for (genvar issue_id = 0; issue_id < `ISSUE_WIDTH; ++issue_id) begin : issue_slices
VX_decode_if #(
.NUM_WARPS (PER_ISSUE_WARPS)
) per_issue_decode_if();
@ -77,13 +76,15 @@ module VX_issue import VX_gpu_pkg::*; #(
assign decode_if.ibuf_pop[issue_id * PER_ISSUE_WARPS +: PER_ISSUE_WARPS] = per_issue_decode_if.ibuf_pop;
`endif
`RESET_RELAY (slice_reset, reset);
VX_issue_slice #(
.INSTANCE_ID (`SFORMATF(("%s%0d", INSTANCE_ID, issue_id))),
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, issue_id)),
.ISSUE_ID (issue_id)
) issue_slice (
`SCOPE_IO_BIND(issue_id)
.clk (clk),
.reset (reset),
.reset (slice_reset),
`ifdef PERF_ENABLE
.issue_perf (per_issue_perf[issue_id]),
`endif
@ -93,7 +94,7 @@ module VX_issue import VX_gpu_pkg::*; #(
);
// Assign transposed dispatch_if
for (genvar ex_id = 0; ex_id < `NUM_EX_UNITS; ++ex_id) begin : g_dispatch_if
for (genvar ex_id = 0; ex_id < `NUM_EX_UNITS; ++ex_id) begin
`ASSIGN_VX_IF(dispatch_if[ex_id * `ISSUE_WIDTH + issue_id], per_issue_dispatch_if[ex_id]);
end
end

View file

@ -13,7 +13,7 @@
`include "VX_define.vh"
module VX_issue_slice import VX_gpu_pkg::*; #(
module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter ISSUE_ID = 0
) (
@ -36,11 +36,16 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
VX_scoreboard_if scoreboard_if();
VX_operands_if operands_if();
`RESET_RELAY (ibuf_reset, reset);
`RESET_RELAY (scoreboard_reset, reset);
`RESET_RELAY (operands_reset, reset);
`RESET_RELAY (dispatch_reset, reset);
VX_ibuffer #(
.INSTANCE_ID (`SFORMATF(("%s-ibuffer", INSTANCE_ID)))
.INSTANCE_ID ($sformatf("%s-ibuffer", INSTANCE_ID))
) ibuffer (
.clk (clk),
.reset (reset),
.reset (ibuf_reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.ibf_stalls),
`endif
@ -49,10 +54,10 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
);
VX_scoreboard #(
.INSTANCE_ID (`SFORMATF(("%s-scoreboard", INSTANCE_ID)))
.INSTANCE_ID ($sformatf("%s-scoreboard", INSTANCE_ID))
) scoreboard (
.clk (clk),
.reset (reset),
.reset (scoreboard_reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.scb_stalls),
.perf_units_uses(issue_perf.units_uses),
@ -64,10 +69,10 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
);
VX_operands #(
.INSTANCE_ID (`SFORMATF(("%s-operands", INSTANCE_ID)))
.INSTANCE_ID ($sformatf("%s-operands", INSTANCE_ID))
) operands (
.clk (clk),
.reset (reset),
.reset (operands_reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.opd_stalls),
`endif
@ -77,10 +82,10 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
);
VX_dispatch #(
.INSTANCE_ID (`SFORMATF(("%s-dispatch", INSTANCE_ID)))
.INSTANCE_ID ($sformatf("%s-dispatch", INSTANCE_ID))
) dispatch (
.clk (clk),
.reset (reset),
.reset (dispatch_reset),
`ifdef PERF_ENABLE
`UNUSED_PIN (perf_stalls),
`endif
@ -88,90 +93,65 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
.dispatch_if (dispatch_if)
);
`ifdef SCOPE
`ifdef DBG_SCOPE_ISSUE
`SCOPE_IO_SWITCH (1);
wire decode_fire = decode_if.valid && decode_if.ready;
wire operands_fire = operands_if.valid && operands_if.ready;
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 2, 4, 3, (
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + 1 + `NR_BITS * 4 +
`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + 1 + `NR_BITS + (3 * `XLEN) +
`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * `XLEN) + 1
), {
decode_if.valid,
decode_if.ready,
operands_if.valid,
operands_if.ready
}, {
decode_fire,
operands_fire,
writeback_if.valid // ack-free
}, {
decode_if.data.uuid,
decode_if.data.wid,
decode_if.data.tmask,
decode_if.data.PC,
decode_if.data.ex_type,
decode_if.data.op_type,
decode_if.data.wb,
decode_if.data.rd,
decode_if.data.rs1,
decode_if.data.rs2,
decode_if.data.rs3,
wire operands_if_fire = operands_if.valid && operands_if.ready;
wire operands_if_not_ready = ~operands_if.ready;
wire writeback_if_valid = writeback_if.valid;
VX_scope_tap #(
.SCOPE_ID (2),
.TRIGGERW (4),
.PROBEW (`UUID_WIDTH + `NUM_THREADS + `EX_BITS + `INST_OP_BITS +
1 + `NR_BITS + (`NUM_THREADS * 3 * `XLEN) +
`UUID_WIDTH + `NUM_THREADS + `NR_BITS + (`NUM_THREADS*`XLEN) + 1)
) scope_tap (
.clk (clk),
.reset (scope_reset),
.start (1'b0),
.stop (1'b0),
.triggers ({
reset,
operands_if_fire,
operands_if_not_ready,
writeback_if_valid
}),
.probes ({
operands_if.data.uuid,
operands_if.data.wis,
operands_if.data.tmask,
operands_if.data.PC,
operands_if.data.ex_type,
operands_if.data.op_type,
operands_if.data.wb,
operands_if.data.rd,
operands_if.data.rs1_data[0],
operands_if.data.rs2_data[0],
operands_if.data.rs3_data[0],
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data,
writeback_if.data.uuid,
writeback_if.data.wis,
writeback_if.data.tmask,
writeback_if.data.rd,
writeback_if.data.data,
writeback_if.data.eop
},
reset_negedge, 1'b0, 4096
}),
.bus_in (scope_bus_in),
.bus_out (scope_bus_out)
);
`else
`SCOPE_IO_UNUSED(0)
`endif
`endif
`ifdef CHIPSCOPE
`ifdef DBG_SCOPE_ISSUE
ila_issue ila_issue_inst (
.clk (clk),
.probe0 ({decode_if.valid, decode_if.data, decode_if.ready}),
.probe1 ({scoreboard_if.valid, scoreboard_if.data, scoreboard_if.ready}),
.probe2 ({operands_if.valid, operands_if.data, operands_if.ready}),
.probe3 ({writeback_if.valid, writeback_if.data})
);
`endif
`SCOPE_IO_UNUSED()
`endif
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (operands_if.valid && operands_if.ready) begin
`TRACE(1, ("%t: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, wis_to_wid(operands_if.data.wis, ISSUE_ID), {operands_if.data.PC, 1'b0}))
`TRACE(1, ("%d: %s wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, wis_to_wid(operands_if.data.wis, ISSUE_ID), {operands_if.data.PC, 1'b0}));
trace_ex_type(1, operands_if.data.ex_type);
`TRACE(1, (", op="))
`TRACE(1, (", op="));
trace_ex_op(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args);
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if.data.tmask, operands_if.data.wb, operands_if.data.rd))
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs1_data, `NUM_THREADS)
`TRACE(1, (", rs2_data="))
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs2_data, `NUM_THREADS)
`TRACE(1, (", rs3_data="))
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs3_data, `NUM_THREADS)
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if.data.tmask, operands_if.data.wb, operands_if.data.rd));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs1_data, `NUM_THREADS);
`TRACE(1, (", rs2_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs2_data, `NUM_THREADS);
`TRACE(1, (", rs3_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs3_data, `NUM_THREADS);
trace_op_args(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args);
`TRACE(1, (" (#%0d)\n", operands_if.data.uuid))
`TRACE(1, (" (#%0d)\n", operands_if.data.uuid));
end
end
`endif

View file

@ -80,7 +80,7 @@ module VX_issue_top import VX_gpu_pkg::*; #(
assign decode_if.data.rs3 = decode_rs3;
assign decode_ready = decode_if.ready;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_writeback_if
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign writeback_if[i].valid = writeback_valid[i];
assign writeback_if[i].data.uuid = writeback_uuid[i];
assign writeback_if[i].data.wis = writeback_wis[i];
@ -92,7 +92,7 @@ module VX_issue_top import VX_gpu_pkg::*; #(
assign writeback_if[i].data.eop = writeback_eop[i];
end
for (genvar i = 0; i < `NUM_EX_UNITS * `ISSUE_WIDTH; ++i) begin : g_dispatch_if
for (genvar i = 0; i < `NUM_EX_UNITS * `ISSUE_WIDTH; ++i) begin
assign dispatch_valid[i] = dispatch_if[i].valid;
assign dispatch_uuid[i] = dispatch_if[i].data.uuid;
assign dispatch_wis[i] = dispatch_if[i].data.wis;
@ -113,13 +113,6 @@ module VX_issue_top import VX_gpu_pkg::*; #(
issue_perf_t issue_perf = '0;
`endif
`ifdef SCOPE
wire [0:0] scope_reset_w = 1'b0;
wire [0:0] scope_bus_in_w = 1'b0;
wire [0:0] scope_bus_out_w;
`UNUSED_VAR (scope_bus_out_w)
`endif
VX_issue #(
.INSTANCE_ID (INSTANCE_ID)
) issue (

201
hw/rtl/core/VX_lmem_unit.sv Normal file
View file

@ -0,0 +1,201 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_lmem_unit import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output cache_perf_t cache_perf,
`endif
VX_lsu_mem_if.slave lsu_mem_in_if [`NUM_LSU_BLOCKS],
VX_lsu_mem_if.master lsu_mem_out_if [`NUM_LSU_BLOCKS]
);
`STATIC_ASSERT(`IS_DIVISBLE((1 << `LMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT(0 == (`LMEM_BASE_ADDR % (1 << `LMEM_LOG_SIZE)), ("invalid parameter"))
localparam REQ_DATAW = `NUM_LSU_LANES + 1 + `NUM_LSU_LANES * (LSU_WORD_SIZE + LSU_ADDR_WIDTH + `ADDR_TYPE_WIDTH + LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH;
localparam RSP_DATAW = `NUM_LSU_LANES + `NUM_LSU_LANES * (LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH;
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE);
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_switch_if[`NUM_LSU_BLOCKS]();
`RESET_RELAY_EX (block_reset, reset, `NUM_LSU_BLOCKS, 1);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
wire [`NUM_LSU_LANES-1:0] is_addr_local_mask;
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
assign is_addr_local_mask[j] = lsu_mem_in_if[i].req_data.atype[j][`ADDR_TYPE_LOCAL];
end
wire is_addr_global = | (lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask);
wire is_addr_local = | (lsu_mem_in_if[i].req_data.mask & is_addr_local_mask);
wire req_global_ready;
wire req_local_ready;
VX_elastic_buffer #(
.DATAW (REQ_DATAW),
.SIZE (2),
.OUT_REG (1)
) req_global_buf (
.clk (clk),
.reset (block_reset[i]),
.valid_in (lsu_mem_in_if[i].req_valid && is_addr_global),
.data_in ({
lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask,
lsu_mem_in_if[i].req_data.rw,
lsu_mem_in_if[i].req_data.byteen,
lsu_mem_in_if[i].req_data.addr,
lsu_mem_in_if[i].req_data.atype,
lsu_mem_in_if[i].req_data.data,
lsu_mem_in_if[i].req_data.tag
}),
.ready_in (req_global_ready),
.valid_out (lsu_mem_out_if[i].req_valid),
.data_out ({
lsu_mem_out_if[i].req_data.mask,
lsu_mem_out_if[i].req_data.rw,
lsu_mem_out_if[i].req_data.byteen,
lsu_mem_out_if[i].req_data.addr,
lsu_mem_out_if[i].req_data.atype,
lsu_mem_out_if[i].req_data.data,
lsu_mem_out_if[i].req_data.tag
}),
.ready_out (lsu_mem_out_if[i].req_ready)
);
VX_elastic_buffer #(
.DATAW (REQ_DATAW),
.SIZE (0),
.OUT_REG (0)
) req_local_buf (
.clk (clk),
.reset (block_reset[i]),
.valid_in (lsu_mem_in_if[i].req_valid && is_addr_local),
.data_in ({
lsu_mem_in_if[i].req_data.mask & is_addr_local_mask,
lsu_mem_in_if[i].req_data.rw,
lsu_mem_in_if[i].req_data.byteen,
lsu_mem_in_if[i].req_data.addr,
lsu_mem_in_if[i].req_data.atype,
lsu_mem_in_if[i].req_data.data,
lsu_mem_in_if[i].req_data.tag
}),
.ready_in (req_local_ready),
.valid_out (lsu_switch_if[i].req_valid),
.data_out ({
lsu_switch_if[i].req_data.mask,
lsu_switch_if[i].req_data.rw,
lsu_switch_if[i].req_data.byteen,
lsu_switch_if[i].req_data.addr,
lsu_switch_if[i].req_data.atype,
lsu_switch_if[i].req_data.data,
lsu_switch_if[i].req_data.tag
}),
.ready_out (lsu_switch_if[i].req_ready)
);
assign lsu_mem_in_if[i].req_ready = (req_global_ready && is_addr_global)
|| (req_local_ready && is_addr_local);
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_DATAW),
.ARBITER ("R"),
.OUT_BUF (1)
) rsp_arb (
.clk (clk),
.reset (block_reset[i]),
.valid_in ({
lsu_switch_if[i].rsp_valid,
lsu_mem_out_if[i].rsp_valid
}),
.ready_in ({
lsu_switch_if[i].rsp_ready,
lsu_mem_out_if[i].rsp_ready
}),
.data_in ({
lsu_switch_if[i].rsp_data,
lsu_mem_out_if[i].rsp_data
}),
.data_out (lsu_mem_in_if[i].rsp_data),
.valid_out (lsu_mem_in_if[i].rsp_valid),
.ready_out (lsu_mem_in_if[i].rsp_ready),
`UNUSED_PIN (sel_out)
);
end
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_if[LSU_NUM_REQS]();
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_tmp_if[`NUM_LSU_LANES]();
VX_lsu_adapter #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (3),
.RSP_OUT_BUF (0)
) lsu_adapter (
.clk (clk),
.reset (block_reset[i]),
.lsu_mem_if (lsu_switch_if[i]),
.mem_bus_if (lmem_bus_tmp_if)
);
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
`ASSIGN_VX_MEM_BUS_IF (lmem_bus_if[i * `NUM_LSU_LANES + j], lmem_bus_tmp_if[j]);
end
end
`RESET_RELAY (lmem_reset, reset);
VX_local_mem #(
.INSTANCE_ID($sformatf("%s-lmem", INSTANCE_ID)),
.SIZE (1 << `LMEM_LOG_SIZE),
.NUM_REQS (LSU_NUM_REQS),
.NUM_BANKS (`LMEM_NUM_BANKS),
.WORD_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.OUT_BUF (3)
) local_mem (
.clk (clk),
.reset (lmem_reset),
`ifdef PERF_ENABLE
.cache_perf (cache_perf),
`endif
.mem_bus_if (lmem_bus_if)
);
endmodule

View file

@ -29,7 +29,7 @@ module VX_lsu_adapter import VX_gpu_pkg::*; #(
VX_mem_bus_if.master mem_bus_if [NUM_LANES]
);
localparam REQ_ADDR_WIDTH = `MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE);
localparam REQ_DATA_WIDTH = 1 + DATA_SIZE + REQ_ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + DATA_SIZE * 8;
localparam REQ_DATA_WIDTH = 1 + DATA_SIZE + REQ_ADDR_WIDTH + `ADDR_TYPE_WIDTH + DATA_SIZE * 8;
localparam RSP_DATA_WIDTH = DATA_SIZE * 8;
// handle request unpacking
@ -41,16 +41,29 @@ module VX_lsu_adapter import VX_gpu_pkg::*; #(
wire [NUM_LANES-1:0][TAG_WIDTH-1:0] req_tag_out;
wire [NUM_LANES-1:0] req_ready_out;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_req_data_in
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign req_data_in[i] = {
lsu_mem_if.req_data.rw,
lsu_mem_if.req_data.addr[i],
lsu_mem_if.req_data.data[i],
lsu_mem_if.req_data.byteen[i],
lsu_mem_if.req_data.flags[i]
lsu_mem_if.req_data.addr[i],
lsu_mem_if.req_data.atype[i],
lsu_mem_if.req_data.data[i]
};
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign mem_bus_if[i].req_valid = req_valid_out[i];
assign {
mem_bus_if[i].req_data.rw,
mem_bus_if[i].req_data.byteen,
mem_bus_if[i].req_data.addr,
mem_bus_if[i].req_data.atype,
mem_bus_if[i].req_data.data
} = req_data_out[i];
assign mem_bus_if[i].req_data.tag = req_tag_out[i];
assign req_ready_out[i] = mem_bus_if[i].req_ready;
end
VX_stream_unpack #(
.NUM_REQS (NUM_LANES),
.DATA_WIDTH (REQ_DATA_WIDTH),
@ -70,19 +83,6 @@ module VX_lsu_adapter import VX_gpu_pkg::*; #(
.ready_out (req_ready_out)
);
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_bus_req
assign mem_bus_if[i].req_valid = req_valid_out[i];
assign {
mem_bus_if[i].req_data.rw,
mem_bus_if[i].req_data.addr,
mem_bus_if[i].req_data.data,
mem_bus_if[i].req_data.byteen,
mem_bus_if[i].req_data.flags
} = req_data_out[i];
assign mem_bus_if[i].req_data.tag = req_tag_out[i];
assign req_ready_out[i] = mem_bus_if[i].req_ready;
end
// handle response packing
wire [NUM_LANES-1:0] rsp_valid_out;
@ -90,10 +90,10 @@ module VX_lsu_adapter import VX_gpu_pkg::*; #(
wire [NUM_LANES-1:0][TAG_WIDTH-1:0] rsp_tag_out;
wire [NUM_LANES-1:0] rsp_ready_out;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_bus_rsp
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign rsp_valid_out[i] = mem_bus_if[i].rsp_valid;
assign rsp_data_out[i] = mem_bus_if[i].rsp_data.data;
assign rsp_tag_out[i] = mem_bus_if[i].rsp_data.tag;
assign rsp_data_out[i] = mem_bus_if[i].rsp_data.data;
assign rsp_tag_out[i] = mem_bus_if[i].rsp_data.tag;
assign mem_bus_if[i].rsp_ready = rsp_ready_out[i];
end

View file

@ -13,7 +13,7 @@
`include "VX_define.vh"
module VX_lsu_slice import VX_gpu_pkg::*; #(
module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -59,25 +59,25 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
wire req_is_fence, rsp_is_fence;
wire [NUM_LANES-1:0][`XLEN-1:0] full_addr;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_full_addr
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign full_addr[i] = execute_if.data.rs1_data[i] + `SEXT(`XLEN, execute_if.data.op_args.lsu.offset);
end
// address type calculation
wire [NUM_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] mem_req_flags;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_flags
wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] mem_req_atype;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [MEM_ADDRW-1:0] block_addr = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
// is I/O address
wire [MEM_ADDRW-1:0] io_addr_start = MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT);
wire [MEM_ADDRW-1:0] io_addr_end = MEM_ADDRW'(`XLEN'(`IO_END_ADDR) >> MEM_ASHIFT);
assign mem_req_flags[i][`MEM_REQ_FLAG_FLUSH] = req_is_fence;
assign mem_req_flags[i][`MEM_REQ_FLAG_IO] = (block_addr >= io_addr_start) && (block_addr < io_addr_end);
assign mem_req_atype[i][`ADDR_TYPE_FLUSH] = req_is_fence;
assign mem_req_atype[i][`ADDR_TYPE_IO] = (block_addr >= io_addr_start) && (block_addr < io_addr_end);
`ifdef LMEM_ENABLE
// is local memory address
wire [MEM_ADDRW-1:0] lmem_addr_start = MEM_ADDRW'(`XLEN'(`LMEM_BASE_ADDR) >> MEM_ASHIFT);
wire [MEM_ADDRW-1:0] lmem_addr_end = MEM_ADDRW'((`XLEN'(`LMEM_BASE_ADDR) + `XLEN'(1 << `LMEM_LOG_SIZE)) >> MEM_ASHIFT);
assign mem_req_flags[i][`MEM_REQ_FLAG_LOCAL] = (block_addr >= lmem_addr_start) && (block_addr < lmem_addr_end);
assign mem_req_atype[i][`ADDR_TYPE_LOCAL] = (block_addr >= lmem_addr_start) && (block_addr < lmem_addr_end);
`endif
end
@ -102,6 +102,8 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
wire mem_req_fire = mem_req_valid && mem_req_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
`UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire)
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
wire no_rsp_buf_valid, no_rsp_buf_ready;
@ -149,49 +151,49 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] req_align;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_addr
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign req_align[i] = full_addr[i][REQ_ASHIFT-1:0];
assign mem_req_addr[i] = full_addr[i][`MEM_ADDR_WIDTH-1:REQ_ASHIFT];
end
// byte enable formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_byteen_w
reg [LSU_WORD_SIZE-1:0] mem_req_byteen_w;
for (genvar i = 0; i < NUM_LANES; ++i) begin
reg [LSU_WORD_SIZE-1:0] mem_req_byteen_r;
always @(*) begin
mem_req_byteen_w = '0;
mem_req_byteen_r = '0;
case (`INST_LSU_WSIZE(execute_if.data.op_type))
0: begin // 8-bit
mem_req_byteen_w[req_align[i]] = 1'b1;
mem_req_byteen_r[req_align[i]] = 1'b1;
end
1: begin // 16 bit
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
end
`ifdef XLEN_64
2: begin // 32 bit
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
end
`endif
// 3: 64 bit
default : mem_req_byteen_w = {LSU_WORD_SIZE{1'b1}};
default : mem_req_byteen_r = {LSU_WORD_SIZE{1'b1}};
endcase
end
assign mem_req_byteen[i] = mem_req_byteen_w;
assign mem_req_byteen[i] = mem_req_byteen_r;
end
// memory misalignment not supported!
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_missalign
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire lsu_req_fire = execute_if.valid && execute_if.ready;
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if.data.tmask[i] || req_is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if.data.op_type))) == 0),
("%t: misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
$time, execute_if.data.wid, {execute_if.data.PC, 1'b0}, full_addr[i], `INST_LSU_WSIZE(execute_if.data.op_type), execute_if.data.uuid))
("misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
execute_if.data.wid, {execute_if.data.PC, 1'b0}, full_addr[i], `INST_LSU_WSIZE(execute_if.data.op_type), execute_if.data.uuid));
end
// store data formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_data
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
mem_req_data[i] = execute_if.data.rs2_data[i];
case (req_align[i])
@ -213,7 +215,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr;
if (PID_BITS != 0) begin : g_pids
if (PID_BITS != 0) begin
reg [`LSUQ_IN_SIZE-1:0][PID_BITS:0] pkt_ctr;
reg [`LSUQ_IN_SIZE-1:0] pkt_sop, pkt_eop;
@ -269,10 +271,10 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
assign mem_rsp_sop_pkt = pkt_sop[pkt_raddr];
assign mem_rsp_eop_pkt = mem_rsp_eop_fire && pkt_eop[pkt_raddr] && (pkt_ctr[pkt_raddr] == 1);
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("%t: allocator full!", $time))
`RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("%t: oops! broken sop request!", $time))
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("allocator full!"))
`RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("Oops!"))
`UNUSED_VAR (mem_rsp_sop)
end else begin : g_no_pids
end else begin
assign pkt_waddr = 0;
assign mem_rsp_sop_pkt = mem_rsp_sop;
assign mem_rsp_eop_pkt = mem_rsp_eop;
@ -298,7 +300,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
wire [NUM_LANES-1:0] lsu_mem_req_mask;
wire [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] lsu_mem_req_byteen;
wire [NUM_LANES-1:0][LSU_ADDR_WIDTH-1:0] lsu_mem_req_addr;
wire [NUM_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] lsu_mem_req_flags;
wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] lsu_mem_req_atype;
wire [NUM_LANES-1:0][(LSU_WORD_SIZE*8)-1:0] lsu_mem_req_data;
wire [LSU_TAG_WIDTH-1:0] lsu_mem_req_tag;
wire lsu_mem_req_ready;
@ -309,14 +311,16 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
wire [LSU_TAG_WIDTH-1:0] lsu_mem_rsp_tag;
wire lsu_mem_rsp_ready;
`RESET_RELAY (mem_scheduler_reset, reset);
VX_mem_scheduler #(
.INSTANCE_ID (`SFORMATF(("%s-memsched", INSTANCE_ID))),
.INSTANCE_ID ($sformatf("%s-scheduler", INSTANCE_ID)),
.CORE_REQS (NUM_LANES),
.MEM_CHANNELS(NUM_LANES),
.WORD_SIZE (LSU_WORD_SIZE),
.LINE_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LSU_ADDR_WIDTH),
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_QUEUE_SIZE (`LSUQ_IN_SIZE),
.MEM_QUEUE_SIZE (`LSUQ_OUT_SIZE),
@ -326,7 +330,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
.CORE_OUT_BUF(0)
) mem_scheduler (
.clk (clk),
.reset (reset),
.reset (mem_scheduler_reset),
// Input request
.core_req_valid (mem_req_valid),
@ -334,12 +338,12 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
.core_req_mask (mem_req_mask),
.core_req_byteen(mem_req_byteen),
.core_req_addr (mem_req_addr),
.core_req_flags (mem_req_flags),
.core_req_atype (mem_req_atype),
.core_req_data (mem_req_data),
.core_req_tag (mem_req_tag),
.core_req_ready (mem_req_ready),
`UNUSED_PIN (core_req_empty),
`UNUSED_PIN (core_req_wr_notify),
`UNUSED_PIN (core_req_sent),
// Output response
.core_rsp_valid (mem_rsp_valid),
@ -356,7 +360,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
.mem_req_mask (lsu_mem_req_mask),
.mem_req_byteen (lsu_mem_req_byteen),
.mem_req_addr (lsu_mem_req_addr),
.mem_req_flags (lsu_mem_req_flags),
.mem_req_atype (lsu_mem_req_atype),
.mem_req_data (lsu_mem_req_data),
.mem_req_tag (lsu_mem_req_tag),
.mem_req_ready (lsu_mem_req_ready),
@ -374,7 +378,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
assign lsu_mem_if.req_data.rw = lsu_mem_req_rw;
assign lsu_mem_if.req_data.byteen = lsu_mem_req_byteen;
assign lsu_mem_if.req_data.addr = lsu_mem_req_addr;
assign lsu_mem_if.req_data.flags = lsu_mem_req_flags;
assign lsu_mem_if.req_data.atype = lsu_mem_req_atype;
assign lsu_mem_if.req_data.data = lsu_mem_req_data;
assign lsu_mem_if.req_data.tag = lsu_mem_req_tag;
assign lsu_mem_req_ready = lsu_mem_if.req_ready;
@ -422,7 +426,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
`endif
`endif
for (genvar i = 0; i < NUM_LANES; i++) begin : g_rsp_data
for (genvar i = 0; i < NUM_LANES; i++) begin
`ifdef XLEN_64
wire [63:0] rsp_data64 = mem_rsp_data[i];
wire [31:0] rsp_data32 = (rsp_align[i][2] ? mem_rsp_data[i][63:32] : mem_rsp_data[i][31:0]);
@ -479,7 +483,6 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
.valid_out (commit_no_rsp_if.valid),
.ready_out (commit_no_rsp_if.ready)
);
assign commit_no_rsp_if.data.rd = '0;
assign commit_no_rsp_if.data.wb = 1'b0;
assign commit_no_rsp_if.data.data = commit_rsp_if.data.data; // arbiter MUX optimization
@ -504,74 +507,51 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
`ifdef DBG_TRACE_MEM
always @(posedge clk) begin
if (execute_if.valid && fence_lock) begin
`TRACE(2, ("%t: *** %s fence wait\n", $time, INSTANCE_ID))
`TRACE(1, ("%d: *** %s fence wait\n", $time, INSTANCE_ID));
end
if (mem_req_fire) begin
if (mem_req_rw) begin
`TRACE(2, ("%t: %s Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask))
`TRACE_ARRAY1D(2, "0x%h", full_addr, NUM_LANES)
`TRACE(2, (", flags="))
`TRACE_ARRAY1D(2, "%b", mem_req_flags, NUM_LANES)
`TRACE(2, (", byteen=0x%0h, data=", mem_req_byteen))
`TRACE_ARRAY1D(2, "0x%0h", mem_req_data, NUM_LANES)
`TRACE(2, (", sop=%b, eop=%b, tag=0x%0h (#%0d)\n", execute_if.data.sop, execute_if.data.eop, mem_req_tag, execute_if.data.uuid))
`TRACE(1, ("%d: %s Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
`TRACE(1, (", byteen=0x%0h, data=", mem_req_byteen));
`TRACE_ARRAY1D(1, "0x%0h", mem_req_data, NUM_LANES);
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_req_tag, execute_if.data.uuid));
end else begin
`TRACE(2, ("%t: %s Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask))
`TRACE_ARRAY1D(2, "0x%h", full_addr, NUM_LANES)
`TRACE(2, (", flags="))
`TRACE_ARRAY1D(2, "%b", mem_req_flags, NUM_LANES)
`TRACE(2, (", byteen=0x%0h, rd=%0d, sop=%b, eop=%b, tag=0x%0h (#%0d)\n", mem_req_byteen, execute_if.data.rd, execute_if.data.sop, execute_if.data.eop, mem_req_tag, execute_if.data.uuid))
`TRACE(1, ("%d: %s Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
`TRACE(1, (", byteen=0x%0h, rd=%0d, tag=0x%0h (#%0d)\n", mem_req_byteen, execute_if.data.rd, mem_req_tag, execute_if.data.uuid));
end
end
if (mem_rsp_fire) begin
`TRACE(2, ("%t: %s Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=",
$time, INSTANCE_ID, rsp_wid, {rsp_pc, 1'b0}, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop))
`TRACE_ARRAY1D(2, "0x%0h", mem_rsp_data, NUM_LANES)
`TRACE(2, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid))
`TRACE(1, ("%d: %s Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=",
$time, INSTANCE_ID, rsp_wid, {rsp_pc, 1'b0}, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop));
`TRACE_ARRAY1D(1, "0x%0h", mem_rsp_data, NUM_LANES);
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid));
end
end
`endif
`ifdef SCOPE
`ifdef DBG_SCOPE_LSU
`SCOPE_IO_SWITCH (1);
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 3, 4, 2, (
1 + NUM_LANES * (`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE * 8) + `UUID_WIDTH + NUM_LANES * LSU_WORD_SIZE * 8 + `UUID_WIDTH
), {
mem_req_valid,
mem_req_ready,
mem_rsp_valid,
mem_rsp_ready
}, {
mem_req_fire,
mem_rsp_fire
}, {
mem_req_rw,
full_addr,
mem_req_byteen,
mem_req_data,
execute_if.data.uuid,
rsp_data,
rsp_uuid
},
reset_negedge, 1'b0, 4096
VX_scope_tap #(
.SCOPE_ID (3),
.TRIGGERW (3),
.PROBEW (1 + NUM_LANES*(`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE*8) + `UUID_WIDTH + NUM_LANES*LSU_WORD_SIZE*8 + `UUID_WIDTH)
) scope_tap (
.clk (clk),
.reset (scope_reset),
.start (1'b0),
.stop (1'b0),
.triggers({reset, mem_req_fire, mem_rsp_fire}),
.probes ({mem_req_rw, full_addr, mem_req_byteen, mem_req_data, execute_if.data.uuid, rsp_data, rsp_uuid}),
.bus_in (scope_bus_in),
.bus_out(scope_bus_out)
);
`else
`SCOPE_IO_UNUSED(0)
`endif
`endif
`ifdef CHIPSCOPE
`ifdef DBG_SCOPE_LSU
ila_lsu ila_lsu_inst (
.clk (clk),
.probe0 ({execute_if.valid, execute_if.data, execute_if.ready}),
.probe1 ({lsu_mem_if.req_valid, lsu_mem_if.req_data, lsu_mem_if.req_ready}),
.probe2 ({lsu_mem_if.rsp_valid, lsu_mem_if.rsp_data, lsu_mem_if.rsp_ready})
);
`endif
`SCOPE_IO_UNUSED()
`endif
endmodule

View file

@ -31,7 +31,9 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
localparam BLOCK_SIZE = `NUM_LSU_BLOCKS;
localparam NUM_LANES = `NUM_LSU_LANES;
`ifdef SCOPE
`SCOPE_IO_SWITCH (BLOCK_SIZE);
`endif
VX_execute_if #(
.NUM_LANES (NUM_LANES)
@ -40,7 +42,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_BUF (3)
.OUT_BUF (1)
) dispatch_unit (
.clk (clk),
.reset (reset),
@ -52,13 +54,16 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
.NUM_LANES (NUM_LANES)
) per_block_commit_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_slices
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : lsu_slices
`RESET_RELAY (slice_reset, reset);
VX_lsu_slice #(
.INSTANCE_ID (`SFORMATF(("%s%0d", INSTANCE_ID, block_idx)))
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, block_idx))
) lsu_slice(
`SCOPE_IO_BIND (block_idx)
.clk (clk),
.reset (reset),
.reset (slice_reset),
.execute_if (per_block_execute_if[block_idx]),
.commit_if (per_block_commit_if[block_idx]),
.lsu_mem_if (lsu_mem_if[block_idx])

View file

@ -1,260 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_mem_unit import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output lmem_perf_t lmem_perf,
output coalescer_perf_t coalescer_perf,
`endif
VX_lsu_mem_if.slave lsu_mem_if [`NUM_LSU_BLOCKS],
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS]
);
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_dcache_if[`NUM_LSU_BLOCKS]();
`ifdef LMEM_ENABLE
`STATIC_ASSERT(`IS_DIVISBLE((1 << `LMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT(0 == (`LMEM_BASE_ADDR % (1 << `LMEM_LOG_SIZE)), ("invalid parameter"))
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE);
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_lmem_if[`NUM_LSU_BLOCKS]();
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lmem_switches
VX_lmem_switch #(
.REQ0_OUT_BUF (1),
.REQ1_OUT_BUF (0),
.RSP_OUT_BUF (1),
.ARBITER ("P")
) lmem_switch (
.clk (clk),
.reset (reset),
.lsu_in_if (lsu_mem_if[i]),
.global_out_if(lsu_dcache_if[i]),
.local_out_if (lsu_lmem_if[i])
);
end
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LMEM_TAG_WIDTH)
) lmem_arb_if[1]();
VX_lsu_mem_arb #(
.NUM_INPUTS (`NUM_LSU_BLOCKS),
.NUM_OUTPUTS(1),
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_SEL_IDX(0),
.ARBITER ("R"),
.REQ_OUT_BUF(0),
.RSP_OUT_BUF(2)
) lmem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (lsu_lmem_if),
.bus_out_if (lmem_arb_if)
);
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LMEM_TAG_WIDTH)
) lmem_adapt_if[`NUM_LSU_LANES]();
VX_lsu_adapter #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LMEM_TAG_WIDTH),
.TAG_SEL_BITS (LMEM_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (3),
.RSP_OUT_BUF (0)
) lmem_adapter (
.clk (clk),
.reset (reset),
.lsu_mem_if (lmem_arb_if[0]),
.mem_bus_if (lmem_adapt_if)
);
VX_local_mem #(
.INSTANCE_ID(`SFORMATF(("%s-lmem", INSTANCE_ID))),
.SIZE (1 << `LMEM_LOG_SIZE),
.NUM_REQS (`NUM_LSU_LANES),
.NUM_BANKS (`LMEM_NUM_BANKS),
.WORD_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.TAG_WIDTH (LMEM_TAG_WIDTH),
.OUT_BUF (3)
) local_mem (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.lmem_perf (lmem_perf),
`endif
.mem_bus_if (lmem_adapt_if)
);
`else
`ifdef PERF_ENABLE
assign lmem_perf = '0;
`endif
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_dcache_if
`ASSIGN_VX_MEM_BUS_IF (lsu_dcache_if[i], lsu_mem_if[i]);
end
`endif
VX_lsu_mem_if #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_coalesced_if[`NUM_LSU_BLOCKS]();
`ifdef PERF_ENABLE
wire [`NUM_LSU_BLOCKS-1:0][`PERF_CTR_BITS-1:0] per_block_coalescer_misses;
wire [`PERF_CTR_BITS-1:0] coalescer_misses;
VX_reduce_tree #(
.DATAW_IN (`PERF_CTR_BITS),
.DATAW_OUT (`PERF_CTR_BITS),
.N (`NUM_LSU_BLOCKS),
.OP ("+")
) coalescer_reduce (
.data_in (per_block_coalescer_misses),
.data_out (coalescer_misses)
);
`BUFFER(coalescer_perf.misses, coalescer_misses);
`endif
if ((`NUM_LSU_LANES > 1) && (LSU_WORD_SIZE != DCACHE_WORD_SIZE)) begin : g_enabled
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_coalescers
VX_mem_coalescer #(
.INSTANCE_ID (`SFORMATF(("%s-coalescer%0d", INSTANCE_ID, i))),
.NUM_REQS (`NUM_LSU_LANES),
.DATA_IN_SIZE (LSU_WORD_SIZE),
.DATA_OUT_SIZE (DCACHE_WORD_SIZE),
.ADDR_WIDTH (LSU_ADDR_WIDTH),
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.QUEUE_SIZE (`LSUQ_OUT_SIZE),
.PERF_CTR_BITS (`PERF_CTR_BITS)
) mem_coalescer (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.misses (per_block_coalescer_misses[i]),
`else
`UNUSED_PIN (misses),
`endif
// Input request
.in_req_valid (lsu_dcache_if[i].req_valid),
.in_req_mask (lsu_dcache_if[i].req_data.mask),
.in_req_rw (lsu_dcache_if[i].req_data.rw),
.in_req_byteen (lsu_dcache_if[i].req_data.byteen),
.in_req_addr (lsu_dcache_if[i].req_data.addr),
.in_req_flags (lsu_dcache_if[i].req_data.flags),
.in_req_data (lsu_dcache_if[i].req_data.data),
.in_req_tag (lsu_dcache_if[i].req_data.tag),
.in_req_ready (lsu_dcache_if[i].req_ready),
// Input response
.in_rsp_valid (lsu_dcache_if[i].rsp_valid),
.in_rsp_mask (lsu_dcache_if[i].rsp_data.mask),
.in_rsp_data (lsu_dcache_if[i].rsp_data.data),
.in_rsp_tag (lsu_dcache_if[i].rsp_data.tag),
.in_rsp_ready (lsu_dcache_if[i].rsp_ready),
// Output request
.out_req_valid (dcache_coalesced_if[i].req_valid),
.out_req_mask (dcache_coalesced_if[i].req_data.mask),
.out_req_rw (dcache_coalesced_if[i].req_data.rw),
.out_req_byteen (dcache_coalesced_if[i].req_data.byteen),
.out_req_addr (dcache_coalesced_if[i].req_data.addr),
.out_req_flags (dcache_coalesced_if[i].req_data.flags),
.out_req_data (dcache_coalesced_if[i].req_data.data),
.out_req_tag (dcache_coalesced_if[i].req_data.tag),
.out_req_ready (dcache_coalesced_if[i].req_ready),
// Output response
.out_rsp_valid (dcache_coalesced_if[i].rsp_valid),
.out_rsp_mask (dcache_coalesced_if[i].rsp_data.mask),
.out_rsp_data (dcache_coalesced_if[i].rsp_data.data),
.out_rsp_tag (dcache_coalesced_if[i].rsp_data.tag),
.out_rsp_ready (dcache_coalesced_if[i].rsp_ready)
);
end
end else begin : g_passthru
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_dcache_coalesced_if
`ASSIGN_VX_MEM_BUS_IF (dcache_coalesced_if[i], lsu_dcache_if[i]);
`ifdef PERF_ENABLE
assign per_block_coalescer_misses[i] = '0;
`endif
end
end
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_dcache_adapters
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_tmp_if[DCACHE_CHANNELS]();
VX_lsu_adapter #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (0)
) dcache_adapter (
.clk (clk),
.reset (reset),
.lsu_mem_if (dcache_coalesced_if[i]),
.mem_bus_if (dcache_bus_tmp_if)
);
for (genvar j = 0; j < DCACHE_CHANNELS; ++j) begin : g_dcache_bus_if
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i * DCACHE_CHANNELS + j], dcache_bus_tmp_if[j]);
end
end
endmodule

View file

@ -1,127 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_mem_unit_top import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter LSU_WORD_WIDTH = LSU_WORD_SIZE * 8
) (
// Clock
input wire clk,
input wire reset,
// LSU memory request
input wire [`NUM_LSU_BLOCKS-1:0] lsu_req_valid,
input wire [`NUM_LSU_BLOCKS-1:0] lsu_req_rw,
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0] lsu_req_mask,
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_WORD_SIZE-1:0] lsu_req_byteen,
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_ADDR_WIDTH-1:0] lsu_req_addr,
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] lsu_req_flags,
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_WORD_WIDTH-1:0] lsu_req_data,
input wire [`NUM_LSU_BLOCKS-1:0][LSU_TAG_WIDTH-1:0] lsu_req_tag,
output wire [`NUM_LSU_BLOCKS-1:0] lsu_req_ready,
// LSU memory response
output wire [`NUM_LSU_BLOCKS-1:0] lsu_rsp_valid,
output wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0] lsu_rsp_mask,
output wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_WORD_WIDTH-1:0] lsu_rsp_data,
output wire [`NUM_LSU_BLOCKS-1:0][LSU_TAG_WIDTH-1:0] lsu_rsp_tag,
input wire [`NUM_LSU_BLOCKS-1:0] lsu_rsp_ready,
// Memory request
output wire [DCACHE_NUM_REQS-1:0] mem_req_valid,
output wire [DCACHE_NUM_REQS-1:0] mem_req_rw,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] mem_req_byteen,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] mem_req_addr,
output wire [DCACHE_NUM_REQS-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] mem_req_flags,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] mem_req_data,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] mem_req_tag,
input wire [DCACHE_NUM_REQS-1:0] mem_req_ready,
// Memory response
input wire [DCACHE_NUM_REQS-1:0] mem_rsp_valid,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] mem_rsp_data,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] mem_rsp_tag,
output wire [DCACHE_NUM_REQS-1:0] mem_rsp_ready
);
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_mem_if[`NUM_LSU_BLOCKS]();
// LSU memory request
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_mem_req
assign lsu_mem_if[i].req_valid = lsu_req_valid[i];
assign lsu_mem_if[i].req_data.rw = lsu_req_rw[i];
assign lsu_mem_if[i].req_data.mask = lsu_req_mask[i];
assign lsu_mem_if[i].req_data.byteen = lsu_req_byteen[i];
assign lsu_mem_if[i].req_data.addr = lsu_req_addr[i];
assign lsu_mem_if[i].req_data.flags = lsu_req_flags[i];
assign lsu_mem_if[i].req_data.data = lsu_req_data[i];
assign lsu_mem_if[i].req_data.tag = lsu_req_tag[i];
assign lsu_req_ready[i] = lsu_mem_if[i].req_ready;
end
// LSU memory response
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_rsp
assign lsu_rsp_valid[i] = lsu_mem_if[i].rsp_valid;
assign lsu_rsp_mask[i] = lsu_mem_if[i].rsp_data.mask;
assign lsu_rsp_data[i] = lsu_mem_if[i].rsp_data.data;
assign lsu_rsp_tag[i] = lsu_mem_if[i].rsp_data.tag;
assign lsu_mem_if[i].rsp_ready = lsu_rsp_ready[i];
end
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) mem_bus_if[DCACHE_NUM_REQS]();
// memory request
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_mem_req
assign mem_req_valid[i] = mem_bus_if[i].req_valid;
assign mem_req_rw[i] = mem_bus_if[i].req_data.rw;
assign mem_req_byteen[i] = mem_bus_if[i].req_data.byteen;
assign mem_req_addr[i] = mem_bus_if[i].req_data.addr;
assign mem_req_flags[i] = mem_bus_if[i].req_data.flags;
assign mem_req_data[i] = mem_bus_if[i].req_data.data;
assign mem_req_tag[i] = mem_bus_if[i].req_data.tag;
assign mem_bus_if[i].req_ready = mem_req_ready[i];
end
// memory response
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_mem_bus_rsp
assign mem_bus_if[i].rsp_valid = mem_rsp_valid[i];
assign mem_bus_if[i].rsp_data.tag = mem_rsp_tag[i];
assign mem_bus_if[i].rsp_data.data = mem_rsp_data[i];
assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready;
end
`ifdef PERF_ENABLE
cache_perf_t lmem_perf = '0;
`endif
VX_mem_unit #(
.INSTANCE_ID (INSTANCE_ID)
) mem_unit (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.lmem_perf (lmem_perf),
`endif
.lsu_mem_if (lsu_mem_if),
.dcache_bus_if (mem_bus_if)
);
endmodule

View file

@ -23,7 +23,7 @@
module VX_operands import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter NUM_BANKS = 4,
parameter OUT_BUF = 3
parameter OUT_BUF = 4 // using 2-cycle EB for area reduction
) (
input wire clk,
input wire reset,
@ -37,15 +37,15 @@ module VX_operands import VX_gpu_pkg::*; #(
VX_operands_if.master operands_if
);
`UNUSED_SPARAM (INSTANCE_ID)
localparam NUM_SRC_OPDS = 3;
localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_OPDS);
localparam NUM_SRC_REGS = 3;
localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_REGS);
localparam REQ_SEL_WIDTH = `UP(REQ_SEL_BITS);
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS;
localparam META_DATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + `UUID_WIDTH;
localparam REGS_DATAW = `XLEN * `NUM_THREADS;
localparam DATAW = META_DATAW + NUM_SRC_OPDS * REGS_DATAW;
localparam DATAW = META_DATAW + NUM_SRC_REGS * REGS_DATAW;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS);
localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS;
localparam XLEN_SIZE = `XLEN / 8;
@ -53,80 +53,87 @@ module VX_operands import VX_gpu_pkg::*; #(
`UNUSED_VAR (writeback_if.data.sop)
wire [NUM_SRC_OPDS-1:0] src_valid;
wire [NUM_SRC_OPDS-1:0] req_valid_in, req_ready_in;
wire [NUM_SRC_OPDS-1:0][PER_BANK_ADDRW-1:0] req_data_in;
wire [NUM_SRC_OPDS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
wire [NUM_SRC_REGS-1:0] src_valid;
wire [NUM_SRC_REGS-1:0] req_in_valid, req_in_ready;
wire [NUM_SRC_REGS-1:0][PER_BANK_ADDRW-1:0] req_in_data;
wire [NUM_SRC_REGS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
wire [NUM_BANKS-1:0] gpr_rd_valid, gpr_rd_ready;
wire [NUM_BANKS-1:0] gpr_rd_valid_st1, gpr_rd_valid_st2;
wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr, gpr_rd_addr_st1;
wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data_st2;
wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data_st1, gpr_rd_data_st2;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx, gpr_rd_req_idx_st1, gpr_rd_req_idx_st2;
wire pipe_ready_in;
wire pipe_valid_st1, pipe_ready_st1;
wire pipe_valid_st2, pipe_ready_st2;
wire [META_DATAW-1:0] pipe_data, pipe_data_st1, pipe_data_st2;
reg [NUM_SRC_OPDS-1:0][(`NUM_THREADS * `XLEN)-1:0] src_data_st2, src_data_m_st2;
reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_n;
wire [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_st1, src_data_st2;
reg [NUM_SRC_OPDS-1:0] data_fetched_st1;
reg [NUM_SRC_REGS-1:0] data_fetched_n;
wire [NUM_SRC_REGS-1:0] data_fetched_st1;
reg has_collision_n;
wire has_collision_st1;
wire [NUM_SRC_OPDS-1:0][`NR_BITS-1:0] src_opds;
assign src_opds = {scoreboard_if.data.rs3, scoreboard_if.data.rs2, scoreboard_if.data.rs1};
wire [NUM_SRC_REGS-1:0][`NR_BITS-1:0] src_regs = {scoreboard_if.data.rs3,
scoreboard_if.data.rs2,
scoreboard_if.data.rs1};
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_req_data_in
if (ISSUE_WIS != 0) begin : g_wis
assign req_data_in[i] = {src_opds[i][`NR_BITS-1:BANK_SEL_BITS], scoreboard_if.data.wis};
end else begin : g_no_wis
assign req_data_in[i] = src_opds[i][`NR_BITS-1:BANK_SEL_BITS];
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
if (ISSUE_WIS != 0) begin
assign req_in_data[i] = {src_regs[i][`NR_BITS-1:BANK_SEL_BITS], scoreboard_if.data.wis};
end else begin
assign req_in_data[i] = src_regs[i][`NR_BITS-1:BANK_SEL_BITS];
end
end
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_req_bank_idx
if (NUM_BANKS != 1) begin : g_multibanks
assign req_bank_idx[i] = src_opds[i][BANK_SEL_BITS-1:0];
end else begin : g_singlebank
if (NUM_BANKS != 1) begin
assign req_bank_idx[i] = src_regs[i][BANK_SEL_BITS-1:0];
end else begin
assign req_bank_idx[i] = '0;
end
end
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_src_valid
assign src_valid[i] = (src_opds[i] != 0) && ~data_fetched_st1[i];
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
assign src_valid[i] = (src_regs[i] != 0) && ~data_fetched_st1[i];
end
assign req_valid_in = {NUM_SRC_OPDS{scoreboard_if.valid}} & src_valid;
assign req_in_valid = {NUM_SRC_REGS{scoreboard_if.valid}} & src_valid;
VX_stream_xbar #(
.NUM_INPUTS (NUM_SRC_OPDS),
.NUM_INPUTS (NUM_SRC_REGS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (PER_BANK_ADDRW),
.ARBITER ("P"), // use priority arbiter
.PERF_CTR_BITS(`PERF_CTR_BITS),
.OUT_BUF (0) // no output buffering
) req_xbar (
.clk (clk),
.reset (reset),
`UNUSED_PIN(collisions),
.valid_in (req_valid_in),
.data_in (req_data_in),
.valid_in (req_in_valid),
.data_in (req_in_data),
.sel_in (req_bank_idx),
.ready_in (req_ready_in),
.ready_in (req_in_ready),
.valid_out (gpr_rd_valid),
.data_out (gpr_rd_addr),
.sel_out (gpr_rd_req_idx),
.ready_out (gpr_rd_ready)
);
assign gpr_rd_ready = {NUM_BANKS{pipe_ready_in}};
wire pipe_in_ready = pipe_ready_st1 || ~pipe_valid_st1;
assign gpr_rd_ready = {NUM_BANKS{pipe_in_ready}};
assign scoreboard_if.ready = pipe_in_ready && ~has_collision_n;
wire pipe_fire_st1 = pipe_valid_st1 && pipe_ready_st1;
wire pipe_fire_st2 = pipe_valid_st2 && pipe_ready_st2;
always @(*) begin
has_collision_n = 0;
for (integer i = 0; i < NUM_SRC_OPDS; ++i) begin
for (integer j = 1; j < (NUM_SRC_OPDS-i); ++j) begin
for (integer i = 0; i < NUM_SRC_REGS; ++i) begin
for (integer j = 1; j < (NUM_SRC_REGS-i); ++j) begin
has_collision_n |= src_valid[i]
&& src_valid[j+i]
&& (req_bank_idx[i] == req_bank_idx[j+i]);
@ -134,7 +141,14 @@ module VX_operands import VX_gpu_pkg::*; #(
end
end
wire [NUM_SRC_OPDS-1:0] req_fire_in = req_valid_in & req_ready_in;
always @(*) begin
data_fetched_n = data_fetched_st1;
if (scoreboard_if.ready) begin
data_fetched_n = '0;
end else begin
data_fetched_n = data_fetched_st1 | req_in_ready;
end
end
assign pipe_data = {
scoreboard_if.data.wis,
@ -148,74 +162,61 @@ module VX_operands import VX_gpu_pkg::*; #(
scoreboard_if.data.uuid
};
assign scoreboard_if.ready = pipe_ready_in && ~has_collision_n;
wire pipe_fire_st1 = pipe_valid_st1 && pipe_ready_st1;
wire pipe_fire_st2 = pipe_valid_st2 && pipe_ready_st2;
VX_pipe_buffer #(
.DATAW (NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH))
VX_pipe_register #(
.DATAW (1 + NUM_SRC_REGS + NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)),
.RESETW (1 + NUM_SRC_REGS)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.valid_in (scoreboard_if.valid),
.ready_in (pipe_ready_in),
.data_in ({gpr_rd_valid, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}),
.data_out ({gpr_rd_valid_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1}),
.valid_out(pipe_valid_st1),
.ready_out(pipe_ready_st1)
.enable (pipe_in_ready),
.data_in ({scoreboard_if.valid, data_fetched_n, gpr_rd_valid, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}),
.data_out ({pipe_valid_st1, data_fetched_st1, gpr_rd_valid_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1})
);
always @(posedge clk) begin
if (reset || scoreboard_if.ready) begin
data_fetched_st1 <= 0;
end else begin
data_fetched_st1 <= data_fetched_st1 | req_fire_in;
end
end
assign pipe_ready_st1 = pipe_ready_st2 || ~pipe_valid_st2;
assign src_data_st1 = pipe_fire_st2 ? '0 : src_data_n;
wire pipe_valid2_st1 = pipe_valid_st1 && ~has_collision_st1;
VX_pipe_buffer #(
.DATAW (NUM_BANKS * (1 + REQ_SEL_WIDTH) + META_DATAW)
`RESET_RELAY (pipe2_reset, reset); // needed for pipe_reg2's wide RESETW
VX_pipe_register #(
.DATAW (1 + NUM_SRC_REGS * REGS_DATAW + NUM_BANKS + NUM_BANKS * REGS_DATAW + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH),
.RESETW (1 + NUM_SRC_REGS * REGS_DATAW)
) pipe_reg2 (
.clk (clk),
.reset (reset),
.valid_in (pipe_valid2_st1),
.ready_in (pipe_ready_st1),
.data_in ({gpr_rd_valid_st1, gpr_rd_req_idx_st1, pipe_data_st1}),
.data_out ({gpr_rd_valid_st2, gpr_rd_req_idx_st2, pipe_data_st2}),
.valid_out(pipe_valid_st2),
.ready_out(pipe_ready_st2)
.reset (pipe2_reset),
.enable (pipe_ready_st1),
.data_in ({pipe_valid2_st1, src_data_st1, gpr_rd_valid_st1, gpr_rd_data_st1, pipe_data_st1, gpr_rd_req_idx_st1}),
.data_out ({pipe_valid_st2, src_data_st2, gpr_rd_valid_st2, gpr_rd_data_st2, pipe_data_st2, gpr_rd_req_idx_st2})
);
always @(*) begin
src_data_m_st2 = src_data_st2;
src_data_n = src_data_st2;
for (integer b = 0; b < NUM_BANKS; ++b) begin
if (gpr_rd_valid_st2[b]) begin
src_data_m_st2[gpr_rd_req_idx_st2[b]] = gpr_rd_data_st2[b];
src_data_n[gpr_rd_req_idx_st2[b]] = gpr_rd_data_st2[b];
end
end
end
always @(posedge clk) begin
if (reset || pipe_fire_st2) begin
src_data_st2 <= 0;
end else begin
src_data_st2 <= src_data_m_st2;
end
end
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)),
.LUTRAM (1)
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (pipe_valid_st2),
.ready_in (pipe_ready_st2),
.data_in ({pipe_data_st2, src_data_m_st2}),
.data_in ({
pipe_data_st2,
src_data_n[0],
src_data_n[1],
src_data_n[2]
}),
.data_out ({
operands_if.data.wis,
operands_if.data.tmask,
@ -226,39 +227,51 @@ module VX_operands import VX_gpu_pkg::*; #(
operands_if.data.op_args,
operands_if.data.rd,
operands_if.data.uuid,
operands_if.data.rs3_data,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs1_data
operands_if.data.rs3_data
}),
.valid_out (operands_if.valid),
.ready_out (operands_if.ready)
);
wire [PER_BANK_ADDRW-1:0] gpr_wr_addr;
if (ISSUE_WIS != 0) begin : g_gpr_wr_addr
if (ISSUE_WIS != 0) begin
assign gpr_wr_addr = {writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS], writeback_if.data.wis};
end else begin : g_gpr_wr_addr_no_wis
end else begin
assign gpr_wr_addr = writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS];
end
wire [BANK_SEL_WIDTH-1:0] gpr_wr_bank_idx;
if (NUM_BANKS != 1) begin : g_gpr_wr_bank_idx
if (NUM_BANKS != 1) begin
assign gpr_wr_bank_idx = writeback_if.data.rd[BANK_SEL_BITS-1:0];
end else begin : g_gpr_wr_bank_idx_0
end else begin
assign gpr_wr_bank_idx = '0;
end
for (genvar b = 0; b < NUM_BANKS; ++b) begin : g_gpr_rams
`ifdef GPR_RESET
reg wr_enabled = 0;
always @(posedge clk) begin
if (reset) begin
wr_enabled <= 1;
end
end
`else
wire wr_enabled = 1;
`endif
for (genvar b = 0; b < NUM_BANKS; ++b) begin
wire gpr_wr_enabled;
if (BANK_SEL_BITS != 0) begin : g_gpr_wr_enabled_multibanks
assign gpr_wr_enabled = writeback_if.valid
if (BANK_SEL_BITS != 0) begin
assign gpr_wr_enabled = wr_enabled
&& writeback_if.valid
&& (gpr_wr_bank_idx == BANK_SEL_BITS'(b));
end else begin : g_gpr_wr_enabled
assign gpr_wr_enabled = writeback_if.valid;
end else begin
assign gpr_wr_enabled = wr_enabled && writeback_if.valid;
end
wire [BYTEENW-1:0] wren;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin : g_wren
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign wren[i*XLEN_SIZE+:XLEN_SIZE] = {XLEN_SIZE{writeback_if.data.tmask[i]}};
end
@ -269,8 +282,7 @@ module VX_operands import VX_gpu_pkg::*; #(
`ifdef GPR_RESET
.RESET_RAM (1),
`endif
.OUT_REG (1),
.RDW_MODE ("R")
.NO_RWCHECK (1)
) gpr_ram (
.clk (clk),
.reset (reset),
@ -280,7 +292,7 @@ module VX_operands import VX_gpu_pkg::*; #(
.waddr (gpr_wr_addr),
.wdata (writeback_if.data.data),
.raddr (gpr_rd_addr_st1[b]),
.rdata (gpr_rd_data_st2[b])
.rdata (gpr_rd_data_st1[b])
);
end
@ -290,7 +302,7 @@ module VX_operands import VX_gpu_pkg::*; #(
if (reset) begin
collisions_r <= '0;
end else begin
collisions_r <= collisions_r + `PERF_CTR_BITS'(scoreboard_if.valid && pipe_ready_in && has_collision_n);
collisions_r <= collisions_r + `PERF_CTR_BITS'(scoreboard_if.valid && pipe_in_ready && has_collision_n);
end
end
assign perf_stalls = collisions_r;

View file

@ -1,93 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_pe_switch import VX_gpu_pkg::*; #(
parameter PE_COUNT = 0,
parameter NUM_LANES = 0,
parameter REQ_OUT_BUF = 0,
parameter RSP_OUT_BUF = 0,
parameter `STRING ARBITER = "R",
parameter PE_SEL_BITS = `CLOG2(PE_COUNT)
) (
input wire clk,
input wire reset,
input wire [`UP(PE_SEL_BITS)-1:0] pe_sel,
VX_execute_if.slave execute_in_if,
VX_commit_if.master commit_out_if,
VX_execute_if.master execute_out_if[PE_COUNT],
VX_commit_if .slave commit_in_if[PE_COUNT]
);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam REQ_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `INST_ALU_BITS + $bits(op_args_t) + 1 + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
localparam RSP_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
wire [PE_COUNT-1:0] pe_req_valid;
wire [PE_COUNT-1:0][REQ_DATAW-1:0] pe_req_data;
wire [PE_COUNT-1:0] pe_req_ready;
VX_stream_switch #(
.DATAW (REQ_DATAW),
.NUM_INPUTS (1),
.NUM_OUTPUTS (PE_COUNT),
.OUT_BUF (REQ_OUT_BUF)
) req_switch (
.clk (clk),
.reset (reset),
.sel_in (pe_sel),
.valid_in (execute_in_if.valid),
.ready_in (execute_in_if.ready),
.data_in (execute_in_if.data),
.data_out (pe_req_data),
.valid_out (pe_req_valid),
.ready_out (pe_req_ready)
);
for (genvar i = 0; i < PE_COUNT; ++i) begin : g_execute_out_if
assign execute_out_if[i].valid = pe_req_valid[i];
assign execute_out_if[i].data = pe_req_data[i];
assign pe_req_ready[i] = execute_out_if[i].ready;
end
///////////////////////////////////////////////////////////////////////////
wire [PE_COUNT-1:0] pe_rsp_valid;
wire [PE_COUNT-1:0][RSP_DATAW-1:0] pe_rsp_data;
wire [PE_COUNT-1:0] pe_rsp_ready;
for (genvar i = 0; i < PE_COUNT; ++i) begin : g_commit_in_if
assign pe_rsp_valid[i] = commit_in_if[i].valid;
assign pe_rsp_data[i] = commit_in_if[i].data;
assign commit_in_if[i].ready = pe_rsp_ready[i];
end
VX_stream_arb #(
.NUM_INPUTS (PE_COUNT),
.DATAW (RSP_DATAW),
.ARBITER (ARBITER),
.OUT_BUF (RSP_OUT_BUF)
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in (pe_rsp_valid),
.ready_in (pe_rsp_ready),
.data_in (pe_rsp_data),
.data_out (commit_out_if.data),
.valid_out (commit_out_if.valid),
.ready_out (commit_out_if.ready),
`UNUSED_PIN (sel_out)
);
endmodule

View file

@ -68,6 +68,8 @@ module VX_schedule import VX_gpu_pkg::*; #(
reg [`PERF_CTR_BITS-1:0] cycles;
reg [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] issued_instrs;
wire schedule_fire = schedule_valid && schedule_ready;
wire schedule_if_fire = schedule_if.valid && schedule_if.ready;
@ -76,7 +78,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
wire [`NUM_ALU_BLOCKS-1:0][`NW_WIDTH-1:0] branch_wid;
wire [`NUM_ALU_BLOCKS-1:0] branch_taken;
wire [`NUM_ALU_BLOCKS-1:0][`PC_BITS-1:0] branch_dest;
for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin : g_branch_init
for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin
assign branch_valid[i] = branch_ctl_if[i].valid;
assign branch_wid[i] = branch_ctl_if[i].wid;
assign branch_taken[i] = branch_ctl_if[i].taken;
@ -111,16 +113,6 @@ module VX_schedule import VX_gpu_pkg::*; #(
barrier_stalls_n= barrier_stalls;
warp_pcs_n = warp_pcs;
// decode unlock
if (decode_sched_if.valid && decode_sched_if.unlock) begin
stalled_warps_n[decode_sched_if.wid] = 0;
end
// CSR unlock
if (sched_csr_if.unlock_warp) begin
stalled_warps_n[sched_csr_if.unlock_wid] = 0;
end
// wspawn handling
if (wspawn.valid && is_single_warp) begin
active_warps_n |= wspawn.wmask;
@ -178,11 +170,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end
end
`ifdef GBAR_ENABLE
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_data.id)) begin
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_id)) begin
barrier_ctrs_n[warp_ctl_if.barrier.id] = '0; // reset barrier counter
barrier_masks_n[gbar_bus_if.rsp_data.id] = '0; // reset barrier mask
barrier_masks_n[gbar_bus_if.rsp_id] = '0; // reset barrier mask
stalled_warps_n = '0; // unlock all warps
end
`endif
@ -197,6 +188,16 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
end
// decode unlock
if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin
stalled_warps_n[decode_sched_if.wid] = 0;
end
// CSR unlock
if (sched_csr_if.unlock_warp) begin
stalled_warps_n[sched_csr_if.unlock_wid] = 0;
end
// stall the warp until decode stage
if (schedule_fire) begin
stalled_warps_n[schedule_wid] = 1;
@ -222,6 +223,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
active_warps <= '0;
thread_masks <= '0;
barrier_stalls <= '0;
issued_instrs <= '0;
cycles <= '0;
wspawn.valid <= 0;
@ -266,6 +268,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
`endif
if (schedule_if_fire) begin
issued_instrs[schedule_if.data.wid] <= issued_instrs[schedule_if.data.wid] + `UUID_WIDTH'(1);
end
if (busy) begin
cycles <= cycles + 1;
end
@ -275,19 +281,21 @@ module VX_schedule import VX_gpu_pkg::*; #(
// barrier handling
`ifdef GBAR_ENABLE
assign gbar_bus_if.req_valid = gbar_req_valid;
assign gbar_bus_if.req_data.id = gbar_req_id;
assign gbar_bus_if.req_data.size_m1 = gbar_req_size_m1;
assign gbar_bus_if.req_data.core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
assign gbar_bus_if.req_valid = gbar_req_valid;
assign gbar_bus_if.req_id = gbar_req_id;
assign gbar_bus_if.req_size_m1 = gbar_req_size_m1;
assign gbar_bus_if.req_core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
`endif
// split/join handling
`RESET_RELAY (split_join_reset, reset);
VX_split_join #(
.INSTANCE_ID (`SFORMATF(("%s-splitjoin", INSTANCE_ID)))
.INSTANCE_ID ($sformatf("%s-splitjoin", INSTANCE_ID))
) split_join (
.clk (clk),
.reset (reset),
.reset (split_join_reset),
.valid (warp_ctl_if.valid),
.wid (warp_ctl_if.wid),
.split (warp_ctl_if.split),
@ -316,7 +324,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
);
wire [`NUM_WARPS-1:0][(`NUM_THREADS + `PC_BITS)-1:0] schedule_data;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_schedule_data
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
assign schedule_data[i] = {thread_masks[i], warp_pcs[i]};
end
@ -325,50 +333,67 @@ module VX_schedule import VX_gpu_pkg::*; #(
schedule_data[schedule_wid][(`NUM_THREADS + `PC_BITS)-5:0]
};
wire [`UUID_WIDTH-1:0] instr_uuid;
`ifdef UUID_ENABLE
VX_uuid_gen #(
.CORE_ID (CORE_ID),
.UUID_WIDTH (`UUID_WIDTH)
) uuid_gen (
.clk (clk),
.reset (reset),
.incr (schedule_fire),
.wid (schedule_wid),
.uuid (instr_uuid)
);
`ifndef NDEBUG
localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS);
reg [`UUID_WIDTH-1:0] instr_uuid;
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid);
`ifdef SV_DPI
always @(posedge clk) begin
if (reset) begin
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 32'd0));
end else if (schedule_fire) begin
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid)));
end
end
`else
assign instr_uuid = '0;
wire [GNW_WIDTH+16-1:0] w_uuid = {g_wid, 16'(schedule_pc)};
always @(*) begin
instr_uuid = `UUID_WIDTH'(w_uuid);
end
`endif
`else
wire [`UUID_WIDTH-1:0] instr_uuid = '0;
`endif
VX_elastic_buffer #(
.DATAW (`NUM_THREADS + `PC_BITS + `NW_WIDTH + `UUID_WIDTH),
.SIZE (2), // need to buffer out ready_in
.OUT_REG (1) // should be registered for BRAM acces in fetch unit
.DATAW (`NUM_THREADS + `PC_BITS + `NW_WIDTH)
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (schedule_valid),
.ready_in (schedule_ready),
.data_in ({schedule_tmask, schedule_pc, schedule_wid, instr_uuid}),
.data_out ({schedule_if.data.tmask, schedule_if.data.PC, schedule_if.data.wid, schedule_if.data.uuid}),
.data_in ({schedule_tmask, schedule_pc, schedule_wid}),
.data_out ({schedule_if.data.tmask, schedule_if.data.PC, schedule_if.data.wid}),
.valid_out (schedule_if.valid),
.ready_out (schedule_if.ready)
);
assign schedule_if.data.uuid = instr_uuid;
// Track pending instructions per warp
reg [`NUM_WARPS-1:0] per_warp_incr;
always @(*) begin
per_warp_incr = 0;
if (schedule_if_fire) begin
per_warp_incr[schedule_if.data.wid] = 1;
end
end
wire [`NUM_WARPS-1:0] pending_warp_empty;
wire [`NUM_WARPS-1:0] pending_warp_alm_empty;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_pending_sizes
`RESET_RELAY_EX (pending_instr_reset, reset, `NUM_WARPS, `MAX_FANOUT);
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
VX_pending_size #(
.SIZE (4096),
.ALM_EMPTY (1)
) counter (
.clk (clk),
.reset (reset),
.incr (schedule_if_fire && (schedule_if.data.wid == `NW_WIDTH'(i))),
.reset (pending_instr_reset[i]),
.incr (per_warp_incr[i]),
.decr (commit_sched_if.committed_warps[i]),
.empty (pending_warp_empty[i]),
.alm_empty (pending_warp_alm_empty[i]),
@ -382,7 +407,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
wire no_pending_instr = (& pending_warp_empty);
`BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1, 1);
`BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1);
// export CSRs
assign sched_csr_if.cycles = cycles;
@ -397,7 +422,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
timeout_ctr <= '0;
timeout_enable <= 0;
end else begin
if (decode_sched_if.valid && decode_sched_if.unlock) begin
if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin
timeout_enable <= 1;
end
if (timeout_enable && active_warps !=0 && active_warps == stalled_warps) begin

View file

@ -30,8 +30,6 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
VX_scoreboard_if.master scoreboard_if
);
`UNUSED_SPARAM (INSTANCE_ID)
localparam NUM_SRC_OPDS = 3;
localparam NUM_OPDS = NUM_SRC_OPDS + 1;
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + 1;
VX_ibuffer_if staging_if [PER_ISSUE_WARPS]();
@ -44,7 +42,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
VX_reduce_tree #(
VX_reduce #(
.DATAW_IN (`NUM_EX_UNITS),
.N (PER_ISSUE_WARPS),
.OP ("|")
@ -53,7 +51,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
.data_out (perf_units_per_cycle)
);
VX_reduce_tree #(
VX_reduce #(
.DATAW_IN (`NUM_SFU_UNITS),
.N (PER_ISSUE_WARPS),
.OP ("|")
@ -62,17 +60,17 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
.data_out (perf_sfu_per_cycle)
);
`BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, 0, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
`BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, 0, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
`BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
`BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
wire [PER_ISSUE_WARPS-1:0] stg_valid_in;
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_stg_valid_in
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
assign stg_valid_in[w] = staging_if[w].valid;
end
wire perf_stall_per_cycle = (|stg_valid_in) && ~(|(stg_valid_in & operands_ready));
always @(posedge clk) begin : g_perf_stalls
always @(posedge clk) begin
if (reset) begin
perf_stalls <= '0;
end else begin
@ -80,7 +78,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
end
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_perf_units_uses
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_units_uses[i] <= '0;
@ -90,7 +88,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
end
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin : g_perf_sfu_uses
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_sfu_uses[i] <= '0;
@ -101,9 +99,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
`endif
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_stanging_bufs
VX_pipe_buffer #(
.DATAW (DATAW)
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (1)
) stanging_buf (
.clk (clk),
.reset (reset),
@ -116,10 +115,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
);
end
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_scoreboard
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
reg [`NUM_REGS-1:0] inuse_regs;
reg [NUM_OPDS-1:0] operands_busy, operands_busy_n;
reg [3:0] operands_busy, operands_busy_n;
wire ibuffer_fire = ibuffer_if[w].valid && ibuffer_if[w].ready;
@ -129,10 +128,6 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
&& (writeback_if.data.wis == ISSUE_WIS_W'(w))
&& writeback_if.data.eop;
wire [NUM_OPDS-1:0][`NR_BITS-1:0] ibuf_opds, stg_opds;
assign ibuf_opds = {ibuffer_if[w].data.rs3, ibuffer_if[w].data.rs2, ibuffer_if[w].data.rs1, ibuffer_if[w].data.rd};
assign stg_opds = {staging_if[w].data.rs3, staging_if[w].data.rs2, staging_if[w].data.rs1, staging_if[w].data.rd};
`ifdef PERF_ENABLE
reg [`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
reg [`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
@ -140,36 +135,86 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
always @(*) begin
perf_inuse_units_per_cycle[w] = '0;
perf_inuse_sfu_per_cycle[w] = '0;
for (integer i = 0; i < NUM_OPDS; ++i) begin
if (staging_if[w].valid && operands_busy[i]) begin
perf_inuse_units_per_cycle[w][inuse_units[stg_opds[i]]] = 1;
if (inuse_units[stg_opds[i]] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[stg_opds[i]]] = 1;
if (staging_if[w].valid) begin
if (operands_busy[0]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rd]] = 1;
if (inuse_units[staging_if[w].data.rd] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rd]] = 1;
end
end
if (operands_busy[1]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs1]] = 1;
if (inuse_units[staging_if[w].data.rs1] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs1]] = 1;
end
end
if (operands_busy[2]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs2]] = 1;
if (inuse_units[staging_if[w].data.rs2] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs2]] = 1;
end
end
if (operands_busy[3]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs3]] = 1;
if (inuse_units[staging_if[w].data.rs3] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs3]] = 1;
end
end
end
end
`endif
for (genvar i = 0; i < NUM_OPDS; ++i) begin : g_operands_busy_n
always @(*) begin
operands_busy_n[i] = operands_busy[i];
always @(*) begin
operands_busy_n = operands_busy;
if (ibuffer_fire) begin
operands_busy_n = {
inuse_regs[ibuffer_if[w].data.rs3],
inuse_regs[ibuffer_if[w].data.rs2],
inuse_regs[ibuffer_if[w].data.rs1],
inuse_regs[ibuffer_if[w].data.rd]
};
end
if (writeback_fire) begin
if (ibuffer_fire) begin
operands_busy_n[i] = inuse_regs[ibuf_opds[i]];
if (staging_fire && staging_if[w].data.wb && staging_if[w].data.rd == ibuf_opds[i]) begin
operands_busy_n[i] = 1;
if (writeback_if.data.rd == ibuffer_if[w].data.rd) begin
operands_busy_n[0] = 0;
end
if (writeback_if.data.rd == ibuffer_if[w].data.rs1) begin
operands_busy_n[1] = 0;
end
if (writeback_if.data.rd == ibuffer_if[w].data.rs2) begin
operands_busy_n[2] = 0;
end
if (writeback_if.data.rd == ibuffer_if[w].data.rs3) begin
operands_busy_n[3] = 0;
end
end else begin
if (writeback_if.data.rd == staging_if[w].data.rd) begin
operands_busy_n[0] = 0;
end
if (writeback_if.data.rd == staging_if[w].data.rs1) begin
operands_busy_n[1] = 0;
end
if (writeback_if.data.rd == staging_if[w].data.rs2) begin
operands_busy_n[2] = 0;
end
if (writeback_if.data.rd == staging_if[w].data.rs3) begin
operands_busy_n[3] = 0;
end
end
if (writeback_fire) begin
if (ibuffer_fire) begin
if (writeback_if.data.rd == ibuf_opds[i]) begin
operands_busy_n[i] = 0;
end
end else begin
if (writeback_if.data.rd == stg_opds[i]) begin
operands_busy_n[i] = 0;
end
end
end
if (staging_fire && staging_if[w].data.wb) begin
if (staging_if[w].data.rd == ibuffer_if[w].data.rd) begin
operands_busy_n[0] = 1;
end
if (staging_if[w].data.rd == ibuffer_if[w].data.rs1) begin
operands_busy_n[1] = 1;
end
if (staging_if[w].data.rd == ibuffer_if[w].data.rs2) begin
operands_busy_n[2] = 1;
end
if (staging_if[w].data.rd == ibuffer_if[w].data.rs3) begin
operands_busy_n[3] = 1;
end
end
end
@ -185,10 +230,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
inuse_regs[staging_if[w].data.rd] <= 1;
end
end
operands_busy <= operands_busy_n;
operands_ready[w] <= ~(| operands_busy_n);
`ifdef PERF_ENABLE
if (staging_fire && staging_if[w].data.wb) begin
inuse_units[staging_if[w].data.rd] <= staging_if[w].data.ex_type;
@ -208,9 +251,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end else begin
if (staging_if[w].valid && ~staging_if[w].ready) begin
`ifdef DBG_TRACE_PIPELINE
`TRACE(4, ("%t: *** %s-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
`TRACE(3, ("%d: *** %s-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr,
operands_busy, staging_if[w].data.uuid))
operands_busy, staging_if[w].data.uuid));
`endif
timeout_ctr <= timeout_ctr + 1;
end else if (ibuffer_fire) begin
@ -222,11 +265,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
("%t: *** %s timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr,
operands_busy, staging_if[w].data.uuid))
operands_busy, staging_if[w].data.uuid));
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if.data.rd] != 0,
("%t: *** %s invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, INSTANCE_ID, w, {writeback_if.data.PC, 1'b0}, writeback_if.data.tmask, writeback_if.data.rd, writeback_if.data.uuid))
$time, INSTANCE_ID, w, {writeback_if.data.PC, 1'b0}, writeback_if.data.tmask, writeback_if.data.rd, writeback_if.data.uuid));
`endif
end
@ -235,20 +278,23 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
wire [PER_ISSUE_WARPS-1:0][DATAW-1:0] arb_data_in;
wire [PER_ISSUE_WARPS-1:0] arb_ready_in;
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_arb_data_in
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
assign arb_valid_in[w] = staging_if[w].valid && operands_ready[w];
assign arb_data_in[w] = staging_if[w].data;
assign staging_if[w].ready = arb_ready_in[w] && operands_ready[w];
end
`RESET_RELAY (arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (PER_ISSUE_WARPS),
.DATAW (DATAW),
.ARBITER ("C"),
.OUT_BUF (3)
.ARBITER ("F"),
.LUTRAM (1),
.OUT_BUF (4) // using 2-cycle EB for area reduction
) out_arb (
.clk (clk),
.reset (reset),
.reset (arb_reset),
.valid_in (arb_valid_in),
.ready_in (arb_ready_in),
.data_in (arb_data_in),

View file

@ -21,8 +21,8 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
input sysmem_perf_t sysmem_perf,
input pipeline_perf_t pipeline_perf,
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
`endif
input base_dcrs_t base_dcrs,
@ -41,25 +41,24 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
VX_warp_ctl_if.master warp_ctl_if
);
`UNUSED_SPARAM (INSTANCE_ID)
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_SFU_LANES;
localparam PE_COUNT = 2;
localparam PE_SEL_BITS = `CLOG2(PE_COUNT);
localparam PE_IDX_WCTL = 0;
localparam PE_IDX_CSRS = 1;
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_SFU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `PC_BITS + PID_WIDTH + 1 + 1;
localparam RSP_ARB_SIZE = 1 + 1;
localparam RSP_ARB_IDX_WCTL = 0;
localparam RSP_ARB_IDX_CSRS = 1;
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) per_block_execute_if[BLOCK_SIZE]();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) per_block_commit_if[BLOCK_SIZE]();
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_BUF (3)
.OUT_BUF (1)
) dispatch_unit (
.clk (clk),
.reset (reset),
@ -67,62 +66,65 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.execute_if (per_block_execute_if)
);
wire [RSP_ARB_SIZE-1:0] rsp_arb_valid_in;
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
// Warp control block
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) pe_execute_if[PE_COUNT]();
) wctl_execute_if();
VX_commit_if#(
.NUM_LANES (NUM_LANES)
) pe_commit_if[PE_COUNT]();
) wctl_commit_if();
reg [PE_SEL_BITS-1:0] pe_select;
always @(*) begin
pe_select = PE_IDX_WCTL;
if (`INST_SFU_IS_CSR(per_block_execute_if[0].data.op_type))
pe_select = PE_IDX_CSRS;
end
assign wctl_execute_if.valid = per_block_execute_if[0].valid && `INST_SFU_IS_WCTL(per_block_execute_if[0].data.op_type);
assign wctl_execute_if.data = per_block_execute_if[0].data;
VX_pe_switch #(
.PE_COUNT (PE_COUNT),
.NUM_LANES (NUM_LANES),
.ARBITER ("R"),
.REQ_OUT_BUF(0),
.RSP_OUT_BUF(3)
) pe_switch (
.clk (clk),
.reset (reset),
.pe_sel (pe_select),
.execute_in_if (per_block_execute_if[0]),
.commit_out_if (per_block_commit_if[0]),
.execute_out_if (pe_execute_if),
.commit_in_if (pe_commit_if)
);
`RESET_RELAY (wctl_reset, reset);
VX_wctl_unit #(
.INSTANCE_ID (`SFORMATF(("%s-wctl", INSTANCE_ID))),
.INSTANCE_ID ($sformatf("%s-wctl", INSTANCE_ID)),
.NUM_LANES (NUM_LANES)
) wctl_unit (
.clk (clk),
.reset (reset),
.execute_if (pe_execute_if[PE_IDX_WCTL]),
.reset (wctl_reset),
.execute_if (wctl_execute_if),
.warp_ctl_if(warp_ctl_if),
.commit_if (pe_commit_if[PE_IDX_WCTL])
.commit_if (wctl_commit_if)
);
assign rsp_arb_valid_in[RSP_ARB_IDX_WCTL] = wctl_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_WCTL] = wctl_commit_if.data;
assign wctl_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_WCTL];
// CSR unit
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) csr_execute_if();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) csr_commit_if();
assign csr_execute_if.valid = per_block_execute_if[0].valid && `INST_SFU_IS_CSR(per_block_execute_if[0].data.op_type);
assign csr_execute_if.data = per_block_execute_if[0].data;
`RESET_RELAY (csr_reset, reset);
VX_csr_unit #(
.INSTANCE_ID (`SFORMATF(("%s-csr", INSTANCE_ID))),
.INSTANCE_ID ($sformatf("%s-csr", INSTANCE_ID)),
.CORE_ID (CORE_ID),
.NUM_LANES (NUM_LANES)
) csr_unit (
.clk (clk),
.reset (reset),
.reset (csr_reset),
.base_dcrs (base_dcrs),
.execute_if (pe_execute_if[PE_IDX_CSRS]),
.execute_if (csr_execute_if),
`ifdef PERF_ENABLE
.sysmem_perf (sysmem_perf),
.pipeline_perf (pipeline_perf),
.mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if),
`endif
`ifdef EXT_F_ENABLE
@ -131,7 +133,47 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.sched_csr_if (sched_csr_if),
.commit_csr_if (commit_csr_if),
.commit_if (pe_commit_if[PE_IDX_CSRS])
.commit_if (csr_commit_if)
);
assign rsp_arb_valid_in[RSP_ARB_IDX_CSRS] = csr_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_CSRS] = csr_commit_if.data;
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSRS];
// can accept new request?
reg sfu_req_ready;
always @(*) begin
case (per_block_execute_if[0].data.op_type)
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
default: sfu_req_ready = wctl_execute_if.ready;
endcase
end
assign per_block_execute_if[0].ready = sfu_req_ready;
// response arbitration
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) arb_commit_if[BLOCK_SIZE]();
VX_stream_arb #(
.NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW),
.ARBITER ("R"),
.OUT_BUF (3)
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in (rsp_arb_valid_in),
.ready_in (rsp_arb_ready_in),
.data_in (rsp_arb_data_in),
.data_out (arb_commit_if[0].data),
.valid_out (arb_commit_if[0].valid),
.ready_out (arb_commit_if[0].ready),
`UNUSED_PIN (sel_out)
);
VX_gather_unit #(
@ -139,9 +181,9 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.NUM_LANES (NUM_LANES),
.OUT_BUF (3)
) gather_unit (
.clk (clk),
.reset (reset),
.commit_in_if (per_block_commit_if),
.clk (clk),
.reset (reset),
.commit_in_if (arb_commit_if),
.commit_out_if (commit_if)
);

View file

@ -45,13 +45,16 @@ module VX_split_join import VX_gpu_pkg::*; #(
wire ipdom_push = valid && split.valid && split.is_dvg;
wire ipdom_pop = valid && sjoin.valid && sjoin_is_dvg;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_ipdom_stacks
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
`RESET_RELAY (ipdom_reset, reset);
VX_ipdom_stack #(
.WIDTH (`NUM_THREADS+`PC_BITS),
.DEPTH (`DV_STACK_SIZE)
) ipdom_stack (
.clk (clk),
.reset (reset),
.reset (ipdom_reset),
.q0 (ipdom_q0),
.q1 (ipdom_q1),
.d (ipdom_data[i]),

399
hw/rtl/core/VX_trace_pkg.sv Normal file
View file

@ -0,0 +1,399 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`ifndef VX_TRACE_PKG_VH
`define VX_TRACE_PKG_VH
`include "VX_define.vh"
package VX_trace_pkg;
`ifdef SIMULATION
`ifdef SV_DPI
import "DPI-C" function void dpi_trace(input int level, input string format /*verilator sformat*/);
`endif
import VX_gpu_pkg::*;
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
case (ex_type)
`EX_ALU: `TRACE(level, ("ALU"));
`EX_LSU: `TRACE(level, ("LSU"));
`EX_FPU: `TRACE(level, ("FPU"));
`EX_SFU: `TRACE(level, ("SFU"));
default: `TRACE(level, ("?"));
endcase
endtask
task trace_ex_op(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input VX_gpu_pkg::op_args_t op_args
);
case (ex_type)
`EX_ALU: begin
case (op_args.alu.xtype)
`ALU_TYPE_ARITH: begin
if (op_args.alu.is_w) begin
if (op_args.alu.use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDIW"));
`INST_ALU_SLL: `TRACE(level, ("SLLIW"));
`INST_ALU_SRL: `TRACE(level, ("SRLIW"));
`INST_ALU_SRA: `TRACE(level, ("SRAIW"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDW"));
`INST_ALU_SUB: `TRACE(level, ("SUBW"));
`INST_ALU_SLL: `TRACE(level, ("SLLW"));
`INST_ALU_SRL: `TRACE(level, ("SRLW"));
`INST_ALU_SRA: `TRACE(level, ("SRAW"));
default: `TRACE(level, ("?"));
endcase
end
end else begin
if (op_args.alu.use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDI"));
`INST_ALU_SLL: `TRACE(level, ("SLLI"));
`INST_ALU_SRL: `TRACE(level, ("SRLI"));
`INST_ALU_SRA: `TRACE(level, ("SRAI"));
`INST_ALU_SLT: `TRACE(level, ("SLTI"));
`INST_ALU_SLTU: `TRACE(level, ("SLTIU"));
`INST_ALU_XOR: `TRACE(level, ("XORI"));
`INST_ALU_OR: `TRACE(level, ("ORI"));
`INST_ALU_AND: `TRACE(level, ("ANDI"));
`INST_ALU_LUI: `TRACE(level, ("LUI"));
`INST_ALU_AUIPC: `TRACE(level, ("AUIPC"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADD"));
`INST_ALU_SUB: `TRACE(level, ("SUB"));
`INST_ALU_SLL: `TRACE(level, ("SLL"));
`INST_ALU_SRL: `TRACE(level, ("SRL"));
`INST_ALU_SRA: `TRACE(level, ("SRA"));
`INST_ALU_SLT: `TRACE(level, ("SLT"));
`INST_ALU_SLTU: `TRACE(level, ("SLTU"));
`INST_ALU_XOR: `TRACE(level, ("XOR"));
`INST_ALU_OR: `TRACE(level, ("OR"));
`INST_ALU_AND: `TRACE(level, ("AND"));
`INST_ALU_CZEQ: `TRACE(level, ("CZERO.EQZ"));
`INST_ALU_CZNE: `TRACE(level, ("CZERO.NEZ"));
default: `TRACE(level, ("?"));
endcase
end
end
end
`ALU_TYPE_BRANCH: begin
case (`INST_BR_BITS'(op_type))
`INST_BR_EQ: `TRACE(level, ("BEQ"));
`INST_BR_NE: `TRACE(level, ("BNE"));
`INST_BR_LT: `TRACE(level, ("BLT"));
`INST_BR_GE: `TRACE(level, ("BGE"));
`INST_BR_LTU: `TRACE(level, ("BLTU"));
`INST_BR_GEU: `TRACE(level, ("BGEU"));
`INST_BR_JAL: `TRACE(level, ("JAL"));
`INST_BR_JALR: `TRACE(level, ("JALR"));
`INST_BR_ECALL: `TRACE(level, ("ECALL"));
`INST_BR_EBREAK:`TRACE(level, ("EBREAK"));
`INST_BR_URET: `TRACE(level, ("URET"));
`INST_BR_SRET: `TRACE(level, ("SRET"));
`INST_BR_MRET: `TRACE(level, ("MRET"));
default: `TRACE(level, ("?"));
endcase
end
`ALU_TYPE_MULDIV: begin
if (op_args.alu.is_w) begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MULW"));
`INST_M_DIV: `TRACE(level, ("DIVW"));
`INST_M_DIVU: `TRACE(level, ("DIVUW"));
`INST_M_REM: `TRACE(level, ("REMW"));
`INST_M_REMU: `TRACE(level, ("REMUW"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MUL"));
`INST_M_MULH: `TRACE(level, ("MULH"));
`INST_M_MULHSU:`TRACE(level, ("MULHSU"));
`INST_M_MULHU: `TRACE(level, ("MULHU"));
`INST_M_DIV: `TRACE(level, ("DIV"));
`INST_M_DIVU: `TRACE(level, ("DIVU"));
`INST_M_REM: `TRACE(level, ("REM"));
`INST_M_REMU: `TRACE(level, ("REMU"));
default: `TRACE(level, ("?"));
endcase
end
end
default: `TRACE(level, ("?"));
endcase
end
`EX_LSU: begin
if (op_args.lsu.is_float) begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LW: `TRACE(level, ("FLW"));
`INST_LSU_LD: `TRACE(level, ("FLD"));
`INST_LSU_SW: `TRACE(level, ("FSW"));
`INST_LSU_SD: `TRACE(level, ("FSD"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LB: `TRACE(level, ("LB"));
`INST_LSU_LH: `TRACE(level, ("LH"));
`INST_LSU_LW: `TRACE(level, ("LW"));
`INST_LSU_LD: `TRACE(level, ("LD"));
`INST_LSU_LBU:`TRACE(level, ("LBU"));
`INST_LSU_LHU:`TRACE(level, ("LHU"));
`INST_LSU_LWU:`TRACE(level, ("LWU"));
`INST_LSU_SB: `TRACE(level, ("SB"));
`INST_LSU_SH: `TRACE(level, ("SH"));
`INST_LSU_SW: `TRACE(level, ("SW"));
`INST_LSU_SD: `TRACE(level, ("SD"));
`INST_LSU_FENCE:`TRACE(level,("FENCE"));
default: `TRACE(level, ("?"));
endcase
end
end
`EX_FPU: begin
case (`INST_FPU_BITS'(op_type))
`INST_FPU_ADD: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FADD.D"));
else
`TRACE(level, ("FADD.S"));
end
`INST_FPU_SUB: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FSUB.D"));
else
`TRACE(level, ("FSUB.S"));
end
`INST_FPU_MUL: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FMUL.D"));
else
`TRACE(level, ("FMUL.S"));
end
`INST_FPU_DIV: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FDIV.D"));
else
`TRACE(level, ("FDIV.S"));
end
`INST_FPU_SQRT: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FSQRT.D"));
else
`TRACE(level, ("FSQRT.S"));
end
`INST_FPU_MADD: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FMADD.D"));
else
`TRACE(level, ("FMADD.S"));
end
`INST_FPU_MSUB: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FMSUB.D"));
else
`TRACE(level, ("FMSUB.S"));
end
`INST_FPU_NMADD: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FNMADD.D"));
else
`TRACE(level, ("FNMADD.S"));
end
`INST_FPU_NMSUB: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FNMSUB.D"));
else
`TRACE(level, ("FNMSUB.S"));
end
`INST_FPU_CMP: begin
if (op_args.fpu.fmt[0]) begin
case (op_args.fpu.frm[1:0])
0: `TRACE(level, ("FLE.D"));
1: `TRACE(level, ("FLT.D"));
2: `TRACE(level, ("FEQ.D"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (op_args.fpu.frm[1:0])
0: `TRACE(level, ("FLE.S"));
1: `TRACE(level, ("FLT.S"));
2: `TRACE(level, ("FEQ.S"));
default: `TRACE(level, ("?"));
endcase
end
end
`INST_FPU_F2F: begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FCVT.D.S"));
end else begin
`TRACE(level, ("FCVT.S.D"));
end
end
`INST_FPU_F2I: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.L.D"));
end else begin
`TRACE(level, ("FCVT.W.D"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.L.S"));
end else begin
`TRACE(level, ("FCVT.W.S"));
end
end
end
`INST_FPU_F2U: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.LU.D"));
end else begin
`TRACE(level, ("FCVT.WU.D"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.LU.S"));
end else begin
`TRACE(level, ("FCVT.WU.S"));
end
end
end
`INST_FPU_I2F: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.D.L"));
end else begin
`TRACE(level, ("FCVT.D.W"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.S.L"));
end else begin
`TRACE(level, ("FCVT.S.W"));
end
end
end
`INST_FPU_U2F: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.D.LU"));
end else begin
`TRACE(level, ("FCVT.D.WU"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.S.LU"));
end else begin
`TRACE(level, ("FCVT.S.WU"));
end
end
end
`INST_FPU_MISC: begin
if (op_args.fpu.fmt[0]) begin
case (op_args.fpu.frm)
0: `TRACE(level, ("FSGNJ.D"));
1: `TRACE(level, ("FSGNJN.D"));
2: `TRACE(level, ("FSGNJX.D"));
3: `TRACE(level, ("FCLASS.D"));
4: `TRACE(level, ("FMV.X.D"));
5: `TRACE(level, ("FMV.D.X"));
6: `TRACE(level, ("FMIN.D"));
7: `TRACE(level, ("FMAX.D"));
endcase
end else begin
case (op_args.fpu.frm)
0: `TRACE(level, ("FSGNJ.S"));
1: `TRACE(level, ("FSGNJN.S"));
2: `TRACE(level, ("FSGNJX.S"));
3: `TRACE(level, ("FCLASS.S"));
4: `TRACE(level, ("FMV.X.S"));
5: `TRACE(level, ("FMV.S.X"));
6: `TRACE(level, ("FMIN.S"));
7: `TRACE(level, ("FMAX.S"));
endcase
end
end
default: `TRACE(level, ("?"));
endcase
end
`EX_SFU: begin
case (`INST_SFU_BITS'(op_type))
`INST_SFU_TMC: `TRACE(level, ("TMC"));
`INST_SFU_WSPAWN:`TRACE(level, ("WSPAWN"));
`INST_SFU_SPLIT: begin if (op_args.wctl.is_neg) `TRACE(level, ("SPLIT.N")); else `TRACE(level, ("SPLIT")); end
`INST_SFU_JOIN: `TRACE(level, ("JOIN"));
`INST_SFU_BAR: `TRACE(level, ("BAR"));
`INST_SFU_PRED: begin if (op_args.wctl.is_neg) `TRACE(level, ("PRED.N")); else `TRACE(level, ("PRED")); end
`INST_SFU_CSRRW: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end
`INST_SFU_CSRRS: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end
`INST_SFU_CSRRC: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end
default: `TRACE(level, ("?"));
endcase
end
default: `TRACE(level, ("?"));
endcase
endtask
task trace_op_args(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input VX_gpu_pkg::op_args_t op_args
);
case (ex_type)
`EX_ALU: begin
`TRACE(level, (", use_PC=%b, use_imm=%b, imm=0x%0h", op_args.alu.use_PC, op_args.alu.use_imm, op_args.alu.imm));
end
`EX_LSU: begin
`TRACE(level, (", offset=0x%0h", op_args.lsu.offset));
end
`EX_FPU: begin
`TRACE(level, (", fmt=0x%0h, frm=0x%0h", op_args.fpu.fmt, op_args.fpu.frm));
end
`EX_SFU: begin
if (`INST_SFU_IS_CSR(op_type)) begin
`TRACE(level, (", addr=0x%0h, use_imm=%b, imm=0x%0h", op_args.csr.addr, op_args.csr.use_imm, op_args.csr.imm));
end
end
default:;
endcase
endtask
task trace_base_dcr(input int level, input [`VX_DCR_ADDR_WIDTH-1:0] addr);
case (addr)
`VX_DCR_BASE_STARTUP_ADDR0: `TRACE(level, ("STARTUP_ADDR0"));
`VX_DCR_BASE_STARTUP_ADDR1: `TRACE(level, ("STARTUP_ADDR1"));
`VX_DCR_BASE_STARTUP_ARG0: `TRACE(level, ("STARTUP_ARG0"));
`VX_DCR_BASE_STARTUP_ARG1: `TRACE(level, ("STARTUP_ARG1"));
`VX_DCR_BASE_MPM_CLASS: `TRACE(level, ("MPM_CLASS"));
default: `TRACE(level, ("?"));
endcase
endtask
`endif
endpackage
`endif // VX_TRACE_PKG_VH

View file

@ -1,44 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_uuid_gen import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter UUID_WIDTH = 48
) (
input wire clk,
input wire reset,
input wire incr,
input wire [`NW_WIDTH-1:0] wid,
output wire [UUID_WIDTH-1:0] uuid
);
localparam GNW_WIDTH = UUID_WIDTH - 32;
reg [31:0] uuid_cntrs [0:`NUM_WARPS-1];
reg [`NUM_WARPS-1:0] has_uuid_cntrs;
always @(posedge clk) begin
if (reset) begin
has_uuid_cntrs <= '0;
end else if (incr) begin
has_uuid_cntrs[wid] <= 1;
end
if (incr) begin
uuid_cntrs[wid] <= has_uuid_cntrs[wid] ? (uuid_cntrs[wid] + 1) : 1;
end
end
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(wid);
assign uuid = {g_wid, (has_uuid_cntrs[wid] ? uuid_cntrs[wid] : 0)};
endmodule

View file

@ -50,9 +50,9 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR);
wire [`UP(LANE_BITS)-1:0] tid;
if (LANE_BITS != 0) begin : g_tid
if (LANE_BITS != 0) begin
assign tid = execute_if.data.tid[0 +: LANE_BITS];
end else begin : g_no_tid
end else begin
assign tid = 0;
end
@ -63,7 +63,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
wire not_pred = execute_if.data.op_args.wctl.is_neg;
wire [NUM_LANES-1:0] taken;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_taken
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign taken[i] = (execute_if.data.rs1_data[i][0] ^ not_pred);
end
@ -131,7 +131,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
// wspawn
wire [`NUM_WARPS-1:0] wspawn_wmask;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_wspawn_wmask
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
assign wspawn_wmask[i] = (i < rs1_data[`NW_BITS:0]) && (i != execute_if.data.wid);
end
assign wspawn.valid = is_wspawn;
@ -162,7 +162,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
assign warp_ctl_if.sjoin = sjoin_r;
assign warp_ctl_if.barrier = barrier_r;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_commit_if
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign commit_if.data.data[i] = `XLEN'(dvstack_ptr);
end

View file

@ -1,17 +1,17 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Modified port of cast module from fpnew Libray
// Modified port of cast module from fpnew Libray
// reference: https://github.com/pulp-platform/fpnew
`include "VX_fpu_define.vh"
@ -22,8 +22,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
parameter LATENCY = 1,
parameter INT_WIDTH = 32,
parameter MAN_BITS = 23,
parameter EXP_BITS = 8,
parameter OUT_REG = 0
parameter EXP_BITS = 8
) (
input wire clk,
input wire reset,
@ -36,10 +35,10 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
input wire is_signed,
input wire [31:0] dataa,
output wire [31:0] result,
output wire [31:0] result,
output wire [`FP_FLAGS_BITS-1:0] fflags
);
);
// Constants
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
@ -56,11 +55,11 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
localparam FMT_SHIFT_COMPENSATION = S_MAN_WIDTH - 1 - MAN_BITS;
localparam NUM_FP_STICKY = 2 * S_MAN_WIDTH - MAN_BITS - 1; // removed mantissa, 1. and R
localparam NUM_INT_STICKY = 2 * S_MAN_WIDTH - INT_WIDTH; // removed int and R
// Input processing
fclass_t fclass;
VX_fp_classifier #(
fclass_t fclass;
VX_fp_classifier #(
.EXP_BITS (EXP_BITS),
.MAN_BITS (MAN_BITS)
) fp_classifier (
@ -70,9 +69,9 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
);
wire [S_MAN_WIDTH-1:0] input_mant;
wire [S_EXP_WIDTH-1:0] input_exp;
wire [S_EXP_WIDTH-1:0] input_exp;
wire input_sign;
wire i2f_sign = dataa[INT_WIDTH-1];
wire f2i_sign = dataa[INT_WIDTH-1] && is_signed;
wire [S_MAN_WIDTH-1:0] f2i_mantissa = f2i_sign ? (-dataa) : dataa;
@ -82,7 +81,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
assign input_sign = is_itof ? f2i_sign : i2f_sign;
// Pipeline stage0
wire is_itof_s0;
wire is_signed_s0;
wire [2:0] rnd_mode_s0;
@ -93,7 +92,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
VX_pipe_register #(
.DATAW (1 + `INST_FRM_BITS + 1 + $bits(fclass_t) + 1 + S_EXP_WIDTH + S_MAN_WIDTH),
.DEPTH (LATENCY > 1)
.DEPTH (LATENCY > 2)
) pipe_reg0 (
.clk (clk),
.reset (reset),
@ -101,7 +100,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
.data_in ({is_itof, is_signed, frm, fclass, input_sign, input_exp, input_mant}),
.data_out ({is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
);
// Normalization
wire [LZC_RESULT_WIDTH-1:0] renorm_shamt_s0; // renormalization shift amount
@ -114,12 +113,12 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
.data_out (renorm_shamt_s0),
.valid_out (mant_is_nonzero_s0)
);
wire mant_is_zero_s0 = ~mant_is_nonzero_s0;
wire [S_MAN_WIDTH-1:0] input_mant_n_s0; // normalized input mantissa
wire [S_MAN_WIDTH-1:0] input_mant_n_s0; // normalized input mantissa
wire [S_EXP_WIDTH-1:0] input_exp_n_s0; // unbiased true exponent
// Realign input mantissa, append zeroes if destination is wider
assign input_mant_n_s0 = encoded_mant_s0 << renorm_shamt_s0;
@ -141,7 +140,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
VX_pipe_register #(
.DATAW (1 + `INST_FRM_BITS + 1 + $bits(fclass_t) + 1 + 1 + S_MAN_WIDTH + S_EXP_WIDTH),
.DEPTH (LATENCY > 2)
.DEPTH (LATENCY > 1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
@ -170,30 +169,30 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
wire of_before_round_s1 = overflow;
// Pipeline stage2
wire is_itof_s2;
wire is_signed_s2;
wire [2:0] rnd_mode_s2;
fclass_t fclass_s2;
fclass_t fclass_s2;
wire mant_is_zero_s2;
wire input_sign_s2;
wire [2*S_MAN_WIDTH:0] destination_mant_s2;
wire [EXP_BITS-1:0] final_exp_s2;
wire of_before_round_s2;
VX_pipe_register #(
.DATAW (1 + 1 + `INST_FRM_BITS + $bits(fclass_t) + 1 + 1 + (2*S_MAN_WIDTH+1) + EXP_BITS + 1),
.DEPTH (LATENCY > 0)
.DEPTH (LATENCY > 3)
) pipe_reg2 (
.clk (clk),
.reset (reset),
.enable (enable),
.data_in ({is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
.data_out ({is_itof_s2, is_signed_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
);
);
// Rouding and classification
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
wire [INT_WIDTH-1:0] final_int_s2; // integer shifted in position
wire [1:0] f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2;
@ -238,20 +237,20 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
wire is_itof_s3;
wire is_signed_s3;
fclass_t fclass_s3;
fclass_t fclass_s3;
wire mant_is_zero_s3;
wire input_sign_s3;
wire rounded_sign_s3;
wire [INT_WIDTH-1:0] rounded_abs_s3;
wire of_before_round_s3;
wire of_before_round_s3;
wire f2i_round_has_sticky_s3;
wire i2f_round_has_sticky_s3;
`UNUSED_VAR (fclass_s3)
`UNUSED_VAR (fclass_s3)
VX_pipe_register #(
.DATAW (1 + 1 + $bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1),
.DEPTH (LATENCY > 3)
.DEPTH (LATENCY > 4)
) pipe_reg3 (
.clk (clk),
.reset (reset),
@ -259,7 +258,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
.data_in ({is_itof_s2, is_signed_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}),
.data_out ({is_itof_s3, is_signed_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3})
);
// Assemble regular result, nan box short ones. Int zeroes need to be detected
wire [INT_WIDTH-1:0] fmt_result_s3 = mant_is_zero_s3 ? 0 : {rounded_sign_s3, rounded_abs_s3[EXP_BITS+MAN_BITS-1:0]};
@ -279,18 +278,18 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
f2i_special_result_s3[INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1; // alone yields 2**(31)-1
f2i_special_result_s3[INT_WIDTH-1] = ~is_signed_s3; // for unsigned casts yields 2**31
end
end
end
// Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
wire f2i_result_is_special_s3 = fclass_s3.is_nan
wire f2i_result_is_special_s3 = fclass_s3.is_nan
| fclass_s3.is_inf
| of_before_round_s3
| (input_sign_s3 & ~is_signed_s3 & ~rounded_int_res_zero_s3);
fflags_t f2i_special_status_s3;
fflags_t i2f_status_s3, f2i_status_s3;
fflags_t tmp_fflags_s3;
// All integer special cases are invalid
assign f2i_special_status_s3 = {1'b1, 4'h0};
@ -307,7 +306,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
VX_pipe_register #(
.DATAW (32 + `FP_FLAGS_BITS),
.DEPTH (OUT_REG)
.DEPTH (LATENCY > 0)
) pipe_reg4 (
.clk (clk),
.reset (reset),

View file

@ -1,17 +1,17 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Modified port of noncomp module from fpnew Libray
// Modified port of noncomp module from fpnew Libray
// reference: https://github.com/pulp-platform/fpnew
`include "VX_fpu_define.vh"
@ -19,10 +19,9 @@
`ifdef FPU_DSP
module VX_fncp_unit import VX_fpu_pkg::*; #(
parameter LATENCY = 1,
parameter LATENCY = 2,
parameter EXP_BITS = 8,
parameter MAN_BITS = 23,
parameter OUT_REG = 0
parameter MAN_BITS = 23
) (
input wire clk,
input wire reset,
@ -34,10 +33,10 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
input wire [31:0] dataa,
input wire [31:0] datab,
output wire [31:0] result,
output wire [31:0] result,
output wire [`FP_FLAGS_BITS-1:0] fflags
);
);
localparam NEG_INF = 32'h00000001,
NEG_NORM = 32'h00000002,
NEG_SUBNORM = 32'h00000004,
@ -56,15 +55,15 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
wire a_smaller, ab_equal;
// Setup
assign a_sign = dataa[31];
assign a_sign = dataa[31];
assign a_exponent = dataa[30:23];
assign a_mantissa = dataa[22:0];
assign b_sign = datab[31];
assign b_sign = datab[31];
assign b_exponent = datab[30:23];
assign b_mantissa = datab[22:0];
VX_fp_classifier #(
VX_fp_classifier #(
.EXP_BITS (EXP_BITS),
.MAN_BITS (MAN_BITS)
) fp_class_a (
@ -73,7 +72,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
.clss_o (a_fclass)
);
VX_fp_classifier #(
VX_fp_classifier #(
.EXP_BITS (EXP_BITS),
.MAN_BITS (MAN_BITS)
) fp_class_b (
@ -83,7 +82,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
);
assign a_smaller = (dataa < datab) ^ (a_sign || b_sign);
assign ab_equal = (dataa == datab)
assign ab_equal = (dataa == datab)
|| (a_fclass.is_zero && b_fclass.is_zero); // +0 == -0
// Pipeline stage0
@ -102,54 +101,54 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
VX_pipe_register #(
.DATAW (4 + 2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fclass_t) + 1 + 1),
.DEPTH (LATENCY > 0)
.DEPTH (LATENCY > 1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (enable),
.data_in ({op_mod, dataa, datab, a_sign, b_sign, a_exponent, a_mantissa, a_fclass, b_fclass, a_smaller, ab_equal}),
.data_out ({op_mod_s0, dataa_s0, datab_s0, a_sign_s0, b_sign_s0, a_exponent_s0, a_mantissa_s0, a_fclass_s0, b_fclass_s0, a_smaller_s0, ab_equal_s0})
);
);
// FCLASS
reg [31:0] fclass_mask_s0; // generate a 10-bit mask for integer reg
always @(*) begin
always @(*) begin
if (a_fclass_s0.is_normal) begin
fclass_mask_s0 = a_sign_s0 ? NEG_NORM : POS_NORM;
end
end
else if (a_fclass_s0.is_inf) begin
fclass_mask_s0 = a_sign_s0 ? NEG_INF : POS_INF;
end
end
else if (a_fclass_s0.is_zero) begin
fclass_mask_s0 = a_sign_s0 ? NEG_ZERO : POS_ZERO;
end
end
else if (a_fclass_s0.is_subnormal) begin
fclass_mask_s0 = a_sign_s0 ? NEG_SUBNORM : POS_SUBNORM;
end
end
else if (a_fclass_s0.is_nan) begin
fclass_mask_s0 = {22'h0, a_fclass_s0.is_quiet, a_fclass_s0.is_signaling, 8'h0};
end
else begin
end
else begin
fclass_mask_s0 = QUT_NAN;
end
end
// Min/Max
// Min/Max
reg [31:0] fminmax_res_s0;
always @(*) begin
if (a_fclass_s0.is_nan && b_fclass_s0.is_nan)
fminmax_res_s0 = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN
else if (a_fclass_s0.is_nan)
else if (a_fclass_s0.is_nan)
fminmax_res_s0 = datab_s0;
else if (b_fclass_s0.is_nan)
else if (b_fclass_s0.is_nan)
fminmax_res_s0 = dataa_s0;
else begin
else begin
// FMIN, FMAX
fminmax_res_s0 = (op_mod_s0[0] ^ a_smaller_s0) ? dataa_s0 : datab_s0;
end
end
// Sign injection
// Sign injection
reg [31:0] fsgnj_res_s0; // result of sign injection
always @(*) begin
case (op_mod_s0[1:0])
@ -159,12 +158,12 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
endcase
end
// Comparison
// Comparison
reg fcmp_res_s0; // result of comparison
reg fcmp_fflags_NV_s0; // comparison fflags
always @(*) begin
case (op_mod_s0[1:0])
0: begin // LE
0: begin // LE
if (a_fclass_s0.is_nan || b_fclass_s0.is_nan) begin
fcmp_res_s0 = 0;
fcmp_fflags_NV_s0 = 1;
@ -180,12 +179,12 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
end else begin
fcmp_res_s0 = (a_smaller_s0 & ~ab_equal_s0);
fcmp_fflags_NV_s0 = 0;
end
end
end
2: begin // EQ
if (a_fclass_s0.is_nan || b_fclass_s0.is_nan) begin
fcmp_res_s0 = 0;
fcmp_fflags_NV_s0 = a_fclass_s0.is_signaling | b_fclass_s0.is_signaling;
fcmp_fflags_NV_s0 = a_fclass_s0.is_signaling | b_fclass_s0.is_signaling;
end else begin
fcmp_res_s0 = ab_equal_s0;
fcmp_fflags_NV_s0 = 0;
@ -193,7 +192,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
end
default: begin
fcmp_res_s0 = 'x;
fcmp_fflags_NV_s0 = 'x;
fcmp_fflags_NV_s0 = 'x;
end
endcase
end
@ -217,7 +216,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
// FMV
result_s0 = dataa_s0;
fflags_NV_s0 = 0;
end
end
6,7: begin
// MIN/MAX
result_s0 = fminmax_res_s0;
@ -230,7 +229,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
VX_pipe_register #(
.DATAW (32 + 1),
.DEPTH (OUT_REG)
.DEPTH (LATENCY > 0)
) pipe_reg1 (
.clk (clk),
.reset (reset),

View file

@ -46,68 +46,56 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
input wire ready_out,
output wire valid_out
);
localparam DATAW = 32 + `INST_FRM_BITS + 1 + 1;
wire [NUM_LANES-1:0][DATAW-1:0] data_in;
`UNUSED_VAR (frm)
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
fflags_t [NUM_LANES-1:0] fflags_out;
wire pe_enable;
wire [NUM_PES-1:0][DATAW-1:0] pe_data_in;
wire [NUM_PES-1:0][31:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in
assign data_in[i][0 +: 32] = dataa[i];
assign data_in[i][32 +: `INST_FRM_BITS] = frm;
assign data_in[i][32 + `INST_FRM_BITS +: 1] = is_itof;
assign data_in[i][32 + `INST_FRM_BITS + 1 +: 1] = is_signed;
end
VX_pe_serializer #(
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.LATENCY (`LATENCY_FCVT),
.DATA_IN_WIDTH (DATAW),
.DATA_OUT_WIDTH (`FP_FLAGS_BITS + 32),
.DATA_IN_WIDTH(32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG (0),
.OUT_BUF (2)
.OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
) pe_serializer (
.clk (clk),
.reset (reset),
.valid_in (valid_in),
.data_in (data_in),
.data_in (dataa),
.tag_in ({mask_in, tag_in}),
.ready_in (ready_in),
.pe_enable (pe_enable),
.pe_data_out(pe_data_in),
.pe_data_in (pe_data_out),
.pe_data_in (pe_data_in),
.pe_data_out(pe_data_out),
.valid_out (valid_out),
.data_out (data_out),
.tag_out ({mask_out, tag_out}),
.ready_out (ready_out)
);
`UNUSED_VAR (pe_data_in)
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign result[i] = data_out[i][0 +: 32];
assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS];
end
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fcvt_units
for (genvar i = 0; i < NUM_PES; ++i) begin
VX_fcvt_unit #(
.LATENCY (`LATENCY_FCVT),
.OUT_REG (1)
.LATENCY (`LATENCY_FCVT)
) fcvt_unit (
.clk (clk),
.reset (reset),
.enable (pe_enable),
.frm (pe_data_in[0][32 +: `INST_FRM_BITS]),
.is_itof (pe_data_in[0][32 + `INST_FRM_BITS +: 1]),
.is_signed (pe_data_in[0][32 + `INST_FRM_BITS + 1 +: 1]),
.frm (frm),
.is_itof (is_itof),
.is_signed (is_signed),
.dataa (pe_data_in[i][0 +: 32]),
.result (pe_data_out[i][0 +: 32]),
.fflags (pe_data_out[i][32 +: `FP_FLAGS_BITS])

View file

@ -44,33 +44,31 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
output wire valid_out,
input wire ready_out
);
localparam DATAW = 2 * 32 + `INST_FRM_BITS;
wire [NUM_LANES-1:0][DATAW-1:0] data_in;
`UNUSED_VAR (frm)
wire [NUM_LANES-1:0][2*32-1:0] data_in;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags_out;
wire pe_enable;
wire [NUM_PES-1:0][DATAW-1:0] pe_data_in;
wire [NUM_PES-1:0][2*32-1:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign data_in[i][0 +: 32] = dataa[i];
assign data_in[i][32 +: 32] = datab[i];
assign data_in[i][64 +: `INST_FRM_BITS] = frm;
end
VX_pe_serializer #(
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.LATENCY (`LATENCY_FDIV),
.DATA_IN_WIDTH (DATAW),
.DATA_OUT_WIDTH (`FP_FLAGS_BITS + 32),
.DATA_IN_WIDTH(2*32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG (0),
.OUT_BUF (2)
.OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
) pe_serializer (
.clk (clk),
.reset (reset),
@ -79,17 +77,15 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
.tag_in ({mask_in, tag_in}),
.ready_in (ready_in),
.pe_enable (pe_enable),
.pe_data_out(pe_data_in),
.pe_data_in (pe_data_out),
.pe_data_in (pe_data_in),
.pe_data_out(pe_data_out),
.valid_out (valid_out),
.data_out (data_out),
.tag_out ({mask_out, tag_out}),
.ready_out (ready_out)
);
`UNUSED_VAR (pe_data_in)
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign result[i] = data_out[i][0 +: 32];
assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS];
end
@ -98,7 +94,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
`ifdef QUARTUS
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fdivs
for (genvar i = 0; i < NUM_PES; ++i) begin
acl_fdiv fdiv (
.clk (clk),
.areset (1'b0),
@ -116,7 +112,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
`elsif VIVADO
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fdivs
for (genvar i = 0; i < NUM_PES; ++i) begin
wire [3:0] tuser;
xil_fdiv fdiv (
.aclk (clk),
@ -138,7 +134,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
`else
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fdivs
for (genvar i = 0; i < NUM_PES; ++i) begin
reg [63:0] r;
`UNUSED_VAR (r)
fflags_t f;
@ -147,9 +143,9 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
dpi_fdiv (
pe_enable,
int'(0),
{32'hffffffff, pe_data_in[i][0 +: 32]}, // a
{32'hffffffff, pe_data_in[i][32 +: 32]}, // b
pe_data_in[0][64 +: `INST_FRM_BITS], // frm
{32'hffffffff, pe_data_in[i][0 +: 32]},
{32'hffffffff, pe_data_in[i][32 +: 32]},
frm,
r,
f
);

View file

@ -76,6 +76,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
reg is_fadd, is_fsub, is_fmul, is_fmadd, is_fmsub, is_fnmadd, is_fnmsub;
reg is_div, is_fcmp, is_itof, is_utof, is_ftoi, is_ftou, is_f2f;
reg dst_fmt, int_fmt;
reg [NUM_LANES-1:0][63:0] operands [3];
@ -87,8 +88,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
end
end
wire f_fmt = fmt[0];
wire i_fmt = fmt[1];
`UNUSED_VAR (fmt)
always @(*) begin
is_fadd = 0;
@ -106,11 +106,25 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
is_ftou = 0;
is_f2f = 0;
dst_fmt = 0;
int_fmt = 0;
`ifdef FLEN_64
dst_fmt = fmt[0];
`endif
`ifdef XLEN_64
int_fmt = fmt[1];
`endif
case (op_type)
`INST_FPU_ADD: begin core_select = FPU_FMA; is_fadd = ~i_fmt; is_fsub = i_fmt; end
`INST_FPU_MADD: begin core_select = FPU_FMA; is_fmadd = ~i_fmt; is_fmsub = i_fmt; end
`INST_FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = ~i_fmt; is_fnmsub = i_fmt; end
`INST_FPU_ADD: begin core_select = FPU_FMA; is_fadd = 1; end
`INST_FPU_SUB: begin core_select = FPU_FMA; is_fsub = 1; end
`INST_FPU_MUL: begin core_select = FPU_FMA; is_fmul = 1; end
`INST_FPU_MADD: begin core_select = FPU_FMA; is_fmadd = 1; end
`INST_FPU_MSUB: begin core_select = FPU_FMA; is_fmsub = 1; end
`INST_FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = 1; end
`INST_FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end
`INST_FPU_DIV: begin core_select = FPU_DIVSQRT; is_div = 1; end
`INST_FPU_SQRT: begin core_select = FPU_DIVSQRT; end
`INST_FPU_CMP: begin core_select = FPU_NCP; is_fcmp = 1; end
@ -124,7 +138,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
end
generate
begin : g_fma
begin : fma
reg [NUM_LANES-1:0][`XLEN-1:0] result_fma;
reg [NUM_LANES-1:0][63:0] result_fadd;
@ -150,13 +164,13 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
always @(*) begin
for (integer i = 0; i < NUM_LANES; ++i) begin
dpi_fadd (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fadd[i], fflags_fadd[i]);
dpi_fsub (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fsub[i], fflags_fsub[i]);
dpi_fmul (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fmul[i], fflags_fmul[i]);
dpi_fmadd (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmadd[i], fflags_fmadd[i]);
dpi_fmsub (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmsub[i], fflags_fmsub[i]);
dpi_fnmadd (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmadd[i], fflags_fnmadd[i]);
dpi_fnmsub (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmsub[i], fflags_fnmsub[i]);
dpi_fadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fadd[i], fflags_fadd[i]);
dpi_fsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fsub[i], fflags_fsub[i]);
dpi_fmul (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fmul[i], fflags_fmul[i]);
dpi_fmadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmadd[i], fflags_fmadd[i]);
dpi_fmsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmsub[i], fflags_fmsub[i]);
dpi_fnmadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmadd[i], fflags_fnmadd[i]);
dpi_fnmsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmsub[i], fflags_fnmsub[i]);
result_fma[i] = is_fadd ? result_fadd[i][`XLEN-1:0] :
is_fsub ? result_fsub[i][`XLEN-1:0] :
@ -200,7 +214,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
endgenerate
generate
begin : g_fdiv
begin : fdiv
reg [NUM_LANES-1:0][`XLEN-1:0] result_fdiv_r;
reg [NUM_LANES-1:0][63:0] result_fdiv;
@ -212,7 +226,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
always @(*) begin
for (integer i = 0; i < NUM_LANES; ++i) begin
dpi_fdiv (fdiv_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fdiv[i], fflags_fdiv[i]);
dpi_fdiv (fdiv_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fdiv[i], fflags_fdiv[i]);
result_fdiv_r[i] = result_fdiv[i][`XLEN-1:0];
end
end
@ -239,7 +253,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
endgenerate
generate
begin : g_fsqrt
begin : fsqrt
reg [NUM_LANES-1:0][`XLEN-1:0] result_fsqrt_r;
reg [NUM_LANES-1:0][63:0] result_fsqrt;
@ -251,7 +265,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
always @(*) begin
for (integer i = 0; i < NUM_LANES; ++i) begin
dpi_fsqrt (fsqrt_fire, int'(f_fmt), operands[0][i], frm, result_fsqrt[i], fflags_fsqrt[i]);
dpi_fsqrt (fsqrt_fire, int'(dst_fmt), operands[0][i], frm, result_fsqrt[i], fflags_fsqrt[i]);
result_fsqrt_r[i] = result_fsqrt[i][`XLEN-1:0];
end
end
@ -278,7 +292,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
endgenerate
generate
begin : g_fcvt
begin : fcvt
reg [NUM_LANES-1:0][`XLEN-1:0] result_fcvt;
reg [NUM_LANES-1:0][63:0] result_itof;
@ -299,11 +313,11 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
always @(*) begin
for (integer i = 0; i < NUM_LANES; ++i) begin
dpi_itof (fcvt_fire, int'(f_fmt), int'(i_fmt), operands[0][i], frm, result_itof[i], fflags_itof[i]);
dpi_utof (fcvt_fire, int'(f_fmt), int'(i_fmt), operands[0][i], frm, result_utof[i], fflags_utof[i]);
dpi_ftoi (fcvt_fire, int'(i_fmt), int'(f_fmt), operands[0][i], frm, result_ftoi[i], fflags_ftoi[i]);
dpi_ftou (fcvt_fire, int'(i_fmt), int'(f_fmt), operands[0][i], frm, result_ftou[i], fflags_ftou[i]);
dpi_f2f (fcvt_fire, int'(f_fmt), operands[0][i], result_f2f[i]);
dpi_itof (fcvt_fire, int'(dst_fmt), int'(int_fmt), operands[0][i], frm, result_itof[i], fflags_itof[i]);
dpi_utof (fcvt_fire, int'(dst_fmt), int'(int_fmt), operands[0][i], frm, result_utof[i], fflags_utof[i]);
dpi_ftoi (fcvt_fire, int'(int_fmt), int'(dst_fmt), operands[0][i], frm, result_ftoi[i], fflags_ftoi[i]);
dpi_ftou (fcvt_fire, int'(int_fmt), int'(dst_fmt), operands[0][i], frm, result_ftou[i], fflags_ftou[i]);
dpi_f2f (fcvt_fire, int'(dst_fmt), operands[0][i], result_f2f[i]);
result_fcvt[i] = is_itof ? result_itof[i][`XLEN-1:0] :
is_utof ? result_utof[i][`XLEN-1:0] :
@ -342,7 +356,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
endgenerate
generate
begin : g_fncp
begin : fncp
reg [NUM_LANES-1:0][`XLEN-1:0] result_fncp;
reg [NUM_LANES-1:0][63:0] result_fclss;
@ -370,17 +384,17 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
always @(*) begin
for (integer i = 0; i < NUM_LANES; ++i) begin
dpi_fclss (fncp_fire, int'(f_fmt), operands[0][i], result_fclss[i]);
dpi_fle (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fle[i], fflags_fle[i]);
dpi_flt (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_flt[i], fflags_flt[i]);
dpi_feq (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_feq[i], fflags_feq[i]);
dpi_fmin (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fmin[i], fflags_fmin[i]);
dpi_fmax (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fmax[i], fflags_fmax[i]);
dpi_fsgnj (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fsgnj[i]);
dpi_fsgnjn (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fsgnjn[i]);
dpi_fsgnjx (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fsgnjx[i]);
result_fmvx[i] = f_fmt ? operands[0][i] : 64'($signed(operands[0][i][31:0])); // sign-extension
result_fmvf[i] = f_fmt ? operands[0][i] : (operands[0][i] | 64'hffffffff00000000); // nan-boxing
dpi_fclss (fncp_fire, int'(dst_fmt), operands[0][i], result_fclss[i]);
dpi_fle (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fle[i], fflags_fle[i]);
dpi_flt (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_flt[i], fflags_flt[i]);
dpi_feq (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_feq[i], fflags_feq[i]);
dpi_fmin (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fmin[i], fflags_fmin[i]);
dpi_fmax (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fmax[i], fflags_fmax[i]);
dpi_fsgnj (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnj[i]);
dpi_fsgnjn (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnjn[i]);
dpi_fsgnjx (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnjx[i]);
result_fmvx[i] = dst_fmt ? operands[0][i] : 64'($signed(operands[0][i][31:0])); // sign-extension
result_fmvf[i] = dst_fmt ? operands[0][i] : (operands[0][i] | 64'hffffffff00000000); // nan-boxing
end
end
@ -430,7 +444,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_DATAW),
.ARBITER ("P"),
.ARBITER ("R"),
.OUT_BUF (0)
) div_sqrt_arb (
.clk (clk),
@ -449,14 +463,14 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
wire [NUM_FPC-1:0][RSP_DATAW-1:0] per_core_data_out;
for (genvar i = 0; i < NUM_FPC; ++i) begin : g_per_core_data_out
for (genvar i = 0; i < NUM_FPC; ++i) begin
assign per_core_data_out[i] = {per_core_result[i], per_core_has_fflags[i], per_core_fflags[i], per_core_tag_out[i]};
end
VX_stream_arb #(
.NUM_INPUTS (NUM_FPC),
.DATAW (RSP_DATAW),
.ARBITER ("R"),
.ARBITER ("F"),
.OUT_BUF (OUT_BUF)
) rsp_arb (
.clk (clk),

View file

@ -51,39 +51,68 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
localparam FPU_DIVSQRT = 1;
localparam FPU_CVT = 2;
localparam FPU_NCP = 3;
localparam NUM_FPCORES = 4;
localparam FPCORES_BITS = `LOG2UP(NUM_FPCORES);
localparam NUM_FPC = 4;
localparam FPC_BITS = `LOG2UP(NUM_FPC);
localparam REQ_DATAW = NUM_LANES + TAG_WIDTH + `INST_FPU_BITS + `INST_FMT_BITS + `INST_FRM_BITS + 3 * (NUM_LANES * 32);
localparam RSP_DATAW = (NUM_LANES * 32) + 1 + $bits(fflags_t) + TAG_WIDTH;
`UNUSED_VAR (fmt)
wire [NUM_FPCORES-1:0] per_core_valid_in;
wire [NUM_FPCORES-1:0][REQ_DATAW-1:0] per_core_data_in;
wire [NUM_FPCORES-1:0] per_core_ready_in;
wire [NUM_FPC-1:0] per_core_ready_in;
wire [NUM_FPC-1:0][NUM_LANES-1:0][31:0] per_core_result;
wire [NUM_FPC-1:0][TAG_WIDTH-1:0] per_core_tag_out;
wire [NUM_FPC-1:0] per_core_ready_out;
wire [NUM_FPC-1:0] per_core_valid_out;
wire [NUM_FPC-1:0] per_core_has_fflags;
fflags_t [NUM_FPC-1:0] per_core_fflags;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0] per_core_mask_in;
wire [NUM_FPCORES-1:0][TAG_WIDTH-1:0] per_core_tag_in;
wire [NUM_FPCORES-1:0][`INST_FPU_BITS-1:0] per_core_op_type;
wire [NUM_FPCORES-1:0][`INST_FMT_BITS-1:0] per_core_fmt;
wire [NUM_FPCORES-1:0][`INST_FRM_BITS-1:0] per_core_frm;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_dataa;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_datab;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_datac;
wire div_ready_in, sqrt_ready_in;
wire [NUM_LANES-1:0][31:0] div_result, sqrt_result;
wire [TAG_WIDTH-1:0] div_tag_out, sqrt_tag_out;
wire div_ready_out, sqrt_ready_out;
wire div_valid_out, sqrt_valid_out;
wire div_has_fflags, sqrt_has_fflags;
fflags_t div_fflags, sqrt_fflags;
wire [NUM_FPCORES-1:0] per_core_valid_out;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_result;
wire [NUM_FPCORES-1:0][TAG_WIDTH-1:0] per_core_tag_out;
wire [NUM_FPCORES-1:0] per_core_has_fflags;
fflags_t [NUM_FPCORES-1:0] per_core_fflags;
wire [NUM_FPCORES-1:0] per_core_ready_out;
reg [FPC_BITS-1:0] core_select;
reg is_madd, is_sub, is_neg, is_div, is_itof, is_signed;
always @(*) begin
is_madd = 0;
is_sub = 0;
is_neg = 0;
is_div = 0;
is_itof = 0;
is_signed = 0;
case (op_type)
`INST_FPU_ADD: begin core_select = FPU_FMA; end
`INST_FPU_SUB: begin core_select = FPU_FMA; is_sub = 1; end
`INST_FPU_MUL: begin core_select = FPU_FMA; is_neg = 1; end
`INST_FPU_MADD: begin core_select = FPU_FMA; is_madd = 1; end
`INST_FPU_MSUB: begin core_select = FPU_FMA; is_madd = 1; is_sub = 1; end
`INST_FPU_NMADD: begin core_select = FPU_FMA; is_madd = 1; is_neg = 1; end
`INST_FPU_NMSUB: begin core_select = FPU_FMA; is_madd = 1; is_sub = 1; is_neg = 1; end
`INST_FPU_DIV: begin core_select = FPU_DIVSQRT; is_div = 1; end
`INST_FPU_SQRT: begin core_select = FPU_DIVSQRT; end
`INST_FPU_F2I: begin core_select = FPU_CVT; is_signed = 1; end
`INST_FPU_F2U: begin core_select = FPU_CVT; end
`INST_FPU_I2F: begin core_select = FPU_CVT; is_itof = 1; is_signed = 1; end
`INST_FPU_U2F: begin core_select = FPU_CVT; is_itof = 1; end
default: begin core_select = FPU_NCP; end
endcase
end
`RESET_RELAY (fma_reset, reset);
`RESET_RELAY (div_reset, reset);
`RESET_RELAY (sqrt_reset, reset);
`RESET_RELAY (cvt_reset, reset);
`RESET_RELAY (ncp_reset, reset);
wire [NUM_LANES-1:0][31:0] dataa_s;
wire [NUM_LANES-1:0][31:0] datab_s;
wire [NUM_LANES-1:0][31:0] datac_s;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign dataa_s[i] = dataa[i][31:0];
assign datab_s[i] = datab[i][31:0];
assign datac_s[i] = datac[i][31:0];
@ -93,60 +122,23 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
`UNUSED_VAR (datab)
`UNUSED_VAR (datac)
// Decode fpu core type
wire [FPCORES_BITS-1:0] core_select = op_type[3:2];
VX_stream_switch #(
.DATAW (REQ_DATAW),
.NUM_OUTPUTS (NUM_FPCORES)
) req_switch (
.clk (clk),
.reset (reset),
.sel_in (core_select),
.valid_in (valid_in),
.ready_in (ready_in),
.data_in ({mask_in, tag_in, fmt, frm, dataa_s, datab_s, datac_s, op_type}),
.data_out (per_core_data_in),
.valid_out (per_core_valid_in),
.ready_out (per_core_ready_in)
);
for (genvar i = 0; i < NUM_FPCORES; ++i) begin : g_per_core_data_in
assign {
per_core_mask_in[i],
per_core_tag_in[i],
per_core_fmt[i],
per_core_frm[i],
per_core_dataa[i],
per_core_datab[i],
per_core_datac[i],
per_core_op_type[i]
} = per_core_data_in[i];
end
// FMA core ///////////////////////////////////////////////////////////////
wire is_madd = per_core_op_type[FPU_FMA][1];
wire is_neg = per_core_op_type[FPU_FMA][0];
wire is_sub = per_core_fmt[FPU_FMA][1];
VX_fpu_fma #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (TAG_WIDTH)
) fpu_fma (
.clk (clk),
.reset (reset),
.valid_in (per_core_valid_in[FPU_FMA]),
.reset (fma_reset),
.valid_in (valid_in && (core_select == FPU_FMA)),
.ready_in (per_core_ready_in[FPU_FMA]),
.mask_in (per_core_mask_in[FPU_FMA]),
.tag_in (per_core_tag_in[FPU_FMA]),
.frm (per_core_frm[FPU_FMA]),
.mask_in (mask_in),
.tag_in (tag_in),
.frm (frm),
.is_madd (is_madd),
.is_sub (is_sub),
.is_neg (is_neg),
.dataa (per_core_dataa[FPU_FMA]),
.datab (per_core_datab[FPU_FMA]),
.datac (per_core_datac[FPU_FMA]),
.dataa (dataa_s),
.datab (datab_s),
.datac (datac_s),
.has_fflags (per_core_has_fflags[FPU_FMA]),
.fflags (per_core_fflags[FPU_FMA]),
.result (per_core_result[FPU_FMA]),
@ -155,99 +147,25 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
.valid_out (per_core_valid_out[FPU_FMA])
);
// Div/Sqrt cores /////////////////////////////////////////////////////////
wire [1:0] div_sqrt_valid_in;
wire [1:0][REQ_DATAW-1:0] div_sqrt_data_in;
wire [1:0] div_sqrt_ready_in;
wire [1:0][NUM_LANES-1:0] div_sqrt_mask_in;
wire [1:0][TAG_WIDTH-1:0] div_sqrt_tag_in;
wire [1:0][`INST_FPU_BITS-1:0] div_sqrt_op_type;
wire [1:0][`INST_FMT_BITS-1:0] div_sqrt_fmt;
wire [1:0][`INST_FRM_BITS-1:0] div_sqrt_frm;
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_dataa;
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_datab;
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_datac;
wire [1:0] div_sqrt_valid_out;
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_result;
wire [1:0][TAG_WIDTH-1:0] div_sqrt_tag_out;
wire [1:0] div_sqrt_has_fflags;
fflags_t [1:0] div_sqrt_fflags;
wire [1:0] div_sqrt_ready_out;
wire div_sqrt_valid_tmp_in;
wire [REQ_DATAW-1:0] div_sqrt_data_tmp_in;
wire div_sqrt_ready_tmp_in;
VX_elastic_buffer #(
.DATAW (REQ_DATAW)
) div_sqrt_req_buffer (
.clk (clk),
.reset (reset),
.valid_in (per_core_valid_in[FPU_DIVSQRT]),
.ready_in (per_core_ready_in[FPU_DIVSQRT]),
.data_in (per_core_data_in[FPU_DIVSQRT]),
.data_out (div_sqrt_data_tmp_in),
.valid_out (div_sqrt_valid_tmp_in),
.ready_out (div_sqrt_ready_tmp_in)
);
wire is_sqrt = div_sqrt_data_tmp_in[0]; // op_type[0]
VX_stream_switch #(
.DATAW (REQ_DATAW),
.NUM_OUTPUTS (2)
) div_sqrt_req_switch (
.clk (clk),
.reset (reset),
.sel_in (is_sqrt),
.valid_in (div_sqrt_valid_tmp_in),
.ready_in (div_sqrt_ready_tmp_in),
.data_in (div_sqrt_data_tmp_in),
.data_out (div_sqrt_data_in),
.valid_out (div_sqrt_valid_in),
.ready_out (div_sqrt_ready_in)
);
for (genvar i = 0; i < 2; ++i) begin : g_div_sqrt_data_in
assign {
div_sqrt_mask_in[i],
div_sqrt_tag_in[i],
div_sqrt_fmt[i],
div_sqrt_frm[i],
div_sqrt_dataa[i],
div_sqrt_datab[i],
div_sqrt_datac[i],
div_sqrt_op_type[i]
} = div_sqrt_data_in[i];
end
`UNUSED_VAR (div_sqrt_op_type)
`UNUSED_VAR (div_sqrt_fmt)
`UNUSED_VAR (div_sqrt_datab)
`UNUSED_VAR (div_sqrt_datac)
VX_fpu_div #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (TAG_WIDTH)
) fpu_div (
.clk (clk),
.reset (reset),
.valid_in (div_sqrt_valid_in[0]),
.ready_in (div_sqrt_ready_in[0]),
.mask_in (div_sqrt_mask_in[0]),
.tag_in (div_sqrt_tag_in[0]),
.frm (div_sqrt_frm[0]),
.dataa (div_sqrt_dataa[0]),
.datab (div_sqrt_datab[0]),
.has_fflags (div_sqrt_has_fflags[0]),
.fflags (div_sqrt_fflags[0]),
.result (div_sqrt_result[0]),
.tag_out (div_sqrt_tag_out[0]),
.valid_out (div_sqrt_valid_out[0]),
.ready_out (div_sqrt_ready_out[0])
.reset (div_reset),
.valid_in (valid_in && (core_select == FPU_DIVSQRT) && is_div),
.ready_in (div_ready_in),
.mask_in (mask_in),
.tag_in (tag_in),
.frm (frm),
.dataa (dataa_s),
.datab (datab_s),
.has_fflags (div_has_fflags),
.fflags (div_fflags),
.result (div_result),
.tag_out (div_tag_out),
.valid_out (div_valid_out),
.ready_out (div_ready_out)
);
VX_fpu_sqrt #(
@ -255,42 +173,92 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
.TAG_WIDTH (TAG_WIDTH)
) fpu_sqrt (
.clk (clk),
.reset (reset),
.valid_in (div_sqrt_valid_in[1]),
.ready_in (div_sqrt_ready_in[1]),
.mask_in (div_sqrt_mask_in[1]),
.tag_in (div_sqrt_tag_in[1]),
.frm (div_sqrt_frm[1]),
.dataa (div_sqrt_dataa[1]),
.has_fflags (div_sqrt_has_fflags[1]),
.fflags (div_sqrt_fflags[1]),
.result (div_sqrt_result[1]),
.tag_out (div_sqrt_tag_out[1]),
.valid_out (div_sqrt_valid_out[1]),
.ready_out (div_sqrt_ready_out[1])
.reset (sqrt_reset),
.valid_in (valid_in && (core_select == FPU_DIVSQRT) && ~is_div),
.ready_in (sqrt_ready_in),
.mask_in (mask_in),
.tag_in (tag_in),
.frm (frm),
.dataa (dataa_s),
.has_fflags (sqrt_has_fflags),
.fflags (sqrt_fflags),
.result (sqrt_result),
.tag_out (sqrt_tag_out),
.valid_out (sqrt_valid_out),
.ready_out (sqrt_ready_out)
);
wire [1:0][RSP_DATAW-1:0] div_sqrt_arb_data_in;
for (genvar i = 0; i < 2; ++i) begin : g_div_sqrt_arb_data_in
assign div_sqrt_arb_data_in[i] = {
div_sqrt_result[i],
div_sqrt_has_fflags[i],
div_sqrt_fflags[i],
div_sqrt_tag_out[i]
};
end
wire cvt_ret_int_in = ~is_itof;
wire cvt_ret_int_out;
VX_fpu_cvt #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (TAG_WIDTH+1)
) fpu_cvt (
.clk (clk),
.reset (cvt_reset),
.valid_in (valid_in && (core_select == FPU_CVT)),
.ready_in (per_core_ready_in[FPU_CVT]),
.mask_in (mask_in),
.tag_in ({cvt_ret_int_in, tag_in}),
.frm (frm),
.is_itof (is_itof),
.is_signed (is_signed),
.dataa (dataa_s),
.has_fflags (per_core_has_fflags[FPU_CVT]),
.fflags (per_core_fflags[FPU_CVT]),
.result (per_core_result[FPU_CVT]),
.tag_out ({cvt_ret_int_out, per_core_tag_out[FPU_CVT]}),
.valid_out (per_core_valid_out[FPU_CVT]),
.ready_out (per_core_ready_out[FPU_CVT])
);
wire ncp_ret_int_in = (op_type == `INST_FPU_CMP)
|| `INST_FPU_IS_CLASS(op_type, frm)
|| `INST_FPU_IS_MVXW(op_type, frm);
wire ncp_ret_int_out;
wire ncp_ret_sext_in = `INST_FPU_IS_MVXW(op_type, frm);
wire ncp_ret_sext_out;
VX_fpu_ncp #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (TAG_WIDTH+2)
) fpu_ncp (
.clk (clk),
.reset (ncp_reset),
.valid_in (valid_in && (core_select == FPU_NCP)),
.ready_in (per_core_ready_in[FPU_NCP]),
.mask_in (mask_in),
.tag_in ({ncp_ret_sext_in, ncp_ret_int_in, tag_in}),
.op_type (op_type),
.frm (frm),
.dataa (dataa_s),
.datab (datab_s),
.result (per_core_result[FPU_NCP]),
.has_fflags (per_core_has_fflags[FPU_NCP]),
.fflags (per_core_fflags[FPU_NCP]),
.tag_out ({ncp_ret_sext_out, ncp_ret_int_out, per_core_tag_out[FPU_NCP]}),
.valid_out (per_core_valid_out[FPU_NCP]),
.ready_out (per_core_ready_out[FPU_NCP])
);
///////////////////////////////////////////////////////////////////////////
assign per_core_ready_in[FPU_DIVSQRT] = is_div ? div_ready_in : sqrt_ready_in;
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_DATAW),
.ARBITER ("P"),
.ARBITER ("R"),
.OUT_BUF (0)
) div_sqrt_rsp_arb (
) div_sqrt_arb (
.clk (clk),
.reset (reset),
.valid_in (div_sqrt_valid_out),
.ready_in (div_sqrt_ready_out),
.data_in (div_sqrt_arb_data_in),
.valid_in ({sqrt_valid_out, div_valid_out}),
.ready_in ({sqrt_ready_out, div_ready_out}),
.data_in ({{sqrt_result, sqrt_has_fflags, sqrt_fflags, sqrt_tag_out},
{div_result, div_has_fflags, div_fflags, div_tag_out}}),
.data_out ({
per_core_result[FPU_DIVSQRT],
per_core_has_fflags[FPU_DIVSQRT],
@ -302,73 +270,12 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
`UNUSED_PIN (sel_out)
);
// CVT core ///////////////////////////////////////////////////////////////
wire is_itof = per_core_op_type[FPU_CVT][1];
wire is_signed = ~per_core_op_type[FPU_CVT][0];
wire cvt_ret_int_in = ~is_itof;
wire cvt_ret_int_out;
VX_fpu_cvt #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (1+TAG_WIDTH)
) fpu_cvt (
.clk (clk),
.reset (reset),
.valid_in (per_core_valid_in[FPU_CVT]),
.ready_in (per_core_ready_in[FPU_CVT]),
.mask_in (per_core_mask_in[FPU_CVT]),
.tag_in ({cvt_ret_int_in, per_core_tag_in[FPU_CVT]}),
.frm (per_core_frm[FPU_CVT]),
.is_itof (is_itof),
.is_signed (is_signed),
.dataa (per_core_dataa[FPU_CVT]),
.has_fflags (per_core_has_fflags[FPU_CVT]),
.fflags (per_core_fflags[FPU_CVT]),
.result (per_core_result[FPU_CVT]),
.tag_out ({cvt_ret_int_out, per_core_tag_out[FPU_CVT]}),
.valid_out (per_core_valid_out[FPU_CVT]),
.ready_out (per_core_ready_out[FPU_CVT])
);
// NCP core ///////////////////////////////////////////////////////////////
wire ncp_ret_int_in = (per_core_op_type[FPU_NCP] == `INST_FPU_CMP)
|| `INST_FPU_IS_CLASS(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP])
|| `INST_FPU_IS_MVXW(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP]);
wire ncp_ret_int_out;
wire ncp_ret_sext_in = `INST_FPU_IS_MVXW(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP]);
wire ncp_ret_sext_out;
VX_fpu_ncp #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (TAG_WIDTH+2)
) fpu_ncp (
.clk (clk),
.reset (reset),
.valid_in (per_core_valid_in[FPU_NCP]),
.ready_in (per_core_ready_in[FPU_NCP]),
.mask_in (per_core_mask_in[FPU_NCP]),
.tag_in ({ncp_ret_sext_in, ncp_ret_int_in, per_core_tag_in[FPU_NCP]}),
.op_type (per_core_op_type[FPU_NCP]),
.frm (per_core_frm[FPU_NCP]),
.dataa (per_core_dataa[FPU_NCP]),
.datab (per_core_datab[FPU_NCP]),
.result (per_core_result[FPU_NCP]),
.has_fflags (per_core_has_fflags[FPU_NCP]),
.fflags (per_core_fflags[FPU_NCP]),
.tag_out ({ncp_ret_sext_out, ncp_ret_int_out, per_core_tag_out[FPU_NCP]}),
.valid_out (per_core_valid_out[FPU_NCP]),
.ready_out (per_core_ready_out[FPU_NCP])
);
///////////////////////////////////////////////////////////////////////////
reg [NUM_FPCORES-1:0][RSP_DATAW+2-1:0] per_core_data_out;
reg [NUM_FPC-1:0][RSP_DATAW+2-1:0] per_core_data_out;
always @(*) begin
for (integer i = 0; i < NUM_FPCORES; ++i) begin
for (integer i = 0; i < NUM_FPC; ++i) begin
per_core_data_out[i][RSP_DATAW+1:2] = {
per_core_result[i],
per_core_has_fflags[i],
@ -387,9 +294,9 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
`UNUSED_VAR (op_ret_int_out)
VX_stream_arb #(
.NUM_INPUTS (NUM_FPCORES),
.NUM_INPUTS (NUM_FPC),
.DATAW (RSP_DATAW + 2),
.ARBITER ("R"),
.ARBITER ("F"),
.OUT_BUF (OUT_BUF)
) rsp_arb (
.clk (clk),
@ -403,22 +310,25 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
`UNUSED_PIN (sel_out)
);
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result
for (genvar i = 0; i < NUM_LANES; ++i) begin
`ifdef FPU_RV64F
reg [`XLEN-1:0] result_w;
reg [`XLEN-1:0] result_r;
always @(*) begin
case (op_ret_int_out)
2'b11: result_w = `XLEN'($signed(result_s[i]));
2'b01: result_w = {32'h00000000, result_s[i]};
default: result_w = {32'hffffffff, result_s[i]};
2'b11: result_r = `XLEN'($signed(result_s[i]));
2'b01: result_r = {32'h00000000, result_s[i]};
default: result_r = {32'hffffffff, result_s[i]};
endcase
end
assign result[i] = result_w;
assign result[i] = result_r;
`else
assign result[i] = result_s[i];
`endif
end
// can accept new request?
assign ready_in = per_core_ready_in[core_select];
endmodule
`endif

Some files were not shown because too many files have changed in this diff Show more