Merge branch 'develop' into tensor-core

This commit is contained in:
Jaewon Lee 2024-09-30 16:48:47 -04:00 committed by GitHub
commit 4a606061d2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
374 changed files with 16486 additions and 29573 deletions

175
.github/workflows/ci.yml vendored Normal file
View file

@ -0,0 +1,175 @@
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: CI
on: [push, pull_request]
jobs:
setup:
runs-on: ubuntu-20.04
steps:
- name: Checkout code
uses: actions/checkout@v2
with:
submodules: recursive
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v2
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
restore-keys: |
${{ runner.os }}-toolchain-
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v2
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
restore-keys: |
${{ runner.os }}-thirdparty-
- name: Install Dependencies
if: steps.cache-toolchain.outputs.cache-hit != 'true' || steps.cache-thirdparty.outputs.cache-hit != 'true'
run: |
sudo bash ./ci/install_dependencies.sh
- name: Setup Toolchain
if: steps.cache-toolchain.outputs.cache-hit != 'true'
run: |
TOOLDIR=$PWD/tools
mkdir -p build
cd build
../configure --tooldir=$TOOLDIR
ci/toolchain_install.sh --all
- name: Setup Third Party
if: steps.cache-thirdparty.outputs.cache-hit != 'true'
run: |
make -C third_party > /dev/null
build:
runs-on: ubuntu-20.04
needs: setup
strategy:
matrix:
xlen: [32, 64]
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Install Dependencies
run: |
sudo bash ./ci/install_dependencies.sh
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v2
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
restore-keys: |
${{ runner.os }}-toolchain-
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v2
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
restore-keys: |
${{ runner.os }}-thirdparty-
- name: Run Build
run: |
TOOLDIR=$PWD/tools
mkdir -p build${{ matrix.xlen }}
cd build${{ matrix.xlen }}
../configure --tooldir=$TOOLDIR --xlen=${{ matrix.xlen }}
source ci/toolchain_env.sh
make software -s > /dev/null
make tests -s > /dev/null
- name: Upload Build Artifact
uses: actions/upload-artifact@v3
with:
name: build-${{ matrix.xlen }}
path: build${{ matrix.xlen }}
tests:
runs-on: ubuntu-20.04
needs: build
strategy:
fail-fast: false
matrix:
name: [regression, opencl, cache, config1, config2, debug, scope, stress, synthesis]
xlen: [32, 64]
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Install Dependencies
run: |
sudo bash ./ci/install_dependencies.sh
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v2
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
restore-keys: |
${{ runner.os }}-toolchain-
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v2
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
restore-keys: |
${{ runner.os }}-thirdparty-
- name: Download Build Artifact
uses: actions/download-artifact@v3
with:
name: build-${{ matrix.xlen }}
path: build${{ matrix.xlen }}
- name: Run tests
run: |
cd build${{ matrix.xlen }}
source ci/toolchain_env.sh
chmod -R +x . # Ensure all files have executable permissions
if [ "${{ matrix.name }}" == "regression" ]; then
./ci/regression.sh --unittest
./ci/regression.sh --isa
./ci/regression.sh --kernel
./ci/regression.sh --regression
else
./ci/regression.sh --${{ matrix.name }}
fi
complete:
runs-on: ubuntu-20.04
needs: tests
steps:
- name: Check Completion
run: echo "All matrix jobs passed"

3
.gitignore vendored
View file

@ -1,2 +1,3 @@
/build*
/.vscode
/.vscode
*.cache

8
.gitmodules vendored
View file

@ -1,9 +1,9 @@
[submodule "third_party/fpnew"]
path = third_party/fpnew
url = https://github.com/pulp-platform/fpnew.git
[submodule "third_party/softfloat"]
path = third_party/softfloat
url = https://github.com/ucb-bar/berkeley-softfloat-3.git
[submodule "third_party/ramulator"]
path = third_party/ramulator
url = https://github.com/CMU-SAFARI/ramulator.git
url = https://github.com/CMU-SAFARI/ramulator2.git
[submodule "third_party/cvfpu"]
path = third_party/cvfpu
url = https://github.com/openhwgroup/cvfpu.git

View file

@ -1,102 +0,0 @@
language: cpp
dist: focal
os: linux
compiler: gcc
addons:
apt:
packages:
- build-essential
- valgrind
- libstdc++6
- binutils
- python
- uuid-dev
env:
global:
- TOOLDIR=$HOME/tools
cache:
directories:
- $TOOLDIR
- $HOME/third_party
- $HOME/build32
- $HOME/build64
before_install:
- if [ ! -d "$TOOLDIR" ] || [ -z "$(ls -A $TOOLDIR)" ] || [ "$(cat "$TOOLDIR/version.txt")" != "v0.4" ]; then
rm -rf $TOOLDIR;
mkdir -p $TRAVIS_BUILD_DIR/build && cd $TRAVIS_BUILD_DIR/build;
../configure --tooldir=$TOOLDIR;
ci/toolchain_install.sh --all;
echo "v0.3" > "$TOOLDIR/version.txt";
else
echo "using existing tooldir build";
fi
- if [ ! -d "$HOME/third_party" ] || [ -z "$(ls -A $HOME/third_party)" ] || [ "$(cat "$HOME/third_party/version.txt")" != "v0.2" ]; then
cd $TRAVIS_BUILD_DIR;
make -C third_party > /dev/null;
echo "v0.2" > "third_party/version.txt";
cp -rf third_party $HOME;
else
echo "using existing third_party build";
cp -rf $HOME/third_party $TRAVIS_BUILD_DIR;
fi
install:
- if [ ! -d "$HOME/build$XLEN" ] || [ -z "$(ls -A $HOME/build$XLEN)" ] || [ "$(cat "$HOME/build$XLEN/version.txt")" != "$TRAVIS_COMMIT" ]; then
mkdir -p $TRAVIS_BUILD_DIR/build$XLEN && cd $TRAVIS_BUILD_DIR/build$XLEN;
../configure --tooldir=$TOOLDIR --xlen=$XLEN;
source ci/toolchain_env.sh;
make build -s > /dev/null;
echo "$TRAVIS_COMMIT" > version.txt;
cp -rf $TRAVIS_BUILD_DIR/build$XLEN $HOME;
else
echo "using existing build for commit $TRAVIS_COMMIT";
cp -rf $HOME/build$XLEN $TRAVIS_BUILD_DIR;
fi
before_script:
- cd $TRAVIS_BUILD_DIR/build$XLEN
- source ci/toolchain_env.sh
stages:
- test
jobs:
include:
- stage: test
name: regression32
env: XLEN=32
script:
- ./ci/travis_run.py ./ci/regression.sh --unittest
- ./ci/travis_run.py ./ci/regression.sh --isa
- ./ci/travis_run.py ./ci/regression.sh --kernel
- ./ci/travis_run.py ./ci/regression.sh --synthesis
- ./ci/travis_run.py ./ci/regression.sh --regression
- ./ci/travis_run.py ./ci/regression.sh --opencl
- stage: test
name: regression64
env: XLEN=64
script:
- ./ci/travis_run.py ./ci/regression.sh --isa
- ./ci/travis_run.py ./ci/regression.sh --kernel
- ./ci/travis_run.py ./ci/regression.sh --synthesis
- ./ci/travis_run.py ./ci/regression.sh --regression
- ./ci/travis_run.py ./ci/regression.sh --opencl
- stage: test
name: config
env: XLEN=32
script:
- ./ci/travis_run.py ./ci/regression.sh --cluster
- ./ci/travis_run.py ./ci/regression.sh --config
- stage: test
name: debug
env: XLEN=32
script:
- ./ci/travis_run.py ./ci/regression.sh --debug
- ./ci/travis_run.py ./ci/regression.sh --stress

View file

@ -1,5 +1,7 @@
include config.mk
.PHONY: build software tests
all:
$(MAKE) -C $(VORTEX_HOME)/third_party
$(MAKE) -C hw
@ -15,18 +17,29 @@ build:
$(MAKE) -C runtime
$(MAKE) -C tests
clean:
software:
$(MAKE) -C hw
$(MAKE) -C kernel
$(MAKE) -C runtime/stub
tests:
$(MAKE) -C tests
clean-build:
$(MAKE) -C hw clean
$(MAKE) -C sim clean
$(MAKE) -C kernel clean
$(MAKE) -C runtime clean
$(MAKE) -C tests clean
clean: clean-build
$(MAKE) -C $(VORTEX_HOME)/third_party clean
# Install setup
KERNEL_INC_DST = $(PREFIX)/kernel/include
KERNEL_LIB_DST = $(PREFIX)/kernel/lib$(XLEN)
RUNTIME_INC_DST = $(PREFIX)/runtime/include
RUNTIME_LIB_DST = $(PREFIX)/runtime/lib
KERNEL_INC_DST = $(INSTALLDIR)/kernel/include
KERNEL_LIB_DST = $(INSTALLDIR)/kernel/lib$(XLEN)
RUNTIME_INC_DST = $(INSTALLDIR)/runtime/include
RUNTIME_LIB_DST = $(INSTALLDIR)/runtime/lib
KERNEL_HEADERS = $(wildcard $(VORTEX_HOME)/kernel/include/*.h)
KERNEL_LIBS = $(wildcard kernel/*.a)

View file

@ -1,5 +1,3 @@
[![Build Status](https://travis-ci.com/vortexgpgpu/vortex.svg?branch=master)](https://travis-ci.com/vortexgpgpu/vortex)
# Vortex GPGPU
Vortex is a full-stack open-source RISC-V GPGPU.
@ -35,63 +33,73 @@ Vortex is a full-stack open-source RISC-V GPGPU.
## Build Instructions
More detailed build instructions can be found [here](docs/install_vortex.md).
### Supported OS Platforms
- Ubuntu 18.04, 20.04
- Ubuntu 18.04, 20.04, 22.04, 24.04
- Centos 7
### Toolchain Dependencies
- [POCL](http://portablecl.org/)
- [LLVM](https://llvm.org/)
- [RISCV-GNU-TOOLCHAIN](https://github.com/riscv-collab/riscv-gnu-toolchain)
- [Verilator](https://www.veripool.org/verilator)
- [FpNew](https://github.com/pulp-platform/fpnew.git)
- [cvfpu](https://github.com/openhwgroup/cvfpu.git)
- [SoftFloat](https://github.com/ucb-bar/berkeley-softfloat-3.git)
- [Ramulator](https://github.com/CMU-SAFARI/ramulator.git)
- [Yosys](https://github.com/YosysHQ/yosys)
- [Sv2v](https://github.com/zachjs/sv2v)
### Install development tools
$ sudo apt-get install build-essential
$ sudo apt-get install binutils
$ sudo apt-get install python
$ sudo apt-get install uuid-dev
$ sudo apt-get install git
### Install Vortex codebase
$ git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git
$ cd Vortex
```sh
git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git
cd vortex
```
### Install system dependencies
```sh
# ensure dependent libraries are present
sudo ./ci/install_dependencies.sh
```
### Configure your build folder
# By default, the toolchain default install location is the /opt folder and can be overridden by setting --tooldir.
$ mkdir build
$ cd build
$ ../configure --xlen=32 --tooldir=$HOME/tools
```sh
mkdir build
cd build
../configure --xlen=32 --tooldir=$HOME/tools
```
### Install prebuilt toolchain
$ ./ci/toolchain_install.sh --all
### set environment variables
# should always run before using the toolchain!
$ source ./ci/toolchain_env.sh
```sh
./ci/toolchain_install.sh --all
```
### Set environment variables
```sh
# should always run before using the toolchain!
source ./ci/toolchain_env.sh
```
### Building Vortex
$ make -s
```sh
make -s
```
### Quick demo running vecadd OpenCL kernel on 2 cores
$ ./ci/blackbox.sh --cores=2 --app=vecadd
```sh
./ci/blackbox.sh --cores=2 --app=vecadd
```
### Common Developer Tips
- Installing Vortex kernel and runtime libraries to use with external tools requires passing --prefix=<install-path> to the configure script.
```sh
$ ../configure --xlen=32 --tooldir=$HOME/tools --prefix=<install-path>
$ make -s
$ make install
``````
- Building Vortex 64-bit simply requires using --xlen=64 configure option.
```sh
$ ../configure --xlen=32 --tooldir=$HOME/tools
```
```sh
../configure --xlen=32 --tooldir=$HOME/tools --prefix=<install-path>
make -s
make install
```
- Building Vortex 64-bit requires setting --xlen=64 configure option.
```sh
../configure --xlen=64 --tooldir=$HOME/tools
```
- Sourcing "./ci/toolchain_env.sh" is required everytime you start a new terminal. we recommend adding "source <build-path>/ci/toolchain_env.sh" to your ~/.bashrc file to automate the process at login.
```sh
$ echo "source <build-path>/ci/toolchain_env.sh" >> ~/.bashrc
```
- Making changes to Makefiles in your source tree or adding new folders will require executing the "configure" script again to get it propagated into your build folder.
```sh
$ ../configure
```
- To debug the GPU, you can generate a "run.log" trace. see /docs/debugging.md for more information.
```sh
$ ./ci/blackbox.sh --app=demo --debug=3
```
```sh
echo "source <build-path>/ci/toolchain_env.sh" >> ~/.bashrc
```
- Making changes to Makefiles in your source tree or adding new folders will require executing the "configure" script again without any options to get changes propagated to your build folder.
```sh
../configure
```
- To debug the GPU, the simulation can generate a runtime trace for analysis. See /docs/debugging.md for more information.
```sh
./ci/blackbox.sh --app=demo --debug=3
```
- For additional information, check out the /docs.

View file

@ -13,6 +13,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
SCRIPT_DIR=$(dirname "$0")
ROOT_DIR=$SCRIPT_DIR/..
show_usage()
{
echo "Vortex BlackBox Test Driver v1.0"
@ -29,301 +32,174 @@ show_help()
echo "--rebuild: 0=disable, 1=force, 2=auto, 3=temp"
}
SCRIPT_DIR=$(dirname "$0")
ROOT_DIR=$SCRIPT_DIR/..
DRIVER=simx
APP=sgemm
CLUSTERS=1
CORES=1
WARPS=4
THREADS=4
L2=
L3=
DEBUG=0
DEBUG_LEVEL=0
SCOPE=0
HAS_ARGS=0
PERF_CLASS=0
REBUILD=2
TEMPBUILD=0
LOGFILE=run.log
for i in "$@"
do
case $i in
--driver=*)
DRIVER=${i#*=}
shift
;;
--app=*)
APP=${i#*=}
shift
;;
--clusters=*)
CLUSTERS=${i#*=}
shift
;;
--cores=*)
CORES=${i#*=}
shift
;;
--warps=*)
WARPS=${i#*=}
shift
;;
--threads=*)
THREADS=${i#*=}
shift
;;
--l2cache)
L2=-DL2_ENABLE
shift
;;
--l3cache)
L3=-DL3_ENABLE
shift
;;
--debug=*)
DEBUG_LEVEL=${i#*=}
DEBUG=1
shift
;;
--scope)
SCOPE=1
CORES=1
shift
;;
--perf=*)
PERF_FLAG=-DPERF_ENABLE
PERF_CLASS=${i#*=}
shift
;;
--args=*)
ARGS=${i#*=}
HAS_ARGS=1
shift
;;
--rebuild=*)
REBUILD=${i#*=}
shift
;;
--log=*)
LOGFILE=${i#*=}
shift
;;
--help)
show_help
exit 0
;;
*)
show_usage
exit -1
;;
esac
done
if [ $REBUILD -eq 3 ];
then
REBUILD=1
TEMPBUILD=1
fi
case $DRIVER in
gpu)
DRIVER_PATH=
;;
simx)
DRIVER_PATH=$ROOT_DIR/runtime/simx
;;
rtlsim)
DRIVER_PATH=$ROOT_DIR/runtime/rtlsim
;;
opae)
DRIVER_PATH=$ROOT_DIR/runtime/opae
;;
xrt)
DRIVER_PATH=$ROOT_DIR/runtime/xrt
;;
*)
echo "invalid driver: $DRIVER"
exit -1
;;
esac
if [ -d "$ROOT_DIR/tests/opencl/$APP" ];
then
APP_PATH=$ROOT_DIR/tests/opencl/$APP
elif [ -d "$ROOT_DIR/tests/regression/$APP" ];
then
APP_PATH=$ROOT_DIR/tests/regression/$APP
else
echo "Application folder not found: $APP"
exit -1
fi
if [ "$DRIVER" = "gpu" ];
then
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
OPTS=$ARGS make -C $APP_PATH run-$DRIVER
status=$?
add_option() {
if [ -n "$1" ]; then
echo "$1 $2"
else
echo "running: make -C $APP_PATH run-$DRIVER"
make -C $APP_PATH run-$DRIVER
status=$?
echo "$2"
fi
}
DEFAULTS() {
DRIVER=simx
APP=sgemm
DEBUG=0
DEBUG_LEVEL=0
SCOPE=0
HAS_ARGS=0
PERF_CLASS=0
CONFIGS="$CONFIGS"
REBUILD=2
TEMPBUILD=0
LOGFILE=run.log
}
parse_args() {
DEFAULTS
for i in "$@"; do
case $i in
--driver=*) DRIVER=${i#*=} ;;
--app=*) APP=${i#*=} ;;
--clusters=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_CLUSTERS=${i#*=}") ;;
--cores=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_CORES=${i#*=}") ;;
--warps=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_WARPS=${i#*=}") ;;
--threads=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_THREADS=${i#*=}") ;;
--l2cache) CONFIGS=$(add_option "$CONFIGS" "-DL2_ENABLE") ;;
--l3cache) CONFIGS=$(add_option "$CONFIGS" "-DL3_ENABLE") ;;
--perf=*) CONFIGS=$(add_option "$CONFIGS" "-DPERF_ENABLE"); PERF_CLASS=${i#*=} ;;
--debug=*) DEBUG=1; DEBUG_LEVEL=${i#*=} ;;
--scope) SCOPE=1; ;;
--args=*) HAS_ARGS=1; ARGS=${i#*=} ;;
--rebuild=*) REBUILD=${i#*=} ;;
--log=*) LOGFILE=${i#*=} ;;
--help) show_help; exit 0 ;;
*) show_usage; exit 1 ;;
esac
done
if [ $REBUILD -eq 3 ];
then
REBUILD=1
TEMPBUILD=1
fi
}
set_driver_path() {
case $DRIVER in
gpu) DRIVER_PATH="" ;;
simx|rtlsim|opae|xrt) DRIVER_PATH="$ROOT_DIR/runtime/$DRIVER" ;;
*) echo "Invalid driver: $DRIVER"; exit 1 ;;
esac
}
set_app_path() {
if [ -d "$ROOT_DIR/tests/opencl/$APP" ]; then
APP_PATH="$ROOT_DIR/tests/opencl/$APP"
elif [ -d "$ROOT_DIR/tests/regression/$APP" ]; then
APP_PATH="$ROOT_DIR/tests/regression/$APP"
else
echo "Application folder not found: $APP"
exit 1
fi
}
build_driver() {
local cmd_opts=""
[ $DEBUG -ne 0 ] && cmd_opts=$(add_option "$cmd_opts" "DEBUG=$DEBUG_LEVEL")
[ $SCOPE -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "SCOPE=1")
[ $TEMPBUILD -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "DESTDIR=\"$TEMPDIR\"")
[ -n "$CONFIGS" ] && cmd_opts=$(add_option "$cmd_opts" "CONFIGS=\"$CONFIGS\"")
if [ -n "$cmd_opts" ]; then
echo "Running: $cmd_opts make -C $DRIVER_PATH > /dev/null"
eval "$cmd_opts make -C $DRIVER_PATH > /dev/null"
else
echo "Running: make -C $DRIVER_PATH > /dev/null"
make -C $DRIVER_PATH > /dev/null
fi
}
run_app() {
local cmd_opts=""
[ $DEBUG -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "DEBUG=1")
[ $TEMPBUILD -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "VORTEX_RT_PATH=\"$TEMPDIR\"")
[ $HAS_ARGS -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "OPTS=\"$ARGS\"")
if [ $DEBUG -ne 0 ]; then
if [ -n "$cmd_opts" ]; then
echo "Running: $cmd_opts make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
eval "$cmd_opts make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
else
echo "Running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
fi
else
if [ -n "$cmd_opts" ]; then
echo "Running: $cmd_opts make -C $APP_PATH run-$DRIVER"
eval "$cmd_opts make -C $APP_PATH run-$DRIVER"
else
echo "Running: make -C $APP_PATH run-$DRIVER"
make -C $APP_PATH run-$DRIVER
fi
fi
status=$?
return $status
}
main() {
parse_args "$@"
set_driver_path
set_app_path
# execute on default installed GPU
if [ "$DRIVER" = "gpu" ]; then
run_app
exit $?
fi
if [ -n "$CONFIGS" ]; then
echo "CONFIGS=$CONFIGS"
fi
if [ $REBUILD -ne 0 ]; then
BLACKBOX_CACHE=blackbox.$DRIVER.cache
LAST_CONFIGS=$(cat "$BLACKBOX_CACHE" 2>/dev/null || echo "")
if [ $REBUILD -eq 1 ] || [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ]; then
make -C $DRIVER_PATH clean-driver > /dev/null
echo "$CONFIGS+$DEBUG+$SCOPE" > "$BLACKBOX_CACHE"
fi
fi
export VORTEX_PROFILING=$PERF_CLASS
make -C "$ROOT_DIR/hw" config > /dev/null
make -C "$ROOT_DIR/runtime/stub" > /dev/null
if [ $TEMPBUILD -eq 1 ]; then
# setup temp directory
TEMPDIR=$(mktemp -d)
mkdir -p "$TEMPDIR"
# build stub driver
echo "running: DESTDIR=$TEMPDIR make -C $ROOT_DIR/runtime/stub"
DESTDIR="$TEMPDIR" make -C $ROOT_DIR/runtime/stub > /dev/null
# register tempdir cleanup on exit
trap "rm -rf $TEMPDIR" EXIT
fi
build_driver
run_app
status=$?
if [ $DEBUG -eq 1 ] && [ -f "$APP_PATH/trace.vcd" ]; then
mv -f $APP_PATH/trace.vcd .
fi
if [ $SCOPE -eq 1 ] && [ -f "$APP_PATH/scope.vcd" ]; then
mv -f $APP_PATH/scope.vcd .
fi
exit $status
fi
}
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS"
echo "CONFIGS=$CONFIGS"
if [ $REBUILD -ne 0 ]
then
BLACKBOX_CACHE=blackbox.$DRIVER.cache
if [ -f "$BLACKBOX_CACHE" ]
then
LAST_CONFIGS=`cat $BLACKBOX_CACHE`
fi
if [ $REBUILD -eq 1 ] || [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ];
then
make -C $DRIVER_PATH clean-driver > /dev/null
echo "$CONFIGS+$DEBUG+$SCOPE" > $BLACKBOX_CACHE
fi
fi
# export performance monitor class identifier
export VORTEX_PROFILING=$PERF_CLASS
status=0
# ensure config update
make -C $ROOT_DIR/hw config > /dev/null
# ensure the stub driver is present
make -C $ROOT_DIR/runtime/stub > /dev/null
if [ $DEBUG -ne 0 ]
then
# running application
if [ $TEMPBUILD -eq 1 ]
then
# setup temp directory
TEMPDIR=$(mktemp -d)
mkdir -p "$TEMPDIR/$DRIVER"
# driver initialization
if [ $SCOPE -eq 1 ]
then
echo "running: DESTDIR=$TEMPDIR/$DRIVER DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: DESTDIR=$TEMPDIR/$DRIVER DEBUG=$DEBUG_LEVEL CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
else
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
fi
# cleanup temp directory
trap "rm -rf $TEMPDIR" EXIT
else
# driver initialization
if [ $SCOPE -eq 1 ]
then
echo "running: DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: DEBUG=$DEBUG_LEVEL CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
DEBUG=1 OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
else
echo "running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
DEBUG=1 make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
fi
fi
if [ -f "$APP_PATH/trace.vcd" ]
then
mv -f $APP_PATH/trace.vcd .
fi
else
if [ $TEMPBUILD -eq 1 ]
then
# setup temp directory
TEMPDIR=$(mktemp -d)
mkdir -p "$TEMPDIR/$DRIVER"
# driver initialization
if [ $SCOPE -eq 1 ]
then
echo "running: DESTDIR=$TEMPDIR/$DRIVER SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: DESTDIR=$TEMPDIR/$DRIVER CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER
status=$?
else
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER"
VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER
status=$?
fi
# cleanup temp directory
trap "rm -rf $TEMPDIR" EXIT
else
# driver initialization
if [ $SCOPE -eq 1 ]
then
echo "running: SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: CONFIGS=$CONFIGS make -C $DRIVER_PATH"
CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
OPTS=$ARGS make -C $APP_PATH run-$DRIVER
status=$?
else
echo "running: make -C $APP_PATH run-$DRIVER"
make -C $APP_PATH run-$DRIVER
status=$?
fi
fi
fi
exit $status
main "$@"

46
ci/install_dependencies.sh Executable file
View file

@ -0,0 +1,46 @@
#!/bin/sh
# Copyright 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
# Function to check if GCC version is less than 11
check_gcc_version() {
local gcc_version
gcc_version=$(gcc -dumpversion)
if dpkg --compare-versions "$gcc_version" lt 11; then
return 0 # GCC version is less than 11
else
return 1 # GCC version is 11 or greater
fi
}
# Update package list
apt-get update -y
# install system dependencies
apt-get install -y build-essential valgrind libstdc++6 binutils python3 uuid-dev ccache
# Check and install GCC 11 if necessary
if check_gcc_version; then
echo "GCC version is less than 11. Installing GCC 11..."
add-apt-repository -y ppa:ubuntu-toolchain-r/test
apt-get update
apt-get install -y g++-11 gcc-11
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100
else
echo "GCC version is 11 or greater. No need to install GCC 11."
fi

View file

@ -21,39 +21,10 @@ rm -f blackbox.*.cache
XLEN=${XLEN:=@XLEN@}
XSIZE=$((XLEN / 8))
echo "Vortex Regression Test: XLEN=$XLEN"
split_file() {
if [[ $# -ne 2 ]]; then
echo "Usage: $0 <filename> <start_with>"
return 1
fi
input_file="$1"
start_with="$2"
if [[ ! -r "$input_file" ]]; then
echo "Error: File '$input_file' is not readable or does not exist."
return 1
fi
count=0
output_file=""
while IFS= read -r line; do
if [[ $line == $start_with* ]]; then
count=$((count + 1))
output_file="$input_file.part$count"
> "$output_file" # ensure empty
fi
if [[ -n "$output_file" ]]; then
echo "$line" >> "$output_file"
fi
done < "$input_file"
if [[ $count -eq 0 ]]; then
echo "No lines starting with '$start_with' were found in '$input_file'."
fi
}
###############################################################################
unittest()
{
make -C tests/unittest run
@ -64,38 +35,33 @@ isa()
{
echo "begin isa tests..."
make -C sim/simx
make -C sim/rtlsim
make -C tests/riscv/isa run-simx
make -C tests/riscv/isa run-rtlsim
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32f
if [ "$XLEN" == "64" ]
then
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64d
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64d
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64d
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64d
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64fx
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64fx
fi
# restore default prebuilt configuration
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
# clean build
make -C sim/rtlsim clean
echo "isa tests done!"
}
@ -104,6 +70,9 @@ kernel()
{
echo "begin kernel tests..."
make -C sim/simx
make -C sim/rtlsim
make -C tests/kernel run-simx
make -C tests/kernel run-rtlsim
@ -114,16 +83,24 @@ regression()
{
echo "begin regression tests..."
make -C runtime/simx
make -C runtime/rtlsim
make -C tests/regression run-simx
make -C tests/regression run-rtlsim
# test global barrier
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tgbar" --cores=2
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -tgbar" --cores=2
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tgbar" --cores=2
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=xrt --app=dogfood --args="-n1 -tgbar" --cores=2
# test local barrier
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar"
./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -tbar"
./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tbar"
./ci/blackbox.sh --driver=xrt --app=dogfood --args="-n1 -tbar"
# test temp driver mode for
./ci/blackbox.sh --driver=simx --app=vecadd --rebuild=3
# test for matmul
CONFIGS="-DTC_NUM=4 -DTC_SIZE=8" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args="-n128 -d1"
@ -135,6 +112,9 @@ opencl()
{
echo "begin opencl tests..."
make -C runtime/simx
make -C runtime/rtlsim
make -C tests/opencl run-simx
make -C tests/opencl run-rtlsim
@ -144,78 +124,75 @@ opencl()
echo "opencl tests done!"
}
cluster()
cache()
{
echo "begin clustering tests..."
echo "begin cache tests..."
# cores clustering
./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=diverge --args="-n1"
# disable local memory
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo --perf=1
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=demo --perf=1
# disable L1 cache
CONFIGS="-DL1_DISABLE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
# reduce l1 line size
CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=rtlsim --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache ways
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache banking
CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test writeback
CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --app=mstress
CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
# cache clustering
CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=4 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=4 --warps=1 --threads=2
# L2/L3
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=4 --l2cache --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=diverge --args="-n1"
echo "clustering tests done!"
echo "begin cache tests..."
}
test_csv_trace()
config1()
{
# test CSV trace generation
make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-simx-32im > run_simx.log
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
split_file run_simx.log "Running "
split_file run_rtlsim.log "Running "
for file in ./run_simx.log.part*; do
if [[ -f "$file" ]]; then
file2="${file//simx/rtlsim}"
if [[ -f "$file2" ]]; then
./ci/trace_csv.py -tsimx $file -otrace_simx.csv
./ci/trace_csv.py -trtlsim $file2 -otrace_rtlsim.csv
diff trace_rtlsim.csv trace_simx.csv
else
echo "File $file2 not found."
fi
fi
done
# restore default prebuilt configuration
make -C sim/simx clean && make -C sim/simx > /dev/null
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
}
echo "begin configuration-1 tests..."
debug()
{
echo "begin debugging tests..."
test_csv_trace
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1"
# warp/threads
./ci/blackbox.sh --driver=rtlsim --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=rtlsim --warps=2 --threads=2 --app=diverge
./ci/blackbox.sh --driver=rtlsim --warps=2 --threads=8 --app=diverge
./ci/blackbox.sh --driver=rtlsim --warps=8 --threads=2 --app=diverge
./ci/blackbox.sh --driver=simx --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=simx --warps=8 --threads=16 --app=diverge
echo "debugging tests done!"
}
config()
{
echo "begin configuration tests..."
# warp/threads configurations
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=2 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=diverge
./ci/blackbox.sh --driver=simx --cores=1 --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=simx --cores=1 --warps=8 --threads=16 --app=diverge
# disable DPI
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
# cores clustering
./ci/blackbox.sh --driver=rtlsim --cores=4 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --app=diverge --args="-n1"
# issue width
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
@ -235,66 +212,132 @@ config()
CONFIGS="-DISSUE_WIDTH=2 -DNUM_FPU_BLOCK=1 -DNUM_FPU_LANES=2" ./ci/blackbox.sh --driver=simx --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_FPU_BLOCK=4 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=simx --app=vecaddx
# FPU's PE scaling
CONFIGS="-DFMA_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfmadd"
CONFIGS="-DFCVT_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tftoi"
CONFIGS="-DFDIV_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfdiv"
CONFIGS="-DFSQRT_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfsqrt"
CONFIGS="-DFNCP_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfclamp"
# LSU scaling
CONFIGS="-DISSUE_WIDTH=2 -DNUM_LSU_BLOCK=1 -DNUM_LSU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_LSU_BLOCK=4 -DNUM_LSU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx
CONFIGS="-DISSUE_WIDTH=2 -DNUM_LSU_BLOCK=1 -DNUM_LSU_LANES=2" ./ci/blackbox.sh --driver=simx --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_LSU_BLOCK=4 -DNUM_LSU_LANES=4" ./ci/blackbox.sh --driver=simx --app=vecaddx
echo "configuration-1 tests done!"
}
config2()
{
echo "begin configuration-2 tests..."
# test opaesim
./ci/blackbox.sh --driver=opae --app=printf
./ci/blackbox.sh --driver=opae --app=diverge
./ci/blackbox.sh --driver=xrt --app=diverge
# disable DPI
if [ "$XLEN" == "64" ]; then
# need to disable trig on 64-bit due to a bug inside fpnew's sqrt core.
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-xtrig -xbar -xgbar"
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood --args="-xtrig -xbar -xgbar"
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=xrt --app=dogfood --args="-xtrig -xbar -xgbar"
else
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=xrt --app=dogfood
fi
# custom program startup address
make -C tests/regression/dogfood clean-kernel
STARTUP_ADDR=0x40000000 make -C tests/regression/dogfood
./ci/blackbox.sh --driver=simx --app=dogfood
./ci/blackbox.sh --driver=rtlsim --app=dogfood
make -C tests/regression/dogfood clean-kernel
# disabling M & F extensions
make -C sim/rtlsim clean && CONFIGS="-DEXT_M_DISABLE -DEXT_F_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32i
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
make -C sim/rtlsim clean && CONFIGS="-DEXT_M_DISABLE -DEXT_F_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32i
make -C sim/rtlsim clean
# disabling ZICOND extension
CONFIGS="-DEXT_ZICOND_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo
# disable local memory
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo --perf=1
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --cores=1 --app=demo --perf=1
# test 128-bit memory block
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=xrt --app=mstress
# disable L1 cache
CONFIGS="-DL1_DISABLE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemmx
CONFIGS="-DL1_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemmx
CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemmx
CONFIGS="-DICACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemmx
# test XLEN-bit memory block
CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=mstress
# multiple L1 caches per socket
CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=2 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=8 --warps=1 --threads=2
# test memory coalescing
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8
# test AXI bus
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo
# test single-bank memory
if [ "$XLEN" == "64" ]; then
CONFIGS="-DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=48" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=48" ./ci/blackbox.sh --driver=xrt --app=mstress
else
CONFIGS="-DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32" ./ci/blackbox.sh --driver=xrt --app=mstress
fi
# reduce l1 line size
CONFIGS="-DL1_LINE_SIZE=4" ./ci/blackbox.sh --driver=rtlsim --app=io_addr
CONFIGS="-DL1_LINE_SIZE=4" ./ci/blackbox.sh --driver=simx --app=io_addr
CONFIGS="-DL1_LINE_SIZE=4 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_LINE_SIZE=4 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test larger memory address
if [ "$XLEN" == "64" ]; then
CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=49" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=49" ./ci/blackbox.sh --driver=xrt --app=mstress
else
CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=33" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=33" ./ci/blackbox.sh --driver=xrt --app=mstress
fi
# test cache banking
CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --cores=1 --app=sgemmx
# test memory banks interleaving
CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=1" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=0" ./ci/blackbox.sh --driver=opae --app=mstress
# test 128-bit MEM block
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
echo "configuration-2 tests done!"
}
# test single-bank DRAM
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
test_csv_trace()
{
# test CSV trace generation
make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-simx-32im > run_simx.log
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
diff trace_rtlsim.csv trace_simx.csv
# clean build
make -C sim/simx clean
make -C sim/rtlsim clean
}
# test 27-bit DRAM address
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
debug()
{
echo "begin debugging tests..."
echo "configuration tests done!"
test_csv_trace
CONFIGS="-O0" ./ci/blackbox.sh --driver=opae --app=demo --args="-n1"
CONFIGS="-O0" ./ci/blackbox.sh --driver=xrt --app=demo --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=xrt --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
echo "debugging tests done!"
}
scope()
{
echo "begin scope tests..."
SCOPE_DEPTH=256 ./ci/blackbox.sh --driver=opae --app=demo --args="-n1" --scope
SCOPE_DEPTH=256 ./ci/blackbox.sh --driver=xrt --app=demo --args="-n1" --scope
echo "debugging scope done!"
}
stress()
@ -302,10 +345,8 @@ stress()
echo "begin stress tests..."
# test verilator reset values
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --app=printf
./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n128" --l2cache
CONFIGS="-DVERILATOR_RESET_VALUE=1 -DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=xrt --app=sgemmx --args="-n128" --l2cache
echo "stress tests done!"
}
@ -315,7 +356,7 @@ synthesis()
echo "begin synthesis tests..."
PREFIX=build_base make -C hw/syn/yosys clean
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys elaborate
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys synthesis
echo "synthesis tests done!"
}
@ -323,11 +364,9 @@ synthesis()
show_usage()
{
echo "Vortex Regression Test"
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cluster] [--debug] [--config] [--stress] [--synthesis] [--all] [--h|--help]"
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--scope] [--stress] [--synthesis] [--all] [--h|--help]"
}
start=$SECONDS
declare -a tests=()
clean=0
@ -351,14 +390,20 @@ while [ "$1" != "" ]; do
--opencl )
tests+=("opencl")
;;
--cluster )
tests+=("cluster")
--cache )
tests+=("cache")
;;
--config1 )
tests+=("config1")
;;
--config2 )
tests+=("config2")
;;
--debug )
tests+=("debug")
;;
--config )
tests+=("config")
--scope )
tests+=("scope")
;;
--stress )
tests+=("stress")
@ -373,9 +418,11 @@ while [ "$1" != "" ]; do
tests+=("kernel")
tests+=("regression")
tests+=("opencl")
tests+=("cluster")
tests+=("cache")
tests+=("config1")
tests+=("config2")
tests+=("debug")
tests+=("config")
tests+=("scope")
tests+=("stress")
tests+=("synthesis")
;;
@ -396,6 +443,8 @@ then
make -s
fi
start=$SECONDS
for test in "${tests[@]}"; do
$test
done

View file

@ -1,13 +1,13 @@
#!/bin/sh
# Copyright 2019-2023
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -16,8 +16,7 @@
TOOLDIR=${TOOLDIR:=@TOOLDIR@}
export VERILATOR_ROOT=$TOOLDIR/verilator
export PATH=$VERILATOR_ROOT/bin:$PATH
export PATH=$TOOLDIR/verilator/bin:$PATH
export SV2V_PATH=$TOOLDIR/sv2v
export PATH=$SV2V_PATH/bin:$PATH

View file

@ -23,9 +23,9 @@ OSVERSION=${OSVERSION:=@OSVERSION@}
riscv32()
{
case $OSVERSION in
"centos/7") parts=$(eval echo {a..h}) ;;
"ubuntu/focal") parts=$(eval echo {a..k}) ;;
*) parts=$(eval echo {a..j}) ;;
"centos/7") parts=$(eval echo {a..l}) ;;
"ubuntu/bionic") parts=$(eval echo {a..j}) ;;
*) parts=$(eval echo {a..k}) ;;
esac
rm -f riscv32-gnu-toolchain.tar.bz2.parta*
for x in $parts
@ -41,7 +41,7 @@ riscv32()
riscv64()
{
case $OSVERSION in
"centos/7") parts=$(eval echo {a..h}) ;;
"centos/7") parts=$(eval echo {a..l}) ;;
*) parts=$(eval echo {a..j}) ;;
esac
rm -f riscv64-gnu-toolchain.tar.bz2.parta*

View file

@ -19,6 +19,8 @@ import csv
import re
import inspect
configs = None
def parse_args():
parser = argparse.ArgumentParser(description='CPU trace log to CSV format converter.')
parser.add_argument('-t', '--type', default='simx', help='log type (rtlsim or simx)')
@ -26,7 +28,26 @@ def parse_args():
parser.add_argument('log', help='Input log file')
return parser.parse_args()
def parse_simx(log_filename):
def load_config(filename):
config_pattern = r"CONFIGS: num_threads=(\d+), num_warps=(\d+), num_cores=(\d+), num_clusters=(\d+), socket_size=(\d+), local_mem_base=0x([0-9a-fA-F]+), num_barriers=(\d+)"
with open(filename, 'r') as file:
for line in file:
config_match = re.search(config_pattern, line)
if config_match:
config = {
'num_threads': int(config_match.group(1)),
'num_warps': int(config_match.group(2)),
'num_cores': int(config_match.group(3)),
'num_clusters': int(config_match.group(4)),
'socket_size': int(config_match.group(5)),
'local_mem_base': int(config_match.group(6), 16),
'num_barriers': int(config_match.group(7)),
}
return config
print("Error: missing CONFIGS: header")
sys.exit(1)
def parse_simx(log_lines):
pc_pattern = r"PC=(0x[0-9a-fA-F]+)"
instr_pattern = r"Instr (0x[0-9a-fA-F]+):"
opcode_pattern = r"Instr 0x[0-9a-fA-F]+: ([0-9a-zA-Z_\.]+)"
@ -37,32 +58,32 @@ def parse_simx(log_filename):
destination_pattern = r"Dest Reg: (.+)"
uuid_pattern = r"#(\d+)"
entries = []
with open(log_filename, 'r') as log_file:
instr_data = None
for lineno, line in enumerate(log_file, start=1):
try:
if line.startswith("DEBUG Fetch:"):
if instr_data:
entries.append(instr_data)
instr_data = {}
instr_data["lineno"] = lineno
instr_data["PC"] = re.search(pc_pattern, line).group(1)
instr_data["core_id"] = re.search(core_id_pattern, line).group(1)
instr_data["warp_id"] = re.search(warp_id_pattern, line).group(1)
instr_data["tmask"] = re.search(tmask_pattern, line).group(1)
instr_data["uuid"] = re.search(uuid_pattern, line).group(1)
elif line.startswith("DEBUG Instr"):
instr_data["instr"] = re.search(instr_pattern, line).group(1)
instr_data["opcode"] = re.search(opcode_pattern, line).group(1)
elif line.startswith("DEBUG Src"):
src_reg = re.search(operands_pattern, line).group(1)
instr_data["operands"] = (instr_data["operands"] + ', ' + src_reg) if 'operands' in instr_data else src_reg
elif line.startswith("DEBUG Dest"):
instr_data["destination"] = re.search(destination_pattern, line).group(1)
except Exception as e:
print("Error at line {}: {}".format(lineno, e))
if instr_data:
entries.append(instr_data)
instr_data = None
for lineno, line in enumerate(log_lines, start=1):
try:
if line.startswith("DEBUG Fetch:"):
if instr_data:
entries.append(instr_data)
instr_data = {}
instr_data["lineno"] = lineno
instr_data["PC"] = re.search(pc_pattern, line).group(1)
instr_data["core_id"] = int(re.search(core_id_pattern, line).group(1))
instr_data["warp_id"] = int(re.search(warp_id_pattern, line).group(1))
instr_data["tmask"] = re.search(tmask_pattern, line).group(1)
instr_data["uuid"] = int(re.search(uuid_pattern, line).group(1))
elif line.startswith("DEBUG Instr"):
instr_data["instr"] = re.search(instr_pattern, line).group(1)
instr_data["opcode"] = re.search(opcode_pattern, line).group(1)
elif line.startswith("DEBUG Src"):
src_reg = re.search(operands_pattern, line).group(1)
instr_data["operands"] = (instr_data["operands"] + ', ' + src_reg) if 'operands' in instr_data else src_reg
elif line.startswith("DEBUG Dest"):
instr_data["destination"] = re.search(destination_pattern, line).group(1)
except Exception as e:
print("Error at line {}: {}".format(lineno, e))
instr_data = None
if instr_data:
entries.append(instr_data)
return entries
def reverse_binary(bin_str):
@ -95,8 +116,9 @@ def append_value(text, reg, value, tmask_arr, sep):
text += "}"
return text, sep
def parse_rtlsim(log_filename):
line_pattern = r"\d+: core(\d+)-(decode|issue|commit)"
def parse_rtlsim(log_lines):
global configs
line_pattern = r"\d+: cluster(\d+)-socket(\d+)-core(\d+)-(decode|issue|commit)"
pc_pattern = r"PC=(0x[0-9a-fA-F]+)"
instr_pattern = r"instr=(0x[0-9a-fA-F]+)"
ex_pattern = r"ex=([a-zA-Z]+)"
@ -116,124 +138,154 @@ def parse_rtlsim(log_filename):
eop_pattern = r"eop=(\d)"
uuid_pattern = r"#(\d+)"
entries = []
with open(log_filename, 'r') as log_file:
instr_data = {}
for lineno, line in enumerate(log_file, start=1):
try:
line_match = re.search(line_pattern, line)
if line_match:
PC = re.search(pc_pattern, line).group(1)
warp_id = re.search(warp_id_pattern, line).group(1)
tmask = re.search(tmask_pattern, line).group(1)
uuid = re.search(uuid_pattern, line).group(1)
core_id = line_match.group(1)
stage = line_match.group(2)
if stage == "decode":
trace = {}
trace["uuid"] = uuid
trace["PC"] = PC
trace["core_id"] = core_id
trace["warp_id"] = warp_id
trace["tmask"] = reverse_binary(tmask)
trace["instr"] = re.search(instr_pattern, line).group(1)
trace["opcode"] = re.search(op_pattern, line).group(1)
trace["opds"] = bin_to_array(re.search(opds_pattern, line).group(1))
trace["rd"] = re.search(rd_pattern, line).group(1)
trace["rs1"] = re.search(rs1_pattern, line).group(1)
trace["rs2"] = re.search(rs2_pattern, line).group(1)
trace["rs3"] = re.search(rs3_pattern, line).group(1)
instr_data = {}
num_cores = configs['num_cores']
socket_size = configs['socket_size']
num_sockets = (num_cores + socket_size - 1) // socket_size
for lineno, line in enumerate(log_lines, start=1):
try:
line_match = re.search(line_pattern, line)
if line_match:
PC = re.search(pc_pattern, line).group(1)
warp_id = int(re.search(warp_id_pattern, line).group(1))
tmask = re.search(tmask_pattern, line).group(1)
uuid = int(re.search(uuid_pattern, line).group(1))
cluster_id = int(line_match.group(1))
socket_id = int(line_match.group(2))
core_id = int(line_match.group(3))
stage = line_match.group(4)
if stage == "decode":
trace = {}
trace["uuid"] = uuid
trace["PC"] = PC
trace["core_id"] = ((((cluster_id * num_sockets) + socket_id) * socket_size) + core_id)
trace["warp_id"] = warp_id
trace["tmask"] = reverse_binary(tmask)
trace["instr"] = re.search(instr_pattern, line).group(1)
trace["opcode"] = re.search(op_pattern, line).group(1)
trace["opds"] = bin_to_array(re.search(opds_pattern, line).group(1))
trace["rd"] = re.search(rd_pattern, line).group(1)
trace["rs1"] = re.search(rs1_pattern, line).group(1)
trace["rs2"] = re.search(rs2_pattern, line).group(1)
trace["rs3"] = re.search(rs3_pattern, line).group(1)
instr_data[uuid] = trace
elif stage == "issue":
if uuid in instr_data:
trace = instr_data[uuid]
trace["lineno"] = lineno
opds = trace["opds"]
if opds[1]:
trace["rs1_data"] = re.search(rs1_data_pattern, line).group(1).split(', ')[::-1]
if opds[2]:
trace["rs2_data"] = re.search(rs2_data_pattern, line).group(1).split(', ')[::-1]
if opds[3]:
trace["rs3_data"] = re.search(rs3_data_pattern, line).group(1).split(', ')[::-1]
trace["issued"] = True
instr_data[uuid] = trace
elif stage == "issue":
if uuid in instr_data:
trace = instr_data[uuid]
trace["lineno"] = lineno
elif stage == "commit":
if uuid in instr_data:
trace = instr_data[uuid]
if "issued" in trace:
opds = trace["opds"]
if opds[1]:
trace["rs1_data"] = re.search(rs1_data_pattern, line).group(1).split(', ')[::-1]
if opds[2]:
trace["rs2_data"] = re.search(rs2_data_pattern, line).group(1).split(', ')[::-1]
if opds[3]:
trace["rs3_data"] = re.search(rs3_data_pattern, line).group(1).split(', ')[::-1]
trace["issued"] = True
dst_tmask_arr = bin_to_array(tmask)[::-1]
wb = re.search(wb_pattern, line).group(1) == "1"
if wb:
rd_data = re.search(rd_data_pattern, line).group(1).split(', ')[::-1]
if 'rd_data' in trace:
merged_rd_data = trace['rd_data']
for i in range(len(dst_tmask_arr)):
if dst_tmask_arr[i] == 1:
merged_rd_data[i] = rd_data[i]
trace['rd_data'] = merged_rd_data
else:
trace['rd_data'] = rd_data
instr_data[uuid] = trace
elif stage == "commit":
if uuid in instr_data:
trace = instr_data[uuid]
if "issued" in trace:
opds = trace["opds"]
dst_tmask_arr = bin_to_array(tmask)[::-1]
wb = re.search(wb_pattern, line).group(1) == "1"
eop = re.search(eop_pattern, line).group(1) == "1"
if eop:
tmask_arr = bin_to_array(trace["tmask"])
destination = ''
if wb:
rd_data = re.search(rd_data_pattern, line).group(1).split(', ')[::-1]
if 'rd_data' in trace:
merged_rd_data = trace['rd_data']
for i in range(len(dst_tmask_arr)):
if dst_tmask_arr[i] == 1:
merged_rd_data[i] = rd_data[i]
trace['rd_data'] = merged_rd_data
else:
trace['rd_data'] = rd_data
instr_data[uuid] = trace
eop = re.search(eop_pattern, line).group(1) == "1"
if eop:
tmask_arr = bin_to_array(trace["tmask"])
destination = ''
if wb:
destination, sep = append_value(destination, trace["rd"], trace['rd_data'], tmask_arr, False)
del trace['rd_data']
trace["destination"] = destination
operands = ''
sep = False
if opds[1]:
operands, sep = append_value(operands, trace["rs1"], trace["rs1_data"], tmask_arr, sep)
del trace["rs1_data"]
if opds[2]:
operands, sep = append_value(operands, trace["rs2"], trace["rs2_data"], tmask_arr, sep)
del trace["rs2_data"]
if opds[3]:
operands, sep = append_value(operands, trace["rs3"], trace["rs3_data"], tmask_arr, sep)
del trace["rs3_data"]
trace["operands"] = operands
del trace["opds"]
del trace["rd"]
del trace["rs1"]
del trace["rs2"]
del trace["rs3"]
del trace["issued"]
del instr_data[uuid]
entries.append(trace)
except Exception as e:
print("Error at line {}: {}".format(lineno, e))
destination, sep = append_value(destination, trace["rd"], trace['rd_data'], tmask_arr, False)
del trace['rd_data']
trace["destination"] = destination
operands = ''
sep = False
if opds[1]:
operands, sep = append_value(operands, trace["rs1"], trace["rs1_data"], tmask_arr, sep)
del trace["rs1_data"]
if opds[2]:
operands, sep = append_value(operands, trace["rs2"], trace["rs2_data"], tmask_arr, sep)
del trace["rs2_data"]
if opds[3]:
operands, sep = append_value(operands, trace["rs3"], trace["rs3_data"], tmask_arr, sep)
del trace["rs3_data"]
trace["operands"] = operands
del trace["opds"]
del trace["rd"]
del trace["rs1"]
del trace["rs2"]
del trace["rs3"]
del trace["issued"]
del instr_data[uuid]
entries.append(trace)
except Exception as e:
print("Error at line {}: {}".format(lineno, e))
return entries
def write_csv(log_filename, csv_filename, log_type):
entries = None
# parse log file
if log_type == "rtlsim":
entries = parse_rtlsim(log_filename)
elif log_type == "simx":
entries = parse_simx(log_filename)
else:
print('Error: invalid log type')
sys.exit()
# sort entries by uuid
entries.sort(key=lambda x: (int(x['uuid'])))
for entry in entries:
del entry['lineno']
# write to CSV
def write_csv(sublogs, csv_filename, log_type):
with open(csv_filename, 'w', newline='') as csv_file:
fieldnames = ["uuid", "PC", "opcode", "instr", "core_id", "warp_id", "tmask", "destination", "operands"]
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for entry in entries:
writer.writerow(entry)
for sublog in sublogs:
entries = None
# parse sublog
if log_type == "rtlsim":
entries = parse_rtlsim(sublog)
elif log_type == "simx":
entries = parse_simx(sublog)
else:
print('Error: invalid log type')
sys.exit()
# sort entries by uuid
entries.sort(key=lambda x: (int(x['uuid'])))
for entry in entries:
del entry['lineno']
for entry in entries:
writer.writerow(entry)
def split_log_file(log_filename):
with open(log_filename, 'r') as log_file:
log_lines = log_file.readlines()
sublogs = []
current_sublog = None
for line in log_lines:
if line.startswith("[VXDRV] START"):
if current_sublog is not None:
sublogs.append(current_sublog)
current_sublog = [line]
elif current_sublog is not None:
current_sublog.append(line)
if current_sublog is not None:
sublogs.append(current_sublog)
else:
sublogs.append(log_lines)
return sublogs
def main():
global configs
args = parse_args()
write_csv(args.log, args.csv, args.type)
configs = load_config(args.log)
sublogs = split_log_file(args.log)
write_csv(sublogs, args.csv, args.type)
if __name__ == "__main__":
main()

View file

@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
# Copyright 2019-2023
#

View file

@ -19,7 +19,7 @@ TOOLDIR ?= @TOOLDIR@
OSVERSION ?= @OSVERSION@
PREFIX ?= @PREFIX@
INSTALLDIR ?= @INSTALLDIR@
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
@ -31,5 +31,4 @@ RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv$(XLEN)-gnu-toolchain
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
VORTEX_RT_PATH ?= $(VORTEX_HOME)/runtime
VORTEX_KN_PATH ?= $(VORTEX_HOME)/kernel
THIRD_PARTY_DIR ?= $(VORTEX_HOME)/third_party

10
configure vendored
View file

@ -26,6 +26,8 @@ detect_osversion() {
case "$VERSION_CODENAME" in
bionic) osversion="ubuntu/bionic";;
focal) osversion="ubuntu/focal";;
jammy) osversion="ubuntu/focal";;
noble) osversion="ubuntu/focal";;
# Add new versions as needed
esac
;;
@ -63,7 +65,7 @@ copy_files() {
filename_no_ext="${filename%.in}"
dest_file="$dest_dir/$filename_no_ext"
mkdir -p "$dest_dir"
sed "s|@VORTEX_HOME@|$SCRIPT_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@PREFIX@|$PREFIX|g" "$file" > "$dest_file"
sed "s|@VORTEX_HOME@|$SCRIPT_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g; s|@CURRENTDIR@|$CURRENT_DIR|g" "$file" > "$dest_file"
# apply permissions to bash scripts
read -r firstline < "$dest_file"
if [[ "$firstline" =~ ^#!.*bash ]]; then
@ -111,7 +113,7 @@ copy_files() {
# default configuration parameters
default_xlen=32
default_tooldir=/opt
default_tooldir=$HOME/tools
default_osversion=$(detect_osversion)
default_prefix=$CURRENT_DIR
@ -140,8 +142,8 @@ PREFIX=${PREFIX:=$default_prefix}
usage() {
echo "Usage: $0 [--xlen=<value>] [--tooldir=<path>] [--osversion=<version>]"
echo " --xlen=<value> Set the XLEN value (default: 32)"
echo " --tooldir=<path> Set the TOOLDIR path (default: /opt)"
echo " --osversion=<version> Set the OS Version (default: $(detect_os))"
echo " --tooldir=<path> Set the TOOLDIR path (default: $HOME/tools)"
echo " --osversion=<version> Set the OS Version (default: $(detect_osversion))"
echo " --prefix=<path> Set installation directory"
exit 1
}

View file

@ -34,7 +34,7 @@ The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware
- `NUM_THREADS`: Number of threads per warps
- `PERF_ENABLE`: enable the use of all profile counters
You configure the syntesis build from the command line:
You can configure the synthesis build from the command line:
$ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make
@ -43,7 +43,7 @@ OPAE Build Progress
You could check the last 10 lines in the build log for possible errors until build completion.
$ tail -n 10 <build_dir>/build.log
$ tail -n 10 <build_dir>/synth/build.log
Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs.
@ -53,7 +53,7 @@ If the build fails and you need to restart it, clean up the build folder using t
$ make clean
The file `vortex_afu.gbs` should exist when the build is done:
The bitstream file `vortex_afu.gbs` should exist when the build is done:
$ ls -lsa <build_dir>/synth/vortex_afu.gbs
@ -65,10 +65,28 @@ Signing the bitstream and Programming the FPGA
$ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs
$ fpgasupdate vortex_afu_unsigned_ssl.gbs
FPGA sample test running OpenCL sgemm kernel
--------------------------------------------
Sample FPGA Run Test
--------------------
Run the following from the Vortex root directory
Ensure you have the correct opae runtime for the FPGA target
$ TARGET=FPGA make -C runtime/opae
Run the following from your Vortex build directory
$ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128"
Testing Vortex using OPAE with Intel ASE Simulation
---------------------------------------------------
Building ASE synthesis
$ TARGET=asesim make -C runtime/opae
Building ASE runtime
$ TARGET=asesim make -C runtime/opae
Running ASE simulation
$ ASE_LOG=0 ASE_WORKDIR=<build_dir>/synth/work TARGET=asesim ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n16"

View file

@ -53,9 +53,9 @@ A waveform trace `trace.vcd` will be generated in the current directory during t
## Analyzing Vortex trace log
When debugging Vortex RTL or SimX Simulator, reading the trace run.log file can be overwhelming when the trace gets really large.
We provide a trace sanitizer tool under ./hw/scripts/trace_csv.py that you can use to convert the large trace into a CSV file containing all the instructions that executed with their source and destination operands. To increase compatibility between traces you will need to initialize RTLSIM's GPRs to zero by defining GPR_RESET.
We provide a trace sanitizer tool under ./hw/scripts/trace_csv.py that you can use to convert the large trace into a CSV file containing all the instructions that executed with their source and destination operands.
$ CONFIGS="-DGPR_RESET" ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=3 --log=run_rtlsim.log
$ ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=3 --log=run_rtlsim.log
$ ./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
$ ./ci/blackbox.sh --driver=simx --app=demo --debug=3 --log=run_simx.log

View file

@ -7,7 +7,8 @@
- [Cache Subsystem](cache_subsystem.md)
- [Software](software.md)
- [Simulation](simulation.md)
- [FPGA Setup Guide](fpga_setup.md)
- [Altera FPGA Setup Guide](altera_fpga_guide.md)
- [Xilinx FPGA Setup Guide](xilinx_fpga_guide.md)
- [Debugging](debugging.md)
- [Useful Links](references.md)
@ -27,6 +28,6 @@ Running Vortex simulators with different configurations:
$ ./ci/blackbox.sh --driver=opae --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo
- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads
- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads
$ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood

52
docs/xilinx_fpga_guide.md Normal file
View file

@ -0,0 +1,52 @@
# FPGA Startup and Configuration Guide
XRT Environment Setup
----------------------
$ source /opt/xilinx/Vitis/2023.1/settings64.sh
$ source /opt/xilinx/xrt/setup.sh
Check Installed FPGA Platforms
------------------------------
$ platforminfo -l
Build FPGA image
----------------
$ cd hw/syn/xilinx/xrt
$ PREFIX=test1 PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 TARGET=hw NUM_CORES=4 make
Will run the synthesis under new build directory: BUILD_DIR := "\<PREFIX>\_\<PLATFORM>\_\<TARGET>"
The generated bitstream will be located under <BUILD_DIR>/bin/vortex_afu.xclbin
Sample FPGA Run Test
--------------------
Ensure you have the correct opae runtime for the FPGA target
$ make -C runtime/xrt clean
$ TARGET=hw make -C runtime/xrt
Run the following from your Vortex build directory
$ TARGET=hw FPGA_BIN_DIR=<BUILD_DIR>/bin ./ci/blackbox.sh --driver=xrt --app=sgemm --args="-n128"
Testing Vortex using XRT Hardware Emulation
-------------------------------------------
Building XRT's hw_emu target
$ cd hw/syn/xilinx/xrt
$ PREFIX=test2 PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 TARGET=hw_emu make
Building XRT hw_meu runtime
$ TARGET=hw_emu make -C runtime/xrt
Running XRT hw_emu simulation
$ TARGET=hw_emu FPGA_BIN_DIR=<BUILD_DIR>/bin ./ci/blackbox.sh --driver=xrt --app=sgemm

View file

@ -9,13 +9,14 @@ all: config
config: VX_config.h VX_types.h
VX_config.h: $(RTL_DIR)/VX_config.vh
VX_config.h: $(RTL_DIR)/VX_config.vh
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/VX_config.vh -o VX_config.h
VX_types.h: $(RTL_DIR)/VX_types.vh
VX_types.h: $(RTL_DIR)/VX_types.vh
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/VX_types.vh -o VX_types.h
clean:
$(MAKE) -C unittest clean
rm -f VX_config.h VX_types.h
.PHONY: VX_config.h VX_types.h

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,8 +14,6 @@
`ifndef FLOAT_DPI_VH
`define FLOAT_DPI_VH
`include "VX_config.vh"
import "DPI-C" function void dpi_fadd(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fsub(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmul(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);

View file

@ -47,8 +47,6 @@ extern "C" {
void dpi_trace(int level, const char* format, ...);
void dpi_trace_start();
void dpi_trace_stop();
uint64_t dpi_uuid_gen(bool reset, int wid);
}
bool sim_trace_enabled();
@ -204,17 +202,3 @@ void dpi_trace_start() {
void dpi_trace_stop() {
sim_trace_enable(false);
}
///////////////////////////////////////////////////////////////////////////////
std::unordered_map<uint32_t, uint32_t> g_uuid_gens;
uint64_t dpi_uuid_gen(bool reset, int wid) {
if (reset) {
g_uuid_gens.clear();
return 0;
}
uint32_t instr_uuid = g_uuid_gens[wid]++;
uint64_t uuid = (uint64_t(wid) << 32) | instr_uuid;
return uuid;
}

View file

@ -14,8 +14,6 @@
`ifndef UTIL_DPI_VH
`define UTIL_DPI_VH
`include "VX_config.vh"
`ifdef XLEN_64
`define INT_TYPE longint
`else
@ -32,6 +30,4 @@ import "DPI-C" function void dpi_trace(input int level, input string format /*ve
import "DPI-C" function void dpi_trace_start();
import "DPI-C" function void dpi_trace_stop();
import "DPI-C" function longint dpi_uuid_gen(input logic reset, input int wid);
`endif

View file

@ -14,7 +14,8 @@
`include "VX_define.vh"
module VX_cluster import VX_gpu_pkg::*; #(
parameter CLUSTER_ID = 0
parameter CLUSTER_ID = 0,
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -55,14 +56,12 @@ module VX_cluster import VX_gpu_pkg::*; #(
VX_gbar_bus_if per_socket_gbar_bus_if[`NUM_SOCKETS]();
VX_gbar_bus_if gbar_bus_if();
`RESET_RELAY (gbar_reset, reset);
VX_gbar_arb #(
.NUM_REQS (`NUM_SOCKETS),
.OUT_BUF ((`NUM_SOCKETS > 2) ? 1 : 0) // bgar_unit has no backpressure
) gbar_arb (
.clk (clk),
.reset (gbar_reset),
.reset (reset),
.bus_in_if (per_socket_gbar_bus_if),
.bus_out_if (gbar_bus_if)
);
@ -71,7 +70,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.INSTANCE_ID ($sformatf("gbar%0d", CLUSTER_ID))
) gbar_unit (
.clk (clk),
.reset (gbar_reset),
.reset (reset),
.gbar_bus_if (gbar_bus_if)
);
@ -85,7 +84,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
`RESET_RELAY (l2_reset, reset);
VX_cache_wrap #(
.INSTANCE_ID ("l2cache"),
.INSTANCE_ID ($sformatf("%s-l2cache", INSTANCE_ID)),
.CACHE_SIZE (`L2_CACHE_SIZE),
.LINE_SIZE (`L2_LINE_SIZE),
.NUM_BANKS (`L2_NUM_BANKS),
@ -95,12 +94,14 @@ module VX_cluster import VX_gpu_pkg::*; #(
.CRSQ_SIZE (`L2_CRSQ_SIZE),
.MSHR_SIZE (`L2_MSHR_SIZE),
.MRSQ_SIZE (`L2_MRSQ_SIZE),
.MREQ_SIZE (`L2_MREQ_SIZE),
.MREQ_SIZE (`L2_WRITEBACK ? `L2_MSHR_SIZE : `L2_MREQ_SIZE),
.TAG_WIDTH (L2_TAG_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`L2_WRITEBACK),
.DIRTY_BYTES (`L2_WRITEBACK),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2),
.CORE_OUT_BUF (3),
.MEM_OUT_BUF (3),
.NC_ENABLE (1),
.PASSTHRU (!`L2_ENABLED)
) l2cache (
@ -115,24 +116,22 @@ module VX_cluster import VX_gpu_pkg::*; #(
///////////////////////////////////////////////////////////////////////////
VX_dcr_bus_if socket_dcr_bus_tmp_if();
assign socket_dcr_bus_tmp_if.write_valid = dcr_bus_if.write_valid && (dcr_bus_if.write_addr >= `VX_DCR_BASE_STATE_BEGIN && dcr_bus_if.write_addr < `VX_DCR_BASE_STATE_END);
assign socket_dcr_bus_tmp_if.write_addr = dcr_bus_if.write_addr;
assign socket_dcr_bus_tmp_if.write_data = dcr_bus_if.write_data;
wire [`NUM_SOCKETS-1:0] per_socket_busy;
`BUFFER_DCR_BUS_IF (socket_dcr_bus_if, socket_dcr_bus_tmp_if, (`NUM_SOCKETS > 1));
// Generate all sockets
for (genvar i = 0; i < `NUM_SOCKETS; ++i) begin
for (genvar socket_id = 0; socket_id < `NUM_SOCKETS; ++socket_id) begin : g_sockets
`RESET_RELAY (socket_reset, reset);
VX_dcr_bus_if socket_dcr_bus_if();
wire is_base_dcr_addr = (dcr_bus_if.write_addr >= `VX_DCR_BASE_STATE_BEGIN && dcr_bus_if.write_addr < `VX_DCR_BASE_STATE_END);
`BUFFER_DCR_BUS_IF (socket_dcr_bus_if, dcr_bus_if, is_base_dcr_addr, (`NUM_SOCKETS > 1))
VX_socket #(
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i)
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + socket_id),
.INSTANCE_ID ($sformatf("%s-socket%0d", INSTANCE_ID, socket_id))
) socket (
`SCOPE_IO_BIND (scope_socket+i)
`SCOPE_IO_BIND (scope_socket+socket_id)
.clk (clk),
.reset (socket_reset),
@ -143,13 +142,13 @@ module VX_cluster import VX_gpu_pkg::*; #(
.dcr_bus_if (socket_dcr_bus_if),
.mem_bus_if (per_socket_mem_bus_if[i]),
.mem_bus_if (per_socket_mem_bus_if[socket_id]),
`ifdef GBAR_ENABLE
.gbar_bus_if (per_socket_gbar_bus_if[i]),
.gbar_bus_if (per_socket_gbar_bus_if[socket_id]),
`endif
.busy (per_socket_busy[i])
.busy (per_socket_busy[socket_id])
);
end

View file

@ -109,7 +109,6 @@
`ifndef SOCKET_SIZE
`define SOCKET_SIZE `MIN(4, `NUM_CORES)
`endif
`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
// Size of Tensor Core
`ifndef TC_SIZE
@ -221,7 +220,7 @@
`ifndef IO_COUT_ADDR
`define IO_COUT_ADDR `IO_BASE_ADDR
`endif
`define IO_COUT_SIZE `MEM_BLOCK_SIZE
`define IO_COUT_SIZE 64
`ifndef IO_MPM_ADDR
`define IO_MPM_ADDR (`IO_COUT_ADDR + `IO_COUT_SIZE)
@ -233,15 +232,17 @@
`endif
`define STACK_SIZE (1 << `STACK_LOG2_SIZE)
`define RESET_DELAY 8
`define RESET_DELAY 8
`ifndef STALL_TIMEOUT
`define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
`endif
`ifndef SV_DPI
`ifndef DPI_DISABLE
`define DPI_DISABLE
`endif
`endif
`ifndef FPU_FPNEW
`ifndef FPU_DSP
@ -303,7 +304,7 @@
// Number of SFU units
`ifndef NUM_SFU_LANES
`define NUM_SFU_LANES `MIN(`NUM_THREADS, 4)
`define NUM_SFU_LANES `NUM_THREADS
`endif
`ifndef NUM_SFU_BLOCKS
`define NUM_SFU_BLOCKS 1
@ -427,22 +428,27 @@
`define LATENCY_FCVT 5
`endif
// FMA Bandwidth ratio
`ifndef FMA_PE_RATIO
`define FMA_PE_RATIO 1
`endif
// FDIV Bandwidth ratio
`ifndef FDIV_PE_RATIO
`define FDIV_PE_RATIO 8
`endif
// FSQRT Bandwidth ratio
`ifndef FSQRT_PE_RATIO
`define FSQRT_PE_RATIO 8
`endif
// FCVT Bandwidth ratio
`ifndef FCVT_PE_RATIO
`define FCVT_PE_RATIO 8
`endif
// FNCP Bandwidth ratio
`ifndef FNCP_PE_RATIO
`define FNCP_PE_RATIO 2
`endif
@ -549,7 +555,12 @@
`define DCACHE_NUM_WAYS 1
`endif
// SM Configurable Knobs //////////////////////////////////////////////////////
// Enable Cache Writeback
`ifndef DCACHE_WRITEBACK
`define DCACHE_WRITEBACK 0
`endif
// LMEM Configurable Knobs ////////////////////////////////////////////////////
`ifndef LMEM_DISABLE
`define LMEM_ENABLE
@ -608,6 +619,11 @@
`define L2_NUM_WAYS 2
`endif
// Enable Cache Writeback
`ifndef L2_WRITEBACK
`define L2_WRITEBACK 0
`endif
// L3cache Configurable Knobs /////////////////////////////////////////////////
// Cache Size
@ -621,7 +637,7 @@
// Number of Banks
`ifndef L3_NUM_BANKS
`define L3_NUM_BANKS `MIN(4, `NUM_CLUSTERS)
`define L3_NUM_BANKS `MIN(8, `NUM_CLUSTERS)
`endif
// Core Response Queue Size
@ -649,6 +665,20 @@
`define L3_NUM_WAYS 4
`endif
// Enable Cache Writeback
`ifndef L3_WRITEBACK
`define L3_WRITEBACK 0
`endif
`ifndef MEMORY_BANKS
`define MEMORY_BANKS 2
`endif
// Number of Memory Ports from LLC
`ifndef NUM_MEM_PORTS
`define NUM_MEM_PORTS `MIN(`MEMORY_BANKS, `L3_NUM_BANKS)
`endif
// ISA Extensions /////////////////////////////////////////////////////////////
`ifdef EXT_A_ENABLE

View file

@ -52,13 +52,19 @@
`ifndef NDEBUG
`define UUID_WIDTH 44
`else
`ifdef SCOPE
`define UUID_WIDTH 44
`else
`define UUID_WIDTH 1
`endif
`endif
`define PC_BITS (`XLEN-1)
`define OFFSET_BITS 12
`define IMM_BITS `XLEN
`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
///////////////////////////////////////////////////////////////////////////////
`define EX_ALU 0
@ -225,22 +231,19 @@
`define INST_FENCE_D 1'h0
`define INST_FENCE_I 1'h1
`define INST_FPU_ADD 4'b0000
`define INST_FPU_SUB 4'b0001
`define INST_FPU_MUL 4'b0010
`define INST_FPU_DIV 4'b0011
`define INST_FPU_SQRT 4'b0100
`define INST_FPU_CMP 4'b0101 // frm: LE=0, LT=1, EQ=2
`define INST_FPU_F2F 4'b0110
`define INST_FPU_MISC 4'b0111 // frm: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
`define INST_FPU_F2I 4'b1000
`define INST_FPU_F2U 4'b1001
`define INST_FPU_I2F 4'b1010
`define INST_FPU_U2F 4'b1011
`define INST_FPU_MADD 4'b1100
`define INST_FPU_MSUB 4'b1101
`define INST_FPU_NMSUB 4'b1110
`define INST_FPU_NMADD 4'b1111
`define INST_FPU_ADD 4'b0000 // SUB=fmt[1]
`define INST_FPU_MUL 4'b0001
`define INST_FPU_MADD 4'b0010 // SUB=fmt[1]
`define INST_FPU_NMADD 4'b0011 // SUB=fmt[1]
`define INST_FPU_DIV 4'b0100
`define INST_FPU_SQRT 4'b0101
`define INST_FPU_F2I 4'b1000 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
`define INST_FPU_F2U 4'b1001 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
`define INST_FPU_I2F 4'b1010 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
`define INST_FPU_U2F 4'b1011 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
`define INST_FPU_CMP 4'b1100 // frm: LE=0, LT=1, EQ=2
`define INST_FPU_F2F 4'b1101 // fmt[0]: F32=0, F64=1
`define INST_FPU_MISC 4'b1110 // frm: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
`define INST_FPU_BITS 4
`define INST_FPU_IS_CLASS(op, frm) (op == `INST_FPU_MISC && frm == 3)
`define INST_FPU_IS_MVXW(op, frm) (op == `INST_FPU_MISC && frm == 4)
@ -265,14 +268,14 @@
///////////////////////////////////////////////////////////////////////////////
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
(`CLOG2(mshr_size) + `CLOG2(num_banks))
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, uuid_width) \
(uuid_width + `CLOG2(mshr_size) + `CLOG2(num_banks))
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \
(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width)) + 1)
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, uuid_width) \
(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, uuid_width), `CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width)) + 1)
///////////////////////////////////////////////////////////////////////////////
@ -282,28 +285,29 @@
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches, uuid_width) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, uuid_width), num_caches)
`define CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches, uuid_width) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches), uuid_width), num_caches)
///////////////////////////////////////////////////////////////////////////////
`ifdef ICACHE_ENABLE
`define L1_ENABLE
`endif
`ifdef DCACHE_ENABLE
`define L1_ENABLE
`endif
`define ADDR_TYPE_FLUSH 0
`define ADDR_TYPE_IO 1
`define ADDR_TYPE_LOCAL 2 // shoud be last since optional
`define ADDR_TYPE_WIDTH (`ADDR_TYPE_LOCAL + `LMEM_ENABLED)
`define MEM_REQ_FLAG_FLUSH 0
`define MEM_REQ_FLAG_IO 1
`define MEM_REQ_FLAG_LOCAL 2 // shoud be last since optional
`define MEM_REQ_FLAGS_WIDTH (`MEM_REQ_FLAG_LOCAL + `LMEM_ENABLED)
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
@ -317,12 +321,24 @@
///////////////////////////////////////////////////////////////////////////////
`define NEG_EDGE(dst, src) \
wire dst; \
VX_edge_trigger #( \
.POS (0), \
.INIT (0) \
) __``dst``__ ( \
.clk (clk), \
.reset (1'b0), \
.data_in (src), \
.data_out (dst) \
)
`define BUFFER_EX(dst, src, ena, latency) \
VX_pipe_register #( \
.DATAW ($bits(dst)), \
.RESETW ($bits(dst)), \
.DEPTH (latency) \
) __``dst ( \
) __``dst``__ ( \
.clk (clk), \
.reset (reset), \
.enable (ena), \
@ -336,13 +352,18 @@
VX_popcount #( \
.N ($bits(in)), \
.MODEL (model) \
) __``out ( \
) __``out``__ ( \
.data_in (in), \
.data_out (out) \
)
`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1)
`define ASSIGN_VX_IF(dst, src) \
assign dst.valid = src.valid; \
assign dst.data = src.data; \
assign src.ready = dst.ready
`define ASSIGN_VX_MEM_BUS_IF(dst, src) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data = src.req_data; \
@ -351,71 +372,90 @@
assign src.rsp_data = dst.rsp_data; \
assign dst.rsp_ready = src.rsp_ready
`define ASSIGN_VX_MEM_BUS_RO_IF(dst, src) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data.rw = 0; \
assign dst.req_data.addr = src.req_data.addr; \
assign dst.req_data.data = '0; \
assign dst.req_data.byteen = '1; \
assign dst.req_data.flags = src.req_data.flags; \
assign dst.req_data.tag = src.req_data.tag; \
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data.data = dst.rsp_data.data; \
assign src.rsp_data.tag = dst.rsp_data.tag; \
assign dst.rsp_ready = src.rsp_ready
`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data.rw = src.req_data.rw; \
assign dst.req_data.byteen = src.req_data.byteen; \
assign dst.req_data.addr = src.req_data.addr; \
assign dst.req_data.atype = src.req_data.atype; \
assign dst.req_data.data = src.req_data.data; \
if (TD != TS) \
assign dst.req_data.byteen = src.req_data.byteen; \
assign dst.req_data.flags = src.req_data.flags; \
/* verilator lint_off GENUNNAMED */ \
if (TD != TS) begin \
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
else \
end else begin \
assign dst.req_data.tag = src.req_data.tag; \
end \
/* verilator lint_on GENUNNAMED */ \
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data.data = dst.rsp_data.data; \
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
assign dst.rsp_ready = src.rsp_ready
`define ASSIGN_VX_LSU_MEM_IF(dst, src) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data = src.req_data; \
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data = dst.rsp_data; \
assign dst.rsp_ready = src.rsp_ready
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \
if (enable) begin \
always @(posedge clk) begin \
__``dst <= {src.write_valid, src.write_addr, src.write_data}; \
end \
`define BUFFER_DCR_BUS_IF(dst, src, ena, latency) \
/* verilator lint_off GENUNNAMED */ \
if (latency != 0) begin \
VX_pipe_register #( \
.DATAW (1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH), \
.DEPTH (latency) \
) pipe_reg ( \
.clk (clk), \
.reset (1'b0), \
.enable (1'b1), \
.data_in ({src.write_valid && ena, src.write_addr, src.write_data}), \
.data_out ({dst.write_valid, dst.write_addr, dst.write_data}) \
); \
end else begin \
assign __``dst = {src.write_valid, src.write_addr, src.write_data}; \
assign {dst.write_valid, dst.write_addr, dst.write_data} = {src.write_valid && ena, src.write_addr, src.write_data}; \
end \
VX_dcr_bus_if dst(); \
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
/* verilator lint_on GENUNNAMED */
`define PERF_COUNTER_ADD(dst, src, field, width, dst_count, src_count, reg_enable) \
for (genvar __d = 0; __d < dst_count; ++__d) begin \
localparam __count = ((src_count > dst_count) ? `CDIV(src_count, dst_count) : 1); \
wire [__count-1:0][width-1:0] __reduce_add_i_``src``field; \
wire [width-1:0] __reduce_add_o_``dst``field; \
for (genvar __i = 0; __i < __count; ++__i) begin \
assign __reduce_add_i_``src``field[__i] = ``src[__d * __count + __i].``field; \
`define PERF_COUNTER_ADD(dst, src, field, width, count, reg_enable) \
/* verilator lint_off GENUNNAMED */ \
if (count > 1) begin \
wire [count-1:0][width-1:0] __reduce_add_i_field; \
wire [width-1:0] __reduce_add_o_field; \
for (genvar __i = 0; __i < count; ++__i) begin \
assign __reduce_add_i_field[__i] = src[__i].``field; \
end \
VX_reduce #(.DATAW_IN(width), .N(__count), .OP("+")) __reduce_add_``dst``field ( \
__reduce_add_i_``src``field, \
__reduce_add_o_``dst``field \
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
__reduce_add_i_field, \
__reduce_add_o_field \
); \
if (reg_enable) begin \
reg [width-1:0] __reduce_add_r_``dst``field; \
reg [width-1:0] __reduce_add_r_field; \
always @(posedge clk) begin \
if (reset) begin \
__reduce_add_r_``dst``field <= '0; \
__reduce_add_r_field <= '0; \
end else begin \
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
__reduce_add_r_field <= __reduce_add_o_field; \
end \
end \
assign ``dst[__d].``field = __reduce_add_r_``dst``field; \
assign dst.``field = __reduce_add_r_field; \
end else begin \
assign ``dst[__d].``field = __reduce_add_o_``dst``field; \
assign dst.``field = __reduce_add_o_field; \
end \
end
end else begin \
assign dst.``field = src[0].``field; \
end \
/* verilator lint_on GENUNNAMED */
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
/* verilator lint_off GENUNNAMED */ \
if (block_size != 1) begin \
if (block_size != `NUM_WARPS) begin \
assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \
@ -424,22 +464,7 @@
end \
end else begin \
assign dst = src; \
end
`define TO_DISPATCH_DATA(data, tid) { \
data.uuid, \
data.wis, \
data.tmask, \
data.PC, \
data.op_type, \
data.op_args, \
data.wb, \
data.rd, \
tid, \
data.rs1_data, \
data.rs2_data, \
data.rs3_data}
///////////////////////////////////////////////////////////////////////////////
end \
/* verilator lint_on GENUNNAMED */
`endif // VX_DEFINE_VH

View file

@ -60,6 +60,8 @@ package VX_gpu_pkg;
logic [7:0] mpm_class;
} base_dcrs_t;
//////////////////////////// Perf counter types ///////////////////////////
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] reads;
logic [`PERF_CTR_BITS-1:0] writes;
@ -77,48 +79,63 @@ package VX_gpu_pkg;
logic [`PERF_CTR_BITS-1:0] latency;
} mem_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] idles;
logic [`PERF_CTR_BITS-1:0] stalls;
} sched_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] ibf_stalls;
logic [`PERF_CTR_BITS-1:0] scb_stalls;
logic [`PERF_CTR_BITS-1:0] opd_stalls;
logic [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] units_uses;
logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses;
} issue_perf_t;
//////////////////////// instruction arguments ////////////////////////////
typedef struct packed {
logic use_PC;
logic use_imm;
logic is_w;
logic [`ALU_TYPE_BITS-1:0] xtype;
logic [`IMM_BITS-1:0] imm;
} alu_mod_t;
} alu_args_t;
typedef struct packed {
logic [($bits(alu_mod_t)-`INST_FRM_BITS-`INST_FMT_BITS)-1:0] __padding;
logic [($bits(alu_args_t)-`INST_FRM_BITS-`INST_FMT_BITS)-1:0] __padding;
logic [`INST_FRM_BITS-1:0] frm;
logic [`INST_FMT_BITS-1:0] fmt;
} fpu_mod_t;
} fpu_args_t;
typedef struct packed {
logic [($bits(alu_mod_t)-1-1-`OFFSET_BITS)-1:0] __padding;
logic [($bits(alu_args_t)-1-1-`OFFSET_BITS)-1:0] __padding;
logic is_store;
logic is_float;
logic [`OFFSET_BITS-1:0] offset;
} lsu_mod_t;
} lsu_args_t;
typedef struct packed {
logic [($bits(alu_mod_t)-1-`VX_CSR_ADDR_BITS-5)-1:0] __padding;
logic [($bits(alu_args_t)-1-`VX_CSR_ADDR_BITS-5)-1:0] __padding;
logic use_imm;
logic [`VX_CSR_ADDR_BITS-1:0] addr;
logic [4:0] imm;
} csr_mod_t;
} csr_args_t;
typedef struct packed {
logic [($bits(alu_mod_t)-1)-1:0] __padding;
logic [($bits(alu_args_t)-1)-1:0] __padding;
logic is_neg;
} wctl_mod_t;
} wctl_args_t;
typedef union packed {
alu_mod_t alu;
fpu_mod_t fpu;
lsu_mod_t lsu;
csr_mod_t csr;
wctl_mod_t wctl;
alu_args_t alu;
fpu_args_t fpu;
lsu_args_t lsu;
csr_args_t csr;
wctl_args_t wctl;
} op_args_t;
/* verilator lint_off UNUSED */
`IGNORE_UNUSED_BEGIN
///////////////////////// LSU memory Parameters ///////////////////////////
@ -129,6 +146,31 @@ package VX_gpu_pkg;
localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS);
localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES;
////////////////////////// Icache Parameters //////////////////////////////
// Word size in bytes
localparam ICACHE_WORD_SIZE = 4;
localparam ICACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(ICACHE_WORD_SIZE));
// Block size in bytes
localparam ICACHE_LINE_SIZE = `L1_LINE_SIZE;
// Core request tag Id bits
localparam ICACHE_TAG_ID_BITS = `NW_WIDTH;
// Core request tag bits
localparam ICACHE_TAG_WIDTH = (`UUID_WIDTH + ICACHE_TAG_ID_BITS);
// Memory request data bits
localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8);
// Memory request tag bits
`ifdef ICACHE_ENABLE
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES, `UUID_WIDTH);
`else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif
////////////////////////// Dcache Parameters //////////////////////////////
// Word size in bytes
@ -154,36 +196,11 @@ package VX_gpu_pkg;
localparam DCACHE_MEM_DATA_WIDTH = (DCACHE_LINE_SIZE * 8);
// Memory request tag bits
`ifdef DCACHE_ENABLE
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`else
`ifdef DCACHE_ENABLE
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES, `UUID_WIDTH);
`else
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`endif
////////////////////////// Icache Parameters //////////////////////////////
// Word size in bytes
localparam ICACHE_WORD_SIZE = 4;
localparam ICACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(ICACHE_WORD_SIZE));
// Block size in bytes
localparam ICACHE_LINE_SIZE = `L1_LINE_SIZE;
// Core request tag Id bits
localparam ICACHE_TAG_ID_BITS = `NW_WIDTH;
// Core request tag bits
localparam ICACHE_TAG_WIDTH = (`UUID_WIDTH + ICACHE_TAG_ID_BITS);
// Memory request data bits
localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8);
// Memory request tag bits
`ifdef ICACHE_ENABLE
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
`else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif
`endif
/////////////////////////////// L1 Parameters /////////////////////////////
@ -208,11 +225,11 @@ package VX_gpu_pkg;
localparam L2_MEM_DATA_WIDTH = (`L2_LINE_SIZE * 8);
// Memory request tag bits
`ifdef L2_ENABLE
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
`else
`ifdef L2_ENABLE
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH, `UUID_WIDTH);
`else
localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
`endif
`endif
/////////////////////////////// L3 Parameters /////////////////////////////
@ -229,23 +246,20 @@ package VX_gpu_pkg;
localparam L3_MEM_DATA_WIDTH = (`L3_LINE_SIZE * 8);
// Memory request tag bits
`ifdef L3_ENABLE
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`else
`ifdef L3_ENABLE
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH, `UUID_WIDTH);
`else
localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`endif
/* verilator lint_on UNUSED */
`endif
/////////////////////////////// Issue parameters //////////////////////////
localparam ISSUE_ISW = `CLOG2(`ISSUE_WIDTH);
localparam ISSUE_ISW_W = `UP(ISSUE_ISW);
localparam ISSUE_RATIO = `NUM_WARPS / `ISSUE_WIDTH;
localparam ISSUE_WIS = `CLOG2(ISSUE_RATIO);
localparam PER_ISSUE_WARPS = `NUM_WARPS / `ISSUE_WIDTH;
localparam ISSUE_WIS = `CLOG2(PER_ISSUE_WARPS);
localparam ISSUE_WIS_W = `UP(ISSUE_WIS);
`IGNORE_UNUSED_BEGIN
function logic [`NW_WIDTH-1:0] wis_to_wid(
input logic [ISSUE_WIS_W-1:0] wis,
input logic [ISSUE_ISW_W-1:0] isw
@ -278,8 +292,446 @@ package VX_gpu_pkg;
wid_to_wis = 0;
end
endfunction
///////////////////////// Miscaellaneous functions ////////////////////////
function logic [`SFU_WIDTH-1:0] op_to_sfu_type(
input logic [`INST_OP_BITS-1:0] op_type
);
case (op_type)
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: op_to_sfu_type = `SFU_CSRS;
default: op_to_sfu_type = `SFU_WCTL;
endcase
endfunction
`IGNORE_UNUSED_END
////////////////////////////////// Tracing ////////////////////////////////////
`ifdef SIMULATION
`ifdef SV_DPI
import "DPI-C" function void dpi_trace(input int level, input string format /*verilator sformat*/);
`endif
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
case (ex_type)
`EX_ALU: `TRACE(level, ("ALU"))
`EX_LSU: `TRACE(level, ("LSU"))
`EX_SFU: `TRACE(level, ("SFU"))
`ifdef EXT_F_ENABLE
`EX_FPU: `TRACE(level, ("FPU"))
`endif
default: `TRACE(level, ("?"))
endcase
endtask
task trace_ex_op(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input VX_gpu_pkg::op_args_t op_args
);
case (ex_type)
`EX_ALU: begin
case (op_args.alu.xtype)
`ALU_TYPE_ARITH: begin
if (op_args.alu.is_w) begin
if (op_args.alu.use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDIW"))
`INST_ALU_SLL: `TRACE(level, ("SLLIW"))
`INST_ALU_SRL: `TRACE(level, ("SRLIW"))
`INST_ALU_SRA: `TRACE(level, ("SRAIW"))
default: `TRACE(level, ("?"))
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDW"))
`INST_ALU_SUB: `TRACE(level, ("SUBW"))
`INST_ALU_SLL: `TRACE(level, ("SLLW"))
`INST_ALU_SRL: `TRACE(level, ("SRLW"))
`INST_ALU_SRA: `TRACE(level, ("SRAW"))
default: `TRACE(level, ("?"))
endcase
end
end else begin
if (op_args.alu.use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDI"))
`INST_ALU_SLL: `TRACE(level, ("SLLI"))
`INST_ALU_SRL: `TRACE(level, ("SRLI"))
`INST_ALU_SRA: `TRACE(level, ("SRAI"))
`INST_ALU_SLT: `TRACE(level, ("SLTI"))
`INST_ALU_SLTU: `TRACE(level, ("SLTIU"))
`INST_ALU_XOR: `TRACE(level, ("XORI"))
`INST_ALU_OR: `TRACE(level, ("ORI"))
`INST_ALU_AND: `TRACE(level, ("ANDI"))
`INST_ALU_LUI: `TRACE(level, ("LUI"))
`INST_ALU_AUIPC: `TRACE(level, ("AUIPC"))
default: `TRACE(level, ("?"))
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADD"))
`INST_ALU_SUB: `TRACE(level, ("SUB"))
`INST_ALU_SLL: `TRACE(level, ("SLL"))
`INST_ALU_SRL: `TRACE(level, ("SRL"))
`INST_ALU_SRA: `TRACE(level, ("SRA"))
`INST_ALU_SLT: `TRACE(level, ("SLT"))
`INST_ALU_SLTU: `TRACE(level, ("SLTU"))
`INST_ALU_XOR: `TRACE(level, ("XOR"))
`INST_ALU_OR: `TRACE(level, ("OR"))
`INST_ALU_AND: `TRACE(level, ("AND"))
`INST_ALU_CZEQ: `TRACE(level, ("CZERO.EQZ"))
`INST_ALU_CZNE: `TRACE(level, ("CZERO.NEZ"))
default: `TRACE(level, ("?"))
endcase
end
end
end
`ALU_TYPE_BRANCH: begin
case (`INST_BR_BITS'(op_type))
`INST_BR_EQ: `TRACE(level, ("BEQ"))
`INST_BR_NE: `TRACE(level, ("BNE"))
`INST_BR_LT: `TRACE(level, ("BLT"))
`INST_BR_GE: `TRACE(level, ("BGE"))
`INST_BR_LTU: `TRACE(level, ("BLTU"))
`INST_BR_GEU: `TRACE(level, ("BGEU"))
`INST_BR_JAL: `TRACE(level, ("JAL"))
`INST_BR_JALR: `TRACE(level, ("JALR"))
`INST_BR_ECALL: `TRACE(level, ("ECALL"))
`INST_BR_EBREAK:`TRACE(level, ("EBREAK"))
`INST_BR_URET: `TRACE(level, ("URET"))
`INST_BR_SRET: `TRACE(level, ("SRET"))
`INST_BR_MRET: `TRACE(level, ("MRET"))
default: `TRACE(level, ("?"))
endcase
end
`ALU_TYPE_MULDIV: begin
if (op_args.alu.is_w) begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MULW"))
`INST_M_DIV: `TRACE(level, ("DIVW"))
`INST_M_DIVU: `TRACE(level, ("DIVUW"))
`INST_M_REM: `TRACE(level, ("REMW"))
`INST_M_REMU: `TRACE(level, ("REMUW"))
default: `TRACE(level, ("?"))
endcase
end else begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MUL"))
`INST_M_MULH: `TRACE(level, ("MULH"))
`INST_M_MULHSU:`TRACE(level, ("MULHSU"))
`INST_M_MULHU: `TRACE(level, ("MULHU"))
`INST_M_DIV: `TRACE(level, ("DIV"))
`INST_M_DIVU: `TRACE(level, ("DIVU"))
`INST_M_REM: `TRACE(level, ("REM"))
`INST_M_REMU: `TRACE(level, ("REMU"))
default: `TRACE(level, ("?"))
endcase
end
end
default: `TRACE(level, ("?"))
endcase
end
`EX_LSU: begin
if (op_args.lsu.is_float) begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LW: `TRACE(level, ("FLW"))
`INST_LSU_LD: `TRACE(level, ("FLD"))
`INST_LSU_SW: `TRACE(level, ("FSW"))
`INST_LSU_SD: `TRACE(level, ("FSD"))
default: `TRACE(level, ("?"))
endcase
end else begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LB: `TRACE(level, ("LB"))
`INST_LSU_LH: `TRACE(level, ("LH"))
`INST_LSU_LW: `TRACE(level, ("LW"))
`INST_LSU_LD: `TRACE(level, ("LD"))
`INST_LSU_LBU:`TRACE(level, ("LBU"))
`INST_LSU_LHU:`TRACE(level, ("LHU"))
`INST_LSU_LWU:`TRACE(level, ("LWU"))
`INST_LSU_SB: `TRACE(level, ("SB"))
`INST_LSU_SH: `TRACE(level, ("SH"))
`INST_LSU_SW: `TRACE(level, ("SW"))
`INST_LSU_SD: `TRACE(level, ("SD"))
`INST_LSU_FENCE:`TRACE(level,("FENCE"))
default: `TRACE(level, ("?"))
endcase
end
end
`EX_SFU: begin
case (`INST_SFU_BITS'(op_type))
`INST_SFU_TMC: `TRACE(level, ("TMC"))
`INST_SFU_WSPAWN:`TRACE(level, ("WSPAWN"))
`INST_SFU_SPLIT: begin
if (op_args.wctl.is_neg) begin
`TRACE(level, ("SPLIT.N"))
end else begin
`TRACE(level, ("SPLIT"))
end
end
`INST_SFU_JOIN: `TRACE(level, ("JOIN"))
`INST_SFU_BAR: `TRACE(level, ("BAR"))
`INST_SFU_PRED: begin
if (op_args.wctl.is_neg) begin
`TRACE(level, ("PRED.N"))
end else begin
`TRACE(level, ("PRED"))
end
end
`INST_SFU_CSRRW: begin
if (op_args.csr.use_imm) begin
`TRACE(level, ("CSRRWI"))
end else begin
`TRACE(level, ("CSRRW"))
end
end
`INST_SFU_CSRRS: begin
if (op_args.csr.use_imm) begin
`TRACE(level, ("CSRRSI"))
end else begin
`TRACE(level, ("CSRRS"))
end
end
`INST_SFU_CSRRC: begin
if (op_args.csr.use_imm) begin
`TRACE(level, ("CSRRCI"))
end else begin
`TRACE(level, ("CSRRC"))
end
end
default: `TRACE(level, ("?"))
endcase
end
`ifdef EXT_F_ENABLE
`EX_FPU: begin
case (`INST_FPU_BITS'(op_type))
`INST_FPU_ADD: begin
if (op_args.fpu.fmt[1]) begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FSUB.D"))
end else begin
`TRACE(level, ("FSUB.S"))
end
end else begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FADD.D"))
end else begin
`TRACE(level, ("FADD.S"))
end
end
end
`INST_FPU_MADD: begin
if (op_args.fpu.fmt[1]) begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FMSUB.D"))
end else begin
`TRACE(level, ("FMSUB.S"))
end
end else begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FMADD.D"))
end else begin
`TRACE(level, ("FMADD.S"))
end
end
end
`INST_FPU_NMADD: begin
if (op_args.fpu.fmt[1]) begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FNMSUB.D"))
end else begin
`TRACE(level, ("FNMSUB.S"))
end
end else begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FNMADD.D"))
end else begin
`TRACE(level, ("FNMADD.S"))
end
end
end
`INST_FPU_MUL: begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FMUL.D"))
end else begin
`TRACE(level, ("FMUL.S"))
end
end
`INST_FPU_DIV: begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FDIV.D"))
end else begin
`TRACE(level, ("FDIV.S"))
end
end
`INST_FPU_SQRT: begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FSQRT.D"))
end else begin
`TRACE(level, ("FSQRT.S"))
end
end
`INST_FPU_CMP: begin
if (op_args.fpu.fmt[0]) begin
case (op_args.fpu.frm[1:0])
0: `TRACE(level, ("FLE.D"))
1: `TRACE(level, ("FLT.D"))
2: `TRACE(level, ("FEQ.D"))
default: `TRACE(level, ("?"))
endcase
end else begin
case (op_args.fpu.frm[1:0])
0: `TRACE(level, ("FLE.S"))
1: `TRACE(level, ("FLT.S"))
2: `TRACE(level, ("FEQ.S"))
default: `TRACE(level, ("?"))
endcase
end
end
`INST_FPU_F2F: begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FCVT.D.S"))
end else begin
`TRACE(level, ("FCVT.S.D"))
end
end
`INST_FPU_F2I: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.L.D"))
end else begin
`TRACE(level, ("FCVT.W.D"))
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.L.S"))
end else begin
`TRACE(level, ("FCVT.W.S"))
end
end
end
`INST_FPU_F2U: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.LU.D"))
end else begin
`TRACE(level, ("FCVT.WU.D"))
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.LU.S"))
end else begin
`TRACE(level, ("FCVT.WU.S"))
end
end
end
`INST_FPU_I2F: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.D.L"))
end else begin
`TRACE(level, ("FCVT.D.W"))
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.S.L"))
end else begin
`TRACE(level, ("FCVT.S.W"))
end
end
end
`INST_FPU_U2F: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.D.LU"))
end else begin
`TRACE(level, ("FCVT.D.WU"))
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.S.LU"))
end else begin
`TRACE(level, ("FCVT.S.WU"))
end
end
end
`INST_FPU_MISC: begin
if (op_args.fpu.fmt[0]) begin
case (op_args.fpu.frm)
0: `TRACE(level, ("FSGNJ.D"))
1: `TRACE(level, ("FSGNJN.D"))
2: `TRACE(level, ("FSGNJX.D"))
3: `TRACE(level, ("FCLASS.D"))
4: `TRACE(level, ("FMV.X.D"))
5: `TRACE(level, ("FMV.D.X"))
6: `TRACE(level, ("FMIN.D"))
7: `TRACE(level, ("FMAX.D"))
endcase
end else begin
case (op_args.fpu.frm)
0: `TRACE(level, ("FSGNJ.S"))
1: `TRACE(level, ("FSGNJN.S"))
2: `TRACE(level, ("FSGNJX.S"))
3: `TRACE(level, ("FCLASS.S"))
4: `TRACE(level, ("FMV.X.S"))
5: `TRACE(level, ("FMV.S.X"))
6: `TRACE(level, ("FMIN.S"))
7: `TRACE(level, ("FMAX.S"))
endcase
end
end
default: `TRACE(level, ("?"))
endcase
end
`endif
default: `TRACE(level, ("?"))
endcase
endtask
task trace_op_args(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input VX_gpu_pkg::op_args_t op_args
);
case (ex_type)
`EX_ALU: begin
`TRACE(level, (", use_PC=%b, use_imm=%b, imm=0x%0h", op_args.alu.use_PC, op_args.alu.use_imm, op_args.alu.imm))
end
`EX_LSU: begin
`TRACE(level, (", offset=0x%0h", op_args.lsu.offset))
end
`EX_SFU: begin
if (`INST_SFU_IS_CSR(op_type)) begin
`TRACE(level, (", addr=0x%0h, use_imm=%b, imm=0x%0h", op_args.csr.addr, op_args.csr.use_imm, op_args.csr.imm))
end
end
`ifdef EXT_F_ENABLE
`EX_FPU: begin
`TRACE(level, (", fmt=0x%0h, frm=0x%0h", op_args.fpu.fmt, op_args.fpu.frm))
end
`endif
default:;
endcase
endtask
task trace_base_dcr(input int level, input [`VX_DCR_ADDR_WIDTH-1:0] addr);
case (addr)
`VX_DCR_BASE_STARTUP_ADDR0: `TRACE(level, ("STARTUP_ADDR0"))
`VX_DCR_BASE_STARTUP_ADDR1: `TRACE(level, ("STARTUP_ADDR1"))
`VX_DCR_BASE_STARTUP_ARG0: `TRACE(level, ("STARTUP_ARG0"))
`VX_DCR_BASE_STARTUP_ARG1: `TRACE(level, ("STARTUP_ARG1"))
`VX_DCR_BASE_MPM_CLASS: `TRACE(level, ("MPM_CLASS"))
default: `TRACE(level, ("?"))
endcase
endtask
`endif
endpackage
`endif // VX_GPU_PKG_VH

View file

@ -22,36 +22,39 @@
///////////////////////////////////////////////////////////////////////////////
`ifdef VIVADO
`define STRING
`else
`define STRING string
`endif
`ifdef SIMULATION
`ifdef SYNTHESIS
`define TRACING_ON
`define TRACING_OFF
`ifndef NDEBUG
`define DEBUG_BLOCK(x) x
`else
`define DEBUG_BLOCK(x)
`endif
`define IGNORE_UNOPTFLAT_BEGIN
`define IGNORE_UNOPTFLAT_END
`define IGNORE_UNUSED_BEGIN
`define IGNORE_UNUSED_END
`define IGNORE_WARNINGS_BEGIN
`define IGNORE_WARNINGS_END
`define UNUSED_PARAM(x)
`define UNUSED_SPARAM(x)
`define UNUSED_VAR(x)
`define UNUSED_PIN(x) . x ()
`define UNUSED_ARG(x) x
`define TRACE(level, args) $write args
`else
`ifdef VERILATOR
`define STATIC_ASSERT(cond, msg) \
generate \
/* verilator lint_off GENUNNAMED */ \
if (!(cond)) $error msg; \
/* verilator lint_on GENUNNAMED */ \
endgenerate
`define ERROR(msg) \
$error msg
`define ASSERT(cond, msg) \
assert(cond) else $error msg
`define RUNTIME_ASSERT(cond, msg) \
always @(posedge clk) begin \
assert(cond) else $error msg; \
end
`define __SCOPE
`define __SCOPE_X
`define __SCOPE_ON
`define __SCOPE_OFF
`ifndef TRACING_ALL
`define TRACING_ON /* verilator tracing_on */
`define TRACING_OFF /* verilator tracing_off */
`else
`define TRACING_ON
`define TRACING_OFF
`endif
`ifndef NDEBUG
`define DEBUG_BLOCK(x) /* verilator lint_off UNUSED */ \
x \
@ -100,43 +103,68 @@
localparam `STRING __``x = x; \
/* verilator lint_on UNUSED */
`define UNUSED_VAR(x) if (1) begin \
`define UNUSED_VAR(x) /* verilator lint_off GENUNNAMED */ \
if (1) begin \
/* verilator lint_off UNUSED */ \
wire [$bits(x)-1:0] __x = x; \
/* verilator lint_on UNUSED */ \
end
end \
/* verilator lint_on GENUNNAMED */
`define UNUSED_PIN(x) /* verilator lint_off PINCONNECTEMPTY */ \
. x () \
/* verilator lint_on PINCONNECTEMPTY */
`define UNUSED_ARG(x) /* verilator lint_off UNUSED */ \
x \
/* verilator lint_on UNUSED */
`define TRACE(level, args) dpi_trace(level, $sformatf args)
`endif
`endif
`ifdef SIMULATION
`define STATIC_ASSERT(cond, msg) \
generate \
if (!(cond)) $error msg; \
endgenerate
`define ERROR(msg) \
$error msg
`define ASSERT(cond, msg) \
assert(cond) else $error msg
`define RUNTIME_ASSERT(cond, msg) \
always @(posedge clk) begin \
assert(cond) else $error msg; \
end
`ifdef SV_DPI
`define TRACE(level, args) dpi_trace(level, $sformatf args);
`else
`define STATIC_ASSERT(cond, msg)
`define ERROR(msg) //
`define ASSERT(cond, msg) //
`define RUNTIME_ASSERT(cond, msg)
`define TRACE(level, args) \
if (level <= `DEBUG_LEVEL) begin \
$write args; \
end
`endif
`else // SYNTHESIS
`define STATIC_ASSERT(cond, msg)
`define ERROR(msg) //
`define ASSERT(cond, msg) //
`define RUNTIME_ASSERT(cond, msg)
`define DEBUG_BLOCK(x)
`define TRACE(level, args)
`define TRACING_ON
`define TRACING_OFF
`define IGNORE_UNOPTFLAT_BEGIN
`define IGNORE_UNOPTFLAT_END
`define IGNORE_UNUSED_BEGIN
`define IGNORE_UNUSED_END
`define IGNORE_WARNINGS_BEGIN
`define IGNORE_WARNINGS_END
`define UNUSED_PARAM(x)
`define UNUSED_SPARAM(x)
`define UNUSED_VAR(x)
`define UNUSED_PIN(x) . x ()
`define UNUSED_ARG(x) x
`define __SCOPE (* mark_debug="true" *)
`define __SCOPE_X
`define __SCOPE_ON \
`undef __SCOPE_X \
`define __SCOPE_X `__SCOPE
`define __SCOPE_OFF \
`undef __SCOPE_X \
`define __SCOPE_X
`endif
///////////////////////////////////////////////////////////////////////////////
@ -148,6 +176,7 @@
`define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *)
`define DISABLE_BRAM (* ramstyle = "logic" *)
`define PRESERVE_NET (* preserve *)
`define STRING string
`elsif VIVADO
`define MAX_FANOUT 8
`define IF_DATA_SIZE(x) $bits(x.data)
@ -155,6 +184,7 @@
`define NO_RW_RAM_CHECK (* rw_addr_collision = "no" *)
`define DISABLE_BRAM (* ram_style = "registers" *)
`define PRESERVE_NET (* keep = "true" *)
`define STRING
`else
`define MAX_FANOUT 8
`define IF_DATA_SIZE(x) x.DATA_WIDTH
@ -162,6 +192,7 @@
`define NO_RW_RAM_CHECK
`define DISABLE_BRAM
`define PRESERVE_NET
`define STRING string
`endif
///////////////////////////////////////////////////////////////////////////////
@ -198,23 +229,23 @@
`define SEXT(len, x) {{(len-$bits(x)+1){x[$bits(x)-1]}}, x[$bits(x)-2:0]}
`define TRACE_ARRAY1D(lvl, fmt, arr, n) \
`TRACE(lvl, ("{")); \
`TRACE(lvl, ("{")) \
for (integer __i = (n-1); __i >= 0; --__i) begin \
if (__i != (n-1)) `TRACE(lvl, (", ")); \
`TRACE(lvl, (fmt, arr[__i])); \
if (__i != (n-1)) `TRACE(lvl, (", ")) \
`TRACE(lvl, (fmt, arr[__i])) \
end \
`TRACE(lvl, ("}"));
`TRACE(lvl, ("}"))
`define TRACE_ARRAY2D(lvl, fmt, arr, m, n) \
`TRACE(lvl, ("{")); \
`TRACE(lvl, ("{")) \
for (integer __i = n-1; __i >= 0; --__i) begin \
if (__i != (n-1)) `TRACE(lvl, (", ")); \
`TRACE(lvl, ("{")); \
if (__i != (n-1)) `TRACE(lvl, (", ")) \
`TRACE(lvl, ("{")) \
for (integer __j = (m-1); __j >= 0; --__j) begin \
if (__j != (m-1)) `TRACE(lvl, (", "));\
`TRACE(lvl, (fmt, arr[__i][__j])); \
if (__j != (m-1)) `TRACE(lvl, (", "))\
`TRACE(lvl, (fmt, arr[__i][__j])) \
end \
`TRACE(lvl, ("}")); \
`TRACE(lvl, ("}")) \
end \
`TRACE(lvl, ("}"))
@ -232,11 +263,14 @@
`define RESET_RELAY(dst, src) \
`RESET_RELAY_EX (dst, src, 1, 0)
// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2
`define TO_OUT_BUF_SIZE(out_reg) `MIN(out_reg, 2)
// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2, 5 -> 2
`define TO_OUT_BUF_SIZE(s) `MIN(s & 7, 2)
// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2
`define TO_OUT_BUF_REG(out_reg) ((out_reg & 1) + ((out_reg >> 2) << 1))
// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2, 5 > 3
`define TO_OUT_BUF_REG(s) (((s & 7) < 2) ? (s & 7) : ((s & 7) - 2))
// lut(x): (x & 8) != 0
`define TO_OUT_BUF_LUTRAM(s) ((s & 8) != 0)
`define REPEAT(n,f,s) `_REPEAT_``n(f,s)
`define _REPEAT_0(f,s)

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -21,48 +21,67 @@
input wire scope_bus_in, \
output wire scope_bus_out,
`define SCOPE_IO_SWITCH(__count) \
wire scope_bus_in_w [__count]; \
wire scope_bus_out_w [__count]; \
`RESET_RELAY_EX(scope_reset_w, scope_reset, __count, 4); \
VX_scope_switch #( \
.N (__count) \
) scope_switch ( \
.clk (clk), \
.reset (scope_reset), \
.req_in (scope_bus_in), \
.rsp_out (scope_bus_out), \
.req_out (scope_bus_in_w), \
.rsp_in (scope_bus_out_w) \
);
`define SCOPE_IO_BIND(__i) \
.scope_reset (scope_reset_w[__i]), \
.scope_bus_in (scope_bus_in_w[__i]), \
.scope_bus_out (scope_bus_out_w[__i]),
`define SCOPE_IO_UNUSED() \
`UNUSED_VAR (scope_reset); \
`UNUSED_VAR (scope_bus_in); \
assign scope_bus_out = 0;
`define SCOPE_IO_UNUSED_W(__i) \
`define SCOPE_IO_UNUSED(__i) \
`UNUSED_VAR (scope_reset_w[__i]); \
`UNUSED_VAR (scope_bus_in_w[__i]); \
assign scope_bus_out_w[__i] = 0;
`define SCOPE_IO_SWITCH(__count) \
wire [__count-1:0] scope_bus_in_w; \
wire [__count-1:0] scope_bus_out_w; \
wire [__count-1:0] scope_reset_w = {__count{scope_reset}}; \
VX_scope_switch #( \
.N (__count) \
) scope_switch ( \
.clk (clk), \
.reset (scope_reset), \
.req_in (scope_bus_in), \
.rsp_out (scope_bus_out), \
.req_out (scope_bus_in_w), \
.rsp_in (scope_bus_out_w) \
)
`define SCOPE_TAP_EX(__idx, __id, __xtriggers_w, __htriggers_w, __probes_w, __xtriggers, __htriggers, __probes, __start, __stop, __depth) \
VX_scope_tap #( \
.SCOPE_ID (__id), \
.XTRIGGERW(__xtriggers_w), \
.HTRIGGERW(__htriggers_w), \
.PROBEW (__probes_w), \
.DEPTH (__depth) \
) scope_tap_``idx ( \
.clk (clk), \
.reset (scope_reset_w[__idx]), \
.start (__start), \
.stop (__stop), \
.xtriggers(__xtriggers), \
.htriggers(__htriggers), \
.probes (__probes), \
.bus_in (scope_bus_in_w[__idx]), \
.bus_out(scope_bus_out_w[__idx]) \
)
`define SCOPE_TAP(__idx, __id, __xtriggers, __htriggers, __probes, __start, __stop, __depth) \
`SCOPE_TAP_EX(__idx, __id, $bits(__xtriggers), $bits(__htriggers), $bits(__probes), __xtriggers, __htriggers, __probes, __start, __stop, __depth)
`else
`define SCOPE_IO_DECL
`define SCOPE_IO_SWITCH(__count)
`define SCOPE_IO_BIND(__i)
`define SCOPE_IO_UNUSED_W(__i)
`define SCOPE_IO_UNUSED(__i)
`define SCOPE_IO_SWITCH(__count)
`define SCOPE_TAP(__idx, __id, __xtriggers, __probes, __depth)
`define SCOPE_TAP_EX(__idx, __id, __xtriggers_w, __probes_w, __xtriggers, __probes, __depth)
`endif
`endif // VX_SCOPE_VH

View file

@ -14,7 +14,8 @@
`include "VX_define.vh"
module VX_socket import VX_gpu_pkg::*; #(
parameter SOCKET_ID = 0
parameter SOCKET_ID = 0,
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -40,17 +41,20 @@ module VX_socket import VX_gpu_pkg::*; #(
output wire busy
);
`ifdef SCOPE
localparam scope_core = 0;
`SCOPE_IO_SWITCH (`SOCKET_SIZE);
`endif
`ifdef GBAR_ENABLE
VX_gbar_bus_if per_core_gbar_bus_if[`SOCKET_SIZE]();
`RESET_RELAY (gbar_arb_reset, reset);
VX_gbar_arb #(
.NUM_REQS (`SOCKET_SIZE),
.OUT_BUF ((`SOCKET_SIZE > 1) ? 2 : 0)
) gbar_arb (
.clk (clk),
.reset (gbar_arb_reset),
.reset (reset),
.bus_in_if (per_core_gbar_bus_if),
.bus_out_if (gbar_bus_if)
);
@ -81,7 +85,7 @@ module VX_socket import VX_gpu_pkg::*; #(
`RESET_RELAY (icache_reset, reset);
VX_cache_cluster #(
.INSTANCE_ID ($sformatf("socket%0d-icache", SOCKET_ID)),
.INSTANCE_ID ($sformatf("%s-icache", INSTANCE_ID)),
.NUM_UNITS (`NUM_ICACHES),
.NUM_INPUTS (`SOCKET_SIZE),
.TAG_SEL_IDX (0),
@ -99,7 +103,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.UUID_WIDTH (`UUID_WIDTH),
.WRITE_ENABLE (0),
.NC_ENABLE (0),
.CORE_OUT_BUF (2),
.CORE_OUT_BUF (3),
.MEM_OUT_BUF (2)
) icache (
`ifdef PERF_ENABLE
@ -126,7 +130,7 @@ module VX_socket import VX_gpu_pkg::*; #(
`RESET_RELAY (dcache_reset, reset);
VX_cache_cluster #(
.INSTANCE_ID ($sformatf("socket%0d-dcache", SOCKET_ID)),
.INSTANCE_ID ($sformatf("%s-dcache", INSTANCE_ID)),
.NUM_UNITS (`NUM_DCACHES),
.NUM_INPUTS (`SOCKET_SIZE),
.TAG_SEL_IDX (0),
@ -139,12 +143,14 @@ module VX_socket import VX_gpu_pkg::*; #(
.CRSQ_SIZE (`DCACHE_CRSQ_SIZE),
.MSHR_SIZE (`DCACHE_MSHR_SIZE),
.MRSQ_SIZE (`DCACHE_MRSQ_SIZE),
.MREQ_SIZE (`DCACHE_MREQ_SIZE),
.MREQ_SIZE (`DCACHE_WRITEBACK ? `DCACHE_MSHR_SIZE : `DCACHE_MREQ_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`DCACHE_WRITEBACK),
.DIRTY_BYTES (`DCACHE_WRITEBACK),
.NC_ENABLE (1),
.CORE_OUT_BUF (`LMEM_ENABLED ? 2 : 1),
.CORE_OUT_BUF (3),
.MEM_OUT_BUF (2)
) dcache (
`ifdef PERF_ENABLE
@ -171,19 +177,17 @@ module VX_socket import VX_gpu_pkg::*; #(
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
`RESET_RELAY (mem_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (2),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX (0),
.ARBITER ("R"),
.REQ_OUT_BUF (2),
.RSP_OUT_BUF (2)
.NUM_INPUTS (2),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX(0),
.ARBITER ("P"), // prioritize the icache
.REQ_OUT_BUF(3),
.RSP_OUT_BUF(3)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
.reset (reset),
.bus_in_if (l1_mem_bus_if),
.bus_out_if (l1_mem_arb_bus_if)
);
@ -194,19 +198,19 @@ module VX_socket import VX_gpu_pkg::*; #(
wire [`SOCKET_SIZE-1:0] per_core_busy;
`BUFFER_DCR_BUS_IF (core_dcr_bus_if, dcr_bus_if, (`SOCKET_SIZE > 1));
`SCOPE_IO_SWITCH (`SOCKET_SIZE)
// Generate all cores
for (genvar i = 0; i < `SOCKET_SIZE; ++i) begin
for (genvar core_id = 0; core_id < `SOCKET_SIZE; ++core_id) begin : g_cores
`RESET_RELAY (core_reset, reset);
VX_dcr_bus_if core_dcr_bus_if();
`BUFFER_DCR_BUS_IF (core_dcr_bus_if, dcr_bus_if, 1'b1, (`SOCKET_SIZE > 1))
VX_core #(
.CORE_ID ((SOCKET_ID * `SOCKET_SIZE) + i)
.CORE_ID ((SOCKET_ID * `SOCKET_SIZE) + core_id),
.INSTANCE_ID ($sformatf("%s-core%0d", INSTANCE_ID, core_id))
) core (
`SCOPE_IO_BIND (i)
`SCOPE_IO_BIND (scope_core + core_id)
.clk (clk),
.reset (core_reset),
@ -217,15 +221,15 @@ module VX_socket import VX_gpu_pkg::*; #(
.dcr_bus_if (core_dcr_bus_if),
.dcache_bus_if (per_core_dcache_bus_if[i * DCACHE_NUM_REQS +: DCACHE_NUM_REQS]),
.dcache_bus_if (per_core_dcache_bus_if[core_id * DCACHE_NUM_REQS +: DCACHE_NUM_REQS]),
.icache_bus_if (per_core_icache_bus_if[i]),
.icache_bus_if (per_core_icache_bus_if[core_id]),
`ifdef GBAR_ENABLE
.gbar_bus_if (per_core_gbar_bus_if[i]),
.gbar_bus_if (per_core_gbar_bus_if[core_id]),
`endif
.busy (per_core_busy[i])
.busy (per_core_busy[core_id])
);
end

View file

@ -85,30 +85,31 @@
`define VX_CSR_MPM_IBUF_ST_H 12'hB85
`define VX_CSR_MPM_SCRB_ST 12'hB06
`define VX_CSR_MPM_SCRB_ST_H 12'hB86
`define VX_CSR_MPM_SCRB_ALU 12'hB07
`define VX_CSR_MPM_SCRB_ALU_H 12'hB87
`define VX_CSR_MPM_SCRB_FPU 12'hB08
`define VX_CSR_MPM_SCRB_FPU_H 12'hB88
`define VX_CSR_MPM_SCRB_LSU 12'hB09
`define VX_CSR_MPM_SCRB_LSU_H 12'hB89
`define VX_CSR_MPM_SCRB_SFU 12'hB0A
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8A
`define VX_CSR_MPM_OPDS_ST 12'hB07
`define VX_CSR_MPM_OPDS_ST_H 12'hB87
`define VX_CSR_MPM_SCRB_ALU 12'hB08
`define VX_CSR_MPM_SCRB_ALU_H 12'hB88
`define VX_CSR_MPM_SCRB_FPU 12'hB09
`define VX_CSR_MPM_SCRB_FPU_H 12'hB89
`define VX_CSR_MPM_SCRB_LSU 12'hB0A
`define VX_CSR_MPM_SCRB_LSU_H 12'hB8A
`define VX_CSR_MPM_SCRB_SFU 12'hB0B
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8B
`define VX_CSR_MPM_SCRB_CSRS 12'hB0C
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB8C
`define VX_CSR_MPM_SCRB_WCTL 12'hB0D
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB8D
// PERF: memory
`define VX_CSR_MPM_IFETCHES 12'hB0B
`define VX_CSR_MPM_IFETCHES_H 12'hB8B
`define VX_CSR_MPM_LOADS 12'hB0C
`define VX_CSR_MPM_LOADS_H 12'hB8C
`define VX_CSR_MPM_STORES 12'hB0D
`define VX_CSR_MPM_STORES_H 12'hB8D
`define VX_CSR_MPM_IFETCH_LT 12'hB0E
`define VX_CSR_MPM_IFETCH_LT_H 12'hB8E
`define VX_CSR_MPM_LOAD_LT 12'hB0F
`define VX_CSR_MPM_LOAD_LT_H 12'hB8F
// SFU: scoreboard
`define VX_CSR_MPM_SCRB_WCTL 12'hB10
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB90
`define VX_CSR_MPM_SCRB_CSRS 12'hB11
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB91
`define VX_CSR_MPM_IFETCHES 12'hB0E
`define VX_CSR_MPM_IFETCHES_H 12'hB8E
`define VX_CSR_MPM_LOADS 12'hB0F
`define VX_CSR_MPM_LOADS_H 12'hB8F
`define VX_CSR_MPM_STORES 12'hB10
`define VX_CSR_MPM_STORES_H 12'hB90
`define VX_CSR_MPM_IFETCH_LT 12'hB11
`define VX_CSR_MPM_IFETCH_LT_H 12'hB91
`define VX_CSR_MPM_LOAD_LT 12'hB12
`define VX_CSR_MPM_LOAD_LT_H 12'hB92
// Machine Performance-monitoring memory counters (class 2) ///////////////////
@ -165,6 +166,10 @@
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99
`define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency
`define VX_CSR_MPM_MEM_LT_H 12'hB9A
`define VX_CSR_MPM_MEM_BANK_CNTR 12'hB1E // memory bank requests
`define VX_CSR_MPM_MEM_BANK_CNTR_H 12'hB9E
`define VX_CSR_MPM_MEM_BANK_TICK 12'hB1F // memory ticks
`define VX_CSR_MPM_MEM_BANK_TICK_H 12'hB9F
// PERF: lmem
`define VX_CSR_MPM_LMEM_READS 12'hB1B // memory reads
`define VX_CSR_MPM_LMEM_READS_H 12'hB9B

View file

@ -44,6 +44,11 @@ module Vortex import VX_gpu_pkg::*; (
output wire busy
);
`ifdef SCOPE
localparam scope_cluster = 0;
`SCOPE_IO_SWITCH (`NUM_CLUSTERS);
`endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if();
assign mem_perf_if.icache = 'x;
@ -75,12 +80,14 @@ module Vortex import VX_gpu_pkg::*; (
.CRSQ_SIZE (`L3_CRSQ_SIZE),
.MSHR_SIZE (`L3_MSHR_SIZE),
.MRSQ_SIZE (`L3_MRSQ_SIZE),
.MREQ_SIZE (`L3_MREQ_SIZE),
.MREQ_SIZE (`L3_WRITEBACK ? `L3_MSHR_SIZE : `L3_MREQ_SIZE),
.TAG_WIDTH (L2_MEM_TAG_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`L3_WRITEBACK),
.DIRTY_BYTES (`L3_WRITEBACK),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2),
.CORE_OUT_BUF (3),
.MEM_OUT_BUF (3),
.NC_ENABLE (1),
.PASSTHRU (!`L3_ENABLED)
) l3cache (
@ -102,7 +109,7 @@ module Vortex import VX_gpu_pkg::*; (
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
`UNUSED_VAR (mem_bus_if.req_data.atype)
`UNUSED_VAR (mem_bus_if.req_data.flags)
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data;
@ -121,19 +128,19 @@ module Vortex import VX_gpu_pkg::*; (
wire [`NUM_CLUSTERS-1:0] per_cluster_busy;
`SCOPE_IO_SWITCH (`NUM_CLUSTERS)
// Generate all clusters
for (genvar i = 0; i < `NUM_CLUSTERS; ++i) begin
for (genvar cluster_id = 0; cluster_id < `NUM_CLUSTERS; ++cluster_id) begin : g_clusters
`RESET_RELAY (cluster_reset, reset);
`BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, (`NUM_CLUSTERS > 1));
VX_dcr_bus_if cluster_dcr_bus_if();
`BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, 1'b1, (`NUM_CLUSTERS > 1))
VX_cluster #(
.CLUSTER_ID (i)
.CLUSTER_ID (cluster_id),
.INSTANCE_ID ($sformatf("cluster%0d", cluster_id))
) cluster (
`SCOPE_IO_BIND (i)
`SCOPE_IO_BIND (scope_cluster + cluster_id)
.clk (clk),
.reset (cluster_reset),
@ -144,9 +151,9 @@ module Vortex import VX_gpu_pkg::*; (
.dcr_bus_if (cluster_dcr_bus_if),
.mem_bus_if (per_cluster_mem_bus_if[i]),
.mem_bus_if (per_cluster_mem_bus_if[cluster_id]),
.busy (per_cluster_busy[i])
.busy (per_cluster_busy[cluster_id])
);
end
@ -182,16 +189,26 @@ module Vortex import VX_gpu_pkg::*; (
`endif
// dump device configuration
initial begin
`TRACE(0, ("CONFIGS: num_threads=%0d, num_warps=%0d, num_cores=%0d, num_clusters=%0d, socket_size=%0d, local_mem_base=0x%0h, num_barriers=%0d\n",
`NUM_THREADS, `NUM_WARPS, `NUM_CORES, `NUM_CLUSTERS, `SOCKET_SIZE, `LMEM_BASE_ADDR, `NUM_BARRIERS))
end
`ifdef DBG_TRACE_MEM
wire [`UUID_WIDTH-1:0] mem_req_uuid = mem_req_tag[`VX_MEM_TAG_WIDTH-1 -: `UUID_WIDTH];
wire [`UUID_WIDTH-1:0] mem_rsp_uuid = mem_rsp_tag[`VX_MEM_TAG_WIDTH-1 -: `UUID_WIDTH];
always @(posedge clk) begin
if (mem_req_fire) begin
if (mem_req_rw)
`TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h data=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data));
else
`TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen));
if (mem_req_rw) begin
`TRACE(1, ("%t: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%h data=0x%h (#%0d)\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data, mem_req_uuid))
end else begin
`TRACE(1, ("%t: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%h (#%0d)\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_uuid))
end
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: MEM Rsp: tag=0x%0h, data=0x%0h\n", $time, mem_rsp_tag, mem_rsp_data));
`TRACE(1, ("%t: MEM Rd Rsp: tag=0x%0h, data=0x%h (#%0d)\n", $time, mem_rsp_tag, mem_rsp_data, mem_rsp_uuid))
end
end
`endif

View file

@ -15,7 +15,7 @@
module Vortex_axi import VX_gpu_pkg::*; #(
parameter AXI_DATA_WIDTH = `VX_MEM_DATA_WIDTH,
parameter AXI_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter AXI_ADDR_WIDTH = `MEM_ADDR_WIDTH + (`VX_MEM_DATA_WIDTH/8),
parameter AXI_TID_WIDTH = `VX_MEM_TAG_WIDTH,
parameter AXI_NUM_BANKS = 1
)(
@ -82,9 +82,11 @@ module Vortex_axi import VX_gpu_pkg::*; #(
// Status
output wire busy
);
`STATIC_ASSERT((AXI_DATA_WIDTH == `VX_MEM_DATA_WIDTH), ("invalid memory data size: current=%0d, expected=%0d", AXI_DATA_WIDTH, `VX_MEM_DATA_WIDTH))
`STATIC_ASSERT((AXI_ADDR_WIDTH >= `MEM_ADDR_WIDTH), ("invalid memory address size: current=%0d, expected=%0d", AXI_ADDR_WIDTH, `VX_MEM_ADDR_WIDTH))
//`STATIC_ASSERT((AXI_TID_WIDTH >= `VX_MEM_TAG_WIDTH), ("invalid memory tag size: current=%0d, expected=%0d", AXI_TID_WIDTH, `VX_MEM_TAG_WIDTH))
localparam DST_LDATAW = `CLOG2(AXI_DATA_WIDTH);
localparam SRC_LDATAW = `CLOG2(`VX_MEM_DATA_WIDTH);
localparam SUB_LDATAW = DST_LDATAW - SRC_LDATAW;
localparam VX_MEM_TAG_A_WIDTH = `VX_MEM_TAG_WIDTH + `MAX(SUB_LDATAW, 0);
localparam VX_MEM_ADDR_A_WIDTH = `VX_MEM_ADDR_WIDTH - SUB_LDATAW;
wire mem_req_valid;
wire mem_req_rw;
@ -99,95 +101,7 @@ module Vortex_axi import VX_gpu_pkg::*; #(
wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_ready;
wire [`MEM_ADDR_WIDTH-1:0] m_axi_awaddr_unqual [AXI_NUM_BANKS];
wire [`MEM_ADDR_WIDTH-1:0] m_axi_araddr_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_awid_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_arid_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_bid_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_rid_unqual [AXI_NUM_BANKS];
for (genvar i = 0; i < AXI_NUM_BANKS; ++i) begin
assign m_axi_awaddr[i] = `MEM_ADDR_WIDTH'(m_axi_awaddr_unqual[i]);
assign m_axi_araddr[i] = `MEM_ADDR_WIDTH'(m_axi_araddr_unqual[i]);
assign m_axi_awid[i] = AXI_TID_WIDTH'(m_axi_awid_unqual[i]);
assign m_axi_arid[i] = AXI_TID_WIDTH'(m_axi_arid_unqual[i]);
assign m_axi_rid_unqual[i] = `VX_MEM_TAG_WIDTH'(m_axi_rid[i]);
assign m_axi_bid_unqual[i] = `VX_MEM_TAG_WIDTH'(m_axi_bid[i]);
end
VX_axi_adapter #(
.DATA_WIDTH (`VX_MEM_DATA_WIDTH),
.ADDR_WIDTH (`MEM_ADDR_WIDTH),
.TAG_WIDTH (`VX_MEM_TAG_WIDTH),
.NUM_BANKS (AXI_NUM_BANKS),
.RSP_OUT_BUF((AXI_NUM_BANKS > 1) ? 2 : 0)
) axi_adapter (
.clk (clk),
.reset (reset),
.mem_req_valid (mem_req_valid),
.mem_req_rw (mem_req_rw),
.mem_req_byteen (mem_req_byteen),
.mem_req_addr (mem_req_addr),
.mem_req_data (mem_req_data),
.mem_req_tag (mem_req_tag),
.mem_req_ready (mem_req_ready),
.mem_rsp_valid (mem_rsp_valid),
.mem_rsp_data (mem_rsp_data),
.mem_rsp_tag (mem_rsp_tag),
.mem_rsp_ready (mem_rsp_ready),
.m_axi_awvalid (m_axi_awvalid),
.m_axi_awready (m_axi_awready),
.m_axi_awaddr (m_axi_awaddr_unqual),
.m_axi_awid (m_axi_awid_unqual),
.m_axi_awlen (m_axi_awlen),
.m_axi_awsize (m_axi_awsize),
.m_axi_awburst (m_axi_awburst),
.m_axi_awlock (m_axi_awlock),
.m_axi_awcache (m_axi_awcache),
.m_axi_awprot (m_axi_awprot),
.m_axi_awqos (m_axi_awqos),
.m_axi_awregion (m_axi_awregion),
.m_axi_wvalid (m_axi_wvalid),
.m_axi_wready (m_axi_wready),
.m_axi_wdata (m_axi_wdata),
.m_axi_wstrb (m_axi_wstrb),
.m_axi_wlast (m_axi_wlast),
.m_axi_bvalid (m_axi_bvalid),
.m_axi_bready (m_axi_bready),
.m_axi_bid (m_axi_bid_unqual),
.m_axi_bresp (m_axi_bresp),
.m_axi_arvalid (m_axi_arvalid),
.m_axi_arready (m_axi_arready),
.m_axi_araddr (m_axi_araddr_unqual),
.m_axi_arid (m_axi_arid_unqual),
.m_axi_arlen (m_axi_arlen),
.m_axi_arsize (m_axi_arsize),
.m_axi_arburst (m_axi_arburst),
.m_axi_arlock (m_axi_arlock),
.m_axi_arcache (m_axi_arcache),
.m_axi_arprot (m_axi_arprot),
.m_axi_arqos (m_axi_arqos),
.m_axi_arregion (m_axi_arregion),
.m_axi_rvalid (m_axi_rvalid),
.m_axi_rready (m_axi_rready),
.m_axi_rdata (m_axi_rdata),
.m_axi_rlast (m_axi_rlast) ,
.m_axi_rid (m_axi_rid_unqual),
.m_axi_rresp (m_axi_rresp)
);
`SCOPE_IO_SWITCH (1)
`SCOPE_IO_SWITCH (1);
Vortex vortex (
`SCOPE_IO_BIND (0)
@ -215,4 +129,128 @@ module Vortex_axi import VX_gpu_pkg::*; #(
.busy (busy)
);
wire mem_req_valid_a;
wire mem_req_rw_a;
wire [(AXI_DATA_WIDTH/8)-1:0] mem_req_byteen_a;
wire [VX_MEM_ADDR_A_WIDTH-1:0] mem_req_addr_a;
wire [AXI_DATA_WIDTH-1:0] mem_req_data_a;
wire [VX_MEM_TAG_A_WIDTH-1:0] mem_req_tag_a;
wire mem_req_ready_a;
wire mem_rsp_valid_a;
wire [AXI_DATA_WIDTH-1:0] mem_rsp_data_a;
wire [VX_MEM_TAG_A_WIDTH-1:0] mem_rsp_tag_a;
wire mem_rsp_ready_a;
VX_mem_adapter #(
.SRC_DATA_WIDTH (`VX_MEM_DATA_WIDTH),
.DST_DATA_WIDTH (AXI_DATA_WIDTH),
.SRC_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH),
.DST_ADDR_WIDTH (VX_MEM_ADDR_A_WIDTH),
.SRC_TAG_WIDTH (`VX_MEM_TAG_WIDTH),
.DST_TAG_WIDTH (VX_MEM_TAG_A_WIDTH),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (0)
) mem_adapter (
.clk (clk),
.reset (reset),
.mem_req_valid_in (mem_req_valid),
.mem_req_addr_in (mem_req_addr),
.mem_req_rw_in (mem_req_rw),
.mem_req_byteen_in (mem_req_byteen),
.mem_req_data_in (mem_req_data),
.mem_req_tag_in (mem_req_tag),
.mem_req_ready_in (mem_req_ready),
.mem_rsp_valid_in (mem_rsp_valid),
.mem_rsp_data_in (mem_rsp_data),
.mem_rsp_tag_in (mem_rsp_tag),
.mem_rsp_ready_in (mem_rsp_ready),
.mem_req_valid_out (mem_req_valid_a),
.mem_req_addr_out (mem_req_addr_a),
.mem_req_rw_out (mem_req_rw_a),
.mem_req_byteen_out (mem_req_byteen_a),
.mem_req_data_out (mem_req_data_a),
.mem_req_tag_out (mem_req_tag_a),
.mem_req_ready_out (mem_req_ready_a),
.mem_rsp_valid_out (mem_rsp_valid_a),
.mem_rsp_data_out (mem_rsp_data_a),
.mem_rsp_tag_out (mem_rsp_tag_a),
.mem_rsp_ready_out (mem_rsp_ready_a)
);
VX_axi_adapter #(
.DATA_WIDTH (AXI_DATA_WIDTH),
.ADDR_WIDTH_IN (VX_MEM_ADDR_A_WIDTH),
.ADDR_WIDTH_OUT (AXI_ADDR_WIDTH),
.TAG_WIDTH_IN (VX_MEM_TAG_A_WIDTH),
.TAG_WIDTH_OUT (AXI_TID_WIDTH),
.NUM_BANKS (AXI_NUM_BANKS),
.BANK_INTERLEAVE(0),
.RSP_OUT_BUF ((AXI_NUM_BANKS > 1) ? 2 : 0)
) axi_adapter (
.clk (clk),
.reset (reset),
.mem_req_valid (mem_req_valid_a),
.mem_req_rw (mem_req_rw_a),
.mem_req_byteen (mem_req_byteen_a),
.mem_req_addr (mem_req_addr_a),
.mem_req_data (mem_req_data_a),
.mem_req_tag (mem_req_tag_a),
.mem_req_ready (mem_req_ready_a),
.mem_rsp_valid (mem_rsp_valid_a),
.mem_rsp_data (mem_rsp_data_a),
.mem_rsp_tag (mem_rsp_tag_a),
.mem_rsp_ready (mem_rsp_ready_a),
.m_axi_awvalid (m_axi_awvalid),
.m_axi_awready (m_axi_awready),
.m_axi_awaddr (m_axi_awaddr),
.m_axi_awid (m_axi_awid),
.m_axi_awlen (m_axi_awlen),
.m_axi_awsize (m_axi_awsize),
.m_axi_awburst (m_axi_awburst),
.m_axi_awlock (m_axi_awlock),
.m_axi_awcache (m_axi_awcache),
.m_axi_awprot (m_axi_awprot),
.m_axi_awqos (m_axi_awqos),
.m_axi_awregion (m_axi_awregion),
.m_axi_wvalid (m_axi_wvalid),
.m_axi_wready (m_axi_wready),
.m_axi_wdata (m_axi_wdata),
.m_axi_wstrb (m_axi_wstrb),
.m_axi_wlast (m_axi_wlast),
.m_axi_bvalid (m_axi_bvalid),
.m_axi_bready (m_axi_bready),
.m_axi_bid (m_axi_bid),
.m_axi_bresp (m_axi_bresp),
.m_axi_arvalid (m_axi_arvalid),
.m_axi_arready (m_axi_arready),
.m_axi_araddr (m_axi_araddr),
.m_axi_arid (m_axi_arid),
.m_axi_arlen (m_axi_arlen),
.m_axi_arsize (m_axi_arsize),
.m_axi_arburst (m_axi_arburst),
.m_axi_arlock (m_axi_arlock),
.m_axi_arcache (m_axi_arcache),
.m_axi_arprot (m_axi_arprot),
.m_axi_arqos (m_axi_arqos),
.m_axi_arregion (m_axi_arregion),
.m_axi_rvalid (m_axi_rvalid),
.m_axi_rready (m_axi_rready),
.m_axi_rdata (m_axi_rdata),
.m_axi_rlast (m_axi_rlast),
.m_axi_rid (m_axi_rid),
.m_axi_rresp (m_axi_rresp)
);
endmodule

View file

@ -5,6 +5,7 @@
// To be done:
// Check how to run this with OPAE. Looks like setup issue
`ifndef NOPAE
`include "platform_if.vh"
@ -85,7 +86,7 @@ module ccip_std_afu #(
t_local_mem_data avs_writedata [NUM_LOCAL_MEM_BANKS];
t_local_mem_addr avs_address [NUM_LOCAL_MEM_BANKS];
logic avs_write [NUM_LOCAL_MEM_BANKS];
logic avs_read [NUM_LOCAL_MEM_BANKS];
logic avs_read [NUM_LOCAL_MEM_BANKS];
for (genvar b = 0; b < NUM_LOCAL_MEM_BANKS; b++) begin
assign local_mem[b].burstcount = avs_burstcount[b];
@ -94,7 +95,7 @@ module ccip_std_afu #(
assign local_mem[b].byteenable = avs_byteenable[b];
assign local_mem[b].write = avs_write[b];
assign local_mem[b].read = avs_read[b];
assign avs_waitrequest[b] = local_mem[b].waitrequest;
assign avs_readdata[b] = local_mem[b].readdata;
assign avs_readdatavalid[b] = local_mem[b].readdatavalid;
@ -107,7 +108,7 @@ module ccip_std_afu #(
.reset (reset_T1),
.cp2af_sRxPort (cp2af_sRx_T1),
.af2cp_sTxPort (af2cp_sTx_T0),
.af2cp_sTxPort (af2cp_sTx_T0),
.avs_writedata (avs_writedata),
.avs_readdata (avs_readdata),
@ -121,3 +122,5 @@ module ccip_std_afu #(
);
endmodule
`endif

View file

@ -30,7 +30,17 @@
//`include "platform_afu_top_config.vh"
`ifdef PLATFORM_PROVIDES_LOCAL_MEMORY
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH (`PLATFORM_MEMORY_ADDR_WIDTH - $clog2(`PLATFORM_MEMORY_DATA_WIDTH/8))
`endif
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH `PLATFORM_MEMORY_DATA_WIDTH
`endif
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH `PLATFORM_MEMORY_BURST_CNT_WIDTH
`endif
package local_mem_cfg_pkg;
@ -57,5 +67,3 @@ package local_mem_cfg_pkg;
typedef logic [LOCAL_MEM_DATA_N_BYTES-1:0] t_local_mem_byte_mask;
endpackage // local_mem_cfg_pkg
`endif // PLATFORM_PROVIDES_LOCAL_MEMORY

View file

@ -18,6 +18,10 @@
`endif
`include "VX_define.vh"
`ifndef PLATFORM_MEMORY_INTERLEAVE
`define PLATFORM_MEMORY_INTERLEAVE 1
`endif
module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_gpu_pkg::*; #(
parameter NUM_LOCAL_MEM_BANKS = 2
) (
@ -40,16 +44,17 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
output t_local_mem_burst_cnt avs_burstcount [NUM_LOCAL_MEM_BANKS],
input wire avs_readdatavalid [NUM_LOCAL_MEM_BANKS]
);
localparam LMEM_DATA_WIDTH = $bits(t_local_mem_data);
localparam LMEM_DATA_SIZE = LMEM_DATA_WIDTH / 8;
localparam LMEM_ADDR_WIDTH = $bits(t_local_mem_addr);
localparam LMEM_ADDR_WIDTH = `VX_MEM_ADDR_WIDTH + ($clog2(`VX_MEM_DATA_WIDTH) - $clog2(LMEM_DATA_WIDTH));
localparam LMEM_BURST_CTRW = $bits(t_local_mem_burst_cnt);
localparam CCI_DATA_WIDTH = $bits(t_ccip_clData);
localparam CCI_DATA_SIZE = CCI_DATA_WIDTH / 8;
localparam CCI_ADDR_WIDTH = $bits(t_ccip_clAddr);
localparam RESET_CTR_WIDTH = `CLOG2(`RESET_DELAY+1);
localparam AVS_RD_QUEUE_SIZE = 32;
localparam _VX_MEM_TAG_WIDTH = `VX_MEM_TAG_WIDTH;
localparam _AVS_REQ_TAGW_VX = _VX_MEM_TAG_WIDTH + `CLOG2(LMEM_DATA_WIDTH) - `CLOG2(`VX_MEM_DATA_WIDTH);
@ -64,6 +69,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
localparam AFU_ID_L = 16'h0002; // AFU ID Lower
localparam AFU_ID_H = 16'h0004; // AFU ID Higher
localparam CMD_IDLE = 0;
localparam CMD_MEM_READ = `AFU_IMAGE_CMD_MEM_READ;
localparam CMD_MEM_WRITE = `AFU_IMAGE_CMD_MEM_WRITE;
localparam CMD_DCR_WRITE = `AFU_IMAGE_CMD_DCR_WRITE;
@ -78,7 +84,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
localparam COUT_TID_WIDTH = `CLOG2(`VX_MEM_BYTEEN_WIDTH);
localparam COUT_QUEUE_DATAW = COUT_TID_WIDTH + 8;
localparam COUT_QUEUE_SIZE = 64;
localparam COUT_QUEUE_SIZE = 1024;
localparam MMIO_DEV_CAPS = `AFU_IMAGE_MMIO_DEV_CAPS;
localparam MMIO_ISA_CAPS = `AFU_IMAGE_MMIO_ISA_CAPS;
@ -96,7 +102,9 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
wire [127:0] afu_id = `AFU_ACCEL_UUID;
wire [63:0] dev_caps = {16'b0,
wire [63:0] dev_caps = {8'b0,
5'(`PLATFORM_MEMORY_ADDR_WIDTH-20),
3'(`CLOG2(`PLATFORM_MEMORY_BANKS)),
8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
16'(`NUM_CORES * `NUM_CLUSTERS),
8'(`NUM_WARPS),
@ -139,14 +147,12 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
// MMIO controller ////////////////////////////////////////////////////////////
t_ccip_c0_ReqMmioHdr mmio_hdr;
assign mmio_hdr = t_ccip_c0_ReqMmioHdr'(cp2af_sRxPort.c0.hdr);
`UNUSED_VAR (mmio_hdr)
t_ccip_c0_ReqMmioHdr mmio_req_hdr;
assign mmio_req_hdr = t_ccip_c0_ReqMmioHdr'(cp2af_sRxPort.c0.hdr[$bits(t_ccip_c0_ReqMmioHdr)-1:0]);
`UNUSED_VAR (mmio_req_hdr)
`STATIC_ASSERT(($bits(t_ccip_c0_ReqMmioHdr)-$bits(mmio_hdr.address)) == 12, ("Oops!"))
t_if_ccip_c2_Tx mmio_tx;
assign af2cp_sTxPort.c2 = mmio_tx;
t_if_ccip_c2_Tx mmio_rsp;
assign af2cp_sTxPort.c2 = mmio_rsp;
`ifdef SCOPE
@ -170,33 +176,35 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
if (reset) begin
cmd_scope_reading <= 0;
cmd_scope_writing <= 0;
scope_bus_in <= 0;
scope_bus_in <= 0;
end else begin
scope_bus_in <= 0;
if (scope_bus_out) begin
cmd_scope_reading <= 1;
scope_bus_ctr <= 63;
end
scope_bus_in <= 0;
if (cp2af_sRxPort.c0.mmioWrValid
&& (MMIO_SCOPE_WRITE == mmio_hdr.address)) begin
&& (MMIO_SCOPE_WRITE == mmio_req_hdr.address)) begin
cmd_scope_wdata <= 64'(cp2af_sRxPort.c0.data);
cmd_scope_writing <= 1;
scope_bus_ctr <= 63;
scope_bus_in <= 1;
end
end
if (cmd_scope_writing) begin
scope_bus_in <= 1'(cmd_scope_wdata >> scope_bus_ctr);
scope_bus_ctr <= scope_bus_ctr - 1;
if (scope_bus_ctr == 0) begin
cmd_scope_writing <= 0;
if (cmd_scope_writing) begin
scope_bus_in <= cmd_scope_wdata[scope_bus_ctr];
scope_bus_ctr <= scope_bus_ctr - 6'd1;
if (scope_bus_ctr == 0) begin
cmd_scope_writing <= 0;
scope_bus_ctr <= 0;
end
end
end
if (cmd_scope_reading) begin
cmd_scope_rdata <= {cmd_scope_rdata[62:0], scope_bus_out};
scope_bus_ctr <= scope_bus_ctr - 1;
if (scope_bus_ctr == 0) begin
cmd_scope_reading <= 0;
if (cmd_scope_reading) begin
cmd_scope_rdata <= {cmd_scope_rdata[62:0], scope_bus_out};
scope_bus_ctr <= scope_bus_ctr - 6'd1;
if (scope_bus_ctr == 0) begin
cmd_scope_reading <= 0;
scope_bus_ctr <= 0;
end
end
end
end
@ -206,6 +214,8 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
wire [COUT_QUEUE_DATAW-1:0] cout_q_dout;
wire cout_q_full, cout_q_empty;
wire [COUT_QUEUE_DATAW-1:0] cout_q_dout_s = cout_q_dout & {COUT_QUEUE_DATAW{!cout_q_empty}};
`ifdef SIMULATION
`ifndef VERILATOR
// disable assertions until full reset
@ -226,60 +236,22 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
`endif
`endif
// MMIO controller ////////////////////////////////////////////////////////////
// Handle MMIO read requests
always @(posedge clk) begin
if (reset) begin
mmio_tx.mmioRdValid <= 0;
mmio_tx.hdr <= '0;
mmio_rsp.mmioRdValid <= 0;
end else begin
mmio_tx.mmioRdValid <= cp2af_sRxPort.c0.mmioRdValid;
mmio_tx.hdr.tid <= mmio_hdr.tid;
end
// serve MMIO write request
if (cp2af_sRxPort.c0.mmioWrValid) begin
case (mmio_hdr.address)
MMIO_CMD_ARG0: begin
cmd_args[0] <= 64'(cp2af_sRxPort.c0.data);
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_CMD_ARG0: data=0x%0h\n", $time, 64'(cp2af_sRxPort.c0.data)));
`endif
end
MMIO_CMD_ARG1: begin
cmd_args[1] <= 64'(cp2af_sRxPort.c0.data);
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_CMD_ARG1: data=0x%0h\n", $time, 64'(cp2af_sRxPort.c0.data)));
`endif
end
MMIO_CMD_ARG2: begin
cmd_args[2] <= 64'(cp2af_sRxPort.c0.data);
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_CMD_ARG2: data=%0d\n", $time, 64'(cp2af_sRxPort.c0.data)));
`endif
end
MMIO_CMD_TYPE: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_CMD_TYPE: data=%0d\n", $time, 64'(cp2af_sRxPort.c0.data)));
`endif
end
`ifdef SCOPE
MMIO_SCOPE_WRITE: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_SCOPE_WRITE: data=0x%0h\n", $time, cmd_scope_wdata));
`endif
end
`endif
default: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: Unknown MMIO Wr: addr=0x%0h, data=0x%0h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data)));
`endif
end
endcase
mmio_rsp.mmioRdValid <= cp2af_sRxPort.c0.mmioRdValid;
end
// serve MMIO read requests
mmio_rsp.hdr.tid <= mmio_req_hdr.tid;
if (cp2af_sRxPort.c0.mmioRdValid) begin
case (mmio_hdr.address)
case (mmio_req_hdr.address)
// AFU header
16'h0000: mmio_tx.data <= {
16'h0000: mmio_rsp.data <= {
4'b0001, // Feature type = AFU
8'b0, // reserved
4'b0, // afu minor revision = 0
@ -289,105 +261,140 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
4'b0, // afu major revision = 0
12'b0 // feature ID = 0
};
AFU_ID_L: mmio_tx.data <= afu_id[63:0]; // afu id low
AFU_ID_H: mmio_tx.data <= afu_id[127:64]; // afu id hi
16'h0006: mmio_tx.data <= 64'h0; // next AFU
16'h0008: mmio_tx.data <= 64'h0; // reserved
AFU_ID_L: mmio_rsp.data <= afu_id[63:0]; // afu id low
AFU_ID_H: mmio_rsp.data <= afu_id[127:64]; // afu id hi
16'h0006: mmio_rsp.data <= 64'h0; // next AFU
16'h0008: mmio_rsp.data <= 64'h0; // reserved
MMIO_STATUS: begin
mmio_tx.data <= 64'({cout_q_dout, !cout_q_empty, 8'(state)});
mmio_rsp.data <= 64'({cout_q_dout_s, !cout_q_empty, 8'(state)});
`ifdef DBG_TRACE_AFU
if (state != STATE_WIDTH'(mmio_tx.data)) begin
`TRACE(2, ("%d: MMIO_STATUS: addr=0x%0h, state=%0d\n", $time, mmio_hdr.address, state));
if (state != STATE_WIDTH'(mmio_rsp.data)) begin
`TRACE(2, ("%t: AFU: MMIO_STATUS: addr=0x%0h, state=%0d\n", $time, mmio_req_hdr.address, state))
end
`endif
end
`ifdef SCOPE
MMIO_SCOPE_READ: begin
mmio_tx.data <= cmd_scope_rdata;
mmio_rsp.data <= cmd_scope_rdata;
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_SCOPE_READ: data=0x%0h\n", $time, cmd_scope_rdata));
`TRACE(2, ("%t: AFU: MMIO_SCOPE_READ: data=0x%h\n", $time, cmd_scope_rdata))
`endif
end
`endif
MMIO_DEV_CAPS: begin
mmio_tx.data <= dev_caps;
mmio_rsp.data <= dev_caps;
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_DEV_CAPS: data=0x%0h\n", $time, dev_caps));
`TRACE(2, ("%t: AFU: MMIO_DEV_CAPS: data=0x%h\n", $time, dev_caps))
`endif
end
MMIO_ISA_CAPS: begin
mmio_tx.data <= isa_caps;
mmio_rsp.data <= isa_caps;
`ifdef DBG_TRACE_AFU
if (state != STATE_WIDTH'(mmio_tx.data)) begin
`TRACE(2, ("%d: MMIO_ISA_CAPS: data=%0d\n", $time, isa_caps));
if (state != STATE_WIDTH'(mmio_rsp.data)) begin
`TRACE(2, ("%t: AFU: MMIO_ISA_CAPS: data=%0d\n", $time, isa_caps))
end
`endif
end
default: begin
mmio_tx.data <= 64'h0;
mmio_rsp.data <= 64'h0;
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: Unknown MMIO Rd: addr=0x%0h\n", $time, mmio_hdr.address));
`TRACE(2, ("%t: AFU: Unknown MMIO Rd: addr=0x%0h\n", $time, mmio_req_hdr.address))
`endif
end
endcase
end
end
// Handle MMIO write requests
always @(posedge clk) begin
if (cp2af_sRxPort.c0.mmioWrValid) begin
case (mmio_req_hdr.address)
MMIO_CMD_ARG0: begin
cmd_args[0] <= 64'(cp2af_sRxPort.c0.data);
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: MMIO_CMD_ARG0: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data)))
`endif
end
MMIO_CMD_ARG1: begin
cmd_args[1] <= 64'(cp2af_sRxPort.c0.data);
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: MMIO_CMD_ARG1: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data)))
`endif
end
MMIO_CMD_ARG2: begin
cmd_args[2] <= 64'(cp2af_sRxPort.c0.data);
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: MMIO_CMD_ARG2: data=%0d\n", $time, 64'(cp2af_sRxPort.c0.data)))
`endif
end
MMIO_CMD_TYPE: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: MMIO_CMD_TYPE: data=%0d\n", $time, 64'(cp2af_sRxPort.c0.data)))
`endif
end
`ifdef SCOPE
MMIO_SCOPE_WRITE: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: MMIO_SCOPE_WRITE: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data)))
`endif
end
`endif
default: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: Unknown MMIO Wr: addr=0x%0h, data=0x%h\n", $time, mmio_req_hdr.address, 64'(cp2af_sRxPort.c0.data)))
`endif
end
endcase
end
end
// COMMAND FSM ////////////////////////////////////////////////////////////////
wire cmd_mem_rd_done;
reg cmd_mem_wr_done;
reg [RESET_CTR_WIDTH-1:0] vx_reset_ctr;
reg vx_busy_wait;
reg vx_running;
reg vx_reset = 1; // asserted at initialization
wire vx_busy;
reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr;
always @(posedge clk) begin
if (state == STATE_RUN) begin
vx_reset_ctr <= vx_reset_ctr + $bits(vx_reset_ctr)'(1);
end else begin
vx_reset_ctr <= '0;
end
end
wire is_mmio_wr_cmd = cp2af_sRxPort.c0.mmioWrValid && (MMIO_CMD_TYPE == mmio_hdr.address);
wire is_mmio_wr_cmd = cp2af_sRxPort.c0.mmioWrValid && (MMIO_CMD_TYPE == mmio_req_hdr.address);
wire [CMD_TYPE_WIDTH-1:0] cmd_type = is_mmio_wr_cmd ?
CMD_TYPE_WIDTH'(cp2af_sRxPort.c0.data) : CMD_TYPE_WIDTH'(0);
CMD_TYPE_WIDTH'(cp2af_sRxPort.c0.data) : CMD_TYPE_WIDTH'(CMD_IDLE);
always @(posedge clk) begin
if (reset) begin
state <= STATE_IDLE;
vx_busy_wait <= 0;
vx_running <= 0;
state <= STATE_IDLE;
vx_reset <= 1;
end else begin
case (state)
STATE_IDLE: begin
case (cmd_type)
CMD_MEM_READ: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: STATE MEM_READ: ia=0x%0h addr=0x%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size));
`TRACE(2, ("%t: AFU: Goto STATE MEM_READ: ia=0x%0h addr=0x%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size))
`endif
state <= STATE_MEM_READ;
end
CMD_MEM_WRITE: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: STATE MEM_WRITE: ia=0x%0h addr=0x%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size));
`TRACE(2, ("%t: AFU: Goto STATE MEM_WRITE: ia=0x%0h addr=0x%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size))
`endif
state <= STATE_MEM_WRITE;
end
CMD_DCR_WRITE: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: STATE DCR_WRITE: addr=0x%0h data=%0d\n", $time, cmd_dcr_addr, cmd_dcr_data));
`TRACE(2, ("%t: AFU: Goto STATE DCR_WRITE: addr=0x%0h data=%0d\n", $time, cmd_dcr_addr, cmd_dcr_data))
`endif
state <= STATE_DCR_WRITE;
end
CMD_RUN: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: STATE RUN\n", $time));
`TRACE(2, ("%t: AFU: Goto STATE RUN\n", $time))
`endif
state <= STATE_RUN;
vx_running <= 0;
vx_reset_ctr <= RESET_CTR_WIDTH'(`RESET_DELAY-1);
vx_reset <= 1;
end
default: begin
state <= state;
@ -398,54 +405,56 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
if (cmd_mem_rd_done) begin
state <= STATE_IDLE;
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: STATE IDLE\n", $time));
`TRACE(2, ("%t: AFU: Goto STATE IDLE\n", $time))
`endif
end
end
STATE_MEM_WRITE: begin
if (cmd_mem_wr_done) begin
state <= STATE_IDLE;
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: STATE IDLE\n", $time));
`endif
end
end
STATE_DCR_WRITE: begin
state <= STATE_IDLE;
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: STATE IDLE\n", $time));
`TRACE(2, ("%t: AFU: Goto STATE IDLE\n", $time))
`endif
end
STATE_RUN: begin
if (vx_running) begin
if (vx_busy_wait) begin
// wait until the gpu goes busy
if (vx_busy) begin
vx_busy_wait <= 0;
end
end else begin
// wait until the gpu is not busy
if (~vx_busy) begin
state <= STATE_IDLE;
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: AFU: End execution\n", $time));
`TRACE(2, ("%d: STATE IDLE\n", $time));
`endif
end
end
if (vx_reset) begin
// wait until the reset network is ready
if (vx_reset_ctr == RESET_CTR_WIDTH'(0)) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: Begin execution\n", $time))
`endif
vx_busy_wait <= 1;
vx_reset <= 0;
end
end else begin
// wait until the reset sequence is complete
if (vx_reset_ctr == (`RESET_DELAY-1)) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: AFU: Begin execution\n", $time));
`endif
vx_running <= 1;
vx_busy_wait <= 1;
end
if (vx_busy_wait) begin
// wait until processor goes busy
if (vx_busy) begin
vx_busy_wait <= 0;
end
end else begin
// wait until the processor is not busy
if (~vx_busy) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: End execution\n", $time))
`TRACE(2, ("%t: AFU: Goto STATE IDLE\n", $time))
`endif
state <= STATE_IDLE;
end
end
end
end
default:;
endcase
// ensure reset network initialization
if (vx_reset_ctr != RESET_CTR_WIDTH'(0)) begin
vx_reset_ctr <= vx_reset_ctr - RESET_CTR_WIDTH'(1);
end
end
end
@ -475,8 +484,6 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.TAG_WIDTH (AVS_REQ_TAGW)
) cci_vx_mem_bus_if[2]();
`RESET_RELAY (cci_adapter_reset, reset);
VX_mem_adapter #(
.SRC_DATA_WIDTH (CCI_DATA_WIDTH),
.DST_DATA_WIDTH (LMEM_DATA_WIDTH),
@ -488,7 +495,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.RSP_OUT_BUF (0)
) cci_mem_adapter (
.clk (clk),
.reset (cci_adapter_reset),
.reset (reset),
.mem_req_valid_in (cci_mem_req_valid),
.mem_req_addr_in (cci_mem_req_addr),
@ -517,8 +524,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.mem_rsp_ready_out (cci_vx_mem_bus_if[1].rsp_ready)
);
assign cci_vx_mem_bus_if[1].req_data.atype = '0;
`UNUSED_VAR (cci_vx_mem_bus_if[1].req_data.atype)
assign cci_vx_mem_bus_if[1].req_data.flags = '0;
//--
@ -528,8 +534,6 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
assign vx_mem_req_valid_qual = vx_mem_req_valid && ~vx_mem_is_cout;
`RESET_RELAY (vx_adapter_reset, reset);
VX_mem_adapter #(
.SRC_DATA_WIDTH (`VX_MEM_DATA_WIDTH),
.DST_DATA_WIDTH (LMEM_DATA_WIDTH),
@ -541,7 +545,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.RSP_OUT_BUF (2)
) vx_mem_adapter (
.clk (clk),
.reset (vx_adapter_reset),
.reset (reset),
.mem_req_valid_in (vx_mem_req_valid_qual),
.mem_req_addr_in (vx_mem_req_addr),
@ -570,8 +574,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.mem_rsp_ready_out (cci_vx_mem_bus_if[0].rsp_ready)
);
assign cci_vx_mem_bus_if[0].req_data.atype = '0;
`UNUSED_VAR (cci_vx_mem_bus_if[0].req_data.atype)
assign cci_vx_mem_bus_if[0].req_data.flags = '0;
//--
VX_mem_bus_if #(
@ -580,39 +583,37 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.TAG_WIDTH (AVS_REQ_TAGW+1)
) mem_bus_if[1]();
`RESET_RELAY (mem_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (2),
.DATA_SIZE (LMEM_DATA_SIZE),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.TAG_WIDTH (AVS_REQ_TAGW),
.ARBITER ("P"),
.ARBITER ("P"), // prioritize VX requests
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (0)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
.reset (reset),
.bus_in_if (cci_vx_mem_bus_if),
.bus_out_if (mem_bus_if)
);
//--
`RESET_RELAY (avs_adapter_reset, reset);
VX_avs_adapter #(
.DATA_WIDTH (LMEM_DATA_WIDTH),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.ADDR_WIDTH_IN (LMEM_ADDR_WIDTH),
.ADDR_WIDTH_OUT($bits(t_local_mem_addr)),
.BURST_WIDTH (LMEM_BURST_CTRW),
.NUM_BANKS (NUM_LOCAL_MEM_BANKS),
.TAG_WIDTH (AVS_REQ_TAGW + 1),
.RD_QUEUE_SIZE (AVS_RD_QUEUE_SIZE),
.BANK_INTERLEAVE(`PLATFORM_MEMORY_INTERLEAVE),
.REQ_OUT_BUF (2),
.RSP_OUT_BUF (0)
) avs_adapter (
.clk (clk),
.reset (avs_adapter_reset),
.reset (reset),
// Memory request
.mem_req_valid (mem_bus_if[0].req_valid),
@ -641,8 +642,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.avs_readdatavalid(avs_readdatavalid)
);
assign mem_bus_if[0].req_data.atype = '0;
`UNUSED_VAR (mem_bus_if[0].req_data.atype)
`UNUSED_VAR (mem_bus_if[0].req_data.flags)
// CCI-P Read Request ///////////////////////////////////////////////////////////
@ -692,9 +692,11 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.reset (reset),
.incr (cci_rd_req_fire),
.decr (cci_rdq_pop),
`UNUSED_PIN (empty),
`UNUSED_PIN (alm_empty),
.full (cci_pending_reads_full),
.size (cci_pending_reads),
`UNUSED_PIN (empty)
`UNUSED_PIN (alm_full),
.size (cci_pending_reads)
);
`UNUSED_VAR (cci_pending_reads)
@ -748,7 +750,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
cci_rd_req_addr <= cci_rd_req_addr + 1;
cci_rd_req_ctr <= cci_rd_req_ctr + $bits(cci_rd_req_ctr)'(1);
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: CCI Rd Req: addr=0x%0h, tag=0x%0h, rem=%0d, pending=%0d\n", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr - 1), cci_pending_reads));
`TRACE(2, ("%t: AFU: CCI Rd Req: addr=0x%0h, tag=0x%0h, rem=%0d, pending=%0d\n", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr - 1), cci_pending_reads))
`endif
end
@ -758,13 +760,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
cci_mem_wr_req_addr_base <= cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE);
end
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=0x%0h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data));
`TRACE(2, ("%t: AFU: CCI Rd Rsp: idx=%0d, ctr=%0d, data=0x%h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data))
`endif
end
if (cci_rdq_pop) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: CCI Rd Queue Pop: pending=%0d\n", $time, cci_pending_reads));
`TRACE(2, ("%t: AFU: CCI Rd Queue Pop: pending=%0d\n", $time, cci_pending_reads))
`endif
end
@ -776,14 +778,12 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
end
end
`RESET_RELAY (cci_rdq_reset, reset);
VX_fifo_queue #(
.DATAW (CCI_RD_QUEUE_DATAW),
.DEPTH (CCI_RD_QUEUE_SIZE)
) cci_rd_req_queue (
.clk (clk),
.reset (cci_rdq_reset),
.reset (reset),
.push (cci_rdq_push),
.pop (cci_rdq_pop),
.data_in (cci_rdq_din),
@ -852,7 +852,9 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.incr (cci_mem_rd_rsp_fire),
.decr (cci_wr_rsp_fire),
.empty (cci_pending_writes_empty),
`UNUSED_PIN (alm_empty),
.full (cci_pending_writes_full),
`UNUSED_PIN (alm_full),
.size (cci_pending_writes)
);
@ -902,13 +904,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
cci_wr_req_done <= 1;
end
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: CCI Wr Req: addr=0x%0h, rem=%0d, pending=%0d, data=0x%0h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data));
`TRACE(2, ("%t: AFU: CCI Wr Req: addr=0x%0h, rem=%0d, pending=%0d, data=0x%h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data))
`endif
end
if (cci_wr_rsp_fire) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: CCI Wr Rsp: pending=%0d\n", $time, cci_pending_writes));
`TRACE(2, ("%t: AFU: CCI Wr Rsp: pending=%0d\n", $time, cci_pending_writes))
`endif
end
end
@ -926,17 +928,17 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
// Vortex ///////////////////////////////////////////////////////////////////
wire vx_dcr_wr_valid = (STATE_DCR_WRITE == state);
wire vx_dcr_wr_valid = (STATE_DCR_WRITE == state);
wire [`VX_DCR_ADDR_WIDTH-1:0] vx_dcr_wr_addr = cmd_dcr_addr;
wire [`VX_DCR_DATA_WIDTH-1:0] vx_dcr_wr_data = cmd_dcr_data;
`SCOPE_IO_SWITCH (2)
`SCOPE_IO_SWITCH (2);
Vortex vortex (
`SCOPE_IO_BIND (1)
.clk (clk),
.reset (reset || ~vx_running),
.reset (vx_reset),
// Memory request
.mem_req_valid (vx_mem_req_valid),
@ -966,7 +968,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
wire [COUT_TID_WIDTH-1:0] cout_tid;
VX_onehot_encoder #(
VX_encoder #(
.N (`VX_MEM_BYTEEN_WIDTH)
) cout_tid_enc (
.data_in (vx_mem_req_byteen),
@ -987,7 +989,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
wire cout_q_push = vx_mem_req_valid && vx_mem_is_cout && ~cout_q_full;
wire cout_q_pop = cp2af_sRxPort.c0.mmioRdValid
&& (mmio_hdr.address == MMIO_STATUS)
&& (mmio_req_hdr.address == MMIO_STATUS)
&& ~cout_q_empty;
VX_fifo_queue #(
@ -1010,59 +1012,59 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
// SCOPE //////////////////////////////////////////////////////////////////////
`ifdef DBG_SCOPE_AFU
`ifdef SCOPE
wire mem_req_fire = mem_bus_if[0].req_valid && mem_bus_if[0].req_ready;
wire mem_rsp_fire = mem_bus_if[0].rsp_valid && mem_bus_if[0].rsp_ready;
wire avs_write_fire = avs_write[0] && ~avs_waitrequest[0];
wire avs_read_fire = avs_read[0] && ~avs_waitrequest[0];
wire [$bits(t_local_mem_addr)-1:0] mem_bus_if_addr = mem_bus_if[0].req_data.addr;
reg [STATE_WIDTH-1:0] state_prev;
always @(posedge clk) begin
state_prev <= state;
end
wire state_changed = (state != state_prev);
wire state_changed = (state != state_prev);
wire vx_mem_req_fire = vx_mem_req_valid && vx_mem_req_ready;
wire vx_mem_rsp_fire = vx_mem_rsp_valid && vx_mem_rsp_ready;
wire avs_req_fire = (avs_write[0] || avs_read[0]) && ~avs_waitrequest[0];
VX_scope_tap #(
.SCOPE_ID (0),
.TRIGGERW (24),
.PROBEW (431)
) scope_tap (
.clk(clk),
.reset(scope_reset_w[0]),
.start(1'b0),
.stop(1'b0),
.triggers({
reset,
state_changed,
mem_req_fire,
mem_rsp_fire,
avs_write_fire,
avs_read_fire,
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP (0, 0, {
vx_reset,
vx_busy,
vx_mem_req_valid,
vx_mem_req_ready,
vx_mem_rsp_valid,
vx_mem_rsp_ready,
avs_read[0],
avs_write[0],
avs_waitrequest[0],
avs_readdatavalid[0],
cp2af_sRxPort.c0.mmioRdValid,
cp2af_sRxPort.c0.mmioWrValid,
cp2af_sRxPort.c0.rspValid,
cp2af_sRxPort.c1.rspValid,
af2cp_sTxPort.c0.valid,
af2cp_sTxPort.c1.valid,
cp2af_sRxPort.c0TxAlmFull,
cp2af_sRxPort.c1TxAlmFull,
af2cp_sTxPort.c2.mmioRdValid,
cci_wr_req_fire,
cci_wr_rsp_fire,
cp2af_sRxPort.c1TxAlmFull
},{
state_changed,
vx_dcr_wr_valid, // ack-free
avs_readdatavalid[0], // ack-free
cp2af_sRxPort.c0.mmioRdValid, // ack-free
cp2af_sRxPort.c0.mmioWrValid, // ack-free
af2cp_sTxPort.c2.mmioRdValid, // ack-free
cp2af_sRxPort.c0.rspValid, // ack-free
cp2af_sRxPort.c1.rspValid, // ack-free
cci_rd_req_fire,
cci_rd_rsp_fire,
cci_pending_reads_full,
cci_pending_writes_empty,
cci_pending_writes_full
}),
.probes({
cci_wr_req_fire,
avs_req_fire,
vx_mem_req_fire,
vx_mem_rsp_fire
},{
cmd_type,
state,
mmio_hdr.address,
mmio_hdr.length,
vx_mem_req_rw,
vx_mem_req_byteen,
vx_mem_req_addr,
vx_mem_req_data,
vx_mem_req_tag,
vx_mem_rsp_data,
vx_mem_rsp_tag,
vx_dcr_wr_addr,
vx_dcr_wr_data,
mmio_req_hdr.address,
cp2af_sRxPort.c0.hdr.mdata,
af2cp_sTxPort.c0.hdr.address,
af2cp_sTxPort.c0.hdr.mdata,
@ -1074,15 +1076,12 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
cci_mem_wr_req_ctr,
cci_rd_req_ctr,
cci_rd_rsp_ctr,
cci_wr_req_ctr,
mem_bus_if_addr
}),
.bus_in(scope_bus_in_w[0]),
.bus_out(scope_bus_out_w[0])
);
`endif
cci_wr_req_ctr
},
reset_negedge, 1'b0, 4096
);
`else
`SCOPE_IO_UNUSED_W(0)
`SCOPE_IO_UNUSED(0)
`endif
///////////////////////////////////////////////////////////////////////////////
@ -1091,13 +1090,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
always @(posedge clk) begin
for (integer i = 0; i < NUM_LOCAL_MEM_BANKS; ++i) begin
if (avs_write[i] && ~avs_waitrequest[i]) begin
`TRACE(2, ("%d: AVS Wr Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h, data=0x%0h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i], avs_writedata[i]));
`TRACE(2, ("%t: AVS Wr Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h, data=0x%h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i], avs_writedata[i]))
end
if (avs_read[i] && ~avs_waitrequest[i]) begin
`TRACE(2, ("%d: AVS Rd Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i]));
`TRACE(2, ("%t: AVS Rd Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i]))
end
if (avs_readdatavalid[i]) begin
`TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%0h\n", $time, i, avs_readdata[i]));
`TRACE(2, ("%t: AVS Rd Rsp [%0d]: data=0x%h\n", $time, i, avs_readdata[i]))
end
end
end

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -17,9 +17,9 @@
`define AFU_ACCEL_NAME "vortex_afu"
`define AFU_ACCEL_UUID 128'h35F9452B_25C2_434C_93D5_6F8C60DB361C
`define AFU_IMAGE_CMD_MEM_READ 1
`define AFU_IMAGE_CMD_MEM_READ 1
`define AFU_IMAGE_CMD_MEM_WRITE 2
`define AFU_IMAGE_CMD_RUN 3
`define AFU_IMAGE_CMD_RUN 3
`define AFU_IMAGE_CMD_DCR_WRITE 4
`define AFU_IMAGE_CMD_MAX_VALUE 4

View file

@ -14,22 +14,20 @@
`include "vortex_afu.vh"
module VX_afu_ctrl #(
parameter AXI_ADDR_WIDTH = 8,
parameter AXI_DATA_WIDTH = 32,
parameter AXI_NUM_BANKS = 1
parameter S_AXI_ADDR_WIDTH = 8,
parameter S_AXI_DATA_WIDTH = 32
) (
// axi4 lite slave signals
input wire clk,
input wire reset,
input wire clk_en,
input wire s_axi_awvalid,
input wire [AXI_ADDR_WIDTH-1:0] s_axi_awaddr,
input wire [S_AXI_ADDR_WIDTH-1:0] s_axi_awaddr,
output wire s_axi_awready,
input wire s_axi_wvalid,
input wire [AXI_DATA_WIDTH-1:0] s_axi_wdata,
input wire [AXI_DATA_WIDTH/8-1:0] s_axi_wstrb,
input wire [S_AXI_DATA_WIDTH-1:0] s_axi_wdata,
input wire [S_AXI_DATA_WIDTH/8-1:0]s_axi_wstrb,
output wire s_axi_wready,
output wire s_axi_bvalid,
@ -37,11 +35,11 @@ module VX_afu_ctrl #(
input wire s_axi_bready,
input wire s_axi_arvalid,
input wire [AXI_ADDR_WIDTH-1:0] s_axi_araddr,
input wire [S_AXI_ADDR_WIDTH-1:0] s_axi_araddr,
output wire s_axi_arready,
output wire s_axi_rvalid,
output wire [AXI_DATA_WIDTH-1:0] s_axi_rdata,
output wire [S_AXI_DATA_WIDTH-1:0] s_axi_rdata,
output wire [1:0] s_axi_rresp,
input wire s_axi_rready,
@ -57,8 +55,6 @@ module VX_afu_ctrl #(
output wire scope_bus_out,
`endif
output wire [63:0] mem_base [AXI_NUM_BANKS],
output wire dcr_wr_valid,
output wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
output wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data
@ -110,39 +106,36 @@ module VX_afu_ctrl #(
ADDR_DEV_0 = 8'h10,
ADDR_DEV_1 = 8'h14,
//ADDR_DEV_CTRL = 8'h18,
ADDR_ISA_0 = 8'h1C,
ADDR_ISA_1 = 8'h20,
//ADDR_ISA_CTRL = 8'h24,
ADDR_ISA_0 = 8'h18,
ADDR_ISA_1 = 8'h1C,
ADDR_DCR_0 = 8'h28,
ADDR_DCR_1 = 8'h2C,
//ADDR_DCR_CTRL = 8'h30,
ADDR_DCR_0 = 8'h20,
ADDR_DCR_1 = 8'h24,
`ifdef SCOPE
ADDR_SCP_0 = 8'h34,
ADDR_SCP_1 = 8'h38,
//ADDR_SCP_CTRL = 8'h3C,
ADDR_SCP_0 = 8'h28,
ADDR_SCP_1 = 8'h2C,
`endif
ADDR_MEM_0 = 8'h40,
ADDR_MEM_1 = 8'h44,
//ADDR_MEM_CTRL = 8'h48,
ADDR_BITS = 8;
localparam
WSTATE_IDLE = 2'd0,
WSTATE_ADDR = 2'd0,
WSTATE_DATA = 2'd1,
WSTATE_RESP = 2'd2;
WSTATE_RESP = 2'd2,
WSTATE_WIDTH = 2;
localparam
RSTATE_IDLE = 2'd0,
RSTATE_DATA = 2'd1;
RSTATE_ADDR = 2'd0,
RSTATE_DATA = 2'd1,
RSTATE_RESP = 2'd2,
RSTATE_WIDTH = 2;
// device caps
wire [63:0] dev_caps = {16'b0,
wire [63:0] dev_caps = {8'b0,
5'(`PLATFORM_MEMORY_ADDR_WIDTH-20),
3'(`CLOG2(`PLATFORM_MEMORY_BANKS)),
8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
16'(`NUM_CORES * `NUM_CLUSTERS),
8'(`NUM_WARPS),
@ -153,16 +146,18 @@ module VX_afu_ctrl #(
2'(`CLOG2(`XLEN)-4),
30'(`MISA_STD)};
reg [1:0] wstate;
reg [WSTATE_WIDTH-1:0] wstate;
reg [ADDR_BITS-1:0] waddr;
wire [31:0] wmask;
wire s_axi_aw_fire;
wire s_axi_w_fire;
wire s_axi_b_fire;
reg [1:0] rstate;
logic [RSTATE_WIDTH-1:0] rstate;
reg [31:0] rdata;
wire [ADDR_BITS-1:0] raddr;
reg [ADDR_BITS-1:0] raddr;
wire s_axi_ar_fire;
wire s_axi_r_fire;
reg ap_reset_r;
reg ap_start_r;
@ -170,20 +165,23 @@ module VX_afu_ctrl #(
reg gie_r;
reg [1:0] ier_r;
reg [1:0] isr_r;
reg [63:0] mem_r [AXI_NUM_BANKS];
reg [31:0] dcra_r;
reg [31:0] dcrv_r;
reg dcr_wr_valid_r;
logic wready_stall;
logic rvalid_stall;
`ifdef SCOPE
reg [63:0] scope_bus_wdata;
reg [63:0] scope_bus_rdata;
reg [63:0] scope_bus_wdata, scope_bus_rdata;
reg [5:0] scope_bus_ctr;
reg cmd_scope_reading;
reg cmd_scope_writing;
reg cmd_scope_writing, cmd_scope_reading;
reg scope_bus_out_r;
reg scope_rdata_valid;
reg is_scope_waddr, is_scope_raddr;
always @(posedge clk) begin
if (reset) begin
@ -191,18 +189,33 @@ module VX_afu_ctrl #(
cmd_scope_writing <= 0;
scope_bus_ctr <= '0;
scope_bus_out_r <= 0;
end else if (clk_en) begin
is_scope_waddr <= 0;
is_scope_raddr <= 0;
scope_bus_rdata <= '0;
scope_rdata_valid <= 0;
end else begin
scope_bus_out_r <= 0;
if (s_axi_aw_fire) begin
is_scope_waddr <= (s_axi_awaddr[ADDR_BITS-1:0] == ADDR_SCP_0)
|| (s_axi_awaddr[ADDR_BITS-1:0] == ADDR_SCP_1);
end
if (s_axi_ar_fire) begin
is_scope_raddr <= (s_axi_araddr[ADDR_BITS-1:0] == ADDR_SCP_0)
|| (s_axi_araddr[ADDR_BITS-1:0] == ADDR_SCP_1);
end
if (s_axi_w_fire && waddr == ADDR_SCP_0) begin
scope_bus_wdata[31:0] <= (s_axi_wdata & wmask) | (scope_bus_wdata[31:0] & ~wmask);
end
if (s_axi_w_fire && waddr == ADDR_SCP_1) begin
scope_bus_wdata[63:32] <= (s_axi_wdata & wmask) | (scope_bus_wdata[63:32] & ~wmask);
cmd_scope_writing <= 1;
scope_rdata_valid <= 0;
scope_bus_out_r <= 1;
scope_bus_ctr <= 63;
end
if (scope_bus_in) begin
cmd_scope_reading <= 1;
scope_bus_rdata <= '0;
scope_bus_ctr <= 63;
end
if (cmd_scope_reading) begin
@ -210,13 +223,16 @@ module VX_afu_ctrl #(
scope_bus_ctr <= scope_bus_ctr - 1;
if (scope_bus_ctr == 0) begin
cmd_scope_reading <= 0;
scope_rdata_valid <= 1;
scope_bus_ctr <= 0;
end
end
if (cmd_scope_writing) begin
scope_bus_out_r <= 1'(scope_bus_wdata >> scope_bus_ctr);
scope_bus_out_r <= scope_bus_wdata[scope_bus_ctr];
scope_bus_ctr <= scope_bus_ctr - 1;
if (scope_bus_ctr == 0) begin
cmd_scope_writing <= 0;
scope_bus_ctr <= 0;
end
end
end
@ -224,41 +240,50 @@ module VX_afu_ctrl #(
assign scope_bus_out = scope_bus_out_r;
assign wready_stall = is_scope_waddr && cmd_scope_writing;
assign rvalid_stall = is_scope_raddr && ~scope_rdata_valid;
`else
assign wready_stall = 0;
assign rvalid_stall = 0;
`endif
// AXI Write
// AXI Write Request
assign s_axi_awready = (wstate == WSTATE_ADDR);
assign s_axi_wready = (wstate == WSTATE_DATA) && ~wready_stall;
assign s_axi_awready = (wstate == WSTATE_IDLE);
assign s_axi_wready = (wstate == WSTATE_DATA);
// AXI Write Response
assign s_axi_bvalid = (wstate == WSTATE_RESP);
assign s_axi_bresp = 2'b00; // OKAY
assign s_axi_aw_fire = s_axi_awvalid && s_axi_awready;
assign s_axi_w_fire = s_axi_wvalid && s_axi_wready;
for (genvar i = 0; i < 4; ++i) begin
for (genvar i = 0; i < 4; ++i) begin : g_wmask
assign wmask[8 * i +: 8] = {8{s_axi_wstrb[i]}};
end
assign s_axi_aw_fire = s_axi_awvalid && s_axi_awready;
assign s_axi_w_fire = s_axi_wvalid && s_axi_wready;
assign s_axi_b_fire = s_axi_bvalid && s_axi_bready;
// wstate
always @(posedge clk) begin
if (reset) begin
wstate <= WSTATE_IDLE;
end else if (clk_en) begin
wstate <= WSTATE_ADDR;
end else begin
case (wstate)
WSTATE_IDLE: wstate <= s_axi_awvalid ? WSTATE_DATA : WSTATE_IDLE;
WSTATE_DATA: wstate <= s_axi_wvalid ? WSTATE_RESP : WSTATE_DATA;
WSTATE_RESP: wstate <= s_axi_bready ? WSTATE_IDLE : WSTATE_RESP;
default: wstate <= WSTATE_IDLE;
WSTATE_ADDR: wstate <= s_axi_aw_fire ? WSTATE_DATA : WSTATE_ADDR;
WSTATE_DATA: wstate <= s_axi_w_fire ? WSTATE_RESP : WSTATE_DATA;
WSTATE_RESP: wstate <= s_axi_b_fire ? WSTATE_ADDR : WSTATE_RESP;
default: wstate <= WSTATE_ADDR;
endcase
end
end
// waddr
always @(posedge clk) begin
if (clk_en) begin
if (s_axi_aw_fire)
waddr <= s_axi_awaddr[ADDR_BITS-1:0];
if (s_axi_aw_fire) begin
waddr <= s_axi_awaddr[ADDR_BITS-1:0];
end
end
@ -276,16 +301,13 @@ module VX_afu_ctrl #(
dcra_r <= '0;
dcrv_r <= '0;
dcr_wr_valid_r <= 0;
end else begin
dcr_wr_valid_r <= 0;
ap_reset_r <= 0;
for (integer i = 0; i < AXI_NUM_BANKS; ++i) begin
mem_r[i] <= '0;
end
end else if (clk_en) begin
if (ap_ready)
ap_start_r <= auto_restart_r;
dcr_wr_valid_r <= 0;
if (s_axi_w_fire) begin
case (waddr)
ADDR_AP_CTRL: begin
@ -317,16 +339,7 @@ module VX_afu_ctrl #(
dcrv_r <= (s_axi_wdata & wmask) | (dcrv_r & ~wmask);
dcr_wr_valid_r <= 1;
end
default: begin
for (integer i = 0; i < AXI_NUM_BANKS; ++i) begin
if (waddr == (ADDR_MEM_0 + 8'(i) * 8'd12)) begin
mem_r[i][31:0] <= (s_axi_wdata & wmask) | (mem_r[i][31:0] & ~wmask);
end
if (waddr == (ADDR_MEM_1 + 8'(i) * 8'd12)) begin
mem_r[i][63:32] <= (s_axi_wdata & wmask) | (mem_r[i][63:32] & ~wmask);
end
end
end
default:;
endcase
if (ier_r[0] & ap_done)
@ -337,83 +350,86 @@ module VX_afu_ctrl #(
end
end
// AXI Read
// AXI Read Request
assign s_axi_arready = (rstate == RSTATE_ADDR);
assign s_axi_arready = (rstate == RSTATE_IDLE);
assign s_axi_rvalid = (rstate == RSTATE_DATA);
// AXI Read Response
assign s_axi_rvalid = (rstate == RSTATE_RESP);
assign s_axi_rdata = rdata;
assign s_axi_rresp = 2'b00; // OKAY
assign s_axi_ar_fire = s_axi_arvalid && s_axi_arready;
assign raddr = s_axi_araddr[ADDR_BITS-1:0];
assign s_axi_r_fire = s_axi_rvalid && s_axi_rready;
// rstate
always @(posedge clk) begin
if (reset) begin
rstate <= RSTATE_IDLE;
end else if (clk_en) begin
rstate <= RSTATE_ADDR;
end else begin
case (rstate)
RSTATE_IDLE: rstate <= s_axi_arvalid ? RSTATE_DATA : RSTATE_IDLE;
RSTATE_DATA: rstate <= (s_axi_rready & s_axi_rvalid) ? RSTATE_IDLE : RSTATE_DATA;
default: rstate <= RSTATE_IDLE;
RSTATE_ADDR: rstate <= s_axi_ar_fire ? RSTATE_DATA : RSTATE_ADDR;
RSTATE_DATA: rstate <= (~rvalid_stall) ? RSTATE_RESP : RSTATE_DATA;
RSTATE_RESP: rstate <= s_axi_r_fire ? RSTATE_ADDR : RSTATE_RESP;
default: rstate <= RSTATE_ADDR;
endcase
end
end
// raddr
always @(posedge clk) begin
if (s_axi_ar_fire) begin
raddr <= s_axi_araddr[ADDR_BITS-1:0];
end
end
// rdata
always @(posedge clk) begin
if (clk_en) begin
if (s_axi_ar_fire) begin
rdata <= '0;
case (raddr)
ADDR_AP_CTRL: begin
rdata[0] <= ap_start_r;
rdata[1] <= ap_done;
rdata[2] <= ap_idle;
rdata[3] <= ap_ready;
rdata[7] <= auto_restart_r;
end
ADDR_GIE: begin
rdata <= 32'(gie_r);
end
ADDR_IER: begin
rdata <= 32'(ier_r);
end
ADDR_ISR: begin
rdata <= 32'(isr_r);
end
ADDR_DEV_0: begin
rdata <= dev_caps[31:0];
end
ADDR_DEV_1: begin
rdata <= dev_caps[63:32];
end
ADDR_ISA_0: begin
rdata <= isa_caps[31:0];
end
ADDR_ISA_1: begin
rdata <= isa_caps[63:32];
end
`ifdef SCOPE
ADDR_SCP_0: begin
rdata <= scope_bus_rdata[31:0];
end
ADDR_SCP_1: begin
rdata <= scope_bus_rdata[63:32];
end
`endif
default:;
endcase
rdata <= '0;
case (raddr)
ADDR_AP_CTRL: begin
rdata[0] <= ap_start_r;
rdata[1] <= ap_done;
rdata[2] <= ap_idle;
rdata[3] <= ap_ready;
rdata[7] <= auto_restart_r;
end
end
ADDR_GIE: begin
rdata <= 32'(gie_r);
end
ADDR_IER: begin
rdata <= 32'(ier_r);
end
ADDR_ISR: begin
rdata <= 32'(isr_r);
end
ADDR_DEV_0: begin
rdata <= dev_caps[31:0];
end
ADDR_DEV_1: begin
rdata <= dev_caps[63:32];
end
ADDR_ISA_0: begin
rdata <= isa_caps[31:0];
end
ADDR_ISA_1: begin
rdata <= isa_caps[63:32];
end
`ifdef SCOPE
ADDR_SCP_0: begin
rdata <= scope_bus_rdata[31:0];
end
ADDR_SCP_1: begin
rdata <= scope_bus_rdata[63:32];
end
`endif
default:;
endcase
end
assign ap_reset = ap_reset_r;
assign ap_start = ap_start_r;
assign interrupt = gie_r & (| isr_r);
assign mem_base = mem_r;
assign dcr_wr_valid = dcr_wr_valid_r;
assign dcr_wr_addr = `VX_DCR_ADDR_WIDTH'(dcra_r);
assign dcr_wr_data = `VX_DCR_DATA_WIDTH'(dcrv_r);

View file

@ -16,17 +16,21 @@
module VX_afu_wrap #(
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
parameter C_M_AXI_MEM_ID_WIDTH = `M_AXI_MEM_ID_WIDTH,
parameter C_M_AXI_MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter C_M_AXI_MEM_DATA_WIDTH = `VX_MEM_DATA_WIDTH
parameter C_M_AXI_MEM_ID_WIDTH = 32,
parameter C_M_AXI_MEM_DATA_WIDTH = 512,
parameter C_M_AXI_MEM_ADDR_WIDTH = 25,
parameter C_M_AXI_MEM_NUM_BANKS = 2
) (
// System signals
input wire ap_clk,
input wire ap_rst_n,
input wire clk,
input wire reset,
// AXI4 master interface
`REPEAT (`M_AXI_MEM_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA),
`else
`REPEAT (`PLATFORM_MEMORY_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
`endif
// AXI4-Lite slave interface
input wire s_axi_ctrl_awvalid,
output wire s_axi_ctrl_awready,
@ -48,11 +52,18 @@ module VX_afu_wrap #(
output wire interrupt
);
localparam C_M_AXI_MEM_NUM_BANKS = `M_AXI_MEM_NUM_BANKS;
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
localparam M_AXI_MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH + $clog2(`PLATFORM_MEMORY_BANKS);
`else
localparam M_AXI_MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH;
`endif
localparam STATE_IDLE = 0;
localparam STATE_RUN = 1;
localparam PENDING_SIZEW = 12; // max outstanding requests size
localparam C_M_AXI_MEM_NUM_BANKS_SW = `CLOG2(C_M_AXI_MEM_NUM_BANKS+1);
wire m_axi_mem_awvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_awready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_a [C_M_AXI_MEM_NUM_BANKS];
@ -80,19 +91,18 @@ module VX_afu_wrap #(
wire [1:0] m_axi_mem_rresp_a [C_M_AXI_MEM_NUM_BANKS];
// convert memory interface to array
`REPEAT (`M_AXI_MEM_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
wire reset = ~ap_rst_n;
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
`else
`REPEAT (`PLATFORM_MEMORY_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
`endif
reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr;
reg [15:0] vx_pending_writes;
reg [PENDING_SIZEW-1:0] vx_pending_writes;
reg vx_busy_wait;
reg vx_running;
reg vx_reset = 1; // asserted at initialization
wire vx_busy;
wire [63:0] mem_base [C_M_AXI_MEM_NUM_BANKS];
wire dcr_wr_valid;
wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr;
wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data;
@ -101,8 +111,8 @@ module VX_afu_wrap #(
wire ap_reset;
wire ap_start;
wire ap_idle = ~vx_running;
wire ap_done = ~(state == STATE_RUN || vx_pending_writes != 0);
wire ap_idle = vx_reset;
wire ap_done = (state == STATE_IDLE) && (vx_pending_writes == '0);
wire ap_ready = 1'b1;
`ifdef SCOPE
@ -111,24 +121,33 @@ module VX_afu_wrap #(
wire scope_reset = reset;
`endif
always @(posedge ap_clk) begin
always @(posedge clk) begin
if (reset || ap_reset) begin
state <= STATE_IDLE;
vx_busy_wait <= 0;
vx_running <= 0;
state <= STATE_IDLE;
vx_reset <= 1;
end else begin
case (state)
STATE_IDLE: begin
if (ap_start) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: STATE RUN\n", $time));
`TRACE(2, ("%t: AFU: Goto STATE RUN\n", $time))
`endif
state <= STATE_RUN;
vx_running <= 0;
vx_reset_ctr <= (`RESET_DELAY-1);
vx_reset <= 1;
end
end
STATE_RUN: begin
if (vx_running) begin
if (vx_reset) begin
// wait until the reset network is ready
if (vx_reset_ctr == 0) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: Begin execution\n", $time))
`endif
vx_busy_wait <= 1;
vx_reset <= 0;
end
end else begin
if (vx_busy_wait) begin
// wait until processor goes busy
if (vx_busy) begin
@ -137,67 +156,63 @@ module VX_afu_wrap #(
end else begin
// wait until the processor is not busy
if (~vx_busy) begin
state <= STATE_IDLE;
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: AFU: End execution\n", $time));
`TRACE(2, ("%d: STATE IDLE\n", $time));
`TRACE(2, ("%t: AFU: End execution\n", $time))
`TRACE(2, ("%t: AFU: Goto STATE IDLE\n", $time))
`endif
state <= STATE_IDLE;
end
end
end else begin
// wait until the reset sequence is complete
if (vx_reset_ctr == (`RESET_DELAY-1)) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: AFU: Begin execution\n", $time));
`endif
vx_running <= 1;
vx_busy_wait <= 1;
end
end
end
endcase
// ensure reset network initialization
if (vx_reset_ctr != '0) begin
vx_reset_ctr <= vx_reset_ctr - 1;
end
end
end
reg m_axi_mem_wfire;
reg m_axi_mem_bfire;
wire [C_M_AXI_MEM_NUM_BANKS-1:0] m_axi_wr_req_fire, m_axi_wr_rsp_fire;
wire [C_M_AXI_MEM_NUM_BANKS_SW-1:0] cur_wr_reqs, cur_wr_rsps;
always @(*) begin
m_axi_mem_wfire = 0;
m_axi_mem_bfire = 0;
for (integer i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
m_axi_mem_wfire |= m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i];
m_axi_mem_bfire |= m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i];
end
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_awfire
VX_axi_write_ack axi_write_ack (
.clk (clk),
.reset (reset),
.awvalid(m_axi_mem_awvalid_a[i]),
.awready(m_axi_mem_awready_a[i]),
.wvalid (m_axi_mem_wvalid_a[i]),
.wready (m_axi_mem_wready_a[i]),
.tx_ack (m_axi_wr_req_fire[i]),
`UNUSED_PIN (aw_ack),
`UNUSED_PIN (w_ack),
`UNUSED_PIN (tx_rdy)
);
assign m_axi_wr_rsp_fire[i] = m_axi_mem_bvalid_a[i] & m_axi_mem_bready_a[i];
end
always @(posedge ap_clk) begin
if (reset || ap_reset) begin
`POP_COUNT(cur_wr_reqs, m_axi_wr_req_fire);
`POP_COUNT(cur_wr_rsps, m_axi_wr_rsp_fire);
wire signed [C_M_AXI_MEM_NUM_BANKS_SW:0] reqs_sub = (C_M_AXI_MEM_NUM_BANKS_SW+1)'(cur_wr_reqs) -
(C_M_AXI_MEM_NUM_BANKS_SW+1)'(cur_wr_rsps);
always @(posedge clk) begin
if (reset) begin
vx_pending_writes <= '0;
end else begin
if (m_axi_mem_wfire && ~m_axi_mem_bfire)
vx_pending_writes <= vx_pending_writes + 1;
if (~m_axi_mem_wfire && m_axi_mem_bfire)
vx_pending_writes <= vx_pending_writes - 1;
end
end
always @(posedge ap_clk) begin
if (state == STATE_RUN) begin
vx_reset_ctr <= vx_reset_ctr + 1;
end else begin
vx_reset_ctr <= '0;
vx_pending_writes <= vx_pending_writes + PENDING_SIZEW'(reqs_sub);
end
end
VX_afu_ctrl #(
.AXI_ADDR_WIDTH (C_S_AXI_CTRL_ADDR_WIDTH),
.AXI_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH),
.AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
.S_AXI_ADDR_WIDTH (C_S_AXI_CTRL_ADDR_WIDTH),
.S_AXI_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH)
) afu_ctrl (
.clk (ap_clk),
.reset (reset || ap_reset),
.clk_en (1'b1),
.clk (clk),
.reset (reset),
.s_axi_awvalid (s_axi_ctrl_awvalid),
.s_axi_awready (s_axi_ctrl_awready),
@ -229,37 +244,36 @@ module VX_afu_wrap #(
.scope_bus_out (scope_bus_in),
`endif
.mem_base (mem_base),
.dcr_wr_valid (dcr_wr_valid),
.dcr_wr_addr (dcr_wr_addr),
.dcr_wr_data (dcr_wr_data)
);
wire [`MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_w [C_M_AXI_MEM_NUM_BANKS];
wire [`MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_w [C_M_AXI_MEM_NUM_BANKS];
wire [M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_u [C_M_AXI_MEM_NUM_BANKS];
wire [M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_u [C_M_AXI_MEM_NUM_BANKS];
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_w[i]) + C_M_AXI_MEM_ADDR_WIDTH'(mem_base[i]);
assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_w[i]) + C_M_AXI_MEM_ADDR_WIDTH'(mem_base[i]);
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_addressing
localparam [C_M_AXI_MEM_ADDR_WIDTH-1:0] BANK_OFFSET = C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET) + C_M_AXI_MEM_ADDR_WIDTH'(i) << M_AXI_MEM_ADDR_WIDTH;
assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_u[i]) + BANK_OFFSET;
assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_u[i]) + BANK_OFFSET;
end
`SCOPE_IO_SWITCH (2)
`SCOPE_IO_SWITCH (2);
Vortex_axi #(
.AXI_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH),
.AXI_ADDR_WIDTH (`MEM_ADDR_WIDTH),
.AXI_ADDR_WIDTH (M_AXI_MEM_ADDR_WIDTH),
.AXI_TID_WIDTH (C_M_AXI_MEM_ID_WIDTH),
.AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
) vortex_axi (
`SCOPE_IO_BIND (1)
.clk (ap_clk),
.reset (reset || ap_reset || ~vx_running),
.clk (clk),
.reset (vx_reset),
.m_axi_awvalid (m_axi_mem_awvalid_a),
.m_axi_awready (m_axi_mem_awready_a),
.m_axi_awaddr (m_axi_mem_awaddr_w),
.m_axi_awaddr (m_axi_mem_awaddr_u),
.m_axi_awid (m_axi_mem_awid_a),
.m_axi_awlen (m_axi_mem_awlen_a),
`UNUSED_PIN (m_axi_awsize),
@ -283,7 +297,7 @@ module VX_afu_wrap #(
.m_axi_arvalid (m_axi_mem_arvalid_a),
.m_axi_arready (m_axi_mem_arready_a),
.m_axi_araddr (m_axi_mem_araddr_w),
.m_axi_araddr (m_axi_mem_araddr_u),
.m_axi_arid (m_axi_mem_arid_a),
.m_axi_arlen (m_axi_mem_arlen_a),
`UNUSED_PIN (m_axi_arsize),
@ -310,42 +324,60 @@ module VX_afu_wrap #(
// SCOPE //////////////////////////////////////////////////////////////////////
`ifdef DBG_SCOPE_AFU
`ifdef SCOPE
`define TRIGGERS { \
reset, \
ap_start, \
ap_done, \
ap_idle, \
interrupt, \
vx_busy_wait, \
vx_busy, \
vx_running \
}
`ifdef DBG_SCOPE_AFU
wire m_axi_mem_awfire_0 = m_axi_mem_awvalid_a[0] & m_axi_mem_awready_a[0];
wire m_axi_mem_arfire_0 = m_axi_mem_arvalid_a[0] & m_axi_mem_arready_a[0];
wire m_axi_mem_wfire_0 = m_axi_mem_wvalid_a[0] & m_axi_mem_wready_a[0];
wire m_axi_mem_bfire_0 = m_axi_mem_bvalid_a[0] & m_axi_mem_bready_a[0];
`define PROBES { \
vx_pending_writes \
}
VX_scope_tap #(
.SCOPE_ID (0),
.TRIGGERW ($bits(`TRIGGERS)),
.PROBEW ($bits(`PROBES))
) scope_tap (
.clk(clk),
.reset(scope_reset_w[0]),
.start(1'b0),
.stop(1'b0),
.triggers(`TRIGGERS),
.probes(`PROBES),
.bus_in(scope_bus_in_w[0]),
.bus_out(scope_bus_out_w[0])
);
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP (0, 0, {
ap_reset,
ap_start,
ap_done,
ap_idle,
interrupt,
vx_reset,
vx_busy,
m_axi_mem_awvalid_a[0],
m_axi_mem_awready_a[0],
m_axi_mem_wvalid_a[0],
m_axi_mem_wready_a[0],
m_axi_mem_bvalid_a[0],
m_axi_mem_bready_a[0],
m_axi_mem_arvalid_a[0],
m_axi_mem_arready_a[0],
m_axi_mem_rvalid_a[0],
m_axi_mem_rready_a[0]
}, {
dcr_wr_valid,
m_axi_mem_awfire_0,
m_axi_mem_arfire_0,
m_axi_mem_wfire_0,
m_axi_mem_bfire_0
},{
dcr_wr_addr,
dcr_wr_data,
vx_pending_writes,
m_axi_mem_awaddr_u[0],
m_axi_mem_awid_a[0],
m_axi_mem_bid_a[0],
m_axi_mem_araddr_u[0],
m_axi_mem_arid_a[0],
m_axi_mem_rid_a[0]
},
reset_negedge, 1'b0, 4096
);
`else
`SCOPE_IO_UNUSED(0)
`endif
`endif
`ifdef CHIPSCOPE
ila_afu ila_afu_inst (
.clk (ap_clk),
.clk (clk),
.probe0 ({
ap_reset,
ap_start,
ap_done,
ap_idle,
@ -355,13 +387,13 @@ module VX_afu_wrap #(
vx_pending_writes,
vx_busy_wait,
vx_busy,
vx_running
vx_reset,
dcr_wr_valid,
dcr_wr_addr,
dcr_wr_data
})
);
`endif
`else
`SCOPE_IO_UNUSED_W(0)
`endif
`ifdef SIMULATION
`ifndef VERILATOR
@ -371,7 +403,7 @@ module VX_afu_wrap #(
initial begin
$assertoff(0, vortex_axi);
end
always @(posedge ap_clk) begin
always @(posedge clk) begin
if (reset) begin
assert_delay_ctr <= '0;
assert_enabled <= 0;
@ -390,19 +422,19 @@ module VX_afu_wrap #(
`endif
`ifdef DBG_TRACE_AFU
always @(posedge ap_clk) begin
always @(posedge clk) begin
for (integer i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
if (m_axi_mem_awvalid_a[i] && m_axi_mem_awready_a[i]) begin
`TRACE(2, ("%d: AFU Wr Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]));
`TRACE(2, ("%t: AXI Wr Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]))
end
if (m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i]) begin
`TRACE(2, ("%d: AFU Wr Req [%0d]: data=0x%0h\n", $time, i, m_axi_mem_wdata_a[i]));
`TRACE(2, ("%t: AXI Wr Req [%0d]: data=0x%h\n", $time, i, m_axi_mem_wdata_a[i]))
end
if (m_axi_mem_arvalid_a[i] && m_axi_mem_arready_a[i]) begin
`TRACE(2, ("%d: AFU Rd Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i]));
`TRACE(2, ("%t: AXI Rd Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i]))
end
if (m_axi_mem_rvalid_a[i] && m_axi_mem_rready_a[i]) begin
`TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]));
`TRACE(2, ("%t: AXI Rd Rsp [%0d]: data=0x%h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]))
end
end
end

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -16,16 +16,25 @@
module vortex_afu #(
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
parameter C_M_AXI_MEM_ID_WIDTH = `M_AXI_MEM_ID_WIDTH,
parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH,
parameter C_M_AXI_MEM_DATA_WIDTH = `PLATFORM_MEMORY_DATA_WIDTH,
parameter C_M_AXI_MEM_ADDR_WIDTH = 64,
parameter C_M_AXI_MEM_DATA_WIDTH = `VX_MEM_DATA_WIDTH
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
parameter C_M_AXI_MEM_NUM_BANKS = 1
`else
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_BANKS
`endif
) (
// System signals
input wire ap_clk,
input wire ap_rst_n,
// AXI4 master interface
`REPEAT (`M_AXI_MEM_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA),
`else
`REPEAT (`PLATFORM_MEMORY_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
`endif
// AXI4-Lite slave interface
input wire s_axi_ctrl_awvalid,
@ -45,8 +54,8 @@ module vortex_afu #(
output wire s_axi_ctrl_bvalid,
input wire s_axi_ctrl_bready,
output wire [1:0] s_axi_ctrl_bresp,
output wire interrupt
output wire interrupt
);
VX_afu_wrap #(
@ -54,16 +63,19 @@ module vortex_afu #(
.C_S_AXI_CTRL_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH),
.C_M_AXI_MEM_ID_WIDTH (C_M_AXI_MEM_ID_WIDTH),
.C_M_AXI_MEM_ADDR_WIDTH (C_M_AXI_MEM_ADDR_WIDTH),
.C_M_AXI_MEM_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH)
.C_M_AXI_MEM_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH),
.C_M_AXI_MEM_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
) afu_wrap (
.ap_clk (ap_clk),
.ap_rst_n (ap_rst_n),
`REPEAT (`M_AXI_MEM_NUM_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
.clk (ap_clk),
.reset (~ap_rst_n),
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, AXI_MEM_ARGS, REPEAT_COMMA),
`else
`REPEAT (`PLATFORM_MEMORY_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
`endif
.s_axi_ctrl_awvalid (s_axi_ctrl_awvalid),
.s_axi_ctrl_awready (s_axi_ctrl_awready),
.s_axi_ctrl_awaddr (s_axi_ctrl_awaddr),
.s_axi_ctrl_awaddr (s_axi_ctrl_awaddr),
.s_axi_ctrl_wvalid (s_axi_ctrl_wvalid),
.s_axi_ctrl_wready (s_axi_ctrl_wready),
.s_axi_ctrl_wdata (s_axi_ctrl_wdata),
@ -81,5 +93,5 @@ module vortex_afu #(
.interrupt (interrupt)
);
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,12 +14,24 @@
`ifndef VORTEX_AFU_VH
`define VORTEX_AFU_VH
`ifndef M_AXI_MEM_NUM_BANKS
`define M_AXI_MEM_NUM_BANKS 1
`ifndef PLATFORM_MEMORY_BANKS
`define PLATFORM_MEMORY_BANKS 2
`endif
`ifndef M_AXI_MEM_ID_WIDTH
`define M_AXI_MEM_ID_WIDTH 32
`ifndef PLATFORM_MEMORY_ADDR_WIDTH
`define PLATFORM_MEMORY_ADDR_WIDTH 31
`endif
`ifndef PLATFORM_MEMORY_DATA_WIDTH
`define PLATFORM_MEMORY_DATA_WIDTH 512
`endif
`ifndef PLATFORM_MEMORY_OFFSET
`define PLATFORM_MEMORY_OFFSET 0
`endif
`ifndef PLATFORM_MEMORY_ID_WIDTH
`define PLATFORM_MEMORY_ID_WIDTH 32
`endif
`define GEN_AXI_MEM(i) \

129
hw/rtl/cache/VX_bank_flush.sv vendored Normal file
View file

@ -0,0 +1,129 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_bank_flush #(
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// Enable cache writeback
parameter WRITEBACK = 0
) (
input wire clk,
input wire reset,
input wire flush_begin,
output wire flush_end,
output wire flush_init,
output wire flush_valid,
output wire [`CS_LINE_SEL_BITS-1:0] flush_line,
output wire [NUM_WAYS-1:0] flush_way,
input wire flush_ready,
input wire mshr_empty,
input wire bank_empty
);
// ways interation is only needed when eviction is enabled
localparam CTR_WIDTH = `CS_LINE_SEL_BITS + (WRITEBACK ? `CS_WAY_SEL_BITS : 0);
localparam STATE_IDLE = 0;
localparam STATE_INIT = 1;
localparam STATE_WAIT1 = 2;
localparam STATE_FLUSH = 3;
localparam STATE_WAIT2 = 4;
localparam STATE_DONE = 5;
reg [2:0] state_r, state_n;
reg [CTR_WIDTH-1:0] counter_r;
always @(*) begin
state_n = state_r;
case (state_r)
STATE_IDLE: begin
if (flush_begin) begin
state_n = STATE_WAIT1;
end
end
STATE_INIT: begin
if (counter_r == ((2 ** `CS_LINE_SEL_BITS)-1)) begin
state_n = STATE_IDLE;
end
end
STATE_WAIT1: begin
// wait for pending requests to complete
if (mshr_empty) begin
state_n = STATE_FLUSH;
end
end
STATE_FLUSH: begin
if (counter_r == ((2 ** CTR_WIDTH)-1) && flush_ready) begin
state_n = (BANK_ID == 0) ? STATE_DONE : STATE_WAIT2;
end
end
STATE_WAIT2: begin
// ensure the bank is empty before notifying the cache flush unit,
// because the flush request to lower caches only goes through bank0
// and it is important that request gets send out last.
if (bank_empty) begin
state_n = STATE_DONE;
end
end
STATE_DONE: begin
// generate a completion pulse
state_n = STATE_IDLE;
end
endcase
end
always @(posedge clk) begin
if (reset) begin
state_r <= STATE_INIT;
counter_r <= '0;
end else begin
state_r <= state_n;
if (state_r != STATE_IDLE) begin
if ((state_r == STATE_INIT)
|| ((state_r == STATE_FLUSH) && flush_ready)) begin
counter_r <= counter_r + CTR_WIDTH'(1);
end
end else begin
counter_r <= '0;
end
end
end
assign flush_end = (state_r == STATE_DONE);
assign flush_init = (state_r == STATE_INIT);
assign flush_valid = (state_r == STATE_FLUSH);
assign flush_line = counter_r[`CS_LINE_SEL_BITS-1:0];
if (WRITEBACK && `CS_WAY_SEL_BITS > 0) begin : g_flush_way
VX_decoder #(
.N (`CS_WAY_SEL_BITS),
.D (NUM_WAYS)
) ctr_decoder (
.data_in (counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]),
.valid_in (1'b1),
.data_out (flush_way)
);
end else begin : g_flush_way_all
assign flush_way = {NUM_WAYS{1'b1}};
end
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,15 +14,15 @@
`include "VX_cache_define.vh"
module VX_cache import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter `STRING INSTANCE_ID = "",
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Size of cache in bytes
parameter CACHE_SIZE = 4096,
parameter CACHE_SIZE = 4096,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
@ -33,7 +33,7 @@ module VX_cache import VX_gpu_pkg::*; #(
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 8,
parameter MSHR_SIZE = 8,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
@ -42,6 +42,12 @@ module VX_cache import VX_gpu_pkg::*; #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -53,12 +59,12 @@ module VX_cache import VX_gpu_pkg::*; #(
// Memory request output register
parameter MEM_OUT_BUF = 0
) (
) (
// PERF
`ifdef PERF_ENABLE
output cache_perf_t cache_perf,
`endif
input wire clk,
input wire reset,
@ -66,23 +72,32 @@ module VX_cache import VX_gpu_pkg::*; #(
VX_mem_bus_if.master mem_bus_if
);
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter: number of banks must be power of 2"))
`STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter: writeback requires write enable"))
`STATIC_ASSERT(WRITEBACK || !DIRTY_BYTES, ("invalid parameter: dirty bytes require writeback"))
// In writeback mode, memory fill response may issue a new memory request to handle evicted blocks.
// We need to ensure that the memory request queue never fills up to avoid deadlock.
`STATIC_ASSERT(!WRITEBACK || (MREQ_SIZE >= MSHR_SIZE), ("invalid parameter: writeback requires MREQ_SIZE >= MSHR_SIZE"))
localparam REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS);
localparam WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS);
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
localparam MEM_TAG_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
localparam MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, UUID_WIDTH);
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
localparam WORD_WIDTH = WORD_SIZE * 8;
localparam WORD_SEL_BITS = `CLOG2(WORDS_PER_LINE);
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam LINE_ADDR_WIDTH = (`CS_WORD_ADDR_WIDTH - BANK_SEL_BITS - WORD_SEL_BITS);
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH;
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + 1;
localparam CORE_RSP_DATAW = WORD_WIDTH + TAG_WIDTH;
localparam BANK_MEM_TAG_WIDTH = UUID_WIDTH + MSHR_ADDR_WIDTH;
localparam CORE_REQ_BUF_ENABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
localparam MEM_REQ_BUF_ENABLE = (NUM_BANKS != 1);
localparam CORE_RSP_REG_DISABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
localparam MEM_REQ_REG_DISABLE = (NUM_BANKS != 1);
localparam REQ_XBAR_BUF = (NUM_REQS > 4) ? 2 : 0;
`ifdef PERF_ENABLE
wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
@ -90,24 +105,33 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank;
`endif
wire [NUM_REQS-1:0] core_req_valid;
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr;
wire [NUM_REQS-1:0] core_req_rw;
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag;
wire [NUM_REQS-1:0] core_req_ready;
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (TAG_WIDTH)
) core_bus2_if[NUM_REQS]();
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_valid[i] = core_bus_if[i].req_valid;
assign core_req_rw[i] = core_bus_if[i].req_data.rw;
assign core_req_byteen[i] = core_bus_if[i].req_data.byteen;
assign core_req_addr[i] = core_bus_if[i].req_data.addr;
assign core_req_data[i] = core_bus_if[i].req_data.data;
assign core_req_tag[i] = core_bus_if[i].req_data.tag;
assign core_bus_if[i].req_ready = core_req_ready[i];
`UNUSED_VAR (core_bus_if[i].req_data.atype)
end
wire [NUM_BANKS-1:0] per_bank_flush_begin;
wire [`UP(UUID_WIDTH)-1:0] flush_uuid;
wire [NUM_BANKS-1:0] per_bank_flush_end;
wire [NUM_BANKS-1:0] per_bank_core_req_fire;
VX_cache_flush #(
.NUM_REQS (NUM_REQS),
.NUM_BANKS (NUM_BANKS),
.UUID_WIDTH(UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // bank xbar latency
) flush_unit (
.clk (clk),
.reset (reset),
.core_bus_in_if (core_bus_if),
.core_bus_out_if (core_bus2_if),
.bank_req_fire (per_bank_core_req_fire),
.flush_begin (per_bank_flush_begin),
.flush_uuid (flush_uuid),
.flush_end (per_bank_flush_end)
);
///////////////////////////////////////////////////////////////////////////
@ -117,99 +141,101 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
wire [NUM_REQS-1:0] core_rsp_ready_s;
`RESET_RELAY (core_rsp_reset, reset);
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_buf
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
.SIZE (CORE_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.SIZE (CORE_RSP_REG_DISABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
.clk (clk),
.reset (core_rsp_reset),
.reset (reset),
.valid_in (core_rsp_valid_s[i]),
.ready_in (core_rsp_ready_s[i]),
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
.data_out ({core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag}),
.valid_out (core_bus_if[i].rsp_valid),
.ready_out (core_bus_if[i].rsp_ready)
.data_out ({core_bus2_if[i].rsp_data.data, core_bus2_if[i].rsp_data.tag}),
.valid_out (core_bus2_if[i].rsp_valid),
.ready_out (core_bus2_if[i].rsp_ready)
);
end
///////////////////////////////////////////////////////////////////////////
// Memory request buffering
wire mem_req_valid_s;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_s;
wire mem_req_rw_s;
wire [LINE_SIZE-1:0] mem_req_byteen_s;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s;
wire mem_req_ready_s;
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH),
.SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_valid_s),
.ready_in (mem_req_ready_s),
.data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s}),
.data_out ({mem_bus_if.req_data.rw, mem_bus_if.req_data.byteen, mem_bus_if.req_data.addr, mem_bus_if.req_data.data, mem_bus_if.req_data.tag}),
.valid_out (mem_bus_if.req_valid),
.ready_out (mem_bus_if.req_ready)
);
assign mem_bus_if.req_data.atype = '0;
///////////////////////////////////////////////////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) mem_bus_tmp_if();
// Memory response buffering
wire mem_rsp_valid_s;
wire [`CS_LINE_WIDTH-1:0] mem_rsp_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s;
wire mem_rsp_ready_s;
VX_elastic_buffer #(
.DATAW (MEM_TAG_WIDTH + `CS_LINE_WIDTH),
.DATAW (MEM_TAG_WIDTH + `CS_LINE_WIDTH),
.SIZE (MRSQ_SIZE),
.OUT_REG (MRSQ_SIZE > 2)
) mem_rsp_queue (
.clk (clk),
.reset (reset),
.valid_in (mem_bus_if.rsp_valid),
.ready_in (mem_bus_if.rsp_ready),
.data_in ({mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data}),
.data_out ({mem_rsp_tag_s, mem_rsp_data_s}),
.valid_in (mem_bus_tmp_if.rsp_valid),
.ready_in (mem_bus_tmp_if.rsp_ready),
.data_in ({mem_bus_tmp_if.rsp_data.tag, mem_bus_tmp_if.rsp_data.data}),
.data_out ({mem_rsp_tag_s, mem_rsp_data_s}),
.valid_out (mem_rsp_valid_s),
.ready_out (mem_rsp_ready_s)
);
///////////////////////////////////////////////////////////////////////
wire [BANK_MEM_TAG_WIDTH-1:0] bank_mem_rsp_tag;
wire [`UP(`CS_BANK_SEL_BITS)-1:0] mem_rsp_bank_id;
wire [`CS_LINE_SEL_BITS-1:0] init_line_sel;
wire init_enable;
if (NUM_BANKS > 1) begin : g_mem_rsp_tag_s_with_banks
assign bank_mem_rsp_tag = mem_rsp_tag_s[MEM_TAG_WIDTH-1:`CS_BANK_SEL_BITS];
assign mem_rsp_bank_id = mem_rsp_tag_s[`CS_BANK_SEL_BITS-1:0];
end else begin : g_mem_rsp_tag_s_no_bank
assign bank_mem_rsp_tag = mem_rsp_tag_s;
assign mem_rsp_bank_id = 0;
end
// this reset relay is required to sync with bank initialization
`RESET_RELAY (init_reset, reset);
// Memory request buffering
VX_cache_init #(
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS)
) cache_init (
wire mem_req_valid;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr;
wire mem_req_rw;
wire [LINE_SIZE-1:0] mem_req_byteen;
wire [`CS_LINE_WIDTH-1:0] mem_req_data;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_flush;
wire mem_req_ready;
wire mem_req_flush_b;
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + 1),
.SIZE (MEM_REQ_REG_DISABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (init_reset),
.addr_out (init_line_sel),
.valid_out (init_enable)
.reset (reset),
.valid_in (mem_req_valid),
.ready_in (mem_req_ready),
.data_in ({mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_data, mem_req_tag, mem_req_flush}),
.data_out ({mem_bus_tmp_if.req_data.rw, mem_bus_tmp_if.req_data.byteen, mem_bus_tmp_if.req_data.addr, mem_bus_tmp_if.req_data.data, mem_bus_tmp_if.req_data.tag, mem_req_flush_b}),
.valid_out (mem_bus_tmp_if.req_valid),
.ready_out (mem_bus_tmp_if.req_ready)
);
///////////////////////////////////////////////////////////////////////
assign mem_bus_tmp_if.req_data.flags = mem_req_flush_b ? `MEM_REQ_FLAGS_WIDTH'(1 << `MEM_REQ_FLAG_FLUSH) : '0;
if (WRITE_ENABLE) begin : g_mem_bus_if
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if);
end else begin : g_mem_bus_if_ro
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if, mem_bus_tmp_if);
end
///////////////////////////////////////////////////////////////////////////
wire [NUM_BANKS-1:0] per_bank_core_req_valid;
wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
wire [NUM_BANKS-1:0] per_bank_core_req_rw;
@ -218,81 +244,105 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_req_data;
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_req_tag;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_req_idx;
wire [NUM_BANKS-1:0] per_bank_core_req_flush;
wire [NUM_BANKS-1:0] per_bank_core_req_ready;
wire [NUM_BANKS-1:0] per_bank_core_rsp_valid;
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_rsp_data;
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_rsp_tag;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_rsp_idx;
wire [NUM_BANKS-1:0] per_bank_core_rsp_ready;
wire [NUM_BANKS-1:0] per_bank_mem_req_valid;
wire [NUM_BANKS-1:0] per_bank_mem_req_valid;
wire [NUM_BANKS-1:0][`CS_MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
wire [NUM_BANKS-1:0] per_bank_mem_req_rw;
wire [NUM_BANKS-1:0][WORD_SEL_WIDTH-1:0] per_bank_mem_req_wsel;
wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_mem_req_byteen;
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_mem_req_data;
wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0] per_bank_mem_req_id;
wire [NUM_BANKS-1:0][LINE_SIZE-1:0] per_bank_mem_req_byteen;
wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_req_data;
wire [NUM_BANKS-1:0][BANK_MEM_TAG_WIDTH-1:0] per_bank_mem_req_tag;
wire [NUM_BANKS-1:0] per_bank_mem_req_flush;
wire [NUM_BANKS-1:0] per_bank_mem_req_ready;
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
if (NUM_BANKS == 1) begin
assign mem_rsp_ready_s = per_bank_mem_rsp_ready;
end else begin
assign mem_rsp_ready_s = per_bank_mem_rsp_ready[`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s)];
end
assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready;
assign mem_rsp_ready_s = per_bank_mem_rsp_ready[mem_rsp_bank_id];
// Bank requests dispatch
wire [NUM_REQS-1:0][CORE_REQ_DATAW-1:0] core_req_data_in;
wire [NUM_BANKS-1:0][CORE_REQ_DATAW-1:0] core_req_data_out;
wire [NUM_REQS-1:0] core_req_valid;
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr;
wire [NUM_REQS-1:0] core_req_rw;
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag;
wire [NUM_REQS-1:0] core_req_flush;
wire [NUM_REQS-1:0] core_req_ready;
wire [NUM_REQS-1:0][LINE_ADDR_WIDTH-1:0] core_req_line_addr;
wire [NUM_REQS-1:0][BANK_SEL_WIDTH-1:0] core_req_bid;
wire [NUM_REQS-1:0][WORD_SEL_WIDTH-1:0] core_req_wsel;
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (WORDS_PER_LINE > 1) begin
wire [NUM_REQS-1:0][CORE_REQ_DATAW-1:0] core_req_data_in;
wire [NUM_BANKS-1:0][CORE_REQ_DATAW-1:0] core_req_data_out;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req
assign core_req_valid[i] = core_bus2_if[i].req_valid;
assign core_req_rw[i] = core_bus2_if[i].req_data.rw;
assign core_req_byteen[i] = core_bus2_if[i].req_data.byteen;
assign core_req_addr[i] = core_bus2_if[i].req_data.addr;
assign core_req_data[i] = core_bus2_if[i].req_data.data;
assign core_req_tag[i] = core_bus2_if[i].req_data.tag;
assign core_req_flush[i] = core_bus2_if[i].req_data.flags[`MEM_REQ_FLAG_FLUSH];
assign core_bus2_if[i].req_ready = core_req_ready[i];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_wsel
if (WORDS_PER_LINE > 1) begin : g_wsel
assign core_req_wsel[i] = core_req_addr[i][0 +: WORD_SEL_BITS];
end else begin
end else begin : g_no_wsel
assign core_req_wsel[i] = '0;
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_line_addr
assign core_req_line_addr[i] = core_req_addr[i][(BANK_SEL_BITS + WORD_SEL_BITS) +: LINE_ADDR_WIDTH];
end
if (NUM_BANKS > 1) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_bid
if (NUM_BANKS > 1) begin : g_multibanks
assign core_req_bid[i] = core_req_addr[i][WORD_SEL_BITS +: BANK_SEL_BITS];
end else begin : g_singlebank
assign core_req_bid[i] = '0;
end
end else begin
assign core_req_bid = '0;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_data_in
assign core_req_data_in[i] = {
core_req_line_addr[i],
core_req_rw[i],
core_req_wsel[i],
core_req_byteen[i],
core_req_byteen[i],
core_req_data[i],
core_req_tag[i]};
core_req_tag[i],
core_req_flush[i]
};
end
`ifdef PERF_ENABLE
wire [`PERF_CTR_BITS-1:0] perf_collisions;
`endif
`RESET_RELAY (req_xbar_reset, reset);
VX_stream_xbar #(
VX_stream_xbar #(
.NUM_INPUTS (NUM_REQS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (CORE_REQ_DATAW),
.PERF_CTR_BITS (`PERF_CTR_BITS),
.OUT_BUF ((NUM_REQS > 4) ? 2 : 0)
.ARBITER ("R"),
.OUT_BUF (REQ_XBAR_BUF)
) req_xbar (
.clk (clk),
.reset (req_xbar_reset),
.reset (reset),
`ifdef PERF_ENABLE
.collisions(perf_collisions),
`else
@ -308,32 +358,27 @@ module VX_cache import VX_gpu_pkg::*; #(
.ready_out (per_bank_core_req_ready)
);
for (genvar i = 0; i < NUM_BANKS; ++i) begin
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_core_req_data_out
assign {
per_bank_core_req_addr[i],
per_bank_core_req_rw[i],
per_bank_core_req_wsel[i],
per_bank_core_req_byteen[i],
per_bank_core_req_byteen[i],
per_bank_core_req_data[i],
per_bank_core_req_tag[i]} = core_req_data_out[i];
per_bank_core_req_tag[i],
per_bank_core_req_flush[i]
} = core_req_data_out[i];
end
// Banks access
for (genvar i = 0; i < NUM_BANKS; ++i) begin
for (genvar bank_id = 0; bank_id < NUM_BANKS; ++bank_id) begin : g_banks
wire [`CS_LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr;
wire curr_bank_mem_rsp_valid;
if (NUM_BANKS == 1) begin
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s;
end else begin
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s && (`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s) == i);
end
wire curr_bank_mem_rsp_valid = mem_rsp_valid_s && (mem_rsp_bank_id == bank_id);
`RESET_RELAY (bank_reset, reset);
VX_cache_bank #(
.BANK_ID (i),
.INSTANCE_ID (INSTANCE_ID),
VX_cache_bank #(
.BANK_ID (bank_id),
.INSTANCE_ID ($sformatf("%s-bank%0d", INSTANCE_ID, bank_id)),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
@ -344,84 +389,87 @@ module VX_cache import VX_gpu_pkg::*; #(
.MSHR_SIZE (MSHR_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.DIRTY_BYTES (DIRTY_BYTES),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_OUT_BUF (CORE_REQ_BUF_ENABLE ? 0 : CORE_OUT_BUF),
.MEM_OUT_BUF (MEM_REQ_BUF_ENABLE ? 0 : MEM_OUT_BUF)
) bank (
.CORE_OUT_REG (CORE_RSP_REG_DISABLE ? 0 : `TO_OUT_BUF_REG(CORE_OUT_BUF)),
.MEM_OUT_REG (MEM_REQ_REG_DISABLE ? 0 : `TO_OUT_BUF_REG(MEM_OUT_BUF))
) bank (
.clk (clk),
.reset (bank_reset),
.reset (reset),
`ifdef PERF_ENABLE
.perf_read_misses (perf_read_miss_per_bank[i]),
.perf_write_misses (perf_write_miss_per_bank[i]),
.perf_mshr_stalls (perf_mshr_stall_per_bank[i]),
.perf_read_misses (perf_read_miss_per_bank[bank_id]),
.perf_write_misses (perf_write_miss_per_bank[bank_id]),
.perf_mshr_stalls (perf_mshr_stall_per_bank[bank_id]),
`endif
// Core request
.core_req_valid (per_bank_core_req_valid[i]),
.core_req_addr (per_bank_core_req_addr[i]),
.core_req_rw (per_bank_core_req_rw[i]),
.core_req_wsel (per_bank_core_req_wsel[i]),
.core_req_byteen (per_bank_core_req_byteen[i]),
.core_req_data (per_bank_core_req_data[i]),
.core_req_tag (per_bank_core_req_tag[i]),
.core_req_idx (per_bank_core_req_idx[i]),
.core_req_ready (per_bank_core_req_ready[i]),
// Core response
.core_rsp_valid (per_bank_core_rsp_valid[i]),
.core_rsp_data (per_bank_core_rsp_data[i]),
.core_rsp_tag (per_bank_core_rsp_tag[i]),
.core_rsp_idx (per_bank_core_rsp_idx[i]),
.core_rsp_ready (per_bank_core_rsp_ready[i]),
// Core request
.core_req_valid (per_bank_core_req_valid[bank_id]),
.core_req_addr (per_bank_core_req_addr[bank_id]),
.core_req_rw (per_bank_core_req_rw[bank_id]),
.core_req_wsel (per_bank_core_req_wsel[bank_id]),
.core_req_byteen (per_bank_core_req_byteen[bank_id]),
.core_req_data (per_bank_core_req_data[bank_id]),
.core_req_tag (per_bank_core_req_tag[bank_id]),
.core_req_idx (per_bank_core_req_idx[bank_id]),
.core_req_flush (per_bank_core_req_flush[bank_id]),
.core_req_ready (per_bank_core_req_ready[bank_id]),
// Core response
.core_rsp_valid (per_bank_core_rsp_valid[bank_id]),
.core_rsp_data (per_bank_core_rsp_data[bank_id]),
.core_rsp_tag (per_bank_core_rsp_tag[bank_id]),
.core_rsp_idx (per_bank_core_rsp_idx[bank_id]),
.core_rsp_ready (per_bank_core_rsp_ready[bank_id]),
// Memory request
.mem_req_valid (per_bank_mem_req_valid[i]),
.mem_req_valid (per_bank_mem_req_valid[bank_id]),
.mem_req_addr (curr_bank_mem_req_addr),
.mem_req_rw (per_bank_mem_req_rw[i]),
.mem_req_wsel (per_bank_mem_req_wsel[i]),
.mem_req_byteen (per_bank_mem_req_byteen[i]),
.mem_req_data (per_bank_mem_req_data[i]),
.mem_req_id (per_bank_mem_req_id[i]),
.mem_req_ready (per_bank_mem_req_ready[i]),
.mem_req_rw (per_bank_mem_req_rw[bank_id]),
.mem_req_byteen (per_bank_mem_req_byteen[bank_id]),
.mem_req_data (per_bank_mem_req_data[bank_id]),
.mem_req_tag (per_bank_mem_req_tag[bank_id]),
.mem_req_flush (per_bank_mem_req_flush[bank_id]),
.mem_req_ready (per_bank_mem_req_ready[bank_id]),
// Memory response
.mem_rsp_valid (curr_bank_mem_rsp_valid),
.mem_rsp_data (mem_rsp_data_s),
.mem_rsp_id (`CS_MEM_TAG_TO_REQ_ID(mem_rsp_tag_s)),
.mem_rsp_ready (per_bank_mem_rsp_ready[i]),
.mem_rsp_tag (bank_mem_rsp_tag),
.mem_rsp_ready (per_bank_mem_rsp_ready[bank_id]),
// initialization
.init_enable (init_enable),
.init_line_sel (init_line_sel)
// Flush request
.flush_begin (per_bank_flush_begin[bank_id]),
.flush_uuid (flush_uuid),
.flush_end (per_bank_flush_end[bank_id])
);
if (NUM_BANKS == 1) begin
assign per_bank_mem_req_addr[i] = curr_bank_mem_req_addr;
end else begin
assign per_bank_mem_req_addr[i] = `CS_LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, i);
if (NUM_BANKS == 1) begin : g_per_bank_mem_req_addr_multibanks
assign per_bank_mem_req_addr[bank_id] = curr_bank_mem_req_addr;
end else begin : g_per_bank_mem_req_addr_singlebank
assign per_bank_mem_req_addr[bank_id] = `CS_LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, bank_id);
end
end
end
// Bank responses gather
wire [NUM_BANKS-1:0][CORE_RSP_DATAW-1:0] core_rsp_data_in;
wire [NUM_REQS-1:0][CORE_RSP_DATAW-1:0] core_rsp_data_out;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_core_rsp_data_in
assign core_rsp_data_in[i] = {per_bank_core_rsp_data[i], per_bank_core_rsp_tag[i]};
end
`RESET_RELAY (rsp_xbar_reset, reset);
VX_stream_xbar #(
.NUM_INPUTS (NUM_BANKS),
.NUM_OUTPUTS (NUM_REQS),
.DATAW (CORE_RSP_DATAW)
.DATAW (CORE_RSP_DATAW),
.ARBITER ("R")
) rsp_xbar (
.clk (clk),
.reset (rsp_xbar_reset),
.reset (reset),
`UNUSED_PIN (collisions),
.valid_in (per_bank_core_rsp_valid),
.data_in (core_rsp_data_in),
@ -433,38 +481,30 @@ module VX_cache import VX_gpu_pkg::*; #(
`UNUSED_PIN (sel_out)
);
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_data_s
assign {core_rsp_data_s[i], core_rsp_tag_s[i]} = core_rsp_data_out[i];
end
///////////////////////////////////////////////////////////////////////////
wire mem_req_valid_p;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_p;
wire mem_req_rw_p;
wire [WORD_SEL_WIDTH-1:0] mem_req_wsel_p;
wire [WORD_SIZE-1:0] mem_req_byteen_p;
wire [`CS_WORD_WIDTH-1:0] mem_req_data_p;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_p;
wire [MSHR_ADDR_WIDTH-1:0] mem_req_id_p;
wire mem_req_ready_p;
// Memory request arbitration
wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + `CS_WORD_WIDTH)-1:0] data_in;
wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + 1)-1:0] data_in;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign data_in[i] = {per_bank_mem_req_addr[i],
per_bank_mem_req_rw[i],
per_bank_mem_req_wsel[i],
per_bank_mem_req_byteen[i],
per_bank_mem_req_data[i],
per_bank_mem_req_id[i]};
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_data_in
assign data_in[i] = {
per_bank_mem_req_addr[i],
per_bank_mem_req_rw[i],
per_bank_mem_req_byteen[i],
per_bank_mem_req_data[i],
per_bank_mem_req_tag[i],
per_bank_mem_req_flush[i]
};
end
wire [BANK_MEM_TAG_WIDTH-1:0] bank_mem_req_tag;
VX_stream_arb #(
.NUM_INPUTS (NUM_BANKS),
.DATAW (`CS_MEM_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + MSHR_ADDR_WIDTH),
.DATAW (`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + 1),
.ARBITER ("R")
) mem_req_arb (
.clk (clk),
@ -472,65 +512,27 @@ module VX_cache import VX_gpu_pkg::*; #(
.valid_in (per_bank_mem_req_valid),
.ready_in (per_bank_mem_req_ready),
.data_in (data_in),
.data_out ({mem_req_addr_p, mem_req_rw_p, mem_req_wsel_p, mem_req_byteen_p, mem_req_data_p, mem_req_id_p}),
.valid_out (mem_req_valid_p),
.ready_out (mem_req_ready_p),
.data_out ({mem_req_addr, mem_req_rw, mem_req_byteen, mem_req_data, bank_mem_req_tag, mem_req_flush}),
.valid_out (mem_req_valid),
.ready_out (mem_req_ready),
`UNUSED_PIN (sel_out)
);
if (NUM_BANKS > 1) begin
wire [`CS_BANK_SEL_BITS-1:0] mem_req_bank_id = `CS_MEM_ADDR_TO_BANK_ID(mem_req_addr_p);
assign mem_req_tag_p = MEM_TAG_WIDTH'({mem_req_bank_id, mem_req_id_p});
end else begin
assign mem_req_tag_p = MEM_TAG_WIDTH'(mem_req_id_p);
end
// Memory request multi-port handling
assign mem_req_valid_s = mem_req_valid_p;
assign mem_req_addr_s = mem_req_addr_p;
assign mem_req_tag_s = mem_req_tag_p;
assign mem_req_ready_p = mem_req_ready_s;
if (WRITE_ENABLE != 0) begin
if (`CS_WORDS_PER_LINE > 1) begin
reg [LINE_SIZE-1:0] mem_req_byteen_r;
reg [`CS_LINE_WIDTH-1:0] mem_req_data_r;
always @(*) begin
mem_req_byteen_r = '0;
mem_req_data_r = 'x;
mem_req_byteen_r[mem_req_wsel_p * WORD_SIZE +: WORD_SIZE] = mem_req_byteen_p;
mem_req_data_r[mem_req_wsel_p * `CS_WORD_WIDTH +: `CS_WORD_WIDTH] = mem_req_data_p;
end
assign mem_req_rw_s = mem_req_rw_p;
assign mem_req_byteen_s = mem_req_byteen_r;
assign mem_req_data_s = mem_req_data_r;
end else begin
`UNUSED_VAR (mem_req_wsel_p)
assign mem_req_rw_s = mem_req_rw_p;
assign mem_req_byteen_s = mem_req_byteen_p;
assign mem_req_data_s = mem_req_data_p;
end
end else begin
`UNUSED_VAR (mem_req_byteen_p)
`UNUSED_VAR (mem_req_wsel_p)
`UNUSED_VAR (mem_req_data_p)
`UNUSED_VAR (mem_req_rw_p)
assign mem_req_rw_s = 0;
assign mem_req_byteen_s = {LINE_SIZE{1'b1}};
assign mem_req_data_s = '0;
if (NUM_BANKS > 1) begin : g_mem_req_tag_multibanks
wire [`CS_BANK_SEL_BITS-1:0] mem_req_bank_id = `CS_MEM_ADDR_TO_BANK_ID(mem_req_addr);
assign mem_req_tag = MEM_TAG_WIDTH'({bank_mem_req_tag, mem_req_bank_id});
end else begin : g_mem_req_tag
assign mem_req_tag = MEM_TAG_WIDTH'(bank_mem_req_tag);
end
`ifdef PERF_ENABLE
// per cycle: core_reads, core_writes
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [NUM_REQS-1:0] perf_core_reads_per_req;
wire [NUM_REQS-1:0] perf_core_writes_per_req;
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
@ -539,16 +541,16 @@ module VX_cache import VX_gpu_pkg::*; #(
`BUFFER(perf_core_reads_per_req, core_req_valid & core_req_ready & ~core_req_rw);
`BUFFER(perf_core_writes_per_req, core_req_valid & core_req_ready & core_req_rw);
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
`POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank);
`POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank);
`POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank);
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req
assign perf_crsp_stall_per_req[i] = core_bus2_if[i].rsp_valid && ~core_bus2_if[i].rsp_ready;
end
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
@ -561,7 +563,7 @@ module VX_cache import VX_gpu_pkg::*; #(
reg [`PERF_CTR_BITS-1:0] perf_write_misses;
reg [`PERF_CTR_BITS-1:0] perf_mshr_stalls;
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
always @(posedge clk) begin
if (reset) begin

View file

@ -41,19 +41,26 @@ module VX_cache_bank #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 1,
// Core response output buffer
parameter CORE_OUT_BUF = 0,
// Core response output register
parameter CORE_OUT_REG = 0,
// Memory request output buffer
parameter MEM_OUT_BUF = 0,
// Memory request output register
parameter MEM_OUT_REG = 0,
parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE),
parameter MEM_TAG_WIDTH = UUID_WIDTH + MSHR_ADDR_WIDTH,
parameter REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS),
parameter WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS)
) (
@ -69,12 +76,13 @@ module VX_cache_bank #(
// Core Request
input wire core_req_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] core_req_addr,
input wire core_req_rw,
input wire [WORD_SEL_WIDTH-1:0] core_req_wsel,
input wire [WORD_SIZE-1:0] core_req_byteen,
input wire [`CS_WORD_WIDTH-1:0] core_req_data,
input wire [TAG_WIDTH-1:0] core_req_tag,
input wire [REQ_SEL_WIDTH-1:0] core_req_idx,
input wire core_req_rw, // write enable
input wire [WORD_SEL_WIDTH-1:0] core_req_wsel, // select the word in a cacheline, e.g. word size = 4 bytes, cacheline size = 64 bytes, it should have log(64/4)= 4 bits
input wire [WORD_SIZE-1:0] core_req_byteen,// which bytes in data to write
input wire [`CS_WORD_WIDTH-1:0] core_req_data, // data to be written
input wire [TAG_WIDTH-1:0] core_req_tag, // identifier of the request (request id)
input wire [REQ_SEL_WIDTH-1:0] core_req_idx, // index of the request in the core request array
input wire core_req_flush, // flush enable
output wire core_req_ready,
// Core Response
@ -88,21 +96,22 @@ module VX_cache_bank #(
output wire mem_req_valid,
output wire [`CS_LINE_ADDR_WIDTH-1:0] mem_req_addr,
output wire mem_req_rw,
output wire [WORD_SEL_WIDTH-1:0] mem_req_wsel,
output wire [WORD_SIZE-1:0] mem_req_byteen,
output wire [`CS_WORD_WIDTH-1:0] mem_req_data,
output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id,
output wire [LINE_SIZE-1:0] mem_req_byteen,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
output wire mem_req_flush,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
input wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id,
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
// initialization
input wire init_enable,
input wire [`CS_LINE_SEL_BITS-1:0] init_line_sel
// flush
input wire flush_begin,
input wire [`UP(UUID_WIDTH)-1:0] flush_uuid,
output wire flush_end
);
localparam PIPELINE_STAGES = 2;
@ -113,6 +122,7 @@ module VX_cache_bank #(
wire crsp_queue_stall;
wire mshr_alm_full;
wire mreq_queue_empty;
wire mreq_queue_alm_full;
wire [`CS_LINE_ADDR_WIDTH-1:0] mem_rsp_addr;
@ -128,173 +138,269 @@ module VX_cache_bank #(
wire [MSHR_ADDR_WIDTH-1:0] replay_id;
wire replay_ready;
wire is_init_st0, is_init_st1;
wire is_flush_st0, is_flush_st1;
wire [NUM_WAYS-1:0] flush_way_st0;
wire [`CS_LINE_ADDR_WIDTH-1:0] addr_sel, addr_st0, addr_st1;
wire rw_st0, rw_st1;
wire [WORD_SEL_WIDTH-1:0] wsel_st0, wsel_st1;
wire [WORD_SIZE-1:0] byteen_st0, byteen_st1;
wire [REQ_SEL_WIDTH-1:0] req_idx_st0, req_idx_st1;
wire [TAG_WIDTH-1:0] tag_st0, tag_st1;
wire [`CS_LINE_SEL_BITS-1:0] line_sel_st0, line_sel_st1;
wire rw_sel, rw_st0, rw_st1;
wire [WORD_SEL_WIDTH-1:0] wsel_sel, wsel_st0, wsel_st1;
wire [WORD_SIZE-1:0] byteen_sel, byteen_st0, byteen_st1;
wire [REQ_SEL_WIDTH-1:0] req_idx_sel, req_idx_st0, req_idx_st1;
wire [TAG_WIDTH-1:0] tag_sel, tag_st0, tag_st1;
wire [`CS_WORD_WIDTH-1:0] read_data_st1;
wire [`CS_LINE_WIDTH-1:0] data_sel, data_st0, data_st1;
wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0, mshr_id_st0, mshr_id_st1;
wire valid_sel, valid_st0, valid_st1;
wire is_init_st0;
wire is_creq_st0, is_creq_st1;
wire is_fill_st0, is_fill_st1;
wire is_replay_st0, is_replay_st1;
wire creq_flush_sel, creq_flush_st0, creq_flush_st1;
wire evict_dirty_st0, evict_dirty_st1;
wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1;
wire [NUM_WAYS-1:0] tag_matches_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_prev_st0, mshr_prev_st1;
wire mshr_pending_st0, mshr_pending_st1;
wire mshr_empty;
wire rdw_hazard_st0;
reg rdw_hazard_st1;
wire flush_valid;
wire init_valid;
wire [`CS_LINE_SEL_BITS-1:0] flush_sel;
wire [NUM_WAYS-1:0] flush_way;
wire flush_ready;
wire pipe_stall = crsp_queue_stall || rdw_hazard_st1;
// ensure we have no pending memory request in the bank
wire no_pending_req = ~valid_st0 && ~valid_st1 && mreq_queue_empty;
// flush unit
VX_bank_flush #(
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WRITEBACK (WRITEBACK)
) flush_unit (
.clk (clk),
.reset (reset),
.flush_begin (flush_begin),
.flush_end (flush_end),
.flush_init (init_valid),
.flush_valid (flush_valid),
.flush_line (flush_sel),
.flush_way (flush_way),
.flush_ready (flush_ready),
.mshr_empty (mshr_empty),
.bank_empty (no_pending_req)
);
wire rdw_hazard1_sel;
wire rdw_hazard2_sel;
reg rdw_hazard3_st1;
wire pipe_stall = crsp_queue_stall || rdw_hazard3_st1;
// inputs arbitration:
// mshr replay has highest priority to maximize utilization since there is no miss.
// handle memory responses next to prevent deadlock with potential memory request from a miss.
wire replay_grant = ~init_enable;
// flush has precedence over core requests to ensure that the cache is in a consistent state.
wire replay_grant = ~init_valid;
wire replay_enable = replay_grant && replay_valid;
wire fill_grant = ~init_enable && ~replay_enable;
wire fill_grant = ~init_valid && ~replay_enable;
wire fill_enable = fill_grant && mem_rsp_valid;
wire creq_grant = ~init_enable && ~replay_enable && ~fill_enable;
wire flush_grant = ~init_valid && ~replay_enable && ~fill_enable;
wire flush_enable = flush_grant && flush_valid;
wire creq_grant = ~init_valid && ~replay_enable && ~fill_enable && ~flush_enable;
wire creq_enable = creq_grant && core_req_valid;
assign replay_ready = replay_grant
&& ~rdw_hazard_st0
&& ~pipe_stall;
&& ~rdw_hazard1_sel
&& ~pipe_stall;
assign mem_rsp_ready = fill_grant
&& (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions
&& ~rdw_hazard2_sel
&& ~pipe_stall;
assign flush_ready = flush_grant
&& (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions
&& ~rdw_hazard2_sel
&& ~pipe_stall;
assign core_req_ready = creq_grant
&& ~mreq_queue_alm_full
&& ~mshr_alm_full
&& ~pipe_stall;
&& ~mreq_queue_alm_full
&& ~mshr_alm_full
&& ~pipe_stall;
wire init_fire = init_enable;
wire init_fire = init_valid;
wire replay_fire = replay_valid && replay_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
wire flush_fire = flush_valid && flush_ready;
wire core_req_fire = core_req_valid && core_req_ready;
wire [TAG_WIDTH-1:0] mshr_creq_tag = replay_enable ? replay_tag : core_req_tag;
wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id = mem_rsp_tag[MSHR_ADDR_WIDTH-1:0];
if (UUID_WIDTH != 0) begin
assign req_uuid_sel = mshr_creq_tag[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign req_uuid_sel = 0;
wire [TAG_WIDTH-1:0] mem_rsp_tag_s;
if (TAG_WIDTH > MEM_TAG_WIDTH) begin : g_mem_rsp_tag_s_pad
assign mem_rsp_tag_s = {mem_rsp_tag, (TAG_WIDTH-MEM_TAG_WIDTH)'(1'b0)};
end else begin : g_mem_rsp_tag_s_cut
assign mem_rsp_tag_s = mem_rsp_tag[MEM_TAG_WIDTH-1 -: TAG_WIDTH];
`UNUSED_VAR (mem_rsp_tag)
end
`UNUSED_VAR (mshr_creq_tag)
wire [TAG_WIDTH-1:0] flush_tag;
if (UUID_WIDTH != 0) begin : g_flush_tag_uuid
assign flush_tag = {flush_uuid, (TAG_WIDTH-UUID_WIDTH)'(1'b0)};
end else begin : g_flush_tag_0
`UNUSED_VAR (flush_uuid)
assign flush_tag = '0;
end
assign valid_sel = init_fire || replay_fire || mem_rsp_fire || core_req_fire;
assign valid_sel = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire;
assign rw_sel = replay_valid ? replay_rw : core_req_rw;
assign byteen_sel = replay_valid ? replay_byteen : core_req_byteen;
assign wsel_sel = replay_valid ? replay_wsel : core_req_wsel;
assign req_idx_sel = replay_valid ? replay_idx : core_req_idx;
assign tag_sel = (init_valid | flush_valid) ? (flush_valid ? flush_tag : '0) :
(replay_valid ? replay_tag : (mem_rsp_valid ? mem_rsp_tag_s : core_req_tag));
assign creq_flush_sel = core_req_valid && core_req_flush;
assign addr_sel = init_enable ? `CS_LINE_ADDR_WIDTH'(init_line_sel) :
(replay_valid ? replay_addr :
(mem_rsp_valid ? mem_rsp_addr : core_req_addr));
assign addr_sel = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) :
(replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : core_req_addr));
assign data_sel[`CS_WORD_WIDTH-1:0] = (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data[`CS_WORD_WIDTH-1:0] : (replay_valid ? replay_data : core_req_data);
for (genvar i = `CS_WORD_WIDTH; i < `CS_LINE_WIDTH; ++i) begin
assign data_sel[i] = mem_rsp_data[i];
if (WRITE_ENABLE) begin : g_data_sel
for (genvar i = 0; i < `CS_LINE_WIDTH; ++i) begin : g_i
if (i < `CS_WORD_WIDTH) begin : g_lo
assign data_sel[i] = replay_valid ? replay_data[i] : (mem_rsp_valid ? mem_rsp_data[i] : core_req_data[i]);
end else begin : g_hi
assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words of data_sel
end
end
end else begin : g_data_sel_ro
assign data_sel = mem_rsp_data;
`UNUSED_VAR (core_req_data)
`UNUSED_VAR (replay_data)
end
if (UUID_WIDTH != 0) begin : g_req_uuid_sel
assign req_uuid_sel = tag_sel[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin : g_req_uuid_sel_0
assign req_uuid_sel = '0;
end
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + NUM_WAYS + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
.RESETW (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (~pipe_stall),
.data_in ({
valid_sel,
init_enable,
replay_enable,
fill_enable,
creq_enable,
addr_sel,
data_sel,
replay_valid ? replay_rw : core_req_rw,
replay_valid ? replay_byteen : core_req_byteen,
replay_valid ? replay_wsel : core_req_wsel,
replay_valid ? replay_idx : core_req_idx,
replay_valid ? replay_tag : core_req_tag,
replay_id
}),
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_creq_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
.data_in ({valid_sel, init_valid, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}),
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
);
if (UUID_WIDTH != 0) begin
if (UUID_WIDTH != 0) begin : g_req_uuid_st0
assign req_uuid_st0 = tag_st0[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign req_uuid_st0 = 0;
end else begin : g_req_uuid_st0_0
assign req_uuid_st0 = '0;
end
wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire do_init_st0 = valid_st0 && is_init_st0;
wire do_lookup_st0 = valid_st0 && ~(is_fill_st0 || is_init_st0);
wire do_flush_st0 = valid_st0 && is_flush_st0;
wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0;
wire do_creq_wr_st0 = valid_st0 && is_creq_st0 && rw_st0;
wire do_replay_rd_st0 = valid_st0 && is_replay_st0 && ~rw_st0;
wire do_replay_wr_st0 = valid_st0 && is_replay_st0 && rw_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire do_cache_rd_st0 = do_creq_rd_st0 || do_replay_rd_st0;
wire do_cache_wr_st0 = do_creq_wr_st0 || do_replay_wr_st0;
wire do_lookup_st0 = do_cache_rd_st0 || do_cache_wr_st0;
wire [`CS_WORD_WIDTH-1:0] write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0];
wire [NUM_WAYS-1:0] tag_matches_st0, tag_matches_st1;
wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1;
assign line_sel_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0];
`RESET_RELAY (tag_reset, reset);
wire [NUM_WAYS-1:0] evict_way_st0;
wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0;
VX_cache_tags #(
.INSTANCE_ID(INSTANCE_ID),
.INSTANCE_ID($sformatf("%s-tags", INSTANCE_ID)),
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH)
) cache_tags (
.clk (clk),
.reset (tag_reset),
.reset (reset),
.req_uuid (req_uuid_st0),
.stall (pipe_stall),
// read/Fill
// init/flush/fill/write/lookup
.init (do_init_st0),
.flush (do_flush_st0),
.fill (do_fill_st0),
.write (do_cache_wr_st0),
.lookup (do_lookup_st0),
.line_addr (addr_st0),
.fill (do_fill_st0),
.init (do_init_st0),
.way_sel (way_sel_st0),
.tag_matches(tag_matches_st0)
.way_sel (flush_way_st0),
.tag_matches(tag_matches_st0),
// replacement
.evict_dirty(evict_dirty_st0),
.evict_way (evict_way_st0),
.evict_tag (evict_tag_st0)
);
wire [`CS_LINE_ADDR_WIDTH-1:0] addr2_st0;
wire is_flush2_st0 = WRITEBACK && is_flush_st0;
assign mshr_id_st0 = is_creq_st0 ? mshr_alloc_id_st0 : replay_id_st0;
assign way_sel_st0 = (is_fill_st0 || is_flush2_st0) ? evict_way_st0 : tag_matches_st0;
assign addr2_st0 = (is_fill_st0 || is_flush2_st0) ? {evict_tag_st0, line_sel_st0} : addr_st0;
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + NUM_WAYS + 1),
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1 + 1),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (~pipe_stall),
.data_in ({valid_st0, is_replay_st0, is_fill_st0, is_creq_st0, rw_st0, addr_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, tag_matches_st0, way_sel_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_replay_st1, is_fill_st1, is_creq_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, tag_matches_st1, way_sel_st1, mshr_pending_st1})
.data_in ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush2_st0, is_creq_st0, creq_flush_st0, rw_st0, addr2_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, evict_dirty_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_init_st1, is_replay_st1, is_fill_st1, is_flush_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, evict_dirty_st1, mshr_pending_st1})
);
// we have a tag hit
wire is_hit_st1 = (| tag_matches_st1);
wire is_hit_st1 = (| way_sel_st1);
if (UUID_WIDTH != 0) begin
if (UUID_WIDTH != 0) begin : g_req_uuid_st1
assign req_uuid_st1 = tag_st1[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign req_uuid_st1 = 0;
end else begin : g_req_uuid_st1_0
assign req_uuid_st1 = '0;
end
wire do_creq_rd_st1 = valid_st1 && is_creq_st1 && ~rw_st1;
wire do_creq_wr_st1 = valid_st1 && is_creq_st1 && rw_st1;
wire is_read_st1 = is_creq_st1 && ~rw_st1;
wire is_write_st1 = is_creq_st1 && rw_st1;
wire do_init_st1 = valid_st1 && is_init_st1;
wire do_fill_st1 = valid_st1 && is_fill_st1;
wire do_flush_st1 = valid_st1 && is_flush_st1;
wire do_creq_rd_st1 = valid_st1 && is_read_st1;
wire do_creq_wr_st1 = valid_st1 && is_write_st1;
wire do_replay_rd_st1 = valid_st1 && is_replay_st1 && ~rw_st1;
wire do_replay_wr_st1 = valid_st1 && is_replay_st1 && rw_st1;
@ -304,25 +410,46 @@ module VX_cache_bank #(
wire do_write_hit_st1 = do_creq_wr_st1 && is_hit_st1;
wire do_write_miss_st1= do_creq_wr_st1 && ~is_hit_st1;
wire do_cache_rd_st1 = do_read_hit_st1 || do_replay_rd_st1;
wire do_cache_wr_st1 = do_write_hit_st1 || do_replay_wr_st1;
assign line_sel_st1 = addr_st1[`CS_LINE_SEL_BITS-1:0];
`UNUSED_VAR (do_write_miss_st1)
// ensure mshr replay always get a hit
`RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("runtime error: invalid mshr replay"));
`RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("%t: missed mshr replay", $time))
// detect BRAM's read-during-write hazard
assign rdw_hazard_st0 = do_fill_st0; // after a fill
// both tag and data stores use BRAM with no read-during-write protection.
// we ned to stall the pipeline to prevent read-after-write hazards.
assign rdw_hazard1_sel = do_fill_st0; // stall first replay following a fill
assign rdw_hazard2_sel = WRITEBACK && do_cache_wr_st0; // a writeback can evict any preceeding write
always @(posedge clk) begin
rdw_hazard_st1 <= (do_creq_rd_st0 && do_write_hit_st1 && (addr_st0 == addr_st1))
&& ~rdw_hazard_st1; // after a write to same address
// stall reads following writes to same line address
rdw_hazard3_st1 <= do_cache_rd_st0 && do_cache_wr_st1 && (line_sel_st0 == line_sel_st1)
&& ~rdw_hazard3_st1; // release pipeline stall
end
wire [`CS_WORD_WIDTH-1:0] write_data_st1 = data_st1[`CS_WORD_WIDTH-1:0];
wire [`CS_LINE_WIDTH-1:0] write_data_st1 = {`CS_WORDS_PER_LINE{data_st1[`CS_WORD_WIDTH-1:0]}};
wire [`CS_LINE_WIDTH-1:0] fill_data_st1 = data_st1;
wire [LINE_SIZE-1:0] write_byteen_st1;
`RESET_RELAY (data_reset, reset);
wire [`CS_LINE_WIDTH-1:0] dirty_data_st1;
wire [LINE_SIZE-1:0] dirty_byteen_st1;
if (`CS_WORDS_PER_LINE > 1) begin : g_write_byteen_st1_wsel
reg [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_byteen_w;
always @(*) begin
write_byteen_w = '0;
write_byteen_w[wsel_st1] = byteen_st1;
end
assign write_byteen_st1 = write_byteen_w;
end else begin : g_write_byteen_st1
assign write_byteen_st1 = byteen_st1;
end
VX_cache_data #(
.INSTANCE_ID (INSTANCE_ID),
.INSTANCE_ID ($sformatf("%s-data", INSTANCE_ID)),
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
@ -330,32 +457,49 @@ module VX_cache_bank #(
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH)
) cache_data (
.clk (clk),
.reset (data_reset),
.reset (reset),
.req_uuid (req_uuid_st1),
.stall (pipe_stall),
.read (do_read_hit_st1 || do_replay_rd_st1),
.init (do_init_st1),
.read (do_cache_rd_st1),
.fill (do_fill_st1),
.write (do_write_hit_st1 || do_replay_wr_st1),
.way_sel (way_sel_st1 | tag_matches_st1),
.flush (do_flush_st1),
.write (do_cache_wr_st1),
.way_sel (way_sel_st1),
.line_addr (addr_st1),
.wsel (wsel_st1),
.byteen (byteen_st1),
.fill_data (fill_data_st1),
.write_data (write_data_st1),
.read_data (read_data_st1)
.write_byteen(write_byteen_st1),
.read_data (read_data_st1),
.dirty_data (dirty_data_st1),
.dirty_byteen(dirty_byteen_st1)
);
wire [MSHR_SIZE-1:0] mshr_matches_st0;
wire [MSHR_SIZE-1:0] mshr_lookup_pending_st0;
wire [MSHR_SIZE-1:0] mshr_lookup_rw_st0;
wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~pipe_stall;
wire mshr_lookup_st0 = mshr_allocate_st0;
wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~pipe_stall;
wire mshr_release_st1 = is_hit_st1 || (rw_st1 && ~mshr_pending_st1);
// release allocated mshr entry if we had a hit
wire mshr_release_st1;
if (WRITEBACK) begin : g_mshr_release_st1
assign mshr_release_st1 = is_hit_st1;
end else begin : g_mshr_release_st1_ro
// we need to keep missed write requests in MSHR if there is already a pending entry to the same address
// this ensures that missed write requests are replayed locally in case a pending fill arrives without the write content
// this can happen when writes are sent late, when the fill was already in flight.
assign mshr_release_st1 = is_hit_st1 || (rw_st1 && ~mshr_pending_st1);
end
VX_pending_size #(
.SIZE (MSHR_SIZE)
@ -364,15 +508,15 @@ module VX_cache_bank #(
.reset (reset),
.incr (core_req_fire),
.decr (replay_fire || (mshr_finalize_st1 && mshr_release_st1)),
.empty (mshr_empty),
`UNUSED_PIN (alm_empty),
.full (mshr_alm_full),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
`RESET_RELAY (mshr_reset, reset);
VX_cache_mshr #(
.INSTANCE_ID (INSTANCE_ID),
.INSTANCE_ID ($sformatf("%s-mshr", INSTANCE_ID)),
.BANK_ID (BANK_ID),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
@ -381,7 +525,7 @@ module VX_cache_bank #(
.DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH)
) cache_mshr (
.clk (clk),
.reset (mshr_reset),
.reset (reset),
.deq_req_uuid (req_uuid_sel),
.lkp_req_uuid (req_uuid_st0),
@ -412,7 +556,8 @@ module VX_cache_bank #(
// lookup
.lookup_valid (mshr_lookup_st0),
.lookup_addr (addr_st0),
.lookup_matches (mshr_matches_st0),
.lookup_pending (mshr_lookup_pending_st0),
.lookup_rw (mshr_lookup_rw_st0),
// finalize
.finalize_valid (mshr_finalize_st1),
@ -422,10 +567,12 @@ module VX_cache_bank #(
.finalize_prev (mshr_prev_st1)
);
// ignore allocated id from mshr matches
// check if there are pending requests to same line in the MSHR
wire [MSHR_SIZE-1:0] lookup_matches;
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
assign lookup_matches[i] = (i != mshr_alloc_id_st0) && mshr_matches_st0[i];
for (genvar i = 0; i < MSHR_SIZE; ++i) begin : g_lookup_matches
assign lookup_matches[i] = mshr_lookup_pending_st0[i]
&& (i != mshr_alloc_id_st0) // exclude current mshr id
&& (WRITEBACK || ~mshr_lookup_rw_st0[i]); // exclude write requests if writethrough
end
assign mshr_pending_st0 = (| lookup_matches);
@ -436,21 +583,19 @@ module VX_cache_bank #(
wire [REQ_SEL_WIDTH-1:0] crsp_queue_idx;
wire [TAG_WIDTH-1:0] crsp_queue_tag;
assign crsp_queue_valid = do_read_hit_st1 || do_replay_rd_st1;
assign crsp_queue_valid = do_cache_rd_st1;
assign crsp_queue_idx = req_idx_st1;
assign crsp_queue_data = read_data_st1;
assign crsp_queue_tag = tag_st1;
`RESET_RELAY (crsp_queue_reset, reset);
VX_elastic_buffer #(
.DATAW (TAG_WIDTH + `CS_WORD_WIDTH + REQ_SEL_WIDTH),
.SIZE (CRSQ_SIZE),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
.OUT_REG (CORE_OUT_REG)
) core_rsp_queue (
.clk (clk),
.reset (crsp_queue_reset),
.valid_in (crsp_queue_valid && ~rdw_hazard_st1),
.reset (reset),
.valid_in (crsp_queue_valid && ~rdw_hazard3_st1),
.ready_in (crsp_queue_ready),
.data_in ({crsp_queue_tag, crsp_queue_data, crsp_queue_idx}),
.data_out ({core_rsp_tag, core_rsp_data, core_rsp_idx}),
@ -462,40 +607,77 @@ module VX_cache_bank #(
// schedule memory request
wire mreq_queue_push, mreq_queue_pop, mreq_queue_empty;
wire [`CS_WORD_WIDTH-1:0] mreq_queue_data;
wire [WORD_SIZE-1:0] mreq_queue_byteen;
wire [WORD_SEL_WIDTH-1:0] mreq_queue_wsel;
wire mreq_queue_push, mreq_queue_pop;
wire [`CS_LINE_WIDTH-1:0] mreq_queue_data;
wire [LINE_SIZE-1:0] mreq_queue_byteen;
wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr;
wire [MSHR_ADDR_WIDTH-1:0] mreq_queue_id;
wire [MEM_TAG_WIDTH-1:0] mreq_queue_tag;
wire mreq_queue_rw;
wire mreq_queue_flush;
assign mreq_queue_push = (do_read_miss_st1 && ~mshr_pending_st1)
|| do_creq_wr_st1;
wire is_fill_or_flush_st1 = is_fill_st1 || is_flush_st1;
wire do_fill_or_flush_st1 = valid_st1 && is_fill_or_flush_st1;
wire do_writeback_st1 = do_fill_or_flush_st1 && evict_dirty_st1;
assign mreq_queue_pop = mem_req_valid && mem_req_ready;
if (WRITEBACK) begin : g_mreq_queue_push
if (DIRTY_BYTES) begin : g_dirty_bytes
// ensure dirty bytes match the tag info
wire has_dirty_bytes = (| dirty_byteen_st1);
`RUNTIME_ASSERT (~do_fill_or_flush_st1 || (evict_dirty_st1 == has_dirty_bytes), ("%t: missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", $time, evict_dirty_st1, has_dirty_bytes, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID)))
end
assign mreq_queue_push = (((do_read_miss_st1 || do_write_miss_st1) && ~mshr_pending_st1)
|| do_writeback_st1)
&& ~rdw_hazard3_st1;
end else begin : g_mreq_queue_push_ro
`UNUSED_VAR (do_writeback_st1)
assign mreq_queue_push = ((do_read_miss_st1 && ~mshr_pending_st1)
|| do_creq_wr_st1)
&& ~rdw_hazard3_st1;
end
assign mreq_queue_rw = WRITE_ENABLE && rw_st1;
assign mreq_queue_pop = mem_req_valid && mem_req_ready;
assign mreq_queue_addr = addr_st1;
assign mreq_queue_id = mshr_id_st1;
assign mreq_queue_wsel = wsel_st1;
assign mreq_queue_byteen = byteen_st1;
assign mreq_queue_data = write_data_st1;
assign mreq_queue_flush = creq_flush_st1;
`RESET_RELAY (mreq_queue_reset, reset);
if (WRITE_ENABLE) begin : g_mreq_queue
if (WRITEBACK) begin : g_writeback
assign mreq_queue_rw = is_fill_or_flush_st1;
assign mreq_queue_data = dirty_data_st1;
assign mreq_queue_byteen = is_fill_or_flush_st1 ? dirty_byteen_st1 : '1;
end else begin : g_writethrough
assign mreq_queue_rw = rw_st1;
assign mreq_queue_data = write_data_st1;
assign mreq_queue_byteen = rw_st1 ? write_byteen_st1 : '1;
`UNUSED_VAR (is_fill_or_flush_st1)
`UNUSED_VAR (dirty_data_st1)
`UNUSED_VAR (dirty_byteen_st1)
end
end else begin : g_mreq_queue_ro
assign mreq_queue_rw = 0;
assign mreq_queue_data = '0;
assign mreq_queue_byteen = '1;
`UNUSED_VAR (dirty_data_st1)
`UNUSED_VAR (dirty_byteen_st1)
end
if (UUID_WIDTH != 0) begin : g_mreq_queue_tag_uuid
assign mreq_queue_tag = {req_uuid_st1, mshr_id_st1};
end else begin : g_mreq_queue_tag
assign mreq_queue_tag = mshr_id_st1;
end
VX_fifo_queue #(
.DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + `CS_WORD_WIDTH),
.DATAW (1 + `CS_LINE_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + MEM_TAG_WIDTH + 1),
.DEPTH (MREQ_SIZE),
.ALM_FULL (MREQ_SIZE-PIPELINE_STAGES),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
.OUT_REG (MEM_OUT_REG)
) mem_req_queue (
.clk (clk),
.reset (mreq_queue_reset),
.reset (reset),
.push (mreq_queue_push),
.pop (mreq_queue_pop),
.data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_id, mreq_queue_byteen, mreq_queue_wsel, mreq_queue_data}),
.data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_wsel, mem_req_data}),
.data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_byteen, mreq_queue_data, mreq_queue_tag, mreq_queue_flush}),
.data_out ({mem_req_rw, mem_req_addr, mem_req_byteen, mem_req_data, mem_req_tag, mem_req_flush}),
.empty (mreq_queue_empty),
.alm_full (mreq_queue_alm_full),
`UNUSED_PIN (full),
@ -515,35 +697,36 @@ module VX_cache_bank #(
`ifdef DBG_TRACE_CACHE
wire crsp_queue_fire = crsp_queue_valid && crsp_queue_ready;
wire pipeline_stall = (replay_valid || mem_rsp_valid || core_req_valid)
&& ~(replay_fire || mem_rsp_fire || core_req_fire);
wire input_stall = (replay_valid || mem_rsp_valid || core_req_valid || flush_valid)
&& ~(replay_fire || mem_rsp_fire || core_req_fire || flush_fire);
always @(posedge clk) begin
if (pipeline_stall) begin
`TRACE(3, ("%d: *** %s-bank%0d stall: crsq=%b, mreq=%b, mshr=%b\n", $time, INSTANCE_ID, BANK_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full));
end
if (init_enable) begin
`TRACE(2, ("%d: %s-bank%0d init: addr=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(init_line_sel, BANK_ID)));
if (input_stall || pipe_stall) begin
`TRACE(3, ("%t: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw1=%b, rdw2=%b, rdw3=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard1_sel, rdw_hazard2_sel, rdw_hazard3_st1))
end
if (mem_rsp_fire) begin
`TRACE(2, ("%d: %s-bank%0d fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data));
`TRACE(2, ("%t: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data, req_uuid_sel))
end
if (replay_fire) begin
`TRACE(2, ("%d: %s-bank%0d mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel));
`TRACE(2, ("%t: %s mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel))
end
if (core_req_fire) begin
if (core_req_rw)
`TRACE(2, ("%d: %s-bank%0d core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel));
else
`TRACE(2, ("%d: %s-bank%0d core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel));
if (core_req_rw) begin
`TRACE(2, ("%t: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel))
end else begin
`TRACE(2, ("%t: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel))
end
end
if (crsp_queue_fire) begin
`TRACE(2, ("%d: %s-bank%0d core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1));
`TRACE(2, ("%t: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1))
end
if (mreq_queue_push) begin
if (do_creq_wr_st1)
`TRACE(2, ("%d: %s-bank%0d writethrough: addr=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1));
else
`TRACE(2, ("%d: %s-bank%0d fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_id, req_uuid_st1));
if (do_creq_wr_st1 && !WRITEBACK) begin
`TRACE(2, ("%t: %s writethrough: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1))
end else if (do_writeback_st1) begin
`TRACE(2, ("%t: %s writeback: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1))
end else begin
`TRACE(2, ("%t: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mshr_id_st1, req_uuid_st1))
end
end
end
`endif

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -18,16 +18,16 @@ module VX_cache_bypass #(
parameter TAG_SEL_IDX = 0,
parameter PASSTHRU = 0,
parameter NC_ENABLE = 0,
parameter NC_ENABLE = 0,
parameter WORD_SIZE = 1,
parameter LINE_SIZE = 1,
parameter LINE_SIZE = 1,
parameter CORE_ADDR_WIDTH = 1,
parameter CORE_TAG_WIDTH = 1,
parameter MEM_ADDR_WIDTH = 1,
parameter MEM_ADDR_WIDTH = 1,
parameter MEM_TAG_IN_WIDTH = 1,
parameter MEM_TAG_OUT_WIDTH = 1,
@ -35,9 +35,9 @@ module VX_cache_bypass #(
parameter CORE_OUT_BUF = 0,
parameter MEM_OUT_BUF = 0,
parameter CORE_DATA_WIDTH = WORD_SIZE * 8
) (
) (
input wire clk,
input wire reset,
@ -56,7 +56,8 @@ module VX_cache_bypass #(
localparam DIRECT_PASSTHRU = PASSTHRU && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == 1);
localparam REQ_SEL_BITS = `CLOG2(NUM_REQS);
localparam MUX_DATAW = 1 + WORD_SIZE + CORE_ADDR_WIDTH + `ADDR_TYPE_WIDTH + CORE_DATA_WIDTH + CORE_TAG_WIDTH;
localparam REQ_SEL_WIDTH = `UP(REQ_SEL_BITS);
localparam MUX_DATAW = 1 + WORD_SIZE + CORE_ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + CORE_DATA_WIDTH + CORE_TAG_WIDTH;
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
@ -71,40 +72,39 @@ module VX_cache_bypass #(
wire core_req_nc_valid;
wire [NUM_REQS-1:0] core_req_nc_valids;
wire [NUM_REQS-1:0] core_req_nc_idxs;
wire [`UP(REQ_SEL_BITS)-1:0] core_req_nc_idx;
wire [NUM_REQS-1:0] core_req_nc_sel;
wire [NUM_REQS-1:0] core_req_nc_idxs;
wire [REQ_SEL_WIDTH-1:0] core_req_nc_idx;
wire [NUM_REQS-1:0] core_req_nc_sel;
wire core_req_nc_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (PASSTHRU != 0) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_nc
if (PASSTHRU != 0) begin : g_passthru
assign core_req_nc_idxs[i] = 1'b1;
end else if (NC_ENABLE) begin
assign core_req_nc_idxs[i] = core_bus_in_if[i].req_data.atype[`ADDR_TYPE_IO];
end else begin
end else if (NC_ENABLE) begin : g_nc
assign core_req_nc_idxs[i] = core_bus_in_if[i].req_data.flags[`MEM_REQ_FLAG_IO];
end else begin : g_no_nc
assign core_req_nc_idxs[i] = 1'b0;
end
end
assign core_req_nc_valids[i] = core_bus_in_if[i].req_valid && core_req_nc_idxs[i];
end
end
VX_generic_arbiter #(
.NUM_REQS (NUM_REQS),
.TYPE (PASSTHRU ? "R" : "P"),
.LOCK_ENABLE (1)
.TYPE (PASSTHRU ? "R" : "P")
) core_req_nc_arb (
.clk (clk),
.reset (reset),
.requests (core_req_nc_valids),
.reset (reset),
.requests (core_req_nc_valids),
.grant_index (core_req_nc_idx),
.grant_onehot (core_req_nc_sel),
.grant_valid (core_req_nc_valid),
.grant_unlock (core_req_nc_ready)
.grant_ready (core_req_nc_ready)
);
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_if
assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && ~core_req_nc_idxs[i];
assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
assign core_bus_in_if[i].req_ready = core_req_nc_valids[i] ? (core_req_nc_ready && core_req_nc_sel[i])
assign core_bus_in_if[i].req_ready = core_req_nc_valids[i] ? (core_req_nc_ready && core_req_nc_sel[i])
: core_bus_out_if[i].req_ready;
end
@ -114,37 +114,37 @@ module VX_cache_bypass #(
wire mem_req_out_rw;
wire [LINE_SIZE-1:0] mem_req_out_byteen;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_out_addr;
wire [`ADDR_TYPE_WIDTH-1:0] mem_req_out_atype;
wire [`MEM_REQ_FLAGS_WIDTH-1:0] mem_req_out_flags;
wire [`CS_LINE_WIDTH-1:0] mem_req_out_data;
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_out_tag;
wire mem_req_out_ready;
wire core_req_nc_sel_rw;
wire [WORD_SIZE-1:0] core_req_nc_sel_byteen;
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_sel_addr;
wire [`ADDR_TYPE_WIDTH-1:0] core_req_nc_sel_atype;
wire [`MEM_REQ_FLAGS_WIDTH-1:0] core_req_nc_sel_flags;
wire [CORE_DATA_WIDTH-1:0] core_req_nc_sel_data;
wire [CORE_TAG_WIDTH-1:0] core_req_nc_sel_tag;
wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in;
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_nc_mux_in
assign core_req_nc_mux_in[i] = {
core_bus_in_if[i].req_data.rw,
core_bus_in_if[i].req_data.byteen,
core_bus_in_if[i].req_data.rw,
core_bus_in_if[i].req_data.addr,
core_bus_in_if[i].req_data.atype,
core_bus_in_if[i].req_data.data,
core_bus_in_if[i].req_data.tag
core_bus_in_if[i].req_data.byteen,
core_bus_in_if[i].req_data.flags,
core_bus_in_if[i].req_data.tag
};
end
assign {
core_req_nc_sel_rw,
core_req_nc_sel_byteen,
core_req_nc_sel_addr,
core_req_nc_sel_atype,
core_req_nc_sel_data,
core_req_nc_sel_tag
core_req_nc_sel_byteen,
core_req_nc_sel_flags,
core_req_nc_sel_tag
} = core_req_nc_mux_in[core_req_nc_idx];
assign core_req_nc_ready = ~mem_bus_in_if.req_valid && mem_req_out_ready;
@ -152,84 +152,82 @@ module VX_cache_bypass #(
assign mem_req_out_valid = mem_bus_in_if.req_valid || core_req_nc_valid;
assign mem_req_out_rw = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.rw : core_req_nc_sel_rw;
assign mem_req_out_addr = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.addr : core_req_nc_sel_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
assign mem_req_out_atype = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.atype : core_req_nc_sel_atype;
assign mem_req_out_flags = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.flags : core_req_nc_sel_flags;
wire [MEM_TAG_ID_BITS-1:0] mem_req_tag_id_bypass;
wire [CORE_TAG_ID_BITS-1:0] core_req_in_id = core_req_nc_sel_tag[CORE_TAG_ID_BITS-1:0];
if (WORDS_PER_LINE > 1) begin
reg [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] mem_req_byteen_in_r;
reg [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_r;
if (WORDS_PER_LINE > 1) begin : g_mem_req_multi_word_line
reg [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] mem_req_byteen_in_w;
reg [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_w;
wire [WSEL_BITS-1:0] req_wsel = core_req_nc_sel_addr[WSEL_BITS-1:0];
always @(*) begin
mem_req_byteen_in_r = '0;
mem_req_byteen_in_r[req_wsel] = core_req_nc_sel_byteen;
mem_req_byteen_in_w = '0;
mem_req_byteen_in_w[req_wsel] = core_req_nc_sel_byteen;
mem_req_data_in_r = 'x;
mem_req_data_in_r[req_wsel] = core_req_nc_sel_data;
mem_req_data_in_w = 'x;
mem_req_data_in_w[req_wsel] = core_req_nc_sel_data;
end
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : mem_req_byteen_in_r;
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : mem_req_data_in_r;
if (NUM_REQS > 1) begin
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : mem_req_byteen_in_w;
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : mem_req_data_in_w;
if (NUM_REQS > 1) begin : g_multiple_requests
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, req_wsel, core_req_in_id});
end else begin
end else begin : g_single_request
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({req_wsel, core_req_in_id});
end
end else begin
end else begin : g_mem_req_single_word_line
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : core_req_nc_sel_byteen;
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : core_req_nc_sel_data;
if (NUM_REQS > 1) begin
if (NUM_REQS > 1) begin : g_multiple_requests
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, core_req_in_id});
end else begin
end else begin : g_single_request
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_in_id});
end
end
wire [MEM_TAG_BYPASS_BITS-1:0] mem_req_tag_bypass;
wire [MEM_TAG_BYPASS_BITS-1:0] mem_req_tag_bypass;
if (UUID_WIDTH != 0) begin
if (UUID_WIDTH != 0) begin : g_mem_req_tag_bypass_with_uuid
assign mem_req_tag_bypass = {core_req_nc_sel_tag[CORE_TAG_ID_BITS +: UUID_WIDTH], mem_req_tag_id_bypass};
end else begin
end else begin : g_mem_req_tag_bypass
assign mem_req_tag_bypass = mem_req_tag_id_bypass;
end
if (PASSTHRU != 0) begin
if (PASSTHRU != 0) begin : g_mem_req_out_tag_passthru
assign mem_req_out_tag = mem_req_tag_bypass;
`UNUSED_VAR (mem_bus_in_if.req_data.tag)
end else begin
if (NC_ENABLE) begin
VX_bits_insert #(
.N (MEM_TAG_OUT_WIDTH-1),
.S (1),
.POS (TAG_SEL_IDX)
) mem_req_tag_in_nc_insert (
.data_in (mem_bus_in_if.req_valid ? (MEM_TAG_OUT_WIDTH-1)'(mem_bus_in_if.req_data.tag) : (MEM_TAG_OUT_WIDTH-1)'(mem_req_tag_bypass)),
.ins_in (~mem_bus_in_if.req_valid),
.data_out (mem_req_out_tag)
);
end else begin
assign mem_req_out_tag = mem_bus_in_if.req_data.tag;
end
end
end else if (NC_ENABLE) begin : g_mem_req_out_tag_nc
VX_bits_insert #(
.N (MEM_TAG_OUT_WIDTH-1),
.S (1),
.POS (TAG_SEL_IDX)
) mem_req_tag_in_nc_insert (
.data_in (mem_bus_in_if.req_valid ? (MEM_TAG_OUT_WIDTH-1)'(mem_bus_in_if.req_data.tag) : (MEM_TAG_OUT_WIDTH-1)'(mem_req_tag_bypass)),
.ins_in (~mem_bus_in_if.req_valid),
.data_out (mem_req_out_tag)
);
end else begin : g_mem_req_out_tag
assign mem_req_out_tag = mem_bus_in_if.req_data.tag;
end
assign mem_bus_in_if.req_ready = mem_req_out_ready;
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `ADDR_TYPE_WIDTH + `CS_LINE_WIDTH + MEM_TAG_OUT_WIDTH),
.SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + `CS_LINE_WIDTH + MEM_TAG_OUT_WIDTH),
.SIZE (DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(MEM_OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_out_valid),
.ready_in (mem_req_out_ready),
.data_in ({mem_req_out_rw, mem_req_out_byteen, mem_req_out_addr, mem_req_out_atype, mem_req_out_data, mem_req_out_tag}),
.data_out ({mem_bus_out_if.req_data.rw, mem_bus_out_if.req_data.byteen, mem_bus_out_if.req_data.addr, mem_bus_out_if.req_data.atype, mem_bus_out_if.req_data.data, mem_bus_out_if.req_data.tag}),
.valid_out (mem_bus_out_if.req_valid),
.valid_in (mem_req_out_valid),
.ready_in (mem_req_out_ready),
.data_in ({mem_req_out_rw, mem_req_out_byteen, mem_req_out_addr, mem_req_out_flags, mem_req_out_data, mem_req_out_tag}),
.data_out ({mem_bus_out_if.req_data.rw, mem_bus_out_if.req_data.byteen, mem_bus_out_if.req_data.addr, mem_bus_out_if.req_data.flags, mem_bus_out_if.req_data.data, mem_bus_out_if.req_data.tag}),
.valid_out (mem_bus_out_if.req_valid),
.ready_out (mem_bus_out_if.req_ready)
);
@ -241,19 +239,17 @@ module VX_cache_bypass #(
wire [NUM_REQS-1:0] core_rsp_in_ready;
wire is_mem_rsp_nc;
if (PASSTHRU != 0) begin
if (PASSTHRU != 0) begin : g_is_mem_rsp_nc_passthru
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid;
end else begin
if (NC_ENABLE) begin
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid && mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
end else begin
assign is_mem_rsp_nc = 1'b0;
end
end else if (NC_ENABLE) begin : g_is_mem_rsp_nc
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid && mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
end else begin : g_is_no_mem_rsp_nc
assign is_mem_rsp_nc = 1'b0;
end
wire [(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1:0] mem_rsp_tag_id_nc;
VX_bits_remove #(
VX_bits_remove #(
.N (MEM_TAG_OUT_WIDTH),
.S (NC_ENABLE),
.POS (TAG_SEL_IDX)
@ -262,57 +258,52 @@ module VX_cache_bypass #(
.data_out (mem_rsp_tag_id_nc)
);
wire [`UP(REQ_SEL_BITS)-1:0] rsp_idx;
if (NUM_REQS > 1) begin
wire [REQ_SEL_WIDTH-1:0] rsp_idx;
if (NUM_REQS > 1) begin : g_rsp_idx
assign rsp_idx = mem_rsp_tag_id_nc[(CORE_TAG_ID_BITS + WSEL_BITS) +: REQ_SEL_BITS];
end else begin
end else begin : g_rsp_idx_0
assign rsp_idx = 1'b0;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_in_valid
assign core_rsp_in_valid[i] = core_bus_out_if[i].rsp_valid || (is_mem_rsp_nc && rsp_idx == REQ_SEL_WIDTH'(i));
end
reg [NUM_REQS-1:0] rsp_nc_valid_r;
always @(*) begin
rsp_nc_valid_r = '0;
rsp_nc_valid_r[rsp_idx] = is_mem_rsp_nc;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_in_ready
assign core_bus_out_if[i].rsp_ready = core_rsp_in_ready[i];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_in_valid[i] = core_bus_out_if[i].rsp_valid || rsp_nc_valid_r[i];
assign core_bus_out_if[i].rsp_ready = core_rsp_in_ready[i];
end
if (WORDS_PER_LINE > 1) begin
wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS +: WSEL_BITS];
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ?
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_in_data
if (WORDS_PER_LINE > 1) begin : g_wsel
wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS +: WSEL_BITS];
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ?
core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
end
end else begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
end else begin : g_no_wsel
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data;
end
end
wire [(CORE_TAG_ID_BITS + UUID_WIDTH)-1:0] mem_rsp_tag_in_nc2;
if (UUID_WIDTH != 0) begin
if (UUID_WIDTH != 0) begin : g_mem_rsp_tag_in_nc2_uuid
assign mem_rsp_tag_in_nc2 = {mem_rsp_tag_id_nc[(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1 -: UUID_WIDTH], mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0]};
end else begin
end else begin : g_mem_rsp_tag_in_nc2
assign mem_rsp_tag_in_nc2 = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (PASSTHRU) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_in_tag
if (PASSTHRU) begin : g_passthru
assign core_rsp_in_tag[i] = mem_rsp_tag_in_nc2;
end else if (NC_ENABLE) begin
end else if (NC_ENABLE) begin : g_nc
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.tag : mem_rsp_tag_in_nc2;
end else begin
end else begin : g_no_nc
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_data.tag;
end
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_buf
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + CORE_TAG_WIDTH),
.SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.SIZE (DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(CORE_OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
.clk (clk),
@ -320,7 +311,7 @@ module VX_cache_bypass #(
.valid_in (core_rsp_in_valid[i]),
.ready_in (core_rsp_in_ready[i]),
.data_in ({core_rsp_in_data[i], core_rsp_in_tag[i]}),
.data_out ({core_bus_in_if[i].rsp_data.data, core_bus_in_if[i].rsp_data.tag}),
.data_out ({core_bus_in_if[i].rsp_data.data, core_bus_in_if[i].rsp_data.tag}),
.valid_out (core_bus_in_if[i].rsp_valid),
.ready_out (core_bus_in_if[i].rsp_ready)
);
@ -328,22 +319,22 @@ module VX_cache_bypass #(
// handle memory responses ////////////////////////////////////////////////
if (PASSTHRU != 0) begin
if (PASSTHRU != 0) begin : g_mem_bus_in_if_passthru
assign mem_bus_in_if.rsp_valid = 1'b0;
assign mem_bus_in_if.rsp_data.data = '0;
assign mem_bus_in_if.rsp_data.tag = '0;
end else if (NC_ENABLE) begin
end else if (NC_ENABLE) begin : g_mem_bus_in_if_nc
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid && ~mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc[MEM_TAG_IN_WIDTH-1:0];
end else begin
end else begin : g_mem_bus_in_if
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid;
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc;
end
wire [NUM_REQS-1:0] core_rsp_out_valid;
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_out_valid
assign core_rsp_out_valid[i] = core_bus_out_if[i].rsp_valid;
end

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -24,20 +24,20 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
parameter NUM_REQS = 4,
// Size of cache in bytes
parameter CACHE_SIZE = 16384,
parameter CACHE_SIZE = 16384,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 4,
// Size of a word in bytes
parameter WORD_SIZE = 4,
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 8,
parameter MSHR_SIZE = 8,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
@ -46,6 +46,12 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -60,7 +66,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
// Memory request output buffer
parameter MEM_OUT_BUF = 0
) (
) (
input wire clk,
input wire reset,
@ -74,17 +80,16 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
);
localparam NUM_CACHES = `UP(NUM_UNITS);
localparam PASSTHRU = (NUM_UNITS == 0);
localparam ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES);
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
localparam ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES);
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH, UUID_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, UUID_WIDTH));
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
`ifdef PERF_ENABLE
cache_perf_t perf_cache_tmp[1], perf_cache_unit[NUM_CACHES];
`PERF_CACHE_ADD (perf_cache_tmp, perf_cache_unit, 1, NUM_CACHES)
assign cache_perf = perf_cache_tmp[0];
cache_perf_t perf_cache_unit[NUM_CACHES];
`PERF_CACHE_ADD (cache_perf, perf_cache_unit, NUM_CACHES)
`endif
VX_mem_bus_if #(
@ -97,9 +102,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.TAG_WIDTH (ARB_TAG_WIDTH)
) arb_core_bus_if[NUM_CACHES * NUM_REQS]();
`RESET_RELAY (arb_reset, reset);
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_arb
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (TAG_WIDTH)
@ -110,7 +113,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.TAG_WIDTH (ARB_TAG_WIDTH)
) arb_core_bus_tmp_if[NUM_CACHES]();
for (genvar j = 0; j < NUM_INPUTS; ++j) begin
for (genvar j = 0; j < NUM_INPUTS; ++j) begin : g_core_bus_tmp_if
`ASSIGN_VX_MEM_BUS_IF (core_bus_tmp_if[j], core_bus_if[j * NUM_REQS + i]);
end
@ -122,23 +125,20 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.TAG_SEL_IDX (TAG_SEL_IDX),
.ARBITER ("R"),
.REQ_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0),
.RSP_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0)
) cache_arb (
.RSP_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? CORE_OUT_BUF : 0)
) core_arb (
.clk (clk),
.reset (arb_reset),
.reset (reset),
.bus_in_if (core_bus_tmp_if),
.bus_out_if (arb_core_bus_tmp_if)
);
for (genvar k = 0; k < NUM_CACHES; ++k) begin
for (genvar k = 0; k < NUM_CACHES; ++k) begin : g_arb_core_bus_if
`ASSIGN_VX_MEM_BUS_IF (arb_core_bus_if[k * NUM_REQS + i], arb_core_bus_tmp_if[k]);
end
end
`RESET_RELAY (cache_reset, reset);
for (genvar i = 0; i < NUM_CACHES; ++i) begin
for (genvar i = 0; i < NUM_CACHES; ++i) begin : g_cache_wrap
VX_cache_wrap #(
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, i)),
.CACHE_SIZE (CACHE_SIZE),
@ -152,6 +152,8 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (ARB_TAG_WIDTH),
.TAG_SEL_IDX (TAG_SEL_IDX),
@ -164,7 +166,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.cache_perf (perf_cache_unit[i]),
`endif
.clk (clk),
.reset (cache_reset),
.reset (reset),
.core_bus_if (arb_core_bus_if[i * NUM_REQS +: NUM_REQS]),
.mem_bus_if (cache_mem_bus_if[i])
);
@ -181,7 +183,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.TAG_WIDTH (MEM_TAG_WIDTH),
.TAG_SEL_IDX (TAG_SEL_IDX),
.ARBITER ("R"),
.REQ_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0),
.REQ_OUT_BUF ((NUM_CACHES > 1) ? MEM_OUT_BUF : 0),
.RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0)
) mem_arb (
.clk (clk),
@ -190,6 +192,10 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.bus_out_if (mem_bus_tmp_if)
);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]);
if (WRITE_ENABLE) begin : g_mem_bus_if
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]);
end else begin : g_mem_bus_if_ro
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if, mem_bus_tmp_if[0]);
end
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -17,17 +17,21 @@ module VX_cache_data #(
parameter `STRING INSTANCE_ID= "",
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 1,
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0
) (
@ -40,62 +44,105 @@ module VX_cache_data #(
input wire stall,
input wire init,
input wire read,
input wire fill,
input wire fill,
input wire flush,
input wire write,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire [`UP(`CS_WORD_SEL_BITS)-1:0] wsel,
input wire [WORD_SIZE-1:0] byteen,
input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] fill_data,
input wire [`CS_WORD_WIDTH-1:0] write_data,
input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] write_data,
input wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_byteen,
input wire [NUM_WAYS-1:0] way_sel,
output wire [`CS_WORD_WIDTH-1:0] read_data
output wire [`CS_WORD_WIDTH-1:0] read_data,
output wire [`CS_LINE_WIDTH-1:0] dirty_data,
output wire [LINE_SIZE-1:0] dirty_byteen
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_PARAM (WORD_SIZE)
`UNUSED_VAR (reset)
`UNUSED_VAR (stall)
`UNUSED_VAR (line_addr)
`UNUSED_VAR (init)
`UNUSED_VAR (read)
`UNUSED_VAR (flush)
localparam BYTEENW = (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) ? (LINE_SIZE * NUM_WAYS) : 1;
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] wdata;
wire [BYTEENW-1:0] wren;
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin
reg [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] wdata_r;
reg [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] wren_r;
always @(*) begin
wdata_r = {`CS_WORDS_PER_LINE{write_data}};
wren_r = '0;
wren_r[wsel] = byteen;
end
// order the data layout to perform ways multiplexing last
// this allows performing onehot encoding of the way index in parallel with BRAM read.
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
assign wdata[i] = fill ? {NUM_WAYS{fill_data[i]}} : {NUM_WAYS{wdata_r[i]}};
for (genvar j = 0; j < NUM_WAYS; ++j) begin
assign wren_w[i][j] = (fill ? {WORD_SIZE{1'b1}} : wren_r[i])
& {WORD_SIZE{((NUM_WAYS == 1) || way_sel[j])}};
end
end
assign wren = wren_w;
end else begin
`UNUSED_VAR (write)
`UNUSED_VAR (byteen)
`UNUSED_VAR (write_data)
assign wdata = fill_data;
assign wren = fill;
end
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_rdata;
wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
VX_onehot_encoder #(
if (WRITEBACK) begin : g_dirty_data
wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] transposed_rdata;
VX_transpose #(
.DATAW (`CS_WORD_WIDTH),
.N (`CS_WORDS_PER_LINE),
.M (NUM_WAYS)
) transpose (
.data_in (line_rdata),
.data_out (transposed_rdata)
);
assign dirty_data = transposed_rdata[way_idx];
end else begin : g_dirty_data_0
assign dirty_data = '0;
end
if (DIRTY_BYTES) begin : g_dirty_byteen
wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_rdata;
wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_wdata;
for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_bs_wdata
wire [LINE_SIZE-1:0] wdata = write ? (bs_rdata[i] | write_byteen) : ((fill || flush) ? '0 : bs_rdata[i]);
assign bs_wdata[i] = init ? '0 : (way_sel[i] ? wdata : bs_rdata[i]);
end
VX_sp_ram #(
.DATAW (LINE_SIZE * NUM_WAYS),
.SIZE (`CS_LINES_PER_BANK)
) byteen_store (
.clk (clk),
.reset (reset),
.read (write || fill || flush),
.write (init || write || fill || flush),
.wren (1'b1),
.addr (line_sel),
.wdata (bs_wdata),
.rdata (bs_rdata)
);
assign dirty_byteen = bs_rdata[way_idx];
end else begin : g_dirty_byteen_0
assign dirty_byteen = '1;
end
// order the data layout to perform ways multiplexing last.
// this allows converting way index to binary in parallel with BRAM readaccess and way selection.
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_wdata;
wire [BYTEENW-1:0] line_wren;
if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin : g_line_wdata
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin : g_i
for (genvar j = 0; j < NUM_WAYS; ++j) begin : g_j
assign line_wdata[i][j] = (fill || !WRITE_ENABLE) ? fill_data[i] : write_data[i];
assign wren_w[i][j] = ((fill || !WRITE_ENABLE) ? {WORD_SIZE{1'b1}} : write_byteen[i])
& {WORD_SIZE{(way_sel[j] || (NUM_WAYS == 1))}};
end
end
assign line_wren = wren_w;
end else begin : g_line_wdata_ro
`UNUSED_VAR (write)
`UNUSED_VAR (write_byteen)
`UNUSED_VAR (write_data)
assign line_wdata = fill_data;
assign line_wren = fill;
end
VX_encoder #(
.N (NUM_WAYS)
) way_enc (
.data_in (way_sel),
@ -103,50 +150,52 @@ module VX_cache_data #(
`UNUSED_PIN (valid_out)
);
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] rdata;
wire line_read = (read && ~stall)
|| (WRITEBACK && (fill || flush));
wire line_write = write || fill;
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
VX_sp_ram #(
.DATAW (`CS_LINE_WIDTH * NUM_WAYS),
.SIZE (`CS_LINES_PER_BANK),
.WRENW (BYTEENW),
.NO_RWCHECK (1)
.NO_RWCHECK (1),
.RW_ASSERT (1)
) data_store (
.clk (clk),
.read (1'b1),
.write (write || fill),
.wren (wren),
.reset (reset),
.read (line_read),
.write (line_write),
.wren (line_wren),
.addr (line_sel),
.wdata (wdata),
.rdata (rdata)
.wdata (line_wdata),
.rdata (line_rdata)
);
wire [NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] per_way_rdata;
if (`CS_WORDS_PER_LINE > 1) begin
assign per_way_rdata = rdata[wsel];
end else begin
if (`CS_WORDS_PER_LINE > 1) begin : g_per_way_rdata_wsel
assign per_way_rdata = line_rdata[wsel];
end else begin : g_per_way_rdata
`UNUSED_VAR (wsel)
assign per_way_rdata = rdata;
end
assign per_way_rdata = line_rdata;
end
assign read_data = per_way_rdata[way_idx];
`UNUSED_VAR (stall)
`ifdef DBG_TRACE_CACHE
always @(posedge clk) begin
always @(posedge clk) begin
if (fill && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d data-fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data));
`TRACE(3, ("%t: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data))
end
if (flush && ~stall) begin
`TRACE(3, ("%t: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, byteen=0x%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, dirty_byteen, dirty_data))
end
if (read && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d data-read: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, read_data, req_uuid));
end
`TRACE(3, ("%t: %s read: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, read_data, req_uuid))
end
if (write && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d data-write: addr=0x%0h, way=%b, blk_addr=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, byteen, write_data, req_uuid));
end
end
`TRACE(3, ("%t: %s write: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, write_byteen[wsel], write_data[wsel], req_uuid))
end
end
`endif
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,7 +14,7 @@
`ifndef VX_CACHE_DEFINE_VH
`define VX_CACHE_DEFINE_VH
`include "VX_define.vh"
`include "VX_define.vh"
`define CS_REQ_SEL_BITS `CLOG2(NUM_REQS)
@ -50,28 +50,27 @@
`define CS_TAG_SEL_ADDR_START (1+`CS_LINE_SEL_ADDR_END)
`define CS_TAG_SEL_ADDR_END (`CS_WORD_ADDR_WIDTH-1)
`define CS_LINE_TAG_ADDR(x) x[`CS_LINE_ADDR_WIDTH-1 : `CS_LINE_SEL_BITS]
`define CS_LINE_ADDR_TAG(x) x[`CS_LINE_ADDR_WIDTH-1 : `CS_LINE_SEL_BITS]
///////////////////////////////////////////////////////////////////////////////
`define CS_LINE_TO_MEM_ADDR(x, i) {x, `CS_BANK_SEL_BITS'(i)}
`define CS_MEM_ADDR_TO_BANK_ID(x) x[0 +: `CS_BANK_SEL_BITS]
`define CS_MEM_TAG_TO_REQ_ID(x) x[MSHR_ADDR_WIDTH-1:0]
`define CS_MEM_TAG_TO_BANK_ID(x) x[MSHR_ADDR_WIDTH +: `CS_BANK_SEL_BITS]
`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
`define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)}
///////////////////////////////////////////////////////////////////////////////
`define PERF_CACHE_ADD(dst, src, dcount, scount) \
`PERF_COUNTER_ADD (dst, src, reads, `PERF_CTR_BITS, dcount, scount, (`CDIV(scount, dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, writes, `PERF_CTR_BITS, dcount, scount, (`CDIV(scount, dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, read_misses, `PERF_CTR_BITS, dcount, scount, (`CDIV(scount, dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, write_misses, `PERF_CTR_BITS, dcount, scount, (`CDIV(scount, dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, bank_stalls, `PERF_CTR_BITS, dcount, scount, (`CDIV(scount, dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, mshr_stalls, `PERF_CTR_BITS, dcount, scount, (`CDIV(scount, dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, dcount, scount, (`CDIV(scount, dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, dcount, scount, (`CDIV(scount, dcount) > 1))
`define PERF_CACHE_ADD(dst, src, count) \
`PERF_COUNTER_ADD (dst, src, reads, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, writes, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, read_misses, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, write_misses, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, bank_stalls, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, mshr_stalls, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, count, (count > 1))
`endif // VX_CACHE_DEFINE_VH

188
hw/rtl/cache/VX_cache_flush.sv vendored Normal file
View file

@ -0,0 +1,188 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_cache_flush #(
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Number of banks
parameter NUM_BANKS = 1,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 1,
// Bank select latency
parameter BANK_SEL_LATENCY = 1
) (
input wire clk,
input wire reset,
VX_mem_bus_if.slave core_bus_in_if [NUM_REQS],
VX_mem_bus_if.master core_bus_out_if [NUM_REQS],
input wire [NUM_BANKS-1:0] bank_req_fire,
output wire [NUM_BANKS-1:0] flush_begin,
output wire [`UP(UUID_WIDTH)-1:0] flush_uuid,
input wire [NUM_BANKS-1:0] flush_end
);
localparam STATE_IDLE = 0;
localparam STATE_WAIT1 = 1;
localparam STATE_FLUSH = 2;
localparam STATE_WAIT2 = 3;
localparam STATE_DONE = 4;
reg [2:0] state, state_n;
// track in-flight core requests
wire no_inflight_reqs;
if (BANK_SEL_LATENCY != 0) begin : g_bank_sel_latency
localparam NUM_REQS_W = `CLOG2(NUM_REQS+1);
localparam NUM_BANKS_W = `CLOG2(NUM_BANKS+1);
wire [NUM_REQS-1:0] core_bus_out_fire;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_fire
assign core_bus_out_fire[i] = core_bus_out_if[i].req_valid && core_bus_out_if[i].req_ready;
end
wire [NUM_REQS_W-1:0] core_bus_out_cnt;
wire [NUM_BANKS_W-1:0] bank_req_cnt;
`POP_COUNT(core_bus_out_cnt, core_bus_out_fire);
`POP_COUNT(bank_req_cnt, bank_req_fire);
`UNUSED_VAR (core_bus_out_cnt)
VX_pending_size #(
.SIZE (BANK_SEL_LATENCY * NUM_BANKS),
.INCRW (NUM_BANKS_W),
.DECRW (NUM_BANKS_W)
) pending_size (
.clk (clk),
.reset (reset),
.incr (NUM_BANKS_W'(core_bus_out_cnt)),
.decr (bank_req_cnt),
.empty (no_inflight_reqs),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end else begin : g_no_bank_sel_latency
assign no_inflight_reqs = 0;
`UNUSED_VAR (bank_req_fire)
end
reg [NUM_BANKS-1:0] flush_done, flush_done_n;
wire [NUM_REQS-1:0] flush_req_mask;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_flush_req_mask
assign flush_req_mask[i] = core_bus_in_if[i].req_valid && core_bus_in_if[i].req_data.flags[`MEM_REQ_FLAG_FLUSH];
end
wire flush_req_enable = (| flush_req_mask);
reg [NUM_REQS-1:0] lock_released, lock_released_n;
reg [`UP(UUID_WIDTH)-1:0] flush_uuid_r, flush_uuid_n;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_req
wire input_enable = ~flush_req_enable || lock_released[i];
assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && input_enable;
assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
assign core_bus_in_if[i].req_ready = core_bus_out_if[i].req_ready && input_enable;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_in_rsp
assign core_bus_in_if[i].rsp_valid = core_bus_out_if[i].rsp_valid;
assign core_bus_in_if[i].rsp_data = core_bus_out_if[i].rsp_data;
assign core_bus_out_if[i].rsp_ready = core_bus_in_if[i].rsp_ready;
end
reg [NUM_REQS-1:0][`UP(UUID_WIDTH)-1:0] core_bus_out_uuid;
wire [NUM_REQS-1:0] core_bus_out_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_uuid
if (UUID_WIDTH != 0) begin : g_uuid
assign core_bus_out_uuid[i] = core_bus_in_if[i].req_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin : g_no_uuid
assign core_bus_out_uuid[i] = 0;
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_ready
assign core_bus_out_ready[i] = core_bus_out_if[i].req_ready;
end
always @(*) begin
state_n = state;
flush_done_n = flush_done;
lock_released_n = lock_released;
flush_uuid_n = flush_uuid_r;
case (state)
STATE_IDLE: begin
if (flush_req_enable) begin
state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT1 : STATE_FLUSH;
for (integer i = NUM_REQS-1; i >= 0; --i) begin
if (flush_req_mask[i]) begin
flush_uuid_n = core_bus_out_uuid[i];
end
end
end
end
STATE_WAIT1: begin
if (no_inflight_reqs) begin
state_n = STATE_FLUSH;
end
end
STATE_FLUSH: begin
// generate a flush request pulse
state_n = STATE_WAIT2;
end
STATE_WAIT2: begin
// wait for all banks to finish flushing
flush_done_n = flush_done | flush_end;
if (flush_done_n == {NUM_BANKS{1'b1}}) begin
state_n = STATE_DONE;
flush_done_n = '0;
// only release current flush requests
// and keep normal requests locked
lock_released_n = flush_req_mask;
end
end
STATE_DONE: begin
// wait until released flush requests are issued
// when returning to IDLE state other requests will unlock
lock_released_n = lock_released & ~core_bus_out_ready;
if (lock_released_n == 0) begin
state_n = STATE_IDLE;
end
end
endcase
end
always @(posedge clk) begin
if (reset) begin
state <= STATE_IDLE;
flush_done <= '0;
lock_released <= '0;
end else begin
state <= state_n;
flush_done <= flush_done_n;
lock_released <= lock_released_n;
end
flush_uuid_r <= flush_uuid_n;
end
assign flush_begin = {NUM_BANKS{state == STATE_FLUSH}};
assign flush_uuid = flush_uuid_r;
endmodule

View file

@ -1,51 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_cache_init #(
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1
) (
input wire clk,
input wire reset,
output wire [`CS_LINE_SEL_BITS-1:0] addr_out,
output wire valid_out
);
reg enabled;
reg [`CS_LINE_SEL_BITS-1:0] line_ctr;
always @(posedge clk) begin
if (reset) begin
enabled <= 1;
line_ctr <= '0;
end else begin
if (enabled) begin
if (line_ctr == ((2 ** `CS_LINE_SEL_BITS)-1)) begin
enabled <= 0;
end
line_ctr <= line_ctr + `CS_LINE_SEL_BITS'(1);
end
end
end
assign addr_out = line_ctr;
assign valid_out = enabled;
endmodule

View file

@ -104,7 +104,8 @@ module VX_cache_mshr #(
// lookup
input wire lookup_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] lookup_addr,
output wire [MSHR_SIZE-1:0] lookup_matches,
output wire [MSHR_SIZE-1:0] lookup_pending,
output wire [MSHR_SIZE-1:0] lookup_rw,
// finalize
input wire finalize_valid,
@ -134,7 +135,7 @@ module VX_cache_mshr #(
wire dequeue_fire = dequeue_valid && dequeue_ready;
wire [MSHR_SIZE-1:0] addr_matches;
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
for (genvar i = 0; i < MSHR_SIZE; ++i) begin : g_addr_matches
assign addr_matches[i] = valid_table[i] && (addr_table[i] == lookup_addr);
end
@ -147,7 +148,7 @@ module VX_cache_mshr #(
.valid_out (allocate_rdy_n)
);
VX_onehot_encoder #(
VX_encoder #(
.N (MSHR_SIZE)
) prev_sel (
.data_in (addr_matches & ~next_table_x),
@ -216,13 +217,13 @@ module VX_cache_mshr #(
next_table <= next_table_n;
end
`RUNTIME_ASSERT((~allocate_fire || ~valid_table[allocate_id_r]), ("%t: *** %s-bank%0d inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, BANK_ID,
`RUNTIME_ASSERT((~allocate_fire || ~valid_table[allocate_id_r]), ("%t: *** %s inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id_r, lkp_req_uuid))
`RUNTIME_ASSERT((~finalize_valid || valid_table[finalize_id]), ("%t: *** %s-bank%0d invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, BANK_ID,
`RUNTIME_ASSERT((~finalize_valid || valid_table[finalize_id]), ("%t: *** %s invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[finalize_id], BANK_ID), finalize_id, fin_req_uuid))
`RUNTIME_ASSERT((~fill_valid || valid_table[fill_id]), ("%t: *** %s-bank%0d invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID, BANK_ID,
`RUNTIME_ASSERT((~fill_valid || valid_table[fill_id]), ("%t: *** %s invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), fill_id))
VX_dp_ram #(
@ -231,9 +232,10 @@ module VX_cache_mshr #(
.LUTRAM (1)
) entries (
.clk (clk),
.reset (reset),
.read (1'b1),
.write (allocate_valid),
`UNUSED_PIN (wren),
.wren (1'b1),
.waddr (allocate_id_r),
.wdata (allocate_data),
.raddr (dequeue_id_r),
@ -251,7 +253,9 @@ module VX_cache_mshr #(
assign dequeue_rw = write_table[dequeue_id_r];
assign dequeue_id = dequeue_id_r;
assign lookup_matches = addr_matches & ~write_table;
// return pending entries for the given cache line
assign lookup_pending = addr_matches;
assign lookup_rw = write_table;
`UNUSED_VAR (lookup_valid)
@ -263,35 +267,42 @@ module VX_cache_mshr #(
end else begin
show_table <= allocate_fire || lookup_valid || finalize_valid || fill_valid || dequeue_fire;
end
if (allocate_fire)
`TRACE(3, ("%d: %s-bank%0d mshr-allocate: addr=0x%0h, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID,
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_prev, allocate_id, lkp_req_uuid));
if (lookup_valid)
`TRACE(3, ("%d: %s-bank%0d mshr-lookup: addr=0x%0h, matches=%b (#%0d)\n", $time, INSTANCE_ID, BANK_ID,
`CS_LINE_TO_FULL_ADDR(lookup_addr, BANK_ID), lookup_matches, lkp_req_uuid));
if (finalize_valid)
`TRACE(3, ("%d: %s-bank%0d mshr-finalize release=%b, pending=%b, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID,
finalize_release, finalize_pending, finalize_prev, finalize_id, fin_req_uuid));
if (fill_valid)
`TRACE(3, ("%d: %s-bank%0d mshr-fill: addr=0x%0h, addr=0x%0h, id=%0d\n", $time, INSTANCE_ID, BANK_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), `CS_LINE_TO_FULL_ADDR(fill_addr, BANK_ID), fill_id));
if (dequeue_fire)
`TRACE(3, ("%d: %s-bank%0d mshr-dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID,
`CS_LINE_TO_FULL_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_uuid));
if (allocate_fire) begin
`TRACE(3, ("%t: %s allocate: addr=0x%0h, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_prev, allocate_id, lkp_req_uuid))
end
if (lookup_valid) begin
`TRACE(3, ("%t: %s lookup: addr=0x%0h, matches=%b (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(lookup_addr, BANK_ID), lookup_pending, lkp_req_uuid))
end
if (finalize_valid) begin
`TRACE(3, ("%t: %s finalize release=%b, pending=%b, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID,
finalize_release, finalize_pending, finalize_prev, finalize_id, fin_req_uuid))
end
if (fill_valid) begin
`TRACE(3, ("%t: %s fill: addr=0x%0h, addr=0x%0h, id=%0d\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), `CS_LINE_TO_FULL_ADDR(fill_addr, BANK_ID), fill_id))
end
if (dequeue_fire) begin
`TRACE(3, ("%t: %s dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_uuid))
end
if (show_table) begin
`TRACE(3, ("%d: %s-bank%0d mshr-table", $time, INSTANCE_ID, BANK_ID));
`TRACE(3, ("%t: %s table", $time, INSTANCE_ID))
for (integer i = 0; i < MSHR_SIZE; ++i) begin
if (valid_table[i]) begin
`TRACE(3, (" %0d=0x%0h", i, `CS_LINE_TO_FULL_ADDR(addr_table[i], BANK_ID)));
if (write_table[i])
`TRACE(3, ("(w)"));
else
`TRACE(3, ("(r)"));
if (next_table[i])
`TRACE(3, ("->%0d", next_index[i]));
`TRACE(3, (" %0d=0x%0h", i, `CS_LINE_TO_FULL_ADDR(addr_table[i], BANK_ID)))
if (write_table[i]) begin
`TRACE(3, ("(w)"))
end else begin
`TRACE(3, ("(r)"))
end
if (next_table[i]) begin
`TRACE(3, ("->%0d", next_index[i]))
end
end
end
`TRACE(3, ("\n"));
`TRACE(3, ("\n"))
end
end
`endif

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -17,15 +17,17 @@ module VX_cache_tags #(
parameter `STRING INSTANCE_ID = "",
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 1,
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
parameter NUM_WAYS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
parameter WORD_SIZE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Request debug identifier
parameter UUID_WIDTH = 0
) (
@ -38,79 +40,139 @@ module VX_cache_tags #(
input wire stall,
// read/fill
// init/fill/lookup
input wire init,
input wire flush,
input wire fill,
input wire write,
input wire lookup,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire fill,
input wire init,
output wire [NUM_WAYS-1:0] way_sel,
output wire [NUM_WAYS-1:0] tag_matches
input wire [NUM_WAYS-1:0] way_sel,
output wire [NUM_WAYS-1:0] tag_matches,
// eviction
output wire evict_dirty,
output wire [NUM_WAYS-1:0] evict_way,
output wire [`CS_TAG_SEL_BITS-1:0] evict_tag
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_VAR (reset)
`UNUSED_VAR (lookup)
localparam TAG_WIDTH = 1 + `CS_TAG_SEL_BITS;
// valid, dirty, tag
localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS;
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_TAG_ADDR(line_addr);
wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_ADDR_TAG(line_addr);
if (NUM_WAYS > 1) begin
reg [NUM_WAYS-1:0] repl_way;
wire [NUM_WAYS-1:0][`CS_TAG_SEL_BITS-1:0] read_tag;
wire [NUM_WAYS-1:0] read_valid;
wire [NUM_WAYS-1:0] read_dirty;
if (NUM_WAYS > 1) begin : g_evict_way
reg [NUM_WAYS-1:0] evict_way_r;
// cyclic assignment of replacement way
always @(posedge clk) begin
if (reset) begin
repl_way <= 1;
end else if (~stall) begin // hold the value on stalls prevent filling different slots twice
repl_way <= {repl_way[NUM_WAYS-2:0], repl_way[NUM_WAYS-1]};
evict_way_r <= 1;
end else if (~stall) begin // holding the value on stalls prevents filling different slots twice
evict_way_r <= {evict_way_r[NUM_WAYS-2:0], evict_way_r[NUM_WAYS-1]};
end
end
for (genvar i = 0; i < NUM_WAYS; ++i) begin
assign way_sel[i] = fill && repl_way[i];
end
end else begin
assign evict_way = fill ? evict_way_r : way_sel;
VX_onehot_mux #(
.DATAW (`CS_TAG_SEL_BITS),
.N (NUM_WAYS)
) evict_tag_sel (
.data_in (read_tag),
.sel_in (evict_way),
.data_out (evict_tag)
);
end else begin : g_evict_way_0
`UNUSED_VAR (stall)
assign way_sel = fill;
assign evict_way = 1'b1;
assign evict_tag = read_tag;
end
for (genvar i = 0; i < NUM_WAYS; ++i) begin
wire [`CS_TAG_SEL_BITS-1:0] read_tag;
wire read_valid;
// fill and flush need to also read in writeback mode
wire fill_s = fill && (!WRITEBACK || ~stall);
wire flush_s = flush && (!WRITEBACK || ~stall);
for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_tag_store
wire do_fill = fill_s && evict_way[i];
wire do_flush = flush_s && (!WRITEBACK || way_sel[i]); // flush the whole line in writethrough mode
wire do_write = WRITEBACK && write && tag_matches[i];
wire line_read = (WRITEBACK && (fill_s || flush_s));
wire line_write = init || do_fill || do_flush || do_write;
wire line_valid = ~(init || flush);
wire [TAG_WIDTH-1:0] line_wdata;
wire [TAG_WIDTH-1:0] line_rdata;
if (WRITEBACK) begin : g_writeback
assign line_wdata = {line_valid, write, line_tag};
assign {read_valid[i], read_dirty[i], read_tag[i]} = line_rdata;
end else begin : g_writethrough
assign line_wdata = {line_valid, line_tag};
assign {read_valid[i], read_tag[i]} = line_rdata;
assign read_dirty[i] = 1'b0;
end
VX_sp_ram #(
.DATAW (TAG_WIDTH),
.SIZE (`CS_LINES_PER_BANK),
.NO_RWCHECK (1)
.NO_RWCHECK (1),
.RW_ASSERT (1)
) tag_store (
.clk (clk),
.read (1'b1),
.write (way_sel[i] || init),
`UNUSED_PIN (wren),
.reset (reset),
.read (line_read),
.write (line_write),
.wren (1'b1),
.addr (line_sel),
.wdata ({~init, line_tag}),
.rdata ({read_valid, read_tag})
.wdata (line_wdata),
.rdata (line_rdata)
);
assign tag_matches[i] = read_valid && (line_tag == read_tag);
end
for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_tag_matches
assign tag_matches[i] = read_valid[i] && (line_tag == read_tag[i]);
end
assign evict_dirty = | (read_dirty & evict_way);
`ifdef DBG_TRACE_CACHE
wire [`CS_LINE_ADDR_WIDTH-1:0] evict_line_addr = {evict_tag, line_sel};
always @(posedge clk) begin
if (fill && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d tag-fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, line_tag));
`TRACE(3, ("%t: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, dirty=%b, evict_addr=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), evict_way, line_sel, line_tag, evict_dirty, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID)))
end
if (init) begin
`TRACE(3, ("%d: %s-bank%0d tag-init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel));
`TRACE(3, ("%t: %s init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel))
end
if (flush && ~stall) begin
`TRACE(3, ("%t: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, dirty=%b\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID), way_sel, line_sel, evict_dirty))
end
if (lookup && ~stall) begin
if (tag_matches != 0) begin
`TRACE(3, ("%d: %s-bank%0d tag-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, line_tag, req_uuid));
if (write) begin
`TRACE(3, ("%t: %s write-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid))
end else begin
`TRACE(3, ("%t: %s read-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid))
end
end else begin
`TRACE(3, ("%d: %s-bank%0d tag-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
if (write) begin
`TRACE(3, ("%t: %s write-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid))
end else begin
`TRACE(3, ("%t: %s read-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid))
end
end
end
end
end
end
`endif
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -20,20 +20,20 @@ module VX_cache_top import VX_gpu_pkg::*; #(
parameter NUM_REQS = 4,
// Size of cache in bytes
parameter CACHE_SIZE = 16384,
parameter CACHE_SIZE = 16384,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 4,
// Number of associative ways
parameter NUM_WAYS = 4,
// Size of a word in bytes
parameter WORD_SIZE = 4,
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 16,
parameter MSHR_SIZE = 16,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
@ -42,6 +42,12 @@ module VX_cache_top import VX_gpu_pkg::*; #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -55,7 +61,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
parameter MEM_OUT_BUF = 2,
parameter MEM_TAG_WIDTH = `CLOG2(MSHR_SIZE) + `CLOG2(NUM_BANKS)
) (
) (
input wire clk,
input wire reset,
@ -69,7 +75,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
input wire [NUM_REQS-1:0] core_req_rw,
input wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr,
input wire [NUM_REQS-1:0][`ADDR_TYPE_WIDTH-1:0] core_req_atype,
input wire [NUM_REQS-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] core_req_flags,
input wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data,
input wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag,
output wire [NUM_REQS-1:0] core_req_ready,
@ -82,17 +88,17 @@ module VX_cache_top import VX_gpu_pkg::*; #(
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire mem_req_rw,
output wire [LINE_SIZE-1:0] mem_req_byteen,
output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire mem_rsp_valid,
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready
);
VX_mem_bus_if #(
@ -111,7 +117,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
assign core_bus_if[i].req_data.rw = core_req_rw[i];
assign core_bus_if[i].req_data.byteen = core_req_byteen[i];
assign core_bus_if[i].req_data.addr = core_req_addr[i];
assign core_bus_if[i].req_data.atype = core_req_atype[i];
assign core_bus_if[i].req_data.flags = core_req_flags[i];
assign core_bus_if[i].req_data.data = core_req_data[i];
assign core_bus_if[i].req_data.tag = core_req_tag[i];
assign core_req_ready[i] = core_bus_if[i].req_ready;
@ -127,18 +133,18 @@ module VX_cache_top import VX_gpu_pkg::*; #(
// Memory request
assign mem_req_valid = mem_bus_if.req_valid;
assign mem_req_rw = mem_bus_if.req_data.rw;
assign mem_req_rw = mem_bus_if.req_data.rw;
assign mem_req_byteen = mem_bus_if.req_data.byteen;
assign mem_req_addr = mem_bus_if.req_data.addr;
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
`UNUSED_VAR (mem_bus_if.req_data.atype)
`UNUSED_VAR (mem_bus_if.req_data.flags)
// Memory response
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data;
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
assign mem_rsp_ready = mem_bus_if.rsp_ready;
VX_cache #(
@ -156,6 +162,8 @@ module VX_cache_top import VX_gpu_pkg::*; #(
.TAG_WIDTH (TAG_WIDTH),
.UUID_WIDTH (UUID_WIDTH),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.CORE_OUT_BUF (CORE_OUT_BUF),
.MEM_OUT_BUF (MEM_OUT_BUF)
) cache (

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -23,20 +23,20 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
// Size of cache in bytes
parameter CACHE_SIZE = 4096,
parameter CACHE_SIZE = 4096,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 4,
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 8,
parameter MSHR_SIZE = 8,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
@ -45,6 +45,12 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -63,7 +69,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
// Memory request output buffer
parameter MEM_OUT_BUF = 0
) (
input wire clk,
input wire reset,
@ -78,12 +84,11 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
localparam CACHE_MEM_TAG_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
localparam CACHE_MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, UUID_WIDTH);
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH, UUID_WIDTH) :
CACHE_MEM_TAG_WIDTH);
localparam NC_OR_BYPASS = (NC_ENABLE || PASSTHRU);
@ -97,9 +102,12 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.TAG_WIDTH (CACHE_MEM_TAG_WIDTH)
) mem_bus_cache_if();
if (NC_OR_BYPASS) begin
`RESET_RELAY (nc_bypass_reset, reset);
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) mem_bus_tmp_if();
if (NC_OR_BYPASS) begin : g_bypass
VX_cache_bypass #(
.NUM_REQS (NUM_REQS),
@ -108,13 +116,13 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.PASSTHRU (PASSTHRU),
.NC_ENABLE (PASSTHRU ? 0 : NC_ENABLE),
.WORD_SIZE (WORD_SIZE),
.WORD_SIZE (WORD_SIZE),
.LINE_SIZE (LINE_SIZE),
.CORE_ADDR_WIDTH (`CS_WORD_ADDR_WIDTH),
.CORE_ADDR_WIDTH (`CS_WORD_ADDR_WIDTH),
.CORE_TAG_WIDTH (TAG_WIDTH),
.MEM_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH),
.MEM_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH),
.MEM_TAG_IN_WIDTH (CACHE_MEM_TAG_WIDTH),
.MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH),
@ -124,51 +132,31 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.MEM_OUT_BUF (MEM_OUT_BUF)
) cache_bypass (
.clk (clk),
.reset (nc_bypass_reset),
.reset (reset),
.core_bus_in_if (core_bus_if),
.core_bus_out_if(core_bus_cache_if),
.mem_bus_in_if (mem_bus_cache_if),
.mem_bus_out_if (mem_bus_if)
.mem_bus_out_if (mem_bus_tmp_if)
);
end else begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
`ASSIGN_VX_MEM_BUS_IF (core_bus_cache_if[i], core_bus_if[i]);
end else begin : g_no_bypass
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_cache_if
`ASSIGN_VX_MEM_BUS_IF (core_bus_cache_if[i], core_bus_if[i]);
end
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_cache_if);
end
`ASSIGN_VX_MEM_BUS_IF (mem_bus_tmp_if, mem_bus_cache_if);
end
if (PASSTHRU != 0) begin
if (WRITE_ENABLE) begin : g_mem_bus_if
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if);
end else begin : g_mem_bus_if_ro
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if, mem_bus_tmp_if);
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
`UNUSED_VAR (core_bus_cache_if[i].req_valid)
`UNUSED_VAR (core_bus_cache_if[i].req_data)
assign core_bus_cache_if[i].req_ready = 0;
assign core_bus_cache_if[i].rsp_valid = 0;
assign core_bus_cache_if[i].rsp_data = '0;
`UNUSED_VAR (core_bus_cache_if[i].rsp_ready)
end
assign mem_bus_cache_if.req_valid = 0;
assign mem_bus_cache_if.req_data = '0;
`UNUSED_VAR (mem_bus_cache_if.req_ready)
`UNUSED_VAR (mem_bus_cache_if.rsp_valid)
`UNUSED_VAR (mem_bus_cache_if.rsp_data)
assign mem_bus_cache_if.rsp_ready = 0;
`ifdef PERF_ENABLE
assign cache_perf = '0;
`endif
end else begin
`RESET_RELAY (cache_reset, reset);
if (PASSTHRU == 0) begin : g_cache
VX_cache #(
.INSTANCE_ID (INSTANCE_ID),
@ -183,32 +171,57 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_OUT_BUF (NC_OR_BYPASS ? 1 : CORE_OUT_BUF),
.MEM_OUT_BUF (NC_OR_BYPASS ? 1 : MEM_OUT_BUF)
) cache (
.clk (clk),
.reset (cache_reset),
.reset (reset),
`ifdef PERF_ENABLE
.cache_perf (cache_perf),
`endif
.core_bus_if (core_bus_cache_if),
.mem_bus_if (mem_bus_cache_if)
);
);
end else begin : g_passthru
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_cache_if
`UNUSED_VAR (core_bus_cache_if[i].req_valid)
`UNUSED_VAR (core_bus_cache_if[i].req_data)
assign core_bus_cache_if[i].req_ready = 0;
assign core_bus_cache_if[i].rsp_valid = 0;
assign core_bus_cache_if[i].rsp_data = '0;
`UNUSED_VAR (core_bus_cache_if[i].rsp_ready)
end
assign mem_bus_cache_if.req_valid = 0;
assign mem_bus_cache_if.req_data = '0;
`UNUSED_VAR (mem_bus_cache_if.req_ready)
`UNUSED_VAR (mem_bus_cache_if.rsp_valid)
`UNUSED_VAR (mem_bus_cache_if.rsp_data)
assign mem_bus_cache_if.rsp_ready = 0;
`ifdef PERF_ENABLE
assign cache_perf = '0;
`endif
end
`ifdef DBG_TRACE_CACHE
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_trace
wire [`UP(UUID_WIDTH)-1:0] core_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] core_rsp_uuid;
if (UUID_WIDTH != 0) begin
if (UUID_WIDTH != 0) begin : g_core_rsp_uuid
assign core_req_uuid = core_bus_if[i].req_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
assign core_rsp_uuid = core_bus_if[i].rsp_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
end else begin : g_no_core_rsp_uuid
assign core_req_uuid = 0;
assign core_rsp_uuid = 0;
end
@ -218,24 +231,25 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (core_req_fire) begin
if (core_bus_if[i].req_data.rw)
`TRACE(1, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid));
else
`TRACE(1, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_req_uuid));
if (core_bus_if[i].req_data.rw) begin
`TRACE(1, ("%t: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid))
end else begin
`TRACE(1, ("%t: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_req_uuid))
end
end
if (core_rsp_fire) begin
`TRACE(1, ("%d: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid));
end
`TRACE(1, ("%t: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid))
end
end
end
end
wire [`UP(UUID_WIDTH)-1:0] mem_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] mem_rsp_uuid;
if ((UUID_WIDTH != 0) && (NC_OR_BYPASS != 0)) begin
if ((UUID_WIDTH != 0) && (NC_OR_BYPASS != 0)) begin : g_mem_req_uuid
assign mem_req_uuid = mem_bus_if.req_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
assign mem_rsp_uuid = mem_bus_if.rsp_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
end else begin : g_no_mem_req_uuid
assign mem_req_uuid = 0;
assign mem_rsp_uuid = 0;
end
@ -245,18 +259,19 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (mem_req_fire) begin
if (mem_bus_if.req_data.rw)
`TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_bus_if.req_data.byteen, mem_bus_if.req_data.data, mem_req_uuid));
else
`TRACE(1, ("%d: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_req_uuid));
if (mem_bus_if.req_data.rw) begin
`TRACE(1, ("%t: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_bus_if.req_data.byteen, mem_bus_if.req_data.data, mem_req_uuid))
end else begin
`TRACE(1, ("%t: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_req_uuid))
end
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%0h (#%0d)\n",
$time, INSTANCE_ID, mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data, mem_rsp_uuid));
`TRACE(1, ("%t: %s mem-rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data, mem_rsp_uuid))
end
end
end
`endif
endmodule

View file

@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_alu_int #(
parameter CORE_ID = 0,
parameter `STRING INSTANCE_ID = "",
parameter BLOCK_IDX = 0,
parameter NUM_LANES = 1
) (
@ -29,7 +29,7 @@ module VX_alu_int #(
VX_branch_ctl_if.master branch_ctl_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam LANE_BITS = `CLOG2(NUM_LANES);
localparam LANE_WIDTH = `UP(LANE_BITS);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
@ -71,19 +71,19 @@ module VX_alu_int #(
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_imm = execute_if.data.op_args.alu.use_imm ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_args.alu.imm)}} : alu_in2;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_br = (execute_if.data.op_args.alu.use_imm && ~is_br_op) ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_args.alu.imm)}} : alu_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_add_result
assign add_result[i] = alu_in1_PC[i] + alu_in2_imm[i];
assign add_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] + alu_in2_imm[i][31:0]));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_sub_result
wire [`XLEN:0] sub_in1 = {is_signed & alu_in1[i][`XLEN-1], alu_in1[i]};
wire [`XLEN:0] sub_in2 = {is_signed & alu_in2_br[i][`XLEN-1], alu_in2_br[i]};
assign sub_result[i] = sub_in1 - sub_in2;
assign sub_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] - alu_in2_imm[i][31:0]));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_shr_result
wire [`XLEN:0] shr_in1 = {is_signed && alu_in1[i][`XLEN-1], alu_in1[i]};
always @(*) begin
case (alu_op[1:0])
@ -102,7 +102,7 @@ module VX_alu_int #(
assign shr_result_w[i] = `XLEN'($signed(shr_res_w));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_msc_result
always @(*) begin
case (alu_op[1:0])
2'b00: msc_result[i] = alu_in1[i] & alu_in2_imm[i]; // AND
@ -114,14 +114,14 @@ module VX_alu_int #(
assign msc_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] << alu_in2_imm[i][4:0])); // SLLW
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_alu_result
wire [`XLEN-1:0] slt_br_result = `XLEN'({is_br_op && ~(| sub_result[i][`XLEN-1:0]), sub_result[i][`XLEN]});
wire [`XLEN-1:0] sub_slt_br_result = (is_sub_op && ~is_br_op) ? sub_result[i][`XLEN-1:0] : slt_br_result;
always @(*) begin
case ({is_alu_w, op_class})
3'b000: alu_result[i] = add_result[i]; // ADD, LUI, AUIPC
3'b001: alu_result[i] = sub_slt_br_result; // SUB, SLTU, SLTI, BR*
3'b010: alu_result[i] = shr_zic_result[i]; // SRL, SRA, SRLI, SRAI, CZERO*
3'b010: alu_result[i] = shr_zic_result[i]; // SRL, SRA, SRLI, SRAI, CZERO*
3'b011: alu_result[i] = msc_result[i]; // AND, OR, XOR, SLL, SLLI
3'b100: alu_result[i] = add_result_w[i]; // ADDIW, ADDW
3'b101: alu_result[i] = sub_result_w[i]; // SUBW
@ -141,9 +141,9 @@ module VX_alu_int #(
assign cbr_dest = add_result[0][1 +: `PC_BITS];
if (LANE_BITS != 0) begin
if (LANE_BITS != 0) begin : g_tid
assign tid = execute_if.data.tid[0 +: LANE_BITS];
end else begin
end else begin : g_tid_0
assign tid = 0;
end
@ -181,11 +181,11 @@ module VX_alu_int #(
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({br_enable, br_wid, br_taken, br_dest}),
.data_in ({br_enable, br_wid, br_taken, br_dest}),
.data_out ({branch_ctl_if.valid, branch_ctl_if.wid, branch_ctl_if.taken, branch_ctl_if.dest})
);
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_commit
assign commit_if.data.data[i] = (is_br_op_r && is_br_static) ? {(PC_r + `PC_BITS'(2)), 1'd0} : alu_result_r[i];
end
@ -193,9 +193,9 @@ module VX_alu_int #(
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (branch_ctl_if.valid) begin
`TRACE(1, ("%d: core%0d-branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n",
$time, CORE_ID, branch_ctl_if.wid, {commit_if.data.PC, 1'b0}, branch_ctl_if.taken, {branch_ctl_if.dest, 1'b0}, commit_if.data.uuid));
if (br_enable) begin
`TRACE(1, ("%t: %s branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n",
$time, INSTANCE_ID, br_wid, {commit_if.data.PC, 1'b0}, br_taken, {br_dest, 1'b0}, commit_if.data.uuid))
end
end
`endif

View file

@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_alu_muldiv #(
parameter CORE_ID = 0,
parameter `STRING INSTANCE_ID = "",
parameter NUM_LANES = 1
) (
input wire clk,
@ -26,7 +26,7 @@ module VX_alu_muldiv #(
// Outputs
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam TAG_WIDTH = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + PID_WIDTH + 1 + 1;
@ -68,8 +68,8 @@ module VX_alu_muldiv #(
wire mul_fire_in = mul_valid_in && mul_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] mul_resultl, mul_resulth;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mul_result_tmp
reg [`XLEN-1:0] mul_resultl, mul_resulth;
wire [`XLEN-1:0] mul_in1 = is_alu_w ? (execute_if.data.rs1_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs1_data[i];
wire [`XLEN-1:0] mul_in2 = is_alu_w ? (execute_if.data.rs2_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs2_data[i];
always @(*) begin
@ -83,7 +83,7 @@ module VX_alu_muldiv #(
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) mul_shift_reg (
.clk(clk),
.clk (clk),
.reset (reset),
.enable (mul_ready_in),
.data_in ({mul_valid_in, execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, mul_result_tmp}),
@ -103,7 +103,7 @@ module VX_alu_muldiv #(
wire [NUM_LANES-1:0][`XLEN:0] mul_in1;
wire [NUM_LANES-1:0][`XLEN:0] mul_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mul_in
assign mul_in1[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]} : {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]};
assign mul_in2[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]} : {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]};
end
@ -149,7 +149,7 @@ module VX_alu_muldiv #(
`else
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_multiplier
wire [`XLEN:0] mul_in1 = {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]};
wire [`XLEN:0] mul_in2 = {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]};
@ -184,7 +184,7 @@ module VX_alu_muldiv #(
`endif
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mul_result_out
`ifdef XLEN_64
assign mul_result_out[i] = is_mulh_out ? mul_result_tmp[i][2*(`XLEN)-1:`XLEN] :
(is_mul_w_out ? `XLEN'($signed(mul_result_tmp[i][31:0])) :
@ -219,7 +219,7 @@ module VX_alu_muldiv #(
wire [NUM_LANES-1:0][`XLEN-1:0] div_in1;
wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_div_in
`ifdef XLEN_64
assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
@ -234,8 +234,8 @@ module VX_alu_muldiv #(
wire [NUM_LANES-1:0][`XLEN-1:0] div_result_in;
wire div_fire_in = div_valid_in && div_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] div_quotient, div_remainder;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_div_result_in
reg [`XLEN-1:0] div_quotient, div_remainder;
always @(*) begin
dpi_idiv (div_fire_in, is_signed_op, div_in1[i], div_in2[i], div_quotient, div_remainder);
end
@ -306,7 +306,7 @@ module VX_alu_muldiv #(
assign {div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out, is_div_w_out, div_pid_out, div_sop_out, div_eop_out} = div_tag_r;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_div_result_out
`ifdef XLEN_64
assign div_result_out[i] = is_rem_op_out ? (is_div_w_out ? `XLEN'($signed(div_remainder[i][31:0])) : div_remainder[i]) :
(is_div_w_out ? `XLEN'($signed(div_quotient[i][31:0])) : div_quotient[i]);
@ -324,7 +324,8 @@ module VX_alu_muldiv #(
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (TAG_WIDTH + (NUM_LANES * `XLEN)),
.OUT_BUF (1)
.ARBITER ("P"),
.OUT_BUF (2)
) rsp_buf (
.clk (clk),
.reset (reset),

View file

@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_alu_unit #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
@ -27,23 +27,27 @@ module VX_alu_unit #(
VX_branch_ctl_if.master branch_ctl_if [`NUM_ALU_BLOCKS]
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam BLOCK_SIZE = `NUM_ALU_BLOCKS;
localparam NUM_LANES = `NUM_ALU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam RSP_ARB_SIZE = 1 + `EXT_M_ENABLED;
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
localparam PE_COUNT = 1 + `EXT_M_ENABLED;
localparam PE_SEL_BITS = `CLOG2(PE_COUNT);
localparam PE_IDX_INT = 0;
localparam PE_IDX_MDV = PE_IDX_INT + `EXT_M_ENABLED;
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) per_block_execute_if[BLOCK_SIZE]();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) per_block_commit_if[BLOCK_SIZE]();
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_BUF (PARTIAL_BW ? 1 : 0)
.OUT_BUF (PARTIAL_BW ? 3 : 0)
) dispatch_unit (
.clk (clk),
.reset (reset),
@ -51,106 +55,62 @@ module VX_alu_unit #(
.execute_if (per_block_execute_if)
);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) per_block_commit_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
`RESET_RELAY (block_reset, reset);
wire is_muldiv_op = `EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == `ALU_TYPE_MULDIV);
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_alus
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) int_execute_if();
) pe_execute_if[PE_COUNT]();
VX_commit_if #(
VX_commit_if#(
.NUM_LANES (NUM_LANES)
) int_commit_if();
) pe_commit_if[PE_COUNT]();
assign int_execute_if.valid = per_block_execute_if[block_idx].valid && ~is_muldiv_op;
assign int_execute_if.data = per_block_execute_if[block_idx].data;
reg [`UP(PE_SEL_BITS)-1:0] pe_select;
always @(*) begin
pe_select = PE_IDX_INT;
if (`EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == `ALU_TYPE_MULDIV))
pe_select = PE_IDX_MDV;
end
`RESET_RELAY (int_reset, block_reset);
VX_pe_switch #(
.PE_COUNT (PE_COUNT),
.NUM_LANES (NUM_LANES),
.ARBITER ("R"),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (PARTIAL_BW ? 1 : 3)
) pe_switch (
.clk (clk),
.reset (reset),
.pe_sel (pe_select),
.execute_in_if (per_block_execute_if[block_idx]),
.commit_out_if (per_block_commit_if[block_idx]),
.execute_out_if (pe_execute_if),
.commit_in_if (pe_commit_if)
);
VX_alu_int #(
.CORE_ID (CORE_ID),
.INSTANCE_ID ($sformatf("%s-int%0d", INSTANCE_ID, block_idx)),
.BLOCK_IDX (block_idx),
.NUM_LANES (NUM_LANES)
) alu_int (
.clk (clk),
.reset (int_reset),
.execute_if (int_execute_if),
.reset (reset),
.execute_if (pe_execute_if[PE_IDX_INT]),
.branch_ctl_if (branch_ctl_if[block_idx]),
.commit_if (int_commit_if)
.commit_if (pe_commit_if[PE_IDX_INT])
);
`ifdef EXT_M_ENABLE
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) mdv_execute_if();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) mdv_commit_if();
assign mdv_execute_if.valid = per_block_execute_if[block_idx].valid && is_muldiv_op;
assign mdv_execute_if.data = per_block_execute_if[block_idx].data;
`RESET_RELAY (mdv_reset, block_reset);
VX_alu_muldiv #(
.CORE_ID (CORE_ID),
.INSTANCE_ID ($sformatf("%s-muldiv%0d", INSTANCE_ID, block_idx)),
.NUM_LANES (NUM_LANES)
) mdv_unit (
) muldiv_unit (
.clk (clk),
.reset (mdv_reset),
.execute_if (mdv_execute_if),
.commit_if (mdv_commit_if)
.reset (reset),
.execute_if (pe_execute_if[PE_IDX_MDV]),
.commit_if (pe_commit_if[PE_IDX_MDV])
);
`endif
assign per_block_execute_if[block_idx].ready =
`ifdef EXT_M_ENABLE
is_muldiv_op ? mdv_execute_if.ready :
`endif
int_execute_if.ready;
// send response
VX_stream_arb #(
.NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW),
.OUT_BUF (PARTIAL_BW ? 1 : 3)
) rsp_arb (
.clk (clk),
.reset (block_reset),
.valid_in ({
`ifdef EXT_M_ENABLE
mdv_commit_if.valid,
`endif
int_commit_if.valid
}),
.ready_in ({
`ifdef EXT_M_ENABLE
mdv_commit_if.ready,
`endif
int_commit_if.ready
}),
.data_in ({
`ifdef EXT_M_ENABLE
mdv_commit_if.data,
`endif
int_commit_if.data
}),
.data_out (per_block_commit_if[block_idx].data),
.valid_out (per_block_commit_if[block_idx].valid),
.ready_out (per_block_commit_if[block_idx].ready),
`UNUSED_PIN (sel_out)
);
end
VX_gather_unit #(

View file

@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_commit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
@ -27,7 +27,7 @@ module VX_commit import VX_gpu_pkg::*; #(
VX_commit_csr_if.master commit_csr_if,
VX_commit_sched_if.master commit_sched_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `NR_BITS + `NUM_THREADS * `XLEN + 1 + 1 + 1;
localparam COMMIT_SIZEW = `CLOG2(`NUM_THREADS + 1);
localparam COMMIT_ALL_SIZEW = COMMIT_SIZEW + `ISSUE_WIDTH - 1;
@ -36,20 +36,18 @@ module VX_commit import VX_gpu_pkg::*; #(
VX_commit_if commit_arb_if[`ISSUE_WIDTH]();
wire [`ISSUE_WIDTH-1:0] commit_fire;
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid;
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask;
wire [`ISSUE_WIDTH-1:0] commit_eop;
wire [`ISSUE_WIDTH-1:0] per_issue_commit_fire;
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] per_issue_commit_wid;
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] per_issue_commit_tmask;
wire [`ISSUE_WIDTH-1:0] per_issue_commit_eop;
`RESET_RELAY (arb_reset, reset);
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_commit_arbs
wire [`NUM_EX_UNITS-1:0] valid_in;
wire [`NUM_EX_UNITS-1:0][DATAW-1:0] data_in;
wire [`NUM_EX_UNITS-1:0] ready_in;
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin : g_data_in
assign valid_in[j] = commit_if[j * `ISSUE_WIDTH + i].valid;
assign data_in[j] = commit_if[j * `ISSUE_WIDTH + i].data;
assign commit_if[j * `ISSUE_WIDTH + i].ready = ready_in[j];
@ -58,11 +56,11 @@ module VX_commit import VX_gpu_pkg::*; #(
VX_stream_arb #(
.NUM_INPUTS (`NUM_EX_UNITS),
.DATAW (DATAW),
.ARBITER ("R"),
.ARBITER ("P"),
.OUT_BUF (1)
) commit_arb (
.clk (clk),
.reset (arb_reset),
.reset (reset),
.valid_in (valid_in),
.ready_in (ready_in),
.data_in (data_in),
@ -72,10 +70,10 @@ module VX_commit import VX_gpu_pkg::*; #(
`UNUSED_PIN (sel_out)
);
assign commit_fire[i] = commit_arb_if[i].valid && commit_arb_if[i].ready;
assign commit_tmask[i]= {`NUM_THREADS{commit_fire[i]}} & commit_arb_if[i].data.tmask;
assign commit_wid[i] = commit_arb_if[i].data.wid;
assign commit_eop[i] = commit_arb_if[i].data.eop;
assign per_issue_commit_fire[i] = commit_arb_if[i].valid && commit_arb_if[i].ready;
assign per_issue_commit_tmask[i]= {`NUM_THREADS{per_issue_commit_fire[i]}} & commit_arb_if[i].data.tmask;
assign per_issue_commit_wid[i] = commit_arb_if[i].data.wid;
assign per_issue_commit_eop[i] = commit_arb_if[i].data.eop;
end
// CSRs update
@ -84,11 +82,11 @@ module VX_commit import VX_gpu_pkg::*; #(
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all_r, commit_size_all_rr;
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
assign commit_fire_any = (| commit_fire);
assign commit_fire_any = (| per_issue_commit_fire);
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_commit_size
wire [COMMIT_SIZEW-1:0] count;
`POP_COUNT(count, commit_tmask[i]);
`POP_COUNT(count, per_issue_commit_tmask[i]);
assign commit_size[i] = count;
end
@ -136,24 +134,33 @@ module VX_commit import VX_gpu_pkg::*; #(
end
assign commit_csr_if.instret = instret;
// Committed instructions
// Track committed instructions
wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop;
reg [`NUM_WARPS-1:0] committed_warps;
always @(*) begin
committed_warps = 0;
for (integer i = 0; i < `ISSUE_WIDTH; ++i) begin
if (per_issue_commit_fire[i] && per_issue_commit_eop[i]) begin
committed_warps[per_issue_commit_wid[i]] = 1;
end
end
end
VX_pipe_register #(
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
.RESETW (`ISSUE_WIDTH)
.DATAW (`NUM_WARPS),
.RESETW (`NUM_WARPS)
) committed_pipe_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({committed, commit_wid}),
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid})
.data_in (committed_warps),
.data_out ({commit_sched_if.committed_warps})
);
// Writeback
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_writeback
assign writeback_if[i].valid = commit_arb_if[i].valid && commit_arb_if[i].data.wb;
assign writeback_if[i].data.uuid = commit_arb_if[i].data.uuid;
assign writeback_if[i].data.wis = wid_to_wis(commit_arb_if[i].data.wid);
@ -167,15 +174,15 @@ module VX_commit import VX_gpu_pkg::*; #(
end
`ifdef DBG_TRACE_PIPELINE
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_trace
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin : g_j
always @(posedge clk) begin
if (commit_if[j * `ISSUE_WIDTH + i].valid && commit_if[j * `ISSUE_WIDTH + i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, commit_if[j * `ISSUE_WIDTH + i].data.wid, {commit_if[j * `ISSUE_WIDTH + i].data.PC, 1'b0}));
`TRACE(1, ("%t: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, commit_if[j * `ISSUE_WIDTH + i].data.wid, {commit_if[j * `ISSUE_WIDTH + i].data.PC, 1'b0}))
trace_ex_type(1, j);
`TRACE(1, (", tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", commit_if[j * `ISSUE_WIDTH + i].data.tmask, commit_if[j * `ISSUE_WIDTH + i].data.wb, commit_if[j * `ISSUE_WIDTH + i].data.rd, commit_if[j * `ISSUE_WIDTH + i].data.sop, commit_if[j * `ISSUE_WIDTH + i].data.eop));
`TRACE_ARRAY1D(1, "0x%0h", commit_if[j * `ISSUE_WIDTH + i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", commit_if[j * `ISSUE_WIDTH + i].data.uuid));
`TRACE(1, (", tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", commit_if[j * `ISSUE_WIDTH + i].data.tmask, commit_if[j * `ISSUE_WIDTH + i].data.wb, commit_if[j * `ISSUE_WIDTH + i].data.rd, commit_if[j * `ISSUE_WIDTH + i].data.sop, commit_if[j * `ISSUE_WIDTH + i].data.eop))
`TRACE_ARRAY1D(1, "0x%0h", commit_if[j * `ISSUE_WIDTH + i].data.data, `NUM_THREADS)
`TRACE(1, (" (#%0d)\n", commit_if[j * `ISSUE_WIDTH + i].data.uuid))
end
end
end

View file

@ -18,7 +18,8 @@
`endif
module VX_core import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter CORE_ID = 0,
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -74,33 +75,26 @@ module VX_core import VX_gpu_pkg::*; #(
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif
`RESET_RELAY (dcr_data_reset, reset);
`RESET_RELAY (schedule_reset, reset);
`RESET_RELAY (fetch_reset, reset);
`RESET_RELAY (decode_reset, reset);
`RESET_RELAY (issue_reset, reset);
`RESET_RELAY (execute_reset, reset);
`RESET_RELAY (commit_reset, reset);
base_dcrs_t base_dcrs;
VX_dcr_data dcr_data (
.clk (clk),
.reset (dcr_data_reset),
.reset (reset),
.dcr_bus_if (dcr_bus_if),
.base_dcrs (base_dcrs)
);
`SCOPE_IO_SWITCH (3)
`SCOPE_IO_SWITCH (3);
VX_schedule #(
.INSTANCE_ID ($sformatf("%s-schedule", INSTANCE_ID)),
.CORE_ID (CORE_ID)
) schedule (
.clk (clk),
.reset (schedule_reset),
.reset (reset),
`ifdef PERF_ENABLE
.perf_schedule_if (pipeline_perf_if.schedule),
.sched_perf (pipeline_perf_if.sched),
`endif
.base_dcrs (base_dcrs),
@ -121,36 +115,36 @@ module VX_core import VX_gpu_pkg::*; #(
);
VX_fetch #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-fetch", INSTANCE_ID))
) fetch (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (fetch_reset),
.reset (reset),
.icache_bus_if (icache_bus_if),
.schedule_if (schedule_if),
.fetch_if (fetch_if)
);
VX_decode #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-decode", INSTANCE_ID))
) decode (
.clk (clk),
.reset (decode_reset),
.reset (reset),
.fetch_if (fetch_if),
.decode_if (decode_if),
.decode_sched_if(decode_sched_if)
);
VX_issue #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-issue", INSTANCE_ID))
) issue (
`SCOPE_IO_BIND (1)
.clk (clk),
.reset (issue_reset),
.reset (reset),
`ifdef PERF_ENABLE
.perf_issue_if (pipeline_perf_if.issue),
.issue_perf (pipeline_perf_if.issue),
`endif
.decode_if (decode_if),
@ -159,12 +153,13 @@ module VX_core import VX_gpu_pkg::*; #(
);
VX_execute #(
.INSTANCE_ID ($sformatf("%s-execute", INSTANCE_ID)),
.CORE_ID (CORE_ID)
) execute (
`SCOPE_IO_BIND (2)
.clk (clk),
.reset (execute_reset),
.reset (reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
@ -186,10 +181,10 @@ module VX_core import VX_gpu_pkg::*; #(
);
VX_commit #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-commit", INSTANCE_ID))
) commit (
.clk (clk),
.reset (commit_reset),
.reset (reset),
.commit_if (commit_if),
@ -199,136 +194,18 @@ module VX_core import VX_gpu_pkg::*; #(
.commit_sched_if(commit_sched_if)
);
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_dcache_if[`NUM_LSU_BLOCKS]();
`ifdef LMEM_ENABLE
`RESET_RELAY (lmem_unit_reset, reset);
VX_lmem_unit #(
.CORE_ID (CORE_ID)
) lmem_unit (
.clk (clk),
.reset (lmem_unit_reset),
VX_mem_unit #(
.INSTANCE_ID (INSTANCE_ID)
) mem_unit (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.lmem),
.lmem_perf (mem_perf_tmp_if.lmem),
`endif
.lsu_mem_in_if (lsu_mem_if),
.lsu_mem_out_if (lsu_dcache_if)
.lsu_mem_if (lsu_mem_if),
.dcache_bus_if (dcache_bus_if)
);
`else
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
`ASSIGN_VX_LSU_MEM_IF (lsu_dcache_if[i], lsu_mem_if[i]);
end
`endif
VX_lsu_mem_if #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_coalesced_if[`NUM_LSU_BLOCKS]();
if (LSU_WORD_SIZE != DCACHE_WORD_SIZE) begin
`RESET_RELAY (coalescer_reset, reset);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
VX_mem_coalescer #(
.INSTANCE_ID ($sformatf("core%0d-coalescer", CORE_ID)),
.NUM_REQS (`NUM_LSU_LANES),
.DATA_IN_SIZE (LSU_WORD_SIZE),
.DATA_OUT_SIZE (DCACHE_WORD_SIZE),
.ADDR_WIDTH (LSU_ADDR_WIDTH),
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.QUEUE_SIZE (`LSUQ_OUT_SIZE)
) coalescer (
.clk (clk),
.reset (coalescer_reset),
// Input request
.in_req_valid (lsu_dcache_if[i].req_valid),
.in_req_mask (lsu_dcache_if[i].req_data.mask),
.in_req_rw (lsu_dcache_if[i].req_data.rw),
.in_req_byteen (lsu_dcache_if[i].req_data.byteen),
.in_req_addr (lsu_dcache_if[i].req_data.addr),
.in_req_atype (lsu_dcache_if[i].req_data.atype),
.in_req_data (lsu_dcache_if[i].req_data.data),
.in_req_tag (lsu_dcache_if[i].req_data.tag),
.in_req_ready (lsu_dcache_if[i].req_ready),
// Input response
.in_rsp_valid (lsu_dcache_if[i].rsp_valid),
.in_rsp_mask (lsu_dcache_if[i].rsp_data.mask),
.in_rsp_data (lsu_dcache_if[i].rsp_data.data),
.in_rsp_tag (lsu_dcache_if[i].rsp_data.tag),
.in_rsp_ready (lsu_dcache_if[i].rsp_ready),
// Output request
.out_req_valid (dcache_coalesced_if[i].req_valid),
.out_req_mask (dcache_coalesced_if[i].req_data.mask),
.out_req_rw (dcache_coalesced_if[i].req_data.rw),
.out_req_byteen (dcache_coalesced_if[i].req_data.byteen),
.out_req_addr (dcache_coalesced_if[i].req_data.addr),
.out_req_atype (dcache_coalesced_if[i].req_data.atype),
.out_req_data (dcache_coalesced_if[i].req_data.data),
.out_req_tag (dcache_coalesced_if[i].req_data.tag),
.out_req_ready (dcache_coalesced_if[i].req_ready),
// Output response
.out_rsp_valid (dcache_coalesced_if[i].rsp_valid),
.out_rsp_mask (dcache_coalesced_if[i].rsp_data.mask),
.out_rsp_data (dcache_coalesced_if[i].rsp_data.data),
.out_rsp_tag (dcache_coalesced_if[i].rsp_data.tag),
.out_rsp_ready (dcache_coalesced_if[i].rsp_ready)
);
end
end else begin
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
`ASSIGN_VX_LSU_MEM_IF (dcache_coalesced_if[i], lsu_dcache_if[i]);
end
end
`RESET_RELAY (lsu_adapter_reset, reset);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_tmp_if[DCACHE_CHANNELS]();
VX_lsu_adapter #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (0)
) lsu_adapter (
.clk (clk),
.reset (lsu_adapter_reset),
.lsu_mem_if (dcache_coalesced_if[i]),
.mem_bus_if (dcache_bus_tmp_if)
);
for (genvar j = 0; j < DCACHE_CHANNELS; ++j) begin
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i * DCACHE_CHANNELS + j], dcache_bus_tmp_if[j]);
end
end
`ifdef PERF_ENABLE
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
@ -352,8 +229,8 @@ module VX_core import VX_gpu_pkg::*; #(
wire [LSU_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
wire [LSU_NUM_REQS-1:0] perf_dcache_rsp_fire;
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_perf_dcache
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin : g_j
assign perf_dcache_rd_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && ~lsu_mem_if[i].req_data.rw;
assign perf_dcache_wr_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && lsu_mem_if[i].req_data.rw;
assign perf_dcache_rsp_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].rsp_valid && lsu_mem_if[i].rsp_data.mask[j] && lsu_mem_if[i].rsp_ready;

View file

@ -32,7 +32,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
output wire [DCACHE_NUM_REQS-1:0] dcache_req_rw,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] dcache_req_byteen,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] dcache_req_addr,
output wire [DCACHE_NUM_REQS-1:0][`ADDR_TYPE_WIDTH-1:0] dcache_req_atype,
output wire [DCACHE_NUM_REQS-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] dcache_req_flags,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_req_data,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] dcache_req_tag,
input wire [DCACHE_NUM_REQS-1:0] dcache_req_ready,
@ -96,7 +96,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign dcache_req_rw[i] = dcache_bus_if[i].req_data.rw;
assign dcache_req_byteen[i] = dcache_bus_if[i].req_data.byteen;
assign dcache_req_addr[i] = dcache_bus_if[i].req_data.addr;
assign dcache_req_atype[i] = dcache_bus_if[i].req_data.atype;
assign dcache_req_flags[i] = dcache_bus_if[i].req_data.flags;
assign dcache_req_data[i] = dcache_bus_if[i].req_data.data;
assign dcache_req_tag[i] = dcache_bus_if[i].req_data.tag;
assign dcache_bus_if[i].req_ready = dcache_req_ready[i];
@ -119,7 +119,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign icache_req_data = icache_bus_if.req_data.data;
assign icache_req_tag = icache_bus_if.req_data.tag;
assign icache_bus_if.req_ready = icache_req_ready;
`UNUSED_VAR (icache_bus_if.req_data.atype)
`UNUSED_VAR (icache_bus_if.req_data.flags)
assign icache_bus_if.rsp_valid = icache_rsp_valid;
assign icache_bus_if.rsp_data.tag = icache_rsp_tag;
@ -144,6 +144,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
`endif
VX_core #(
.INSTANCE_ID ($sformatf("core")),
.CORE_ID (CORE_ID)
) core (
`SCOPE_IO_BIND (0)

View file

@ -26,13 +26,13 @@
addr+12'h80 : dst = 32'(src[$bits(src)-1:32])
`endif
module VX_csr_data
import VX_gpu_pkg::*;
`ifdef EXT_F_ENABLE
import VX_fpu_pkg::*;
`endif
#(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0
) (
input wire clk,
@ -83,7 +83,7 @@ import VX_fpu_pkg::*;
wire [`NUM_FPU_BLOCKS-1:0][`NW_WIDTH-1:0] fpu_write_wid;
fflags_t [`NUM_FPU_BLOCKS-1:0] fpu_write_fflags;
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin : g_fpu_write
assign fpu_write_enable[i] = fpu_csr_if[i].write_enable;
assign fpu_write_wid[i] = fpu_csr_if[i].write_wid;
assign fpu_write_fflags[i] = fpu_csr_if[i].write_fflags;
@ -107,7 +107,7 @@ import VX_fpu_pkg::*;
end
end
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin : g_fpu_csr_read_frm
assign fpu_csr_if[i].read_frm = fcsr[fpu_csr_if[i].read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS];
end
@ -147,7 +147,7 @@ import VX_fpu_pkg::*;
mscratch <= write_data;
end
default: begin
`ASSERT(0, ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid));
`ASSERT(0, ("%t: *** %s invalid CSR write address: %0h (#%0d)", $time, INSTANCE_ID, write_addr, write_uuid));
end
endcase
end
@ -155,41 +155,41 @@ import VX_fpu_pkg::*;
// CSRs read //////////////////////////////////////////////////////////////
reg [`XLEN-1:0] read_data_ro_r;
reg [`XLEN-1:0] read_data_rw_r;
reg read_addr_valid_r;
reg [`XLEN-1:0] read_data_ro_w;
reg [`XLEN-1:0] read_data_rw_w;
reg read_addr_valid_w;
always @(*) begin
read_data_ro_r = '0;
read_data_rw_r = '0;
read_addr_valid_r = 1;
read_data_ro_w = '0;
read_data_rw_w = '0;
read_addr_valid_w = 1;
case (read_addr)
`VX_CSR_MVENDORID : read_data_ro_r = `XLEN'(`VENDOR_ID);
`VX_CSR_MARCHID : read_data_ro_r = `XLEN'(`ARCHITECTURE_ID);
`VX_CSR_MIMPID : read_data_ro_r = `XLEN'(`IMPLEMENTATION_ID);
`VX_CSR_MISA : read_data_ro_r = `XLEN'({2'(`CLOG2(`XLEN/16)), 30'(`MISA_STD)});
`VX_CSR_MVENDORID : read_data_ro_w = `XLEN'(`VENDOR_ID);
`VX_CSR_MARCHID : read_data_ro_w = `XLEN'(`ARCHITECTURE_ID);
`VX_CSR_MIMPID : read_data_ro_w = `XLEN'(`IMPLEMENTATION_ID);
`VX_CSR_MISA : read_data_ro_w = `XLEN'({2'(`CLOG2(`XLEN/16)), 30'(`MISA_STD)});
`ifdef EXT_F_ENABLE
`VX_CSR_FFLAGS : read_data_rw_r = `XLEN'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]);
`VX_CSR_FRM : read_data_rw_r = `XLEN'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
`VX_CSR_FCSR : read_data_rw_r = `XLEN'(fcsr[read_wid]);
`VX_CSR_FFLAGS : read_data_rw_w = `XLEN'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]);
`VX_CSR_FRM : read_data_rw_w = `XLEN'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
`VX_CSR_FCSR : read_data_rw_w = `XLEN'(fcsr[read_wid]);
`endif
`VX_CSR_MSCRATCH : read_data_rw_r = mscratch;
`VX_CSR_MSCRATCH : read_data_rw_w = mscratch;
`VX_CSR_WARP_ID : read_data_ro_r = `XLEN'(read_wid);
`VX_CSR_CORE_ID : read_data_ro_r = `XLEN'(CORE_ID);
`VX_CSR_ACTIVE_THREADS: read_data_ro_r = `XLEN'(thread_masks[read_wid]);
`VX_CSR_ACTIVE_WARPS: read_data_ro_r = `XLEN'(active_warps);
`VX_CSR_NUM_THREADS: read_data_ro_r = `XLEN'(`NUM_THREADS);
`VX_CSR_NUM_WARPS : read_data_ro_r = `XLEN'(`NUM_WARPS);
`VX_CSR_NUM_CORES : read_data_ro_r = `XLEN'(`NUM_CORES * `NUM_CLUSTERS);
`VX_CSR_LOCAL_MEM_BASE: read_data_ro_r = `XLEN'(`LMEM_BASE_ADDR);
`VX_CSR_WARP_ID : read_data_ro_w = `XLEN'(read_wid);
`VX_CSR_CORE_ID : read_data_ro_w = `XLEN'(CORE_ID);
`VX_CSR_ACTIVE_THREADS: read_data_ro_w = `XLEN'(thread_masks[read_wid]);
`VX_CSR_ACTIVE_WARPS: read_data_ro_w = `XLEN'(active_warps);
`VX_CSR_NUM_THREADS: read_data_ro_w = `XLEN'(`NUM_THREADS);
`VX_CSR_NUM_WARPS : read_data_ro_w = `XLEN'(`NUM_WARPS);
`VX_CSR_NUM_CORES : read_data_ro_w = `XLEN'(`NUM_CORES * `NUM_CLUSTERS);
`VX_CSR_LOCAL_MEM_BASE: read_data_ro_w = `XLEN'(`LMEM_BASE_ADDR);
`CSR_READ_64(`VX_CSR_MCYCLE, read_data_ro_r, cycles);
`CSR_READ_64(`VX_CSR_MCYCLE, read_data_ro_w, cycles);
`VX_CSR_MPM_RESERVED : read_data_ro_r = 'x;
`VX_CSR_MPM_RESERVED_H : read_data_ro_r = 'x;
`VX_CSR_MPM_RESERVED : read_data_ro_w = 'x;
`VX_CSR_MPM_RESERVED_H : read_data_ro_w = 'x;
`CSR_READ_64(`VX_CSR_MINSTRET, read_data_ro_r, commit_csr_if.instret);
`CSR_READ_64(`VX_CSR_MINSTRET, read_data_ro_w, commit_csr_if.instret);
`VX_CSR_SATP,
`VX_CSR_MSTATUS,
@ -200,77 +200,77 @@ import VX_fpu_pkg::*;
`VX_CSR_MTVEC,
`VX_CSR_MEPC,
`VX_CSR_PMPCFG0,
`VX_CSR_PMPADDR0 : read_data_ro_r = `XLEN'(0);
`VX_CSR_PMPADDR0 : read_data_ro_w = `XLEN'(0);
default: begin
read_addr_valid_r = 0;
read_addr_valid_w = 0;
if ((read_addr >= `VX_CSR_MPM_USER && read_addr < (`VX_CSR_MPM_USER + 32))
|| (read_addr >= `VX_CSR_MPM_USER_H && read_addr < (`VX_CSR_MPM_USER_H + 32))) begin
read_addr_valid_r = 1;
read_addr_valid_w = 1;
`ifdef PERF_ENABLE
case (base_dcrs.mpm_class)
`VX_DCR_MPM_CLASS_CORE: begin
case (read_addr)
// PERF: pipeline
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_r, pipeline_perf_if.sched_idles);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_r, pipeline_perf_if.sched_stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_r, pipeline_perf_if.ibf_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_r, pipeline_perf_if.scb_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_r, pipeline_perf_if.units_uses[`EX_ALU]);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_w, pipeline_perf_if.sched.idles);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_w, pipeline_perf_if.sched.stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_w, pipeline_perf_if.issue.ibf_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_w, pipeline_perf_if.issue.scb_stalls);
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_w, pipeline_perf_if.issue.opd_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_ALU]);
`ifdef EXT_F_ENABLE
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, pipeline_perf_if.units_uses[`EX_FPU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_FPU]);
`else
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, `PERF_CTR_BITS'(0));
`endif
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_r, pipeline_perf_if.units_uses[`EX_LSU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_r, pipeline_perf_if.units_uses[`EX_SFU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_r, pipeline_perf_if.sfu_uses[`SFU_CSRS]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_r, pipeline_perf_if.sfu_uses[`SFU_WCTL]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_LSU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_w, pipeline_perf_if.issue.units_uses[`EX_SFU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_w, pipeline_perf_if.issue.sfu_uses[`SFU_CSRS]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_w, pipeline_perf_if.issue.sfu_uses[`SFU_WCTL]);
// PERF: memory
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_r, pipeline_perf_if.ifetches);
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_r, pipeline_perf_if.loads);
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_r, pipeline_perf_if.stores);
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_r, pipeline_perf_if.ifetch_latency);
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_r, pipeline_perf_if.load_latency);
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf_if.ifetches);
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf_if.loads);
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_w, pipeline_perf_if.stores);
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_w, pipeline_perf_if.ifetch_latency);
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_w, pipeline_perf_if.load_latency);
default:;
endcase
end
`VX_DCR_MPM_CLASS_MEM: begin
case (read_addr)
// PERF: icache
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_r, mem_perf_if.icache.reads);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_r, mem_perf_if.icache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_r, mem_perf_if.icache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, mem_perf_if.icache.reads);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, mem_perf_if.icache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, mem_perf_if.icache.mshr_stalls);
// PERF: dcache
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_r, mem_perf_if.dcache.reads);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_r, mem_perf_if.dcache.writes);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_r, mem_perf_if.dcache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_r, mem_perf_if.dcache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_r, mem_perf_if.dcache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_r, mem_perf_if.dcache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_w, mem_perf_if.dcache.reads);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_w, mem_perf_if.dcache.writes);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_w, mem_perf_if.dcache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_w, mem_perf_if.dcache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_w, mem_perf_if.dcache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_w, mem_perf_if.dcache.mshr_stalls);
// PERF: lmem
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_r, mem_perf_if.lmem.reads);
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_r, mem_perf_if.lmem.writes);
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_r, mem_perf_if.lmem.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, mem_perf_if.lmem.reads);
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, mem_perf_if.lmem.writes);
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, mem_perf_if.lmem.bank_stalls);
// PERF: l2cache
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_r, mem_perf_if.l2cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_r, mem_perf_if.l2cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_r, mem_perf_if.l2cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_r, mem_perf_if.l2cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_r, mem_perf_if.l2cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l2cache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_w, mem_perf_if.l2cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_w, mem_perf_if.l2cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_w, mem_perf_if.l2cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_w, mem_perf_if.l2cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_w, mem_perf_if.l2cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_w, mem_perf_if.l2cache.mshr_stalls);
// PERF: l3cache
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_r, mem_perf_if.l3cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_r, mem_perf_if.l3cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_r, mem_perf_if.l3cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_r, mem_perf_if.l3cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_r, mem_perf_if.l3cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l3cache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_w, mem_perf_if.l3cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_w, mem_perf_if.l3cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_w, mem_perf_if.l3cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_w, mem_perf_if.l3cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_w, mem_perf_if.l3cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_w, mem_perf_if.l3cache.mshr_stalls);
// PERF: memory
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_r, mem_perf_if.mem.reads);
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_r, mem_perf_if.mem.writes);
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_r, mem_perf_if.mem.latency);
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, mem_perf_if.mem.reads);
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, mem_perf_if.mem.writes);
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_w, mem_perf_if.mem.latency);
default:;
endcase
end
@ -282,12 +282,12 @@ import VX_fpu_pkg::*;
endcase
end
assign read_data_ro = read_data_ro_r;
assign read_data_rw = read_data_rw_r;
assign read_data_ro = read_data_ro_w;
assign read_data_rw = read_data_rw_w;
`UNUSED_VAR (base_dcrs)
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
`RUNTIME_ASSERT(~read_enable || read_addr_valid_w, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
`ifdef PERF_ENABLE
`UNUSED_VAR (mem_perf_if.icache);

View file

@ -14,6 +14,7 @@
`include "VX_define.vh"
module VX_csr_unit import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0,
parameter NUM_LANES = 1
) (
@ -36,7 +37,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
VX_execute_if.slave execute_if,
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
@ -65,14 +66,15 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
wire [NUM_LANES-1:0][`XLEN-1:0] rs1_data;
`UNUSED_VAR (rs1_data)
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_rs1_data
assign rs1_data[i] = execute_if.data.rs1_data[i];
end
wire csr_write_enable = (execute_if.data.op_type == `INST_SFU_CSRRW);
VX_csr_data #(
.CORE_ID (CORE_ID)
.INSTANCE_ID (INSTANCE_ID),
.CORE_ID (CORE_ID)
) csr_data (
.clk (clk),
.reset (reset),
@ -111,12 +113,15 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
wire [NUM_LANES-1:0][`XLEN-1:0] wtid, gtid;
for (genvar i = 0; i < NUM_LANES; ++i) begin
if (PID_BITS != 0) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_wtid
if (PID_BITS != 0) begin : g_pid
assign wtid[i] = `XLEN'(execute_if.data.pid * NUM_LANES + i);
end else begin
end else begin : g_no_pid
assign wtid[i] = `XLEN'(i);
end
end
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_gtid
assign gtid[i] = (`XLEN'(CORE_ID) << (`NW_BITS + `NT_BITS)) + (`XLEN'(execute_if.data.wid) << `NT_BITS) + wtid[i];
end

View file

@ -12,7 +12,6 @@
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
module VX_dcr_data import VX_gpu_pkg::*; (
input wire clk,
@ -51,9 +50,9 @@ module VX_dcr_data import VX_gpu_pkg::*; (
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (dcr_bus_if.write_valid) begin
`TRACE(1, ("%d: base-dcr: state=", $time));
`TRACE(1, ("%t: base-dcr: state=", $time))
trace_base_dcr(1, dcr_bus_if.write_addr);
`TRACE(1, (", data=0x%0h\n", dcr_bus_if.write_data));
`TRACE(1, (", data=0x%h\n", dcr_bus_if.write_data))
end
end
`endif

View file

@ -12,24 +12,23 @@
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
`ifdef EXT_F_ENABLE
`define USED_IREG(x) \
x``_r = {1'b0, ``x}; \
x``_v = {1'b0, ``x}; \
use_``x = 1
`define USED_FREG(x) \
x``_r = {1'b1, ``x}; \
x``_v = {1'b1, ``x}; \
use_``x = 1
`else
`define USED_IREG(x) \
x``_r = ``x; \
x``_v = ``x; \
use_``x = 1
`endif
module VX_decode import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
@ -44,14 +43,14 @@ module VX_decode import VX_gpu_pkg::*; #(
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + (`NR_BITS * 4);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
reg [`EX_BITS-1:0] ex_type;
reg [`INST_OP_BITS-1:0] op_type;
op_args_t op_args;
reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r;
reg [`NR_BITS-1:0] rd_v, rs1_v, rs2_v, rs3_v;
reg use_rd, use_rs1, use_rs2, use_rs3;
reg is_wstall;
@ -145,15 +144,21 @@ module VX_decode import VX_gpu_pkg::*; #(
end
`endif
`STATIC_ASSERT($bits(alu_args_t) == $bits(op_args_t), ("alu_args_t size mismatch: current=%0d, expected=%0d", $bits(alu_args_t), $bits(op_args_t)));
`STATIC_ASSERT($bits(fpu_args_t) == $bits(op_args_t), ("fpu_args_t size mismatch: current=%0d, expected=%0d", $bits(fpu_args_t), $bits(op_args_t)));
`STATIC_ASSERT($bits(lsu_args_t) == $bits(op_args_t), ("lsu_args_t size mismatch: current=%0d, expected=%0d", $bits(lsu_args_t), $bits(op_args_t)));
`STATIC_ASSERT($bits(csr_args_t) == $bits(op_args_t), ("csr_args_t size mismatch: current=%0d, expected=%0d", $bits(csr_args_t), $bits(op_args_t)));
`STATIC_ASSERT($bits(wctl_args_t) == $bits(op_args_t), ("wctl_args_t size mismatch: current=%0d, expected=%0d", $bits(wctl_args_t), $bits(op_args_t)));
always @(*) begin
ex_type = '0;
ex_type = 'x;
op_type = 'x;
op_args = 'x;
rd_r = '0;
rs1_r = '0;
rs2_r = '0;
rs3_r = '0;
rd_v = '0;
rs1_v = '0;
rs2_v = '0;
rs3_v = '0;
use_rd = 0;
use_rs1 = 0;
use_rs2 = 0;
@ -371,14 +376,16 @@ module VX_decode import VX_gpu_pkg::*; #(
`USED_IREG (rs2);
end
`ifdef EXT_F_ENABLE
`INST_FMADD,
`INST_FMSUB,
`INST_FNMSUB,
`INST_FNMADD: begin
`INST_FMADD, // 7'b1000011
`INST_FMSUB, // 7'b1000111
`INST_FNMSUB, // 7'b1001011
`INST_FNMADD: // 7'b1001111
begin
ex_type = `EX_FPU;
op_type = `INST_OP_BITS'({2'b11, opcode[3:2]});
op_type = `INST_OP_BITS'({2'b00, 1'b1, opcode[3]});
op_args.fpu.frm = func3;
op_args.fpu.fmt[0] = func2[0]; // float / double
op_args.fpu.fmt[1] = opcode[3] ^ opcode[2]; // SUB
use_rd = 1;
`USED_FREG (rd);
`USED_FREG (rs1);
@ -394,9 +401,10 @@ module VX_decode import VX_gpu_pkg::*; #(
case (func5)
5'b00000, // FADD
5'b00001, // FSUB
5'b00010, // FMUL
5'b00011: begin // FDIV
op_type = `INST_OP_BITS'(func5[1:0]);
5'b00010: // FMUL
begin
op_type = `INST_OP_BITS'({2'b00, 1'b0, func5[1]});
op_args.fpu.fmt[1] = func5[0]; // SUB
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
@ -425,6 +433,13 @@ module VX_decode import VX_gpu_pkg::*; #(
`USED_FREG (rs1);
end
`endif
5'b00011: begin
// FDIV
op_type = `INST_OP_BITS'(`INST_FPU_DIV);
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
5'b01011: begin
// FSQRT
op_type = `INST_OP_BITS'(`INST_FPU_SQRT);
@ -522,7 +537,7 @@ module VX_decode import VX_gpu_pkg::*; #(
end
// disable write to integer register r0
wire wb = use_rd && (rd_r != 0);
wire wb = use_rd && (rd_v != 0);
VX_elastic_buffer #(
.DATAW (DATAW),
@ -532,7 +547,7 @@ module VX_decode import VX_gpu_pkg::*; #(
.reset (reset),
.valid_in (fetch_if.valid),
.ready_in (fetch_if.ready),
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_r, rs1_r, rs2_r, rs3_r}),
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_v, rs1_v, rs2_v, rs3_v}),
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
.valid_out (decode_if.valid),
.ready_out (decode_if.ready)
@ -542,9 +557,10 @@ module VX_decode import VX_gpu_pkg::*; #(
wire fetch_fire = fetch_if.valid && fetch_if.ready;
assign decode_sched_if.valid = fetch_fire;
assign decode_sched_if.wid = fetch_if.data.wid;
assign decode_sched_if.is_wstall = is_wstall;
assign decode_sched_if.valid = fetch_fire;
assign decode_sched_if.wid = fetch_if.data.wid;
assign decode_sched_if.unlock = ~is_wstall;
`ifndef L1_ENABLE
assign fetch_if.ibuf_pop = decode_if.ibuf_pop;
`endif
@ -552,14 +568,14 @@ module VX_decode import VX_gpu_pkg::*; #(
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (decode_if.valid && decode_if.ready) begin
`TRACE(1, ("%d: core%0d-decode: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, CORE_ID, decode_if.data.wid, {decode_if.data.PC, 1'd0}, instr));
`TRACE(1, ("%t: %s: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, INSTANCE_ID, decode_if.data.wid, {decode_if.data.PC, 1'd0}, instr))
trace_ex_type(1, decode_if.data.ex_type);
`TRACE(1, (", op="));
`TRACE(1, (", op="))
trace_ex_op(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args);
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, opds=%b%b%b%b",
decode_if.data.tmask, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, use_rd, use_rs1, use_rs2, use_rs3));
decode_if.data.tmask, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, use_rd, use_rs1, use_rs2, use_rs3))
trace_op_args(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args);
`TRACE(1, (" (#%0d)\n", decode_if.data.uuid));
`TRACE(1, (" (#%0d)\n", decode_if.data.uuid))
end
end
`endif

View file

@ -12,10 +12,9 @@
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
module VX_dispatch import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
@ -24,118 +23,81 @@ module VX_dispatch import VX_gpu_pkg::*; #(
output wire [`PERF_CTR_BITS-1:0] perf_stalls [`NUM_EX_UNITS],
`endif
// inputs
VX_operands_if.slave operands_if [`ISSUE_WIDTH],
VX_operands_if.slave operands_if,
// outputs
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH]
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS]
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH;
wire [`NUM_THREADS-1:0][`NT_WIDTH-1:0] tids;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
for (genvar i = 0; i < `NUM_THREADS; ++i) begin : g_tids
assign tids[i] = `NT_WIDTH'(i);
end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [`NT_WIDTH-1:0] last_active_tid;
wire [`NT_WIDTH-1:0] last_active_tid;
VX_find_first #(
.N (`NUM_THREADS),
.DATAW (`NT_WIDTH),
.REVERSE (1)
) last_tid_select (
.valid_in (operands_if.data.tmask),
.data_in (tids),
.data_out (last_active_tid),
`UNUSED_PIN (valid_out)
);
VX_find_first #(
.N (`NUM_THREADS),
.DATAW (`NT_WIDTH),
.REVERSE (1)
) last_tid_select (
.valid_in (operands_if[i].data.tmask),
.data_in (tids),
.data_out (last_active_tid),
`UNUSED_PIN (valid_out)
wire [`NUM_EX_UNITS-1:0] operands_ready_in;
assign operands_if.ready = operands_ready_in[operands_if.data.ex_type];
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_buffers
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (1)
) buffer (
.clk (clk),
.reset (reset),
.valid_in (operands_if.valid && (operands_if.data.ex_type == `EX_BITS'(i))),
.ready_in (operands_ready_in[i]),
.data_in ({
operands_if.data.uuid,
operands_if.data.wis,
operands_if.data.tmask,
operands_if.data.PC,
operands_if.data.op_type,
operands_if.data.op_args,
operands_if.data.wb,
operands_if.data.rd,
last_active_tid,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data
}),
.data_out (dispatch_if[i].data),
.valid_out (dispatch_if[i].valid),
.ready_out (dispatch_if[i].ready)
);
wire [`NUM_EX_UNITS-1:0] operands_reset;
`RESET_RELAY (buf_reset, reset);
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) buffer (
.clk (clk),
.reset (buf_reset),
.valid_in (operands_if[i].valid && (operands_if[i].data.ex_type == j)),
.ready_in (operands_reset[j]),
.data_in (`TO_DISPATCH_DATA(operands_if[i].data, last_active_tid)),
.data_out (dispatch_if[j * `ISSUE_WIDTH + i].data),
.valid_out (dispatch_if[j * `ISSUE_WIDTH + i].valid),
.ready_out (dispatch_if[j * `ISSUE_WIDTH + i].ready)
);
end
assign operands_if[i].ready = operands_reset[operands_if[i].data.ex_type];
end
`ifdef PERF_ENABLE
wire [`NUM_EX_UNITS-1:0] perf_unit_stalls_per_cycle, perf_unit_stalls_per_cycle_r;
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_stalls_per_cycle;
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r;
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
always @(*) begin
perf_issue_unit_stalls_per_cycle[i] = '0;
if (operands_if[i].valid && ~operands_if[i].ready) begin
perf_issue_unit_stalls_per_cycle[i][operands_if[i].data.ex_type] = 1;
end
end
end
wire operands_if_stall = operands_if.valid && ~operands_if.ready;
VX_reduce #(
.DATAW_IN (`NUM_EX_UNITS),
.N (`ISSUE_WIDTH),
.OP ("|")
) reduce (
.data_in (perf_issue_unit_stalls_per_cycle),
.data_out (perf_unit_stalls_per_cycle)
);
`BUFFER(perf_unit_stalls_per_cycle_r, perf_unit_stalls_per_cycle);
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_perf_stalls
always @(posedge clk) begin
if (reset) begin
perf_stalls_r[i] <= '0;
end else begin
perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(perf_unit_stalls_per_cycle_r[i]);
perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(operands_if_stall && operands_if.data.ex_type == `EX_BITS'(i));
end
end
end
for (genvar i=0; i < `NUM_EX_UNITS; ++i) begin
assign perf_stalls[i] = perf_stalls_r[i];
end
`endif
`ifdef DBG_TRACE_PIPELINE
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
always @(posedge clk) begin
if (operands_if[i].valid && operands_if[i].ready) begin
`TRACE(1, ("%d: core%0d-issue: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, wis_to_wid(operands_if[i].data.wis, i), {operands_if[i].data.PC, 1'b0}));
trace_ex_type(1, operands_if[i].data.ex_type);
`TRACE(1, (", op="));
trace_ex_op(1, operands_if[i].data.ex_type, operands_if[i].data.op_type, operands_if[i].data.op_args);
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if[i].data.tmask, operands_if[i].data.wb, operands_if[i].data.rd));
`TRACE_ARRAY1D(1, "0x%0h", operands_if[i].data.rs1_data, `NUM_THREADS);
`TRACE(1, (", rs2_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if[i].data.rs2_data, `NUM_THREADS);
`TRACE(1, (", rs3_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if[i].data.rs3_data, `NUM_THREADS);
trace_op_args(1, operands_if[i].data.ex_type, operands_if[i].data.op_type, operands_if[i].data.op_args);
`TRACE(1, (" (#%0d)\n", operands_if[i].data.uuid));
end
end
end
`endif
endmodule

View file

@ -40,7 +40,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
localparam ISSUE_W = `LOG2UP(`ISSUE_WIDTH);
localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN);
localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
localparam FANOUT_ENABLE= (`NUM_THREADS > MAX_FANOUT);
localparam FANOUT_ENABLE= (`NUM_THREADS > (MAX_FANOUT + MAX_FANOUT /2));
localparam DATA_TMASK_OFF = IN_DATAW - (`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS);
localparam DATA_REGS_OFF = 0;
@ -49,13 +49,12 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire [`ISSUE_WIDTH-1:0][IN_DATAW-1:0] dispatch_data;
wire [`ISSUE_WIDTH-1:0] dispatch_ready;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_dispatch_data
assign dispatch_valid[i] = dispatch_if[i].valid;
assign dispatch_data[i] = dispatch_if[i].data;
assign dispatch_if[i].ready = dispatch_ready[i];
end
wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices;
wire [BLOCK_SIZE-1:0] block_ready;
wire [BLOCK_SIZE-1:0][NUM_LANES-1:0] block_tmask;
wire [BLOCK_SIZE-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] block_regs;
@ -66,28 +65,53 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire batch_done = (& block_done);
// batch select logic
logic [BATCH_COUNT_W-1:0] batch_idx;
if (BATCH_COUNT != 1) begin
if (BATCH_COUNT != 1) begin : g_batch_idx
wire [BATCH_COUNT_W-1:0] batch_idx_n;
wire [BATCH_COUNT-1:0] valid_batches;
for (genvar i = 0; i < BATCH_COUNT; ++i) begin : g_valid_batches
assign valid_batches[i] = | dispatch_valid[i * BLOCK_SIZE +: BLOCK_SIZE];
end
VX_generic_arbiter #(
.NUM_REQS (BATCH_COUNT),
.TYPE ("P")
) batch_sel (
.clk (clk),
.reset (reset),
.requests (valid_batches),
.grant_index (batch_idx_n),
`UNUSED_PIN (grant_onehot),
`UNUSED_PIN (grant_valid),
.grant_ready (batch_done)
);
always @(posedge clk) begin
if (reset) begin
batch_idx <= '0;
end else begin
batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done);
end else if (batch_done) begin
batch_idx <= batch_idx_n;
end
end
end else begin
end else begin : g_batch_idx_0
assign batch_idx = 0;
`UNUSED_VAR (batch_done)
end
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices;
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_issue_indices
assign issue_indices[block_idx] = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
end
wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
assign issue_indices[block_idx] = issue_idx;
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_blocks
wire [ISSUE_W-1:0] issue_idx = issue_indices[block_idx];
wire valid_p, ready_p;
if (`NUM_THREADS != NUM_LANES) begin
if (`NUM_THREADS > NUM_LANES) begin : g_partial_threads
reg [NUM_PACKETS-1:0] sent_mask_p;
wire [PID_WIDTH-1:0] start_p_n, start_p, end_p;
wire dispatch_valid_r;
@ -122,8 +146,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
for (genvar j = 0; j < NUM_LANES; ++j) begin
for (genvar i = 0; i < NUM_PACKETS; ++i) begin : g_per_packet_data
for (genvar j = 0; j < NUM_LANES; ++j) begin : g_j
localparam k = i * NUM_LANES + j;
assign per_packet_tmask[i][j] = dispatch_tmask[k];
assign per_packet_regs[i][0][j] = dispatch_rs1_data[k];
@ -133,10 +157,12 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
end
wire [NUM_PACKETS-1:0] packet_valids;
wire [NUM_PACKETS-1:0][PID_WIDTH-1:0] packet_ids;
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
for (genvar i = 0; i < NUM_PACKETS; ++i) begin : g_packet_valids
assign packet_valids[i] = (| per_packet_tmask[i]);
end
wire [NUM_PACKETS-1:0][PID_WIDTH-1:0] packet_ids;
for (genvar i = 0; i < NUM_PACKETS; ++i) begin : g_packet_ids
assign packet_ids[i] = PID_WIDTH'(i);
end
@ -185,13 +211,13 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
assign block_pid[block_idx] = start_p;
assign block_sop[block_idx] = is_first_p;
assign block_eop[block_idx] = is_last_p;
if (FANOUT_ENABLE) begin
if (FANOUT_ENABLE) begin : g_block_ready_fanout
assign block_ready[block_idx] = dispatch_valid_r && ready_p && block_enable;
end else begin
end else begin : g_block_ready
assign block_ready[block_idx] = ready_p && block_enable;
end
assign block_done[block_idx] = ~dispatch_valid[issue_idx] || fire_eop;
end else begin
assign block_done[block_idx] = fire_eop || ~dispatch_valid[issue_idx];
end else begin : g_full_threads
assign valid_p = dispatch_valid[issue_idx];
assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
@ -201,31 +227,31 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
assign block_sop[block_idx] = 1'b1;
assign block_eop[block_idx] = 1'b1;
assign block_ready[block_idx] = ready_p;
assign block_done[block_idx] = ~valid_p || ready_p;
assign block_done[block_idx] = ready_p || ~valid_p;
end
wire [ISSUE_ISW_W-1:0] isw;
if (BATCH_COUNT != 1) begin
if (BLOCK_SIZE != 1) begin
if (BATCH_COUNT != 1) begin : g_isw_batch
if (BLOCK_SIZE != 1) begin : g_block
assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)};
end else begin
end else begin : g_no_block
assign isw = batch_idx;
end
end else begin
end else begin : g_isw
assign isw = block_idx;
end
`RESET_RELAY(buf_out_reset, reset);
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
logic [OUT_DATAW-1:0] execute_data, execute_data_w;
VX_elastic_buffer #(
.DATAW (OUT_DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) buf_out (
.clk (clk),
.reset (buf_out_reset),
.reset (reset),
.valid_in (valid_p),
.ready_in (ready_p),
.data_in ({
@ -239,17 +265,27 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
block_pid[block_idx],
block_sop[block_idx],
block_eop[block_idx]}),
.data_out (execute_if[block_idx].data),
.data_out (execute_data),
.valid_out (execute_if[block_idx].valid),
.ready_out (execute_if[block_idx].ready)
);
if (`NUM_THREADS != NUM_LANES) begin : g_execute_data_w_partial
assign execute_data_w = execute_data;
end else begin : g_execute_data_w_full
always @(*) begin
execute_data_w = execute_data;
execute_data_w[2:0] = {1'b0, 1'b1, 1'b1}; // default pid, sop, and eop
end
end
assign execute_if[block_idx].data = execute_data_w;
end
reg [`ISSUE_WIDTH-1:0] ready_in;
always @(*) begin
ready_in = 0;
for (integer i = 0; i < BLOCK_SIZE; ++i) begin
ready_in[issue_indices[i]] = block_ready[i] && block_eop[i];
for (integer block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
ready_in[issue_indices[block_idx]] = block_ready[block_idx] && block_eop[block_idx];
end
end
assign dispatch_ready = ready_in;

View file

@ -14,6 +14,7 @@
`include "VX_define.vh"
module VX_execute import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0
) (
`SCOPE_IO_DECL
@ -50,41 +51,35 @@ module VX_execute import VX_gpu_pkg::*; #(
VX_fpu_csr_if fpu_csr_if[`NUM_FPU_BLOCKS]();
`endif
`RESET_RELAY (alu_reset, reset);
`RESET_RELAY (lsu_reset, reset);
`RESET_RELAY (sfu_reset, reset);
VX_alu_unit #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-alu", INSTANCE_ID))
) alu_unit (
.clk (clk),
.reset (alu_reset),
.reset (reset),
.dispatch_if (dispatch_if[`EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (commit_if[`EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.branch_ctl_if (branch_ctl_if)
);
`SCOPE_IO_SWITCH (1)
`SCOPE_IO_SWITCH (1);
VX_lsu_unit #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-lsu", INSTANCE_ID))
) lsu_unit (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (lsu_reset),
.reset (reset),
.dispatch_if (dispatch_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (commit_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.lsu_mem_if (lsu_mem_if)
);
`ifdef EXT_F_ENABLE
`RESET_RELAY (fpu_reset, reset);
VX_fpu_unit #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-fpu", INSTANCE_ID))
) fpu_unit (
.clk (clk),
.reset (fpu_reset),
.reset (reset),
.dispatch_if (dispatch_if[`EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (commit_if[`EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.fpu_csr_if (fpu_csr_if)
@ -92,10 +87,11 @@ module VX_execute import VX_gpu_pkg::*; #(
`endif
VX_sfu_unit #(
.INSTANCE_ID ($sformatf("%s-sfu", INSTANCE_ID)),
.CORE_ID (CORE_ID)
) sfu_unit (
.clk (clk),
.reset (sfu_reset),
.reset (reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if (pipeline_perf_if),

View file

@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_fetch import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -30,7 +30,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
// outputs
VX_fetch_if.master fetch_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_VAR (reset)
wire icache_req_valid;
@ -56,9 +56,10 @@ module VX_fetch import VX_gpu_pkg::*; #(
.LUTRAM (1)
) tag_store (
.clk (clk),
.reset (reset),
.read (1'b1),
.write (icache_req_fire),
`UNUSED_PIN (wren),
.wren (1'b1),
.waddr (req_tag),
.wdata ({schedule_if.data.PC, schedule_if.data.tmask}),
.raddr (rsp_tag),
@ -70,7 +71,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
// This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache requests.
// This issue is particularly prevalent when the icache and dcache are disabled and both requests share the same bus.
wire [`NUM_WARPS-1:0] pending_ibuf_full;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_pending_reads
VX_pending_size #(
.SIZE (`IBUF_SIZE)
) pending_reads (
@ -78,9 +79,11 @@ module VX_fetch import VX_gpu_pkg::*; #(
.reset (reset),
.incr (icache_req_fire && schedule_if.data.wid == i),
.decr (fetch_if.ibuf_pop[i]),
`UNUSED_PIN (empty),
`UNUSED_PIN (alm_empty),
.full (pending_ibuf_full[i]),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end
wire ibuf_ready = ~pending_ibuf_full[schedule_if.data.wid];
@ -89,7 +92,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
`endif
`RUNTIME_ASSERT((!schedule_if.valid || schedule_if.data.PC != 0),
("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, {schedule_if.data.PC, 1'b0}, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.uuid))
("%t: *** %s invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, INSTANCE_ID, {schedule_if.data.PC, 1'b0}, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.uuid))
// Icache Request
@ -113,9 +116,9 @@ module VX_fetch import VX_gpu_pkg::*; #(
.ready_out (icache_bus_if.req_ready)
);
assign icache_bus_if.req_data.atype = '0;
assign icache_bus_if.req_data.flags = '0;
assign icache_bus_if.req_data.rw = 0;
assign icache_bus_if.req_data.byteen = 4'b1111;
assign icache_bus_if.req_data.byteen = '1;
assign icache_bus_if.req_data.data = '0;
// Icache Response
@ -128,59 +131,57 @@ module VX_fetch import VX_gpu_pkg::*; #(
assign fetch_if.data.uuid = rsp_uuid;
assign icache_bus_if.rsp_ready = fetch_if.ready;
`ifdef SCOPE
`ifdef DBG_SCOPE_FETCH
if (CORE_ID == 0) begin
`ifdef SCOPE
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
VX_scope_tap #(
.SCOPE_ID (1),
.TRIGGERW (4),
.PROBEW (`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +
ICACHE_TAG_WIDTH + ICACHE_WORD_SIZE + ICACHE_ADDR_WIDTH +
(ICACHE_WORD_SIZE*8) + ICACHE_TAG_WIDTH)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({
reset,
schedule_fire,
icache_req_fire,
icache_rsp_fire
}),
.probes({
schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC,
icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag
}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
ila_fetch ila_fetch_inst (
.clk (clk),
.probe0 ({reset, schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC, schedule_if.ready, schedule_if.valid}),
.probe1 ({icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr, icache_bus_if.req_ready, icache_bus_if.req_valid}),
.probe2 ({icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag, icache_bus_if.rsp_ready, icache_bus_if.rsp_valid})
);
`endif
end
`SCOPE_IO_SWITCH (1);
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire icache_bus_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
wire icache_bus_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
wire [`UUID_WIDTH-1:0] icache_bus_req_uuid = icache_bus_if.req_data.tag[ICACHE_TAG_WIDTH-1 -: `UUID_WIDTH];
wire [`UUID_WIDTH-1:0] icache_bus_rsp_uuid = icache_bus_if.rsp_data.tag[ICACHE_TAG_WIDTH-1 -: `UUID_WIDTH];
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 1, 6, 3, (
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +
`UUID_WIDTH + ICACHE_WORD_SIZE + ICACHE_ADDR_WIDTH +
`UUID_WIDTH + (ICACHE_WORD_SIZE * 8)
), {
schedule_if.valid,
schedule_if.ready,
icache_bus_if.req_valid,
icache_bus_if.req_ready,
icache_bus_if.rsp_valid,
icache_bus_if.rsp_ready
}, {
schedule_fire,
icache_bus_req_fire,
icache_bus_rsp_fire
},{
schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC,
icache_bus_req_uuid, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
icache_bus_rsp_uuid, icache_bus_if.rsp_data.data
},
reset_negedge, 1'b0, 4096
);
`else
`SCOPE_IO_UNUSED()
`SCOPE_IO_UNUSED(0)
`endif
`endif
`ifdef CHIPSCOPE
ila_fetch ila_fetch_inst (
.clk (clk),
.probe0 ({schedule_if.valid, schedule_if.data, schedule_if.ready}),
.probe1 ({icache_bus_if.req_valid, icache_bus_if.req_data, icache_bus_if.req_ready}),
.probe2 ({icache_bus_if.rsp_valid, icache_bus_if.rsp_data, icache_bus_if.rsp_ready})
);
`endif
`ifdef DBG_TRACE_MEM
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire fetch_fire = fetch_if.valid && fetch_if.ready;
always @(posedge clk) begin
if (schedule_fire) begin
`TRACE(1, ("%d: I$%0d req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, CORE_ID, schedule_if.data.wid, {schedule_if.data.PC, 1'b0}, schedule_if.data.tmask, schedule_if.data.uuid));
if (schedule_if.valid && schedule_if.ready) begin
`TRACE(1, ("%t: %s req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, INSTANCE_ID, schedule_if.data.wid, {schedule_if.data.PC, 1'b0}, schedule_if.data.tmask, schedule_if.data.uuid))
end
if (fetch_fire) begin
`TRACE(1, ("%d: I$%0d rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, fetch_if.data.wid, {fetch_if.data.PC, 1'b0}, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid));
if (fetch_if.valid && fetch_if.ready) begin
`TRACE(1, ("%t: %s rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, INSTANCE_ID, fetch_if.data.wid, {fetch_if.data.PC, 1'b0}, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid))
end
end
`endif

View file

@ -14,7 +14,7 @@
`include "VX_fpu_define.vh"
module VX_fpu_unit import VX_fpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
@ -26,7 +26,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_fpu_csr_if.master fpu_csr_if[`NUM_FPU_BLOCKS]
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam BLOCK_SIZE = `NUM_FPU_BLOCKS;
localparam NUM_LANES = `NUM_FPU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
@ -41,7 +41,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_BUF (PARTIAL_BW ? 1 : 0)
.OUT_BUF (PARTIAL_BW ? 3 : 0)
) dispatch_unit (
.clk (clk),
.reset (reset),
@ -53,12 +53,10 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.NUM_LANES (NUM_LANES)
) per_block_commit_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_fpus
`UNUSED_VAR (per_block_execute_if[block_idx].data.tid)
`UNUSED_VAR (per_block_execute_if[block_idx].data.wb)
`RESET_RELAY (block_reset, reset);
// Store request info
wire fpu_req_valid, fpu_req_ready;
wire fpu_rsp_valid, fpu_rsp_ready;
@ -71,9 +69,9 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
wire [NUM_LANES-1:0] fpu_rsp_tmask;
wire [`PC_BITS-1:0] fpu_rsp_PC;
wire [`NR_BITS-1:0] fpu_rsp_rd;
wire [PID_WIDTH-1:0] fpu_rsp_pid;
wire fpu_rsp_sop;
wire fpu_rsp_eop;
wire [PID_WIDTH-1:0] fpu_rsp_pid, fpu_rsp_pid_u;
wire fpu_rsp_sop, fpu_rsp_sop_u;
wire fpu_rsp_eop, fpu_rsp_eop_u;
wire [TAG_WIDTH-1:0] fpu_req_tag, fpu_rsp_tag;
wire mdata_full;
@ -89,17 +87,30 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.SIZE (`FPUQ_SIZE)
) tag_store (
.clk (clk),
.reset (block_reset),
.reset (reset),
.acquire_en (execute_fire),
.write_addr (fpu_req_tag),
.write_data ({per_block_execute_if[block_idx].data.uuid, per_block_execute_if[block_idx].data.wid, per_block_execute_if[block_idx].data.tmask, per_block_execute_if[block_idx].data.PC, per_block_execute_if[block_idx].data.rd, per_block_execute_if[block_idx].data.pid, per_block_execute_if[block_idx].data.sop, per_block_execute_if[block_idx].data.eop}),
.read_data ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
.read_data ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid_u, fpu_rsp_sop_u, fpu_rsp_eop_u}),
.read_addr (fpu_rsp_tag),
.release_en (fpu_rsp_fire),
.full (mdata_full),
`UNUSED_PIN (empty)
);
if (PID_BITS != 0) begin : g_fpu_rsp_pid
assign fpu_rsp_pid = fpu_rsp_pid_u;
assign fpu_rsp_sop = fpu_rsp_sop_u;
assign fpu_rsp_eop = fpu_rsp_eop_u;
end else begin : g_no_fpu_rsp_pid
`UNUSED_VAR (fpu_rsp_pid_u)
`UNUSED_VAR (fpu_rsp_sop_u)
`UNUSED_VAR (fpu_rsp_eop_u)
assign fpu_rsp_pid = 0;
assign fpu_rsp_sop = 1;
assign fpu_rsp_eop = 1;
end
// resolve dynamic FRM from CSR
wire [`INST_FRM_BITS-1:0] fpu_req_frm;
`ASSIGN_BLOCKED_WID (fpu_csr_if[block_idx].read_wid, per_block_execute_if[block_idx].data.wid, block_idx, `NUM_FPU_BLOCKS)
@ -111,8 +122,6 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
assign fpu_req_valid = per_block_execute_if[block_idx].valid && ~mdata_full;
assign per_block_execute_if[block_idx].ready = fpu_req_ready && ~mdata_full;
`RESET_RELAY (fpu_reset, block_reset);
`ifdef FPU_DPI
VX_fpu_dpi #(
@ -121,7 +130,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_dpi (
.clk (clk),
.reset (fpu_reset),
.reset (reset),
.valid_in (fpu_req_valid),
.mask_in (per_block_execute_if[block_idx].data.tmask),
@ -150,7 +159,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_fpnew (
.clk (clk),
.reset (fpu_reset),
.reset (reset),
.valid_in (fpu_req_valid),
.mask_in (per_block_execute_if[block_idx].data.tmask),
@ -179,7 +188,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_dsp (
.clk (clk),
.reset (fpu_reset),
.reset (reset),
.valid_in (fpu_req_valid),
.mask_in (per_block_execute_if[block_idx].data.tmask),
@ -202,27 +211,38 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
`endif
// handle FPU response
// handle CSR update
fflags_t fpu_rsp_fflags_q;
if (PID_BITS != 0) begin
if (PID_BITS != 0) begin : g_pid
fflags_t fpu_rsp_fflags_r;
always @(posedge clk) begin
if (block_reset) begin
if (reset) begin
fpu_rsp_fflags_r <= '0;
end else if (fpu_rsp_fire) begin
fpu_rsp_fflags_r <= fpu_rsp_eop ? '0 : (fpu_rsp_fflags_r | fpu_rsp_fflags);
end
end
assign fpu_rsp_fflags_q = fpu_rsp_fflags_r | fpu_rsp_fflags;
end else begin
end else begin : g_no_pid
assign fpu_rsp_fflags_q = fpu_rsp_fflags;
end
assign fpu_csr_if[block_idx].write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags;
`ASSIGN_BLOCKED_WID (fpu_csr_if[block_idx].write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS)
assign fpu_csr_if[block_idx].write_fflags = fpu_rsp_fflags_q;
VX_fpu_csr_if fpu_csr_tmp_if();
assign fpu_csr_tmp_if.write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags;
`ASSIGN_BLOCKED_WID (fpu_csr_tmp_if.write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS)
assign fpu_csr_tmp_if.write_fflags = fpu_rsp_fflags_q;
VX_pipe_register #(
.DATAW (1 + `NW_WIDTH + $bits(fflags_t)),
.RESETW (1)
) fpu_csr_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({fpu_csr_tmp_if.write_enable, fpu_csr_tmp_if.write_wid, fpu_csr_tmp_if.write_fflags}),
.data_out ({fpu_csr_if[block_idx].write_enable, fpu_csr_if[block_idx].write_wid, fpu_csr_if[block_idx].write_fflags})
);
// send response
@ -231,7 +251,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.SIZE (0)
) rsp_buf (
.clk (clk),
.reset (block_reset),
.reset (reset),
.valid_in (fpu_rsp_valid),
.ready_in (fpu_rsp_ready),
.data_in ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_result, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),

View file

@ -41,17 +41,17 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
wire [BLOCK_SIZE-1:0] commit_in_ready;
wire [BLOCK_SIZE-1:0][ISSUE_ISW_W-1:0] commit_in_isw;
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin : g_commit_in
assign commit_in_valid[i] = commit_in_if[i].valid;
assign commit_in_data[i] = commit_in_if[i].data;
assign commit_in_if[i].ready = commit_in_ready[i];
if (BLOCK_SIZE != `ISSUE_WIDTH) begin
if (BLOCK_SIZE != 1) begin
if (BLOCK_SIZE != `ISSUE_WIDTH) begin : g_commit_in_isw_partial
if (BLOCK_SIZE != 1) begin : g_block
assign commit_in_isw[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_ISW_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)};
end else begin
end else begin : g_no_block
assign commit_in_isw[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_ISW_W];
end
end else begin
end else begin : g_commit_in_isw_full
assign commit_in_isw[i] = BLOCK_SIZE_W'(i);
end
end
@ -70,24 +70,23 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
commit_out_data[commit_in_isw[i]] = commit_in_data[i];
end
end
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin : g_commit_in_ready
assign commit_in_ready[i] = commit_out_ready[commit_in_isw[i]];
end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin: g_out_bufs
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_tmp_if();
`RESET_RELAY(commit_out_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf (
.clk (clk),
.reset (commit_out_reset),
.reset (reset),
.valid_in (commit_out_valid[i]),
.ready_in (commit_out_ready[i]),
.data_in (commit_out_data[i]),
@ -96,31 +95,31 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
.ready_out (commit_tmp_if.ready)
);
logic [`NUM_THREADS-1:0] commit_tmask_r;
logic [`NUM_THREADS-1:0][`XLEN-1:0] commit_data_r;
if (PID_BITS != 0) begin
logic [`NUM_THREADS-1:0] commit_tmask_w;
logic [`NUM_THREADS-1:0][`XLEN-1:0] commit_data_w;
if (PID_BITS != 0) begin : g_commit_data_with_pid
always @(*) begin
commit_tmask_r = '0;
commit_data_r = 'x;
commit_tmask_w = '0;
commit_data_w = 'x;
for (integer j = 0; j < NUM_LANES; ++j) begin
commit_tmask_r[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.tmask[j];
commit_data_r[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.data[j];
commit_tmask_w[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.tmask[j];
commit_data_w[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.data[j];
end
end
end else begin
assign commit_tmask_r = commit_tmp_if.data.tmask;
assign commit_data_r = commit_tmp_if.data.data;
end else begin : g_commit_data_no_pid
assign commit_tmask_w = commit_tmp_if.data.tmask;
assign commit_data_w = commit_tmp_if.data.data;
end
assign commit_out_if[i].valid = commit_tmp_if.valid;
assign commit_out_if[i].data = {
commit_tmp_if.data.uuid,
commit_tmp_if.data.wid,
commit_tmask_r,
commit_tmask_w,
commit_tmp_if.data.PC,
commit_tmp_if.data.wb,
commit_tmp_if.data.rd,
commit_data_r,
commit_data_w,
1'b0, // PID
commit_tmp_if.data.sop,
commit_tmp_if.data.eop

View file

@ -1,286 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_gpr_slice import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter CACHE_ENABLE = 0
) (
input wire clk,
input wire reset,
VX_writeback_if.slave writeback_if,
VX_scoreboard_if.slave scoreboard_if,
VX_operands_if.master operands_if
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);
localparam STATE_IDLE = 2'd0;
localparam STATE_FETCH1 = 2'd1;
localparam STATE_FETCH2 = 2'd2;
localparam STATE_FETCH3 = 2'd3;
localparam STATE_BITS = 2;
wire [`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data;
reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n;
reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data [ISSUE_RATIO-1:0];
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data_n [ISSUE_RATIO-1:0];
reg [`NR_BITS-1:0] cache_reg [ISSUE_RATIO-1:0];
reg [`NR_BITS-1:0] cache_reg_n [ISSUE_RATIO-1:0];
reg [`NUM_THREADS-1:0] cache_tmask [ISSUE_RATIO-1:0];
reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0];
reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n;
reg [STATE_BITS-1:0] state, state_n;
reg [`NR_BITS-1:0] rs2, rs2_n;
reg [`NR_BITS-1:0] rs3, rs3_n;
reg rs2_ready, rs2_ready_n;
reg rs3_ready, rs3_ready_n;
reg data_ready, data_ready_n;
wire stg_valid_in, stg_ready_in;
wire is_rs1_zero = (scoreboard_if.data.rs1 == 0);
wire is_rs2_zero = (scoreboard_if.data.rs2 == 0);
wire is_rs3_zero = (scoreboard_if.data.rs3 == 0);
always @(*) begin
state_n = state;
rs2_n = rs2;
rs3_n = rs3;
rs2_ready_n = rs2_ready;
rs3_ready_n = rs3_ready;
rs1_data_n = rs1_data;
rs2_data_n = rs2_data;
rs3_data_n = rs3_data;
cache_data_n = cache_data;
cache_reg_n = cache_reg;
cache_tmask_n= cache_tmask;
cache_eop_n = cache_eop;
gpr_rd_rid_n = gpr_rd_rid;
gpr_rd_wis_n = gpr_rd_wis;
data_ready_n = data_ready;
case (state)
STATE_IDLE: begin
if (operands_if.valid && operands_if.ready) begin
data_ready_n = 0;
end
if (scoreboard_if.valid && data_ready_n == 0) begin
data_ready_n = 1;
if (is_rs3_zero || (CACHE_ENABLE != 0 &&
scoreboard_if.data.rs3 == cache_reg[scoreboard_if.data.wis] &&
(scoreboard_if.data.tmask & cache_tmask[scoreboard_if.data.wis]) == scoreboard_if.data.tmask)) begin
rs3_data_n = (is_rs3_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if.data.wis];
rs3_ready_n = 1;
end else begin
rs3_ready_n = 0;
gpr_rd_rid_n = scoreboard_if.data.rs3;
data_ready_n = 0;
state_n = STATE_FETCH3;
end
if (is_rs2_zero || (CACHE_ENABLE != 0 &&
scoreboard_if.data.rs2 == cache_reg[scoreboard_if.data.wis] &&
(scoreboard_if.data.tmask & cache_tmask[scoreboard_if.data.wis]) == scoreboard_if.data.tmask)) begin
rs2_data_n = (is_rs2_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if.data.wis];
rs2_ready_n = 1;
end else begin
rs2_ready_n = 0;
gpr_rd_rid_n = scoreboard_if.data.rs2;
data_ready_n = 0;
state_n = STATE_FETCH2;
end
if (is_rs1_zero || (CACHE_ENABLE != 0 &&
scoreboard_if.data.rs1 == cache_reg[scoreboard_if.data.wis] &&
(scoreboard_if.data.tmask & cache_tmask[scoreboard_if.data.wis]) == scoreboard_if.data.tmask)) begin
rs1_data_n = (is_rs1_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if.data.wis];
end else begin
gpr_rd_rid_n = scoreboard_if.data.rs1;
data_ready_n = 0;
state_n = STATE_FETCH1;
end
end
gpr_rd_wis_n = scoreboard_if.data.wis;
rs2_n = scoreboard_if.data.rs2;
rs3_n = scoreboard_if.data.rs3;
end
STATE_FETCH1: begin
rs1_data_n = gpr_rd_data;
if (~rs2_ready) begin
gpr_rd_rid_n = rs2;
state_n = STATE_FETCH2;
end else if (~rs3_ready) begin
gpr_rd_rid_n = rs3;
state_n = STATE_FETCH3;
end else begin
data_ready_n = 1;
state_n = STATE_IDLE;
end
end
STATE_FETCH2: begin
rs2_data_n = gpr_rd_data;
if (~rs3_ready) begin
gpr_rd_rid_n = rs3;
state_n = STATE_FETCH3;
end else begin
data_ready_n = 1;
state_n = STATE_IDLE;
end
end
STATE_FETCH3: begin
rs3_data_n = gpr_rd_data;
data_ready_n = 1;
state_n = STATE_IDLE;
end
endcase
if (CACHE_ENABLE != 0 && writeback_if.valid) begin
if ((cache_reg[writeback_if.data.wis] == writeback_if.data.rd)
|| (cache_eop[writeback_if.data.wis] && writeback_if.data.sop)) begin
for (integer j = 0; j < `NUM_THREADS; ++j) begin
if (writeback_if.data.tmask[j]) begin
cache_data_n[writeback_if.data.wis][j] = writeback_if.data.data[j];
end
end
cache_reg_n[writeback_if.data.wis] = writeback_if.data.rd;
cache_eop_n[writeback_if.data.wis] = writeback_if.data.eop;
cache_tmask_n[writeback_if.data.wis] = writeback_if.data.sop ? writeback_if.data.tmask :
(cache_tmask_n[writeback_if.data.wis] | writeback_if.data.tmask);
end
end
end
always @(posedge clk) begin
if (reset) begin
state <= STATE_IDLE;
cache_eop <= {ISSUE_RATIO{1'b1}};
data_ready <= 0;
end else begin
state <= state_n;
cache_eop <= cache_eop_n;
data_ready <= data_ready_n;
end
gpr_rd_rid <= gpr_rd_rid_n;
gpr_rd_wis <= gpr_rd_wis_n;
rs2_ready <= rs2_ready_n;
rs3_ready <= rs3_ready_n;
rs2 <= rs2_n;
rs3 <= rs3_n;
rs1_data <= rs1_data_n;
rs2_data <= rs2_data_n;
rs3_data <= rs3_data_n;
cache_data <= cache_data_n;
cache_reg <= cache_reg_n;
cache_tmask <= cache_tmask_n;
end
assign stg_valid_in = scoreboard_if.valid && data_ready;
assign scoreboard_if.ready = stg_ready_in && data_ready;
VX_toggle_buffer #(
.DATAW (DATAW)
) toggle_buffer (
.clk (clk),
.reset (reset),
.valid_in (stg_valid_in),
.data_in ({
scoreboard_if.data.uuid,
scoreboard_if.data.wis,
scoreboard_if.data.tmask,
scoreboard_if.data.PC,
scoreboard_if.data.wb,
scoreboard_if.data.ex_type,
scoreboard_if.data.op_type,
scoreboard_if.data.op_args,
scoreboard_if.data.rd
}),
.ready_in (stg_ready_in),
.valid_out (operands_if.valid),
.data_out ({
operands_if.data.uuid,
operands_if.data.wis,
operands_if.data.tmask,
operands_if.data.PC,
operands_if.data.wb,
operands_if.data.ex_type,
operands_if.data.op_type,
operands_if.data.op_args,
operands_if.data.rd
}),
.ready_out (operands_if.ready)
);
assign operands_if.data.rs1_data = rs1_data;
assign operands_if.data.rs2_data = rs2_data;
assign operands_if.data.rs3_data = rs3_data;
// GPR banks
reg [RAM_ADDRW-1:0] gpr_rd_addr;
wire [RAM_ADDRW-1:0] gpr_wr_addr;
if (ISSUE_WIS != 0) begin
assign gpr_wr_addr = {writeback_if.data.wis, writeback_if.data.rd};
always @(posedge clk) begin
gpr_rd_addr <= {gpr_rd_wis_n, gpr_rd_rid_n};
end
end else begin
assign gpr_wr_addr = writeback_if.data.rd;
always @(posedge clk) begin
gpr_rd_addr <= gpr_rd_rid_n;
end
end
`ifdef GPR_RESET
reg wr_enabled = 0;
always @(posedge clk) begin
if (reset) begin
wr_enabled <= 1;
end
end
`endif
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
VX_dp_ram #(
.DATAW (`XLEN),
.SIZE (`NUM_REGS * ISSUE_RATIO),
`ifdef GPR_RESET
.INIT_ENABLE (1),
.INIT_VALUE (0),
`endif
.NO_RWCHECK (1)
) gpr_ram (
.clk (clk),
.read (1'b1),
`UNUSED_PIN (wren),
`ifdef GPR_RESET
.write (wr_enabled && writeback_if.valid && writeback_if.data.tmask[j]),
`else
.write (writeback_if.valid && writeback_if.data.tmask[j]),
`endif
.waddr (gpr_wr_addr),
.wdata (writeback_if.data.data[j]),
.raddr (gpr_rd_addr),
.rdata (gpr_rd_data[j])
);
end
endmodule

View file

@ -14,33 +14,36 @@
`include "VX_define.vh"
module VX_ibuffer import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output wire [`PERF_CTR_BITS-1:0] perf_stalls,
`endif
// inputs
VX_decode_if.slave decode_if,
// outputs
VX_ibuffer_if.master ibuffer_if [`NUM_WARPS]
VX_ibuffer_if.master ibuffer_if [PER_ISSUE_WARPS]
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4);
wire [`NUM_WARPS-1:0] ibuf_ready_in;
wire [PER_ISSUE_WARPS-1:0] ibuf_ready_in;
assign decode_if.ready = ibuf_ready_in[decode_if.data.wid];
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_instr_bufs
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`IBUF_SIZE),
.OUT_REG (2) // use a 2-cycle FIFO
.OUT_REG (2) // 2-cycle EB for area reduction
) instr_buf (
.clk (clk),
.reset (reset),
.valid_in (decode_if.valid && decode_if.data.wid == i),
.valid_in (decode_if.valid && decode_if.data.wid == ISSUE_WIS_W'(w)),
.data_in ({
decode_if.data.uuid,
decode_if.data.tmask,
@ -52,15 +55,32 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
decode_if.data.rd,
decode_if.data.rs1,
decode_if.data.rs2,
decode_if.data.rs3}),
.ready_in (ibuf_ready_in[i]),
.valid_out(ibuffer_if[i].valid),
.data_out (ibuffer_if[i].data),
.ready_out(ibuffer_if[i].ready)
decode_if.data.rs3
}),
.ready_in (ibuf_ready_in[w]),
.valid_out(ibuffer_if[w].valid),
.data_out (ibuffer_if[w].data),
.ready_out(ibuffer_if[w].ready)
);
`ifndef L1_ENABLE
assign decode_if.ibuf_pop[i] = ibuffer_if[i].valid && ibuffer_if[i].ready;
assign decode_if.ibuf_pop[w] = ibuffer_if[w].valid && ibuffer_if[w].ready;
`endif
end
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
wire decode_if_stall = decode_if.valid && ~decode_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_ibf_stalls <= '0;
end else begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_if_stall);
end
end
assign perf_stalls = perf_ibf_stalls;
`endif
endmodule

View file

@ -48,9 +48,9 @@ module VX_ipdom_stack #(
empty_r <= 1;
full_r <= 0;
end else begin
`ASSERT(~push || ~full, ("runtime error: writing to a full stack!"));
`ASSERT(~pop || ~empty, ("runtime error: reading an empty stack!"));
`ASSERT(~push || ~pop, ("runtime error: push and pop in same cycle not supported!"));
`ASSERT(~push || ~full, ("%t: runtime error: writing to a full stack!", $time));
`ASSERT(~pop || ~empty, ("%t: runtime error: reading an empty stack!", $time));
`ASSERT(~push || ~pop, ("%t: runtime error: push and pop in same cycle not supported!", $time));
if (push) begin
rd_ptr <= wr_ptr;
wr_ptr <= wr_ptr + ADDRW'(1);
@ -72,9 +72,10 @@ module VX_ipdom_stack #(
.LUTRAM (OUT_REG ? 0 : 1)
) store (
.clk (clk),
.reset (reset),
.read (1'b1),
.write (push),
`UNUSED_PIN (wren),
.wren (1'b1),
.waddr (wr_ptr),
.wdata ({q1, q0}),
.raddr (rd_ptr),

View file

@ -12,10 +12,9 @@
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
module VX_issue #(
parameter CORE_ID = 0
module VX_issue import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -23,137 +22,80 @@ module VX_issue #(
input wire reset,
`ifdef PERF_ENABLE
VX_pipeline_perf_if.issue perf_issue_if,
output issue_perf_t issue_perf,
`endif
VX_decode_if.slave decode_if,
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH]
);
VX_ibuffer_if ibuffer_if [`NUM_WARPS]();
VX_scoreboard_if scoreboard_if [`ISSUE_WIDTH]();
VX_operands_if operands_if [`ISSUE_WIDTH]();
`RESET_RELAY (ibuf_reset, reset);
`RESET_RELAY (scoreboard_reset, reset);
`RESET_RELAY (operands_reset, reset);
`RESET_RELAY (dispatch_reset, reset);
VX_ibuffer #(
.CORE_ID (CORE_ID)
) ibuffer (
.clk (clk),
.reset (ibuf_reset),
.decode_if (decode_if),
.ibuffer_if (ibuffer_if)
);
VX_scoreboard #(
.CORE_ID (CORE_ID)
) scoreboard (
.clk (clk),
.reset (scoreboard_reset),
`ifdef PERF_ENABLE
.perf_scb_stalls(perf_issue_if.scb_stalls),
.perf_units_uses(perf_issue_if.units_uses),
.perf_sfu_uses (perf_issue_if.sfu_uses),
`endif
.writeback_if (writeback_if),
.ibuffer_if (ibuffer_if),
.scoreboard_if (scoreboard_if)
);
VX_operands #(
.CORE_ID (CORE_ID)
) operands (
.clk (clk),
.reset (operands_reset),
.writeback_if (writeback_if),
.scoreboard_if (scoreboard_if),
.operands_if (operands_if)
);
VX_dispatch #(
.CORE_ID (CORE_ID)
) dispatch (
.clk (clk),
.reset (dispatch_reset),
`ifdef PERF_ENABLE
`UNUSED_PIN (perf_stalls),
`endif
.operands_if (operands_if),
.dispatch_if (dispatch_if)
);
`ifdef DBG_SCOPE_ISSUE
if (CORE_ID == 0) begin
`ifdef SCOPE
wire operands_if_fire = operands_if[0].valid && operands_if[0].ready;
wire operands_if_not_ready = ~operands_if[0].ready;
wire writeback_if_valid = writeback_if[0].valid;
VX_scope_tap #(
.SCOPE_ID (2),
.TRIGGERW (4),
.PROBEW (`UUID_WIDTH + `NUM_THREADS + `EX_BITS + `INST_OP_BITS +
1 + `NR_BITS + (`NUM_THREADS * 3 * `XLEN) +
`UUID_WIDTH + `NUM_THREADS + `NR_BITS + (`NUM_THREADS*`XLEN) + 1)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({
reset,
operands_if_fire,
operands_if_not_ready,
writeback_if_valid
}),
.probes({
operands_if[0].data.uuid,
operands_if[0].data.tmask,
operands_if[0].data.ex_type,
operands_if[0].data.op_type,
operands_if[0].data.wb,
operands_if[0].data.rd,
operands_if[0].data.rs1_data,
operands_if[0].data.rs2_data,
operands_if[0].data.rs3_data,
writeback_if[0].data.uuid,
writeback_if[0].data.tmask,
writeback_if[0].data.rd,
writeback_if[0].data.data,
writeback_if[0].data.eop
}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
ila_issue ila_issue_inst (
.clk (clk),
.probe0 ({operands_if.uuid, ibuffer.rs3, ibuffer.rs2, ibuffer.rs1, operands_if.PC, operands_if.tmask, operands_if.wid, operands_if.ex_type, operands_if.op_type, operands_if.ready, operands_if.valid}),
.probe1 ({writeback_if.uuid, writeback_if.data[0], writeback_if.PC, writeback_if.tmask, writeback_if.wid, writeback_if.eop, writeback_if.valid})
);
`endif
end
`else
`SCOPE_IO_UNUSED()
`endif
`STATIC_ASSERT ((`ISSUE_WIDTH <= `NUM_WARPS), ("invalid parameter"))
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
wire decode_stall = decode_if.valid && ~decode_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_ibf_stalls <= '0;
end else begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_stall);
end
issue_perf_t per_issue_perf [`ISSUE_WIDTH];
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, ibf_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, scb_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, opd_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_issue_perf_units_uses
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, units_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
end
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin : g_issue_perf_sfu_uses
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, sfu_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
end
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
`endif
wire [ISSUE_ISW_W-1:0] decode_isw = wid_to_isw(decode_if.data.wid);
wire [ISSUE_WIS_W-1:0] decode_wis = wid_to_wis(decode_if.data.wid);
wire [`ISSUE_WIDTH-1:0] decode_ready_in;
assign decode_if.ready = decode_ready_in[decode_isw];
`SCOPE_IO_SWITCH (`ISSUE_WIDTH);
for (genvar issue_id = 0; issue_id < `ISSUE_WIDTH; ++issue_id) begin : g_issue_slices
VX_decode_if #(
.NUM_WARPS (PER_ISSUE_WARPS)
) per_issue_decode_if();
VX_dispatch_if per_issue_dispatch_if[`NUM_EX_UNITS]();
assign per_issue_decode_if.valid = decode_if.valid && (decode_isw == ISSUE_ISW_W'(issue_id));
assign per_issue_decode_if.data.uuid = decode_if.data.uuid;
assign per_issue_decode_if.data.wid = decode_wis;
assign per_issue_decode_if.data.tmask = decode_if.data.tmask;
assign per_issue_decode_if.data.PC = decode_if.data.PC;
assign per_issue_decode_if.data.ex_type = decode_if.data.ex_type;
assign per_issue_decode_if.data.op_type = decode_if.data.op_type;
assign per_issue_decode_if.data.op_args = decode_if.data.op_args;
assign per_issue_decode_if.data.wb = decode_if.data.wb;
assign per_issue_decode_if.data.rd = decode_if.data.rd;
assign per_issue_decode_if.data.rs1 = decode_if.data.rs1;
assign per_issue_decode_if.data.rs2 = decode_if.data.rs2;
assign per_issue_decode_if.data.rs3 = decode_if.data.rs3;
assign decode_ready_in[issue_id] = per_issue_decode_if.ready;
`ifndef L1_ENABLE
assign decode_if.ibuf_pop[issue_id * PER_ISSUE_WARPS +: PER_ISSUE_WARPS] = per_issue_decode_if.ibuf_pop;
`endif
VX_issue_slice #(
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, issue_id)),
.ISSUE_ID (issue_id)
) issue_slice (
`SCOPE_IO_BIND(issue_id)
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.issue_perf (per_issue_perf[issue_id]),
`endif
.decode_if (per_issue_decode_if),
.writeback_if (writeback_if[issue_id]),
.dispatch_if (per_issue_dispatch_if)
);
// Assign transposed dispatch_if
for (genvar ex_id = 0; ex_id < `NUM_EX_UNITS; ++ex_id) begin : g_dispatch_if
`ASSIGN_VX_IF(dispatch_if[ex_id * `ISSUE_WIDTH + issue_id], per_issue_dispatch_if[ex_id]);
end
end
endmodule

View file

@ -0,0 +1,157 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_issue_slice import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter ISSUE_ID = 0
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output issue_perf_t issue_perf,
`endif
VX_decode_if.slave decode_if,
VX_writeback_if.slave writeback_if,
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS]
);
`UNUSED_PARAM (ISSUE_ID)
VX_ibuffer_if ibuffer_if [PER_ISSUE_WARPS]();
VX_scoreboard_if scoreboard_if();
VX_operands_if operands_if();
VX_ibuffer #(
.INSTANCE_ID ($sformatf("%s-ibuffer", INSTANCE_ID))
) ibuffer (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.ibf_stalls),
`endif
.decode_if (decode_if),
.ibuffer_if (ibuffer_if)
);
VX_scoreboard #(
.INSTANCE_ID ($sformatf("%s-scoreboard", INSTANCE_ID))
) scoreboard (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.scb_stalls),
.perf_units_uses(issue_perf.units_uses),
.perf_sfu_uses (issue_perf.sfu_uses),
`endif
.writeback_if (writeback_if),
.ibuffer_if (ibuffer_if),
.scoreboard_if (scoreboard_if)
);
VX_operands #(
.INSTANCE_ID ($sformatf("%s-operands", INSTANCE_ID))
) operands (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.opd_stalls),
`endif
.writeback_if (writeback_if),
.scoreboard_if (scoreboard_if),
.operands_if (operands_if)
);
VX_dispatch #(
.INSTANCE_ID ($sformatf("%s-dispatch", INSTANCE_ID))
) dispatch (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
`UNUSED_PIN (perf_stalls),
`endif
.operands_if (operands_if),
.dispatch_if (dispatch_if)
);
`ifdef SCOPE
`ifdef DBG_SCOPE_ISSUE
`SCOPE_IO_SWITCH (1);
wire operands_fire = operands_if.valid && operands_if.ready;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 2, 2, 2, (
`UUID_WIDTH + `NUM_THREADS + `EX_BITS + `INST_OP_BITS +
1 + `NR_BITS + (`NUM_THREADS * 3 * `XLEN) +
`UUID_WIDTH + `NUM_THREADS + `NR_BITS + (`NUM_THREADS*`XLEN) + 1
), {
operands_if.valid,
operands_if.ready
}, {
operands_fire,
writeback_if.valid // ack-free
}, {
operands_if.data.uuid,
operands_if.data.tmask,
operands_if.data.ex_type,
operands_if.data.op_type,
operands_if.data.wb,
operands_if.data.rd,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data,
writeback_if.data.uuid,
writeback_if.data.tmask,
writeback_if.data.rd,
writeback_if.data.data,
writeback_if.data.eop
},
reset_negedge, 1'b0, 4096
);
`else
`SCOPE_IO_UNUSED(0)
`endif
`endif
`ifdef CHIPSCOPE
ila_issue ila_issue_inst (
.clk (clk),
.probe0 ({decode_if.valid, decode_if.data, decode_if.ready}),
.probe1 ({scoreboard_if.valid, scoreboard_if.data, scoreboard_if.ready}),
.probe2 ({operands_if.valid, operands_if.data, operands_if.ready}),
.probe3 ({writeback_if.valid, writeback_if.data})
);
`endif
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (operands_if.valid && operands_if.ready) begin
`TRACE(1, ("%t: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, wis_to_wid(operands_if.data.wis, ISSUE_ID), {operands_if.data.PC, 1'b0}))
trace_ex_type(1, operands_if.data.ex_type);
`TRACE(1, (", op="))
trace_ex_op(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args);
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if.data.tmask, operands_if.data.wb, operands_if.data.rd))
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs1_data, `NUM_THREADS)
`TRACE(1, (", rs2_data="))
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs2_data, `NUM_THREADS)
`TRACE(1, (", rs3_data="))
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs3_data, `NUM_THREADS)
trace_op_args(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args);
`TRACE(1, (" (#%0d)\n", operands_if.data.uuid))
end
end
`endif
endmodule

139
hw/rtl/core/VX_issue_top.sv Normal file
View file

@ -0,0 +1,139 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_issue_top import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "issue"
) (
// Clock
input wire clk,
input wire reset,
input wire decode_valid,
input wire [`UUID_WIDTH-1:0] decode_uuid,
input wire [`NW_WIDTH-1:0] decode_wid,
input wire [`NUM_THREADS-1:0] decode_tmask,
input wire [`PC_BITS-1:0] decode_PC,
input wire [`EX_BITS-1:0] decode_ex_type,
input wire [`INST_OP_BITS-1:0] decode_op_type,
input op_args_t decode_op_args,
input wire decode_wb,
input wire [`NR_BITS-1:0] decode_rd,
input wire [`NR_BITS-1:0] decode_rs1,
input wire [`NR_BITS-1:0] decode_rs2,
input wire [`NR_BITS-1:0] decode_rs3,
output wire decode_ready,
input wire writeback_valid[`ISSUE_WIDTH],
input wire [`UUID_WIDTH-1:0] writeback_uuid[`ISSUE_WIDTH],
input wire [ISSUE_WIS_W-1:0] writeback_wis[`ISSUE_WIDTH],
input wire [`NUM_THREADS-1:0] writeback_tmask[`ISSUE_WIDTH],
input wire [`PC_BITS-1:0] writeback_PC[`ISSUE_WIDTH],
input wire [`NR_BITS-1:0] writeback_rd[`ISSUE_WIDTH],
input wire [`NUM_THREADS-1:0][`XLEN-1:0] writeback_data[`ISSUE_WIDTH],
input wire writeback_sop[`ISSUE_WIDTH],
input wire writeback_eop[`ISSUE_WIDTH],
output wire dispatch_valid[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`UUID_WIDTH-1:0] dispatch_uuid[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [ISSUE_WIS_W-1:0] dispatch_wis[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NUM_THREADS-1:0] dispatch_tmask[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`PC_BITS-1:0] dispatch_PC[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`INST_ALU_BITS-1:0] dispatch_op_type[`NUM_EX_UNITS * `ISSUE_WIDTH],
output op_args_t dispatch_op_args[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire dispatch_wb[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NR_BITS-1:0] dispatch_rd[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NT_WIDTH-1:0] dispatch_tid[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs1_data[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data[`NUM_EX_UNITS * `ISSUE_WIDTH],
input wire dispatch_ready[`NUM_EX_UNITS * `ISSUE_WIDTH]
);
VX_decode_if decode_if();
VX_dispatch_if dispatch_if[`NUM_EX_UNITS * `ISSUE_WIDTH]();
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
assign decode_if.valid = decode_valid;
assign decode_if.data.uuid = decode_uuid;
assign decode_if.data.wid = decode_wid;
assign decode_if.data.tmask = decode_tmask;
assign decode_if.data.PC = decode_PC;
assign decode_if.data.ex_type = decode_ex_type;
assign decode_if.data.op_type = decode_op_type;
assign decode_if.data.op_args = decode_op_args;
assign decode_if.data.wb = decode_wb;
assign decode_if.data.rd = decode_rd;
assign decode_if.data.rs1 = decode_rs1;
assign decode_if.data.rs2 = decode_rs2;
assign decode_if.data.rs3 = decode_rs3;
assign decode_ready = decode_if.ready;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_writeback_if
assign writeback_if[i].valid = writeback_valid[i];
assign writeback_if[i].data.uuid = writeback_uuid[i];
assign writeback_if[i].data.wis = writeback_wis[i];
assign writeback_if[i].data.tmask = writeback_tmask[i];
assign writeback_if[i].data.PC = writeback_PC[i];
assign writeback_if[i].data.rd = writeback_rd[i];
assign writeback_if[i].data.data = writeback_data[i];
assign writeback_if[i].data.sop = writeback_sop[i];
assign writeback_if[i].data.eop = writeback_eop[i];
end
for (genvar i = 0; i < `NUM_EX_UNITS * `ISSUE_WIDTH; ++i) begin : g_dispatch_if
assign dispatch_valid[i] = dispatch_if[i].valid;
assign dispatch_uuid[i] = dispatch_if[i].data.uuid;
assign dispatch_wis[i] = dispatch_if[i].data.wis;
assign dispatch_tmask[i] = dispatch_if[i].data.tmask;
assign dispatch_PC[i] = dispatch_if[i].data.PC;
assign dispatch_op_type[i] = dispatch_if[i].data.op_type;
assign dispatch_op_args[i] = dispatch_if[i].data.op_args;
assign dispatch_wb[i] = dispatch_if[i].data.wb;
assign dispatch_rd[i] = dispatch_if[i].data.rd;
assign dispatch_tid[i] = dispatch_if[i].data.tid;
assign dispatch_rs1_data[i] = dispatch_if[i].data.rs1_data;
assign dispatch_rs2_data[i] = dispatch_if[i].data.rs2_data;
assign dispatch_rs3_data[i] = dispatch_if[i].data.rs3_data;
assign dispatch_if[i].ready = dispatch_ready[i];
end
`ifdef PERF_ENABLE
issue_perf_t issue_perf = '0;
`endif
`ifdef SCOPE
wire [0:0] scope_reset_w = 1'b0;
wire [0:0] scope_bus_in_w = 1'b0;
wire [0:0] scope_bus_out_w;
`UNUSED_VAR (scope_bus_out_w)
`endif
VX_issue #(
.INSTANCE_ID (INSTANCE_ID)
) issue (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.issue_perf (issue_perf),
`endif
.decode_if (decode_if),
.writeback_if (writeback_if),
.dispatch_if (dispatch_if)
);
endmodule

View file

@ -1,227 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_lmem_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output cache_perf_t cache_perf,
`endif
VX_lsu_mem_if.slave lsu_mem_in_if [`NUM_LSU_BLOCKS],
VX_lsu_mem_if.master lsu_mem_out_if [`NUM_LSU_BLOCKS]
);
`STATIC_ASSERT(`IS_DIVISBLE((1 << `LMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT(0 == (`LMEM_BASE_ADDR % (1 << `LMEM_LOG_SIZE)), ("invalid parameter"))
localparam REQ_DATAW = `NUM_LSU_LANES + 1 + `NUM_LSU_LANES * (LSU_WORD_SIZE + LSU_ADDR_WIDTH + `ADDR_TYPE_WIDTH + LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH;
localparam RSP_DATAW = `NUM_LSU_LANES + `NUM_LSU_LANES * (LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH;
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE);
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_lsu_if[`NUM_LSU_BLOCKS]();
`RESET_RELAY (req_reset, reset);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
wire [`NUM_LSU_LANES-1:0] is_addr_local_mask;
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
assign is_addr_local_mask[j] = lsu_mem_in_if[i].req_data.atype[j][`ADDR_TYPE_LOCAL];
end
wire is_addr_global = | (lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask);
wire is_addr_local = | (lsu_mem_in_if[i].req_data.mask & is_addr_local_mask);
wire req_global_ready;
wire req_local_ready;
VX_elastic_buffer #(
.DATAW (REQ_DATAW),
.SIZE (2),
.OUT_REG (1)
) req_global_buf (
.clk (clk),
.reset (req_reset),
.valid_in (lsu_mem_in_if[i].req_valid && is_addr_global),
.data_in ({
lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask,
lsu_mem_in_if[i].req_data.rw,
lsu_mem_in_if[i].req_data.byteen,
lsu_mem_in_if[i].req_data.addr,
lsu_mem_in_if[i].req_data.atype,
lsu_mem_in_if[i].req_data.data,
lsu_mem_in_if[i].req_data.tag
}),
.ready_in (req_global_ready),
.valid_out (lsu_mem_out_if[i].req_valid),
.data_out ({
lsu_mem_out_if[i].req_data.mask,
lsu_mem_out_if[i].req_data.rw,
lsu_mem_out_if[i].req_data.byteen,
lsu_mem_out_if[i].req_data.addr,
lsu_mem_out_if[i].req_data.atype,
lsu_mem_out_if[i].req_data.data,
lsu_mem_out_if[i].req_data.tag
}),
.ready_out (lsu_mem_out_if[i].req_ready)
);
VX_elastic_buffer #(
.DATAW (REQ_DATAW),
.SIZE (0),
.OUT_REG (0)
) req_local_buf (
.clk (clk),
.reset (req_reset),
.valid_in (lsu_mem_in_if[i].req_valid && is_addr_local),
.data_in ({
lsu_mem_in_if[i].req_data.mask & is_addr_local_mask,
lsu_mem_in_if[i].req_data.rw,
lsu_mem_in_if[i].req_data.byteen,
lsu_mem_in_if[i].req_data.addr,
lsu_mem_in_if[i].req_data.atype,
lsu_mem_in_if[i].req_data.data,
lsu_mem_in_if[i].req_data.tag
}),
.ready_in (req_local_ready),
.valid_out (lmem_lsu_if[i].req_valid),
.data_out ({
lmem_lsu_if[i].req_data.mask,
lmem_lsu_if[i].req_data.rw,
lmem_lsu_if[i].req_data.byteen,
lmem_lsu_if[i].req_data.addr,
lmem_lsu_if[i].req_data.atype,
lmem_lsu_if[i].req_data.data,
lmem_lsu_if[i].req_data.tag
}),
.ready_out (lmem_lsu_if[i].req_ready)
);
assign lsu_mem_in_if[i].req_ready = (req_global_ready && is_addr_global)
|| (req_local_ready && is_addr_local);
end
`RESET_RELAY (rsp_reset, reset);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
wire rsp_arb_valid;
wire rsp_arb_index;
wire rsp_arb_ready;
VX_generic_arbiter #(
.NUM_REQS (2),
.LOCK_ENABLE (1),
.TYPE ("R")
) arbiter (
.clk (clk),
.reset (rsp_reset),
.requests ({
lmem_lsu_if[i].rsp_valid,
lsu_mem_out_if[i].rsp_valid
}),
.grant_valid (rsp_arb_valid),
.grant_index (rsp_arb_index),
`UNUSED_PIN (grant_onehot),
.grant_unlock(rsp_arb_ready)
);
VX_elastic_buffer #(
.DATAW (RSP_DATAW),
.SIZE (2),
.OUT_REG (0)
) rsp_buf (
.clk (clk),
.reset (rsp_reset),
.valid_in (rsp_arb_valid),
.data_in ({
rsp_arb_index ? lmem_lsu_if[i].rsp_data.mask : lsu_mem_out_if[i].rsp_data.mask,
rsp_arb_index ? lmem_lsu_if[i].rsp_data.data : lsu_mem_out_if[i].rsp_data.data,
rsp_arb_index ? lmem_lsu_if[i].rsp_data.tag : lsu_mem_out_if[i].rsp_data.tag
}),
.ready_in (rsp_arb_ready),
.valid_out (lsu_mem_in_if[i].rsp_valid),
.data_out ({
lsu_mem_in_if[i].rsp_data.mask,
lsu_mem_in_if[i].rsp_data.data,
lsu_mem_in_if[i].rsp_data.tag
}),
.ready_out (lsu_mem_in_if[i].rsp_ready)
);
assign lsu_mem_out_if[i].rsp_ready = rsp_arb_ready && ~rsp_arb_index;
assign lmem_lsu_if[i].rsp_ready = rsp_arb_ready && rsp_arb_index;
end
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_if[LSU_NUM_REQS]();
`RESET_RELAY (adapter_reset, reset);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_tmp_if[`NUM_LSU_LANES]();
VX_lsu_adapter #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH),
.REQ_OUT_BUF (2),
.RSP_OUT_BUF (1)
) lsu_adapter (
.clk (clk),
.reset (adapter_reset),
.lsu_mem_if (lmem_lsu_if[i]),
.mem_bus_if (lmem_bus_tmp_if)
);
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
`ASSIGN_VX_MEM_BUS_IF (lmem_bus_if[i * `NUM_LSU_LANES + j], lmem_bus_tmp_if[j]);
end
end
`RESET_RELAY (lmem_reset, reset);
VX_local_mem #(
.INSTANCE_ID($sformatf("core%0d-lmem", CORE_ID)),
.SIZE (1 << `LMEM_LOG_SIZE),
.NUM_REQS (LSU_NUM_REQS),
.NUM_BANKS (`LMEM_NUM_BANKS),
.WORD_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH)
) local_mem (
.clk (clk),
.reset (lmem_reset),
`ifdef PERF_ENABLE
.cache_perf (cache_perf),
`endif
.mem_bus_if (lmem_bus_if)
);
endmodule

View file

@ -14,8 +14,7 @@
`include "VX_define.vh"
module VX_lsu_slice import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter BLOCK_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -60,25 +59,25 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
wire req_is_fence, rsp_is_fence;
wire [NUM_LANES-1:0][`XLEN-1:0] full_addr;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_full_addr
assign full_addr[i] = execute_if.data.rs1_data[i] + `SEXT(`XLEN, execute_if.data.op_args.lsu.offset);
end
// address type calculation
wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] mem_req_atype;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [NUM_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] mem_req_flags;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_flags
wire [MEM_ADDRW-1:0] block_addr = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
// is I/O address
wire [MEM_ADDRW-1:0] io_addr_start = MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT);
wire [MEM_ADDRW-1:0] io_addr_end = MEM_ADDRW'(`XLEN'(`IO_END_ADDR) >> MEM_ASHIFT);
assign mem_req_atype[i][`ADDR_TYPE_FLUSH] = req_is_fence;
assign mem_req_atype[i][`ADDR_TYPE_IO] = (block_addr >= io_addr_start) && (block_addr < io_addr_end);
assign mem_req_flags[i][`MEM_REQ_FLAG_FLUSH] = req_is_fence;
assign mem_req_flags[i][`MEM_REQ_FLAG_IO] = (block_addr >= io_addr_start) && (block_addr < io_addr_end);
`ifdef LMEM_ENABLE
// is local memory address
wire [MEM_ADDRW-1:0] lmem_addr_start = MEM_ADDRW'(`XLEN'(`LMEM_BASE_ADDR) >> MEM_ASHIFT);
wire [MEM_ADDRW-1:0] lmem_addr_end = MEM_ADDRW'((`XLEN'(`LMEM_BASE_ADDR) + `XLEN'(1 << `LMEM_LOG_SIZE)) >> MEM_ASHIFT);
assign mem_req_atype[i][`ADDR_TYPE_LOCAL] = (block_addr >= lmem_addr_start) && (block_addr < lmem_addr_end);
assign mem_req_flags[i][`MEM_REQ_FLAG_LOCAL] = (block_addr >= lmem_addr_start) && (block_addr < lmem_addr_end);
`endif
end
@ -88,7 +87,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
wire [NUM_LANES-1:0] mem_req_mask;
wire mem_req_rw;
wire [NUM_LANES-1:0][LSU_ADDR_WIDTH-1:0] mem_req_addr;
reg [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] mem_req_byteen;
wire [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] mem_req_byteen;
reg [NUM_LANES-1:0][LSU_WORD_SIZE*8-1:0] mem_req_data;
wire [TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
@ -103,8 +102,6 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
wire mem_req_fire = mem_req_valid && mem_req_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
`UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire)
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
wire no_rsp_buf_valid, no_rsp_buf_ready;
@ -152,46 +149,49 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] req_align;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_addr
assign req_align[i] = full_addr[i][REQ_ASHIFT-1:0];
assign mem_req_addr[i] = full_addr[i][`MEM_ADDR_WIDTH-1:REQ_ASHIFT];
end
// byte enable formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_byteen_w
reg [LSU_WORD_SIZE-1:0] mem_req_byteen_w;
always @(*) begin
mem_req_byteen[i] = '0;
mem_req_byteen_w = '0;
case (`INST_LSU_WSIZE(execute_if.data.op_type))
0: begin // 8-bit
mem_req_byteen[i][req_align[i]] = 1'b1;
mem_req_byteen_w[req_align[i]] = 1'b1;
end
1: begin // 16 bit
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
end
`ifdef XLEN_64
2: begin // 32 bit
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
end
`endif
default : mem_req_byteen[i] = {LSU_WORD_SIZE{1'b1}};
// 3: 64 bit
default : mem_req_byteen_w = {LSU_WORD_SIZE{1'b1}};
endcase
end
assign mem_req_byteen[i] = mem_req_byteen_w;
end
// memory misalignment not supported!
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_missalign
wire lsu_req_fire = execute_if.valid && execute_if.ready;
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if.data.tmask[i] || req_is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if.data.op_type))) == 0),
("misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
execute_if.data.wid, {execute_if.data.PC, 1'b0}, full_addr[i], `INST_LSU_WSIZE(execute_if.data.op_type), execute_if.data.uuid));
("%t: misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
$time, execute_if.data.wid, {execute_if.data.PC, 1'b0}, full_addr[i], `INST_LSU_WSIZE(execute_if.data.op_type), execute_if.data.uuid))
end
// store data formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_data
always @(*) begin
mem_req_data[i] = execute_if.data.rs2_data[i];
case (req_align[i])
@ -213,7 +213,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr;
if (PID_BITS != 0) begin
if (PID_BITS != 0) begin : g_pids
reg [`LSUQ_IN_SIZE-1:0][PID_BITS:0] pkt_ctr;
reg [`LSUQ_IN_SIZE-1:0] pkt_sop, pkt_eop;
@ -269,10 +269,10 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
assign mem_rsp_sop_pkt = pkt_sop[pkt_raddr];
assign mem_rsp_eop_pkt = mem_rsp_eop_fire && pkt_eop[pkt_raddr] && (pkt_ctr[pkt_raddr] == 1);
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("allocator full!"))
`RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("Oops!"))
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("%t: allocator full!", $time))
`RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("%t: oops! broken sop request!", $time))
`UNUSED_VAR (mem_rsp_sop)
end else begin
end else begin : g_no_pids
assign pkt_waddr = 0;
assign mem_rsp_sop_pkt = mem_rsp_sop;
assign mem_rsp_eop_pkt = mem_rsp_eop;
@ -298,7 +298,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
wire [NUM_LANES-1:0] lsu_mem_req_mask;
wire [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] lsu_mem_req_byteen;
wire [NUM_LANES-1:0][LSU_ADDR_WIDTH-1:0] lsu_mem_req_addr;
wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] lsu_mem_req_atype;
wire [NUM_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] lsu_mem_req_flags;
wire [NUM_LANES-1:0][(LSU_WORD_SIZE*8)-1:0] lsu_mem_req_data;
wire [LSU_TAG_WIDTH-1:0] lsu_mem_req_tag;
wire lsu_mem_req_ready;
@ -309,16 +309,14 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
wire [LSU_TAG_WIDTH-1:0] lsu_mem_rsp_tag;
wire lsu_mem_rsp_ready;
`RESET_RELAY (mem_scheduler_reset, reset);
VX_mem_scheduler #(
.INSTANCE_ID ($sformatf("core%0d-lsu-memsched%0d", CORE_ID, BLOCK_ID)),
.INSTANCE_ID ($sformatf("%s-scheduler", INSTANCE_ID)),
.CORE_REQS (NUM_LANES),
.MEM_CHANNELS(NUM_LANES),
.WORD_SIZE (LSU_WORD_SIZE),
.LINE_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LSU_ADDR_WIDTH),
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_QUEUE_SIZE (`LSUQ_IN_SIZE),
.MEM_QUEUE_SIZE (`LSUQ_OUT_SIZE),
@ -328,7 +326,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
.CORE_OUT_BUF(0)
) mem_scheduler (
.clk (clk),
.reset (mem_scheduler_reset),
.reset (reset),
// Input request
.core_req_valid (mem_req_valid),
@ -336,12 +334,12 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
.core_req_mask (mem_req_mask),
.core_req_byteen(mem_req_byteen),
.core_req_addr (mem_req_addr),
.core_req_atype (mem_req_atype),
.core_req_flags (mem_req_flags),
.core_req_data (mem_req_data),
.core_req_tag (mem_req_tag),
.core_req_ready (mem_req_ready),
`UNUSED_PIN (core_req_empty),
`UNUSED_PIN (core_req_sent),
`UNUSED_PIN (core_req_wr_notify),
// Output response
.core_rsp_valid (mem_rsp_valid),
@ -358,7 +356,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
.mem_req_mask (lsu_mem_req_mask),
.mem_req_byteen (lsu_mem_req_byteen),
.mem_req_addr (lsu_mem_req_addr),
.mem_req_atype (lsu_mem_req_atype),
.mem_req_flags (lsu_mem_req_flags),
.mem_req_data (lsu_mem_req_data),
.mem_req_tag (lsu_mem_req_tag),
.mem_req_ready (lsu_mem_req_ready),
@ -376,7 +374,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
assign lsu_mem_if.req_data.rw = lsu_mem_req_rw;
assign lsu_mem_if.req_data.byteen = lsu_mem_req_byteen;
assign lsu_mem_if.req_data.addr = lsu_mem_req_addr;
assign lsu_mem_if.req_data.atype = lsu_mem_req_atype;
assign lsu_mem_if.req_data.flags = lsu_mem_req_flags;
assign lsu_mem_if.req_data.data = lsu_mem_req_data;
assign lsu_mem_if.req_data.tag = lsu_mem_req_tag;
assign lsu_mem_req_ready = lsu_mem_if.req_ready;
@ -424,7 +422,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
`endif
`endif
for (genvar i = 0; i < NUM_LANES; i++) begin
for (genvar i = 0; i < NUM_LANES; i++) begin : g_rsp_data
`ifdef XLEN_64
wire [63:0] rsp_data64 = mem_rsp_data[i];
wire [31:0] rsp_data32 = (rsp_align[i][2] ? mem_rsp_data[i][63:32] : mem_rsp_data[i][31:0]);
@ -481,6 +479,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
.valid_out (commit_no_rsp_if.valid),
.ready_out (commit_no_rsp_if.ready)
);
assign commit_no_rsp_if.data.rd = '0;
assign commit_no_rsp_if.data.wb = 1'b0;
assign commit_no_rsp_if.data.data = commit_rsp_if.data.data; // arbiter MUX optimization
@ -488,6 +487,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_ARB_DATAW),
.ARBITER ("P"), // prioritize commit_rsp_if
.OUT_BUF (3)
) rsp_arb (
.clk (clk),
@ -504,67 +504,70 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
`ifdef DBG_TRACE_MEM
always @(posedge clk) begin
if (execute_if.valid && fence_lock) begin
`TRACE(1, ("%d: *** D$%0d fence wait\n", $time, CORE_ID));
`TRACE(1, ("%t: *** %s fence wait\n", $time, INSTANCE_ID))
end
if (mem_req_fire) begin
if (mem_req_rw) begin
`TRACE(1, ("%d: D$%0d Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
`TRACE(1, (", byteen=0x%0h, data=", mem_req_byteen));
`TRACE_ARRAY1D(1, "0x%0h", mem_req_data, NUM_LANES);
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_req_tag, execute_if.data.uuid));
`TRACE(1, ("%t: %s Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask))
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES)
`TRACE(1, (", flags="))
`TRACE_ARRAY1D(1, "%b", mem_req_flags, NUM_LANES)
`TRACE(1, (", byteen=0x%0h, data=", mem_req_byteen))
`TRACE_ARRAY1D(1, "0x%0h", mem_req_data, NUM_LANES)
`TRACE(1, (", sop=%b, eop=%b, tag=0x%0h (#%0d)\n", execute_if.data.sop, execute_if.data.eop, mem_req_tag, execute_if.data.uuid))
end else begin
`TRACE(1, ("%d: D$%0d Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
`TRACE(1, (", byteen=0x%0h, rd=%0d, tag=0x%0h (#%0d)\n", mem_req_byteen, execute_if.data.rd, mem_req_tag, execute_if.data.uuid));
`TRACE(1, ("%t: %s Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask))
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES)
`TRACE(1, (", flags="))
`TRACE_ARRAY1D(1, "%b", mem_req_flags, NUM_LANES)
`TRACE(1, (", byteen=0x%0h, rd=%0d, sop=%b, eop=%b, tag=0x%0h (#%0d)\n", mem_req_byteen, execute_if.data.rd, execute_if.data.sop, execute_if.data.eop, mem_req_tag, execute_if.data.uuid))
end
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: D$%0d Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=",
$time, CORE_ID, rsp_wid, {rsp_pc, 1'b0}, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop));
`TRACE_ARRAY1D(1, "0x%0h", mem_rsp_data, NUM_LANES);
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid));
`TRACE(1, ("%t: %s Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=",
$time, INSTANCE_ID, rsp_wid, {rsp_pc, 1'b0}, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop))
`TRACE_ARRAY1D(1, "0x%0h", mem_rsp_data, NUM_LANES)
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid))
end
end
`endif
`ifdef SCOPE
`ifdef DBG_SCOPE_LSU
if (CORE_ID == 0 && BLOCK_ID == 0) begin
`ifdef SCOPE
VX_scope_tap #(
.SCOPE_ID (3),
.TRIGGERW (3),
.PROBEW (`UUID_WIDTH+NUM_LANES*(`XLEN+4+`XLEN)+1+`UUID_WIDTH+NUM_LANES*`XLEN)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({reset, mem_req_fire, mem_rsp_fire}),
.probes({execute_if.data.uuid, full_addr, mem_req_rw, mem_req_byteen, mem_req_data, rsp_uuid, rsp_data}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
wire [31:0] full_addr_0 = full_addr[0];
wire [31:0] mem_req_data_0 = mem_req_data[0];
wire [31:0] rsp_data_0 = rsp_data[0];
ila_lsu ila_lsu_inst (
.clk (clk),
.probe0 ({mem_req_data_0, execute_if.data.uuid, execute_if.data.wid, execute_if.data.PC, mem_req_mask, full_addr_0, mem_req_byteen, mem_req_rw, mem_req_ready, mem_req_valid}),
.probe1 ({rsp_data_0, rsp_uuid, mem_rsp_eop, rsp_pc, rsp_rd, mem_rsp_mask, rsp_wid, mem_rsp_ready, mem_rsp_valid}),
.probe2 ({lsu_mem_if.req_data.data, lsu_mem_if.req_data.tag, lsu_mem_if.req_data.byteen, lsu_mem_if.req_data.addr, lsu_mem_if.req_data.rw, lsu_mem_if.req_ready, lsu_mem_if.req_valid}),
.probe3 ({lsu_mem_if.rsp_data.data, lsu_mem_if.rsp_data.tag, lsu_mem_if.rsp_ready, lsu_mem_if.rsp_valid})
);
`endif
end
`SCOPE_IO_SWITCH (1);
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 3, 4, 2, (
1 + NUM_LANES * (`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE * 8) + `UUID_WIDTH + NUM_LANES * LSU_WORD_SIZE * 8 + `UUID_WIDTH
), {
mem_req_valid,
mem_req_ready,
mem_rsp_valid,
mem_rsp_ready
}, {
mem_req_fire,
mem_rsp_fire
}, {
mem_req_rw,
full_addr,
mem_req_byteen,
mem_req_data,
execute_if.data.uuid,
rsp_data,
rsp_uuid
},
reset_negedge, 1'b0, 4096
);
`else
`SCOPE_IO_UNUSED()
`SCOPE_IO_UNUSED(0)
`endif
`endif
`ifdef CHIPSCOPE
ila_lsu ila_lsu_inst (
.clk (clk),
.probe0 ({execute_if.valid, execute_if.data, execute_if.ready}),
.probe1 ({lsu_mem_if.req_valid, lsu_mem_if.req_data, lsu_mem_if.req_ready}),
.probe2 ({lsu_mem_if.rsp_valid, lsu_mem_if.rsp_data, lsu_mem_if.rsp_ready})
);
`endif
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,8 +14,8 @@
`include "VX_define.vh"
module VX_lsu_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
input wire clk,
@ -24,18 +24,15 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
// Inputs
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
// Outputs
// Outputs
VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_lsu_mem_if.master lsu_mem_if [`NUM_LSU_BLOCKS]
);
localparam BLOCK_SIZE = `NUM_LSU_BLOCKS;
localparam NUM_LANES = `NUM_LSU_LANES;
`ifdef SCOPE
localparam scope_lsu = 0;
`SCOPE_IO_SWITCH (BLOCK_SIZE);
`endif
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) per_block_execute_if[BLOCK_SIZE]();
@ -43,7 +40,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_BUF (1)
.OUT_BUF (3)
) dispatch_unit (
.clk (clk),
.reset (reset),
@ -55,17 +52,13 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
.NUM_LANES (NUM_LANES)
) per_block_commit_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
`RESET_RELAY (block_reset, reset);
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_lsus
VX_lsu_slice #(
.CORE_ID (CORE_ID),
.BLOCK_ID (block_idx)
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, block_idx))
) lsu_slice(
`SCOPE_IO_BIND (scope_lsu+block_idx)
`SCOPE_IO_BIND (block_idx)
.clk (clk),
.reset (block_reset),
.reset (reset),
.execute_if (per_block_execute_if[block_idx]),
.commit_if (per_block_commit_if[block_idx]),
.lsu_mem_if (lsu_mem_if[block_idx])
@ -82,5 +75,5 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
.commit_in_if (per_block_commit_if),
.commit_out_if (commit_if)
);
endmodule

221
hw/rtl/core/VX_mem_unit.sv Normal file
View file

@ -0,0 +1,221 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_mem_unit import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output cache_perf_t lmem_perf,
`endif
VX_lsu_mem_if.slave lsu_mem_if [`NUM_LSU_BLOCKS],
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS]
);
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_dcache_if[`NUM_LSU_BLOCKS]();
`ifdef LMEM_ENABLE
`STATIC_ASSERT(`IS_DIVISBLE((1 << `LMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT(0 == (`LMEM_BASE_ADDR % (1 << `LMEM_LOG_SIZE)), ("invalid parameter"))
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE);
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_lmem_if[`NUM_LSU_BLOCKS]();
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lmem_switches
VX_lmem_switch #(
.REQ0_OUT_BUF (3),
.REQ1_OUT_BUF (0),
.RSP_OUT_BUF (1),
.ARBITER ("P")
) lmem_switch (
.clk (clk),
.reset (reset),
.lsu_in_if (lsu_mem_if[i]),
.global_out_if(lsu_dcache_if[i]),
.local_out_if (lsu_lmem_if[i])
);
end
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_if[LSU_NUM_REQS]();
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lmem_adapters
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_tmp_if[`NUM_LSU_LANES]();
VX_lsu_adapter #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (3),
.RSP_OUT_BUF (0)
) lmem_adapter (
.clk (clk),
.reset (reset),
.lsu_mem_if (lsu_lmem_if[i]),
.mem_bus_if (lmem_bus_tmp_if)
);
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin : g_lmem_bus_if
`ASSIGN_VX_MEM_BUS_IF (lmem_bus_if[i * `NUM_LSU_LANES + j], lmem_bus_tmp_if[j]);
end
end
VX_local_mem #(
.INSTANCE_ID($sformatf("%s-lmem", INSTANCE_ID)),
.SIZE (1 << `LMEM_LOG_SIZE),
.NUM_REQS (LSU_NUM_REQS),
.NUM_BANKS (`LMEM_NUM_BANKS),
.WORD_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.OUT_BUF (3)
) local_mem (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.lmem_perf (lmem_perf),
`endif
.mem_bus_if (lmem_bus_if)
);
`else
`ifdef PERF_ENABLE
assign lmem_perf = '0;
`endif
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_dcache_if
`ASSIGN_VX_MEM_BUS_IF (lsu_dcache_if[i], lsu_mem_if[i]);
end
`endif
VX_lsu_mem_if #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_coalesced_if[`NUM_LSU_BLOCKS]();
if (LSU_WORD_SIZE != DCACHE_WORD_SIZE) begin : g_enabled
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_coalescers
VX_mem_coalescer #(
.INSTANCE_ID ($sformatf("%s-coalescer%0d", INSTANCE_ID, i)),
.NUM_REQS (`NUM_LSU_LANES),
.DATA_IN_SIZE (LSU_WORD_SIZE),
.DATA_OUT_SIZE (DCACHE_WORD_SIZE),
.ADDR_WIDTH (LSU_ADDR_WIDTH),
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.QUEUE_SIZE (`LSUQ_OUT_SIZE)
) mem_coalescer (
.clk (clk),
.reset (reset),
// Input request
.in_req_valid (lsu_dcache_if[i].req_valid),
.in_req_mask (lsu_dcache_if[i].req_data.mask),
.in_req_rw (lsu_dcache_if[i].req_data.rw),
.in_req_byteen (lsu_dcache_if[i].req_data.byteen),
.in_req_addr (lsu_dcache_if[i].req_data.addr),
.in_req_flags (lsu_dcache_if[i].req_data.flags),
.in_req_data (lsu_dcache_if[i].req_data.data),
.in_req_tag (lsu_dcache_if[i].req_data.tag),
.in_req_ready (lsu_dcache_if[i].req_ready),
// Input response
.in_rsp_valid (lsu_dcache_if[i].rsp_valid),
.in_rsp_mask (lsu_dcache_if[i].rsp_data.mask),
.in_rsp_data (lsu_dcache_if[i].rsp_data.data),
.in_rsp_tag (lsu_dcache_if[i].rsp_data.tag),
.in_rsp_ready (lsu_dcache_if[i].rsp_ready),
// Output request
.out_req_valid (dcache_coalesced_if[i].req_valid),
.out_req_mask (dcache_coalesced_if[i].req_data.mask),
.out_req_rw (dcache_coalesced_if[i].req_data.rw),
.out_req_byteen (dcache_coalesced_if[i].req_data.byteen),
.out_req_addr (dcache_coalesced_if[i].req_data.addr),
.out_req_flags (dcache_coalesced_if[i].req_data.flags),
.out_req_data (dcache_coalesced_if[i].req_data.data),
.out_req_tag (dcache_coalesced_if[i].req_data.tag),
.out_req_ready (dcache_coalesced_if[i].req_ready),
// Output response
.out_rsp_valid (dcache_coalesced_if[i].rsp_valid),
.out_rsp_mask (dcache_coalesced_if[i].rsp_data.mask),
.out_rsp_data (dcache_coalesced_if[i].rsp_data.data),
.out_rsp_tag (dcache_coalesced_if[i].rsp_data.tag),
.out_rsp_ready (dcache_coalesced_if[i].rsp_ready)
);
end
end else begin : g_passthru
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_dcache_coalesced_if
`ASSIGN_VX_MEM_BUS_IF (dcache_coalesced_if[i], lsu_dcache_if[i]);
end
end
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_dcache_adapters
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_tmp_if[DCACHE_CHANNELS]();
VX_lsu_adapter #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (0)
) dcache_adapter (
.clk (clk),
.reset (reset),
.lsu_mem_if (dcache_coalesced_if[i]),
.mem_bus_if (dcache_bus_tmp_if)
);
for (genvar j = 0; j < DCACHE_CHANNELS; ++j) begin : g_dcache_bus_if
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i * DCACHE_CHANNELS + j], dcache_bus_tmp_if[j]);
end
end
endmodule

View file

@ -0,0 +1,127 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_mem_unit_top import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter LSU_WORD_WIDTH = LSU_WORD_SIZE * 8
) (
// Clock
input wire clk,
input wire reset,
// LSU memory request
input wire [`NUM_LSU_BLOCKS-1:0] lsu_req_valid,
input wire [`NUM_LSU_BLOCKS-1:0] lsu_req_rw,
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0] lsu_req_mask,
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_WORD_SIZE-1:0] lsu_req_byteen,
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_ADDR_WIDTH-1:0] lsu_req_addr,
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] lsu_req_flags,
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_WORD_WIDTH-1:0] lsu_req_data,
input wire [`NUM_LSU_BLOCKS-1:0][LSU_TAG_WIDTH-1:0] lsu_req_tag,
output wire [`NUM_LSU_BLOCKS-1:0] lsu_req_ready,
// LSU memory response
output wire [`NUM_LSU_BLOCKS-1:0] lsu_rsp_valid,
output wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0] lsu_rsp_mask,
output wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_WORD_WIDTH-1:0] lsu_rsp_data,
output wire [`NUM_LSU_BLOCKS-1:0][LSU_TAG_WIDTH-1:0] lsu_rsp_tag,
input wire [`NUM_LSU_BLOCKS-1:0] lsu_rsp_ready,
// Memory request
output wire [DCACHE_NUM_REQS-1:0] mem_req_valid,
output wire [DCACHE_NUM_REQS-1:0] mem_req_rw,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] mem_req_byteen,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] mem_req_addr,
output wire [DCACHE_NUM_REQS-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] mem_req_flags,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] mem_req_data,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] mem_req_tag,
input wire [DCACHE_NUM_REQS-1:0] mem_req_ready,
// Memory response
input wire [DCACHE_NUM_REQS-1:0] mem_rsp_valid,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] mem_rsp_data,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] mem_rsp_tag,
output wire [DCACHE_NUM_REQS-1:0] mem_rsp_ready
);
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_mem_if[`NUM_LSU_BLOCKS]();
// LSU memory request
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_mem_req
assign lsu_mem_if[i].req_valid = lsu_req_valid[i];
assign lsu_mem_if[i].req_data.rw = lsu_req_rw[i];
assign lsu_mem_if[i].req_data.mask = lsu_req_mask[i];
assign lsu_mem_if[i].req_data.byteen = lsu_req_byteen[i];
assign lsu_mem_if[i].req_data.addr = lsu_req_addr[i];
assign lsu_mem_if[i].req_data.flags = lsu_req_flags[i];
assign lsu_mem_if[i].req_data.data = lsu_req_data[i];
assign lsu_mem_if[i].req_data.tag = lsu_req_tag[i];
assign lsu_req_ready[i] = lsu_mem_if[i].req_ready;
end
// LSU memory response
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_rsp
assign lsu_rsp_valid[i] = lsu_mem_if[i].rsp_valid;
assign lsu_rsp_mask[i] = lsu_mem_if[i].rsp_data.mask;
assign lsu_rsp_data[i] = lsu_mem_if[i].rsp_data.data;
assign lsu_rsp_tag[i] = lsu_mem_if[i].rsp_data.tag;
assign lsu_mem_if[i].rsp_ready = lsu_rsp_ready[i];
end
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) mem_bus_if[DCACHE_NUM_REQS]();
// memory request
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_mem_req
assign mem_req_valid[i] = mem_bus_if[i].req_valid;
assign mem_req_rw[i] = mem_bus_if[i].req_data.rw;
assign mem_req_byteen[i] = mem_bus_if[i].req_data.byteen;
assign mem_req_addr[i] = mem_bus_if[i].req_data.addr;
assign mem_req_flags[i] = mem_bus_if[i].req_data.flags;
assign mem_req_data[i] = mem_bus_if[i].req_data.data;
assign mem_req_tag[i] = mem_bus_if[i].req_data.tag;
assign mem_bus_if[i].req_ready = mem_req_ready[i];
end
// memory response
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_mem_bus_rsp
assign mem_bus_if[i].rsp_valid = mem_rsp_valid[i];
assign mem_bus_if[i].rsp_data.tag = mem_rsp_tag[i];
assign mem_bus_if[i].rsp_data.data = mem_rsp_data[i];
assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready;
end
`ifdef PERF_ENABLE
cache_perf_t lmem_perf = '0;
`endif
VX_mem_unit #(
.INSTANCE_ID (INSTANCE_ID)
) mem_unit (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.lmem_perf (lmem_perf),
`endif
.lsu_mem_if (lsu_mem_if),
.dcache_bus_if (mem_bus_if)
);
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -13,30 +13,289 @@
`include "VX_define.vh"
// reset all GPRs in debug mode
`ifdef SIMULATION
`ifndef NDEBUG
`define GPR_RESET
`endif
`endif
module VX_operands import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = "",
parameter NUM_BANKS = 4,
parameter OUT_BUF = 3
) (
input wire clk,
input wire reset,
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_scoreboard_if.slave scoreboard_if [`ISSUE_WIDTH],
VX_operands_if.master operands_if [`ISSUE_WIDTH]
`ifdef PERF_ENABLE
output wire [`PERF_CTR_BITS-1:0] perf_stalls,
`endif
VX_writeback_if.slave writeback_if,
VX_scoreboard_if.slave scoreboard_if,
VX_operands_if.master operands_if
);
`UNUSED_SPARAM (INSTANCE_ID)
localparam NUM_SRC_OPDS = 3;
localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_OPDS);
localparam REQ_SEL_WIDTH = `UP(REQ_SEL_BITS);
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS;
localparam META_DATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + `UUID_WIDTH;
localparam REGS_DATAW = `XLEN * `NUM_THREADS;
localparam DATAW = META_DATAW + NUM_SRC_OPDS * REGS_DATAW;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS);
localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS;
localparam XLEN_SIZE = `XLEN / 8;
localparam BYTEENW = `NUM_THREADS * XLEN_SIZE;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
`RESET_RELAY (slice_reset, reset);
`UNUSED_VAR (writeback_if.data.sop)
VX_gpr_slice #(
.CORE_ID (CORE_ID)
) gpr_slice (
.clk (clk),
.reset (slice_reset),
.writeback_if (writeback_if[i]),
.scoreboard_if(scoreboard_if[i]),
.operands_if (operands_if[i])
wire [NUM_SRC_OPDS-1:0] src_valid;
wire [NUM_SRC_OPDS-1:0] req_valid_in, req_ready_in;
wire [NUM_SRC_OPDS-1:0][PER_BANK_ADDRW-1:0] req_data_in;
wire [NUM_SRC_OPDS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
wire [NUM_BANKS-1:0] gpr_rd_valid, gpr_rd_ready;
wire [NUM_BANKS-1:0] gpr_rd_valid_st1, gpr_rd_valid_st2;
wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr, gpr_rd_addr_st1;
wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data_st2;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx, gpr_rd_req_idx_st1, gpr_rd_req_idx_st2;
wire pipe_ready_in;
wire pipe_valid_st1, pipe_ready_st1;
wire pipe_valid_st2, pipe_ready_st2;
wire [META_DATAW-1:0] pipe_data, pipe_data_st1, pipe_data_st2;
reg [NUM_SRC_OPDS-1:0][(`NUM_THREADS * `XLEN)-1:0] src_data_st2, src_data_m_st2;
reg [NUM_SRC_OPDS-1:0] data_fetched_st1;
reg has_collision_n;
wire has_collision_st1;
wire [NUM_SRC_OPDS-1:0][`NR_BITS-1:0] src_opds;
assign src_opds = {scoreboard_if.data.rs3, scoreboard_if.data.rs2, scoreboard_if.data.rs1};
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_req_data_in
if (ISSUE_WIS != 0) begin : g_wis
assign req_data_in[i] = {src_opds[i][`NR_BITS-1:BANK_SEL_BITS], scoreboard_if.data.wis};
end else begin : g_no_wis
assign req_data_in[i] = src_opds[i][`NR_BITS-1:BANK_SEL_BITS];
end
end
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_req_bank_idx
if (NUM_BANKS != 1) begin : g_multibanks
assign req_bank_idx[i] = src_opds[i][BANK_SEL_BITS-1:0];
end else begin : g_singlebank
assign req_bank_idx[i] = '0;
end
end
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_src_valid
assign src_valid[i] = (src_opds[i] != 0) && ~data_fetched_st1[i];
end
assign req_valid_in = {NUM_SRC_OPDS{scoreboard_if.valid}} & src_valid;
VX_stream_xbar #(
.NUM_INPUTS (NUM_SRC_OPDS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (PER_BANK_ADDRW),
.ARBITER ("P"), // use priority arbiter
.PERF_CTR_BITS(`PERF_CTR_BITS),
.OUT_BUF (0) // no output buffering
) req_xbar (
.clk (clk),
.reset (reset),
`UNUSED_PIN(collisions),
.valid_in (req_valid_in),
.data_in (req_data_in),
.sel_in (req_bank_idx),
.ready_in (req_ready_in),
.valid_out (gpr_rd_valid),
.data_out (gpr_rd_addr),
.sel_out (gpr_rd_req_idx),
.ready_out (gpr_rd_ready)
);
assign gpr_rd_ready = {NUM_BANKS{pipe_ready_in}};
always @(*) begin
has_collision_n = 0;
for (integer i = 0; i < NUM_SRC_OPDS; ++i) begin
for (integer j = 1; j < (NUM_SRC_OPDS-i); ++j) begin
has_collision_n |= src_valid[i]
&& src_valid[j+i]
&& (req_bank_idx[i] == req_bank_idx[j+i]);
end
end
end
wire [NUM_SRC_OPDS-1:0] req_fire_in = req_valid_in & req_ready_in;
assign pipe_data = {
scoreboard_if.data.wis,
scoreboard_if.data.tmask,
scoreboard_if.data.PC,
scoreboard_if.data.wb,
scoreboard_if.data.ex_type,
scoreboard_if.data.op_type,
scoreboard_if.data.op_args,
scoreboard_if.data.rd,
scoreboard_if.data.uuid
};
assign scoreboard_if.ready = pipe_ready_in && ~has_collision_n;
wire pipe_fire_st1 = pipe_valid_st1 && pipe_ready_st1;
wire pipe_fire_st2 = pipe_valid_st2 && pipe_ready_st2;
VX_pipe_buffer #(
.DATAW (NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH))
) pipe_reg1 (
.clk (clk),
.reset (reset),
.valid_in (scoreboard_if.valid),
.ready_in (pipe_ready_in),
.data_in ({gpr_rd_valid, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}),
.data_out ({gpr_rd_valid_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1}),
.valid_out(pipe_valid_st1),
.ready_out(pipe_ready_st1)
);
always @(posedge clk) begin
if (reset || scoreboard_if.ready) begin
data_fetched_st1 <= 0;
end else begin
data_fetched_st1 <= data_fetched_st1 | req_fire_in;
end
end
wire pipe_valid2_st1 = pipe_valid_st1 && ~has_collision_st1;
VX_pipe_buffer #(
.DATAW (NUM_BANKS + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH)
) pipe_reg2 (
.clk (clk),
.reset (reset),
.valid_in (pipe_valid2_st1),
.ready_in (pipe_ready_st1),
.data_in ({gpr_rd_valid_st1, pipe_data_st1, gpr_rd_req_idx_st1}),
.data_out ({gpr_rd_valid_st2, pipe_data_st2, gpr_rd_req_idx_st2}),
.valid_out(pipe_valid_st2),
.ready_out(pipe_ready_st2)
);
always @(*) begin
src_data_m_st2 = src_data_st2;
for (integer b = 0; b < NUM_BANKS; ++b) begin
if (gpr_rd_valid_st2[b]) begin
src_data_m_st2[gpr_rd_req_idx_st2[b]] = gpr_rd_data_st2[b];
end
end
end
always @(posedge clk) begin
if (reset || pipe_fire_st2) begin
src_data_st2 <= 0;
end else begin
src_data_st2 <= src_data_m_st2;
end
end
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (pipe_valid_st2),
.ready_in (pipe_ready_st2),
.data_in ({pipe_data_st2, src_data_m_st2}),
.data_out ({
operands_if.data.wis,
operands_if.data.tmask,
operands_if.data.PC,
operands_if.data.wb,
operands_if.data.ex_type,
operands_if.data.op_type,
operands_if.data.op_args,
operands_if.data.rd,
operands_if.data.uuid,
operands_if.data.rs3_data,
operands_if.data.rs2_data,
operands_if.data.rs1_data
}),
.valid_out (operands_if.valid),
.ready_out (operands_if.ready)
);
wire [PER_BANK_ADDRW-1:0] gpr_wr_addr;
if (ISSUE_WIS != 0) begin : g_gpr_wr_addr
assign gpr_wr_addr = {writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS], writeback_if.data.wis};
end else begin : g_gpr_wr_addr_no_wis
assign gpr_wr_addr = writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS];
end
wire [BANK_SEL_WIDTH-1:0] gpr_wr_bank_idx;
if (NUM_BANKS != 1) begin : g_gpr_wr_bank_idx
assign gpr_wr_bank_idx = writeback_if.data.rd[BANK_SEL_BITS-1:0];
end else begin : g_gpr_wr_bank_idx_0
assign gpr_wr_bank_idx = '0;
end
for (genvar b = 0; b < NUM_BANKS; ++b) begin : g_gpr_rams
wire gpr_wr_enabled;
if (BANK_SEL_BITS != 0) begin : g_gpr_wr_enabled_multibanks
assign gpr_wr_enabled = writeback_if.valid
&& (gpr_wr_bank_idx == BANK_SEL_BITS'(b));
end else begin : g_gpr_wr_enabled
assign gpr_wr_enabled = writeback_if.valid;
end
wire [BYTEENW-1:0] wren;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin : g_wren
assign wren[i*XLEN_SIZE+:XLEN_SIZE] = {XLEN_SIZE{writeback_if.data.tmask[i]}};
end
VX_dp_ram #(
.DATAW (REGS_DATAW),
.SIZE (PER_BANK_REGS * PER_ISSUE_WARPS),
.OUT_REG (1),
.READ_ENABLE (1),
.WRENW (BYTEENW),
`ifdef GPR_RESET
.RESET_RAM (1),
`endif
.NO_RWCHECK (1)
) gpr_ram (
.clk (clk),
.reset (reset),
.read (pipe_fire_st1),
.wren (wren),
.write (gpr_wr_enabled),
.waddr (gpr_wr_addr),
.wdata (writeback_if.data.data),
.raddr (gpr_rd_addr_st1[b]),
.rdata (gpr_rd_data_st2[b])
);
end
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] collisions_r;
always @(posedge clk) begin
if (reset) begin
collisions_r <= '0;
end else begin
collisions_r <= collisions_r + `PERF_CTR_BITS'(scoreboard_if.valid && pipe_ready_in && has_collision_n);
end
end
assign perf_stalls = collisions_r;
`endif
endmodule

View file

@ -0,0 +1,92 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_pe_switch import VX_gpu_pkg::*; #(
parameter PE_COUNT = 0,
parameter NUM_LANES = 0,
parameter REQ_OUT_BUF = 0,
parameter RSP_OUT_BUF = 0,
parameter `STRING ARBITER = "R",
parameter PE_SEL_BITS = `CLOG2(PE_COUNT)
) (
input wire clk,
input wire reset,
input wire [`UP(PE_SEL_BITS)-1:0] pe_sel,
VX_execute_if.slave execute_in_if,
VX_commit_if.master commit_out_if,
VX_execute_if.master execute_out_if[PE_COUNT],
VX_commit_if .slave commit_in_if[PE_COUNT]
);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam REQ_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `INST_ALU_BITS + $bits(op_args_t) + 1 + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
localparam RSP_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
wire [PE_COUNT-1:0] pe_req_valid;
wire [PE_COUNT-1:0][REQ_DATAW-1:0] pe_req_data;
wire [PE_COUNT-1:0] pe_req_ready;
VX_stream_switch #(
.DATAW (REQ_DATAW),
.NUM_OUTPUTS (PE_COUNT),
.OUT_BUF (REQ_OUT_BUF)
) req_switch (
.clk (clk),
.reset (reset),
.sel_in (pe_sel),
.valid_in (execute_in_if.valid),
.ready_in (execute_in_if.ready),
.data_in (execute_in_if.data),
.data_out (pe_req_data),
.valid_out (pe_req_valid),
.ready_out (pe_req_ready)
);
for (genvar i = 0; i < PE_COUNT; ++i) begin : g_execute_out_if
assign execute_out_if[i].valid = pe_req_valid[i];
assign execute_out_if[i].data = pe_req_data[i];
assign pe_req_ready[i] = execute_out_if[i].ready;
end
///////////////////////////////////////////////////////////////////////////
wire [PE_COUNT-1:0] pe_rsp_valid;
wire [PE_COUNT-1:0][RSP_DATAW-1:0] pe_rsp_data;
wire [PE_COUNT-1:0] pe_rsp_ready;
for (genvar i = 0; i < PE_COUNT; ++i) begin : g_commit_in_if
assign pe_rsp_valid[i] = commit_in_if[i].valid;
assign pe_rsp_data[i] = commit_in_if[i].data;
assign commit_in_if[i].ready = pe_rsp_ready[i];
end
VX_stream_arb #(
.NUM_INPUTS (PE_COUNT),
.DATAW (RSP_DATAW),
.ARBITER (ARBITER),
.OUT_BUF (RSP_OUT_BUF)
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in (pe_rsp_valid),
.ready_in (pe_rsp_ready),
.data_in (pe_rsp_data),
.data_out (commit_out_if.data),
.valid_out (commit_out_if.valid),
.ready_out (commit_out_if.ready),
`UNUSED_PIN (sel_out)
);
endmodule

View file

@ -1,79 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_pending_instr #(
parameter CTR_WIDTH = 12,
parameter ALM_EMPTY = 1,
parameter DECR_COUNT = 1
) (
input wire clk,
input wire reset,
input wire incr,
input wire [`NW_WIDTH-1:0] incr_wid,
input wire [DECR_COUNT-1:0] decr,
input wire [DECR_COUNT-1:0][`NW_WIDTH-1:0] decr_wid,
input wire [`NW_WIDTH-1:0] alm_empty_wid,
output wire empty,
output wire alm_empty
);
localparam COUNTW = `CLOG2(DECR_COUNT+1);
reg [`NUM_WARPS-1:0][CTR_WIDTH-1:0] pending_instrs;
reg [`NUM_WARPS-1:0][COUNTW-1:0] decr_cnt;
reg [`NUM_WARPS-1:0][DECR_COUNT-1:0] decr_mask;
reg [`NUM_WARPS-1:0] incr_cnt, incr_cnt_n;
reg [`NUM_WARPS-1:0] alm_empty_r, empty_r;
always @(*) begin
incr_cnt_n = 0;
decr_mask = 0;
if (incr) begin
incr_cnt_n[incr_wid] = 1;
end
for (integer i = 0; i < DECR_COUNT; ++i) begin
if (decr[i]) begin
decr_mask[decr_wid[i]][i] = 1;
end
end
end
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
wire [COUNTW-1:0] decr_cnt_n;
`POP_COUNT(decr_cnt_n, decr_mask[i]);
wire [CTR_WIDTH-1:0] pending_instrs_n = pending_instrs[i] + CTR_WIDTH'(incr_cnt[i]) - CTR_WIDTH'(decr_cnt[i]);
always @(posedge clk) begin
if (reset) begin
incr_cnt[i] <= '0;
decr_cnt[i] <= '0;
pending_instrs[i] <= '0;
alm_empty_r[i] <= 0;
empty_r[i] <= 1;
end else begin
incr_cnt[i] <= incr_cnt_n[i];
decr_cnt[i] <= decr_cnt_n;
pending_instrs[i] <= pending_instrs_n;
alm_empty_r[i] <= (pending_instrs_n == ALM_EMPTY);
empty_r[i] <= (pending_instrs_n == 0);
end
end
end
assign alm_empty = alm_empty_r[alm_empty_wid];
assign empty = (& empty_r);
endmodule

View file

@ -14,13 +14,14 @@
`include "VX_define.vh"
module VX_schedule import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_pipeline_perf_if.schedule perf_schedule_if,
output sched_perf_t sched_perf,
`endif
// configuration
@ -42,6 +43,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
// status
output wire busy
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (CORE_ID)
reg [`NUM_WARPS-1:0] active_warps, active_warps_n; // updated when a warp is activated or disabled
@ -76,7 +78,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
wire [`NUM_ALU_BLOCKS-1:0][`NW_WIDTH-1:0] branch_wid;
wire [`NUM_ALU_BLOCKS-1:0] branch_taken;
wire [`NUM_ALU_BLOCKS-1:0][`PC_BITS-1:0] branch_dest;
for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin
for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin : g_branch_init
assign branch_valid[i] = branch_ctl_if[i].valid;
assign branch_wid[i] = branch_ctl_if[i].wid;
assign branch_taken[i] = branch_ctl_if[i].taken;
@ -187,7 +189,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
// decode unlock
if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin
if (decode_sched_if.valid && decode_sched_if.unlock) begin
stalled_warps_n[decode_sched_if.wid] = 0;
end
@ -287,13 +289,11 @@ module VX_schedule import VX_gpu_pkg::*; #(
// split/join handling
`RESET_RELAY (split_join_reset, reset);
VX_split_join #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-splitjoin", INSTANCE_ID))
) split_join (
.clk (clk),
.reset (split_join_reset),
.reset (reset),
.valid (warp_ctl_if.valid),
.wid (warp_ctl_if.wid),
.split (warp_ctl_if.split),
@ -322,7 +322,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
);
wire [`NUM_WARPS-1:0][(`NUM_THREADS + `PC_BITS)-1:0] schedule_data;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_schedule_data
assign schedule_data[i] = {thread_masks[i], warp_pcs[i]};
end
@ -331,61 +331,62 @@ module VX_schedule import VX_gpu_pkg::*; #(
schedule_data[schedule_wid][(`NUM_THREADS + `PC_BITS)-5:0]
};
`ifndef NDEBUG
localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS);
reg [`UUID_WIDTH-1:0] instr_uuid;
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid);
`ifdef SV_DPI
always @(posedge clk) begin
if (reset) begin
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 32'd0));
end else if (schedule_fire) begin
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid)));
end
end
wire [`UUID_WIDTH-1:0] instr_uuid;
`ifdef UUID_ENABLE
VX_uuid_gen #(
.CORE_ID (CORE_ID),
.UUID_WIDTH (`UUID_WIDTH)
) uuid_gen (
.clk (clk),
.reset (reset),
.incr (schedule_fire),
.wid (schedule_wid),
.uuid (instr_uuid)
);
`else
wire [GNW_WIDTH+16-1:0] w_uuid = {g_wid, 16'(schedule_pc)};
always @(*) begin
instr_uuid = `UUID_WIDTH'(w_uuid);
end
`endif
`else
wire [`UUID_WIDTH-1:0] instr_uuid = '0;
assign instr_uuid = '0;
`endif
VX_elastic_buffer #(
.DATAW (`NUM_THREADS + `PC_BITS + `NW_WIDTH)
.DATAW (`NUM_THREADS + `PC_BITS + `NW_WIDTH + `UUID_WIDTH),
.SIZE (2), // need to buffer out ready_in
.OUT_REG (1) // should be registered for BRAM acces in fetch unit
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (schedule_valid),
.ready_in (schedule_ready),
.data_in ({schedule_tmask, schedule_pc, schedule_wid}),
.data_out ({schedule_if.data.tmask, schedule_if.data.PC, schedule_if.data.wid}),
.data_in ({schedule_tmask, schedule_pc, schedule_wid, instr_uuid}),
.data_out ({schedule_if.data.tmask, schedule_if.data.PC, schedule_if.data.wid, schedule_if.data.uuid}),
.valid_out (schedule_if.valid),
.ready_out (schedule_if.ready)
);
assign schedule_if.data.uuid = instr_uuid;
// Track pending instructions per warp
`RESET_RELAY (pending_instr_reset, reset);
wire [`NUM_WARPS-1:0] pending_warp_empty;
wire [`NUM_WARPS-1:0] pending_warp_alm_empty;
wire no_pending_instr;
VX_pending_instr #(
.CTR_WIDTH (12),
.DECR_COUNT (`ISSUE_WIDTH),
.ALM_EMPTY (1)
) pending_instr(
.clk (clk),
.reset (pending_instr_reset),
.incr (schedule_if_fire),
.incr_wid (schedule_if.data.wid),
.decr (commit_sched_if.committed),
.decr_wid (commit_sched_if.committed_wid),
.alm_empty_wid (sched_csr_if.alm_empty_wid),
.alm_empty (sched_csr_if.alm_empty),
.empty (no_pending_instr)
);
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_pending_sizes
VX_pending_size #(
.SIZE (4096),
.ALM_EMPTY (1)
) counter (
.clk (clk),
.reset (reset),
.incr (schedule_if_fire && (schedule_if.data.wid == `NW_WIDTH'(i))),
.decr (commit_sched_if.committed_warps[i]),
.empty (pending_warp_empty[i]),
.alm_empty (pending_warp_alm_empty[i]),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end
assign sched_csr_if.alm_empty = pending_warp_alm_empty[sched_csr_if.alm_empty_wid];
wire no_pending_instr = (& pending_warp_empty);
`BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1);
@ -402,7 +403,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
timeout_ctr <= '0;
timeout_enable <= 0;
end else begin
if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin
if (decode_sched_if.valid && decode_sched_if.unlock) begin
timeout_enable <= 1;
end
if (timeout_enable && active_warps !=0 && active_warps == stalled_warps) begin
@ -412,7 +413,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
end
end
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps))
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** %s timeout: stalled_warps=%b", $time, INSTANCE_ID, stalled_warps))
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_sched_idles;
@ -431,8 +432,8 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
end
assign perf_schedule_if.sched_idles = perf_sched_idles;
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
assign sched_perf.idles = perf_sched_idles;
assign sched_perf.stalls = perf_sched_stalls;
`endif
endmodule

View file

@ -14,39 +14,39 @@
`include "VX_define.vh"
module VX_scoreboard import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS],
output reg [`PERF_CTR_BITS-1:0] perf_sfu_uses [`NUM_SFU_UNITS],
output reg [`PERF_CTR_BITS-1:0] perf_stalls,
output reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_units_uses,
output reg [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_sfu_uses,
`endif
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave ibuffer_if [`NUM_WARPS],
VX_scoreboard_if.master scoreboard_if [`ISSUE_WIDTH]
VX_writeback_if.slave writeback_if,
VX_ibuffer_if.slave ibuffer_if [PER_ISSUE_WARPS],
VX_scoreboard_if.master scoreboard_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam NUM_SRC_OPDS = 3;
localparam NUM_OPDS = NUM_SRC_OPDS + 1;
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + 1;
VX_ibuffer_if staging_if [PER_ISSUE_WARPS]();
reg [PER_ISSUE_WARPS-1:0] operands_ready;
`ifdef PERF_ENABLE
reg [`NUM_WARPS-1:0][`NUM_EX_UNITS-1:0] perf_inuse_units_per_cycle;
reg [PER_ISSUE_WARPS-1:0][`NUM_EX_UNITS-1:0] perf_inuse_units_per_cycle;
wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
reg [`NUM_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
wire [`NUM_WARPS-1:0] perf_issue_stalls_per_cycle;
wire [`CLOG2(`NUM_WARPS+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
VX_reduce #(
.DATAW_IN (`NUM_EX_UNITS),
.N (`NUM_WARPS),
.N (PER_ISSUE_WARPS),
.OP ("|")
) perf_units_reduce (
.data_in (perf_inuse_units_per_cycle),
@ -55,26 +55,32 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
VX_reduce #(
.DATAW_IN (`NUM_SFU_UNITS),
.N (`NUM_WARPS),
.N (PER_ISSUE_WARPS),
.OP ("|")
) perf_sfu_reduce (
.data_in (perf_inuse_sfu_per_cycle),
.data_out (perf_sfu_per_cycle)
);
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
`BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, `CDIV(`NUM_WARPS, `MAX_FANOUT));
`BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, `CDIV(`NUM_WARPS, `MAX_FANOUT));
`BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
`BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
always @(posedge clk) begin
wire [PER_ISSUE_WARPS-1:0] stg_valid_in;
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_stg_valid_in
assign stg_valid_in[w] = staging_if[w].valid;
end
wire perf_stall_per_cycle = (|stg_valid_in) && ~(|(stg_valid_in & operands_ready));
always @(posedge clk) begin : g_perf_stalls
if (reset) begin
perf_scb_stalls <= '0;
perf_stalls <= '0;
end else begin
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
perf_stalls <= perf_stalls + `PERF_CTR_BITS'(perf_stall_per_cycle);
end
end
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_perf_units_uses
always @(posedge clk) begin
if (reset) begin
perf_units_uses[i] <= '0;
@ -84,7 +90,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
end
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin : g_perf_sfu_uses
always @(posedge clk) begin
if (reset) begin
perf_sfu_uses[i] <= '0;
@ -95,139 +101,75 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
`endif
VX_ibuffer_if staging_if [`NUM_WARPS]();
wire [`NUM_WARPS-1:0][3:0] staging_opds_busy;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (1)
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_stanging_bufs
VX_pipe_buffer #(
.DATAW (DATAW)
) stanging_buf (
.clk (clk),
.reset (reset),
.valid_in (ibuffer_if[i].valid),
.data_in (ibuffer_if[i].data),
.ready_in (ibuffer_if[i].ready),
.valid_out(staging_if[i].valid),
.data_out (staging_if[i].data),
.ready_out(staging_if[i].ready)
.valid_in (ibuffer_if[w].valid),
.data_in (ibuffer_if[w].data),
.ready_in (ibuffer_if[w].ready),
.valid_out(staging_if[w].valid),
.data_out (staging_if[w].data),
.ready_out(staging_if[w].ready)
);
end
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_scoreboard
reg [`NUM_REGS-1:0] inuse_regs;
reg [3:0] operands_busy_r, operands_busy_n;
reg [NUM_OPDS-1:0] operands_busy, operands_busy_n;
localparam iw = i % `ISSUE_WIDTH;
localparam wis = i / `ISSUE_WIDTH;
wire ibuffer_fire = ibuffer_if[w].valid && ibuffer_if[w].ready;
wire ibuffer_fire = ibuffer_if[i].valid && ibuffer_if[i].ready;
wire staging_fire = staging_if[w].valid && staging_if[w].ready;
wire staging_fire = staging_if[i].valid && staging_if[i].ready;
wire writeback_fire = writeback_if.valid
&& (writeback_if.data.wis == ISSUE_WIS_W'(w))
&& writeback_if.data.eop;
wire writeback_fire = writeback_if[iw].valid
&& (writeback_if[iw].data.wis == ISSUE_WIS_W'(wis))
&& writeback_if[iw].data.eop;
wire [NUM_OPDS-1:0][`NR_BITS-1:0] ibuf_opds, stg_opds;
assign ibuf_opds = {ibuffer_if[w].data.rs3, ibuffer_if[w].data.rs2, ibuffer_if[w].data.rs1, ibuffer_if[w].data.rd};
assign stg_opds = {staging_if[w].data.rs3, staging_if[w].data.rs2, staging_if[w].data.rs1, staging_if[w].data.rd};
`ifdef PERF_ENABLE
reg [`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
reg [`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
reg [`SFU_WIDTH-1:0] sfu_type;
always @(*) begin
case (staging_if[i].data.op_type)
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: sfu_type = `SFU_CSRS;
default: sfu_type = `SFU_WCTL;
endcase
end
always @(*) begin
perf_inuse_units_per_cycle[i] = '0;
perf_inuse_sfu_per_cycle[i] = '0;
if (staging_if[i].valid) begin
if (operands_busy_r[0]) begin
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rd]] = 1;
if (inuse_units[staging_if[i].data.rd] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[i][inuse_sfu[staging_if[i].data.rd]] = 1;
end
end
if (operands_busy_r[1]) begin
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs1]] = 1;
if (inuse_units[staging_if[i].data.rs1] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[i][inuse_sfu[staging_if[i].data.rs1]] = 1;
end
end
if (operands_busy_r[2]) begin
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs2]] = 1;
if (inuse_units[staging_if[i].data.rs2] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[i][inuse_sfu[staging_if[i].data.rs2]] = 1;
end
end
if (operands_busy_r[3]) begin
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs3]] = 1;
if (inuse_units[staging_if[i].data.rs3] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[i][inuse_sfu[staging_if[i].data.rs3]] = 1;
perf_inuse_units_per_cycle[w] = '0;
perf_inuse_sfu_per_cycle[w] = '0;
for (integer i = 0; i < NUM_OPDS; ++i) begin
if (staging_if[w].valid && operands_busy[i]) begin
perf_inuse_units_per_cycle[w][inuse_units[stg_opds[i]]] = 1;
if (inuse_units[stg_opds[i]] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[stg_opds[i]]] = 1;
end
end
end
end
assign perf_issue_stalls_per_cycle[i] = staging_if[i].valid && ~staging_if[i].ready;
`endif
always @(*) begin
operands_busy_n = operands_busy_r;
if (ibuffer_fire) begin
operands_busy_n = {
inuse_regs[ibuffer_if[i].data.rs3],
inuse_regs[ibuffer_if[i].data.rs2],
inuse_regs[ibuffer_if[i].data.rs1],
inuse_regs[ibuffer_if[i].data.rd]
};
end
if (writeback_fire) begin
for (integer i = 0; i < NUM_OPDS; ++i) begin
operands_busy_n[i] = operands_busy[i];
if (ibuffer_fire) begin
if (writeback_if[iw].data.rd == ibuffer_if[i].data.rd) begin
operands_busy_n[0] = 0;
end
if (writeback_if[iw].data.rd == ibuffer_if[i].data.rs1) begin
operands_busy_n[1] = 0;
end
if (writeback_if[iw].data.rd == ibuffer_if[i].data.rs2) begin
operands_busy_n[2] = 0;
end
if (writeback_if[iw].data.rd == ibuffer_if[i].data.rs3) begin
operands_busy_n[3] = 0;
end
end else begin
if (writeback_if[iw].data.rd == staging_if[i].data.rd) begin
operands_busy_n[0] = 0;
end
if (writeback_if[iw].data.rd == staging_if[i].data.rs1) begin
operands_busy_n[1] = 0;
end
if (writeback_if[iw].data.rd == staging_if[i].data.rs2) begin
operands_busy_n[2] = 0;
end
if (writeback_if[iw].data.rd == staging_if[i].data.rs3) begin
operands_busy_n[3] = 0;
operands_busy_n[i] = inuse_regs[ibuf_opds[i]];
end
if (writeback_fire) begin
if (ibuffer_fire) begin
if (writeback_if.data.rd == ibuf_opds[i]) begin
operands_busy_n[i] = 0;
end
end else begin
if (writeback_if.data.rd == stg_opds[i]) begin
operands_busy_n[i] = 0;
end
end
end
end
if (staging_fire && staging_if[i].data.wb) begin
if (staging_if[i].data.rd == ibuffer_if[i].data.rd) begin
operands_busy_n[0] = 1;
end
if (staging_if[i].data.rd == ibuffer_if[i].data.rs1) begin
operands_busy_n[1] = 1;
end
if (staging_if[i].data.rd == ibuffer_if[i].data.rs2) begin
operands_busy_n[2] = 1;
end
if (staging_if[i].data.rd == ibuffer_if[i].data.rs3) begin
operands_busy_n[3] = 1;
if (staging_fire && staging_if[w].data.wb && staging_if[w].data.rd == ibuf_opds[i]) begin
operands_busy_n[i] = 1;
end
end
end
@ -237,25 +179,24 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
inuse_regs <= '0;
end else begin
if (writeback_fire) begin
inuse_regs[writeback_if[iw].data.rd] <= 0;
inuse_regs[writeback_if.data.rd] <= 0;
end
if (staging_fire && staging_if[i].data.wb) begin
inuse_regs[staging_if[i].data.rd] <= 1;
if (staging_fire && staging_if[w].data.wb) begin
inuse_regs[staging_if[w].data.rd] <= 1;
end
end
operands_busy_r <= operands_busy_n;
operands_busy <= operands_busy_n;
operands_ready[w] <= ~(| operands_busy_n);
`ifdef PERF_ENABLE
if (staging_fire && staging_if[i].data.wb) begin
inuse_units[staging_if[i].data.rd] <= staging_if[i].data.ex_type;
if (staging_if[i].data.ex_type == `EX_SFU) begin
inuse_sfu[staging_if[i].data.rd] <= sfu_type;
if (staging_fire && staging_if[w].data.wb) begin
inuse_units[staging_if[w].data.rd] <= staging_if[w].data.ex_type;
if (staging_if[w].data.ex_type == `EX_SFU) begin
inuse_sfu[staging_if[w].data.rd] <= op_to_sfu_type(staging_if[w].data.op_type);
end
end
`endif
end
assign staging_opds_busy[i] = operands_busy_r;
`ifdef SIMULATION
reg [31:0] timeout_ctr;
@ -263,11 +204,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
if (reset) begin
timeout_ctr <= '0;
end else begin
if (staging_if[i].valid && ~staging_if[i].ready) begin
if (staging_if[w].valid && ~staging_if[w].ready) begin
`ifdef DBG_TRACE_PIPELINE
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, CORE_ID, i, {staging_if[i].data.PC, 1'b0}, staging_if[i].data.tmask, timeout_ctr,
operands_busy_r, staging_if[i].data.uuid));
`TRACE(3, ("%t: *** %s-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr,
operands_busy, staging_if[w].data.uuid))
`endif
timeout_ctr <= timeout_ctr + 1;
end else if (ibuffer_fire) begin
@ -277,59 +218,54 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, CORE_ID, i, {staging_if[i].data.PC, 1'b0}, staging_if[i].data.tmask, timeout_ctr,
operands_busy_r, staging_if[i].data.uuid));
("%t: *** %s timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr,
operands_busy, staging_if[w].data.uuid))
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[iw].data.rd] != 0,
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, CORE_ID, i, {writeback_if[iw].data.PC, 1'b0}, writeback_if[iw].data.tmask, writeback_if[iw].data.rd, writeback_if[iw].data.uuid));
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if.data.rd] != 0,
("%t: *** %s invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, INSTANCE_ID, w, {writeback_if.data.PC, 1'b0}, writeback_if.data.tmask, writeback_if.data.rd, writeback_if.data.uuid))
`endif
end
`RESET_RELAY (arb_reset, reset);
wire [PER_ISSUE_WARPS-1:0] arb_valid_in;
wire [PER_ISSUE_WARPS-1:0][DATAW-1:0] arb_data_in;
wire [PER_ISSUE_WARPS-1:0] arb_ready_in;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [ISSUE_RATIO-1:0] valid_in;
wire [ISSUE_RATIO-1:0][DATAW-1:0] data_in;
wire [ISSUE_RATIO-1:0] ready_in;
for (genvar j = 0; j < ISSUE_RATIO; ++j) begin
wire operands_ready = ~(| staging_opds_busy[j * `ISSUE_WIDTH + i]);
assign valid_in[j] = staging_if[j * `ISSUE_WIDTH + i].valid && operands_ready;
assign data_in[j] = staging_if[j * `ISSUE_WIDTH + i].data;
assign staging_if[j * `ISSUE_WIDTH + i].ready = ready_in[j] && operands_ready;
end
VX_stream_arb #(
.NUM_INPUTS (ISSUE_RATIO),
.DATAW (DATAW),
.ARBITER ("R"),
.OUT_BUF (2)
) out_arb (
.clk (clk),
.reset (arb_reset),
.valid_in (valid_in),
.ready_in (ready_in),
.data_in (data_in),
.data_out ({
scoreboard_if[i].data.uuid,
scoreboard_if[i].data.tmask,
scoreboard_if[i].data.PC,
scoreboard_if[i].data.ex_type,
scoreboard_if[i].data.op_type,
scoreboard_if[i].data.op_args,
scoreboard_if[i].data.wb,
scoreboard_if[i].data.rd,
scoreboard_if[i].data.rs1,
scoreboard_if[i].data.rs2,
scoreboard_if[i].data.rs3
}),
.valid_out (scoreboard_if[i].valid),
.ready_out (scoreboard_if[i].ready),
.sel_out (scoreboard_if[i].data.wis)
);
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_arb_data_in
assign arb_valid_in[w] = staging_if[w].valid && operands_ready[w];
assign arb_data_in[w] = staging_if[w].data;
assign staging_if[w].ready = arb_ready_in[w] && operands_ready[w];
end
VX_stream_arb #(
.NUM_INPUTS (PER_ISSUE_WARPS),
.DATAW (DATAW),
.ARBITER ("C"),
.OUT_BUF (3)
) out_arb (
.clk (clk),
.reset (reset),
.valid_in (arb_valid_in),
.ready_in (arb_ready_in),
.data_in (arb_data_in),
.data_out ({
scoreboard_if.data.uuid,
scoreboard_if.data.tmask,
scoreboard_if.data.PC,
scoreboard_if.data.ex_type,
scoreboard_if.data.op_type,
scoreboard_if.data.op_args,
scoreboard_if.data.wb,
scoreboard_if.data.rd,
scoreboard_if.data.rs1,
scoreboard_if.data.rs2,
scoreboard_if.data.rs3
}),
.valid_out (scoreboard_if.valid),
.ready_out (scoreboard_if.ready),
.sel_out (scoreboard_if.data.wis)
);
endmodule

View file

@ -14,6 +14,7 @@
`include "VX_define.vh"
module VX_sfu_unit import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0
) (
input wire clk,
@ -39,25 +40,26 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_warp_ctl_if.master warp_ctl_if
);
`UNUSED_PARAM (CORE_ID)
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_SFU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `PC_BITS + PID_WIDTH + 1 + 1;
localparam RSP_ARB_SIZE = 1 + 1;
localparam RSP_ARB_IDX_WCTL = 0;
localparam RSP_ARB_IDX_CSRS = 1;
`UNUSED_SPARAM (INSTANCE_ID)
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_SFU_LANES;
localparam PE_COUNT = 2;
localparam PE_SEL_BITS = `CLOG2(PE_COUNT);
localparam PE_IDX_WCTL = 0;
localparam PE_IDX_CSRS = 1;
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) per_block_execute_if[BLOCK_SIZE]();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) per_block_commit_if[BLOCK_SIZE]();
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_BUF (1)
.OUT_BUF (3)
) dispatch_unit (
.clk (clk),
.reset (reset),
@ -65,60 +67,58 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.execute_if (per_block_execute_if)
);
wire [RSP_ARB_SIZE-1:0] rsp_arb_valid_in;
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
// Warp control block
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) wctl_execute_if();
) pe_execute_if[PE_COUNT]();
VX_commit_if#(
.NUM_LANES (NUM_LANES)
) wctl_commit_if();
) pe_commit_if[PE_COUNT]();
assign wctl_execute_if.valid = per_block_execute_if[0].valid && `INST_SFU_IS_WCTL(per_block_execute_if[0].data.op_type);
assign wctl_execute_if.data = per_block_execute_if[0].data;
reg [PE_SEL_BITS-1:0] pe_select;
always @(*) begin
pe_select = PE_IDX_WCTL;
if (`INST_SFU_IS_CSR(per_block_execute_if[0].data.op_type))
pe_select = PE_IDX_CSRS;
end
`RESET_RELAY (wctl_reset, reset);
VX_pe_switch #(
.PE_COUNT (PE_COUNT),
.NUM_LANES (NUM_LANES),
.ARBITER ("R"),
.REQ_OUT_BUF(0),
.RSP_OUT_BUF(3)
) pe_switch (
.clk (clk),
.reset (reset),
.pe_sel (pe_select),
.execute_in_if (per_block_execute_if[0]),
.commit_out_if (per_block_commit_if[0]),
.execute_out_if (pe_execute_if),
.commit_in_if (pe_commit_if)
);
VX_wctl_unit #(
.CORE_ID (CORE_ID),
.INSTANCE_ID ($sformatf("%s-wctl", INSTANCE_ID)),
.NUM_LANES (NUM_LANES)
) wctl_unit (
.clk (clk),
.reset (wctl_reset),
.execute_if (wctl_execute_if),
.reset (reset),
.execute_if (pe_execute_if[PE_IDX_WCTL]),
.warp_ctl_if(warp_ctl_if),
.commit_if (wctl_commit_if)
.commit_if (pe_commit_if[PE_IDX_WCTL])
);
assign rsp_arb_valid_in[RSP_ARB_IDX_WCTL] = wctl_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_WCTL] = wctl_commit_if.data;
assign wctl_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_WCTL];
// CSR unit
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) csr_execute_if();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) csr_commit_if();
assign csr_execute_if.valid = per_block_execute_if[0].valid && `INST_SFU_IS_CSR(per_block_execute_if[0].data.op_type);
assign csr_execute_if.data = per_block_execute_if[0].data;
`RESET_RELAY (csr_reset, reset);
VX_csr_unit #(
.INSTANCE_ID ($sformatf("%s-csr", INSTANCE_ID)),
.CORE_ID (CORE_ID),
.NUM_LANES (NUM_LANES)
) csr_unit (
.clk (clk),
.reset (csr_reset),
.reset (reset),
.base_dcrs (base_dcrs),
.execute_if (csr_execute_if),
.execute_if (pe_execute_if[PE_IDX_CSRS]),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
@ -131,57 +131,17 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.sched_csr_if (sched_csr_if),
.commit_csr_if (commit_csr_if),
.commit_if (csr_commit_if)
);
assign rsp_arb_valid_in[RSP_ARB_IDX_CSRS] = csr_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_CSRS] = csr_commit_if.data;
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSRS];
// can accept new request?
reg sfu_req_ready;
always @(*) begin
case (per_block_execute_if[0].data.op_type)
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
default: sfu_req_ready = wctl_execute_if.ready;
endcase
end
assign per_block_execute_if[0].ready = sfu_req_ready;
// response arbitration
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) arb_commit_if[BLOCK_SIZE]();
VX_stream_arb #(
.NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW),
.ARBITER ("R"),
.OUT_BUF (3)
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in (rsp_arb_valid_in),
.ready_in (rsp_arb_ready_in),
.data_in (rsp_arb_data_in),
.data_out (arb_commit_if[0].data),
.valid_out (arb_commit_if[0].valid),
.ready_out (arb_commit_if[0].ready),
`UNUSED_PIN (sel_out)
.commit_if (pe_commit_if[PE_IDX_CSRS])
);
VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_BUF (1)
.OUT_BUF (3)
) gather_unit (
.clk (clk),
.reset (reset),
.commit_in_if (arb_commit_if),
.clk (clk),
.reset (reset),
.commit_in_if (per_block_commit_if),
.commit_out_if (commit_if)
);

View file

@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_split_join import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
@ -31,7 +31,7 @@ module VX_split_join import VX_gpu_pkg::*; #(
input wire [`NW_WIDTH-1:0] stack_wid,
output wire [`DV_STACK_SIZEW-1:0] stack_ptr
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
wire [(`NUM_THREADS+`PC_BITS)-1:0] ipdom_data [`NUM_WARPS-1:0];
wire [`DV_STACK_SIZEW-1:0] ipdom_q_ptr [`NUM_WARPS-1:0];
@ -45,16 +45,14 @@ module VX_split_join import VX_gpu_pkg::*; #(
wire ipdom_push = valid && split.valid && split.is_dvg;
wire ipdom_pop = valid && sjoin.valid && sjoin_is_dvg;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
`RESET_RELAY (ipdom_reset, reset);
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_ipdom_stacks
VX_ipdom_stack #(
.WIDTH (`NUM_THREADS+`PC_BITS),
.DEPTH (`DV_STACK_SIZE)
.DEPTH (`DV_STACK_SIZE),
.OUT_REG (0)
) ipdom_stack (
.clk (clk),
.reset (ipdom_reset),
.reset (reset),
.q0 (ipdom_q0),
.q1 (ipdom_q1),
.d (ipdom_data[i]),

View file

@ -1,387 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`ifndef VX_TRACE_VH
`define VX_TRACE_VH
`ifdef SIMULATION
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
case (ex_type)
`EX_ALU: `TRACE(level, ("ALU"));
`EX_LSU: `TRACE(level, ("LSU"));
`EX_FPU: `TRACE(level, ("FPU"));
`EX_SFU: `TRACE(level, ("SFU"));
default: `TRACE(level, ("?"));
endcase
endtask
task trace_ex_op(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input VX_gpu_pkg::op_args_t op_args
);
case (ex_type)
`EX_ALU: begin
case (op_args.alu.xtype)
`ALU_TYPE_ARITH: begin
if (op_args.alu.is_w) begin
if (op_args.alu.use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDIW"));
`INST_ALU_SLL: `TRACE(level, ("SLLIW"));
`INST_ALU_SRL: `TRACE(level, ("SRLIW"));
`INST_ALU_SRA: `TRACE(level, ("SRAIW"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDW"));
`INST_ALU_SUB: `TRACE(level, ("SUBW"));
`INST_ALU_SLL: `TRACE(level, ("SLLW"));
`INST_ALU_SRL: `TRACE(level, ("SRLW"));
`INST_ALU_SRA: `TRACE(level, ("SRAW"));
default: `TRACE(level, ("?"));
endcase
end
end else begin
if (op_args.alu.use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDI"));
`INST_ALU_SLL: `TRACE(level, ("SLLI"));
`INST_ALU_SRL: `TRACE(level, ("SRLI"));
`INST_ALU_SRA: `TRACE(level, ("SRAI"));
`INST_ALU_SLT: `TRACE(level, ("SLTI"));
`INST_ALU_SLTU: `TRACE(level, ("SLTIU"));
`INST_ALU_XOR: `TRACE(level, ("XORI"));
`INST_ALU_OR: `TRACE(level, ("ORI"));
`INST_ALU_AND: `TRACE(level, ("ANDI"));
`INST_ALU_LUI: `TRACE(level, ("LUI"));
`INST_ALU_AUIPC: `TRACE(level, ("AUIPC"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADD"));
`INST_ALU_SUB: `TRACE(level, ("SUB"));
`INST_ALU_SLL: `TRACE(level, ("SLL"));
`INST_ALU_SRL: `TRACE(level, ("SRL"));
`INST_ALU_SRA: `TRACE(level, ("SRA"));
`INST_ALU_SLT: `TRACE(level, ("SLT"));
`INST_ALU_SLTU: `TRACE(level, ("SLTU"));
`INST_ALU_XOR: `TRACE(level, ("XOR"));
`INST_ALU_OR: `TRACE(level, ("OR"));
`INST_ALU_AND: `TRACE(level, ("AND"));
`INST_ALU_CZEQ: `TRACE(level, ("CZERO.EQZ"));
`INST_ALU_CZNE: `TRACE(level, ("CZERO.NEZ"));
default: `TRACE(level, ("?"));
endcase
end
end
end
`ALU_TYPE_BRANCH: begin
case (`INST_BR_BITS'(op_type))
`INST_BR_EQ: `TRACE(level, ("BEQ"));
`INST_BR_NE: `TRACE(level, ("BNE"));
`INST_BR_LT: `TRACE(level, ("BLT"));
`INST_BR_GE: `TRACE(level, ("BGE"));
`INST_BR_LTU: `TRACE(level, ("BLTU"));
`INST_BR_GEU: `TRACE(level, ("BGEU"));
`INST_BR_JAL: `TRACE(level, ("JAL"));
`INST_BR_JALR: `TRACE(level, ("JALR"));
`INST_BR_ECALL: `TRACE(level, ("ECALL"));
`INST_BR_EBREAK:`TRACE(level, ("EBREAK"));
`INST_BR_URET: `TRACE(level, ("URET"));
`INST_BR_SRET: `TRACE(level, ("SRET"));
`INST_BR_MRET: `TRACE(level, ("MRET"));
default: `TRACE(level, ("?"));
endcase
end
`ALU_TYPE_MULDIV: begin
if (op_args.alu.is_w) begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MULW"));
`INST_M_DIV: `TRACE(level, ("DIVW"));
`INST_M_DIVU: `TRACE(level, ("DIVUW"));
`INST_M_REM: `TRACE(level, ("REMW"));
`INST_M_REMU: `TRACE(level, ("REMUW"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MUL"));
`INST_M_MULH: `TRACE(level, ("MULH"));
`INST_M_MULHSU:`TRACE(level, ("MULHSU"));
`INST_M_MULHU: `TRACE(level, ("MULHU"));
`INST_M_DIV: `TRACE(level, ("DIV"));
`INST_M_DIVU: `TRACE(level, ("DIVU"));
`INST_M_REM: `TRACE(level, ("REM"));
`INST_M_REMU: `TRACE(level, ("REMU"));
default: `TRACE(level, ("?"));
endcase
end
end
default: `TRACE(level, ("?"));
endcase
end
`EX_LSU: begin
if (op_args.lsu.is_float) begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LW: `TRACE(level, ("FLW"));
`INST_LSU_LD: `TRACE(level, ("FLD"));
`INST_LSU_SW: `TRACE(level, ("FSW"));
`INST_LSU_SD: `TRACE(level, ("FSD"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LB: `TRACE(level, ("LB"));
`INST_LSU_LH: `TRACE(level, ("LH"));
`INST_LSU_LW: `TRACE(level, ("LW"));
`INST_LSU_LD: `TRACE(level, ("LD"));
`INST_LSU_LBU:`TRACE(level, ("LBU"));
`INST_LSU_LHU:`TRACE(level, ("LHU"));
`INST_LSU_LWU:`TRACE(level, ("LWU"));
`INST_LSU_SB: `TRACE(level, ("SB"));
`INST_LSU_SH: `TRACE(level, ("SH"));
`INST_LSU_SW: `TRACE(level, ("SW"));
`INST_LSU_SD: `TRACE(level, ("SD"));
`INST_LSU_FENCE:`TRACE(level,("FENCE"));
default: `TRACE(level, ("?"));
endcase
end
end
`EX_FPU: begin
case (`INST_FPU_BITS'(op_type))
`INST_FPU_ADD: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FADD.D"));
else
`TRACE(level, ("FADD.S"));
end
`INST_FPU_SUB: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FSUB.D"));
else
`TRACE(level, ("FSUB.S"));
end
`INST_FPU_MUL: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FMUL.D"));
else
`TRACE(level, ("FMUL.S"));
end
`INST_FPU_DIV: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FDIV.D"));
else
`TRACE(level, ("FDIV.S"));
end
`INST_FPU_SQRT: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FSQRT.D"));
else
`TRACE(level, ("FSQRT.S"));
end
`INST_FPU_MADD: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FMADD.D"));
else
`TRACE(level, ("FMADD.S"));
end
`INST_FPU_MSUB: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FMSUB.D"));
else
`TRACE(level, ("FMSUB.S"));
end
`INST_FPU_NMADD: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FNMADD.D"));
else
`TRACE(level, ("FNMADD.S"));
end
`INST_FPU_NMSUB: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FNMSUB.D"));
else
`TRACE(level, ("FNMSUB.S"));
end
`INST_FPU_CMP: begin
if (op_args.fpu.fmt[0]) begin
case (op_args.fpu.frm[1:0])
0: `TRACE(level, ("FLE.D"));
1: `TRACE(level, ("FLT.D"));
2: `TRACE(level, ("FEQ.D"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (op_args.fpu.frm[1:0])
0: `TRACE(level, ("FLE.S"));
1: `TRACE(level, ("FLT.S"));
2: `TRACE(level, ("FEQ.S"));
default: `TRACE(level, ("?"));
endcase
end
end
`INST_FPU_F2F: begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FCVT.D.S"));
end else begin
`TRACE(level, ("FCVT.S.D"));
end
end
`INST_FPU_F2I: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.L.D"));
end else begin
`TRACE(level, ("FCVT.W.D"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.L.S"));
end else begin
`TRACE(level, ("FCVT.W.S"));
end
end
end
`INST_FPU_F2U: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.LU.D"));
end else begin
`TRACE(level, ("FCVT.WU.D"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.LU.S"));
end else begin
`TRACE(level, ("FCVT.WU.S"));
end
end
end
`INST_FPU_I2F: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.D.L"));
end else begin
`TRACE(level, ("FCVT.D.W"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.S.L"));
end else begin
`TRACE(level, ("FCVT.S.W"));
end
end
end
`INST_FPU_U2F: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.D.LU"));
end else begin
`TRACE(level, ("FCVT.D.WU"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.S.LU"));
end else begin
`TRACE(level, ("FCVT.S.WU"));
end
end
end
`INST_FPU_MISC: begin
if (op_args.fpu.fmt[0]) begin
case (op_args.fpu.frm)
0: `TRACE(level, ("FSGNJ.D"));
1: `TRACE(level, ("FSGNJN.D"));
2: `TRACE(level, ("FSGNJX.D"));
3: `TRACE(level, ("FCLASS.D"));
4: `TRACE(level, ("FMV.X.D"));
5: `TRACE(level, ("FMV.D.X"));
6: `TRACE(level, ("FMIN.D"));
7: `TRACE(level, ("FMAX.D"));
endcase
end else begin
case (op_args.fpu.frm)
0: `TRACE(level, ("FSGNJ.S"));
1: `TRACE(level, ("FSGNJN.S"));
2: `TRACE(level, ("FSGNJX.S"));
3: `TRACE(level, ("FCLASS.S"));
4: `TRACE(level, ("FMV.X.S"));
5: `TRACE(level, ("FMV.S.X"));
6: `TRACE(level, ("FMIN.S"));
7: `TRACE(level, ("FMAX.S"));
endcase
end
end
default: `TRACE(level, ("?"));
endcase
end
`EX_SFU: begin
case (`INST_SFU_BITS'(op_type))
`INST_SFU_TMC: `TRACE(level, ("TMC"));
`INST_SFU_WSPAWN:`TRACE(level, ("WSPAWN"));
`INST_SFU_SPLIT: begin if (op_args.wctl.is_neg) `TRACE(level, ("SPLIT.N")); else `TRACE(level, ("SPLIT")); end
`INST_SFU_JOIN: `TRACE(level, ("JOIN"));
`INST_SFU_BAR: `TRACE(level, ("BAR"));
`INST_SFU_PRED: begin if (op_args.wctl.is_neg) `TRACE(level, ("PRED.N")); else `TRACE(level, ("PRED")); end
`INST_SFU_CSRRW: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end
`INST_SFU_CSRRS: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end
`INST_SFU_CSRRC: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end
default: `TRACE(level, ("?"));
endcase
end
default: `TRACE(level, ("?"));
endcase
endtask
task trace_op_args(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input VX_gpu_pkg::op_args_t op_args
);
case (ex_type)
`EX_ALU: begin
`TRACE(level, (", use_PC=%b, use_imm=%b, imm=0x%0h", op_args.alu.use_PC, op_args.alu.use_imm, op_args.alu.imm));
end
`EX_LSU: begin
`TRACE(level, (", offset=0x%0h", op_args.lsu.offset));
end
`EX_FPU: begin
`TRACE(level, (", fmt=0x%0h, frm=0x%0h", op_args.fpu.fmt, op_args.fpu.frm));
end
`EX_SFU: begin
if (`INST_SFU_IS_CSR(op_type)) begin
`TRACE(level, (", addr=0x%0h, use_imm=%b, imm=0x%0h", op_args.csr.addr, op_args.csr.use_imm, op_args.csr.imm));
end
end
default:;
endcase
endtask
task trace_base_dcr(input int level, input [`VX_DCR_ADDR_WIDTH-1:0] addr);
case (addr)
`VX_DCR_BASE_STARTUP_ADDR0: `TRACE(level, ("STARTUP_ADDR0"));
`VX_DCR_BASE_STARTUP_ADDR1: `TRACE(level, ("STARTUP_ADDR1"));
`VX_DCR_BASE_STARTUP_ARG0: `TRACE(level, ("STARTUP_ARG0"));
`VX_DCR_BASE_STARTUP_ARG1: `TRACE(level, ("STARTUP_ARG1"));
`VX_DCR_BASE_MPM_CLASS: `TRACE(level, ("MPM_CLASS"));
default: `TRACE(level, ("?"));
endcase
endtask
`endif
`endif // VX_TRACE_VH

View file

@ -0,0 +1,44 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_uuid_gen import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter UUID_WIDTH = 48
) (
input wire clk,
input wire reset,
input wire incr,
input wire [`NW_WIDTH-1:0] wid,
output wire [UUID_WIDTH-1:0] uuid
);
localparam GNW_WIDTH = UUID_WIDTH - 32;
reg [31:0] uuid_cntrs [0:`NUM_WARPS-1];
reg [`NUM_WARPS-1:0] has_uuid_cntrs;
always @(posedge clk) begin
if (reset) begin
has_uuid_cntrs <= '0;
end else if (incr) begin
has_uuid_cntrs[wid] <= 1;
end
if (incr) begin
uuid_cntrs[wid] <= has_uuid_cntrs[wid] ? (uuid_cntrs[wid] + 1) : 1;
end
end
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(wid);
assign uuid = {g_wid, (has_uuid_cntrs[wid] ? uuid_cntrs[wid] : 0)};
endmodule

View file

@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_wctl_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter `STRING INSTANCE_ID = "",
parameter NUM_LANES = 1
) (
input wire clk,
@ -27,7 +27,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
VX_warp_ctl_if.master warp_ctl_if,
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam LANE_BITS = `CLOG2(NUM_LANES);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
@ -50,9 +50,9 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR);
wire [`UP(LANE_BITS)-1:0] tid;
if (LANE_BITS != 0) begin
if (LANE_BITS != 0) begin : g_tid
assign tid = execute_if.data.tid[0 +: LANE_BITS];
end else begin
end else begin : g_no_tid
assign tid = 0;
end
@ -63,7 +63,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
wire not_pred = execute_if.data.op_args.wctl.is_neg;
wire [NUM_LANES-1:0] taken;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_taken
assign taken[i] = (execute_if.data.rs1_data[i][0] ^ not_pred);
end
@ -131,7 +131,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
// wspawn
wire [`NUM_WARPS-1:0] wspawn_wmask;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_wspawn_wmask
assign wspawn_wmask[i] = (i < rs1_data[`NW_BITS:0]) && (i != execute_if.data.wid);
end
assign wspawn.valid = is_wspawn;
@ -162,7 +162,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
assign warp_ctl_if.sjoin = sjoin_r;
assign warp_ctl_if.barrier = barrier_r;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_commit_if
assign commit_if.data.data[i] = `XLEN'(dvstack_ptr);
end

View file

@ -1,17 +1,17 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Modified port of cast module from fpnew Libray
// Modified port of cast module from fpnew Libray
// reference: https://github.com/pulp-platform/fpnew
`include "VX_fpu_define.vh"
@ -22,7 +22,8 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
parameter LATENCY = 1,
parameter INT_WIDTH = 32,
parameter MAN_BITS = 23,
parameter EXP_BITS = 8
parameter EXP_BITS = 8,
parameter OUT_REG = 0
) (
input wire clk,
input wire reset,
@ -35,10 +36,10 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
input wire is_signed,
input wire [31:0] dataa,
output wire [31:0] result,
output wire [31:0] result,
output wire [`FP_FLAGS_BITS-1:0] fflags
);
);
// Constants
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
@ -55,11 +56,11 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
localparam FMT_SHIFT_COMPENSATION = S_MAN_WIDTH - 1 - MAN_BITS;
localparam NUM_FP_STICKY = 2 * S_MAN_WIDTH - MAN_BITS - 1; // removed mantissa, 1. and R
localparam NUM_INT_STICKY = 2 * S_MAN_WIDTH - INT_WIDTH; // removed int and R
// Input processing
fclass_t fclass;
VX_fp_classifier #(
fclass_t fclass;
VX_fp_classifier #(
.EXP_BITS (EXP_BITS),
.MAN_BITS (MAN_BITS)
) fp_classifier (
@ -69,9 +70,9 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
);
wire [S_MAN_WIDTH-1:0] input_mant;
wire [S_EXP_WIDTH-1:0] input_exp;
wire [S_EXP_WIDTH-1:0] input_exp;
wire input_sign;
wire i2f_sign = dataa[INT_WIDTH-1];
wire f2i_sign = dataa[INT_WIDTH-1] && is_signed;
wire [S_MAN_WIDTH-1:0] f2i_mantissa = f2i_sign ? (-dataa) : dataa;
@ -81,7 +82,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
assign input_sign = is_itof ? f2i_sign : i2f_sign;
// Pipeline stage0
wire is_itof_s0;
wire is_signed_s0;
wire [2:0] rnd_mode_s0;
@ -92,7 +93,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
VX_pipe_register #(
.DATAW (1 + `INST_FRM_BITS + 1 + $bits(fclass_t) + 1 + S_EXP_WIDTH + S_MAN_WIDTH),
.DEPTH (LATENCY > 2)
.DEPTH (LATENCY > 1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
@ -100,7 +101,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
.data_in ({is_itof, is_signed, frm, fclass, input_sign, input_exp, input_mant}),
.data_out ({is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
);
// Normalization
wire [LZC_RESULT_WIDTH-1:0] renorm_shamt_s0; // renormalization shift amount
@ -113,12 +114,12 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
.data_out (renorm_shamt_s0),
.valid_out (mant_is_nonzero_s0)
);
wire mant_is_zero_s0 = ~mant_is_nonzero_s0;
wire [S_MAN_WIDTH-1:0] input_mant_n_s0; // normalized input mantissa
wire [S_MAN_WIDTH-1:0] input_mant_n_s0; // normalized input mantissa
wire [S_EXP_WIDTH-1:0] input_exp_n_s0; // unbiased true exponent
// Realign input mantissa, append zeroes if destination is wider
assign input_mant_n_s0 = encoded_mant_s0 << renorm_shamt_s0;
@ -140,7 +141,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
VX_pipe_register #(
.DATAW (1 + `INST_FRM_BITS + 1 + $bits(fclass_t) + 1 + 1 + S_MAN_WIDTH + S_EXP_WIDTH),
.DEPTH (LATENCY > 1)
.DEPTH (LATENCY > 2)
) pipe_reg1 (
.clk (clk),
.reset (reset),
@ -169,30 +170,30 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
wire of_before_round_s1 = overflow;
// Pipeline stage2
wire is_itof_s2;
wire is_signed_s2;
wire [2:0] rnd_mode_s2;
fclass_t fclass_s2;
fclass_t fclass_s2;
wire mant_is_zero_s2;
wire input_sign_s2;
wire [2*S_MAN_WIDTH:0] destination_mant_s2;
wire [EXP_BITS-1:0] final_exp_s2;
wire of_before_round_s2;
VX_pipe_register #(
.DATAW (1 + 1 + `INST_FRM_BITS + $bits(fclass_t) + 1 + 1 + (2*S_MAN_WIDTH+1) + EXP_BITS + 1),
.DEPTH (LATENCY > 3)
.DEPTH (LATENCY > 0)
) pipe_reg2 (
.clk (clk),
.reset (reset),
.enable (enable),
.data_in ({is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
.data_out ({is_itof_s2, is_signed_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
);
);
// Rouding and classification
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
wire [INT_WIDTH-1:0] final_int_s2; // integer shifted in position
wire [1:0] f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2;
@ -237,20 +238,20 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
wire is_itof_s3;
wire is_signed_s3;
fclass_t fclass_s3;
fclass_t fclass_s3;
wire mant_is_zero_s3;
wire input_sign_s3;
wire rounded_sign_s3;
wire [INT_WIDTH-1:0] rounded_abs_s3;
wire of_before_round_s3;
wire of_before_round_s3;
wire f2i_round_has_sticky_s3;
wire i2f_round_has_sticky_s3;
`UNUSED_VAR (fclass_s3)
`UNUSED_VAR (fclass_s3)
VX_pipe_register #(
.DATAW (1 + 1 + $bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1),
.DEPTH (LATENCY > 4)
.DEPTH (LATENCY > 3)
) pipe_reg3 (
.clk (clk),
.reset (reset),
@ -258,7 +259,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
.data_in ({is_itof_s2, is_signed_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}),
.data_out ({is_itof_s3, is_signed_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3})
);
// Assemble regular result, nan box short ones. Int zeroes need to be detected
wire [INT_WIDTH-1:0] fmt_result_s3 = mant_is_zero_s3 ? 0 : {rounded_sign_s3, rounded_abs_s3[EXP_BITS+MAN_BITS-1:0]};
@ -278,18 +279,18 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
f2i_special_result_s3[INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1; // alone yields 2**(31)-1
f2i_special_result_s3[INT_WIDTH-1] = ~is_signed_s3; // for unsigned casts yields 2**31
end
end
end
// Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
wire f2i_result_is_special_s3 = fclass_s3.is_nan
wire f2i_result_is_special_s3 = fclass_s3.is_nan
| fclass_s3.is_inf
| of_before_round_s3
| (input_sign_s3 & ~is_signed_s3 & ~rounded_int_res_zero_s3);
fflags_t f2i_special_status_s3;
fflags_t i2f_status_s3, f2i_status_s3;
fflags_t tmp_fflags_s3;
// All integer special cases are invalid
assign f2i_special_status_s3 = {1'b1, 4'h0};
@ -306,7 +307,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
VX_pipe_register #(
.DATAW (32 + `FP_FLAGS_BITS),
.DEPTH (LATENCY > 0)
.DEPTH (OUT_REG)
) pipe_reg4 (
.clk (clk),
.reset (reset),

View file

@ -1,17 +1,17 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Modified port of noncomp module from fpnew Libray
// Modified port of noncomp module from fpnew Libray
// reference: https://github.com/pulp-platform/fpnew
`include "VX_fpu_define.vh"
@ -19,9 +19,10 @@
`ifdef FPU_DSP
module VX_fncp_unit import VX_fpu_pkg::*; #(
parameter LATENCY = 2,
parameter LATENCY = 1,
parameter EXP_BITS = 8,
parameter MAN_BITS = 23
parameter MAN_BITS = 23,
parameter OUT_REG = 0
) (
input wire clk,
input wire reset,
@ -33,10 +34,10 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
input wire [31:0] dataa,
input wire [31:0] datab,
output wire [31:0] result,
output wire [31:0] result,
output wire [`FP_FLAGS_BITS-1:0] fflags
);
);
localparam NEG_INF = 32'h00000001,
NEG_NORM = 32'h00000002,
NEG_SUBNORM = 32'h00000004,
@ -55,15 +56,15 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
wire a_smaller, ab_equal;
// Setup
assign a_sign = dataa[31];
assign a_sign = dataa[31];
assign a_exponent = dataa[30:23];
assign a_mantissa = dataa[22:0];
assign b_sign = datab[31];
assign b_sign = datab[31];
assign b_exponent = datab[30:23];
assign b_mantissa = datab[22:0];
VX_fp_classifier #(
VX_fp_classifier #(
.EXP_BITS (EXP_BITS),
.MAN_BITS (MAN_BITS)
) fp_class_a (
@ -72,7 +73,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
.clss_o (a_fclass)
);
VX_fp_classifier #(
VX_fp_classifier #(
.EXP_BITS (EXP_BITS),
.MAN_BITS (MAN_BITS)
) fp_class_b (
@ -82,7 +83,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
);
assign a_smaller = (dataa < datab) ^ (a_sign || b_sign);
assign ab_equal = (dataa == datab)
assign ab_equal = (dataa == datab)
|| (a_fclass.is_zero && b_fclass.is_zero); // +0 == -0
// Pipeline stage0
@ -101,54 +102,54 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
VX_pipe_register #(
.DATAW (4 + 2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fclass_t) + 1 + 1),
.DEPTH (LATENCY > 1)
.DEPTH (LATENCY > 0)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (enable),
.data_in ({op_mod, dataa, datab, a_sign, b_sign, a_exponent, a_mantissa, a_fclass, b_fclass, a_smaller, ab_equal}),
.data_out ({op_mod_s0, dataa_s0, datab_s0, a_sign_s0, b_sign_s0, a_exponent_s0, a_mantissa_s0, a_fclass_s0, b_fclass_s0, a_smaller_s0, ab_equal_s0})
);
);
// FCLASS
reg [31:0] fclass_mask_s0; // generate a 10-bit mask for integer reg
always @(*) begin
always @(*) begin
if (a_fclass_s0.is_normal) begin
fclass_mask_s0 = a_sign_s0 ? NEG_NORM : POS_NORM;
end
end
else if (a_fclass_s0.is_inf) begin
fclass_mask_s0 = a_sign_s0 ? NEG_INF : POS_INF;
end
end
else if (a_fclass_s0.is_zero) begin
fclass_mask_s0 = a_sign_s0 ? NEG_ZERO : POS_ZERO;
end
end
else if (a_fclass_s0.is_subnormal) begin
fclass_mask_s0 = a_sign_s0 ? NEG_SUBNORM : POS_SUBNORM;
end
end
else if (a_fclass_s0.is_nan) begin
fclass_mask_s0 = {22'h0, a_fclass_s0.is_quiet, a_fclass_s0.is_signaling, 8'h0};
end
else begin
end
else begin
fclass_mask_s0 = QUT_NAN;
end
end
// Min/Max
// Min/Max
reg [31:0] fminmax_res_s0;
always @(*) begin
if (a_fclass_s0.is_nan && b_fclass_s0.is_nan)
fminmax_res_s0 = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN
else if (a_fclass_s0.is_nan)
else if (a_fclass_s0.is_nan)
fminmax_res_s0 = datab_s0;
else if (b_fclass_s0.is_nan)
else if (b_fclass_s0.is_nan)
fminmax_res_s0 = dataa_s0;
else begin
else begin
// FMIN, FMAX
fminmax_res_s0 = (op_mod_s0[0] ^ a_smaller_s0) ? dataa_s0 : datab_s0;
end
end
// Sign injection
// Sign injection
reg [31:0] fsgnj_res_s0; // result of sign injection
always @(*) begin
case (op_mod_s0[1:0])
@ -158,12 +159,12 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
endcase
end
// Comparison
// Comparison
reg fcmp_res_s0; // result of comparison
reg fcmp_fflags_NV_s0; // comparison fflags
always @(*) begin
case (op_mod_s0[1:0])
0: begin // LE
0: begin // LE
if (a_fclass_s0.is_nan || b_fclass_s0.is_nan) begin
fcmp_res_s0 = 0;
fcmp_fflags_NV_s0 = 1;
@ -179,12 +180,12 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
end else begin
fcmp_res_s0 = (a_smaller_s0 & ~ab_equal_s0);
fcmp_fflags_NV_s0 = 0;
end
end
end
2: begin // EQ
if (a_fclass_s0.is_nan || b_fclass_s0.is_nan) begin
fcmp_res_s0 = 0;
fcmp_fflags_NV_s0 = a_fclass_s0.is_signaling | b_fclass_s0.is_signaling;
fcmp_fflags_NV_s0 = a_fclass_s0.is_signaling | b_fclass_s0.is_signaling;
end else begin
fcmp_res_s0 = ab_equal_s0;
fcmp_fflags_NV_s0 = 0;
@ -192,7 +193,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
end
default: begin
fcmp_res_s0 = 'x;
fcmp_fflags_NV_s0 = 'x;
fcmp_fflags_NV_s0 = 'x;
end
endcase
end
@ -216,7 +217,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
// FMV
result_s0 = dataa_s0;
fflags_NV_s0 = 0;
end
end
6,7: begin
// MIN/MAX
result_s0 = fminmax_res_s0;
@ -229,7 +230,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
VX_pipe_register #(
.DATAW (32 + 1),
.DEPTH (LATENCY > 0)
.DEPTH (OUT_REG)
) pipe_reg1 (
.clk (clk),
.reset (reset),

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -21,7 +21,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
parameter TAG_WIDTH = 1
) (
input wire clk,
input wire reset,
input wire reset,
output wire ready_in,
input wire valid_in,
@ -36,7 +36,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
input wire is_signed,
input wire [NUM_LANES-1:0][31:0] dataa,
output wire [NUM_LANES-1:0][31:0] result,
output wire [NUM_LANES-1:0][31:0] result,
output wire has_fflags,
output wire [`FP_FLAGS_BITS-1:0] fflags,
@ -45,56 +45,69 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
input wire ready_out,
output wire valid_out
);
`UNUSED_VAR (frm)
);
localparam DATAW = 32 + `INST_FRM_BITS + 1 + 1;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][DATAW-1:0] data_in;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
fflags_t [NUM_LANES-1:0] fflags_out;
wire pe_enable;
wire [NUM_PES-1:0][31:0] pe_data_in;
wire pe_enable;
wire [NUM_PES-1:0][DATAW-1:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in
assign data_in[i][0 +: 32] = dataa[i];
assign data_in[i][32 +: `INST_FRM_BITS] = frm;
assign data_in[i][32 + `INST_FRM_BITS +: 1] = is_itof;
assign data_in[i][32 + `INST_FRM_BITS + 1 +: 1] = is_signed;
end
VX_pe_serializer #(
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.LATENCY (`LATENCY_FCVT),
.DATA_IN_WIDTH(32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.DATA_IN_WIDTH (DATAW),
.DATA_OUT_WIDTH (`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG (0)
.PE_REG (0),
.OUT_BUF (2)
) pe_serializer (
.clk (clk),
.reset (reset),
.valid_in (valid_in),
.data_in (dataa),
.data_in (data_in),
.tag_in ({mask_in, tag_in}),
.ready_in (ready_in),
.pe_enable (pe_enable),
.pe_data_in (pe_data_in),
.pe_data_out(pe_data_out),
.pe_data_out(pe_data_in),
.pe_data_in (pe_data_out),
.valid_out (valid_out),
.data_out (data_out),
.tag_out ({mask_out, tag_out}),
.ready_out (ready_out)
);
for (genvar i = 0; i < NUM_LANES; ++i) begin
`UNUSED_VAR (pe_data_in)
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result
assign result[i] = data_out[i][0 +: 32];
assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS];
end
for (genvar i = 0; i < NUM_PES; ++i) begin
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fcvt_units
VX_fcvt_unit #(
.LATENCY (`LATENCY_FCVT)
.LATENCY (`LATENCY_FCVT),
.OUT_REG (1)
) fcvt_unit (
.clk (clk),
.reset (reset),
.enable (pe_enable),
.frm (frm),
.is_itof (is_itof),
.is_signed (is_signed),
.frm (pe_data_in[0][32 +: `INST_FRM_BITS]),
.is_itof (pe_data_in[0][32 + `INST_FRM_BITS +: 1]),
.is_signed (pe_data_in[0][32 + `INST_FRM_BITS + 1 +: 1]),
.dataa (pe_data_in[i][0 +: 32]),
.result (pe_data_out[i][0 +: 32]),
.fflags (pe_data_out[i][32 +: `FP_FLAGS_BITS])

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -21,7 +21,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
parameter TAG_WIDTH = 1
) (
input wire clk,
input wire reset,
input wire reset,
input wire valid_in,
output wire ready_in,
@ -31,10 +31,10 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
input wire [TAG_WIDTH-1:0] tag_in,
input wire [`INST_FRM_BITS-1:0] frm,
input wire [NUM_LANES-1:0][31:0] dataa,
input wire [NUM_LANES-1:0][31:0] datab,
output wire [NUM_LANES-1:0][31:0] result,
output wire [NUM_LANES-1:0][31:0] result,
output wire has_fflags,
output wire [`FP_FLAGS_BITS-1:0] fflags,
@ -44,30 +44,33 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
output wire valid_out,
input wire ready_out
);
`UNUSED_VAR (frm)
localparam DATAW = 2 * 32 + `INST_FRM_BITS;
wire [NUM_LANES-1:0][2*32-1:0] data_in;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][DATAW-1:0] data_in;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags_out;
wire pe_enable;
wire [NUM_PES-1:0][2*32-1:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
wire pe_enable;
wire [NUM_PES-1:0][DATAW-1:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in
assign data_in[i][0 +: 32] = dataa[i];
assign data_in[i][32 +: 32] = datab[i];
assign data_in[i][64 +: `INST_FRM_BITS] = frm;
end
VX_pe_serializer #(
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.LATENCY (`LATENCY_FDIV),
.DATA_IN_WIDTH(2*32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.DATA_IN_WIDTH (DATAW),
.DATA_OUT_WIDTH (`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG (0)
.PE_REG (0),
.OUT_BUF (2)
) pe_serializer (
.clk (clk),
.reset (reset),
@ -76,15 +79,17 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
.tag_in ({mask_in, tag_in}),
.ready_in (ready_in),
.pe_enable (pe_enable),
.pe_data_in (pe_data_in),
.pe_data_out(pe_data_out),
.pe_data_out(pe_data_in),
.pe_data_in (pe_data_out),
.valid_out (valid_out),
.data_out (data_out),
.tag_out ({mask_out, tag_out}),
.ready_out (ready_out)
);
for (genvar i = 0; i < NUM_LANES; ++i) begin
`UNUSED_VAR (pe_data_in)
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result
assign result[i] = data_out[i][0 +: 32];
assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS];
end
@ -92,8 +97,8 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
fflags_t [NUM_LANES-1:0] per_lane_fflags;
`ifdef QUARTUS
for (genvar i = 0; i < NUM_PES; ++i) begin
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fdivs
acl_fdiv fdiv (
.clk (clk),
.areset (1'b0),
@ -103,15 +108,15 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
.q (pe_data_out[i][0 +: 32])
);
assign pe_data_out[i][32 +: `FP_FLAGS_BITS] = 'x;
end
end
assign has_fflags = 0;
assign per_lane_fflags = 'x;
`UNUSED_VAR (fflags_out)
`elsif VIVADO
for (genvar i = 0; i < NUM_PES; ++i) begin
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fdivs
wire [3:0] tuser;
xil_fdiv fdiv (
.aclk (clk),
@ -131,21 +136,21 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
assign has_fflags = 1;
assign per_lane_fflags = fflags_out;
`else
`else
for (genvar i = 0; i < NUM_PES; ++i) begin
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fdivs
reg [63:0] r;
`UNUSED_VAR (r)
`UNUSED_VAR (r)
fflags_t f;
always @(*) begin
always @(*) begin
dpi_fdiv (
pe_enable,
int'(0),
{32'hffffffff, pe_data_in[i][0 +: 32]},
{32'hffffffff, pe_data_in[i][32 +: 32]},
frm,
r,
pe_enable,
int'(0),
{32'hffffffff, pe_data_in[i][0 +: 32]}, // a
{32'hffffffff, pe_data_in[i][32 +: 32]}, // b
pe_data_in[0][64 +: `INST_FRM_BITS], // frm
r,
f
);
end

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -15,7 +15,7 @@
`ifdef FPU_DPI
module VX_fpu_dpi import VX_fpu_pkg::*; #(
module VX_fpu_dpi import VX_fpu_pkg::*; #(
parameter NUM_LANES = 1,
parameter TAG_WIDTH = 1,
parameter OUT_BUF = 0
@ -29,7 +29,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
input wire [NUM_LANES-1:0] mask_in,
input wire [TAG_WIDTH-1:0] tag_in,
input wire [`INST_FPU_BITS-1:0] op_type,
input wire [`INST_FMT_BITS-1:0] fmt,
input wire [`INST_FRM_BITS-1:0] frm,
@ -37,7 +37,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
input wire [NUM_LANES-1:0][`XLEN-1:0] dataa,
input wire [NUM_LANES-1:0][`XLEN-1:0] datab,
input wire [NUM_LANES-1:0][`XLEN-1:0] datac,
output wire [NUM_LANES-1:0][`XLEN-1:0] result,
output wire [NUM_LANES-1:0][`XLEN-1:0] result,
output wire has_fflags,
output wire [`FP_FLAGS_BITS-1:0] fflags,
@ -55,31 +55,30 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
localparam FPC_BITS = `LOG2UP(NUM_FPC);
localparam RSP_DATAW = (NUM_LANES * `XLEN) + 1 + $bits(fflags_t) + TAG_WIDTH;
wire [NUM_FPC-1:0] per_core_ready_in;
wire [NUM_FPC-1:0][NUM_LANES-1:0][`XLEN-1:0] per_core_result;
wire [NUM_FPC-1:0][TAG_WIDTH-1:0] per_core_tag_out;
reg [NUM_FPC-1:0] per_core_ready_out;
wire [NUM_FPC-1:0] per_core_valid_out;
wire [NUM_FPC-1:0] per_core_has_fflags;
fflags_t [NUM_FPC-1:0] per_core_fflags;
wire [NUM_FPC-1:0] per_core_valid_out;
wire [NUM_FPC-1:0] per_core_has_fflags;
fflags_t [NUM_FPC-1:0] per_core_fflags;
wire div_ready_in, sqrt_ready_in;
wire [NUM_LANES-1:0][`XLEN-1:0] div_result, sqrt_result;
wire [TAG_WIDTH-1:0] div_tag_out, sqrt_tag_out;
wire div_ready_out, sqrt_ready_out;
wire div_valid_out, sqrt_valid_out;
wire div_has_fflags, sqrt_has_fflags;
wire div_valid_out, sqrt_valid_out;
wire div_has_fflags, sqrt_has_fflags;
fflags_t div_fflags, sqrt_fflags;
reg [FPC_BITS-1:0] core_select;
reg is_fadd, is_fsub, is_fmul, is_fmadd, is_fmsub, is_fnmadd, is_fnmsub;
reg is_div, is_fcmp, is_itof, is_utof, is_ftoi, is_ftou, is_f2f;
reg dst_fmt, int_fmt;
reg is_div, is_fcmp, is_itof, is_utof, is_ftoi, is_ftou, is_f2f;
reg [NUM_LANES-1:0][63:0] operands [3];
always @(*) begin
for (integer i = 0; i < NUM_LANES; ++i) begin
operands[0][i] = 64'(dataa[i]);
@ -88,43 +87,30 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
end
end
`UNUSED_VAR (fmt)
wire f_fmt = fmt[0];
wire i_fmt = fmt[1];
always @(*) begin
is_fadd = 0;
is_fsub = 0;
is_fmul = 0;
is_fsub = 0;
is_fmul = 0;
is_fmadd = 0;
is_fmsub = 0;
is_fnmadd = 0;
is_fnmsub = 0;
is_div = 0;
is_fnmadd = 0;
is_fnmsub = 0;
is_div = 0;
is_fcmp = 0;
is_itof = 0;
is_utof = 0;
is_ftoi = 0;
is_ftou = 0;
is_f2f = 0;
dst_fmt = 0;
int_fmt = 0;
`ifdef FLEN_64
dst_fmt = fmt[0];
`endif
`ifdef XLEN_64
int_fmt = fmt[1];
`endif
case (op_type)
`INST_FPU_ADD: begin core_select = FPU_FMA; is_fadd = 1; end
`INST_FPU_SUB: begin core_select = FPU_FMA; is_fsub = 1; end
`INST_FPU_ADD: begin core_select = FPU_FMA; is_fadd = ~i_fmt; is_fsub = i_fmt; end
`INST_FPU_MADD: begin core_select = FPU_FMA; is_fmadd = ~i_fmt; is_fmsub = i_fmt; end
`INST_FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = ~i_fmt; is_fnmsub = i_fmt; end
`INST_FPU_MUL: begin core_select = FPU_FMA; is_fmul = 1; end
`INST_FPU_MADD: begin core_select = FPU_FMA; is_fmadd = 1; end
`INST_FPU_MSUB: begin core_select = FPU_FMA; is_fmsub = 1; end
`INST_FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = 1; end
`INST_FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end
`INST_FPU_DIV: begin core_select = FPU_DIVSQRT; is_div = 1; end
`INST_FPU_SQRT: begin core_select = FPU_DIVSQRT; end
`INST_FPU_CMP: begin core_select = FPU_NCP; is_fcmp = 1; end
@ -132,23 +118,23 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
`INST_FPU_F2U: begin core_select = FPU_CVT; is_ftou = 1; end
`INST_FPU_I2F: begin core_select = FPU_CVT; is_itof = 1; end
`INST_FPU_U2F: begin core_select = FPU_CVT; is_utof = 1; end
`INST_FPU_F2F: begin core_select = FPU_CVT; is_f2f = 1; end
`INST_FPU_F2F: begin core_select = FPU_CVT; is_f2f = 1; end
default: begin core_select = FPU_NCP; end
endcase
end
generate
begin : fma
generate
begin : g_fma
reg [NUM_LANES-1:0][`XLEN-1:0] result_fma;
wire [NUM_LANES-1:0][63:0] result_fadd;
wire [NUM_LANES-1:0][63:0] result_fsub;
wire [NUM_LANES-1:0][63:0] result_fmul;
wire [NUM_LANES-1:0][63:0] result_fmadd;
wire [NUM_LANES-1:0][63:0] result_fmsub;
wire [NUM_LANES-1:0][63:0] result_fnmadd;
wire [NUM_LANES-1:0][63:0] result_fnmsub;
reg [NUM_LANES-1:0][63:0] result_fadd;
reg [NUM_LANES-1:0][63:0] result_fsub;
reg [NUM_LANES-1:0][63:0] result_fmul;
reg [NUM_LANES-1:0][63:0] result_fmadd;
reg [NUM_LANES-1:0][63:0] result_fmsub;
reg [NUM_LANES-1:0][63:0] result_fnmadd;
reg [NUM_LANES-1:0][63:0] result_fnmsub;
fflags_t [NUM_LANES-1:0] fflags_fma;
fflags_t [NUM_LANES-1:0] fflags_fadd;
fflags_t [NUM_LANES-1:0] fflags_fsub;
@ -162,33 +148,33 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
wire fma_ready = per_core_ready_out[FPU_FMA] || ~per_core_valid_out[FPU_FMA];
wire fma_fire = fma_valid && fma_ready;
always @(*) begin
always @(*) begin
for (integer i = 0; i < NUM_LANES; ++i) begin
dpi_fadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fadd[i], fflags_fadd[i]);
dpi_fsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fsub[i], fflags_fsub[i]);
dpi_fmul (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fmul[i], fflags_fmul[i]);
dpi_fmadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmadd[i], fflags_fmadd[i]);
dpi_fmsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmsub[i], fflags_fmsub[i]);
dpi_fnmadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmadd[i], fflags_fnmadd[i]);
dpi_fnmsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmsub[i], fflags_fnmsub[i]);
dpi_fadd (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fadd[i], fflags_fadd[i]);
dpi_fsub (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fsub[i], fflags_fsub[i]);
dpi_fmul (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fmul[i], fflags_fmul[i]);
dpi_fmadd (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmadd[i], fflags_fmadd[i]);
dpi_fmsub (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmsub[i], fflags_fmsub[i]);
dpi_fnmadd (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmadd[i], fflags_fnmadd[i]);
dpi_fnmsub (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmsub[i], fflags_fnmsub[i]);
result_fma[i] = is_fadd ? result_fadd[i][`XLEN-1:0] :
is_fsub ? result_fsub[i][`XLEN-1:0] :
is_fmul ? result_fmul[i][`XLEN-1:0] :
is_fmadd ? result_fmadd[i][`XLEN-1:0] :
is_fmadd ? result_fmadd[i][`XLEN-1:0] :
is_fmsub ? result_fmsub[i][`XLEN-1:0] :
is_fnmadd ? result_fnmadd[i][`XLEN-1:0] :
is_fnmadd ? result_fnmadd[i][`XLEN-1:0] :
is_fnmsub ? result_fnmsub[i][`XLEN-1:0] :
'0;
fflags_fma[i] = is_fadd ? fflags_fadd[i] :
is_fsub ? fflags_fsub[i] :
is_fmul ? fflags_fmul[i] :
is_fmadd ? fflags_fmadd[i] :
is_fmadd ? fflags_fmadd[i] :
is_fmsub ? fflags_fmsub[i] :
is_fnmadd ? fflags_fnmadd[i] :
is_fnmsub ? fflags_fnmsub[i] :
'0;
is_fnmadd ? fflags_fnmadd[i] :
is_fnmsub ? fflags_fnmsub[i] :
'0;
end
end
@ -213,20 +199,20 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
end
endgenerate
generate
begin : fdiv
generate
begin : g_fdiv
reg [NUM_LANES-1:0][`XLEN-1:0] result_fdiv_r;
wire [NUM_LANES-1:0][63:0] result_fdiv;
reg [NUM_LANES-1:0][63:0] result_fdiv;
fflags_t [NUM_LANES-1:0] fflags_fdiv;
wire fdiv_valid = (valid_in && core_select == FPU_DIVSQRT) && is_div;
wire fdiv_ready = div_ready_out || ~div_valid_out;
wire fdiv_fire = fdiv_valid && fdiv_ready;
always @(*) begin
for (integer i = 0; i < NUM_LANES; ++i) begin
dpi_fdiv (fdiv_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fdiv[i], fflags_fdiv[i]);
always @(*) begin
for (integer i = 0; i < NUM_LANES; ++i) begin
dpi_fdiv (fdiv_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fdiv[i], fflags_fdiv[i]);
result_fdiv_r[i] = result_fdiv[i][`XLEN-1:0];
end
end
@ -252,20 +238,20 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
end
endgenerate
generate
begin : fsqrt
generate
begin : g_fsqrt
reg [NUM_LANES-1:0][`XLEN-1:0] result_fsqrt_r;
wire [NUM_LANES-1:0][63:0] result_fsqrt;
reg [NUM_LANES-1:0][63:0] result_fsqrt;
fflags_t [NUM_LANES-1:0] fflags_fsqrt;
wire fsqrt_valid = (valid_in && core_select == FPU_DIVSQRT) && ~is_div;
wire fsqrt_ready = sqrt_ready_out || ~sqrt_valid_out;
wire fsqrt_ready = sqrt_ready_out || ~sqrt_valid_out;
wire fsqrt_fire = fsqrt_valid && fsqrt_ready;
always @(*) begin
always @(*) begin
for (integer i = 0; i < NUM_LANES; ++i) begin
dpi_fsqrt (fsqrt_fire, int'(dst_fmt), operands[0][i], frm, result_fsqrt[i], fflags_fsqrt[i]);
dpi_fsqrt (fsqrt_fire, int'(f_fmt), operands[0][i], frm, result_fsqrt[i], fflags_fsqrt[i]);
result_fsqrt_r[i] = result_fsqrt[i][`XLEN-1:0];
end
end
@ -292,15 +278,15 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
endgenerate
generate
begin : fcvt
begin : g_fcvt
reg [NUM_LANES-1:0][`XLEN-1:0] result_fcvt;
wire [NUM_LANES-1:0][63:0] result_itof;
wire [NUM_LANES-1:0][63:0] result_utof;
wire [NUM_LANES-1:0][63:0] result_ftoi;
wire [NUM_LANES-1:0][63:0] result_ftou;
wire [NUM_LANES-1:0][63:0] result_f2f;
reg [NUM_LANES-1:0][63:0] result_itof;
reg [NUM_LANES-1:0][63:0] result_utof;
reg [NUM_LANES-1:0][63:0] result_ftoi;
reg [NUM_LANES-1:0][63:0] result_ftou;
reg [NUM_LANES-1:0][63:0] result_f2f;
fflags_t [NUM_LANES-1:0] fflags_fcvt;
fflags_t [NUM_LANES-1:0] fflags_itof;
fflags_t [NUM_LANES-1:0] fflags_utof;
@ -310,20 +296,20 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
wire fcvt_valid = (valid_in && core_select == FPU_CVT);
wire fcvt_ready = per_core_ready_out[FPU_CVT] || ~per_core_valid_out[FPU_CVT];
wire fcvt_fire = fcvt_valid && fcvt_ready;
always @(*) begin
always @(*) begin
for (integer i = 0; i < NUM_LANES; ++i) begin
dpi_itof (fcvt_fire, int'(dst_fmt), int'(int_fmt), operands[0][i], frm, result_itof[i], fflags_itof[i]);
dpi_utof (fcvt_fire, int'(dst_fmt), int'(int_fmt), operands[0][i], frm, result_utof[i], fflags_utof[i]);
dpi_ftoi (fcvt_fire, int'(int_fmt), int'(dst_fmt), operands[0][i], frm, result_ftoi[i], fflags_ftoi[i]);
dpi_ftou (fcvt_fire, int'(int_fmt), int'(dst_fmt), operands[0][i], frm, result_ftou[i], fflags_ftou[i]);
dpi_f2f (fcvt_fire, int'(dst_fmt), operands[0][i], result_f2f[i]);
dpi_itof (fcvt_fire, int'(f_fmt), int'(i_fmt), operands[0][i], frm, result_itof[i], fflags_itof[i]);
dpi_utof (fcvt_fire, int'(f_fmt), int'(i_fmt), operands[0][i], frm, result_utof[i], fflags_utof[i]);
dpi_ftoi (fcvt_fire, int'(i_fmt), int'(f_fmt), operands[0][i], frm, result_ftoi[i], fflags_ftoi[i]);
dpi_ftou (fcvt_fire, int'(i_fmt), int'(f_fmt), operands[0][i], frm, result_ftou[i], fflags_ftou[i]);
dpi_f2f (fcvt_fire, int'(f_fmt), operands[0][i], result_f2f[i]);
result_fcvt[i] = is_itof ? result_itof[i][`XLEN-1:0] :
is_utof ? result_utof[i][`XLEN-1:0] :
is_ftoi ? result_ftoi[i][`XLEN-1:0] :
is_ftou ? result_ftou[i][`XLEN-1:0] :
is_f2f ? result_f2f[i][`XLEN-1:0] :
is_ftou ? result_ftou[i][`XLEN-1:0] :
is_f2f ? result_f2f[i][`XLEN-1:0] :
'0;
fflags_fcvt[i] = is_itof ? fflags_itof[i] :
@ -355,19 +341,19 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
end
endgenerate
generate
begin : fncp
generate
begin : g_fncp
reg [NUM_LANES-1:0][`XLEN-1:0] result_fncp;
wire [NUM_LANES-1:0][63:0] result_fclss;
wire [NUM_LANES-1:0][63:0] result_flt;
wire [NUM_LANES-1:0][63:0] result_fle;
wire [NUM_LANES-1:0][63:0] result_feq;
wire [NUM_LANES-1:0][63:0] result_fmin;
wire [NUM_LANES-1:0][63:0] result_fmax;
wire [NUM_LANES-1:0][63:0] result_fsgnj;
wire [NUM_LANES-1:0][63:0] result_fsgnjn;
wire [NUM_LANES-1:0][63:0] result_fsgnjx;
reg [NUM_LANES-1:0][63:0] result_fclss;
reg [NUM_LANES-1:0][63:0] result_flt;
reg [NUM_LANES-1:0][63:0] result_fle;
reg [NUM_LANES-1:0][63:0] result_feq;
reg [NUM_LANES-1:0][63:0] result_fmin;
reg [NUM_LANES-1:0][63:0] result_fmax;
reg [NUM_LANES-1:0][63:0] result_fsgnj;
reg [NUM_LANES-1:0][63:0] result_fsgnjn;
reg [NUM_LANES-1:0][63:0] result_fsgnjx;
reg [NUM_LANES-1:0][63:0] result_fmvx;
reg [NUM_LANES-1:0][63:0] result_fmvf;
@ -381,20 +367,20 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
wire fncp_valid = (valid_in && core_select == FPU_NCP);
wire fncp_ready = per_core_ready_out[FPU_NCP] || ~per_core_valid_out[FPU_NCP];
wire fncp_fire = fncp_valid && fncp_ready;
always @(*) begin
always @(*) begin
for (integer i = 0; i < NUM_LANES; ++i) begin
dpi_fclss (fncp_fire, int'(dst_fmt), operands[0][i], result_fclss[i]);
dpi_fle (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fle[i], fflags_fle[i]);
dpi_flt (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_flt[i], fflags_flt[i]);
dpi_feq (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_feq[i], fflags_feq[i]);
dpi_fmin (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fmin[i], fflags_fmin[i]);
dpi_fmax (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fmax[i], fflags_fmax[i]);
dpi_fsgnj (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnj[i]);
dpi_fsgnjn (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnjn[i]);
dpi_fsgnjx (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnjx[i]);
result_fmvx[i] = dst_fmt ? operands[0][i] : 64'($signed(operands[0][i][31:0])); // sign-extension
result_fmvf[i] = dst_fmt ? operands[0][i] : (operands[0][i] | 64'hffffffff00000000); // nan-boxing
dpi_fclss (fncp_fire, int'(f_fmt), operands[0][i], result_fclss[i]);
dpi_fle (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fle[i], fflags_fle[i]);
dpi_flt (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_flt[i], fflags_flt[i]);
dpi_feq (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_feq[i], fflags_feq[i]);
dpi_fmin (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fmin[i], fflags_fmin[i]);
dpi_fmax (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fmax[i], fflags_fmax[i]);
dpi_fsgnj (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fsgnj[i]);
dpi_fsgnjn (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fsgnjn[i]);
dpi_fsgnjx (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fsgnjx[i]);
result_fmvx[i] = f_fmt ? operands[0][i] : 64'($signed(operands[0][i][31:0])); // sign-extension
result_fmvf[i] = f_fmt ? operands[0][i] : (operands[0][i] | 64'hffffffff00000000); // nan-boxing
end
end
@ -431,7 +417,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
.data_in ({fncp_valid, tag_in, has_fflags_fncp, result_fncp, fflags_merged}),
.data_out ({per_core_valid_out[FPU_NCP], per_core_tag_out[FPU_NCP], per_core_has_fflags[FPU_NCP], per_core_result[FPU_NCP], per_core_fflags[FPU_NCP]})
);
assign per_core_ready_in[FPU_NCP] = fncp_ready;
end
@ -443,15 +429,15 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_DATAW),
.ARBITER ("R"),
.DATAW (RSP_DATAW),
.ARBITER ("P"),
.OUT_BUF (0)
) div_sqrt_arb (
.clk (clk),
.reset (reset),
.valid_in ({sqrt_valid_out, div_valid_out}),
.valid_in ({sqrt_valid_out, div_valid_out}),
.ready_in ({sqrt_ready_out, div_ready_out}),
.data_in ({{sqrt_result, sqrt_has_fflags, sqrt_fflags, sqrt_tag_out},
.data_in ({{sqrt_result, sqrt_has_fflags, sqrt_fflags, sqrt_tag_out},
{div_result, div_has_fflags, div_fflags, div_tag_out}}),
.data_out ({per_core_result[FPU_DIVSQRT], per_core_has_fflags[FPU_DIVSQRT], per_core_fflags[FPU_DIVSQRT], per_core_tag_out[FPU_DIVSQRT]}),
.valid_out (per_core_valid_out[FPU_DIVSQRT]),
@ -463,19 +449,19 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
wire [NUM_FPC-1:0][RSP_DATAW-1:0] per_core_data_out;
for (genvar i = 0; i < NUM_FPC; ++i) begin
for (genvar i = 0; i < NUM_FPC; ++i) begin : g_per_core_data_out
assign per_core_data_out[i] = {per_core_result[i], per_core_has_fflags[i], per_core_fflags[i], per_core_tag_out[i]};
end
VX_stream_arb #(
.NUM_INPUTS (NUM_FPC),
.DATAW (RSP_DATAW),
.DATAW (RSP_DATAW),
.ARBITER ("R"),
.OUT_BUF (OUT_BUF)
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in (per_core_valid_out),
.valid_in (per_core_valid_out),
.ready_in (per_core_ready_out),
.data_in (per_core_data_out),
.data_out ({result, has_fflags, fflags, tag_out}),

View file

@ -51,68 +51,39 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
localparam FPU_DIVSQRT = 1;
localparam FPU_CVT = 2;
localparam FPU_NCP = 3;
localparam NUM_FPC = 4;
localparam FPC_BITS = `LOG2UP(NUM_FPC);
localparam NUM_FPCORES = 4;
localparam FPCORES_BITS = `LOG2UP(NUM_FPCORES);
localparam REQ_DATAW = NUM_LANES + TAG_WIDTH + `INST_FPU_BITS + `INST_FMT_BITS + `INST_FRM_BITS + 3 * (NUM_LANES * 32);
localparam RSP_DATAW = (NUM_LANES * 32) + 1 + $bits(fflags_t) + TAG_WIDTH;
`UNUSED_VAR (fmt)
wire [NUM_FPC-1:0] per_core_ready_in;
wire [NUM_FPC-1:0][NUM_LANES-1:0][31:0] per_core_result;
wire [NUM_FPC-1:0][TAG_WIDTH-1:0] per_core_tag_out;
wire [NUM_FPC-1:0] per_core_ready_out;
wire [NUM_FPC-1:0] per_core_valid_out;
wire [NUM_FPC-1:0] per_core_has_fflags;
fflags_t [NUM_FPC-1:0] per_core_fflags;
wire [NUM_FPCORES-1:0] per_core_valid_in;
wire [NUM_FPCORES-1:0][REQ_DATAW-1:0] per_core_data_in;
wire [NUM_FPCORES-1:0] per_core_ready_in;
wire div_ready_in, sqrt_ready_in;
wire [NUM_LANES-1:0][31:0] div_result, sqrt_result;
wire [TAG_WIDTH-1:0] div_tag_out, sqrt_tag_out;
wire div_ready_out, sqrt_ready_out;
wire div_valid_out, sqrt_valid_out;
wire div_has_fflags, sqrt_has_fflags;
fflags_t div_fflags, sqrt_fflags;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0] per_core_mask_in;
wire [NUM_FPCORES-1:0][TAG_WIDTH-1:0] per_core_tag_in;
wire [NUM_FPCORES-1:0][`INST_FPU_BITS-1:0] per_core_op_type;
wire [NUM_FPCORES-1:0][`INST_FMT_BITS-1:0] per_core_fmt;
wire [NUM_FPCORES-1:0][`INST_FRM_BITS-1:0] per_core_frm;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_dataa;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_datab;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_datac;
reg [FPC_BITS-1:0] core_select;
reg is_madd, is_sub, is_neg, is_div, is_itof, is_signed;
always @(*) begin
is_madd = 0;
is_sub = 0;
is_neg = 0;
is_div = 0;
is_itof = 0;
is_signed = 0;
case (op_type)
`INST_FPU_ADD: begin core_select = FPU_FMA; end
`INST_FPU_SUB: begin core_select = FPU_FMA; is_sub = 1; end
`INST_FPU_MUL: begin core_select = FPU_FMA; is_neg = 1; end
`INST_FPU_MADD: begin core_select = FPU_FMA; is_madd = 1; end
`INST_FPU_MSUB: begin core_select = FPU_FMA; is_madd = 1; is_sub = 1; end
`INST_FPU_NMADD: begin core_select = FPU_FMA; is_madd = 1; is_neg = 1; end
`INST_FPU_NMSUB: begin core_select = FPU_FMA; is_madd = 1; is_sub = 1; is_neg = 1; end
`INST_FPU_DIV: begin core_select = FPU_DIVSQRT; is_div = 1; end
`INST_FPU_SQRT: begin core_select = FPU_DIVSQRT; end
`INST_FPU_F2I: begin core_select = FPU_CVT; is_signed = 1; end
`INST_FPU_F2U: begin core_select = FPU_CVT; end
`INST_FPU_I2F: begin core_select = FPU_CVT; is_itof = 1; is_signed = 1; end
`INST_FPU_U2F: begin core_select = FPU_CVT; is_itof = 1; end
default: begin core_select = FPU_NCP; end
endcase
end
`RESET_RELAY (fma_reset, reset);
`RESET_RELAY (div_reset, reset);
`RESET_RELAY (sqrt_reset, reset);
`RESET_RELAY (cvt_reset, reset);
`RESET_RELAY (ncp_reset, reset);
wire [NUM_FPCORES-1:0] per_core_valid_out;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_result;
wire [NUM_FPCORES-1:0][TAG_WIDTH-1:0] per_core_tag_out;
wire [NUM_FPCORES-1:0] per_core_has_fflags;
fflags_t [NUM_FPCORES-1:0] per_core_fflags;
wire [NUM_FPCORES-1:0] per_core_ready_out;
wire [NUM_LANES-1:0][31:0] dataa_s;
wire [NUM_LANES-1:0][31:0] datab_s;
wire [NUM_LANES-1:0][31:0] datac_s;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data
assign dataa_s[i] = dataa[i][31:0];
assign datab_s[i] = datab[i][31:0];
assign datac_s[i] = datac[i][31:0];
@ -122,23 +93,60 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
`UNUSED_VAR (datab)
`UNUSED_VAR (datac)
// Decode fpu core type
wire [FPCORES_BITS-1:0] core_select = op_type[3:2];
VX_stream_switch #(
.DATAW (REQ_DATAW),
.NUM_OUTPUTS (NUM_FPCORES)
) req_switch (
.clk (clk),
.reset (reset),
.sel_in (core_select),
.valid_in (valid_in),
.ready_in (ready_in),
.data_in ({mask_in, tag_in, fmt, frm, dataa_s, datab_s, datac_s, op_type}),
.data_out (per_core_data_in),
.valid_out (per_core_valid_in),
.ready_out (per_core_ready_in)
);
for (genvar i = 0; i < NUM_FPCORES; ++i) begin : g_per_core_data_in
assign {
per_core_mask_in[i],
per_core_tag_in[i],
per_core_fmt[i],
per_core_frm[i],
per_core_dataa[i],
per_core_datab[i],
per_core_datac[i],
per_core_op_type[i]
} = per_core_data_in[i];
end
// FMA core ///////////////////////////////////////////////////////////////
wire is_madd = per_core_op_type[FPU_FMA][1];
wire is_neg = per_core_op_type[FPU_FMA][0];
wire is_sub = per_core_fmt[FPU_FMA][1];
VX_fpu_fma #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (TAG_WIDTH)
) fpu_fma (
.clk (clk),
.reset (fma_reset),
.valid_in (valid_in && (core_select == FPU_FMA)),
.reset (reset),
.valid_in (per_core_valid_in[FPU_FMA]),
.ready_in (per_core_ready_in[FPU_FMA]),
.mask_in (mask_in),
.tag_in (tag_in),
.frm (frm),
.mask_in (per_core_mask_in[FPU_FMA]),
.tag_in (per_core_tag_in[FPU_FMA]),
.frm (per_core_frm[FPU_FMA]),
.is_madd (is_madd),
.is_sub (is_sub),
.is_neg (is_neg),
.dataa (dataa_s),
.datab (datab_s),
.datac (datac_s),
.dataa (per_core_dataa[FPU_FMA]),
.datab (per_core_datab[FPU_FMA]),
.datac (per_core_datac[FPU_FMA]),
.has_fflags (per_core_has_fflags[FPU_FMA]),
.fflags (per_core_fflags[FPU_FMA]),
.result (per_core_result[FPU_FMA]),
@ -147,25 +155,99 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
.valid_out (per_core_valid_out[FPU_FMA])
);
// Div/Sqrt cores /////////////////////////////////////////////////////////
wire [1:0] div_sqrt_valid_in;
wire [1:0][REQ_DATAW-1:0] div_sqrt_data_in;
wire [1:0] div_sqrt_ready_in;
wire [1:0][NUM_LANES-1:0] div_sqrt_mask_in;
wire [1:0][TAG_WIDTH-1:0] div_sqrt_tag_in;
wire [1:0][`INST_FPU_BITS-1:0] div_sqrt_op_type;
wire [1:0][`INST_FMT_BITS-1:0] div_sqrt_fmt;
wire [1:0][`INST_FRM_BITS-1:0] div_sqrt_frm;
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_dataa;
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_datab;
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_datac;
wire [1:0] div_sqrt_valid_out;
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_result;
wire [1:0][TAG_WIDTH-1:0] div_sqrt_tag_out;
wire [1:0] div_sqrt_has_fflags;
fflags_t [1:0] div_sqrt_fflags;
wire [1:0] div_sqrt_ready_out;
wire div_sqrt_valid_tmp_in;
wire [REQ_DATAW-1:0] div_sqrt_data_tmp_in;
wire div_sqrt_ready_tmp_in;
VX_elastic_buffer #(
.DATAW (REQ_DATAW)
) div_sqrt_req_buffer (
.clk (clk),
.reset (reset),
.valid_in (per_core_valid_in[FPU_DIVSQRT]),
.ready_in (per_core_ready_in[FPU_DIVSQRT]),
.data_in (per_core_data_in[FPU_DIVSQRT]),
.data_out (div_sqrt_data_tmp_in),
.valid_out (div_sqrt_valid_tmp_in),
.ready_out (div_sqrt_ready_tmp_in)
);
wire is_sqrt = div_sqrt_data_tmp_in[0]; // op_type[0]
VX_stream_switch #(
.DATAW (REQ_DATAW),
.NUM_OUTPUTS (2)
) div_sqrt_req_switch (
.clk (clk),
.reset (reset),
.sel_in (is_sqrt),
.valid_in (div_sqrt_valid_tmp_in),
.ready_in (div_sqrt_ready_tmp_in),
.data_in (div_sqrt_data_tmp_in),
.data_out (div_sqrt_data_in),
.valid_out (div_sqrt_valid_in),
.ready_out (div_sqrt_ready_in)
);
for (genvar i = 0; i < 2; ++i) begin : g_div_sqrt_data_in
assign {
div_sqrt_mask_in[i],
div_sqrt_tag_in[i],
div_sqrt_fmt[i],
div_sqrt_frm[i],
div_sqrt_dataa[i],
div_sqrt_datab[i],
div_sqrt_datac[i],
div_sqrt_op_type[i]
} = div_sqrt_data_in[i];
end
`UNUSED_VAR (div_sqrt_op_type)
`UNUSED_VAR (div_sqrt_fmt)
`UNUSED_VAR (div_sqrt_datab)
`UNUSED_VAR (div_sqrt_datac)
VX_fpu_div #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (TAG_WIDTH)
) fpu_div (
.clk (clk),
.reset (div_reset),
.valid_in (valid_in && (core_select == FPU_DIVSQRT) && is_div),
.ready_in (div_ready_in),
.mask_in (mask_in),
.tag_in (tag_in),
.frm (frm),
.dataa (dataa_s),
.datab (datab_s),
.has_fflags (div_has_fflags),
.fflags (div_fflags),
.result (div_result),
.tag_out (div_tag_out),
.valid_out (div_valid_out),
.ready_out (div_ready_out)
.reset (reset),
.valid_in (div_sqrt_valid_in[0]),
.ready_in (div_sqrt_ready_in[0]),
.mask_in (div_sqrt_mask_in[0]),
.tag_in (div_sqrt_tag_in[0]),
.frm (div_sqrt_frm[0]),
.dataa (div_sqrt_dataa[0]),
.datab (div_sqrt_datab[0]),
.has_fflags (div_sqrt_has_fflags[0]),
.fflags (div_sqrt_fflags[0]),
.result (div_sqrt_result[0]),
.tag_out (div_sqrt_tag_out[0]),
.valid_out (div_sqrt_valid_out[0]),
.ready_out (div_sqrt_ready_out[0])
);
VX_fpu_sqrt #(
@ -173,92 +255,42 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
.TAG_WIDTH (TAG_WIDTH)
) fpu_sqrt (
.clk (clk),
.reset (sqrt_reset),
.valid_in (valid_in && (core_select == FPU_DIVSQRT) && ~is_div),
.ready_in (sqrt_ready_in),
.mask_in (mask_in),
.tag_in (tag_in),
.frm (frm),
.dataa (dataa_s),
.has_fflags (sqrt_has_fflags),
.fflags (sqrt_fflags),
.result (sqrt_result),
.tag_out (sqrt_tag_out),
.valid_out (sqrt_valid_out),
.ready_out (sqrt_ready_out)
.reset (reset),
.valid_in (div_sqrt_valid_in[1]),
.ready_in (div_sqrt_ready_in[1]),
.mask_in (div_sqrt_mask_in[1]),
.tag_in (div_sqrt_tag_in[1]),
.frm (div_sqrt_frm[1]),
.dataa (div_sqrt_dataa[1]),
.has_fflags (div_sqrt_has_fflags[1]),
.fflags (div_sqrt_fflags[1]),
.result (div_sqrt_result[1]),
.tag_out (div_sqrt_tag_out[1]),
.valid_out (div_sqrt_valid_out[1]),
.ready_out (div_sqrt_ready_out[1])
);
wire cvt_ret_int_in = ~is_itof;
wire cvt_ret_int_out;
VX_fpu_cvt #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (TAG_WIDTH+1)
) fpu_cvt (
.clk (clk),
.reset (cvt_reset),
.valid_in (valid_in && (core_select == FPU_CVT)),
.ready_in (per_core_ready_in[FPU_CVT]),
.mask_in (mask_in),
.tag_in ({cvt_ret_int_in, tag_in}),
.frm (frm),
.is_itof (is_itof),
.is_signed (is_signed),
.dataa (dataa_s),
.has_fflags (per_core_has_fflags[FPU_CVT]),
.fflags (per_core_fflags[FPU_CVT]),
.result (per_core_result[FPU_CVT]),
.tag_out ({cvt_ret_int_out, per_core_tag_out[FPU_CVT]}),
.valid_out (per_core_valid_out[FPU_CVT]),
.ready_out (per_core_ready_out[FPU_CVT])
);
wire ncp_ret_int_in = (op_type == `INST_FPU_CMP)
|| `INST_FPU_IS_CLASS(op_type, frm)
|| `INST_FPU_IS_MVXW(op_type, frm);
wire ncp_ret_int_out;
wire ncp_ret_sext_in = `INST_FPU_IS_MVXW(op_type, frm);
wire ncp_ret_sext_out;
VX_fpu_ncp #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (TAG_WIDTH+2)
) fpu_ncp (
.clk (clk),
.reset (ncp_reset),
.valid_in (valid_in && (core_select == FPU_NCP)),
.ready_in (per_core_ready_in[FPU_NCP]),
.mask_in (mask_in),
.tag_in ({ncp_ret_sext_in, ncp_ret_int_in, tag_in}),
.op_type (op_type),
.frm (frm),
.dataa (dataa_s),
.datab (datab_s),
.result (per_core_result[FPU_NCP]),
.has_fflags (per_core_has_fflags[FPU_NCP]),
.fflags (per_core_fflags[FPU_NCP]),
.tag_out ({ncp_ret_sext_out, ncp_ret_int_out, per_core_tag_out[FPU_NCP]}),
.valid_out (per_core_valid_out[FPU_NCP]),
.ready_out (per_core_ready_out[FPU_NCP])
);
///////////////////////////////////////////////////////////////////////////
assign per_core_ready_in[FPU_DIVSQRT] = is_div ? div_ready_in : sqrt_ready_in;
wire [1:0][RSP_DATAW-1:0] div_sqrt_arb_data_in;
for (genvar i = 0; i < 2; ++i) begin : g_div_sqrt_arb_data_in
assign div_sqrt_arb_data_in[i] = {
div_sqrt_result[i],
div_sqrt_has_fflags[i],
div_sqrt_fflags[i],
div_sqrt_tag_out[i]
};
end
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_DATAW),
.ARBITER ("R"),
.ARBITER ("P"),
.OUT_BUF (0)
) div_sqrt_arb (
) div_sqrt_rsp_arb (
.clk (clk),
.reset (reset),
.valid_in ({sqrt_valid_out, div_valid_out}),
.ready_in ({sqrt_ready_out, div_ready_out}),
.data_in ({{sqrt_result, sqrt_has_fflags, sqrt_fflags, sqrt_tag_out},
{div_result, div_has_fflags, div_fflags, div_tag_out}}),
.valid_in (div_sqrt_valid_out),
.ready_in (div_sqrt_ready_out),
.data_in (div_sqrt_arb_data_in),
.data_out ({
per_core_result[FPU_DIVSQRT],
per_core_has_fflags[FPU_DIVSQRT],
@ -270,12 +302,73 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
`UNUSED_PIN (sel_out)
);
// CVT core ///////////////////////////////////////////////////////////////
wire is_itof = per_core_op_type[FPU_CVT][1];
wire is_signed = ~per_core_op_type[FPU_CVT][0];
wire cvt_ret_int_in = ~is_itof;
wire cvt_ret_int_out;
VX_fpu_cvt #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (1+TAG_WIDTH)
) fpu_cvt (
.clk (clk),
.reset (reset),
.valid_in (per_core_valid_in[FPU_CVT]),
.ready_in (per_core_ready_in[FPU_CVT]),
.mask_in (per_core_mask_in[FPU_CVT]),
.tag_in ({cvt_ret_int_in, per_core_tag_in[FPU_CVT]}),
.frm (per_core_frm[FPU_CVT]),
.is_itof (is_itof),
.is_signed (is_signed),
.dataa (per_core_dataa[FPU_CVT]),
.has_fflags (per_core_has_fflags[FPU_CVT]),
.fflags (per_core_fflags[FPU_CVT]),
.result (per_core_result[FPU_CVT]),
.tag_out ({cvt_ret_int_out, per_core_tag_out[FPU_CVT]}),
.valid_out (per_core_valid_out[FPU_CVT]),
.ready_out (per_core_ready_out[FPU_CVT])
);
// NCP core ///////////////////////////////////////////////////////////////
wire ncp_ret_int_in = (per_core_op_type[FPU_NCP] == `INST_FPU_CMP)
|| `INST_FPU_IS_CLASS(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP])
|| `INST_FPU_IS_MVXW(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP]);
wire ncp_ret_int_out;
wire ncp_ret_sext_in = `INST_FPU_IS_MVXW(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP]);
wire ncp_ret_sext_out;
VX_fpu_ncp #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (TAG_WIDTH+2)
) fpu_ncp (
.clk (clk),
.reset (reset),
.valid_in (per_core_valid_in[FPU_NCP]),
.ready_in (per_core_ready_in[FPU_NCP]),
.mask_in (per_core_mask_in[FPU_NCP]),
.tag_in ({ncp_ret_sext_in, ncp_ret_int_in, per_core_tag_in[FPU_NCP]}),
.op_type (per_core_op_type[FPU_NCP]),
.frm (per_core_frm[FPU_NCP]),
.dataa (per_core_dataa[FPU_NCP]),
.datab (per_core_datab[FPU_NCP]),
.result (per_core_result[FPU_NCP]),
.has_fflags (per_core_has_fflags[FPU_NCP]),
.fflags (per_core_fflags[FPU_NCP]),
.tag_out ({ncp_ret_sext_out, ncp_ret_int_out, per_core_tag_out[FPU_NCP]}),
.valid_out (per_core_valid_out[FPU_NCP]),
.ready_out (per_core_ready_out[FPU_NCP])
);
///////////////////////////////////////////////////////////////////////////
reg [NUM_FPC-1:0][RSP_DATAW+2-1:0] per_core_data_out;
reg [NUM_FPCORES-1:0][RSP_DATAW+2-1:0] per_core_data_out;
always @(*) begin
for (integer i = 0; i < NUM_FPC; ++i) begin
for (integer i = 0; i < NUM_FPCORES; ++i) begin
per_core_data_out[i][RSP_DATAW+1:2] = {
per_core_result[i],
per_core_has_fflags[i],
@ -289,12 +382,12 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
end
wire [NUM_LANES-1:0][31:0] result_s;
wire [1:0] op_ret_int_out;
`UNUSED_VAR (op_ret_int_out)
VX_stream_arb #(
.NUM_INPUTS (NUM_FPC),
.NUM_INPUTS (NUM_FPCORES),
.DATAW (RSP_DATAW + 2),
.ARBITER ("R"),
.OUT_BUF (OUT_BUF)
@ -310,25 +403,22 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
`UNUSED_PIN (sel_out)
);
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result
`ifdef FPU_RV64F
reg [`XLEN-1:0] result_r;
reg [`XLEN-1:0] result_w;
always @(*) begin
case (op_ret_int_out)
2'b11: result_r = `XLEN'($signed(result_s[i]));
2'b01: result_r = {32'h00000000, result_s[i]};
default: result_r = {32'hffffffff, result_s[i]};
2'b11: result_w = `XLEN'($signed(result_s[i]));
2'b01: result_w = {32'h00000000, result_s[i]};
default: result_w = {32'hffffffff, result_s[i]};
endcase
end
assign result[i] = result_r;
assign result[i] = result_w;
`else
assign result[i] = result_s[i];
`endif
end
// can accept new request?
assign ready_in = per_core_ready_in[core_select];
endmodule
`endif

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -21,7 +21,7 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
parameter TAG_WIDTH = 1
) (
input wire clk,
input wire reset,
input wire reset,
output wire ready_in,
input wire valid_in,
@ -29,7 +29,7 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
input wire [NUM_LANES-1:0] mask_in,
input wire [TAG_WIDTH-1:0] tag_in,
input wire [`INST_FRM_BITS-1:0] frm,
input wire is_madd,
@ -39,7 +39,7 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
input wire [NUM_LANES-1:0][31:0] dataa,
input wire [NUM_LANES-1:0][31:0] datab,
input wire [NUM_LANES-1:0][31:0] datac,
output wire [NUM_LANES-1:0][31:0] result,
output wire [NUM_LANES-1:0][31:0] result,
output wire has_fflags,
output wire [`FP_FLAGS_BITS-1:0] fflags,
@ -49,26 +49,27 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
input wire ready_out,
output wire valid_out
);
`UNUSED_VAR (frm)
localparam DATAW = 3 * 32 + `INST_FRM_BITS;
wire [NUM_LANES-1:0][3*32-1:0] data_in;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][DATAW-1:0] data_in;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags_out;
wire pe_enable;
wire [NUM_PES-1:0][3*32-1:0] pe_data_in;
wire pe_enable;
wire [NUM_PES-1:0][DATAW-1:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
reg [NUM_LANES-1:0][31:0] a, b, c;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_select
always @(*) begin
if (is_madd) begin
// MADD / MSUB / NMADD / NMSUB
a[i] = is_neg ? {~dataa[i][31], dataa[i][30:0]} : dataa[i];
a[i] = {is_neg ^ dataa[i][31], dataa[i][30:0]};
b[i] = datab[i];
c[i] = (is_neg ^ is_sub) ? {~datac[i][31], datac[i][30:0]} : datac[i];
c[i] = {is_neg ^ is_sub ^ datac[i][31], datac[i][30:0]};
end else begin
if (is_neg) begin
// MUL
@ -77,28 +78,30 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
c[i] = '0;
end else begin
// ADD / SUB
a[i] = 32'h3f800000; // 1.0f
b[i] = dataa[i];
c[i] = is_sub ? {~datab[i][31], datab[i][30:0]} : datab[i];
a[i] = dataa[i];
b[i] = 32'h3f800000; // 1.0f
c[i] = {is_sub ^ datab[i][31], datab[i][30:0]};
end
end
end
end
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in
assign data_in[i][0 +: 32] = a[i];
assign data_in[i][32 +: 32] = b[i];
assign data_in[i][64 +: 32] = c[i];
assign data_in[i][96 +: `INST_FRM_BITS] = frm;
end
VX_pe_serializer #(
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.LATENCY (`LATENCY_FMA),
.DATA_IN_WIDTH(3*32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.DATA_IN_WIDTH (DATAW),
.DATA_OUT_WIDTH (`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG (1)
.PE_REG (0),
.OUT_BUF (2)
) pe_serializer (
.clk (clk),
.reset (reset),
@ -107,15 +110,17 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
.tag_in ({mask_in, tag_in}),
.ready_in (ready_in),
.pe_enable (pe_enable),
.pe_data_in (pe_data_in),
.pe_data_out(pe_data_out),
.pe_data_out(pe_data_in),
.pe_data_in (pe_data_out),
.valid_out (valid_out),
.data_out (data_out),
.tag_out ({mask_out, tag_out}),
.ready_out (ready_out)
);
for (genvar i = 0; i < NUM_LANES; ++i) begin
`UNUSED_VAR (pe_data_in)
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result
assign result[i] = data_out[i][0 +: 32];
assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS];
end
@ -123,8 +128,8 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
fflags_t [NUM_LANES-1:0] per_lane_fflags;
`ifdef QUARTUS
for (genvar i = 0; i < NUM_PES; ++i) begin
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fmas
acl_fmadd fmadd (
.clk (clk),
.areset (1'b0),
@ -136,15 +141,15 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
);
assign pe_data_out[i][32 +: `FP_FLAGS_BITS] = 'x;
end
assign has_fflags = 0;
assign per_lane_fflags = 'x;
`elsif VIVADO
for (genvar i = 0; i < NUM_PES; ++i) begin
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fmas
wire [2:0] tuser;
xil_fma fma (
.aclk (clk),
.aclken (pe_enable),
@ -167,20 +172,20 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
`else
for (genvar i = 0; i < NUM_PES; ++i) begin
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fmas
reg [63:0] r;
`UNUSED_VAR (r)
fflags_t f;
always @(*) begin
always @(*) begin
dpi_fmadd (
pe_enable,
int'(0),
{32'hffffffff, pe_data_in[i][0 +: 32]},
{32'hffffffff, pe_data_in[i][32 +: 32]},
{32'hffffffff, pe_data_in[i][64 +: 32]},
frm,
r,
pe_enable,
int'(0),
{32'hffffffff, pe_data_in[i][0 +: 32]}, // a
{32'hffffffff, pe_data_in[i][32 +: 32]}, // b
{32'hffffffff, pe_data_in[i][64 +: 32]}, // c
pe_data_in[0][96 +: `INST_FRM_BITS], // frm
r,
f
);
end

View file

@ -90,7 +90,7 @@ module VX_fpu_fpnew
reg [TAG_WIDTH-1:0] fpu_tag_in, fpu_tag_out;
reg [2:0][NUM_LANES-1:0][`XLEN-1:0] fpu_operands;
logic [2:0][NUM_LANES-1:0][`XLEN-1:0] fpu_operands;
wire [NUM_LANES-1:0][`XLEN-1:0] fpu_result;
fpnew_pkg::status_t fpu_status;
@ -105,7 +105,7 @@ module VX_fpu_fpnew
`UNUSED_VAR (fmt)
always @(*) begin
fpu_op = 'x;
fpu_op = fpnew_pkg::operation_e'('x);
fpu_rnd = frm;
fpu_op_mod = 0;
fpu_has_fflags = 1;
@ -134,20 +134,13 @@ module VX_fpu_fpnew
fpu_op = fpnew_pkg::ADD;
fpu_operands[1] = dataa;
fpu_operands[2] = datab;
end
`INST_FPU_SUB: begin
fpu_op = fpnew_pkg::ADD;
fpu_operands[1] = dataa;
fpu_operands[2] = datab;
fpu_op_mod = 1;
fpu_op_mod = fmt[1]; // FADD or FSUB
end
`INST_FPU_MUL: begin fpu_op = fpnew_pkg::MUL; end
`INST_FPU_MADD: begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = fmt[1]; end
`INST_FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = ~fmt[1]; end
`INST_FPU_DIV: begin fpu_op = fpnew_pkg::DIV; end
`INST_FPU_SQRT: begin fpu_op = fpnew_pkg::SQRT; end
`INST_FPU_MADD: begin fpu_op = fpnew_pkg::FMADD; end
`INST_FPU_MSUB: begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = 1; end
`INST_FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = 1; end
`INST_FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end
`ifdef FLEN_64
`INST_FPU_F2F: begin fpu_op = fpnew_pkg::F2F; fpu_src_fmt = fmt[0] ? fpnew_pkg::FP32 : fpnew_pkg::FP64; end
`endif
@ -169,7 +162,7 @@ module VX_fpu_fpnew
end
`UNUSED_VAR (mask_in)
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_fpnew_coreses
wire [(TAG_WIDTH+1)-1:0] fpu_tag;
wire fpu_valid_out_uq;
wire fpu_ready_in_uq;
@ -183,8 +176,7 @@ module VX_fpu_fpnew
.Features (FPU_FEATURES),
.Implementation (FPU_IMPLEMENTATION),
.TagType (logic[(TAG_WIDTH+1)-1:0]),
.TrueSIMDClass (1),
.EnableSIMDMask (1)
.DivSqrtSel (fpnew_pkg::PULP)
) fpnew_core (
.clk_i (clk),
.rst_ni (~reset),
@ -196,11 +188,11 @@ module VX_fpu_fpnew
.dst_fmt_i (fpu_dst_fmt),
.int_fmt_i (fpu_int_fmt),
.vectorial_op_i (1'b0),
.simd_mask_i (mask_in[i]),
.simd_mask_i (1'b1),
.tag_i ({fpu_tag_in, fpu_has_fflags}),
.in_valid_i (fpu_valid_in),
.in_ready_o (fpu_ready_in_uq),
.flush_i (reset),
.flush_i (1'b0),
.result_o (fpu_result[i]),
.status_o (fpu_status_uq),
.tag_o (fpu_tag),
@ -209,7 +201,7 @@ module VX_fpu_fpnew
`UNUSED_PIN (busy_o)
);
if (i == 0) begin
if (i == 0) begin : g_output_0
assign {fpu_tag_out, fpu_has_fflags_out} = fpu_tag;
assign fpu_valid_out = fpu_valid_out_uq;
assign fpu_ready_in = fpu_ready_in_uq;

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -35,7 +35,7 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #(
input wire [NUM_LANES-1:0][31:0] dataa,
input wire [NUM_LANES-1:0][31:0] datab,
output wire [NUM_LANES-1:0][31:0] result,
output wire [NUM_LANES-1:0][31:0] result,
output wire has_fflags,
output wire [`FP_FLAGS_BITS-1:0] fflags,
@ -44,31 +44,35 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #(
input wire ready_out,
output wire valid_out
);
`UNUSED_VAR (frm)
);
localparam DATAW = 2 * 32 + `INST_FRM_BITS + `INST_FPU_BITS;
wire [NUM_LANES-1:0][2*32-1:0] data_in;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][DATAW-1:0] data_in;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
fflags_t [NUM_LANES-1:0] fflags_out;
wire pe_enable;
wire [NUM_PES-1:0][2*32-1:0] pe_data_in;
wire pe_enable;
wire [NUM_PES-1:0][DATAW-1:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in
assign data_in[i][0 +: 32] = dataa[i];
assign data_in[i][32 +: 32] = datab[i];
assign data_in[i][64 +: `INST_FRM_BITS] = frm;
assign data_in[i][64 + `INST_FRM_BITS +: `INST_FPU_BITS] = op_type;
end
VX_pe_serializer #(
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.LATENCY (`LATENCY_FNCP),
.DATA_IN_WIDTH(2*32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.DATA_IN_WIDTH (DATAW),
.DATA_OUT_WIDTH (`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG (0)
.PE_REG (0),
.OUT_BUF (2)
) pe_serializer (
.clk (clk),
.reset (reset),
@ -77,28 +81,31 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #(
.tag_in ({mask_in, tag_in}),
.ready_in (ready_in),
.pe_enable (pe_enable),
.pe_data_in (pe_data_in),
.pe_data_out(pe_data_out),
.pe_data_out(pe_data_in),
.pe_data_in (pe_data_out),
.valid_out (valid_out),
.data_out (data_out),
.tag_out ({mask_out, tag_out}),
.ready_out (ready_out)
);
for (genvar i = 0; i < NUM_LANES; ++i) begin
`UNUSED_VAR (pe_data_in)
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result
assign result[i] = data_out[i][0 +: 32];
assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS];
end
for (genvar i = 0; i < NUM_PES; ++i) begin
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fncp_units
VX_fncp_unit #(
.LATENCY (`LATENCY_FNCP)
.LATENCY (`LATENCY_FNCP),
.OUT_REG (1)
) fncp_unit (
.clk (clk),
.reset (reset),
.enable (pe_enable),
.frm (frm),
.op_type (op_type),
.frm (pe_data_in[0][64 +: `INST_FRM_BITS]),
.op_type (pe_data_in[0][64 + `INST_FRM_BITS +: `INST_FPU_BITS]),
.dataa (pe_data_in[i][0 +: 32]),
.datab (pe_data_in[i][32 +: 32]),
.result (pe_data_out[i][0 +: 32]),

Some files were not shown because too many files have changed in this diff Show more