mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-21 12:28:15 -04:00
Compare commits
No commits in common. "master" and "v2.2" have entirely different histories.
398 changed files with 29997 additions and 54343 deletions
|
@ -1,8 +0,0 @@
|
|||
Language: Cpp
|
||||
BasedOnStyle: LLVM
|
||||
IndentWidth: 2
|
||||
TabWidth: 2
|
||||
ColumnLimit: 0
|
||||
UseTab: Never
|
||||
BreakBeforeBraces: Attach
|
||||
AlwaysBreakTemplateDeclarations: true
|
41
.github/workflows/ci.yml
vendored
41
.github/workflows/ci.yml
vendored
|
@ -17,17 +17,17 @@ on: [push, pull_request]
|
|||
|
||||
jobs:
|
||||
setup:
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-20.04
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v2
|
||||
with:
|
||||
submodules: recursive
|
||||
|
||||
- name: Cache Toolchain Directory
|
||||
id: cache-toolchain
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: tools
|
||||
key: ${{ runner.os }}-toolchain-v0.1
|
||||
|
@ -36,7 +36,7 @@ jobs:
|
|||
|
||||
- name: Cache Third Party Directory
|
||||
id: cache-thirdparty
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: third_party
|
||||
key: ${{ runner.os }}-thirdparty-v0.1
|
||||
|
@ -46,7 +46,7 @@ jobs:
|
|||
- name: Install Dependencies
|
||||
if: steps.cache-toolchain.outputs.cache-hit != 'true' || steps.cache-thirdparty.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
sudo bash ./ci/install_dependencies.sh
|
||||
sudo bash ./ci/system_updates.sh
|
||||
|
||||
- name: Setup Toolchain
|
||||
if: steps.cache-toolchain.outputs.cache-hit != 'true'
|
||||
|
@ -63,7 +63,7 @@ jobs:
|
|||
make -C third_party > /dev/null
|
||||
|
||||
build:
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-20.04
|
||||
needs: setup
|
||||
strategy:
|
||||
matrix:
|
||||
|
@ -71,15 +71,15 @@ jobs:
|
|||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
sudo bash ./ci/install_dependencies.sh
|
||||
sudo bash ./ci/system_updates.sh
|
||||
|
||||
- name: Cache Toolchain Directory
|
||||
id: cache-toolchain
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: tools
|
||||
key: ${{ runner.os }}-toolchain-v0.1
|
||||
|
@ -88,7 +88,7 @@ jobs:
|
|||
|
||||
- name: Cache Third Party Directory
|
||||
id: cache-thirdparty
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: third_party
|
||||
key: ${{ runner.os }}-thirdparty-v0.1
|
||||
|
@ -106,31 +106,31 @@ jobs:
|
|||
make tests -s > /dev/null
|
||||
|
||||
- name: Upload Build Artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
uses: actions/upload-artifact@v2
|
||||
with:
|
||||
name: build-${{ matrix.xlen }}
|
||||
path: build${{ matrix.xlen }}
|
||||
|
||||
tests:
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-20.04
|
||||
needs: build
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
name: [regression, opencl, cache, config1, config2, debug, scope, stress, synthesis, vm, vector]
|
||||
name: [regression, opencl, cache, config1, config2, debug, stress]
|
||||
xlen: [32, 64]
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
uses: actions/checkout@v2
|
||||
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
sudo bash ./ci/install_dependencies.sh
|
||||
sudo bash ./ci/system_updates.sh
|
||||
|
||||
- name: Cache Toolchain Directory
|
||||
id: cache-toolchain
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: tools
|
||||
key: ${{ runner.os }}-toolchain-v0.1
|
||||
|
@ -139,7 +139,7 @@ jobs:
|
|||
|
||||
- name: Cache Third Party Directory
|
||||
id: cache-thirdparty
|
||||
uses: actions/cache@v4
|
||||
uses: actions/cache@v2
|
||||
with:
|
||||
path: third_party
|
||||
key: ${{ runner.os }}-thirdparty-v0.1
|
||||
|
@ -147,7 +147,7 @@ jobs:
|
|||
${{ runner.os }}-thirdparty-
|
||||
|
||||
- name: Download Build Artifact
|
||||
uses: actions/download-artifact@v4
|
||||
uses: actions/download-artifact@v2
|
||||
with:
|
||||
name: build-${{ matrix.xlen }}
|
||||
path: build${{ matrix.xlen }}
|
||||
|
@ -161,15 +161,16 @@ jobs:
|
|||
./ci/regression.sh --unittest
|
||||
./ci/regression.sh --isa
|
||||
./ci/regression.sh --kernel
|
||||
./ci/regression.sh --synthesis
|
||||
./ci/regression.sh --regression
|
||||
else
|
||||
./ci/regression.sh --${{ matrix.name }}
|
||||
fi
|
||||
|
||||
complete:
|
||||
runs-on: ubuntu-22.04
|
||||
runs-on: ubuntu-20.04
|
||||
needs: tests
|
||||
|
||||
steps:
|
||||
- name: Check Completion
|
||||
run: echo "All matrix jobs passed"
|
||||
run: echo "All matrix jobs passed"
|
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -1,4 +1,3 @@
|
|||
/build*
|
||||
/.vscode
|
||||
*.cache
|
||||
*.code-workspace
|
||||
*.cache
|
6
.gitmodules
vendored
6
.gitmodules
vendored
|
@ -1,9 +1,9 @@
|
|||
[submodule "third_party/fpnew"]
|
||||
path = third_party/fpnew
|
||||
url = https://github.com/pulp-platform/fpnew.git
|
||||
[submodule "third_party/softfloat"]
|
||||
path = third_party/softfloat
|
||||
url = https://github.com/ucb-bar/berkeley-softfloat-3.git
|
||||
[submodule "third_party/ramulator"]
|
||||
path = third_party/ramulator
|
||||
url = https://github.com/CMU-SAFARI/ramulator2.git
|
||||
[submodule "third_party/cvfpu"]
|
||||
path = third_party/cvfpu
|
||||
url = https://github.com/openhwgroup/cvfpu.git
|
||||
|
|
|
@ -1,20 +0,0 @@
|
|||
FROM ubuntu:20.04
|
||||
|
||||
LABEL "Udit Subramanya"="usubramanya3@gatech.edu"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y build-essential valgrind git wget libpng-dev libboost-all-dev uuid-dev ccache cmake
|
||||
|
||||
# Third-Party Repository to Install g++11 on Ubuntu 18.04
|
||||
RUN apt-get install -y manpages-dev software-properties-common
|
||||
RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test
|
||||
|
||||
RUN apt-get install -y gcc-11 g++-11
|
||||
|
||||
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
|
||||
RUN update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11
|
||||
|
||||
# create a directory for mounting the volume
|
||||
WORKDIR /root/vortex
|
81
README.md
81
README.md
|
@ -1,35 +1,10 @@
|
|||
# Vortex GPGPU
|
||||
|
||||
Vortex is a full-stack open-source RISC-V GPGPU. Vortex supports multiple **backend drivers**, including our C++ simulator (simx), an RTL simulator, and physical Xilinx and Altera FPGAs-- all controlled by a single driver script. The chosen driver determines the corresponding code invoked to run Vortex. Generally, developers will prototype their intended design in simx, before completing going forward with an RTL implementation. Alternatively, you can get up and running by selecting a driver of your choice and running a demo program.
|
||||
|
||||
## Website
|
||||
Vortex news can be found on its [website](https://vortex.cc.gatech.edu/)
|
||||
|
||||
## Citation
|
||||
```
|
||||
@inproceedings{10.1145/3466752.3480128,
|
||||
author = {Tine, Blaise and Yalamarthy, Krishna Praveen and Elsabbagh, Fares and Hyesoon, Kim},
|
||||
title = {Vortex: Extending the RISC-V ISA for GPGPU and 3D-Graphics},
|
||||
year = {2021},
|
||||
isbn = {9781450385572},
|
||||
publisher = {Association for Computing Machinery},
|
||||
address = {New York, NY, USA},
|
||||
url = {https://doi.org/10.1145/3466752.3480128},
|
||||
doi = {10.1145/3466752.3480128},
|
||||
abstract = {The importance of open-source hardware and software has been increasing. However, despite GPUs being one of the more popular accelerators across various applications, there is very little open-source GPU infrastructure in the public domain. We argue that one of the reasons for the lack of open-source infrastructure for GPUs is rooted in the complexity of their ISA and software stacks. In this work, we first propose an ISA extension to RISC-V that supports GPGPUs and graphics. The main goal of the ISA extension proposal is to minimize the ISA changes so that the corresponding changes to the open-source ecosystem are also minimal, which makes for a sustainable development ecosystem. To demonstrate the feasibility of the minimally extended RISC-V ISA, we implemented the complete software and hardware stacks of Vortex on FPGA. Vortex is a PCIe-based soft GPU that supports OpenCL and OpenGL. Vortex can be used in a variety of applications, including machine learning, graph analytics, and graphics rendering. Vortex can scale up to 32 cores on an Altera Stratix 10 FPGA, delivering a peak performance of 25.6 GFlops at 200 Mhz.},
|
||||
booktitle = {MICRO-54: 54th Annual IEEE/ACM International Symposium on Microarchitecture},
|
||||
pages = {754–766},
|
||||
numpages = {13},
|
||||
keywords = {reconfigurable computing, memory systems., computer graphics},
|
||||
location = {Virtual Event, Greece},
|
||||
series = {MICRO '21}
|
||||
}
|
||||
```
|
||||
Vortex is a full-stack open-source RISC-V GPGPU.
|
||||
|
||||
## Specifications
|
||||
|
||||
- Support RISC-V RV32IMAF and RV64IMAFD
|
||||
|
||||
- Microarchitecture:
|
||||
- configurable number of cores, warps, and threads.
|
||||
- configurable number of ALU, FPU, LSU, and SFU units per core.
|
||||
|
@ -54,50 +29,48 @@ Vortex news can be found on its [website](https://vortex.cc.gatech.edu/)
|
|||
- `ci`: Continuous integration scripts.
|
||||
- `miscs`: Miscellaneous resources.
|
||||
|
||||
## Quick Start
|
||||
If you are interested in a stable release of Vortex, you can download the latest release [here](https://github.com/vortexgpgpu/vortex/releases/latest). Otherwise, you can pull the most recent, but (potentially) unstable version as shown below. The following steps demonstrate how to build and run Vortex with the default driver: SimX. If you are interested in a different backend, look [here](docs/simulation.md).
|
||||
|
||||
## Build Instructions
|
||||
More detailed build instructions can be found [here](docs/install_vortex.md).
|
||||
### Supported OS Platforms
|
||||
- Ubuntu 18.04, 20.04, 22.04, 24.04
|
||||
- Ubuntu 18.04, 20.04
|
||||
- Centos 7
|
||||
### Toolchain Dependencies
|
||||
The following dependencies will be fetched prebuilt by `toolchain_install.sh`.
|
||||
- [POCL](http://portablecl.org/)
|
||||
- [LLVM](https://llvm.org/)
|
||||
- [RISCV-GNU-TOOLCHAIN](https://github.com/riscv-collab/riscv-gnu-toolchain)
|
||||
- [Verilator](https://www.veripool.org/verilator)
|
||||
- [cvfpu](https://github.com/openhwgroup/cvfpu.git)
|
||||
- [FpNew](https://github.com/pulp-platform/fpnew.git)
|
||||
- [SoftFloat](https://github.com/ucb-bar/berkeley-softfloat-3.git)
|
||||
- [Ramulator](https://github.com/CMU-SAFARI/ramulator.git)
|
||||
- [Yosys](https://github.com/YosysHQ/yosys)
|
||||
- [Sv2v](https://github.com/zachjs/sv2v)
|
||||
### Install development tools
|
||||
```sh
|
||||
sudo apt-get install build-essential
|
||||
sudo apt-get install binutils
|
||||
sudo apt-get install python
|
||||
sudo apt-get install uuid-dev
|
||||
sudo apt-get install git
|
||||
```
|
||||
### Install Vortex codebase
|
||||
```sh
|
||||
git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git
|
||||
cd vortex
|
||||
```
|
||||
### Install system dependencies
|
||||
```sh
|
||||
# ensure dependent libraries are present
|
||||
sudo ./ci/install_dependencies.sh
|
||||
git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git
|
||||
cd vortex
|
||||
```
|
||||
### Configure your build folder
|
||||
```sh
|
||||
mkdir build
|
||||
cd build
|
||||
# for 32bit
|
||||
../configure --xlen=32 --tooldir=$HOME/tools
|
||||
# for 64bit
|
||||
../configure --xlen=64 --tooldir=$HOME/tools
|
||||
mkdir build
|
||||
cd build
|
||||
../configure --xlen=32 --tooldir=$HOME/tools
|
||||
```
|
||||
### Install prebuilt toolchain
|
||||
```sh
|
||||
./ci/toolchain_install.sh --all
|
||||
./ci/toolchain_install.sh --all
|
||||
```
|
||||
### set environment variables
|
||||
### Set environment variables
|
||||
```sh
|
||||
# should always run before using the toolchain!
|
||||
source ./ci/toolchain_env.sh
|
||||
# should always run before using the toolchain!
|
||||
source ./ci/toolchain_env.sh
|
||||
```
|
||||
### Building Vortex
|
||||
```sh
|
||||
|
@ -115,20 +88,20 @@ make -s
|
|||
make -s
|
||||
make install
|
||||
```
|
||||
- Building Vortex 64-bit requires setting --xlen=64 configure option.
|
||||
- Building Vortex 64-bit simply requires using --xlen=64 configure option.
|
||||
```sh
|
||||
../configure --xlen=64 --tooldir=$HOME/tools
|
||||
../configure --xlen=32 --tooldir=$HOME/tools
|
||||
```
|
||||
- Sourcing "./ci/toolchain_env.sh" is required everytime you start a new terminal. we recommend adding "source <build-path>/ci/toolchain_env.sh" to your ~/.bashrc file to automate the process at login.
|
||||
```sh
|
||||
echo "source <build-path>/ci/toolchain_env.sh" >> ~/.bashrc
|
||||
```
|
||||
- Making changes to Makefiles in your source tree or adding new folders will require executing the "configure" script again without any options to get changes propagated to your build folder.
|
||||
- Making changes to Makefiles in your source tree or adding new folders will require executing the "configure" script again to get it propagated into your build folder.
|
||||
```sh
|
||||
../configure
|
||||
```
|
||||
- To debug the GPU, the simulation can generate a runtime trace for analysis. See /docs/debugging.md for more information.
|
||||
- To debug the GPU, you can generate a "run.log" trace. see /docs/debugging.md for more information.
|
||||
```sh
|
||||
./ci/blackbox.sh --app=demo --debug=3
|
||||
```
|
||||
- For additional information, check out the [documentation](docs/index.md)
|
||||
- For additional information, check out the /docs.
|
||||
|
|
455
ci/blackbox.sh
455
ci/blackbox.sh
|
@ -13,9 +13,6 @@
|
|||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
SCRIPT_DIR=$(dirname "$0")
|
||||
ROOT_DIR=$SCRIPT_DIR/..
|
||||
|
||||
show_usage()
|
||||
{
|
||||
echo "Vortex BlackBox Test Driver v1.0"
|
||||
|
@ -32,174 +29,302 @@ show_help()
|
|||
echo "--rebuild: 0=disable, 1=force, 2=auto, 3=temp"
|
||||
}
|
||||
|
||||
add_option() {
|
||||
if [ -n "$1" ]; then
|
||||
echo "$1 $2"
|
||||
else
|
||||
echo "$2"
|
||||
fi
|
||||
}
|
||||
SCRIPT_DIR=$(dirname "$0")
|
||||
ROOT_DIR=$SCRIPT_DIR/..
|
||||
|
||||
DEFAULTS() {
|
||||
DRIVER=simx
|
||||
APP=sgemm
|
||||
DEBUG=0
|
||||
DEBUG_LEVEL=0
|
||||
SCOPE=0
|
||||
HAS_ARGS=0
|
||||
PERF_CLASS=0
|
||||
CONFIGS="$CONFIGS"
|
||||
REBUILD=2
|
||||
TEMPBUILD=0
|
||||
LOGFILE=run.log
|
||||
}
|
||||
DRIVER=simx
|
||||
APP=sgemm
|
||||
CLUSTERS=1
|
||||
CORES=1
|
||||
WARPS=4
|
||||
THREADS=4
|
||||
L2=
|
||||
L3=
|
||||
DEBUG=0
|
||||
DEBUG_LEVEL=0
|
||||
SCOPE=0
|
||||
HAS_ARGS=0
|
||||
PERF_CLASS=0
|
||||
REBUILD=2
|
||||
TEMPBUILD=0
|
||||
LOGFILE=run.log
|
||||
|
||||
parse_args() {
|
||||
DEFAULTS
|
||||
for i in "$@"; do
|
||||
case $i in
|
||||
--driver=*) DRIVER=${i#*=} ;;
|
||||
--app=*) APP=${i#*=} ;;
|
||||
--clusters=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_CLUSTERS=${i#*=}") ;;
|
||||
--cores=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_CORES=${i#*=}") ;;
|
||||
--warps=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_WARPS=${i#*=}") ;;
|
||||
--threads=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_THREADS=${i#*=}") ;;
|
||||
--l2cache) CONFIGS=$(add_option "$CONFIGS" "-DL2_ENABLE") ;;
|
||||
--l3cache) CONFIGS=$(add_option "$CONFIGS" "-DL3_ENABLE") ;;
|
||||
--perf=*) CONFIGS=$(add_option "$CONFIGS" "-DPERF_ENABLE"); PERF_CLASS=${i#*=} ;;
|
||||
--debug=*) DEBUG=1; DEBUG_LEVEL=${i#*=} ;;
|
||||
--scope) SCOPE=1; ;;
|
||||
--args=*) HAS_ARGS=1; ARGS=${i#*=} ;;
|
||||
--rebuild=*) REBUILD=${i#*=} ;;
|
||||
--log=*) LOGFILE=${i#*=} ;;
|
||||
--help) show_help; exit 0 ;;
|
||||
*) show_usage; exit 1 ;;
|
||||
esac
|
||||
done
|
||||
for i in "$@"
|
||||
do
|
||||
case $i in
|
||||
--driver=*)
|
||||
DRIVER=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--app=*)
|
||||
APP=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--clusters=*)
|
||||
CLUSTERS=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--cores=*)
|
||||
CORES=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--warps=*)
|
||||
WARPS=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--threads=*)
|
||||
THREADS=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--l2cache)
|
||||
L2=-DL2_ENABLE
|
||||
shift
|
||||
;;
|
||||
--l3cache)
|
||||
L3=-DL3_ENABLE
|
||||
shift
|
||||
;;
|
||||
--debug=*)
|
||||
DEBUG_LEVEL=${i#*=}
|
||||
DEBUG=1
|
||||
shift
|
||||
;;
|
||||
--scope)
|
||||
SCOPE=1
|
||||
CORES=1
|
||||
shift
|
||||
;;
|
||||
--perf=*)
|
||||
PERF_FLAG=-DPERF_ENABLE
|
||||
PERF_CLASS=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--args=*)
|
||||
ARGS=${i#*=}
|
||||
HAS_ARGS=1
|
||||
shift
|
||||
;;
|
||||
--rebuild=*)
|
||||
REBUILD=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--log=*)
|
||||
LOGFILE=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--help)
|
||||
show_help
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
show_usage
|
||||
exit -1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ $REBUILD -eq 3 ];
|
||||
if [ $REBUILD -eq 3 ];
|
||||
then
|
||||
REBUILD=1
|
||||
TEMPBUILD=1
|
||||
fi
|
||||
|
||||
case $DRIVER in
|
||||
gpu)
|
||||
DRIVER_PATH=
|
||||
;;
|
||||
simx)
|
||||
DRIVER_PATH=$ROOT_DIR/runtime/simx
|
||||
;;
|
||||
rtlsim)
|
||||
DRIVER_PATH=$ROOT_DIR/runtime/rtlsim
|
||||
;;
|
||||
opae)
|
||||
DRIVER_PATH=$ROOT_DIR/runtime/opae
|
||||
;;
|
||||
xrt)
|
||||
DRIVER_PATH=$ROOT_DIR/runtime/xrt
|
||||
;;
|
||||
*)
|
||||
echo "invalid driver: $DRIVER"
|
||||
exit -1
|
||||
;;
|
||||
esac
|
||||
|
||||
if [ -d "$ROOT_DIR/tests/opencl/$APP" ];
|
||||
then
|
||||
APP_PATH=$ROOT_DIR/tests/opencl/$APP
|
||||
elif [ -d "$ROOT_DIR/tests/regression/$APP" ];
|
||||
then
|
||||
APP_PATH=$ROOT_DIR/tests/regression/$APP
|
||||
else
|
||||
echo "Application folder not found: $APP"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
if [ "$DRIVER" = "gpu" ];
|
||||
then
|
||||
# running application
|
||||
if [ $HAS_ARGS -eq 1 ]
|
||||
then
|
||||
REBUILD=1
|
||||
TEMPBUILD=1
|
||||
fi
|
||||
}
|
||||
|
||||
set_driver_path() {
|
||||
case $DRIVER in
|
||||
gpu) DRIVER_PATH="" ;;
|
||||
simx|rtlsim|opae|xrt) DRIVER_PATH="$ROOT_DIR/runtime/$DRIVER" ;;
|
||||
*) echo "Invalid driver: $DRIVER"; exit 1 ;;
|
||||
esac
|
||||
}
|
||||
|
||||
set_app_path() {
|
||||
if [ -d "$ROOT_DIR/tests/opencl/$APP" ]; then
|
||||
APP_PATH="$ROOT_DIR/tests/opencl/$APP"
|
||||
elif [ -d "$ROOT_DIR/tests/regression/$APP" ]; then
|
||||
APP_PATH="$ROOT_DIR/tests/regression/$APP"
|
||||
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
|
||||
OPTS=$ARGS make -C $APP_PATH run-$DRIVER
|
||||
status=$?
|
||||
else
|
||||
echo "Application folder not found: $APP"
|
||||
exit 1
|
||||
fi
|
||||
}
|
||||
|
||||
build_driver() {
|
||||
local cmd_opts=""
|
||||
[ $DEBUG -ne 0 ] && cmd_opts=$(add_option "$cmd_opts" "DEBUG=$DEBUG_LEVEL")
|
||||
[ $SCOPE -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "SCOPE=1")
|
||||
[ $TEMPBUILD -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "DESTDIR=\"$TEMPDIR\"")
|
||||
[ -n "$CONFIGS" ] && cmd_opts=$(add_option "$cmd_opts" "CONFIGS=\"$CONFIGS\"")
|
||||
|
||||
if [ -n "$cmd_opts" ]; then
|
||||
echo "Running: $cmd_opts make -C $DRIVER_PATH > /dev/null"
|
||||
eval "$cmd_opts make -C $DRIVER_PATH > /dev/null"
|
||||
else
|
||||
echo "Running: make -C $DRIVER_PATH > /dev/null"
|
||||
make -C $DRIVER_PATH > /dev/null
|
||||
fi
|
||||
}
|
||||
|
||||
run_app() {
|
||||
local cmd_opts=""
|
||||
[ $DEBUG -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "DEBUG=1")
|
||||
[ $TEMPBUILD -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "VORTEX_RT_PATH=\"$TEMPDIR\"")
|
||||
[ $HAS_ARGS -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "OPTS=\"$ARGS\"")
|
||||
|
||||
if [ $DEBUG -ne 0 ]; then
|
||||
if [ -n "$cmd_opts" ]; then
|
||||
echo "Running: $cmd_opts make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||
eval "$cmd_opts make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||
else
|
||||
echo "Running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||
make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
fi
|
||||
else
|
||||
if [ -n "$cmd_opts" ]; then
|
||||
echo "Running: $cmd_opts make -C $APP_PATH run-$DRIVER"
|
||||
eval "$cmd_opts make -C $APP_PATH run-$DRIVER"
|
||||
else
|
||||
echo "Running: make -C $APP_PATH run-$DRIVER"
|
||||
make -C $APP_PATH run-$DRIVER
|
||||
fi
|
||||
fi
|
||||
status=$?
|
||||
return $status
|
||||
}
|
||||
|
||||
main() {
|
||||
parse_args "$@"
|
||||
set_driver_path
|
||||
set_app_path
|
||||
|
||||
# execute on default installed GPU
|
||||
if [ "$DRIVER" = "gpu" ]; then
|
||||
run_app
|
||||
exit $?
|
||||
fi
|
||||
|
||||
if [ -n "$CONFIGS" ]; then
|
||||
echo "CONFIGS=$CONFIGS"
|
||||
fi
|
||||
|
||||
if [ $REBUILD -ne 0 ]; then
|
||||
BLACKBOX_CACHE=blackbox.$DRIVER.cache
|
||||
LAST_CONFIGS=$(cat "$BLACKBOX_CACHE" 2>/dev/null || echo "")
|
||||
|
||||
if [ $REBUILD -eq 1 ] || [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ]; then
|
||||
make -C $DRIVER_PATH clean-driver > /dev/null
|
||||
echo "$CONFIGS+$DEBUG+$SCOPE" > "$BLACKBOX_CACHE"
|
||||
fi
|
||||
fi
|
||||
|
||||
export VORTEX_PROFILING=$PERF_CLASS
|
||||
|
||||
make -C "$ROOT_DIR/hw" config > /dev/null
|
||||
make -C "$ROOT_DIR/runtime/stub" > /dev/null
|
||||
|
||||
if [ $TEMPBUILD -eq 1 ]; then
|
||||
# setup temp directory
|
||||
TEMPDIR=$(mktemp -d)
|
||||
mkdir -p "$TEMPDIR"
|
||||
# build stub driver
|
||||
echo "running: DESTDIR=$TEMPDIR make -C $ROOT_DIR/runtime/stub"
|
||||
DESTDIR="$TEMPDIR" make -C $ROOT_DIR/runtime/stub > /dev/null
|
||||
# register tempdir cleanup on exit
|
||||
trap "rm -rf $TEMPDIR" EXIT
|
||||
fi
|
||||
|
||||
build_driver
|
||||
run_app
|
||||
status=$?
|
||||
|
||||
if [ $DEBUG -eq 1 ] && [ -f "$APP_PATH/trace.vcd" ]; then
|
||||
mv -f $APP_PATH/trace.vcd .
|
||||
fi
|
||||
|
||||
if [ $SCOPE -eq 1 ] && [ -f "$APP_PATH/scope.vcd" ]; then
|
||||
mv -f $APP_PATH/scope.vcd .
|
||||
echo "running: make -C $APP_PATH run-$DRIVER"
|
||||
make -C $APP_PATH run-$DRIVER
|
||||
status=$?
|
||||
fi
|
||||
|
||||
exit $status
|
||||
}
|
||||
fi
|
||||
|
||||
main "$@"
|
||||
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS"
|
||||
|
||||
echo "CONFIGS=$CONFIGS"
|
||||
|
||||
if [ $REBUILD -ne 0 ]
|
||||
then
|
||||
BLACKBOX_CACHE=blackbox.$DRIVER.cache
|
||||
if [ -f "$BLACKBOX_CACHE" ]
|
||||
then
|
||||
LAST_CONFIGS=`cat $BLACKBOX_CACHE`
|
||||
fi
|
||||
|
||||
if [ $REBUILD -eq 1 ] || [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ];
|
||||
then
|
||||
make -C $DRIVER_PATH clean-driver > /dev/null
|
||||
echo "$CONFIGS+$DEBUG+$SCOPE" > $BLACKBOX_CACHE
|
||||
fi
|
||||
fi
|
||||
|
||||
# export performance monitor class identifier
|
||||
export VORTEX_PROFILING=$PERF_CLASS
|
||||
|
||||
status=0
|
||||
|
||||
# ensure config update
|
||||
make -C $ROOT_DIR/hw config > /dev/null
|
||||
|
||||
# ensure the stub driver is present
|
||||
make -C $ROOT_DIR/runtime/stub > /dev/null
|
||||
|
||||
if [ $DEBUG -ne 0 ]
|
||||
then
|
||||
# running application
|
||||
if [ $TEMPBUILD -eq 1 ]
|
||||
then
|
||||
# setup temp directory
|
||||
TEMPDIR=$(mktemp -d)
|
||||
mkdir -p "$TEMPDIR/$DRIVER"
|
||||
|
||||
# driver initialization
|
||||
if [ $SCOPE -eq 1 ]
|
||||
then
|
||||
echo "running: DESTDIR=$TEMPDIR/$DRIVER DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
|
||||
DESTDIR="$TEMPDIR/$DRIVER" DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||
else
|
||||
echo "running: DESTDIR=$TEMPDIR/$DRIVER DEBUG=$DEBUG_LEVEL CONFIGS=$CONFIGS make -C $DRIVER_PATH"
|
||||
DESTDIR="$TEMPDIR/$DRIVER" DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||
fi
|
||||
|
||||
# running application
|
||||
if [ $HAS_ARGS -eq 1 ]
|
||||
then
|
||||
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
status=$?
|
||||
else
|
||||
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
status=$?
|
||||
fi
|
||||
|
||||
# cleanup temp directory
|
||||
trap "rm -rf $TEMPDIR" EXIT
|
||||
else
|
||||
# driver initialization
|
||||
if [ $SCOPE -eq 1 ]
|
||||
then
|
||||
echo "running: DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
|
||||
DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||
else
|
||||
echo "running: DEBUG=$DEBUG_LEVEL CONFIGS=$CONFIGS make -C $DRIVER_PATH"
|
||||
DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||
fi
|
||||
|
||||
# running application
|
||||
if [ $HAS_ARGS -eq 1 ]
|
||||
then
|
||||
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||
DEBUG=1 OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
status=$?
|
||||
else
|
||||
echo "running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||
DEBUG=1 make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
status=$?
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -f "$APP_PATH/trace.vcd" ]
|
||||
then
|
||||
mv -f $APP_PATH/trace.vcd .
|
||||
fi
|
||||
else
|
||||
if [ $TEMPBUILD -eq 1 ]
|
||||
then
|
||||
# setup temp directory
|
||||
TEMPDIR=$(mktemp -d)
|
||||
mkdir -p "$TEMPDIR/$DRIVER"
|
||||
|
||||
# driver initialization
|
||||
if [ $SCOPE -eq 1 ]
|
||||
then
|
||||
echo "running: DESTDIR=$TEMPDIR/$DRIVER SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
|
||||
DESTDIR="$TEMPDIR/$DRIVER" SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||
else
|
||||
echo "running: DESTDIR=$TEMPDIR/$DRIVER CONFIGS=$CONFIGS make -C $DRIVER_PATH"
|
||||
DESTDIR="$TEMPDIR/$DRIVER" CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||
fi
|
||||
|
||||
# running application
|
||||
if [ $HAS_ARGS -eq 1 ]
|
||||
then
|
||||
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
|
||||
VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER
|
||||
status=$?
|
||||
else
|
||||
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER"
|
||||
VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER
|
||||
status=$?
|
||||
fi
|
||||
|
||||
# cleanup temp directory
|
||||
trap "rm -rf $TEMPDIR" EXIT
|
||||
else
|
||||
|
||||
# driver initialization
|
||||
if [ $SCOPE -eq 1 ]
|
||||
then
|
||||
echo "running: SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
|
||||
SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||
else
|
||||
echo "running: CONFIGS=$CONFIGS make -C $DRIVER_PATH"
|
||||
CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||
fi
|
||||
|
||||
# running application
|
||||
if [ $HAS_ARGS -eq 1 ]
|
||||
then
|
||||
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
|
||||
OPTS=$ARGS make -C $APP_PATH run-$DRIVER
|
||||
status=$?
|
||||
else
|
||||
echo "running: make -C $APP_PATH run-$DRIVER"
|
||||
make -C $APP_PATH run-$DRIVER
|
||||
status=$?
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
exit $status
|
||||
|
|
|
@ -1,46 +0,0 @@
|
|||
#!/bin/sh
|
||||
|
||||
# Copyright 2019-2023
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -e
|
||||
|
||||
# Function to check if GCC version is less than 11
|
||||
check_gcc_version() {
|
||||
local gcc_version
|
||||
gcc_version=$(gcc -dumpversion)
|
||||
if dpkg --compare-versions "$gcc_version" lt 11; then
|
||||
return 0 # GCC version is less than 11
|
||||
else
|
||||
return 1 # GCC version is 11 or greater
|
||||
fi
|
||||
}
|
||||
|
||||
# Update package list
|
||||
apt-get update -y
|
||||
|
||||
# install system dependencies
|
||||
apt-get install -y build-essential valgrind libstdc++6 binutils python3 uuid-dev ccache cmake libffi7
|
||||
|
||||
# Check and install GCC 11 if necessary
|
||||
if check_gcc_version; then
|
||||
echo "GCC version is less than 11. Installing GCC 11..."
|
||||
add-apt-repository -y ppa:ubuntu-toolchain-r/test
|
||||
apt-get update
|
||||
apt-get install -y g++-11 gcc-11
|
||||
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100
|
||||
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100
|
||||
else
|
||||
echo "GCC version is 11 or greater. No need to install GCC 11."
|
||||
fi
|
|
@ -19,8 +19,6 @@ set -e
|
|||
# clear blackbox cache
|
||||
rm -f blackbox.*.cache
|
||||
|
||||
# HW: add a test "VM Test" to make sure VM feature is enabled
|
||||
|
||||
XLEN=${XLEN:=@XLEN@}
|
||||
|
||||
XSIZE=$((XLEN / 8))
|
||||
|
@ -43,23 +41,31 @@ isa()
|
|||
make -C tests/riscv/isa run-simx
|
||||
make -C tests/riscv/isa run-rtlsim
|
||||
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32f
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim-32f
|
||||
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32f
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim-32f
|
||||
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32f
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim-32f
|
||||
|
||||
if [ "$XLEN" == "64" ]
|
||||
then
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64d
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim-64d
|
||||
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64d
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim-64d
|
||||
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64f
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim-64f
|
||||
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64f
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim-64f
|
||||
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64fx
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim-64fx
|
||||
fi
|
||||
|
||||
# clean build
|
||||
|
@ -94,18 +100,10 @@ regression()
|
|||
# test global barrier
|
||||
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tgbar" --cores=2
|
||||
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tgbar" --cores=2
|
||||
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=xrt --app=dogfood --args="-n1 -tgbar" --cores=2
|
||||
|
||||
# test local barrier
|
||||
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar"
|
||||
./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tbar"
|
||||
./ci/blackbox.sh --driver=xrt --app=dogfood --args="-n1 -tbar"
|
||||
|
||||
# test temp driver mode for
|
||||
./ci/blackbox.sh --driver=simx --app=vecadd --rebuild=3
|
||||
|
||||
# test for matmul
|
||||
CONFIGS="-DTC_NUM=4 -DTC_SIZE=8" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args="-n128 -d1"
|
||||
|
||||
echo "regression tests done!"
|
||||
}
|
||||
|
@ -126,22 +124,6 @@ opencl()
|
|||
echo "opencl tests done!"
|
||||
}
|
||||
|
||||
vm(){
|
||||
echo "begin vm tests..."
|
||||
|
||||
make -C sim/simx clean && CONFIGS="-DVM_ENABLE" make -C sim/simx
|
||||
make -C runtime/simx clean && CONFIGS="-DVM_ENABLE" make -C runtime/simx
|
||||
make -C tests/opencl run-simx
|
||||
make -C tests/regression run-simx
|
||||
|
||||
make -C sim/simx clean && CONFIGS="-DVM_ENABLE -DVM_ADDR_MODE=BARE" make -C sim/simx
|
||||
make -C runtime/simx clean && CONFIGS="-DVM_ENABLE -DVM_ADDR_MODE=BARE" make -C runtime/simx
|
||||
make -C tests/opencl run-simx
|
||||
make -C tests/regression run-simx
|
||||
|
||||
echo "vm tests done!"
|
||||
}
|
||||
|
||||
cache()
|
||||
{
|
||||
echo "begin cache tests..."
|
||||
|
@ -158,33 +140,27 @@ cache()
|
|||
|
||||
# reduce l1 line size
|
||||
CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=rtlsim --app=io_addr
|
||||
CONFIGS="-DL1_LINE_SIZE=$XSIZE -DDISABLE_L1" ./ci/blackbox.sh --driver=rtlsim --app=io_addr
|
||||
CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=io_addr
|
||||
CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
|
||||
CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx
|
||||
|
||||
# test cache ways
|
||||
CONFIGS="-DICACHE_NUM_WAYS=1 -DDCACHE_NUM_WAYS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
|
||||
CONFIGS="-DICACHE_NUM_WAYS=4 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
|
||||
CONFIGS="-DICACHE_NUM_WAYS=4 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx
|
||||
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
|
||||
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx
|
||||
|
||||
# test cache banking
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=simx --app=sgemmx
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --threads=8
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=4" ./ci/blackbox.sh --driver=simx --app=sgemmx --threads=8
|
||||
|
||||
# replacement policy
|
||||
CONFIGS="-DDCACHE_REPL_POLICY=0" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
|
||||
CONFIGS="-DDCACHE_REPL_POLICY=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
|
||||
CONFIGS="-DDCACHE_REPL_POLICY=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
|
||||
CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
|
||||
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
|
||||
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
|
||||
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
|
||||
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
|
||||
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
|
||||
|
||||
# test writeback
|
||||
CONFIGS="-DDCACHE_WRITEBACK=1 -DDCACHE_DIRTYBYTES=0 -DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=mstress
|
||||
CONFIGS="-DDCACHE_WRITEBACK=1 -DDCACHE_DIRTYBYTES=1 -DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=mstress
|
||||
CONFIGS="-DDCACHE_WRITEBACK=1 -DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=simx --app=mstress
|
||||
CONFIGS="-DSOCKET_SIZE=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
|
||||
CONFIGS="-DSOCKET_SIZE=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
|
||||
CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --app=mstress
|
||||
CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --app=mstress
|
||||
CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
|
||||
CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
|
||||
|
||||
# cache clustering
|
||||
CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=4 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=4 --warps=1 --threads=2
|
||||
|
@ -259,39 +235,33 @@ config2()
|
|||
# test opaesim
|
||||
./ci/blackbox.sh --driver=opae --app=printf
|
||||
./ci/blackbox.sh --driver=opae --app=diverge
|
||||
./ci/blackbox.sh --driver=xrt --app=diverge
|
||||
|
||||
# disable DPI
|
||||
if [ "$XLEN" == "64" ]; then
|
||||
# need to disable trig on 64-bit due to a bug inside fpnew's sqrt core.
|
||||
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-xtrig -xbar -xgbar"
|
||||
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood --args="-xtrig -xbar -xgbar"
|
||||
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=xrt --app=dogfood --args="-xtrig -xbar -xgbar"
|
||||
else
|
||||
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
|
||||
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=xrt --app=dogfood
|
||||
fi
|
||||
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
|
||||
|
||||
# custom program startup address
|
||||
make -C tests/regression/dogfood clean-kernel
|
||||
STARTUP_ADDR=0x80000000 make -C tests/regression/dogfood
|
||||
STARTUP_ADDR=0x40000000 make -C tests/regression/dogfood
|
||||
./ci/blackbox.sh --driver=simx --app=dogfood
|
||||
./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
make -C tests/regression/dogfood clean-kernel
|
||||
|
||||
# disabling M & F extensions
|
||||
make -C sim/rtlsim clean && CONFIGS="-DEXT_M_DISABLE -DEXT_F_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32i
|
||||
make -C sim/rtlsim clean && CONFIGS="-DEXT_M_DISABLE -DEXT_F_DISABLE" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim-32i
|
||||
make -C sim/rtlsim clean
|
||||
|
||||
# disabling ZICOND extension
|
||||
CONFIGS="-DEXT_ZICOND_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo
|
||||
|
||||
# test 128-bit memory block
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=mstress
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=xrt --app=mstress
|
||||
# test AXI bus
|
||||
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --app=mstress
|
||||
|
||||
# test XLEN-bit memory block
|
||||
# test 128-bit MEM block
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=mstress
|
||||
|
||||
# test XLEN-bit MEM block
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=opae --app=mstress
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=mstress
|
||||
|
||||
|
@ -299,35 +269,11 @@ config2()
|
|||
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8
|
||||
|
||||
# test single-bank memory
|
||||
if [ "$XLEN" == "64" ]; then
|
||||
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
|
||||
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=xrt --app=mstress
|
||||
else
|
||||
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
|
||||
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=xrt --app=mstress
|
||||
fi
|
||||
# test single-bank DRAM
|
||||
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
|
||||
|
||||
# test larger memory address
|
||||
if [ "$XLEN" == "64" ]; then
|
||||
CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=49" ./ci/blackbox.sh --driver=opae --app=mstress
|
||||
CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=49" ./ci/blackbox.sh --driver=xrt --app=mstress
|
||||
else
|
||||
CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=33" ./ci/blackbox.sh --driver=opae --app=mstress
|
||||
CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=33" ./ci/blackbox.sh --driver=xrt --app=mstress
|
||||
fi
|
||||
|
||||
# test memory banks interleaving
|
||||
CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=1" ./ci/blackbox.sh --driver=opae --app=mstress
|
||||
CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=0" ./ci/blackbox.sh --driver=opae --app=mstress
|
||||
|
||||
# test memory ports
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8" ./ci/blackbox.sh --driver=opae --app=mstress --threads=8
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=8" ./ci/blackbox.sh --driver=xrt --app=mstress --threads=8
|
||||
# test 27-bit DRAM address
|
||||
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --app=mstress
|
||||
|
||||
echo "configuration-2 tests done!"
|
||||
}
|
||||
|
@ -353,32 +299,20 @@ debug()
|
|||
|
||||
test_csv_trace
|
||||
|
||||
CONFIGS="-O0" ./ci/blackbox.sh --driver=opae --app=demo --args="-n1"
|
||||
CONFIGS="-O0" ./ci/blackbox.sh --driver=xrt --app=demo --args="-n1"
|
||||
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
|
||||
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=xrt --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
|
||||
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1"
|
||||
|
||||
echo "debugging tests done!"
|
||||
}
|
||||
|
||||
scope()
|
||||
{
|
||||
echo "begin scope tests..."
|
||||
|
||||
SCOPE_DEPTH=128 ./ci/blackbox.sh --driver=opae --app=demo --args="-n1" --scope
|
||||
SCOPE_DEPTH=128 ./ci/blackbox.sh --driver=xrt --app=demo --args="-n1" --scope
|
||||
|
||||
echo "debugging scope done!"
|
||||
}
|
||||
|
||||
stress()
|
||||
{
|
||||
echo "begin stress tests..."
|
||||
|
||||
# test verilator reset values
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=1 -DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=xrt --app=sgemmx --args="-n128" --l2cache
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --args="-n128" --l2cache
|
||||
|
||||
echo "stress tests done!"
|
||||
}
|
||||
|
@ -388,25 +322,15 @@ synthesis()
|
|||
echo "begin synthesis tests..."
|
||||
|
||||
PREFIX=build_base make -C hw/syn/yosys clean
|
||||
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE -DNUM_WARPS=2 -DNUM_THREADS=2" make -C hw/syn/yosys synthesis
|
||||
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys synthesis
|
||||
|
||||
echo "synthesis tests done!"
|
||||
}
|
||||
|
||||
vector()
|
||||
{
|
||||
echo "begin vector tests..."
|
||||
|
||||
make -C sim/simx clean && CONFIGS="-DEXT_V_ENABLE" make -C sim/simx
|
||||
TOOLDIR=@TOOLDIR@ XLEN=@XLEN@ VLEN=256 REG_TESTS=1 ./tests/riscv/riscv-vector-tests/run-test.sh
|
||||
|
||||
echo "vector tests done!"
|
||||
}
|
||||
|
||||
show_usage()
|
||||
{
|
||||
echo "Vortex Regression Test"
|
||||
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--scope] [--stress] [--synthesis] [--vector] [--all] [--h|--help]"
|
||||
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--stress] [--synthesis] [--all] [--h|--help]"
|
||||
}
|
||||
|
||||
declare -a tests=()
|
||||
|
@ -435,9 +359,6 @@ while [ "$1" != "" ]; do
|
|||
--cache )
|
||||
tests+=("cache")
|
||||
;;
|
||||
--vm )
|
||||
tests+=("vm")
|
||||
;;
|
||||
--config1 )
|
||||
tests+=("config1")
|
||||
;;
|
||||
|
@ -447,18 +368,12 @@ while [ "$1" != "" ]; do
|
|||
--debug )
|
||||
tests+=("debug")
|
||||
;;
|
||||
--scope )
|
||||
tests+=("scope")
|
||||
;;
|
||||
--stress )
|
||||
tests+=("stress")
|
||||
;;
|
||||
--synthesis )
|
||||
tests+=("synthesis")
|
||||
;;
|
||||
--vector )
|
||||
tests+=("vector")
|
||||
;;
|
||||
--all )
|
||||
tests=()
|
||||
tests+=("unittest")
|
||||
|
@ -467,14 +382,11 @@ while [ "$1" != "" ]; do
|
|||
tests+=("regression")
|
||||
tests+=("opencl")
|
||||
tests+=("cache")
|
||||
tests+=("vm")
|
||||
tests+=("config1")
|
||||
tests+=("config2")
|
||||
tests+=("debug")
|
||||
tests+=("scope")
|
||||
tests+=("stress")
|
||||
tests+=("synthesis")
|
||||
tests+=("vector")
|
||||
;;
|
||||
-h | --help )
|
||||
show_usage
|
||||
|
|
27
ci/system_updates.sh
Executable file
27
ci/system_updates.sh
Executable file
|
@ -0,0 +1,27 @@
|
|||
#!/bin/sh
|
||||
|
||||
# Copyright 2019-2023
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
set -e
|
||||
|
||||
apt-get update -y
|
||||
|
||||
add-apt-repository -y ppa:ubuntu-toolchain-r/test
|
||||
apt-get update
|
||||
apt-get install -y g++-11 gcc-11
|
||||
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100
|
||||
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100
|
||||
|
||||
apt-get install -y build-essential valgrind libstdc++6 binutils python uuid-dev ccache
|
|
@ -1,13 +1,13 @@
|
|||
#!/bin/sh
|
||||
|
||||
# Copyright 2019-2023
|
||||
#
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -15,6 +15,7 @@
|
|||
# limitations under the License.
|
||||
|
||||
TOOLDIR=${TOOLDIR:=@TOOLDIR@}
|
||||
|
||||
export PATH=$TOOLDIR/verilator/bin:$PATH
|
||||
|
||||
export SV2V_PATH=$TOOLDIR/sv2v
|
||||
|
|
|
@ -23,9 +23,9 @@ OSVERSION=${OSVERSION:=@OSVERSION@}
|
|||
riscv32()
|
||||
{
|
||||
case $OSVERSION in
|
||||
"centos/7") parts=$(eval echo {a..l}) ;;
|
||||
"ubuntu/bionic") parts=$(eval echo {a..j}) ;;
|
||||
*) parts=$(eval echo {a..k}) ;;
|
||||
"centos/7") parts=$(eval echo {a..h}) ;;
|
||||
"ubuntu/focal") parts=$(eval echo {a..k}) ;;
|
||||
*) parts=$(eval echo {a..j}) ;;
|
||||
esac
|
||||
rm -f riscv32-gnu-toolchain.tar.bz2.parta*
|
||||
for x in $parts
|
||||
|
@ -41,7 +41,7 @@ riscv32()
|
|||
riscv64()
|
||||
{
|
||||
case $OSVERSION in
|
||||
"centos/7") parts=$(eval echo {a..l}) ;;
|
||||
"centos/7") parts=$(eval echo {a..h}) ;;
|
||||
*) parts=$(eval echo {a..j}) ;;
|
||||
esac
|
||||
rm -f riscv64-gnu-toolchain.tar.bz2.parta*
|
||||
|
|
|
@ -44,8 +44,7 @@ def load_config(filename):
|
|||
'num_barriers': int(config_match.group(7)),
|
||||
}
|
||||
return config
|
||||
print("Error: missing CONFIGS: header")
|
||||
sys.exit(1)
|
||||
return None
|
||||
|
||||
def parse_simx(log_lines):
|
||||
pc_pattern = r"PC=(0x[0-9a-fA-F]+)"
|
||||
|
@ -275,8 +274,6 @@ def split_log_file(log_filename):
|
|||
|
||||
if current_sublog is not None:
|
||||
sublogs.append(current_sublog)
|
||||
else:
|
||||
sublogs.append(log_lines)
|
||||
|
||||
return sublogs
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/env python3
|
||||
#!/usr/bin/env python
|
||||
|
||||
# Copyright 2019-2023
|
||||
#
|
||||
|
|
|
@ -31,4 +31,7 @@ RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv$(XLEN)-gnu-toolchain
|
|||
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
|
||||
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
|
||||
|
||||
THIRD_PARTY_DIR ?= $(VORTEX_HOME)/third_party
|
||||
VORTEX_RT_PATH ?= $(VORTEX_HOME)/runtime
|
||||
VORTEX_KN_PATH ?= $(VORTEX_HOME)/kernel
|
||||
|
||||
THIRD_PARTY_DIR ?= $(VORTEX_HOME)/third_party
|
10
configure
vendored
10
configure
vendored
|
@ -26,8 +26,6 @@ detect_osversion() {
|
|||
case "$VERSION_CODENAME" in
|
||||
bionic) osversion="ubuntu/bionic";;
|
||||
focal) osversion="ubuntu/focal";;
|
||||
jammy) osversion="ubuntu/focal";;
|
||||
noble) osversion="ubuntu/focal";;
|
||||
# Add new versions as needed
|
||||
esac
|
||||
;;
|
||||
|
@ -65,7 +63,7 @@ copy_files() {
|
|||
filename_no_ext="${filename%.in}"
|
||||
dest_file="$dest_dir/$filename_no_ext"
|
||||
mkdir -p "$dest_dir"
|
||||
sed "s|@VORTEX_HOME@|$SOURCE_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g; s|@BUILDDIR@|$CURRENT_DIR|g" "$file" > "$dest_file"
|
||||
sed "s|@VORTEX_HOME@|$SCRIPT_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g" "$file" > "$dest_file"
|
||||
# apply permissions to bash scripts
|
||||
read -r firstline < "$dest_file"
|
||||
if [[ "$firstline" =~ ^#!.*bash ]]; then
|
||||
|
@ -169,8 +167,8 @@ fi
|
|||
SUBDIRS=("." "!ci" "!perf" "hw*" "kernel*" "runtime*" "sim*" "tests*")
|
||||
|
||||
# Get the directory of the script
|
||||
SOURCE_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
|
||||
|
||||
THIRD_PARTY_DIR=$SOURCE_DIR/third_party
|
||||
THIRD_PARTY_DIR=$SCRIPT_DIR/third_party
|
||||
|
||||
copy_files "$SOURCE_DIR" "$CURRENT_DIR"
|
||||
copy_files "$SCRIPT_DIR" "$CURRENT_DIR"
|
||||
|
|
79
docs/altera_fpga_guide.md
Normal file
79
docs/altera_fpga_guide.md
Normal file
|
@ -0,0 +1,79 @@
|
|||
# FPGA Startup and Configuration Guide
|
||||
|
||||
OPAE Environment Setup
|
||||
----------------------
|
||||
|
||||
$ source /opt/inteldevstack/init_env_user.sh
|
||||
$ export OPAE_HOME=/opt/opae/1.1.2
|
||||
$ export PATH=$OPAE_HOME/bin:$PATH
|
||||
$ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH
|
||||
$ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH
|
||||
$ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH
|
||||
|
||||
OPAE Build
|
||||
------------------
|
||||
|
||||
The FPGA has to following configuration options:
|
||||
- DEVICE_FAMILY=arria10 | stratix10
|
||||
- NUM_CORES=#n
|
||||
|
||||
Command line:
|
||||
|
||||
$ cd hw/syn/altera/opae
|
||||
$ PREFIX=test1 TARGET=fpga NUM_CORES=4 make
|
||||
|
||||
A new folder (ex: `test1_xxx_4c`) will be created and the build will start and take ~30-480 min to complete.
|
||||
Setting TARGET=ase will build the project for simulation using Intel ASE.
|
||||
|
||||
|
||||
OPAE Build Configuration
|
||||
------------------------
|
||||
|
||||
The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware parameters that can be modified when build the processor.For example, have the following parameters that can be configured:
|
||||
- `NUM_WARPS`: Number of warps per cores
|
||||
- `NUM_THREADS`: Number of threads per warps
|
||||
- `PERF_ENABLE`: enable the use of all profile counters
|
||||
|
||||
You configure the syntesis build from the command line:
|
||||
|
||||
$ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make
|
||||
|
||||
OPAE Build Progress
|
||||
-------------------
|
||||
|
||||
You could check the last 10 lines in the build log for possible errors until build completion.
|
||||
|
||||
$ tail -n 10 <build_dir>/build.log
|
||||
|
||||
Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs.
|
||||
|
||||
$ ps -u <username>
|
||||
|
||||
If the build fails and you need to restart it, clean up the build folder using the following command:
|
||||
|
||||
$ make clean
|
||||
|
||||
The bitstream file `vortex_afu.gbs` should exist when the build is done:
|
||||
|
||||
$ ls -lsa <build_dir>/synth/vortex_afu.gbs
|
||||
|
||||
|
||||
Signing the bitstream and Programming the FPGA
|
||||
----------------------------------------------
|
||||
|
||||
$ cd <build_dir>
|
||||
$ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs
|
||||
$ fpgasupdate vortex_afu_unsigned_ssl.gbs
|
||||
|
||||
Sample FPGA Run Test
|
||||
--------------------
|
||||
|
||||
Ensure you have the correct opae runtime for the FPGA target
|
||||
|
||||
$ make -C runtime/opae clean
|
||||
$ TARGET=FPGA make -C runtime/opae
|
||||
|
||||
Run the following from your Vortex build directory
|
||||
|
||||
$ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128"
|
||||
|
|
@ -1,37 +1,18 @@
|
|||
# Contributing to Vortex
|
||||
# Contributing to Vortex on Github
|
||||
|
||||
## Github
|
||||
Vortex uses Github to host its git repositories.
|
||||
There are a lot of ways to use the features on Github for collaboration.
|
||||
Therefore, this documentation details the standard procedure for contributing to Vortex.
|
||||
Development of Vortex is consolidated to this repo, `vortex` and any associated forks.
|
||||
Previously, there was active work done on a private repo named `vortex-dev`.
|
||||
`vortex-dev` has officially been deprecated and fully merged into this public repo, `vortex`.
|
||||
If you are returning to this project and have legacy versions of Vortex, you can use the releases branches to access older versions.
|
||||
## Github Details
|
||||
- There are two main repos, `vortex` (public, this one) and `vortex-dev` (private)
|
||||
- todo: Most current development is on `vortex`
|
||||
- If you have a legacy version of `vortex`, you can use the releases branch or tags to access the repo at that point in time
|
||||
|
||||
## Contribution Process
|
||||
In an effort to keep `vortex` organized, permissions to directly create branches and push code has been limited to admins.
|
||||
However, contributions are strongly encouraged and keep the project moving forward! Here is the procedure for contributing:
|
||||
- You should create a new branch from develop that is clearly named with the feature that you want to add
|
||||
- Avoid pushing directly to the `master` branch instead you will need to make a Pull Request (PR)
|
||||
- There should be protections in place that prevent pushing directly to the main branch, but don't rely on it
|
||||
- When you make a PR it will be tested against the continuous integration (ci) pipeline (see `continuous_integration.md`)
|
||||
- It is not sufficient to just write some tests, they need to be incorporated into the ci pipeline to make sure they are run
|
||||
- During a PR, you might receive feedback regarding your changes and you might need to make further commits to your branch
|
||||
|
||||
1. Create a fork of `vortex`
|
||||
2. In your fork, create a branch from `master` that briefly explains the work you are adding (ie: `develop-documentation`)
|
||||
3. Make your changes on the new branch in your fork. You may create as many commits as you need, which might be common if you are making multiple iterations
|
||||
4. Since you are the owner of your fork, you have full permissions to push commits to your fork
|
||||
4. When you are satisfied with the changes on your fork, you can open a PR from your fork using the online interface
|
||||
5. If you recently made a push, you will get automatically get a prompt on Github online to create a PR, which you can press
|
||||
6. Otherwise, you can go to your fork on Github online and manually create a PR (todo)
|
||||
(todo): how to name and format your PR, what information you should add to the PR, does not need to be too strict if you are attending the weekly meetings*
|
||||
7. Github uses the following semantics: `base repository` gets the changes from your `head repository`
|
||||
8. Therefore, you should set the `base repository` to `vortexgpgpu/vortex` and the `base` branch to `master` since the master branch is protected by reviewed PRs.
|
||||
9. And you should assign the `head repository` to `<your-github-username>/vortex` (which represents your fork of vortex) and the `base` branch to the one created in step 2
|
||||
10. Now that your intended PR has been specified, you should review the status. Check for merge conflicts, if all your commits are present, and all the modified files make sense
|
||||
11. You can still make a PR if there are issues in step 10, just make sure the structure is correct according to steps 7-9
|
||||
12. Once the PR is made, the CI pipeline will run automatically, testing your changes
|
||||
13. Remember, a PR is flexible if you need to make changes to the code you can go back to your branch of the fork to commit and push any updates
|
||||
14. As long as the `head repository`'s `base` branch is the one you edited, the PR will automatically get the most recent changes
|
||||
15. When all merge conflicts are resolved, changes are made, and tests pass you can have an admin merge your PR
|
||||
|
||||
## What Makes a Good Contribution?
|
||||
- If you are contributing code changes, then review [testing.md](./testing.md) to ensure your tests are integrated into the [CI pipeline](continuous_integration.md)
|
||||
- During a PR, you should consider the advice you are provided by your reviewers. Remember you keep adding commits to an open PR!
|
||||
- If your change aims to fix an issue opened on Github, please tag that issue in the PR itself
|
||||
## Creating and Adding Tests
|
||||
see `testing.md`
|
|
@ -33,13 +33,7 @@ The recommended method to enable debugging is to pass the `--debug` flag to `bla
|
|||
// Running demo program on rtlsim in debug mode
|
||||
$ ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=1
|
||||
|
||||
A debug trace `run.log` is generated in the current directory during the program execution. The trace includes important states of the simulated processor (memory, caches, pipeline, stalls, etc..). A waveform trace `trace.vcd` is also generated in the current directory during the program execution.
|
||||
By default all library modules unde the /libs/ folder are excluded from the trace to reduce the waveform file size, you can chnage that behavoir by either explicitly commenting out `TRACING_OFF`/`TRACING_ON` inside a lib module source (e.g. VX_stream_buffer.sv) or simply enabling a full trace using the following command.
|
||||
|
||||
// Debugging the demo program with rtlsim in full tracing mode
|
||||
$ CONFIGS="-DTRACING_ALL" ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=1
|
||||
|
||||
You can visualize the waveform trace using any tool that can open VCD files (Modelsim, Quartus, Vivado, etc..). [GTKwave] (http://gtkwave.sourceforge.net) is a great open-source scope analyzer that also works with VCD files.
|
||||
A debug trace `run.log` is generated in the current directory during the program execution. The trace includes important states of the simulated processor (memory, caches, pipeline, stalls, etc..). A waveform trace `trace.vcd` is also generated in the current directory during the program execution. You can visualize the waveform trace using any tool that can open VCD files (Modelsim, Quartus, Vivado, etc..). [GTKwave] (http://gtkwave.sourceforge.net) is a great open-source scope analyzer that also works with VCD files.
|
||||
|
||||
## FPGA Debugging
|
||||
|
||||
|
|
|
@ -1,19 +1,16 @@
|
|||
# Environment Setup
|
||||
|
||||
These instructions apply to the development vortex repo using the updated toolchain. The updated toolchain is considered to be any commit of `master` pulled from July 2, 2023 onwards. The toolchain update in question can be viewed in this [commit](https://github.com/vortexgpgpu/vortex-dev/commit/0048496ba28d7b9a209a0e569d52d60f2b68fc04). Therefore, if you are unsure whether you are using the new toolchain or not, then you should check the `ci` folder for the existence of the `toolchain_prebuilt.sh` script. Furthermore, you should notice that the `toolchain_install.sh` script has the legacy `llvm()` split into `llvm-vortex()` and `llvm-pocl()`.
|
||||
|
||||
## Set Up on Your Own System
|
||||
|
||||
## Set Up on Your Own System
|
||||
The toolchain binaries provided with Vortex are built on Ubuntu-based systems. To install Vortex on your own system, [follow these instructions](install_vortex.md).
|
||||
|
||||
|
||||
## Servers for Georgia Tech Students and Collaborators
|
||||
|
||||
### Volvo
|
||||
|
||||
Volvo is a 64-core server provided by HPArch. You need valid credentials to access it. If you don't already have access, you can get in contact with your mentor to ask about setting your account up.
|
||||
|
||||
Setup on Volvo:
|
||||
|
||||
1. Connect to Georgia Tech's VPN or ssh into another machine on campus
|
||||
2. `ssh volvo.cc.gatech.edu`
|
||||
3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
|
||||
|
@ -22,11 +19,9 @@ Setup on Volvo:
|
|||
6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
|
||||
|
||||
### Nio
|
||||
|
||||
Nio is a 20-core desktop server provided by HPArch. If you have access to Volvo, you also have access to Nio.
|
||||
|
||||
Setup on Nio:
|
||||
|
||||
1. Connect to Georgia Tech's VPN or ssh into another machine on campus
|
||||
2. `ssh nio.cc.gatech.edu`
|
||||
3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
|
||||
|
@ -34,12 +29,11 @@ Setup on Nio:
|
|||
5. `make -s` in the `vortex` root directory
|
||||
6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
|
||||
|
||||
## Docker (Experimental)
|
||||
|
||||
## Docker (Experimental)
|
||||
Docker allows for isolated pre-built environments to be created, shared and used. The emulation mode required for ARM-based processors will incur a decrease in performance. Currently, the dockerfile is not included with the official vortex repository and is not actively maintained or supported.
|
||||
|
||||
### Setup with Docker
|
||||
|
||||
1. Clone repo recursively onto your local machine: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
|
||||
2. Download the dockerfile from [here](https://github.gatech.edu/gist/usubramanya3/f1bf3e953faa38a6372e1292ffd0b65c) and place it in the root of the repo.
|
||||
3. Build the Dockerfile into an image: `docker build --platform=linux/amd64 -t vortex -f dockerfile .`
|
||||
|
|
|
@ -1,217 +0,0 @@
|
|||
# FPGA Startup and Configuration Guide
|
||||
|
||||
## Gaining Access to FPGA's with CRNCH
|
||||
If you are associated with Georgia Tech (or related workshops) you can use CRNCH's server to gain remote access to FPGA's. Otherwise, you can skip to the Xilinx or Intel (Altera) synthesis steps below.
|
||||
|
||||
## What is CRNCH?
|
||||
|
||||
**C**enter for **R**esearch into **N**ovel **C**omputing **H**ierarchies
|
||||
|
||||
## What does CRNCH Offer?
|
||||
|
||||
**The Rogues Gallery (RG)**: new concept focused on developing our understanding of next-generation hardware with a focus on unorthodox and uncommon technologies. **RG** will acquire new and unique hardware (ie, the aforementioned “*rogues*”) from vendors, research labs, and startups and make this hardware available to students, faculty, and industry collaborators within a managed data center environment
|
||||
|
||||
## Why are the Rouges Important?
|
||||
|
||||
By exposing students and researchers to this set of unique hardware, we hope to foster cross-cutting discussions about hardware designs that will drive future *performance improvements in computing long after the Moore’s Law era of “cheap transistors” ends*. Specifically, the Rouges Gallery contains FPGA's which can be synthesized into Vortex hardware.
|
||||
|
||||
## How is the Rouges Gallery Funded?
|
||||
|
||||
Rogues Gallery testbed is primarily supported by the National Science Foundation (NSF) under NSF Award Number [#2016701](https://www.nsf.gov/awardsearch/showAward?AWD_ID=2016701&HistoricalAwards=false)
|
||||
|
||||
## Rouges Gallery Documentation
|
||||
|
||||
You can read about RG in more detail on its official documentation [page](https://gt-crnch-rg.readthedocs.io/en/main/index.html#).
|
||||
|
||||
You can listen to a talk about RG [here](https://mediaspace.gatech.edu/media/Jeff%20Young%20-%20Rogues%20Gallery%20-%20CRNCH%20Summit%202021/1_lqlgr0jj)
|
||||
|
||||
[CRNCH Summit 2023](https://github.com/gt-crnch/crnch-summit-2023/tree/main)
|
||||
|
||||
## Request Access for Rouges Gallery
|
||||
|
||||
You should use [this form](https://crnch-rg.cc.gatech.edu/request-rogues-gallery-access/) to request access to RG’s reconfigurable computing (vortex fpga) resources. You should receive an email with your ticket item being created. Once it gets processed, you should get an email confirmed your access has been granted. It might take some time to get processed.
|
||||
|
||||
## How to Access Rouges Gallery?
|
||||
There are two methods of accessing CRNCH's Rouges Gallery
|
||||
1) Web-based GUI: [rg-ood.crnch.gatech.edu](http://rg-ood.crnch.gatech.edu/)
|
||||
2) SSH: `ssh <your-gt-username>@rg-login.crnch.gatech.edu`
|
||||
|
||||
|
||||
## Where should I keep my files?
|
||||
The CRNCH servers have a folder called `USERSCRATCH` which can be found in your home directory: `echo $HOME`. You should keep all your files in this folder since it is available across all the Rouges Gallery Nodes.
|
||||
|
||||
## **What Machines are Available in the Rogues Gallery?**
|
||||
|
||||
Complete list of machines can be found [here](https://gt-crnch-rg.readthedocs.io/en/main/general/rg-hardware.html). Furthermore, you can find detailed information about the FPGA hardware [here](https://gt-crnch-rg.readthedocs.io/en/main/reconfig/xilinx/xilinx-getting-started.html).
|
||||
|
||||
## Allocate an FPGA Node
|
||||
Once you’ve connected to the CRNCH login node, you can use the Slurm scheduler to request an interactive job using `salloc`. This [page](https://gt-crnch-rg.readthedocs.io/en/main/general/using-slurm.html) explains why we use Slurm to request resources. Documentation for `salloc` can be found [here](https://gt-crnch-rg.readthedocs.io/en/main/general/using-slurm-examples.html). And here.
|
||||
|
||||
|
||||
To request 16 cores and 64GB of RAM for 6 hours on flubber9, a fpga dev node:
|
||||
```bash
|
||||
salloc -p rg-fpga --nodes=1 --ntasks-per-node=16 --mem=64G --nodelist flubber1 --time=06:00:00
|
||||
```
|
||||
Synthesis for Xilinx Boards
|
||||
----------------------
|
||||
Once you are logged in, you will need to complete some first time configurations. If you are interested in the Intel (Altera) synthesis steps, scroll down below.
|
||||
|
||||
### Source Configuration Scripts
|
||||
```
|
||||
# From any directory
|
||||
$ source /opt/xilinx/xrt/setup.sh
|
||||
$ source /tools/reconfig/xilinx/Vitis/2023.1/settings64.sh
|
||||
```
|
||||
|
||||
### Check Installed FPGA Platforms
|
||||
`platforminfo -l` which tells us the correct name of the platform installed on the current fpga node. It should be used for the `PLATFORM` variable below. Otherwise, if there is an error then there was an issue with the previous two commands.
|
||||
|
||||
### Install Vortex Toolchain
|
||||
The Xilinx synthesis process requires verilator to generate the bitstream. Eventually, you will need the whole toolchain to run the bitstream on the FPGA. Therefore, the Vortex toolchain and can be installed as follows. If you complete these steps properly, you should only need to complete them once and you can skip to `Activate Vortex Toolchain`
|
||||
```
|
||||
# Make a build directory from root and configure scripts for your environment
|
||||
mkdir build && cd build && ../configure --tooldir=$HOME/tools
|
||||
|
||||
# Install the whole prebuilt toolchain
|
||||
./ci/toolchain_install.sh --all
|
||||
|
||||
# Add environment variables to bashrc
|
||||
echo "source <full-path-to-vortex-root>/vortex/build/ci/toolchain_env.sh" >> ~/.bashrc
|
||||
```
|
||||
|
||||
### Activate Vortex Toolchain
|
||||
```
|
||||
# From any directory
|
||||
source ~/.bashrc
|
||||
|
||||
# Check environment setup
|
||||
verilator --version
|
||||
```
|
||||
|
||||
### Build the FPGA Bitstream
|
||||
The root directory contains the path `hw/syn/xilinx/xrt` which has the makefile used to generate the Vortex bitstream.
|
||||
|
||||
```
|
||||
$ cd hw/syn/xilinx/xrt
|
||||
$ PREFIX=test1 PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 TARGET=hw NUM_CORES=1 make > build_u250_hw_1c.log 2>&1 &
|
||||
```
|
||||
Will run the synthesis under new build directory: BUILD_DIR := "\<PREFIX>\_\<PLATFORM>\_\<TARGET>"
|
||||
The generated bitstream will be located under <BUILD_DIR>/bin/vortex_afu.xclbin
|
||||
|
||||
For long-running jobs, invocation of this makefile can be made of the following form:
|
||||
|
||||
`[CONFIGS=<vortex macros>] [PREFIX=<prefix directory name>] [NUM_CORES=<#>] TARGET=hw|hw_emu PLATFORM=<platform baseName> nohup make > <log filename> 2>&1 &`
|
||||
|
||||
For example:
|
||||
|
||||
```bash
|
||||
CONFIGS="-DL2_ENABLE -DDCACHE_SIZE=8192" PREFIX=build_4c_u280 NUM_CORES=4 TARGET=hw PLATFORM=xilinx_u280_gen3x16_xdma_1_202310_1 nohup make > build_u250_hw_4c.log 2>&1 &
|
||||
```
|
||||
|
||||
The build is complete when the bitstream file `vortex_afu.xclbin` exists in `<prefix directory name><platform baseName>hw|hw_emu/bin`.
|
||||
|
||||
### Running a Program on Xilinx FPGA
|
||||
|
||||
The [blackbox.sh](./simulation.md) script within the build directory can be used to run a test with Vortex’s xrt driver using the following command:
|
||||
|
||||
`FPGA_BIN_DIR=<path to bitstream directory> TARGET=hw|hw_emu PLATFORM=<platform baseName> ./ci/blackbox.sh --driver=xrt --app=<test name>`
|
||||
|
||||
For example:
|
||||
|
||||
```FPGA_BIN_DIR=<realpath> hw/syn/xilinx/xrt/build_4c_u280_xilinx_u280_gen3x16_xdma_1_202211_1_hw/bin TARGET=hw PLATFORM=xilinx_u280_gen3x16_xdma_1_202211_1 ./ci/blackbox.sh --driver=xrt --app=demo```
|
||||
|
||||
Synthesis for Intel (Altera) Boards
|
||||
----------------------
|
||||
|
||||
### OPAE Environment Setup
|
||||
|
||||
|
||||
$ source /opt/inteldevstack/init_env_user.sh
|
||||
$ export OPAE_HOME=/opt/opae/1.1.2
|
||||
$ export PATH=$OPAE_HOME/bin:$PATH
|
||||
$ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH
|
||||
$ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH
|
||||
$ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH
|
||||
|
||||
### OPAE Build
|
||||
|
||||
The FPGA has to following configuration options:
|
||||
- DEVICE_FAMILY=arria10 | stratix10
|
||||
- NUM_CORES=#n
|
||||
|
||||
Command line:
|
||||
|
||||
$ cd hw/syn/altera/opae
|
||||
$ PREFIX=test1 TARGET=fpga NUM_CORES=4 make
|
||||
|
||||
A new folder (ex: `test1_xxx_4c`) will be created and the build will start and take ~30-480 min to complete.
|
||||
Setting TARGET=ase will build the project for simulation using Intel ASE.
|
||||
|
||||
|
||||
### OPAE Build Configuration
|
||||
|
||||
The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware parameters that can be modified when build the processor.For example, have the following parameters that can be configured:
|
||||
- `NUM_WARPS`: Number of warps per cores
|
||||
- `NUM_THREADS`: Number of threads per warps
|
||||
- `PERF_ENABLE`: enable the use of all profile counters
|
||||
|
||||
You configure the syntesis build from the command line:
|
||||
|
||||
$ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make
|
||||
|
||||
### OPAE Build Progress
|
||||
|
||||
You could check the last 10 lines in the build log for possible errors until build completion.
|
||||
|
||||
$ tail -n 10 <build_dir>/build.log
|
||||
|
||||
Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs.
|
||||
|
||||
$ ps -u <username>
|
||||
|
||||
If the build fails and you need to restart it, clean up the build folder using the following command:
|
||||
|
||||
$ make clean
|
||||
|
||||
The file `vortex_afu.gbs` should exist when the build is done:
|
||||
|
||||
$ ls -lsa <build_dir>/synth/vortex_afu.gbs
|
||||
|
||||
|
||||
### Signing the bitstream and Programming the FPGA
|
||||
|
||||
$ cd <build_dir>
|
||||
$ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs
|
||||
$ fpgasupdate vortex_afu_unsigned_ssl.gbs
|
||||
|
||||
### Sample FPGA Run Test
|
||||
Ensure you have the correct opae runtime for the FPGA target
|
||||
|
||||
```
|
||||
$ TARGET=FPGA make -C runtime/opae
|
||||
```
|
||||
|
||||
Run the [blackbox.sh](./simulation.md) from your Vortex build directory
|
||||
|
||||
```
|
||||
$ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128"
|
||||
```
|
||||
|
||||
### FPGA sample test running OpenCL sgemm kernel
|
||||
|
||||
You can use the `blackbox.sh` script to run the following from your Vortex build directory
|
||||
|
||||
$ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128"
|
||||
|
||||
### Testing Vortex using OPAE with Intel ASE Simulation
|
||||
Building ASE synthesis
|
||||
|
||||
```$ TARGET=asesim make -C runtime/opae```
|
||||
|
||||
Building ASE runtime
|
||||
|
||||
```$ TARGET=asesim make -C runtime/opae```
|
||||
|
||||
Running ASE simulation
|
||||
|
||||
```$ ASE_LOG=0 ASE_WORKDIR=<build_dir>/synth/work TARGET=asesim ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n16"```
|
|
@ -2,8 +2,32 @@
|
|||
|
||||
## Table of Contents
|
||||
|
||||
- [Codebase Layout](codebase.md): Summary of repo file tree
|
||||
- [Microarchitecture](microarchitecture.md): Vortex Pipeline and cache microarchitectural details and reconfigurability
|
||||
- [Simulation](simulation.md): Details for building and running each simulation driver
|
||||
- [Contributing](contributing.md): Process for contributing your own features including repo semantics and testing
|
||||
- [Debugging](debugging.md): Debugging configurations for each Vortex driver
|
||||
- [Codebase Layout](codebase.md)
|
||||
- [Microarchitecture](microarchitecture.md)
|
||||
- [Cache Subsystem](cache_subsystem.md)
|
||||
- [Software](software.md)
|
||||
- [Simulation](simulation.md)
|
||||
- [Altera FPGA Setup Guide](altera_fpga_guide.md)
|
||||
- [Xilinx FPGA Setup Guide](xilinx_fpga_guide.md)
|
||||
- [Debugging](debugging.md)
|
||||
- [Useful Links](references.md)
|
||||
|
||||
## Installation
|
||||
|
||||
- For the different environments Vortex supports, [read this document](environment_setup.md).
|
||||
- To install on your own system, [follow this document](install_vortex.md).
|
||||
|
||||
## Quick Start Scenarios
|
||||
|
||||
Running Vortex simulators with different configurations:
|
||||
- Run basic driver test with rtlsim driver and Vortex config of 2 clusters, 2 cores, 2 warps, 4 threads
|
||||
|
||||
$ ./ci/blackbox.sh --driver=rtlsim --clusters=2 --cores=2 --warps=2 --threads=4 --app=basic
|
||||
|
||||
- Run demo driver test with opae driver and Vortex config of 1 clusters, 4 cores, 4 warps, 2 threads
|
||||
|
||||
$ ./ci/blackbox.sh --driver=opae --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo
|
||||
|
||||
- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads
|
||||
|
||||
$ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood
|
||||
|
|
|
@ -77,7 +77,4 @@ Vortex has a 6-stage pipeline:
|
|||
- Sockets
|
||||
- Grouping multiple cores sharing L1 cache
|
||||
- Clusters
|
||||
- Grouping of sockets sharing L2 cache
|
||||
|
||||
### Vortex Cache Subsystem
|
||||
More details about the cache subsystem are provided [here](./cache_subsystem.md).
|
||||
- Grouping of sockets sharing L2 cache
|
|
@ -6,16 +6,13 @@
|
|||
|
||||
### Cycle-Approximate Simulation
|
||||
|
||||
SimX is a C++ cycle-level in-house simulator developed for Vortex. The relevant files are located in the `simx` folder. The [readme](README.md) has the most detailed instructions for building and running simX.
|
||||
|
||||
- To install on your own system, [follow this document](install_vortex.md).
|
||||
- For the different Georgia Tech environments Vortex supports, [read this document](environment_setup.md).
|
||||
SimX is a C++ cycle-level in-house simulator developed for Vortex. The relevant files are located in the `simX` folder.
|
||||
|
||||
### FGPA Simulation
|
||||
|
||||
The guide to build the fpga with specific configurations is located [here.](fpga_setup.md) You can find instructions for both Xilinx and Altera based FPGAs.
|
||||
The current target FPGA for simulation is the Arria10 Intel Accelerator Card v1.0. The guide to build the fpga with specific configurations is located [here.](fpga_setup.md)
|
||||
|
||||
### How to Test (using `blackbox.sh`)
|
||||
### How to Test
|
||||
|
||||
Running tests under specific drivers (rtlsim,simx,fpga) is done using the script named `blackbox.sh` located in the `ci` folder. Running command `./ci/blackbox.sh --help` from the Vortex root directory will display the following command line arguments for `blackbox.sh`:
|
||||
|
||||
|
@ -50,20 +47,4 @@ PERF: core1: instrs=90693, cycles=53108, IPC=1.707709
|
|||
PERF: core2: instrs=90849, cycles=53107, IPC=1.710678
|
||||
PERF: core3: instrs=90836, cycles=50347, IPC=1.804199
|
||||
PERF: instrs=363180, cycles=53108, IPC=6.838518
|
||||
```
|
||||
|
||||
## Additional Quick Start Scenarios
|
||||
|
||||
Running Vortex simulators with different configurations and drivers is supported. For example:
|
||||
|
||||
- Run basic driver test with rtlsim driver and Vortex config of 2 clusters, 2 cores, 2 warps, 4 threads
|
||||
|
||||
$ ./ci/blackbox.sh --driver=rtlsim --clusters=2 --cores=2 --warps=2 --threads=4 --app=basic
|
||||
|
||||
- Run demo driver test with opae driver and Vortex config of 1 clusters, 4 cores, 4 warps, 2 threads
|
||||
|
||||
$ ./ci/blackbox.sh --driver=opae --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo
|
||||
|
||||
- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads
|
||||
|
||||
$ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood
|
||||
```
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
## Running a Vortex application
|
||||
|
||||
The framework provides a utility script: blackbox.sh under the /ci/ folder for executing applications in the tests tree. It gets copied into the `build` directory with all the environment variables resolved, so you should run it from the `build` directory as follows:
|
||||
The framework provides a utility script: blackbox.sh under the /ci/ folder for executing applications in the tests tree.
|
||||
You can query the commandline options of the tool using:
|
||||
|
||||
$ ./ci/blackbox.sh --help
|
||||
|
@ -49,4 +49,4 @@ Compile your test: `$ make -C tests/regression/<test-name>`
|
|||
Run your test: `$ ./ci/blackbox.sh --driver=simx --app=<test-name> --debug`
|
||||
|
||||
## Adding Your Tests to the CI Pipeline
|
||||
If you are a contributor, then you will need to add tests that integrate into the continuous integration pipeline. Remember, Pull Requests cannot be merged unless new code has tests and existing tests do not regress. Furthermore, if you are contributing a new feature, it is recommended that you add the ability to enable / disable the new feature that you are adding. See more at [contributing.md](contributing.md) and [continuous_integration.md](continuous_integration.md).
|
||||
See `continuous_integration.md`
|
36
docs/xilinx_fpga_guide.md
Normal file
36
docs/xilinx_fpga_guide.md
Normal file
|
@ -0,0 +1,36 @@
|
|||
# FPGA Startup and Configuration Guide
|
||||
|
||||
XRT Environment Setup
|
||||
----------------------
|
||||
|
||||
$ source /opt/xilinx/Vitis/2023.1/settings64.sh
|
||||
$ source /opt/xilinx/xrt/setup.sh
|
||||
|
||||
|
||||
Check Installed FPGA Platforms
|
||||
------------------------------
|
||||
|
||||
$ platforminfo -l
|
||||
|
||||
|
||||
Build FPGA image
|
||||
----------------
|
||||
|
||||
$ cd hw/syn/xilinx/xrt
|
||||
$ PREFIX=test1 PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 TARGET=hw NUM_CORES=4 make
|
||||
|
||||
Will run the synthesis under new build directory: BUILD_DIR := "\<PREFIX>\_\<PLATFORM>\_\<TARGET>"
|
||||
|
||||
The generated bitstream will be located under <BUILD_DIR>/bin/vortex_afu.xclbin
|
||||
|
||||
Sample FPGA Run Test
|
||||
--------------------
|
||||
|
||||
Ensure you have the correct opae runtime for the FPGA target
|
||||
|
||||
$ make -C runtime/xrt clean
|
||||
$ TARGET=hw make -C runtime/xrt
|
||||
|
||||
Run the following from your Vortex build directory
|
||||
|
||||
$ TARGET=hw FPGA_BIN_DIR=<BUILD_DIR>/bin ./ci/blackbox.sh --driver=xrt --app=sgemm --args="-n128"
|
|
@ -47,6 +47,8 @@ extern "C" {
|
|||
void dpi_trace(int level, const char* format, ...);
|
||||
void dpi_trace_start();
|
||||
void dpi_trace_stop();
|
||||
|
||||
uint64_t dpi_uuid_gen(bool reset, int wid);
|
||||
}
|
||||
|
||||
bool sim_trace_enabled();
|
||||
|
@ -202,3 +204,17 @@ void dpi_trace_start() {
|
|||
void dpi_trace_stop() {
|
||||
sim_trace_enable(false);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::unordered_map<uint32_t, uint32_t> g_uuid_gens;
|
||||
|
||||
uint64_t dpi_uuid_gen(bool reset, int wid) {
|
||||
if (reset) {
|
||||
g_uuid_gens.clear();
|
||||
return 0;
|
||||
}
|
||||
uint32_t instr_uuid = g_uuid_gens[wid]++;
|
||||
uint64_t uuid = (uint64_t(wid) << 32) | instr_uuid;
|
||||
return uuid;
|
||||
}
|
|
@ -30,4 +30,6 @@ import "DPI-C" function void dpi_trace(input int level, input string format /*ve
|
|||
import "DPI-C" function void dpi_trace_start();
|
||||
import "DPI-C" function void dpi_trace_stop();
|
||||
|
||||
import "DPI-C" function longint dpi_uuid_gen(input logic reset, input int wid);
|
||||
|
||||
`endif
|
||||
|
|
|
@ -24,14 +24,14 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
`endif
|
||||
|
||||
// DCRs
|
||||
VX_dcr_bus_if.slave dcr_bus_if,
|
||||
|
||||
// Memory
|
||||
VX_mem_bus_if.master mem_bus_if [`L2_MEM_PORTS],
|
||||
VX_mem_bus_if.master mem_bus_if,
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
|
@ -43,12 +43,12 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
cache_perf_t l2_perf;
|
||||
sysmem_perf_t sysmem_perf_tmp;
|
||||
always @(*) begin
|
||||
sysmem_perf_tmp = sysmem_perf;
|
||||
sysmem_perf_tmp.l2cache = l2_perf;
|
||||
end
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
assign mem_perf_tmp_if.icache = 'x;
|
||||
assign mem_perf_tmp_if.dcache = 'x;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.lmem = 'x;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
`endif
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
|
@ -56,21 +56,23 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
VX_gbar_bus_if per_socket_gbar_bus_if[`NUM_SOCKETS]();
|
||||
VX_gbar_bus_if gbar_bus_if();
|
||||
|
||||
`RESET_RELAY (gbar_reset, reset);
|
||||
|
||||
VX_gbar_arb #(
|
||||
.NUM_REQS (`NUM_SOCKETS),
|
||||
.OUT_BUF ((`NUM_SOCKETS > 2) ? 1 : 0) // bgar_unit has no backpressure
|
||||
) gbar_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (gbar_reset),
|
||||
.bus_in_if (per_socket_gbar_bus_if),
|
||||
.bus_out_if (gbar_bus_if)
|
||||
);
|
||||
|
||||
VX_gbar_unit #(
|
||||
.INSTANCE_ID (`SFORMATF(("gbar%0d", CLUSTER_ID)))
|
||||
.INSTANCE_ID ($sformatf("gbar%0d", CLUSTER_ID))
|
||||
) gbar_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (gbar_reset),
|
||||
.gbar_bus_if (gbar_bus_if)
|
||||
);
|
||||
|
||||
|
@ -79,19 +81,18 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
||||
) per_socket_mem_bus_if[`NUM_SOCKETS * `L1_MEM_PORTS]();
|
||||
) per_socket_mem_bus_if[`NUM_SOCKETS]();
|
||||
|
||||
`RESET_RELAY (l2_reset, reset);
|
||||
|
||||
VX_cache_wrap #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-l2cache", INSTANCE_ID))),
|
||||
.INSTANCE_ID ($sformatf("%s-l2cache", INSTANCE_ID)),
|
||||
.CACHE_SIZE (`L2_CACHE_SIZE),
|
||||
.LINE_SIZE (`L2_LINE_SIZE),
|
||||
.NUM_BANKS (`L2_NUM_BANKS),
|
||||
.NUM_WAYS (`L2_NUM_WAYS),
|
||||
.WORD_SIZE (L2_WORD_SIZE),
|
||||
.NUM_REQS (L2_NUM_REQS),
|
||||
.MEM_PORTS (`L2_MEM_PORTS),
|
||||
.CRSQ_SIZE (`L2_CRSQ_SIZE),
|
||||
.MSHR_SIZE (`L2_MSHR_SIZE),
|
||||
.MRSQ_SIZE (`L2_MRSQ_SIZE),
|
||||
|
@ -99,19 +100,17 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
.TAG_WIDTH (L2_TAG_WIDTH),
|
||||
.WRITE_ENABLE (1),
|
||||
.WRITEBACK (`L2_WRITEBACK),
|
||||
.DIRTY_BYTES (`L2_DIRTYBYTES),
|
||||
.REPL_POLICY (`L2_REPL_POLICY),
|
||||
.DIRTY_BYTES (`L2_WRITEBACK),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
|
||||
.CORE_OUT_BUF (3),
|
||||
.MEM_OUT_BUF (3),
|
||||
.CORE_OUT_BUF (2),
|
||||
.MEM_OUT_BUF (2),
|
||||
.NC_ENABLE (1),
|
||||
.PASSTHRU (!`L2_ENABLED)
|
||||
) l2cache (
|
||||
.clk (clk),
|
||||
.reset (l2_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (l2_perf),
|
||||
.cache_perf (mem_perf_tmp_if.l2cache),
|
||||
`endif
|
||||
.core_bus_if (per_socket_mem_bus_if),
|
||||
.mem_bus_if (mem_bus_if)
|
||||
|
@ -119,20 +118,24 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
VX_dcr_bus_if socket_dcr_bus_tmp_if();
|
||||
assign socket_dcr_bus_tmp_if.write_valid = dcr_bus_if.write_valid && (dcr_bus_if.write_addr >= `VX_DCR_BASE_STATE_BEGIN && dcr_bus_if.write_addr < `VX_DCR_BASE_STATE_END);
|
||||
assign socket_dcr_bus_tmp_if.write_addr = dcr_bus_if.write_addr;
|
||||
assign socket_dcr_bus_tmp_if.write_data = dcr_bus_if.write_data;
|
||||
|
||||
wire [`NUM_SOCKETS-1:0] per_socket_busy;
|
||||
|
||||
VX_dcr_bus_if socket_dcr_bus_if();
|
||||
`BUFFER_DCR_BUS_IF (socket_dcr_bus_if, socket_dcr_bus_tmp_if, (`NUM_SOCKETS > 1));
|
||||
|
||||
// Generate all sockets
|
||||
for (genvar socket_id = 0; socket_id < `NUM_SOCKETS; ++socket_id) begin : g_sockets
|
||||
for (genvar socket_id = 0; socket_id < `NUM_SOCKETS; ++socket_id) begin : sockets
|
||||
|
||||
`RESET_RELAY (socket_reset, reset);
|
||||
|
||||
VX_dcr_bus_if socket_dcr_bus_if();
|
||||
wire is_base_dcr_addr = (dcr_bus_if.write_addr >= `VX_DCR_BASE_STATE_BEGIN && dcr_bus_if.write_addr < `VX_DCR_BASE_STATE_END);
|
||||
`BUFFER_DCR_BUS_IF (socket_dcr_bus_if, dcr_bus_if, is_base_dcr_addr, (`NUM_SOCKETS > 1))
|
||||
|
||||
VX_socket #(
|
||||
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + socket_id),
|
||||
.INSTANCE_ID (`SFORMATF(("%s-socket%0d", INSTANCE_ID, socket_id)))
|
||||
.INSTANCE_ID ($sformatf("%s-socket%0d", INSTANCE_ID, socket_id))
|
||||
) socket (
|
||||
`SCOPE_IO_BIND (scope_socket+socket_id)
|
||||
|
||||
|
@ -140,12 +143,12 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
.reset (socket_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.sysmem_perf (sysmem_perf_tmp),
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
`endif
|
||||
|
||||
.dcr_bus_if (socket_dcr_bus_if),
|
||||
|
||||
.mem_bus_if (per_socket_mem_bus_if[socket_id * `L1_MEM_PORTS +: `L1_MEM_PORTS]),
|
||||
.mem_bus_if (per_socket_mem_bus_if[socket_id]),
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
.gbar_bus_if (per_socket_gbar_bus_if[socket_id]),
|
||||
|
@ -155,6 +158,6 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
);
|
||||
end
|
||||
|
||||
`BUFFER_EX(busy, (| per_socket_busy), 1'b1, 1, (`NUM_SOCKETS > 1));
|
||||
`BUFFER_EX(busy, (| per_socket_busy), 1'b1, (`NUM_SOCKETS > 1));
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -31,6 +31,7 @@
|
|||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifndef EXT_M_DISABLE
|
||||
`define EXT_M_ENABLE
|
||||
`endif
|
||||
|
@ -85,10 +86,6 @@
|
|||
`endif
|
||||
`endif
|
||||
|
||||
`ifndef VLEN
|
||||
`define VLEN 256
|
||||
`endif
|
||||
|
||||
`ifndef NUM_CLUSTERS
|
||||
`define NUM_CLUSTERS 1
|
||||
`endif
|
||||
|
@ -113,24 +110,6 @@
|
|||
`define SOCKET_SIZE `MIN(4, `NUM_CORES)
|
||||
`endif
|
||||
|
||||
// Size of Tensor Core
|
||||
`ifndef TC_SIZE
|
||||
`define TC_SIZE 8
|
||||
`endif
|
||||
|
||||
// Number of TCs per Warp
|
||||
`ifndef TC_NUM
|
||||
`define TC_NUM 4
|
||||
`endif
|
||||
|
||||
`ifndef NUM_TCU_LANES
|
||||
`define NUM_TCU_LANES `TC_NUM
|
||||
`endif
|
||||
|
||||
`ifndef NUM_TCU_BLOCKS
|
||||
`define NUM_TCU_BLOCKS `ISSUE_WIDTH
|
||||
`endif
|
||||
|
||||
`ifdef L2_ENABLE
|
||||
`define L2_ENABLED 1
|
||||
`else
|
||||
|
@ -172,28 +151,6 @@
|
|||
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
|
||||
`endif
|
||||
|
||||
// Platform memory parameters
|
||||
|
||||
`ifndef PLATFORM_MEMORY_NUM_BANKS
|
||||
`define PLATFORM_MEMORY_NUM_BANKS 2
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_MEMORY_ADDR_WIDTH
|
||||
`ifdef XLEN_64
|
||||
`define PLATFORM_MEMORY_ADDR_WIDTH 48
|
||||
`else
|
||||
`define PLATFORM_MEMORY_ADDR_WIDTH 32
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_MEMORY_DATA_SIZE
|
||||
`define PLATFORM_MEMORY_DATA_SIZE 64
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_MEMORY_INTERLEAVE
|
||||
`define PLATFORM_MEMORY_INTERLEAVE 1
|
||||
`endif
|
||||
|
||||
`ifdef XLEN_64
|
||||
|
||||
`ifndef STACK_BASE_ADDR
|
||||
|
@ -212,14 +169,7 @@
|
|||
`define IO_BASE_ADDR 64'h000000040
|
||||
`endif
|
||||
|
||||
`ifdef VM_ENABLE
|
||||
`ifndef PAGE_TABLE_BASE_ADDR
|
||||
`define PAGE_TABLE_BASE_ADDR 64'h0F0000000
|
||||
`endif
|
||||
|
||||
`endif
|
||||
|
||||
`else // XLEN_32
|
||||
`else
|
||||
|
||||
`ifndef STACK_BASE_ADDR
|
||||
`define STACK_BASE_ADDR 32'hFFFF0000
|
||||
|
@ -237,13 +187,6 @@
|
|||
`define IO_BASE_ADDR 32'h00000040
|
||||
`endif
|
||||
|
||||
`ifdef VM_ENABLE
|
||||
`ifndef PAGE_TABLE_BASE_ADDR
|
||||
`define PAGE_TABLE_BASE_ADDR 32'hF0000000
|
||||
`endif
|
||||
|
||||
`endif
|
||||
|
||||
`endif
|
||||
|
||||
`define IO_END_ADDR `USER_BASE_ADDR
|
||||
|
@ -271,17 +214,15 @@
|
|||
`endif
|
||||
`define STACK_SIZE (1 << `STACK_LOG2_SIZE)
|
||||
|
||||
`define RESET_DELAY 8
|
||||
`define RESET_DELAY 8
|
||||
|
||||
`ifndef STALL_TIMEOUT
|
||||
`define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
|
||||
`endif
|
||||
|
||||
`ifndef SV_DPI
|
||||
`ifndef DPI_DISABLE
|
||||
`define DPI_DISABLE
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifndef FPU_FPNEW
|
||||
`ifndef FPU_DSP
|
||||
|
@ -310,59 +251,6 @@
|
|||
`define DEBUG_LEVEL 3
|
||||
`endif
|
||||
|
||||
`ifndef MEM_PAGE_SIZE
|
||||
`define MEM_PAGE_SIZE (4096)
|
||||
`endif
|
||||
`ifndef MEM_PAGE_LOG2_SIZE
|
||||
`define MEM_PAGE_LOG2_SIZE (12)
|
||||
`endif
|
||||
|
||||
// Virtual Memory Configuration ///////////////////////////////////////////////////////
|
||||
`ifdef VM_ENABLE
|
||||
`ifdef XLEN_32
|
||||
`ifndef VM_ADDR_MODE
|
||||
`define VM_ADDR_MODE SV32 //or BARE
|
||||
`endif
|
||||
`ifndef PT_LEVEL
|
||||
`define PT_LEVEL (2)
|
||||
`endif
|
||||
`ifndef PTE_SIZE
|
||||
`define PTE_SIZE (4)
|
||||
`endif
|
||||
`ifndef NUM_PTE_ENTRY
|
||||
`define NUM_PTE_ENTRY (1024)
|
||||
`endif
|
||||
`ifndef PT_SIZE_LIMIT
|
||||
`define PT_SIZE_LIMIT (1<<23)
|
||||
`endif
|
||||
`else
|
||||
`ifndef VM_ADDR_MODE
|
||||
`define VM_ADDR_MODE SV39 //or BARE
|
||||
`endif
|
||||
`ifndef PT_LEVEL
|
||||
`define PT_LEVEL (3)
|
||||
`endif
|
||||
`ifndef PTE_SIZE
|
||||
`define PTE_SIZE (8)
|
||||
`endif
|
||||
`ifndef NUM_PTE_ENTRY
|
||||
`define NUM_PTE_ENTRY (512)
|
||||
`endif
|
||||
`ifndef PT_SIZE_LIMIT
|
||||
`define PT_SIZE_LIMIT (1<<25)
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifndef PT_SIZE
|
||||
`define PT_SIZE MEM_PAGE_SIZE
|
||||
`endif
|
||||
|
||||
`ifndef TLB_SIZE
|
||||
`define TLB_SIZE (32)
|
||||
`endif
|
||||
|
||||
`endif
|
||||
|
||||
// Pipeline Configuration /////////////////////////////////////////////////////
|
||||
|
||||
// Issue width
|
||||
|
@ -590,16 +478,7 @@
|
|||
|
||||
// Number of Associative Ways
|
||||
`ifndef ICACHE_NUM_WAYS
|
||||
`define ICACHE_NUM_WAYS 4
|
||||
`endif
|
||||
|
||||
// Replacement Policy
|
||||
`ifndef ICACHE_REPL_POLICY
|
||||
`define ICACHE_REPL_POLICY 1
|
||||
`endif
|
||||
|
||||
`ifndef ICACHE_MEM_PORTS
|
||||
`define ICACHE_MEM_PORTS 1
|
||||
`define ICACHE_NUM_WAYS 1
|
||||
`endif
|
||||
|
||||
// Dcache Configurable Knobs //////////////////////////////////////////////////
|
||||
|
@ -628,7 +507,7 @@
|
|||
|
||||
// Number of Banks
|
||||
`ifndef DCACHE_NUM_BANKS
|
||||
`define DCACHE_NUM_BANKS `MIN(DCACHE_NUM_REQS, 16)
|
||||
`define DCACHE_NUM_BANKS `MIN(`NUM_LSU_LANES, 4)
|
||||
`endif
|
||||
|
||||
// Core Response Queue Size
|
||||
|
@ -648,12 +527,12 @@
|
|||
|
||||
// Memory Response Queue Size
|
||||
`ifndef DCACHE_MRSQ_SIZE
|
||||
`define DCACHE_MRSQ_SIZE 4
|
||||
`define DCACHE_MRSQ_SIZE 0
|
||||
`endif
|
||||
|
||||
// Number of Associative Ways
|
||||
`ifndef DCACHE_NUM_WAYS
|
||||
`define DCACHE_NUM_WAYS 4
|
||||
`define DCACHE_NUM_WAYS 1
|
||||
`endif
|
||||
|
||||
// Enable Cache Writeback
|
||||
|
@ -661,25 +540,6 @@
|
|||
`define DCACHE_WRITEBACK 0
|
||||
`endif
|
||||
|
||||
// Enable Cache Dirty bytes
|
||||
`ifndef DCACHE_DIRTYBYTES
|
||||
`define DCACHE_DIRTYBYTES `DCACHE_WRITEBACK
|
||||
`endif
|
||||
|
||||
// Replacement Policy
|
||||
`ifndef DCACHE_REPL_POLICY
|
||||
`define DCACHE_REPL_POLICY 1
|
||||
`endif
|
||||
|
||||
// Number of Memory Ports
|
||||
`ifndef L1_MEM_PORTS
|
||||
`ifdef L1_DISABLE
|
||||
`define L1_MEM_PORTS `MIN(DCACHE_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
|
||||
`else
|
||||
`define L1_MEM_PORTS `MIN(`DCACHE_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
|
||||
`endif
|
||||
`endif
|
||||
|
||||
// LMEM Configurable Knobs ////////////////////////////////////////////////////
|
||||
|
||||
`ifndef LMEM_DISABLE
|
||||
|
@ -702,12 +562,16 @@
|
|||
|
||||
// Cache Size
|
||||
`ifndef L2_CACHE_SIZE
|
||||
`ifdef ALTERA_S10
|
||||
`define L2_CACHE_SIZE 2097152
|
||||
`else
|
||||
`define L2_CACHE_SIZE 1048576
|
||||
`endif
|
||||
`endif
|
||||
|
||||
// Number of Banks
|
||||
`ifndef L2_NUM_BANKS
|
||||
`define L2_NUM_BANKS `MIN(L2_NUM_REQS, 16)
|
||||
`define L2_NUM_BANKS `MIN(4, `NUM_SOCKETS)
|
||||
`endif
|
||||
|
||||
// Core Response Queue Size
|
||||
|
@ -727,12 +591,12 @@
|
|||
|
||||
// Memory Response Queue Size
|
||||
`ifndef L2_MRSQ_SIZE
|
||||
`define L2_MRSQ_SIZE 4
|
||||
`define L2_MRSQ_SIZE 0
|
||||
`endif
|
||||
|
||||
// Number of Associative Ways
|
||||
`ifndef L2_NUM_WAYS
|
||||
`define L2_NUM_WAYS 8
|
||||
`define L2_NUM_WAYS 2
|
||||
`endif
|
||||
|
||||
// Enable Cache Writeback
|
||||
|
@ -740,35 +604,20 @@
|
|||
`define L2_WRITEBACK 0
|
||||
`endif
|
||||
|
||||
// Enable Cache Dirty bytes
|
||||
`ifndef L2_DIRTYBYTES
|
||||
`define L2_DIRTYBYTES `L2_WRITEBACK
|
||||
`endif
|
||||
|
||||
// Replacement Policy
|
||||
`ifndef L2_REPL_POLICY
|
||||
`define L2_REPL_POLICY 1
|
||||
`endif
|
||||
|
||||
// Number of Memory Ports
|
||||
`ifndef L2_MEM_PORTS
|
||||
`ifdef L2_ENABLE
|
||||
`define L2_MEM_PORTS `MIN(`L2_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
|
||||
`else
|
||||
`define L2_MEM_PORTS `MIN(L2_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
|
||||
`endif
|
||||
`endif
|
||||
|
||||
// L3cache Configurable Knobs /////////////////////////////////////////////////
|
||||
|
||||
// Cache Size
|
||||
`ifndef L3_CACHE_SIZE
|
||||
`ifdef ALTERA_S10
|
||||
`define L3_CACHE_SIZE 2097152
|
||||
`else
|
||||
`define L3_CACHE_SIZE 1048576
|
||||
`endif
|
||||
`endif
|
||||
|
||||
// Number of Banks
|
||||
`ifndef L3_NUM_BANKS
|
||||
`define L3_NUM_BANKS `MIN(L3_NUM_REQS, 16)
|
||||
`define L3_NUM_BANKS `MIN(4, `NUM_CLUSTERS)
|
||||
`endif
|
||||
|
||||
// Core Response Queue Size
|
||||
|
@ -788,12 +637,12 @@
|
|||
|
||||
// Memory Response Queue Size
|
||||
`ifndef L3_MRSQ_SIZE
|
||||
`define L3_MRSQ_SIZE 4
|
||||
`define L3_MRSQ_SIZE 0
|
||||
`endif
|
||||
|
||||
// Number of Associative Ways
|
||||
`ifndef L3_NUM_WAYS
|
||||
`define L3_NUM_WAYS 8
|
||||
`define L3_NUM_WAYS 4
|
||||
`endif
|
||||
|
||||
// Enable Cache Writeback
|
||||
|
@ -801,25 +650,6 @@
|
|||
`define L3_WRITEBACK 0
|
||||
`endif
|
||||
|
||||
// Enable Cache Dirty bytes
|
||||
`ifndef L3_DIRTYBYTES
|
||||
`define L3_DIRTYBYTES `L3_WRITEBACK
|
||||
`endif
|
||||
|
||||
// Replacement Policy
|
||||
`ifndef L3_REPL_POLICY
|
||||
`define L3_REPL_POLICY 1
|
||||
`endif
|
||||
|
||||
// Number of Memory Ports
|
||||
`ifndef L3_MEM_PORTS
|
||||
`ifdef L3_ENABLE
|
||||
`define L3_MEM_PORTS `MIN(`L3_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
|
||||
`else
|
||||
`define L3_MEM_PORTS `MIN(L3_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
|
||||
`endif
|
||||
`endif
|
||||
|
||||
// ISA Extensions /////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef EXT_A_ENABLE
|
||||
|
@ -852,12 +682,6 @@
|
|||
`define EXT_M_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifdef EXT_V_ENABLE
|
||||
`define EXT_V_ENABLED 1
|
||||
`else
|
||||
`define EXT_V_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifdef EXT_ZICOND_ENABLE
|
||||
`define EXT_ZICOND_ENABLED 1
|
||||
`else
|
||||
|
@ -874,7 +698,7 @@
|
|||
`define ISA_STD_N 13
|
||||
`define ISA_STD_Q 16
|
||||
`define ISA_STD_S 18
|
||||
`define ISA_STD_V 21
|
||||
`define ISA_STD_U 20
|
||||
|
||||
`define ISA_EXT_ICACHE 0
|
||||
`define ISA_EXT_DCACHE 1
|
||||
|
@ -911,7 +735,7 @@
|
|||
| (0 << 18) /* S - Supervisor mode implemented */ \
|
||||
| (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \
|
||||
| (1 << 20) /* U - User mode implemented */ \
|
||||
| (`EXT_V_ENABLED << 21) /* V - Tentatively reserved for Vector extension */ \
|
||||
| (0 << 21) /* V - Tentatively reserved for Vector extension */ \
|
||||
| (0 << 22) /* W - Reserved */ \
|
||||
| (1 << 23) /* X - Non-standard extensions present */ \
|
||||
| (0 << 24) /* Y - Reserved */ \
|
||||
|
|
|
@ -50,16 +50,10 @@
|
|||
`define PERF_CTR_BITS 44
|
||||
|
||||
`ifndef NDEBUG
|
||||
`define UUID_ENABLE
|
||||
`define UUID_WIDTH 44
|
||||
`else
|
||||
`ifdef SCOPE
|
||||
`define UUID_ENABLE
|
||||
`define UUID_WIDTH 44
|
||||
`else
|
||||
`define UUID_WIDTH 1
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`define PC_BITS (`XLEN-1)
|
||||
`define OFFSET_BITS 12
|
||||
|
@ -233,19 +227,22 @@
|
|||
`define INST_FENCE_D 1'h0
|
||||
`define INST_FENCE_I 1'h1
|
||||
|
||||
`define INST_FPU_ADD 4'b0000 // SUB=fmt[1]
|
||||
`define INST_FPU_MUL 4'b0001
|
||||
`define INST_FPU_MADD 4'b0010 // SUB=fmt[1]
|
||||
`define INST_FPU_NMADD 4'b0011 // SUB=fmt[1]
|
||||
`define INST_FPU_DIV 4'b0100
|
||||
`define INST_FPU_SQRT 4'b0101
|
||||
`define INST_FPU_F2I 4'b1000 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
|
||||
`define INST_FPU_F2U 4'b1001 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
|
||||
`define INST_FPU_I2F 4'b1010 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
|
||||
`define INST_FPU_U2F 4'b1011 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
|
||||
`define INST_FPU_CMP 4'b1100 // frm: LE=0, LT=1, EQ=2
|
||||
`define INST_FPU_F2F 4'b1101 // fmt[0]: F32=0, F64=1
|
||||
`define INST_FPU_MISC 4'b1110 // frm: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
|
||||
`define INST_FPU_ADD 4'b0000
|
||||
`define INST_FPU_SUB 4'b0001
|
||||
`define INST_FPU_MUL 4'b0010
|
||||
`define INST_FPU_DIV 4'b0011
|
||||
`define INST_FPU_SQRT 4'b0100
|
||||
`define INST_FPU_CMP 4'b0101 // frm: LE=0, LT=1, EQ=2
|
||||
`define INST_FPU_F2F 4'b0110
|
||||
`define INST_FPU_MISC 4'b0111 // frm: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
|
||||
`define INST_FPU_F2I 4'b1000
|
||||
`define INST_FPU_F2U 4'b1001
|
||||
`define INST_FPU_I2F 4'b1010
|
||||
`define INST_FPU_U2F 4'b1011
|
||||
`define INST_FPU_MADD 4'b1100
|
||||
`define INST_FPU_MSUB 4'b1101
|
||||
`define INST_FPU_NMSUB 4'b1110
|
||||
`define INST_FPU_NMADD 4'b1111
|
||||
`define INST_FPU_BITS 4
|
||||
`define INST_FPU_IS_CLASS(op, frm) (op == `INST_FPU_MISC && frm == 3)
|
||||
`define INST_FPU_IS_MVXW(op, frm) (op == `INST_FPU_MISC && frm == 4)
|
||||
|
@ -270,14 +267,14 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width) \
|
||||
(uuid_width + `CLOG2(mshr_size) + `CLOG2(`CDIV(num_banks, mem_ports)))
|
||||
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
|
||||
(`CLOG2(mshr_size) + `CLOG2(num_banks))
|
||||
|
||||
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width) \
|
||||
(`CLOG2(`CDIV(num_reqs, mem_ports)) + `CLOG2(line_size / word_size) + tag_width)
|
||||
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
|
||||
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
|
||||
|
||||
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, tag_width, uuid_width) \
|
||||
(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width), `CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width)) + 1)
|
||||
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \
|
||||
(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width)) + 1)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -287,14 +284,14 @@
|
|||
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \
|
||||
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
|
||||
|
||||
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, num_caches, uuid_width) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width), num_caches)
|
||||
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
|
||||
|
||||
`define CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
|
||||
`define CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
|
||||
|
||||
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, tag_width, num_inputs, num_caches, uuid_width) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches), uuid_width), num_caches)
|
||||
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -306,12 +303,11 @@
|
|||
`define L1_ENABLE
|
||||
`endif
|
||||
|
||||
`define MEM_REQ_FLAG_FLUSH 0
|
||||
`define MEM_REQ_FLAG_IO 1
|
||||
`define MEM_REQ_FLAG_LOCAL 2 // shoud be last since optional
|
||||
`define MEM_REQ_FLAGS_WIDTH (`MEM_REQ_FLAG_LOCAL + `LMEM_ENABLED)
|
||||
`define ADDR_TYPE_FLUSH 0
|
||||
`define ADDR_TYPE_IO 1
|
||||
`define ADDR_TYPE_LOCAL 2 // shoud be last since optional
|
||||
`define ADDR_TYPE_WIDTH (`ADDR_TYPE_LOCAL + `LMEM_ENABLED)
|
||||
|
||||
`define VX_MEM_PORTS `L3_MEM_PORTS
|
||||
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
|
||||
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
|
||||
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
|
||||
|
@ -324,23 +320,12 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define NEG_EDGE(dst, src) \
|
||||
VX_edge_trigger #( \
|
||||
.POS (0), \
|
||||
.INIT (0) \
|
||||
) __neg_edge`__LINE__ ( \
|
||||
.clk (clk), \
|
||||
.reset (1'b0), \
|
||||
.data_in (src), \
|
||||
.data_out (dst) \
|
||||
)
|
||||
|
||||
`define BUFFER_EX(dst, src, ena, resetw, latency) \
|
||||
`define BUFFER_EX(dst, src, ena, latency) \
|
||||
VX_pipe_register #( \
|
||||
.DATAW ($bits(dst)), \
|
||||
.RESETW (resetw), \
|
||||
.RESETW ($bits(dst)), \
|
||||
.DEPTH (latency) \
|
||||
) __buffer_ex`__LINE__ ( \
|
||||
) __``dst``__ ( \
|
||||
.clk (clk), \
|
||||
.reset (reset), \
|
||||
.enable (ena), \
|
||||
|
@ -348,13 +333,13 @@
|
|||
.data_out (dst) \
|
||||
)
|
||||
|
||||
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, $bits(dst), 1)
|
||||
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 1)
|
||||
|
||||
`define POP_COUNT_EX(out, in, model) \
|
||||
VX_popcount #( \
|
||||
.N ($bits(in)), \
|
||||
.MODEL (model) \
|
||||
) __pop_count_ex`__LINE__ ( \
|
||||
) __``out``__ ( \
|
||||
.data_in (in), \
|
||||
.data_out (out) \
|
||||
)
|
||||
|
@ -374,114 +359,50 @@
|
|||
assign src.rsp_data = dst.rsp_data; \
|
||||
assign dst.rsp_ready = src.rsp_ready
|
||||
|
||||
`define ASSIGN_VX_MEM_BUS_RO_IF(dst, src) \
|
||||
assign dst.req_valid = src.req_valid; \
|
||||
assign dst.req_data.rw = 0; \
|
||||
assign dst.req_data.addr = src.req_data.addr; \
|
||||
assign dst.req_data.data = '0; \
|
||||
assign dst.req_data.byteen = '1; \
|
||||
assign dst.req_data.flags = src.req_data.flags; \
|
||||
assign dst.req_data.tag = src.req_data.tag; \
|
||||
assign src.req_ready = dst.req_ready; \
|
||||
assign src.rsp_valid = dst.rsp_valid; \
|
||||
assign src.rsp_data.data = dst.rsp_data.data; \
|
||||
assign src.rsp_data.tag = dst.rsp_data.tag; \
|
||||
assign dst.rsp_ready = src.rsp_ready
|
||||
|
||||
`define ASSIGN_VX_MEM_BUS_IF_EX(dst, src, TD, TS, UUID) \
|
||||
`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \
|
||||
assign dst.req_valid = src.req_valid; \
|
||||
assign dst.req_data.rw = src.req_data.rw; \
|
||||
assign dst.req_data.addr = src.req_data.addr; \
|
||||
assign dst.req_data.data = src.req_data.data; \
|
||||
assign dst.req_data.byteen = src.req_data.byteen; \
|
||||
assign dst.req_data.flags = src.req_data.flags; \
|
||||
/* verilator lint_off GENUNNAMED */ \
|
||||
if (TD != TS) begin \
|
||||
if (UUID != 0) begin \
|
||||
if (TD > TS) begin \
|
||||
assign dst.req_data.tag = {src.req_data.tag.uuid, {(TD-TS){1'b0}}, src.req_data.tag.value}; \
|
||||
end else begin \
|
||||
assign dst.req_data.tag = {src.req_data.tag.uuid, src.req_data.tag.value[TD-UUID-1:0]}; \
|
||||
end \
|
||||
end else begin \
|
||||
if (TD > TS) begin \
|
||||
assign dst.req_data.tag = {{(TD-TS){1'b0}}, src.req_data.tag}; \
|
||||
end else begin \
|
||||
assign dst.req_data.tag = src.req_data.tag[TD-1:0]; \
|
||||
end \
|
||||
end \
|
||||
end else begin \
|
||||
assign dst.req_data.addr = src.req_data.addr; \
|
||||
assign dst.req_data.atype = src.req_data.atype; \
|
||||
assign dst.req_data.data = src.req_data.data; \
|
||||
if (TD != TS) \
|
||||
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
|
||||
else \
|
||||
assign dst.req_data.tag = src.req_data.tag; \
|
||||
end \
|
||||
/* verilator lint_on GENUNNAMED */ \
|
||||
assign src.req_ready = dst.req_ready; \
|
||||
assign src.rsp_valid = dst.rsp_valid; \
|
||||
assign src.rsp_data.data = dst.rsp_data.data; \
|
||||
/* verilator lint_off GENUNNAMED */ \
|
||||
if (TD != TS) begin \
|
||||
if (UUID != 0) begin \
|
||||
if (TD > TS) begin \
|
||||
assign src.rsp_data.tag = {dst.rsp_data.tag.uuid, dst.rsp_data.tag.value[TS-UUID-1:0]}; \
|
||||
end else begin \
|
||||
assign src.rsp_data.tag = {dst.rsp_data.tag.uuid, {(TS-TD){1'b0}}, dst.rsp_data.tag.value}; \
|
||||
end \
|
||||
end else begin \
|
||||
if (TD > TS) begin \
|
||||
assign src.rsp_data.tag = dst.rsp_data.tag[TS-1:0]; \
|
||||
end else begin \
|
||||
assign src.rsp_data.tag = {{(TS-TD){1'b0}}, dst.rsp_data.tag}; \
|
||||
end \
|
||||
end \
|
||||
end else begin \
|
||||
assign src.rsp_data.tag = dst.rsp_data.tag; \
|
||||
end \
|
||||
/* verilator lint_on GENUNNAMED */ \
|
||||
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
|
||||
assign dst.rsp_ready = src.rsp_ready
|
||||
|
||||
`define INIT_VX_MEM_BUS_IF(itf) \
|
||||
assign itf.req_valid = 0; \
|
||||
assign itf.req_data = '0; \
|
||||
`UNUSED_VAR (itf.req_ready) \
|
||||
`UNUSED_VAR (itf.rsp_valid) \
|
||||
`UNUSED_VAR (itf.rsp_data) \
|
||||
assign itf.rsp_ready = 0;
|
||||
`define ASSIGN_VX_LSU_MEM_IF(dst, src) \
|
||||
assign dst.req_valid = src.req_valid; \
|
||||
assign dst.req_data = src.req_data; \
|
||||
assign src.req_ready = dst.req_ready; \
|
||||
assign src.rsp_valid = dst.rsp_valid; \
|
||||
assign src.rsp_data = dst.rsp_data; \
|
||||
assign dst.rsp_ready = src.rsp_ready
|
||||
|
||||
`define UNUSED_VX_MEM_BUS_IF(itf) \
|
||||
`UNUSED_VAR (itf.req_valid) \
|
||||
`UNUSED_VAR (itf.req_data) \
|
||||
assign itf.req_ready = 0; \
|
||||
assign itf.rsp_valid = 0; \
|
||||
assign itf.rsp_data = '0; \
|
||||
`UNUSED_VAR (itf.rsp_ready)
|
||||
|
||||
|
||||
`define BUFFER_DCR_BUS_IF(dst, src, ena, latency) \
|
||||
/* verilator lint_off GENUNNAMED */ \
|
||||
if (latency != 0) begin \
|
||||
VX_pipe_register #( \
|
||||
.DATAW (1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH), \
|
||||
.DEPTH (latency) \
|
||||
) pipe_reg ( \
|
||||
.clk (clk), \
|
||||
.reset (1'b0), \
|
||||
.enable (1'b1), \
|
||||
.data_in ({src.write_valid && ena, src.write_addr, src.write_data}), \
|
||||
.data_out ({dst.write_valid, dst.write_addr, dst.write_data}) \
|
||||
); \
|
||||
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
|
||||
if (enable) begin \
|
||||
reg [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __dst; \
|
||||
always @(posedge clk) begin \
|
||||
__dst <= {src.write_valid, src.write_addr, src.write_data}; \
|
||||
end \
|
||||
assign {dst.write_valid, dst.write_addr, dst.write_data} = __dst; \
|
||||
end else begin \
|
||||
assign {dst.write_valid, dst.write_addr, dst.write_data} = {src.write_valid && ena, src.write_addr, src.write_data}; \
|
||||
end \
|
||||
/* verilator lint_on GENUNNAMED */
|
||||
assign {dst.write_valid, dst.write_addr, dst.write_data} = {src.write_valid, src.write_addr, src.write_data}; \
|
||||
end
|
||||
|
||||
`define PERF_COUNTER_ADD(dst, src, field, width, count, reg_enable) \
|
||||
/* verilator lint_off GENUNNAMED */ \
|
||||
if (count > 1) begin \
|
||||
wire [count-1:0][width-1:0] __reduce_add_i_field; \
|
||||
wire [width-1:0] __reduce_add_o_field; \
|
||||
for (genvar __i = 0; __i < count; ++__i) begin \
|
||||
assign __reduce_add_i_field[__i] = src[__i].``field; \
|
||||
end \
|
||||
VX_reduce_tree #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
|
||||
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
|
||||
__reduce_add_i_field, \
|
||||
__reduce_add_o_field \
|
||||
); \
|
||||
|
@ -500,11 +421,9 @@
|
|||
end \
|
||||
end else begin \
|
||||
assign dst.``field = src[0].``field; \
|
||||
end \
|
||||
/* verilator lint_on GENUNNAMED */
|
||||
end
|
||||
|
||||
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
|
||||
/* verilator lint_off GENUNNAMED */ \
|
||||
if (block_size != 1) begin \
|
||||
if (block_size != `NUM_WARPS) begin \
|
||||
assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \
|
||||
|
@ -513,7 +432,6 @@
|
|||
end \
|
||||
end else begin \
|
||||
assign dst = src; \
|
||||
end \
|
||||
/* verilator lint_on GENUNNAMED */
|
||||
end
|
||||
|
||||
`endif // VX_DEFINE_VH
|
||||
|
|
|
@ -73,17 +73,6 @@ package VX_gpu_pkg;
|
|||
logic [`PERF_CTR_BITS-1:0] crsp_stalls;
|
||||
} cache_perf_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [`PERF_CTR_BITS-1:0] reads;
|
||||
logic [`PERF_CTR_BITS-1:0] writes;
|
||||
logic [`PERF_CTR_BITS-1:0] bank_stalls;
|
||||
logic [`PERF_CTR_BITS-1:0] crsp_stalls;
|
||||
} lmem_perf_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [`PERF_CTR_BITS-1:0] misses;
|
||||
} coalescer_perf_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [`PERF_CTR_BITS-1:0] reads;
|
||||
logic [`PERF_CTR_BITS-1:0] writes;
|
||||
|
@ -103,26 +92,6 @@ package VX_gpu_pkg;
|
|||
logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses;
|
||||
} issue_perf_t;
|
||||
|
||||
typedef struct packed {
|
||||
cache_perf_t icache;
|
||||
cache_perf_t dcache;
|
||||
cache_perf_t l2cache;
|
||||
cache_perf_t l3cache;
|
||||
lmem_perf_t lmem;
|
||||
coalescer_perf_t coalescer;
|
||||
mem_perf_t mem;
|
||||
} sysmem_perf_t;
|
||||
|
||||
typedef struct packed {
|
||||
sched_perf_t sched;
|
||||
issue_perf_t issue;
|
||||
logic [`PERF_CTR_BITS-1:0] ifetches;
|
||||
logic [`PERF_CTR_BITS-1:0] loads;
|
||||
logic [`PERF_CTR_BITS-1:0] stores;
|
||||
logic [`PERF_CTR_BITS-1:0] ifetch_latency;
|
||||
logic [`PERF_CTR_BITS-1:0] load_latency;
|
||||
} pipeline_perf_t;
|
||||
|
||||
//////////////////////// instruction arguments ////////////////////////////
|
||||
|
||||
typedef struct packed {
|
||||
|
@ -176,7 +145,6 @@ package VX_gpu_pkg;
|
|||
localparam LSU_TAG_ID_BITS = (`CLOG2(`LSUQ_IN_SIZE) + `CLOG2(LSU_MEM_BATCHES));
|
||||
localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS);
|
||||
localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES;
|
||||
localparam LMEM_TAG_WIDTH = LSU_TAG_WIDTH + `CLOG2(`NUM_LSU_BLOCKS);
|
||||
|
||||
////////////////////////// Icache Parameters //////////////////////////////
|
||||
|
||||
|
@ -198,9 +166,9 @@ package VX_gpu_pkg;
|
|||
|
||||
// Memory request tag bits
|
||||
`ifdef ICACHE_ENABLE
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, 1, `NUM_ICACHES, `UUID_WIDTH);
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
|
||||
`else
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, 1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
|
||||
`endif
|
||||
|
||||
////////////////////////// Dcache Parameters //////////////////////////////
|
||||
|
@ -212,7 +180,7 @@ package VX_gpu_pkg;
|
|||
// Block size in bytes
|
||||
localparam DCACHE_LINE_SIZE = `L1_LINE_SIZE;
|
||||
|
||||
// Input request size (using coalesced memory blocks)
|
||||
// Input request size
|
||||
localparam DCACHE_CHANNELS = `UP((`NUM_LSU_LANES * LSU_WORD_SIZE) / DCACHE_WORD_SIZE);
|
||||
localparam DCACHE_NUM_REQS = `NUM_LSU_BLOCKS * DCACHE_CHANNELS;
|
||||
|
||||
|
@ -229,27 +197,26 @@ package VX_gpu_pkg;
|
|||
|
||||
// Memory request tag bits
|
||||
`ifdef DCACHE_ENABLE
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, `L1_MEM_PORTS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES, `UUID_WIDTH);
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
|
||||
`else
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, `L1_MEM_PORTS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
|
||||
`endif
|
||||
|
||||
/////////////////////////////// L1 Parameters /////////////////////////////
|
||||
|
||||
// arbitrate between icache and dcache
|
||||
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
||||
localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
|
||||
|
||||
/////////////////////////////// L2 Parameters /////////////////////////////
|
||||
|
||||
localparam ICACHE_MEM_ARB_IDX = 0;
|
||||
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
|
||||
localparam ICACHE_MEM_ARB_IDX = 0;
|
||||
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
|
||||
|
||||
// Word size in bytes
|
||||
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
|
||||
|
||||
// Input request size
|
||||
localparam L2_NUM_REQS = `NUM_SOCKETS * `L1_MEM_PORTS;
|
||||
localparam L2_NUM_REQS = `NUM_SOCKETS;
|
||||
|
||||
// Core request tag bits
|
||||
localparam L2_TAG_WIDTH = L1_MEM_ARB_TAG_WIDTH;
|
||||
|
@ -259,9 +226,9 @@ package VX_gpu_pkg;
|
|||
|
||||
// Memory request tag bits
|
||||
`ifdef L2_ENABLE
|
||||
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_MEM_PORTS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH, `UUID_WIDTH);
|
||||
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
|
||||
`else
|
||||
localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_MEM_PORTS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
|
||||
localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
|
||||
`endif
|
||||
|
||||
/////////////////////////////// L3 Parameters /////////////////////////////
|
||||
|
@ -270,7 +237,7 @@ package VX_gpu_pkg;
|
|||
localparam L3_WORD_SIZE = `L2_LINE_SIZE;
|
||||
|
||||
// Input request size
|
||||
localparam L3_NUM_REQS = `NUM_CLUSTERS * `L2_MEM_PORTS;
|
||||
localparam L3_NUM_REQS = `NUM_CLUSTERS;
|
||||
|
||||
// Core request tag bits
|
||||
localparam L3_TAG_WIDTH = L2_MEM_TAG_WIDTH;
|
||||
|
@ -280,9 +247,9 @@ package VX_gpu_pkg;
|
|||
|
||||
// Memory request tag bits
|
||||
`ifdef L3_ENABLE
|
||||
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_MEM_PORTS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH, `UUID_WIDTH);
|
||||
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
|
||||
`else
|
||||
localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_MEM_PORTS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
|
||||
localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
|
||||
`endif
|
||||
|
||||
/////////////////////////////// Issue parameters //////////////////////////
|
||||
|
@ -341,430 +308,6 @@ package VX_gpu_pkg;
|
|||
|
||||
`IGNORE_UNUSED_END
|
||||
|
||||
////////////////////////////////// Tracing ////////////////////////////////////
|
||||
|
||||
`ifdef SIMULATION
|
||||
|
||||
`ifdef SV_DPI
|
||||
import "DPI-C" function void dpi_trace(input int level, input string format /*verilator sformat*/);
|
||||
`endif
|
||||
|
||||
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
|
||||
case (ex_type)
|
||||
`EX_ALU: `TRACE(level, ("ALU"))
|
||||
`EX_LSU: `TRACE(level, ("LSU"))
|
||||
`EX_SFU: `TRACE(level, ("SFU"))
|
||||
`ifdef EXT_F_ENABLE
|
||||
`EX_FPU: `TRACE(level, ("FPU"))
|
||||
`endif
|
||||
default: `TRACE(level, ("?"))
|
||||
endcase
|
||||
endtask
|
||||
|
||||
task trace_ex_op(input int level,
|
||||
input [`EX_BITS-1:0] ex_type,
|
||||
input [`INST_OP_BITS-1:0] op_type,
|
||||
input VX_gpu_pkg::op_args_t op_args
|
||||
);
|
||||
case (ex_type)
|
||||
`EX_ALU: begin
|
||||
case (op_args.alu.xtype)
|
||||
`ALU_TYPE_ARITH: begin
|
||||
if (op_args.alu.is_w) begin
|
||||
if (op_args.alu.use_imm) begin
|
||||
case (`INST_ALU_BITS'(op_type))
|
||||
`INST_ALU_ADD: `TRACE(level, ("ADDIW"))
|
||||
`INST_ALU_SLL: `TRACE(level, ("SLLIW"))
|
||||
`INST_ALU_SRL: `TRACE(level, ("SRLIW"))
|
||||
`INST_ALU_SRA: `TRACE(level, ("SRAIW"))
|
||||
default: `TRACE(level, ("?"))
|
||||
endcase
|
||||
end else begin
|
||||
case (`INST_ALU_BITS'(op_type))
|
||||
`INST_ALU_ADD: `TRACE(level, ("ADDW"))
|
||||
`INST_ALU_SUB: `TRACE(level, ("SUBW"))
|
||||
`INST_ALU_SLL: `TRACE(level, ("SLLW"))
|
||||
`INST_ALU_SRL: `TRACE(level, ("SRLW"))
|
||||
`INST_ALU_SRA: `TRACE(level, ("SRAW"))
|
||||
default: `TRACE(level, ("?"))
|
||||
endcase
|
||||
end
|
||||
end else begin
|
||||
if (op_args.alu.use_imm) begin
|
||||
case (`INST_ALU_BITS'(op_type))
|
||||
`INST_ALU_ADD: `TRACE(level, ("ADDI"))
|
||||
`INST_ALU_SLL: `TRACE(level, ("SLLI"))
|
||||
`INST_ALU_SRL: `TRACE(level, ("SRLI"))
|
||||
`INST_ALU_SRA: `TRACE(level, ("SRAI"))
|
||||
`INST_ALU_SLT: `TRACE(level, ("SLTI"))
|
||||
`INST_ALU_SLTU: `TRACE(level, ("SLTIU"))
|
||||
`INST_ALU_XOR: `TRACE(level, ("XORI"))
|
||||
`INST_ALU_OR: `TRACE(level, ("ORI"))
|
||||
`INST_ALU_AND: `TRACE(level, ("ANDI"))
|
||||
`INST_ALU_LUI: `TRACE(level, ("LUI"))
|
||||
`INST_ALU_AUIPC: `TRACE(level, ("AUIPC"))
|
||||
default: `TRACE(level, ("?"))
|
||||
endcase
|
||||
end else begin
|
||||
case (`INST_ALU_BITS'(op_type))
|
||||
`INST_ALU_ADD: `TRACE(level, ("ADD"))
|
||||
`INST_ALU_SUB: `TRACE(level, ("SUB"))
|
||||
`INST_ALU_SLL: `TRACE(level, ("SLL"))
|
||||
`INST_ALU_SRL: `TRACE(level, ("SRL"))
|
||||
`INST_ALU_SRA: `TRACE(level, ("SRA"))
|
||||
`INST_ALU_SLT: `TRACE(level, ("SLT"))
|
||||
`INST_ALU_SLTU: `TRACE(level, ("SLTU"))
|
||||
`INST_ALU_XOR: `TRACE(level, ("XOR"))
|
||||
`INST_ALU_OR: `TRACE(level, ("OR"))
|
||||
`INST_ALU_AND: `TRACE(level, ("AND"))
|
||||
`INST_ALU_CZEQ: `TRACE(level, ("CZERO.EQZ"))
|
||||
`INST_ALU_CZNE: `TRACE(level, ("CZERO.NEZ"))
|
||||
default: `TRACE(level, ("?"))
|
||||
endcase
|
||||
end
|
||||
end
|
||||
end
|
||||
`ALU_TYPE_BRANCH: begin
|
||||
case (`INST_BR_BITS'(op_type))
|
||||
`INST_BR_EQ: `TRACE(level, ("BEQ"))
|
||||
`INST_BR_NE: `TRACE(level, ("BNE"))
|
||||
`INST_BR_LT: `TRACE(level, ("BLT"))
|
||||
`INST_BR_GE: `TRACE(level, ("BGE"))
|
||||
`INST_BR_LTU: `TRACE(level, ("BLTU"))
|
||||
`INST_BR_GEU: `TRACE(level, ("BGEU"))
|
||||
`INST_BR_JAL: `TRACE(level, ("JAL"))
|
||||
`INST_BR_JALR: `TRACE(level, ("JALR"))
|
||||
`INST_BR_ECALL: `TRACE(level, ("ECALL"))
|
||||
`INST_BR_EBREAK:`TRACE(level, ("EBREAK"))
|
||||
`INST_BR_URET: `TRACE(level, ("URET"))
|
||||
`INST_BR_SRET: `TRACE(level, ("SRET"))
|
||||
`INST_BR_MRET: `TRACE(level, ("MRET"))
|
||||
default: `TRACE(level, ("?"))
|
||||
endcase
|
||||
end
|
||||
`ALU_TYPE_MULDIV: begin
|
||||
if (op_args.alu.is_w) begin
|
||||
case (`INST_M_BITS'(op_type))
|
||||
`INST_M_MUL: `TRACE(level, ("MULW"))
|
||||
`INST_M_DIV: `TRACE(level, ("DIVW"))
|
||||
`INST_M_DIVU: `TRACE(level, ("DIVUW"))
|
||||
`INST_M_REM: `TRACE(level, ("REMW"))
|
||||
`INST_M_REMU: `TRACE(level, ("REMUW"))
|
||||
default: `TRACE(level, ("?"))
|
||||
endcase
|
||||
end else begin
|
||||
case (`INST_M_BITS'(op_type))
|
||||
`INST_M_MUL: `TRACE(level, ("MUL"))
|
||||
`INST_M_MULH: `TRACE(level, ("MULH"))
|
||||
`INST_M_MULHSU:`TRACE(level, ("MULHSU"))
|
||||
`INST_M_MULHU: `TRACE(level, ("MULHU"))
|
||||
`INST_M_DIV: `TRACE(level, ("DIV"))
|
||||
`INST_M_DIVU: `TRACE(level, ("DIVU"))
|
||||
`INST_M_REM: `TRACE(level, ("REM"))
|
||||
`INST_M_REMU: `TRACE(level, ("REMU"))
|
||||
default: `TRACE(level, ("?"))
|
||||
endcase
|
||||
end
|
||||
end
|
||||
default: `TRACE(level, ("?"))
|
||||
endcase
|
||||
end
|
||||
`EX_LSU: begin
|
||||
if (op_args.lsu.is_float) begin
|
||||
case (`INST_LSU_BITS'(op_type))
|
||||
`INST_LSU_LW: `TRACE(level, ("FLW"))
|
||||
`INST_LSU_LD: `TRACE(level, ("FLD"))
|
||||
`INST_LSU_SW: `TRACE(level, ("FSW"))
|
||||
`INST_LSU_SD: `TRACE(level, ("FSD"))
|
||||
default: `TRACE(level, ("?"))
|
||||
endcase
|
||||
end else begin
|
||||
case (`INST_LSU_BITS'(op_type))
|
||||
`INST_LSU_LB: `TRACE(level, ("LB"))
|
||||
`INST_LSU_LH: `TRACE(level, ("LH"))
|
||||
`INST_LSU_LW: `TRACE(level, ("LW"))
|
||||
`INST_LSU_LD: `TRACE(level, ("LD"))
|
||||
`INST_LSU_LBU:`TRACE(level, ("LBU"))
|
||||
`INST_LSU_LHU:`TRACE(level, ("LHU"))
|
||||
`INST_LSU_LWU:`TRACE(level, ("LWU"))
|
||||
`INST_LSU_SB: `TRACE(level, ("SB"))
|
||||
`INST_LSU_SH: `TRACE(level, ("SH"))
|
||||
`INST_LSU_SW: `TRACE(level, ("SW"))
|
||||
`INST_LSU_SD: `TRACE(level, ("SD"))
|
||||
`INST_LSU_FENCE:`TRACE(level,("FENCE"))
|
||||
default: `TRACE(level, ("?"))
|
||||
endcase
|
||||
end
|
||||
end
|
||||
`EX_SFU: begin
|
||||
case (`INST_SFU_BITS'(op_type))
|
||||
`INST_SFU_TMC: `TRACE(level, ("TMC"))
|
||||
`INST_SFU_WSPAWN:`TRACE(level, ("WSPAWN"))
|
||||
`INST_SFU_SPLIT: begin
|
||||
if (op_args.wctl.is_neg) begin
|
||||
`TRACE(level, ("SPLIT.N"))
|
||||
end else begin
|
||||
`TRACE(level, ("SPLIT"))
|
||||
end
|
||||
end
|
||||
`INST_SFU_JOIN: `TRACE(level, ("JOIN"))
|
||||
`INST_SFU_BAR: `TRACE(level, ("BAR"))
|
||||
`INST_SFU_PRED: begin
|
||||
if (op_args.wctl.is_neg) begin
|
||||
`TRACE(level, ("PRED.N"))
|
||||
end else begin
|
||||
`TRACE(level, ("PRED"))
|
||||
end
|
||||
end
|
||||
`INST_SFU_CSRRW: begin
|
||||
if (op_args.csr.use_imm) begin
|
||||
`TRACE(level, ("CSRRWI"))
|
||||
end else begin
|
||||
`TRACE(level, ("CSRRW"))
|
||||
end
|
||||
end
|
||||
`INST_SFU_CSRRS: begin
|
||||
if (op_args.csr.use_imm) begin
|
||||
`TRACE(level, ("CSRRSI"))
|
||||
end else begin
|
||||
`TRACE(level, ("CSRRS"))
|
||||
end
|
||||
end
|
||||
`INST_SFU_CSRRC: begin
|
||||
if (op_args.csr.use_imm) begin
|
||||
`TRACE(level, ("CSRRCI"))
|
||||
end else begin
|
||||
`TRACE(level, ("CSRRC"))
|
||||
end
|
||||
end
|
||||
default: `TRACE(level, ("?"))
|
||||
endcase
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
`EX_FPU: begin
|
||||
case (`INST_FPU_BITS'(op_type))
|
||||
`INST_FPU_ADD: begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
`TRACE(level, ("FSUB.D"))
|
||||
end else begin
|
||||
`TRACE(level, ("FSUB.S"))
|
||||
end
|
||||
end else begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
`TRACE(level, ("FADD.D"))
|
||||
end else begin
|
||||
`TRACE(level, ("FADD.S"))
|
||||
end
|
||||
end
|
||||
end
|
||||
`INST_FPU_MADD: begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
`TRACE(level, ("FMSUB.D"))
|
||||
end else begin
|
||||
`TRACE(level, ("FMSUB.S"))
|
||||
end
|
||||
end else begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
`TRACE(level, ("FMADD.D"))
|
||||
end else begin
|
||||
`TRACE(level, ("FMADD.S"))
|
||||
end
|
||||
end
|
||||
end
|
||||
`INST_FPU_NMADD: begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
`TRACE(level, ("FNMSUB.D"))
|
||||
end else begin
|
||||
`TRACE(level, ("FNMSUB.S"))
|
||||
end
|
||||
end else begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
`TRACE(level, ("FNMADD.D"))
|
||||
end else begin
|
||||
`TRACE(level, ("FNMADD.S"))
|
||||
end
|
||||
end
|
||||
end
|
||||
`INST_FPU_MUL: begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
`TRACE(level, ("FMUL.D"))
|
||||
end else begin
|
||||
`TRACE(level, ("FMUL.S"))
|
||||
end
|
||||
end
|
||||
`INST_FPU_DIV: begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
`TRACE(level, ("FDIV.D"))
|
||||
end else begin
|
||||
`TRACE(level, ("FDIV.S"))
|
||||
end
|
||||
end
|
||||
`INST_FPU_SQRT: begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
`TRACE(level, ("FSQRT.D"))
|
||||
end else begin
|
||||
`TRACE(level, ("FSQRT.S"))
|
||||
end
|
||||
end
|
||||
`INST_FPU_CMP: begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
case (op_args.fpu.frm[1:0])
|
||||
0: `TRACE(level, ("FLE.D"))
|
||||
1: `TRACE(level, ("FLT.D"))
|
||||
2: `TRACE(level, ("FEQ.D"))
|
||||
default: `TRACE(level, ("?"))
|
||||
endcase
|
||||
end else begin
|
||||
case (op_args.fpu.frm[1:0])
|
||||
0: `TRACE(level, ("FLE.S"))
|
||||
1: `TRACE(level, ("FLT.S"))
|
||||
2: `TRACE(level, ("FEQ.S"))
|
||||
default: `TRACE(level, ("?"))
|
||||
endcase
|
||||
end
|
||||
end
|
||||
`INST_FPU_F2F: begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
`TRACE(level, ("FCVT.D.S"))
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.S.D"))
|
||||
end
|
||||
end
|
||||
`INST_FPU_F2I: begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.L.D"))
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.W.D"))
|
||||
end
|
||||
end else begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.L.S"))
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.W.S"))
|
||||
end
|
||||
end
|
||||
end
|
||||
`INST_FPU_F2U: begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.LU.D"))
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.WU.D"))
|
||||
end
|
||||
end else begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.LU.S"))
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.WU.S"))
|
||||
end
|
||||
end
|
||||
end
|
||||
`INST_FPU_I2F: begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.D.L"))
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.D.W"))
|
||||
end
|
||||
end else begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.S.L"))
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.S.W"))
|
||||
end
|
||||
end
|
||||
end
|
||||
`INST_FPU_U2F: begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.D.LU"))
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.D.WU"))
|
||||
end
|
||||
end else begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.S.LU"))
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.S.WU"))
|
||||
end
|
||||
end
|
||||
end
|
||||
`INST_FPU_MISC: begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
case (op_args.fpu.frm)
|
||||
0: `TRACE(level, ("FSGNJ.D"))
|
||||
1: `TRACE(level, ("FSGNJN.D"))
|
||||
2: `TRACE(level, ("FSGNJX.D"))
|
||||
3: `TRACE(level, ("FCLASS.D"))
|
||||
4: `TRACE(level, ("FMV.X.D"))
|
||||
5: `TRACE(level, ("FMV.D.X"))
|
||||
6: `TRACE(level, ("FMIN.D"))
|
||||
7: `TRACE(level, ("FMAX.D"))
|
||||
endcase
|
||||
end else begin
|
||||
case (op_args.fpu.frm)
|
||||
0: `TRACE(level, ("FSGNJ.S"))
|
||||
1: `TRACE(level, ("FSGNJN.S"))
|
||||
2: `TRACE(level, ("FSGNJX.S"))
|
||||
3: `TRACE(level, ("FCLASS.S"))
|
||||
4: `TRACE(level, ("FMV.X.S"))
|
||||
5: `TRACE(level, ("FMV.S.X"))
|
||||
6: `TRACE(level, ("FMIN.S"))
|
||||
7: `TRACE(level, ("FMAX.S"))
|
||||
endcase
|
||||
end
|
||||
end
|
||||
default: `TRACE(level, ("?"))
|
||||
endcase
|
||||
end
|
||||
`endif
|
||||
default: `TRACE(level, ("?"))
|
||||
endcase
|
||||
endtask
|
||||
|
||||
task trace_op_args(input int level,
|
||||
input [`EX_BITS-1:0] ex_type,
|
||||
input [`INST_OP_BITS-1:0] op_type,
|
||||
input VX_gpu_pkg::op_args_t op_args
|
||||
);
|
||||
case (ex_type)
|
||||
`EX_ALU: begin
|
||||
`TRACE(level, (", use_PC=%b, use_imm=%b, imm=0x%0h", op_args.alu.use_PC, op_args.alu.use_imm, op_args.alu.imm))
|
||||
end
|
||||
`EX_LSU: begin
|
||||
`TRACE(level, (", offset=0x%0h", op_args.lsu.offset))
|
||||
end
|
||||
`EX_SFU: begin
|
||||
if (`INST_SFU_IS_CSR(op_type)) begin
|
||||
`TRACE(level, (", addr=0x%0h, use_imm=%b, imm=0x%0h", op_args.csr.addr, op_args.csr.use_imm, op_args.csr.imm))
|
||||
end
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
`EX_FPU: begin
|
||||
`TRACE(level, (", fmt=0x%0h, frm=0x%0h", op_args.fpu.fmt, op_args.fpu.frm))
|
||||
end
|
||||
`endif
|
||||
default:;
|
||||
endcase
|
||||
endtask
|
||||
|
||||
task trace_base_dcr(input int level, input [`VX_DCR_ADDR_WIDTH-1:0] addr);
|
||||
case (addr)
|
||||
`VX_DCR_BASE_STARTUP_ADDR0: `TRACE(level, ("STARTUP_ADDR0"))
|
||||
`VX_DCR_BASE_STARTUP_ADDR1: `TRACE(level, ("STARTUP_ADDR1"))
|
||||
`VX_DCR_BASE_STARTUP_ARG0: `TRACE(level, ("STARTUP_ARG0"))
|
||||
`VX_DCR_BASE_STARTUP_ARG1: `TRACE(level, ("STARTUP_ARG1"))
|
||||
`VX_DCR_BASE_MPM_CLASS: `TRACE(level, ("MPM_CLASS"))
|
||||
default: `TRACE(level, ("?"))
|
||||
endcase
|
||||
endtask
|
||||
|
||||
`endif
|
||||
|
||||
endpackage
|
||||
|
||||
`endif // VX_GPU_PKG_VH
|
||||
|
|
|
@ -22,34 +22,36 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef SIMULATION
|
||||
|
||||
`define STATIC_ASSERT(cond, msg) \
|
||||
/* verilator lint_off GENUNNAMED */ \
|
||||
if (!(cond)) $error msg; \
|
||||
/* verilator lint_on GENUNNAMED */ \
|
||||
|
||||
`define ERROR(msg) \
|
||||
$error msg
|
||||
|
||||
`define ASSERT(cond, msg) \
|
||||
assert(cond) else $error msg
|
||||
|
||||
`define RUNTIME_ASSERT(cond, msg) \
|
||||
always @(posedge clk) begin \
|
||||
if (!reset) begin \
|
||||
`ASSERT(cond, msg); \
|
||||
end \
|
||||
end
|
||||
|
||||
`ifndef TRACING_ALL
|
||||
`define TRACING_ON /* verilator tracing_on */
|
||||
`define TRACING_OFF /* verilator tracing_off */
|
||||
`ifdef VIVADO
|
||||
`define STRING
|
||||
`else
|
||||
`define TRACING_ON
|
||||
`define TRACING_OFF
|
||||
`define STRING string
|
||||
`endif
|
||||
|
||||
`ifdef SYNTHESIS
|
||||
`define TRACING_ON
|
||||
`define TRACING_OFF
|
||||
`ifndef NDEBUG
|
||||
`define DEBUG_BLOCK(x) x
|
||||
`else
|
||||
`define DEBUG_BLOCK(x)
|
||||
`endif
|
||||
`define IGNORE_UNOPTFLAT_BEGIN
|
||||
`define IGNORE_UNOPTFLAT_END
|
||||
`define IGNORE_UNUSED_BEGIN
|
||||
`define IGNORE_UNUSED_END
|
||||
`define IGNORE_WARNINGS_BEGIN
|
||||
`define IGNORE_WARNINGS_END
|
||||
`define UNUSED_PARAM(x)
|
||||
`define UNUSED_SPARAM(x)
|
||||
`define UNUSED_VAR(x)
|
||||
`define UNUSED_PIN(x) . x ()
|
||||
`define UNUSED_ARG(x) x
|
||||
`define TRACE(level, args) if (level <= `DEBUG_LEVEL) $write args
|
||||
`else
|
||||
`ifdef VERILATOR
|
||||
`define TRACING_ON /* verilator tracing_on */
|
||||
`define TRACING_OFF /* verilator tracing_off */
|
||||
`ifndef NDEBUG
|
||||
`define DEBUG_BLOCK(x) /* verilator lint_off UNUSED */ \
|
||||
x \
|
||||
|
@ -98,99 +100,74 @@
|
|||
localparam `STRING __``x = x; \
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
`define UNUSED_VAR(x) /* verilator lint_off GENUNNAMED */ \
|
||||
if (1) begin \
|
||||
`define UNUSED_VAR(x) if (1) begin \
|
||||
/* verilator lint_off UNUSED */ \
|
||||
wire [$bits(x)-1:0] __unused = x; \
|
||||
wire [$bits(x)-1:0] __x = x; \
|
||||
/* verilator lint_on UNUSED */ \
|
||||
end \
|
||||
/* verilator lint_on GENUNNAMED */
|
||||
end
|
||||
|
||||
`define UNUSED_PIN(x) /* verilator lint_off PINCONNECTEMPTY */ \
|
||||
. x () \
|
||||
/* verilator lint_on PINCONNECTEMPTY */
|
||||
|
||||
`define UNUSED_ARG(x) /* verilator lint_off UNUSED */ \
|
||||
x \
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
`ifdef SV_DPI
|
||||
`define TRACE(level, args) dpi_trace(level, $sformatf args);
|
||||
`else
|
||||
`define TRACE(level, args) \
|
||||
if (level <= `DEBUG_LEVEL) begin \
|
||||
$write args; \
|
||||
end
|
||||
`endif
|
||||
|
||||
`define SFORMATF(x) $sformatf x
|
||||
`ifdef SV_DPI
|
||||
`define TRACE(level, args) dpi_trace(level, $sformatf args)
|
||||
`else
|
||||
`define TRACE(level, args) if (level <= `DEBUG_LEVEL) $write args
|
||||
`endif
|
||||
|
||||
`else // SYNTHESIS
|
||||
`endif
|
||||
|
||||
`define STATIC_ASSERT(cond, msg)
|
||||
`define ERROR(msg) //
|
||||
`define ASSERT(cond, msg) //
|
||||
`define RUNTIME_ASSERT(cond, msg)
|
||||
`ifdef SIMULATION
|
||||
`define STATIC_ASSERT(cond, msg) \
|
||||
generate \
|
||||
if (!(cond)) $error msg; \
|
||||
endgenerate
|
||||
|
||||
`define DEBUG_BLOCK(x)
|
||||
`define TRACE(level, args)
|
||||
`define SFORMATF(x) ""
|
||||
`define ERROR(msg) \
|
||||
$error msg
|
||||
|
||||
`define TRACING_ON
|
||||
`define TRACING_OFF
|
||||
|
||||
`define IGNORE_UNOPTFLAT_BEGIN
|
||||
`define IGNORE_UNOPTFLAT_END
|
||||
`define IGNORE_UNUSED_BEGIN
|
||||
`define IGNORE_UNUSED_END
|
||||
`define IGNORE_WARNINGS_BEGIN
|
||||
`define IGNORE_WARNINGS_END
|
||||
`define UNUSED_PARAM(x)
|
||||
`define UNUSED_SPARAM(x)
|
||||
`define UNUSED_VAR(x)
|
||||
`define UNUSED_PIN(x) . x ()
|
||||
`define UNUSED_ARG(x) x
|
||||
`define ASSERT(cond, msg) \
|
||||
assert(cond) else $error msg
|
||||
|
||||
`define RUNTIME_ASSERT(cond, msg) \
|
||||
always @(posedge clk) begin \
|
||||
assert(cond) else $error msg; \
|
||||
end
|
||||
`else
|
||||
`define STATIC_ASSERT(cond, msg)
|
||||
`define ERROR(msg) //
|
||||
`define ASSERT(cond, msg) //
|
||||
`define RUNTIME_ASSERT(cond, msg)
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef QUARTUS
|
||||
`define MAX_FANOUT 8
|
||||
`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256)
|
||||
`define USE_BLOCK_BRAM (* ramstyle = "block" *)
|
||||
`define IF_DATA_SIZE(x) $bits(x.data)
|
||||
`define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *)
|
||||
`define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *)
|
||||
`define RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams on" *)
|
||||
`define DISABLE_BRAM (* ramstyle = "logic" *)
|
||||
`define PRESERVE_NET (* preserve *)
|
||||
`define BLACKBOX_CELL (* black_box *)
|
||||
`define STRING string
|
||||
`elsif VIVADO
|
||||
`define MAX_FANOUT 8
|
||||
`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256)
|
||||
`define USE_BLOCK_BRAM (* ram_style = "block" *)
|
||||
`define IF_DATA_SIZE(x) $bits(x.data)
|
||||
`define USE_FAST_BRAM (* ram_style = "distributed" *)
|
||||
`define NO_RW_RAM_CHECK (* rw_addr_collision = "no" *)
|
||||
`define RW_RAM_CHECK (* rw_addr_collision = "yes" *)
|
||||
`define DISABLE_BRAM (* ram_style = "registers" *)
|
||||
`define PRESERVE_NET (* keep = "true" *)
|
||||
`define BLACKBOX_CELL (* black_box *)
|
||||
`define STRING
|
||||
`ifndef SIMULATION
|
||||
`define ASYNC_BRAM_PATCH
|
||||
`endif
|
||||
`else
|
||||
`define MAX_FANOUT 8
|
||||
`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256)
|
||||
`define USE_BLOCK_BRAM
|
||||
`define IF_DATA_SIZE(x) x.DATA_WIDTH
|
||||
`define USE_FAST_BRAM
|
||||
`define NO_RW_RAM_CHECK
|
||||
`define RW_RAM_CHECK
|
||||
`define DISABLE_BRAM
|
||||
`define PRESERVE_NET
|
||||
`define BLACKBOX_CELL
|
||||
`define STRING string
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -215,7 +192,7 @@
|
|||
|
||||
`define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x)))
|
||||
|
||||
`define UP(x) (((x) > 0) ? (x) : 1)
|
||||
`define UP(x) (((x) != 0) ? (x) : 1)
|
||||
|
||||
`define CDIV(n,d) ((n + d - 1) / (d))
|
||||
|
||||
|
@ -227,23 +204,23 @@
|
|||
`define SEXT(len, x) {{(len-$bits(x)+1){x[$bits(x)-1]}}, x[$bits(x)-2:0]}
|
||||
|
||||
`define TRACE_ARRAY1D(lvl, fmt, arr, n) \
|
||||
`TRACE(lvl, ("{")) \
|
||||
`TRACE(lvl, ("{")); \
|
||||
for (integer __i = (n-1); __i >= 0; --__i) begin \
|
||||
if (__i != (n-1)) `TRACE(lvl, (", ")) \
|
||||
`TRACE(lvl, (fmt, arr[__i])) \
|
||||
if (__i != (n-1)) `TRACE(lvl, (", ")); \
|
||||
`TRACE(lvl, (fmt, arr[__i])); \
|
||||
end \
|
||||
`TRACE(lvl, ("}"))
|
||||
`TRACE(lvl, ("}"));
|
||||
|
||||
`define TRACE_ARRAY2D(lvl, fmt, arr, m, n) \
|
||||
`TRACE(lvl, ("{")) \
|
||||
`TRACE(lvl, ("{")); \
|
||||
for (integer __i = n-1; __i >= 0; --__i) begin \
|
||||
if (__i != (n-1)) `TRACE(lvl, (", ")) \
|
||||
`TRACE(lvl, ("{")) \
|
||||
if (__i != (n-1)) `TRACE(lvl, (", ")); \
|
||||
`TRACE(lvl, ("{")); \
|
||||
for (integer __j = (m-1); __j >= 0; --__j) begin \
|
||||
if (__j != (m-1)) `TRACE(lvl, (", "))\
|
||||
`TRACE(lvl, (fmt, arr[__i][__j])) \
|
||||
if (__j != (m-1)) `TRACE(lvl, (", "));\
|
||||
`TRACE(lvl, (fmt, arr[__i][__j])); \
|
||||
end \
|
||||
`TRACE(lvl, ("}")) \
|
||||
`TRACE(lvl, ("}")); \
|
||||
end \
|
||||
`TRACE(lvl, ("}"))
|
||||
|
||||
|
@ -262,13 +239,10 @@
|
|||
`RESET_RELAY_EX (dst, src, 1, 0)
|
||||
|
||||
// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2, 5 -> 2
|
||||
`define TO_OUT_BUF_SIZE(s) `MIN(s & 7, 2)
|
||||
`define TO_OUT_BUF_SIZE(s) `MIN(s, 2)
|
||||
|
||||
// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2, 5 > 3
|
||||
`define TO_OUT_BUF_REG(s) (((s & 7) < 2) ? (s & 7) : ((s & 7) - 2))
|
||||
|
||||
// lut(x): (x & 8) != 0
|
||||
`define TO_OUT_BUF_LUTRAM(s) ((s & 8) != 0)
|
||||
`define TO_OUT_BUF_REG(s) ((s < 2) ? s : (s - 2))
|
||||
|
||||
`define REPEAT(n,f,s) `_REPEAT_``n(f,s)
|
||||
`define _REPEAT_0(f,s)
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -21,66 +21,47 @@
|
|||
input wire scope_bus_in, \
|
||||
output wire scope_bus_out,
|
||||
|
||||
`define SCOPE_IO_SWITCH(__count) \
|
||||
wire scope_bus_in_w [__count]; \
|
||||
wire scope_bus_out_w [__count]; \
|
||||
`RESET_RELAY_EX(scope_reset_w, scope_reset, __count, 4); \
|
||||
VX_scope_switch #( \
|
||||
.N (__count) \
|
||||
) scope_switch ( \
|
||||
.clk (clk), \
|
||||
.reset (scope_reset), \
|
||||
.req_in (scope_bus_in), \
|
||||
.rsp_out (scope_bus_out), \
|
||||
.req_out (scope_bus_in_w), \
|
||||
.rsp_in (scope_bus_out_w) \
|
||||
);
|
||||
|
||||
`define SCOPE_IO_BIND(__i) \
|
||||
.scope_reset (scope_reset_w[__i]), \
|
||||
.scope_bus_in (scope_bus_in_w[__i]), \
|
||||
.scope_bus_out (scope_bus_out_w[__i]),
|
||||
|
||||
`define SCOPE_IO_UNUSED(__i) \
|
||||
`define SCOPE_IO_UNUSED() \
|
||||
`UNUSED_VAR (scope_reset); \
|
||||
`UNUSED_VAR (scope_bus_in); \
|
||||
assign scope_bus_out = 0;
|
||||
|
||||
`define SCOPE_IO_UNUSED_W(__i) \
|
||||
`UNUSED_VAR (scope_reset_w[__i]); \
|
||||
`UNUSED_VAR (scope_bus_in_w[__i]); \
|
||||
assign scope_bus_out_w[__i] = 0;
|
||||
|
||||
`define SCOPE_IO_SWITCH(__count) \
|
||||
wire [__count-1:0] scope_bus_in_w; \
|
||||
wire [__count-1:0] scope_bus_out_w; \
|
||||
wire [__count-1:0] scope_reset_w = {__count{scope_reset}}; \
|
||||
VX_scope_switch #( \
|
||||
.N (__count) \
|
||||
) scope_switch ( \
|
||||
.clk (clk), \
|
||||
.reset (scope_reset), \
|
||||
.req_in (scope_bus_in), \
|
||||
.rsp_out (scope_bus_out), \
|
||||
.req_out (scope_bus_in_w), \
|
||||
.rsp_in (scope_bus_out_w) \
|
||||
)
|
||||
|
||||
`define SCOPE_TAP_EX(__idx, __id, __xtriggers_w, __htriggers_w, __probes_w, __xtriggers, __htriggers, __probes, __start, __stop, __depth) \
|
||||
VX_scope_tap #( \
|
||||
.SCOPE_ID (__id), \
|
||||
.XTRIGGERW(__xtriggers_w), \
|
||||
.HTRIGGERW(__htriggers_w), \
|
||||
.PROBEW (__probes_w), \
|
||||
.DEPTH (__depth) \
|
||||
) scope_tap_``idx ( \
|
||||
.clk (clk), \
|
||||
.reset (scope_reset_w[__idx]), \
|
||||
.start (__start), \
|
||||
.stop (__stop), \
|
||||
.xtriggers(__xtriggers), \
|
||||
.htriggers(__htriggers), \
|
||||
.probes (__probes), \
|
||||
.bus_in (scope_bus_in_w[__idx]), \
|
||||
.bus_out(scope_bus_out_w[__idx]) \
|
||||
)
|
||||
|
||||
`define SCOPE_TAP(__idx, __id, __xtriggers, __htriggers, __probes, __start, __stop, __depth) \
|
||||
`SCOPE_TAP_EX(__idx, __id, $bits(__xtriggers), $bits(__htriggers), $bits(__probes), __xtriggers, __htriggers, __probes, __start, __stop, __depth)
|
||||
|
||||
`else
|
||||
|
||||
`define SCOPE_IO_DECL
|
||||
|
||||
`define SCOPE_IO_BIND(__i)
|
||||
|
||||
`define SCOPE_IO_UNUSED(__i)
|
||||
|
||||
`define SCOPE_IO_SWITCH(__count)
|
||||
|
||||
`define SCOPE_TAP(__idx, __id, __xtriggers, __probes, __depth)
|
||||
`define SCOPE_IO_BIND(__i)
|
||||
|
||||
`define SCOPE_TAP_EX(__idx, __id, __xtriggers_w, __probes_w, __xtriggers, __probes, __depth)
|
||||
`define SCOPE_IO_UNUSED_W(__i)
|
||||
|
||||
`define SCOPE_IO_UNUSED(__i)
|
||||
|
||||
`endif
|
||||
|
||||
|
|
|
@ -24,14 +24,14 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
`endif
|
||||
|
||||
// DCRs
|
||||
VX_dcr_bus_if.slave dcr_bus_if,
|
||||
|
||||
// Memory
|
||||
VX_mem_bus_if.master mem_bus_if [`L1_MEM_PORTS],
|
||||
VX_mem_bus_if.master mem_bus_if,
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
// Barrier
|
||||
|
@ -49,12 +49,14 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
`ifdef GBAR_ENABLE
|
||||
VX_gbar_bus_if per_core_gbar_bus_if[`SOCKET_SIZE]();
|
||||
|
||||
`RESET_RELAY (gbar_arb_reset, reset);
|
||||
|
||||
VX_gbar_arb #(
|
||||
.NUM_REQS (`SOCKET_SIZE),
|
||||
.OUT_BUF ((`SOCKET_SIZE > 1) ? 2 : 0)
|
||||
) gbar_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (gbar_arb_reset),
|
||||
.bus_in_if (per_core_gbar_bus_if),
|
||||
.bus_out_if (gbar_bus_if)
|
||||
);
|
||||
|
@ -63,13 +65,11 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
cache_perf_t icache_perf, dcache_perf;
|
||||
sysmem_perf_t sysmem_perf_tmp;
|
||||
always @(*) begin
|
||||
sysmem_perf_tmp = sysmem_perf;
|
||||
sysmem_perf_tmp.icache = icache_perf;
|
||||
sysmem_perf_tmp.dcache = dcache_perf;
|
||||
end
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.lmem = 'x;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
@ -82,12 +82,12 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (ICACHE_LINE_SIZE),
|
||||
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
|
||||
) icache_mem_bus_if[1]();
|
||||
) icache_mem_bus_if();
|
||||
|
||||
`RESET_RELAY (icache_reset, reset);
|
||||
|
||||
VX_cache_cluster #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-icache", INSTANCE_ID))),
|
||||
.INSTANCE_ID ($sformatf("%s-icache", INSTANCE_ID)),
|
||||
.NUM_UNITS (`NUM_ICACHES),
|
||||
.NUM_INPUTS (`SOCKET_SIZE),
|
||||
.TAG_SEL_IDX (0),
|
||||
|
@ -97,22 +97,19 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.NUM_WAYS (`ICACHE_NUM_WAYS),
|
||||
.WORD_SIZE (ICACHE_WORD_SIZE),
|
||||
.NUM_REQS (1),
|
||||
.MEM_PORTS (1),
|
||||
.CRSQ_SIZE (`ICACHE_CRSQ_SIZE),
|
||||
.MSHR_SIZE (`ICACHE_MSHR_SIZE),
|
||||
.MRSQ_SIZE (`ICACHE_MRSQ_SIZE),
|
||||
.MREQ_SIZE (`ICACHE_MREQ_SIZE),
|
||||
.TAG_WIDTH (ICACHE_TAG_WIDTH),
|
||||
.FLAGS_WIDTH (0),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.WRITE_ENABLE (0),
|
||||
.REPL_POLICY (`ICACHE_REPL_POLICY),
|
||||
.NC_ENABLE (0),
|
||||
.CORE_OUT_BUF (3),
|
||||
.CORE_OUT_BUF (2),
|
||||
.MEM_OUT_BUF (2)
|
||||
) icache (
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (icache_perf),
|
||||
.cache_perf (mem_perf_tmp_if.icache),
|
||||
`endif
|
||||
.clk (clk),
|
||||
.reset (icache_reset),
|
||||
|
@ -130,12 +127,12 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_LINE_SIZE),
|
||||
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
|
||||
) dcache_mem_bus_if[`L1_MEM_PORTS]();
|
||||
) dcache_mem_bus_if();
|
||||
|
||||
`RESET_RELAY (dcache_reset, reset);
|
||||
|
||||
VX_cache_cluster #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-dcache", INSTANCE_ID))),
|
||||
.INSTANCE_ID ($sformatf("%s-dcache", INSTANCE_ID)),
|
||||
.NUM_UNITS (`NUM_DCACHES),
|
||||
.NUM_INPUTS (`SOCKET_SIZE),
|
||||
.TAG_SEL_IDX (0),
|
||||
|
@ -145,24 +142,21 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.NUM_WAYS (`DCACHE_NUM_WAYS),
|
||||
.WORD_SIZE (DCACHE_WORD_SIZE),
|
||||
.NUM_REQS (DCACHE_NUM_REQS),
|
||||
.MEM_PORTS (`L1_MEM_PORTS),
|
||||
.CRSQ_SIZE (`DCACHE_CRSQ_SIZE),
|
||||
.MSHR_SIZE (`DCACHE_MSHR_SIZE),
|
||||
.MRSQ_SIZE (`DCACHE_MRSQ_SIZE),
|
||||
.MREQ_SIZE (`DCACHE_WRITEBACK ? `DCACHE_MSHR_SIZE : `DCACHE_MREQ_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
|
||||
.WRITE_ENABLE (1),
|
||||
.WRITEBACK (`DCACHE_WRITEBACK),
|
||||
.DIRTY_BYTES (`DCACHE_DIRTYBYTES),
|
||||
.REPL_POLICY (`DCACHE_REPL_POLICY),
|
||||
.DIRTY_BYTES (`DCACHE_WRITEBACK),
|
||||
.NC_ENABLE (1),
|
||||
.CORE_OUT_BUF (3),
|
||||
.CORE_OUT_BUF (2),
|
||||
.MEM_OUT_BUF (2)
|
||||
) dcache (
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (dcache_perf),
|
||||
.cache_perf (mem_perf_tmp_if.dcache),
|
||||
`endif
|
||||
.clk (clk),
|
||||
.reset (dcache_reset),
|
||||
|
@ -172,64 +166,51 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
for (genvar i = 0; i < `L1_MEM_PORTS; ++i) begin : g_mem_bus_if
|
||||
if (i == 0) begin : g_i0
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
|
||||
) l1_mem_bus_if[2]();
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
|
||||
) l1_mem_bus_if[2]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
||||
) l1_mem_arb_bus_if[1]();
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
||||
) l1_mem_arb_bus_if[1]();
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_bus_if[0], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
|
||||
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_bus_if[1], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
|
||||
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
|
||||
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.NUM_OUTPUTS(1),
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX(0),
|
||||
.ARBITER ("P"), // prioritize the icache
|
||||
.REQ_OUT_BUF(3),
|
||||
.RSP_OUT_BUF(3)
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.bus_in_if (l1_mem_bus_if),
|
||||
.bus_out_if (l1_mem_arb_bus_if)
|
||||
);
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (0),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF (2),
|
||||
.RSP_OUT_BUF (2)
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.bus_in_if (l1_mem_bus_if),
|
||||
.bus_out_if (l1_mem_arb_bus_if)
|
||||
);
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[0], l1_mem_arb_bus_if[0]);
|
||||
end else begin : g_i
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
||||
) l1_mem_arb_bus_if();
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_arb_bus_if, dcache_mem_bus_if[i], L1_MEM_ARB_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], l1_mem_arb_bus_if);
|
||||
end
|
||||
end
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, l1_mem_arb_bus_if[0]);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [`SOCKET_SIZE-1:0] per_core_busy;
|
||||
|
||||
VX_dcr_bus_if core_dcr_bus_if();
|
||||
`BUFFER_DCR_BUS_IF (core_dcr_bus_if, dcr_bus_if, (`SOCKET_SIZE > 1));
|
||||
|
||||
// Generate all cores
|
||||
for (genvar core_id = 0; core_id < `SOCKET_SIZE; ++core_id) begin : g_cores
|
||||
for (genvar core_id = 0; core_id < `SOCKET_SIZE; ++core_id) begin : cores
|
||||
|
||||
`RESET_RELAY (core_reset, reset);
|
||||
|
||||
VX_dcr_bus_if core_dcr_bus_if();
|
||||
`BUFFER_DCR_BUS_IF (core_dcr_bus_if, dcr_bus_if, 1'b1, (`SOCKET_SIZE > 1))
|
||||
|
||||
VX_core #(
|
||||
.CORE_ID ((SOCKET_ID * `SOCKET_SIZE) + core_id),
|
||||
.INSTANCE_ID (`SFORMATF(("%s-core%0d", INSTANCE_ID, core_id)))
|
||||
.INSTANCE_ID ($sformatf("%s-core%0d", INSTANCE_ID, core_id))
|
||||
) core (
|
||||
`SCOPE_IO_BIND (scope_core + core_id)
|
||||
|
||||
|
@ -237,7 +218,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.reset (core_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.sysmem_perf (sysmem_perf_tmp),
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
`endif
|
||||
|
||||
.dcr_bus_if (core_dcr_bus_if),
|
||||
|
@ -254,6 +235,6 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
);
|
||||
end
|
||||
|
||||
`BUFFER_EX(busy, (| per_core_busy), 1'b1, 1, (`SOCKET_SIZE > 1));
|
||||
`BUFFER_EX(busy, (| per_core_busy), 1'b1, (`SOCKET_SIZE > 1));
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -166,8 +166,6 @@
|
|||
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99
|
||||
`define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency
|
||||
`define VX_CSR_MPM_MEM_LT_H 12'hB9A
|
||||
`define VX_CSR_MPM_MEM_BANK_ST 12'hB1E // bank conflicts
|
||||
`define VX_CSR_MPM_MEM_BANK_ST_H 12'hB9E
|
||||
// PERF: lmem
|
||||
`define VX_CSR_MPM_LMEM_READS 12'hB1B // memory reads
|
||||
`define VX_CSR_MPM_LMEM_READS_H 12'hB9B
|
||||
|
@ -175,9 +173,6 @@
|
|||
`define VX_CSR_MPM_LMEM_WRITES_H 12'hB9C
|
||||
`define VX_CSR_MPM_LMEM_BANK_ST 12'hB1D // bank conflicts
|
||||
`define VX_CSR_MPM_LMEM_BANK_ST_H 12'hB9D
|
||||
// PERF: coalescer
|
||||
`define VX_CSR_MPM_COALESCER_MISS 12'hB1F // coalescer misses
|
||||
`define VX_CSR_MPM_COALESCER_MISS_H 12'hB9F
|
||||
|
||||
// Machine Performance-monitoring memory counters (class 3) ///////////////////
|
||||
// <Add your own counters: use addresses hB03..B1F, hB83..hB9F>
|
||||
|
@ -189,19 +184,6 @@
|
|||
`define VX_CSR_MIMPID 12'hF13
|
||||
`define VX_CSR_MHARTID 12'hF14
|
||||
|
||||
// Vector CSRs
|
||||
|
||||
`define VX_CSR_VSTART 12'h008
|
||||
`define VX_CSR_VXSAT 12'h009
|
||||
`define VX_CSR_VXRM 12'h00A
|
||||
`define VX_CSR_VCSR 12'h00F
|
||||
`define VX_CSR_VL 12'hC20
|
||||
`define VX_CSR_VTYPE 12'hC21
|
||||
`define VX_CSR_VLENB 12'hC22
|
||||
`define VX_CSR_VCYCLE 12'hC00
|
||||
`define VX_CSR_VTIME 12'hC01
|
||||
`define VX_CSR_VINSTRET 12'hC02
|
||||
|
||||
// GPGU CSRs
|
||||
|
||||
`define VX_CSR_THREAD_ID 12'hCC0
|
||||
|
@ -215,10 +197,4 @@
|
|||
`define VX_CSR_NUM_CORES 12'hFC2
|
||||
`define VX_CSR_LOCAL_MEM_BASE 12'hFC3
|
||||
|
||||
`define VX_MAT_MUL_SIZE 12'hFC4 // VX_MAT_MUL_SIZE = Matrix Size / TC Size
|
||||
`define VX_TC_NUM 12'hFC5
|
||||
`define VX_TC_SIZE 12'hFC6
|
||||
|
||||
|
||||
|
||||
`endif // VX_TYPES_VH
|
||||
|
|
151
hw/rtl/Vortex.sv
151
hw/rtl/Vortex.sv
|
@ -21,19 +21,19 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
input wire reset,
|
||||
|
||||
// Memory request
|
||||
output wire mem_req_valid [`VX_MEM_PORTS],
|
||||
output wire mem_req_rw [`VX_MEM_PORTS],
|
||||
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen [`VX_MEM_PORTS],
|
||||
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr [`VX_MEM_PORTS],
|
||||
output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data [`VX_MEM_PORTS],
|
||||
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag [`VX_MEM_PORTS],
|
||||
input wire mem_req_ready [`VX_MEM_PORTS],
|
||||
output wire mem_req_valid,
|
||||
output wire mem_req_rw,
|
||||
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
|
||||
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data,
|
||||
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// Memory response
|
||||
input wire mem_rsp_valid [`VX_MEM_PORTS],
|
||||
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data [`VX_MEM_PORTS],
|
||||
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag [`VX_MEM_PORTS],
|
||||
output wire mem_rsp_ready [`VX_MEM_PORTS],
|
||||
input wire mem_rsp_valid,
|
||||
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire mem_rsp_ready,
|
||||
|
||||
// DCR write request
|
||||
input wire dcr_wr_valid,
|
||||
|
@ -50,25 +50,22 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
cache_perf_t l3_perf;
|
||||
mem_perf_t mem_perf;
|
||||
sysmem_perf_t sysmem_perf;
|
||||
always @(*) begin
|
||||
sysmem_perf = '0;
|
||||
sysmem_perf.l3cache = l3_perf;
|
||||
sysmem_perf.mem = mem_perf;
|
||||
end
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
assign mem_perf_if.icache = 'x;
|
||||
assign mem_perf_if.dcache = 'x;
|
||||
assign mem_perf_if.l2cache = 'x;
|
||||
assign mem_perf_if.lmem = 'x;
|
||||
`endif
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L2_LINE_SIZE),
|
||||
.TAG_WIDTH (L2_MEM_TAG_WIDTH)
|
||||
) per_cluster_mem_bus_if[`NUM_CLUSTERS * `L2_MEM_PORTS]();
|
||||
) per_cluster_mem_bus_if[`NUM_CLUSTERS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L3_LINE_SIZE),
|
||||
.TAG_WIDTH (L3_MEM_TAG_WIDTH)
|
||||
) mem_bus_if[`L3_MEM_PORTS]();
|
||||
) mem_bus_if();
|
||||
|
||||
`RESET_RELAY (l3_reset, reset);
|
||||
|
||||
|
@ -80,7 +77,6 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
.NUM_WAYS (`L3_NUM_WAYS),
|
||||
.WORD_SIZE (L3_WORD_SIZE),
|
||||
.NUM_REQS (L3_NUM_REQS),
|
||||
.MEM_PORTS (`L3_MEM_PORTS),
|
||||
.CRSQ_SIZE (`L3_CRSQ_SIZE),
|
||||
.MSHR_SIZE (`L3_MSHR_SIZE),
|
||||
.MRSQ_SIZE (`L3_MRSQ_SIZE),
|
||||
|
@ -88,12 +84,10 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
.TAG_WIDTH (L2_MEM_TAG_WIDTH),
|
||||
.WRITE_ENABLE (1),
|
||||
.WRITEBACK (`L3_WRITEBACK),
|
||||
.DIRTY_BYTES (`L3_DIRTYBYTES),
|
||||
.REPL_POLICY (`L3_REPL_POLICY),
|
||||
.DIRTY_BYTES (`L3_WRITEBACK),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
|
||||
.CORE_OUT_BUF (3),
|
||||
.MEM_OUT_BUF (3),
|
||||
.CORE_OUT_BUF (2),
|
||||
.MEM_OUT_BUF (2),
|
||||
.NC_ENABLE (1),
|
||||
.PASSTHRU (!`L3_ENABLED)
|
||||
) l3cache (
|
||||
|
@ -101,28 +95,31 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
.reset (l3_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (l3_perf),
|
||||
.cache_perf (mem_perf_if.l3cache),
|
||||
`endif
|
||||
|
||||
.core_bus_if (per_cluster_mem_bus_if),
|
||||
.mem_bus_if (mem_bus_if)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < `L3_MEM_PORTS; ++i) begin : g_mem_bus_if
|
||||
assign mem_req_valid[i] = mem_bus_if[i].req_valid;
|
||||
assign mem_req_rw[i] = mem_bus_if[i].req_data.rw;
|
||||
assign mem_req_byteen[i] = mem_bus_if[i].req_data.byteen;
|
||||
assign mem_req_addr[i] = mem_bus_if[i].req_data.addr;
|
||||
assign mem_req_data[i] = mem_bus_if[i].req_data.data;
|
||||
assign mem_req_tag[i] = mem_bus_if[i].req_data.tag;
|
||||
`UNUSED_VAR (mem_bus_if[i].req_data.flags)
|
||||
assign mem_bus_if[i].req_ready = mem_req_ready[i];
|
||||
assign mem_req_valid = mem_bus_if.req_valid;
|
||||
assign mem_req_rw = mem_bus_if.req_data.rw;
|
||||
assign mem_req_byteen= mem_bus_if.req_data.byteen;
|
||||
assign mem_req_addr = mem_bus_if.req_data.addr;
|
||||
assign mem_req_data = mem_bus_if.req_data.data;
|
||||
assign mem_req_tag = mem_bus_if.req_data.tag;
|
||||
assign mem_bus_if.req_ready = mem_req_ready;
|
||||
`UNUSED_VAR (mem_bus_if.req_data.atype)
|
||||
|
||||
assign mem_bus_if[i].rsp_valid = mem_rsp_valid[i];
|
||||
assign mem_bus_if[i].rsp_data.data = mem_rsp_data[i];
|
||||
assign mem_bus_if[i].rsp_data.tag = mem_rsp_tag[i];
|
||||
assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready;
|
||||
end
|
||||
assign mem_bus_if.rsp_valid = mem_rsp_valid;
|
||||
assign mem_bus_if.rsp_data.data = mem_rsp_data;
|
||||
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
|
||||
assign mem_rsp_ready = mem_bus_if.rsp_ready;
|
||||
|
||||
wire mem_req_fire = mem_req_valid && mem_req_ready;
|
||||
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
|
||||
`UNUSED_VAR (mem_req_fire)
|
||||
`UNUSED_VAR (mem_rsp_fire)
|
||||
|
||||
VX_dcr_bus_if dcr_bus_if();
|
||||
assign dcr_bus_if.write_valid = dcr_wr_valid;
|
||||
|
@ -132,16 +129,16 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
wire [`NUM_CLUSTERS-1:0] per_cluster_busy;
|
||||
|
||||
// Generate all clusters
|
||||
for (genvar cluster_id = 0; cluster_id < `NUM_CLUSTERS; ++cluster_id) begin : g_clusters
|
||||
for (genvar cluster_id = 0; cluster_id < `NUM_CLUSTERS; ++cluster_id) begin : clusters
|
||||
|
||||
`RESET_RELAY (cluster_reset, reset);
|
||||
|
||||
VX_dcr_bus_if cluster_dcr_bus_if();
|
||||
`BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, 1'b1, (`NUM_CLUSTERS > 1))
|
||||
`BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, (`NUM_CLUSTERS > 1));
|
||||
|
||||
VX_cluster #(
|
||||
.CLUSTER_ID (cluster_id),
|
||||
.INSTANCE_ID (`SFORMATF(("cluster%0d", cluster_id)))
|
||||
.INSTANCE_ID ($sformatf("cluster%0d", cluster_id))
|
||||
) cluster (
|
||||
`SCOPE_IO_BIND (scope_cluster + cluster_id)
|
||||
|
||||
|
@ -149,83 +146,59 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
.reset (cluster_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.sysmem_perf (sysmem_perf),
|
||||
.mem_perf_if (mem_perf_if),
|
||||
`endif
|
||||
|
||||
.dcr_bus_if (cluster_dcr_bus_if),
|
||||
|
||||
.mem_bus_if (per_cluster_mem_bus_if[cluster_id * `L2_MEM_PORTS +: `L2_MEM_PORTS]),
|
||||
.mem_bus_if (per_cluster_mem_bus_if[cluster_id]),
|
||||
|
||||
.busy (per_cluster_busy[cluster_id])
|
||||
);
|
||||
end
|
||||
|
||||
`BUFFER_EX(busy, (| per_cluster_busy), 1'b1, 1, (`NUM_CLUSTERS > 1));
|
||||
`BUFFER_EX(busy, (| per_cluster_busy), 1'b1, (`NUM_CLUSTERS > 1));
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
localparam MEM_PORTS_CTR_W = `CLOG2(`VX_MEM_PORTS+1);
|
||||
|
||||
wire [`VX_MEM_PORTS-1:0] mem_req_fire, mem_rsp_fire;
|
||||
wire [`VX_MEM_PORTS-1:0] mem_rd_req_fire, mem_wr_req_fire;
|
||||
|
||||
for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_perf_ctrs
|
||||
assign mem_req_fire[i] = mem_req_valid[i] & mem_req_ready[i];
|
||||
assign mem_rsp_fire[i] = mem_rsp_valid[i] & mem_rsp_ready[i];
|
||||
assign mem_rd_req_fire[i] = mem_req_fire[i] & ~mem_req_rw[i];
|
||||
assign mem_wr_req_fire[i] = mem_req_fire[i] & mem_req_rw[i];
|
||||
end
|
||||
|
||||
wire [MEM_PORTS_CTR_W-1:0] perf_mem_reads_per_cycle;
|
||||
wire [MEM_PORTS_CTR_W-1:0] perf_mem_writes_per_cycle;
|
||||
wire [MEM_PORTS_CTR_W-1:0] perf_mem_rsps_per_cycle;
|
||||
|
||||
`POP_COUNT(perf_mem_reads_per_cycle, mem_rd_req_fire);
|
||||
`POP_COUNT(perf_mem_writes_per_cycle, mem_wr_req_fire);
|
||||
`POP_COUNT(perf_mem_rsps_per_cycle, mem_rsp_fire);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||
mem_perf_t mem_perf;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_mem_pending_reads <= '0;
|
||||
end else begin
|
||||
perf_mem_pending_reads <= $signed(perf_mem_pending_reads) +
|
||||
`PERF_CTR_BITS'($signed((MEM_PORTS_CTR_W+1)'(perf_mem_reads_per_cycle) - (MEM_PORTS_CTR_W+1)'(perf_mem_rsps_per_cycle)));
|
||||
`PERF_CTR_BITS'($signed(2'(mem_req_fire && ~mem_bus_if.req_data.rw) - 2'(mem_rsp_fire)));
|
||||
end
|
||||
end
|
||||
|
||||
wire mem_rd_req_fire = mem_req_fire && ~mem_bus_if.req_data.rw;
|
||||
wire mem_wr_req_fire = mem_req_fire && mem_bus_if.req_data.rw;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
mem_perf <= '0;
|
||||
end else begin
|
||||
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(perf_mem_reads_per_cycle);
|
||||
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(perf_mem_writes_per_cycle);
|
||||
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(mem_rd_req_fire);
|
||||
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(mem_wr_req_fire);
|
||||
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
|
||||
end
|
||||
end
|
||||
assign mem_perf_if.mem = mem_perf;
|
||||
|
||||
`endif
|
||||
|
||||
// dump device configuration
|
||||
initial begin
|
||||
`TRACE(0, ("CONFIGS: num_threads=%0d, num_warps=%0d, num_cores=%0d, num_clusters=%0d, socket_size=%0d, local_mem_base=0x%0h, num_barriers=%0d\n",
|
||||
`NUM_THREADS, `NUM_WARPS, `NUM_CORES, `NUM_CLUSTERS, `SOCKET_SIZE, `LMEM_BASE_ADDR, `NUM_BARRIERS))
|
||||
end
|
||||
|
||||
`ifdef DBG_TRACE_MEM
|
||||
for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_trace
|
||||
always @(posedge clk) begin
|
||||
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
|
||||
if (mem_bus_if[i].req_data.rw) begin
|
||||
`TRACE(2, ("%t: MEM Wr Req[%0d]: addr=0x%0h, byteen=0x%h data=0x%h, tag=0x%0h (#%0d)\n", $time, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: MEM Rd Req[%0d]: addr=0x%0h, byteen=0x%h, tag=0x%0h (#%0d)\n", $time, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
|
||||
`TRACE(2, ("%t: MEM Rd Rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n", $time, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid))
|
||||
end
|
||||
always @(posedge clk) begin
|
||||
if (mem_req_fire) begin
|
||||
if (mem_req_rw)
|
||||
`TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%h data=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data));
|
||||
else
|
||||
`TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen));
|
||||
end
|
||||
if (mem_rsp_fire) begin
|
||||
`TRACE(1, ("%d: MEM Rd Rsp: tag=0x%0h, data=0x%h\n", $time, mem_rsp_tag, mem_rsp_data));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -82,26 +82,112 @@ module Vortex_axi import VX_gpu_pkg::*; #(
|
|||
// Status
|
||||
output wire busy
|
||||
);
|
||||
localparam DST_LDATAW = `CLOG2(AXI_DATA_WIDTH);
|
||||
localparam SRC_LDATAW = `CLOG2(`VX_MEM_DATA_WIDTH);
|
||||
localparam SUB_LDATAW = DST_LDATAW - SRC_LDATAW;
|
||||
localparam VX_MEM_TAG_A_WIDTH = `VX_MEM_TAG_WIDTH + `MAX(SUB_LDATAW, 0);
|
||||
localparam VX_MEM_ADDR_A_WIDTH = `VX_MEM_ADDR_WIDTH - SUB_LDATAW;
|
||||
`STATIC_ASSERT((AXI_DATA_WIDTH == `VX_MEM_DATA_WIDTH), ("invalid memory data size: current=%0d, expected=%0d", AXI_DATA_WIDTH, `VX_MEM_DATA_WIDTH))
|
||||
`STATIC_ASSERT((AXI_ADDR_WIDTH >= `MEM_ADDR_WIDTH), ("invalid memory address size: current=%0d, expected=%0d", AXI_ADDR_WIDTH, `VX_MEM_ADDR_WIDTH))
|
||||
//`STATIC_ASSERT((AXI_TID_WIDTH >= `VX_MEM_TAG_WIDTH), ("invalid memory tag size: current=%0d, expected=%0d", AXI_TID_WIDTH, `VX_MEM_TAG_WIDTH))
|
||||
|
||||
wire mem_req_valid [`VX_MEM_PORTS];
|
||||
wire mem_req_rw [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag [`VX_MEM_PORTS];
|
||||
wire mem_req_ready [`VX_MEM_PORTS];
|
||||
wire mem_req_valid;
|
||||
wire mem_req_rw;
|
||||
wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen;
|
||||
wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr;
|
||||
wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data;
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag;
|
||||
wire mem_req_ready;
|
||||
|
||||
wire mem_rsp_valid [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag [`VX_MEM_PORTS];
|
||||
wire mem_rsp_ready [`VX_MEM_PORTS];
|
||||
wire mem_rsp_valid;
|
||||
wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data;
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag;
|
||||
wire mem_rsp_ready;
|
||||
|
||||
`SCOPE_IO_SWITCH (1);
|
||||
wire [`MEM_ADDR_WIDTH-1:0] m_axi_awaddr_unqual [AXI_NUM_BANKS];
|
||||
wire [`MEM_ADDR_WIDTH-1:0] m_axi_araddr_unqual [AXI_NUM_BANKS];
|
||||
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_awid_unqual [AXI_NUM_BANKS];
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_arid_unqual [AXI_NUM_BANKS];
|
||||
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_bid_unqual [AXI_NUM_BANKS];
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_rid_unqual [AXI_NUM_BANKS];
|
||||
|
||||
for (genvar i = 0; i < AXI_NUM_BANKS; ++i) begin
|
||||
assign m_axi_awaddr[i] = `MEM_ADDR_WIDTH'(m_axi_awaddr_unqual[i]);
|
||||
assign m_axi_araddr[i] = `MEM_ADDR_WIDTH'(m_axi_araddr_unqual[i]);
|
||||
|
||||
assign m_axi_awid[i] = AXI_TID_WIDTH'(m_axi_awid_unqual[i]);
|
||||
assign m_axi_arid[i] = AXI_TID_WIDTH'(m_axi_arid_unqual[i]);
|
||||
|
||||
assign m_axi_rid_unqual[i] = `VX_MEM_TAG_WIDTH'(m_axi_rid[i]);
|
||||
assign m_axi_bid_unqual[i] = `VX_MEM_TAG_WIDTH'(m_axi_bid[i]);
|
||||
end
|
||||
|
||||
VX_axi_adapter #(
|
||||
.DATA_WIDTH (`VX_MEM_DATA_WIDTH),
|
||||
.ADDR_WIDTH (`MEM_ADDR_WIDTH),
|
||||
.TAG_WIDTH (`VX_MEM_TAG_WIDTH),
|
||||
.NUM_BANKS (AXI_NUM_BANKS),
|
||||
.RSP_OUT_BUF((AXI_NUM_BANKS > 1) ? 2 : 0)
|
||||
) axi_adapter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
.mem_req_valid (mem_req_valid),
|
||||
.mem_req_rw (mem_req_rw),
|
||||
.mem_req_byteen (mem_req_byteen),
|
||||
.mem_req_addr (mem_req_addr),
|
||||
.mem_req_data (mem_req_data),
|
||||
.mem_req_tag (mem_req_tag),
|
||||
.mem_req_ready (mem_req_ready),
|
||||
|
||||
.mem_rsp_valid (mem_rsp_valid),
|
||||
.mem_rsp_data (mem_rsp_data),
|
||||
.mem_rsp_tag (mem_rsp_tag),
|
||||
.mem_rsp_ready (mem_rsp_ready),
|
||||
|
||||
.m_axi_awvalid (m_axi_awvalid),
|
||||
.m_axi_awready (m_axi_awready),
|
||||
.m_axi_awaddr (m_axi_awaddr_unqual),
|
||||
.m_axi_awid (m_axi_awid_unqual),
|
||||
.m_axi_awlen (m_axi_awlen),
|
||||
.m_axi_awsize (m_axi_awsize),
|
||||
.m_axi_awburst (m_axi_awburst),
|
||||
.m_axi_awlock (m_axi_awlock),
|
||||
.m_axi_awcache (m_axi_awcache),
|
||||
.m_axi_awprot (m_axi_awprot),
|
||||
.m_axi_awqos (m_axi_awqos),
|
||||
.m_axi_awregion (m_axi_awregion),
|
||||
|
||||
.m_axi_wvalid (m_axi_wvalid),
|
||||
.m_axi_wready (m_axi_wready),
|
||||
.m_axi_wdata (m_axi_wdata),
|
||||
.m_axi_wstrb (m_axi_wstrb),
|
||||
.m_axi_wlast (m_axi_wlast),
|
||||
|
||||
.m_axi_bvalid (m_axi_bvalid),
|
||||
.m_axi_bready (m_axi_bready),
|
||||
.m_axi_bid (m_axi_bid_unqual),
|
||||
.m_axi_bresp (m_axi_bresp),
|
||||
|
||||
.m_axi_arvalid (m_axi_arvalid),
|
||||
.m_axi_arready (m_axi_arready),
|
||||
.m_axi_araddr (m_axi_araddr_unqual),
|
||||
.m_axi_arid (m_axi_arid_unqual),
|
||||
.m_axi_arlen (m_axi_arlen),
|
||||
.m_axi_arsize (m_axi_arsize),
|
||||
.m_axi_arburst (m_axi_arburst),
|
||||
.m_axi_arlock (m_axi_arlock),
|
||||
.m_axi_arcache (m_axi_arcache),
|
||||
.m_axi_arprot (m_axi_arprot),
|
||||
.m_axi_arqos (m_axi_arqos),
|
||||
.m_axi_arregion (m_axi_arregion),
|
||||
|
||||
.m_axi_rvalid (m_axi_rvalid),
|
||||
.m_axi_rready (m_axi_rready),
|
||||
.m_axi_rdata (m_axi_rdata),
|
||||
.m_axi_rlast (m_axi_rlast) ,
|
||||
.m_axi_rid (m_axi_rid_unqual),
|
||||
.m_axi_rresp (m_axi_rresp)
|
||||
);
|
||||
|
||||
`SCOPE_IO_SWITCH (1)
|
||||
|
||||
Vortex vortex (
|
||||
`SCOPE_IO_BIND (0)
|
||||
|
@ -129,133 +215,4 @@ module Vortex_axi import VX_gpu_pkg::*; #(
|
|||
.busy (busy)
|
||||
);
|
||||
|
||||
wire mem_req_valid_a [`VX_MEM_PORTS];
|
||||
wire mem_req_rw_a [`VX_MEM_PORTS];
|
||||
wire [(AXI_DATA_WIDTH/8)-1:0] mem_req_byteen_a [`VX_MEM_PORTS];
|
||||
wire [VX_MEM_ADDR_A_WIDTH-1:0] mem_req_addr_a [`VX_MEM_PORTS];
|
||||
wire [AXI_DATA_WIDTH-1:0] mem_req_data_a [`VX_MEM_PORTS];
|
||||
wire [VX_MEM_TAG_A_WIDTH-1:0] mem_req_tag_a [`VX_MEM_PORTS];
|
||||
wire mem_req_ready_a [`VX_MEM_PORTS];
|
||||
|
||||
wire mem_rsp_valid_a [`VX_MEM_PORTS];
|
||||
wire [AXI_DATA_WIDTH-1:0] mem_rsp_data_a [`VX_MEM_PORTS];
|
||||
wire [VX_MEM_TAG_A_WIDTH-1:0] mem_rsp_tag_a [`VX_MEM_PORTS];
|
||||
wire mem_rsp_ready_a [`VX_MEM_PORTS];
|
||||
|
||||
// Adjust memory data width to match AXI interface
|
||||
for (genvar i = 0; i < `VX_MEM_PORTS; i++) begin : g_mem_adapter
|
||||
VX_mem_data_adapter #(
|
||||
.SRC_DATA_WIDTH (`VX_MEM_DATA_WIDTH),
|
||||
.DST_DATA_WIDTH (AXI_DATA_WIDTH),
|
||||
.SRC_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH),
|
||||
.DST_ADDR_WIDTH (VX_MEM_ADDR_A_WIDTH),
|
||||
.SRC_TAG_WIDTH (`VX_MEM_TAG_WIDTH),
|
||||
.DST_TAG_WIDTH (VX_MEM_TAG_A_WIDTH),
|
||||
.REQ_OUT_BUF (0),
|
||||
.RSP_OUT_BUF (0)
|
||||
) mem_data_adapter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
.mem_req_valid_in (mem_req_valid[i]),
|
||||
.mem_req_addr_in (mem_req_addr[i]),
|
||||
.mem_req_rw_in (mem_req_rw[i]),
|
||||
.mem_req_byteen_in (mem_req_byteen[i]),
|
||||
.mem_req_data_in (mem_req_data[i]),
|
||||
.mem_req_tag_in (mem_req_tag[i]),
|
||||
.mem_req_ready_in (mem_req_ready[i]),
|
||||
|
||||
.mem_rsp_valid_in (mem_rsp_valid[i]),
|
||||
.mem_rsp_data_in (mem_rsp_data[i]),
|
||||
.mem_rsp_tag_in (mem_rsp_tag[i]),
|
||||
.mem_rsp_ready_in (mem_rsp_ready[i]),
|
||||
|
||||
.mem_req_valid_out (mem_req_valid_a[i]),
|
||||
.mem_req_addr_out (mem_req_addr_a[i]),
|
||||
.mem_req_rw_out (mem_req_rw_a[i]),
|
||||
.mem_req_byteen_out (mem_req_byteen_a[i]),
|
||||
.mem_req_data_out (mem_req_data_a[i]),
|
||||
.mem_req_tag_out (mem_req_tag_a[i]),
|
||||
.mem_req_ready_out (mem_req_ready_a[i]),
|
||||
|
||||
.mem_rsp_valid_out (mem_rsp_valid_a[i]),
|
||||
.mem_rsp_data_out (mem_rsp_data_a[i]),
|
||||
.mem_rsp_tag_out (mem_rsp_tag_a[i]),
|
||||
.mem_rsp_ready_out (mem_rsp_ready_a[i])
|
||||
);
|
||||
end
|
||||
|
||||
VX_axi_adapter #(
|
||||
.DATA_WIDTH (AXI_DATA_WIDTH),
|
||||
.ADDR_WIDTH_IN (VX_MEM_ADDR_A_WIDTH),
|
||||
.ADDR_WIDTH_OUT (AXI_ADDR_WIDTH),
|
||||
.TAG_WIDTH_IN (VX_MEM_TAG_A_WIDTH),
|
||||
.TAG_WIDTH_OUT (AXI_TID_WIDTH),
|
||||
.NUM_PORTS_IN (`VX_MEM_PORTS),
|
||||
.NUM_BANKS_OUT (AXI_NUM_BANKS),
|
||||
.INTERLEAVE (`PLATFORM_MEMORY_INTERLEAVE),
|
||||
.REQ_OUT_BUF ((`VX_MEM_PORTS > 1) ? 2 : 0),
|
||||
.RSP_OUT_BUF ((`VX_MEM_PORTS > 1 || AXI_NUM_BANKS > 1) ? 2 : 0)
|
||||
) axi_adapter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
.mem_req_valid (mem_req_valid_a),
|
||||
.mem_req_rw (mem_req_rw_a),
|
||||
.mem_req_byteen (mem_req_byteen_a),
|
||||
.mem_req_addr (mem_req_addr_a),
|
||||
.mem_req_data (mem_req_data_a),
|
||||
.mem_req_tag (mem_req_tag_a),
|
||||
.mem_req_ready (mem_req_ready_a),
|
||||
|
||||
.mem_rsp_valid (mem_rsp_valid_a),
|
||||
.mem_rsp_data (mem_rsp_data_a),
|
||||
.mem_rsp_tag (mem_rsp_tag_a),
|
||||
.mem_rsp_ready (mem_rsp_ready_a),
|
||||
|
||||
.m_axi_awvalid (m_axi_awvalid),
|
||||
.m_axi_awready (m_axi_awready),
|
||||
.m_axi_awaddr (m_axi_awaddr),
|
||||
.m_axi_awid (m_axi_awid),
|
||||
.m_axi_awlen (m_axi_awlen),
|
||||
.m_axi_awsize (m_axi_awsize),
|
||||
.m_axi_awburst (m_axi_awburst),
|
||||
.m_axi_awlock (m_axi_awlock),
|
||||
.m_axi_awcache (m_axi_awcache),
|
||||
.m_axi_awprot (m_axi_awprot),
|
||||
.m_axi_awqos (m_axi_awqos),
|
||||
.m_axi_awregion (m_axi_awregion),
|
||||
|
||||
.m_axi_wvalid (m_axi_wvalid),
|
||||
.m_axi_wready (m_axi_wready),
|
||||
.m_axi_wdata (m_axi_wdata),
|
||||
.m_axi_wstrb (m_axi_wstrb),
|
||||
.m_axi_wlast (m_axi_wlast),
|
||||
|
||||
.m_axi_bvalid (m_axi_bvalid),
|
||||
.m_axi_bready (m_axi_bready),
|
||||
.m_axi_bid (m_axi_bid),
|
||||
.m_axi_bresp (m_axi_bresp),
|
||||
|
||||
.m_axi_arvalid (m_axi_arvalid),
|
||||
.m_axi_arready (m_axi_arready),
|
||||
.m_axi_araddr (m_axi_araddr),
|
||||
.m_axi_arid (m_axi_arid),
|
||||
.m_axi_arlen (m_axi_arlen),
|
||||
.m_axi_arsize (m_axi_arsize),
|
||||
.m_axi_arburst (m_axi_arburst),
|
||||
.m_axi_arlock (m_axi_arlock),
|
||||
.m_axi_arcache (m_axi_arcache),
|
||||
.m_axi_arprot (m_axi_arprot),
|
||||
.m_axi_arqos (m_axi_arqos),
|
||||
.m_axi_arregion (m_axi_arregion),
|
||||
|
||||
.m_axi_rvalid (m_axi_rvalid),
|
||||
.m_axi_rready (m_axi_rready),
|
||||
.m_axi_rdata (m_axi_rdata),
|
||||
.m_axi_rlast (m_axi_rlast),
|
||||
.m_axi_rid (m_axi_rid),
|
||||
.m_axi_rresp (m_axi_rresp)
|
||||
);
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -28,19 +28,9 @@
|
|||
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
// POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
`include "VX_define.vh"
|
||||
//`include "platform_afu_top_config.vh"
|
||||
|
||||
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH ((`PLATFORM_MEMORY_ADDR_WIDTH - $clog2(`PLATFORM_MEMORY_NUM_BANKS)) - $clog2(`PLATFORM_MEMORY_DATA_SIZE))
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH (`PLATFORM_MEMORY_DATA_SIZE * 8)
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH 4
|
||||
`endif
|
||||
`ifdef PLATFORM_PROVIDES_LOCAL_MEMORY
|
||||
|
||||
package local_mem_cfg_pkg;
|
||||
|
||||
|
@ -67,3 +57,5 @@ package local_mem_cfg_pkg;
|
|||
typedef logic [LOCAL_MEM_DATA_N_BYTES-1:0] t_local_mem_byte_mask;
|
||||
|
||||
endpackage // local_mem_cfg_pkg
|
||||
|
||||
`endif // PLATFORM_PROVIDES_LOCAL_MEMORY
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -17,9 +17,9 @@
|
|||
`define AFU_ACCEL_NAME "vortex_afu"
|
||||
`define AFU_ACCEL_UUID 128'h35F9452B_25C2_434C_93D5_6F8C60DB361C
|
||||
|
||||
`define AFU_IMAGE_CMD_MEM_READ 1
|
||||
`define AFU_IMAGE_CMD_MEM_READ 1
|
||||
`define AFU_IMAGE_CMD_MEM_WRITE 2
|
||||
`define AFU_IMAGE_CMD_RUN 3
|
||||
`define AFU_IMAGE_CMD_RUN 3
|
||||
`define AFU_IMAGE_CMD_DCR_WRITE 4
|
||||
`define AFU_IMAGE_CMD_MAX_VALUE 4
|
||||
|
||||
|
|
|
@ -14,20 +14,22 @@
|
|||
`include "vortex_afu.vh"
|
||||
|
||||
module VX_afu_ctrl #(
|
||||
parameter S_AXI_ADDR_WIDTH = 8,
|
||||
parameter S_AXI_DATA_WIDTH = 32
|
||||
parameter AXI_ADDR_WIDTH = 8,
|
||||
parameter AXI_DATA_WIDTH = 32,
|
||||
parameter AXI_NUM_BANKS = 1
|
||||
) (
|
||||
// axi4 lite slave signals
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
input wire clk_en,
|
||||
|
||||
input wire s_axi_awvalid,
|
||||
input wire [S_AXI_ADDR_WIDTH-1:0] s_axi_awaddr,
|
||||
input wire [AXI_ADDR_WIDTH-1:0] s_axi_awaddr,
|
||||
output wire s_axi_awready,
|
||||
|
||||
input wire s_axi_wvalid,
|
||||
input wire [S_AXI_DATA_WIDTH-1:0] s_axi_wdata,
|
||||
input wire [S_AXI_DATA_WIDTH/8-1:0]s_axi_wstrb,
|
||||
input wire [AXI_DATA_WIDTH-1:0] s_axi_wdata,
|
||||
input wire [AXI_DATA_WIDTH/8-1:0] s_axi_wstrb,
|
||||
output wire s_axi_wready,
|
||||
|
||||
output wire s_axi_bvalid,
|
||||
|
@ -35,11 +37,11 @@ module VX_afu_ctrl #(
|
|||
input wire s_axi_bready,
|
||||
|
||||
input wire s_axi_arvalid,
|
||||
input wire [S_AXI_ADDR_WIDTH-1:0] s_axi_araddr,
|
||||
input wire [AXI_ADDR_WIDTH-1:0] s_axi_araddr,
|
||||
output wire s_axi_arready,
|
||||
|
||||
output wire s_axi_rvalid,
|
||||
output wire [S_AXI_DATA_WIDTH-1:0] s_axi_rdata,
|
||||
output wire [AXI_DATA_WIDTH-1:0] s_axi_rdata,
|
||||
output wire [1:0] s_axi_rresp,
|
||||
input wire s_axi_rready,
|
||||
|
||||
|
@ -50,13 +52,13 @@ module VX_afu_ctrl #(
|
|||
input wire ap_idle,
|
||||
output wire interrupt,
|
||||
|
||||
output wire ap_ctrl_read,
|
||||
|
||||
`ifdef SCOPE
|
||||
input wire scope_bus_in,
|
||||
output wire scope_bus_out,
|
||||
`endif
|
||||
|
||||
output wire [63:0] mem_base [AXI_NUM_BANKS],
|
||||
|
||||
output wire dcr_wr_valid,
|
||||
output wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
|
||||
output wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data
|
||||
|
@ -108,38 +110,39 @@ module VX_afu_ctrl #(
|
|||
|
||||
ADDR_DEV_0 = 8'h10,
|
||||
ADDR_DEV_1 = 8'h14,
|
||||
//ADDR_DEV_CTRL = 8'h18,
|
||||
|
||||
ADDR_ISA_0 = 8'h18,
|
||||
ADDR_ISA_1 = 8'h1C,
|
||||
ADDR_ISA_0 = 8'h1C,
|
||||
ADDR_ISA_1 = 8'h20,
|
||||
//ADDR_ISA_CTRL = 8'h24,
|
||||
|
||||
ADDR_DCR_0 = 8'h20,
|
||||
ADDR_DCR_1 = 8'h24,
|
||||
ADDR_DCR_0 = 8'h28,
|
||||
ADDR_DCR_1 = 8'h2C,
|
||||
//ADDR_DCR_CTRL = 8'h30,
|
||||
|
||||
`ifdef SCOPE
|
||||
ADDR_SCP_0 = 8'h28,
|
||||
ADDR_SCP_1 = 8'h2C,
|
||||
ADDR_SCP_0 = 8'h34,
|
||||
ADDR_SCP_1 = 8'h38,
|
||||
//ADDR_SCP_CTRL = 8'h3C,
|
||||
`endif
|
||||
|
||||
ADDR_MEM_0 = 8'h40,
|
||||
ADDR_MEM_1 = 8'h44,
|
||||
//ADDR_MEM_CTRL = 8'h48,
|
||||
|
||||
ADDR_BITS = 8;
|
||||
|
||||
localparam
|
||||
WSTATE_ADDR = 2'd0,
|
||||
WSTATE_IDLE = 2'd0,
|
||||
WSTATE_DATA = 2'd1,
|
||||
WSTATE_RESP = 2'd2,
|
||||
WSTATE_WIDTH = 2;
|
||||
WSTATE_RESP = 2'd2;
|
||||
|
||||
localparam
|
||||
RSTATE_ADDR = 2'd0,
|
||||
RSTATE_DATA = 2'd1,
|
||||
RSTATE_RESP = 2'd2,
|
||||
RSTATE_WIDTH = 2;
|
||||
|
||||
localparam MEMORY_BANK_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH - `CLOG2(`PLATFORM_MEMORY_NUM_BANKS);
|
||||
RSTATE_IDLE = 2'd0,
|
||||
RSTATE_DATA = 2'd1;
|
||||
|
||||
// device caps
|
||||
wire [63:0] dev_caps = {8'b0,
|
||||
5'(MEMORY_BANK_ADDR_WIDTH-20),
|
||||
3'(`CLOG2(`PLATFORM_MEMORY_NUM_BANKS)),
|
||||
wire [63:0] dev_caps = {16'b0,
|
||||
8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
|
||||
16'(`NUM_CORES * `NUM_CLUSTERS),
|
||||
8'(`NUM_WARPS),
|
||||
|
@ -150,18 +153,16 @@ module VX_afu_ctrl #(
|
|||
2'(`CLOG2(`XLEN)-4),
|
||||
30'(`MISA_STD)};
|
||||
|
||||
reg [WSTATE_WIDTH-1:0] wstate;
|
||||
reg [1:0] wstate;
|
||||
reg [ADDR_BITS-1:0] waddr;
|
||||
wire [31:0] wmask;
|
||||
wire s_axi_aw_fire;
|
||||
wire s_axi_w_fire;
|
||||
wire s_axi_b_fire;
|
||||
|
||||
logic [RSTATE_WIDTH-1:0] rstate;
|
||||
reg [1:0] rstate;
|
||||
reg [31:0] rdata;
|
||||
reg [ADDR_BITS-1:0] raddr;
|
||||
wire [ADDR_BITS-1:0] raddr;
|
||||
wire s_axi_ar_fire;
|
||||
wire s_axi_r_fire;
|
||||
|
||||
reg ap_reset_r;
|
||||
reg ap_start_r;
|
||||
|
@ -169,23 +170,20 @@ module VX_afu_ctrl #(
|
|||
reg gie_r;
|
||||
reg [1:0] ier_r;
|
||||
reg [1:0] isr_r;
|
||||
reg [63:0] mem_r [AXI_NUM_BANKS];
|
||||
reg [31:0] dcra_r;
|
||||
reg [31:0] dcrv_r;
|
||||
reg dcr_wr_valid_r;
|
||||
|
||||
logic wready_stall;
|
||||
logic rvalid_stall;
|
||||
|
||||
`ifdef SCOPE
|
||||
|
||||
reg [63:0] scope_bus_wdata, scope_bus_rdata;
|
||||
reg [63:0] scope_bus_wdata;
|
||||
reg [63:0] scope_bus_rdata;
|
||||
reg [5:0] scope_bus_ctr;
|
||||
|
||||
reg cmd_scope_writing, cmd_scope_reading;
|
||||
reg cmd_scope_reading;
|
||||
reg cmd_scope_writing;
|
||||
reg scope_bus_out_r;
|
||||
reg scope_rdata_valid;
|
||||
|
||||
reg is_scope_waddr, is_scope_raddr;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
@ -193,33 +191,18 @@ module VX_afu_ctrl #(
|
|||
cmd_scope_writing <= 0;
|
||||
scope_bus_ctr <= '0;
|
||||
scope_bus_out_r <= 0;
|
||||
is_scope_waddr <= 0;
|
||||
is_scope_raddr <= 0;
|
||||
scope_bus_rdata <= '0;
|
||||
scope_rdata_valid <= 0;
|
||||
end else begin
|
||||
scope_bus_out_r <= 0;
|
||||
if (s_axi_aw_fire) begin
|
||||
is_scope_waddr <= (s_axi_awaddr[ADDR_BITS-1:0] == ADDR_SCP_0)
|
||||
|| (s_axi_awaddr[ADDR_BITS-1:0] == ADDR_SCP_1);
|
||||
end
|
||||
if (s_axi_ar_fire) begin
|
||||
is_scope_raddr <= (s_axi_araddr[ADDR_BITS-1:0] == ADDR_SCP_0)
|
||||
|| (s_axi_araddr[ADDR_BITS-1:0] == ADDR_SCP_1);
|
||||
end
|
||||
end else if (clk_en) begin
|
||||
if (s_axi_w_fire && waddr == ADDR_SCP_0) begin
|
||||
scope_bus_wdata[31:0] <= (s_axi_wdata & wmask) | (scope_bus_wdata[31:0] & ~wmask);
|
||||
end
|
||||
if (s_axi_w_fire && waddr == ADDR_SCP_1) begin
|
||||
scope_bus_wdata[63:32] <= (s_axi_wdata & wmask) | (scope_bus_wdata[63:32] & ~wmask);
|
||||
cmd_scope_writing <= 1;
|
||||
scope_rdata_valid <= 0;
|
||||
scope_bus_out_r <= 1;
|
||||
scope_bus_ctr <= 63;
|
||||
end
|
||||
if (scope_bus_in) begin
|
||||
cmd_scope_reading <= 1;
|
||||
scope_bus_rdata <= '0;
|
||||
scope_bus_ctr <= 63;
|
||||
end
|
||||
if (cmd_scope_reading) begin
|
||||
|
@ -227,16 +210,13 @@ module VX_afu_ctrl #(
|
|||
scope_bus_ctr <= scope_bus_ctr - 1;
|
||||
if (scope_bus_ctr == 0) begin
|
||||
cmd_scope_reading <= 0;
|
||||
scope_rdata_valid <= 1;
|
||||
scope_bus_ctr <= 0;
|
||||
end
|
||||
end
|
||||
if (cmd_scope_writing) begin
|
||||
scope_bus_out_r <= scope_bus_wdata[scope_bus_ctr];
|
||||
scope_bus_out_r <= 1'(scope_bus_wdata >> scope_bus_ctr);
|
||||
scope_bus_ctr <= scope_bus_ctr - 1;
|
||||
if (scope_bus_ctr == 0) begin
|
||||
cmd_scope_writing <= 0;
|
||||
scope_bus_ctr <= 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -244,50 +224,41 @@ module VX_afu_ctrl #(
|
|||
|
||||
assign scope_bus_out = scope_bus_out_r;
|
||||
|
||||
assign wready_stall = is_scope_waddr && cmd_scope_writing;
|
||||
assign rvalid_stall = is_scope_raddr && ~scope_rdata_valid;
|
||||
|
||||
`else
|
||||
|
||||
assign wready_stall = 0;
|
||||
assign rvalid_stall = 0;
|
||||
|
||||
`endif
|
||||
|
||||
// AXI Write Request
|
||||
assign s_axi_awready = (wstate == WSTATE_ADDR);
|
||||
assign s_axi_wready = (wstate == WSTATE_DATA) && ~wready_stall;
|
||||
// AXI Write
|
||||
|
||||
// AXI Write Response
|
||||
assign s_axi_awready = (wstate == WSTATE_IDLE);
|
||||
assign s_axi_wready = (wstate == WSTATE_DATA);
|
||||
assign s_axi_bvalid = (wstate == WSTATE_RESP);
|
||||
assign s_axi_bresp = 2'b00; // OKAY
|
||||
|
||||
for (genvar i = 0; i < 4; ++i) begin : g_wmask
|
||||
assign wmask[8 * i +: 8] = {8{s_axi_wstrb[i]}};
|
||||
end
|
||||
|
||||
assign s_axi_aw_fire = s_axi_awvalid && s_axi_awready;
|
||||
assign s_axi_w_fire = s_axi_wvalid && s_axi_wready;
|
||||
assign s_axi_b_fire = s_axi_bvalid && s_axi_bready;
|
||||
|
||||
for (genvar i = 0; i < 4; ++i) begin
|
||||
assign wmask[8 * i +: 8] = {8{s_axi_wstrb[i]}};
|
||||
end
|
||||
|
||||
// wstate
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
wstate <= WSTATE_ADDR;
|
||||
end else begin
|
||||
wstate <= WSTATE_IDLE;
|
||||
end else if (clk_en) begin
|
||||
case (wstate)
|
||||
WSTATE_ADDR: wstate <= s_axi_aw_fire ? WSTATE_DATA : WSTATE_ADDR;
|
||||
WSTATE_DATA: wstate <= s_axi_w_fire ? WSTATE_RESP : WSTATE_DATA;
|
||||
WSTATE_RESP: wstate <= s_axi_b_fire ? WSTATE_ADDR : WSTATE_RESP;
|
||||
default: wstate <= WSTATE_ADDR;
|
||||
WSTATE_IDLE: wstate <= s_axi_awvalid ? WSTATE_DATA : WSTATE_IDLE;
|
||||
WSTATE_DATA: wstate <= s_axi_wvalid ? WSTATE_RESP : WSTATE_DATA;
|
||||
WSTATE_RESP: wstate <= s_axi_bready ? WSTATE_IDLE : WSTATE_RESP;
|
||||
default: wstate <= WSTATE_IDLE;
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
// waddr
|
||||
always @(posedge clk) begin
|
||||
if (s_axi_aw_fire) begin
|
||||
waddr <= s_axi_awaddr[ADDR_BITS-1:0];
|
||||
if (clk_en) begin
|
||||
if (s_axi_aw_fire)
|
||||
waddr <= s_axi_awaddr[ADDR_BITS-1:0];
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -305,13 +276,16 @@ module VX_afu_ctrl #(
|
|||
dcra_r <= '0;
|
||||
dcrv_r <= '0;
|
||||
dcr_wr_valid_r <= 0;
|
||||
end else begin
|
||||
dcr_wr_valid_r <= 0;
|
||||
ap_reset_r <= 0;
|
||||
|
||||
for (integer i = 0; i < AXI_NUM_BANKS; ++i) begin
|
||||
mem_r[i] <= '0;
|
||||
end
|
||||
end else if (clk_en) begin
|
||||
if (ap_ready)
|
||||
ap_start_r <= auto_restart_r;
|
||||
|
||||
dcr_wr_valid_r <= 0;
|
||||
|
||||
if (s_axi_w_fire) begin
|
||||
case (waddr)
|
||||
ADDR_AP_CTRL: begin
|
||||
|
@ -343,7 +317,16 @@ module VX_afu_ctrl #(
|
|||
dcrv_r <= (s_axi_wdata & wmask) | (dcrv_r & ~wmask);
|
||||
dcr_wr_valid_r <= 1;
|
||||
end
|
||||
default:;
|
||||
default: begin
|
||||
for (integer i = 0; i < AXI_NUM_BANKS; ++i) begin
|
||||
if (waddr == (ADDR_MEM_0 + 8'(i) * 8'd12)) begin
|
||||
mem_r[i][31:0] <= (s_axi_wdata & wmask) | (mem_r[i][31:0] & ~wmask);
|
||||
end
|
||||
if (waddr == (ADDR_MEM_1 + 8'(i) * 8'd12)) begin
|
||||
mem_r[i][63:32] <= (s_axi_wdata & wmask) | (mem_r[i][63:32] & ~wmask);
|
||||
end
|
||||
end
|
||||
end
|
||||
endcase
|
||||
|
||||
if (ier_r[0] & ap_done)
|
||||
|
@ -354,87 +337,82 @@ module VX_afu_ctrl #(
|
|||
end
|
||||
end
|
||||
|
||||
// AXI Read Request
|
||||
assign s_axi_arready = (rstate == RSTATE_ADDR);
|
||||
// AXI Read
|
||||
|
||||
// AXI Read Response
|
||||
assign s_axi_rvalid = (rstate == RSTATE_RESP);
|
||||
assign s_axi_arready = (rstate == RSTATE_IDLE);
|
||||
assign s_axi_rvalid = (rstate == RSTATE_DATA);
|
||||
assign s_axi_rdata = rdata;
|
||||
assign s_axi_rresp = 2'b00; // OKAY
|
||||
|
||||
assign s_axi_ar_fire = s_axi_arvalid && s_axi_arready;
|
||||
assign s_axi_r_fire = s_axi_rvalid && s_axi_rready;
|
||||
assign raddr = s_axi_araddr[ADDR_BITS-1:0];
|
||||
|
||||
// rstate
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
rstate <= RSTATE_ADDR;
|
||||
end else begin
|
||||
rstate <= RSTATE_IDLE;
|
||||
end else if (clk_en) begin
|
||||
case (rstate)
|
||||
RSTATE_ADDR: rstate <= s_axi_ar_fire ? RSTATE_DATA : RSTATE_ADDR;
|
||||
RSTATE_DATA: rstate <= rvalid_stall ? RSTATE_DATA : RSTATE_RESP;
|
||||
RSTATE_RESP: rstate <= s_axi_r_fire ? RSTATE_ADDR : RSTATE_RESP;
|
||||
default: rstate <= RSTATE_ADDR;
|
||||
RSTATE_IDLE: rstate <= s_axi_arvalid ? RSTATE_DATA : RSTATE_IDLE;
|
||||
RSTATE_DATA: rstate <= (s_axi_rready & s_axi_rvalid) ? RSTATE_IDLE : RSTATE_DATA;
|
||||
default: rstate <= RSTATE_IDLE;
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
// raddr
|
||||
always @(posedge clk) begin
|
||||
if (s_axi_ar_fire) begin
|
||||
raddr <= s_axi_araddr[ADDR_BITS-1:0];
|
||||
end
|
||||
end
|
||||
|
||||
// rdata
|
||||
always @(posedge clk) begin
|
||||
rdata <= '0;
|
||||
case (raddr)
|
||||
ADDR_AP_CTRL: begin
|
||||
rdata[0] <= ap_start_r;
|
||||
rdata[1] <= ap_done;
|
||||
rdata[2] <= ap_idle;
|
||||
rdata[3] <= ap_ready;
|
||||
rdata[7] <= auto_restart_r;
|
||||
if (clk_en) begin
|
||||
if (s_axi_ar_fire) begin
|
||||
rdata <= '0;
|
||||
case (raddr)
|
||||
ADDR_AP_CTRL: begin
|
||||
rdata[0] <= ap_start_r;
|
||||
rdata[1] <= ap_done;
|
||||
rdata[2] <= ap_idle;
|
||||
rdata[3] <= ap_ready;
|
||||
rdata[7] <= auto_restart_r;
|
||||
end
|
||||
ADDR_GIE: begin
|
||||
rdata <= 32'(gie_r);
|
||||
end
|
||||
ADDR_IER: begin
|
||||
rdata <= 32'(ier_r);
|
||||
end
|
||||
ADDR_ISR: begin
|
||||
rdata <= 32'(isr_r);
|
||||
end
|
||||
ADDR_DEV_0: begin
|
||||
rdata <= dev_caps[31:0];
|
||||
end
|
||||
ADDR_DEV_1: begin
|
||||
rdata <= dev_caps[63:32];
|
||||
end
|
||||
ADDR_ISA_0: begin
|
||||
rdata <= isa_caps[31:0];
|
||||
end
|
||||
ADDR_ISA_1: begin
|
||||
rdata <= isa_caps[63:32];
|
||||
end
|
||||
`ifdef SCOPE
|
||||
ADDR_SCP_0: begin
|
||||
rdata <= scope_bus_rdata[31:0];
|
||||
end
|
||||
ADDR_SCP_1: begin
|
||||
rdata <= scope_bus_rdata[63:32];
|
||||
end
|
||||
`endif
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
ADDR_GIE: begin
|
||||
rdata <= 32'(gie_r);
|
||||
end
|
||||
ADDR_IER: begin
|
||||
rdata <= 32'(ier_r);
|
||||
end
|
||||
ADDR_ISR: begin
|
||||
rdata <= 32'(isr_r);
|
||||
end
|
||||
ADDR_DEV_0: begin
|
||||
rdata <= dev_caps[31:0];
|
||||
end
|
||||
ADDR_DEV_1: begin
|
||||
rdata <= dev_caps[63:32];
|
||||
end
|
||||
ADDR_ISA_0: begin
|
||||
rdata <= isa_caps[31:0];
|
||||
end
|
||||
ADDR_ISA_1: begin
|
||||
rdata <= isa_caps[63:32];
|
||||
end
|
||||
`ifdef SCOPE
|
||||
ADDR_SCP_0: begin
|
||||
rdata <= scope_bus_rdata[31:0];
|
||||
end
|
||||
ADDR_SCP_1: begin
|
||||
rdata <= scope_bus_rdata[63:32];
|
||||
end
|
||||
`endif
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
assign ap_reset = ap_reset_r;
|
||||
assign ap_start = ap_start_r;
|
||||
assign interrupt = gie_r & (| isr_r);
|
||||
|
||||
assign ap_ctrl_read = s_axi_r_fire && (raddr == ADDR_AP_CTRL);
|
||||
assign mem_base = mem_r;
|
||||
|
||||
assign dcr_wr_valid = dcr_wr_valid_r;
|
||||
assign dcr_wr_addr = `VX_DCR_ADDR_WIDTH'(dcra_r);
|
||||
|
|
|
@ -10,93 +10,68 @@
|
|||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
// Reference: https://www.xilinx.com/developer/articles/porting-rtl-designs-to-vitis-rtl-kernels.html
|
||||
|
||||
`include "vortex_afu.vh"
|
||||
|
||||
module VX_afu_wrap #(
|
||||
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
|
||||
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
|
||||
parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH,
|
||||
parameter C_M_AXI_MEM_DATA_WIDTH = `PLATFORM_MEMORY_DATA_SIZE * 8,
|
||||
parameter C_M_AXI_MEM_ADDR_WIDTH = 64,
|
||||
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
|
||||
parameter C_M_AXI_MEM_NUM_BANKS = 1
|
||||
`else
|
||||
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS
|
||||
`endif
|
||||
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
|
||||
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
|
||||
parameter C_M_AXI_MEM_ID_WIDTH = `M_AXI_MEM_ID_WIDTH,
|
||||
parameter C_M_AXI_MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
||||
parameter C_M_AXI_MEM_DATA_WIDTH = `VX_MEM_DATA_WIDTH
|
||||
) (
|
||||
// System signals
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
input wire ap_clk,
|
||||
input wire ap_rst_n,
|
||||
|
||||
// AXI4 master interface
|
||||
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
|
||||
`REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA),
|
||||
`else
|
||||
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
|
||||
`endif
|
||||
`REPEAT (`M_AXI_MEM_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
|
||||
|
||||
// AXI4-Lite slave interface
|
||||
input wire s_axi_ctrl_awvalid,
|
||||
output wire s_axi_ctrl_awready,
|
||||
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_awaddr,
|
||||
|
||||
input wire s_axi_ctrl_wvalid,
|
||||
output wire s_axi_ctrl_wready,
|
||||
input wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_wdata,
|
||||
input wire [C_S_AXI_CTRL_DATA_WIDTH/8-1:0] s_axi_ctrl_wstrb,
|
||||
|
||||
input wire s_axi_ctrl_arvalid,
|
||||
output wire s_axi_ctrl_arready,
|
||||
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_araddr,
|
||||
|
||||
output wire s_axi_ctrl_rvalid,
|
||||
input wire s_axi_ctrl_rready,
|
||||
output wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_rdata,
|
||||
output wire [1:0] s_axi_ctrl_rresp,
|
||||
|
||||
output wire s_axi_ctrl_bvalid,
|
||||
input wire s_axi_ctrl_bready,
|
||||
output wire [1:0] s_axi_ctrl_bresp,
|
||||
|
||||
output wire interrupt
|
||||
);
|
||||
localparam M_AXI_MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH;
|
||||
localparam C_M_AXI_MEM_NUM_BANKS = `M_AXI_MEM_NUM_BANKS;
|
||||
|
||||
typedef enum logic [1:0] {
|
||||
STATE_IDLE = 0,
|
||||
STATE_INIT = 1,
|
||||
STATE_RUN = 2,
|
||||
STATE_DONE = 3
|
||||
} state_e;
|
||||
|
||||
localparam PENDING_WR_SIZEW = 12; // max outstanding requests size
|
||||
localparam NUM_MEM_BANKS_SIZEW = `CLOG2(C_M_AXI_MEM_NUM_BANKS+1);
|
||||
localparam STATE_IDLE = 0;
|
||||
localparam STATE_RUN = 1;
|
||||
|
||||
wire m_axi_mem_awvalid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_awready_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_awid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [7:0] m_axi_mem_awlen_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
||||
wire m_axi_mem_wvalid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_wready_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_wdata_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_DATA_WIDTH/8-1:0] m_axi_mem_wstrb_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_wlast_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
||||
wire m_axi_mem_bvalid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_bready_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_bid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [1:0] m_axi_mem_bresp_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
||||
wire m_axi_mem_arvalid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_arready_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_arid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [7:0] m_axi_mem_arlen_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
||||
wire m_axi_mem_rvalid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_rready_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_rdata_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
@ -105,31 +80,30 @@ module VX_afu_wrap #(
|
|||
wire [1:0] m_axi_mem_rresp_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
||||
// convert memory interface to array
|
||||
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
|
||||
`REPEAT (1, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
|
||||
`else
|
||||
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
|
||||
`endif
|
||||
`REPEAT (`M_AXI_MEM_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
|
||||
|
||||
wire reset = ~ap_rst_n;
|
||||
|
||||
reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr;
|
||||
reg [PENDING_WR_SIZEW-1:0] vx_pending_writes;
|
||||
reg vx_reset = 1; // asserted at initialization
|
||||
reg [15:0] vx_pending_writes;
|
||||
reg vx_busy_wait;
|
||||
reg vx_running;
|
||||
|
||||
wire vx_busy;
|
||||
|
||||
wire [63:0] mem_base [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
||||
wire dcr_wr_valid;
|
||||
wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr;
|
||||
wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data;
|
||||
|
||||
state_e state;
|
||||
reg state;
|
||||
|
||||
wire ap_reset;
|
||||
wire ap_start;
|
||||
wire ap_ctrl_read;
|
||||
wire ap_idle = (state == STATE_IDLE);
|
||||
wire ap_done = (state == STATE_DONE) && (vx_pending_writes == '0);
|
||||
wire ap_ready = ap_done;
|
||||
|
||||
wire ap_done_ack = ap_done && ap_ctrl_read;
|
||||
wire ap_idle = ~vx_running;
|
||||
wire ap_done = ~(state == STATE_RUN || vx_pending_writes != 0);
|
||||
wire ap_ready = 1'b1;
|
||||
|
||||
`ifdef SCOPE
|
||||
wire scope_bus_in;
|
||||
|
@ -137,129 +111,108 @@ module VX_afu_wrap #(
|
|||
wire scope_reset = reset;
|
||||
`endif
|
||||
|
||||
always @(posedge clk) begin
|
||||
always @(posedge ap_clk) begin
|
||||
if (reset || ap_reset) begin
|
||||
state <= STATE_IDLE;
|
||||
vx_reset <= 1;
|
||||
state <= STATE_IDLE;
|
||||
vx_busy_wait <= 0;
|
||||
vx_running <= 0;
|
||||
end else begin
|
||||
case (state)
|
||||
STATE_IDLE: begin
|
||||
if (ap_start) begin
|
||||
`ifdef DBG_TRACE_AFU
|
||||
`TRACE(2, ("%t: AFU: Begin initialization\n", $time))
|
||||
`TRACE(2, ("%d: STATE RUN\n", $time));
|
||||
`endif
|
||||
state <= STATE_INIT;
|
||||
vx_reset_ctr <= (`RESET_DELAY-1);
|
||||
vx_reset <= 1;
|
||||
end
|
||||
end
|
||||
STATE_INIT: begin
|
||||
if (vx_reset) begin
|
||||
// wait for reset to complete
|
||||
if (vx_reset_ctr == 0) begin
|
||||
`ifdef DBG_TRACE_AFU
|
||||
`TRACE(2, ("%t: AFU: Initialization completed\n", $time))
|
||||
`endif
|
||||
vx_reset <= 0;
|
||||
end
|
||||
end else begin
|
||||
// wait until processor goes busy
|
||||
if (vx_busy) begin
|
||||
`ifdef DBG_TRACE_AFU
|
||||
`TRACE(2, ("%t: AFU: Begin execution\n", $time))
|
||||
`endif
|
||||
state <= STATE_RUN;
|
||||
end
|
||||
state <= STATE_RUN;
|
||||
vx_running <= 0;
|
||||
end
|
||||
end
|
||||
STATE_RUN: begin
|
||||
// wait until the processor is not busy
|
||||
if (~vx_busy) begin
|
||||
`ifdef DBG_TRACE_AFU
|
||||
`TRACE(2, ("%t: AFU: Execution completed\n", $time))
|
||||
`endif
|
||||
state <= STATE_DONE;
|
||||
end
|
||||
end
|
||||
STATE_DONE: begin
|
||||
// wait for host's done acknowledgement
|
||||
if (ap_done_ack) begin
|
||||
`ifdef DBG_TRACE_AFU
|
||||
`TRACE(2, ("%t: AFU: Processor idle\n", $time))
|
||||
`endif
|
||||
state <= STATE_IDLE;
|
||||
if (vx_running) begin
|
||||
if (vx_busy_wait) begin
|
||||
// wait until processor goes busy
|
||||
if (vx_busy) begin
|
||||
vx_busy_wait <= 0;
|
||||
end
|
||||
end else begin
|
||||
// wait until the processor is not busy
|
||||
if (~vx_busy) begin
|
||||
state <= STATE_IDLE;
|
||||
`ifdef DBG_TRACE_AFU
|
||||
`TRACE(2, ("%d: AFU: End execution\n", $time));
|
||||
`TRACE(2, ("%d: STATE IDLE\n", $time));
|
||||
`endif
|
||||
end
|
||||
end
|
||||
end else begin
|
||||
// wait until the reset sequence is complete
|
||||
if (vx_reset_ctr == (`RESET_DELAY-1)) begin
|
||||
`ifdef DBG_TRACE_AFU
|
||||
`TRACE(2, ("%d: AFU: Begin execution\n", $time));
|
||||
`endif
|
||||
vx_running <= 1;
|
||||
vx_busy_wait <= 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
endcase
|
||||
|
||||
// ensure reset network initialization
|
||||
if (vx_reset_ctr != '0) begin
|
||||
vx_reset_ctr <= vx_reset_ctr - 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
wire [C_M_AXI_MEM_NUM_BANKS-1:0] m_axi_wr_req_fire, m_axi_wr_rsp_fire;
|
||||
wire [NUM_MEM_BANKS_SIZEW-1:0] cur_wr_reqs, cur_wr_rsps;
|
||||
reg m_axi_mem_wfire;
|
||||
reg m_axi_mem_bfire;
|
||||
|
||||
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_m_axi_wr_req_fire
|
||||
VX_axi_write_ack axi_write_ack (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.awvalid(m_axi_mem_awvalid_a[i]),
|
||||
.awready(m_axi_mem_awready_a[i]),
|
||||
.wvalid (m_axi_mem_wvalid_a[i]),
|
||||
.wready (m_axi_mem_wready_a[i]),
|
||||
.tx_ack (m_axi_wr_req_fire[i]),
|
||||
`UNUSED_PIN (aw_ack),
|
||||
`UNUSED_PIN (w_ack),
|
||||
`UNUSED_PIN (tx_rdy)
|
||||
);
|
||||
always @(*) begin
|
||||
m_axi_mem_wfire = 0;
|
||||
m_axi_mem_bfire = 0;
|
||||
for (integer i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
|
||||
m_axi_mem_wfire |= m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i];
|
||||
m_axi_mem_bfire |= m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i];
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_m_axi_wr_rsp_fire
|
||||
assign m_axi_wr_rsp_fire[i] = m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i];
|
||||
end
|
||||
|
||||
`POP_COUNT(cur_wr_reqs, m_axi_wr_req_fire);
|
||||
`POP_COUNT(cur_wr_rsps, m_axi_wr_rsp_fire);
|
||||
|
||||
wire signed [NUM_MEM_BANKS_SIZEW:0] reqs_sub = (NUM_MEM_BANKS_SIZEW+1)'(cur_wr_reqs) -
|
||||
(NUM_MEM_BANKS_SIZEW+1)'(cur_wr_rsps);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
always @(posedge ap_clk) begin
|
||||
if (reset || ap_reset) begin
|
||||
vx_pending_writes <= '0;
|
||||
end else begin
|
||||
vx_pending_writes <= vx_pending_writes + PENDING_WR_SIZEW'(reqs_sub);
|
||||
if (m_axi_mem_wfire && ~m_axi_mem_bfire)
|
||||
vx_pending_writes <= vx_pending_writes + 1;
|
||||
if (~m_axi_mem_wfire && m_axi_mem_bfire)
|
||||
vx_pending_writes <= vx_pending_writes - 1;
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge ap_clk) begin
|
||||
if (state == STATE_RUN) begin
|
||||
vx_reset_ctr <= vx_reset_ctr + 1;
|
||||
end else begin
|
||||
vx_reset_ctr <= '0;
|
||||
end
|
||||
end
|
||||
|
||||
VX_afu_ctrl #(
|
||||
.S_AXI_ADDR_WIDTH (C_S_AXI_CTRL_ADDR_WIDTH),
|
||||
.S_AXI_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH)
|
||||
.AXI_ADDR_WIDTH (C_S_AXI_CTRL_ADDR_WIDTH),
|
||||
.AXI_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH),
|
||||
.AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
|
||||
) afu_ctrl (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.clk (ap_clk),
|
||||
.reset (reset || ap_reset),
|
||||
.clk_en (1'b1),
|
||||
|
||||
.s_axi_awvalid (s_axi_ctrl_awvalid),
|
||||
.s_axi_awready (s_axi_ctrl_awready),
|
||||
.s_axi_awaddr (s_axi_ctrl_awaddr),
|
||||
|
||||
.s_axi_wvalid (s_axi_ctrl_wvalid),
|
||||
.s_axi_wready (s_axi_ctrl_wready),
|
||||
.s_axi_wdata (s_axi_ctrl_wdata),
|
||||
.s_axi_wstrb (s_axi_ctrl_wstrb),
|
||||
|
||||
.s_axi_arvalid (s_axi_ctrl_arvalid),
|
||||
.s_axi_arready (s_axi_ctrl_arready),
|
||||
.s_axi_araddr (s_axi_ctrl_araddr),
|
||||
|
||||
.s_axi_rvalid (s_axi_ctrl_rvalid),
|
||||
.s_axi_rready (s_axi_ctrl_rready),
|
||||
.s_axi_rdata (s_axi_ctrl_rdata),
|
||||
.s_axi_rresp (s_axi_ctrl_rresp),
|
||||
|
||||
.s_axi_bvalid (s_axi_ctrl_bvalid),
|
||||
.s_axi_bready (s_axi_ctrl_bready),
|
||||
.s_axi_bresp (s_axi_ctrl_bresp),
|
||||
|
@ -271,42 +224,42 @@ module VX_afu_wrap #(
|
|||
.ap_idle (ap_idle),
|
||||
.interrupt (interrupt),
|
||||
|
||||
.ap_ctrl_read (ap_ctrl_read),
|
||||
|
||||
`ifdef SCOPE
|
||||
.scope_bus_in (scope_bus_out),
|
||||
.scope_bus_out (scope_bus_in),
|
||||
`endif
|
||||
|
||||
.mem_base (mem_base),
|
||||
|
||||
.dcr_wr_valid (dcr_wr_valid),
|
||||
.dcr_wr_addr (dcr_wr_addr),
|
||||
.dcr_wr_data (dcr_wr_data)
|
||||
);
|
||||
|
||||
wire [M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_u [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_u [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [`MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_w [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [`MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_w [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
||||
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_addressing
|
||||
assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_u[i]) + C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET);
|
||||
assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_u[i]) + C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET);
|
||||
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
|
||||
assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_w[i]) + C_M_AXI_MEM_ADDR_WIDTH'(mem_base[i]);
|
||||
assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_w[i]) + C_M_AXI_MEM_ADDR_WIDTH'(mem_base[i]);
|
||||
end
|
||||
|
||||
`SCOPE_IO_SWITCH (2);
|
||||
`SCOPE_IO_SWITCH (2)
|
||||
|
||||
Vortex_axi #(
|
||||
.AXI_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH),
|
||||
.AXI_ADDR_WIDTH (M_AXI_MEM_ADDR_WIDTH),
|
||||
.AXI_ADDR_WIDTH (`MEM_ADDR_WIDTH),
|
||||
.AXI_TID_WIDTH (C_M_AXI_MEM_ID_WIDTH),
|
||||
.AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
|
||||
) vortex_axi (
|
||||
`SCOPE_IO_BIND (1)
|
||||
|
||||
.clk (clk),
|
||||
.reset (vx_reset),
|
||||
.clk (ap_clk),
|
||||
.reset (reset || ap_reset || ~vx_running),
|
||||
|
||||
.m_axi_awvalid (m_axi_mem_awvalid_a),
|
||||
.m_axi_awready (m_axi_mem_awready_a),
|
||||
.m_axi_awaddr (m_axi_mem_awaddr_u),
|
||||
.m_axi_awaddr (m_axi_mem_awaddr_w),
|
||||
.m_axi_awid (m_axi_mem_awid_a),
|
||||
.m_axi_awlen (m_axi_mem_awlen_a),
|
||||
`UNUSED_PIN (m_axi_awsize),
|
||||
|
@ -330,7 +283,7 @@ module VX_afu_wrap #(
|
|||
|
||||
.m_axi_arvalid (m_axi_mem_arvalid_a),
|
||||
.m_axi_arready (m_axi_mem_arready_a),
|
||||
.m_axi_araddr (m_axi_mem_araddr_u),
|
||||
.m_axi_araddr (m_axi_mem_araddr_w),
|
||||
.m_axi_arid (m_axi_mem_arid_a),
|
||||
.m_axi_arlen (m_axi_mem_arlen_a),
|
||||
`UNUSED_PIN (m_axi_arsize),
|
||||
|
@ -357,79 +310,38 @@ module VX_afu_wrap #(
|
|||
|
||||
// SCOPE //////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef SCOPE
|
||||
`ifdef DBG_SCOPE_AFU
|
||||
wire m_axi_mem_awfire_0 = m_axi_mem_awvalid_a[0] & m_axi_mem_awready_a[0];
|
||||
wire m_axi_mem_arfire_0 = m_axi_mem_arvalid_a[0] & m_axi_mem_arready_a[0];
|
||||
wire m_axi_mem_wfire_0 = m_axi_mem_wvalid_a[0] & m_axi_mem_wready_a[0];
|
||||
wire m_axi_mem_bfire_0 = m_axi_mem_bvalid_a[0] & m_axi_mem_bready_a[0];
|
||||
wire reset_negedge;
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP (0, 0, {
|
||||
ap_reset,
|
||||
ap_start,
|
||||
ap_done,
|
||||
ap_idle,
|
||||
interrupt,
|
||||
vx_reset,
|
||||
vx_busy,
|
||||
state,
|
||||
m_axi_mem_awvalid_a[0],
|
||||
m_axi_mem_awready_a[0],
|
||||
m_axi_mem_wvalid_a[0],
|
||||
m_axi_mem_wready_a[0],
|
||||
m_axi_mem_bvalid_a[0],
|
||||
m_axi_mem_bready_a[0],
|
||||
m_axi_mem_arvalid_a[0],
|
||||
m_axi_mem_arready_a[0],
|
||||
m_axi_mem_rvalid_a[0],
|
||||
m_axi_mem_rready_a[0]
|
||||
}, {
|
||||
dcr_wr_valid,
|
||||
m_axi_mem_awfire_0,
|
||||
m_axi_mem_arfire_0,
|
||||
m_axi_mem_wfire_0,
|
||||
m_axi_mem_bfire_0
|
||||
}, {
|
||||
dcr_wr_addr,
|
||||
dcr_wr_data,
|
||||
vx_pending_writes,
|
||||
m_axi_mem_awaddr_u[0],
|
||||
m_axi_mem_awid_a[0],
|
||||
m_axi_mem_bid_a[0],
|
||||
m_axi_mem_araddr_u[0],
|
||||
m_axi_mem_arid_a[0],
|
||||
m_axi_mem_rid_a[0]
|
||||
},
|
||||
reset_negedge, 1'b0, 4096
|
||||
);
|
||||
`else
|
||||
`SCOPE_IO_UNUSED(0)
|
||||
`endif
|
||||
`endif
|
||||
`define TRIGGERS { \
|
||||
reset, \
|
||||
ap_start, \
|
||||
ap_done, \
|
||||
ap_idle, \
|
||||
interrupt, \
|
||||
vx_busy_wait, \
|
||||
vx_busy, \
|
||||
vx_running \
|
||||
}
|
||||
|
||||
`ifdef CHIPSCOPE
|
||||
`ifdef DBG_SCOPE_AFU
|
||||
ila_afu ila_afu_inst (
|
||||
.clk (clk),
|
||||
.probe0 ({
|
||||
ap_reset,
|
||||
ap_start,
|
||||
ap_done,
|
||||
ap_idle,
|
||||
state,
|
||||
interrupt
|
||||
}),
|
||||
.probe1 ({
|
||||
vx_pending_writes,
|
||||
vx_busy,
|
||||
vx_reset,
|
||||
dcr_wr_valid,
|
||||
dcr_wr_addr,
|
||||
dcr_wr_data
|
||||
})
|
||||
`define PROBES { \
|
||||
vx_pending_writes \
|
||||
}
|
||||
|
||||
VX_scope_tap #(
|
||||
.SCOPE_ID (0),
|
||||
.TRIGGERW ($bits(`TRIGGERS)),
|
||||
.PROBEW ($bits(`PROBES))
|
||||
) scope_tap (
|
||||
.clk (clk),
|
||||
.reset (scope_reset_w[0]),
|
||||
.start (1'b0),
|
||||
.stop (1'b0),
|
||||
.triggers (`TRIGGERS),
|
||||
.probes (`PROBES),
|
||||
.bus_in (scope_bus_in_w[0]),
|
||||
.bus_out (scope_bus_out_w[0])
|
||||
);
|
||||
`endif
|
||||
`else
|
||||
`SCOPE_IO_UNUSED_W(0)
|
||||
`endif
|
||||
|
||||
`ifdef SIMULATION
|
||||
|
@ -440,7 +352,7 @@ module VX_afu_wrap #(
|
|||
initial begin
|
||||
$assertoff(0, vortex_axi);
|
||||
end
|
||||
always @(posedge clk) begin
|
||||
always @(posedge ap_clk) begin
|
||||
if (reset) begin
|
||||
assert_delay_ctr <= '0;
|
||||
assert_enabled <= 0;
|
||||
|
@ -459,22 +371,19 @@ module VX_afu_wrap #(
|
|||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_AFU
|
||||
always @(posedge clk) begin
|
||||
always @(posedge ap_clk) begin
|
||||
for (integer i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
|
||||
if (m_axi_mem_awvalid_a[i] && m_axi_mem_awready_a[i]) begin
|
||||
`TRACE(2, ("%t: AXI Wr Req [%0d]: addr=0x%0h, id=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]))
|
||||
`TRACE(2, ("%d: AFU Wr Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]));
|
||||
end
|
||||
if (m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i]) begin
|
||||
`TRACE(2, ("%t: AXI Wr Req [%0d]: strb=0x%h, data=0x%h\n", $time, i, m_axi_mem_wstrb_a[i], m_axi_mem_wdata_a[i]))
|
||||
end
|
||||
if (m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i]) begin
|
||||
`TRACE(2, ("%t: AXI Wr Rsp [%0d]: id=0x%0h\n", $time, i, m_axi_mem_bid_a[i]))
|
||||
`TRACE(2, ("%d: AFU Wr Req [%0d]: data=0x%h\n", $time, i, m_axi_mem_wdata_a[i]));
|
||||
end
|
||||
if (m_axi_mem_arvalid_a[i] && m_axi_mem_arready_a[i]) begin
|
||||
`TRACE(2, ("%t: AXI Rd Req [%0d]: addr=0x%0h, id=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i]))
|
||||
`TRACE(2, ("%d: AFU Rd Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i]));
|
||||
end
|
||||
if (m_axi_mem_rvalid_a[i] && m_axi_mem_rready_a[i]) begin
|
||||
`TRACE(2, ("%t: AXI Rd Rsp [%0d]: data=0x%h, id=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]))
|
||||
`TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]));
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -16,50 +16,37 @@
|
|||
module vortex_afu #(
|
||||
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
|
||||
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
|
||||
parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH,
|
||||
parameter C_M_AXI_MEM_DATA_WIDTH = (`PLATFORM_MEMORY_DATA_SIZE * 8),
|
||||
parameter C_M_AXI_MEM_ID_WIDTH = `M_AXI_MEM_ID_WIDTH,
|
||||
parameter C_M_AXI_MEM_ADDR_WIDTH = 64,
|
||||
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
|
||||
parameter C_M_AXI_MEM_NUM_BANKS = 1
|
||||
`else
|
||||
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS
|
||||
`endif
|
||||
parameter C_M_AXI_MEM_DATA_WIDTH = `VX_MEM_DATA_WIDTH
|
||||
) (
|
||||
// System signals
|
||||
input wire ap_clk,
|
||||
input wire ap_rst_n,
|
||||
|
||||
|
||||
// AXI4 master interface
|
||||
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
|
||||
`REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA),
|
||||
`else
|
||||
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
|
||||
`endif
|
||||
`REPEAT (`M_AXI_MEM_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
|
||||
|
||||
// AXI4-Lite slave interface
|
||||
input wire s_axi_ctrl_awvalid,
|
||||
output wire s_axi_ctrl_awready,
|
||||
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_awaddr,
|
||||
|
||||
input wire s_axi_ctrl_wvalid,
|
||||
output wire s_axi_ctrl_wready,
|
||||
input wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_wdata,
|
||||
input wire [C_S_AXI_CTRL_DATA_WIDTH/8-1:0] s_axi_ctrl_wstrb,
|
||||
|
||||
input wire s_axi_ctrl_arvalid,
|
||||
input wire s_axi_ctrl_arvalid,
|
||||
output wire s_axi_ctrl_arready,
|
||||
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_araddr,
|
||||
|
||||
output wire s_axi_ctrl_rvalid,
|
||||
input wire s_axi_ctrl_rready,
|
||||
input wire s_axi_ctrl_rready,
|
||||
output wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_rdata,
|
||||
output wire [1:0] s_axi_ctrl_rresp,
|
||||
|
||||
output wire s_axi_ctrl_bvalid,
|
||||
output wire s_axi_ctrl_bvalid,
|
||||
input wire s_axi_ctrl_bready,
|
||||
output wire [1:0] s_axi_ctrl_bresp,
|
||||
|
||||
output wire interrupt
|
||||
|
||||
output wire interrupt
|
||||
);
|
||||
|
||||
VX_afu_wrap #(
|
||||
|
@ -67,39 +54,32 @@ module vortex_afu #(
|
|||
.C_S_AXI_CTRL_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH),
|
||||
.C_M_AXI_MEM_ID_WIDTH (C_M_AXI_MEM_ID_WIDTH),
|
||||
.C_M_AXI_MEM_ADDR_WIDTH (C_M_AXI_MEM_ADDR_WIDTH),
|
||||
.C_M_AXI_MEM_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH),
|
||||
.C_M_AXI_MEM_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
|
||||
.C_M_AXI_MEM_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH)
|
||||
) afu_wrap (
|
||||
.clk (ap_clk),
|
||||
.reset (~ap_rst_n),
|
||||
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
|
||||
`REPEAT (1, AXI_MEM_ARGS, REPEAT_COMMA),
|
||||
`else
|
||||
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
|
||||
`endif
|
||||
.ap_clk (ap_clk),
|
||||
.ap_rst_n (ap_rst_n),
|
||||
|
||||
`REPEAT (`M_AXI_MEM_NUM_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
|
||||
|
||||
.s_axi_ctrl_awvalid (s_axi_ctrl_awvalid),
|
||||
.s_axi_ctrl_awready (s_axi_ctrl_awready),
|
||||
.s_axi_ctrl_awaddr (s_axi_ctrl_awaddr),
|
||||
|
||||
.s_axi_ctrl_awaddr (s_axi_ctrl_awaddr),
|
||||
.s_axi_ctrl_wvalid (s_axi_ctrl_wvalid),
|
||||
.s_axi_ctrl_wready (s_axi_ctrl_wready),
|
||||
.s_axi_ctrl_wdata (s_axi_ctrl_wdata),
|
||||
.s_axi_ctrl_wstrb (s_axi_ctrl_wstrb),
|
||||
|
||||
.s_axi_ctrl_arvalid (s_axi_ctrl_arvalid),
|
||||
.s_axi_ctrl_arready (s_axi_ctrl_arready),
|
||||
.s_axi_ctrl_araddr (s_axi_ctrl_araddr),
|
||||
|
||||
.s_axi_ctrl_rvalid (s_axi_ctrl_rvalid),
|
||||
.s_axi_ctrl_rready (s_axi_ctrl_rready),
|
||||
.s_axi_ctrl_rdata (s_axi_ctrl_rdata),
|
||||
.s_axi_ctrl_rresp (s_axi_ctrl_rresp),
|
||||
|
||||
.s_axi_ctrl_bvalid (s_axi_ctrl_bvalid),
|
||||
.s_axi_ctrl_bready (s_axi_ctrl_bready),
|
||||
.s_axi_ctrl_bresp (s_axi_ctrl_bresp),
|
||||
|
||||
.interrupt (interrupt)
|
||||
);
|
||||
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -14,12 +14,12 @@
|
|||
`ifndef VORTEX_AFU_VH
|
||||
`define VORTEX_AFU_VH
|
||||
|
||||
`ifndef PLATFORM_MEMORY_OFFSET
|
||||
`define PLATFORM_MEMORY_OFFSET 0
|
||||
`ifndef M_AXI_MEM_NUM_BANKS
|
||||
`define M_AXI_MEM_NUM_BANKS 1
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_MEMORY_ID_WIDTH
|
||||
`define PLATFORM_MEMORY_ID_WIDTH 32
|
||||
`ifndef M_AXI_MEM_ID_WIDTH
|
||||
`define M_AXI_MEM_ID_WIDTH 32
|
||||
`endif
|
||||
|
||||
`define GEN_AXI_MEM(i) \
|
||||
|
|
54
hw/rtl/cache/VX_bank_flush.sv
vendored
54
hw/rtl/cache/VX_bank_flush.sv
vendored
|
@ -33,7 +33,7 @@ module VX_bank_flush #(
|
|||
output wire flush_init,
|
||||
output wire flush_valid,
|
||||
output wire [`CS_LINE_SEL_BITS-1:0] flush_line,
|
||||
output wire [`CS_WAY_SEL_WIDTH-1:0] flush_way,
|
||||
output wire [NUM_WAYS-1:0] flush_way,
|
||||
input wire flush_ready,
|
||||
input wire mshr_empty,
|
||||
input wire bank_empty
|
||||
|
@ -48,21 +48,20 @@ module VX_bank_flush #(
|
|||
localparam STATE_WAIT2 = 4;
|
||||
localparam STATE_DONE = 5;
|
||||
|
||||
reg [2:0] state, state_n;
|
||||
reg [2:0] state_r, state_n;
|
||||
|
||||
reg [CTR_WIDTH-1:0] counter;
|
||||
reg [CTR_WIDTH-1:0] counter_r;
|
||||
|
||||
always @(*) begin
|
||||
state_n = state;
|
||||
case (state)
|
||||
//STATE_IDLE:
|
||||
default : begin
|
||||
state_n = state_r;
|
||||
case (state_r)
|
||||
STATE_IDLE: begin
|
||||
if (flush_begin) begin
|
||||
state_n = STATE_WAIT1;
|
||||
end
|
||||
end
|
||||
STATE_INIT: begin
|
||||
if (counter == ((2 ** `CS_LINE_SEL_BITS)-1)) begin
|
||||
if (counter_r == ((2 ** `CS_LINE_SEL_BITS)-1)) begin
|
||||
state_n = STATE_IDLE;
|
||||
end
|
||||
end
|
||||
|
@ -73,7 +72,7 @@ module VX_bank_flush #(
|
|||
end
|
||||
end
|
||||
STATE_FLUSH: begin
|
||||
if (counter == ((2 ** CTR_WIDTH)-1) && flush_ready) begin
|
||||
if (counter_r == ((2 ** CTR_WIDTH)-1) && flush_ready) begin
|
||||
state_n = (BANK_ID == 0) ? STATE_DONE : STATE_WAIT2;
|
||||
end
|
||||
end
|
||||
|
@ -94,30 +93,35 @@ module VX_bank_flush #(
|
|||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
state <= STATE_INIT;
|
||||
counter <= '0;
|
||||
state_r <= STATE_INIT;
|
||||
counter_r <= '0;
|
||||
end else begin
|
||||
state <= state_n;
|
||||
if (state != STATE_IDLE) begin
|
||||
if ((state == STATE_INIT)
|
||||
|| ((state == STATE_FLUSH) && flush_ready)) begin
|
||||
counter <= counter + CTR_WIDTH'(1);
|
||||
state_r <= state_n;
|
||||
if (state_r != STATE_IDLE) begin
|
||||
if ((state_r == STATE_INIT)
|
||||
|| ((state_r == STATE_FLUSH) && flush_ready)) begin
|
||||
counter_r <= counter_r + CTR_WIDTH'(1);
|
||||
end
|
||||
end else begin
|
||||
counter <= '0;
|
||||
counter_r <= '0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign flush_end = (state == STATE_DONE);
|
||||
assign flush_init = (state == STATE_INIT);
|
||||
assign flush_valid = (state == STATE_FLUSH);
|
||||
assign flush_line = counter[`CS_LINE_SEL_BITS-1:0];
|
||||
assign flush_end = (state_r == STATE_DONE);
|
||||
assign flush_init = (state_r == STATE_INIT);
|
||||
assign flush_valid = (state_r == STATE_FLUSH);
|
||||
assign flush_line = counter_r[`CS_LINE_SEL_BITS-1:0];
|
||||
|
||||
if (WRITEBACK && (NUM_WAYS > 1)) begin : g_flush_way
|
||||
assign flush_way = counter[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS];
|
||||
end else begin : g_flush_way_all
|
||||
assign flush_way = '0;
|
||||
if (WRITEBACK && `CS_WAY_SEL_BITS > 0) begin
|
||||
reg [NUM_WAYS-1:0] flush_way_r;
|
||||
always @(*) begin
|
||||
flush_way_r = '0;
|
||||
flush_way_r[counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]] = 1;
|
||||
end
|
||||
assign flush_way = flush_way_r;
|
||||
end else begin
|
||||
assign flush_way = {NUM_WAYS{1'b1}};
|
||||
end
|
||||
|
||||
endmodule
|
||||
|
|
525
hw/rtl/cache/VX_cache.sv
vendored
525
hw/rtl/cache/VX_cache.sv
vendored
|
@ -19,26 +19,23 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
// Number of Word requests per cycle
|
||||
parameter NUM_REQS = 4,
|
||||
|
||||
// Number of memory ports
|
||||
parameter MEM_PORTS = 1,
|
||||
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 32768,
|
||||
parameter CACHE_SIZE = 4096,
|
||||
// Size of line inside a bank in bytes
|
||||
parameter LINE_SIZE = 64,
|
||||
// Number of banks
|
||||
parameter NUM_BANKS = 4,
|
||||
parameter NUM_BANKS = 1,
|
||||
// Number of associative ways
|
||||
parameter NUM_WAYS = 4,
|
||||
parameter NUM_WAYS = 1,
|
||||
// Size of a word in bytes
|
||||
parameter WORD_SIZE = 16,
|
||||
parameter WORD_SIZE = `XLEN/8,
|
||||
|
||||
// Core Response Queue Size
|
||||
parameter CRSQ_SIZE = 4,
|
||||
parameter CRSQ_SIZE = 2,
|
||||
// Miss Reserv Queue Knob
|
||||
parameter MSHR_SIZE = 16,
|
||||
parameter MSHR_SIZE = 8,
|
||||
// Memory Response Queue Size
|
||||
parameter MRSQ_SIZE = 4,
|
||||
parameter MRSQ_SIZE = 0,
|
||||
// Memory Request Queue Size
|
||||
parameter MREQ_SIZE = 4,
|
||||
|
||||
|
@ -51,23 +48,17 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
// Enable dirty bytes on writeback
|
||||
parameter DIRTY_BYTES = 0,
|
||||
|
||||
// Replacement policy
|
||||
parameter REPL_POLICY = `CS_REPL_FIFO,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
// core request tag size
|
||||
parameter TAG_WIDTH = UUID_WIDTH + 1,
|
||||
|
||||
// core request flags
|
||||
parameter FLAGS_WIDTH = 0,
|
||||
|
||||
// Core response output register
|
||||
parameter CORE_OUT_BUF = 3,
|
||||
parameter CORE_OUT_BUF = 0,
|
||||
|
||||
// Memory request output register
|
||||
parameter MEM_OUT_BUF = 3
|
||||
parameter MEM_OUT_BUF = 0
|
||||
) (
|
||||
// PERF
|
||||
`ifdef PERF_ENABLE
|
||||
|
@ -78,37 +69,34 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
VX_mem_bus_if.slave core_bus_if [NUM_REQS],
|
||||
VX_mem_bus_if.master mem_bus_if [MEM_PORTS]
|
||||
VX_mem_bus_if.master mem_bus_if
|
||||
);
|
||||
|
||||
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter: number of banks must be power of 2"))
|
||||
`STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter: writeback requires write enable"))
|
||||
`STATIC_ASSERT(WRITEBACK || !DIRTY_BYTES, ("invalid parameter: dirty bytes require writeback"))
|
||||
`STATIC_ASSERT(NUM_BANKS >= MEM_PORTS, ("invalid parameter: number of banks must be greater or equal to number of memory ports"))
|
||||
|
||||
// In writeback mode, memory fill response may issue a new memory request to handle evicted blocks.
|
||||
// We need to ensure that the memory request queue never fills up to avoid deadlock.
|
||||
`STATIC_ASSERT(!WRITEBACK || (MREQ_SIZE >= MSHR_SIZE), ("invalid parameter: writeback requires MREQ_SIZE >= MSHR_SIZE"))
|
||||
|
||||
localparam REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS);
|
||||
localparam WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS);
|
||||
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
|
||||
localparam MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH);
|
||||
localparam MEM_TAG_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
|
||||
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
|
||||
localparam WORD_WIDTH = WORD_SIZE * 8;
|
||||
localparam WORD_SEL_BITS = `CLOG2(WORDS_PER_LINE);
|
||||
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
|
||||
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
|
||||
localparam LINE_ADDR_WIDTH = (`CS_WORD_ADDR_WIDTH - BANK_SEL_BITS - WORD_SEL_BITS);
|
||||
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + `UP(FLAGS_WIDTH);
|
||||
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + 1;
|
||||
localparam CORE_RSP_DATAW = WORD_WIDTH + TAG_WIDTH;
|
||||
localparam BANK_MEM_TAG_WIDTH = UUID_WIDTH + MSHR_ADDR_WIDTH;
|
||||
localparam MEM_REQ_DATAW = (`CS_LINE_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + `UP(FLAGS_WIDTH));
|
||||
localparam MEM_RSP_DATAW = `CS_LINE_WIDTH + MEM_TAG_WIDTH;
|
||||
localparam MEM_PORTS_SEL_BITS = `CLOG2(MEM_PORTS);
|
||||
localparam MEM_PORTS_SEL_WIDTH = `UP(MEM_PORTS_SEL_BITS);
|
||||
localparam MEM_ARB_SEL_BITS = `CLOG2(`CDIV(NUM_BANKS, MEM_PORTS));
|
||||
localparam MEM_ARB_SEL_WIDTH = `UP(MEM_ARB_SEL_BITS);
|
||||
|
||||
localparam REQ_XBAR_BUF = (NUM_REQS > 2) ? 2 : 0;
|
||||
localparam CORE_RSP_BUF_ENABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
|
||||
localparam MEM_REQ_BUF_ENABLE = (NUM_BANKS != 1);
|
||||
localparam CORE_REQ_BUF_ENABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
|
||||
localparam MEM_REQ_BUF_ENABLE = (NUM_BANKS != 1);
|
||||
|
||||
localparam REQ_XBAR_BUF = (NUM_REQS > 4) ? 2 : 0;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
|
||||
|
@ -122,7 +110,6 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
) core_bus2_if[NUM_REQS]();
|
||||
|
||||
wire [NUM_BANKS-1:0] per_bank_flush_begin;
|
||||
wire [`UP(UUID_WIDTH)-1:0] flush_uuid;
|
||||
wire [NUM_BANKS-1:0] per_bank_flush_end;
|
||||
|
||||
wire [NUM_BANKS-1:0] per_bank_core_req_fire;
|
||||
|
@ -130,9 +117,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
VX_cache_flush #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.UUID_WIDTH(UUID_WIDTH),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // request xbar latency
|
||||
.BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // bank xbar latency
|
||||
) flush_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -140,101 +125,92 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
.core_bus_out_if (core_bus2_if),
|
||||
.bank_req_fire (per_bank_core_req_fire),
|
||||
.flush_begin (per_bank_flush_begin),
|
||||
.flush_uuid (flush_uuid),
|
||||
.flush_end (per_bank_flush_end)
|
||||
);
|
||||
|
||||
// Memory response gather /////////////////////////////////////////////////
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH)
|
||||
) mem_bus_tmp_if[MEM_PORTS]();
|
||||
// Core response buffering
|
||||
wire [NUM_REQS-1:0] core_rsp_valid_s;
|
||||
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_s;
|
||||
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
|
||||
wire [NUM_REQS-1:0] core_rsp_ready_s;
|
||||
|
||||
wire [MEM_PORTS-1:0] mem_rsp_queue_valid;
|
||||
wire [MEM_PORTS-1:0][MEM_RSP_DATAW-1:0] mem_rsp_queue_data;
|
||||
wire [MEM_PORTS-1:0] mem_rsp_queue_ready;
|
||||
`RESET_RELAY_EX (core_rsp_reset, reset, NUM_REQS, `MAX_FANOUT);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (MEM_RSP_DATAW),
|
||||
.SIZE (MRSQ_SIZE),
|
||||
.OUT_REG (MRSQ_SIZE > 2)
|
||||
) mem_rsp_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (mem_bus_tmp_if[i].rsp_valid),
|
||||
.data_in (mem_bus_tmp_if[i].rsp_data),
|
||||
.ready_in (mem_bus_tmp_if[i].rsp_ready),
|
||||
.valid_out (mem_rsp_queue_valid[i]),
|
||||
.data_out (mem_rsp_queue_data[i]),
|
||||
.ready_out (mem_rsp_queue_ready[i])
|
||||
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
|
||||
.SIZE (CORE_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
|
||||
) core_rsp_buf (
|
||||
.clk (clk),
|
||||
.reset (core_rsp_reset[i]),
|
||||
.valid_in (core_rsp_valid_s[i]),
|
||||
.ready_in (core_rsp_ready_s[i]),
|
||||
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
|
||||
.data_out ({core_bus2_if[i].rsp_data.data, core_bus2_if[i].rsp_data.tag}),
|
||||
.valid_out (core_bus2_if[i].rsp_valid),
|
||||
.ready_out (core_bus2_if[i].rsp_ready)
|
||||
);
|
||||
end
|
||||
|
||||
wire [MEM_PORTS-1:0][MEM_RSP_DATAW-MEM_ARB_SEL_BITS-1:0] mem_rsp_queue_data_s;
|
||||
wire [MEM_PORTS-1:0][BANK_SEL_WIDTH-1:0] mem_rsp_queue_sel;
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue_data_s
|
||||
wire [BANK_MEM_TAG_WIDTH-1:0] mem_rsp_tag_s = mem_rsp_queue_data[i][MEM_TAG_WIDTH-1:MEM_ARB_SEL_BITS];
|
||||
wire [`CS_LINE_WIDTH-1:0] mem_rsp_data_s = mem_rsp_queue_data[i][MEM_RSP_DATAW-1:MEM_TAG_WIDTH];
|
||||
assign mem_rsp_queue_data_s[i] = {mem_rsp_data_s, mem_rsp_tag_s};
|
||||
end
|
||||
// Memory request buffering
|
||||
wire mem_req_valid_s;
|
||||
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_s;
|
||||
wire mem_req_rw_s;
|
||||
wire [LINE_SIZE-1:0] mem_req_byteen_s;
|
||||
wire [`CS_LINE_WIDTH-1:0] mem_req_data_s;
|
||||
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s;
|
||||
wire mem_req_flush_s;
|
||||
wire mem_req_ready_s;
|
||||
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue_sel
|
||||
if (NUM_BANKS > 1) begin : g_multibanks
|
||||
if (NUM_BANKS != MEM_PORTS) begin : g_arb_sel
|
||||
VX_bits_concat #(
|
||||
.L (MEM_ARB_SEL_BITS),
|
||||
.R (MEM_PORTS_SEL_BITS)
|
||||
) mem_rsp_sel_concat (
|
||||
.left_in (mem_rsp_queue_data[i][MEM_ARB_SEL_BITS-1:0]),
|
||||
.right_in (MEM_PORTS_SEL_WIDTH'(i)),
|
||||
.data_out (mem_rsp_queue_sel[i])
|
||||
);
|
||||
end else begin : g_no_arb_sel
|
||||
assign mem_rsp_queue_sel[i] = MEM_PORTS_SEL_WIDTH'(i);
|
||||
end
|
||||
end else begin : g_singlebank
|
||||
assign mem_rsp_queue_sel[i] = 0;
|
||||
end
|
||||
end
|
||||
wire mem_bus_if_flush;
|
||||
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_rsp_valid;
|
||||
wire [NUM_BANKS-1:0][MEM_RSP_DATAW-MEM_ARB_SEL_BITS-1:0] per_bank_mem_rsp_pdata;
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
|
||||
|
||||
VX_stream_omega #(
|
||||
.NUM_INPUTS (MEM_PORTS),
|
||||
.NUM_OUTPUTS (NUM_BANKS),
|
||||
.DATAW (MEM_RSP_DATAW-MEM_ARB_SEL_BITS),
|
||||
.ARBITER ("R"),
|
||||
.OUT_BUF (3)
|
||||
) mem_rsp_xbar (
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + 1),
|
||||
.SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
|
||||
) mem_req_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (mem_rsp_queue_valid),
|
||||
.data_in (mem_rsp_queue_data_s),
|
||||
.sel_in (mem_rsp_queue_sel),
|
||||
.ready_in (mem_rsp_queue_ready),
|
||||
.valid_out (per_bank_mem_rsp_valid),
|
||||
.data_out (per_bank_mem_rsp_pdata),
|
||||
`UNUSED_PIN (sel_out),
|
||||
.ready_out (per_bank_mem_rsp_ready),
|
||||
`UNUSED_PIN (collisions)
|
||||
.valid_in (mem_req_valid_s),
|
||||
.ready_in (mem_req_ready_s),
|
||||
.data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s, mem_req_flush_s}),
|
||||
.data_out ({mem_bus_if.req_data.rw, mem_bus_if.req_data.byteen, mem_bus_if.req_data.addr, mem_bus_if.req_data.data, mem_bus_if.req_data.tag, mem_bus_if_flush}),
|
||||
.valid_out (mem_bus_if.req_valid),
|
||||
.ready_out (mem_bus_if.req_ready)
|
||||
);
|
||||
|
||||
wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_rsp_data;
|
||||
wire [NUM_BANKS-1:0][BANK_MEM_TAG_WIDTH-1:0] per_bank_mem_rsp_tag;
|
||||
assign mem_bus_if.req_data.atype = mem_bus_if_flush ? `ADDR_TYPE_WIDTH'(1 << `ADDR_TYPE_FLUSH) : '0;
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_per_bank_mem_rsp_data
|
||||
assign {
|
||||
per_bank_mem_rsp_data[i],
|
||||
per_bank_mem_rsp_tag[i]
|
||||
} = per_bank_mem_rsp_pdata[i];
|
||||
end
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Core requests dispatch /////////////////////////////////////////////////
|
||||
// Memory response buffering
|
||||
wire mem_rsp_valid_s;
|
||||
wire [`CS_LINE_WIDTH-1:0] mem_rsp_data_s;
|
||||
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s;
|
||||
wire mem_rsp_ready_s;
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (MEM_TAG_WIDTH + `CS_LINE_WIDTH),
|
||||
.SIZE (MRSQ_SIZE),
|
||||
.OUT_REG (MRSQ_SIZE > 2)
|
||||
) mem_rsp_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (mem_bus_if.rsp_valid),
|
||||
.ready_in (mem_bus_if.rsp_ready),
|
||||
.data_in ({mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data}),
|
||||
.data_out ({mem_rsp_tag_s, mem_rsp_data_s}),
|
||||
.valid_out (mem_rsp_valid_s),
|
||||
.ready_out (mem_rsp_ready_s)
|
||||
);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_BANKS-1:0] per_bank_core_req_valid;
|
||||
wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
|
||||
|
@ -244,7 +220,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_req_data;
|
||||
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_req_tag;
|
||||
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_req_idx;
|
||||
wire [NUM_BANKS-1:0][`UP(FLAGS_WIDTH)-1:0] per_bank_core_req_flags;
|
||||
wire [NUM_BANKS-1:0] per_bank_core_req_flush;
|
||||
wire [NUM_BANKS-1:0] per_bank_core_req_ready;
|
||||
|
||||
wire [NUM_BANKS-1:0] per_bank_core_rsp_valid;
|
||||
|
@ -254,21 +230,33 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
wire [NUM_BANKS-1:0] per_bank_core_rsp_ready;
|
||||
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_req_valid;
|
||||
wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
|
||||
wire [NUM_BANKS-1:0][`CS_MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_req_rw;
|
||||
wire [NUM_BANKS-1:0][LINE_SIZE-1:0] per_bank_mem_req_byteen;
|
||||
wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_req_data;
|
||||
wire [NUM_BANKS-1:0][BANK_MEM_TAG_WIDTH-1:0] per_bank_mem_req_tag;
|
||||
wire [NUM_BANKS-1:0][`UP(FLAGS_WIDTH)-1:0] per_bank_mem_req_flags;
|
||||
wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0] per_bank_mem_req_id;
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_req_flush;
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_req_ready;
|
||||
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
|
||||
|
||||
assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready;
|
||||
|
||||
if (NUM_BANKS == 1) begin
|
||||
assign mem_rsp_ready_s = per_bank_mem_rsp_ready;
|
||||
end else begin
|
||||
assign mem_rsp_ready_s = per_bank_mem_rsp_ready[`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s)];
|
||||
end
|
||||
|
||||
// Bank requests dispatch
|
||||
|
||||
wire [NUM_REQS-1:0] core_req_valid;
|
||||
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr;
|
||||
wire [NUM_REQS-1:0] core_req_rw;
|
||||
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen;
|
||||
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data;
|
||||
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag;
|
||||
wire [NUM_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] core_req_flags;
|
||||
wire [NUM_REQS-1:0] core_req_flush;
|
||||
wire [NUM_REQS-1:0] core_req_ready;
|
||||
|
||||
wire [NUM_REQS-1:0][LINE_ADDR_WIDTH-1:0] core_req_line_addr;
|
||||
|
@ -278,38 +266,35 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
wire [NUM_REQS-1:0][CORE_REQ_DATAW-1:0] core_req_data_in;
|
||||
wire [NUM_BANKS-1:0][CORE_REQ_DATAW-1:0] core_req_data_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_req_valid[i] = core_bus2_if[i].req_valid;
|
||||
assign core_req_rw[i] = core_bus2_if[i].req_data.rw;
|
||||
assign core_req_byteen[i] = core_bus2_if[i].req_data.byteen;
|
||||
assign core_req_addr[i] = core_bus2_if[i].req_data.addr;
|
||||
assign core_req_data[i] = core_bus2_if[i].req_data.data;
|
||||
assign core_req_tag[i] = core_bus2_if[i].req_data.tag;
|
||||
assign core_req_flags[i] = `UP(FLAGS_WIDTH)'(core_bus2_if[i].req_data.flags);
|
||||
assign core_req_flush[i] = core_bus2_if[i].req_data.atype[`ADDR_TYPE_FLUSH];
|
||||
assign core_bus2_if[i].req_ready = core_req_ready[i];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_wsel
|
||||
if (WORDS_PER_LINE > 1) begin : g_wsel
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
if (WORDS_PER_LINE > 1) begin
|
||||
assign core_req_wsel[i] = core_req_addr[i][0 +: WORD_SEL_BITS];
|
||||
end else begin : g_no_wsel
|
||||
end else begin
|
||||
assign core_req_wsel[i] = '0;
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_line_addr
|
||||
assign core_req_line_addr[i] = core_req_addr[i][(BANK_SEL_BITS + WORD_SEL_BITS) +: LINE_ADDR_WIDTH];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_bid
|
||||
if (NUM_BANKS > 1) begin : g_multibanks
|
||||
if (NUM_BANKS > 1) begin
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_req_bid[i] = core_req_addr[i][WORD_SEL_BITS +: BANK_SEL_BITS];
|
||||
end else begin : g_singlebank
|
||||
assign core_req_bid[i] = '0;
|
||||
end
|
||||
end else begin
|
||||
assign core_req_bid = '0;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_data_in
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_req_data_in[i] = {
|
||||
core_req_line_addr[i],
|
||||
core_req_rw[i],
|
||||
|
@ -317,26 +302,26 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
core_req_byteen[i],
|
||||
core_req_data[i],
|
||||
core_req_tag[i],
|
||||
core_req_flags[i]
|
||||
core_req_flush[i]
|
||||
};
|
||||
end
|
||||
|
||||
assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`PERF_CTR_BITS-1:0] perf_collisions;
|
||||
`endif
|
||||
|
||||
`RESET_RELAY (req_xbar_reset, reset);
|
||||
|
||||
VX_stream_xbar #(
|
||||
.NUM_INPUTS (NUM_REQS),
|
||||
.NUM_OUTPUTS (NUM_BANKS),
|
||||
.DATAW (CORE_REQ_DATAW),
|
||||
.PERF_CTR_BITS (`PERF_CTR_BITS),
|
||||
.ARBITER ("R"),
|
||||
.ARBITER ("F"),
|
||||
.OUT_BUF (REQ_XBAR_BUF)
|
||||
) req_xbar (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (req_xbar_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.collisions(perf_collisions),
|
||||
`else
|
||||
|
@ -352,7 +337,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
.ready_out (per_bank_core_req_ready)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_core_req_data_out
|
||||
for (genvar i = 0; i < NUM_BANKS; ++i) begin
|
||||
assign {
|
||||
per_bank_core_req_addr[i],
|
||||
per_bank_core_req_rw[i],
|
||||
|
@ -360,42 +345,50 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
per_bank_core_req_byteen[i],
|
||||
per_bank_core_req_data[i],
|
||||
per_bank_core_req_tag[i],
|
||||
per_bank_core_req_flags[i]
|
||||
per_bank_core_req_flush[i]
|
||||
} = core_req_data_out[i];
|
||||
end
|
||||
|
||||
// Banks access ///////////////////////////////////////////////////////////
|
||||
// Banks access
|
||||
for (genvar bank_id = 0; bank_id < NUM_BANKS; ++bank_id) begin : banks
|
||||
wire [`CS_LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr;
|
||||
wire curr_bank_mem_rsp_valid;
|
||||
|
||||
if (NUM_BANKS == 1) begin
|
||||
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s;
|
||||
end else begin
|
||||
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s && (`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s) == bank_id);
|
||||
end
|
||||
|
||||
`RESET_RELAY (bank_reset, reset);
|
||||
|
||||
for (genvar bank_id = 0; bank_id < NUM_BANKS; ++bank_id) begin : g_banks
|
||||
VX_cache_bank #(
|
||||
.BANK_ID (bank_id),
|
||||
.INSTANCE_ID (`SFORMATF(("%s-bank%0d", INSTANCE_ID, bank_id))),
|
||||
.INSTANCE_ID ($sformatf("%s-bank%0d", INSTANCE_ID, bank_id)),
|
||||
.CACHE_SIZE (CACHE_SIZE),
|
||||
.LINE_SIZE (LINE_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.NUM_WAYS (NUM_WAYS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.WRITEBACK (WRITEBACK),
|
||||
.DIRTY_BYTES (DIRTY_BYTES),
|
||||
.REPL_POLICY (REPL_POLICY),
|
||||
.CRSQ_SIZE (CRSQ_SIZE),
|
||||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.MREQ_SIZE (MREQ_SIZE),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.DIRTY_BYTES (DIRTY_BYTES),
|
||||
.WRITEBACK (WRITEBACK),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.FLAGS_WIDTH (FLAGS_WIDTH),
|
||||
.CORE_OUT_REG (CORE_RSP_BUF_ENABLE ? 0 : `TO_OUT_BUF_REG(CORE_OUT_BUF)),
|
||||
.MEM_OUT_REG (MEM_REQ_BUF_ENABLE ? 0 : `TO_OUT_BUF_REG(MEM_OUT_BUF))
|
||||
.CORE_OUT_BUF (CORE_REQ_BUF_ENABLE ? 0 : CORE_OUT_BUF),
|
||||
.MEM_OUT_BUF (MEM_REQ_BUF_ENABLE ? 0 : MEM_OUT_BUF)
|
||||
) bank (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (bank_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_read_miss (perf_read_miss_per_bank[bank_id]),
|
||||
.perf_write_miss (perf_write_miss_per_bank[bank_id]),
|
||||
.perf_mshr_stall (perf_mshr_stall_per_bank[bank_id]),
|
||||
.perf_read_misses (perf_read_miss_per_bank[bank_id]),
|
||||
.perf_write_misses (perf_write_miss_per_bank[bank_id]),
|
||||
.perf_mshr_stalls (perf_mshr_stall_per_bank[bank_id]),
|
||||
`endif
|
||||
|
||||
// Core request
|
||||
|
@ -407,7 +400,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
.core_req_data (per_bank_core_req_data[bank_id]),
|
||||
.core_req_tag (per_bank_core_req_tag[bank_id]),
|
||||
.core_req_idx (per_bank_core_req_idx[bank_id]),
|
||||
.core_req_flags (per_bank_core_req_flags[bank_id]),
|
||||
.core_req_flush (per_bank_core_req_flush[bank_id]),
|
||||
.core_req_ready (per_bank_core_req_ready[bank_id]),
|
||||
|
||||
// Core response
|
||||
|
@ -419,49 +412,50 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
|
||||
// Memory request
|
||||
.mem_req_valid (per_bank_mem_req_valid[bank_id]),
|
||||
.mem_req_addr (per_bank_mem_req_addr[bank_id]),
|
||||
.mem_req_addr (curr_bank_mem_req_addr),
|
||||
.mem_req_rw (per_bank_mem_req_rw[bank_id]),
|
||||
.mem_req_byteen (per_bank_mem_req_byteen[bank_id]),
|
||||
.mem_req_data (per_bank_mem_req_data[bank_id]),
|
||||
.mem_req_tag (per_bank_mem_req_tag[bank_id]),
|
||||
.mem_req_flags (per_bank_mem_req_flags[bank_id]),
|
||||
.mem_req_id (per_bank_mem_req_id[bank_id]),
|
||||
.mem_req_flush (per_bank_mem_req_flush[bank_id]),
|
||||
.mem_req_ready (per_bank_mem_req_ready[bank_id]),
|
||||
|
||||
// Memory response
|
||||
.mem_rsp_valid (per_bank_mem_rsp_valid[bank_id]),
|
||||
.mem_rsp_data (per_bank_mem_rsp_data[bank_id]),
|
||||
.mem_rsp_tag (per_bank_mem_rsp_tag[bank_id]),
|
||||
.mem_rsp_valid (curr_bank_mem_rsp_valid),
|
||||
.mem_rsp_data (mem_rsp_data_s),
|
||||
.mem_rsp_id (`CS_MEM_TAG_TO_REQ_ID(mem_rsp_tag_s)),
|
||||
.mem_rsp_ready (per_bank_mem_rsp_ready[bank_id]),
|
||||
|
||||
// Flush request
|
||||
.flush_begin (per_bank_flush_begin[bank_id]),
|
||||
.flush_uuid (flush_uuid),
|
||||
.flush_end (per_bank_flush_end[bank_id])
|
||||
);
|
||||
|
||||
if (NUM_BANKS == 1) begin
|
||||
assign per_bank_mem_req_addr[bank_id] = curr_bank_mem_req_addr;
|
||||
end else begin
|
||||
assign per_bank_mem_req_addr[bank_id] = `CS_LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, bank_id);
|
||||
end
|
||||
end
|
||||
|
||||
// Core responses gather //////////////////////////////////////////////////
|
||||
// Bank responses gather
|
||||
|
||||
wire [NUM_BANKS-1:0][CORE_RSP_DATAW-1:0] core_rsp_data_in;
|
||||
wire [NUM_REQS-1:0][CORE_RSP_DATAW-1:0] core_rsp_data_out;
|
||||
|
||||
wire [NUM_REQS-1:0] core_rsp_valid_s;
|
||||
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_s;
|
||||
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
|
||||
wire [NUM_REQS-1:0] core_rsp_ready_s;
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_core_rsp_data_in
|
||||
for (genvar i = 0; i < NUM_BANKS; ++i) begin
|
||||
assign core_rsp_data_in[i] = {per_bank_core_rsp_data[i], per_bank_core_rsp_tag[i]};
|
||||
end
|
||||
|
||||
`RESET_RELAY (rsp_xbar_reset, reset);
|
||||
|
||||
VX_stream_xbar #(
|
||||
.NUM_INPUTS (NUM_BANKS),
|
||||
.NUM_OUTPUTS (NUM_REQS),
|
||||
.DATAW (CORE_RSP_DATAW),
|
||||
.ARBITER ("R")
|
||||
.ARBITER ("F")
|
||||
) rsp_xbar (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (rsp_xbar_reset),
|
||||
`UNUSED_PIN (collisions),
|
||||
.valid_in (per_bank_core_rsp_valid),
|
||||
.data_in (core_rsp_data_in),
|
||||
|
@ -473,170 +467,113 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_data_s
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign {core_rsp_data_s[i], core_rsp_tag_s[i]} = core_rsp_data_out[i];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_buf
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
|
||||
.SIZE (CORE_RSP_BUF_ENABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
|
||||
) core_rsp_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (core_rsp_valid_s[i]),
|
||||
.ready_in (core_rsp_ready_s[i]),
|
||||
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
|
||||
.data_out ({core_bus2_if[i].rsp_data.data, core_bus2_if[i].rsp_data.tag}),
|
||||
.valid_out (core_bus2_if[i].rsp_valid),
|
||||
.ready_out (core_bus2_if[i].rsp_ready)
|
||||
);
|
||||
end
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Memory request arbitration /////////////////////////////////////////////
|
||||
wire mem_req_valid_p;
|
||||
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_p;
|
||||
wire mem_req_rw_p;
|
||||
wire [LINE_SIZE-1:0] mem_req_byteen_p;
|
||||
wire [`CS_LINE_WIDTH-1:0] mem_req_data_p;
|
||||
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_p;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mem_req_id_p;
|
||||
wire mem_req_flush_p;
|
||||
wire mem_req_ready_p;
|
||||
|
||||
wire [NUM_BANKS-1:0][MEM_REQ_DATAW-1:0] per_bank_mem_req_pdata;
|
||||
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_per_bank_mem_req_pdata
|
||||
assign per_bank_mem_req_pdata[i] = {
|
||||
per_bank_mem_req_rw[i],
|
||||
// Memory request arbitration
|
||||
|
||||
wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + 1)-1:0] data_in;
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; ++i) begin
|
||||
assign data_in[i] = {
|
||||
per_bank_mem_req_addr[i],
|
||||
per_bank_mem_req_data[i],
|
||||
per_bank_mem_req_rw[i],
|
||||
per_bank_mem_req_byteen[i],
|
||||
per_bank_mem_req_flags[i],
|
||||
per_bank_mem_req_tag[i]
|
||||
per_bank_mem_req_data[i],
|
||||
per_bank_mem_req_id[i],
|
||||
per_bank_mem_req_flush[i]
|
||||
};
|
||||
end
|
||||
|
||||
wire [MEM_PORTS-1:0] mem_req_valid;
|
||||
wire [MEM_PORTS-1:0][MEM_REQ_DATAW-1:0] mem_req_pdata;
|
||||
wire [MEM_PORTS-1:0] mem_req_ready;
|
||||
wire [MEM_PORTS-1:0][MEM_ARB_SEL_WIDTH-1:0] mem_req_sel_out;
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (NUM_BANKS),
|
||||
.NUM_OUTPUTS(MEM_PORTS),
|
||||
.DATAW (MEM_REQ_DATAW),
|
||||
.ARBITER ("R")
|
||||
.DATAW (`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + MSHR_ADDR_WIDTH + 1),
|
||||
.ARBITER ("F")
|
||||
) mem_req_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (per_bank_mem_req_valid),
|
||||
.data_in (per_bank_mem_req_pdata),
|
||||
.ready_in (per_bank_mem_req_ready),
|
||||
.valid_out (mem_req_valid),
|
||||
.data_out (mem_req_pdata),
|
||||
.ready_out (mem_req_ready),
|
||||
.sel_out (mem_req_sel_out)
|
||||
.data_in (data_in),
|
||||
.data_out ({mem_req_addr_p, mem_req_rw_p, mem_req_byteen_p, mem_req_data_p, mem_req_id_p, mem_req_flush_p}),
|
||||
.valid_out (mem_req_valid_p),
|
||||
.ready_out (mem_req_ready_p),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_req_buf
|
||||
wire mem_req_rw;
|
||||
wire [`CS_LINE_ADDR_WIDTH-1:0] mem_req_addr;
|
||||
wire [`CS_LINE_WIDTH-1:0] mem_req_data;
|
||||
wire [LINE_SIZE-1:0] mem_req_byteen;
|
||||
wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags;
|
||||
wire [BANK_MEM_TAG_WIDTH-1:0] mem_req_tag;
|
||||
if (NUM_BANKS > 1) begin
|
||||
wire [`CS_BANK_SEL_BITS-1:0] mem_req_bank_id = `CS_MEM_ADDR_TO_BANK_ID(mem_req_addr_p);
|
||||
assign mem_req_tag_p = MEM_TAG_WIDTH'({mem_req_bank_id, mem_req_id_p});
|
||||
end else begin
|
||||
assign mem_req_tag_p = MEM_TAG_WIDTH'(mem_req_id_p);
|
||||
end
|
||||
|
||||
assign {
|
||||
mem_req_rw,
|
||||
mem_req_addr,
|
||||
mem_req_data,
|
||||
mem_req_byteen,
|
||||
mem_req_flags,
|
||||
mem_req_tag
|
||||
} = mem_req_pdata[i];
|
||||
// Memory request multi-port handling
|
||||
|
||||
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_w;
|
||||
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_w;
|
||||
wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags_w;
|
||||
assign mem_req_valid_s = mem_req_valid_p;
|
||||
assign mem_req_addr_s = mem_req_addr_p;
|
||||
assign mem_req_tag_s = mem_req_tag_p;
|
||||
assign mem_req_flush_s = mem_req_flush_p;
|
||||
assign mem_req_ready_p = mem_req_ready_s;
|
||||
|
||||
if (NUM_BANKS > 1) begin : g_mem_req_tag_multibanks
|
||||
if (NUM_BANKS != MEM_PORTS) begin : g_arb_sel
|
||||
wire [`CS_BANK_SEL_BITS-1:0] mem_req_bank_id;
|
||||
VX_bits_concat #(
|
||||
.L (MEM_ARB_SEL_BITS),
|
||||
.R (MEM_PORTS_SEL_BITS)
|
||||
) bank_id_concat (
|
||||
.left_in (mem_req_sel_out[i]),
|
||||
.right_in (MEM_PORTS_SEL_WIDTH'(i)),
|
||||
.data_out (mem_req_bank_id)
|
||||
);
|
||||
assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'({mem_req_addr, mem_req_bank_id});
|
||||
assign mem_req_tag_w = {mem_req_tag, mem_req_sel_out[i]};
|
||||
end else begin : g_no_arb_sel
|
||||
`UNUSED_VAR (mem_req_sel_out)
|
||||
assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'({mem_req_addr, MEM_PORTS_SEL_WIDTH'(i)});
|
||||
assign mem_req_tag_w = MEM_TAG_WIDTH'(mem_req_tag);
|
||||
end
|
||||
end else begin : g_mem_req_tag
|
||||
`UNUSED_VAR (mem_req_sel_out)
|
||||
assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'(mem_req_addr);
|
||||
assign mem_req_tag_w = MEM_TAG_WIDTH'(mem_req_tag);
|
||||
end
|
||||
if (WRITE_ENABLE != 0) begin
|
||||
assign mem_req_rw_s = mem_req_rw_p;
|
||||
assign mem_req_byteen_s = mem_req_byteen_p;
|
||||
assign mem_req_data_s = mem_req_data_p;
|
||||
end else begin
|
||||
`UNUSED_VAR (mem_req_byteen_p)
|
||||
`UNUSED_VAR (mem_req_data_p)
|
||||
`UNUSED_VAR (mem_req_rw_p)
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)),
|
||||
.SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
|
||||
) mem_req_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (mem_req_valid[i]),
|
||||
.ready_in (mem_req_ready[i]),
|
||||
.data_in ({mem_req_rw, mem_req_byteen, mem_req_addr_w, mem_req_data, mem_req_tag_w, mem_req_flags}),
|
||||
.data_out ({mem_bus_tmp_if[i].req_data.rw, mem_bus_tmp_if[i].req_data.byteen, mem_bus_tmp_if[i].req_data.addr, mem_bus_tmp_if[i].req_data.data, mem_bus_tmp_if[i].req_data.tag, mem_req_flags_w}),
|
||||
.valid_out (mem_bus_tmp_if[i].req_valid),
|
||||
.ready_out (mem_bus_tmp_if[i].req_ready)
|
||||
);
|
||||
|
||||
if (FLAGS_WIDTH != 0) begin : g_mem_req_flags
|
||||
assign mem_bus_tmp_if[i].req_data.flags = mem_req_flags_w;
|
||||
end else begin : g_no_mem_req_flags
|
||||
assign mem_bus_tmp_if[i].req_data.flags = '0;
|
||||
`UNUSED_VAR (mem_req_flags_w)
|
||||
end
|
||||
|
||||
if (WRITE_ENABLE) begin : g_mem_bus_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
|
||||
end else begin : g_mem_bus_if_ro
|
||||
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
|
||||
end
|
||||
assign mem_req_rw_s = 0;
|
||||
assign mem_req_byteen_s = {LINE_SIZE{1'b1}};
|
||||
assign mem_req_data_s = '0;
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [NUM_REQS-1:0] perf_core_reads_per_req;
|
||||
wire [NUM_REQS-1:0] perf_core_writes_per_req;
|
||||
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
|
||||
wire [MEM_PORTS-1:0] perf_mem_stall_per_port;
|
||||
// per cycle: core_reads, core_writes
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
|
||||
|
||||
`BUFFER(perf_core_reads_per_req, core_req_valid & core_req_ready & ~core_req_rw);
|
||||
`BUFFER(perf_core_writes_per_req, core_req_valid & core_req_ready & core_req_rw);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req
|
||||
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_perf_mem_stall_per_port
|
||||
assign perf_mem_stall_per_port[i] = mem_bus_if[i].req_valid && ~mem_bus_if[i].req_ready;
|
||||
end
|
||||
wire [NUM_REQS-1:0] perf_core_reads_per_req;
|
||||
wire [NUM_REQS-1:0] perf_core_writes_per_req;
|
||||
|
||||
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
|
||||
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
|
||||
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
|
||||
wire [`CLOG2(MEM_PORTS+1)-1:0] perf_mem_stall_per_cycle;
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||
|
||||
`BUFFER(perf_core_reads_per_req, core_req_valid & core_req_ready & ~core_req_rw);
|
||||
`BUFFER(perf_core_writes_per_req, core_req_valid & core_req_ready & core_req_rw);
|
||||
|
||||
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
|
||||
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
|
||||
`POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank);
|
||||
`POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank);
|
||||
`POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank);
|
||||
|
||||
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign perf_crsp_stall_per_req[i] = core_bus2_if[i].rsp_valid && ~core_bus2_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
|
||||
`POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port);
|
||||
|
||||
wire perf_mem_stall_per_cycle = mem_bus_if.req_valid && ~mem_bus_if.req_ready;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
|
||||
|
|
644
hw/rtl/cache/VX_cache_bank.sv
vendored
644
hw/rtl/cache/VX_cache_bank.sv
vendored
|
@ -47,26 +47,19 @@ module VX_cache_bank #(
|
|||
// Enable dirty bytes on writeback
|
||||
parameter DIRTY_BYTES = 0,
|
||||
|
||||
// Replacement policy
|
||||
parameter REPL_POLICY = `CS_REPL_FIFO,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
// core request tag size
|
||||
parameter TAG_WIDTH = UUID_WIDTH + 1,
|
||||
|
||||
// core request flags
|
||||
parameter FLAGS_WIDTH = 0,
|
||||
// Core response output buffer
|
||||
parameter CORE_OUT_BUF = 0,
|
||||
|
||||
// Core response output register
|
||||
parameter CORE_OUT_REG = 0,
|
||||
|
||||
// Memory request output register
|
||||
parameter MEM_OUT_REG = 0,
|
||||
// Memory request output buffer
|
||||
parameter MEM_OUT_BUF = 0,
|
||||
|
||||
parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE),
|
||||
parameter MEM_TAG_WIDTH = UUID_WIDTH + MSHR_ADDR_WIDTH,
|
||||
parameter REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS),
|
||||
parameter WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS)
|
||||
) (
|
||||
|
@ -74,9 +67,9 @@ module VX_cache_bank #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output wire perf_read_miss,
|
||||
output wire perf_write_miss,
|
||||
output wire perf_mshr_stall,
|
||||
output wire perf_read_misses,
|
||||
output wire perf_write_misses,
|
||||
output wire perf_mshr_stalls,
|
||||
`endif
|
||||
|
||||
// Core Request
|
||||
|
@ -88,7 +81,7 @@ module VX_cache_bank #(
|
|||
input wire [`CS_WORD_WIDTH-1:0] core_req_data, // data to be written
|
||||
input wire [TAG_WIDTH-1:0] core_req_tag, // identifier of the request (request id)
|
||||
input wire [REQ_SEL_WIDTH-1:0] core_req_idx, // index of the request in the core request array
|
||||
input wire [`UP(FLAGS_WIDTH)-1:0] core_req_flags,
|
||||
input wire core_req_flush, // flush enable
|
||||
output wire core_req_ready,
|
||||
|
||||
// Core Response
|
||||
|
@ -104,19 +97,18 @@ module VX_cache_bank #(
|
|||
output wire mem_req_rw,
|
||||
output wire [LINE_SIZE-1:0] mem_req_byteen,
|
||||
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
|
||||
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||
output wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags,
|
||||
output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id, // index of the head entry in the mshr
|
||||
output wire mem_req_flush,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// Memory response
|
||||
input wire mem_rsp_valid,
|
||||
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
input wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id,
|
||||
output wire mem_rsp_ready,
|
||||
|
||||
// flush
|
||||
input wire flush_begin,
|
||||
input wire [`UP(UUID_WIDTH)-1:0] flush_uuid,
|
||||
output wire flush_end
|
||||
);
|
||||
|
||||
|
@ -144,45 +136,43 @@ module VX_cache_bank #(
|
|||
wire [MSHR_ADDR_WIDTH-1:0] replay_id;
|
||||
wire replay_ready;
|
||||
|
||||
|
||||
wire valid_sel, valid_st0, valid_st1;
|
||||
wire is_init_st0;
|
||||
wire is_creq_st0, is_creq_st1;
|
||||
wire is_fill_st0, is_fill_st1;
|
||||
wire is_init_st0, is_init_st1;
|
||||
wire is_flush_st0, is_flush_st1;
|
||||
wire [`CS_WAY_SEL_WIDTH-1:0] flush_way_st0, evict_way_st0;
|
||||
wire [`CS_WAY_SEL_WIDTH-1:0] way_idx_st0, way_idx_st1;
|
||||
wire [NUM_WAYS-1:0] flush_way_st0;
|
||||
|
||||
wire [`CS_LINE_ADDR_WIDTH-1:0] addr_sel, addr_st0, addr_st1;
|
||||
wire [`CS_LINE_SEL_BITS-1:0] line_idx_st0, line_idx_st1;
|
||||
wire [`CS_TAG_SEL_BITS-1:0] line_tag_st0, line_tag_st1;
|
||||
wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0, evict_tag_st1;
|
||||
wire [`CS_LINE_SEL_BITS-1:0] line_sel_st0, line_sel_st1;
|
||||
wire rw_sel, rw_st0, rw_st1;
|
||||
wire [WORD_SEL_WIDTH-1:0] word_idx_sel, word_idx_st0, word_idx_st1;
|
||||
wire [WORD_SEL_WIDTH-1:0] wsel_sel, wsel_st0, wsel_st1;
|
||||
wire [WORD_SIZE-1:0] byteen_sel, byteen_st0, byteen_st1;
|
||||
wire [REQ_SEL_WIDTH-1:0] req_idx_sel, req_idx_st0, req_idx_st1;
|
||||
wire [TAG_WIDTH-1:0] tag_sel, tag_st0, tag_st1;
|
||||
wire [`CS_WORD_WIDTH-1:0] write_word_st0, write_word_st1;
|
||||
wire [`CS_WORD_WIDTH-1:0] read_data_st1;
|
||||
wire [`CS_LINE_WIDTH-1:0] data_sel, data_st0, data_st1;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mshr_id_st0, mshr_id_st1;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0;
|
||||
wire is_dirty_st0, is_dirty_st1;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0, mshr_id_st0, mshr_id_st1;
|
||||
wire valid_sel, valid_st0, valid_st1;
|
||||
wire is_creq_st0, is_creq_st1;
|
||||
wire is_fill_st0, is_fill_st1;
|
||||
wire is_replay_st0, is_replay_st1;
|
||||
wire is_hit_st0, is_hit_st1;
|
||||
wire [`UP(FLAGS_WIDTH)-1:0] flags_sel, flags_st0, flags_st1;
|
||||
wire creq_flush_sel, creq_flush_st0, creq_flush_st1;
|
||||
wire evict_dirty_st0, evict_dirty_st1;
|
||||
wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1;
|
||||
wire [NUM_WAYS-1:0] tag_matches_st0;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mshr_prev_st0, mshr_prev_st1;
|
||||
wire mshr_pending_st0, mshr_pending_st1;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mshr_previd_st0, mshr_previd_st1;
|
||||
wire mshr_empty;
|
||||
|
||||
wire flush_valid;
|
||||
wire init_valid;
|
||||
wire [`CS_LINE_SEL_BITS-1:0] flush_sel;
|
||||
wire [`CS_WAY_SEL_WIDTH-1:0] flush_way;
|
||||
wire [NUM_WAYS-1:0] flush_way;
|
||||
wire flush_ready;
|
||||
|
||||
// ensure we have no pending memory request in the bank
|
||||
wire no_pending_req = ~valid_st0 && ~valid_st1 && mreq_queue_empty;
|
||||
|
||||
// flush unit
|
||||
VX_bank_flush #(
|
||||
.BANK_ID (BANK_ID),
|
||||
.CACHE_SIZE (CACHE_SIZE),
|
||||
|
@ -204,7 +194,11 @@ module VX_cache_bank #(
|
|||
.bank_empty (no_pending_req)
|
||||
);
|
||||
|
||||
wire pipe_stall = crsp_queue_stall;
|
||||
wire rdw_hazard1_sel;
|
||||
wire rdw_hazard2_sel;
|
||||
reg rdw_hazard3_st1;
|
||||
|
||||
wire pipe_stall = crsp_queue_stall || rdw_hazard3_st1;
|
||||
|
||||
// inputs arbitration:
|
||||
// mshr replay has highest priority to maximize utilization since there is no miss.
|
||||
|
@ -223,217 +217,216 @@ module VX_cache_bank #(
|
|||
wire creq_enable = creq_grant && core_req_valid;
|
||||
|
||||
assign replay_ready = replay_grant
|
||||
&& ~(!WRITEBACK && replay_rw && mreq_queue_alm_full) // needed for writethrough
|
||||
&& ~rdw_hazard1_sel
|
||||
&& ~pipe_stall;
|
||||
|
||||
assign mem_rsp_ready = fill_grant
|
||||
&& ~(WRITEBACK && mreq_queue_alm_full) // needed for writeback
|
||||
&& (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions
|
||||
&& ~rdw_hazard2_sel
|
||||
&& ~pipe_stall;
|
||||
|
||||
assign flush_ready = flush_grant
|
||||
&& ~(WRITEBACK && mreq_queue_alm_full) // needed for writeback
|
||||
&& (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions
|
||||
&& ~rdw_hazard2_sel
|
||||
&& ~pipe_stall;
|
||||
|
||||
assign core_req_ready = creq_grant
|
||||
&& ~mreq_queue_alm_full // needed for fill requests
|
||||
&& ~mshr_alm_full // needed for mshr allocation
|
||||
&& ~mreq_queue_alm_full
|
||||
&& ~mshr_alm_full
|
||||
&& ~pipe_stall;
|
||||
|
||||
wire init_fire = init_valid;
|
||||
wire replay_fire = replay_valid && replay_ready;
|
||||
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
|
||||
wire flush_fire = flush_valid && flush_ready;
|
||||
wire flush_fire = flush_valid && flush_ready;
|
||||
wire core_req_fire = core_req_valid && core_req_ready;
|
||||
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id = mem_rsp_tag[MSHR_ADDR_WIDTH-1:0];
|
||||
|
||||
wire [TAG_WIDTH-1:0] mem_rsp_tag_s;
|
||||
if (TAG_WIDTH > MEM_TAG_WIDTH) begin : g_mem_rsp_tag_s_pad
|
||||
assign mem_rsp_tag_s = {mem_rsp_tag, (TAG_WIDTH-MEM_TAG_WIDTH)'(1'b0)};
|
||||
end else begin : g_mem_rsp_tag_s_cut
|
||||
assign mem_rsp_tag_s = mem_rsp_tag[MEM_TAG_WIDTH-1 -: TAG_WIDTH];
|
||||
`UNUSED_VAR (mem_rsp_tag)
|
||||
end
|
||||
|
||||
wire [TAG_WIDTH-1:0] flush_tag;
|
||||
if (UUID_WIDTH != 0) begin : g_flush_tag_uuid
|
||||
assign flush_tag = {flush_uuid, (TAG_WIDTH-UUID_WIDTH)'(1'b0)};
|
||||
end else begin : g_flush_tag_0
|
||||
`UNUSED_VAR (flush_uuid)
|
||||
assign flush_tag = '0;
|
||||
end
|
||||
|
||||
assign valid_sel = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire;
|
||||
assign rw_sel = replay_valid ? replay_rw : core_req_rw;
|
||||
assign byteen_sel = replay_valid ? replay_byteen : core_req_byteen;
|
||||
assign wsel_sel = replay_valid ? replay_wsel : core_req_wsel;
|
||||
assign req_idx_sel = replay_valid ? replay_idx : core_req_idx;
|
||||
assign tag_sel = replay_valid ? replay_tag : core_req_tag;
|
||||
assign creq_flush_sel = core_req_valid && core_req_flush;
|
||||
|
||||
assign addr_sel = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) :
|
||||
(replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : core_req_addr));
|
||||
assign word_idx_sel= replay_valid ? replay_wsel : core_req_wsel;
|
||||
assign req_idx_sel = replay_valid ? replay_idx : core_req_idx;
|
||||
assign tag_sel = (init_valid | flush_valid) ? (flush_valid ? flush_tag : '0) :
|
||||
(replay_valid ? replay_tag : (mem_rsp_valid ? mem_rsp_tag_s : core_req_tag));
|
||||
assign flags_sel = core_req_valid ? core_req_flags : '0;
|
||||
|
||||
if (WRITE_ENABLE) begin : g_data_sel
|
||||
for (genvar i = 0; i < `CS_LINE_WIDTH; ++i) begin : g_i
|
||||
if (i < `CS_WORD_WIDTH) begin : g_lo
|
||||
assign data_sel[i] = replay_valid ? replay_data[i] : (mem_rsp_valid ? mem_rsp_data[i] : core_req_data[i]);
|
||||
end else begin : g_hi
|
||||
assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words of data_sel
|
||||
end
|
||||
end
|
||||
end else begin : g_data_sel_ro
|
||||
assign data_sel = mem_rsp_data;
|
||||
if (WRITE_ENABLE) begin
|
||||
assign data_sel[`CS_WORD_WIDTH-1:0] = replay_valid ? replay_data : (mem_rsp_valid ? mem_rsp_data[`CS_WORD_WIDTH-1:0] : core_req_data);
|
||||
end else begin
|
||||
assign data_sel[`CS_WORD_WIDTH-1:0] = mem_rsp_data[`CS_WORD_WIDTH-1:0];
|
||||
`UNUSED_VAR (core_req_data)
|
||||
`UNUSED_VAR (replay_data)
|
||||
end
|
||||
|
||||
if (UUID_WIDTH != 0) begin : g_req_uuid_sel
|
||||
assign req_uuid_sel = tag_sel[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
end else begin : g_req_uuid_sel_0
|
||||
assign req_uuid_sel = '0;
|
||||
for (genvar i = `CS_WORD_WIDTH; i < `CS_LINE_WIDTH; ++i) begin
|
||||
assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words of data_sel
|
||||
end
|
||||
|
||||
wire is_init_sel = init_valid;
|
||||
wire is_creq_sel = creq_enable || replay_enable;
|
||||
wire is_fill_sel = fill_enable;
|
||||
wire is_flush_sel = flush_enable;
|
||||
wire is_replay_sel = replay_enable;
|
||||
if (UUID_WIDTH != 0) begin
|
||||
assign req_uuid_sel = tag_sel[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
end else begin
|
||||
assign req_uuid_sel = 0;
|
||||
end
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + `UP(FLAGS_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
|
||||
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + NUM_WAYS + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
|
||||
.RESETW (1)
|
||||
) pipe_reg0 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~pipe_stall),
|
||||
.data_in ({valid_sel, is_init_sel, is_fill_sel, is_flush_sel, is_creq_sel, is_replay_sel, flags_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, word_idx_sel, req_idx_sel, tag_sel, replay_id}),
|
||||
.data_out ({valid_st0, is_init_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, flags_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, replay_id_st0})
|
||||
.data_in ({valid_sel, init_valid, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}),
|
||||
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
|
||||
);
|
||||
|
||||
if (UUID_WIDTH != 0) begin : g_req_uuid_st0
|
||||
if (UUID_WIDTH != 0) begin
|
||||
assign req_uuid_st0 = tag_st0[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
end else begin : g_req_uuid_st0_0
|
||||
assign req_uuid_st0 = '0;
|
||||
end else begin
|
||||
assign req_uuid_st0 = 0;
|
||||
end
|
||||
|
||||
wire is_read_st0 = is_creq_st0 && ~rw_st0;
|
||||
wire is_write_st0 = is_creq_st0 && rw_st0;
|
||||
wire do_init_st0 = valid_st0 && is_init_st0;
|
||||
wire do_flush_st0 = valid_st0 && is_flush_st0;
|
||||
wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0;
|
||||
wire do_creq_wr_st0 = valid_st0 && is_creq_st0 && rw_st0;
|
||||
wire do_replay_rd_st0 = valid_st0 && is_replay_st0 && ~rw_st0;
|
||||
wire do_replay_wr_st0 = valid_st0 && is_replay_st0 && rw_st0;
|
||||
wire do_fill_st0 = valid_st0 && is_fill_st0;
|
||||
wire do_cache_rd_st0 = do_creq_rd_st0 || do_replay_rd_st0;
|
||||
wire do_cache_wr_st0 = do_creq_wr_st0 || do_replay_wr_st0;
|
||||
wire do_lookup_st0 = do_cache_rd_st0 || do_cache_wr_st0;
|
||||
|
||||
wire do_init_st0 = valid_st0 && is_init_st0;
|
||||
wire do_flush_st0 = valid_st0 && is_flush_st0;
|
||||
wire do_read_st0 = valid_st0 && is_read_st0;
|
||||
wire do_write_st0 = valid_st0 && is_write_st0;
|
||||
wire do_fill_st0 = valid_st0 && is_fill_st0;
|
||||
wire [`CS_WORD_WIDTH-1:0] write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0];
|
||||
|
||||
wire is_read_st1 = is_creq_st1 && ~rw_st1;
|
||||
wire is_write_st1 = is_creq_st1 && rw_st1;
|
||||
assign line_sel_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0];
|
||||
|
||||
wire do_read_st1 = valid_st1 && is_read_st1;
|
||||
wire do_write_st1 = valid_st1 && is_write_st1;
|
||||
|
||||
assign line_idx_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0];
|
||||
assign line_tag_st0 = `CS_LINE_ADDR_TAG(addr_st0);
|
||||
|
||||
assign write_word_st0 = data_st0[`CS_WORD_WIDTH-1:0];
|
||||
|
||||
wire do_lookup_st0 = do_read_st0 || do_write_st0;
|
||||
wire do_lookup_st1 = do_read_st1 || do_write_st1;
|
||||
|
||||
wire [`CS_WAY_SEL_WIDTH-1:0] victim_way_st0;
|
||||
wire [NUM_WAYS-1:0] tag_matches_st0;
|
||||
|
||||
VX_cache_repl #(
|
||||
.CACHE_SIZE (CACHE_SIZE),
|
||||
.LINE_SIZE (LINE_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.NUM_WAYS (NUM_WAYS),
|
||||
.REPL_POLICY (REPL_POLICY)
|
||||
) cache_repl (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.stall (pipe_stall),
|
||||
.init (do_init_st0),
|
||||
.lookup_valid(do_lookup_st1 && ~pipe_stall),
|
||||
.lookup_hit (is_hit_st1),
|
||||
.lookup_line(line_idx_st1),
|
||||
.lookup_way (way_idx_st1),
|
||||
.repl_valid (do_fill_st0 && ~pipe_stall),
|
||||
.repl_line (line_idx_st0),
|
||||
.repl_way (victim_way_st0)
|
||||
);
|
||||
|
||||
assign evict_way_st0 = is_fill_st0 ? victim_way_st0 : flush_way_st0;
|
||||
wire [NUM_WAYS-1:0] evict_way_st0;
|
||||
wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0;
|
||||
|
||||
VX_cache_tags #(
|
||||
.INSTANCE_ID($sformatf("%s-tags", INSTANCE_ID)),
|
||||
.BANK_ID (BANK_ID),
|
||||
.CACHE_SIZE (CACHE_SIZE),
|
||||
.LINE_SIZE (LINE_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.NUM_WAYS (NUM_WAYS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.WRITEBACK (WRITEBACK)
|
||||
.WRITEBACK (WRITEBACK),
|
||||
.UUID_WIDTH (UUID_WIDTH)
|
||||
) cache_tags (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
// inputs
|
||||
|
||||
.req_uuid (req_uuid_st0),
|
||||
|
||||
.stall (pipe_stall),
|
||||
|
||||
// init/flush/fill/write/lookup
|
||||
.init (do_init_st0),
|
||||
.flush (do_flush_st0 && ~pipe_stall),
|
||||
.fill (do_fill_st0 && ~pipe_stall),
|
||||
.read (do_read_st0 && ~pipe_stall),
|
||||
.write (do_write_st0 && ~pipe_stall),
|
||||
.line_idx (line_idx_st0),
|
||||
.line_tag (line_tag_st0),
|
||||
.evict_way (evict_way_st0),
|
||||
// outputs
|
||||
.flush (do_flush_st0),
|
||||
.fill (do_fill_st0),
|
||||
.write (do_cache_wr_st0),
|
||||
.lookup (do_lookup_st0),
|
||||
.line_addr (addr_st0),
|
||||
.way_sel (flush_way_st0),
|
||||
.tag_matches(tag_matches_st0),
|
||||
.evict_dirty(is_dirty_st0),
|
||||
|
||||
// replacement
|
||||
.evict_dirty(evict_dirty_st0),
|
||||
.evict_way (evict_way_st0),
|
||||
.evict_tag (evict_tag_st0)
|
||||
);
|
||||
|
||||
wire [`CS_WAY_SEL_WIDTH-1:0] hit_idx_st0;
|
||||
VX_onehot_encoder #(
|
||||
.N (NUM_WAYS)
|
||||
) way_idx_enc (
|
||||
.data_in (tag_matches_st0),
|
||||
.data_out (hit_idx_st0),
|
||||
`UNUSED_PIN (valid_out)
|
||||
);
|
||||
wire [`CS_LINE_ADDR_WIDTH-1:0] addr2_st0;
|
||||
|
||||
assign way_idx_st0 = is_creq_st0 ? hit_idx_st0 : evict_way_st0;
|
||||
assign is_hit_st0 = (| tag_matches_st0);
|
||||
wire is_flush2_st0 = WRITEBACK && is_flush_st0;
|
||||
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0;
|
||||
assign mshr_id_st0 = is_replay_st0 ? replay_id_st0 : mshr_alloc_id_st0;
|
||||
assign mshr_id_st0 = is_creq_st0 ? mshr_alloc_id_st0 : replay_id_st0;
|
||||
|
||||
assign way_sel_st0 = (is_fill_st0 || is_flush2_st0) ? evict_way_st0 : tag_matches_st0;
|
||||
|
||||
assign addr2_st0 = (is_fill_st0 || is_flush2_st0) ? {evict_tag_st0, line_sel_st0} : addr_st0;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(FLAGS_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_TAG_SEL_BITS + `CS_TAG_SEL_BITS + `CS_LINE_SEL_BITS + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1),
|
||||
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1 + 1),
|
||||
.RESETW (1)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~pipe_stall),
|
||||
.data_in ({valid_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, is_dirty_st0, is_hit_st0, rw_st0, flags_st0, way_idx_st0, evict_tag_st0, line_tag_st0, line_idx_st0, data_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_previd_st0, mshr_pending_st0}),
|
||||
.data_out ({valid_st1, is_fill_st1, is_flush_st1, is_creq_st1, is_replay_st1, is_dirty_st1, is_hit_st1, rw_st1, flags_st1, way_idx_st1, evict_tag_st1, line_tag_st1, line_idx_st1, data_st1, byteen_st1, word_idx_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_previd_st1, mshr_pending_st1})
|
||||
.data_in ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush2_st0, is_creq_st0, creq_flush_st0, rw_st0, addr2_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, evict_dirty_st0, mshr_pending_st0}),
|
||||
.data_out ({valid_st1, is_init_st1, is_replay_st1, is_fill_st1, is_flush_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, evict_dirty_st1, mshr_pending_st1})
|
||||
);
|
||||
|
||||
if (UUID_WIDTH != 0) begin : g_req_uuid_st1
|
||||
// we have a tag hit
|
||||
wire is_hit_st1 = (| way_sel_st1);
|
||||
|
||||
if (UUID_WIDTH != 0) begin
|
||||
assign req_uuid_st1 = tag_st1[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
end else begin : g_req_uuid_st1_0
|
||||
assign req_uuid_st1 = '0;
|
||||
end else begin
|
||||
assign req_uuid_st1 = 0;
|
||||
end
|
||||
|
||||
assign addr_st1 = {line_tag_st1, line_idx_st1};
|
||||
wire is_read_st1 = is_creq_st1 && ~rw_st1;
|
||||
wire is_write_st1 = is_creq_st1 && rw_st1;
|
||||
|
||||
wire do_init_st1 = valid_st1 && is_init_st1;
|
||||
wire do_fill_st1 = valid_st1 && is_fill_st1;
|
||||
wire do_flush_st1 = valid_st1 && is_flush_st1;
|
||||
|
||||
wire do_creq_rd_st1 = valid_st1 && is_read_st1;
|
||||
wire do_creq_wr_st1 = valid_st1 && is_write_st1;
|
||||
wire do_replay_rd_st1 = valid_st1 && is_replay_st1 && ~rw_st1;
|
||||
wire do_replay_wr_st1 = valid_st1 && is_replay_st1 && rw_st1;
|
||||
|
||||
wire do_read_hit_st1 = do_creq_rd_st1 && is_hit_st1;
|
||||
wire do_read_miss_st1 = do_creq_rd_st1 && ~is_hit_st1;
|
||||
|
||||
wire do_write_hit_st1 = do_creq_wr_st1 && is_hit_st1;
|
||||
wire do_write_miss_st1= do_creq_wr_st1 && ~is_hit_st1;
|
||||
|
||||
wire do_cache_rd_st1 = do_read_hit_st1 || do_replay_rd_st1;
|
||||
wire do_cache_wr_st1 = do_write_hit_st1 || do_replay_wr_st1;
|
||||
|
||||
assign line_sel_st1 = addr_st1[`CS_LINE_SEL_BITS-1:0];
|
||||
|
||||
`UNUSED_VAR (do_write_miss_st1)
|
||||
|
||||
// ensure mshr replay always get a hit
|
||||
`RUNTIME_ASSERT (~(valid_st1 && is_replay_st1 && ~is_hit_st1), ("%t: missed mshr replay", $time))
|
||||
`RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("missed mshr replay"));
|
||||
|
||||
assign write_word_st1 = data_st1[`CS_WORD_WIDTH-1:0];
|
||||
`UNUSED_VAR (data_st1)
|
||||
// both tag and data stores use BRAM with no read-during-write protection.
|
||||
// we ned to stall the pipeline to prevent read-after-write hazards.
|
||||
assign rdw_hazard1_sel = do_fill_st0; // stall first replay following a fill
|
||||
assign rdw_hazard2_sel = WRITEBACK && do_cache_wr_st0; // a writeback can evict any preceeding write
|
||||
always @(posedge clk) begin
|
||||
// stall reads following writes to same line address
|
||||
rdw_hazard3_st1 <= do_cache_rd_st0 && do_cache_wr_st1 && (line_sel_st0 == line_sel_st1)
|
||||
&& ~rdw_hazard3_st1; // release pipeline stall
|
||||
end
|
||||
|
||||
wire[`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] read_data_st1;
|
||||
wire [LINE_SIZE-1:0] evict_byteen_st1;
|
||||
wire [`CS_LINE_WIDTH-1:0] write_data_st1 = {`CS_WORDS_PER_LINE{data_st1[`CS_WORD_WIDTH-1:0]}};
|
||||
wire [`CS_LINE_WIDTH-1:0] fill_data_st1 = data_st1;
|
||||
wire [LINE_SIZE-1:0] write_byteen_st1;
|
||||
|
||||
wire [`CS_LINE_WIDTH-1:0] dirty_data_st1;
|
||||
wire [LINE_SIZE-1:0] dirty_byteen_st1;
|
||||
|
||||
if (`CS_WORDS_PER_LINE > 1) begin
|
||||
reg [LINE_SIZE-1:0] write_byteen_r;
|
||||
always @(*) begin
|
||||
write_byteen_r = '0;
|
||||
write_byteen_r[wsel_st1 * WORD_SIZE +: WORD_SIZE] = byteen_st1;
|
||||
end
|
||||
assign write_byteen_st1 = write_byteen_r;
|
||||
end else begin
|
||||
assign write_byteen_st1 = byteen_st1;
|
||||
end
|
||||
|
||||
VX_cache_data #(
|
||||
.INSTANCE_ID ($sformatf("%s-data", INSTANCE_ID)),
|
||||
.BANK_ID (BANK_ID),
|
||||
.CACHE_SIZE (CACHE_SIZE),
|
||||
.LINE_SIZE (LINE_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
|
@ -441,57 +434,56 @@ module VX_cache_bank #(
|
|||
.WORD_SIZE (WORD_SIZE),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.WRITEBACK (WRITEBACK),
|
||||
.DIRTY_BYTES (DIRTY_BYTES)
|
||||
.DIRTY_BYTES (DIRTY_BYTES),
|
||||
.UUID_WIDTH (UUID_WIDTH)
|
||||
) cache_data (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
// inputs
|
||||
.init (do_init_st0),
|
||||
.fill (do_fill_st0 && ~pipe_stall),
|
||||
.flush (do_flush_st0 && ~pipe_stall),
|
||||
.read (do_read_st0 && ~pipe_stall),
|
||||
.write (do_write_st0 && ~pipe_stall),
|
||||
.evict_way (evict_way_st0),
|
||||
.tag_matches(tag_matches_st0),
|
||||
.line_idx (line_idx_st0),
|
||||
.fill_data (data_st0),
|
||||
.write_word (write_word_st0),
|
||||
.word_idx (word_idx_st0),
|
||||
.write_byteen(byteen_st0),
|
||||
.way_idx_r (way_idx_st1),
|
||||
// outputs
|
||||
|
||||
.req_uuid (req_uuid_st1),
|
||||
|
||||
.stall (pipe_stall),
|
||||
|
||||
.init (do_init_st1),
|
||||
.read (do_cache_rd_st1),
|
||||
.fill (do_fill_st1),
|
||||
.flush (do_flush_st1),
|
||||
.write (do_cache_wr_st1),
|
||||
.way_sel (way_sel_st1),
|
||||
.line_addr (addr_st1),
|
||||
.wsel (wsel_st1),
|
||||
.fill_data (fill_data_st1),
|
||||
.write_data (write_data_st1),
|
||||
.write_byteen(write_byteen_st1),
|
||||
.read_data (read_data_st1),
|
||||
.evict_byteen(evict_byteen_st1)
|
||||
.dirty_data (dirty_data_st1),
|
||||
.dirty_byteen(dirty_byteen_st1)
|
||||
);
|
||||
|
||||
// only allocate MSHR entries for non-replay core requests
|
||||
wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~is_replay_st0;
|
||||
wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~is_replay_st1;
|
||||
wire [MSHR_SIZE-1:0] mshr_lookup_pending_st0;
|
||||
wire [MSHR_SIZE-1:0] mshr_lookup_rw_st0;
|
||||
wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~pipe_stall;
|
||||
wire mshr_lookup_st0 = mshr_allocate_st0;
|
||||
wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~pipe_stall;
|
||||
|
||||
// release allocated mshr entry if we had a hit
|
||||
wire mshr_release_st1;
|
||||
if (WRITEBACK) begin : g_mshr_release
|
||||
if (WRITEBACK) begin
|
||||
assign mshr_release_st1 = is_hit_st1;
|
||||
end else begin : g_mshr_release_ro
|
||||
// we need to keep missed write requests in MSHR if there is already a pending entry to the same address.
|
||||
// this ensures that missed write requests are replayed locally in case a pending fill arrives without the write content.
|
||||
// this can happen when writes are sent to memory late, when a related fill was already in flight.
|
||||
end else begin
|
||||
// we need to keep missed write requests in MSHR if there is already a pending entry to the same address
|
||||
// this ensures that missed write requests are replayed locally in case a pending fill arrives without the write content
|
||||
// this can happen when writes are sent late, when the fill was already in flight.
|
||||
assign mshr_release_st1 = is_hit_st1 || (rw_st1 && ~mshr_pending_st1);
|
||||
end
|
||||
|
||||
wire mshr_release_fire = mshr_finalize_st1 && mshr_release_st1 && ~pipe_stall;
|
||||
|
||||
wire [1:0] mshr_dequeue;
|
||||
`POP_COUNT(mshr_dequeue, {replay_fire, mshr_release_fire});
|
||||
|
||||
VX_pending_size #(
|
||||
.SIZE (MSHR_SIZE),
|
||||
.DECRW (2)
|
||||
.SIZE (MSHR_SIZE)
|
||||
) mshr_pending_size (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.incr (core_req_fire),
|
||||
.decr (mshr_dequeue),
|
||||
.decr (replay_fire || (mshr_finalize_st1 && mshr_release_st1)),
|
||||
.empty (mshr_empty),
|
||||
`UNUSED_PIN (alm_empty),
|
||||
.full (mshr_alm_full),
|
||||
|
@ -500,12 +492,11 @@ module VX_cache_bank #(
|
|||
);
|
||||
|
||||
VX_cache_mshr #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-mshr", INSTANCE_ID))),
|
||||
.INSTANCE_ID ($sformatf("%s-mshr", INSTANCE_ID)),
|
||||
.BANK_ID (BANK_ID),
|
||||
.LINE_SIZE (LINE_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.WRITEBACK (WRITEBACK),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH)
|
||||
) cache_mshr (
|
||||
|
@ -513,7 +504,7 @@ module VX_cache_bank #(
|
|||
.reset (reset),
|
||||
|
||||
.deq_req_uuid (req_uuid_sel),
|
||||
.alc_req_uuid (req_uuid_st0),
|
||||
.lkp_req_uuid (req_uuid_st0),
|
||||
.fin_req_uuid (req_uuid_st1),
|
||||
|
||||
// memory fill
|
||||
|
@ -530,23 +521,37 @@ module VX_cache_bank #(
|
|||
.dequeue_ready (replay_ready),
|
||||
|
||||
// allocate
|
||||
.allocate_valid (mshr_allocate_st0 && ~pipe_stall),
|
||||
.allocate_valid (mshr_allocate_st0),
|
||||
.allocate_addr (addr_st0),
|
||||
.allocate_rw (rw_st0),
|
||||
.allocate_data ({word_idx_st0, byteen_st0, write_word_st0, tag_st0, req_idx_st0}),
|
||||
.allocate_data ({wsel_st0, byteen_st0, write_data_st0, tag_st0, req_idx_st0}),
|
||||
.allocate_id (mshr_alloc_id_st0),
|
||||
.allocate_pending(mshr_pending_st0),
|
||||
.allocate_previd(mshr_previd_st0),
|
||||
.allocate_prev (mshr_prev_st0),
|
||||
`UNUSED_PIN (allocate_ready),
|
||||
|
||||
// lookup
|
||||
.lookup_valid (mshr_lookup_st0),
|
||||
.lookup_addr (addr_st0),
|
||||
.lookup_pending (mshr_lookup_pending_st0),
|
||||
.lookup_rw (mshr_lookup_rw_st0),
|
||||
|
||||
// finalize
|
||||
.finalize_valid (mshr_finalize_st1 && ~pipe_stall),
|
||||
.finalize_is_release(mshr_release_st1),
|
||||
.finalize_is_pending(mshr_pending_st1),
|
||||
.finalize_valid (mshr_finalize_st1),
|
||||
.finalize_release(mshr_release_st1),
|
||||
.finalize_pending(mshr_pending_st1),
|
||||
.finalize_id (mshr_id_st1),
|
||||
.finalize_previd(mshr_previd_st1)
|
||||
.finalize_prev (mshr_prev_st1)
|
||||
);
|
||||
|
||||
// check if there are pending requests to same line in the MSHR
|
||||
wire [MSHR_SIZE-1:0] lookup_matches;
|
||||
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
|
||||
assign lookup_matches[i] = mshr_lookup_pending_st0[i]
|
||||
&& (i != mshr_alloc_id_st0) // exclude current mshr id
|
||||
&& (WRITEBACK || ~mshr_lookup_rw_st0[i]); // exclude write requests if writethrough
|
||||
end
|
||||
assign mshr_pending_st0 = (| lookup_matches);
|
||||
|
||||
// schedule core response
|
||||
|
||||
wire crsp_queue_valid, crsp_queue_ready;
|
||||
|
@ -554,19 +559,19 @@ module VX_cache_bank #(
|
|||
wire [REQ_SEL_WIDTH-1:0] crsp_queue_idx;
|
||||
wire [TAG_WIDTH-1:0] crsp_queue_tag;
|
||||
|
||||
assign crsp_queue_valid = do_read_st1 && is_hit_st1;
|
||||
assign crsp_queue_valid = do_cache_rd_st1;
|
||||
assign crsp_queue_idx = req_idx_st1;
|
||||
assign crsp_queue_data = read_data_st1[word_idx_st1];
|
||||
assign crsp_queue_data = read_data_st1;
|
||||
assign crsp_queue_tag = tag_st1;
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (TAG_WIDTH + `CS_WORD_WIDTH + REQ_SEL_WIDTH),
|
||||
.SIZE (CRSQ_SIZE),
|
||||
.OUT_REG (CORE_OUT_REG)
|
||||
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
|
||||
) core_rsp_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (crsp_queue_valid),
|
||||
.valid_in (crsp_queue_valid && ~rdw_hazard3_st1),
|
||||
.ready_in (crsp_queue_ready),
|
||||
.data_in ({crsp_queue_tag, crsp_queue_data, crsp_queue_idx}),
|
||||
.data_out ({core_rsp_tag, core_rsp_data, core_rsp_idx}),
|
||||
|
@ -582,93 +587,59 @@ module VX_cache_bank #(
|
|||
wire [`CS_LINE_WIDTH-1:0] mreq_queue_data;
|
||||
wire [LINE_SIZE-1:0] mreq_queue_byteen;
|
||||
wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr;
|
||||
wire [MEM_TAG_WIDTH-1:0] mreq_queue_tag;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mreq_queue_id;
|
||||
wire mreq_queue_rw;
|
||||
wire [`UP(FLAGS_WIDTH)-1:0] mreq_queue_flags;
|
||||
wire mreq_queue_flush;
|
||||
|
||||
wire is_fill_or_flush_st1 = is_fill_st1 || (is_flush_st1 && WRITEBACK);
|
||||
wire is_fill_or_flush_st1 = is_fill_st1 || is_flush_st1;
|
||||
wire do_fill_or_flush_st1 = valid_st1 && is_fill_or_flush_st1;
|
||||
wire do_writeback_st1 = do_fill_or_flush_st1 && is_dirty_st1;
|
||||
wire [`CS_LINE_ADDR_WIDTH-1:0] evict_addr_st1 = {evict_tag_st1, line_idx_st1};
|
||||
wire do_writeback_st1 = do_fill_or_flush_st1 && evict_dirty_st1;
|
||||
|
||||
if (WRITE_ENABLE) begin : g_mreq_queue
|
||||
if (WRITEBACK) begin : g_wb
|
||||
if (DIRTY_BYTES) begin : g_dirty_bytes
|
||||
// ensure dirty bytes match the tag info
|
||||
wire has_dirty_bytes = (| evict_byteen_st1);
|
||||
`RUNTIME_ASSERT (~do_fill_or_flush_st1 || (is_dirty_st1 == has_dirty_bytes), ("%t: missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", $time, is_dirty_st1, has_dirty_bytes, `CS_BANK_TO_FULL_ADDR(addr_st1, BANK_ID)))
|
||||
end
|
||||
// issue a fill request on a read/write miss
|
||||
// issue a writeback on a dirty line eviction
|
||||
assign mreq_queue_push = ((do_lookup_st1 && ~is_hit_st1 && ~mshr_pending_st1)
|
||||
|| do_writeback_st1)
|
||||
&& ~pipe_stall;
|
||||
assign mreq_queue_addr = is_fill_or_flush_st1 ? evict_addr_st1 : addr_st1;
|
||||
assign mreq_queue_rw = is_fill_or_flush_st1;
|
||||
assign mreq_queue_data = read_data_st1;
|
||||
assign mreq_queue_byteen = is_fill_or_flush_st1 ? evict_byteen_st1 : '1;
|
||||
`UNUSED_VAR (write_word_st1)
|
||||
`UNUSED_VAR (byteen_st1)
|
||||
end else begin : g_wt
|
||||
wire [LINE_SIZE-1:0] line_byteen;
|
||||
VX_demux #(
|
||||
.DATAW (WORD_SIZE),
|
||||
.N (`CS_WORDS_PER_LINE)
|
||||
) byteen_demux (
|
||||
.sel_in (word_idx_st1),
|
||||
.data_in (byteen_st1),
|
||||
.data_out (line_byteen)
|
||||
);
|
||||
// issue a fill request on a read miss
|
||||
// issue a memory write on a write request
|
||||
assign mreq_queue_push = ((do_read_st1 && ~is_hit_st1 && ~mshr_pending_st1)
|
||||
|| do_write_st1)
|
||||
&& ~pipe_stall;
|
||||
assign mreq_queue_addr = addr_st1;
|
||||
assign mreq_queue_rw = rw_st1;
|
||||
assign mreq_queue_data = {`CS_WORDS_PER_LINE{write_word_st1}};
|
||||
assign mreq_queue_byteen = rw_st1 ? line_byteen : '1;
|
||||
`UNUSED_VAR (is_fill_or_flush_st1)
|
||||
`UNUSED_VAR (do_writeback_st1)
|
||||
`UNUSED_VAR (evict_addr_st1)
|
||||
`UNUSED_VAR (evict_byteen_st1)
|
||||
if (WRITEBACK) begin
|
||||
if (DIRTY_BYTES) begin
|
||||
// ensure dirty bytes match the tag info
|
||||
wire has_dirty_bytes = (| dirty_byteen_st1);
|
||||
`RUNTIME_ASSERT (~do_fill_or_flush_st1 || (evict_dirty_st1 == has_dirty_bytes), ("missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", evict_dirty_st1, has_dirty_bytes, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID)));
|
||||
end
|
||||
end else begin : g_mreq_queue_ro
|
||||
// issue a fill request on a read miss
|
||||
assign mreq_queue_push = (do_read_st1 && ~is_hit_st1 && ~mshr_pending_st1)
|
||||
&& ~pipe_stall;
|
||||
assign mreq_queue_addr = addr_st1;
|
||||
assign mreq_queue_rw = 0;
|
||||
assign mreq_queue_data = '0;
|
||||
assign mreq_queue_byteen = '1;
|
||||
assign mreq_queue_push = (((do_read_miss_st1 || do_write_miss_st1) && ~mshr_pending_st1)
|
||||
|| do_writeback_st1)
|
||||
&& ~rdw_hazard3_st1;
|
||||
end else begin
|
||||
`UNUSED_VAR (do_writeback_st1)
|
||||
`UNUSED_VAR (evict_addr_st1)
|
||||
`UNUSED_VAR (evict_byteen_st1)
|
||||
`UNUSED_VAR (write_word_st1)
|
||||
`UNUSED_VAR (byteen_st1)
|
||||
end
|
||||
|
||||
if (UUID_WIDTH != 0) begin : g_mreq_queue_tag_uuid
|
||||
assign mreq_queue_tag = {req_uuid_st1, mshr_id_st1};
|
||||
end else begin : g_mreq_queue_tag
|
||||
assign mreq_queue_tag = mshr_id_st1;
|
||||
assign mreq_queue_push = ((do_read_miss_st1 && ~mshr_pending_st1)
|
||||
|| do_creq_wr_st1)
|
||||
&& ~rdw_hazard3_st1;
|
||||
end
|
||||
|
||||
assign mreq_queue_pop = mem_req_valid && mem_req_ready;
|
||||
assign mreq_queue_flags = flags_st1;
|
||||
assign mreq_queue_addr = addr_st1;
|
||||
assign mreq_queue_id = mshr_id_st1;
|
||||
assign mreq_queue_flush = creq_flush_st1;
|
||||
|
||||
if (WRITE_ENABLE) begin
|
||||
assign mreq_queue_rw = WRITEBACK ? is_fill_or_flush_st1 : rw_st1;
|
||||
assign mreq_queue_data = WRITEBACK ? dirty_data_st1 : write_data_st1;
|
||||
assign mreq_queue_byteen = WRITEBACK ? dirty_byteen_st1 : write_byteen_st1;
|
||||
end else begin
|
||||
assign mreq_queue_rw = 0;
|
||||
assign mreq_queue_data = 0;
|
||||
assign mreq_queue_byteen = 0;
|
||||
`UNUSED_VAR (dirty_data_st1)
|
||||
`UNUSED_VAR (dirty_byteen_st1)
|
||||
end
|
||||
|
||||
VX_fifo_queue #(
|
||||
.DATAW (1 + `CS_LINE_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)),
|
||||
.DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + 1),
|
||||
.DEPTH (MREQ_SIZE),
|
||||
.ALM_FULL (MREQ_SIZE - PIPELINE_STAGES),
|
||||
.OUT_REG (MEM_OUT_REG)
|
||||
.ALM_FULL (MREQ_SIZE-PIPELINE_STAGES),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
|
||||
) mem_req_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.push (mreq_queue_push),
|
||||
.pop (mreq_queue_pop),
|
||||
.data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_byteen, mreq_queue_data, mreq_queue_tag, mreq_queue_flags}),
|
||||
.data_out ({mem_req_rw, mem_req_addr, mem_req_byteen, mem_req_data, mem_req_tag, mem_req_flags}),
|
||||
.data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_id, mreq_queue_byteen, mreq_queue_data, mreq_queue_flush}),
|
||||
.data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_data, mem_req_flush}),
|
||||
.empty (mreq_queue_empty),
|
||||
.alm_full (mreq_queue_alm_full),
|
||||
`UNUSED_PIN (full),
|
||||
|
@ -678,101 +649,44 @@ module VX_cache_bank #(
|
|||
|
||||
assign mem_req_valid = ~mreq_queue_empty;
|
||||
|
||||
`UNUSED_VAR (do_lookup_st0)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
assign perf_read_miss = do_read_st1 && ~is_hit_st1;
|
||||
assign perf_write_miss = do_write_st1 && ~is_hit_st1;
|
||||
assign perf_mshr_stall = mshr_alm_full;
|
||||
assign perf_read_misses = do_read_miss_st1;
|
||||
assign perf_write_misses = do_write_miss_st1;
|
||||
assign perf_mshr_stalls = mshr_alm_full;
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_CACHE
|
||||
wire crsp_queue_fire = crsp_queue_valid && crsp_queue_ready;
|
||||
wire input_stall = (replay_valid || mem_rsp_valid || core_req_valid || flush_valid)
|
||||
&& ~(replay_fire || mem_rsp_fire || core_req_fire || flush_fire);
|
||||
|
||||
wire [`XLEN-1:0] mem_rsp_full_addr = `CS_BANK_TO_FULL_ADDR(mem_rsp_addr, BANK_ID);
|
||||
wire [`XLEN-1:0] replay_full_addr = `CS_BANK_TO_FULL_ADDR(replay_addr, BANK_ID);
|
||||
wire [`XLEN-1:0] core_req_full_addr = `CS_BANK_TO_FULL_ADDR(core_req_addr, BANK_ID);
|
||||
wire [`XLEN-1:0] full_addr_st0 = `CS_BANK_TO_FULL_ADDR(addr_st0, BANK_ID);
|
||||
wire [`XLEN-1:0] full_addr_st1 = `CS_BANK_TO_FULL_ADDR(addr_st1, BANK_ID);
|
||||
wire [`XLEN-1:0] mreq_queue_full_addr = `CS_BANK_TO_FULL_ADDR(mreq_queue_addr, BANK_ID);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (input_stall || pipe_stall) begin
|
||||
`TRACE(4, ("%t: *** %s stall: crsq=%b, mreq=%b, mshr=%b\n", $time, INSTANCE_ID,
|
||||
crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full))
|
||||
`TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw1=%b, rdw2=%b, rdw3=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard1_sel, rdw_hazard2_sel, rdw_hazard3_st1));
|
||||
end
|
||||
if (mem_rsp_fire) begin
|
||||
`TRACE(2, ("%t: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
|
||||
mem_rsp_full_addr, mem_rsp_id, mem_rsp_data, req_uuid_sel))
|
||||
`TRACE(2, ("%d: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data));
|
||||
end
|
||||
if (replay_fire) begin
|
||||
`TRACE(2, ("%t: %s mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID,
|
||||
replay_full_addr, replay_tag, replay_idx, req_uuid_sel))
|
||||
`TRACE(2, ("%d: %s mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel));
|
||||
end
|
||||
if (core_req_fire) begin
|
||||
if (core_req_rw) begin
|
||||
`TRACE(2, ("%t: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
|
||||
core_req_full_addr, core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID,
|
||||
core_req_full_addr, core_req_tag, core_req_idx, req_uuid_sel))
|
||||
end
|
||||
end
|
||||
if (do_init_st0) begin
|
||||
`TRACE(3, ("%t: %s tags-init: addr=0x%0h, line=%0d\n", $time, INSTANCE_ID, full_addr_st0, line_idx_st0))
|
||||
end
|
||||
if (do_fill_st0 && ~pipe_stall) begin
|
||||
`TRACE(3, ("%t: %s tags-fill: addr=0x%0h, way=%0d, line=%0d, dirty=%b (#%0d)\n", $time, INSTANCE_ID,
|
||||
full_addr_st0, evict_way_st0, line_idx_st0, is_dirty_st0, req_uuid_st0))
|
||||
end
|
||||
if (do_flush_st0 && ~pipe_stall) begin
|
||||
`TRACE(3, ("%t: %s tags-flush: addr=0x%0h, way=%0d, line=%0d, dirty=%b (#%0d)\n", $time, INSTANCE_ID,
|
||||
full_addr_st0, evict_way_st0, line_idx_st0, is_dirty_st0, req_uuid_st0))
|
||||
end
|
||||
if (do_lookup_st0 && ~pipe_stall) begin
|
||||
if (is_hit_st0) begin
|
||||
`TRACE(3, ("%t: %s tags-hit: addr=0x%0h, rw=%b, way=%0d, line=%0d, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID,
|
||||
full_addr_st0, rw_st0, way_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0))
|
||||
end else begin
|
||||
`TRACE(3, ("%t: %s tags-miss: addr=0x%0h, rw=%b, way=%0d, line=%0d, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID,
|
||||
full_addr_st0, rw_st0, way_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0))
|
||||
end
|
||||
end
|
||||
if (do_fill_st0 && ~pipe_stall) begin
|
||||
`TRACE(3, ("%t: %s data-fill: addr=0x%0h, way=%0d, line=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
|
||||
full_addr_st0, way_idx_st0, line_idx_st0, data_st0, req_uuid_st0))
|
||||
end
|
||||
if (do_flush_st0 && ~pipe_stall) begin
|
||||
`TRACE(3, ("%t: %s data-flush: addr=0x%0h, way=%0d, line=%0d (#%0d)\n", $time, INSTANCE_ID,
|
||||
full_addr_st0, way_idx_st0, line_idx_st0, req_uuid_st0))
|
||||
end
|
||||
if (do_read_st1 && is_hit_st1 && ~pipe_stall) begin
|
||||
`TRACE(3, ("%t: %s data-read: addr=0x%0h, way=%0d, line=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
|
||||
full_addr_st1, way_idx_st1, line_idx_st1, word_idx_st1, crsp_queue_data, req_uuid_st1))
|
||||
end
|
||||
if (do_write_st1 && is_hit_st1 && ~pipe_stall) begin
|
||||
`TRACE(3, ("%t: %s data-write: addr=0x%0h, way=%0d, line=%0d, wsel=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
|
||||
full_addr_st1, way_idx_st1, line_idx_st1, word_idx_st1, byteen_st1, write_word_st1, req_uuid_st1))
|
||||
if (core_req_rw)
|
||||
`TRACE(2, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel));
|
||||
else
|
||||
`TRACE(2, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel));
|
||||
end
|
||||
if (crsp_queue_fire) begin
|
||||
`TRACE(2, ("%t: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
|
||||
full_addr_st1, crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1))
|
||||
`TRACE(2, ("%d: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1));
|
||||
end
|
||||
if (mreq_queue_push) begin
|
||||
if (!WRITEBACK && do_write_st1) begin
|
||||
`TRACE(2, ("%t: %s writethrough: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
|
||||
mreq_queue_full_addr, mreq_queue_byteen, mreq_queue_data, req_uuid_st1))
|
||||
end else if (WRITEBACK && do_writeback_st1) begin
|
||||
`TRACE(2, ("%t: %s writeback: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
|
||||
mreq_queue_full_addr, mreq_queue_byteen, mreq_queue_data, req_uuid_st1))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID,
|
||||
mreq_queue_full_addr, mshr_id_st1, req_uuid_st1))
|
||||
end
|
||||
if (do_creq_wr_st1 && !WRITEBACK)
|
||||
`TRACE(2, ("%d: %s writethrough: addr=0x%0h, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1));
|
||||
else if (do_writeback_st1)
|
||||
`TRACE(2, ("%d: %s writeback: addr=0x%0h, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data));
|
||||
else
|
||||
`TRACE(2, ("%d: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_id, req_uuid_st1));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
461
hw/rtl/cache/VX_cache_bypass.sv
vendored
461
hw/rtl/cache/VX_cache_bypass.sv
vendored
|
@ -15,10 +15,10 @@
|
|||
|
||||
module VX_cache_bypass #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter MEM_PORTS = 1,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
|
||||
parameter CACHE_ENABLE = 0,
|
||||
parameter PASSTHRU = 0,
|
||||
parameter NC_ENABLE = 0,
|
||||
|
||||
parameter WORD_SIZE = 1,
|
||||
parameter LINE_SIZE = 1,
|
||||
|
@ -29,11 +29,14 @@ module VX_cache_bypass #(
|
|||
|
||||
parameter MEM_ADDR_WIDTH = 1,
|
||||
parameter MEM_TAG_IN_WIDTH = 1,
|
||||
parameter MEM_TAG_OUT_WIDTH = 1,
|
||||
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
parameter CORE_OUT_BUF = 0,
|
||||
parameter MEM_OUT_BUF = 0
|
||||
parameter MEM_OUT_BUF = 0,
|
||||
|
||||
parameter CORE_DATA_WIDTH = WORD_SIZE * 8
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
@ -45,222 +48,304 @@ module VX_cache_bypass #(
|
|||
VX_mem_bus_if.master core_bus_out_if [NUM_REQS],
|
||||
|
||||
// Memory request in
|
||||
VX_mem_bus_if.slave mem_bus_in_if [MEM_PORTS],
|
||||
VX_mem_bus_if.slave mem_bus_in_if,
|
||||
|
||||
// Memory request out
|
||||
VX_mem_bus_if.master mem_bus_out_if [MEM_PORTS]
|
||||
VX_mem_bus_if.master mem_bus_out_if
|
||||
);
|
||||
localparam DIRECT_PASSTHRU = !CACHE_ENABLE && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == MEM_PORTS);
|
||||
localparam CORE_DATA_WIDTH = WORD_SIZE * 8;
|
||||
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
|
||||
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
|
||||
localparam DIRECT_PASSTHRU = PASSTHRU && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == 1);
|
||||
|
||||
localparam CORE_TAG_ID_WIDTH = CORE_TAG_WIDTH - UUID_WIDTH;
|
||||
localparam MEM_TAG_ID_WIDTH = `CLOG2(`CDIV(NUM_REQS, MEM_PORTS)) + CORE_TAG_ID_WIDTH;
|
||||
localparam MEM_TAG_NC1_WIDTH = UUID_WIDTH + MEM_TAG_ID_WIDTH;
|
||||
localparam MEM_TAG_NC2_WIDTH = MEM_TAG_NC1_WIDTH + WSEL_BITS;
|
||||
localparam MEM_TAG_OUT_WIDTH = CACHE_ENABLE ? `MAX(MEM_TAG_IN_WIDTH, MEM_TAG_NC2_WIDTH) : MEM_TAG_NC2_WIDTH;
|
||||
localparam REQ_SEL_BITS = `CLOG2(NUM_REQS);
|
||||
localparam MUX_DATAW = 1 + WORD_SIZE + CORE_ADDR_WIDTH + `ADDR_TYPE_WIDTH + CORE_DATA_WIDTH + CORE_TAG_WIDTH;
|
||||
|
||||
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
|
||||
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
|
||||
|
||||
localparam CORE_TAG_ID_BITS = CORE_TAG_WIDTH - UUID_WIDTH;
|
||||
localparam MEM_TAG_ID_BITS = REQ_SEL_BITS + WSEL_BITS + CORE_TAG_ID_BITS;
|
||||
localparam MEM_TAG_BYPASS_BITS = UUID_WIDTH + MEM_TAG_ID_BITS;
|
||||
|
||||
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % `MEM_BLOCK_SIZE), ("invalid parameter"))
|
||||
|
||||
// hanlde non-cacheable core request switch ///////////////////////////////
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (CORE_TAG_WIDTH)
|
||||
) core_bus_nc_switch_if[(CACHE_ENABLE ? 2 : 1) * NUM_REQS]();
|
||||
// handle core requests ///////////////////////////////////////////////////
|
||||
|
||||
wire core_req_nc_valid;
|
||||
wire [NUM_REQS-1:0] core_req_nc_valids;
|
||||
wire [NUM_REQS-1:0] core_req_nc_idxs;
|
||||
wire [`UP(REQ_SEL_BITS)-1:0] core_req_nc_idx;
|
||||
wire [NUM_REQS-1:0] core_req_nc_sel;
|
||||
wire core_req_nc_ready;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_is_nc
|
||||
if (CACHE_ENABLE) begin : g_cache
|
||||
assign core_req_nc_sel[i] = ~core_bus_in_if[i].req_data.flags[`MEM_REQ_FLAG_IO];
|
||||
end else begin : g_no_cache
|
||||
assign core_req_nc_sel[i] = 1'b0;
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
if (PASSTHRU != 0) begin
|
||||
assign core_req_nc_idxs[i] = 1'b1;
|
||||
end else if (NC_ENABLE) begin
|
||||
assign core_req_nc_idxs[i] = core_bus_in_if[i].req_data.atype[`ADDR_TYPE_IO];
|
||||
end else begin
|
||||
assign core_req_nc_idxs[i] = 1'b0;
|
||||
end
|
||||
assign core_req_nc_valids[i] = core_bus_in_if[i].req_valid && core_req_nc_idxs[i];
|
||||
end
|
||||
|
||||
VX_mem_switch #(
|
||||
.NUM_INPUTS (NUM_REQS),
|
||||
.NUM_OUTPUTS ((CACHE_ENABLE ? 2 : 1) * NUM_REQS),
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (CORE_TAG_WIDTH),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF (0),
|
||||
.RSP_OUT_BUF (DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(CORE_OUT_BUF))
|
||||
) core_bus_nc_switch (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.bus_sel (core_req_nc_sel),
|
||||
.bus_in_if (core_bus_in_if),
|
||||
.bus_out_if(core_bus_nc_switch_if)
|
||||
VX_generic_arbiter #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.TYPE (PASSTHRU ? "R" : "P")
|
||||
) core_req_nc_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.requests (core_req_nc_valids),
|
||||
.grant_index (core_req_nc_idx),
|
||||
.grant_onehot (core_req_nc_sel),
|
||||
.grant_valid (core_req_nc_valid),
|
||||
.grant_ready (core_req_nc_ready)
|
||||
);
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (CORE_TAG_WIDTH)
|
||||
) core_bus_in_nc_if[NUM_REQS]();
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_nc_switch_if
|
||||
|
||||
assign core_bus_in_nc_if[i].req_valid = core_bus_nc_switch_if[0 * NUM_REQS + i].req_valid;
|
||||
assign core_bus_in_nc_if[i].req_data = core_bus_nc_switch_if[0 * NUM_REQS + i].req_data;
|
||||
assign core_bus_nc_switch_if[0 * NUM_REQS + i].req_ready = core_bus_in_nc_if[i].req_ready;
|
||||
|
||||
assign core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_valid = core_bus_in_nc_if[i].rsp_valid;
|
||||
assign core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_data = core_bus_in_nc_if[i].rsp_data;
|
||||
assign core_bus_in_nc_if[i].rsp_ready = core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_ready;
|
||||
|
||||
if (CACHE_ENABLE) begin : g_cache
|
||||
assign core_bus_out_if[i].req_valid = core_bus_nc_switch_if[1 * NUM_REQS + i].req_valid;
|
||||
assign core_bus_out_if[i].req_data = core_bus_nc_switch_if[1 * NUM_REQS + i].req_data;
|
||||
assign core_bus_nc_switch_if[1 * NUM_REQS + i].req_ready = core_bus_out_if[i].req_ready;
|
||||
|
||||
assign core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_valid = core_bus_out_if[i].rsp_valid;
|
||||
assign core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_data = core_bus_out_if[i].rsp_data;
|
||||
assign core_bus_out_if[i].rsp_ready = core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_ready;
|
||||
end else begin : g_no_cache
|
||||
`INIT_VX_MEM_BUS_IF (core_bus_out_if[i])
|
||||
end
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && ~core_req_nc_idxs[i];
|
||||
assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
|
||||
assign core_bus_in_if[i].req_ready = core_req_nc_valids[i] ? (core_req_nc_ready && core_req_nc_sel[i])
|
||||
: core_bus_out_if[i].req_ready;
|
||||
end
|
||||
|
||||
// handle memory requests /////////////////////////////////////////////////
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_NC1_WIDTH)
|
||||
) core_bus_nc_arb_if[MEM_PORTS]();
|
||||
wire mem_req_out_valid;
|
||||
wire mem_req_out_rw;
|
||||
wire [LINE_SIZE-1:0] mem_req_out_byteen;
|
||||
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_out_addr;
|
||||
wire [`ADDR_TYPE_WIDTH-1:0] mem_req_out_atype;
|
||||
wire [`CS_LINE_WIDTH-1:0] mem_req_out_data;
|
||||
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_out_tag;
|
||||
wire mem_req_out_ready;
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (NUM_REQS),
|
||||
.NUM_OUTPUTS(MEM_PORTS),
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (CORE_TAG_WIDTH),
|
||||
.TAG_SEL_IDX(TAG_SEL_IDX),
|
||||
.ARBITER (CACHE_ENABLE ? "P" : "R"),
|
||||
.REQ_OUT_BUF(0),
|
||||
.RSP_OUT_BUF(0)
|
||||
) core_bus_nc_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.bus_in_if (core_bus_in_nc_if),
|
||||
.bus_out_if (core_bus_nc_arb_if)
|
||||
);
|
||||
wire core_req_nc_sel_rw;
|
||||
wire [WORD_SIZE-1:0] core_req_nc_sel_byteen;
|
||||
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_sel_addr;
|
||||
wire [`ADDR_TYPE_WIDTH-1:0] core_req_nc_sel_atype;
|
||||
wire [CORE_DATA_WIDTH-1:0] core_req_nc_sel_data;
|
||||
wire [CORE_TAG_WIDTH-1:0] core_req_nc_sel_tag;
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_NC2_WIDTH)
|
||||
) mem_bus_out_nc_if[MEM_PORTS]();
|
||||
wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in;
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_req_nc_mux_in[i] = {
|
||||
core_bus_in_if[i].req_data.rw,
|
||||
core_bus_in_if[i].req_data.byteen,
|
||||
core_bus_in_if[i].req_data.addr,
|
||||
core_bus_in_if[i].req_data.atype,
|
||||
core_bus_in_if[i].req_data.data,
|
||||
core_bus_in_if[i].req_data.tag
|
||||
};
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_out_nc
|
||||
wire core_req_nc_arb_rw;
|
||||
wire [WORD_SIZE-1:0] core_req_nc_arb_byteen;
|
||||
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_arb_addr;
|
||||
wire [`MEM_REQ_FLAGS_WIDTH-1:0] core_req_nc_arb_flags;
|
||||
wire [CORE_DATA_WIDTH-1:0] core_req_nc_arb_data;
|
||||
wire [MEM_TAG_NC1_WIDTH-1:0] core_req_nc_arb_tag;
|
||||
assign {
|
||||
core_req_nc_sel_rw,
|
||||
core_req_nc_sel_byteen,
|
||||
core_req_nc_sel_addr,
|
||||
core_req_nc_sel_atype,
|
||||
core_req_nc_sel_data,
|
||||
core_req_nc_sel_tag
|
||||
} = core_req_nc_mux_in[core_req_nc_idx];
|
||||
|
||||
assign {
|
||||
core_req_nc_arb_rw,
|
||||
core_req_nc_arb_addr,
|
||||
core_req_nc_arb_data,
|
||||
core_req_nc_arb_byteen,
|
||||
core_req_nc_arb_flags,
|
||||
core_req_nc_arb_tag
|
||||
} = core_bus_nc_arb_if[i].req_data;
|
||||
assign core_req_nc_ready = ~mem_bus_in_if.req_valid && mem_req_out_ready;
|
||||
|
||||
logic [MEM_ADDR_WIDTH-1:0] core_req_nc_arb_addr_w;
|
||||
logic [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] core_req_nc_arb_byteen_w;
|
||||
logic [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] core_req_nc_arb_data_w;
|
||||
logic [CORE_DATA_WIDTH-1:0] core_rsp_nc_arb_data_w;
|
||||
wire [MEM_TAG_NC2_WIDTH-1:0] core_req_nc_arb_tag_w;
|
||||
wire [MEM_TAG_NC1_WIDTH-1:0] core_rsp_nc_arb_tag_w;
|
||||
assign mem_req_out_valid = mem_bus_in_if.req_valid || core_req_nc_valid;
|
||||
assign mem_req_out_rw = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.rw : core_req_nc_sel_rw;
|
||||
assign mem_req_out_addr = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.addr : core_req_nc_sel_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
|
||||
assign mem_req_out_atype = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.atype : core_req_nc_sel_atype;
|
||||
|
||||
if (WORDS_PER_LINE > 1) begin : g_multi_word_line
|
||||
wire [WSEL_BITS-1:0] rsp_wsel;
|
||||
wire [WSEL_BITS-1:0] req_wsel = core_req_nc_arb_addr[WSEL_BITS-1:0];
|
||||
always @(*) begin
|
||||
core_req_nc_arb_byteen_w = '0;
|
||||
core_req_nc_arb_byteen_w[req_wsel] = core_req_nc_arb_byteen;
|
||||
core_req_nc_arb_data_w = 'x;
|
||||
core_req_nc_arb_data_w[req_wsel] = core_req_nc_arb_data;
|
||||
end
|
||||
wire [MEM_TAG_ID_BITS-1:0] mem_req_tag_id_bypass;
|
||||
|
||||
wire [CORE_TAG_ID_BITS-1:0] core_req_in_id = core_req_nc_sel_tag[CORE_TAG_ID_BITS-1:0];
|
||||
|
||||
if (WORDS_PER_LINE > 1) begin
|
||||
reg [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] mem_req_byteen_in_r;
|
||||
reg [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_r;
|
||||
|
||||
wire [WSEL_BITS-1:0] req_wsel = core_req_nc_sel_addr[WSEL_BITS-1:0];
|
||||
|
||||
always @(*) begin
|
||||
mem_req_byteen_in_r = '0;
|
||||
mem_req_byteen_in_r[req_wsel] = core_req_nc_sel_byteen;
|
||||
|
||||
mem_req_data_in_r = 'x;
|
||||
mem_req_data_in_r[req_wsel] = core_req_nc_sel_data;
|
||||
end
|
||||
|
||||
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : mem_req_byteen_in_r;
|
||||
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : mem_req_data_in_r;
|
||||
if (NUM_REQS > 1) begin
|
||||
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, req_wsel, core_req_in_id});
|
||||
end else begin
|
||||
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({req_wsel, core_req_in_id});
|
||||
end
|
||||
end else begin
|
||||
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : core_req_nc_sel_byteen;
|
||||
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : core_req_nc_sel_data;
|
||||
if (NUM_REQS > 1) begin
|
||||
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, core_req_in_id});
|
||||
end else begin
|
||||
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_in_id});
|
||||
end
|
||||
end
|
||||
|
||||
wire [MEM_TAG_BYPASS_BITS-1:0] mem_req_tag_bypass;
|
||||
|
||||
if (UUID_WIDTH != 0) begin
|
||||
assign mem_req_tag_bypass = {core_req_nc_sel_tag[CORE_TAG_ID_BITS +: UUID_WIDTH], mem_req_tag_id_bypass};
|
||||
end else begin
|
||||
assign mem_req_tag_bypass = mem_req_tag_id_bypass;
|
||||
end
|
||||
|
||||
if (PASSTHRU != 0) begin
|
||||
assign mem_req_out_tag = mem_req_tag_bypass;
|
||||
`UNUSED_VAR (mem_bus_in_if.req_data.tag)
|
||||
end else begin
|
||||
if (NC_ENABLE) begin
|
||||
VX_bits_insert #(
|
||||
.N (MEM_TAG_NC1_WIDTH),
|
||||
.S (WSEL_BITS),
|
||||
.N (MEM_TAG_OUT_WIDTH-1),
|
||||
.S (1),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) wsel_insert (
|
||||
.data_in (core_req_nc_arb_tag),
|
||||
.ins_in (req_wsel),
|
||||
.data_out (core_req_nc_arb_tag_w)
|
||||
) mem_req_tag_in_nc_insert (
|
||||
.data_in (mem_bus_in_if.req_valid ? (MEM_TAG_OUT_WIDTH-1)'(mem_bus_in_if.req_data.tag) : (MEM_TAG_OUT_WIDTH-1)'(mem_req_tag_bypass)),
|
||||
.ins_in (~mem_bus_in_if.req_valid),
|
||||
.data_out (mem_req_out_tag)
|
||||
);
|
||||
VX_bits_remove #(
|
||||
.N (MEM_TAG_NC2_WIDTH),
|
||||
.S (WSEL_BITS),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) wsel_remove (
|
||||
.data_in (mem_bus_out_nc_if[i].rsp_data.tag),
|
||||
.sel_out (rsp_wsel),
|
||||
.data_out (core_rsp_nc_arb_tag_w)
|
||||
);
|
||||
assign core_req_nc_arb_addr_w = core_req_nc_arb_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
|
||||
assign core_rsp_nc_arb_data_w = mem_bus_out_nc_if[i].rsp_data.data[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
|
||||
end else begin : g_single_word_line
|
||||
assign core_req_nc_arb_addr_w = core_req_nc_arb_addr;
|
||||
assign core_req_nc_arb_byteen_w = core_req_nc_arb_byteen;
|
||||
assign core_req_nc_arb_data_w = core_req_nc_arb_data;
|
||||
assign core_req_nc_arb_tag_w = MEM_TAG_NC2_WIDTH'(core_req_nc_arb_tag);
|
||||
|
||||
assign core_rsp_nc_arb_data_w = mem_bus_out_nc_if[i].rsp_data.data;
|
||||
assign core_rsp_nc_arb_tag_w = MEM_TAG_NC1_WIDTH'(mem_bus_out_nc_if[i].rsp_data.tag);
|
||||
end
|
||||
|
||||
assign mem_bus_out_nc_if[i].req_valid = core_bus_nc_arb_if[i].req_valid;
|
||||
assign mem_bus_out_nc_if[i].req_data = {
|
||||
core_req_nc_arb_rw,
|
||||
core_req_nc_arb_addr_w,
|
||||
core_req_nc_arb_data_w,
|
||||
core_req_nc_arb_byteen_w,
|
||||
core_req_nc_arb_flags,
|
||||
core_req_nc_arb_tag_w
|
||||
};
|
||||
assign core_bus_nc_arb_if[i].req_ready = mem_bus_out_nc_if[i].req_ready;
|
||||
|
||||
assign core_bus_nc_arb_if[i].rsp_valid = mem_bus_out_nc_if[i].rsp_valid;
|
||||
assign core_bus_nc_arb_if[i].rsp_data = {
|
||||
core_rsp_nc_arb_data_w,
|
||||
core_rsp_nc_arb_tag_w
|
||||
};
|
||||
assign mem_bus_out_nc_if[i].rsp_ready = core_bus_nc_arb_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_OUT_WIDTH)
|
||||
) mem_bus_out_src_if[(CACHE_ENABLE ? 2 : 1) * MEM_PORTS]();
|
||||
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_out_src
|
||||
`ASSIGN_VX_MEM_BUS_IF_EX(mem_bus_out_src_if[0 * MEM_PORTS + i], mem_bus_out_nc_if[i], MEM_TAG_OUT_WIDTH, MEM_TAG_NC2_WIDTH, UUID_WIDTH);
|
||||
if (CACHE_ENABLE) begin : g_cache
|
||||
`ASSIGN_VX_MEM_BUS_IF_EX(mem_bus_out_src_if[1 * MEM_PORTS + i], mem_bus_in_if[i], MEM_TAG_OUT_WIDTH, MEM_TAG_IN_WIDTH, UUID_WIDTH);
|
||||
end else begin : g_no_cache
|
||||
`UNUSED_VX_MEM_BUS_IF(mem_bus_in_if[i])
|
||||
end else begin
|
||||
assign mem_req_out_tag = mem_bus_in_if.req_data.tag;
|
||||
end
|
||||
end
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS ((CACHE_ENABLE ? 2 : 1) * MEM_PORTS),
|
||||
.NUM_OUTPUTS(MEM_PORTS),
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_OUT_WIDTH),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF(DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(MEM_OUT_BUF)),
|
||||
.RSP_OUT_BUF(0)
|
||||
) mem_bus_out_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.bus_in_if (mem_bus_out_src_if),
|
||||
.bus_out_if (mem_bus_out_if)
|
||||
assign mem_bus_in_if.req_ready = mem_req_out_ready;
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `ADDR_TYPE_WIDTH + `CS_LINE_WIDTH + MEM_TAG_OUT_WIDTH),
|
||||
.SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
|
||||
) mem_req_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (mem_req_out_valid),
|
||||
.ready_in (mem_req_out_ready),
|
||||
.data_in ({mem_req_out_rw, mem_req_out_byteen, mem_req_out_addr, mem_req_out_atype, mem_req_out_data, mem_req_out_tag}),
|
||||
.data_out ({mem_bus_out_if.req_data.rw, mem_bus_out_if.req_data.byteen, mem_bus_out_if.req_data.addr, mem_bus_out_if.req_data.atype, mem_bus_out_if.req_data.data, mem_bus_out_if.req_data.tag}),
|
||||
.valid_out (mem_bus_out_if.req_valid),
|
||||
.ready_out (mem_bus_out_if.req_ready)
|
||||
);
|
||||
|
||||
// handle core responses //////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_REQS-1:0] core_rsp_in_valid;
|
||||
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_in_data;
|
||||
wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_in_tag;
|
||||
wire [NUM_REQS-1:0] core_rsp_in_ready;
|
||||
|
||||
wire is_mem_rsp_nc;
|
||||
if (PASSTHRU != 0) begin
|
||||
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid;
|
||||
end else begin
|
||||
if (NC_ENABLE) begin
|
||||
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid && mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
|
||||
end else begin
|
||||
assign is_mem_rsp_nc = 1'b0;
|
||||
end
|
||||
end
|
||||
|
||||
wire [(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1:0] mem_rsp_tag_id_nc;
|
||||
|
||||
VX_bits_remove #(
|
||||
.N (MEM_TAG_OUT_WIDTH),
|
||||
.S (NC_ENABLE),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) mem_rsp_tag_in_nc_remove (
|
||||
.data_in (mem_bus_out_if.rsp_data.tag),
|
||||
.data_out (mem_rsp_tag_id_nc)
|
||||
);
|
||||
|
||||
wire [`UP(REQ_SEL_BITS)-1:0] rsp_idx;
|
||||
if (NUM_REQS > 1) begin
|
||||
assign rsp_idx = mem_rsp_tag_id_nc[(CORE_TAG_ID_BITS + WSEL_BITS) +: REQ_SEL_BITS];
|
||||
end else begin
|
||||
assign rsp_idx = 1'b0;
|
||||
end
|
||||
|
||||
reg [NUM_REQS-1:0] rsp_nc_valid_r;
|
||||
always @(*) begin
|
||||
rsp_nc_valid_r = '0;
|
||||
rsp_nc_valid_r[rsp_idx] = is_mem_rsp_nc;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_rsp_in_valid[i] = core_bus_out_if[i].rsp_valid || rsp_nc_valid_r[i];
|
||||
assign core_bus_out_if[i].rsp_ready = core_rsp_in_ready[i];
|
||||
end
|
||||
|
||||
if (WORDS_PER_LINE > 1) begin
|
||||
wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS +: WSEL_BITS];
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ?
|
||||
core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
|
||||
end
|
||||
end else begin
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data;
|
||||
end
|
||||
end
|
||||
|
||||
wire [(CORE_TAG_ID_BITS + UUID_WIDTH)-1:0] mem_rsp_tag_in_nc2;
|
||||
if (UUID_WIDTH != 0) begin
|
||||
assign mem_rsp_tag_in_nc2 = {mem_rsp_tag_id_nc[(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1 -: UUID_WIDTH], mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0]};
|
||||
end else begin
|
||||
assign mem_rsp_tag_in_nc2 = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
if (PASSTHRU) begin
|
||||
assign core_rsp_in_tag[i] = mem_rsp_tag_in_nc2;
|
||||
end else if (NC_ENABLE) begin
|
||||
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.tag : mem_rsp_tag_in_nc2;
|
||||
end else begin
|
||||
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_data.tag;
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`CS_WORD_WIDTH + CORE_TAG_WIDTH),
|
||||
.SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
|
||||
) core_rsp_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (core_rsp_in_valid[i]),
|
||||
.ready_in (core_rsp_in_ready[i]),
|
||||
.data_in ({core_rsp_in_data[i], core_rsp_in_tag[i]}),
|
||||
.data_out ({core_bus_in_if[i].rsp_data.data, core_bus_in_if[i].rsp_data.tag}),
|
||||
.valid_out (core_bus_in_if[i].rsp_valid),
|
||||
.ready_out (core_bus_in_if[i].rsp_ready)
|
||||
);
|
||||
end
|
||||
|
||||
// handle memory responses ////////////////////////////////////////////////
|
||||
|
||||
if (PASSTHRU != 0) begin
|
||||
assign mem_bus_in_if.rsp_valid = 1'b0;
|
||||
assign mem_bus_in_if.rsp_data.data = '0;
|
||||
assign mem_bus_in_if.rsp_data.tag = '0;
|
||||
end else if (NC_ENABLE) begin
|
||||
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid && ~mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
|
||||
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
|
||||
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc[MEM_TAG_IN_WIDTH-1:0];
|
||||
end else begin
|
||||
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid;
|
||||
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
|
||||
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc;
|
||||
end
|
||||
|
||||
wire [NUM_REQS-1:0] core_rsp_out_valid;
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_rsp_out_valid[i] = core_bus_out_if[i].rsp_valid;
|
||||
end
|
||||
|
||||
assign mem_bus_out_if.rsp_ready = is_mem_rsp_nc ? (~core_rsp_out_valid[rsp_idx] && core_rsp_in_ready[rsp_idx]) : mem_bus_in_if.rsp_ready;
|
||||
|
||||
endmodule
|
||||
|
|
125
hw/rtl/cache/VX_cache_cluster.sv
vendored
125
hw/rtl/cache/VX_cache_cluster.sv
vendored
|
@ -23,26 +23,23 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
// Number of requests per cycle
|
||||
parameter NUM_REQS = 4,
|
||||
|
||||
// Number of memory ports
|
||||
parameter MEM_PORTS = 1,
|
||||
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 32768,
|
||||
parameter CACHE_SIZE = 16384,
|
||||
// Size of line inside a bank in bytes
|
||||
parameter LINE_SIZE = 64,
|
||||
// Number of banks
|
||||
parameter NUM_BANKS = 4,
|
||||
parameter NUM_BANKS = 1,
|
||||
// Number of associative ways
|
||||
parameter NUM_WAYS = 4,
|
||||
// Size of a word in bytes
|
||||
parameter WORD_SIZE = 16,
|
||||
parameter WORD_SIZE = 4,
|
||||
|
||||
// Core Response Queue Size
|
||||
parameter CRSQ_SIZE = 4,
|
||||
parameter CRSQ_SIZE = 2,
|
||||
// Miss Reserv Queue Knob
|
||||
parameter MSHR_SIZE = 16,
|
||||
parameter MSHR_SIZE = 8,
|
||||
// Memory Response Queue Size
|
||||
parameter MRSQ_SIZE = 4,
|
||||
parameter MRSQ_SIZE = 0,
|
||||
// Memory Request Queue Size
|
||||
parameter MREQ_SIZE = 4,
|
||||
|
||||
|
@ -55,26 +52,20 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
// Enable dirty bytes on writeback
|
||||
parameter DIRTY_BYTES = 0,
|
||||
|
||||
// Replacement policy
|
||||
parameter REPL_POLICY = `CS_REPL_FIFO,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
// core request tag size
|
||||
parameter TAG_WIDTH = UUID_WIDTH + 1,
|
||||
|
||||
// core request flags
|
||||
parameter FLAGS_WIDTH = 0,
|
||||
|
||||
// enable bypass for non-cacheable addresses
|
||||
parameter NC_ENABLE = 0,
|
||||
|
||||
// Core response output buffer
|
||||
parameter CORE_OUT_BUF = 3,
|
||||
parameter CORE_OUT_BUF = 0,
|
||||
|
||||
// Memory request output buffer
|
||||
parameter MEM_OUT_BUF = 3
|
||||
parameter MEM_OUT_BUF = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
@ -85,16 +76,14 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
|
||||
VX_mem_bus_if.slave core_bus_if [NUM_INPUTS * NUM_REQS],
|
||||
VX_mem_bus_if.master mem_bus_if [MEM_PORTS]
|
||||
VX_mem_bus_if.master mem_bus_if
|
||||
);
|
||||
localparam NUM_CACHES = `UP(NUM_UNITS);
|
||||
localparam PASSTHRU = (NUM_UNITS == 0);
|
||||
localparam ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES);
|
||||
|
||||
localparam CACHE_MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH);
|
||||
localparam BYPASS_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, MEM_PORTS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH);
|
||||
localparam NC_TAG_WIDTH = `MAX(CACHE_MEM_TAG_WIDTH, BYPASS_TAG_WIDTH) + 1;
|
||||
localparam MEM_TAG_WIDTH = PASSTHRU ? BYPASS_TAG_WIDTH : (NC_ENABLE ? NC_TAG_WIDTH : CACHE_MEM_TAG_WIDTH);
|
||||
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
|
||||
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
|
||||
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
|
||||
|
||||
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
|
||||
|
||||
|
@ -106,14 +95,16 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH)
|
||||
) cache_mem_bus_if[NUM_CACHES * MEM_PORTS]();
|
||||
) cache_mem_bus_if[NUM_CACHES]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (ARB_TAG_WIDTH)
|
||||
) arb_core_bus_if[NUM_CACHES * NUM_REQS]();
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_arb
|
||||
`RESET_RELAY_EX (cache_arb_reset, reset, NUM_REQS, `MAX_FANOUT);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (TAG_WIDTH)
|
||||
|
@ -124,7 +115,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
.TAG_WIDTH (ARB_TAG_WIDTH)
|
||||
) arb_core_bus_tmp_if[NUM_CACHES]();
|
||||
|
||||
for (genvar j = 0; j < NUM_INPUTS; ++j) begin : g_core_bus_tmp_if
|
||||
for (genvar j = 0; j < NUM_INPUTS; ++j) begin
|
||||
`ASSIGN_VX_MEM_BUS_IF (core_bus_tmp_if[j], core_bus_if[j * NUM_REQS + i]);
|
||||
end
|
||||
|
||||
|
@ -136,40 +127,40 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
.TAG_SEL_IDX (TAG_SEL_IDX),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0),
|
||||
.RSP_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? CORE_OUT_BUF : 0)
|
||||
) core_arb (
|
||||
.RSP_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0)
|
||||
) cache_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (cache_arb_reset[i]),
|
||||
.bus_in_if (core_bus_tmp_if),
|
||||
.bus_out_if (arb_core_bus_tmp_if)
|
||||
);
|
||||
|
||||
for (genvar k = 0; k < NUM_CACHES; ++k) begin : g_arb_core_bus_if
|
||||
for (genvar k = 0; k < NUM_CACHES; ++k) begin
|
||||
`ASSIGN_VX_MEM_BUS_IF (arb_core_bus_if[k * NUM_REQS + i], arb_core_bus_tmp_if[k]);
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_CACHES; ++i) begin : g_cache_wrap
|
||||
for (genvar i = 0; i < NUM_CACHES; ++i) begin : caches
|
||||
|
||||
`RESET_RELAY (cache_reset, reset);
|
||||
|
||||
VX_cache_wrap #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s%0d", INSTANCE_ID, i))),
|
||||
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, i)),
|
||||
.CACHE_SIZE (CACHE_SIZE),
|
||||
.LINE_SIZE (LINE_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.NUM_WAYS (NUM_WAYS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.MEM_PORTS (MEM_PORTS),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.WRITEBACK (WRITEBACK),
|
||||
.DIRTY_BYTES (DIRTY_BYTES),
|
||||
.REPL_POLICY (REPL_POLICY),
|
||||
.CRSQ_SIZE (CRSQ_SIZE),
|
||||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.MRSQ_SIZE (MRSQ_SIZE),
|
||||
.MREQ_SIZE (MREQ_SIZE),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.WRITEBACK (WRITEBACK),
|
||||
.DIRTY_BYTES (DIRTY_BYTES),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.TAG_WIDTH (ARB_TAG_WIDTH),
|
||||
.FLAGS_WIDTH (FLAGS_WIDTH),
|
||||
.TAG_SEL_IDX (TAG_SEL_IDX),
|
||||
.CORE_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : CORE_OUT_BUF),
|
||||
.MEM_OUT_BUF ((NUM_CACHES > 1) ? 2 : MEM_OUT_BUF),
|
||||
|
@ -180,48 +171,32 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
.cache_perf (perf_cache_unit[i]),
|
||||
`endif
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (cache_reset),
|
||||
.core_bus_if (arb_core_bus_if[i * NUM_REQS +: NUM_REQS]),
|
||||
.mem_bus_if (cache_mem_bus_if[i * MEM_PORTS +: MEM_PORTS])
|
||||
.mem_bus_if (cache_mem_bus_if[i])
|
||||
);
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_if
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH)
|
||||
) arb_core_bus_tmp_if[NUM_CACHES]();
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1))
|
||||
) mem_bus_tmp_if[1]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1))
|
||||
) mem_bus_tmp_if[1]();
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (NUM_CACHES),
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (TAG_SEL_IDX),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0),
|
||||
.RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0)
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.bus_in_if (cache_mem_bus_if),
|
||||
.bus_out_if (mem_bus_tmp_if)
|
||||
);
|
||||
|
||||
for (genvar j = 0; j < NUM_CACHES; ++j) begin : g_arb_core_bus_tmp_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (arb_core_bus_tmp_if[j], cache_mem_bus_if[j * MEM_PORTS + i]);
|
||||
end
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (NUM_CACHES),
|
||||
.NUM_OUTPUTS (1),
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (TAG_SEL_IDX),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF ((NUM_CACHES > 1) ? MEM_OUT_BUF : 0),
|
||||
.RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0)
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.bus_in_if (arb_core_bus_tmp_if),
|
||||
.bus_out_if (mem_bus_tmp_if)
|
||||
);
|
||||
|
||||
if (WRITE_ENABLE) begin : g_we
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[0]);
|
||||
end else begin : g_ro
|
||||
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[0]);
|
||||
end
|
||||
end
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]);
|
||||
|
||||
endmodule
|
||||
|
|
227
hw/rtl/cache/VX_cache_data.sv
vendored
227
hw/rtl/cache/VX_cache_data.sv
vendored
|
@ -14,6 +14,8 @@
|
|||
`include "VX_cache_define.vh"
|
||||
|
||||
module VX_cache_data #(
|
||||
parameter `STRING INSTANCE_ID= "",
|
||||
parameter BANK_ID = 0,
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 1024,
|
||||
// Size of line inside a bank in bytes
|
||||
|
@ -29,116 +31,169 @@ module VX_cache_data #(
|
|||
// Enable cache writeback
|
||||
parameter WRITEBACK = 0,
|
||||
// Enable dirty bytes on writeback
|
||||
parameter DIRTY_BYTES = 0
|
||||
parameter DIRTY_BYTES = 0,
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
// inputs
|
||||
|
||||
`IGNORE_UNUSED_BEGIN
|
||||
input wire[`UP(UUID_WIDTH)-1:0] req_uuid,
|
||||
`IGNORE_UNUSED_END
|
||||
|
||||
input wire stall,
|
||||
|
||||
input wire init,
|
||||
input wire read,
|
||||
input wire fill,
|
||||
input wire flush,
|
||||
input wire read,
|
||||
input wire write,
|
||||
input wire [`CS_LINE_SEL_BITS-1:0] line_idx,
|
||||
input wire [`CS_WAY_SEL_WIDTH-1:0] evict_way,
|
||||
input wire [NUM_WAYS-1:0] tag_matches,
|
||||
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
|
||||
input wire [`UP(`CS_WORD_SEL_BITS)-1:0] wsel,
|
||||
input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] fill_data,
|
||||
input wire [`CS_WORD_WIDTH-1:0] write_word,
|
||||
input wire [WORD_SIZE-1:0] write_byteen,
|
||||
input wire [`UP(`CS_WORD_SEL_BITS)-1:0] word_idx,
|
||||
input wire [`CS_WAY_SEL_WIDTH-1:0] way_idx_r,
|
||||
// outputs
|
||||
output wire [`CS_LINE_WIDTH-1:0] read_data,
|
||||
output wire [LINE_SIZE-1:0] evict_byteen
|
||||
input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] write_data,
|
||||
input wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_byteen,
|
||||
input wire [NUM_WAYS-1:0] way_sel,
|
||||
output wire [`CS_WORD_WIDTH-1:0] read_data,
|
||||
output wire [`CS_LINE_WIDTH-1:0] dirty_data,
|
||||
output wire [LINE_SIZE-1:0] dirty_byteen
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
`UNUSED_PARAM (BANK_ID)
|
||||
`UNUSED_PARAM (WORD_SIZE)
|
||||
`UNUSED_VAR (stall)
|
||||
`UNUSED_VAR (line_addr)
|
||||
`UNUSED_VAR (init)
|
||||
`UNUSED_VAR (read)
|
||||
`UNUSED_VAR (flush)
|
||||
|
||||
wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_mask;
|
||||
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin : g_write_mask
|
||||
wire word_en = (`CS_WORDS_PER_LINE == 1) || (word_idx == i);
|
||||
assign write_mask[i] = write_byteen & {WORD_SIZE{word_en}};
|
||||
end
|
||||
localparam BYTEENW = (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) ? (LINE_SIZE * NUM_WAYS) : 1;
|
||||
|
||||
if (DIRTY_BYTES != 0) begin : g_dirty_bytes
|
||||
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
|
||||
|
||||
wire [NUM_WAYS-1:0][LINE_SIZE-1:0] byteen_rdata;
|
||||
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_rdata;
|
||||
wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
|
||||
|
||||
for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_byteen_store
|
||||
wire [LINE_SIZE-1:0] byteen_wdata = {LINE_SIZE{write}}; // only asserted on writes
|
||||
wire [LINE_SIZE-1:0] byteen_wren = {LINE_SIZE{init || fill || flush}} | write_mask;
|
||||
wire byteen_write = ((fill || flush) && ((NUM_WAYS == 1) || (evict_way == i)))
|
||||
|| (write && tag_matches[i])
|
||||
|| init;
|
||||
wire byteen_read = fill || flush;
|
||||
if (WRITEBACK) begin
|
||||
if (DIRTY_BYTES) begin
|
||||
wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_rdata;
|
||||
wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_wdata;
|
||||
|
||||
for (genvar i = 0; i < NUM_WAYS; ++i) begin
|
||||
wire [LINE_SIZE-1:0] wdata = write ? (bs_rdata[i] | write_byteen) : ((fill || flush) ? '0 : bs_rdata[i]);
|
||||
assign bs_wdata[i] = init ? '0 : (way_sel[i] ? wdata : bs_rdata[i]);
|
||||
end
|
||||
|
||||
VX_sp_ram #(
|
||||
.DATAW (LINE_SIZE),
|
||||
.WRENW (LINE_SIZE),
|
||||
.SIZE (`CS_LINES_PER_BANK),
|
||||
.OUT_REG (1),
|
||||
.RDW_MODE ("R")
|
||||
.DATAW (LINE_SIZE * NUM_WAYS),
|
||||
.SIZE (`CS_LINES_PER_BANK)
|
||||
) byteen_store (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.read (byteen_read),
|
||||
.write (byteen_write),
|
||||
.wren (byteen_wren),
|
||||
.addr (line_idx),
|
||||
.wdata (byteen_wdata),
|
||||
.rdata (byteen_rdata[i])
|
||||
.read (write || fill || flush),
|
||||
.write (init || write || fill || flush),
|
||||
.wren (1'b1),
|
||||
.addr (line_sel),
|
||||
.wdata (bs_wdata),
|
||||
.rdata (bs_rdata)
|
||||
);
|
||||
|
||||
assign dirty_byteen = bs_rdata[way_idx];
|
||||
end else begin
|
||||
assign dirty_byteen = {LINE_SIZE{1'b1}};
|
||||
end
|
||||
|
||||
assign evict_byteen = byteen_rdata[way_idx_r];
|
||||
|
||||
end else begin : g_no_dirty_bytes
|
||||
`UNUSED_VAR (init)
|
||||
`UNUSED_VAR (flush)
|
||||
assign evict_byteen = '1; // update whole line
|
||||
end
|
||||
|
||||
wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] line_rdata;
|
||||
|
||||
for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_data_store
|
||||
|
||||
localparam WRENW = WRITE_ENABLE ? LINE_SIZE : 1;
|
||||
|
||||
wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] line_wdata;
|
||||
wire [WRENW-1:0] line_wren;
|
||||
|
||||
if (WRITE_ENABLE) begin : g_wren
|
||||
assign line_wdata = fill ? fill_data : {`CS_WORDS_PER_LINE{write_word}};
|
||||
assign line_wren = {LINE_SIZE{fill}} | write_mask;
|
||||
end else begin : g_no_wren
|
||||
`UNUSED_VAR (write_word)
|
||||
`UNUSED_VAR (write_mask)
|
||||
assign line_wdata = fill_data;
|
||||
assign line_wren = 1'b1;
|
||||
wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] flipped_rdata;
|
||||
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
|
||||
for (genvar j = 0; j < NUM_WAYS; ++j) begin
|
||||
assign flipped_rdata[j][i] = line_rdata[i][j];
|
||||
end
|
||||
end
|
||||
|
||||
wire line_write = (fill && ((NUM_WAYS == 1) || (evict_way == i)))
|
||||
|| (write && tag_matches[i] && WRITE_ENABLE);
|
||||
|
||||
wire line_read = read || ((fill || flush) && WRITEBACK);
|
||||
|
||||
VX_sp_ram #(
|
||||
.DATAW (`CS_LINE_WIDTH),
|
||||
.SIZE (`CS_LINES_PER_BANK),
|
||||
.WRENW (WRENW),
|
||||
.OUT_REG (1),
|
||||
.RDW_MODE ("R")
|
||||
) data_store (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.read (line_read),
|
||||
.write (line_write),
|
||||
.wren (line_wren),
|
||||
.addr (line_idx),
|
||||
.wdata (line_wdata),
|
||||
.rdata (line_rdata[i])
|
||||
);
|
||||
assign dirty_data = flipped_rdata[way_idx];
|
||||
end else begin
|
||||
assign dirty_byteen = '0;
|
||||
assign dirty_data = '0;
|
||||
end
|
||||
|
||||
assign read_data = line_rdata[way_idx_r];
|
||||
// order the data layout to perform ways multiplexing last.
|
||||
// this allows converting way index to binary in parallel with BRAM readaccess and way selection.
|
||||
|
||||
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_wdata;
|
||||
wire [BYTEENW-1:0] line_wren;
|
||||
|
||||
if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin
|
||||
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w;
|
||||
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
|
||||
for (genvar j = 0; j < NUM_WAYS; ++j) begin
|
||||
assign line_wdata[i][j] = (fill || !WRITE_ENABLE) ? fill_data[i] : write_data[i];
|
||||
assign wren_w[i][j] = ((fill || !WRITE_ENABLE) ? {WORD_SIZE{1'b1}} : write_byteen[i])
|
||||
& {WORD_SIZE{(way_sel[j] || (NUM_WAYS == 1))}};
|
||||
end
|
||||
end
|
||||
assign line_wren = wren_w;
|
||||
end else begin
|
||||
`UNUSED_VAR (write)
|
||||
`UNUSED_VAR (write_byteen)
|
||||
`UNUSED_VAR (write_data)
|
||||
assign line_wdata = fill_data;
|
||||
assign line_wren = fill;
|
||||
end
|
||||
|
||||
VX_onehot_encoder #(
|
||||
.N (NUM_WAYS)
|
||||
) way_enc (
|
||||
.data_in (way_sel),
|
||||
.data_out (way_idx),
|
||||
`UNUSED_PIN (valid_out)
|
||||
);
|
||||
|
||||
wire line_read = (read && ~stall)
|
||||
|| (WRITEBACK && (fill || flush));
|
||||
|
||||
wire line_write = write || fill;
|
||||
|
||||
VX_sp_ram #(
|
||||
.DATAW (`CS_LINE_WIDTH * NUM_WAYS),
|
||||
.SIZE (`CS_LINES_PER_BANK),
|
||||
.WRENW (BYTEENW),
|
||||
.NO_RWCHECK (1),
|
||||
.RW_ASSERT (1)
|
||||
) data_store (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.read (line_read),
|
||||
.write (line_write),
|
||||
.wren (line_wren),
|
||||
.addr (line_sel),
|
||||
.wdata (line_wdata),
|
||||
.rdata (line_rdata)
|
||||
);
|
||||
|
||||
wire [NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] per_way_rdata;
|
||||
if (`CS_WORDS_PER_LINE > 1) begin
|
||||
assign per_way_rdata = line_rdata[wsel];
|
||||
end else begin
|
||||
`UNUSED_VAR (wsel)
|
||||
assign per_way_rdata = line_rdata;
|
||||
end
|
||||
assign read_data = per_way_rdata[way_idx];
|
||||
|
||||
`ifdef DBG_TRACE_CACHE
|
||||
always @(posedge clk) begin
|
||||
if (fill && ~stall) begin
|
||||
`TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data));
|
||||
end
|
||||
if (flush && ~stall) begin
|
||||
`TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, dirty_byteen, dirty_data));
|
||||
end
|
||||
if (read && ~stall) begin
|
||||
`TRACE(3, ("%d: %s read: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, read_data, req_uuid));
|
||||
end
|
||||
if (write && ~stall) begin
|
||||
`TRACE(3, ("%d: %s write: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, write_byteen[wsel], write_data[wsel], req_uuid));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
|
14
hw/rtl/cache/VX_cache_define.vh
vendored
14
hw/rtl/cache/VX_cache_define.vh
vendored
|
@ -22,7 +22,6 @@
|
|||
`define CS_LINE_WIDTH (8 * LINE_SIZE)
|
||||
`define CS_BANK_SIZE (CACHE_SIZE / NUM_BANKS)
|
||||
`define CS_WAY_SEL_BITS `CLOG2(NUM_WAYS)
|
||||
`define CS_WAY_SEL_WIDTH `UP(`CS_WAY_SEL_BITS)
|
||||
|
||||
`define CS_LINES_PER_BANK (`CS_BANK_SIZE / (LINE_SIZE * NUM_WAYS))
|
||||
`define CS_WORDS_PER_LINE (LINE_SIZE / WORD_SIZE)
|
||||
|
@ -55,7 +54,12 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define CS_BANK_TO_FULL_ADDR(x, b) {x, (`XLEN-$bits(x))'(b << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
|
||||
`define CS_LINE_TO_MEM_ADDR(x, i) {x, `CS_BANK_SEL_BITS'(i)}
|
||||
`define CS_MEM_ADDR_TO_BANK_ID(x) x[0 +: `CS_BANK_SEL_BITS]
|
||||
`define CS_MEM_TAG_TO_REQ_ID(x) x[MSHR_ADDR_WIDTH-1:0]
|
||||
`define CS_MEM_TAG_TO_BANK_ID(x) x[MSHR_ADDR_WIDTH +: `CS_BANK_SEL_BITS]
|
||||
|
||||
`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
|
||||
`define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -70,10 +74,4 @@
|
|||
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, count, (count > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, count, (count > 1))
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define CS_REPL_RANDOM 0
|
||||
`define CS_REPL_FIFO 1
|
||||
`define CS_REPL_PLRU 2
|
||||
|
||||
`endif // VX_CACHE_DEFINE_VH
|
||||
|
|
44
hw/rtl/cache/VX_cache_flush.sv
vendored
44
hw/rtl/cache/VX_cache_flush.sv
vendored
|
@ -18,10 +18,6 @@ module VX_cache_flush #(
|
|||
parameter NUM_REQS = 4,
|
||||
// Number of banks
|
||||
parameter NUM_BANKS = 1,
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
// core request tag size
|
||||
parameter TAG_WIDTH = UUID_WIDTH + 1,
|
||||
// Bank select latency
|
||||
parameter BANK_SEL_LATENCY = 1
|
||||
) (
|
||||
|
@ -31,11 +27,8 @@ module VX_cache_flush #(
|
|||
VX_mem_bus_if.master core_bus_out_if [NUM_REQS],
|
||||
input wire [NUM_BANKS-1:0] bank_req_fire,
|
||||
output wire [NUM_BANKS-1:0] flush_begin,
|
||||
output wire [`UP(UUID_WIDTH)-1:0] flush_uuid,
|
||||
input wire [NUM_BANKS-1:0] flush_end
|
||||
);
|
||||
`UNUSED_PARAM (TAG_WIDTH)
|
||||
|
||||
localparam STATE_IDLE = 0;
|
||||
localparam STATE_WAIT1 = 1;
|
||||
localparam STATE_FLUSH = 2;
|
||||
|
@ -48,13 +41,13 @@ module VX_cache_flush #(
|
|||
|
||||
wire no_inflight_reqs;
|
||||
|
||||
if (BANK_SEL_LATENCY != 0) begin : g_bank_sel_latency
|
||||
if (BANK_SEL_LATENCY != 0) begin
|
||||
|
||||
localparam NUM_REQS_W = `CLOG2(NUM_REQS+1);
|
||||
localparam NUM_BANKS_W = `CLOG2(NUM_BANKS+1);
|
||||
|
||||
wire [NUM_REQS-1:0] core_bus_out_fire;
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_fire
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_bus_out_fire[i] = core_bus_out_if[i].req_valid && core_bus_out_if[i].req_ready;
|
||||
end
|
||||
|
||||
|
@ -81,7 +74,7 @@ module VX_cache_flush #(
|
|||
`UNUSED_PIN (size)
|
||||
);
|
||||
|
||||
end else begin : g_no_bank_sel_latency
|
||||
end else begin
|
||||
assign no_inflight_reqs = 0;
|
||||
`UNUSED_VAR (bank_req_fire)
|
||||
end
|
||||
|
@ -89,38 +82,28 @@ module VX_cache_flush #(
|
|||
reg [NUM_BANKS-1:0] flush_done, flush_done_n;
|
||||
|
||||
wire [NUM_REQS-1:0] flush_req_mask;
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_flush_req_mask
|
||||
assign flush_req_mask[i] = core_bus_in_if[i].req_valid && core_bus_in_if[i].req_data.flags[`MEM_REQ_FLAG_FLUSH];
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign flush_req_mask[i] = core_bus_in_if[i].req_valid && core_bus_in_if[i].req_data.atype[`ADDR_TYPE_FLUSH];
|
||||
end
|
||||
wire flush_req_enable = (| flush_req_mask);
|
||||
|
||||
reg [NUM_REQS-1:0] lock_released, lock_released_n;
|
||||
reg [`UP(UUID_WIDTH)-1:0] flush_uuid_r, flush_uuid_n;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_req
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
wire input_enable = ~flush_req_enable || lock_released[i];
|
||||
assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && input_enable;
|
||||
assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
|
||||
assign core_bus_in_if[i].req_ready = core_bus_out_if[i].req_ready && input_enable;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_in_rsp
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_bus_in_if[i].rsp_valid = core_bus_out_if[i].rsp_valid;
|
||||
assign core_bus_in_if[i].rsp_data = core_bus_out_if[i].rsp_data;
|
||||
assign core_bus_out_if[i].rsp_ready = core_bus_in_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
reg [NUM_REQS-1:0][`UP(UUID_WIDTH)-1:0] core_bus_out_uuid;
|
||||
wire [NUM_REQS-1:0] core_bus_out_ready;
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_uuid
|
||||
if (UUID_WIDTH != 0) begin : g_uuid
|
||||
assign core_bus_out_uuid[i] = core_bus_in_if[i].req_data.tag.uuid;
|
||||
end else begin : g_no_uuid
|
||||
assign core_bus_out_uuid[i] = 0;
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_ready
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_bus_out_ready[i] = core_bus_out_if[i].req_ready;
|
||||
end
|
||||
|
||||
|
@ -128,17 +111,10 @@ module VX_cache_flush #(
|
|||
state_n = state;
|
||||
flush_done_n = flush_done;
|
||||
lock_released_n = lock_released;
|
||||
flush_uuid_n = flush_uuid_r;
|
||||
case (state)
|
||||
//STATE_IDLE:
|
||||
default: begin
|
||||
STATE_IDLE: begin
|
||||
if (flush_req_enable) begin
|
||||
state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT1 : STATE_FLUSH;
|
||||
for (integer i = NUM_REQS-1; i >= 0; --i) begin
|
||||
if (flush_req_mask[i]) begin
|
||||
flush_uuid_n = core_bus_out_uuid[i];
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
STATE_WAIT1: begin
|
||||
|
@ -182,10 +158,8 @@ module VX_cache_flush #(
|
|||
flush_done <= flush_done_n;
|
||||
lock_released <= lock_released_n;
|
||||
end
|
||||
flush_uuid_r <= flush_uuid_n;
|
||||
end
|
||||
|
||||
assign flush_begin = {NUM_BANKS{state == STATE_FLUSH}};
|
||||
assign flush_uuid = flush_uuid_r;
|
||||
|
||||
endmodule
|
||||
|
|
177
hw/rtl/cache/VX_cache_mshr.sv
vendored
177
hw/rtl/cache/VX_cache_mshr.sv
vendored
|
@ -24,23 +24,36 @@
|
|||
// arrival and are dequeued in the same order.
|
||||
// Each entry has a next pointer to the next entry pending for the same cache line.
|
||||
//
|
||||
// During the fill request, the MSHR will dequue the MSHR entry at the fill_id location
|
||||
// During the fill operation, the MSHR will release the MSHR entry at fill_id
|
||||
// which represents the first request in the pending list that initiated the memory fill.
|
||||
//
|
||||
// The dequeue response directly follows the fill request and will release
|
||||
// The dequeue operation directly follows the fill operation and will release
|
||||
// all the subsequent entries linked to fill_id (pending the same cache line).
|
||||
//
|
||||
// During the allocation request, the MSHR will allocate the next free slot
|
||||
// During the allocation operation, the MSHR will allocate the next free slot
|
||||
// for the incoming core request. We return the allocated slot id as well as
|
||||
// the slot id of the previous entry for the same cache line. This is used to
|
||||
// link the new entry to the pending list.
|
||||
// link the new entry to the pending list during finalization.
|
||||
//
|
||||
// The finalize request is used to persit or release the currently allocated MSHR entry
|
||||
// if we had a cache miss or a hit, respectively.
|
||||
// The lookup operation is used to find all pending entries for a given cache line.
|
||||
// This is used to by the cache bank to determine if a cache miss is already pending
|
||||
// and therefore avoid issuing a memory fill request.
|
||||
//
|
||||
// The finalize operation is used to release the allocated MSHR entry if we had a hit.
|
||||
// If we had a miss and finalize_pending is true, we link the allocated entry to
|
||||
// its corresponding pending list (via finalize_prev).
|
||||
//
|
||||
// Warning: This MSHR implementation is strongly coupled with the bank pipeline
|
||||
// and as such changes to either module requires careful evaluation.
|
||||
//
|
||||
// This architecture implements three pipeline stages:
|
||||
// - Arbitration: cache bank arbitration before entering pipeline.
|
||||
// fill and dequeue operations are executed at this stage.
|
||||
// - stage 0: cache bank tag access stage.
|
||||
// allocate and lookup operations are executed at this stage.
|
||||
// - stage 1: cache bank tdatag access stage.
|
||||
// finalize operation is executed at this stage.
|
||||
//
|
||||
|
||||
module VX_cache_mshr #(
|
||||
parameter `STRING INSTANCE_ID= "",
|
||||
|
@ -55,9 +68,6 @@ module VX_cache_mshr #(
|
|||
parameter UUID_WIDTH = 0,
|
||||
// MSHR parameters
|
||||
parameter DATA_WIDTH = 1,
|
||||
// Enable cache writeback
|
||||
parameter WRITEBACK = 0,
|
||||
|
||||
parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE)
|
||||
) (
|
||||
input wire clk,
|
||||
|
@ -65,7 +75,7 @@ module VX_cache_mshr #(
|
|||
|
||||
`IGNORE_UNUSED_BEGIN
|
||||
input wire[`UP(UUID_WIDTH)-1:0] deq_req_uuid,
|
||||
input wire[`UP(UUID_WIDTH)-1:0] alc_req_uuid,
|
||||
input wire[`UP(UUID_WIDTH)-1:0] lkp_req_uuid,
|
||||
input wire[`UP(UUID_WIDTH)-1:0] fin_req_uuid,
|
||||
`IGNORE_UNUSED_END
|
||||
|
||||
|
@ -88,21 +98,26 @@ module VX_cache_mshr #(
|
|||
input wire allocate_rw,
|
||||
input wire [DATA_WIDTH-1:0] allocate_data,
|
||||
output wire [MSHR_ADDR_WIDTH-1:0] allocate_id,
|
||||
output wire allocate_pending,
|
||||
output wire [MSHR_ADDR_WIDTH-1:0] allocate_previd,
|
||||
output wire [MSHR_ADDR_WIDTH-1:0] allocate_prev,
|
||||
output wire allocate_ready,
|
||||
|
||||
// lookup
|
||||
input wire lookup_valid,
|
||||
input wire [`CS_LINE_ADDR_WIDTH-1:0] lookup_addr,
|
||||
output wire [MSHR_SIZE-1:0] lookup_pending,
|
||||
output wire [MSHR_SIZE-1:0] lookup_rw,
|
||||
|
||||
// finalize
|
||||
input wire finalize_valid,
|
||||
input wire finalize_is_release,
|
||||
input wire finalize_is_pending,
|
||||
input wire [MSHR_ADDR_WIDTH-1:0] finalize_previd,
|
||||
input wire [MSHR_ADDR_WIDTH-1:0] finalize_id
|
||||
input wire finalize_release,
|
||||
input wire finalize_pending,
|
||||
input wire [MSHR_ADDR_WIDTH-1:0] finalize_id,
|
||||
input wire [MSHR_ADDR_WIDTH-1:0] finalize_prev
|
||||
);
|
||||
`UNUSED_PARAM (BANK_ID)
|
||||
|
||||
reg [`CS_LINE_ADDR_WIDTH-1:0] addr_table [0:MSHR_SIZE-1];
|
||||
reg [MSHR_ADDR_WIDTH-1:0] next_index [0:MSHR_SIZE-1];
|
||||
reg [`CS_LINE_ADDR_WIDTH-1:0] addr_table [MSHR_SIZE-1:0];
|
||||
reg [MSHR_ADDR_WIDTH-1:0] next_index [MSHR_SIZE-1:0];
|
||||
|
||||
reg [MSHR_SIZE-1:0] valid_table, valid_table_n;
|
||||
reg [MSHR_SIZE-1:0] next_table, next_table_x, next_table_n;
|
||||
|
@ -120,8 +135,8 @@ module VX_cache_mshr #(
|
|||
wire dequeue_fire = dequeue_valid && dequeue_ready;
|
||||
|
||||
wire [MSHR_SIZE-1:0] addr_matches;
|
||||
for (genvar i = 0; i < MSHR_SIZE; ++i) begin : g_addr_matches
|
||||
assign addr_matches[i] = valid_table[i] && (addr_table[i] == allocate_addr);
|
||||
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
|
||||
assign addr_matches[i] = valid_table[i] && (addr_table[i] == lookup_addr);
|
||||
end
|
||||
|
||||
VX_lzc #(
|
||||
|
@ -133,13 +148,11 @@ module VX_cache_mshr #(
|
|||
.valid_out (allocate_rdy_n)
|
||||
);
|
||||
|
||||
// find matching tail-entry
|
||||
VX_priority_encoder #(
|
||||
VX_onehot_encoder #(
|
||||
.N (MSHR_SIZE)
|
||||
) prev_sel (
|
||||
.data_in (addr_matches & ~next_table_x),
|
||||
.index_out (prev_idx),
|
||||
`UNUSED_PIN (onehot_out),
|
||||
.data_out (prev_idx),
|
||||
`UNUSED_PIN (valid_out)
|
||||
);
|
||||
|
||||
|
@ -158,22 +171,17 @@ module VX_cache_mshr #(
|
|||
valid_table_n[dequeue_id] = 0;
|
||||
if (next_table[dequeue_id]) begin
|
||||
dequeue_id_n = next_index[dequeue_id];
|
||||
end else if (finalize_valid && finalize_is_pending && (finalize_previd == dequeue_id)) begin
|
||||
dequeue_id_n = finalize_id;
|
||||
end else begin
|
||||
dequeue_val_n = 0;
|
||||
end
|
||||
end
|
||||
|
||||
if (finalize_valid) begin
|
||||
if (finalize_is_release) begin
|
||||
if (finalize_release) begin
|
||||
valid_table_n[finalize_id] = 0;
|
||||
end
|
||||
// warning: This code allows 'finalize_is_pending' to be asserted regardless of hit/miss
|
||||
// to reduce the its propagation delay into the MSHR. this is safe because wrong updates
|
||||
// to 'next_table_n' will be cleared during 'allocate_fire' below.
|
||||
if (finalize_is_pending) begin
|
||||
next_table_x[finalize_previd] = 1;
|
||||
if (finalize_pending) begin
|
||||
next_table_x[finalize_prev] = 1;
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -196,12 +204,12 @@ module VX_cache_mshr #(
|
|||
end
|
||||
|
||||
if (allocate_fire) begin
|
||||
addr_table[allocate_id] <= allocate_addr;
|
||||
addr_table[allocate_id] <= allocate_addr;
|
||||
write_table[allocate_id] <= allocate_rw;
|
||||
end
|
||||
|
||||
if (finalize_valid && finalize_is_pending) begin
|
||||
next_index[finalize_previd] <= finalize_id;
|
||||
if (finalize_valid && finalize_pending) begin
|
||||
next_index[finalize_prev] <= finalize_id;
|
||||
end
|
||||
|
||||
dequeue_id_r <= dequeue_id_n;
|
||||
|
@ -209,21 +217,20 @@ module VX_cache_mshr #(
|
|||
next_table <= next_table_n;
|
||||
end
|
||||
|
||||
`RUNTIME_ASSERT(~(allocate_fire && valid_table[allocate_id_r]), ("%t: *** %s inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
|
||||
`CS_BANK_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id_r, alc_req_uuid))
|
||||
`RUNTIME_ASSERT((~allocate_fire || ~valid_table[allocate_id_r]), ("%t: *** %s inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
|
||||
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id_r, lkp_req_uuid))
|
||||
|
||||
`RUNTIME_ASSERT(~(finalize_valid && ~valid_table[finalize_id]), ("%t: *** %s invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
|
||||
`CS_BANK_TO_FULL_ADDR(addr_table[finalize_id], BANK_ID), finalize_id, fin_req_uuid))
|
||||
`RUNTIME_ASSERT((~finalize_valid || valid_table[finalize_id]), ("%t: *** %s invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
|
||||
`CS_LINE_TO_FULL_ADDR(addr_table[finalize_id], BANK_ID), finalize_id, fin_req_uuid))
|
||||
|
||||
`RUNTIME_ASSERT(~(fill_valid && ~valid_table[fill_id]), ("%t: *** %s invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID,
|
||||
`CS_BANK_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), fill_id))
|
||||
`RUNTIME_ASSERT((~fill_valid || valid_table[fill_id]), ("%t: *** %s invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID,
|
||||
`CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), fill_id))
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (DATA_WIDTH),
|
||||
.SIZE (MSHR_SIZE),
|
||||
.RDW_MODE ("R"),
|
||||
.RADDR_REG (1)
|
||||
) mshr_store (
|
||||
.DATAW (DATA_WIDTH),
|
||||
.SIZE (MSHR_SIZE),
|
||||
.LUTRAM (1)
|
||||
) entries (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.read (1'b1),
|
||||
|
@ -238,20 +245,19 @@ module VX_cache_mshr #(
|
|||
assign fill_addr = addr_table[fill_id];
|
||||
|
||||
assign allocate_ready = allocate_rdy;
|
||||
assign allocate_id = allocate_id_r;
|
||||
assign allocate_previd = prev_idx;
|
||||
assign allocate_id = allocate_id_r;
|
||||
assign allocate_prev = prev_idx;
|
||||
|
||||
if (WRITEBACK) begin : g_pending_wb
|
||||
assign allocate_pending = |addr_matches;
|
||||
end else begin : g_pending_wt
|
||||
// exclude write requests if writethrough
|
||||
assign allocate_pending = |(addr_matches & ~write_table);
|
||||
end
|
||||
assign dequeue_valid = dequeue_val;
|
||||
assign dequeue_addr = addr_table[dequeue_id_r];
|
||||
assign dequeue_rw = write_table[dequeue_id_r];
|
||||
assign dequeue_id = dequeue_id_r;
|
||||
|
||||
assign dequeue_valid = dequeue_val;
|
||||
assign dequeue_addr = addr_table[dequeue_id_r];
|
||||
assign dequeue_rw = write_table[dequeue_id_r];
|
||||
assign dequeue_id = dequeue_id_r;
|
||||
// return pending entries for the given cache line
|
||||
assign lookup_pending = addr_matches;
|
||||
assign lookup_rw = write_table;
|
||||
|
||||
`UNUSED_VAR (lookup_valid)
|
||||
|
||||
`ifdef DBG_TRACE_CACHE
|
||||
reg show_table;
|
||||
|
@ -259,42 +265,37 @@ module VX_cache_mshr #(
|
|||
if (reset) begin
|
||||
show_table <= 0;
|
||||
end else begin
|
||||
show_table <= allocate_fire || finalize_valid || fill_valid || dequeue_fire;
|
||||
end
|
||||
if (allocate_fire) begin
|
||||
`TRACE(3, ("%t: %s allocate: addr=0x%0h, id=%0d, pending=%b, prev=%0d (#%0d)\n", $time, INSTANCE_ID,
|
||||
`CS_BANK_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id, allocate_pending, prev_idx, alc_req_uuid))
|
||||
end
|
||||
if (finalize_valid && finalize_is_release) begin
|
||||
`TRACE(3, ("%t: %s release: id=%0d (#%0d)\n", $time, INSTANCE_ID, finalize_id, fin_req_uuid))
|
||||
end
|
||||
if (finalize_valid && finalize_is_pending) begin
|
||||
`TRACE(3, ("%t: %s finalize: id=%0d (#%0d)\n", $time, INSTANCE_ID, finalize_id, fin_req_uuid))
|
||||
end
|
||||
if (fill_valid) begin
|
||||
`TRACE(3, ("%t: %s fill: addr=0x%0h, id=%0d\n", $time, INSTANCE_ID,
|
||||
`CS_BANK_TO_FULL_ADDR(fill_addr, BANK_ID), fill_id))
|
||||
end
|
||||
if (dequeue_fire) begin
|
||||
`TRACE(3, ("%t: %s dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID,
|
||||
`CS_BANK_TO_FULL_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_uuid))
|
||||
show_table <= allocate_fire || lookup_valid || finalize_valid || fill_valid || dequeue_fire;
|
||||
end
|
||||
if (allocate_fire)
|
||||
`TRACE(3, ("%d: %s allocate: addr=0x%0h, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID,
|
||||
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_prev, allocate_id, lkp_req_uuid));
|
||||
if (lookup_valid)
|
||||
`TRACE(3, ("%d: %s lookup: addr=0x%0h, matches=%b (#%0d)\n", $time, INSTANCE_ID,
|
||||
`CS_LINE_TO_FULL_ADDR(lookup_addr, BANK_ID), lookup_pending, lkp_req_uuid));
|
||||
if (finalize_valid)
|
||||
`TRACE(3, ("%d: %s finalize release=%b, pending=%b, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID,
|
||||
finalize_release, finalize_pending, finalize_prev, finalize_id, fin_req_uuid));
|
||||
if (fill_valid)
|
||||
`TRACE(3, ("%d: %s fill: addr=0x%0h, addr=0x%0h, id=%0d\n", $time, INSTANCE_ID,
|
||||
`CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), `CS_LINE_TO_FULL_ADDR(fill_addr, BANK_ID), fill_id));
|
||||
if (dequeue_fire)
|
||||
`TRACE(3, ("%d: %s dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID,
|
||||
`CS_LINE_TO_FULL_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_uuid));
|
||||
if (show_table) begin
|
||||
`TRACE(3, ("%t: %s table", $time, INSTANCE_ID))
|
||||
`TRACE(3, ("%d: %s table", $time, INSTANCE_ID));
|
||||
for (integer i = 0; i < MSHR_SIZE; ++i) begin
|
||||
if (valid_table[i]) begin
|
||||
`TRACE(3, (" %0d=0x%0h", i, `CS_BANK_TO_FULL_ADDR(addr_table[i], BANK_ID)))
|
||||
if (write_table[i]) begin
|
||||
`TRACE(3, ("(w)"))
|
||||
end else begin
|
||||
`TRACE(3, ("(r)"))
|
||||
end
|
||||
if (next_table[i]) begin
|
||||
`TRACE(3, ("->%0d", next_index[i]))
|
||||
end
|
||||
`TRACE(3, (" %0d=0x%0h", i, `CS_LINE_TO_FULL_ADDR(addr_table[i], BANK_ID)));
|
||||
if (write_table[i])
|
||||
`TRACE(3, ("(w)"));
|
||||
else
|
||||
`TRACE(3, ("(r)"));
|
||||
if (next_table[i])
|
||||
`TRACE(3, ("->%0d", next_index[i]));
|
||||
end
|
||||
end
|
||||
`TRACE(3, ("\n"))
|
||||
`TRACE(3, ("\n"));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
210
hw/rtl/cache/VX_cache_repl.sv
vendored
210
hw/rtl/cache/VX_cache_repl.sv
vendored
|
@ -1,210 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_cache_define.vh"
|
||||
|
||||
// Fast PLRU encoder and decoder utility
|
||||
// Adapted from BaseJump STL: http://bjump.org/data_out.html
|
||||
|
||||
module plru_decoder #(
|
||||
parameter NUM_WAYS = 1,
|
||||
parameter WAY_IDX_BITS = $clog2(NUM_WAYS),
|
||||
parameter WAY_IDX_WIDTH = `UP(WAY_IDX_BITS)
|
||||
) (
|
||||
input wire [WAY_IDX_WIDTH-1:0] way_idx,
|
||||
output wire [`UP(NUM_WAYS-1)-1:0] lru_data,
|
||||
output wire [`UP(NUM_WAYS-1)-1:0] lru_mask
|
||||
);
|
||||
if (NUM_WAYS > 1) begin : g_dec
|
||||
wire [`UP(NUM_WAYS-1)-1:0] data;
|
||||
`IGNORE_UNOPTFLAT_BEGIN
|
||||
wire [`UP(NUM_WAYS-1)-1:0] mask;
|
||||
`IGNORE_UNOPTFLAT_END
|
||||
for (genvar i = 0; i < NUM_WAYS-1; ++i) begin : g_i
|
||||
if (i == 0) begin : g_i_0
|
||||
assign mask[i] = 1'b1;
|
||||
end else if (i % 2 == 1) begin : g_i_odd
|
||||
assign mask[i] = mask[(i-1)/2] & ~way_idx[WAY_IDX_BITS-$clog2(i+2)+1];
|
||||
end else begin : g_i_even
|
||||
assign mask[i] = mask[(i-2)/2] & way_idx[WAY_IDX_BITS-$clog2(i+2)+1];
|
||||
end
|
||||
assign data[i] = ~way_idx[WAY_IDX_BITS-$clog2(i+2)];
|
||||
end
|
||||
assign lru_data = data;
|
||||
assign lru_mask = mask;
|
||||
end else begin : g_no_dec
|
||||
`UNUSED_VAR (way_idx)
|
||||
assign lru_data = '0;
|
||||
assign lru_mask = '0;
|
||||
end
|
||||
|
||||
endmodule
|
||||
|
||||
module plru_encoder #(
|
||||
parameter NUM_WAYS = 1,
|
||||
parameter WAY_IDX_BITS = $clog2(NUM_WAYS),
|
||||
parameter WAY_IDX_WIDTH = `UP(WAY_IDX_BITS)
|
||||
) (
|
||||
input wire [`UP(NUM_WAYS-1)-1:0] lru_in,
|
||||
output wire [WAY_IDX_WIDTH-1:0] way_idx
|
||||
);
|
||||
if (NUM_WAYS > 1) begin : g_enc
|
||||
wire [WAY_IDX_BITS-1:0] tmp;
|
||||
for (genvar i = 0; i < WAY_IDX_BITS; ++i) begin : g_i
|
||||
if (i == 0) begin : g_i_0
|
||||
assign tmp[WAY_IDX_WIDTH-1] = lru_in[0];
|
||||
end else begin : g_i_n
|
||||
VX_mux #(
|
||||
.N (2**i)
|
||||
) mux (
|
||||
.data_in (lru_in[((2**i)-1)+:(2**i)]),
|
||||
.sel_in (tmp[WAY_IDX_BITS-1-:i]),
|
||||
.data_out (tmp[WAY_IDX_BITS-1-i])
|
||||
);
|
||||
end
|
||||
end
|
||||
assign way_idx = tmp;
|
||||
end else begin : g_no_enc
|
||||
`UNUSED_VAR (lru_in)
|
||||
assign way_idx = '0;
|
||||
end
|
||||
|
||||
endmodule
|
||||
|
||||
module VX_cache_repl #(
|
||||
parameter CACHE_SIZE = 1024,
|
||||
// Size of line inside a bank in bytes
|
||||
parameter LINE_SIZE = 64,
|
||||
// Number of banks
|
||||
parameter NUM_BANKS = 1,
|
||||
// Number of associative ways
|
||||
parameter NUM_WAYS = 1,
|
||||
// replacement policy
|
||||
parameter REPL_POLICY = `CS_REPL_FIFO
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
input wire stall,
|
||||
input wire init,
|
||||
input wire lookup_valid,
|
||||
input wire lookup_hit,
|
||||
input wire [`CS_LINE_SEL_BITS-1:0] lookup_line,
|
||||
input wire [`CS_WAY_SEL_WIDTH-1:0] lookup_way,
|
||||
input wire repl_valid,
|
||||
input wire [`CS_LINE_SEL_BITS-1:0] repl_line,
|
||||
output wire [`CS_WAY_SEL_WIDTH-1:0] repl_way
|
||||
);
|
||||
localparam WAY_SEL_WIDTH = `CS_WAY_SEL_WIDTH;
|
||||
`UNUSED_VAR (reset)
|
||||
`UNUSED_VAR (init)
|
||||
`UNUSED_VAR (stall)
|
||||
|
||||
if (NUM_WAYS > 1) begin : g_enable
|
||||
if (REPL_POLICY == `CS_REPL_PLRU) begin : g_plru
|
||||
// Pseudo Least Recently Used replacement policy
|
||||
localparam LRU_WIDTH = `UP(NUM_WAYS-1);
|
||||
|
||||
wire [LRU_WIDTH-1:0] plru_rdata;
|
||||
wire [LRU_WIDTH-1:0] plru_wdata;
|
||||
wire [LRU_WIDTH-1:0] plru_wmask;
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (LRU_WIDTH),
|
||||
.SIZE (`CS_LINES_PER_BANK),
|
||||
.WRENW (LRU_WIDTH),
|
||||
.RDW_MODE ("R"),
|
||||
.RADDR_REG (1)
|
||||
) plru_store (
|
||||
.clk (clk),
|
||||
.reset (1'b0),
|
||||
.read (repl_valid),
|
||||
.write (init || (lookup_valid && lookup_hit)),
|
||||
.wren (init ? '1 : plru_wmask),
|
||||
.waddr (lookup_line),
|
||||
.raddr (repl_line),
|
||||
.wdata (init ? '0 : plru_wdata),
|
||||
.rdata (plru_rdata)
|
||||
);
|
||||
|
||||
plru_decoder #(
|
||||
.NUM_WAYS (NUM_WAYS)
|
||||
) plru_dec (
|
||||
.way_idx (lookup_way),
|
||||
.lru_data (plru_wdata),
|
||||
.lru_mask (plru_wmask)
|
||||
);
|
||||
|
||||
plru_encoder #(
|
||||
.NUM_WAYS (NUM_WAYS)
|
||||
) plru_enc (
|
||||
.lru_in (plru_rdata),
|
||||
.way_idx (repl_way)
|
||||
);
|
||||
|
||||
end else if (REPL_POLICY == `CS_REPL_FIFO) begin : g_fifo
|
||||
// Fifo replacement policy
|
||||
`UNUSED_VAR (lookup_valid)
|
||||
`UNUSED_VAR (lookup_hit)
|
||||
`UNUSED_VAR (lookup_line)
|
||||
`UNUSED_VAR (lookup_way)
|
||||
|
||||
wire [WAY_SEL_WIDTH-1:0] fifo_rdata;
|
||||
wire [WAY_SEL_WIDTH-1:0] fifo_wdata = fifo_rdata + 1;
|
||||
|
||||
VX_sp_ram #(
|
||||
.DATAW (WAY_SEL_WIDTH),
|
||||
.SIZE (`CS_LINES_PER_BANK),
|
||||
.RDW_MODE ("R"),
|
||||
.RADDR_REG (1)
|
||||
) fifo_store (
|
||||
.clk (clk),
|
||||
.reset (1'b0),
|
||||
.read (repl_valid),
|
||||
.write (init || repl_valid),
|
||||
.wren (1'b1),
|
||||
.addr (repl_line),
|
||||
.wdata (init ? '0 : fifo_wdata),
|
||||
.rdata (fifo_rdata)
|
||||
);
|
||||
|
||||
assign repl_way = fifo_rdata;
|
||||
end else begin : g_random
|
||||
// Random replacement policy
|
||||
`UNUSED_VAR (lookup_valid)
|
||||
`UNUSED_VAR (lookup_hit)
|
||||
`UNUSED_VAR (lookup_line)
|
||||
`UNUSED_VAR (lookup_way)
|
||||
`UNUSED_VAR (repl_valid)
|
||||
`UNUSED_VAR (repl_line)
|
||||
reg [WAY_SEL_WIDTH-1:0] victim_idx;
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
victim_idx <= 0;
|
||||
end else if (~stall) begin
|
||||
victim_idx <= victim_idx + 1;
|
||||
end
|
||||
end
|
||||
assign repl_way = victim_idx;
|
||||
end
|
||||
end else begin : g_disable
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (lookup_valid)
|
||||
`UNUSED_VAR (lookup_hit)
|
||||
`UNUSED_VAR (lookup_line)
|
||||
`UNUSED_VAR (lookup_way)
|
||||
`UNUSED_VAR (repl_valid)
|
||||
`UNUSED_VAR (repl_line)
|
||||
assign repl_way = 1'b0;
|
||||
end
|
||||
|
||||
endmodule
|
133
hw/rtl/cache/VX_cache_tags.sv
vendored
133
hw/rtl/cache/VX_cache_tags.sv
vendored
|
@ -14,6 +14,8 @@
|
|||
`include "VX_cache_define.vh"
|
||||
|
||||
module VX_cache_tags #(
|
||||
parameter `STRING INSTANCE_ID = "",
|
||||
parameter BANK_ID = 0,
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 1024,
|
||||
// Size of line inside a bank in bytes
|
||||
|
@ -25,61 +27,96 @@ module VX_cache_tags #(
|
|||
// Size of a word in bytes
|
||||
parameter WORD_SIZE = 1,
|
||||
// Enable cache writeback
|
||||
parameter WRITEBACK = 0
|
||||
parameter WRITEBACK = 0,
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// inputs
|
||||
`IGNORE_UNUSED_BEGIN
|
||||
input wire [`UP(UUID_WIDTH)-1:0] req_uuid,
|
||||
`IGNORE_UNUSED_END
|
||||
|
||||
input wire stall,
|
||||
|
||||
// init/fill/lookup
|
||||
input wire init,
|
||||
input wire flush,
|
||||
input wire fill,
|
||||
input wire read,
|
||||
input wire write,
|
||||
input wire [`CS_LINE_SEL_BITS-1:0] line_idx,
|
||||
input wire [`CS_TAG_SEL_BITS-1:0] line_tag,
|
||||
input wire [`CS_WAY_SEL_WIDTH-1:0] evict_way,
|
||||
|
||||
// outputs
|
||||
input wire lookup,
|
||||
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
|
||||
input wire [NUM_WAYS-1:0] way_sel,
|
||||
output wire [NUM_WAYS-1:0] tag_matches,
|
||||
|
||||
// eviction
|
||||
output wire evict_dirty,
|
||||
output wire [NUM_WAYS-1:0] evict_way,
|
||||
output wire [`CS_TAG_SEL_BITS-1:0] evict_tag
|
||||
);
|
||||
// valid, dirty, tag
|
||||
localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS;
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
`UNUSED_PARAM (BANK_ID)
|
||||
`UNUSED_VAR (lookup)
|
||||
|
||||
// valid, dirty, tag
|
||||
localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS;
|
||||
|
||||
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
|
||||
wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_ADDR_TAG(line_addr);
|
||||
|
||||
wire [NUM_WAYS-1:0][`CS_TAG_SEL_BITS-1:0] read_tag;
|
||||
wire [NUM_WAYS-1:0] read_valid;
|
||||
wire [NUM_WAYS-1:0] read_dirty;
|
||||
`UNUSED_VAR (read)
|
||||
|
||||
if (WRITEBACK) begin : g_evict_tag_wb
|
||||
assign evict_dirty = read_dirty[evict_way];
|
||||
assign evict_tag = read_tag[evict_way];
|
||||
end else begin : g_evict_tag_wt
|
||||
`UNUSED_VAR (read_dirty)
|
||||
assign evict_dirty = 1'b0;
|
||||
assign evict_tag = '0;
|
||||
if (NUM_WAYS > 1) begin
|
||||
reg [NUM_WAYS-1:0] evict_way_r;
|
||||
// cyclic assignment of replacement way
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
evict_way_r <= 1;
|
||||
end else if (~stall) begin // holding the value on stalls prevents filling different slots twice
|
||||
evict_way_r <= {evict_way_r[NUM_WAYS-2:0], evict_way_r[NUM_WAYS-1]};
|
||||
end
|
||||
end
|
||||
|
||||
assign evict_way = fill ? evict_way_r : way_sel;
|
||||
|
||||
VX_onehot_mux #(
|
||||
.DATAW (`CS_TAG_SEL_BITS),
|
||||
.N (NUM_WAYS)
|
||||
) evict_tag_sel (
|
||||
.data_in (read_tag),
|
||||
.sel_in (evict_way),
|
||||
.data_out (evict_tag)
|
||||
);
|
||||
end else begin
|
||||
`UNUSED_VAR (stall)
|
||||
assign evict_way = 1'b1;
|
||||
assign evict_tag = read_tag;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_tag_store
|
||||
wire way_en = (NUM_WAYS == 1) || (evict_way == i);
|
||||
wire do_init = init; // init all ways
|
||||
wire do_fill = fill && way_en;
|
||||
wire do_flush = flush && (!WRITEBACK || way_en); // flush the whole line in writethrough mode
|
||||
wire do_write = WRITEBACK && write && tag_matches[i]; // only write on tag hit
|
||||
// fill and flush need to also read in writeback mode
|
||||
wire fill_s = fill && (!WRITEBACK || ~stall);
|
||||
wire flush_s = flush && (!WRITEBACK || ~stall);
|
||||
|
||||
wire line_read = read || write || (WRITEBACK && (fill || flush));
|
||||
wire line_write = do_init || do_fill || do_flush || do_write;
|
||||
wire line_valid = fill || write;
|
||||
for (genvar i = 0; i < NUM_WAYS; ++i) begin
|
||||
|
||||
wire do_fill = fill_s && evict_way[i];
|
||||
wire do_flush = flush_s && (!WRITEBACK || way_sel[i]); // flush the whole line in writethrough mode
|
||||
wire do_write = WRITEBACK && write && tag_matches[i];
|
||||
|
||||
wire line_read = (WRITEBACK && (fill_s || flush_s));
|
||||
wire line_write = init || do_fill || do_flush || do_write;
|
||||
wire line_valid = ~(init || flush);
|
||||
|
||||
wire [TAG_WIDTH-1:0] line_wdata;
|
||||
wire [TAG_WIDTH-1:0] line_rdata;
|
||||
|
||||
if (WRITEBACK) begin : g_wdata
|
||||
if (WRITEBACK) begin
|
||||
assign line_wdata = {line_valid, write, line_tag};
|
||||
assign {read_valid[i], read_dirty[i], read_tag[i]} = line_rdata;
|
||||
end else begin : g_wdata
|
||||
end else begin
|
||||
assign line_wdata = {line_valid, line_tag};
|
||||
assign {read_valid[i], read_tag[i]} = line_rdata;
|
||||
assign read_dirty[i] = 1'b0;
|
||||
|
@ -88,22 +125,52 @@ module VX_cache_tags #(
|
|||
VX_sp_ram #(
|
||||
.DATAW (TAG_WIDTH),
|
||||
.SIZE (`CS_LINES_PER_BANK),
|
||||
.RDW_MODE ("W"),
|
||||
.RADDR_REG (1)
|
||||
.NO_RWCHECK (1),
|
||||
.RW_ASSERT (1)
|
||||
) tag_store (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.read (line_read),
|
||||
.write (line_write),
|
||||
.wren (1'b1),
|
||||
.addr (line_idx),
|
||||
.addr (line_sel),
|
||||
.wdata (line_wdata),
|
||||
.rdata (line_rdata)
|
||||
);
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_tag_matches
|
||||
for (genvar i = 0; i < NUM_WAYS; ++i) begin
|
||||
assign tag_matches[i] = read_valid[i] && (line_tag == read_tag[i]);
|
||||
end
|
||||
|
||||
assign evict_dirty = | (read_dirty & evict_way);
|
||||
|
||||
`ifdef DBG_TRACE_CACHE
|
||||
wire [`CS_LINE_ADDR_WIDTH-1:0] evict_line_addr = {evict_tag, line_sel};
|
||||
always @(posedge clk) begin
|
||||
if (fill && ~stall) begin
|
||||
`TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, dirty=%b, evict_addr=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), evict_way, line_sel, line_tag, evict_dirty, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID)));
|
||||
end
|
||||
if (init) begin
|
||||
`TRACE(3, ("%d: %s init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel));
|
||||
end
|
||||
if (flush && ~stall) begin
|
||||
`TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, dirty=%b\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID), way_sel, line_sel, evict_dirty));
|
||||
end
|
||||
if (lookup && ~stall) begin
|
||||
if (tag_matches != 0) begin
|
||||
if (write)
|
||||
`TRACE(3, ("%d: %s write-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid));
|
||||
else
|
||||
`TRACE(3, ("%d: %s read-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid));
|
||||
end else begin
|
||||
if (write)
|
||||
`TRACE(3, ("%d: %s write-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
|
||||
else
|
||||
`TRACE(3, ("%d: %s read-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
|
109
hw/rtl/cache/VX_cache_top.sv
vendored
109
hw/rtl/cache/VX_cache_top.sv
vendored
|
@ -19,11 +19,8 @@ module VX_cache_top import VX_gpu_pkg::*; #(
|
|||
// Number of Word requests per cycle
|
||||
parameter NUM_REQS = 4,
|
||||
|
||||
// Number of memory ports
|
||||
parameter MEM_PORTS = 1,
|
||||
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 65536,
|
||||
parameter CACHE_SIZE = 16384,
|
||||
// Size of line inside a bank in bytes
|
||||
parameter LINE_SIZE = 64,
|
||||
// Number of banks
|
||||
|
@ -31,39 +28,39 @@ module VX_cache_top import VX_gpu_pkg::*; #(
|
|||
// Number of associative ways
|
||||
parameter NUM_WAYS = 4,
|
||||
// Size of a word in bytes
|
||||
parameter WORD_SIZE = 16,
|
||||
parameter WORD_SIZE = 4,
|
||||
|
||||
// Core Response Queue Size
|
||||
parameter CRSQ_SIZE = 8,
|
||||
parameter CRSQ_SIZE = 2,
|
||||
// Miss Reserv Queue Knob
|
||||
parameter MSHR_SIZE = 16,
|
||||
// Memory Response Queue Size
|
||||
parameter MRSQ_SIZE = 8,
|
||||
parameter MRSQ_SIZE = 0,
|
||||
// Memory Request Queue Size
|
||||
parameter MREQ_SIZE = 8,
|
||||
parameter MREQ_SIZE = 4,
|
||||
|
||||
// Enable cache writeable
|
||||
parameter WRITE_ENABLE = 1,
|
||||
|
||||
// Enable cache writeback
|
||||
parameter WRITEBACK = 1,
|
||||
parameter WRITEBACK = 0,
|
||||
|
||||
// Enable dirty bytes on writeback
|
||||
parameter DIRTY_BYTES = 1,
|
||||
parameter DIRTY_BYTES = 0,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
// core request tag size
|
||||
parameter TAG_WIDTH = 32,
|
||||
parameter TAG_WIDTH = 16,
|
||||
|
||||
// Core response output buffer
|
||||
parameter CORE_OUT_BUF = 3,
|
||||
parameter CORE_OUT_BUF = 2,
|
||||
|
||||
// Memory request output buffer
|
||||
parameter MEM_OUT_BUF = 3,
|
||||
parameter MEM_OUT_BUF = 2,
|
||||
|
||||
parameter MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH)
|
||||
parameter MEM_TAG_WIDTH = `CLOG2(MSHR_SIZE) + `CLOG2(NUM_BANKS)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
@ -74,35 +71,35 @@ module VX_cache_top import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
|
||||
// Core request
|
||||
input wire core_req_valid [NUM_REQS],
|
||||
input wire core_req_rw [NUM_REQS],
|
||||
input wire[WORD_SIZE-1:0] core_req_byteen [NUM_REQS],
|
||||
input wire[`CS_WORD_ADDR_WIDTH-1:0] core_req_addr [NUM_REQS],
|
||||
input wire[`MEM_REQ_FLAGS_WIDTH-1:0] core_req_flags [NUM_REQS],
|
||||
input wire[`CS_WORD_WIDTH-1:0] core_req_data [NUM_REQS],
|
||||
input wire[TAG_WIDTH-1:0] core_req_tag [NUM_REQS],
|
||||
output wire core_req_ready [NUM_REQS],
|
||||
input wire [NUM_REQS-1:0] core_req_valid,
|
||||
input wire [NUM_REQS-1:0] core_req_rw,
|
||||
input wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
|
||||
input wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr,
|
||||
input wire [NUM_REQS-1:0][`ADDR_TYPE_WIDTH-1:0] core_req_atype,
|
||||
input wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data,
|
||||
input wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag,
|
||||
output wire [NUM_REQS-1:0] core_req_ready,
|
||||
|
||||
// Core response
|
||||
output wire core_rsp_valid [NUM_REQS],
|
||||
output wire[`CS_WORD_WIDTH-1:0] core_rsp_data [NUM_REQS],
|
||||
output wire[TAG_WIDTH-1:0] core_rsp_tag [NUM_REQS],
|
||||
input wire core_rsp_ready [NUM_REQS],
|
||||
output wire [NUM_REQS-1:0] core_rsp_valid,
|
||||
output wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data,
|
||||
output wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag,
|
||||
input wire [NUM_REQS-1:0] core_rsp_ready,
|
||||
|
||||
// Memory request
|
||||
output wire mem_req_valid [MEM_PORTS],
|
||||
output wire mem_req_rw [MEM_PORTS],
|
||||
output wire [LINE_SIZE-1:0] mem_req_byteen [MEM_PORTS],
|
||||
output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr [MEM_PORTS],
|
||||
output wire [`CS_LINE_WIDTH-1:0] mem_req_data [MEM_PORTS],
|
||||
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag [MEM_PORTS],
|
||||
input wire mem_req_ready [MEM_PORTS],
|
||||
output wire mem_req_valid,
|
||||
output wire mem_req_rw,
|
||||
output wire [LINE_SIZE-1:0] mem_req_byteen,
|
||||
output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
|
||||
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// Memory response
|
||||
input wire mem_rsp_valid [MEM_PORTS],
|
||||
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data [MEM_PORTS],
|
||||
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag [MEM_PORTS],
|
||||
output wire mem_rsp_ready [MEM_PORTS]
|
||||
input wire mem_rsp_valid,
|
||||
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire mem_rsp_ready
|
||||
);
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
|
@ -112,7 +109,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
|
|||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH)
|
||||
) mem_bus_if[MEM_PORTS]();
|
||||
) mem_bus_if();
|
||||
|
||||
// Core request
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
|
@ -120,7 +117,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
|
|||
assign core_bus_if[i].req_data.rw = core_req_rw[i];
|
||||
assign core_bus_if[i].req_data.byteen = core_req_byteen[i];
|
||||
assign core_bus_if[i].req_data.addr = core_req_addr[i];
|
||||
assign core_bus_if[i].req_data.flags = core_req_flags[i];
|
||||
assign core_bus_if[i].req_data.atype = core_req_atype[i];
|
||||
assign core_bus_if[i].req_data.data = core_req_data[i];
|
||||
assign core_bus_if[i].req_data.tag = core_req_tag[i];
|
||||
assign core_req_ready[i] = core_bus_if[i].req_ready;
|
||||
|
@ -128,32 +125,29 @@ module VX_cache_top import VX_gpu_pkg::*; #(
|
|||
|
||||
// Core response
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_rsp_valid[i]= core_bus_if[i].rsp_valid;
|
||||
assign core_rsp_valid[i] = core_bus_if[i].rsp_valid;
|
||||
assign core_rsp_data[i] = core_bus_if[i].rsp_data.data;
|
||||
assign core_rsp_tag[i] = core_bus_if[i].rsp_data.tag;
|
||||
assign core_rsp_tag[i] = core_bus_if[i].rsp_data.tag;
|
||||
assign core_bus_if[i].rsp_ready = core_rsp_ready[i];
|
||||
end
|
||||
|
||||
// Memory request
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin
|
||||
assign mem_req_valid[i] = mem_bus_if[i].req_valid;
|
||||
assign mem_req_rw[i] = mem_bus_if[i].req_data.rw;
|
||||
assign mem_req_byteen[i]= mem_bus_if[i].req_data.byteen;
|
||||
assign mem_req_addr[i] = mem_bus_if[i].req_data.addr;
|
||||
assign mem_req_data[i] = mem_bus_if[i].req_data.data;
|
||||
assign mem_req_tag[i] = mem_bus_if[i].req_data.tag;
|
||||
assign mem_bus_if[i].req_ready = mem_req_ready[i];
|
||||
end
|
||||
assign mem_req_valid = mem_bus_if.req_valid;
|
||||
assign mem_req_rw = mem_bus_if.req_data.rw;
|
||||
assign mem_req_byteen = mem_bus_if.req_data.byteen;
|
||||
assign mem_req_addr = mem_bus_if.req_data.addr;
|
||||
assign mem_req_data = mem_bus_if.req_data.data;
|
||||
assign mem_req_tag = mem_bus_if.req_data.tag;
|
||||
assign mem_bus_if.req_ready = mem_req_ready;
|
||||
`UNUSED_VAR (mem_bus_if.req_data.atype)
|
||||
|
||||
// Memory response
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin
|
||||
assign mem_bus_if[i].rsp_valid = mem_rsp_valid[i];
|
||||
assign mem_bus_if[i].rsp_data.data = mem_rsp_data[i];
|
||||
assign mem_bus_if[i].rsp_data.tag = mem_rsp_tag[i];
|
||||
assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready;
|
||||
end
|
||||
assign mem_bus_if.rsp_valid = mem_rsp_valid;
|
||||
assign mem_bus_if.rsp_data.data = mem_rsp_data;
|
||||
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
|
||||
assign mem_rsp_ready = mem_bus_if.rsp_ready;
|
||||
|
||||
VX_cache_wrap #(
|
||||
VX_cache #(
|
||||
.INSTANCE_ID (INSTANCE_ID),
|
||||
.CACHE_SIZE (CACHE_SIZE),
|
||||
.LINE_SIZE (LINE_SIZE),
|
||||
|
@ -161,7 +155,6 @@ module VX_cache_top import VX_gpu_pkg::*; #(
|
|||
.NUM_WAYS (NUM_WAYS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.MEM_PORTS (MEM_PORTS),
|
||||
.CRSQ_SIZE (CRSQ_SIZE),
|
||||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.MRSQ_SIZE (MRSQ_SIZE),
|
||||
|
|
253
hw/rtl/cache/VX_cache_wrap.sv
vendored
253
hw/rtl/cache/VX_cache_wrap.sv
vendored
|
@ -21,26 +21,24 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
// Number of Word requests per cycle
|
||||
parameter NUM_REQS = 4,
|
||||
|
||||
// Number of memory ports
|
||||
parameter MEM_PORTS = 1,
|
||||
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 4096,
|
||||
// Size of line inside a bank in bytes
|
||||
parameter LINE_SIZE = 64,
|
||||
// Number of banks
|
||||
parameter NUM_BANKS = 4,
|
||||
parameter NUM_BANKS = 1,
|
||||
// Number of associative ways
|
||||
parameter NUM_WAYS = 4,
|
||||
parameter NUM_WAYS = 1,
|
||||
// Size of a word in bytes
|
||||
parameter WORD_SIZE = 16,
|
||||
parameter WORD_SIZE = 4,
|
||||
|
||||
// Core Response Queue Size
|
||||
parameter CRSQ_SIZE = 4,
|
||||
parameter CRSQ_SIZE = 2,
|
||||
// Miss Reserv Queue Knob
|
||||
parameter MSHR_SIZE = 16,
|
||||
parameter MSHR_SIZE = 8,
|
||||
// Memory Response Queue Size
|
||||
parameter MRSQ_SIZE = 4,
|
||||
parameter MRSQ_SIZE = 0,
|
||||
// Memory Request Queue Size
|
||||
parameter MREQ_SIZE = 4,
|
||||
|
||||
|
@ -53,18 +51,12 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
// Enable dirty bytes on writeback
|
||||
parameter DIRTY_BYTES = 0,
|
||||
|
||||
// Replacement policy
|
||||
parameter REPL_POLICY = `CS_REPL_FIFO,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
// core request tag size
|
||||
parameter TAG_WIDTH = UUID_WIDTH + 1,
|
||||
|
||||
// core request flags
|
||||
parameter FLAGS_WIDTH = 0,
|
||||
|
||||
// enable bypass for non-cacheable addresses
|
||||
parameter NC_ENABLE = 0,
|
||||
|
||||
|
@ -72,10 +64,10 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
parameter PASSTHRU = 0,
|
||||
|
||||
// Core response output buffer
|
||||
parameter CORE_OUT_BUF = 3,
|
||||
parameter CORE_OUT_BUF = 0,
|
||||
|
||||
// Memory request output buffer
|
||||
parameter MEM_OUT_BUF = 3
|
||||
parameter MEM_OUT_BUF = 0
|
||||
) (
|
||||
|
||||
input wire clk,
|
||||
|
@ -87,16 +79,19 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
|
||||
VX_mem_bus_if.slave core_bus_if [NUM_REQS],
|
||||
VX_mem_bus_if.master mem_bus_if [MEM_PORTS]
|
||||
VX_mem_bus_if.master mem_bus_if
|
||||
);
|
||||
|
||||
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
|
||||
|
||||
localparam CACHE_MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH);
|
||||
localparam BYPASS_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, MEM_PORTS, LINE_SIZE, WORD_SIZE, TAG_WIDTH);
|
||||
localparam NC_TAG_WIDTH = `MAX(CACHE_MEM_TAG_WIDTH, BYPASS_TAG_WIDTH) + 1;
|
||||
localparam MEM_TAG_WIDTH = PASSTHRU ? BYPASS_TAG_WIDTH : (NC_ENABLE ? NC_TAG_WIDTH : CACHE_MEM_TAG_WIDTH);
|
||||
localparam BYPASS_ENABLE = (NC_ENABLE || PASSTHRU);
|
||||
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
|
||||
localparam CACHE_MEM_TAG_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
|
||||
|
||||
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
|
||||
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
|
||||
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
|
||||
|
||||
localparam NC_OR_BYPASS = (NC_ENABLE || PASSTHRU);
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
|
@ -106,21 +101,18 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (CACHE_MEM_TAG_WIDTH)
|
||||
) mem_bus_cache_if[MEM_PORTS]();
|
||||
) mem_bus_cache_if();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH)
|
||||
) mem_bus_tmp_if[MEM_PORTS]();
|
||||
if (NC_OR_BYPASS) begin
|
||||
|
||||
if (BYPASS_ENABLE) begin : g_bypass
|
||||
`RESET_RELAY (nc_bypass_reset, reset);
|
||||
|
||||
VX_cache_bypass #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.MEM_PORTS (MEM_PORTS),
|
||||
.TAG_SEL_IDX (TAG_SEL_IDX),
|
||||
|
||||
.CACHE_ENABLE (!PASSTHRU),
|
||||
.PASSTHRU (PASSTHRU),
|
||||
.NC_ENABLE (PASSTHRU ? 0 : NC_ENABLE),
|
||||
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.LINE_SIZE (LINE_SIZE),
|
||||
|
@ -130,6 +122,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
|
||||
.MEM_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH),
|
||||
.MEM_TAG_IN_WIDTH (CACHE_MEM_TAG_WIDTH),
|
||||
.MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH),
|
||||
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
|
||||
|
@ -137,35 +130,51 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
.MEM_OUT_BUF (MEM_OUT_BUF)
|
||||
) cache_bypass (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (nc_bypass_reset),
|
||||
|
||||
.core_bus_in_if (core_bus_if),
|
||||
.core_bus_out_if(core_bus_cache_if),
|
||||
|
||||
.mem_bus_in_if (mem_bus_cache_if),
|
||||
.mem_bus_out_if (mem_bus_tmp_if)
|
||||
.mem_bus_out_if (mem_bus_if)
|
||||
);
|
||||
|
||||
end else begin : g_no_bypass
|
||||
end else begin
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_cache_if
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
`ASSIGN_VX_MEM_BUS_IF (core_bus_cache_if[i], core_bus_if[i]);
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_tmp_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_tmp_if[i], mem_bus_cache_if[i]);
|
||||
end
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_cache_if);
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_if
|
||||
if (WRITE_ENABLE) begin : g_we
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
|
||||
end else begin : g_ro
|
||||
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
|
||||
end
|
||||
end
|
||||
if (PASSTHRU != 0) begin
|
||||
|
||||
if (PASSTHRU == 0) begin : g_cache
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
`UNUSED_VAR (core_bus_cache_if[i].req_valid)
|
||||
`UNUSED_VAR (core_bus_cache_if[i].req_data)
|
||||
assign core_bus_cache_if[i].req_ready = 0;
|
||||
|
||||
assign core_bus_cache_if[i].rsp_valid = 0;
|
||||
assign core_bus_cache_if[i].rsp_data = '0;
|
||||
`UNUSED_VAR (core_bus_cache_if[i].rsp_ready)
|
||||
end
|
||||
|
||||
assign mem_bus_cache_if.req_valid = 0;
|
||||
assign mem_bus_cache_if.req_data = '0;
|
||||
`UNUSED_VAR (mem_bus_cache_if.req_ready)
|
||||
|
||||
`UNUSED_VAR (mem_bus_cache_if.rsp_valid)
|
||||
`UNUSED_VAR (mem_bus_cache_if.rsp_data)
|
||||
assign mem_bus_cache_if.rsp_ready = 0;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
assign cache_perf = '0;
|
||||
`endif
|
||||
|
||||
end else begin
|
||||
|
||||
`RESET_RELAY (cache_reset, reset);
|
||||
|
||||
VX_cache #(
|
||||
.INSTANCE_ID (INSTANCE_ID),
|
||||
|
@ -175,23 +184,20 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
.NUM_WAYS (NUM_WAYS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.MEM_PORTS (MEM_PORTS),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.WRITEBACK (WRITEBACK),
|
||||
.DIRTY_BYTES (DIRTY_BYTES),
|
||||
.REPL_POLICY (REPL_POLICY),
|
||||
.CRSQ_SIZE (CRSQ_SIZE),
|
||||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.MRSQ_SIZE (MRSQ_SIZE),
|
||||
.MREQ_SIZE (MREQ_SIZE),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.WRITEBACK (WRITEBACK),
|
||||
.DIRTY_BYTES (DIRTY_BYTES),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.FLAGS_WIDTH (FLAGS_WIDTH),
|
||||
.CORE_OUT_BUF (BYPASS_ENABLE ? 1 : CORE_OUT_BUF),
|
||||
.MEM_OUT_BUF (BYPASS_ENABLE ? 1 : MEM_OUT_BUF)
|
||||
.CORE_OUT_BUF (NC_OR_BYPASS ? 1 : CORE_OUT_BUF),
|
||||
.MEM_OUT_BUF (NC_OR_BYPASS ? 1 : MEM_OUT_BUF)
|
||||
) cache (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (cache_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (cache_perf),
|
||||
`endif
|
||||
|
@ -199,105 +205,64 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
.mem_bus_if (mem_bus_cache_if)
|
||||
);
|
||||
|
||||
end else begin : g_passthru
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_cache_if
|
||||
`UNUSED_VX_MEM_BUS_IF (core_bus_cache_if[i])
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_cache_if
|
||||
`INIT_VX_MEM_BUS_IF (mem_bus_cache_if[i])
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [NUM_REQS-1:0] perf_core_reads_per_req;
|
||||
wire [NUM_REQS-1:0] perf_core_writes_per_req;
|
||||
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
|
||||
wire [MEM_PORTS-1:0] perf_mem_stall_per_port;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req
|
||||
assign perf_core_reads_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && ~core_bus_if[i].req_data.rw;
|
||||
assign perf_core_writes_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && core_bus_if[i].req_data.rw;
|
||||
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_perf_mem_stall_per_port
|
||||
assign perf_mem_stall_per_port[i] = mem_bus_if[i].req_valid && ~mem_bus_if[i].req_ready;
|
||||
end
|
||||
|
||||
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
|
||||
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||
wire [`CLOG2(MEM_PORTS+1)-1:0] perf_mem_stall_per_cycle;
|
||||
|
||||
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
|
||||
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
|
||||
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
|
||||
`POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_core_reads <= '0;
|
||||
perf_core_writes <= '0;
|
||||
perf_mem_stalls <= '0;
|
||||
perf_crsp_stalls <= '0;
|
||||
end else begin
|
||||
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
|
||||
perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'(perf_mem_stall_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
end
|
||||
end
|
||||
|
||||
assign cache_perf.reads = perf_core_reads;
|
||||
assign cache_perf.writes = perf_core_writes;
|
||||
assign cache_perf.read_misses = '0;
|
||||
assign cache_perf.write_misses = '0;
|
||||
assign cache_perf.bank_stalls = '0;
|
||||
assign cache_perf.mshr_stalls = '0;
|
||||
assign cache_perf.mem_stalls = perf_mem_stalls;
|
||||
assign cache_perf.crsp_stalls = perf_crsp_stalls;
|
||||
`endif
|
||||
|
||||
end
|
||||
|
||||
`ifdef DBG_TRACE_CACHE
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_trace_core
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
wire [`UP(UUID_WIDTH)-1:0] core_req_uuid;
|
||||
wire [`UP(UUID_WIDTH)-1:0] core_rsp_uuid;
|
||||
|
||||
if (UUID_WIDTH != 0) begin
|
||||
assign core_req_uuid = core_bus_if[i].req_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
assign core_rsp_uuid = core_bus_if[i].rsp_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
end else begin
|
||||
assign core_req_uuid = 0;
|
||||
assign core_rsp_uuid = 0;
|
||||
end
|
||||
|
||||
wire core_req_fire = core_bus_if[i].req_valid && core_bus_if[i].req_ready;
|
||||
wire core_rsp_fire = core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (core_bus_if[i].req_valid && core_bus_if[i].req_ready) begin
|
||||
if (core_bus_if[i].req_data.rw) begin
|
||||
`TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_bus_if[i].req_data.tag.uuid))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.tag.uuid))
|
||||
end
|
||||
if (core_req_fire) begin
|
||||
if (core_bus_if[i].req_data.rw)
|
||||
`TRACE(1, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid));
|
||||
else
|
||||
`TRACE(1, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_req_uuid));
|
||||
end
|
||||
if (core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready) begin
|
||||
`TRACE(2, ("%t: %s core-rd-rsp[%0d]: tag=0x%0h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, core_bus_if[i].rsp_data.tag.value, core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag.uuid))
|
||||
if (core_rsp_fire) begin
|
||||
`TRACE(1, ("%d: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid));
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_trace_mem
|
||||
always @(posedge clk) begin
|
||||
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
|
||||
if (mem_bus_if[i].req_data.rw) begin
|
||||
`TRACE(2, ("%t: %s mem-wr-req[%0d]: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.uuid))
|
||||
end else begin
|
||||
`TRACE(2, ("%t: %s mem-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
|
||||
end
|
||||
end
|
||||
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
|
||||
`TRACE(2, ("%t: %s mem-rd-rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid))
|
||||
end
|
||||
wire [`UP(UUID_WIDTH)-1:0] mem_req_uuid;
|
||||
wire [`UP(UUID_WIDTH)-1:0] mem_rsp_uuid;
|
||||
|
||||
if ((UUID_WIDTH != 0) && (NC_OR_BYPASS != 0)) begin
|
||||
assign mem_req_uuid = mem_bus_if.req_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
assign mem_rsp_uuid = mem_bus_if.rsp_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
end else begin
|
||||
assign mem_req_uuid = 0;
|
||||
assign mem_rsp_uuid = 0;
|
||||
end
|
||||
|
||||
wire mem_req_fire = mem_bus_if.req_valid && mem_bus_if.req_ready;
|
||||
wire mem_rsp_fire = mem_bus_if.rsp_valid && mem_bus_if.rsp_ready;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (mem_req_fire) begin
|
||||
if (mem_bus_if.req_data.rw)
|
||||
`TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%h, data=0x%h (#%0d)\n",
|
||||
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_bus_if.req_data.byteen, mem_bus_if.req_data.data, mem_req_uuid));
|
||||
else
|
||||
`TRACE(1, ("%d: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_req_uuid));
|
||||
end
|
||||
if (mem_rsp_fire) begin
|
||||
`TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n",
|
||||
$time, INSTANCE_ID, mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data, mem_rsp_uuid));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -71,19 +71,19 @@ module VX_alu_int #(
|
|||
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_imm = execute_if.data.op_args.alu.use_imm ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_args.alu.imm)}} : alu_in2;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_br = (execute_if.data.op_args.alu.use_imm && ~is_br_op) ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_args.alu.imm)}} : alu_in2;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_add_result
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign add_result[i] = alu_in1_PC[i] + alu_in2_imm[i];
|
||||
assign add_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] + alu_in2_imm[i][31:0]));
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_sub_result
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire [`XLEN:0] sub_in1 = {is_signed & alu_in1[i][`XLEN-1], alu_in1[i]};
|
||||
wire [`XLEN:0] sub_in2 = {is_signed & alu_in2_br[i][`XLEN-1], alu_in2_br[i]};
|
||||
assign sub_result[i] = sub_in1 - sub_in2;
|
||||
assign sub_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] - alu_in2_imm[i][31:0]));
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_shr_result
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire [`XLEN:0] shr_in1 = {is_signed && alu_in1[i][`XLEN-1], alu_in1[i]};
|
||||
always @(*) begin
|
||||
case (alu_op[1:0])
|
||||
|
@ -102,7 +102,7 @@ module VX_alu_int #(
|
|||
assign shr_result_w[i] = `XLEN'($signed(shr_res_w));
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_msc_result
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
always @(*) begin
|
||||
case (alu_op[1:0])
|
||||
2'b00: msc_result[i] = alu_in1[i] & alu_in2_imm[i]; // AND
|
||||
|
@ -114,7 +114,7 @@ module VX_alu_int #(
|
|||
assign msc_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] << alu_in2_imm[i][4:0])); // SLLW
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_alu_result
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire [`XLEN-1:0] slt_br_result = `XLEN'({is_br_op && ~(| sub_result[i][`XLEN-1:0]), sub_result[i][`XLEN]});
|
||||
wire [`XLEN-1:0] sub_slt_br_result = (is_sub_op && ~is_br_op) ? sub_result[i][`XLEN-1:0] : slt_br_result;
|
||||
always @(*) begin
|
||||
|
@ -141,9 +141,9 @@ module VX_alu_int #(
|
|||
|
||||
assign cbr_dest = add_result[0][1 +: `PC_BITS];
|
||||
|
||||
if (LANE_BITS != 0) begin : g_tid
|
||||
if (LANE_BITS != 0) begin
|
||||
assign tid = execute_if.data.tid[0 +: LANE_BITS];
|
||||
end else begin : g_tid_0
|
||||
end else begin
|
||||
assign tid = 0;
|
||||
end
|
||||
|
||||
|
@ -185,7 +185,7 @@ module VX_alu_int #(
|
|||
.data_out ({branch_ctl_if.valid, branch_ctl_if.wid, branch_ctl_if.taken, branch_ctl_if.dest})
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_commit
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign commit_if.data.data[i] = (is_br_op_r && is_br_static) ? {(PC_r + `PC_BITS'(2)), 1'd0} : alu_result_r[i];
|
||||
end
|
||||
|
||||
|
@ -194,8 +194,8 @@ module VX_alu_int #(
|
|||
`ifdef DBG_TRACE_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (br_enable) begin
|
||||
`TRACE(2, ("%t: %s branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, br_wid, {commit_if.data.PC, 1'b0}, br_taken, {br_dest, 1'b0}, commit_if.data.uuid))
|
||||
`TRACE(1, ("%d: %s-branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, br_wid, {commit_if.data.PC, 1'b0}, br_taken, {br_dest, 1'b0}, commit_if.data.uuid));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -68,7 +68,7 @@ module VX_alu_muldiv #(
|
|||
|
||||
wire mul_fire_in = mul_valid_in && mul_ready_in;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mul_result_tmp
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
reg [`XLEN-1:0] mul_resultl, mul_resulth;
|
||||
wire [`XLEN-1:0] mul_in1 = is_alu_w ? (execute_if.data.rs1_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs1_data[i];
|
||||
wire [`XLEN-1:0] mul_in2 = is_alu_w ? (execute_if.data.rs2_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs2_data[i];
|
||||
|
@ -103,7 +103,7 @@ module VX_alu_muldiv #(
|
|||
wire [NUM_LANES-1:0][`XLEN:0] mul_in1;
|
||||
wire [NUM_LANES-1:0][`XLEN:0] mul_in2;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mul_in
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign mul_in1[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]} : {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]};
|
||||
assign mul_in2[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]} : {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]};
|
||||
end
|
||||
|
@ -149,7 +149,7 @@ module VX_alu_muldiv #(
|
|||
|
||||
`else
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_multiplier
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire [`XLEN:0] mul_in1 = {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]};
|
||||
wire [`XLEN:0] mul_in2 = {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]};
|
||||
|
||||
|
@ -184,7 +184,7 @@ module VX_alu_muldiv #(
|
|||
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mul_result_out
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
`ifdef XLEN_64
|
||||
assign mul_result_out[i] = is_mulh_out ? mul_result_tmp[i][2*(`XLEN)-1:`XLEN] :
|
||||
(is_mul_w_out ? `XLEN'($signed(mul_result_tmp[i][31:0])) :
|
||||
|
@ -219,7 +219,7 @@ module VX_alu_muldiv #(
|
|||
wire [NUM_LANES-1:0][`XLEN-1:0] div_in1;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_div_in
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
`ifdef XLEN_64
|
||||
assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
|
||||
assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
|
||||
|
@ -234,7 +234,7 @@ module VX_alu_muldiv #(
|
|||
wire [NUM_LANES-1:0][`XLEN-1:0] div_result_in;
|
||||
wire div_fire_in = div_valid_in && div_ready_in;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_div_result_in
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
reg [`XLEN-1:0] div_quotient, div_remainder;
|
||||
always @(*) begin
|
||||
dpi_idiv (div_fire_in, is_signed_op, div_in1[i], div_in2[i], div_quotient, div_remainder);
|
||||
|
@ -306,7 +306,7 @@ module VX_alu_muldiv #(
|
|||
|
||||
assign {div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out, is_div_w_out, div_pid_out, div_sop_out, div_eop_out} = div_tag_r;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_div_result_out
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
`ifdef XLEN_64
|
||||
assign div_result_out[i] = is_rem_op_out ? (is_div_w_out ? `XLEN'($signed(div_remainder[i][31:0])) : div_remainder[i]) :
|
||||
(is_div_w_out ? `XLEN'($signed(div_quotient[i][31:0])) : div_quotient[i]);
|
||||
|
@ -324,8 +324,8 @@ module VX_alu_muldiv #(
|
|||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATAW (TAG_WIDTH + (NUM_LANES * `XLEN)),
|
||||
.ARBITER ("P"),
|
||||
.OUT_BUF (2)
|
||||
.ARBITER ("F"),
|
||||
.OUT_BUF (1)
|
||||
) rsp_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
|
|
@ -30,24 +30,20 @@ module VX_alu_unit #(
|
|||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam BLOCK_SIZE = `NUM_ALU_BLOCKS;
|
||||
localparam NUM_LANES = `NUM_ALU_LANES;
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
|
||||
localparam RSP_ARB_SIZE = 1 + `EXT_M_ENABLED;
|
||||
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
|
||||
localparam PE_COUNT = 1 + `EXT_M_ENABLED;
|
||||
localparam PE_SEL_BITS = `CLOG2(PE_COUNT);
|
||||
localparam PE_IDX_INT = 0;
|
||||
localparam PE_IDX_MDV = PE_IDX_INT + `EXT_M_ENABLED;
|
||||
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) per_block_execute_if[BLOCK_SIZE]();
|
||||
|
||||
VX_commit_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) per_block_commit_if[BLOCK_SIZE]();
|
||||
|
||||
VX_dispatch_unit #(
|
||||
.BLOCK_SIZE (BLOCK_SIZE),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.OUT_BUF (PARTIAL_BW ? 3 : 0)
|
||||
.OUT_BUF (PARTIAL_BW ? 1 : 0)
|
||||
) dispatch_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -55,62 +51,103 @@ module VX_alu_unit #(
|
|||
.execute_if (per_block_execute_if)
|
||||
);
|
||||
|
||||
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_alus
|
||||
VX_commit_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) per_block_commit_if[BLOCK_SIZE]();
|
||||
|
||||
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
|
||||
|
||||
`RESET_RELAY_EN (block_reset, reset,(BLOCK_SIZE > 1));
|
||||
|
||||
wire is_muldiv_op = `EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == `ALU_TYPE_MULDIV);
|
||||
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) pe_execute_if[PE_COUNT]();
|
||||
) int_execute_if();
|
||||
|
||||
VX_commit_if#(
|
||||
VX_commit_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) pe_commit_if[PE_COUNT]();
|
||||
) int_commit_if();
|
||||
|
||||
reg [`UP(PE_SEL_BITS)-1:0] pe_select;
|
||||
always @(*) begin
|
||||
pe_select = PE_IDX_INT;
|
||||
if (`EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == `ALU_TYPE_MULDIV))
|
||||
pe_select = PE_IDX_MDV;
|
||||
end
|
||||
|
||||
VX_pe_switch #(
|
||||
.PE_COUNT (PE_COUNT),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF (0),
|
||||
.RSP_OUT_BUF (PARTIAL_BW ? 1 : 3)
|
||||
) pe_switch (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.pe_sel (pe_select),
|
||||
.execute_in_if (per_block_execute_if[block_idx]),
|
||||
.commit_out_if (per_block_commit_if[block_idx]),
|
||||
.execute_out_if (pe_execute_if),
|
||||
.commit_in_if (pe_commit_if)
|
||||
);
|
||||
assign int_execute_if.valid = per_block_execute_if[block_idx].valid && ~is_muldiv_op;
|
||||
assign int_execute_if.data = per_block_execute_if[block_idx].data;
|
||||
|
||||
VX_alu_int #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-int%0d", INSTANCE_ID, block_idx))),
|
||||
.INSTANCE_ID ($sformatf("%s-int%0d", INSTANCE_ID, block_idx)),
|
||||
.BLOCK_IDX (block_idx),
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) alu_int (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.execute_if (pe_execute_if[PE_IDX_INT]),
|
||||
.reset (block_reset),
|
||||
.execute_if (int_execute_if),
|
||||
.branch_ctl_if (branch_ctl_if[block_idx]),
|
||||
.commit_if (pe_commit_if[PE_IDX_INT])
|
||||
.commit_if (int_commit_if)
|
||||
);
|
||||
|
||||
`ifdef EXT_M_ENABLE
|
||||
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) muldiv_execute_if();
|
||||
|
||||
VX_commit_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) muldiv_commit_if();
|
||||
|
||||
assign muldiv_execute_if.valid = per_block_execute_if[block_idx].valid && is_muldiv_op;
|
||||
assign muldiv_execute_if.data = per_block_execute_if[block_idx].data;
|
||||
|
||||
VX_alu_muldiv #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-muldiv%0d", INSTANCE_ID, block_idx))),
|
||||
.INSTANCE_ID ($sformatf("%s-muldiv%0d", INSTANCE_ID, block_idx)),
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) muldiv_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.execute_if (pe_execute_if[PE_IDX_MDV]),
|
||||
.commit_if (pe_commit_if[PE_IDX_MDV])
|
||||
.reset (block_reset),
|
||||
.execute_if (muldiv_execute_if),
|
||||
.commit_if (muldiv_commit_if)
|
||||
);
|
||||
|
||||
`endif
|
||||
|
||||
assign per_block_execute_if[block_idx].ready =
|
||||
`ifdef EXT_M_ENABLE
|
||||
is_muldiv_op ? muldiv_execute_if.ready :
|
||||
`endif
|
||||
int_execute_if.ready;
|
||||
|
||||
// send response
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (RSP_ARB_SIZE),
|
||||
.DATAW (RSP_ARB_DATAW),
|
||||
.OUT_BUF (PARTIAL_BW ? 1 : 3),
|
||||
.ARBITER ("F")
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (block_reset),
|
||||
.valid_in ({
|
||||
`ifdef EXT_M_ENABLE
|
||||
muldiv_commit_if.valid,
|
||||
`endif
|
||||
int_commit_if.valid
|
||||
}),
|
||||
.ready_in ({
|
||||
`ifdef EXT_M_ENABLE
|
||||
muldiv_commit_if.ready,
|
||||
`endif
|
||||
int_commit_if.ready
|
||||
}),
|
||||
.data_in ({
|
||||
`ifdef EXT_M_ENABLE
|
||||
muldiv_commit_if.data,
|
||||
`endif
|
||||
int_commit_if.data
|
||||
}),
|
||||
.data_out (per_block_commit_if[block_idx].data),
|
||||
.valid_out (per_block_commit_if[block_idx].valid),
|
||||
.ready_out (per_block_commit_if[block_idx].ready),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
end
|
||||
|
||||
VX_gather_unit #(
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_commit import VX_gpu_pkg::*; #(
|
||||
module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = ""
|
||||
) (
|
||||
input wire clk,
|
||||
|
@ -41,26 +41,28 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] per_issue_commit_tmask;
|
||||
wire [`ISSUE_WIDTH-1:0] per_issue_commit_eop;
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_commit_arbs
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
|
||||
wire [`NUM_EX_UNITS-1:0] valid_in;
|
||||
wire [`NUM_EX_UNITS-1:0][DATAW-1:0] data_in;
|
||||
wire [`NUM_EX_UNITS-1:0] ready_in;
|
||||
|
||||
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin : g_data_in
|
||||
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin
|
||||
assign valid_in[j] = commit_if[j * `ISSUE_WIDTH + i].valid;
|
||||
assign data_in[j] = commit_if[j * `ISSUE_WIDTH + i].data;
|
||||
assign commit_if[j * `ISSUE_WIDTH + i].ready = ready_in[j];
|
||||
end
|
||||
|
||||
`RESET_RELAY (arb_reset, reset);
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (`NUM_EX_UNITS),
|
||||
.DATAW (DATAW),
|
||||
.ARBITER ("P"),
|
||||
.ARBITER ("R"),
|
||||
.OUT_BUF (1)
|
||||
) commit_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (arb_reset),
|
||||
.valid_in (valid_in),
|
||||
.ready_in (ready_in),
|
||||
.data_in (data_in),
|
||||
|
@ -84,7 +86,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
|
||||
assign commit_fire_any = (| per_issue_commit_fire);
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_commit_size
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
wire [COMMIT_SIZEW-1:0] count;
|
||||
`POP_COUNT(count, per_issue_commit_tmask[i]);
|
||||
assign commit_size[i] = count;
|
||||
|
@ -101,7 +103,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
.data_out ({commit_fire_any_r, commit_size_r})
|
||||
);
|
||||
|
||||
VX_reduce_tree #(
|
||||
VX_reduce #(
|
||||
.DATAW_IN (COMMIT_SIZEW),
|
||||
.DATAW_OUT (COMMIT_ALL_SIZEW),
|
||||
.N (`ISSUE_WIDTH),
|
||||
|
@ -160,7 +162,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
|
||||
// Writeback
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_writeback
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign writeback_if[i].valid = commit_arb_if[i].valid && commit_arb_if[i].data.wb;
|
||||
assign writeback_if[i].data.uuid = commit_arb_if[i].data.uuid;
|
||||
assign writeback_if[i].data.wis = wid_to_wis(commit_arb_if[i].data.wid);
|
||||
|
@ -174,15 +176,15 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
end
|
||||
|
||||
`ifdef DBG_TRACE_PIPELINE
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_trace
|
||||
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin : g_j
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin
|
||||
always @(posedge clk) begin
|
||||
if (commit_if[j * `ISSUE_WIDTH + i].valid && commit_if[j * `ISSUE_WIDTH + i].ready) begin
|
||||
`TRACE(1, ("%t: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, commit_if[j * `ISSUE_WIDTH + i].data.wid, {commit_if[j * `ISSUE_WIDTH + i].data.PC, 1'b0}))
|
||||
`TRACE(1, ("%d: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, commit_if[j * `ISSUE_WIDTH + i].data.wid, {commit_if[j * `ISSUE_WIDTH + i].data.PC, 1'b0}));
|
||||
trace_ex_type(1, j);
|
||||
`TRACE(1, (", tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", commit_if[j * `ISSUE_WIDTH + i].data.tmask, commit_if[j * `ISSUE_WIDTH + i].data.wb, commit_if[j * `ISSUE_WIDTH + i].data.rd, commit_if[j * `ISSUE_WIDTH + i].data.sop, commit_if[j * `ISSUE_WIDTH + i].data.eop))
|
||||
`TRACE_ARRAY1D(1, "0x%0h", commit_if[j * `ISSUE_WIDTH + i].data.data, `NUM_THREADS)
|
||||
`TRACE(1, (" (#%0d)\n", commit_if[j * `ISSUE_WIDTH + i].data.uuid))
|
||||
`TRACE(1, (", tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", commit_if[j * `ISSUE_WIDTH + i].data.tmask, commit_if[j * `ISSUE_WIDTH + i].data.wb, commit_if[j * `ISSUE_WIDTH + i].data.rd, commit_if[j * `ISSUE_WIDTH + i].data.sop, commit_if[j * `ISSUE_WIDTH + i].data.eop));
|
||||
`TRACE_ARRAY1D(1, "0x%0h", commit_if[j * `ISSUE_WIDTH + i].data.data, `NUM_THREADS);
|
||||
`TRACE(1, (" (#%0d)\n", commit_if[j * `ISSUE_WIDTH + i].data.uuid));
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -28,7 +28,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
`endif
|
||||
|
||||
VX_dcr_bus_if.slave dcr_bus_if,
|
||||
|
@ -65,37 +65,44 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
) lsu_mem_if[`NUM_LSU_BLOCKS]();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
lmem_perf_t lmem_perf;
|
||||
coalescer_perf_t coalescer_perf;
|
||||
pipeline_perf_t pipeline_perf;
|
||||
sysmem_perf_t sysmem_perf_tmp;
|
||||
always @(*) begin
|
||||
sysmem_perf_tmp = sysmem_perf;
|
||||
sysmem_perf_tmp.lmem = lmem_perf;
|
||||
sysmem_perf_tmp.coalescer = coalescer_perf;
|
||||
end
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
VX_pipeline_perf_if pipeline_perf_if();
|
||||
|
||||
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
|
||||
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
|
||||
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
|
||||
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
|
||||
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
|
||||
`endif
|
||||
|
||||
`RESET_RELAY (dcr_data_reset, reset);
|
||||
`RESET_RELAY (schedule_reset, reset);
|
||||
`RESET_RELAY (fetch_reset, reset);
|
||||
`RESET_RELAY (decode_reset, reset);
|
||||
`RESET_RELAY (issue_reset, reset);
|
||||
`RESET_RELAY (execute_reset, reset);
|
||||
`RESET_RELAY (commit_reset, reset);
|
||||
|
||||
base_dcrs_t base_dcrs;
|
||||
|
||||
VX_dcr_data dcr_data (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (dcr_data_reset),
|
||||
.dcr_bus_if (dcr_bus_if),
|
||||
.base_dcrs (base_dcrs)
|
||||
);
|
||||
|
||||
`SCOPE_IO_SWITCH (3);
|
||||
`SCOPE_IO_SWITCH (3)
|
||||
|
||||
VX_schedule #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-schedule", INSTANCE_ID))),
|
||||
.INSTANCE_ID ($sformatf("%s-schedule", INSTANCE_ID)),
|
||||
.CORE_ID (CORE_ID)
|
||||
) schedule (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (schedule_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.sched_perf (pipeline_perf.sched),
|
||||
.sched_perf (pipeline_perf_if.sched),
|
||||
`endif
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
@ -116,36 +123,36 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
);
|
||||
|
||||
VX_fetch #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-fetch", INSTANCE_ID)))
|
||||
.INSTANCE_ID ($sformatf("%s-fetch", INSTANCE_ID))
|
||||
) fetch (
|
||||
`SCOPE_IO_BIND (0)
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (fetch_reset),
|
||||
.icache_bus_if (icache_bus_if),
|
||||
.schedule_if (schedule_if),
|
||||
.fetch_if (fetch_if)
|
||||
);
|
||||
|
||||
VX_decode #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-decode", INSTANCE_ID)))
|
||||
.INSTANCE_ID ($sformatf("%s-decode", INSTANCE_ID))
|
||||
) decode (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (decode_reset),
|
||||
.fetch_if (fetch_if),
|
||||
.decode_if (decode_if),
|
||||
.decode_sched_if(decode_sched_if)
|
||||
);
|
||||
|
||||
VX_issue #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-issue", INSTANCE_ID)))
|
||||
.INSTANCE_ID ($sformatf("%s-issue", INSTANCE_ID))
|
||||
) issue (
|
||||
`SCOPE_IO_BIND (1)
|
||||
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (issue_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.issue_perf (pipeline_perf.issue),
|
||||
.issue_perf (pipeline_perf_if.issue),
|
||||
`endif
|
||||
|
||||
.decode_if (decode_if),
|
||||
|
@ -154,17 +161,17 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
);
|
||||
|
||||
VX_execute #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-execute", INSTANCE_ID))),
|
||||
.INSTANCE_ID ($sformatf("%s-execute", INSTANCE_ID)),
|
||||
.CORE_ID (CORE_ID)
|
||||
) execute (
|
||||
`SCOPE_IO_BIND (2)
|
||||
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (execute_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.sysmem_perf (sysmem_perf_tmp),
|
||||
.pipeline_perf (pipeline_perf),
|
||||
.mem_perf_if (mem_perf_tmp_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
`endif
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
|
@ -182,10 +189,10 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
);
|
||||
|
||||
VX_commit #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-commit", INSTANCE_ID)))
|
||||
.INSTANCE_ID ($sformatf("%s-commit", INSTANCE_ID))
|
||||
) commit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (commit_reset),
|
||||
|
||||
.commit_if (commit_if),
|
||||
|
||||
|
@ -195,19 +202,134 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
.commit_sched_if(commit_sched_if)
|
||||
);
|
||||
|
||||
VX_mem_unit #(
|
||||
VX_lsu_mem_if #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
) lsu_dcache_if[`NUM_LSU_BLOCKS]();
|
||||
|
||||
`ifdef LMEM_ENABLE
|
||||
|
||||
`RESET_RELAY (lmem_unit_reset, reset);
|
||||
|
||||
VX_lmem_unit #(
|
||||
.INSTANCE_ID (INSTANCE_ID)
|
||||
) mem_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
) lmem_unit (
|
||||
.clk (clk),
|
||||
.reset (lmem_unit_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.lmem_perf (lmem_perf),
|
||||
.coalescer_perf(coalescer_perf),
|
||||
.cache_perf (mem_perf_tmp_if.lmem),
|
||||
`endif
|
||||
.lsu_mem_if (lsu_mem_if),
|
||||
.dcache_bus_if (dcache_bus_if)
|
||||
.lsu_mem_in_if (lsu_mem_if),
|
||||
.lsu_mem_out_if (lsu_dcache_if)
|
||||
);
|
||||
|
||||
`else
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
|
||||
`ASSIGN_VX_LSU_MEM_IF (lsu_dcache_if[i], lsu_mem_if[i]);
|
||||
end
|
||||
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
|
||||
|
||||
VX_lsu_mem_if #(
|
||||
.NUM_LANES (DCACHE_CHANNELS),
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) dcache_coalesced_if();
|
||||
|
||||
if (LSU_WORD_SIZE != DCACHE_WORD_SIZE) begin
|
||||
|
||||
`RESET_RELAY (mem_coalescer_reset, reset);
|
||||
|
||||
VX_mem_coalescer #(
|
||||
.INSTANCE_ID ($sformatf("%s-coalescer%0d", INSTANCE_ID, i)),
|
||||
.NUM_REQS (`NUM_LSU_LANES),
|
||||
.DATA_IN_SIZE (LSU_WORD_SIZE),
|
||||
.DATA_OUT_SIZE (DCACHE_WORD_SIZE),
|
||||
.ADDR_WIDTH (LSU_ADDR_WIDTH),
|
||||
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.QUEUE_SIZE (`LSUQ_OUT_SIZE)
|
||||
) mem_coalescer (
|
||||
.clk (clk),
|
||||
.reset (mem_coalescer_reset),
|
||||
|
||||
// Input request
|
||||
.in_req_valid (lsu_dcache_if[i].req_valid),
|
||||
.in_req_mask (lsu_dcache_if[i].req_data.mask),
|
||||
.in_req_rw (lsu_dcache_if[i].req_data.rw),
|
||||
.in_req_byteen (lsu_dcache_if[i].req_data.byteen),
|
||||
.in_req_addr (lsu_dcache_if[i].req_data.addr),
|
||||
.in_req_atype (lsu_dcache_if[i].req_data.atype),
|
||||
.in_req_data (lsu_dcache_if[i].req_data.data),
|
||||
.in_req_tag (lsu_dcache_if[i].req_data.tag),
|
||||
.in_req_ready (lsu_dcache_if[i].req_ready),
|
||||
|
||||
// Input response
|
||||
.in_rsp_valid (lsu_dcache_if[i].rsp_valid),
|
||||
.in_rsp_mask (lsu_dcache_if[i].rsp_data.mask),
|
||||
.in_rsp_data (lsu_dcache_if[i].rsp_data.data),
|
||||
.in_rsp_tag (lsu_dcache_if[i].rsp_data.tag),
|
||||
.in_rsp_ready (lsu_dcache_if[i].rsp_ready),
|
||||
|
||||
// Output request
|
||||
.out_req_valid (dcache_coalesced_if.req_valid),
|
||||
.out_req_mask (dcache_coalesced_if.req_data.mask),
|
||||
.out_req_rw (dcache_coalesced_if.req_data.rw),
|
||||
.out_req_byteen (dcache_coalesced_if.req_data.byteen),
|
||||
.out_req_addr (dcache_coalesced_if.req_data.addr),
|
||||
.out_req_atype (dcache_coalesced_if.req_data.atype),
|
||||
.out_req_data (dcache_coalesced_if.req_data.data),
|
||||
.out_req_tag (dcache_coalesced_if.req_data.tag),
|
||||
.out_req_ready (dcache_coalesced_if.req_ready),
|
||||
|
||||
// Output response
|
||||
.out_rsp_valid (dcache_coalesced_if.rsp_valid),
|
||||
.out_rsp_mask (dcache_coalesced_if.rsp_data.mask),
|
||||
.out_rsp_data (dcache_coalesced_if.rsp_data.data),
|
||||
.out_rsp_tag (dcache_coalesced_if.rsp_data.tag),
|
||||
.out_rsp_ready (dcache_coalesced_if.rsp_ready)
|
||||
);
|
||||
|
||||
end else begin
|
||||
|
||||
`ASSIGN_VX_LSU_MEM_IF (dcache_coalesced_if, lsu_dcache_if[i]);
|
||||
|
||||
end
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) dcache_bus_tmp_if[DCACHE_CHANNELS]();
|
||||
|
||||
`RESET_RELAY (lsu_adapter_reset, reset);
|
||||
|
||||
VX_lsu_adapter #(
|
||||
.NUM_LANES (DCACHE_CHANNELS),
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH),
|
||||
.TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH),
|
||||
.ARBITER ("P"),
|
||||
.REQ_OUT_BUF (0),
|
||||
.RSP_OUT_BUF (0)
|
||||
) lsu_adapter (
|
||||
.clk (clk),
|
||||
.reset (lsu_adapter_reset),
|
||||
.lsu_mem_if (dcache_coalesced_if),
|
||||
.mem_bus_if (dcache_bus_tmp_if)
|
||||
);
|
||||
|
||||
for (genvar j = 0; j < DCACHE_CHANNELS; ++j) begin
|
||||
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i * DCACHE_CHANNELS + j], dcache_bus_tmp_if[j]);
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
|
||||
|
@ -231,8 +353,8 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
wire [LSU_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
|
||||
wire [LSU_NUM_REQS-1:0] perf_dcache_rsp_fire;
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_perf_dcache
|
||||
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin : g_j
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
|
||||
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
|
||||
assign perf_dcache_rd_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && ~lsu_mem_if[i].req_data.rw;
|
||||
assign perf_dcache_wr_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && lsu_mem_if[i].req_data.rw;
|
||||
assign perf_dcache_rsp_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].rsp_valid && lsu_mem_if[i].rsp_data.mask[j] && lsu_mem_if[i].rsp_ready;
|
||||
|
@ -278,11 +400,12 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
|
||||
assign pipeline_perf.ifetches = perf_ifetches;
|
||||
assign pipeline_perf.loads = perf_loads;
|
||||
assign pipeline_perf.stores = perf_stores;
|
||||
assign pipeline_perf.ifetch_latency = perf_icache_lat;
|
||||
assign pipeline_perf.load_latency = perf_dcache_lat;
|
||||
assign pipeline_perf_if.ifetches = perf_ifetches;
|
||||
assign pipeline_perf_if.loads = perf_loads;
|
||||
assign pipeline_perf_if.stores = perf_stores;
|
||||
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
||||
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
|
||||
assign pipeline_perf_if.load_latency = perf_dcache_lat;
|
||||
|
||||
`endif
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
output wire [DCACHE_NUM_REQS-1:0] dcache_req_rw,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] dcache_req_byteen,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] dcache_req_addr,
|
||||
output wire [DCACHE_NUM_REQS-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] dcache_req_flags,
|
||||
output wire [DCACHE_NUM_REQS-1:0][`ADDR_TYPE_WIDTH-1:0] dcache_req_atype,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_req_data,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] dcache_req_tag,
|
||||
input wire [DCACHE_NUM_REQS-1:0] dcache_req_ready,
|
||||
|
@ -96,7 +96,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
assign dcache_req_rw[i] = dcache_bus_if[i].req_data.rw;
|
||||
assign dcache_req_byteen[i] = dcache_bus_if[i].req_data.byteen;
|
||||
assign dcache_req_addr[i] = dcache_bus_if[i].req_data.addr;
|
||||
assign dcache_req_flags[i] = dcache_bus_if[i].req_data.flags;
|
||||
assign dcache_req_atype[i] = dcache_bus_if[i].req_data.atype;
|
||||
assign dcache_req_data[i] = dcache_bus_if[i].req_data.data;
|
||||
assign dcache_req_tag[i] = dcache_bus_if[i].req_data.tag;
|
||||
assign dcache_bus_if[i].req_ready = dcache_req_ready[i];
|
||||
|
@ -119,7 +119,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
assign icache_req_data = icache_bus_if.req_data.data;
|
||||
assign icache_req_tag = icache_bus_if.req_data.tag;
|
||||
assign icache_bus_if.req_ready = icache_req_ready;
|
||||
`UNUSED_VAR (icache_bus_if.req_data.flags)
|
||||
`UNUSED_VAR (icache_bus_if.req_data.atype)
|
||||
|
||||
assign icache_bus_if.rsp_valid = icache_rsp_valid;
|
||||
assign icache_bus_if.rsp_data.tag = icache_rsp_tag;
|
||||
|
@ -127,13 +127,13 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
assign icache_rsp_ready = icache_bus_if.rsp_ready;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
sysmem_perf_t mem_perf;
|
||||
assign mem_perf.icache = '0;
|
||||
assign mem_perf.dcache = '0;
|
||||
assign mem_perf.l2cache = '0;
|
||||
assign mem_perf.l3cache = '0;
|
||||
assign mem_perf.lmem = '0;
|
||||
assign mem_perf.mem = '0;
|
||||
VX_mem_perf_if mem_perf_if();
|
||||
assign mem_perf_if.icache = '0;
|
||||
assign mem_perf_if.dcache = '0;
|
||||
assign mem_perf_if.l2cache = '0;
|
||||
assign mem_perf_if.l3cache = '0;
|
||||
assign mem_perf_if.lmem = '0;
|
||||
assign mem_perf_if.mem = '0;
|
||||
`endif
|
||||
|
||||
`ifdef SCOPE
|
||||
|
@ -144,7 +144,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
|
||||
VX_core #(
|
||||
.INSTANCE_ID (`SFORMATF(("core"))),
|
||||
.INSTANCE_ID ($sformatf("core")),
|
||||
.CORE_ID (CORE_ID)
|
||||
) core (
|
||||
`SCOPE_IO_BIND (0)
|
||||
|
@ -152,7 +152,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
.reset (reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.sysmem_perf (sysmem_perf),
|
||||
.mem_perf_if (mem_perf_if),
|
||||
`endif
|
||||
|
||||
.dcr_bus_if (dcr_bus_if),
|
||||
|
|
|
@ -41,8 +41,8 @@ import VX_fpu_pkg::*;
|
|||
input base_dcrs_t base_dcrs,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
input pipeline_perf_t pipeline_perf,
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
`endif
|
||||
|
||||
VX_commit_csr_if.slave commit_csr_if,
|
||||
|
@ -83,7 +83,7 @@ import VX_fpu_pkg::*;
|
|||
wire [`NUM_FPU_BLOCKS-1:0][`NW_WIDTH-1:0] fpu_write_wid;
|
||||
fflags_t [`NUM_FPU_BLOCKS-1:0] fpu_write_fflags;
|
||||
|
||||
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin : g_fpu_write
|
||||
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
|
||||
assign fpu_write_enable[i] = fpu_csr_if[i].write_enable;
|
||||
assign fpu_write_wid[i] = fpu_csr_if[i].write_wid;
|
||||
assign fpu_write_fflags[i] = fpu_csr_if[i].write_fflags;
|
||||
|
@ -107,7 +107,7 @@ import VX_fpu_pkg::*;
|
|||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin : g_fpu_csr_read_frm
|
||||
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
|
||||
assign fpu_csr_if[i].read_frm = fcsr[fpu_csr_if[i].read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS];
|
||||
end
|
||||
|
||||
|
@ -155,41 +155,41 @@ import VX_fpu_pkg::*;
|
|||
|
||||
// CSRs read //////////////////////////////////////////////////////////////
|
||||
|
||||
reg [`XLEN-1:0] read_data_ro_w;
|
||||
reg [`XLEN-1:0] read_data_rw_w;
|
||||
reg read_addr_valid_w;
|
||||
reg [`XLEN-1:0] read_data_ro_r;
|
||||
reg [`XLEN-1:0] read_data_rw_r;
|
||||
reg read_addr_valid_r;
|
||||
|
||||
always @(*) begin
|
||||
read_data_ro_w = '0;
|
||||
read_data_rw_w = '0;
|
||||
read_addr_valid_w = 1;
|
||||
read_data_ro_r = '0;
|
||||
read_data_rw_r = '0;
|
||||
read_addr_valid_r = 1;
|
||||
case (read_addr)
|
||||
`VX_CSR_MVENDORID : read_data_ro_w = `XLEN'(`VENDOR_ID);
|
||||
`VX_CSR_MARCHID : read_data_ro_w = `XLEN'(`ARCHITECTURE_ID);
|
||||
`VX_CSR_MIMPID : read_data_ro_w = `XLEN'(`IMPLEMENTATION_ID);
|
||||
`VX_CSR_MISA : read_data_ro_w = `XLEN'({2'(`CLOG2(`XLEN/16)), 30'(`MISA_STD)});
|
||||
`VX_CSR_MVENDORID : read_data_ro_r = `XLEN'(`VENDOR_ID);
|
||||
`VX_CSR_MARCHID : read_data_ro_r = `XLEN'(`ARCHITECTURE_ID);
|
||||
`VX_CSR_MIMPID : read_data_ro_r = `XLEN'(`IMPLEMENTATION_ID);
|
||||
`VX_CSR_MISA : read_data_ro_r = `XLEN'({2'(`CLOG2(`XLEN/16)), 30'(`MISA_STD)});
|
||||
`ifdef EXT_F_ENABLE
|
||||
`VX_CSR_FFLAGS : read_data_rw_w = `XLEN'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]);
|
||||
`VX_CSR_FRM : read_data_rw_w = `XLEN'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
|
||||
`VX_CSR_FCSR : read_data_rw_w = `XLEN'(fcsr[read_wid]);
|
||||
`VX_CSR_FFLAGS : read_data_rw_r = `XLEN'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]);
|
||||
`VX_CSR_FRM : read_data_rw_r = `XLEN'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
|
||||
`VX_CSR_FCSR : read_data_rw_r = `XLEN'(fcsr[read_wid]);
|
||||
`endif
|
||||
`VX_CSR_MSCRATCH : read_data_rw_w = mscratch;
|
||||
`VX_CSR_MSCRATCH : read_data_rw_r = mscratch;
|
||||
|
||||
`VX_CSR_WARP_ID : read_data_ro_w = `XLEN'(read_wid);
|
||||
`VX_CSR_CORE_ID : read_data_ro_w = `XLEN'(CORE_ID);
|
||||
`VX_CSR_ACTIVE_THREADS: read_data_ro_w = `XLEN'(thread_masks[read_wid]);
|
||||
`VX_CSR_ACTIVE_WARPS: read_data_ro_w = `XLEN'(active_warps);
|
||||
`VX_CSR_NUM_THREADS: read_data_ro_w = `XLEN'(`NUM_THREADS);
|
||||
`VX_CSR_NUM_WARPS : read_data_ro_w = `XLEN'(`NUM_WARPS);
|
||||
`VX_CSR_NUM_CORES : read_data_ro_w = `XLEN'(`NUM_CORES * `NUM_CLUSTERS);
|
||||
`VX_CSR_LOCAL_MEM_BASE: read_data_ro_w = `XLEN'(`LMEM_BASE_ADDR);
|
||||
`VX_CSR_WARP_ID : read_data_ro_r = `XLEN'(read_wid);
|
||||
`VX_CSR_CORE_ID : read_data_ro_r = `XLEN'(CORE_ID);
|
||||
`VX_CSR_ACTIVE_THREADS: read_data_ro_r = `XLEN'(thread_masks[read_wid]);
|
||||
`VX_CSR_ACTIVE_WARPS: read_data_ro_r = `XLEN'(active_warps);
|
||||
`VX_CSR_NUM_THREADS: read_data_ro_r = `XLEN'(`NUM_THREADS);
|
||||
`VX_CSR_NUM_WARPS : read_data_ro_r = `XLEN'(`NUM_WARPS);
|
||||
`VX_CSR_NUM_CORES : read_data_ro_r = `XLEN'(`NUM_CORES * `NUM_CLUSTERS);
|
||||
`VX_CSR_LOCAL_MEM_BASE: read_data_ro_r = `XLEN'(`LMEM_BASE_ADDR);
|
||||
|
||||
`CSR_READ_64(`VX_CSR_MCYCLE, read_data_ro_w, cycles);
|
||||
`CSR_READ_64(`VX_CSR_MCYCLE, read_data_ro_r, cycles);
|
||||
|
||||
`VX_CSR_MPM_RESERVED : read_data_ro_w = 'x;
|
||||
`VX_CSR_MPM_RESERVED_H : read_data_ro_w = 'x;
|
||||
`VX_CSR_MPM_RESERVED : read_data_ro_r = 'x;
|
||||
`VX_CSR_MPM_RESERVED_H : read_data_ro_r = 'x;
|
||||
|
||||
`CSR_READ_64(`VX_CSR_MINSTRET, read_data_ro_w, commit_csr_if.instret);
|
||||
`CSR_READ_64(`VX_CSR_MINSTRET, read_data_ro_r, commit_csr_if.instret);
|
||||
|
||||
`VX_CSR_SATP,
|
||||
`VX_CSR_MSTATUS,
|
||||
|
@ -200,79 +200,77 @@ import VX_fpu_pkg::*;
|
|||
`VX_CSR_MTVEC,
|
||||
`VX_CSR_MEPC,
|
||||
`VX_CSR_PMPCFG0,
|
||||
`VX_CSR_PMPADDR0 : read_data_ro_w = `XLEN'(0);
|
||||
`VX_CSR_PMPADDR0 : read_data_ro_r = `XLEN'(0);
|
||||
|
||||
default: begin
|
||||
read_addr_valid_w = 0;
|
||||
read_addr_valid_r = 0;
|
||||
if ((read_addr >= `VX_CSR_MPM_USER && read_addr < (`VX_CSR_MPM_USER + 32))
|
||||
|| (read_addr >= `VX_CSR_MPM_USER_H && read_addr < (`VX_CSR_MPM_USER_H + 32))) begin
|
||||
read_addr_valid_w = 1;
|
||||
read_addr_valid_r = 1;
|
||||
`ifdef PERF_ENABLE
|
||||
case (base_dcrs.mpm_class)
|
||||
`VX_DCR_MPM_CLASS_CORE: begin
|
||||
case (read_addr)
|
||||
// PERF: pipeline
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_w, pipeline_perf.sched.idles);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_w, pipeline_perf.sched.stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_w, pipeline_perf.issue.ibf_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_w, pipeline_perf.issue.scb_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_w, pipeline_perf.issue.opd_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_ALU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_r, pipeline_perf_if.sched.idles);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_r, pipeline_perf_if.sched.stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_r, pipeline_perf_if.issue.ibf_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_r, pipeline_perf_if.issue.scb_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_r, pipeline_perf_if.issue.opd_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_ALU]);
|
||||
`ifdef EXT_F_ENABLE
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_FPU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_FPU]);
|
||||
`else
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, `PERF_CTR_BITS'(0));
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, `PERF_CTR_BITS'(0));
|
||||
`endif
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_LSU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_SFU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_CSRS]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_WCTL]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_LSU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_SFU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_CSRS]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_WCTL]);
|
||||
// PERF: memory
|
||||
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf.ifetches);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf.loads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_w, pipeline_perf.stores);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_w, pipeline_perf.ifetch_latency);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_w, pipeline_perf.load_latency);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_r, pipeline_perf_if.ifetches);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_r, pipeline_perf_if.loads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_r, pipeline_perf_if.stores);
|
||||
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_r, pipeline_perf_if.ifetch_latency);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_r, pipeline_perf_if.load_latency);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
`VX_DCR_MPM_CLASS_MEM: begin
|
||||
case (read_addr)
|
||||
// PERF: icache
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, sysmem_perf.icache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, sysmem_perf.icache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, sysmem_perf.icache.mshr_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_r, mem_perf_if.icache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_r, mem_perf_if.icache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_r, mem_perf_if.icache.mshr_stalls);
|
||||
// PERF: dcache
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_w, sysmem_perf.dcache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_w, sysmem_perf.dcache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_w, sysmem_perf.dcache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_w, sysmem_perf.dcache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_w, sysmem_perf.dcache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_w, sysmem_perf.dcache.mshr_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_r, mem_perf_if.dcache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_r, mem_perf_if.dcache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_r, mem_perf_if.dcache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_r, mem_perf_if.dcache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_r, mem_perf_if.dcache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_r, mem_perf_if.dcache.mshr_stalls);
|
||||
// PERF: lmem
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, sysmem_perf.lmem.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, sysmem_perf.lmem.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, sysmem_perf.lmem.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_r, mem_perf_if.lmem.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_r, mem_perf_if.lmem.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_r, mem_perf_if.lmem.bank_stalls);
|
||||
// PERF: l2cache
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_w, sysmem_perf.l2cache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_w, sysmem_perf.l2cache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_w, sysmem_perf.l2cache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_w, sysmem_perf.l2cache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l2cache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l2cache.mshr_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_r, mem_perf_if.l2cache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_r, mem_perf_if.l2cache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_r, mem_perf_if.l2cache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_r, mem_perf_if.l2cache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_r, mem_perf_if.l2cache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l2cache.mshr_stalls);
|
||||
// PERF: l3cache
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_w, sysmem_perf.l3cache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_w, sysmem_perf.l3cache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_w, sysmem_perf.l3cache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_w, sysmem_perf.l3cache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l3cache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l3cache.mshr_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_r, mem_perf_if.l3cache.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_r, mem_perf_if.l3cache.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_r, mem_perf_if.l3cache.read_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_r, mem_perf_if.l3cache.write_misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_r, mem_perf_if.l3cache.bank_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l3cache.mshr_stalls);
|
||||
// PERF: memory
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, sysmem_perf.mem.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, sysmem_perf.mem.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_w, sysmem_perf.mem.latency);
|
||||
// PERF: coalescer
|
||||
`CSR_READ_64(`VX_CSR_MPM_COALESCER_MISS, read_data_ro_w, sysmem_perf.coalescer.misses);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_r, mem_perf_if.mem.reads);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_r, mem_perf_if.mem.writes);
|
||||
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_r, mem_perf_if.mem.latency);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
|
@ -284,16 +282,16 @@ import VX_fpu_pkg::*;
|
|||
endcase
|
||||
end
|
||||
|
||||
assign read_data_ro = read_data_ro_w;
|
||||
assign read_data_rw = read_data_rw_w;
|
||||
assign read_data_ro = read_data_ro_r;
|
||||
assign read_data_rw = read_data_rw_r;
|
||||
|
||||
`UNUSED_VAR (base_dcrs)
|
||||
|
||||
`RUNTIME_ASSERT(~read_enable || read_addr_valid_w, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
|
||||
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
`UNUSED_VAR (sysmem_perf.icache);
|
||||
`UNUSED_VAR (sysmem_perf.lmem);
|
||||
`UNUSED_VAR (mem_perf_if.icache);
|
||||
`UNUSED_VAR (mem_perf_if.lmem);
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -24,8 +24,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
input base_dcrs_t base_dcrs,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
input pipeline_perf_t pipeline_perf,
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
|
@ -66,7 +66,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] rs1_data;
|
||||
`UNUSED_VAR (rs1_data)
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_rs1_data
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign rs1_data[i] = execute_if.data.rs1_data[i];
|
||||
end
|
||||
|
||||
|
@ -82,8 +82,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
.base_dcrs (base_dcrs),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.sysmem_perf (sysmem_perf),
|
||||
.pipeline_perf (pipeline_perf),
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
`endif
|
||||
|
||||
.commit_csr_if (commit_csr_if),
|
||||
|
@ -113,15 +113,12 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] wtid, gtid;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_wtid
|
||||
if (PID_BITS != 0) begin : g_pid
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
if (PID_BITS != 0) begin
|
||||
assign wtid[i] = `XLEN'(execute_if.data.pid * NUM_LANES + i);
|
||||
end else begin : g_no_pid
|
||||
end else begin
|
||||
assign wtid[i] = `XLEN'(i);
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_gtid
|
||||
assign gtid[i] = (`XLEN'(CORE_ID) << (`NW_BITS + `NT_BITS)) + (`XLEN'(execute_if.data.wid) << `NT_BITS) + wtid[i];
|
||||
end
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_dcr_data import VX_gpu_pkg::*; (
|
||||
module VX_dcr_data import VX_gpu_pkg::*, VX_trace_pkg::*; (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
|
@ -50,9 +50,9 @@ module VX_dcr_data import VX_gpu_pkg::*; (
|
|||
`ifdef DBG_TRACE_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (dcr_bus_if.write_valid) begin
|
||||
`TRACE(2, ("%t: base-dcr: state=", $time))
|
||||
`TRACE(1, ("%d: base-dcr: state=", $time));
|
||||
trace_base_dcr(1, dcr_bus_if.write_addr);
|
||||
`TRACE(2, (", data=0x%h\n", dcr_bus_if.write_data))
|
||||
`TRACE(1, (", data=0x%h\n", dcr_bus_if.write_data));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -15,19 +15,19 @@
|
|||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`define USED_IREG(x) \
|
||||
x``_v = {1'b0, ``x}; \
|
||||
x``_r = {1'b0, ``x}; \
|
||||
use_``x = 1
|
||||
|
||||
`define USED_FREG(x) \
|
||||
x``_v = {1'b1, ``x}; \
|
||||
x``_r = {1'b1, ``x}; \
|
||||
use_``x = 1
|
||||
`else
|
||||
`define USED_IREG(x) \
|
||||
x``_v = ``x; \
|
||||
x``_r = ``x; \
|
||||
use_``x = 1
|
||||
`endif
|
||||
|
||||
module VX_decode import VX_gpu_pkg::*; #(
|
||||
module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = ""
|
||||
) (
|
||||
input wire clk,
|
||||
|
@ -50,7 +50,7 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
reg [`EX_BITS-1:0] ex_type;
|
||||
reg [`INST_OP_BITS-1:0] op_type;
|
||||
op_args_t op_args;
|
||||
reg [`NR_BITS-1:0] rd_v, rs1_v, rs2_v, rs3_v;
|
||||
reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r;
|
||||
reg use_rd, use_rs1, use_rs2, use_rs3;
|
||||
reg is_wstall;
|
||||
|
||||
|
@ -152,13 +152,13 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
|
||||
always @(*) begin
|
||||
|
||||
ex_type = 'x;
|
||||
ex_type = '0;
|
||||
op_type = 'x;
|
||||
op_args = 'x;
|
||||
rd_v = '0;
|
||||
rs1_v = '0;
|
||||
rs2_v = '0;
|
||||
rs3_v = '0;
|
||||
rd_r = '0;
|
||||
rs1_r = '0;
|
||||
rs2_r = '0;
|
||||
rs3_r = '0;
|
||||
use_rd = 0;
|
||||
use_rs1 = 0;
|
||||
use_rs2 = 0;
|
||||
|
@ -376,16 +376,14 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`USED_IREG (rs2);
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
`INST_FMADD, // 7'b1000011
|
||||
`INST_FMSUB, // 7'b1000111
|
||||
`INST_FNMSUB, // 7'b1001011
|
||||
`INST_FNMADD: // 7'b1001111
|
||||
begin
|
||||
`INST_FMADD,
|
||||
`INST_FMSUB,
|
||||
`INST_FNMSUB,
|
||||
`INST_FNMADD: begin
|
||||
ex_type = `EX_FPU;
|
||||
op_type = `INST_OP_BITS'({2'b00, 1'b1, opcode[3]});
|
||||
op_type = `INST_OP_BITS'({2'b11, opcode[3:2]});
|
||||
op_args.fpu.frm = func3;
|
||||
op_args.fpu.fmt[0] = func2[0]; // float / double
|
||||
op_args.fpu.fmt[1] = opcode[3] ^ opcode[2]; // SUB
|
||||
use_rd = 1;
|
||||
`USED_FREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
|
@ -401,10 +399,9 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
case (func5)
|
||||
5'b00000, // FADD
|
||||
5'b00001, // FSUB
|
||||
5'b00010: // FMUL
|
||||
begin
|
||||
op_type = `INST_OP_BITS'({2'b00, 1'b0, func5[1]});
|
||||
op_args.fpu.fmt[1] = func5[0]; // SUB
|
||||
5'b00010, // FMUL
|
||||
5'b00011: begin // FDIV
|
||||
op_type = `INST_OP_BITS'(func5[1:0]);
|
||||
`USED_FREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
`USED_FREG (rs2);
|
||||
|
@ -433,13 +430,6 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`USED_FREG (rs1);
|
||||
end
|
||||
`endif
|
||||
5'b00011: begin
|
||||
// FDIV
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_DIV);
|
||||
`USED_FREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
`USED_FREG (rs2);
|
||||
end
|
||||
5'b01011: begin
|
||||
// FSQRT
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_SQRT);
|
||||
|
@ -537,7 +527,7 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
end
|
||||
|
||||
// disable write to integer register r0
|
||||
wire wb = use_rd && (rd_v != 0);
|
||||
wire wb = use_rd && (rd_r != 0);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
|
@ -547,7 +537,7 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
.reset (reset),
|
||||
.valid_in (fetch_if.valid),
|
||||
.ready_in (fetch_if.ready),
|
||||
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_v, rs1_v, rs2_v, rs3_v}),
|
||||
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_r, rs1_r, rs2_r, rs3_r}),
|
||||
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
|
||||
.valid_out (decode_if.valid),
|
||||
.ready_out (decode_if.ready)
|
||||
|
@ -557,10 +547,9 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
|
||||
wire fetch_fire = fetch_if.valid && fetch_if.ready;
|
||||
|
||||
assign decode_sched_if.valid = fetch_fire;
|
||||
assign decode_sched_if.wid = fetch_if.data.wid;
|
||||
assign decode_sched_if.unlock = ~is_wstall;
|
||||
|
||||
assign decode_sched_if.valid = fetch_fire;
|
||||
assign decode_sched_if.wid = fetch_if.data.wid;
|
||||
assign decode_sched_if.is_wstall = is_wstall;
|
||||
`ifndef L1_ENABLE
|
||||
assign fetch_if.ibuf_pop = decode_if.ibuf_pop;
|
||||
`endif
|
||||
|
@ -568,14 +557,14 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`ifdef DBG_TRACE_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (decode_if.valid && decode_if.ready) begin
|
||||
`TRACE(1, ("%t: %s: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, INSTANCE_ID, decode_if.data.wid, {decode_if.data.PC, 1'd0}, instr))
|
||||
`TRACE(1, ("%d: %s: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, INSTANCE_ID, decode_if.data.wid, {decode_if.data.PC, 1'd0}, instr));
|
||||
trace_ex_type(1, decode_if.data.ex_type);
|
||||
`TRACE(1, (", op="))
|
||||
`TRACE(1, (", op="));
|
||||
trace_ex_op(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args);
|
||||
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, opds=%b%b%b%b",
|
||||
decode_if.data.tmask, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, use_rd, use_rs1, use_rs2, use_rs3))
|
||||
decode_if.data.tmask, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, use_rd, use_rs1, use_rs2, use_rs3));
|
||||
trace_op_args(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args);
|
||||
`TRACE(1, (" (#%0d)\n", decode_if.data.uuid))
|
||||
`TRACE(1, (" (#%0d)\n", decode_if.data.uuid));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -33,7 +33,7 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
|||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH;
|
||||
|
||||
wire [`NUM_THREADS-1:0][`NT_WIDTH-1:0] tids;
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin : g_tids
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
assign tids[i] = `NT_WIDTH'(i);
|
||||
end
|
||||
|
||||
|
@ -50,19 +50,23 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
|||
`UNUSED_PIN (valid_out)
|
||||
);
|
||||
|
||||
wire [`NUM_EX_UNITS-1:0] operands_ready_in;
|
||||
assign operands_if.ready = operands_ready_in[operands_if.data.ex_type];
|
||||
wire [`NUM_EX_UNITS-1:0] operands_reset;
|
||||
assign operands_if.ready = operands_reset[operands_if.data.ex_type];
|
||||
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
||||
|
||||
`RESET_RELAY (buffer_reset, reset);
|
||||
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_buffers
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (2),
|
||||
.OUT_REG (1)
|
||||
.OUT_REG (2), // 2-cycle EB for area reduction
|
||||
.LUTRAM (1)
|
||||
) buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (buffer_reset),
|
||||
.valid_in (operands_if.valid && (operands_if.data.ex_type == `EX_BITS'(i))),
|
||||
.ready_in (operands_ready_in[i]),
|
||||
.ready_in (operands_reset[i]),
|
||||
.data_in ({
|
||||
operands_if.data.uuid,
|
||||
operands_if.data.wis,
|
||||
|
@ -88,7 +92,7 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
|||
|
||||
wire operands_if_stall = operands_if.valid && ~operands_if.ready;
|
||||
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_perf_stalls
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_stalls_r[i] <= '0;
|
||||
|
|
|
@ -49,12 +49,13 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||
wire [`ISSUE_WIDTH-1:0][IN_DATAW-1:0] dispatch_data;
|
||||
wire [`ISSUE_WIDTH-1:0] dispatch_ready;
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_dispatch_data
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign dispatch_valid[i] = dispatch_if[i].valid;
|
||||
assign dispatch_data[i] = dispatch_if[i].data;
|
||||
assign dispatch_if[i].ready = dispatch_ready[i];
|
||||
end
|
||||
|
||||
wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices;
|
||||
wire [BLOCK_SIZE-1:0] block_ready;
|
||||
wire [BLOCK_SIZE-1:0][NUM_LANES-1:0] block_tmask;
|
||||
wire [BLOCK_SIZE-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] block_regs;
|
||||
|
@ -65,53 +66,30 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||
|
||||
wire batch_done = (& block_done);
|
||||
|
||||
// batch select logic
|
||||
|
||||
logic [BATCH_COUNT_W-1:0] batch_idx;
|
||||
|
||||
if (BATCH_COUNT != 1) begin : g_batch_idx
|
||||
wire [BATCH_COUNT_W-1:0] batch_idx_n;
|
||||
wire [BATCH_COUNT-1:0] valid_batches;
|
||||
for (genvar i = 0; i < BATCH_COUNT; ++i) begin : g_valid_batches
|
||||
assign valid_batches[i] = | dispatch_valid[i * BLOCK_SIZE +: BLOCK_SIZE];
|
||||
end
|
||||
|
||||
VX_generic_arbiter #(
|
||||
.NUM_REQS (BATCH_COUNT),
|
||||
.TYPE ("P")
|
||||
) batch_sel (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.requests (valid_batches),
|
||||
.grant_index (batch_idx_n),
|
||||
`UNUSED_PIN (grant_onehot),
|
||||
`UNUSED_PIN (grant_valid),
|
||||
.grant_ready (batch_done)
|
||||
);
|
||||
|
||||
if (BATCH_COUNT != 1) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
batch_idx <= '0;
|
||||
end else if (batch_done) begin
|
||||
batch_idx <= batch_idx_n;
|
||||
end else begin
|
||||
batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done);
|
||||
end
|
||||
end
|
||||
end else begin : g_batch_idx_0
|
||||
end else begin
|
||||
assign batch_idx = 0;
|
||||
`UNUSED_VAR (batch_done)
|
||||
end
|
||||
|
||||
wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices;
|
||||
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_issue_indices
|
||||
assign issue_indices[block_idx] = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
|
||||
end
|
||||
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
|
||||
|
||||
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_blocks
|
||||
wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
|
||||
assign issue_indices[block_idx] = issue_idx;
|
||||
|
||||
`RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1));
|
||||
|
||||
wire [ISSUE_W-1:0] issue_idx = issue_indices[block_idx];
|
||||
wire valid_p, ready_p;
|
||||
|
||||
if (`NUM_THREADS > NUM_LANES) begin : g_partial_threads
|
||||
if (`NUM_THREADS != NUM_LANES) begin
|
||||
reg [NUM_PACKETS-1:0] sent_mask_p;
|
||||
wire [PID_WIDTH-1:0] start_p_n, start_p, end_p;
|
||||
wire dispatch_valid_r;
|
||||
|
@ -124,7 +102,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||
wire fire_eop = fire_p && is_last_p;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
if (block_reset) begin
|
||||
sent_mask_p <= '0;
|
||||
is_first_p <= 1;
|
||||
end else begin
|
||||
|
@ -146,8 +124,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
|
||||
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
|
||||
|
||||
for (genvar i = 0; i < NUM_PACKETS; ++i) begin : g_per_packet_data
|
||||
for (genvar j = 0; j < NUM_LANES; ++j) begin : g_j
|
||||
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
|
||||
for (genvar j = 0; j < NUM_LANES; ++j) begin
|
||||
localparam k = i * NUM_LANES + j;
|
||||
assign per_packet_tmask[i][j] = dispatch_tmask[k];
|
||||
assign per_packet_regs[i][0][j] = dispatch_rs1_data[k];
|
||||
|
@ -157,12 +135,10 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||
end
|
||||
|
||||
wire [NUM_PACKETS-1:0] packet_valids;
|
||||
for (genvar i = 0; i < NUM_PACKETS; ++i) begin : g_packet_valids
|
||||
assign packet_valids[i] = (| per_packet_tmask[i]);
|
||||
end
|
||||
|
||||
wire [NUM_PACKETS-1:0][PID_WIDTH-1:0] packet_ids;
|
||||
for (genvar i = 0; i < NUM_PACKETS; ++i) begin : g_packet_ids
|
||||
|
||||
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
|
||||
assign packet_valids[i] = (| per_packet_tmask[i]);
|
||||
assign packet_ids[i] = PID_WIDTH'(i);
|
||||
end
|
||||
|
||||
|
@ -211,13 +187,13 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||
assign block_pid[block_idx] = start_p;
|
||||
assign block_sop[block_idx] = is_first_p;
|
||||
assign block_eop[block_idx] = is_last_p;
|
||||
if (FANOUT_ENABLE) begin : g_block_ready_fanout
|
||||
if (FANOUT_ENABLE) begin
|
||||
assign block_ready[block_idx] = dispatch_valid_r && ready_p && block_enable;
|
||||
end else begin : g_block_ready
|
||||
end else begin
|
||||
assign block_ready[block_idx] = ready_p && block_enable;
|
||||
end
|
||||
assign block_done[block_idx] = fire_eop || ~dispatch_valid[issue_idx];
|
||||
end else begin : g_full_threads
|
||||
assign block_done[block_idx] = ~dispatch_valid[issue_idx] || fire_eop;
|
||||
end else begin
|
||||
assign valid_p = dispatch_valid[issue_idx];
|
||||
assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
|
||||
assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
|
||||
|
@ -227,31 +203,29 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||
assign block_sop[block_idx] = 1'b1;
|
||||
assign block_eop[block_idx] = 1'b1;
|
||||
assign block_ready[block_idx] = ready_p;
|
||||
assign block_done[block_idx] = ready_p || ~valid_p;
|
||||
assign block_done[block_idx] = ~valid_p || ready_p;
|
||||
end
|
||||
|
||||
wire [ISSUE_ISW_W-1:0] isw;
|
||||
if (BATCH_COUNT != 1) begin : g_isw_batch
|
||||
if (BLOCK_SIZE != 1) begin : g_block
|
||||
if (BATCH_COUNT != 1) begin
|
||||
if (BLOCK_SIZE != 1) begin
|
||||
assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)};
|
||||
end else begin : g_no_block
|
||||
end else begin
|
||||
assign isw = batch_idx;
|
||||
end
|
||||
end else begin : g_isw
|
||||
end else begin
|
||||
assign isw = block_idx;
|
||||
end
|
||||
|
||||
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
|
||||
|
||||
logic [OUT_DATAW-1:0] execute_data, execute_data_w;
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (OUT_DATAW),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
|
||||
) buf_out (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (block_reset),
|
||||
.valid_in (valid_p),
|
||||
.ready_in (ready_p),
|
||||
.data_in ({
|
||||
|
@ -265,27 +239,17 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||
block_pid[block_idx],
|
||||
block_sop[block_idx],
|
||||
block_eop[block_idx]}),
|
||||
.data_out (execute_data),
|
||||
.data_out (execute_if[block_idx].data),
|
||||
.valid_out (execute_if[block_idx].valid),
|
||||
.ready_out (execute_if[block_idx].ready)
|
||||
);
|
||||
|
||||
if (`NUM_THREADS != NUM_LANES) begin : g_execute_data_w_partial
|
||||
assign execute_data_w = execute_data;
|
||||
end else begin : g_execute_data_w_full
|
||||
always @(*) begin
|
||||
execute_data_w = execute_data;
|
||||
execute_data_w[2:0] = {1'b0, 1'b1, 1'b1}; // default pid, sop, and eop
|
||||
end
|
||||
end
|
||||
assign execute_if[block_idx].data = execute_data_w;
|
||||
end
|
||||
|
||||
reg [`ISSUE_WIDTH-1:0] ready_in;
|
||||
always @(*) begin
|
||||
ready_in = 0;
|
||||
for (integer block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
|
||||
ready_in[issue_indices[block_idx]] = block_ready[block_idx] && block_eop[block_idx];
|
||||
for (integer i = 0; i < BLOCK_SIZE; ++i) begin
|
||||
ready_in[issue_indices[i]] = block_ready[i] && block_eop[i];
|
||||
end
|
||||
end
|
||||
assign dispatch_ready = ready_in;
|
||||
|
|
|
@ -23,8 +23,8 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
input pipeline_perf_t pipeline_perf,
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
`endif
|
||||
|
||||
input base_dcrs_t base_dcrs,
|
||||
|
@ -51,35 +51,41 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||
VX_fpu_csr_if fpu_csr_if[`NUM_FPU_BLOCKS]();
|
||||
`endif
|
||||
|
||||
`RESET_RELAY (alu_reset, reset);
|
||||
`RESET_RELAY (lsu_reset, reset);
|
||||
`RESET_RELAY (sfu_reset, reset);
|
||||
|
||||
VX_alu_unit #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-alu", INSTANCE_ID)))
|
||||
.INSTANCE_ID ($sformatf("%s-alu", INSTANCE_ID))
|
||||
) alu_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (alu_reset),
|
||||
.dispatch_if (dispatch_if[`EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.commit_if (commit_if[`EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.branch_ctl_if (branch_ctl_if)
|
||||
);
|
||||
|
||||
`SCOPE_IO_SWITCH (1);
|
||||
`SCOPE_IO_SWITCH (1)
|
||||
|
||||
VX_lsu_unit #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-lsu", INSTANCE_ID)))
|
||||
.INSTANCE_ID ($sformatf("%s-lsu", INSTANCE_ID))
|
||||
) lsu_unit (
|
||||
`SCOPE_IO_BIND (0)
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (lsu_reset),
|
||||
.dispatch_if (dispatch_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.commit_if (commit_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.lsu_mem_if (lsu_mem_if)
|
||||
);
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`RESET_RELAY (fpu_reset, reset);
|
||||
|
||||
VX_fpu_unit #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-fpu", INSTANCE_ID)))
|
||||
.INSTANCE_ID ($sformatf("%s-fpu", INSTANCE_ID))
|
||||
) fpu_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (fpu_reset),
|
||||
.dispatch_if (dispatch_if[`EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.commit_if (commit_if[`EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.fpu_csr_if (fpu_csr_if)
|
||||
|
@ -87,14 +93,14 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
|
||||
VX_sfu_unit #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-sfu", INSTANCE_ID))),
|
||||
.INSTANCE_ID ($sformatf("%s-sfu", INSTANCE_ID)),
|
||||
.CORE_ID (CORE_ID)
|
||||
) sfu_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (sfu_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.sysmem_perf (sysmem_perf),
|
||||
.pipeline_perf (pipeline_perf),
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.pipeline_perf_if (pipeline_perf_if),
|
||||
`endif
|
||||
.base_dcrs (base_dcrs),
|
||||
.dispatch_if (dispatch_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
|
|
|
@ -51,9 +51,8 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
|||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (`PC_BITS + `NUM_THREADS),
|
||||
.SIZE (`NUM_WARPS),
|
||||
.RDW_MODE ("R"),
|
||||
.DATAW (`PC_BITS + `NUM_THREADS),
|
||||
.SIZE (`NUM_WARPS),
|
||||
.LUTRAM (1)
|
||||
) tag_store (
|
||||
.clk (clk),
|
||||
|
@ -72,7 +71,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
|||
// This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache requests.
|
||||
// This issue is particularly prevalent when the icache and dcache are disabled and both requests share the same bus.
|
||||
wire [`NUM_WARPS-1:0] pending_ibuf_full;
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_pending_reads
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
|
||||
VX_pending_size #(
|
||||
.SIZE (`IBUF_SIZE)
|
||||
) pending_reads (
|
||||
|
@ -117,9 +116,9 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
|||
.ready_out (icache_bus_if.req_ready)
|
||||
);
|
||||
|
||||
assign icache_bus_if.req_data.flags = '0;
|
||||
assign icache_bus_if.req_data.atype = '0;
|
||||
assign icache_bus_if.req_data.rw = 0;
|
||||
assign icache_bus_if.req_data.byteen = '1;
|
||||
assign icache_bus_if.req_data.byteen = 4'b1111;
|
||||
assign icache_bus_if.req_data.data = '0;
|
||||
|
||||
// Icache Response
|
||||
|
@ -132,59 +131,47 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
|||
assign fetch_if.data.uuid = rsp_uuid;
|
||||
assign icache_bus_if.rsp_ready = fetch_if.ready;
|
||||
|
||||
`ifdef SCOPE
|
||||
`ifdef DBG_SCOPE_FETCH
|
||||
`SCOPE_IO_SWITCH (1);
|
||||
wire schedule_fire = schedule_if.valid && schedule_if.ready;
|
||||
wire icache_bus_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
|
||||
wire icache_bus_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
|
||||
wire reset_negedge;
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP_EX (0, 1, 6, 3, (
|
||||
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +
|
||||
`UUID_WIDTH + ICACHE_WORD_SIZE + ICACHE_ADDR_WIDTH +
|
||||
`UUID_WIDTH + (ICACHE_WORD_SIZE * 8)
|
||||
), {
|
||||
schedule_if.valid,
|
||||
schedule_if.ready,
|
||||
icache_bus_if.req_valid,
|
||||
icache_bus_if.req_ready,
|
||||
icache_bus_if.rsp_valid,
|
||||
icache_bus_if.rsp_ready
|
||||
}, {
|
||||
wire icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
|
||||
VX_scope_tap #(
|
||||
.SCOPE_ID (1),
|
||||
.TRIGGERW (4),
|
||||
.PROBEW (`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +
|
||||
ICACHE_TAG_WIDTH + ICACHE_WORD_SIZE + ICACHE_ADDR_WIDTH +
|
||||
(ICACHE_WORD_SIZE*8) + ICACHE_TAG_WIDTH)
|
||||
) scope_tap (
|
||||
.clk (clk),
|
||||
.reset (scope_reset),
|
||||
.start (1'b0),
|
||||
.stop (1'b0),
|
||||
.triggers ({
|
||||
reset,
|
||||
schedule_fire,
|
||||
icache_bus_req_fire,
|
||||
icache_bus_rsp_fire
|
||||
},{
|
||||
icache_req_fire,
|
||||
icache_rsp_fire
|
||||
}),
|
||||
.probes ({
|
||||
schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC,
|
||||
icache_bus_if.req_data.tag.uuid, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
|
||||
icache_bus_if.rsp_data.tag.uuid, icache_bus_if.rsp_data.data
|
||||
},
|
||||
reset_negedge, 1'b0, 4096
|
||||
icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
|
||||
icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag
|
||||
}),
|
||||
.bus_in (scope_bus_in),
|
||||
.bus_out (scope_bus_out)
|
||||
);
|
||||
`else
|
||||
`SCOPE_IO_UNUSED(0)
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef CHIPSCOPE
|
||||
`ifdef DBG_SCOPE_FETCH
|
||||
ila_fetch ila_fetch_inst (
|
||||
.clk (clk),
|
||||
.probe0 ({schedule_if.valid, schedule_if.data, schedule_if.ready}),
|
||||
.probe1 ({icache_bus_if.req_valid, icache_bus_if.req_data, icache_bus_if.req_ready}),
|
||||
.probe2 ({icache_bus_if.rsp_valid, icache_bus_if.rsp_data, icache_bus_if.rsp_ready})
|
||||
);
|
||||
`endif
|
||||
`SCOPE_IO_UNUSED()
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_MEM
|
||||
wire schedule_fire = schedule_if.valid && schedule_if.ready;
|
||||
wire fetch_fire = fetch_if.valid && fetch_if.ready;
|
||||
always @(posedge clk) begin
|
||||
if (schedule_if.valid && schedule_if.ready) begin
|
||||
`TRACE(1, ("%t: %s req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, INSTANCE_ID, schedule_if.data.wid, {schedule_if.data.PC, 1'b0}, schedule_if.data.tmask, schedule_if.data.uuid))
|
||||
if (schedule_fire) begin
|
||||
`TRACE(1, ("%d: %s req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, INSTANCE_ID, schedule_if.data.wid, {schedule_if.data.PC, 1'b0}, schedule_if.data.tmask, schedule_if.data.uuid));
|
||||
end
|
||||
if (fetch_if.valid && fetch_if.ready) begin
|
||||
`TRACE(1, ("%t: %s rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, INSTANCE_ID, fetch_if.data.wid, {fetch_if.data.PC, 1'b0}, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid))
|
||||
if (fetch_fire) begin
|
||||
`TRACE(1, ("%d: %s rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, INSTANCE_ID, fetch_if.data.wid, {fetch_if.data.PC, 1'b0}, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -41,7 +41,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||
VX_dispatch_unit #(
|
||||
.BLOCK_SIZE (BLOCK_SIZE),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.OUT_BUF (PARTIAL_BW ? 3 : 0)
|
||||
.OUT_BUF (PARTIAL_BW ? 1 : 0)
|
||||
) dispatch_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -53,10 +53,12 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||
.NUM_LANES (NUM_LANES)
|
||||
) per_block_commit_if[BLOCK_SIZE]();
|
||||
|
||||
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_fpus
|
||||
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
|
||||
`UNUSED_VAR (per_block_execute_if[block_idx].data.tid)
|
||||
`UNUSED_VAR (per_block_execute_if[block_idx].data.wb)
|
||||
|
||||
`RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1));
|
||||
|
||||
// Store request info
|
||||
wire fpu_req_valid, fpu_req_ready;
|
||||
wire fpu_rsp_valid, fpu_rsp_ready;
|
||||
|
@ -69,9 +71,9 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||
wire [NUM_LANES-1:0] fpu_rsp_tmask;
|
||||
wire [`PC_BITS-1:0] fpu_rsp_PC;
|
||||
wire [`NR_BITS-1:0] fpu_rsp_rd;
|
||||
wire [PID_WIDTH-1:0] fpu_rsp_pid, fpu_rsp_pid_u;
|
||||
wire fpu_rsp_sop, fpu_rsp_sop_u;
|
||||
wire fpu_rsp_eop, fpu_rsp_eop_u;
|
||||
wire [PID_WIDTH-1:0] fpu_rsp_pid;
|
||||
wire fpu_rsp_sop;
|
||||
wire fpu_rsp_eop;
|
||||
|
||||
wire [TAG_WIDTH-1:0] fpu_req_tag, fpu_rsp_tag;
|
||||
wire mdata_full;
|
||||
|
@ -87,30 +89,17 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||
.SIZE (`FPUQ_SIZE)
|
||||
) tag_store (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (block_reset),
|
||||
.acquire_en (execute_fire),
|
||||
.write_addr (fpu_req_tag),
|
||||
.write_data ({per_block_execute_if[block_idx].data.uuid, per_block_execute_if[block_idx].data.wid, per_block_execute_if[block_idx].data.tmask, per_block_execute_if[block_idx].data.PC, per_block_execute_if[block_idx].data.rd, per_block_execute_if[block_idx].data.pid, per_block_execute_if[block_idx].data.sop, per_block_execute_if[block_idx].data.eop}),
|
||||
.read_data ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid_u, fpu_rsp_sop_u, fpu_rsp_eop_u}),
|
||||
.read_data ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
|
||||
.read_addr (fpu_rsp_tag),
|
||||
.release_en (fpu_rsp_fire),
|
||||
.full (mdata_full),
|
||||
`UNUSED_PIN (empty)
|
||||
);
|
||||
|
||||
if (PID_BITS != 0) begin : g_fpu_rsp_pid
|
||||
assign fpu_rsp_pid = fpu_rsp_pid_u;
|
||||
assign fpu_rsp_sop = fpu_rsp_sop_u;
|
||||
assign fpu_rsp_eop = fpu_rsp_eop_u;
|
||||
end else begin : g_no_fpu_rsp_pid
|
||||
`UNUSED_VAR (fpu_rsp_pid_u)
|
||||
`UNUSED_VAR (fpu_rsp_sop_u)
|
||||
`UNUSED_VAR (fpu_rsp_eop_u)
|
||||
assign fpu_rsp_pid = 0;
|
||||
assign fpu_rsp_sop = 1;
|
||||
assign fpu_rsp_eop = 1;
|
||||
end
|
||||
|
||||
// resolve dynamic FRM from CSR
|
||||
wire [`INST_FRM_BITS-1:0] fpu_req_frm;
|
||||
`ASSIGN_BLOCKED_WID (fpu_csr_if[block_idx].read_wid, per_block_execute_if[block_idx].data.wid, block_idx, `NUM_FPU_BLOCKS)
|
||||
|
@ -130,7 +119,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||
.OUT_BUF (PARTIAL_BW ? 1 : 3)
|
||||
) fpu_dpi (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (block_reset),
|
||||
|
||||
.valid_in (fpu_req_valid),
|
||||
.mask_in (per_block_execute_if[block_idx].data.tmask),
|
||||
|
@ -159,7 +148,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||
.OUT_BUF (PARTIAL_BW ? 1 : 3)
|
||||
) fpu_fpnew (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (block_reset),
|
||||
|
||||
.valid_in (fpu_req_valid),
|
||||
.mask_in (per_block_execute_if[block_idx].data.tmask),
|
||||
|
@ -188,7 +177,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||
.OUT_BUF (PARTIAL_BW ? 1 : 3)
|
||||
) fpu_dsp (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (block_reset),
|
||||
|
||||
.valid_in (fpu_req_valid),
|
||||
.mask_in (per_block_execute_if[block_idx].data.tmask),
|
||||
|
@ -211,38 +200,27 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||
|
||||
`endif
|
||||
|
||||
// handle CSR update
|
||||
// handle FPU response
|
||||
|
||||
fflags_t fpu_rsp_fflags_q;
|
||||
|
||||
if (PID_BITS != 0) begin : g_pid
|
||||
if (PID_BITS != 0) begin
|
||||
fflags_t fpu_rsp_fflags_r;
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
if (block_reset) begin
|
||||
fpu_rsp_fflags_r <= '0;
|
||||
end else if (fpu_rsp_fire) begin
|
||||
fpu_rsp_fflags_r <= fpu_rsp_eop ? '0 : (fpu_rsp_fflags_r | fpu_rsp_fflags);
|
||||
end
|
||||
end
|
||||
assign fpu_rsp_fflags_q = fpu_rsp_fflags_r | fpu_rsp_fflags;
|
||||
end else begin : g_no_pid
|
||||
end else begin
|
||||
assign fpu_rsp_fflags_q = fpu_rsp_fflags;
|
||||
end
|
||||
|
||||
VX_fpu_csr_if fpu_csr_tmp_if();
|
||||
assign fpu_csr_tmp_if.write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags;
|
||||
`ASSIGN_BLOCKED_WID (fpu_csr_tmp_if.write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS)
|
||||
assign fpu_csr_tmp_if.write_fflags = fpu_rsp_fflags_q;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_WIDTH + $bits(fflags_t)),
|
||||
.RESETW (1)
|
||||
) fpu_csr_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (1'b1),
|
||||
.data_in ({fpu_csr_tmp_if.write_enable, fpu_csr_tmp_if.write_wid, fpu_csr_tmp_if.write_fflags}),
|
||||
.data_out ({fpu_csr_if[block_idx].write_enable, fpu_csr_if[block_idx].write_wid, fpu_csr_if[block_idx].write_fflags})
|
||||
);
|
||||
assign fpu_csr_if[block_idx].write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags;
|
||||
`ASSIGN_BLOCKED_WID (fpu_csr_if[block_idx].write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS)
|
||||
assign fpu_csr_if[block_idx].write_fflags = fpu_rsp_fflags_q;
|
||||
|
||||
// send response
|
||||
|
||||
|
@ -251,7 +229,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||
.SIZE (0)
|
||||
) rsp_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (block_reset),
|
||||
.valid_in (fpu_rsp_valid),
|
||||
.ready_in (fpu_rsp_ready),
|
||||
.data_in ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_result, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
|
||||
|
|
|
@ -41,17 +41,17 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
|
|||
wire [BLOCK_SIZE-1:0] commit_in_ready;
|
||||
wire [BLOCK_SIZE-1:0][ISSUE_ISW_W-1:0] commit_in_isw;
|
||||
|
||||
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin : g_commit_in
|
||||
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
|
||||
assign commit_in_valid[i] = commit_in_if[i].valid;
|
||||
assign commit_in_data[i] = commit_in_if[i].data;
|
||||
assign commit_in_if[i].ready = commit_in_ready[i];
|
||||
if (BLOCK_SIZE != `ISSUE_WIDTH) begin : g_commit_in_isw_partial
|
||||
if (BLOCK_SIZE != 1) begin : g_block
|
||||
if (BLOCK_SIZE != `ISSUE_WIDTH) begin
|
||||
if (BLOCK_SIZE != 1) begin
|
||||
assign commit_in_isw[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_ISW_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)};
|
||||
end else begin : g_no_block
|
||||
end else begin
|
||||
assign commit_in_isw[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_ISW_W];
|
||||
end
|
||||
end else begin : g_commit_in_isw_full
|
||||
end else begin
|
||||
assign commit_in_isw[i] = BLOCK_SIZE_W'(i);
|
||||
end
|
||||
end
|
||||
|
@ -70,12 +70,11 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
|
|||
commit_out_data[commit_in_isw[i]] = commit_in_data[i];
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin : g_commit_in_ready
|
||||
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
|
||||
assign commit_in_ready[i] = commit_out_ready[commit_in_isw[i]];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin: g_out_bufs
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
VX_commit_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) commit_tmp_if();
|
||||
|
@ -95,31 +94,31 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
|
|||
.ready_out (commit_tmp_if.ready)
|
||||
);
|
||||
|
||||
logic [`NUM_THREADS-1:0] commit_tmask_w;
|
||||
logic [`NUM_THREADS-1:0][`XLEN-1:0] commit_data_w;
|
||||
if (PID_BITS != 0) begin : g_commit_data_with_pid
|
||||
logic [`NUM_THREADS-1:0] commit_tmask_r;
|
||||
logic [`NUM_THREADS-1:0][`XLEN-1:0] commit_data_r;
|
||||
if (PID_BITS != 0) begin
|
||||
always @(*) begin
|
||||
commit_tmask_w = '0;
|
||||
commit_data_w = 'x;
|
||||
commit_tmask_r = '0;
|
||||
commit_data_r = 'x;
|
||||
for (integer j = 0; j < NUM_LANES; ++j) begin
|
||||
commit_tmask_w[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.tmask[j];
|
||||
commit_data_w[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.data[j];
|
||||
commit_tmask_r[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.tmask[j];
|
||||
commit_data_r[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.data[j];
|
||||
end
|
||||
end
|
||||
end else begin : g_commit_data_no_pid
|
||||
assign commit_tmask_w = commit_tmp_if.data.tmask;
|
||||
assign commit_data_w = commit_tmp_if.data.data;
|
||||
end else begin
|
||||
assign commit_tmask_r = commit_tmp_if.data.tmask;
|
||||
assign commit_data_r = commit_tmp_if.data.data;
|
||||
end
|
||||
|
||||
assign commit_out_if[i].valid = commit_tmp_if.valid;
|
||||
assign commit_out_if[i].data = {
|
||||
commit_tmp_if.data.uuid,
|
||||
commit_tmp_if.data.wid,
|
||||
commit_tmask_w,
|
||||
commit_tmask_r,
|
||||
commit_tmp_if.data.PC,
|
||||
commit_tmp_if.data.wb,
|
||||
commit_tmp_if.data.rd,
|
||||
commit_data_w,
|
||||
commit_data_r,
|
||||
1'b0, // PID
|
||||
commit_tmp_if.data.sop,
|
||||
commit_tmp_if.data.eop
|
||||
|
|
|
@ -35,11 +35,11 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
|||
wire [PER_ISSUE_WARPS-1:0] ibuf_ready_in;
|
||||
assign decode_if.ready = ibuf_ready_in[decode_if.data.wid];
|
||||
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_instr_bufs
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`IBUF_SIZE),
|
||||
.OUT_REG (1)
|
||||
.OUT_REG (2) // 2-cycle EB for area reduction
|
||||
) instr_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
module VX_ipdom_stack #(
|
||||
parameter WIDTH = 1,
|
||||
parameter DEPTH = 1,
|
||||
parameter OUT_REG = 0,
|
||||
parameter ADDRW = `LOG2UP(DEPTH)
|
||||
) (
|
||||
input wire clk,
|
||||
|
@ -30,63 +31,76 @@ module VX_ipdom_stack #(
|
|||
output wire empty,
|
||||
output wire full
|
||||
);
|
||||
reg [ADDRW-1:0] rd_ptr, rd_ptr_n, wr_ptr;
|
||||
reg slot_set [DEPTH-1:0];
|
||||
|
||||
reg [ADDRW-1:0] rd_ptr, wr_ptr;
|
||||
|
||||
reg empty_r, full_r;
|
||||
|
||||
wire [WIDTH-1:0] d0, d1;
|
||||
|
||||
wire d_set_r;
|
||||
|
||||
always @(*) begin
|
||||
rd_ptr_n = rd_ptr;
|
||||
if (push) begin
|
||||
rd_ptr_n = wr_ptr;
|
||||
end else if (pop) begin
|
||||
rd_ptr_n = rd_ptr - ADDRW'(d_set_r);
|
||||
end
|
||||
end
|
||||
wire d_set_n = slot_set[rd_ptr];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
rd_ptr <= '0;
|
||||
wr_ptr <= '0;
|
||||
empty_r <= 1;
|
||||
full_r <= 0;
|
||||
rd_ptr <= '0;
|
||||
end else begin
|
||||
`ASSERT(~push || ~full, ("%t: runtime error: writing to a full stack!", $time));
|
||||
`ASSERT(~pop || ~empty, ("%t: runtime error: reading an empty stack!", $time));
|
||||
`ASSERT(~push || ~pop, ("%t: runtime error: push and pop in same cycle not supported!", $time));
|
||||
`ASSERT(~push || ~full, ("runtime error: writing to a full stack!"));
|
||||
`ASSERT(~pop || ~empty, ("runtime error: reading an empty stack!"));
|
||||
`ASSERT(~push || ~pop, ("runtime error: push and pop in same cycle not supported!"));
|
||||
if (push) begin
|
||||
rd_ptr <= wr_ptr;
|
||||
wr_ptr <= wr_ptr + ADDRW'(1);
|
||||
empty_r <= 0;
|
||||
full_r <= (ADDRW'(DEPTH-1) == wr_ptr);
|
||||
end else if (pop) begin
|
||||
wr_ptr <= wr_ptr - ADDRW'(d_set_r);
|
||||
empty_r <= (rd_ptr == 0) && d_set_r;
|
||||
wr_ptr <= wr_ptr - ADDRW'(d_set_n);
|
||||
rd_ptr <= rd_ptr - ADDRW'(d_set_n);
|
||||
empty_r <= (rd_ptr == 0) && (d_set_n == 1);
|
||||
full_r <= 0;
|
||||
end
|
||||
rd_ptr <= rd_ptr_n;
|
||||
end
|
||||
end
|
||||
|
||||
wire [WIDTH * 2:0] qout = push ? {1'b0, q1, q0} : {1'b1, d1, d0};
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (1 + WIDTH * 2),
|
||||
.SIZE (DEPTH),
|
||||
.OUT_REG (1),
|
||||
.RDW_MODE ("R")
|
||||
) ipdom_store (
|
||||
.DATAW (WIDTH * 2),
|
||||
.SIZE (DEPTH),
|
||||
.OUT_REG (OUT_REG ? 1 : 0),
|
||||
.LUTRAM (OUT_REG ? 0 : 1)
|
||||
) store (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.read (1'b1),
|
||||
.write (push || pop),
|
||||
.write (push),
|
||||
.wren (1'b1),
|
||||
.waddr (push ? wr_ptr : rd_ptr),
|
||||
.wdata (qout),
|
||||
.raddr (rd_ptr_n),
|
||||
.rdata ({d_set_r, d1, d0})
|
||||
.waddr (wr_ptr),
|
||||
.wdata ({q1, q0}),
|
||||
.raddr (rd_ptr),
|
||||
.rdata ({d1, d0})
|
||||
);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (push) begin
|
||||
slot_set[wr_ptr] <= 0;
|
||||
end else if (pop) begin
|
||||
slot_set[rd_ptr] <= 1;
|
||||
end
|
||||
end
|
||||
|
||||
wire d_set_r;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1),
|
||||
.DEPTH (OUT_REG)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (1'b1),
|
||||
.data_in (d_set_n),
|
||||
.data_out (d_set_r)
|
||||
);
|
||||
|
||||
assign d = d_set_r ? d0 : d1;
|
||||
|
|
|
@ -29,17 +29,16 @@ module VX_issue import VX_gpu_pkg::*; #(
|
|||
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
|
||||
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH]
|
||||
);
|
||||
`STATIC_ASSERT ((`ISSUE_WIDTH <= `NUM_WARPS), ("invalid parameter"))
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
issue_perf_t per_issue_perf [`ISSUE_WIDTH];
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, ibf_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, scb_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, opd_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_issue_perf_units_uses
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, units_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
end
|
||||
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin : g_issue_perf_sfu_uses
|
||||
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, sfu_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
end
|
||||
`endif
|
||||
|
@ -50,9 +49,9 @@ module VX_issue import VX_gpu_pkg::*; #(
|
|||
wire [`ISSUE_WIDTH-1:0] decode_ready_in;
|
||||
assign decode_if.ready = decode_ready_in[decode_isw];
|
||||
|
||||
`SCOPE_IO_SWITCH (`ISSUE_WIDTH);
|
||||
`SCOPE_IO_SWITCH (`ISSUE_WIDTH)
|
||||
|
||||
for (genvar issue_id = 0; issue_id < `ISSUE_WIDTH; ++issue_id) begin : g_slices
|
||||
for (genvar issue_id = 0; issue_id < `ISSUE_WIDTH; ++issue_id) begin : issue_slices
|
||||
VX_decode_if #(
|
||||
.NUM_WARPS (PER_ISSUE_WARPS)
|
||||
) per_issue_decode_if();
|
||||
|
@ -77,13 +76,15 @@ module VX_issue import VX_gpu_pkg::*; #(
|
|||
assign decode_if.ibuf_pop[issue_id * PER_ISSUE_WARPS +: PER_ISSUE_WARPS] = per_issue_decode_if.ibuf_pop;
|
||||
`endif
|
||||
|
||||
`RESET_RELAY (slice_reset, reset);
|
||||
|
||||
VX_issue_slice #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s%0d", INSTANCE_ID, issue_id))),
|
||||
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, issue_id)),
|
||||
.ISSUE_ID (issue_id)
|
||||
) issue_slice (
|
||||
`SCOPE_IO_BIND(issue_id)
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (slice_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.issue_perf (per_issue_perf[issue_id]),
|
||||
`endif
|
||||
|
@ -93,7 +94,7 @@ module VX_issue import VX_gpu_pkg::*; #(
|
|||
);
|
||||
|
||||
// Assign transposed dispatch_if
|
||||
for (genvar ex_id = 0; ex_id < `NUM_EX_UNITS; ++ex_id) begin : g_dispatch_if
|
||||
for (genvar ex_id = 0; ex_id < `NUM_EX_UNITS; ++ex_id) begin
|
||||
`ASSIGN_VX_IF(dispatch_if[ex_id * `ISSUE_WIDTH + issue_id], per_issue_dispatch_if[ex_id]);
|
||||
end
|
||||
end
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_issue_slice import VX_gpu_pkg::*; #(
|
||||
module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = "",
|
||||
parameter ISSUE_ID = 0
|
||||
) (
|
||||
|
@ -36,11 +36,16 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
|
|||
VX_scoreboard_if scoreboard_if();
|
||||
VX_operands_if operands_if();
|
||||
|
||||
`RESET_RELAY (ibuf_reset, reset);
|
||||
`RESET_RELAY (scoreboard_reset, reset);
|
||||
`RESET_RELAY (operands_reset, reset);
|
||||
`RESET_RELAY (dispatch_reset, reset);
|
||||
|
||||
VX_ibuffer #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-ibuffer", INSTANCE_ID)))
|
||||
.INSTANCE_ID ($sformatf("%s-ibuffer", INSTANCE_ID))
|
||||
) ibuffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (ibuf_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_stalls (issue_perf.ibf_stalls),
|
||||
`endif
|
||||
|
@ -49,10 +54,10 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
|
|||
);
|
||||
|
||||
VX_scoreboard #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-scoreboard", INSTANCE_ID)))
|
||||
.INSTANCE_ID ($sformatf("%s-scoreboard", INSTANCE_ID))
|
||||
) scoreboard (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (scoreboard_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_stalls (issue_perf.scb_stalls),
|
||||
.perf_units_uses(issue_perf.units_uses),
|
||||
|
@ -64,10 +69,10 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
|
|||
);
|
||||
|
||||
VX_operands #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-operands", INSTANCE_ID)))
|
||||
.INSTANCE_ID ($sformatf("%s-operands", INSTANCE_ID))
|
||||
) operands (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (operands_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_stalls (issue_perf.opd_stalls),
|
||||
`endif
|
||||
|
@ -77,10 +82,10 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
|
|||
);
|
||||
|
||||
VX_dispatch #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-dispatch", INSTANCE_ID)))
|
||||
.INSTANCE_ID ($sformatf("%s-dispatch", INSTANCE_ID))
|
||||
) dispatch (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (dispatch_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
`UNUSED_PIN (perf_stalls),
|
||||
`endif
|
||||
|
@ -88,90 +93,65 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
|
|||
.dispatch_if (dispatch_if)
|
||||
);
|
||||
|
||||
`ifdef SCOPE
|
||||
`ifdef DBG_SCOPE_ISSUE
|
||||
`SCOPE_IO_SWITCH (1);
|
||||
wire decode_fire = decode_if.valid && decode_if.ready;
|
||||
wire operands_fire = operands_if.valid && operands_if.ready;
|
||||
wire reset_negedge;
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP_EX (0, 2, 4, 3, (
|
||||
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + 1 + `NR_BITS * 4 +
|
||||
`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + 1 + `NR_BITS + (3 * `XLEN) +
|
||||
`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * `XLEN) + 1
|
||||
), {
|
||||
decode_if.valid,
|
||||
decode_if.ready,
|
||||
operands_if.valid,
|
||||
operands_if.ready
|
||||
}, {
|
||||
decode_fire,
|
||||
operands_fire,
|
||||
writeback_if.valid // ack-free
|
||||
}, {
|
||||
decode_if.data.uuid,
|
||||
decode_if.data.wid,
|
||||
decode_if.data.tmask,
|
||||
decode_if.data.PC,
|
||||
decode_if.data.ex_type,
|
||||
decode_if.data.op_type,
|
||||
decode_if.data.wb,
|
||||
decode_if.data.rd,
|
||||
decode_if.data.rs1,
|
||||
decode_if.data.rs2,
|
||||
decode_if.data.rs3,
|
||||
wire operands_if_fire = operands_if.valid && operands_if.ready;
|
||||
wire operands_if_not_ready = ~operands_if.ready;
|
||||
wire writeback_if_valid = writeback_if.valid;
|
||||
VX_scope_tap #(
|
||||
.SCOPE_ID (2),
|
||||
.TRIGGERW (4),
|
||||
.PROBEW (`UUID_WIDTH + `NUM_THREADS + `EX_BITS + `INST_OP_BITS +
|
||||
1 + `NR_BITS + (`NUM_THREADS * 3 * `XLEN) +
|
||||
`UUID_WIDTH + `NUM_THREADS + `NR_BITS + (`NUM_THREADS*`XLEN) + 1)
|
||||
) scope_tap (
|
||||
.clk (clk),
|
||||
.reset (scope_reset),
|
||||
.start (1'b0),
|
||||
.stop (1'b0),
|
||||
.triggers ({
|
||||
reset,
|
||||
operands_if_fire,
|
||||
operands_if_not_ready,
|
||||
writeback_if_valid
|
||||
}),
|
||||
.probes ({
|
||||
operands_if.data.uuid,
|
||||
operands_if.data.wis,
|
||||
operands_if.data.tmask,
|
||||
operands_if.data.PC,
|
||||
operands_if.data.ex_type,
|
||||
operands_if.data.op_type,
|
||||
operands_if.data.wb,
|
||||
operands_if.data.rd,
|
||||
operands_if.data.rs1_data[0],
|
||||
operands_if.data.rs2_data[0],
|
||||
operands_if.data.rs3_data[0],
|
||||
operands_if.data.rs1_data,
|
||||
operands_if.data.rs2_data,
|
||||
operands_if.data.rs3_data,
|
||||
writeback_if.data.uuid,
|
||||
writeback_if.data.wis,
|
||||
writeback_if.data.tmask,
|
||||
writeback_if.data.rd,
|
||||
writeback_if.data.data,
|
||||
writeback_if.data.eop
|
||||
},
|
||||
reset_negedge, 1'b0, 4096
|
||||
}),
|
||||
.bus_in (scope_bus_in),
|
||||
.bus_out (scope_bus_out)
|
||||
);
|
||||
`else
|
||||
`SCOPE_IO_UNUSED(0)
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef CHIPSCOPE
|
||||
`ifdef DBG_SCOPE_ISSUE
|
||||
ila_issue ila_issue_inst (
|
||||
.clk (clk),
|
||||
.probe0 ({decode_if.valid, decode_if.data, decode_if.ready}),
|
||||
.probe1 ({scoreboard_if.valid, scoreboard_if.data, scoreboard_if.ready}),
|
||||
.probe2 ({operands_if.valid, operands_if.data, operands_if.ready}),
|
||||
.probe3 ({writeback_if.valid, writeback_if.data})
|
||||
);
|
||||
`endif
|
||||
`SCOPE_IO_UNUSED()
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (operands_if.valid && operands_if.ready) begin
|
||||
`TRACE(1, ("%t: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, wis_to_wid(operands_if.data.wis, ISSUE_ID), {operands_if.data.PC, 1'b0}))
|
||||
`TRACE(1, ("%d: %s wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, wis_to_wid(operands_if.data.wis, ISSUE_ID), {operands_if.data.PC, 1'b0}));
|
||||
trace_ex_type(1, operands_if.data.ex_type);
|
||||
`TRACE(1, (", op="))
|
||||
`TRACE(1, (", op="));
|
||||
trace_ex_op(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args);
|
||||
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if.data.tmask, operands_if.data.wb, operands_if.data.rd))
|
||||
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs1_data, `NUM_THREADS)
|
||||
`TRACE(1, (", rs2_data="))
|
||||
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs2_data, `NUM_THREADS)
|
||||
`TRACE(1, (", rs3_data="))
|
||||
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs3_data, `NUM_THREADS)
|
||||
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if.data.tmask, operands_if.data.wb, operands_if.data.rd));
|
||||
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs1_data, `NUM_THREADS);
|
||||
`TRACE(1, (", rs2_data="));
|
||||
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs2_data, `NUM_THREADS);
|
||||
`TRACE(1, (", rs3_data="));
|
||||
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs3_data, `NUM_THREADS);
|
||||
trace_op_args(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args);
|
||||
`TRACE(1, (" (#%0d)\n", operands_if.data.uuid))
|
||||
`TRACE(1, (" (#%0d)\n", operands_if.data.uuid));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -80,7 +80,7 @@ module VX_issue_top import VX_gpu_pkg::*; #(
|
|||
assign decode_if.data.rs3 = decode_rs3;
|
||||
assign decode_ready = decode_if.ready;
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_writeback_if
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
|
||||
assign writeback_if[i].valid = writeback_valid[i];
|
||||
assign writeback_if[i].data.uuid = writeback_uuid[i];
|
||||
assign writeback_if[i].data.wis = writeback_wis[i];
|
||||
|
@ -92,7 +92,7 @@ module VX_issue_top import VX_gpu_pkg::*; #(
|
|||
assign writeback_if[i].data.eop = writeback_eop[i];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS * `ISSUE_WIDTH; ++i) begin : g_dispatch_if
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS * `ISSUE_WIDTH; ++i) begin
|
||||
assign dispatch_valid[i] = dispatch_if[i].valid;
|
||||
assign dispatch_uuid[i] = dispatch_if[i].data.uuid;
|
||||
assign dispatch_wis[i] = dispatch_if[i].data.wis;
|
||||
|
@ -113,13 +113,6 @@ module VX_issue_top import VX_gpu_pkg::*; #(
|
|||
issue_perf_t issue_perf = '0;
|
||||
`endif
|
||||
|
||||
`ifdef SCOPE
|
||||
wire [0:0] scope_reset_w = 1'b0;
|
||||
wire [0:0] scope_bus_in_w = 1'b0;
|
||||
wire [0:0] scope_bus_out_w;
|
||||
`UNUSED_VAR (scope_bus_out_w)
|
||||
`endif
|
||||
|
||||
VX_issue #(
|
||||
.INSTANCE_ID (INSTANCE_ID)
|
||||
) issue (
|
||||
|
|
201
hw/rtl/core/VX_lmem_unit.sv
Normal file
201
hw/rtl/core/VX_lmem_unit.sv
Normal file
|
@ -0,0 +1,201 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_lmem_unit import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = ""
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output cache_perf_t cache_perf,
|
||||
`endif
|
||||
|
||||
VX_lsu_mem_if.slave lsu_mem_in_if [`NUM_LSU_BLOCKS],
|
||||
VX_lsu_mem_if.master lsu_mem_out_if [`NUM_LSU_BLOCKS]
|
||||
);
|
||||
`STATIC_ASSERT(`IS_DIVISBLE((1 << `LMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter"))
|
||||
`STATIC_ASSERT(0 == (`LMEM_BASE_ADDR % (1 << `LMEM_LOG_SIZE)), ("invalid parameter"))
|
||||
|
||||
localparam REQ_DATAW = `NUM_LSU_LANES + 1 + `NUM_LSU_LANES * (LSU_WORD_SIZE + LSU_ADDR_WIDTH + `ADDR_TYPE_WIDTH + LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH;
|
||||
localparam RSP_DATAW = `NUM_LSU_LANES + `NUM_LSU_LANES * (LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH;
|
||||
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE);
|
||||
|
||||
VX_lsu_mem_if #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
) lsu_switch_if[`NUM_LSU_BLOCKS]();
|
||||
|
||||
`RESET_RELAY_EX (block_reset, reset, `NUM_LSU_BLOCKS, 1);
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
|
||||
|
||||
wire [`NUM_LSU_LANES-1:0] is_addr_local_mask;
|
||||
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
|
||||
assign is_addr_local_mask[j] = lsu_mem_in_if[i].req_data.atype[j][`ADDR_TYPE_LOCAL];
|
||||
end
|
||||
|
||||
wire is_addr_global = | (lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask);
|
||||
wire is_addr_local = | (lsu_mem_in_if[i].req_data.mask & is_addr_local_mask);
|
||||
|
||||
wire req_global_ready;
|
||||
wire req_local_ready;
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (REQ_DATAW),
|
||||
.SIZE (2),
|
||||
.OUT_REG (1)
|
||||
) req_global_buf (
|
||||
.clk (clk),
|
||||
.reset (block_reset[i]),
|
||||
.valid_in (lsu_mem_in_if[i].req_valid && is_addr_global),
|
||||
.data_in ({
|
||||
lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask,
|
||||
lsu_mem_in_if[i].req_data.rw,
|
||||
lsu_mem_in_if[i].req_data.byteen,
|
||||
lsu_mem_in_if[i].req_data.addr,
|
||||
lsu_mem_in_if[i].req_data.atype,
|
||||
lsu_mem_in_if[i].req_data.data,
|
||||
lsu_mem_in_if[i].req_data.tag
|
||||
}),
|
||||
.ready_in (req_global_ready),
|
||||
.valid_out (lsu_mem_out_if[i].req_valid),
|
||||
.data_out ({
|
||||
lsu_mem_out_if[i].req_data.mask,
|
||||
lsu_mem_out_if[i].req_data.rw,
|
||||
lsu_mem_out_if[i].req_data.byteen,
|
||||
lsu_mem_out_if[i].req_data.addr,
|
||||
lsu_mem_out_if[i].req_data.atype,
|
||||
lsu_mem_out_if[i].req_data.data,
|
||||
lsu_mem_out_if[i].req_data.tag
|
||||
}),
|
||||
.ready_out (lsu_mem_out_if[i].req_ready)
|
||||
);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (REQ_DATAW),
|
||||
.SIZE (0),
|
||||
.OUT_REG (0)
|
||||
) req_local_buf (
|
||||
.clk (clk),
|
||||
.reset (block_reset[i]),
|
||||
.valid_in (lsu_mem_in_if[i].req_valid && is_addr_local),
|
||||
.data_in ({
|
||||
lsu_mem_in_if[i].req_data.mask & is_addr_local_mask,
|
||||
lsu_mem_in_if[i].req_data.rw,
|
||||
lsu_mem_in_if[i].req_data.byteen,
|
||||
lsu_mem_in_if[i].req_data.addr,
|
||||
lsu_mem_in_if[i].req_data.atype,
|
||||
lsu_mem_in_if[i].req_data.data,
|
||||
lsu_mem_in_if[i].req_data.tag
|
||||
}),
|
||||
.ready_in (req_local_ready),
|
||||
.valid_out (lsu_switch_if[i].req_valid),
|
||||
.data_out ({
|
||||
lsu_switch_if[i].req_data.mask,
|
||||
lsu_switch_if[i].req_data.rw,
|
||||
lsu_switch_if[i].req_data.byteen,
|
||||
lsu_switch_if[i].req_data.addr,
|
||||
lsu_switch_if[i].req_data.atype,
|
||||
lsu_switch_if[i].req_data.data,
|
||||
lsu_switch_if[i].req_data.tag
|
||||
}),
|
||||
.ready_out (lsu_switch_if[i].req_ready)
|
||||
);
|
||||
|
||||
assign lsu_mem_in_if[i].req_ready = (req_global_ready && is_addr_global)
|
||||
|| (req_local_ready && is_addr_local);
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATAW (RSP_DATAW),
|
||||
.ARBITER ("R"),
|
||||
.OUT_BUF (1)
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (block_reset[i]),
|
||||
.valid_in ({
|
||||
lsu_switch_if[i].rsp_valid,
|
||||
lsu_mem_out_if[i].rsp_valid
|
||||
}),
|
||||
.ready_in ({
|
||||
lsu_switch_if[i].rsp_ready,
|
||||
lsu_mem_out_if[i].rsp_ready
|
||||
}),
|
||||
.data_in ({
|
||||
lsu_switch_if[i].rsp_data,
|
||||
lsu_mem_out_if[i].rsp_data
|
||||
}),
|
||||
.data_out (lsu_mem_in_if[i].rsp_data),
|
||||
.valid_out (lsu_mem_in_if[i].rsp_valid),
|
||||
.ready_out (lsu_mem_in_if[i].rsp_ready),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
end
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
) lmem_bus_if[LSU_NUM_REQS]();
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
) lmem_bus_tmp_if[`NUM_LSU_LANES]();
|
||||
|
||||
VX_lsu_adapter #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH),
|
||||
.TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH),
|
||||
.ARBITER ("P"),
|
||||
.REQ_OUT_BUF (3),
|
||||
.RSP_OUT_BUF (0)
|
||||
) lsu_adapter (
|
||||
.clk (clk),
|
||||
.reset (block_reset[i]),
|
||||
.lsu_mem_if (lsu_switch_if[i]),
|
||||
.mem_bus_if (lmem_bus_tmp_if)
|
||||
);
|
||||
|
||||
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
|
||||
`ASSIGN_VX_MEM_BUS_IF (lmem_bus_if[i * `NUM_LSU_LANES + j], lmem_bus_tmp_if[j]);
|
||||
end
|
||||
end
|
||||
|
||||
`RESET_RELAY (lmem_reset, reset);
|
||||
|
||||
VX_local_mem #(
|
||||
.INSTANCE_ID($sformatf("%s-lmem", INSTANCE_ID)),
|
||||
.SIZE (1 << `LMEM_LOG_SIZE),
|
||||
.NUM_REQS (LSU_NUM_REQS),
|
||||
.NUM_BANKS (`LMEM_NUM_BANKS),
|
||||
.WORD_SIZE (LSU_WORD_SIZE),
|
||||
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH),
|
||||
.OUT_BUF (3)
|
||||
) local_mem (
|
||||
.clk (clk),
|
||||
.reset (lmem_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (cache_perf),
|
||||
`endif
|
||||
.mem_bus_if (lmem_bus_if)
|
||||
);
|
||||
|
||||
endmodule
|
|
@ -29,7 +29,7 @@ module VX_lsu_adapter import VX_gpu_pkg::*; #(
|
|||
VX_mem_bus_if.master mem_bus_if [NUM_LANES]
|
||||
);
|
||||
localparam REQ_ADDR_WIDTH = `MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE);
|
||||
localparam REQ_DATA_WIDTH = 1 + DATA_SIZE + REQ_ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + DATA_SIZE * 8;
|
||||
localparam REQ_DATA_WIDTH = 1 + DATA_SIZE + REQ_ADDR_WIDTH + `ADDR_TYPE_WIDTH + DATA_SIZE * 8;
|
||||
localparam RSP_DATA_WIDTH = DATA_SIZE * 8;
|
||||
|
||||
// handle request unpacking
|
||||
|
@ -41,16 +41,29 @@ module VX_lsu_adapter import VX_gpu_pkg::*; #(
|
|||
wire [NUM_LANES-1:0][TAG_WIDTH-1:0] req_tag_out;
|
||||
wire [NUM_LANES-1:0] req_ready_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_req_data_in
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign req_data_in[i] = {
|
||||
lsu_mem_if.req_data.rw,
|
||||
lsu_mem_if.req_data.addr[i],
|
||||
lsu_mem_if.req_data.data[i],
|
||||
lsu_mem_if.req_data.byteen[i],
|
||||
lsu_mem_if.req_data.flags[i]
|
||||
lsu_mem_if.req_data.addr[i],
|
||||
lsu_mem_if.req_data.atype[i],
|
||||
lsu_mem_if.req_data.data[i]
|
||||
};
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign mem_bus_if[i].req_valid = req_valid_out[i];
|
||||
assign {
|
||||
mem_bus_if[i].req_data.rw,
|
||||
mem_bus_if[i].req_data.byteen,
|
||||
mem_bus_if[i].req_data.addr,
|
||||
mem_bus_if[i].req_data.atype,
|
||||
mem_bus_if[i].req_data.data
|
||||
} = req_data_out[i];
|
||||
assign mem_bus_if[i].req_data.tag = req_tag_out[i];
|
||||
assign req_ready_out[i] = mem_bus_if[i].req_ready;
|
||||
end
|
||||
|
||||
VX_stream_unpack #(
|
||||
.NUM_REQS (NUM_LANES),
|
||||
.DATA_WIDTH (REQ_DATA_WIDTH),
|
||||
|
@ -70,19 +83,6 @@ module VX_lsu_adapter import VX_gpu_pkg::*; #(
|
|||
.ready_out (req_ready_out)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_bus_req
|
||||
assign mem_bus_if[i].req_valid = req_valid_out[i];
|
||||
assign {
|
||||
mem_bus_if[i].req_data.rw,
|
||||
mem_bus_if[i].req_data.addr,
|
||||
mem_bus_if[i].req_data.data,
|
||||
mem_bus_if[i].req_data.byteen,
|
||||
mem_bus_if[i].req_data.flags
|
||||
} = req_data_out[i];
|
||||
assign mem_bus_if[i].req_data.tag = req_tag_out[i];
|
||||
assign req_ready_out[i] = mem_bus_if[i].req_ready;
|
||||
end
|
||||
|
||||
// handle response packing
|
||||
|
||||
wire [NUM_LANES-1:0] rsp_valid_out;
|
||||
|
@ -90,10 +90,10 @@ module VX_lsu_adapter import VX_gpu_pkg::*; #(
|
|||
wire [NUM_LANES-1:0][TAG_WIDTH-1:0] rsp_tag_out;
|
||||
wire [NUM_LANES-1:0] rsp_ready_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_bus_rsp
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign rsp_valid_out[i] = mem_bus_if[i].rsp_valid;
|
||||
assign rsp_data_out[i] = mem_bus_if[i].rsp_data.data;
|
||||
assign rsp_tag_out[i] = mem_bus_if[i].rsp_data.tag;
|
||||
assign rsp_data_out[i] = mem_bus_if[i].rsp_data.data;
|
||||
assign rsp_tag_out[i] = mem_bus_if[i].rsp_data.tag;
|
||||
assign mem_bus_if[i].rsp_ready = rsp_ready_out[i];
|
||||
end
|
||||
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_lsu_slice import VX_gpu_pkg::*; #(
|
||||
module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = ""
|
||||
) (
|
||||
`SCOPE_IO_DECL
|
||||
|
@ -59,25 +59,25 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
wire req_is_fence, rsp_is_fence;
|
||||
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] full_addr;
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_full_addr
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign full_addr[i] = execute_if.data.rs1_data[i] + `SEXT(`XLEN, execute_if.data.op_args.lsu.offset);
|
||||
end
|
||||
|
||||
// address type calculation
|
||||
|
||||
wire [NUM_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] mem_req_flags;
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_flags
|
||||
wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] mem_req_atype;
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire [MEM_ADDRW-1:0] block_addr = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
|
||||
// is I/O address
|
||||
wire [MEM_ADDRW-1:0] io_addr_start = MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT);
|
||||
wire [MEM_ADDRW-1:0] io_addr_end = MEM_ADDRW'(`XLEN'(`IO_END_ADDR) >> MEM_ASHIFT);
|
||||
assign mem_req_flags[i][`MEM_REQ_FLAG_FLUSH] = req_is_fence;
|
||||
assign mem_req_flags[i][`MEM_REQ_FLAG_IO] = (block_addr >= io_addr_start) && (block_addr < io_addr_end);
|
||||
assign mem_req_atype[i][`ADDR_TYPE_FLUSH] = req_is_fence;
|
||||
assign mem_req_atype[i][`ADDR_TYPE_IO] = (block_addr >= io_addr_start) && (block_addr < io_addr_end);
|
||||
`ifdef LMEM_ENABLE
|
||||
// is local memory address
|
||||
wire [MEM_ADDRW-1:0] lmem_addr_start = MEM_ADDRW'(`XLEN'(`LMEM_BASE_ADDR) >> MEM_ASHIFT);
|
||||
wire [MEM_ADDRW-1:0] lmem_addr_end = MEM_ADDRW'((`XLEN'(`LMEM_BASE_ADDR) + `XLEN'(1 << `LMEM_LOG_SIZE)) >> MEM_ASHIFT);
|
||||
assign mem_req_flags[i][`MEM_REQ_FLAG_LOCAL] = (block_addr >= lmem_addr_start) && (block_addr < lmem_addr_end);
|
||||
assign mem_req_atype[i][`ADDR_TYPE_LOCAL] = (block_addr >= lmem_addr_start) && (block_addr < lmem_addr_end);
|
||||
`endif
|
||||
end
|
||||
|
||||
|
@ -102,6 +102,8 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
|
||||
wire mem_req_fire = mem_req_valid && mem_req_ready;
|
||||
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
|
||||
`UNUSED_VAR (mem_req_fire)
|
||||
`UNUSED_VAR (mem_rsp_fire)
|
||||
|
||||
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
|
||||
wire no_rsp_buf_valid, no_rsp_buf_ready;
|
||||
|
@ -149,49 +151,49 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
|
||||
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] req_align;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_addr
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign req_align[i] = full_addr[i][REQ_ASHIFT-1:0];
|
||||
assign mem_req_addr[i] = full_addr[i][`MEM_ADDR_WIDTH-1:REQ_ASHIFT];
|
||||
end
|
||||
|
||||
// byte enable formatting
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_byteen_w
|
||||
reg [LSU_WORD_SIZE-1:0] mem_req_byteen_w;
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
reg [LSU_WORD_SIZE-1:0] mem_req_byteen_r;
|
||||
always @(*) begin
|
||||
mem_req_byteen_w = '0;
|
||||
mem_req_byteen_r = '0;
|
||||
case (`INST_LSU_WSIZE(execute_if.data.op_type))
|
||||
0: begin // 8-bit
|
||||
mem_req_byteen_w[req_align[i]] = 1'b1;
|
||||
mem_req_byteen_r[req_align[i]] = 1'b1;
|
||||
end
|
||||
1: begin // 16 bit
|
||||
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
|
||||
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
|
||||
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
|
||||
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
|
||||
end
|
||||
`ifdef XLEN_64
|
||||
2: begin // 32 bit
|
||||
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
|
||||
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
|
||||
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
|
||||
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
|
||||
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
|
||||
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
|
||||
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
|
||||
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
|
||||
end
|
||||
`endif
|
||||
// 3: 64 bit
|
||||
default : mem_req_byteen_w = {LSU_WORD_SIZE{1'b1}};
|
||||
default : mem_req_byteen_r = {LSU_WORD_SIZE{1'b1}};
|
||||
endcase
|
||||
end
|
||||
assign mem_req_byteen[i] = mem_req_byteen_w;
|
||||
assign mem_req_byteen[i] = mem_req_byteen_r;
|
||||
end
|
||||
|
||||
// memory misalignment not supported!
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_missalign
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire lsu_req_fire = execute_if.valid && execute_if.ready;
|
||||
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if.data.tmask[i] || req_is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if.data.op_type))) == 0),
|
||||
("%t: misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
|
||||
$time, execute_if.data.wid, {execute_if.data.PC, 1'b0}, full_addr[i], `INST_LSU_WSIZE(execute_if.data.op_type), execute_if.data.uuid))
|
||||
("misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
|
||||
execute_if.data.wid, {execute_if.data.PC, 1'b0}, full_addr[i], `INST_LSU_WSIZE(execute_if.data.op_type), execute_if.data.uuid));
|
||||
end
|
||||
|
||||
// store data formatting
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_data
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
always @(*) begin
|
||||
mem_req_data[i] = execute_if.data.rs2_data[i];
|
||||
case (req_align[i])
|
||||
|
@ -213,7 +215,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
|
||||
wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr;
|
||||
|
||||
if (PID_BITS != 0) begin : g_pids
|
||||
if (PID_BITS != 0) begin
|
||||
reg [`LSUQ_IN_SIZE-1:0][PID_BITS:0] pkt_ctr;
|
||||
reg [`LSUQ_IN_SIZE-1:0] pkt_sop, pkt_eop;
|
||||
|
||||
|
@ -269,10 +271,10 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
|
||||
assign mem_rsp_sop_pkt = pkt_sop[pkt_raddr];
|
||||
assign mem_rsp_eop_pkt = mem_rsp_eop_fire && pkt_eop[pkt_raddr] && (pkt_ctr[pkt_raddr] == 1);
|
||||
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("%t: allocator full!", $time))
|
||||
`RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("%t: oops! broken sop request!", $time))
|
||||
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("allocator full!"))
|
||||
`RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("Oops!"))
|
||||
`UNUSED_VAR (mem_rsp_sop)
|
||||
end else begin : g_no_pids
|
||||
end else begin
|
||||
assign pkt_waddr = 0;
|
||||
assign mem_rsp_sop_pkt = mem_rsp_sop;
|
||||
assign mem_rsp_eop_pkt = mem_rsp_eop;
|
||||
|
@ -298,7 +300,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
wire [NUM_LANES-1:0] lsu_mem_req_mask;
|
||||
wire [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] lsu_mem_req_byteen;
|
||||
wire [NUM_LANES-1:0][LSU_ADDR_WIDTH-1:0] lsu_mem_req_addr;
|
||||
wire [NUM_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] lsu_mem_req_flags;
|
||||
wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] lsu_mem_req_atype;
|
||||
wire [NUM_LANES-1:0][(LSU_WORD_SIZE*8)-1:0] lsu_mem_req_data;
|
||||
wire [LSU_TAG_WIDTH-1:0] lsu_mem_req_tag;
|
||||
wire lsu_mem_req_ready;
|
||||
|
@ -309,14 +311,16 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
wire [LSU_TAG_WIDTH-1:0] lsu_mem_rsp_tag;
|
||||
wire lsu_mem_rsp_ready;
|
||||
|
||||
`RESET_RELAY (mem_scheduler_reset, reset);
|
||||
|
||||
VX_mem_scheduler #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-memsched", INSTANCE_ID))),
|
||||
.INSTANCE_ID ($sformatf("%s-scheduler", INSTANCE_ID)),
|
||||
.CORE_REQS (NUM_LANES),
|
||||
.MEM_CHANNELS(NUM_LANES),
|
||||
.WORD_SIZE (LSU_WORD_SIZE),
|
||||
.LINE_SIZE (LSU_WORD_SIZE),
|
||||
.ADDR_WIDTH (LSU_ADDR_WIDTH),
|
||||
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
|
||||
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.CORE_QUEUE_SIZE (`LSUQ_IN_SIZE),
|
||||
.MEM_QUEUE_SIZE (`LSUQ_OUT_SIZE),
|
||||
|
@ -326,7 +330,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
.CORE_OUT_BUF(0)
|
||||
) mem_scheduler (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (mem_scheduler_reset),
|
||||
|
||||
// Input request
|
||||
.core_req_valid (mem_req_valid),
|
||||
|
@ -334,12 +338,12 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
.core_req_mask (mem_req_mask),
|
||||
.core_req_byteen(mem_req_byteen),
|
||||
.core_req_addr (mem_req_addr),
|
||||
.core_req_flags (mem_req_flags),
|
||||
.core_req_atype (mem_req_atype),
|
||||
.core_req_data (mem_req_data),
|
||||
.core_req_tag (mem_req_tag),
|
||||
.core_req_ready (mem_req_ready),
|
||||
`UNUSED_PIN (core_req_empty),
|
||||
`UNUSED_PIN (core_req_wr_notify),
|
||||
`UNUSED_PIN (core_req_sent),
|
||||
|
||||
// Output response
|
||||
.core_rsp_valid (mem_rsp_valid),
|
||||
|
@ -356,7 +360,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
.mem_req_mask (lsu_mem_req_mask),
|
||||
.mem_req_byteen (lsu_mem_req_byteen),
|
||||
.mem_req_addr (lsu_mem_req_addr),
|
||||
.mem_req_flags (lsu_mem_req_flags),
|
||||
.mem_req_atype (lsu_mem_req_atype),
|
||||
.mem_req_data (lsu_mem_req_data),
|
||||
.mem_req_tag (lsu_mem_req_tag),
|
||||
.mem_req_ready (lsu_mem_req_ready),
|
||||
|
@ -374,7 +378,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
assign lsu_mem_if.req_data.rw = lsu_mem_req_rw;
|
||||
assign lsu_mem_if.req_data.byteen = lsu_mem_req_byteen;
|
||||
assign lsu_mem_if.req_data.addr = lsu_mem_req_addr;
|
||||
assign lsu_mem_if.req_data.flags = lsu_mem_req_flags;
|
||||
assign lsu_mem_if.req_data.atype = lsu_mem_req_atype;
|
||||
assign lsu_mem_if.req_data.data = lsu_mem_req_data;
|
||||
assign lsu_mem_if.req_data.tag = lsu_mem_req_tag;
|
||||
assign lsu_mem_req_ready = lsu_mem_if.req_ready;
|
||||
|
@ -422,7 +426,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; i++) begin : g_rsp_data
|
||||
for (genvar i = 0; i < NUM_LANES; i++) begin
|
||||
`ifdef XLEN_64
|
||||
wire [63:0] rsp_data64 = mem_rsp_data[i];
|
||||
wire [31:0] rsp_data32 = (rsp_align[i][2] ? mem_rsp_data[i][63:32] : mem_rsp_data[i][31:0]);
|
||||
|
@ -479,7 +483,6 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
.valid_out (commit_no_rsp_if.valid),
|
||||
.ready_out (commit_no_rsp_if.ready)
|
||||
);
|
||||
|
||||
assign commit_no_rsp_if.data.rd = '0;
|
||||
assign commit_no_rsp_if.data.wb = 1'b0;
|
||||
assign commit_no_rsp_if.data.data = commit_rsp_if.data.data; // arbiter MUX optimization
|
||||
|
@ -504,74 +507,51 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
`ifdef DBG_TRACE_MEM
|
||||
always @(posedge clk) begin
|
||||
if (execute_if.valid && fence_lock) begin
|
||||
`TRACE(2, ("%t: *** %s fence wait\n", $time, INSTANCE_ID))
|
||||
`TRACE(1, ("%d: *** %s fence wait\n", $time, INSTANCE_ID));
|
||||
end
|
||||
if (mem_req_fire) begin
|
||||
if (mem_req_rw) begin
|
||||
`TRACE(2, ("%t: %s Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask))
|
||||
`TRACE_ARRAY1D(2, "0x%h", full_addr, NUM_LANES)
|
||||
`TRACE(2, (", flags="))
|
||||
`TRACE_ARRAY1D(2, "%b", mem_req_flags, NUM_LANES)
|
||||
`TRACE(2, (", byteen=0x%0h, data=", mem_req_byteen))
|
||||
`TRACE_ARRAY1D(2, "0x%0h", mem_req_data, NUM_LANES)
|
||||
`TRACE(2, (", sop=%b, eop=%b, tag=0x%0h (#%0d)\n", execute_if.data.sop, execute_if.data.eop, mem_req_tag, execute_if.data.uuid))
|
||||
`TRACE(1, ("%d: %s Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask));
|
||||
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
|
||||
`TRACE(1, (", atype="));
|
||||
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
|
||||
`TRACE(1, (", byteen=0x%0h, data=", mem_req_byteen));
|
||||
`TRACE_ARRAY1D(1, "0x%0h", mem_req_data, NUM_LANES);
|
||||
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_req_tag, execute_if.data.uuid));
|
||||
end else begin
|
||||
`TRACE(2, ("%t: %s Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask))
|
||||
`TRACE_ARRAY1D(2, "0x%h", full_addr, NUM_LANES)
|
||||
`TRACE(2, (", flags="))
|
||||
`TRACE_ARRAY1D(2, "%b", mem_req_flags, NUM_LANES)
|
||||
`TRACE(2, (", byteen=0x%0h, rd=%0d, sop=%b, eop=%b, tag=0x%0h (#%0d)\n", mem_req_byteen, execute_if.data.rd, execute_if.data.sop, execute_if.data.eop, mem_req_tag, execute_if.data.uuid))
|
||||
`TRACE(1, ("%d: %s Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask));
|
||||
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
|
||||
`TRACE(1, (", atype="));
|
||||
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
|
||||
`TRACE(1, (", byteen=0x%0h, rd=%0d, tag=0x%0h (#%0d)\n", mem_req_byteen, execute_if.data.rd, mem_req_tag, execute_if.data.uuid));
|
||||
end
|
||||
end
|
||||
if (mem_rsp_fire) begin
|
||||
`TRACE(2, ("%t: %s Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=",
|
||||
$time, INSTANCE_ID, rsp_wid, {rsp_pc, 1'b0}, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop))
|
||||
`TRACE_ARRAY1D(2, "0x%0h", mem_rsp_data, NUM_LANES)
|
||||
`TRACE(2, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid))
|
||||
`TRACE(1, ("%d: %s Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=",
|
||||
$time, INSTANCE_ID, rsp_wid, {rsp_pc, 1'b0}, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop));
|
||||
`TRACE_ARRAY1D(1, "0x%0h", mem_rsp_data, NUM_LANES);
|
||||
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
`ifdef SCOPE
|
||||
`ifdef DBG_SCOPE_LSU
|
||||
`SCOPE_IO_SWITCH (1);
|
||||
wire reset_negedge;
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP_EX (0, 3, 4, 2, (
|
||||
1 + NUM_LANES * (`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE * 8) + `UUID_WIDTH + NUM_LANES * LSU_WORD_SIZE * 8 + `UUID_WIDTH
|
||||
), {
|
||||
mem_req_valid,
|
||||
mem_req_ready,
|
||||
mem_rsp_valid,
|
||||
mem_rsp_ready
|
||||
}, {
|
||||
mem_req_fire,
|
||||
mem_rsp_fire
|
||||
}, {
|
||||
mem_req_rw,
|
||||
full_addr,
|
||||
mem_req_byteen,
|
||||
mem_req_data,
|
||||
execute_if.data.uuid,
|
||||
rsp_data,
|
||||
rsp_uuid
|
||||
},
|
||||
reset_negedge, 1'b0, 4096
|
||||
VX_scope_tap #(
|
||||
.SCOPE_ID (3),
|
||||
.TRIGGERW (3),
|
||||
.PROBEW (1 + NUM_LANES*(`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE*8) + `UUID_WIDTH + NUM_LANES*LSU_WORD_SIZE*8 + `UUID_WIDTH)
|
||||
) scope_tap (
|
||||
.clk (clk),
|
||||
.reset (scope_reset),
|
||||
.start (1'b0),
|
||||
.stop (1'b0),
|
||||
.triggers({reset, mem_req_fire, mem_rsp_fire}),
|
||||
.probes ({mem_req_rw, full_addr, mem_req_byteen, mem_req_data, execute_if.data.uuid, rsp_data, rsp_uuid}),
|
||||
.bus_in (scope_bus_in),
|
||||
.bus_out(scope_bus_out)
|
||||
);
|
||||
`else
|
||||
`SCOPE_IO_UNUSED(0)
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef CHIPSCOPE
|
||||
`ifdef DBG_SCOPE_LSU
|
||||
ila_lsu ila_lsu_inst (
|
||||
.clk (clk),
|
||||
.probe0 ({execute_if.valid, execute_if.data, execute_if.ready}),
|
||||
.probe1 ({lsu_mem_if.req_valid, lsu_mem_if.req_data, lsu_mem_if.req_ready}),
|
||||
.probe2 ({lsu_mem_if.rsp_valid, lsu_mem_if.rsp_data, lsu_mem_if.rsp_ready})
|
||||
);
|
||||
`endif
|
||||
`SCOPE_IO_UNUSED()
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -31,7 +31,9 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||
localparam BLOCK_SIZE = `NUM_LSU_BLOCKS;
|
||||
localparam NUM_LANES = `NUM_LSU_LANES;
|
||||
|
||||
`ifdef SCOPE
|
||||
`SCOPE_IO_SWITCH (BLOCK_SIZE);
|
||||
`endif
|
||||
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
|
@ -40,7 +42,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||
VX_dispatch_unit #(
|
||||
.BLOCK_SIZE (BLOCK_SIZE),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.OUT_BUF (3)
|
||||
.OUT_BUF (1)
|
||||
) dispatch_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -52,13 +54,16 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||
.NUM_LANES (NUM_LANES)
|
||||
) per_block_commit_if[BLOCK_SIZE]();
|
||||
|
||||
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_slices
|
||||
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : lsu_slices
|
||||
|
||||
`RESET_RELAY (slice_reset, reset);
|
||||
|
||||
VX_lsu_slice #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s%0d", INSTANCE_ID, block_idx)))
|
||||
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, block_idx))
|
||||
) lsu_slice(
|
||||
`SCOPE_IO_BIND (block_idx)
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (slice_reset),
|
||||
.execute_if (per_block_execute_if[block_idx]),
|
||||
.commit_if (per_block_commit_if[block_idx]),
|
||||
.lsu_mem_if (lsu_mem_if[block_idx])
|
||||
|
|
|
@ -1,260 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_mem_unit import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = ""
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output lmem_perf_t lmem_perf,
|
||||
output coalescer_perf_t coalescer_perf,
|
||||
`endif
|
||||
|
||||
VX_lsu_mem_if.slave lsu_mem_if [`NUM_LSU_BLOCKS],
|
||||
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS]
|
||||
);
|
||||
VX_lsu_mem_if #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
) lsu_dcache_if[`NUM_LSU_BLOCKS]();
|
||||
|
||||
`ifdef LMEM_ENABLE
|
||||
|
||||
`STATIC_ASSERT(`IS_DIVISBLE((1 << `LMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter"))
|
||||
`STATIC_ASSERT(0 == (`LMEM_BASE_ADDR % (1 << `LMEM_LOG_SIZE)), ("invalid parameter"))
|
||||
|
||||
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE);
|
||||
|
||||
VX_lsu_mem_if #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
) lsu_lmem_if[`NUM_LSU_BLOCKS]();
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lmem_switches
|
||||
VX_lmem_switch #(
|
||||
.REQ0_OUT_BUF (1),
|
||||
.REQ1_OUT_BUF (0),
|
||||
.RSP_OUT_BUF (1),
|
||||
.ARBITER ("P")
|
||||
) lmem_switch (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.lsu_in_if (lsu_mem_if[i]),
|
||||
.global_out_if(lsu_dcache_if[i]),
|
||||
.local_out_if (lsu_lmem_if[i])
|
||||
);
|
||||
end
|
||||
|
||||
VX_lsu_mem_if #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LMEM_TAG_WIDTH)
|
||||
) lmem_arb_if[1]();
|
||||
|
||||
VX_lsu_mem_arb #(
|
||||
.NUM_INPUTS (`NUM_LSU_BLOCKS),
|
||||
.NUM_OUTPUTS(1),
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH),
|
||||
.TAG_SEL_IDX(0),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF(0),
|
||||
.RSP_OUT_BUF(2)
|
||||
) lmem_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.bus_in_if (lsu_lmem_if),
|
||||
.bus_out_if (lmem_arb_if)
|
||||
);
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LMEM_TAG_WIDTH)
|
||||
) lmem_adapt_if[`NUM_LSU_LANES]();
|
||||
|
||||
VX_lsu_adapter #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LMEM_TAG_WIDTH),
|
||||
.TAG_SEL_BITS (LMEM_TAG_WIDTH - `UUID_WIDTH),
|
||||
.ARBITER ("P"),
|
||||
.REQ_OUT_BUF (3),
|
||||
.RSP_OUT_BUF (0)
|
||||
) lmem_adapter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.lsu_mem_if (lmem_arb_if[0]),
|
||||
.mem_bus_if (lmem_adapt_if)
|
||||
);
|
||||
|
||||
VX_local_mem #(
|
||||
.INSTANCE_ID(`SFORMATF(("%s-lmem", INSTANCE_ID))),
|
||||
.SIZE (1 << `LMEM_LOG_SIZE),
|
||||
.NUM_REQS (`NUM_LSU_LANES),
|
||||
.NUM_BANKS (`LMEM_NUM_BANKS),
|
||||
.WORD_SIZE (LSU_WORD_SIZE),
|
||||
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.TAG_WIDTH (LMEM_TAG_WIDTH),
|
||||
.OUT_BUF (3)
|
||||
) local_mem (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.lmem_perf (lmem_perf),
|
||||
`endif
|
||||
.mem_bus_if (lmem_adapt_if)
|
||||
);
|
||||
|
||||
`else
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
assign lmem_perf = '0;
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_dcache_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (lsu_dcache_if[i], lsu_mem_if[i]);
|
||||
end
|
||||
|
||||
`endif
|
||||
|
||||
VX_lsu_mem_if #(
|
||||
.NUM_LANES (DCACHE_CHANNELS),
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) dcache_coalesced_if[`NUM_LSU_BLOCKS]();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`NUM_LSU_BLOCKS-1:0][`PERF_CTR_BITS-1:0] per_block_coalescer_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] coalescer_misses;
|
||||
VX_reduce_tree #(
|
||||
.DATAW_IN (`PERF_CTR_BITS),
|
||||
.DATAW_OUT (`PERF_CTR_BITS),
|
||||
.N (`NUM_LSU_BLOCKS),
|
||||
.OP ("+")
|
||||
) coalescer_reduce (
|
||||
.data_in (per_block_coalescer_misses),
|
||||
.data_out (coalescer_misses)
|
||||
);
|
||||
`BUFFER(coalescer_perf.misses, coalescer_misses);
|
||||
`endif
|
||||
|
||||
if ((`NUM_LSU_LANES > 1) && (LSU_WORD_SIZE != DCACHE_WORD_SIZE)) begin : g_enabled
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_coalescers
|
||||
VX_mem_coalescer #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-coalescer%0d", INSTANCE_ID, i))),
|
||||
.NUM_REQS (`NUM_LSU_LANES),
|
||||
.DATA_IN_SIZE (LSU_WORD_SIZE),
|
||||
.DATA_OUT_SIZE (DCACHE_WORD_SIZE),
|
||||
.ADDR_WIDTH (LSU_ADDR_WIDTH),
|
||||
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.QUEUE_SIZE (`LSUQ_OUT_SIZE),
|
||||
.PERF_CTR_BITS (`PERF_CTR_BITS)
|
||||
) mem_coalescer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.misses (per_block_coalescer_misses[i]),
|
||||
`else
|
||||
`UNUSED_PIN (misses),
|
||||
`endif
|
||||
|
||||
// Input request
|
||||
.in_req_valid (lsu_dcache_if[i].req_valid),
|
||||
.in_req_mask (lsu_dcache_if[i].req_data.mask),
|
||||
.in_req_rw (lsu_dcache_if[i].req_data.rw),
|
||||
.in_req_byteen (lsu_dcache_if[i].req_data.byteen),
|
||||
.in_req_addr (lsu_dcache_if[i].req_data.addr),
|
||||
.in_req_flags (lsu_dcache_if[i].req_data.flags),
|
||||
.in_req_data (lsu_dcache_if[i].req_data.data),
|
||||
.in_req_tag (lsu_dcache_if[i].req_data.tag),
|
||||
.in_req_ready (lsu_dcache_if[i].req_ready),
|
||||
|
||||
// Input response
|
||||
.in_rsp_valid (lsu_dcache_if[i].rsp_valid),
|
||||
.in_rsp_mask (lsu_dcache_if[i].rsp_data.mask),
|
||||
.in_rsp_data (lsu_dcache_if[i].rsp_data.data),
|
||||
.in_rsp_tag (lsu_dcache_if[i].rsp_data.tag),
|
||||
.in_rsp_ready (lsu_dcache_if[i].rsp_ready),
|
||||
|
||||
// Output request
|
||||
.out_req_valid (dcache_coalesced_if[i].req_valid),
|
||||
.out_req_mask (dcache_coalesced_if[i].req_data.mask),
|
||||
.out_req_rw (dcache_coalesced_if[i].req_data.rw),
|
||||
.out_req_byteen (dcache_coalesced_if[i].req_data.byteen),
|
||||
.out_req_addr (dcache_coalesced_if[i].req_data.addr),
|
||||
.out_req_flags (dcache_coalesced_if[i].req_data.flags),
|
||||
.out_req_data (dcache_coalesced_if[i].req_data.data),
|
||||
.out_req_tag (dcache_coalesced_if[i].req_data.tag),
|
||||
.out_req_ready (dcache_coalesced_if[i].req_ready),
|
||||
|
||||
// Output response
|
||||
.out_rsp_valid (dcache_coalesced_if[i].rsp_valid),
|
||||
.out_rsp_mask (dcache_coalesced_if[i].rsp_data.mask),
|
||||
.out_rsp_data (dcache_coalesced_if[i].rsp_data.data),
|
||||
.out_rsp_tag (dcache_coalesced_if[i].rsp_data.tag),
|
||||
.out_rsp_ready (dcache_coalesced_if[i].rsp_ready)
|
||||
);
|
||||
end
|
||||
|
||||
end else begin : g_passthru
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_dcache_coalesced_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (dcache_coalesced_if[i], lsu_dcache_if[i]);
|
||||
`ifdef PERF_ENABLE
|
||||
assign per_block_coalescer_misses[i] = '0;
|
||||
`endif
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_dcache_adapters
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) dcache_bus_tmp_if[DCACHE_CHANNELS]();
|
||||
|
||||
VX_lsu_adapter #(
|
||||
.NUM_LANES (DCACHE_CHANNELS),
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH),
|
||||
.TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH),
|
||||
.ARBITER ("P"),
|
||||
.REQ_OUT_BUF (0),
|
||||
.RSP_OUT_BUF (0)
|
||||
) dcache_adapter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.lsu_mem_if (dcache_coalesced_if[i]),
|
||||
.mem_bus_if (dcache_bus_tmp_if)
|
||||
);
|
||||
|
||||
for (genvar j = 0; j < DCACHE_CHANNELS; ++j) begin : g_dcache_bus_if
|
||||
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i * DCACHE_CHANNELS + j], dcache_bus_tmp_if[j]);
|
||||
end
|
||||
|
||||
end
|
||||
|
||||
endmodule
|
|
@ -1,127 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_mem_unit_top import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = "",
|
||||
parameter LSU_WORD_WIDTH = LSU_WORD_SIZE * 8
|
||||
) (
|
||||
// Clock
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// LSU memory request
|
||||
input wire [`NUM_LSU_BLOCKS-1:0] lsu_req_valid,
|
||||
input wire [`NUM_LSU_BLOCKS-1:0] lsu_req_rw,
|
||||
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0] lsu_req_mask,
|
||||
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_WORD_SIZE-1:0] lsu_req_byteen,
|
||||
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_ADDR_WIDTH-1:0] lsu_req_addr,
|
||||
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] lsu_req_flags,
|
||||
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_WORD_WIDTH-1:0] lsu_req_data,
|
||||
input wire [`NUM_LSU_BLOCKS-1:0][LSU_TAG_WIDTH-1:0] lsu_req_tag,
|
||||
output wire [`NUM_LSU_BLOCKS-1:0] lsu_req_ready,
|
||||
|
||||
// LSU memory response
|
||||
output wire [`NUM_LSU_BLOCKS-1:0] lsu_rsp_valid,
|
||||
output wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0] lsu_rsp_mask,
|
||||
output wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_WORD_WIDTH-1:0] lsu_rsp_data,
|
||||
output wire [`NUM_LSU_BLOCKS-1:0][LSU_TAG_WIDTH-1:0] lsu_rsp_tag,
|
||||
input wire [`NUM_LSU_BLOCKS-1:0] lsu_rsp_ready,
|
||||
|
||||
// Memory request
|
||||
output wire [DCACHE_NUM_REQS-1:0] mem_req_valid,
|
||||
output wire [DCACHE_NUM_REQS-1:0] mem_req_rw,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] mem_req_byteen,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [DCACHE_NUM_REQS-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] mem_req_flags,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] mem_req_data,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] mem_req_tag,
|
||||
input wire [DCACHE_NUM_REQS-1:0] mem_req_ready,
|
||||
|
||||
// Memory response
|
||||
input wire [DCACHE_NUM_REQS-1:0] mem_rsp_valid,
|
||||
input wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] mem_rsp_data,
|
||||
input wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire [DCACHE_NUM_REQS-1:0] mem_rsp_ready
|
||||
);
|
||||
VX_lsu_mem_if #(
|
||||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH)
|
||||
) lsu_mem_if[`NUM_LSU_BLOCKS]();
|
||||
|
||||
// LSU memory request
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_mem_req
|
||||
assign lsu_mem_if[i].req_valid = lsu_req_valid[i];
|
||||
assign lsu_mem_if[i].req_data.rw = lsu_req_rw[i];
|
||||
assign lsu_mem_if[i].req_data.mask = lsu_req_mask[i];
|
||||
assign lsu_mem_if[i].req_data.byteen = lsu_req_byteen[i];
|
||||
assign lsu_mem_if[i].req_data.addr = lsu_req_addr[i];
|
||||
assign lsu_mem_if[i].req_data.flags = lsu_req_flags[i];
|
||||
assign lsu_mem_if[i].req_data.data = lsu_req_data[i];
|
||||
assign lsu_mem_if[i].req_data.tag = lsu_req_tag[i];
|
||||
assign lsu_req_ready[i] = lsu_mem_if[i].req_ready;
|
||||
end
|
||||
|
||||
// LSU memory response
|
||||
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_rsp
|
||||
assign lsu_rsp_valid[i] = lsu_mem_if[i].rsp_valid;
|
||||
assign lsu_rsp_mask[i] = lsu_mem_if[i].rsp_data.mask;
|
||||
assign lsu_rsp_data[i] = lsu_mem_if[i].rsp_data.data;
|
||||
assign lsu_rsp_tag[i] = lsu_mem_if[i].rsp_data.tag;
|
||||
assign lsu_mem_if[i].rsp_ready = lsu_rsp_ready[i];
|
||||
end
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) mem_bus_if[DCACHE_NUM_REQS]();
|
||||
|
||||
// memory request
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_mem_req
|
||||
assign mem_req_valid[i] = mem_bus_if[i].req_valid;
|
||||
assign mem_req_rw[i] = mem_bus_if[i].req_data.rw;
|
||||
assign mem_req_byteen[i] = mem_bus_if[i].req_data.byteen;
|
||||
assign mem_req_addr[i] = mem_bus_if[i].req_data.addr;
|
||||
assign mem_req_flags[i] = mem_bus_if[i].req_data.flags;
|
||||
assign mem_req_data[i] = mem_bus_if[i].req_data.data;
|
||||
assign mem_req_tag[i] = mem_bus_if[i].req_data.tag;
|
||||
assign mem_bus_if[i].req_ready = mem_req_ready[i];
|
||||
end
|
||||
|
||||
// memory response
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_mem_bus_rsp
|
||||
assign mem_bus_if[i].rsp_valid = mem_rsp_valid[i];
|
||||
assign mem_bus_if[i].rsp_data.tag = mem_rsp_tag[i];
|
||||
assign mem_bus_if[i].rsp_data.data = mem_rsp_data[i];
|
||||
assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
cache_perf_t lmem_perf = '0;
|
||||
`endif
|
||||
|
||||
VX_mem_unit #(
|
||||
.INSTANCE_ID (INSTANCE_ID)
|
||||
) mem_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.lmem_perf (lmem_perf),
|
||||
`endif
|
||||
.lsu_mem_if (lsu_mem_if),
|
||||
.dcache_bus_if (mem_bus_if)
|
||||
);
|
||||
|
||||
endmodule
|
|
@ -23,7 +23,7 @@
|
|||
module VX_operands import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = "",
|
||||
parameter NUM_BANKS = 4,
|
||||
parameter OUT_BUF = 3
|
||||
parameter OUT_BUF = 4 // using 2-cycle EB for area reduction
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
@ -37,15 +37,15 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
VX_operands_if.master operands_if
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam NUM_SRC_OPDS = 3;
|
||||
localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_OPDS);
|
||||
localparam NUM_SRC_REGS = 3;
|
||||
localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_REGS);
|
||||
localparam REQ_SEL_WIDTH = `UP(REQ_SEL_BITS);
|
||||
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
|
||||
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
|
||||
localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS;
|
||||
localparam META_DATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + `UUID_WIDTH;
|
||||
localparam REGS_DATAW = `XLEN * `NUM_THREADS;
|
||||
localparam DATAW = META_DATAW + NUM_SRC_OPDS * REGS_DATAW;
|
||||
localparam DATAW = META_DATAW + NUM_SRC_REGS * REGS_DATAW;
|
||||
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS);
|
||||
localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS;
|
||||
localparam XLEN_SIZE = `XLEN / 8;
|
||||
|
@ -53,80 +53,87 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
|
||||
`UNUSED_VAR (writeback_if.data.sop)
|
||||
|
||||
wire [NUM_SRC_OPDS-1:0] src_valid;
|
||||
wire [NUM_SRC_OPDS-1:0] req_valid_in, req_ready_in;
|
||||
wire [NUM_SRC_OPDS-1:0][PER_BANK_ADDRW-1:0] req_data_in;
|
||||
wire [NUM_SRC_OPDS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
|
||||
wire [NUM_SRC_REGS-1:0] src_valid;
|
||||
wire [NUM_SRC_REGS-1:0] req_in_valid, req_in_ready;
|
||||
wire [NUM_SRC_REGS-1:0][PER_BANK_ADDRW-1:0] req_in_data;
|
||||
wire [NUM_SRC_REGS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
|
||||
|
||||
wire [NUM_BANKS-1:0] gpr_rd_valid, gpr_rd_ready;
|
||||
wire [NUM_BANKS-1:0] gpr_rd_valid_st1, gpr_rd_valid_st2;
|
||||
wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr, gpr_rd_addr_st1;
|
||||
wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data_st2;
|
||||
wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data_st1, gpr_rd_data_st2;
|
||||
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx, gpr_rd_req_idx_st1, gpr_rd_req_idx_st2;
|
||||
|
||||
wire pipe_ready_in;
|
||||
wire pipe_valid_st1, pipe_ready_st1;
|
||||
wire pipe_valid_st2, pipe_ready_st2;
|
||||
wire [META_DATAW-1:0] pipe_data, pipe_data_st1, pipe_data_st2;
|
||||
|
||||
reg [NUM_SRC_OPDS-1:0][(`NUM_THREADS * `XLEN)-1:0] src_data_st2, src_data_m_st2;
|
||||
reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_n;
|
||||
wire [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_st1, src_data_st2;
|
||||
|
||||
reg [NUM_SRC_OPDS-1:0] data_fetched_st1;
|
||||
reg [NUM_SRC_REGS-1:0] data_fetched_n;
|
||||
wire [NUM_SRC_REGS-1:0] data_fetched_st1;
|
||||
|
||||
reg has_collision_n;
|
||||
wire has_collision_st1;
|
||||
|
||||
wire [NUM_SRC_OPDS-1:0][`NR_BITS-1:0] src_opds;
|
||||
assign src_opds = {scoreboard_if.data.rs3, scoreboard_if.data.rs2, scoreboard_if.data.rs1};
|
||||
wire [NUM_SRC_REGS-1:0][`NR_BITS-1:0] src_regs = {scoreboard_if.data.rs3,
|
||||
scoreboard_if.data.rs2,
|
||||
scoreboard_if.data.rs1};
|
||||
|
||||
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_req_data_in
|
||||
if (ISSUE_WIS != 0) begin : g_wis
|
||||
assign req_data_in[i] = {src_opds[i][`NR_BITS-1:BANK_SEL_BITS], scoreboard_if.data.wis};
|
||||
end else begin : g_no_wis
|
||||
assign req_data_in[i] = src_opds[i][`NR_BITS-1:BANK_SEL_BITS];
|
||||
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
|
||||
if (ISSUE_WIS != 0) begin
|
||||
assign req_in_data[i] = {src_regs[i][`NR_BITS-1:BANK_SEL_BITS], scoreboard_if.data.wis};
|
||||
end else begin
|
||||
assign req_in_data[i] = src_regs[i][`NR_BITS-1:BANK_SEL_BITS];
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_req_bank_idx
|
||||
if (NUM_BANKS != 1) begin : g_multibanks
|
||||
assign req_bank_idx[i] = src_opds[i][BANK_SEL_BITS-1:0];
|
||||
end else begin : g_singlebank
|
||||
if (NUM_BANKS != 1) begin
|
||||
assign req_bank_idx[i] = src_regs[i][BANK_SEL_BITS-1:0];
|
||||
end else begin
|
||||
assign req_bank_idx[i] = '0;
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_src_valid
|
||||
assign src_valid[i] = (src_opds[i] != 0) && ~data_fetched_st1[i];
|
||||
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
|
||||
assign src_valid[i] = (src_regs[i] != 0) && ~data_fetched_st1[i];
|
||||
end
|
||||
|
||||
assign req_valid_in = {NUM_SRC_OPDS{scoreboard_if.valid}} & src_valid;
|
||||
assign req_in_valid = {NUM_SRC_REGS{scoreboard_if.valid}} & src_valid;
|
||||
|
||||
VX_stream_xbar #(
|
||||
.NUM_INPUTS (NUM_SRC_OPDS),
|
||||
.NUM_INPUTS (NUM_SRC_REGS),
|
||||
.NUM_OUTPUTS (NUM_BANKS),
|
||||
.DATAW (PER_BANK_ADDRW),
|
||||
.ARBITER ("P"), // use priority arbiter
|
||||
.PERF_CTR_BITS(`PERF_CTR_BITS),
|
||||
.OUT_BUF (0) // no output buffering
|
||||
) req_xbar (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
`UNUSED_PIN(collisions),
|
||||
.valid_in (req_valid_in),
|
||||
.data_in (req_data_in),
|
||||
.valid_in (req_in_valid),
|
||||
.data_in (req_in_data),
|
||||
.sel_in (req_bank_idx),
|
||||
.ready_in (req_ready_in),
|
||||
.ready_in (req_in_ready),
|
||||
.valid_out (gpr_rd_valid),
|
||||
.data_out (gpr_rd_addr),
|
||||
.sel_out (gpr_rd_req_idx),
|
||||
.ready_out (gpr_rd_ready)
|
||||
);
|
||||
|
||||
assign gpr_rd_ready = {NUM_BANKS{pipe_ready_in}};
|
||||
wire pipe_in_ready = pipe_ready_st1 || ~pipe_valid_st1;
|
||||
|
||||
assign gpr_rd_ready = {NUM_BANKS{pipe_in_ready}};
|
||||
|
||||
assign scoreboard_if.ready = pipe_in_ready && ~has_collision_n;
|
||||
|
||||
wire pipe_fire_st1 = pipe_valid_st1 && pipe_ready_st1;
|
||||
wire pipe_fire_st2 = pipe_valid_st2 && pipe_ready_st2;
|
||||
|
||||
always @(*) begin
|
||||
has_collision_n = 0;
|
||||
for (integer i = 0; i < NUM_SRC_OPDS; ++i) begin
|
||||
for (integer j = 1; j < (NUM_SRC_OPDS-i); ++j) begin
|
||||
for (integer i = 0; i < NUM_SRC_REGS; ++i) begin
|
||||
for (integer j = 1; j < (NUM_SRC_REGS-i); ++j) begin
|
||||
has_collision_n |= src_valid[i]
|
||||
&& src_valid[j+i]
|
||||
&& (req_bank_idx[i] == req_bank_idx[j+i]);
|
||||
|
@ -134,7 +141,14 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
|
||||
wire [NUM_SRC_OPDS-1:0] req_fire_in = req_valid_in & req_ready_in;
|
||||
always @(*) begin
|
||||
data_fetched_n = data_fetched_st1;
|
||||
if (scoreboard_if.ready) begin
|
||||
data_fetched_n = '0;
|
||||
end else begin
|
||||
data_fetched_n = data_fetched_st1 | req_in_ready;
|
||||
end
|
||||
end
|
||||
|
||||
assign pipe_data = {
|
||||
scoreboard_if.data.wis,
|
||||
|
@ -148,74 +162,61 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
scoreboard_if.data.uuid
|
||||
};
|
||||
|
||||
assign scoreboard_if.ready = pipe_ready_in && ~has_collision_n;
|
||||
|
||||
wire pipe_fire_st1 = pipe_valid_st1 && pipe_ready_st1;
|
||||
wire pipe_fire_st2 = pipe_valid_st2 && pipe_ready_st2;
|
||||
|
||||
VX_pipe_buffer #(
|
||||
.DATAW (NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH))
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_SRC_REGS + NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)),
|
||||
.RESETW (1 + NUM_SRC_REGS)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (scoreboard_if.valid),
|
||||
.ready_in (pipe_ready_in),
|
||||
.data_in ({gpr_rd_valid, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}),
|
||||
.data_out ({gpr_rd_valid_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1}),
|
||||
.valid_out(pipe_valid_st1),
|
||||
.ready_out(pipe_ready_st1)
|
||||
.enable (pipe_in_ready),
|
||||
.data_in ({scoreboard_if.valid, data_fetched_n, gpr_rd_valid, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}),
|
||||
.data_out ({pipe_valid_st1, data_fetched_st1, gpr_rd_valid_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1})
|
||||
);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset || scoreboard_if.ready) begin
|
||||
data_fetched_st1 <= 0;
|
||||
end else begin
|
||||
data_fetched_st1 <= data_fetched_st1 | req_fire_in;
|
||||
end
|
||||
end
|
||||
assign pipe_ready_st1 = pipe_ready_st2 || ~pipe_valid_st2;
|
||||
|
||||
assign src_data_st1 = pipe_fire_st2 ? '0 : src_data_n;
|
||||
|
||||
wire pipe_valid2_st1 = pipe_valid_st1 && ~has_collision_st1;
|
||||
|
||||
VX_pipe_buffer #(
|
||||
.DATAW (NUM_BANKS * (1 + REQ_SEL_WIDTH) + META_DATAW)
|
||||
`RESET_RELAY (pipe2_reset, reset); // needed for pipe_reg2's wide RESETW
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_SRC_REGS * REGS_DATAW + NUM_BANKS + NUM_BANKS * REGS_DATAW + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH),
|
||||
.RESETW (1 + NUM_SRC_REGS * REGS_DATAW)
|
||||
) pipe_reg2 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (pipe_valid2_st1),
|
||||
.ready_in (pipe_ready_st1),
|
||||
.data_in ({gpr_rd_valid_st1, gpr_rd_req_idx_st1, pipe_data_st1}),
|
||||
.data_out ({gpr_rd_valid_st2, gpr_rd_req_idx_st2, pipe_data_st2}),
|
||||
.valid_out(pipe_valid_st2),
|
||||
.ready_out(pipe_ready_st2)
|
||||
.reset (pipe2_reset),
|
||||
.enable (pipe_ready_st1),
|
||||
.data_in ({pipe_valid2_st1, src_data_st1, gpr_rd_valid_st1, gpr_rd_data_st1, pipe_data_st1, gpr_rd_req_idx_st1}),
|
||||
.data_out ({pipe_valid_st2, src_data_st2, gpr_rd_valid_st2, gpr_rd_data_st2, pipe_data_st2, gpr_rd_req_idx_st2})
|
||||
);
|
||||
|
||||
always @(*) begin
|
||||
src_data_m_st2 = src_data_st2;
|
||||
src_data_n = src_data_st2;
|
||||
for (integer b = 0; b < NUM_BANKS; ++b) begin
|
||||
if (gpr_rd_valid_st2[b]) begin
|
||||
src_data_m_st2[gpr_rd_req_idx_st2[b]] = gpr_rd_data_st2[b];
|
||||
src_data_n[gpr_rd_req_idx_st2[b]] = gpr_rd_data_st2[b];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset || pipe_fire_st2) begin
|
||||
src_data_st2 <= 0;
|
||||
end else begin
|
||||
src_data_st2 <= src_data_m_st2;
|
||||
end
|
||||
end
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
|
||||
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)),
|
||||
.LUTRAM (1)
|
||||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (pipe_valid_st2),
|
||||
.ready_in (pipe_ready_st2),
|
||||
.data_in ({pipe_data_st2, src_data_m_st2}),
|
||||
.data_in ({
|
||||
pipe_data_st2,
|
||||
src_data_n[0],
|
||||
src_data_n[1],
|
||||
src_data_n[2]
|
||||
}),
|
||||
.data_out ({
|
||||
operands_if.data.wis,
|
||||
operands_if.data.tmask,
|
||||
|
@ -226,39 +227,51 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
operands_if.data.op_args,
|
||||
operands_if.data.rd,
|
||||
operands_if.data.uuid,
|
||||
operands_if.data.rs3_data,
|
||||
operands_if.data.rs1_data,
|
||||
operands_if.data.rs2_data,
|
||||
operands_if.data.rs1_data
|
||||
operands_if.data.rs3_data
|
||||
}),
|
||||
.valid_out (operands_if.valid),
|
||||
.ready_out (operands_if.ready)
|
||||
);
|
||||
|
||||
wire [PER_BANK_ADDRW-1:0] gpr_wr_addr;
|
||||
if (ISSUE_WIS != 0) begin : g_gpr_wr_addr
|
||||
if (ISSUE_WIS != 0) begin
|
||||
assign gpr_wr_addr = {writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS], writeback_if.data.wis};
|
||||
end else begin : g_gpr_wr_addr_no_wis
|
||||
end else begin
|
||||
assign gpr_wr_addr = writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS];
|
||||
end
|
||||
|
||||
wire [BANK_SEL_WIDTH-1:0] gpr_wr_bank_idx;
|
||||
if (NUM_BANKS != 1) begin : g_gpr_wr_bank_idx
|
||||
if (NUM_BANKS != 1) begin
|
||||
assign gpr_wr_bank_idx = writeback_if.data.rd[BANK_SEL_BITS-1:0];
|
||||
end else begin : g_gpr_wr_bank_idx_0
|
||||
end else begin
|
||||
assign gpr_wr_bank_idx = '0;
|
||||
end
|
||||
|
||||
for (genvar b = 0; b < NUM_BANKS; ++b) begin : g_gpr_rams
|
||||
`ifdef GPR_RESET
|
||||
reg wr_enabled = 0;
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
wr_enabled <= 1;
|
||||
end
|
||||
end
|
||||
`else
|
||||
wire wr_enabled = 1;
|
||||
`endif
|
||||
|
||||
for (genvar b = 0; b < NUM_BANKS; ++b) begin
|
||||
wire gpr_wr_enabled;
|
||||
if (BANK_SEL_BITS != 0) begin : g_gpr_wr_enabled_multibanks
|
||||
assign gpr_wr_enabled = writeback_if.valid
|
||||
if (BANK_SEL_BITS != 0) begin
|
||||
assign gpr_wr_enabled = wr_enabled
|
||||
&& writeback_if.valid
|
||||
&& (gpr_wr_bank_idx == BANK_SEL_BITS'(b));
|
||||
end else begin : g_gpr_wr_enabled
|
||||
assign gpr_wr_enabled = writeback_if.valid;
|
||||
end else begin
|
||||
assign gpr_wr_enabled = wr_enabled && writeback_if.valid;
|
||||
end
|
||||
|
||||
wire [BYTEENW-1:0] wren;
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin : g_wren
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
assign wren[i*XLEN_SIZE+:XLEN_SIZE] = {XLEN_SIZE{writeback_if.data.tmask[i]}};
|
||||
end
|
||||
|
||||
|
@ -269,8 +282,7 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
`ifdef GPR_RESET
|
||||
.RESET_RAM (1),
|
||||
`endif
|
||||
.OUT_REG (1),
|
||||
.RDW_MODE ("R")
|
||||
.NO_RWCHECK (1)
|
||||
) gpr_ram (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -280,7 +292,7 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
.waddr (gpr_wr_addr),
|
||||
.wdata (writeback_if.data.data),
|
||||
.raddr (gpr_rd_addr_st1[b]),
|
||||
.rdata (gpr_rd_data_st2[b])
|
||||
.rdata (gpr_rd_data_st1[b])
|
||||
);
|
||||
end
|
||||
|
||||
|
@ -290,7 +302,7 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
if (reset) begin
|
||||
collisions_r <= '0;
|
||||
end else begin
|
||||
collisions_r <= collisions_r + `PERF_CTR_BITS'(scoreboard_if.valid && pipe_ready_in && has_collision_n);
|
||||
collisions_r <= collisions_r + `PERF_CTR_BITS'(scoreboard_if.valid && pipe_in_ready && has_collision_n);
|
||||
end
|
||||
end
|
||||
assign perf_stalls = collisions_r;
|
||||
|
|
|
@ -1,93 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_pe_switch import VX_gpu_pkg::*; #(
|
||||
parameter PE_COUNT = 0,
|
||||
parameter NUM_LANES = 0,
|
||||
parameter REQ_OUT_BUF = 0,
|
||||
parameter RSP_OUT_BUF = 0,
|
||||
parameter `STRING ARBITER = "R",
|
||||
parameter PE_SEL_BITS = `CLOG2(PE_COUNT)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
input wire [`UP(PE_SEL_BITS)-1:0] pe_sel,
|
||||
VX_execute_if.slave execute_in_if,
|
||||
VX_commit_if.master commit_out_if,
|
||||
VX_execute_if.master execute_out_if[PE_COUNT],
|
||||
VX_commit_if .slave commit_in_if[PE_COUNT]
|
||||
);
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam REQ_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `INST_ALU_BITS + $bits(op_args_t) + 1 + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
|
||||
localparam RSP_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
|
||||
|
||||
wire [PE_COUNT-1:0] pe_req_valid;
|
||||
wire [PE_COUNT-1:0][REQ_DATAW-1:0] pe_req_data;
|
||||
wire [PE_COUNT-1:0] pe_req_ready;
|
||||
|
||||
VX_stream_switch #(
|
||||
.DATAW (REQ_DATAW),
|
||||
.NUM_INPUTS (1),
|
||||
.NUM_OUTPUTS (PE_COUNT),
|
||||
.OUT_BUF (REQ_OUT_BUF)
|
||||
) req_switch (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.sel_in (pe_sel),
|
||||
.valid_in (execute_in_if.valid),
|
||||
.ready_in (execute_in_if.ready),
|
||||
.data_in (execute_in_if.data),
|
||||
.data_out (pe_req_data),
|
||||
.valid_out (pe_req_valid),
|
||||
.ready_out (pe_req_ready)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < PE_COUNT; ++i) begin : g_execute_out_if
|
||||
assign execute_out_if[i].valid = pe_req_valid[i];
|
||||
assign execute_out_if[i].data = pe_req_data[i];
|
||||
assign pe_req_ready[i] = execute_out_if[i].ready;
|
||||
end
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [PE_COUNT-1:0] pe_rsp_valid;
|
||||
wire [PE_COUNT-1:0][RSP_DATAW-1:0] pe_rsp_data;
|
||||
wire [PE_COUNT-1:0] pe_rsp_ready;
|
||||
|
||||
for (genvar i = 0; i < PE_COUNT; ++i) begin : g_commit_in_if
|
||||
assign pe_rsp_valid[i] = commit_in_if[i].valid;
|
||||
assign pe_rsp_data[i] = commit_in_if[i].data;
|
||||
assign commit_in_if[i].ready = pe_rsp_ready[i];
|
||||
end
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (PE_COUNT),
|
||||
.DATAW (RSP_DATAW),
|
||||
.ARBITER (ARBITER),
|
||||
.OUT_BUF (RSP_OUT_BUF)
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (pe_rsp_valid),
|
||||
.ready_in (pe_rsp_ready),
|
||||
.data_in (pe_rsp_data),
|
||||
.data_out (commit_out_if.data),
|
||||
.valid_out (commit_out_if.valid),
|
||||
.ready_out (commit_out_if.ready),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
endmodule
|
|
@ -68,6 +68,8 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
|
||||
reg [`PERF_CTR_BITS-1:0] cycles;
|
||||
|
||||
reg [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] issued_instrs;
|
||||
|
||||
wire schedule_fire = schedule_valid && schedule_ready;
|
||||
wire schedule_if_fire = schedule_if.valid && schedule_if.ready;
|
||||
|
||||
|
@ -76,7 +78,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
wire [`NUM_ALU_BLOCKS-1:0][`NW_WIDTH-1:0] branch_wid;
|
||||
wire [`NUM_ALU_BLOCKS-1:0] branch_taken;
|
||||
wire [`NUM_ALU_BLOCKS-1:0][`PC_BITS-1:0] branch_dest;
|
||||
for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin : g_branch_init
|
||||
for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin
|
||||
assign branch_valid[i] = branch_ctl_if[i].valid;
|
||||
assign branch_wid[i] = branch_ctl_if[i].wid;
|
||||
assign branch_taken[i] = branch_ctl_if[i].taken;
|
||||
|
@ -111,16 +113,6 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
barrier_stalls_n= barrier_stalls;
|
||||
warp_pcs_n = warp_pcs;
|
||||
|
||||
// decode unlock
|
||||
if (decode_sched_if.valid && decode_sched_if.unlock) begin
|
||||
stalled_warps_n[decode_sched_if.wid] = 0;
|
||||
end
|
||||
|
||||
// CSR unlock
|
||||
if (sched_csr_if.unlock_warp) begin
|
||||
stalled_warps_n[sched_csr_if.unlock_wid] = 0;
|
||||
end
|
||||
|
||||
// wspawn handling
|
||||
if (wspawn.valid && is_single_warp) begin
|
||||
active_warps_n |= wspawn.wmask;
|
||||
|
@ -178,11 +170,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
|
||||
end
|
||||
end
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_data.id)) begin
|
||||
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_id)) begin
|
||||
barrier_ctrs_n[warp_ctl_if.barrier.id] = '0; // reset barrier counter
|
||||
barrier_masks_n[gbar_bus_if.rsp_data.id] = '0; // reset barrier mask
|
||||
barrier_masks_n[gbar_bus_if.rsp_id] = '0; // reset barrier mask
|
||||
stalled_warps_n = '0; // unlock all warps
|
||||
end
|
||||
`endif
|
||||
|
@ -197,6 +188,16 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
|
||||
// decode unlock
|
||||
if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin
|
||||
stalled_warps_n[decode_sched_if.wid] = 0;
|
||||
end
|
||||
|
||||
// CSR unlock
|
||||
if (sched_csr_if.unlock_warp) begin
|
||||
stalled_warps_n[sched_csr_if.unlock_wid] = 0;
|
||||
end
|
||||
|
||||
// stall the warp until decode stage
|
||||
if (schedule_fire) begin
|
||||
stalled_warps_n[schedule_wid] = 1;
|
||||
|
@ -222,6 +223,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
active_warps <= '0;
|
||||
thread_masks <= '0;
|
||||
barrier_stalls <= '0;
|
||||
issued_instrs <= '0;
|
||||
cycles <= '0;
|
||||
wspawn.valid <= 0;
|
||||
|
||||
|
@ -266,6 +268,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
end
|
||||
`endif
|
||||
|
||||
if (schedule_if_fire) begin
|
||||
issued_instrs[schedule_if.data.wid] <= issued_instrs[schedule_if.data.wid] + `UUID_WIDTH'(1);
|
||||
end
|
||||
|
||||
if (busy) begin
|
||||
cycles <= cycles + 1;
|
||||
end
|
||||
|
@ -275,19 +281,21 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
// barrier handling
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
assign gbar_bus_if.req_valid = gbar_req_valid;
|
||||
assign gbar_bus_if.req_data.id = gbar_req_id;
|
||||
assign gbar_bus_if.req_data.size_m1 = gbar_req_size_m1;
|
||||
assign gbar_bus_if.req_data.core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
|
||||
assign gbar_bus_if.req_valid = gbar_req_valid;
|
||||
assign gbar_bus_if.req_id = gbar_req_id;
|
||||
assign gbar_bus_if.req_size_m1 = gbar_req_size_m1;
|
||||
assign gbar_bus_if.req_core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
|
||||
`endif
|
||||
|
||||
// split/join handling
|
||||
|
||||
`RESET_RELAY (split_join_reset, reset);
|
||||
|
||||
VX_split_join #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-splitjoin", INSTANCE_ID)))
|
||||
.INSTANCE_ID ($sformatf("%s-splitjoin", INSTANCE_ID))
|
||||
) split_join (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (split_join_reset),
|
||||
.valid (warp_ctl_if.valid),
|
||||
.wid (warp_ctl_if.wid),
|
||||
.split (warp_ctl_if.split),
|
||||
|
@ -316,7 +324,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
);
|
||||
|
||||
wire [`NUM_WARPS-1:0][(`NUM_THREADS + `PC_BITS)-1:0] schedule_data;
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_schedule_data
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
|
||||
assign schedule_data[i] = {thread_masks[i], warp_pcs[i]};
|
||||
end
|
||||
|
||||
|
@ -325,50 +333,67 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
schedule_data[schedule_wid][(`NUM_THREADS + `PC_BITS)-5:0]
|
||||
};
|
||||
|
||||
wire [`UUID_WIDTH-1:0] instr_uuid;
|
||||
`ifdef UUID_ENABLE
|
||||
VX_uuid_gen #(
|
||||
.CORE_ID (CORE_ID),
|
||||
.UUID_WIDTH (`UUID_WIDTH)
|
||||
) uuid_gen (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.incr (schedule_fire),
|
||||
.wid (schedule_wid),
|
||||
.uuid (instr_uuid)
|
||||
);
|
||||
`ifndef NDEBUG
|
||||
localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS);
|
||||
reg [`UUID_WIDTH-1:0] instr_uuid;
|
||||
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid);
|
||||
`ifdef SV_DPI
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 32'd0));
|
||||
end else if (schedule_fire) begin
|
||||
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid)));
|
||||
end
|
||||
end
|
||||
`else
|
||||
assign instr_uuid = '0;
|
||||
wire [GNW_WIDTH+16-1:0] w_uuid = {g_wid, 16'(schedule_pc)};
|
||||
always @(*) begin
|
||||
instr_uuid = `UUID_WIDTH'(w_uuid);
|
||||
end
|
||||
`endif
|
||||
`else
|
||||
wire [`UUID_WIDTH-1:0] instr_uuid = '0;
|
||||
`endif
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`NUM_THREADS + `PC_BITS + `NW_WIDTH + `UUID_WIDTH),
|
||||
.SIZE (2), // need to buffer out ready_in
|
||||
.OUT_REG (1) // should be registered for BRAM acces in fetch unit
|
||||
.DATAW (`NUM_THREADS + `PC_BITS + `NW_WIDTH)
|
||||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (schedule_valid),
|
||||
.ready_in (schedule_ready),
|
||||
.data_in ({schedule_tmask, schedule_pc, schedule_wid, instr_uuid}),
|
||||
.data_out ({schedule_if.data.tmask, schedule_if.data.PC, schedule_if.data.wid, schedule_if.data.uuid}),
|
||||
.data_in ({schedule_tmask, schedule_pc, schedule_wid}),
|
||||
.data_out ({schedule_if.data.tmask, schedule_if.data.PC, schedule_if.data.wid}),
|
||||
.valid_out (schedule_if.valid),
|
||||
.ready_out (schedule_if.ready)
|
||||
);
|
||||
|
||||
assign schedule_if.data.uuid = instr_uuid;
|
||||
|
||||
// Track pending instructions per warp
|
||||
|
||||
reg [`NUM_WARPS-1:0] per_warp_incr;
|
||||
always @(*) begin
|
||||
per_warp_incr = 0;
|
||||
if (schedule_if_fire) begin
|
||||
per_warp_incr[schedule_if.data.wid] = 1;
|
||||
end
|
||||
end
|
||||
|
||||
wire [`NUM_WARPS-1:0] pending_warp_empty;
|
||||
wire [`NUM_WARPS-1:0] pending_warp_alm_empty;
|
||||
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_pending_sizes
|
||||
`RESET_RELAY_EX (pending_instr_reset, reset, `NUM_WARPS, `MAX_FANOUT);
|
||||
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
|
||||
|
||||
VX_pending_size #(
|
||||
.SIZE (4096),
|
||||
.ALM_EMPTY (1)
|
||||
) counter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.incr (schedule_if_fire && (schedule_if.data.wid == `NW_WIDTH'(i))),
|
||||
.reset (pending_instr_reset[i]),
|
||||
.incr (per_warp_incr[i]),
|
||||
.decr (commit_sched_if.committed_warps[i]),
|
||||
.empty (pending_warp_empty[i]),
|
||||
.alm_empty (pending_warp_alm_empty[i]),
|
||||
|
@ -382,7 +407,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
|
||||
wire no_pending_instr = (& pending_warp_empty);
|
||||
|
||||
`BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1, 1);
|
||||
`BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1);
|
||||
|
||||
// export CSRs
|
||||
assign sched_csr_if.cycles = cycles;
|
||||
|
@ -397,7 +422,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
timeout_ctr <= '0;
|
||||
timeout_enable <= 0;
|
||||
end else begin
|
||||
if (decode_sched_if.valid && decode_sched_if.unlock) begin
|
||||
if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin
|
||||
timeout_enable <= 1;
|
||||
end
|
||||
if (timeout_enable && active_warps !=0 && active_warps == stalled_warps) begin
|
||||
|
|
|
@ -30,8 +30,6 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
VX_scoreboard_if.master scoreboard_if
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam NUM_SRC_OPDS = 3;
|
||||
localparam NUM_OPDS = NUM_SRC_OPDS + 1;
|
||||
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + 1;
|
||||
|
||||
VX_ibuffer_if staging_if [PER_ISSUE_WARPS]();
|
||||
|
@ -44,7 +42,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
|
||||
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
|
||||
|
||||
VX_reduce_tree #(
|
||||
VX_reduce #(
|
||||
.DATAW_IN (`NUM_EX_UNITS),
|
||||
.N (PER_ISSUE_WARPS),
|
||||
.OP ("|")
|
||||
|
@ -53,7 +51,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
.data_out (perf_units_per_cycle)
|
||||
);
|
||||
|
||||
VX_reduce_tree #(
|
||||
VX_reduce #(
|
||||
.DATAW_IN (`NUM_SFU_UNITS),
|
||||
.N (PER_ISSUE_WARPS),
|
||||
.OP ("|")
|
||||
|
@ -62,17 +60,17 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
.data_out (perf_sfu_per_cycle)
|
||||
);
|
||||
|
||||
`BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, 0, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
|
||||
`BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, 0, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
|
||||
`BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
|
||||
`BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
|
||||
|
||||
wire [PER_ISSUE_WARPS-1:0] stg_valid_in;
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_stg_valid_in
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
|
||||
assign stg_valid_in[w] = staging_if[w].valid;
|
||||
end
|
||||
|
||||
wire perf_stall_per_cycle = (|stg_valid_in) && ~(|(stg_valid_in & operands_ready));
|
||||
|
||||
always @(posedge clk) begin : g_perf_stalls
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_stalls <= '0;
|
||||
end else begin
|
||||
|
@ -80,7 +78,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_perf_units_uses
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_units_uses[i] <= '0;
|
||||
|
@ -90,7 +88,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin : g_perf_sfu_uses
|
||||
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_sfu_uses[i] <= '0;
|
||||
|
@ -101,9 +99,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
end
|
||||
`endif
|
||||
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_stanging_bufs
|
||||
VX_pipe_buffer #(
|
||||
.DATAW (DATAW)
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (1)
|
||||
) stanging_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -116,10 +115,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
);
|
||||
end
|
||||
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_scoreboard
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
|
||||
reg [`NUM_REGS-1:0] inuse_regs;
|
||||
|
||||
reg [NUM_OPDS-1:0] operands_busy, operands_busy_n;
|
||||
reg [3:0] operands_busy, operands_busy_n;
|
||||
|
||||
wire ibuffer_fire = ibuffer_if[w].valid && ibuffer_if[w].ready;
|
||||
|
||||
|
@ -129,10 +128,6 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
&& (writeback_if.data.wis == ISSUE_WIS_W'(w))
|
||||
&& writeback_if.data.eop;
|
||||
|
||||
wire [NUM_OPDS-1:0][`NR_BITS-1:0] ibuf_opds, stg_opds;
|
||||
assign ibuf_opds = {ibuffer_if[w].data.rs3, ibuffer_if[w].data.rs2, ibuffer_if[w].data.rs1, ibuffer_if[w].data.rd};
|
||||
assign stg_opds = {staging_if[w].data.rs3, staging_if[w].data.rs2, staging_if[w].data.rs1, staging_if[w].data.rd};
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
|
||||
reg [`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
|
||||
|
@ -140,36 +135,86 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
always @(*) begin
|
||||
perf_inuse_units_per_cycle[w] = '0;
|
||||
perf_inuse_sfu_per_cycle[w] = '0;
|
||||
for (integer i = 0; i < NUM_OPDS; ++i) begin
|
||||
if (staging_if[w].valid && operands_busy[i]) begin
|
||||
perf_inuse_units_per_cycle[w][inuse_units[stg_opds[i]]] = 1;
|
||||
if (inuse_units[stg_opds[i]] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[w][inuse_sfu[stg_opds[i]]] = 1;
|
||||
if (staging_if[w].valid) begin
|
||||
if (operands_busy[0]) begin
|
||||
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rd]] = 1;
|
||||
if (inuse_units[staging_if[w].data.rd] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rd]] = 1;
|
||||
end
|
||||
end
|
||||
if (operands_busy[1]) begin
|
||||
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs1]] = 1;
|
||||
if (inuse_units[staging_if[w].data.rs1] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs1]] = 1;
|
||||
end
|
||||
end
|
||||
if (operands_busy[2]) begin
|
||||
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs2]] = 1;
|
||||
if (inuse_units[staging_if[w].data.rs2] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs2]] = 1;
|
||||
end
|
||||
end
|
||||
if (operands_busy[3]) begin
|
||||
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs3]] = 1;
|
||||
if (inuse_units[staging_if[w].data.rs3] == `EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs3]] = 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < NUM_OPDS; ++i) begin : g_operands_busy_n
|
||||
always @(*) begin
|
||||
operands_busy_n[i] = operands_busy[i];
|
||||
always @(*) begin
|
||||
operands_busy_n = operands_busy;
|
||||
if (ibuffer_fire) begin
|
||||
operands_busy_n = {
|
||||
inuse_regs[ibuffer_if[w].data.rs3],
|
||||
inuse_regs[ibuffer_if[w].data.rs2],
|
||||
inuse_regs[ibuffer_if[w].data.rs1],
|
||||
inuse_regs[ibuffer_if[w].data.rd]
|
||||
};
|
||||
end
|
||||
if (writeback_fire) begin
|
||||
if (ibuffer_fire) begin
|
||||
operands_busy_n[i] = inuse_regs[ibuf_opds[i]];
|
||||
if (staging_fire && staging_if[w].data.wb && staging_if[w].data.rd == ibuf_opds[i]) begin
|
||||
operands_busy_n[i] = 1;
|
||||
if (writeback_if.data.rd == ibuffer_if[w].data.rd) begin
|
||||
operands_busy_n[0] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == ibuffer_if[w].data.rs1) begin
|
||||
operands_busy_n[1] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == ibuffer_if[w].data.rs2) begin
|
||||
operands_busy_n[2] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == ibuffer_if[w].data.rs3) begin
|
||||
operands_busy_n[3] = 0;
|
||||
end
|
||||
end else begin
|
||||
if (writeback_if.data.rd == staging_if[w].data.rd) begin
|
||||
operands_busy_n[0] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == staging_if[w].data.rs1) begin
|
||||
operands_busy_n[1] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == staging_if[w].data.rs2) begin
|
||||
operands_busy_n[2] = 0;
|
||||
end
|
||||
if (writeback_if.data.rd == staging_if[w].data.rs3) begin
|
||||
operands_busy_n[3] = 0;
|
||||
end
|
||||
end
|
||||
if (writeback_fire) begin
|
||||
if (ibuffer_fire) begin
|
||||
if (writeback_if.data.rd == ibuf_opds[i]) begin
|
||||
operands_busy_n[i] = 0;
|
||||
end
|
||||
end else begin
|
||||
if (writeback_if.data.rd == stg_opds[i]) begin
|
||||
operands_busy_n[i] = 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
if (staging_fire && staging_if[w].data.wb) begin
|
||||
if (staging_if[w].data.rd == ibuffer_if[w].data.rd) begin
|
||||
operands_busy_n[0] = 1;
|
||||
end
|
||||
if (staging_if[w].data.rd == ibuffer_if[w].data.rs1) begin
|
||||
operands_busy_n[1] = 1;
|
||||
end
|
||||
if (staging_if[w].data.rd == ibuffer_if[w].data.rs2) begin
|
||||
operands_busy_n[2] = 1;
|
||||
end
|
||||
if (staging_if[w].data.rd == ibuffer_if[w].data.rs3) begin
|
||||
operands_busy_n[3] = 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -185,10 +230,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
inuse_regs[staging_if[w].data.rd] <= 1;
|
||||
end
|
||||
end
|
||||
|
||||
operands_busy <= operands_busy_n;
|
||||
operands_ready[w] <= ~(| operands_busy_n);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
if (staging_fire && staging_if[w].data.wb) begin
|
||||
inuse_units[staging_if[w].data.rd] <= staging_if[w].data.ex_type;
|
||||
|
@ -208,9 +251,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
end else begin
|
||||
if (staging_if[w].valid && ~staging_if[w].ready) begin
|
||||
`ifdef DBG_TRACE_PIPELINE
|
||||
`TRACE(4, ("%t: *** %s-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
|
||||
`TRACE(3, ("%d: *** %s-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
|
||||
$time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr,
|
||||
operands_busy, staging_if[w].data.uuid))
|
||||
operands_busy, staging_if[w].data.uuid));
|
||||
`endif
|
||||
timeout_ctr <= timeout_ctr + 1;
|
||||
end else if (ibuffer_fire) begin
|
||||
|
@ -222,11 +265,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
|
||||
("%t: *** %s timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
|
||||
$time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr,
|
||||
operands_busy, staging_if[w].data.uuid))
|
||||
operands_busy, staging_if[w].data.uuid));
|
||||
|
||||
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if.data.rd] != 0,
|
||||
("%t: *** %s invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
|
||||
$time, INSTANCE_ID, w, {writeback_if.data.PC, 1'b0}, writeback_if.data.tmask, writeback_if.data.rd, writeback_if.data.uuid))
|
||||
$time, INSTANCE_ID, w, {writeback_if.data.PC, 1'b0}, writeback_if.data.tmask, writeback_if.data.rd, writeback_if.data.uuid));
|
||||
`endif
|
||||
|
||||
end
|
||||
|
@ -235,20 +278,23 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
wire [PER_ISSUE_WARPS-1:0][DATAW-1:0] arb_data_in;
|
||||
wire [PER_ISSUE_WARPS-1:0] arb_ready_in;
|
||||
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_arb_data_in
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
|
||||
assign arb_valid_in[w] = staging_if[w].valid && operands_ready[w];
|
||||
assign arb_data_in[w] = staging_if[w].data;
|
||||
assign staging_if[w].ready = arb_ready_in[w] && operands_ready[w];
|
||||
end
|
||||
|
||||
`RESET_RELAY (arb_reset, reset);
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (PER_ISSUE_WARPS),
|
||||
.DATAW (DATAW),
|
||||
.ARBITER ("C"),
|
||||
.OUT_BUF (3)
|
||||
.ARBITER ("F"),
|
||||
.LUTRAM (1),
|
||||
.OUT_BUF (4) // using 2-cycle EB for area reduction
|
||||
) out_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (arb_reset),
|
||||
.valid_in (arb_valid_in),
|
||||
.ready_in (arb_ready_in),
|
||||
.data_in (arb_data_in),
|
||||
|
|
|
@ -21,8 +21,8 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
input sysmem_perf_t sysmem_perf,
|
||||
input pipeline_perf_t pipeline_perf,
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
VX_pipeline_perf_if.slave pipeline_perf_if,
|
||||
`endif
|
||||
|
||||
input base_dcrs_t base_dcrs,
|
||||
|
@ -41,25 +41,24 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||
VX_warp_ctl_if.master warp_ctl_if
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam BLOCK_SIZE = 1;
|
||||
localparam NUM_LANES = `NUM_SFU_LANES;
|
||||
localparam PE_COUNT = 2;
|
||||
localparam PE_SEL_BITS = `CLOG2(PE_COUNT);
|
||||
localparam PE_IDX_WCTL = 0;
|
||||
localparam PE_IDX_CSRS = 1;
|
||||
localparam BLOCK_SIZE = 1;
|
||||
localparam NUM_LANES = `NUM_SFU_LANES;
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
|
||||
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `PC_BITS + PID_WIDTH + 1 + 1;
|
||||
localparam RSP_ARB_SIZE = 1 + 1;
|
||||
localparam RSP_ARB_IDX_WCTL = 0;
|
||||
localparam RSP_ARB_IDX_CSRS = 1;
|
||||
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) per_block_execute_if[BLOCK_SIZE]();
|
||||
|
||||
VX_commit_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) per_block_commit_if[BLOCK_SIZE]();
|
||||
|
||||
VX_dispatch_unit #(
|
||||
.BLOCK_SIZE (BLOCK_SIZE),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.OUT_BUF (3)
|
||||
.OUT_BUF (1)
|
||||
) dispatch_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -67,62 +66,65 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||
.execute_if (per_block_execute_if)
|
||||
);
|
||||
|
||||
wire [RSP_ARB_SIZE-1:0] rsp_arb_valid_in;
|
||||
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
|
||||
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
|
||||
|
||||
// Warp control block
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) pe_execute_if[PE_COUNT]();
|
||||
|
||||
) wctl_execute_if();
|
||||
VX_commit_if#(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) pe_commit_if[PE_COUNT]();
|
||||
) wctl_commit_if();
|
||||
|
||||
reg [PE_SEL_BITS-1:0] pe_select;
|
||||
always @(*) begin
|
||||
pe_select = PE_IDX_WCTL;
|
||||
if (`INST_SFU_IS_CSR(per_block_execute_if[0].data.op_type))
|
||||
pe_select = PE_IDX_CSRS;
|
||||
end
|
||||
assign wctl_execute_if.valid = per_block_execute_if[0].valid && `INST_SFU_IS_WCTL(per_block_execute_if[0].data.op_type);
|
||||
assign wctl_execute_if.data = per_block_execute_if[0].data;
|
||||
|
||||
VX_pe_switch #(
|
||||
.PE_COUNT (PE_COUNT),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF(0),
|
||||
.RSP_OUT_BUF(3)
|
||||
) pe_switch (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.pe_sel (pe_select),
|
||||
.execute_in_if (per_block_execute_if[0]),
|
||||
.commit_out_if (per_block_commit_if[0]),
|
||||
.execute_out_if (pe_execute_if),
|
||||
.commit_in_if (pe_commit_if)
|
||||
);
|
||||
`RESET_RELAY (wctl_reset, reset);
|
||||
|
||||
VX_wctl_unit #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-wctl", INSTANCE_ID))),
|
||||
.INSTANCE_ID ($sformatf("%s-wctl", INSTANCE_ID)),
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) wctl_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.execute_if (pe_execute_if[PE_IDX_WCTL]),
|
||||
.reset (wctl_reset),
|
||||
.execute_if (wctl_execute_if),
|
||||
.warp_ctl_if(warp_ctl_if),
|
||||
.commit_if (pe_commit_if[PE_IDX_WCTL])
|
||||
.commit_if (wctl_commit_if)
|
||||
);
|
||||
|
||||
assign rsp_arb_valid_in[RSP_ARB_IDX_WCTL] = wctl_commit_if.valid;
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_WCTL] = wctl_commit_if.data;
|
||||
assign wctl_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_WCTL];
|
||||
|
||||
// CSR unit
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) csr_execute_if();
|
||||
VX_commit_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) csr_commit_if();
|
||||
|
||||
assign csr_execute_if.valid = per_block_execute_if[0].valid && `INST_SFU_IS_CSR(per_block_execute_if[0].data.op_type);
|
||||
assign csr_execute_if.data = per_block_execute_if[0].data;
|
||||
|
||||
`RESET_RELAY (csr_reset, reset);
|
||||
|
||||
VX_csr_unit #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-csr", INSTANCE_ID))),
|
||||
.INSTANCE_ID ($sformatf("%s-csr", INSTANCE_ID)),
|
||||
.CORE_ID (CORE_ID),
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) csr_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (csr_reset),
|
||||
|
||||
.base_dcrs (base_dcrs),
|
||||
.execute_if (pe_execute_if[PE_IDX_CSRS]),
|
||||
.execute_if (csr_execute_if),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.sysmem_perf (sysmem_perf),
|
||||
.pipeline_perf (pipeline_perf),
|
||||
.mem_perf_if (mem_perf_if),
|
||||
.pipeline_perf_if(pipeline_perf_if),
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
|
@ -131,7 +133,47 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||
|
||||
.sched_csr_if (sched_csr_if),
|
||||
.commit_csr_if (commit_csr_if),
|
||||
.commit_if (pe_commit_if[PE_IDX_CSRS])
|
||||
.commit_if (csr_commit_if)
|
||||
);
|
||||
|
||||
assign rsp_arb_valid_in[RSP_ARB_IDX_CSRS] = csr_commit_if.valid;
|
||||
assign rsp_arb_data_in[RSP_ARB_IDX_CSRS] = csr_commit_if.data;
|
||||
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSRS];
|
||||
|
||||
// can accept new request?
|
||||
|
||||
reg sfu_req_ready;
|
||||
always @(*) begin
|
||||
case (per_block_execute_if[0].data.op_type)
|
||||
`INST_SFU_CSRRW,
|
||||
`INST_SFU_CSRRS,
|
||||
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
|
||||
default: sfu_req_ready = wctl_execute_if.ready;
|
||||
endcase
|
||||
end
|
||||
assign per_block_execute_if[0].ready = sfu_req_ready;
|
||||
|
||||
// response arbitration
|
||||
|
||||
VX_commit_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) arb_commit_if[BLOCK_SIZE]();
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (RSP_ARB_SIZE),
|
||||
.DATAW (RSP_ARB_DATAW),
|
||||
.ARBITER ("R"),
|
||||
.OUT_BUF (3)
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (rsp_arb_valid_in),
|
||||
.ready_in (rsp_arb_ready_in),
|
||||
.data_in (rsp_arb_data_in),
|
||||
.data_out (arb_commit_if[0].data),
|
||||
.valid_out (arb_commit_if[0].valid),
|
||||
.ready_out (arb_commit_if[0].ready),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
VX_gather_unit #(
|
||||
|
@ -139,9 +181,9 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||
.NUM_LANES (NUM_LANES),
|
||||
.OUT_BUF (3)
|
||||
) gather_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.commit_in_if (per_block_commit_if),
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.commit_in_if (arb_commit_if),
|
||||
.commit_out_if (commit_if)
|
||||
);
|
||||
|
||||
|
|
|
@ -45,13 +45,16 @@ module VX_split_join import VX_gpu_pkg::*; #(
|
|||
wire ipdom_push = valid && split.valid && split.is_dvg;
|
||||
wire ipdom_pop = valid && sjoin.valid && sjoin_is_dvg;
|
||||
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_ipdom_stacks
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
|
||||
|
||||
`RESET_RELAY (ipdom_reset, reset);
|
||||
|
||||
VX_ipdom_stack #(
|
||||
.WIDTH (`NUM_THREADS+`PC_BITS),
|
||||
.DEPTH (`DV_STACK_SIZE)
|
||||
) ipdom_stack (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (ipdom_reset),
|
||||
.q0 (ipdom_q0),
|
||||
.q1 (ipdom_q1),
|
||||
.d (ipdom_data[i]),
|
||||
|
|
399
hw/rtl/core/VX_trace_pkg.sv
Normal file
399
hw/rtl/core/VX_trace_pkg.sv
Normal file
|
@ -0,0 +1,399 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`ifndef VX_TRACE_PKG_VH
|
||||
`define VX_TRACE_PKG_VH
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
package VX_trace_pkg;
|
||||
|
||||
`ifdef SIMULATION
|
||||
|
||||
`ifdef SV_DPI
|
||||
import "DPI-C" function void dpi_trace(input int level, input string format /*verilator sformat*/);
|
||||
`endif
|
||||
|
||||
import VX_gpu_pkg::*;
|
||||
|
||||
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
|
||||
case (ex_type)
|
||||
`EX_ALU: `TRACE(level, ("ALU"));
|
||||
`EX_LSU: `TRACE(level, ("LSU"));
|
||||
`EX_FPU: `TRACE(level, ("FPU"));
|
||||
`EX_SFU: `TRACE(level, ("SFU"));
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
endtask
|
||||
|
||||
task trace_ex_op(input int level,
|
||||
input [`EX_BITS-1:0] ex_type,
|
||||
input [`INST_OP_BITS-1:0] op_type,
|
||||
input VX_gpu_pkg::op_args_t op_args
|
||||
);
|
||||
case (ex_type)
|
||||
`EX_ALU: begin
|
||||
case (op_args.alu.xtype)
|
||||
`ALU_TYPE_ARITH: begin
|
||||
if (op_args.alu.is_w) begin
|
||||
if (op_args.alu.use_imm) begin
|
||||
case (`INST_ALU_BITS'(op_type))
|
||||
`INST_ALU_ADD: `TRACE(level, ("ADDIW"));
|
||||
`INST_ALU_SLL: `TRACE(level, ("SLLIW"));
|
||||
`INST_ALU_SRL: `TRACE(level, ("SRLIW"));
|
||||
`INST_ALU_SRA: `TRACE(level, ("SRAIW"));
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end else begin
|
||||
case (`INST_ALU_BITS'(op_type))
|
||||
`INST_ALU_ADD: `TRACE(level, ("ADDW"));
|
||||
`INST_ALU_SUB: `TRACE(level, ("SUBW"));
|
||||
`INST_ALU_SLL: `TRACE(level, ("SLLW"));
|
||||
`INST_ALU_SRL: `TRACE(level, ("SRLW"));
|
||||
`INST_ALU_SRA: `TRACE(level, ("SRAW"));
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end
|
||||
end else begin
|
||||
if (op_args.alu.use_imm) begin
|
||||
case (`INST_ALU_BITS'(op_type))
|
||||
`INST_ALU_ADD: `TRACE(level, ("ADDI"));
|
||||
`INST_ALU_SLL: `TRACE(level, ("SLLI"));
|
||||
`INST_ALU_SRL: `TRACE(level, ("SRLI"));
|
||||
`INST_ALU_SRA: `TRACE(level, ("SRAI"));
|
||||
`INST_ALU_SLT: `TRACE(level, ("SLTI"));
|
||||
`INST_ALU_SLTU: `TRACE(level, ("SLTIU"));
|
||||
`INST_ALU_XOR: `TRACE(level, ("XORI"));
|
||||
`INST_ALU_OR: `TRACE(level, ("ORI"));
|
||||
`INST_ALU_AND: `TRACE(level, ("ANDI"));
|
||||
`INST_ALU_LUI: `TRACE(level, ("LUI"));
|
||||
`INST_ALU_AUIPC: `TRACE(level, ("AUIPC"));
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end else begin
|
||||
case (`INST_ALU_BITS'(op_type))
|
||||
`INST_ALU_ADD: `TRACE(level, ("ADD"));
|
||||
`INST_ALU_SUB: `TRACE(level, ("SUB"));
|
||||
`INST_ALU_SLL: `TRACE(level, ("SLL"));
|
||||
`INST_ALU_SRL: `TRACE(level, ("SRL"));
|
||||
`INST_ALU_SRA: `TRACE(level, ("SRA"));
|
||||
`INST_ALU_SLT: `TRACE(level, ("SLT"));
|
||||
`INST_ALU_SLTU: `TRACE(level, ("SLTU"));
|
||||
`INST_ALU_XOR: `TRACE(level, ("XOR"));
|
||||
`INST_ALU_OR: `TRACE(level, ("OR"));
|
||||
`INST_ALU_AND: `TRACE(level, ("AND"));
|
||||
`INST_ALU_CZEQ: `TRACE(level, ("CZERO.EQZ"));
|
||||
`INST_ALU_CZNE: `TRACE(level, ("CZERO.NEZ"));
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end
|
||||
end
|
||||
end
|
||||
`ALU_TYPE_BRANCH: begin
|
||||
case (`INST_BR_BITS'(op_type))
|
||||
`INST_BR_EQ: `TRACE(level, ("BEQ"));
|
||||
`INST_BR_NE: `TRACE(level, ("BNE"));
|
||||
`INST_BR_LT: `TRACE(level, ("BLT"));
|
||||
`INST_BR_GE: `TRACE(level, ("BGE"));
|
||||
`INST_BR_LTU: `TRACE(level, ("BLTU"));
|
||||
`INST_BR_GEU: `TRACE(level, ("BGEU"));
|
||||
`INST_BR_JAL: `TRACE(level, ("JAL"));
|
||||
`INST_BR_JALR: `TRACE(level, ("JALR"));
|
||||
`INST_BR_ECALL: `TRACE(level, ("ECALL"));
|
||||
`INST_BR_EBREAK:`TRACE(level, ("EBREAK"));
|
||||
`INST_BR_URET: `TRACE(level, ("URET"));
|
||||
`INST_BR_SRET: `TRACE(level, ("SRET"));
|
||||
`INST_BR_MRET: `TRACE(level, ("MRET"));
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end
|
||||
`ALU_TYPE_MULDIV: begin
|
||||
if (op_args.alu.is_w) begin
|
||||
case (`INST_M_BITS'(op_type))
|
||||
`INST_M_MUL: `TRACE(level, ("MULW"));
|
||||
`INST_M_DIV: `TRACE(level, ("DIVW"));
|
||||
`INST_M_DIVU: `TRACE(level, ("DIVUW"));
|
||||
`INST_M_REM: `TRACE(level, ("REMW"));
|
||||
`INST_M_REMU: `TRACE(level, ("REMUW"));
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end else begin
|
||||
case (`INST_M_BITS'(op_type))
|
||||
`INST_M_MUL: `TRACE(level, ("MUL"));
|
||||
`INST_M_MULH: `TRACE(level, ("MULH"));
|
||||
`INST_M_MULHSU:`TRACE(level, ("MULHSU"));
|
||||
`INST_M_MULHU: `TRACE(level, ("MULHU"));
|
||||
`INST_M_DIV: `TRACE(level, ("DIV"));
|
||||
`INST_M_DIVU: `TRACE(level, ("DIVU"));
|
||||
`INST_M_REM: `TRACE(level, ("REM"));
|
||||
`INST_M_REMU: `TRACE(level, ("REMU"));
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end
|
||||
end
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end
|
||||
`EX_LSU: begin
|
||||
if (op_args.lsu.is_float) begin
|
||||
case (`INST_LSU_BITS'(op_type))
|
||||
`INST_LSU_LW: `TRACE(level, ("FLW"));
|
||||
`INST_LSU_LD: `TRACE(level, ("FLD"));
|
||||
`INST_LSU_SW: `TRACE(level, ("FSW"));
|
||||
`INST_LSU_SD: `TRACE(level, ("FSD"));
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end else begin
|
||||
case (`INST_LSU_BITS'(op_type))
|
||||
`INST_LSU_LB: `TRACE(level, ("LB"));
|
||||
`INST_LSU_LH: `TRACE(level, ("LH"));
|
||||
`INST_LSU_LW: `TRACE(level, ("LW"));
|
||||
`INST_LSU_LD: `TRACE(level, ("LD"));
|
||||
`INST_LSU_LBU:`TRACE(level, ("LBU"));
|
||||
`INST_LSU_LHU:`TRACE(level, ("LHU"));
|
||||
`INST_LSU_LWU:`TRACE(level, ("LWU"));
|
||||
`INST_LSU_SB: `TRACE(level, ("SB"));
|
||||
`INST_LSU_SH: `TRACE(level, ("SH"));
|
||||
`INST_LSU_SW: `TRACE(level, ("SW"));
|
||||
`INST_LSU_SD: `TRACE(level, ("SD"));
|
||||
`INST_LSU_FENCE:`TRACE(level,("FENCE"));
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end
|
||||
end
|
||||
`EX_FPU: begin
|
||||
case (`INST_FPU_BITS'(op_type))
|
||||
`INST_FPU_ADD: begin
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FADD.D"));
|
||||
else
|
||||
`TRACE(level, ("FADD.S"));
|
||||
end
|
||||
`INST_FPU_SUB: begin
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FSUB.D"));
|
||||
else
|
||||
`TRACE(level, ("FSUB.S"));
|
||||
end
|
||||
`INST_FPU_MUL: begin
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FMUL.D"));
|
||||
else
|
||||
`TRACE(level, ("FMUL.S"));
|
||||
end
|
||||
`INST_FPU_DIV: begin
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FDIV.D"));
|
||||
else
|
||||
`TRACE(level, ("FDIV.S"));
|
||||
end
|
||||
`INST_FPU_SQRT: begin
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FSQRT.D"));
|
||||
else
|
||||
`TRACE(level, ("FSQRT.S"));
|
||||
end
|
||||
`INST_FPU_MADD: begin
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FMADD.D"));
|
||||
else
|
||||
`TRACE(level, ("FMADD.S"));
|
||||
end
|
||||
`INST_FPU_MSUB: begin
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FMSUB.D"));
|
||||
else
|
||||
`TRACE(level, ("FMSUB.S"));
|
||||
end
|
||||
`INST_FPU_NMADD: begin
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FNMADD.D"));
|
||||
else
|
||||
`TRACE(level, ("FNMADD.S"));
|
||||
end
|
||||
`INST_FPU_NMSUB: begin
|
||||
if (op_args.fpu.fmt[0])
|
||||
`TRACE(level, ("FNMSUB.D"));
|
||||
else
|
||||
`TRACE(level, ("FNMSUB.S"));
|
||||
end
|
||||
`INST_FPU_CMP: begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
case (op_args.fpu.frm[1:0])
|
||||
0: `TRACE(level, ("FLE.D"));
|
||||
1: `TRACE(level, ("FLT.D"));
|
||||
2: `TRACE(level, ("FEQ.D"));
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end else begin
|
||||
case (op_args.fpu.frm[1:0])
|
||||
0: `TRACE(level, ("FLE.S"));
|
||||
1: `TRACE(level, ("FLT.S"));
|
||||
2: `TRACE(level, ("FEQ.S"));
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end
|
||||
end
|
||||
`INST_FPU_F2F: begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
`TRACE(level, ("FCVT.D.S"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.S.D"));
|
||||
end
|
||||
end
|
||||
`INST_FPU_F2I: begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.L.D"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.W.D"));
|
||||
end
|
||||
end else begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.L.S"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.W.S"));
|
||||
end
|
||||
end
|
||||
end
|
||||
`INST_FPU_F2U: begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.LU.D"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.WU.D"));
|
||||
end
|
||||
end else begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.LU.S"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.WU.S"));
|
||||
end
|
||||
end
|
||||
end
|
||||
`INST_FPU_I2F: begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.D.L"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.D.W"));
|
||||
end
|
||||
end else begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.S.L"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.S.W"));
|
||||
end
|
||||
end
|
||||
end
|
||||
`INST_FPU_U2F: begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.D.LU"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.D.WU"));
|
||||
end
|
||||
end else begin
|
||||
if (op_args.fpu.fmt[1]) begin
|
||||
`TRACE(level, ("FCVT.S.LU"));
|
||||
end else begin
|
||||
`TRACE(level, ("FCVT.S.WU"));
|
||||
end
|
||||
end
|
||||
end
|
||||
`INST_FPU_MISC: begin
|
||||
if (op_args.fpu.fmt[0]) begin
|
||||
case (op_args.fpu.frm)
|
||||
0: `TRACE(level, ("FSGNJ.D"));
|
||||
1: `TRACE(level, ("FSGNJN.D"));
|
||||
2: `TRACE(level, ("FSGNJX.D"));
|
||||
3: `TRACE(level, ("FCLASS.D"));
|
||||
4: `TRACE(level, ("FMV.X.D"));
|
||||
5: `TRACE(level, ("FMV.D.X"));
|
||||
6: `TRACE(level, ("FMIN.D"));
|
||||
7: `TRACE(level, ("FMAX.D"));
|
||||
endcase
|
||||
end else begin
|
||||
case (op_args.fpu.frm)
|
||||
0: `TRACE(level, ("FSGNJ.S"));
|
||||
1: `TRACE(level, ("FSGNJN.S"));
|
||||
2: `TRACE(level, ("FSGNJX.S"));
|
||||
3: `TRACE(level, ("FCLASS.S"));
|
||||
4: `TRACE(level, ("FMV.X.S"));
|
||||
5: `TRACE(level, ("FMV.S.X"));
|
||||
6: `TRACE(level, ("FMIN.S"));
|
||||
7: `TRACE(level, ("FMAX.S"));
|
||||
endcase
|
||||
end
|
||||
end
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end
|
||||
`EX_SFU: begin
|
||||
case (`INST_SFU_BITS'(op_type))
|
||||
`INST_SFU_TMC: `TRACE(level, ("TMC"));
|
||||
`INST_SFU_WSPAWN:`TRACE(level, ("WSPAWN"));
|
||||
`INST_SFU_SPLIT: begin if (op_args.wctl.is_neg) `TRACE(level, ("SPLIT.N")); else `TRACE(level, ("SPLIT")); end
|
||||
`INST_SFU_JOIN: `TRACE(level, ("JOIN"));
|
||||
`INST_SFU_BAR: `TRACE(level, ("BAR"));
|
||||
`INST_SFU_PRED: begin if (op_args.wctl.is_neg) `TRACE(level, ("PRED.N")); else `TRACE(level, ("PRED")); end
|
||||
`INST_SFU_CSRRW: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end
|
||||
`INST_SFU_CSRRS: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end
|
||||
`INST_SFU_CSRRC: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
end
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
endtask
|
||||
|
||||
task trace_op_args(input int level,
|
||||
input [`EX_BITS-1:0] ex_type,
|
||||
input [`INST_OP_BITS-1:0] op_type,
|
||||
input VX_gpu_pkg::op_args_t op_args
|
||||
);
|
||||
case (ex_type)
|
||||
`EX_ALU: begin
|
||||
`TRACE(level, (", use_PC=%b, use_imm=%b, imm=0x%0h", op_args.alu.use_PC, op_args.alu.use_imm, op_args.alu.imm));
|
||||
end
|
||||
`EX_LSU: begin
|
||||
`TRACE(level, (", offset=0x%0h", op_args.lsu.offset));
|
||||
end
|
||||
`EX_FPU: begin
|
||||
`TRACE(level, (", fmt=0x%0h, frm=0x%0h", op_args.fpu.fmt, op_args.fpu.frm));
|
||||
end
|
||||
`EX_SFU: begin
|
||||
if (`INST_SFU_IS_CSR(op_type)) begin
|
||||
`TRACE(level, (", addr=0x%0h, use_imm=%b, imm=0x%0h", op_args.csr.addr, op_args.csr.use_imm, op_args.csr.imm));
|
||||
end
|
||||
end
|
||||
default:;
|
||||
endcase
|
||||
endtask
|
||||
|
||||
task trace_base_dcr(input int level, input [`VX_DCR_ADDR_WIDTH-1:0] addr);
|
||||
case (addr)
|
||||
`VX_DCR_BASE_STARTUP_ADDR0: `TRACE(level, ("STARTUP_ADDR0"));
|
||||
`VX_DCR_BASE_STARTUP_ADDR1: `TRACE(level, ("STARTUP_ADDR1"));
|
||||
`VX_DCR_BASE_STARTUP_ARG0: `TRACE(level, ("STARTUP_ARG0"));
|
||||
`VX_DCR_BASE_STARTUP_ARG1: `TRACE(level, ("STARTUP_ARG1"));
|
||||
`VX_DCR_BASE_MPM_CLASS: `TRACE(level, ("MPM_CLASS"));
|
||||
default: `TRACE(level, ("?"));
|
||||
endcase
|
||||
endtask
|
||||
|
||||
`endif
|
||||
|
||||
endpackage
|
||||
|
||||
`endif // VX_TRACE_PKG_VH
|
|
@ -1,44 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_uuid_gen import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter UUID_WIDTH = 48
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
input wire incr,
|
||||
input wire [`NW_WIDTH-1:0] wid,
|
||||
output wire [UUID_WIDTH-1:0] uuid
|
||||
);
|
||||
localparam GNW_WIDTH = UUID_WIDTH - 32;
|
||||
reg [31:0] uuid_cntrs [0:`NUM_WARPS-1];
|
||||
reg [`NUM_WARPS-1:0] has_uuid_cntrs;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
has_uuid_cntrs <= '0;
|
||||
end else if (incr) begin
|
||||
has_uuid_cntrs[wid] <= 1;
|
||||
end
|
||||
if (incr) begin
|
||||
uuid_cntrs[wid] <= has_uuid_cntrs[wid] ? (uuid_cntrs[wid] + 1) : 1;
|
||||
end
|
||||
end
|
||||
|
||||
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(wid);
|
||||
assign uuid = {g_wid, (has_uuid_cntrs[wid] ? uuid_cntrs[wid] : 0)};
|
||||
|
||||
endmodule
|
|
@ -50,9 +50,9 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
|||
wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR);
|
||||
|
||||
wire [`UP(LANE_BITS)-1:0] tid;
|
||||
if (LANE_BITS != 0) begin : g_tid
|
||||
if (LANE_BITS != 0) begin
|
||||
assign tid = execute_if.data.tid[0 +: LANE_BITS];
|
||||
end else begin : g_no_tid
|
||||
end else begin
|
||||
assign tid = 0;
|
||||
end
|
||||
|
||||
|
@ -63,7 +63,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
|||
wire not_pred = execute_if.data.op_args.wctl.is_neg;
|
||||
|
||||
wire [NUM_LANES-1:0] taken;
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_taken
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign taken[i] = (execute_if.data.rs1_data[i][0] ^ not_pred);
|
||||
end
|
||||
|
||||
|
@ -131,7 +131,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
|||
// wspawn
|
||||
|
||||
wire [`NUM_WARPS-1:0] wspawn_wmask;
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_wspawn_wmask
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
|
||||
assign wspawn_wmask[i] = (i < rs1_data[`NW_BITS:0]) && (i != execute_if.data.wid);
|
||||
end
|
||||
assign wspawn.valid = is_wspawn;
|
||||
|
@ -162,7 +162,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
|||
assign warp_ctl_if.sjoin = sjoin_r;
|
||||
assign warp_ctl_if.barrier = barrier_r;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_commit_if
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign commit_if.data.data[i] = `XLEN'(dvstack_ptr);
|
||||
end
|
||||
|
||||
|
|
|
@ -1,17 +1,17 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Modified port of cast module from fpnew Libray
|
||||
// Modified port of cast module from fpnew Libray
|
||||
// reference: https://github.com/pulp-platform/fpnew
|
||||
|
||||
`include "VX_fpu_define.vh"
|
||||
|
@ -22,8 +22,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
parameter LATENCY = 1,
|
||||
parameter INT_WIDTH = 32,
|
||||
parameter MAN_BITS = 23,
|
||||
parameter EXP_BITS = 8,
|
||||
parameter OUT_REG = 0
|
||||
parameter EXP_BITS = 8
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
@ -36,10 +35,10 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
input wire is_signed,
|
||||
|
||||
input wire [31:0] dataa,
|
||||
output wire [31:0] result,
|
||||
output wire [31:0] result,
|
||||
|
||||
output wire [`FP_FLAGS_BITS-1:0] fflags
|
||||
);
|
||||
);
|
||||
// Constants
|
||||
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
|
||||
|
||||
|
@ -56,11 +55,11 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
localparam FMT_SHIFT_COMPENSATION = S_MAN_WIDTH - 1 - MAN_BITS;
|
||||
localparam NUM_FP_STICKY = 2 * S_MAN_WIDTH - MAN_BITS - 1; // removed mantissa, 1. and R
|
||||
localparam NUM_INT_STICKY = 2 * S_MAN_WIDTH - INT_WIDTH; // removed int and R
|
||||
|
||||
|
||||
// Input processing
|
||||
|
||||
fclass_t fclass;
|
||||
VX_fp_classifier #(
|
||||
|
||||
fclass_t fclass;
|
||||
VX_fp_classifier #(
|
||||
.EXP_BITS (EXP_BITS),
|
||||
.MAN_BITS (MAN_BITS)
|
||||
) fp_classifier (
|
||||
|
@ -70,9 +69,9 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
);
|
||||
|
||||
wire [S_MAN_WIDTH-1:0] input_mant;
|
||||
wire [S_EXP_WIDTH-1:0] input_exp;
|
||||
wire [S_EXP_WIDTH-1:0] input_exp;
|
||||
wire input_sign;
|
||||
|
||||
|
||||
wire i2f_sign = dataa[INT_WIDTH-1];
|
||||
wire f2i_sign = dataa[INT_WIDTH-1] && is_signed;
|
||||
wire [S_MAN_WIDTH-1:0] f2i_mantissa = f2i_sign ? (-dataa) : dataa;
|
||||
|
@ -82,7 +81,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
assign input_sign = is_itof ? f2i_sign : i2f_sign;
|
||||
|
||||
// Pipeline stage0
|
||||
|
||||
|
||||
wire is_itof_s0;
|
||||
wire is_signed_s0;
|
||||
wire [2:0] rnd_mode_s0;
|
||||
|
@ -93,7 +92,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `INST_FRM_BITS + 1 + $bits(fclass_t) + 1 + S_EXP_WIDTH + S_MAN_WIDTH),
|
||||
.DEPTH (LATENCY > 1)
|
||||
.DEPTH (LATENCY > 2)
|
||||
) pipe_reg0 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -101,7 +100,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
.data_in ({is_itof, is_signed, frm, fclass, input_sign, input_exp, input_mant}),
|
||||
.data_out ({is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
|
||||
);
|
||||
|
||||
|
||||
// Normalization
|
||||
|
||||
wire [LZC_RESULT_WIDTH-1:0] renorm_shamt_s0; // renormalization shift amount
|
||||
|
@ -114,12 +113,12 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
.data_out (renorm_shamt_s0),
|
||||
.valid_out (mant_is_nonzero_s0)
|
||||
);
|
||||
|
||||
|
||||
wire mant_is_zero_s0 = ~mant_is_nonzero_s0;
|
||||
|
||||
wire [S_MAN_WIDTH-1:0] input_mant_n_s0; // normalized input mantissa
|
||||
wire [S_MAN_WIDTH-1:0] input_mant_n_s0; // normalized input mantissa
|
||||
wire [S_EXP_WIDTH-1:0] input_exp_n_s0; // unbiased true exponent
|
||||
|
||||
|
||||
// Realign input mantissa, append zeroes if destination is wider
|
||||
assign input_mant_n_s0 = encoded_mant_s0 << renorm_shamt_s0;
|
||||
|
||||
|
@ -141,7 +140,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `INST_FRM_BITS + 1 + $bits(fclass_t) + 1 + 1 + S_MAN_WIDTH + S_EXP_WIDTH),
|
||||
.DEPTH (LATENCY > 2)
|
||||
.DEPTH (LATENCY > 1)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -170,30 +169,30 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
wire of_before_round_s1 = overflow;
|
||||
|
||||
// Pipeline stage2
|
||||
|
||||
|
||||
wire is_itof_s2;
|
||||
wire is_signed_s2;
|
||||
wire [2:0] rnd_mode_s2;
|
||||
fclass_t fclass_s2;
|
||||
fclass_t fclass_s2;
|
||||
wire mant_is_zero_s2;
|
||||
wire input_sign_s2;
|
||||
wire [2*S_MAN_WIDTH:0] destination_mant_s2;
|
||||
wire [EXP_BITS-1:0] final_exp_s2;
|
||||
wire of_before_round_s2;
|
||||
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + `INST_FRM_BITS + $bits(fclass_t) + 1 + 1 + (2*S_MAN_WIDTH+1) + EXP_BITS + 1),
|
||||
.DEPTH (LATENCY > 0)
|
||||
.DEPTH (LATENCY > 3)
|
||||
) pipe_reg2 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (enable),
|
||||
.data_in ({is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
|
||||
.data_out ({is_itof_s2, is_signed_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
|
||||
);
|
||||
|
||||
);
|
||||
|
||||
// Rouding and classification
|
||||
|
||||
|
||||
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
|
||||
wire [INT_WIDTH-1:0] final_int_s2; // integer shifted in position
|
||||
wire [1:0] f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2;
|
||||
|
@ -238,20 +237,20 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
|
||||
wire is_itof_s3;
|
||||
wire is_signed_s3;
|
||||
fclass_t fclass_s3;
|
||||
fclass_t fclass_s3;
|
||||
wire mant_is_zero_s3;
|
||||
wire input_sign_s3;
|
||||
wire rounded_sign_s3;
|
||||
wire [INT_WIDTH-1:0] rounded_abs_s3;
|
||||
wire of_before_round_s3;
|
||||
wire of_before_round_s3;
|
||||
wire f2i_round_has_sticky_s3;
|
||||
wire i2f_round_has_sticky_s3;
|
||||
|
||||
`UNUSED_VAR (fclass_s3)
|
||||
`UNUSED_VAR (fclass_s3)
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + $bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1),
|
||||
.DEPTH (LATENCY > 3)
|
||||
.DEPTH (LATENCY > 4)
|
||||
) pipe_reg3 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -259,7 +258,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
.data_in ({is_itof_s2, is_signed_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}),
|
||||
.data_out ({is_itof_s3, is_signed_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3})
|
||||
);
|
||||
|
||||
|
||||
// Assemble regular result, nan box short ones. Int zeroes need to be detected
|
||||
wire [INT_WIDTH-1:0] fmt_result_s3 = mant_is_zero_s3 ? 0 : {rounded_sign_s3, rounded_abs_s3[EXP_BITS+MAN_BITS-1:0]};
|
||||
|
||||
|
@ -279,18 +278,18 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
f2i_special_result_s3[INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1; // alone yields 2**(31)-1
|
||||
f2i_special_result_s3[INT_WIDTH-1] = ~is_signed_s3; // for unsigned casts yields 2**31
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
// Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
|
||||
wire f2i_result_is_special_s3 = fclass_s3.is_nan
|
||||
wire f2i_result_is_special_s3 = fclass_s3.is_nan
|
||||
| fclass_s3.is_inf
|
||||
| of_before_round_s3
|
||||
| (input_sign_s3 & ~is_signed_s3 & ~rounded_int_res_zero_s3);
|
||||
|
||||
|
||||
fflags_t f2i_special_status_s3;
|
||||
fflags_t i2f_status_s3, f2i_status_s3;
|
||||
fflags_t tmp_fflags_s3;
|
||||
|
||||
|
||||
// All integer special cases are invalid
|
||||
assign f2i_special_status_s3 = {1'b1, 4'h0};
|
||||
|
||||
|
@ -307,7 +306,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
|
||||
VX_pipe_register #(
|
||||
.DATAW (32 + `FP_FLAGS_BITS),
|
||||
.DEPTH (OUT_REG)
|
||||
.DEPTH (LATENCY > 0)
|
||||
) pipe_reg4 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
|
|
@ -1,17 +1,17 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
// Modified port of noncomp module from fpnew Libray
|
||||
// Modified port of noncomp module from fpnew Libray
|
||||
// reference: https://github.com/pulp-platform/fpnew
|
||||
|
||||
`include "VX_fpu_define.vh"
|
||||
|
@ -19,10 +19,9 @@
|
|||
`ifdef FPU_DSP
|
||||
|
||||
module VX_fncp_unit import VX_fpu_pkg::*; #(
|
||||
parameter LATENCY = 1,
|
||||
parameter LATENCY = 2,
|
||||
parameter EXP_BITS = 8,
|
||||
parameter MAN_BITS = 23,
|
||||
parameter OUT_REG = 0
|
||||
parameter MAN_BITS = 23
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
@ -34,10 +33,10 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
|
|||
|
||||
input wire [31:0] dataa,
|
||||
input wire [31:0] datab,
|
||||
output wire [31:0] result,
|
||||
output wire [31:0] result,
|
||||
|
||||
output wire [`FP_FLAGS_BITS-1:0] fflags
|
||||
);
|
||||
);
|
||||
localparam NEG_INF = 32'h00000001,
|
||||
NEG_NORM = 32'h00000002,
|
||||
NEG_SUBNORM = 32'h00000004,
|
||||
|
@ -56,15 +55,15 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
|
|||
wire a_smaller, ab_equal;
|
||||
|
||||
// Setup
|
||||
assign a_sign = dataa[31];
|
||||
assign a_sign = dataa[31];
|
||||
assign a_exponent = dataa[30:23];
|
||||
assign a_mantissa = dataa[22:0];
|
||||
|
||||
assign b_sign = datab[31];
|
||||
assign b_sign = datab[31];
|
||||
assign b_exponent = datab[30:23];
|
||||
assign b_mantissa = datab[22:0];
|
||||
|
||||
VX_fp_classifier #(
|
||||
VX_fp_classifier #(
|
||||
.EXP_BITS (EXP_BITS),
|
||||
.MAN_BITS (MAN_BITS)
|
||||
) fp_class_a (
|
||||
|
@ -73,7 +72,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
|
|||
.clss_o (a_fclass)
|
||||
);
|
||||
|
||||
VX_fp_classifier #(
|
||||
VX_fp_classifier #(
|
||||
.EXP_BITS (EXP_BITS),
|
||||
.MAN_BITS (MAN_BITS)
|
||||
) fp_class_b (
|
||||
|
@ -83,7 +82,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
|
|||
);
|
||||
|
||||
assign a_smaller = (dataa < datab) ^ (a_sign || b_sign);
|
||||
assign ab_equal = (dataa == datab)
|
||||
assign ab_equal = (dataa == datab)
|
||||
|| (a_fclass.is_zero && b_fclass.is_zero); // +0 == -0
|
||||
|
||||
// Pipeline stage0
|
||||
|
@ -102,54 +101,54 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
|
|||
|
||||
VX_pipe_register #(
|
||||
.DATAW (4 + 2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fclass_t) + 1 + 1),
|
||||
.DEPTH (LATENCY > 0)
|
||||
.DEPTH (LATENCY > 1)
|
||||
) pipe_reg0 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (enable),
|
||||
.data_in ({op_mod, dataa, datab, a_sign, b_sign, a_exponent, a_mantissa, a_fclass, b_fclass, a_smaller, ab_equal}),
|
||||
.data_out ({op_mod_s0, dataa_s0, datab_s0, a_sign_s0, b_sign_s0, a_exponent_s0, a_mantissa_s0, a_fclass_s0, b_fclass_s0, a_smaller_s0, ab_equal_s0})
|
||||
);
|
||||
);
|
||||
|
||||
// FCLASS
|
||||
reg [31:0] fclass_mask_s0; // generate a 10-bit mask for integer reg
|
||||
always @(*) begin
|
||||
always @(*) begin
|
||||
if (a_fclass_s0.is_normal) begin
|
||||
fclass_mask_s0 = a_sign_s0 ? NEG_NORM : POS_NORM;
|
||||
end
|
||||
end
|
||||
else if (a_fclass_s0.is_inf) begin
|
||||
fclass_mask_s0 = a_sign_s0 ? NEG_INF : POS_INF;
|
||||
end
|
||||
end
|
||||
else if (a_fclass_s0.is_zero) begin
|
||||
fclass_mask_s0 = a_sign_s0 ? NEG_ZERO : POS_ZERO;
|
||||
end
|
||||
end
|
||||
else if (a_fclass_s0.is_subnormal) begin
|
||||
fclass_mask_s0 = a_sign_s0 ? NEG_SUBNORM : POS_SUBNORM;
|
||||
end
|
||||
end
|
||||
else if (a_fclass_s0.is_nan) begin
|
||||
fclass_mask_s0 = {22'h0, a_fclass_s0.is_quiet, a_fclass_s0.is_signaling, 8'h0};
|
||||
end
|
||||
else begin
|
||||
end
|
||||
else begin
|
||||
fclass_mask_s0 = QUT_NAN;
|
||||
end
|
||||
end
|
||||
|
||||
// Min/Max
|
||||
// Min/Max
|
||||
reg [31:0] fminmax_res_s0;
|
||||
always @(*) begin
|
||||
if (a_fclass_s0.is_nan && b_fclass_s0.is_nan)
|
||||
fminmax_res_s0 = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN
|
||||
else if (a_fclass_s0.is_nan)
|
||||
else if (a_fclass_s0.is_nan)
|
||||
fminmax_res_s0 = datab_s0;
|
||||
else if (b_fclass_s0.is_nan)
|
||||
else if (b_fclass_s0.is_nan)
|
||||
fminmax_res_s0 = dataa_s0;
|
||||
else begin
|
||||
else begin
|
||||
// FMIN, FMAX
|
||||
fminmax_res_s0 = (op_mod_s0[0] ^ a_smaller_s0) ? dataa_s0 : datab_s0;
|
||||
end
|
||||
end
|
||||
|
||||
// Sign injection
|
||||
// Sign injection
|
||||
reg [31:0] fsgnj_res_s0; // result of sign injection
|
||||
always @(*) begin
|
||||
case (op_mod_s0[1:0])
|
||||
|
@ -159,12 +158,12 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
|
|||
endcase
|
||||
end
|
||||
|
||||
// Comparison
|
||||
// Comparison
|
||||
reg fcmp_res_s0; // result of comparison
|
||||
reg fcmp_fflags_NV_s0; // comparison fflags
|
||||
always @(*) begin
|
||||
case (op_mod_s0[1:0])
|
||||
0: begin // LE
|
||||
0: begin // LE
|
||||
if (a_fclass_s0.is_nan || b_fclass_s0.is_nan) begin
|
||||
fcmp_res_s0 = 0;
|
||||
fcmp_fflags_NV_s0 = 1;
|
||||
|
@ -180,12 +179,12 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
|
|||
end else begin
|
||||
fcmp_res_s0 = (a_smaller_s0 & ~ab_equal_s0);
|
||||
fcmp_fflags_NV_s0 = 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
2: begin // EQ
|
||||
if (a_fclass_s0.is_nan || b_fclass_s0.is_nan) begin
|
||||
fcmp_res_s0 = 0;
|
||||
fcmp_fflags_NV_s0 = a_fclass_s0.is_signaling | b_fclass_s0.is_signaling;
|
||||
fcmp_fflags_NV_s0 = a_fclass_s0.is_signaling | b_fclass_s0.is_signaling;
|
||||
end else begin
|
||||
fcmp_res_s0 = ab_equal_s0;
|
||||
fcmp_fflags_NV_s0 = 0;
|
||||
|
@ -193,7 +192,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
|
|||
end
|
||||
default: begin
|
||||
fcmp_res_s0 = 'x;
|
||||
fcmp_fflags_NV_s0 = 'x;
|
||||
fcmp_fflags_NV_s0 = 'x;
|
||||
end
|
||||
endcase
|
||||
end
|
||||
|
@ -217,7 +216,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
|
|||
// FMV
|
||||
result_s0 = dataa_s0;
|
||||
fflags_NV_s0 = 0;
|
||||
end
|
||||
end
|
||||
6,7: begin
|
||||
// MIN/MAX
|
||||
result_s0 = fminmax_res_s0;
|
||||
|
@ -230,7 +229,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
|
|||
|
||||
VX_pipe_register #(
|
||||
.DATAW (32 + 1),
|
||||
.DEPTH (OUT_REG)
|
||||
.DEPTH (LATENCY > 0)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
|
|
@ -46,68 +46,56 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
localparam DATAW = 32 + `INST_FRM_BITS + 1 + 1;
|
||||
|
||||
wire [NUM_LANES-1:0][DATAW-1:0] data_in;
|
||||
`UNUSED_VAR (frm)
|
||||
|
||||
wire [NUM_LANES-1:0] mask_out;
|
||||
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
|
||||
fflags_t [NUM_LANES-1:0] fflags_out;
|
||||
|
||||
wire pe_enable;
|
||||
wire [NUM_PES-1:0][DATAW-1:0] pe_data_in;
|
||||
wire [NUM_PES-1:0][31:0] pe_data_in;
|
||||
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in
|
||||
assign data_in[i][0 +: 32] = dataa[i];
|
||||
assign data_in[i][32 +: `INST_FRM_BITS] = frm;
|
||||
assign data_in[i][32 + `INST_FRM_BITS +: 1] = is_itof;
|
||||
assign data_in[i][32 + `INST_FRM_BITS + 1 +: 1] = is_signed;
|
||||
end
|
||||
|
||||
VX_pe_serializer #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.NUM_PES (NUM_PES),
|
||||
.LATENCY (`LATENCY_FCVT),
|
||||
.DATA_IN_WIDTH (DATAW),
|
||||
.DATA_OUT_WIDTH (`FP_FLAGS_BITS + 32),
|
||||
.DATA_IN_WIDTH(32),
|
||||
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
|
||||
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
|
||||
.PE_REG (0),
|
||||
.OUT_BUF (2)
|
||||
.OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
|
||||
) pe_serializer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in),
|
||||
.data_in (data_in),
|
||||
.data_in (dataa),
|
||||
.tag_in ({mask_in, tag_in}),
|
||||
.ready_in (ready_in),
|
||||
.pe_enable (pe_enable),
|
||||
.pe_data_out(pe_data_in),
|
||||
.pe_data_in (pe_data_out),
|
||||
.pe_data_in (pe_data_in),
|
||||
.pe_data_out(pe_data_out),
|
||||
.valid_out (valid_out),
|
||||
.data_out (data_out),
|
||||
.tag_out ({mask_out, tag_out}),
|
||||
.ready_out (ready_out)
|
||||
);
|
||||
|
||||
`UNUSED_VAR (pe_data_in)
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign result[i] = data_out[i][0 +: 32];
|
||||
assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fcvt_units
|
||||
for (genvar i = 0; i < NUM_PES; ++i) begin
|
||||
VX_fcvt_unit #(
|
||||
.LATENCY (`LATENCY_FCVT),
|
||||
.OUT_REG (1)
|
||||
.LATENCY (`LATENCY_FCVT)
|
||||
) fcvt_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (pe_enable),
|
||||
.frm (pe_data_in[0][32 +: `INST_FRM_BITS]),
|
||||
.is_itof (pe_data_in[0][32 + `INST_FRM_BITS +: 1]),
|
||||
.is_signed (pe_data_in[0][32 + `INST_FRM_BITS + 1 +: 1]),
|
||||
.frm (frm),
|
||||
.is_itof (is_itof),
|
||||
.is_signed (is_signed),
|
||||
.dataa (pe_data_in[i][0 +: 32]),
|
||||
.result (pe_data_out[i][0 +: 32]),
|
||||
.fflags (pe_data_out[i][32 +: `FP_FLAGS_BITS])
|
||||
|
|
|
@ -44,33 +44,31 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
|
|||
output wire valid_out,
|
||||
input wire ready_out
|
||||
);
|
||||
localparam DATAW = 2 * 32 + `INST_FRM_BITS;
|
||||
|
||||
wire [NUM_LANES-1:0][DATAW-1:0] data_in;
|
||||
`UNUSED_VAR (frm)
|
||||
|
||||
wire [NUM_LANES-1:0][2*32-1:0] data_in;
|
||||
wire [NUM_LANES-1:0] mask_out;
|
||||
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
|
||||
wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags_out;
|
||||
|
||||
wire pe_enable;
|
||||
wire [NUM_PES-1:0][DATAW-1:0] pe_data_in;
|
||||
wire [NUM_PES-1:0][2*32-1:0] pe_data_in;
|
||||
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign data_in[i][0 +: 32] = dataa[i];
|
||||
assign data_in[i][32 +: 32] = datab[i];
|
||||
assign data_in[i][64 +: `INST_FRM_BITS] = frm;
|
||||
end
|
||||
|
||||
VX_pe_serializer #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.NUM_PES (NUM_PES),
|
||||
.LATENCY (`LATENCY_FDIV),
|
||||
.DATA_IN_WIDTH (DATAW),
|
||||
.DATA_OUT_WIDTH (`FP_FLAGS_BITS + 32),
|
||||
.DATA_IN_WIDTH(2*32),
|
||||
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
|
||||
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
|
||||
.PE_REG (0),
|
||||
.OUT_BUF (2)
|
||||
.OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
|
||||
) pe_serializer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -79,17 +77,15 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
|
|||
.tag_in ({mask_in, tag_in}),
|
||||
.ready_in (ready_in),
|
||||
.pe_enable (pe_enable),
|
||||
.pe_data_out(pe_data_in),
|
||||
.pe_data_in (pe_data_out),
|
||||
.pe_data_in (pe_data_in),
|
||||
.pe_data_out(pe_data_out),
|
||||
.valid_out (valid_out),
|
||||
.data_out (data_out),
|
||||
.tag_out ({mask_out, tag_out}),
|
||||
.ready_out (ready_out)
|
||||
);
|
||||
|
||||
`UNUSED_VAR (pe_data_in)
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign result[i] = data_out[i][0 +: 32];
|
||||
assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS];
|
||||
end
|
||||
|
@ -98,7 +94,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
|
|||
|
||||
`ifdef QUARTUS
|
||||
|
||||
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fdivs
|
||||
for (genvar i = 0; i < NUM_PES; ++i) begin
|
||||
acl_fdiv fdiv (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
|
@ -116,7 +112,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
|
|||
|
||||
`elsif VIVADO
|
||||
|
||||
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fdivs
|
||||
for (genvar i = 0; i < NUM_PES; ++i) begin
|
||||
wire [3:0] tuser;
|
||||
xil_fdiv fdiv (
|
||||
.aclk (clk),
|
||||
|
@ -138,7 +134,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
|
|||
|
||||
`else
|
||||
|
||||
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fdivs
|
||||
for (genvar i = 0; i < NUM_PES; ++i) begin
|
||||
reg [63:0] r;
|
||||
`UNUSED_VAR (r)
|
||||
fflags_t f;
|
||||
|
@ -147,9 +143,9 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
|
|||
dpi_fdiv (
|
||||
pe_enable,
|
||||
int'(0),
|
||||
{32'hffffffff, pe_data_in[i][0 +: 32]}, // a
|
||||
{32'hffffffff, pe_data_in[i][32 +: 32]}, // b
|
||||
pe_data_in[0][64 +: `INST_FRM_BITS], // frm
|
||||
{32'hffffffff, pe_data_in[i][0 +: 32]},
|
||||
{32'hffffffff, pe_data_in[i][32 +: 32]},
|
||||
frm,
|
||||
r,
|
||||
f
|
||||
);
|
||||
|
|
|
@ -76,6 +76,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
|||
|
||||
reg is_fadd, is_fsub, is_fmul, is_fmadd, is_fmsub, is_fnmadd, is_fnmsub;
|
||||
reg is_div, is_fcmp, is_itof, is_utof, is_ftoi, is_ftou, is_f2f;
|
||||
reg dst_fmt, int_fmt;
|
||||
|
||||
reg [NUM_LANES-1:0][63:0] operands [3];
|
||||
|
||||
|
@ -87,8 +88,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
|
||||
wire f_fmt = fmt[0];
|
||||
wire i_fmt = fmt[1];
|
||||
`UNUSED_VAR (fmt)
|
||||
|
||||
always @(*) begin
|
||||
is_fadd = 0;
|
||||
|
@ -106,11 +106,25 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
|||
is_ftou = 0;
|
||||
is_f2f = 0;
|
||||
|
||||
dst_fmt = 0;
|
||||
int_fmt = 0;
|
||||
|
||||
`ifdef FLEN_64
|
||||
dst_fmt = fmt[0];
|
||||
`endif
|
||||
|
||||
`ifdef XLEN_64
|
||||
int_fmt = fmt[1];
|
||||
`endif
|
||||
|
||||
case (op_type)
|
||||
`INST_FPU_ADD: begin core_select = FPU_FMA; is_fadd = ~i_fmt; is_fsub = i_fmt; end
|
||||
`INST_FPU_MADD: begin core_select = FPU_FMA; is_fmadd = ~i_fmt; is_fmsub = i_fmt; end
|
||||
`INST_FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = ~i_fmt; is_fnmsub = i_fmt; end
|
||||
`INST_FPU_ADD: begin core_select = FPU_FMA; is_fadd = 1; end
|
||||
`INST_FPU_SUB: begin core_select = FPU_FMA; is_fsub = 1; end
|
||||
`INST_FPU_MUL: begin core_select = FPU_FMA; is_fmul = 1; end
|
||||
`INST_FPU_MADD: begin core_select = FPU_FMA; is_fmadd = 1; end
|
||||
`INST_FPU_MSUB: begin core_select = FPU_FMA; is_fmsub = 1; end
|
||||
`INST_FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = 1; end
|
||||
`INST_FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end
|
||||
`INST_FPU_DIV: begin core_select = FPU_DIVSQRT; is_div = 1; end
|
||||
`INST_FPU_SQRT: begin core_select = FPU_DIVSQRT; end
|
||||
`INST_FPU_CMP: begin core_select = FPU_NCP; is_fcmp = 1; end
|
||||
|
@ -124,7 +138,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
|||
end
|
||||
|
||||
generate
|
||||
begin : g_fma
|
||||
begin : fma
|
||||
|
||||
reg [NUM_LANES-1:0][`XLEN-1:0] result_fma;
|
||||
reg [NUM_LANES-1:0][63:0] result_fadd;
|
||||
|
@ -150,13 +164,13 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
|||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < NUM_LANES; ++i) begin
|
||||
dpi_fadd (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fadd[i], fflags_fadd[i]);
|
||||
dpi_fsub (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fsub[i], fflags_fsub[i]);
|
||||
dpi_fmul (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fmul[i], fflags_fmul[i]);
|
||||
dpi_fmadd (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmadd[i], fflags_fmadd[i]);
|
||||
dpi_fmsub (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmsub[i], fflags_fmsub[i]);
|
||||
dpi_fnmadd (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmadd[i], fflags_fnmadd[i]);
|
||||
dpi_fnmsub (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmsub[i], fflags_fnmsub[i]);
|
||||
dpi_fadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fadd[i], fflags_fadd[i]);
|
||||
dpi_fsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fsub[i], fflags_fsub[i]);
|
||||
dpi_fmul (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fmul[i], fflags_fmul[i]);
|
||||
dpi_fmadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmadd[i], fflags_fmadd[i]);
|
||||
dpi_fmsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmsub[i], fflags_fmsub[i]);
|
||||
dpi_fnmadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmadd[i], fflags_fnmadd[i]);
|
||||
dpi_fnmsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmsub[i], fflags_fnmsub[i]);
|
||||
|
||||
result_fma[i] = is_fadd ? result_fadd[i][`XLEN-1:0] :
|
||||
is_fsub ? result_fsub[i][`XLEN-1:0] :
|
||||
|
@ -200,7 +214,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
|||
endgenerate
|
||||
|
||||
generate
|
||||
begin : g_fdiv
|
||||
begin : fdiv
|
||||
|
||||
reg [NUM_LANES-1:0][`XLEN-1:0] result_fdiv_r;
|
||||
reg [NUM_LANES-1:0][63:0] result_fdiv;
|
||||
|
@ -212,7 +226,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
|||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < NUM_LANES; ++i) begin
|
||||
dpi_fdiv (fdiv_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fdiv[i], fflags_fdiv[i]);
|
||||
dpi_fdiv (fdiv_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fdiv[i], fflags_fdiv[i]);
|
||||
result_fdiv_r[i] = result_fdiv[i][`XLEN-1:0];
|
||||
end
|
||||
end
|
||||
|
@ -239,7 +253,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
|||
endgenerate
|
||||
|
||||
generate
|
||||
begin : g_fsqrt
|
||||
begin : fsqrt
|
||||
|
||||
reg [NUM_LANES-1:0][`XLEN-1:0] result_fsqrt_r;
|
||||
reg [NUM_LANES-1:0][63:0] result_fsqrt;
|
||||
|
@ -251,7 +265,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
|||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < NUM_LANES; ++i) begin
|
||||
dpi_fsqrt (fsqrt_fire, int'(f_fmt), operands[0][i], frm, result_fsqrt[i], fflags_fsqrt[i]);
|
||||
dpi_fsqrt (fsqrt_fire, int'(dst_fmt), operands[0][i], frm, result_fsqrt[i], fflags_fsqrt[i]);
|
||||
result_fsqrt_r[i] = result_fsqrt[i][`XLEN-1:0];
|
||||
end
|
||||
end
|
||||
|
@ -278,7 +292,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
|||
endgenerate
|
||||
|
||||
generate
|
||||
begin : g_fcvt
|
||||
begin : fcvt
|
||||
|
||||
reg [NUM_LANES-1:0][`XLEN-1:0] result_fcvt;
|
||||
reg [NUM_LANES-1:0][63:0] result_itof;
|
||||
|
@ -299,11 +313,11 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
|||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < NUM_LANES; ++i) begin
|
||||
dpi_itof (fcvt_fire, int'(f_fmt), int'(i_fmt), operands[0][i], frm, result_itof[i], fflags_itof[i]);
|
||||
dpi_utof (fcvt_fire, int'(f_fmt), int'(i_fmt), operands[0][i], frm, result_utof[i], fflags_utof[i]);
|
||||
dpi_ftoi (fcvt_fire, int'(i_fmt), int'(f_fmt), operands[0][i], frm, result_ftoi[i], fflags_ftoi[i]);
|
||||
dpi_ftou (fcvt_fire, int'(i_fmt), int'(f_fmt), operands[0][i], frm, result_ftou[i], fflags_ftou[i]);
|
||||
dpi_f2f (fcvt_fire, int'(f_fmt), operands[0][i], result_f2f[i]);
|
||||
dpi_itof (fcvt_fire, int'(dst_fmt), int'(int_fmt), operands[0][i], frm, result_itof[i], fflags_itof[i]);
|
||||
dpi_utof (fcvt_fire, int'(dst_fmt), int'(int_fmt), operands[0][i], frm, result_utof[i], fflags_utof[i]);
|
||||
dpi_ftoi (fcvt_fire, int'(int_fmt), int'(dst_fmt), operands[0][i], frm, result_ftoi[i], fflags_ftoi[i]);
|
||||
dpi_ftou (fcvt_fire, int'(int_fmt), int'(dst_fmt), operands[0][i], frm, result_ftou[i], fflags_ftou[i]);
|
||||
dpi_f2f (fcvt_fire, int'(dst_fmt), operands[0][i], result_f2f[i]);
|
||||
|
||||
result_fcvt[i] = is_itof ? result_itof[i][`XLEN-1:0] :
|
||||
is_utof ? result_utof[i][`XLEN-1:0] :
|
||||
|
@ -342,7 +356,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
|||
endgenerate
|
||||
|
||||
generate
|
||||
begin : g_fncp
|
||||
begin : fncp
|
||||
|
||||
reg [NUM_LANES-1:0][`XLEN-1:0] result_fncp;
|
||||
reg [NUM_LANES-1:0][63:0] result_fclss;
|
||||
|
@ -370,17 +384,17 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
|||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < NUM_LANES; ++i) begin
|
||||
dpi_fclss (fncp_fire, int'(f_fmt), operands[0][i], result_fclss[i]);
|
||||
dpi_fle (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fle[i], fflags_fle[i]);
|
||||
dpi_flt (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_flt[i], fflags_flt[i]);
|
||||
dpi_feq (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_feq[i], fflags_feq[i]);
|
||||
dpi_fmin (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fmin[i], fflags_fmin[i]);
|
||||
dpi_fmax (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fmax[i], fflags_fmax[i]);
|
||||
dpi_fsgnj (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fsgnj[i]);
|
||||
dpi_fsgnjn (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fsgnjn[i]);
|
||||
dpi_fsgnjx (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fsgnjx[i]);
|
||||
result_fmvx[i] = f_fmt ? operands[0][i] : 64'($signed(operands[0][i][31:0])); // sign-extension
|
||||
result_fmvf[i] = f_fmt ? operands[0][i] : (operands[0][i] | 64'hffffffff00000000); // nan-boxing
|
||||
dpi_fclss (fncp_fire, int'(dst_fmt), operands[0][i], result_fclss[i]);
|
||||
dpi_fle (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fle[i], fflags_fle[i]);
|
||||
dpi_flt (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_flt[i], fflags_flt[i]);
|
||||
dpi_feq (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_feq[i], fflags_feq[i]);
|
||||
dpi_fmin (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fmin[i], fflags_fmin[i]);
|
||||
dpi_fmax (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fmax[i], fflags_fmax[i]);
|
||||
dpi_fsgnj (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnj[i]);
|
||||
dpi_fsgnjn (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnjn[i]);
|
||||
dpi_fsgnjx (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnjx[i]);
|
||||
result_fmvx[i] = dst_fmt ? operands[0][i] : 64'($signed(operands[0][i][31:0])); // sign-extension
|
||||
result_fmvf[i] = dst_fmt ? operands[0][i] : (operands[0][i] | 64'hffffffff00000000); // nan-boxing
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -430,7 +444,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
|||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATAW (RSP_DATAW),
|
||||
.ARBITER ("P"),
|
||||
.ARBITER ("R"),
|
||||
.OUT_BUF (0)
|
||||
) div_sqrt_arb (
|
||||
.clk (clk),
|
||||
|
@ -449,14 +463,14 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
|||
|
||||
wire [NUM_FPC-1:0][RSP_DATAW-1:0] per_core_data_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_FPC; ++i) begin : g_per_core_data_out
|
||||
for (genvar i = 0; i < NUM_FPC; ++i) begin
|
||||
assign per_core_data_out[i] = {per_core_result[i], per_core_has_fflags[i], per_core_fflags[i], per_core_tag_out[i]};
|
||||
end
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (NUM_FPC),
|
||||
.DATAW (RSP_DATAW),
|
||||
.ARBITER ("R"),
|
||||
.ARBITER ("F"),
|
||||
.OUT_BUF (OUT_BUF)
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
|
|
|
@ -51,39 +51,68 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
|
|||
localparam FPU_DIVSQRT = 1;
|
||||
localparam FPU_CVT = 2;
|
||||
localparam FPU_NCP = 3;
|
||||
localparam NUM_FPCORES = 4;
|
||||
localparam FPCORES_BITS = `LOG2UP(NUM_FPCORES);
|
||||
localparam NUM_FPC = 4;
|
||||
localparam FPC_BITS = `LOG2UP(NUM_FPC);
|
||||
|
||||
localparam REQ_DATAW = NUM_LANES + TAG_WIDTH + `INST_FPU_BITS + `INST_FMT_BITS + `INST_FRM_BITS + 3 * (NUM_LANES * 32);
|
||||
localparam RSP_DATAW = (NUM_LANES * 32) + 1 + $bits(fflags_t) + TAG_WIDTH;
|
||||
|
||||
`UNUSED_VAR (fmt)
|
||||
|
||||
wire [NUM_FPCORES-1:0] per_core_valid_in;
|
||||
wire [NUM_FPCORES-1:0][REQ_DATAW-1:0] per_core_data_in;
|
||||
wire [NUM_FPCORES-1:0] per_core_ready_in;
|
||||
wire [NUM_FPC-1:0] per_core_ready_in;
|
||||
wire [NUM_FPC-1:0][NUM_LANES-1:0][31:0] per_core_result;
|
||||
wire [NUM_FPC-1:0][TAG_WIDTH-1:0] per_core_tag_out;
|
||||
wire [NUM_FPC-1:0] per_core_ready_out;
|
||||
wire [NUM_FPC-1:0] per_core_valid_out;
|
||||
wire [NUM_FPC-1:0] per_core_has_fflags;
|
||||
fflags_t [NUM_FPC-1:0] per_core_fflags;
|
||||
|
||||
wire [NUM_FPCORES-1:0][NUM_LANES-1:0] per_core_mask_in;
|
||||
wire [NUM_FPCORES-1:0][TAG_WIDTH-1:0] per_core_tag_in;
|
||||
wire [NUM_FPCORES-1:0][`INST_FPU_BITS-1:0] per_core_op_type;
|
||||
wire [NUM_FPCORES-1:0][`INST_FMT_BITS-1:0] per_core_fmt;
|
||||
wire [NUM_FPCORES-1:0][`INST_FRM_BITS-1:0] per_core_frm;
|
||||
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_dataa;
|
||||
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_datab;
|
||||
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_datac;
|
||||
wire div_ready_in, sqrt_ready_in;
|
||||
wire [NUM_LANES-1:0][31:0] div_result, sqrt_result;
|
||||
wire [TAG_WIDTH-1:0] div_tag_out, sqrt_tag_out;
|
||||
wire div_ready_out, sqrt_ready_out;
|
||||
wire div_valid_out, sqrt_valid_out;
|
||||
wire div_has_fflags, sqrt_has_fflags;
|
||||
fflags_t div_fflags, sqrt_fflags;
|
||||
|
||||
wire [NUM_FPCORES-1:0] per_core_valid_out;
|
||||
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_result;
|
||||
wire [NUM_FPCORES-1:0][TAG_WIDTH-1:0] per_core_tag_out;
|
||||
wire [NUM_FPCORES-1:0] per_core_has_fflags;
|
||||
fflags_t [NUM_FPCORES-1:0] per_core_fflags;
|
||||
wire [NUM_FPCORES-1:0] per_core_ready_out;
|
||||
reg [FPC_BITS-1:0] core_select;
|
||||
reg is_madd, is_sub, is_neg, is_div, is_itof, is_signed;
|
||||
|
||||
always @(*) begin
|
||||
is_madd = 0;
|
||||
is_sub = 0;
|
||||
is_neg = 0;
|
||||
is_div = 0;
|
||||
is_itof = 0;
|
||||
is_signed = 0;
|
||||
case (op_type)
|
||||
`INST_FPU_ADD: begin core_select = FPU_FMA; end
|
||||
`INST_FPU_SUB: begin core_select = FPU_FMA; is_sub = 1; end
|
||||
`INST_FPU_MUL: begin core_select = FPU_FMA; is_neg = 1; end
|
||||
`INST_FPU_MADD: begin core_select = FPU_FMA; is_madd = 1; end
|
||||
`INST_FPU_MSUB: begin core_select = FPU_FMA; is_madd = 1; is_sub = 1; end
|
||||
`INST_FPU_NMADD: begin core_select = FPU_FMA; is_madd = 1; is_neg = 1; end
|
||||
`INST_FPU_NMSUB: begin core_select = FPU_FMA; is_madd = 1; is_sub = 1; is_neg = 1; end
|
||||
`INST_FPU_DIV: begin core_select = FPU_DIVSQRT; is_div = 1; end
|
||||
`INST_FPU_SQRT: begin core_select = FPU_DIVSQRT; end
|
||||
`INST_FPU_F2I: begin core_select = FPU_CVT; is_signed = 1; end
|
||||
`INST_FPU_F2U: begin core_select = FPU_CVT; end
|
||||
`INST_FPU_I2F: begin core_select = FPU_CVT; is_itof = 1; is_signed = 1; end
|
||||
`INST_FPU_U2F: begin core_select = FPU_CVT; is_itof = 1; end
|
||||
default: begin core_select = FPU_NCP; end
|
||||
endcase
|
||||
end
|
||||
|
||||
`RESET_RELAY (fma_reset, reset);
|
||||
`RESET_RELAY (div_reset, reset);
|
||||
`RESET_RELAY (sqrt_reset, reset);
|
||||
`RESET_RELAY (cvt_reset, reset);
|
||||
`RESET_RELAY (ncp_reset, reset);
|
||||
|
||||
wire [NUM_LANES-1:0][31:0] dataa_s;
|
||||
wire [NUM_LANES-1:0][31:0] datab_s;
|
||||
wire [NUM_LANES-1:0][31:0] datac_s;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign dataa_s[i] = dataa[i][31:0];
|
||||
assign datab_s[i] = datab[i][31:0];
|
||||
assign datac_s[i] = datac[i][31:0];
|
||||
|
@ -93,60 +122,23 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
|
|||
`UNUSED_VAR (datab)
|
||||
`UNUSED_VAR (datac)
|
||||
|
||||
// Decode fpu core type
|
||||
wire [FPCORES_BITS-1:0] core_select = op_type[3:2];
|
||||
|
||||
VX_stream_switch #(
|
||||
.DATAW (REQ_DATAW),
|
||||
.NUM_OUTPUTS (NUM_FPCORES)
|
||||
) req_switch (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.sel_in (core_select),
|
||||
.valid_in (valid_in),
|
||||
.ready_in (ready_in),
|
||||
.data_in ({mask_in, tag_in, fmt, frm, dataa_s, datab_s, datac_s, op_type}),
|
||||
.data_out (per_core_data_in),
|
||||
.valid_out (per_core_valid_in),
|
||||
.ready_out (per_core_ready_in)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_FPCORES; ++i) begin : g_per_core_data_in
|
||||
assign {
|
||||
per_core_mask_in[i],
|
||||
per_core_tag_in[i],
|
||||
per_core_fmt[i],
|
||||
per_core_frm[i],
|
||||
per_core_dataa[i],
|
||||
per_core_datab[i],
|
||||
per_core_datac[i],
|
||||
per_core_op_type[i]
|
||||
} = per_core_data_in[i];
|
||||
end
|
||||
|
||||
// FMA core ///////////////////////////////////////////////////////////////
|
||||
|
||||
wire is_madd = per_core_op_type[FPU_FMA][1];
|
||||
wire is_neg = per_core_op_type[FPU_FMA][0];
|
||||
wire is_sub = per_core_fmt[FPU_FMA][1];
|
||||
|
||||
VX_fpu_fma #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAG_WIDTH (TAG_WIDTH)
|
||||
) fpu_fma (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (per_core_valid_in[FPU_FMA]),
|
||||
.reset (fma_reset),
|
||||
.valid_in (valid_in && (core_select == FPU_FMA)),
|
||||
.ready_in (per_core_ready_in[FPU_FMA]),
|
||||
.mask_in (per_core_mask_in[FPU_FMA]),
|
||||
.tag_in (per_core_tag_in[FPU_FMA]),
|
||||
.frm (per_core_frm[FPU_FMA]),
|
||||
.mask_in (mask_in),
|
||||
.tag_in (tag_in),
|
||||
.frm (frm),
|
||||
.is_madd (is_madd),
|
||||
.is_sub (is_sub),
|
||||
.is_neg (is_neg),
|
||||
.dataa (per_core_dataa[FPU_FMA]),
|
||||
.datab (per_core_datab[FPU_FMA]),
|
||||
.datac (per_core_datac[FPU_FMA]),
|
||||
.dataa (dataa_s),
|
||||
.datab (datab_s),
|
||||
.datac (datac_s),
|
||||
.has_fflags (per_core_has_fflags[FPU_FMA]),
|
||||
.fflags (per_core_fflags[FPU_FMA]),
|
||||
.result (per_core_result[FPU_FMA]),
|
||||
|
@ -155,99 +147,25 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
|
|||
.valid_out (per_core_valid_out[FPU_FMA])
|
||||
);
|
||||
|
||||
// Div/Sqrt cores /////////////////////////////////////////////////////////
|
||||
|
||||
wire [1:0] div_sqrt_valid_in;
|
||||
wire [1:0][REQ_DATAW-1:0] div_sqrt_data_in;
|
||||
wire [1:0] div_sqrt_ready_in;
|
||||
|
||||
wire [1:0][NUM_LANES-1:0] div_sqrt_mask_in;
|
||||
wire [1:0][TAG_WIDTH-1:0] div_sqrt_tag_in;
|
||||
wire [1:0][`INST_FPU_BITS-1:0] div_sqrt_op_type;
|
||||
wire [1:0][`INST_FMT_BITS-1:0] div_sqrt_fmt;
|
||||
wire [1:0][`INST_FRM_BITS-1:0] div_sqrt_frm;
|
||||
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_dataa;
|
||||
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_datab;
|
||||
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_datac;
|
||||
|
||||
wire [1:0] div_sqrt_valid_out;
|
||||
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_result;
|
||||
wire [1:0][TAG_WIDTH-1:0] div_sqrt_tag_out;
|
||||
wire [1:0] div_sqrt_has_fflags;
|
||||
fflags_t [1:0] div_sqrt_fflags;
|
||||
wire [1:0] div_sqrt_ready_out;
|
||||
|
||||
wire div_sqrt_valid_tmp_in;
|
||||
wire [REQ_DATAW-1:0] div_sqrt_data_tmp_in;
|
||||
wire div_sqrt_ready_tmp_in;
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (REQ_DATAW)
|
||||
) div_sqrt_req_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (per_core_valid_in[FPU_DIVSQRT]),
|
||||
.ready_in (per_core_ready_in[FPU_DIVSQRT]),
|
||||
.data_in (per_core_data_in[FPU_DIVSQRT]),
|
||||
.data_out (div_sqrt_data_tmp_in),
|
||||
.valid_out (div_sqrt_valid_tmp_in),
|
||||
.ready_out (div_sqrt_ready_tmp_in)
|
||||
);
|
||||
|
||||
wire is_sqrt = div_sqrt_data_tmp_in[0]; // op_type[0]
|
||||
|
||||
VX_stream_switch #(
|
||||
.DATAW (REQ_DATAW),
|
||||
.NUM_OUTPUTS (2)
|
||||
) div_sqrt_req_switch (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.sel_in (is_sqrt),
|
||||
.valid_in (div_sqrt_valid_tmp_in),
|
||||
.ready_in (div_sqrt_ready_tmp_in),
|
||||
.data_in (div_sqrt_data_tmp_in),
|
||||
.data_out (div_sqrt_data_in),
|
||||
.valid_out (div_sqrt_valid_in),
|
||||
.ready_out (div_sqrt_ready_in)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < 2; ++i) begin : g_div_sqrt_data_in
|
||||
assign {
|
||||
div_sqrt_mask_in[i],
|
||||
div_sqrt_tag_in[i],
|
||||
div_sqrt_fmt[i],
|
||||
div_sqrt_frm[i],
|
||||
div_sqrt_dataa[i],
|
||||
div_sqrt_datab[i],
|
||||
div_sqrt_datac[i],
|
||||
div_sqrt_op_type[i]
|
||||
} = div_sqrt_data_in[i];
|
||||
end
|
||||
|
||||
`UNUSED_VAR (div_sqrt_op_type)
|
||||
`UNUSED_VAR (div_sqrt_fmt)
|
||||
`UNUSED_VAR (div_sqrt_datab)
|
||||
`UNUSED_VAR (div_sqrt_datac)
|
||||
|
||||
VX_fpu_div #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAG_WIDTH (TAG_WIDTH)
|
||||
) fpu_div (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (div_sqrt_valid_in[0]),
|
||||
.ready_in (div_sqrt_ready_in[0]),
|
||||
.mask_in (div_sqrt_mask_in[0]),
|
||||
.tag_in (div_sqrt_tag_in[0]),
|
||||
.frm (div_sqrt_frm[0]),
|
||||
.dataa (div_sqrt_dataa[0]),
|
||||
.datab (div_sqrt_datab[0]),
|
||||
.has_fflags (div_sqrt_has_fflags[0]),
|
||||
.fflags (div_sqrt_fflags[0]),
|
||||
.result (div_sqrt_result[0]),
|
||||
.tag_out (div_sqrt_tag_out[0]),
|
||||
.valid_out (div_sqrt_valid_out[0]),
|
||||
.ready_out (div_sqrt_ready_out[0])
|
||||
.reset (div_reset),
|
||||
.valid_in (valid_in && (core_select == FPU_DIVSQRT) && is_div),
|
||||
.ready_in (div_ready_in),
|
||||
.mask_in (mask_in),
|
||||
.tag_in (tag_in),
|
||||
.frm (frm),
|
||||
.dataa (dataa_s),
|
||||
.datab (datab_s),
|
||||
.has_fflags (div_has_fflags),
|
||||
.fflags (div_fflags),
|
||||
.result (div_result),
|
||||
.tag_out (div_tag_out),
|
||||
.valid_out (div_valid_out),
|
||||
.ready_out (div_ready_out)
|
||||
);
|
||||
|
||||
VX_fpu_sqrt #(
|
||||
|
@ -255,42 +173,92 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
|
|||
.TAG_WIDTH (TAG_WIDTH)
|
||||
) fpu_sqrt (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (div_sqrt_valid_in[1]),
|
||||
.ready_in (div_sqrt_ready_in[1]),
|
||||
.mask_in (div_sqrt_mask_in[1]),
|
||||
.tag_in (div_sqrt_tag_in[1]),
|
||||
.frm (div_sqrt_frm[1]),
|
||||
.dataa (div_sqrt_dataa[1]),
|
||||
.has_fflags (div_sqrt_has_fflags[1]),
|
||||
.fflags (div_sqrt_fflags[1]),
|
||||
.result (div_sqrt_result[1]),
|
||||
.tag_out (div_sqrt_tag_out[1]),
|
||||
.valid_out (div_sqrt_valid_out[1]),
|
||||
.ready_out (div_sqrt_ready_out[1])
|
||||
.reset (sqrt_reset),
|
||||
.valid_in (valid_in && (core_select == FPU_DIVSQRT) && ~is_div),
|
||||
.ready_in (sqrt_ready_in),
|
||||
.mask_in (mask_in),
|
||||
.tag_in (tag_in),
|
||||
.frm (frm),
|
||||
.dataa (dataa_s),
|
||||
.has_fflags (sqrt_has_fflags),
|
||||
.fflags (sqrt_fflags),
|
||||
.result (sqrt_result),
|
||||
.tag_out (sqrt_tag_out),
|
||||
.valid_out (sqrt_valid_out),
|
||||
.ready_out (sqrt_ready_out)
|
||||
);
|
||||
|
||||
wire [1:0][RSP_DATAW-1:0] div_sqrt_arb_data_in;
|
||||
for (genvar i = 0; i < 2; ++i) begin : g_div_sqrt_arb_data_in
|
||||
assign div_sqrt_arb_data_in[i] = {
|
||||
div_sqrt_result[i],
|
||||
div_sqrt_has_fflags[i],
|
||||
div_sqrt_fflags[i],
|
||||
div_sqrt_tag_out[i]
|
||||
};
|
||||
end
|
||||
wire cvt_ret_int_in = ~is_itof;
|
||||
wire cvt_ret_int_out;
|
||||
|
||||
VX_fpu_cvt #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAG_WIDTH (TAG_WIDTH+1)
|
||||
) fpu_cvt (
|
||||
.clk (clk),
|
||||
.reset (cvt_reset),
|
||||
.valid_in (valid_in && (core_select == FPU_CVT)),
|
||||
.ready_in (per_core_ready_in[FPU_CVT]),
|
||||
.mask_in (mask_in),
|
||||
.tag_in ({cvt_ret_int_in, tag_in}),
|
||||
.frm (frm),
|
||||
.is_itof (is_itof),
|
||||
.is_signed (is_signed),
|
||||
.dataa (dataa_s),
|
||||
.has_fflags (per_core_has_fflags[FPU_CVT]),
|
||||
.fflags (per_core_fflags[FPU_CVT]),
|
||||
.result (per_core_result[FPU_CVT]),
|
||||
.tag_out ({cvt_ret_int_out, per_core_tag_out[FPU_CVT]}),
|
||||
.valid_out (per_core_valid_out[FPU_CVT]),
|
||||
.ready_out (per_core_ready_out[FPU_CVT])
|
||||
);
|
||||
|
||||
wire ncp_ret_int_in = (op_type == `INST_FPU_CMP)
|
||||
|| `INST_FPU_IS_CLASS(op_type, frm)
|
||||
|| `INST_FPU_IS_MVXW(op_type, frm);
|
||||
wire ncp_ret_int_out;
|
||||
|
||||
wire ncp_ret_sext_in = `INST_FPU_IS_MVXW(op_type, frm);
|
||||
wire ncp_ret_sext_out;
|
||||
|
||||
VX_fpu_ncp #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAG_WIDTH (TAG_WIDTH+2)
|
||||
) fpu_ncp (
|
||||
.clk (clk),
|
||||
.reset (ncp_reset),
|
||||
.valid_in (valid_in && (core_select == FPU_NCP)),
|
||||
.ready_in (per_core_ready_in[FPU_NCP]),
|
||||
.mask_in (mask_in),
|
||||
.tag_in ({ncp_ret_sext_in, ncp_ret_int_in, tag_in}),
|
||||
.op_type (op_type),
|
||||
.frm (frm),
|
||||
.dataa (dataa_s),
|
||||
.datab (datab_s),
|
||||
.result (per_core_result[FPU_NCP]),
|
||||
.has_fflags (per_core_has_fflags[FPU_NCP]),
|
||||
.fflags (per_core_fflags[FPU_NCP]),
|
||||
.tag_out ({ncp_ret_sext_out, ncp_ret_int_out, per_core_tag_out[FPU_NCP]}),
|
||||
.valid_out (per_core_valid_out[FPU_NCP]),
|
||||
.ready_out (per_core_ready_out[FPU_NCP])
|
||||
);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
assign per_core_ready_in[FPU_DIVSQRT] = is_div ? div_ready_in : sqrt_ready_in;
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
.DATAW (RSP_DATAW),
|
||||
.ARBITER ("P"),
|
||||
.ARBITER ("R"),
|
||||
.OUT_BUF (0)
|
||||
) div_sqrt_rsp_arb (
|
||||
) div_sqrt_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (div_sqrt_valid_out),
|
||||
.ready_in (div_sqrt_ready_out),
|
||||
.data_in (div_sqrt_arb_data_in),
|
||||
.valid_in ({sqrt_valid_out, div_valid_out}),
|
||||
.ready_in ({sqrt_ready_out, div_ready_out}),
|
||||
.data_in ({{sqrt_result, sqrt_has_fflags, sqrt_fflags, sqrt_tag_out},
|
||||
{div_result, div_has_fflags, div_fflags, div_tag_out}}),
|
||||
.data_out ({
|
||||
per_core_result[FPU_DIVSQRT],
|
||||
per_core_has_fflags[FPU_DIVSQRT],
|
||||
|
@ -302,73 +270,12 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
|
|||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
// CVT core ///////////////////////////////////////////////////////////////
|
||||
|
||||
wire is_itof = per_core_op_type[FPU_CVT][1];
|
||||
wire is_signed = ~per_core_op_type[FPU_CVT][0];
|
||||
wire cvt_ret_int_in = ~is_itof;
|
||||
wire cvt_ret_int_out;
|
||||
|
||||
VX_fpu_cvt #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAG_WIDTH (1+TAG_WIDTH)
|
||||
) fpu_cvt (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (per_core_valid_in[FPU_CVT]),
|
||||
.ready_in (per_core_ready_in[FPU_CVT]),
|
||||
.mask_in (per_core_mask_in[FPU_CVT]),
|
||||
.tag_in ({cvt_ret_int_in, per_core_tag_in[FPU_CVT]}),
|
||||
.frm (per_core_frm[FPU_CVT]),
|
||||
.is_itof (is_itof),
|
||||
.is_signed (is_signed),
|
||||
.dataa (per_core_dataa[FPU_CVT]),
|
||||
.has_fflags (per_core_has_fflags[FPU_CVT]),
|
||||
.fflags (per_core_fflags[FPU_CVT]),
|
||||
.result (per_core_result[FPU_CVT]),
|
||||
.tag_out ({cvt_ret_int_out, per_core_tag_out[FPU_CVT]}),
|
||||
.valid_out (per_core_valid_out[FPU_CVT]),
|
||||
.ready_out (per_core_ready_out[FPU_CVT])
|
||||
);
|
||||
|
||||
// NCP core ///////////////////////////////////////////////////////////////
|
||||
|
||||
wire ncp_ret_int_in = (per_core_op_type[FPU_NCP] == `INST_FPU_CMP)
|
||||
|| `INST_FPU_IS_CLASS(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP])
|
||||
|| `INST_FPU_IS_MVXW(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP]);
|
||||
wire ncp_ret_int_out;
|
||||
|
||||
wire ncp_ret_sext_in = `INST_FPU_IS_MVXW(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP]);
|
||||
wire ncp_ret_sext_out;
|
||||
|
||||
VX_fpu_ncp #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAG_WIDTH (TAG_WIDTH+2)
|
||||
) fpu_ncp (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (per_core_valid_in[FPU_NCP]),
|
||||
.ready_in (per_core_ready_in[FPU_NCP]),
|
||||
.mask_in (per_core_mask_in[FPU_NCP]),
|
||||
.tag_in ({ncp_ret_sext_in, ncp_ret_int_in, per_core_tag_in[FPU_NCP]}),
|
||||
.op_type (per_core_op_type[FPU_NCP]),
|
||||
.frm (per_core_frm[FPU_NCP]),
|
||||
.dataa (per_core_dataa[FPU_NCP]),
|
||||
.datab (per_core_datab[FPU_NCP]),
|
||||
.result (per_core_result[FPU_NCP]),
|
||||
.has_fflags (per_core_has_fflags[FPU_NCP]),
|
||||
.fflags (per_core_fflags[FPU_NCP]),
|
||||
.tag_out ({ncp_ret_sext_out, ncp_ret_int_out, per_core_tag_out[FPU_NCP]}),
|
||||
.valid_out (per_core_valid_out[FPU_NCP]),
|
||||
.ready_out (per_core_ready_out[FPU_NCP])
|
||||
);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
reg [NUM_FPCORES-1:0][RSP_DATAW+2-1:0] per_core_data_out;
|
||||
reg [NUM_FPC-1:0][RSP_DATAW+2-1:0] per_core_data_out;
|
||||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < NUM_FPCORES; ++i) begin
|
||||
for (integer i = 0; i < NUM_FPC; ++i) begin
|
||||
per_core_data_out[i][RSP_DATAW+1:2] = {
|
||||
per_core_result[i],
|
||||
per_core_has_fflags[i],
|
||||
|
@ -387,9 +294,9 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
|
|||
`UNUSED_VAR (op_ret_int_out)
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (NUM_FPCORES),
|
||||
.NUM_INPUTS (NUM_FPC),
|
||||
.DATAW (RSP_DATAW + 2),
|
||||
.ARBITER ("R"),
|
||||
.ARBITER ("F"),
|
||||
.OUT_BUF (OUT_BUF)
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
|
@ -403,22 +310,25 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
|
|||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
`ifdef FPU_RV64F
|
||||
reg [`XLEN-1:0] result_w;
|
||||
reg [`XLEN-1:0] result_r;
|
||||
always @(*) begin
|
||||
case (op_ret_int_out)
|
||||
2'b11: result_w = `XLEN'($signed(result_s[i]));
|
||||
2'b01: result_w = {32'h00000000, result_s[i]};
|
||||
default: result_w = {32'hffffffff, result_s[i]};
|
||||
2'b11: result_r = `XLEN'($signed(result_s[i]));
|
||||
2'b01: result_r = {32'h00000000, result_s[i]};
|
||||
default: result_r = {32'hffffffff, result_s[i]};
|
||||
endcase
|
||||
end
|
||||
assign result[i] = result_w;
|
||||
assign result[i] = result_r;
|
||||
`else
|
||||
assign result[i] = result_s[i];
|
||||
`endif
|
||||
end
|
||||
|
||||
// can accept new request?
|
||||
assign ready_in = per_core_ready_in[core_select];
|
||||
|
||||
endmodule
|
||||
|
||||
`endif
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue