diff --git a/.clang-format b/.clang-format new file mode 100644 index 000000000..5a8564956 --- /dev/null +++ b/.clang-format @@ -0,0 +1,8 @@ +Language: Cpp +BasedOnStyle: LLVM +IndentWidth: 2 +TabWidth: 2 +ColumnLimit: 0 +UseTab: Never +BreakBeforeBraces: Attach +AlwaysBreakTemplateDeclarations: true \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f49dd42bf..877039497 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,17 +17,17 @@ on: [push, pull_request] jobs: setup: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 with: submodules: recursive - name: Cache Toolchain Directory id: cache-toolchain - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: tools key: ${{ runner.os }}-toolchain-v0.1 @@ -36,7 +36,7 @@ jobs: - name: Cache Third Party Directory id: cache-thirdparty - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: third_party key: ${{ runner.os }}-thirdparty-v0.1 @@ -46,7 +46,7 @@ jobs: - name: Install Dependencies if: steps.cache-toolchain.outputs.cache-hit != 'true' || steps.cache-thirdparty.outputs.cache-hit != 'true' run: | - sudo bash ./ci/system_updates.sh + sudo bash ./ci/install_dependencies.sh - name: Setup Toolchain if: steps.cache-toolchain.outputs.cache-hit != 'true' @@ -63,7 +63,7 @@ jobs: make -C third_party > /dev/null build: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: setup strategy: matrix: @@ -71,15 +71,15 @@ jobs: steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Install Dependencies run: | - sudo bash ./ci/system_updates.sh + sudo bash ./ci/install_dependencies.sh - name: Cache Toolchain Directory id: cache-toolchain - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: tools key: ${{ runner.os }}-toolchain-v0.1 @@ -88,7 +88,7 @@ jobs: - name: Cache Third Party Directory id: cache-thirdparty - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: third_party key: ${{ runner.os }}-thirdparty-v0.1 @@ -106,31 +106,31 @@ jobs: make tests -s > /dev/null - name: Upload Build Artifact - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v4 with: name: build-${{ matrix.xlen }} path: build${{ matrix.xlen }} tests: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: build strategy: fail-fast: false matrix: - name: [regression, opencl, cache, config1, config2, debug, stress] + name: [regression, opencl, cache, config1, config2, debug, scope, stress, synthesis, vm, vector] xlen: [32, 64] steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v4 - name: Install Dependencies run: | - sudo bash ./ci/system_updates.sh + sudo bash ./ci/install_dependencies.sh - name: Cache Toolchain Directory id: cache-toolchain - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: tools key: ${{ runner.os }}-toolchain-v0.1 @@ -139,7 +139,7 @@ jobs: - name: Cache Third Party Directory id: cache-thirdparty - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: third_party key: ${{ runner.os }}-thirdparty-v0.1 @@ -147,7 +147,7 @@ jobs: ${{ runner.os }}-thirdparty- - name: Download Build Artifact - uses: actions/download-artifact@v2 + uses: actions/download-artifact@v4 with: name: build-${{ matrix.xlen }} path: build${{ matrix.xlen }} @@ -161,16 +161,15 @@ jobs: ./ci/regression.sh --unittest ./ci/regression.sh --isa ./ci/regression.sh --kernel - ./ci/regression.sh --synthesis ./ci/regression.sh --regression else ./ci/regression.sh --${{ matrix.name }} fi complete: - runs-on: ubuntu-20.04 + runs-on: ubuntu-22.04 needs: tests steps: - name: Check Completion - run: echo "All matrix jobs passed" \ No newline at end of file + run: echo "All matrix jobs passed" diff --git a/.gitignore b/.gitignore index 039456040..43388e9cb 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /build* /.vscode -*.cache \ No newline at end of file +*.cache +*.code-workspace diff --git a/.gitmodules b/.gitmodules index df3ca47e2..32abfe9cb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,9 @@ -[submodule "third_party/fpnew"] - path = third_party/fpnew - url = https://github.com/pulp-platform/fpnew.git [submodule "third_party/softfloat"] path = third_party/softfloat url = https://github.com/ucb-bar/berkeley-softfloat-3.git [submodule "third_party/ramulator"] path = third_party/ramulator url = https://github.com/CMU-SAFARI/ramulator2.git +[submodule "third_party/cvfpu"] + path = third_party/cvfpu + url = https://github.com/openhwgroup/cvfpu.git diff --git a/Dockerfile.dev b/Dockerfile.dev new file mode 100644 index 000000000..22cd74155 --- /dev/null +++ b/Dockerfile.dev @@ -0,0 +1,20 @@ +FROM ubuntu:20.04 + +LABEL "Udit Subramanya"="usubramanya3@gatech.edu" + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && \ + apt-get install -y build-essential valgrind git wget libpng-dev libboost-all-dev uuid-dev ccache cmake + +# Third-Party Repository to Install g++11 on Ubuntu 18.04 +RUN apt-get install -y manpages-dev software-properties-common +RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test + +RUN apt-get install -y gcc-11 g++-11 + +RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11 +RUN update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11 + +# create a directory for mounting the volume +WORKDIR /root/vortex \ No newline at end of file diff --git a/README.md b/README.md index 7cafd498d..a7228e772 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,35 @@ # Vortex GPGPU -Vortex is a full-stack open-source RISC-V GPGPU. +Vortex is a full-stack open-source RISC-V GPGPU. Vortex supports multiple **backend drivers**, including our C++ simulator (simx), an RTL simulator, and physical Xilinx and Altera FPGAs-- all controlled by a single driver script. The chosen driver determines the corresponding code invoked to run Vortex. Generally, developers will prototype their intended design in simx, before completing going forward with an RTL implementation. Alternatively, you can get up and running by selecting a driver of your choice and running a demo program. + +## Website +Vortex news can be found on its [website](https://vortex.cc.gatech.edu/) + +## Citation +``` +@inproceedings{10.1145/3466752.3480128, + author = {Tine, Blaise and Yalamarthy, Krishna Praveen and Elsabbagh, Fares and Hyesoon, Kim}, + title = {Vortex: Extending the RISC-V ISA for GPGPU and 3D-Graphics}, + year = {2021}, + isbn = {9781450385572}, + publisher = {Association for Computing Machinery}, + address = {New York, NY, USA}, + url = {https://doi.org/10.1145/3466752.3480128}, + doi = {10.1145/3466752.3480128}, + abstract = {The importance of open-source hardware and software has been increasing. However, despite GPUs being one of the more popular accelerators across various applications, there is very little open-source GPU infrastructure in the public domain. We argue that one of the reasons for the lack of open-source infrastructure for GPUs is rooted in the complexity of their ISA and software stacks. In this work, we first propose an ISA extension to RISC-V that supports GPGPUs and graphics. The main goal of the ISA extension proposal is to minimize the ISA changes so that the corresponding changes to the open-source ecosystem are also minimal, which makes for a sustainable development ecosystem. To demonstrate the feasibility of the minimally extended RISC-V ISA, we implemented the complete software and hardware stacks of Vortex on FPGA. Vortex is a PCIe-based soft GPU that supports OpenCL and OpenGL. Vortex can be used in a variety of applications, including machine learning, graph analytics, and graphics rendering. Vortex can scale up to 32 cores on an Altera Stratix 10 FPGA, delivering a peak performance of 25.6 GFlops at 200 Mhz.}, + booktitle = {MICRO-54: 54th Annual IEEE/ACM International Symposium on Microarchitecture}, + pages = {754–766}, + numpages = {13}, + keywords = {reconfigurable computing, memory systems., computer graphics}, + location = {Virtual Event, Greece}, + series = {MICRO '21} +} +``` ## Specifications - Support RISC-V RV32IMAF and RV64IMAFD + - Microarchitecture: - configurable number of cores, warps, and threads. - configurable number of ALU, FPU, LSU, and SFU units per core. @@ -29,48 +54,50 @@ Vortex is a full-stack open-source RISC-V GPGPU. - `ci`: Continuous integration scripts. - `miscs`: Miscellaneous resources. -## Build Instructions -More detailed build instructions can be found [here](docs/install_vortex.md). +## Quick Start +If you are interested in a stable release of Vortex, you can download the latest release [here](https://github.com/vortexgpgpu/vortex/releases/latest). Otherwise, you can pull the most recent, but (potentially) unstable version as shown below. The following steps demonstrate how to build and run Vortex with the default driver: SimX. If you are interested in a different backend, look [here](docs/simulation.md). + ### Supported OS Platforms -- Ubuntu 18.04, 20.04 +- Ubuntu 18.04, 20.04, 22.04, 24.04 - Centos 7 ### Toolchain Dependencies +The following dependencies will be fetched prebuilt by `toolchain_install.sh`. - [POCL](http://portablecl.org/) - [LLVM](https://llvm.org/) - [RISCV-GNU-TOOLCHAIN](https://github.com/riscv-collab/riscv-gnu-toolchain) - [Verilator](https://www.veripool.org/verilator) -- [FpNew](https://github.com/pulp-platform/fpnew.git) +- [cvfpu](https://github.com/openhwgroup/cvfpu.git) - [SoftFloat](https://github.com/ucb-bar/berkeley-softfloat-3.git) - [Ramulator](https://github.com/CMU-SAFARI/ramulator.git) - [Yosys](https://github.com/YosysHQ/yosys) - [Sv2v](https://github.com/zachjs/sv2v) -### Install development tools -```sh -sudo apt-get install build-essential -sudo apt-get install binutils -sudo apt-get install python -sudo apt-get install uuid-dev -sudo apt-get install git -``` ### Install Vortex codebase ```sh -git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git -cd vortex + git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git + cd vortex +``` +### Install system dependencies +```sh +# ensure dependent libraries are present +sudo ./ci/install_dependencies.sh ``` ### Configure your build folder ```sh -mkdir build -cd build -../configure --xlen=32 --tooldir=$HOME/tools + mkdir build + cd build + # for 32bit + ../configure --xlen=32 --tooldir=$HOME/tools + # for 64bit + ../configure --xlen=64 --tooldir=$HOME/tools ``` ### Install prebuilt toolchain ```sh -./ci/toolchain_install.sh --all + ./ci/toolchain_install.sh --all ``` -### Set environment variables +### set environment variables ```sh -# should always run before using the toolchain! -source ./ci/toolchain_env.sh + # should always run before using the toolchain! + source ./ci/toolchain_env.sh ``` ### Building Vortex ```sh @@ -88,20 +115,20 @@ make -s make -s make install ``` -- Building Vortex 64-bit simply requires using --xlen=64 configure option. +- Building Vortex 64-bit requires setting --xlen=64 configure option. ```sh -../configure --xlen=32 --tooldir=$HOME/tools +../configure --xlen=64 --tooldir=$HOME/tools ``` - Sourcing "./ci/toolchain_env.sh" is required everytime you start a new terminal. we recommend adding "source /ci/toolchain_env.sh" to your ~/.bashrc file to automate the process at login. ```sh echo "source /ci/toolchain_env.sh" >> ~/.bashrc ``` -- Making changes to Makefiles in your source tree or adding new folders will require executing the "configure" script again to get it propagated into your build folder. +- Making changes to Makefiles in your source tree or adding new folders will require executing the "configure" script again without any options to get changes propagated to your build folder. ```sh ../configure ``` -- To debug the GPU, you can generate a "run.log" trace. see /docs/debugging.md for more information. +- To debug the GPU, the simulation can generate a runtime trace for analysis. See /docs/debugging.md for more information. ```sh ./ci/blackbox.sh --app=demo --debug=3 ``` -- For additional information, check out the /docs. +- For additional information, check out the [documentation](docs/index.md) diff --git a/ci/blackbox.sh b/ci/blackbox.sh index fe94677aa..27a43781b 100755 --- a/ci/blackbox.sh +++ b/ci/blackbox.sh @@ -13,6 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. +SCRIPT_DIR=$(dirname "$0") +ROOT_DIR=$SCRIPT_DIR/.. + show_usage() { echo "Vortex BlackBox Test Driver v1.0" @@ -29,302 +32,174 @@ show_help() echo "--rebuild: 0=disable, 1=force, 2=auto, 3=temp" } -SCRIPT_DIR=$(dirname "$0") -ROOT_DIR=$SCRIPT_DIR/.. - -DRIVER=simx -APP=sgemm -CLUSTERS=1 -CORES=1 -WARPS=4 -THREADS=4 -L2= -L3= -DEBUG=0 -DEBUG_LEVEL=0 -SCOPE=0 -HAS_ARGS=0 -PERF_CLASS=0 -REBUILD=2 -TEMPBUILD=0 -LOGFILE=run.log - -for i in "$@" -do -case $i in - --driver=*) - DRIVER=${i#*=} - shift - ;; - --app=*) - APP=${i#*=} - shift - ;; - --clusters=*) - CLUSTERS=${i#*=} - shift - ;; - --cores=*) - CORES=${i#*=} - shift - ;; - --warps=*) - WARPS=${i#*=} - shift - ;; - --threads=*) - THREADS=${i#*=} - shift - ;; - --l2cache) - L2=-DL2_ENABLE - shift - ;; - --l3cache) - L3=-DL3_ENABLE - shift - ;; - --debug=*) - DEBUG_LEVEL=${i#*=} - DEBUG=1 - shift - ;; - --scope) - SCOPE=1 - CORES=1 - shift - ;; - --perf=*) - PERF_FLAG=-DPERF_ENABLE - PERF_CLASS=${i#*=} - shift - ;; - --args=*) - ARGS=${i#*=} - HAS_ARGS=1 - shift - ;; - --rebuild=*) - REBUILD=${i#*=} - shift - ;; - --log=*) - LOGFILE=${i#*=} - shift - ;; - --help) - show_help - exit 0 - ;; - *) - show_usage - exit -1 - ;; -esac -done - -if [ $REBUILD -eq 3 ]; -then - REBUILD=1 - TEMPBUILD=1 -fi - -case $DRIVER in - gpu) - DRIVER_PATH= - ;; - simx) - DRIVER_PATH=$ROOT_DIR/runtime/simx - ;; - rtlsim) - DRIVER_PATH=$ROOT_DIR/runtime/rtlsim - ;; - opae) - DRIVER_PATH=$ROOT_DIR/runtime/opae - ;; - xrt) - DRIVER_PATH=$ROOT_DIR/runtime/xrt - ;; - *) - echo "invalid driver: $DRIVER" - exit -1 - ;; -esac - -if [ -d "$ROOT_DIR/tests/opencl/$APP" ]; -then - APP_PATH=$ROOT_DIR/tests/opencl/$APP -elif [ -d "$ROOT_DIR/tests/regression/$APP" ]; -then - APP_PATH=$ROOT_DIR/tests/regression/$APP -else - echo "Application folder not found: $APP" - exit -1 -fi - -if [ "$DRIVER" = "gpu" ]; -then - # running application - if [ $HAS_ARGS -eq 1 ] - then - echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER" - OPTS=$ARGS make -C $APP_PATH run-$DRIVER - status=$? +add_option() { + if [ -n "$1" ]; then + echo "$1 $2" else - echo "running: make -C $APP_PATH run-$DRIVER" - make -C $APP_PATH run-$DRIVER - status=$? + echo "$2" + fi +} + +DEFAULTS() { + DRIVER=simx + APP=sgemm + DEBUG=0 + DEBUG_LEVEL=0 + SCOPE=0 + HAS_ARGS=0 + PERF_CLASS=0 + CONFIGS="$CONFIGS" + REBUILD=2 + TEMPBUILD=0 + LOGFILE=run.log +} + +parse_args() { + DEFAULTS + for i in "$@"; do + case $i in + --driver=*) DRIVER=${i#*=} ;; + --app=*) APP=${i#*=} ;; + --clusters=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_CLUSTERS=${i#*=}") ;; + --cores=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_CORES=${i#*=}") ;; + --warps=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_WARPS=${i#*=}") ;; + --threads=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_THREADS=${i#*=}") ;; + --l2cache) CONFIGS=$(add_option "$CONFIGS" "-DL2_ENABLE") ;; + --l3cache) CONFIGS=$(add_option "$CONFIGS" "-DL3_ENABLE") ;; + --perf=*) CONFIGS=$(add_option "$CONFIGS" "-DPERF_ENABLE"); PERF_CLASS=${i#*=} ;; + --debug=*) DEBUG=1; DEBUG_LEVEL=${i#*=} ;; + --scope) SCOPE=1; ;; + --args=*) HAS_ARGS=1; ARGS=${i#*=} ;; + --rebuild=*) REBUILD=${i#*=} ;; + --log=*) LOGFILE=${i#*=} ;; + --help) show_help; exit 0 ;; + *) show_usage; exit 1 ;; + esac + done + + if [ $REBUILD -eq 3 ]; + then + REBUILD=1 + TEMPBUILD=1 + fi +} + +set_driver_path() { + case $DRIVER in + gpu) DRIVER_PATH="" ;; + simx|rtlsim|opae|xrt) DRIVER_PATH="$ROOT_DIR/runtime/$DRIVER" ;; + *) echo "Invalid driver: $DRIVER"; exit 1 ;; + esac +} + +set_app_path() { + if [ -d "$ROOT_DIR/tests/opencl/$APP" ]; then + APP_PATH="$ROOT_DIR/tests/opencl/$APP" + elif [ -d "$ROOT_DIR/tests/regression/$APP" ]; then + APP_PATH="$ROOT_DIR/tests/regression/$APP" + else + echo "Application folder not found: $APP" + exit 1 + fi +} + +build_driver() { + local cmd_opts="" + [ $DEBUG -ne 0 ] && cmd_opts=$(add_option "$cmd_opts" "DEBUG=$DEBUG_LEVEL") + [ $SCOPE -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "SCOPE=1") + [ $TEMPBUILD -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "DESTDIR=\"$TEMPDIR\"") + [ -n "$CONFIGS" ] && cmd_opts=$(add_option "$cmd_opts" "CONFIGS=\"$CONFIGS\"") + + if [ -n "$cmd_opts" ]; then + echo "Running: $cmd_opts make -C $DRIVER_PATH > /dev/null" + eval "$cmd_opts make -C $DRIVER_PATH > /dev/null" + else + echo "Running: make -C $DRIVER_PATH > /dev/null" + make -C $DRIVER_PATH > /dev/null + fi +} + +run_app() { + local cmd_opts="" + [ $DEBUG -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "DEBUG=1") + [ $TEMPBUILD -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "VORTEX_RT_PATH=\"$TEMPDIR\"") + [ $HAS_ARGS -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "OPTS=\"$ARGS\"") + + if [ $DEBUG -ne 0 ]; then + if [ -n "$cmd_opts" ]; then + echo "Running: $cmd_opts make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1" + eval "$cmd_opts make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1" + else + echo "Running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1" + make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1 + fi + else + if [ -n "$cmd_opts" ]; then + echo "Running: $cmd_opts make -C $APP_PATH run-$DRIVER" + eval "$cmd_opts make -C $APP_PATH run-$DRIVER" + else + echo "Running: make -C $APP_PATH run-$DRIVER" + make -C $APP_PATH run-$DRIVER + fi + fi + status=$? + return $status +} + +main() { + parse_args "$@" + set_driver_path + set_app_path + + # execute on default installed GPU + if [ "$DRIVER" = "gpu" ]; then + run_app + exit $? + fi + + if [ -n "$CONFIGS" ]; then + echo "CONFIGS=$CONFIGS" + fi + + if [ $REBUILD -ne 0 ]; then + BLACKBOX_CACHE=blackbox.$DRIVER.cache + LAST_CONFIGS=$(cat "$BLACKBOX_CACHE" 2>/dev/null || echo "") + + if [ $REBUILD -eq 1 ] || [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ]; then + make -C $DRIVER_PATH clean-driver > /dev/null + echo "$CONFIGS+$DEBUG+$SCOPE" > "$BLACKBOX_CACHE" + fi + fi + + export VORTEX_PROFILING=$PERF_CLASS + + make -C "$ROOT_DIR/hw" config > /dev/null + make -C "$ROOT_DIR/runtime/stub" > /dev/null + + if [ $TEMPBUILD -eq 1 ]; then + # setup temp directory + TEMPDIR=$(mktemp -d) + mkdir -p "$TEMPDIR" + # build stub driver + echo "running: DESTDIR=$TEMPDIR make -C $ROOT_DIR/runtime/stub" + DESTDIR="$TEMPDIR" make -C $ROOT_DIR/runtime/stub > /dev/null + # register tempdir cleanup on exit + trap "rm -rf $TEMPDIR" EXIT + fi + + build_driver + run_app + status=$? + + if [ $DEBUG -eq 1 ] && [ -f "$APP_PATH/trace.vcd" ]; then + mv -f $APP_PATH/trace.vcd . + fi + + if [ $SCOPE -eq 1 ] && [ -f "$APP_PATH/scope.vcd" ]; then + mv -f $APP_PATH/scope.vcd . fi exit $status -fi +} -CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS" - -echo "CONFIGS=$CONFIGS" - -if [ $REBUILD -ne 0 ] -then - BLACKBOX_CACHE=blackbox.$DRIVER.cache - if [ -f "$BLACKBOX_CACHE" ] - then - LAST_CONFIGS=`cat $BLACKBOX_CACHE` - fi - - if [ $REBUILD -eq 1 ] || [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ]; - then - make -C $DRIVER_PATH clean-driver > /dev/null - echo "$CONFIGS+$DEBUG+$SCOPE" > $BLACKBOX_CACHE - fi -fi - -# export performance monitor class identifier -export VORTEX_PROFILING=$PERF_CLASS - -status=0 - -# ensure config update -make -C $ROOT_DIR/hw config > /dev/null - -# ensure the stub driver is present -make -C $ROOT_DIR/runtime/stub > /dev/null - -if [ $DEBUG -ne 0 ] -then - # running application - if [ $TEMPBUILD -eq 1 ] - then - # setup temp directory - TEMPDIR=$(mktemp -d) - mkdir -p "$TEMPDIR/$DRIVER" - - # driver initialization - if [ $SCOPE -eq 1 ] - then - echo "running: DESTDIR=$TEMPDIR/$DRIVER DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH" - DESTDIR="$TEMPDIR/$DRIVER" DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null - else - echo "running: DESTDIR=$TEMPDIR/$DRIVER DEBUG=$DEBUG_LEVEL CONFIGS=$CONFIGS make -C $DRIVER_PATH" - DESTDIR="$TEMPDIR/$DRIVER" DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null - fi - - # running application - if [ $HAS_ARGS -eq 1 ] - then - echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1" - DEBUG=1 VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1 - status=$? - else - echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1" - DEBUG=1 VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1 - status=$? - fi - - # cleanup temp directory - trap "rm -rf $TEMPDIR" EXIT - else - # driver initialization - if [ $SCOPE -eq 1 ] - then - echo "running: DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH" - DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null - else - echo "running: DEBUG=$DEBUG_LEVEL CONFIGS=$CONFIGS make -C $DRIVER_PATH" - DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null - fi - - # running application - if [ $HAS_ARGS -eq 1 ] - then - echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1" - DEBUG=1 OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1 - status=$? - else - echo "running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1" - DEBUG=1 make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1 - status=$? - fi - fi - - if [ -f "$APP_PATH/trace.vcd" ] - then - mv -f $APP_PATH/trace.vcd . - fi -else - if [ $TEMPBUILD -eq 1 ] - then - # setup temp directory - TEMPDIR=$(mktemp -d) - mkdir -p "$TEMPDIR/$DRIVER" - - # driver initialization - if [ $SCOPE -eq 1 ] - then - echo "running: DESTDIR=$TEMPDIR/$DRIVER SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH" - DESTDIR="$TEMPDIR/$DRIVER" SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null - else - echo "running: DESTDIR=$TEMPDIR/$DRIVER CONFIGS=$CONFIGS make -C $DRIVER_PATH" - DESTDIR="$TEMPDIR/$DRIVER" CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null - fi - - # running application - if [ $HAS_ARGS -eq 1 ] - then - echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER" - VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER - status=$? - else - echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER" - VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER - status=$? - fi - - # cleanup temp directory - trap "rm -rf $TEMPDIR" EXIT - else - - # driver initialization - if [ $SCOPE -eq 1 ] - then - echo "running: SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH" - SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null - else - echo "running: CONFIGS=$CONFIGS make -C $DRIVER_PATH" - CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null - fi - - # running application - if [ $HAS_ARGS -eq 1 ] - then - echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER" - OPTS=$ARGS make -C $APP_PATH run-$DRIVER - status=$? - else - echo "running: make -C $APP_PATH run-$DRIVER" - make -C $APP_PATH run-$DRIVER - status=$? - fi - fi -fi - -exit $status +main "$@" \ No newline at end of file diff --git a/ci/install_dependencies.sh b/ci/install_dependencies.sh new file mode 100755 index 000000000..44e16400c --- /dev/null +++ b/ci/install_dependencies.sh @@ -0,0 +1,46 @@ +#!/bin/sh + +# Copyright 2019-2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +# Function to check if GCC version is less than 11 +check_gcc_version() { + local gcc_version + gcc_version=$(gcc -dumpversion) + if dpkg --compare-versions "$gcc_version" lt 11; then + return 0 # GCC version is less than 11 + else + return 1 # GCC version is 11 or greater + fi +} + +# Update package list +apt-get update -y + +# install system dependencies +apt-get install -y build-essential valgrind libstdc++6 binutils python3 uuid-dev ccache cmake libffi7 + +# Check and install GCC 11 if necessary +if check_gcc_version; then + echo "GCC version is less than 11. Installing GCC 11..." + add-apt-repository -y ppa:ubuntu-toolchain-r/test + apt-get update + apt-get install -y g++-11 gcc-11 + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100 + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 +else + echo "GCC version is 11 or greater. No need to install GCC 11." +fi diff --git a/ci/regression.sh.in b/ci/regression.sh.in index 3cd46a463..94ac4651f 100755 --- a/ci/regression.sh.in +++ b/ci/regression.sh.in @@ -19,6 +19,8 @@ set -e # clear blackbox cache rm -f blackbox.*.cache +# HW: add a test "VM Test" to make sure VM feature is enabled + XLEN=${XLEN:=@XLEN@} XSIZE=$((XLEN / 8)) @@ -41,31 +43,23 @@ isa() make -C tests/riscv/isa run-simx make -C tests/riscv/isa run-rtlsim - make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null - make -C tests/riscv/isa run-rtlsim-32f + make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32f - make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null - make -C tests/riscv/isa run-rtlsim-32f + make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32f - make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null - make -C tests/riscv/isa run-rtlsim-32f + make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32f if [ "$XLEN" == "64" ] then - make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null - make -C tests/riscv/isa run-rtlsim-64d + make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64d - make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null - make -C tests/riscv/isa run-rtlsim-64d + make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64d - make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null - make -C tests/riscv/isa run-rtlsim-64f + make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64f - make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null - make -C tests/riscv/isa run-rtlsim-64f + make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64f - make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null - make -C tests/riscv/isa run-rtlsim-64fx + make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64fx fi # clean build @@ -100,10 +94,18 @@ regression() # test global barrier CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tgbar" --cores=2 CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tgbar" --cores=2 + CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=xrt --app=dogfood --args="-n1 -tgbar" --cores=2 # test local barrier ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar" ./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tbar" + ./ci/blackbox.sh --driver=xrt --app=dogfood --args="-n1 -tbar" + + # test temp driver mode for + ./ci/blackbox.sh --driver=simx --app=vecadd --rebuild=3 + + # test for matmul + CONFIGS="-DTC_NUM=4 -DTC_SIZE=8" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args="-n128 -d1" echo "regression tests done!" } @@ -124,6 +126,22 @@ opencl() echo "opencl tests done!" } +vm(){ + echo "begin vm tests..." + + make -C sim/simx clean && CONFIGS="-DVM_ENABLE" make -C sim/simx + make -C runtime/simx clean && CONFIGS="-DVM_ENABLE" make -C runtime/simx + make -C tests/opencl run-simx + make -C tests/regression run-simx + + make -C sim/simx clean && CONFIGS="-DVM_ENABLE -DVM_ADDR_MODE=BARE" make -C sim/simx + make -C runtime/simx clean && CONFIGS="-DVM_ENABLE -DVM_ADDR_MODE=BARE" make -C runtime/simx + make -C tests/opencl run-simx + make -C tests/regression run-simx + + echo "vm tests done!" +} + cache() { echo "begin cache tests..." @@ -140,27 +158,33 @@ cache() # reduce l1 line size CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=rtlsim --app=io_addr + CONFIGS="-DL1_LINE_SIZE=$XSIZE -DDISABLE_L1" ./ci/blackbox.sh --driver=rtlsim --app=io_addr CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=io_addr CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx # test cache ways - CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx + CONFIGS="-DICACHE_NUM_WAYS=1 -DDCACHE_NUM_WAYS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DICACHE_NUM_WAYS=4 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DICACHE_NUM_WAYS=4 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx # test cache banking - CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx - CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx - CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx + CONFIGS="-DMEM_BLOCK_SIZE=8 -DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DMEM_BLOCK_SIZE=8 -DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=simx --app=sgemmx + CONFIGS="-DMEM_BLOCK_SIZE=8 -DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --threads=8 + CONFIGS="-DMEM_BLOCK_SIZE=8 -DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=4" ./ci/blackbox.sh --driver=simx --app=sgemmx --threads=8 + + # replacement policy + CONFIGS="-DDCACHE_REPL_POLICY=0" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DDCACHE_REPL_POLICY=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx + CONFIGS="-DDCACHE_REPL_POLICY=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx # test writeback - CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --app=mstress - CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --app=mstress - CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=mstress - CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=mstress + CONFIGS="-DDCACHE_WRITEBACK=1 -DDCACHE_DIRTYBYTES=0 -DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=mstress + CONFIGS="-DDCACHE_WRITEBACK=1 -DDCACHE_DIRTYBYTES=1 -DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=mstress + CONFIGS="-DDCACHE_WRITEBACK=1 -DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=simx --app=mstress + CONFIGS="-DSOCKET_SIZE=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=mstress + CONFIGS="-DSOCKET_SIZE=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=mstress # cache clustering CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=4 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=4 --warps=1 --threads=2 @@ -235,33 +259,39 @@ config2() # test opaesim ./ci/blackbox.sh --driver=opae --app=printf ./ci/blackbox.sh --driver=opae --app=diverge + ./ci/blackbox.sh --driver=xrt --app=diverge # disable DPI - CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood - CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood + if [ "$XLEN" == "64" ]; then + # need to disable trig on 64-bit due to a bug inside fpnew's sqrt core. + CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-xtrig -xbar -xgbar" + CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood --args="-xtrig -xbar -xgbar" + CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=xrt --app=dogfood --args="-xtrig -xbar -xgbar" + else + CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood + CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood + CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=xrt --app=dogfood + fi # custom program startup address make -C tests/regression/dogfood clean-kernel - STARTUP_ADDR=0x40000000 make -C tests/regression/dogfood + STARTUP_ADDR=0x80000000 make -C tests/regression/dogfood ./ci/blackbox.sh --driver=simx --app=dogfood ./ci/blackbox.sh --driver=rtlsim --app=dogfood make -C tests/regression/dogfood clean-kernel # disabling M & F extensions - make -C sim/rtlsim clean && CONFIGS="-DEXT_M_DISABLE -DEXT_F_DISABLE" make -C sim/rtlsim > /dev/null - make -C tests/riscv/isa run-rtlsim-32i + make -C sim/rtlsim clean && CONFIGS="-DEXT_M_DISABLE -DEXT_F_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32i make -C sim/rtlsim clean # disabling ZICOND extension CONFIGS="-DEXT_ZICOND_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo - # test AXI bus - AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --app=mstress - - # test 128-bit MEM block + # test 128-bit memory block CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=mstress + CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=xrt --app=mstress - # test XLEN-bit MEM block + # test XLEN-bit memory block CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=opae --app=mstress CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=mstress @@ -269,11 +299,35 @@ config2() CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8 CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8 - # test single-bank DRAM - CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress + # test single-bank memory + if [ "$XLEN" == "64" ]; then + CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress + CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=xrt --app=mstress + else + CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress + CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=xrt --app=mstress + fi - # test 27-bit DRAM address - CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --app=mstress + # test larger memory address + if [ "$XLEN" == "64" ]; then + CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=49" ./ci/blackbox.sh --driver=opae --app=mstress + CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=49" ./ci/blackbox.sh --driver=xrt --app=mstress + else + CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=33" ./ci/blackbox.sh --driver=opae --app=mstress + CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=33" ./ci/blackbox.sh --driver=xrt --app=mstress + fi + + # test memory banks interleaving + CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=1" ./ci/blackbox.sh --driver=opae --app=mstress + CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=0" ./ci/blackbox.sh --driver=opae --app=mstress + + # test memory ports + CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress + CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8 + CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress + CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8 + CONFIGS="-DMEM_BLOCK_SIZE=8" ./ci/blackbox.sh --driver=opae --app=mstress --threads=8 + CONFIGS="-DMEM_BLOCK_SIZE=8" ./ci/blackbox.sh --driver=xrt --app=mstress --threads=8 echo "configuration-2 tests done!" } @@ -299,20 +353,32 @@ debug() test_csv_trace + CONFIGS="-O0" ./ci/blackbox.sh --driver=opae --app=demo --args="-n1" + CONFIGS="-O0" ./ci/blackbox.sh --driver=xrt --app=demo --args="-n1" CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1" + CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=xrt --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1" CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1" - ./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1" echo "debugging tests done!" } +scope() +{ + echo "begin scope tests..." + + SCOPE_DEPTH=128 ./ci/blackbox.sh --driver=opae --app=demo --args="-n1" --scope + SCOPE_DEPTH=128 ./ci/blackbox.sh --driver=xrt --app=demo --args="-n1" --scope + + echo "debugging scope done!" +} + stress() { echo "begin stress tests..." # test verilator reset values CONFIGS="-DVERILATOR_RESET_VALUE=1 -DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood - CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --args="-n128" --l2cache + CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=xrt --app=sgemmx --args="-n128" --l2cache echo "stress tests done!" } @@ -322,15 +388,25 @@ synthesis() echo "begin synthesis tests..." PREFIX=build_base make -C hw/syn/yosys clean - PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys synthesis + PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE -DNUM_WARPS=2 -DNUM_THREADS=2" make -C hw/syn/yosys synthesis echo "synthesis tests done!" } +vector() +{ + echo "begin vector tests..." + + make -C sim/simx clean && CONFIGS="-DEXT_V_ENABLE" make -C sim/simx + TOOLDIR=@TOOLDIR@ XLEN=@XLEN@ VLEN=256 REG_TESTS=1 ./tests/riscv/riscv-vector-tests/run-test.sh + + echo "vector tests done!" +} + show_usage() { echo "Vortex Regression Test" - echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--stress] [--synthesis] [--all] [--h|--help]" + echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--scope] [--stress] [--synthesis] [--vector] [--all] [--h|--help]" } declare -a tests=() @@ -359,6 +435,9 @@ while [ "$1" != "" ]; do --cache ) tests+=("cache") ;; + --vm ) + tests+=("vm") + ;; --config1 ) tests+=("config1") ;; @@ -368,12 +447,18 @@ while [ "$1" != "" ]; do --debug ) tests+=("debug") ;; + --scope ) + tests+=("scope") + ;; --stress ) tests+=("stress") ;; --synthesis ) tests+=("synthesis") ;; + --vector ) + tests+=("vector") + ;; --all ) tests=() tests+=("unittest") @@ -382,11 +467,14 @@ while [ "$1" != "" ]; do tests+=("regression") tests+=("opencl") tests+=("cache") + tests+=("vm") tests+=("config1") tests+=("config2") tests+=("debug") + tests+=("scope") tests+=("stress") tests+=("synthesis") + tests+=("vector") ;; -h | --help ) show_usage diff --git a/ci/system_updates.sh b/ci/system_updates.sh deleted file mode 100755 index 43abbe5ab..000000000 --- a/ci/system_updates.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/sh - -# Copyright 2019-2023 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -e - -apt-get update -y - -add-apt-repository -y ppa:ubuntu-toolchain-r/test -apt-get update -apt-get install -y g++-11 gcc-11 -update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100 -update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100 - -apt-get install -y build-essential valgrind libstdc++6 binutils python uuid-dev ccache diff --git a/ci/toolchain_env.sh.in b/ci/toolchain_env.sh.in index 9fcfdbb89..9c3387c13 100755 --- a/ci/toolchain_env.sh.in +++ b/ci/toolchain_env.sh.in @@ -1,13 +1,13 @@ #!/bin/sh # Copyright 2019-2023 -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -15,7 +15,6 @@ # limitations under the License. TOOLDIR=${TOOLDIR:=@TOOLDIR@} - export PATH=$TOOLDIR/verilator/bin:$PATH export SV2V_PATH=$TOOLDIR/sv2v diff --git a/ci/toolchain_install.sh.in b/ci/toolchain_install.sh.in index 935568ff0..01ebe889b 100755 --- a/ci/toolchain_install.sh.in +++ b/ci/toolchain_install.sh.in @@ -23,9 +23,9 @@ OSVERSION=${OSVERSION:=@OSVERSION@} riscv32() { case $OSVERSION in - "centos/7") parts=$(eval echo {a..h}) ;; - "ubuntu/focal") parts=$(eval echo {a..k}) ;; - *) parts=$(eval echo {a..j}) ;; + "centos/7") parts=$(eval echo {a..l}) ;; + "ubuntu/bionic") parts=$(eval echo {a..j}) ;; + *) parts=$(eval echo {a..k}) ;; esac rm -f riscv32-gnu-toolchain.tar.bz2.parta* for x in $parts @@ -41,7 +41,7 @@ riscv32() riscv64() { case $OSVERSION in - "centos/7") parts=$(eval echo {a..h}) ;; + "centos/7") parts=$(eval echo {a..l}) ;; *) parts=$(eval echo {a..j}) ;; esac rm -f riscv64-gnu-toolchain.tar.bz2.parta* diff --git a/ci/trace_csv.py b/ci/trace_csv.py index 4a36f5f6a..077f8027e 100755 --- a/ci/trace_csv.py +++ b/ci/trace_csv.py @@ -44,7 +44,8 @@ def load_config(filename): 'num_barriers': int(config_match.group(7)), } return config - return None + print("Error: missing CONFIGS: header") + sys.exit(1) def parse_simx(log_lines): pc_pattern = r"PC=(0x[0-9a-fA-F]+)" @@ -274,6 +275,8 @@ def split_log_file(log_filename): if current_sublog is not None: sublogs.append(current_sublog) + else: + sublogs.append(log_lines) return sublogs diff --git a/ci/travis_run.py b/ci/travis_run.py index 907cf5ce4..70459cbee 100755 --- a/ci/travis_run.py +++ b/ci/travis_run.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright 2019-2023 # diff --git a/config.mk.in b/config.mk.in index 81339f195..57f77059e 100644 --- a/config.mk.in +++ b/config.mk.in @@ -31,7 +31,4 @@ RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv$(XLEN)-gnu-toolchain RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX) -VORTEX_RT_PATH ?= $(VORTEX_HOME)/runtime -VORTEX_KN_PATH ?= $(VORTEX_HOME)/kernel - -THIRD_PARTY_DIR ?= $(VORTEX_HOME)/third_party \ No newline at end of file +THIRD_PARTY_DIR ?= $(VORTEX_HOME)/third_party diff --git a/configure b/configure index 62975784b..fbcd3f130 100755 --- a/configure +++ b/configure @@ -26,6 +26,8 @@ detect_osversion() { case "$VERSION_CODENAME" in bionic) osversion="ubuntu/bionic";; focal) osversion="ubuntu/focal";; + jammy) osversion="ubuntu/focal";; + noble) osversion="ubuntu/focal";; # Add new versions as needed esac ;; @@ -63,7 +65,7 @@ copy_files() { filename_no_ext="${filename%.in}" dest_file="$dest_dir/$filename_no_ext" mkdir -p "$dest_dir" - sed "s|@VORTEX_HOME@|$SCRIPT_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g" "$file" > "$dest_file" + sed "s|@VORTEX_HOME@|$SOURCE_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g; s|@BUILDDIR@|$CURRENT_DIR|g" "$file" > "$dest_file" # apply permissions to bash scripts read -r firstline < "$dest_file" if [[ "$firstline" =~ ^#!.*bash ]]; then @@ -167,8 +169,8 @@ fi SUBDIRS=("." "!ci" "!perf" "hw*" "kernel*" "runtime*" "sim*" "tests*") # Get the directory of the script -SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +SOURCE_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" -THIRD_PARTY_DIR=$SCRIPT_DIR/third_party +THIRD_PARTY_DIR=$SOURCE_DIR/third_party -copy_files "$SCRIPT_DIR" "$CURRENT_DIR" +copy_files "$SOURCE_DIR" "$CURRENT_DIR" diff --git a/docs/altera_fpga_guide.md b/docs/altera_fpga_guide.md deleted file mode 100644 index 61d1ae26e..000000000 --- a/docs/altera_fpga_guide.md +++ /dev/null @@ -1,79 +0,0 @@ -# FPGA Startup and Configuration Guide - -OPAE Environment Setup ----------------------- - - $ source /opt/inteldevstack/init_env_user.sh - $ export OPAE_HOME=/opt/opae/1.1.2 - $ export PATH=$OPAE_HOME/bin:$PATH - $ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH - $ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH - $ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH - -OPAE Build ------------------- - -The FPGA has to following configuration options: -- DEVICE_FAMILY=arria10 | stratix10 -- NUM_CORES=#n - -Command line: - - $ cd hw/syn/altera/opae - $ PREFIX=test1 TARGET=fpga NUM_CORES=4 make - -A new folder (ex: `test1_xxx_4c`) will be created and the build will start and take ~30-480 min to complete. -Setting TARGET=ase will build the project for simulation using Intel ASE. - - -OPAE Build Configuration ------------------------- - -The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware parameters that can be modified when build the processor.For example, have the following parameters that can be configured: -- `NUM_WARPS`: Number of warps per cores -- `NUM_THREADS`: Number of threads per warps -- `PERF_ENABLE`: enable the use of all profile counters - -You configure the syntesis build from the command line: - - $ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make - -OPAE Build Progress -------------------- - -You could check the last 10 lines in the build log for possible errors until build completion. - - $ tail -n 10 /build.log - -Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs. - - $ ps -u - -If the build fails and you need to restart it, clean up the build folder using the following command: - - $ make clean - -The bitstream file `vortex_afu.gbs` should exist when the build is done: - - $ ls -lsa /synth/vortex_afu.gbs - - -Signing the bitstream and Programming the FPGA ----------------------------------------------- - - $ cd - $ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs - $ fpgasupdate vortex_afu_unsigned_ssl.gbs - -Sample FPGA Run Test --------------------- - -Ensure you have the correct opae runtime for the FPGA target - - $ make -C runtime/opae clean - $ TARGET=FPGA make -C runtime/opae - -Run the following from your Vortex build directory - - $ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128" - diff --git a/docs/contributing.md b/docs/contributing.md index 14e0ccd0c..0250e9f9f 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -1,18 +1,37 @@ -# Contributing to Vortex on Github +# Contributing to Vortex -## Github Details -- There are two main repos, `vortex` (public, this one) and `vortex-dev` (private) -- todo: Most current development is on `vortex` -- If you have a legacy version of `vortex`, you can use the releases branch or tags to access the repo at that point in time +## Github +Vortex uses Github to host its git repositories. +There are a lot of ways to use the features on Github for collaboration. +Therefore, this documentation details the standard procedure for contributing to Vortex. +Development of Vortex is consolidated to this repo, `vortex` and any associated forks. +Previously, there was active work done on a private repo named `vortex-dev`. +`vortex-dev` has officially been deprecated and fully merged into this public repo, `vortex`. +If you are returning to this project and have legacy versions of Vortex, you can use the releases branches to access older versions. ## Contribution Process -- You should create a new branch from develop that is clearly named with the feature that you want to add -- Avoid pushing directly to the `master` branch instead you will need to make a Pull Request (PR) -- There should be protections in place that prevent pushing directly to the main branch, but don't rely on it -- When you make a PR it will be tested against the continuous integration (ci) pipeline (see `continuous_integration.md`) -- It is not sufficient to just write some tests, they need to be incorporated into the ci pipeline to make sure they are run -- During a PR, you might receive feedback regarding your changes and you might need to make further commits to your branch +In an effort to keep `vortex` organized, permissions to directly create branches and push code has been limited to admins. +However, contributions are strongly encouraged and keep the project moving forward! Here is the procedure for contributing: +1. Create a fork of `vortex` +2. In your fork, create a branch from `master` that briefly explains the work you are adding (ie: `develop-documentation`) +3. Make your changes on the new branch in your fork. You may create as many commits as you need, which might be common if you are making multiple iterations +4. Since you are the owner of your fork, you have full permissions to push commits to your fork +4. When you are satisfied with the changes on your fork, you can open a PR from your fork using the online interface +5. If you recently made a push, you will get automatically get a prompt on Github online to create a PR, which you can press +6. Otherwise, you can go to your fork on Github online and manually create a PR (todo) +(todo): how to name and format your PR, what information you should add to the PR, does not need to be too strict if you are attending the weekly meetings* +7. Github uses the following semantics: `base repository` gets the changes from your `head repository` +8. Therefore, you should set the `base repository` to `vortexgpgpu/vortex` and the `base` branch to `master` since the master branch is protected by reviewed PRs. +9. And you should assign the `head repository` to `/vortex` (which represents your fork of vortex) and the `base` branch to the one created in step 2 +10. Now that your intended PR has been specified, you should review the status. Check for merge conflicts, if all your commits are present, and all the modified files make sense +11. You can still make a PR if there are issues in step 10, just make sure the structure is correct according to steps 7-9 +12. Once the PR is made, the CI pipeline will run automatically, testing your changes +13. Remember, a PR is flexible if you need to make changes to the code you can go back to your branch of the fork to commit and push any updates +14. As long as the `head repository`'s `base` branch is the one you edited, the PR will automatically get the most recent changes +15. When all merge conflicts are resolved, changes are made, and tests pass you can have an admin merge your PR -## Creating and Adding Tests -see `testing.md` \ No newline at end of file +## What Makes a Good Contribution? +- If you are contributing code changes, then review [testing.md](./testing.md) to ensure your tests are integrated into the [CI pipeline](continuous_integration.md) +- During a PR, you should consider the advice you are provided by your reviewers. Remember you keep adding commits to an open PR! +- If your change aims to fix an issue opened on Github, please tag that issue in the PR itself \ No newline at end of file diff --git a/docs/debugging.md b/docs/debugging.md index 6e2e14890..840e9cdd2 100644 --- a/docs/debugging.md +++ b/docs/debugging.md @@ -33,7 +33,13 @@ The recommended method to enable debugging is to pass the `--debug` flag to `bla // Running demo program on rtlsim in debug mode $ ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=1 -A debug trace `run.log` is generated in the current directory during the program execution. The trace includes important states of the simulated processor (memory, caches, pipeline, stalls, etc..). A waveform trace `trace.vcd` is also generated in the current directory during the program execution. You can visualize the waveform trace using any tool that can open VCD files (Modelsim, Quartus, Vivado, etc..). [GTKwave] (http://gtkwave.sourceforge.net) is a great open-source scope analyzer that also works with VCD files. +A debug trace `run.log` is generated in the current directory during the program execution. The trace includes important states of the simulated processor (memory, caches, pipeline, stalls, etc..). A waveform trace `trace.vcd` is also generated in the current directory during the program execution. +By default all library modules unde the /libs/ folder are excluded from the trace to reduce the waveform file size, you can chnage that behavoir by either explicitly commenting out `TRACING_OFF`/`TRACING_ON` inside a lib module source (e.g. VX_stream_buffer.sv) or simply enabling a full trace using the following command. + + // Debugging the demo program with rtlsim in full tracing mode + $ CONFIGS="-DTRACING_ALL" ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=1 + +You can visualize the waveform trace using any tool that can open VCD files (Modelsim, Quartus, Vivado, etc..). [GTKwave] (http://gtkwave.sourceforge.net) is a great open-source scope analyzer that also works with VCD files. ## FPGA Debugging diff --git a/docs/environment_setup.md b/docs/environment_setup.md index a55060ee5..ccd97c55e 100644 --- a/docs/environment_setup.md +++ b/docs/environment_setup.md @@ -1,16 +1,19 @@ # Environment Setup + These instructions apply to the development vortex repo using the updated toolchain. The updated toolchain is considered to be any commit of `master` pulled from July 2, 2023 onwards. The toolchain update in question can be viewed in this [commit](https://github.com/vortexgpgpu/vortex-dev/commit/0048496ba28d7b9a209a0e569d52d60f2b68fc04). Therefore, if you are unsure whether you are using the new toolchain or not, then you should check the `ci` folder for the existence of the `toolchain_prebuilt.sh` script. Furthermore, you should notice that the `toolchain_install.sh` script has the legacy `llvm()` split into `llvm-vortex()` and `llvm-pocl()`. - ## Set Up on Your Own System + The toolchain binaries provided with Vortex are built on Ubuntu-based systems. To install Vortex on your own system, [follow these instructions](install_vortex.md). - ## Servers for Georgia Tech Students and Collaborators + ### Volvo + Volvo is a 64-core server provided by HPArch. You need valid credentials to access it. If you don't already have access, you can get in contact with your mentor to ask about setting your account up. Setup on Volvo: + 1. Connect to Georgia Tech's VPN or ssh into another machine on campus 2. `ssh volvo.cc.gatech.edu` 3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git` @@ -19,9 +22,11 @@ Setup on Volvo: 6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood` ### Nio + Nio is a 20-core desktop server provided by HPArch. If you have access to Volvo, you also have access to Nio. Setup on Nio: + 1. Connect to Georgia Tech's VPN or ssh into another machine on campus 2. `ssh nio.cc.gatech.edu` 3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git` @@ -29,11 +34,12 @@ Setup on Nio: 5. `make -s` in the `vortex` root directory 6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood` - ## Docker (Experimental) + Docker allows for isolated pre-built environments to be created, shared and used. The emulation mode required for ARM-based processors will incur a decrease in performance. Currently, the dockerfile is not included with the official vortex repository and is not actively maintained or supported. ### Setup with Docker + 1. Clone repo recursively onto your local machine: `git clone --recursive https://github.com/vortexgpgpu/vortex.git` 2. Download the dockerfile from [here](https://github.gatech.edu/gist/usubramanya3/f1bf3e953faa38a6372e1292ffd0b65c) and place it in the root of the repo. 3. Build the Dockerfile into an image: `docker build --platform=linux/amd64 -t vortex -f dockerfile .` diff --git a/docs/fpga_setup.md b/docs/fpga_setup.md new file mode 100644 index 000000000..d909d8687 --- /dev/null +++ b/docs/fpga_setup.md @@ -0,0 +1,217 @@ +# FPGA Startup and Configuration Guide + +## Gaining Access to FPGA's with CRNCH +If you are associated with Georgia Tech (or related workshops) you can use CRNCH's server to gain remote access to FPGA's. Otherwise, you can skip to the Xilinx or Intel (Altera) synthesis steps below. + +## What is CRNCH? + +**C**enter for **R**esearch into **N**ovel **C**omputing **H**ierarchies + +## What does CRNCH Offer? + +**The Rogues Gallery (RG)**: new concept focused on developing our understanding of next-generation hardware with a focus on unorthodox and uncommon technologies. **RG** will acquire new and unique hardware (ie, the aforementioned “*rogues*”) from vendors, research labs, and startups and make this hardware available to students, faculty, and industry collaborators within a managed data center environment + +## Why are the Rouges Important? + +By exposing students and researchers to this set of unique hardware, we hope to foster cross-cutting discussions about hardware designs that will drive future *performance improvements in computing long after the Moore’s Law era of “cheap transistors” ends*. Specifically, the Rouges Gallery contains FPGA's which can be synthesized into Vortex hardware. + +## How is the Rouges Gallery Funded? + +Rogues Gallery testbed is primarily supported by the National Science Foundation (NSF) under NSF Award Number [#2016701](https://www.nsf.gov/awardsearch/showAward?AWD_ID=2016701&HistoricalAwards=false) + +## Rouges Gallery Documentation + +You can read about RG in more detail on its official documentation [page](https://gt-crnch-rg.readthedocs.io/en/main/index.html#). + +You can listen to a talk about RG [here](https://mediaspace.gatech.edu/media/Jeff%20Young%20-%20Rogues%20Gallery%20-%20CRNCH%20Summit%202021/1_lqlgr0jj) + +[CRNCH Summit 2023](https://github.com/gt-crnch/crnch-summit-2023/tree/main) + +## Request Access for Rouges Gallery + +You should use [this form](https://crnch-rg.cc.gatech.edu/request-rogues-gallery-access/) to request access to RG’s reconfigurable computing (vortex fpga) resources. You should receive an email with your ticket item being created. Once it gets processed, you should get an email confirmed your access has been granted. It might take some time to get processed. + +## How to Access Rouges Gallery? +There are two methods of accessing CRNCH's Rouges Gallery +1) Web-based GUI: [rg-ood.crnch.gatech.edu](http://rg-ood.crnch.gatech.edu/) +2) SSH: `ssh @rg-login.crnch.gatech.edu` + + +## Where should I keep my files? +The CRNCH servers have a folder called `USERSCRATCH` which can be found in your home directory: `echo $HOME`. You should keep all your files in this folder since it is available across all the Rouges Gallery Nodes. + +## **What Machines are Available in the Rogues Gallery?** + +Complete list of machines can be found [here](https://gt-crnch-rg.readthedocs.io/en/main/general/rg-hardware.html). Furthermore, you can find detailed information about the FPGA hardware [here](https://gt-crnch-rg.readthedocs.io/en/main/reconfig/xilinx/xilinx-getting-started.html). + +## Allocate an FPGA Node +Once you’ve connected to the CRNCH login node, you can use the Slurm scheduler to request an interactive job using `salloc`. This [page](https://gt-crnch-rg.readthedocs.io/en/main/general/using-slurm.html) explains why we use Slurm to request resources. Documentation for `salloc` can be found [here](https://gt-crnch-rg.readthedocs.io/en/main/general/using-slurm-examples.html). And here. + + +To request 16 cores and 64GB of RAM for 6 hours on flubber9, a fpga dev node: +```bash +salloc -p rg-fpga --nodes=1 --ntasks-per-node=16 --mem=64G --nodelist flubber1 --time=06:00:00 +``` +Synthesis for Xilinx Boards +---------------------- +Once you are logged in, you will need to complete some first time configurations. If you are interested in the Intel (Altera) synthesis steps, scroll down below. + +### Source Configuration Scripts +``` +# From any directory +$ source /opt/xilinx/xrt/setup.sh +$ source /tools/reconfig/xilinx/Vitis/2023.1/settings64.sh +``` + +### Check Installed FPGA Platforms +`platforminfo -l` which tells us the correct name of the platform installed on the current fpga node. It should be used for the `PLATFORM` variable below. Otherwise, if there is an error then there was an issue with the previous two commands. + +### Install Vortex Toolchain +The Xilinx synthesis process requires verilator to generate the bitstream. Eventually, you will need the whole toolchain to run the bitstream on the FPGA. Therefore, the Vortex toolchain and can be installed as follows. If you complete these steps properly, you should only need to complete them once and you can skip to `Activate Vortex Toolchain` +``` +# Make a build directory from root and configure scripts for your environment +mkdir build && cd build && ../configure --tooldir=$HOME/tools + +# Install the whole prebuilt toolchain +./ci/toolchain_install.sh --all + +# Add environment variables to bashrc +echo "source /vortex/build/ci/toolchain_env.sh" >> ~/.bashrc +``` + +### Activate Vortex Toolchain +``` +# From any directory +source ~/.bashrc + +# Check environment setup +verilator --version +``` + +### Build the FPGA Bitstream +The root directory contains the path `hw/syn/xilinx/xrt` which has the makefile used to generate the Vortex bitstream. + +``` + $ cd hw/syn/xilinx/xrt + $ PREFIX=test1 PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 TARGET=hw NUM_CORES=1 make > build_u250_hw_1c.log 2>&1 & +``` +Will run the synthesis under new build directory: BUILD_DIR := "\\_\\_\" +The generated bitstream will be located under /bin/vortex_afu.xclbin + +For long-running jobs, invocation of this makefile can be made of the following form: + +`[CONFIGS=] [PREFIX=] [NUM_CORES=<#>] TARGET=hw|hw_emu PLATFORM= nohup make > 2>&1 &` + +For example: + +```bash +CONFIGS="-DL2_ENABLE -DDCACHE_SIZE=8192" PREFIX=build_4c_u280 NUM_CORES=4 TARGET=hw PLATFORM=xilinx_u280_gen3x16_xdma_1_202310_1 nohup make > build_u250_hw_4c.log 2>&1 & +``` + +The build is complete when the bitstream file `vortex_afu.xclbin` exists in `hw|hw_emu/bin`. + +### Running a Program on Xilinx FPGA + +The [blackbox.sh](./simulation.md) script within the build directory can be used to run a test with Vortex’s xrt driver using the following command: + +`FPGA_BIN_DIR= TARGET=hw|hw_emu PLATFORM= ./ci/blackbox.sh --driver=xrt --app=` + +For example: + +```FPGA_BIN_DIR= hw/syn/xilinx/xrt/build_4c_u280_xilinx_u280_gen3x16_xdma_1_202211_1_hw/bin TARGET=hw PLATFORM=xilinx_u280_gen3x16_xdma_1_202211_1 ./ci/blackbox.sh --driver=xrt --app=demo``` + +Synthesis for Intel (Altera) Boards +---------------------- + +### OPAE Environment Setup + + + $ source /opt/inteldevstack/init_env_user.sh + $ export OPAE_HOME=/opt/opae/1.1.2 + $ export PATH=$OPAE_HOME/bin:$PATH + $ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH + $ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH + $ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH + +### OPAE Build + +The FPGA has to following configuration options: +- DEVICE_FAMILY=arria10 | stratix10 +- NUM_CORES=#n + +Command line: + + $ cd hw/syn/altera/opae + $ PREFIX=test1 TARGET=fpga NUM_CORES=4 make + +A new folder (ex: `test1_xxx_4c`) will be created and the build will start and take ~30-480 min to complete. +Setting TARGET=ase will build the project for simulation using Intel ASE. + + +### OPAE Build Configuration + +The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware parameters that can be modified when build the processor.For example, have the following parameters that can be configured: +- `NUM_WARPS`: Number of warps per cores +- `NUM_THREADS`: Number of threads per warps +- `PERF_ENABLE`: enable the use of all profile counters + +You configure the syntesis build from the command line: + + $ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make + +### OPAE Build Progress + +You could check the last 10 lines in the build log for possible errors until build completion. + + $ tail -n 10 /build.log + +Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs. + + $ ps -u + +If the build fails and you need to restart it, clean up the build folder using the following command: + + $ make clean + +The file `vortex_afu.gbs` should exist when the build is done: + + $ ls -lsa /synth/vortex_afu.gbs + + +### Signing the bitstream and Programming the FPGA + + $ cd + $ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs + $ fpgasupdate vortex_afu_unsigned_ssl.gbs + +### Sample FPGA Run Test +Ensure you have the correct opae runtime for the FPGA target + +``` +$ TARGET=FPGA make -C runtime/opae +``` + +Run the [blackbox.sh](./simulation.md) from your Vortex build directory + +``` +$ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128" +``` + +### FPGA sample test running OpenCL sgemm kernel + +You can use the `blackbox.sh` script to run the following from your Vortex build directory + + $ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128" + +### Testing Vortex using OPAE with Intel ASE Simulation +Building ASE synthesis + +```$ TARGET=asesim make -C runtime/opae``` + +Building ASE runtime + +```$ TARGET=asesim make -C runtime/opae``` + +Running ASE simulation + +```$ ASE_LOG=0 ASE_WORKDIR=/synth/work TARGET=asesim ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n16"``` diff --git a/docs/index.md b/docs/index.md index 14a45f335..351e41fbb 100644 --- a/docs/index.md +++ b/docs/index.md @@ -2,32 +2,8 @@ ## Table of Contents -- [Codebase Layout](codebase.md) -- [Microarchitecture](microarchitecture.md) -- [Cache Subsystem](cache_subsystem.md) -- [Software](software.md) -- [Simulation](simulation.md) -- [Altera FPGA Setup Guide](altera_fpga_guide.md) -- [Xilinx FPGA Setup Guide](xilinx_fpga_guide.md) -- [Debugging](debugging.md) -- [Useful Links](references.md) - -## Installation - -- For the different environments Vortex supports, [read this document](environment_setup.md). -- To install on your own system, [follow this document](install_vortex.md). - -## Quick Start Scenarios - -Running Vortex simulators with different configurations: -- Run basic driver test with rtlsim driver and Vortex config of 2 clusters, 2 cores, 2 warps, 4 threads - - $ ./ci/blackbox.sh --driver=rtlsim --clusters=2 --cores=2 --warps=2 --threads=4 --app=basic - -- Run demo driver test with opae driver and Vortex config of 1 clusters, 4 cores, 4 warps, 2 threads - - $ ./ci/blackbox.sh --driver=opae --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo - -- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads - - $ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood +- [Codebase Layout](codebase.md): Summary of repo file tree +- [Microarchitecture](microarchitecture.md): Vortex Pipeline and cache microarchitectural details and reconfigurability +- [Simulation](simulation.md): Details for building and running each simulation driver +- [Contributing](contributing.md): Process for contributing your own features including repo semantics and testing +- [Debugging](debugging.md): Debugging configurations for each Vortex driver diff --git a/docs/microarchitecture.md b/docs/microarchitecture.md index 3459abcc4..85fa52fd5 100644 --- a/docs/microarchitecture.md +++ b/docs/microarchitecture.md @@ -77,4 +77,7 @@ Vortex has a 6-stage pipeline: - Sockets - Grouping multiple cores sharing L1 cache - Clusters - - Grouping of sockets sharing L2 cache \ No newline at end of file + - Grouping of sockets sharing L2 cache + +### Vortex Cache Subsystem +More details about the cache subsystem are provided [here](./cache_subsystem.md). \ No newline at end of file diff --git a/docs/simulation.md b/docs/simulation.md index 86ce1f135..4201a64d4 100644 --- a/docs/simulation.md +++ b/docs/simulation.md @@ -6,13 +6,16 @@ ### Cycle-Approximate Simulation -SimX is a C++ cycle-level in-house simulator developed for Vortex. The relevant files are located in the `simX` folder. +SimX is a C++ cycle-level in-house simulator developed for Vortex. The relevant files are located in the `simx` folder. The [readme](README.md) has the most detailed instructions for building and running simX. + +- To install on your own system, [follow this document](install_vortex.md). +- For the different Georgia Tech environments Vortex supports, [read this document](environment_setup.md). ### FGPA Simulation -The current target FPGA for simulation is the Arria10 Intel Accelerator Card v1.0. The guide to build the fpga with specific configurations is located [here.](fpga_setup.md) +The guide to build the fpga with specific configurations is located [here.](fpga_setup.md) You can find instructions for both Xilinx and Altera based FPGAs. -### How to Test +### How to Test (using `blackbox.sh`) Running tests under specific drivers (rtlsim,simx,fpga) is done using the script named `blackbox.sh` located in the `ci` folder. Running command `./ci/blackbox.sh --help` from the Vortex root directory will display the following command line arguments for `blackbox.sh`: @@ -47,4 +50,20 @@ PERF: core1: instrs=90693, cycles=53108, IPC=1.707709 PERF: core2: instrs=90849, cycles=53107, IPC=1.710678 PERF: core3: instrs=90836, cycles=50347, IPC=1.804199 PERF: instrs=363180, cycles=53108, IPC=6.838518 -``` \ No newline at end of file +``` + +## Additional Quick Start Scenarios + +Running Vortex simulators with different configurations and drivers is supported. For example: + +- Run basic driver test with rtlsim driver and Vortex config of 2 clusters, 2 cores, 2 warps, 4 threads + + $ ./ci/blackbox.sh --driver=rtlsim --clusters=2 --cores=2 --warps=2 --threads=4 --app=basic + +- Run demo driver test with opae driver and Vortex config of 1 clusters, 4 cores, 4 warps, 2 threads + + $ ./ci/blackbox.sh --driver=opae --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo + +- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads + + $ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood \ No newline at end of file diff --git a/docs/testing.md b/docs/testing.md index b2ae8fb2c..739193ce3 100644 --- a/docs/testing.md +++ b/docs/testing.md @@ -2,7 +2,7 @@ ## Running a Vortex application -The framework provides a utility script: blackbox.sh under the /ci/ folder for executing applications in the tests tree. +The framework provides a utility script: blackbox.sh under the /ci/ folder for executing applications in the tests tree. It gets copied into the `build` directory with all the environment variables resolved, so you should run it from the `build` directory as follows: You can query the commandline options of the tool using: $ ./ci/blackbox.sh --help @@ -49,4 +49,4 @@ Compile your test: `$ make -C tests/regression/` Run your test: `$ ./ci/blackbox.sh --driver=simx --app= --debug` ## Adding Your Tests to the CI Pipeline -See `continuous_integration.md` \ No newline at end of file +If you are a contributor, then you will need to add tests that integrate into the continuous integration pipeline. Remember, Pull Requests cannot be merged unless new code has tests and existing tests do not regress. Furthermore, if you are contributing a new feature, it is recommended that you add the ability to enable / disable the new feature that you are adding. See more at [contributing.md](contributing.md) and [continuous_integration.md](continuous_integration.md). \ No newline at end of file diff --git a/docs/xilinx_fpga_guide.md b/docs/xilinx_fpga_guide.md deleted file mode 100644 index f2960deb6..000000000 --- a/docs/xilinx_fpga_guide.md +++ /dev/null @@ -1,36 +0,0 @@ -# FPGA Startup and Configuration Guide - -XRT Environment Setup ----------------------- - - $ source /opt/xilinx/Vitis/2023.1/settings64.sh - $ source /opt/xilinx/xrt/setup.sh - - -Check Installed FPGA Platforms ------------------------------- - - $ platforminfo -l - - -Build FPGA image ----------------- - - $ cd hw/syn/xilinx/xrt - $ PREFIX=test1 PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 TARGET=hw NUM_CORES=4 make - -Will run the synthesis under new build directory: BUILD_DIR := "\\_\\_\" - -The generated bitstream will be located under /bin/vortex_afu.xclbin - -Sample FPGA Run Test --------------------- - -Ensure you have the correct opae runtime for the FPGA target - - $ make -C runtime/xrt clean - $ TARGET=hw make -C runtime/xrt - -Run the following from your Vortex build directory - - $ TARGET=hw FPGA_BIN_DIR=/bin ./ci/blackbox.sh --driver=xrt --app=sgemm --args="-n128" \ No newline at end of file diff --git a/hw/dpi/util_dpi.cpp b/hw/dpi/util_dpi.cpp index 020816b0b..d804d4885 100644 --- a/hw/dpi/util_dpi.cpp +++ b/hw/dpi/util_dpi.cpp @@ -47,8 +47,6 @@ extern "C" { void dpi_trace(int level, const char* format, ...); void dpi_trace_start(); void dpi_trace_stop(); - - uint64_t dpi_uuid_gen(bool reset, int wid); } bool sim_trace_enabled(); @@ -204,17 +202,3 @@ void dpi_trace_start() { void dpi_trace_stop() { sim_trace_enable(false); } - -/////////////////////////////////////////////////////////////////////////////// - -std::unordered_map g_uuid_gens; - -uint64_t dpi_uuid_gen(bool reset, int wid) { - if (reset) { - g_uuid_gens.clear(); - return 0; - } - uint32_t instr_uuid = g_uuid_gens[wid]++; - uint64_t uuid = (uint64_t(wid) << 32) | instr_uuid; - return uuid; -} \ No newline at end of file diff --git a/hw/dpi/util_dpi.vh b/hw/dpi/util_dpi.vh index 0da62b041..74b095af1 100644 --- a/hw/dpi/util_dpi.vh +++ b/hw/dpi/util_dpi.vh @@ -30,6 +30,4 @@ import "DPI-C" function void dpi_trace(input int level, input string format /*ve import "DPI-C" function void dpi_trace_start(); import "DPI-C" function void dpi_trace_stop(); -import "DPI-C" function longint dpi_uuid_gen(input logic reset, input int wid); - `endif diff --git a/hw/rtl/VX_cluster.sv b/hw/rtl/VX_cluster.sv index 714e69dd4..4af05dc62 100644 --- a/hw/rtl/VX_cluster.sv +++ b/hw/rtl/VX_cluster.sv @@ -24,14 +24,14 @@ module VX_cluster import VX_gpu_pkg::*; #( input wire reset, `ifdef PERF_ENABLE - VX_mem_perf_if.slave mem_perf_if, + input sysmem_perf_t sysmem_perf, `endif // DCRs VX_dcr_bus_if.slave dcr_bus_if, // Memory - VX_mem_bus_if.master mem_bus_if, + VX_mem_bus_if.master mem_bus_if [`L2_MEM_PORTS], // Status output wire busy @@ -43,12 +43,12 @@ module VX_cluster import VX_gpu_pkg::*; #( `endif `ifdef PERF_ENABLE - VX_mem_perf_if mem_perf_tmp_if(); - assign mem_perf_tmp_if.icache = 'x; - assign mem_perf_tmp_if.dcache = 'x; - assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; - assign mem_perf_tmp_if.lmem = 'x; - assign mem_perf_tmp_if.mem = mem_perf_if.mem; + cache_perf_t l2_perf; + sysmem_perf_t sysmem_perf_tmp; + always @(*) begin + sysmem_perf_tmp = sysmem_perf; + sysmem_perf_tmp.l2cache = l2_perf; + end `endif `ifdef GBAR_ENABLE @@ -56,23 +56,21 @@ module VX_cluster import VX_gpu_pkg::*; #( VX_gbar_bus_if per_socket_gbar_bus_if[`NUM_SOCKETS](); VX_gbar_bus_if gbar_bus_if(); - `RESET_RELAY (gbar_reset, reset); - VX_gbar_arb #( .NUM_REQS (`NUM_SOCKETS), .OUT_BUF ((`NUM_SOCKETS > 2) ? 1 : 0) // bgar_unit has no backpressure ) gbar_arb ( .clk (clk), - .reset (gbar_reset), + .reset (reset), .bus_in_if (per_socket_gbar_bus_if), .bus_out_if (gbar_bus_if) ); VX_gbar_unit #( - .INSTANCE_ID ($sformatf("gbar%0d", CLUSTER_ID)) + .INSTANCE_ID (`SFORMATF(("gbar%0d", CLUSTER_ID))) ) gbar_unit ( .clk (clk), - .reset (gbar_reset), + .reset (reset), .gbar_bus_if (gbar_bus_if) ); @@ -81,18 +79,19 @@ module VX_cluster import VX_gpu_pkg::*; #( VX_mem_bus_if #( .DATA_SIZE (`L1_LINE_SIZE), .TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH) - ) per_socket_mem_bus_if[`NUM_SOCKETS](); + ) per_socket_mem_bus_if[`NUM_SOCKETS * `L1_MEM_PORTS](); `RESET_RELAY (l2_reset, reset); VX_cache_wrap #( - .INSTANCE_ID ($sformatf("%s-l2cache", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-l2cache", INSTANCE_ID))), .CACHE_SIZE (`L2_CACHE_SIZE), .LINE_SIZE (`L2_LINE_SIZE), .NUM_BANKS (`L2_NUM_BANKS), .NUM_WAYS (`L2_NUM_WAYS), .WORD_SIZE (L2_WORD_SIZE), .NUM_REQS (L2_NUM_REQS), + .MEM_PORTS (`L2_MEM_PORTS), .CRSQ_SIZE (`L2_CRSQ_SIZE), .MSHR_SIZE (`L2_MSHR_SIZE), .MRSQ_SIZE (`L2_MRSQ_SIZE), @@ -100,17 +99,19 @@ module VX_cluster import VX_gpu_pkg::*; #( .TAG_WIDTH (L2_TAG_WIDTH), .WRITE_ENABLE (1), .WRITEBACK (`L2_WRITEBACK), - .DIRTY_BYTES (`L2_WRITEBACK), + .DIRTY_BYTES (`L2_DIRTYBYTES), + .REPL_POLICY (`L2_REPL_POLICY), .UUID_WIDTH (`UUID_WIDTH), - .CORE_OUT_BUF (2), - .MEM_OUT_BUF (2), + .FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH), + .CORE_OUT_BUF (3), + .MEM_OUT_BUF (3), .NC_ENABLE (1), .PASSTHRU (!`L2_ENABLED) ) l2cache ( .clk (clk), .reset (l2_reset), `ifdef PERF_ENABLE - .cache_perf (mem_perf_tmp_if.l2cache), + .cache_perf (l2_perf), `endif .core_bus_if (per_socket_mem_bus_if), .mem_bus_if (mem_bus_if) @@ -118,24 +119,20 @@ module VX_cluster import VX_gpu_pkg::*; #( /////////////////////////////////////////////////////////////////////////// - VX_dcr_bus_if socket_dcr_bus_tmp_if(); - assign socket_dcr_bus_tmp_if.write_valid = dcr_bus_if.write_valid && (dcr_bus_if.write_addr >= `VX_DCR_BASE_STATE_BEGIN && dcr_bus_if.write_addr < `VX_DCR_BASE_STATE_END); - assign socket_dcr_bus_tmp_if.write_addr = dcr_bus_if.write_addr; - assign socket_dcr_bus_tmp_if.write_data = dcr_bus_if.write_data; - wire [`NUM_SOCKETS-1:0] per_socket_busy; - VX_dcr_bus_if socket_dcr_bus_if(); - `BUFFER_DCR_BUS_IF (socket_dcr_bus_if, socket_dcr_bus_tmp_if, (`NUM_SOCKETS > 1)); - // Generate all sockets - for (genvar socket_id = 0; socket_id < `NUM_SOCKETS; ++socket_id) begin : sockets + for (genvar socket_id = 0; socket_id < `NUM_SOCKETS; ++socket_id) begin : g_sockets `RESET_RELAY (socket_reset, reset); + VX_dcr_bus_if socket_dcr_bus_if(); + wire is_base_dcr_addr = (dcr_bus_if.write_addr >= `VX_DCR_BASE_STATE_BEGIN && dcr_bus_if.write_addr < `VX_DCR_BASE_STATE_END); + `BUFFER_DCR_BUS_IF (socket_dcr_bus_if, dcr_bus_if, is_base_dcr_addr, (`NUM_SOCKETS > 1)) + VX_socket #( .SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + socket_id), - .INSTANCE_ID ($sformatf("%s-socket%0d", INSTANCE_ID, socket_id)) + .INSTANCE_ID (`SFORMATF(("%s-socket%0d", INSTANCE_ID, socket_id))) ) socket ( `SCOPE_IO_BIND (scope_socket+socket_id) @@ -143,12 +140,12 @@ module VX_cluster import VX_gpu_pkg::*; #( .reset (socket_reset), `ifdef PERF_ENABLE - .mem_perf_if (mem_perf_tmp_if), + .sysmem_perf (sysmem_perf_tmp), `endif .dcr_bus_if (socket_dcr_bus_if), - .mem_bus_if (per_socket_mem_bus_if[socket_id]), + .mem_bus_if (per_socket_mem_bus_if[socket_id * `L1_MEM_PORTS +: `L1_MEM_PORTS]), `ifdef GBAR_ENABLE .gbar_bus_if (per_socket_gbar_bus_if[socket_id]), @@ -158,6 +155,6 @@ module VX_cluster import VX_gpu_pkg::*; #( ); end - `BUFFER_EX(busy, (| per_socket_busy), 1'b1, (`NUM_SOCKETS > 1)); + `BUFFER_EX(busy, (| per_socket_busy), 1'b1, 1, (`NUM_SOCKETS > 1)); endmodule diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 8d1c280fd..c61b1d5e9 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -31,7 +31,6 @@ `endif /////////////////////////////////////////////////////////////////////////////// - `ifndef EXT_M_DISABLE `define EXT_M_ENABLE `endif @@ -86,6 +85,10 @@ `endif `endif +`ifndef VLEN +`define VLEN 256 +`endif + `ifndef NUM_CLUSTERS `define NUM_CLUSTERS 1 `endif @@ -110,6 +113,24 @@ `define SOCKET_SIZE `MIN(4, `NUM_CORES) `endif +// Size of Tensor Core +`ifndef TC_SIZE +`define TC_SIZE 8 +`endif + +// Number of TCs per Warp +`ifndef TC_NUM +`define TC_NUM 4 +`endif + +`ifndef NUM_TCU_LANES +`define NUM_TCU_LANES `TC_NUM +`endif + +`ifndef NUM_TCU_BLOCKS +`define NUM_TCU_BLOCKS `ISSUE_WIDTH +`endif + `ifdef L2_ENABLE `define L2_ENABLED 1 `else @@ -151,6 +172,28 @@ `define L3_LINE_SIZE `MEM_BLOCK_SIZE `endif +// Platform memory parameters + +`ifndef PLATFORM_MEMORY_NUM_BANKS +`define PLATFORM_MEMORY_NUM_BANKS 2 +`endif + +`ifndef PLATFORM_MEMORY_ADDR_WIDTH +`ifdef XLEN_64 + `define PLATFORM_MEMORY_ADDR_WIDTH 48 +`else + `define PLATFORM_MEMORY_ADDR_WIDTH 32 +`endif +`endif + +`ifndef PLATFORM_MEMORY_DATA_SIZE +`define PLATFORM_MEMORY_DATA_SIZE 64 +`endif + +`ifndef PLATFORM_MEMORY_INTERLEAVE +`define PLATFORM_MEMORY_INTERLEAVE 1 +`endif + `ifdef XLEN_64 `ifndef STACK_BASE_ADDR @@ -169,7 +212,14 @@ `define IO_BASE_ADDR 64'h000000040 `endif -`else +`ifdef VM_ENABLE +`ifndef PAGE_TABLE_BASE_ADDR +`define PAGE_TABLE_BASE_ADDR 64'h0F0000000 +`endif + +`endif + +`else // XLEN_32 `ifndef STACK_BASE_ADDR `define STACK_BASE_ADDR 32'hFFFF0000 @@ -187,6 +237,13 @@ `define IO_BASE_ADDR 32'h00000040 `endif +`ifdef VM_ENABLE +`ifndef PAGE_TABLE_BASE_ADDR +`define PAGE_TABLE_BASE_ADDR 32'hF0000000 +`endif + +`endif + `endif `define IO_END_ADDR `USER_BASE_ADDR @@ -214,15 +271,17 @@ `endif `define STACK_SIZE (1 << `STACK_LOG2_SIZE) -`define RESET_DELAY 8 +`define RESET_DELAY 8 `ifndef STALL_TIMEOUT `define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED))) `endif `ifndef SV_DPI +`ifndef DPI_DISABLE `define DPI_DISABLE `endif +`endif `ifndef FPU_FPNEW `ifndef FPU_DSP @@ -251,6 +310,59 @@ `define DEBUG_LEVEL 3 `endif +`ifndef MEM_PAGE_SIZE +`define MEM_PAGE_SIZE (4096) +`endif +`ifndef MEM_PAGE_LOG2_SIZE +`define MEM_PAGE_LOG2_SIZE (12) +`endif + +// Virtual Memory Configuration /////////////////////////////////////////////////////// +`ifdef VM_ENABLE + `ifdef XLEN_32 + `ifndef VM_ADDR_MODE + `define VM_ADDR_MODE SV32 //or BARE + `endif + `ifndef PT_LEVEL + `define PT_LEVEL (2) + `endif + `ifndef PTE_SIZE + `define PTE_SIZE (4) + `endif + `ifndef NUM_PTE_ENTRY + `define NUM_PTE_ENTRY (1024) + `endif + `ifndef PT_SIZE_LIMIT + `define PT_SIZE_LIMIT (1<<23) + `endif + `else + `ifndef VM_ADDR_MODE + `define VM_ADDR_MODE SV39 //or BARE + `endif + `ifndef PT_LEVEL + `define PT_LEVEL (3) + `endif + `ifndef PTE_SIZE + `define PTE_SIZE (8) + `endif + `ifndef NUM_PTE_ENTRY + `define NUM_PTE_ENTRY (512) + `endif + `ifndef PT_SIZE_LIMIT + `define PT_SIZE_LIMIT (1<<25) + `endif + `endif + + `ifndef PT_SIZE + `define PT_SIZE MEM_PAGE_SIZE + `endif + + `ifndef TLB_SIZE + `define TLB_SIZE (32) + `endif + +`endif + // Pipeline Configuration ///////////////////////////////////////////////////// // Issue width @@ -478,7 +590,16 @@ // Number of Associative Ways `ifndef ICACHE_NUM_WAYS -`define ICACHE_NUM_WAYS 1 +`define ICACHE_NUM_WAYS 4 +`endif + +// Replacement Policy +`ifndef ICACHE_REPL_POLICY +`define ICACHE_REPL_POLICY 1 +`endif + +`ifndef ICACHE_MEM_PORTS +`define ICACHE_MEM_PORTS 1 `endif // Dcache Configurable Knobs ////////////////////////////////////////////////// @@ -507,7 +628,7 @@ // Number of Banks `ifndef DCACHE_NUM_BANKS -`define DCACHE_NUM_BANKS `MIN(`NUM_LSU_LANES, 4) +`define DCACHE_NUM_BANKS `MIN(DCACHE_NUM_REQS, 16) `endif // Core Response Queue Size @@ -527,12 +648,12 @@ // Memory Response Queue Size `ifndef DCACHE_MRSQ_SIZE -`define DCACHE_MRSQ_SIZE 0 +`define DCACHE_MRSQ_SIZE 4 `endif // Number of Associative Ways `ifndef DCACHE_NUM_WAYS -`define DCACHE_NUM_WAYS 1 +`define DCACHE_NUM_WAYS 4 `endif // Enable Cache Writeback @@ -540,6 +661,25 @@ `define DCACHE_WRITEBACK 0 `endif +// Enable Cache Dirty bytes +`ifndef DCACHE_DIRTYBYTES +`define DCACHE_DIRTYBYTES `DCACHE_WRITEBACK +`endif + +// Replacement Policy +`ifndef DCACHE_REPL_POLICY +`define DCACHE_REPL_POLICY 1 +`endif + +// Number of Memory Ports +`ifndef L1_MEM_PORTS +`ifdef L1_DISABLE +`define L1_MEM_PORTS `MIN(DCACHE_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS) +`else +`define L1_MEM_PORTS `MIN(`DCACHE_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS) +`endif +`endif + // LMEM Configurable Knobs //////////////////////////////////////////////////// `ifndef LMEM_DISABLE @@ -562,16 +702,12 @@ // Cache Size `ifndef L2_CACHE_SIZE -`ifdef ALTERA_S10 -`define L2_CACHE_SIZE 2097152 -`else `define L2_CACHE_SIZE 1048576 `endif -`endif // Number of Banks `ifndef L2_NUM_BANKS -`define L2_NUM_BANKS `MIN(4, `NUM_SOCKETS) +`define L2_NUM_BANKS `MIN(L2_NUM_REQS, 16) `endif // Core Response Queue Size @@ -591,12 +727,12 @@ // Memory Response Queue Size `ifndef L2_MRSQ_SIZE -`define L2_MRSQ_SIZE 0 +`define L2_MRSQ_SIZE 4 `endif // Number of Associative Ways `ifndef L2_NUM_WAYS -`define L2_NUM_WAYS 2 +`define L2_NUM_WAYS 8 `endif // Enable Cache Writeback @@ -604,20 +740,35 @@ `define L2_WRITEBACK 0 `endif +// Enable Cache Dirty bytes +`ifndef L2_DIRTYBYTES +`define L2_DIRTYBYTES `L2_WRITEBACK +`endif + +// Replacement Policy +`ifndef L2_REPL_POLICY +`define L2_REPL_POLICY 1 +`endif + +// Number of Memory Ports +`ifndef L2_MEM_PORTS +`ifdef L2_ENABLE +`define L2_MEM_PORTS `MIN(`L2_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS) +`else +`define L2_MEM_PORTS `MIN(L2_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS) +`endif +`endif + // L3cache Configurable Knobs ///////////////////////////////////////////////// // Cache Size `ifndef L3_CACHE_SIZE -`ifdef ALTERA_S10 `define L3_CACHE_SIZE 2097152 -`else -`define L3_CACHE_SIZE 1048576 -`endif `endif // Number of Banks `ifndef L3_NUM_BANKS -`define L3_NUM_BANKS `MIN(4, `NUM_CLUSTERS) +`define L3_NUM_BANKS `MIN(L3_NUM_REQS, 16) `endif // Core Response Queue Size @@ -637,12 +788,12 @@ // Memory Response Queue Size `ifndef L3_MRSQ_SIZE -`define L3_MRSQ_SIZE 0 +`define L3_MRSQ_SIZE 4 `endif // Number of Associative Ways `ifndef L3_NUM_WAYS -`define L3_NUM_WAYS 4 +`define L3_NUM_WAYS 8 `endif // Enable Cache Writeback @@ -650,6 +801,25 @@ `define L3_WRITEBACK 0 `endif +// Enable Cache Dirty bytes +`ifndef L3_DIRTYBYTES +`define L3_DIRTYBYTES `L3_WRITEBACK +`endif + +// Replacement Policy +`ifndef L3_REPL_POLICY +`define L3_REPL_POLICY 1 +`endif + +// Number of Memory Ports +`ifndef L3_MEM_PORTS +`ifdef L3_ENABLE +`define L3_MEM_PORTS `MIN(`L3_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS) +`else +`define L3_MEM_PORTS `MIN(L3_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS) +`endif +`endif + // ISA Extensions ///////////////////////////////////////////////////////////// `ifdef EXT_A_ENABLE @@ -682,6 +852,12 @@ `define EXT_M_ENABLED 0 `endif +`ifdef EXT_V_ENABLE + `define EXT_V_ENABLED 1 +`else + `define EXT_V_ENABLED 0 +`endif + `ifdef EXT_ZICOND_ENABLE `define EXT_ZICOND_ENABLED 1 `else @@ -698,7 +874,7 @@ `define ISA_STD_N 13 `define ISA_STD_Q 16 `define ISA_STD_S 18 -`define ISA_STD_U 20 +`define ISA_STD_V 21 `define ISA_EXT_ICACHE 0 `define ISA_EXT_DCACHE 1 @@ -735,7 +911,7 @@ | (0 << 18) /* S - Supervisor mode implemented */ \ | (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \ | (1 << 20) /* U - User mode implemented */ \ - | (0 << 21) /* V - Tentatively reserved for Vector extension */ \ + | (`EXT_V_ENABLED << 21) /* V - Tentatively reserved for Vector extension */ \ | (0 << 22) /* W - Reserved */ \ | (1 << 23) /* X - Non-standard extensions present */ \ | (0 << 24) /* Y - Reserved */ \ diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index 686124c16..8e6e4efae 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -50,10 +50,16 @@ `define PERF_CTR_BITS 44 `ifndef NDEBUG +`define UUID_ENABLE +`define UUID_WIDTH 44 +`else +`ifdef SCOPE +`define UUID_ENABLE `define UUID_WIDTH 44 `else `define UUID_WIDTH 1 `endif +`endif `define PC_BITS (`XLEN-1) `define OFFSET_BITS 12 @@ -227,22 +233,19 @@ `define INST_FENCE_D 1'h0 `define INST_FENCE_I 1'h1 -`define INST_FPU_ADD 4'b0000 -`define INST_FPU_SUB 4'b0001 -`define INST_FPU_MUL 4'b0010 -`define INST_FPU_DIV 4'b0011 -`define INST_FPU_SQRT 4'b0100 -`define INST_FPU_CMP 4'b0101 // frm: LE=0, LT=1, EQ=2 -`define INST_FPU_F2F 4'b0110 -`define INST_FPU_MISC 4'b0111 // frm: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7 -`define INST_FPU_F2I 4'b1000 -`define INST_FPU_F2U 4'b1001 -`define INST_FPU_I2F 4'b1010 -`define INST_FPU_U2F 4'b1011 -`define INST_FPU_MADD 4'b1100 -`define INST_FPU_MSUB 4'b1101 -`define INST_FPU_NMSUB 4'b1110 -`define INST_FPU_NMADD 4'b1111 +`define INST_FPU_ADD 4'b0000 // SUB=fmt[1] +`define INST_FPU_MUL 4'b0001 +`define INST_FPU_MADD 4'b0010 // SUB=fmt[1] +`define INST_FPU_NMADD 4'b0011 // SUB=fmt[1] +`define INST_FPU_DIV 4'b0100 +`define INST_FPU_SQRT 4'b0101 +`define INST_FPU_F2I 4'b1000 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1 +`define INST_FPU_F2U 4'b1001 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1 +`define INST_FPU_I2F 4'b1010 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1 +`define INST_FPU_U2F 4'b1011 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1 +`define INST_FPU_CMP 4'b1100 // frm: LE=0, LT=1, EQ=2 +`define INST_FPU_F2F 4'b1101 // fmt[0]: F32=0, F64=1 +`define INST_FPU_MISC 4'b1110 // frm: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7 `define INST_FPU_BITS 4 `define INST_FPU_IS_CLASS(op, frm) (op == `INST_FPU_MISC && frm == 3) `define INST_FPU_IS_MVXW(op, frm) (op == `INST_FPU_MISC && frm == 4) @@ -267,14 +270,14 @@ /////////////////////////////////////////////////////////////////////////////// -`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \ - (`CLOG2(mshr_size) + `CLOG2(num_banks)) +`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width) \ + (uuid_width + `CLOG2(mshr_size) + `CLOG2(`CDIV(num_banks, mem_ports))) -`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \ - (`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width) +`define CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width) \ + (`CLOG2(`CDIV(num_reqs, mem_ports)) + `CLOG2(line_size / word_size) + tag_width) -`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \ - (`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width)) + 1) +`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, tag_width, uuid_width) \ + (`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width), `CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width)) + 1) /////////////////////////////////////////////////////////////////////////////// @@ -284,14 +287,14 @@ `define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \ (tag_width + `ARB_SEL_BITS(`UP(num_caches), 1)) -`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \ - `CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches) +`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, num_caches, uuid_width) \ + `CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width), num_caches) -`define CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \ - `CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches) +`define CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width, num_inputs, num_caches) \ + `CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches) -`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \ - `CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches) +`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, tag_width, num_inputs, num_caches, uuid_width) \ + `CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches), uuid_width), num_caches) /////////////////////////////////////////////////////////////////////////////// @@ -303,11 +306,12 @@ `define L1_ENABLE `endif -`define ADDR_TYPE_FLUSH 0 -`define ADDR_TYPE_IO 1 -`define ADDR_TYPE_LOCAL 2 // shoud be last since optional -`define ADDR_TYPE_WIDTH (`ADDR_TYPE_LOCAL + `LMEM_ENABLED) +`define MEM_REQ_FLAG_FLUSH 0 +`define MEM_REQ_FLAG_IO 1 +`define MEM_REQ_FLAG_LOCAL 2 // shoud be last since optional +`define MEM_REQ_FLAGS_WIDTH (`MEM_REQ_FLAG_LOCAL + `LMEM_ENABLED) +`define VX_MEM_PORTS `L3_MEM_PORTS `define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE `define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE)) `define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8) @@ -320,12 +324,23 @@ /////////////////////////////////////////////////////////////////////////////// -`define BUFFER_EX(dst, src, ena, latency) \ +`define NEG_EDGE(dst, src) \ + VX_edge_trigger #( \ + .POS (0), \ + .INIT (0) \ + ) __neg_edge`__LINE__ ( \ + .clk (clk), \ + .reset (1'b0), \ + .data_in (src), \ + .data_out (dst) \ + ) + +`define BUFFER_EX(dst, src, ena, resetw, latency) \ VX_pipe_register #( \ .DATAW ($bits(dst)), \ - .RESETW ($bits(dst)), \ + .RESETW (resetw), \ .DEPTH (latency) \ - ) __``dst``__ ( \ + ) __buffer_ex`__LINE__ ( \ .clk (clk), \ .reset (reset), \ .enable (ena), \ @@ -333,13 +348,13 @@ .data_out (dst) \ ) -`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 1) +`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, $bits(dst), 1) `define POP_COUNT_EX(out, in, model) \ VX_popcount #( \ .N ($bits(in)), \ .MODEL (model) \ - ) __``out``__ ( \ + ) __pop_count_ex`__LINE__ ( \ .data_in (in), \ .data_out (out) \ ) @@ -359,50 +374,114 @@ assign src.rsp_data = dst.rsp_data; \ assign dst.rsp_ready = src.rsp_ready -`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \ +`define ASSIGN_VX_MEM_BUS_RO_IF(dst, src) \ assign dst.req_valid = src.req_valid; \ - assign dst.req_data.rw = src.req_data.rw; \ - assign dst.req_data.byteen = src.req_data.byteen; \ + assign dst.req_data.rw = 0; \ assign dst.req_data.addr = src.req_data.addr; \ - assign dst.req_data.atype = src.req_data.atype; \ - assign dst.req_data.data = src.req_data.data; \ - if (TD != TS) \ - assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \ - else \ - assign dst.req_data.tag = src.req_data.tag; \ + assign dst.req_data.data = '0; \ + assign dst.req_data.byteen = '1; \ + assign dst.req_data.flags = src.req_data.flags; \ + assign dst.req_data.tag = src.req_data.tag; \ assign src.req_ready = dst.req_ready; \ assign src.rsp_valid = dst.rsp_valid; \ assign src.rsp_data.data = dst.rsp_data.data; \ - assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \ + assign src.rsp_data.tag = dst.rsp_data.tag; \ assign dst.rsp_ready = src.rsp_ready -`define ASSIGN_VX_LSU_MEM_IF(dst, src) \ - assign dst.req_valid = src.req_valid; \ - assign dst.req_data = src.req_data; \ - assign src.req_ready = dst.req_ready; \ - assign src.rsp_valid = dst.rsp_valid; \ - assign src.rsp_data = dst.rsp_data; \ - assign dst.rsp_ready = src.rsp_ready - -`define BUFFER_DCR_BUS_IF(dst, src, enable) \ - if (enable) begin \ - reg [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __dst; \ - always @(posedge clk) begin \ - __dst <= {src.write_valid, src.write_addr, src.write_data}; \ +`define ASSIGN_VX_MEM_BUS_IF_EX(dst, src, TD, TS, UUID) \ + assign dst.req_valid = src.req_valid; \ + assign dst.req_data.rw = src.req_data.rw; \ + assign dst.req_data.addr = src.req_data.addr; \ + assign dst.req_data.data = src.req_data.data; \ + assign dst.req_data.byteen = src.req_data.byteen; \ + assign dst.req_data.flags = src.req_data.flags; \ + /* verilator lint_off GENUNNAMED */ \ + if (TD != TS) begin \ + if (UUID != 0) begin \ + if (TD > TS) begin \ + assign dst.req_data.tag = {src.req_data.tag.uuid, {(TD-TS){1'b0}}, src.req_data.tag.value}; \ + end else begin \ + assign dst.req_data.tag = {src.req_data.tag.uuid, src.req_data.tag.value[TD-UUID-1:0]}; \ + end \ + end else begin \ + if (TD > TS) begin \ + assign dst.req_data.tag = {{(TD-TS){1'b0}}, src.req_data.tag}; \ + end else begin \ + assign dst.req_data.tag = src.req_data.tag[TD-1:0]; \ + end \ end \ - assign {dst.write_valid, dst.write_addr, dst.write_data} = __dst; \ end else begin \ - assign {dst.write_valid, dst.write_addr, dst.write_data} = {src.write_valid, src.write_addr, src.write_data}; \ - end + assign dst.req_data.tag = src.req_data.tag; \ + end \ + /* verilator lint_on GENUNNAMED */ \ + assign src.req_ready = dst.req_ready; \ + assign src.rsp_valid = dst.rsp_valid; \ + assign src.rsp_data.data = dst.rsp_data.data; \ + /* verilator lint_off GENUNNAMED */ \ + if (TD != TS) begin \ + if (UUID != 0) begin \ + if (TD > TS) begin \ + assign src.rsp_data.tag = {dst.rsp_data.tag.uuid, dst.rsp_data.tag.value[TS-UUID-1:0]}; \ + end else begin \ + assign src.rsp_data.tag = {dst.rsp_data.tag.uuid, {(TS-TD){1'b0}}, dst.rsp_data.tag.value}; \ + end \ + end else begin \ + if (TD > TS) begin \ + assign src.rsp_data.tag = dst.rsp_data.tag[TS-1:0]; \ + end else begin \ + assign src.rsp_data.tag = {{(TS-TD){1'b0}}, dst.rsp_data.tag}; \ + end \ + end \ + end else begin \ + assign src.rsp_data.tag = dst.rsp_data.tag; \ + end \ + /* verilator lint_on GENUNNAMED */ \ + assign dst.rsp_ready = src.rsp_ready + +`define INIT_VX_MEM_BUS_IF(itf) \ + assign itf.req_valid = 0; \ + assign itf.req_data = '0; \ + `UNUSED_VAR (itf.req_ready) \ + `UNUSED_VAR (itf.rsp_valid) \ + `UNUSED_VAR (itf.rsp_data) \ + assign itf.rsp_ready = 0; + +`define UNUSED_VX_MEM_BUS_IF(itf) \ + `UNUSED_VAR (itf.req_valid) \ + `UNUSED_VAR (itf.req_data) \ + assign itf.req_ready = 0; \ + assign itf.rsp_valid = 0; \ + assign itf.rsp_data = '0; \ + `UNUSED_VAR (itf.rsp_ready) + + +`define BUFFER_DCR_BUS_IF(dst, src, ena, latency) \ + /* verilator lint_off GENUNNAMED */ \ + if (latency != 0) begin \ + VX_pipe_register #( \ + .DATAW (1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH), \ + .DEPTH (latency) \ + ) pipe_reg ( \ + .clk (clk), \ + .reset (1'b0), \ + .enable (1'b1), \ + .data_in ({src.write_valid && ena, src.write_addr, src.write_data}), \ + .data_out ({dst.write_valid, dst.write_addr, dst.write_data}) \ + ); \ + end else begin \ + assign {dst.write_valid, dst.write_addr, dst.write_data} = {src.write_valid && ena, src.write_addr, src.write_data}; \ + end \ + /* verilator lint_on GENUNNAMED */ `define PERF_COUNTER_ADD(dst, src, field, width, count, reg_enable) \ + /* verilator lint_off GENUNNAMED */ \ if (count > 1) begin \ wire [count-1:0][width-1:0] __reduce_add_i_field; \ wire [width-1:0] __reduce_add_o_field; \ for (genvar __i = 0; __i < count; ++__i) begin \ assign __reduce_add_i_field[__i] = src[__i].``field; \ end \ - VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \ + VX_reduce_tree #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \ __reduce_add_i_field, \ __reduce_add_o_field \ ); \ @@ -421,9 +500,11 @@ end \ end else begin \ assign dst.``field = src[0].``field; \ - end + end \ + /* verilator lint_on GENUNNAMED */ `define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \ + /* verilator lint_off GENUNNAMED */ \ if (block_size != 1) begin \ if (block_size != `NUM_WARPS) begin \ assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \ @@ -432,6 +513,7 @@ end \ end else begin \ assign dst = src; \ - end + end \ + /* verilator lint_on GENUNNAMED */ `endif // VX_DEFINE_VH diff --git a/hw/rtl/VX_gpu_pkg.sv b/hw/rtl/VX_gpu_pkg.sv index 393f2a66f..fe22f0846 100644 --- a/hw/rtl/VX_gpu_pkg.sv +++ b/hw/rtl/VX_gpu_pkg.sv @@ -73,6 +73,17 @@ package VX_gpu_pkg; logic [`PERF_CTR_BITS-1:0] crsp_stalls; } cache_perf_t; + typedef struct packed { + logic [`PERF_CTR_BITS-1:0] reads; + logic [`PERF_CTR_BITS-1:0] writes; + logic [`PERF_CTR_BITS-1:0] bank_stalls; + logic [`PERF_CTR_BITS-1:0] crsp_stalls; + } lmem_perf_t; + + typedef struct packed { + logic [`PERF_CTR_BITS-1:0] misses; + } coalescer_perf_t; + typedef struct packed { logic [`PERF_CTR_BITS-1:0] reads; logic [`PERF_CTR_BITS-1:0] writes; @@ -92,6 +103,26 @@ package VX_gpu_pkg; logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses; } issue_perf_t; + typedef struct packed { + cache_perf_t icache; + cache_perf_t dcache; + cache_perf_t l2cache; + cache_perf_t l3cache; + lmem_perf_t lmem; + coalescer_perf_t coalescer; + mem_perf_t mem; + } sysmem_perf_t; + + typedef struct packed { + sched_perf_t sched; + issue_perf_t issue; + logic [`PERF_CTR_BITS-1:0] ifetches; + logic [`PERF_CTR_BITS-1:0] loads; + logic [`PERF_CTR_BITS-1:0] stores; + logic [`PERF_CTR_BITS-1:0] ifetch_latency; + logic [`PERF_CTR_BITS-1:0] load_latency; + } pipeline_perf_t; + //////////////////////// instruction arguments //////////////////////////// typedef struct packed { @@ -145,6 +176,7 @@ package VX_gpu_pkg; localparam LSU_TAG_ID_BITS = (`CLOG2(`LSUQ_IN_SIZE) + `CLOG2(LSU_MEM_BATCHES)); localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS); localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES; + localparam LMEM_TAG_WIDTH = LSU_TAG_WIDTH + `CLOG2(`NUM_LSU_BLOCKS); ////////////////////////// Icache Parameters ////////////////////////////// @@ -166,9 +198,9 @@ package VX_gpu_pkg; // Memory request tag bits `ifdef ICACHE_ENABLE - localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES); + localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, 1, `NUM_ICACHES, `UUID_WIDTH); `else - localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES); + localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, 1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES); `endif ////////////////////////// Dcache Parameters ////////////////////////////// @@ -180,7 +212,7 @@ package VX_gpu_pkg; // Block size in bytes localparam DCACHE_LINE_SIZE = `L1_LINE_SIZE; - // Input request size + // Input request size (using coalesced memory blocks) localparam DCACHE_CHANNELS = `UP((`NUM_LSU_LANES * LSU_WORD_SIZE) / DCACHE_WORD_SIZE); localparam DCACHE_NUM_REQS = `NUM_LSU_BLOCKS * DCACHE_CHANNELS; @@ -197,26 +229,27 @@ package VX_gpu_pkg; // Memory request tag bits `ifdef DCACHE_ENABLE - localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES); + localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, `L1_MEM_PORTS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES, `UUID_WIDTH); `else - localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES); + localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, `L1_MEM_PORTS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES); `endif /////////////////////////////// L1 Parameters ///////////////////////////// + // arbitrate between icache and dcache localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH); localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2)); /////////////////////////////// L2 Parameters ///////////////////////////// - localparam ICACHE_MEM_ARB_IDX = 0; - localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1; + localparam ICACHE_MEM_ARB_IDX = 0; + localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1; // Word size in bytes localparam L2_WORD_SIZE = `L1_LINE_SIZE; // Input request size - localparam L2_NUM_REQS = `NUM_SOCKETS; + localparam L2_NUM_REQS = `NUM_SOCKETS * `L1_MEM_PORTS; // Core request tag bits localparam L2_TAG_WIDTH = L1_MEM_ARB_TAG_WIDTH; @@ -226,9 +259,9 @@ package VX_gpu_pkg; // Memory request tag bits `ifdef L2_ENABLE - localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH); + localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_MEM_PORTS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH, `UUID_WIDTH); `else - localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH); + localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_MEM_PORTS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH); `endif /////////////////////////////// L3 Parameters ///////////////////////////// @@ -237,7 +270,7 @@ package VX_gpu_pkg; localparam L3_WORD_SIZE = `L2_LINE_SIZE; // Input request size - localparam L3_NUM_REQS = `NUM_CLUSTERS; + localparam L3_NUM_REQS = `NUM_CLUSTERS * `L2_MEM_PORTS; // Core request tag bits localparam L3_TAG_WIDTH = L2_MEM_TAG_WIDTH; @@ -247,9 +280,9 @@ package VX_gpu_pkg; // Memory request tag bits `ifdef L3_ENABLE - localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH); + localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_MEM_PORTS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH, `UUID_WIDTH); `else - localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH); + localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_MEM_PORTS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH); `endif /////////////////////////////// Issue parameters ////////////////////////// @@ -308,6 +341,430 @@ package VX_gpu_pkg; `IGNORE_UNUSED_END +////////////////////////////////// Tracing //////////////////////////////////// + +`ifdef SIMULATION + +`ifdef SV_DPI + import "DPI-C" function void dpi_trace(input int level, input string format /*verilator sformat*/); +`endif + + task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type); + case (ex_type) + `EX_ALU: `TRACE(level, ("ALU")) + `EX_LSU: `TRACE(level, ("LSU")) + `EX_SFU: `TRACE(level, ("SFU")) + `ifdef EXT_F_ENABLE + `EX_FPU: `TRACE(level, ("FPU")) + `endif + default: `TRACE(level, ("?")) + endcase + endtask + + task trace_ex_op(input int level, + input [`EX_BITS-1:0] ex_type, + input [`INST_OP_BITS-1:0] op_type, + input VX_gpu_pkg::op_args_t op_args + ); + case (ex_type) + `EX_ALU: begin + case (op_args.alu.xtype) + `ALU_TYPE_ARITH: begin + if (op_args.alu.is_w) begin + if (op_args.alu.use_imm) begin + case (`INST_ALU_BITS'(op_type)) + `INST_ALU_ADD: `TRACE(level, ("ADDIW")) + `INST_ALU_SLL: `TRACE(level, ("SLLIW")) + `INST_ALU_SRL: `TRACE(level, ("SRLIW")) + `INST_ALU_SRA: `TRACE(level, ("SRAIW")) + default: `TRACE(level, ("?")) + endcase + end else begin + case (`INST_ALU_BITS'(op_type)) + `INST_ALU_ADD: `TRACE(level, ("ADDW")) + `INST_ALU_SUB: `TRACE(level, ("SUBW")) + `INST_ALU_SLL: `TRACE(level, ("SLLW")) + `INST_ALU_SRL: `TRACE(level, ("SRLW")) + `INST_ALU_SRA: `TRACE(level, ("SRAW")) + default: `TRACE(level, ("?")) + endcase + end + end else begin + if (op_args.alu.use_imm) begin + case (`INST_ALU_BITS'(op_type)) + `INST_ALU_ADD: `TRACE(level, ("ADDI")) + `INST_ALU_SLL: `TRACE(level, ("SLLI")) + `INST_ALU_SRL: `TRACE(level, ("SRLI")) + `INST_ALU_SRA: `TRACE(level, ("SRAI")) + `INST_ALU_SLT: `TRACE(level, ("SLTI")) + `INST_ALU_SLTU: `TRACE(level, ("SLTIU")) + `INST_ALU_XOR: `TRACE(level, ("XORI")) + `INST_ALU_OR: `TRACE(level, ("ORI")) + `INST_ALU_AND: `TRACE(level, ("ANDI")) + `INST_ALU_LUI: `TRACE(level, ("LUI")) + `INST_ALU_AUIPC: `TRACE(level, ("AUIPC")) + default: `TRACE(level, ("?")) + endcase + end else begin + case (`INST_ALU_BITS'(op_type)) + `INST_ALU_ADD: `TRACE(level, ("ADD")) + `INST_ALU_SUB: `TRACE(level, ("SUB")) + `INST_ALU_SLL: `TRACE(level, ("SLL")) + `INST_ALU_SRL: `TRACE(level, ("SRL")) + `INST_ALU_SRA: `TRACE(level, ("SRA")) + `INST_ALU_SLT: `TRACE(level, ("SLT")) + `INST_ALU_SLTU: `TRACE(level, ("SLTU")) + `INST_ALU_XOR: `TRACE(level, ("XOR")) + `INST_ALU_OR: `TRACE(level, ("OR")) + `INST_ALU_AND: `TRACE(level, ("AND")) + `INST_ALU_CZEQ: `TRACE(level, ("CZERO.EQZ")) + `INST_ALU_CZNE: `TRACE(level, ("CZERO.NEZ")) + default: `TRACE(level, ("?")) + endcase + end + end + end + `ALU_TYPE_BRANCH: begin + case (`INST_BR_BITS'(op_type)) + `INST_BR_EQ: `TRACE(level, ("BEQ")) + `INST_BR_NE: `TRACE(level, ("BNE")) + `INST_BR_LT: `TRACE(level, ("BLT")) + `INST_BR_GE: `TRACE(level, ("BGE")) + `INST_BR_LTU: `TRACE(level, ("BLTU")) + `INST_BR_GEU: `TRACE(level, ("BGEU")) + `INST_BR_JAL: `TRACE(level, ("JAL")) + `INST_BR_JALR: `TRACE(level, ("JALR")) + `INST_BR_ECALL: `TRACE(level, ("ECALL")) + `INST_BR_EBREAK:`TRACE(level, ("EBREAK")) + `INST_BR_URET: `TRACE(level, ("URET")) + `INST_BR_SRET: `TRACE(level, ("SRET")) + `INST_BR_MRET: `TRACE(level, ("MRET")) + default: `TRACE(level, ("?")) + endcase + end + `ALU_TYPE_MULDIV: begin + if (op_args.alu.is_w) begin + case (`INST_M_BITS'(op_type)) + `INST_M_MUL: `TRACE(level, ("MULW")) + `INST_M_DIV: `TRACE(level, ("DIVW")) + `INST_M_DIVU: `TRACE(level, ("DIVUW")) + `INST_M_REM: `TRACE(level, ("REMW")) + `INST_M_REMU: `TRACE(level, ("REMUW")) + default: `TRACE(level, ("?")) + endcase + end else begin + case (`INST_M_BITS'(op_type)) + `INST_M_MUL: `TRACE(level, ("MUL")) + `INST_M_MULH: `TRACE(level, ("MULH")) + `INST_M_MULHSU:`TRACE(level, ("MULHSU")) + `INST_M_MULHU: `TRACE(level, ("MULHU")) + `INST_M_DIV: `TRACE(level, ("DIV")) + `INST_M_DIVU: `TRACE(level, ("DIVU")) + `INST_M_REM: `TRACE(level, ("REM")) + `INST_M_REMU: `TRACE(level, ("REMU")) + default: `TRACE(level, ("?")) + endcase + end + end + default: `TRACE(level, ("?")) + endcase + end + `EX_LSU: begin + if (op_args.lsu.is_float) begin + case (`INST_LSU_BITS'(op_type)) + `INST_LSU_LW: `TRACE(level, ("FLW")) + `INST_LSU_LD: `TRACE(level, ("FLD")) + `INST_LSU_SW: `TRACE(level, ("FSW")) + `INST_LSU_SD: `TRACE(level, ("FSD")) + default: `TRACE(level, ("?")) + endcase + end else begin + case (`INST_LSU_BITS'(op_type)) + `INST_LSU_LB: `TRACE(level, ("LB")) + `INST_LSU_LH: `TRACE(level, ("LH")) + `INST_LSU_LW: `TRACE(level, ("LW")) + `INST_LSU_LD: `TRACE(level, ("LD")) + `INST_LSU_LBU:`TRACE(level, ("LBU")) + `INST_LSU_LHU:`TRACE(level, ("LHU")) + `INST_LSU_LWU:`TRACE(level, ("LWU")) + `INST_LSU_SB: `TRACE(level, ("SB")) + `INST_LSU_SH: `TRACE(level, ("SH")) + `INST_LSU_SW: `TRACE(level, ("SW")) + `INST_LSU_SD: `TRACE(level, ("SD")) + `INST_LSU_FENCE:`TRACE(level,("FENCE")) + default: `TRACE(level, ("?")) + endcase + end + end + `EX_SFU: begin + case (`INST_SFU_BITS'(op_type)) + `INST_SFU_TMC: `TRACE(level, ("TMC")) + `INST_SFU_WSPAWN:`TRACE(level, ("WSPAWN")) + `INST_SFU_SPLIT: begin + if (op_args.wctl.is_neg) begin + `TRACE(level, ("SPLIT.N")) + end else begin + `TRACE(level, ("SPLIT")) + end + end + `INST_SFU_JOIN: `TRACE(level, ("JOIN")) + `INST_SFU_BAR: `TRACE(level, ("BAR")) + `INST_SFU_PRED: begin + if (op_args.wctl.is_neg) begin + `TRACE(level, ("PRED.N")) + end else begin + `TRACE(level, ("PRED")) + end + end + `INST_SFU_CSRRW: begin + if (op_args.csr.use_imm) begin + `TRACE(level, ("CSRRWI")) + end else begin + `TRACE(level, ("CSRRW")) + end + end + `INST_SFU_CSRRS: begin + if (op_args.csr.use_imm) begin + `TRACE(level, ("CSRRSI")) + end else begin + `TRACE(level, ("CSRRS")) + end + end + `INST_SFU_CSRRC: begin + if (op_args.csr.use_imm) begin + `TRACE(level, ("CSRRCI")) + end else begin + `TRACE(level, ("CSRRC")) + end + end + default: `TRACE(level, ("?")) + endcase + end + `ifdef EXT_F_ENABLE + `EX_FPU: begin + case (`INST_FPU_BITS'(op_type)) + `INST_FPU_ADD: begin + if (op_args.fpu.fmt[1]) begin + if (op_args.fpu.fmt[0]) begin + `TRACE(level, ("FSUB.D")) + end else begin + `TRACE(level, ("FSUB.S")) + end + end else begin + if (op_args.fpu.fmt[0]) begin + `TRACE(level, ("FADD.D")) + end else begin + `TRACE(level, ("FADD.S")) + end + end + end + `INST_FPU_MADD: begin + if (op_args.fpu.fmt[1]) begin + if (op_args.fpu.fmt[0]) begin + `TRACE(level, ("FMSUB.D")) + end else begin + `TRACE(level, ("FMSUB.S")) + end + end else begin + if (op_args.fpu.fmt[0]) begin + `TRACE(level, ("FMADD.D")) + end else begin + `TRACE(level, ("FMADD.S")) + end + end + end + `INST_FPU_NMADD: begin + if (op_args.fpu.fmt[1]) begin + if (op_args.fpu.fmt[0]) begin + `TRACE(level, ("FNMSUB.D")) + end else begin + `TRACE(level, ("FNMSUB.S")) + end + end else begin + if (op_args.fpu.fmt[0]) begin + `TRACE(level, ("FNMADD.D")) + end else begin + `TRACE(level, ("FNMADD.S")) + end + end + end + `INST_FPU_MUL: begin + if (op_args.fpu.fmt[0]) begin + `TRACE(level, ("FMUL.D")) + end else begin + `TRACE(level, ("FMUL.S")) + end + end + `INST_FPU_DIV: begin + if (op_args.fpu.fmt[0]) begin + `TRACE(level, ("FDIV.D")) + end else begin + `TRACE(level, ("FDIV.S")) + end + end + `INST_FPU_SQRT: begin + if (op_args.fpu.fmt[0]) begin + `TRACE(level, ("FSQRT.D")) + end else begin + `TRACE(level, ("FSQRT.S")) + end + end + `INST_FPU_CMP: begin + if (op_args.fpu.fmt[0]) begin + case (op_args.fpu.frm[1:0]) + 0: `TRACE(level, ("FLE.D")) + 1: `TRACE(level, ("FLT.D")) + 2: `TRACE(level, ("FEQ.D")) + default: `TRACE(level, ("?")) + endcase + end else begin + case (op_args.fpu.frm[1:0]) + 0: `TRACE(level, ("FLE.S")) + 1: `TRACE(level, ("FLT.S")) + 2: `TRACE(level, ("FEQ.S")) + default: `TRACE(level, ("?")) + endcase + end + end + `INST_FPU_F2F: begin + if (op_args.fpu.fmt[0]) begin + `TRACE(level, ("FCVT.D.S")) + end else begin + `TRACE(level, ("FCVT.S.D")) + end + end + `INST_FPU_F2I: begin + if (op_args.fpu.fmt[0]) begin + if (op_args.fpu.fmt[1]) begin + `TRACE(level, ("FCVT.L.D")) + end else begin + `TRACE(level, ("FCVT.W.D")) + end + end else begin + if (op_args.fpu.fmt[1]) begin + `TRACE(level, ("FCVT.L.S")) + end else begin + `TRACE(level, ("FCVT.W.S")) + end + end + end + `INST_FPU_F2U: begin + if (op_args.fpu.fmt[0]) begin + if (op_args.fpu.fmt[1]) begin + `TRACE(level, ("FCVT.LU.D")) + end else begin + `TRACE(level, ("FCVT.WU.D")) + end + end else begin + if (op_args.fpu.fmt[1]) begin + `TRACE(level, ("FCVT.LU.S")) + end else begin + `TRACE(level, ("FCVT.WU.S")) + end + end + end + `INST_FPU_I2F: begin + if (op_args.fpu.fmt[0]) begin + if (op_args.fpu.fmt[1]) begin + `TRACE(level, ("FCVT.D.L")) + end else begin + `TRACE(level, ("FCVT.D.W")) + end + end else begin + if (op_args.fpu.fmt[1]) begin + `TRACE(level, ("FCVT.S.L")) + end else begin + `TRACE(level, ("FCVT.S.W")) + end + end + end + `INST_FPU_U2F: begin + if (op_args.fpu.fmt[0]) begin + if (op_args.fpu.fmt[1]) begin + `TRACE(level, ("FCVT.D.LU")) + end else begin + `TRACE(level, ("FCVT.D.WU")) + end + end else begin + if (op_args.fpu.fmt[1]) begin + `TRACE(level, ("FCVT.S.LU")) + end else begin + `TRACE(level, ("FCVT.S.WU")) + end + end + end + `INST_FPU_MISC: begin + if (op_args.fpu.fmt[0]) begin + case (op_args.fpu.frm) + 0: `TRACE(level, ("FSGNJ.D")) + 1: `TRACE(level, ("FSGNJN.D")) + 2: `TRACE(level, ("FSGNJX.D")) + 3: `TRACE(level, ("FCLASS.D")) + 4: `TRACE(level, ("FMV.X.D")) + 5: `TRACE(level, ("FMV.D.X")) + 6: `TRACE(level, ("FMIN.D")) + 7: `TRACE(level, ("FMAX.D")) + endcase + end else begin + case (op_args.fpu.frm) + 0: `TRACE(level, ("FSGNJ.S")) + 1: `TRACE(level, ("FSGNJN.S")) + 2: `TRACE(level, ("FSGNJX.S")) + 3: `TRACE(level, ("FCLASS.S")) + 4: `TRACE(level, ("FMV.X.S")) + 5: `TRACE(level, ("FMV.S.X")) + 6: `TRACE(level, ("FMIN.S")) + 7: `TRACE(level, ("FMAX.S")) + endcase + end + end + default: `TRACE(level, ("?")) + endcase + end + `endif + default: `TRACE(level, ("?")) + endcase + endtask + + task trace_op_args(input int level, + input [`EX_BITS-1:0] ex_type, + input [`INST_OP_BITS-1:0] op_type, + input VX_gpu_pkg::op_args_t op_args + ); + case (ex_type) + `EX_ALU: begin + `TRACE(level, (", use_PC=%b, use_imm=%b, imm=0x%0h", op_args.alu.use_PC, op_args.alu.use_imm, op_args.alu.imm)) + end + `EX_LSU: begin + `TRACE(level, (", offset=0x%0h", op_args.lsu.offset)) + end + `EX_SFU: begin + if (`INST_SFU_IS_CSR(op_type)) begin + `TRACE(level, (", addr=0x%0h, use_imm=%b, imm=0x%0h", op_args.csr.addr, op_args.csr.use_imm, op_args.csr.imm)) + end + end + `ifdef EXT_F_ENABLE + `EX_FPU: begin + `TRACE(level, (", fmt=0x%0h, frm=0x%0h", op_args.fpu.fmt, op_args.fpu.frm)) + end + `endif + default:; + endcase + endtask + + task trace_base_dcr(input int level, input [`VX_DCR_ADDR_WIDTH-1:0] addr); + case (addr) + `VX_DCR_BASE_STARTUP_ADDR0: `TRACE(level, ("STARTUP_ADDR0")) + `VX_DCR_BASE_STARTUP_ADDR1: `TRACE(level, ("STARTUP_ADDR1")) + `VX_DCR_BASE_STARTUP_ARG0: `TRACE(level, ("STARTUP_ARG0")) + `VX_DCR_BASE_STARTUP_ARG1: `TRACE(level, ("STARTUP_ARG1")) + `VX_DCR_BASE_MPM_CLASS: `TRACE(level, ("MPM_CLASS")) + default: `TRACE(level, ("?")) + endcase + endtask + +`endif + endpackage `endif // VX_GPU_PKG_VH diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index 59f5ef0f5..362bf2978 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -22,36 +22,34 @@ /////////////////////////////////////////////////////////////////////////////// -`ifdef VIVADO -`define STRING -`else -`define STRING string -`endif +`ifdef SIMULATION -`ifdef SYNTHESIS -`define TRACING_ON -`define TRACING_OFF -`ifndef NDEBUG - `define DEBUG_BLOCK(x) x -`else - `define DEBUG_BLOCK(x) -`endif -`define IGNORE_UNOPTFLAT_BEGIN -`define IGNORE_UNOPTFLAT_END -`define IGNORE_UNUSED_BEGIN -`define IGNORE_UNUSED_END -`define IGNORE_WARNINGS_BEGIN -`define IGNORE_WARNINGS_END -`define UNUSED_PARAM(x) -`define UNUSED_SPARAM(x) -`define UNUSED_VAR(x) -`define UNUSED_PIN(x) . x () -`define UNUSED_ARG(x) x -`define TRACE(level, args) if (level <= `DEBUG_LEVEL) $write args -`else -`ifdef VERILATOR +`define STATIC_ASSERT(cond, msg) \ + /* verilator lint_off GENUNNAMED */ \ + if (!(cond)) $error msg; \ + /* verilator lint_on GENUNNAMED */ \ + +`define ERROR(msg) \ + $error msg + +`define ASSERT(cond, msg) \ + assert(cond) else $error msg + +`define RUNTIME_ASSERT(cond, msg) \ + always @(posedge clk) begin \ + if (!reset) begin \ + `ASSERT(cond, msg); \ + end \ + end + +`ifndef TRACING_ALL `define TRACING_ON /* verilator tracing_on */ `define TRACING_OFF /* verilator tracing_off */ +`else +`define TRACING_ON +`define TRACING_OFF +`endif + `ifndef NDEBUG `define DEBUG_BLOCK(x) /* verilator lint_off UNUSED */ \ x \ @@ -100,74 +98,99 @@ localparam `STRING __``x = x; \ /* verilator lint_on UNUSED */ -`define UNUSED_VAR(x) if (1) begin \ +`define UNUSED_VAR(x) /* verilator lint_off GENUNNAMED */ \ + if (1) begin \ /* verilator lint_off UNUSED */ \ - wire [$bits(x)-1:0] __x = x; \ + wire [$bits(x)-1:0] __unused = x; \ /* verilator lint_on UNUSED */ \ - end + end \ + /* verilator lint_on GENUNNAMED */ `define UNUSED_PIN(x) /* verilator lint_off PINCONNECTEMPTY */ \ . x () \ /* verilator lint_on PINCONNECTEMPTY */ + `define UNUSED_ARG(x) /* verilator lint_off UNUSED */ \ x \ /* verilator lint_on UNUSED */ -`endif `ifdef SV_DPI -`define TRACE(level, args) dpi_trace(level, $sformatf args) +`define TRACE(level, args) dpi_trace(level, $sformatf args); `else -`define TRACE(level, args) if (level <= `DEBUG_LEVEL) $write args +`define TRACE(level, args) \ + if (level <= `DEBUG_LEVEL) begin \ + $write args; \ + end `endif -`endif +`define SFORMATF(x) $sformatf x -`ifdef SIMULATION - `define STATIC_ASSERT(cond, msg) \ - generate \ - if (!(cond)) $error msg; \ - endgenerate +`else // SYNTHESIS - `define ERROR(msg) \ - $error msg +`define STATIC_ASSERT(cond, msg) +`define ERROR(msg) // +`define ASSERT(cond, msg) // +`define RUNTIME_ASSERT(cond, msg) - `define ASSERT(cond, msg) \ - assert(cond) else $error msg +`define DEBUG_BLOCK(x) +`define TRACE(level, args) +`define SFORMATF(x) "" + +`define TRACING_ON +`define TRACING_OFF + +`define IGNORE_UNOPTFLAT_BEGIN +`define IGNORE_UNOPTFLAT_END +`define IGNORE_UNUSED_BEGIN +`define IGNORE_UNUSED_END +`define IGNORE_WARNINGS_BEGIN +`define IGNORE_WARNINGS_END +`define UNUSED_PARAM(x) +`define UNUSED_SPARAM(x) +`define UNUSED_VAR(x) +`define UNUSED_PIN(x) . x () +`define UNUSED_ARG(x) x - `define RUNTIME_ASSERT(cond, msg) \ - always @(posedge clk) begin \ - assert(cond) else $error msg; \ - end -`else - `define STATIC_ASSERT(cond, msg) - `define ERROR(msg) // - `define ASSERT(cond, msg) // - `define RUNTIME_ASSERT(cond, msg) `endif /////////////////////////////////////////////////////////////////////////////// `ifdef QUARTUS `define MAX_FANOUT 8 -`define IF_DATA_SIZE(x) $bits(x.data) +`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256) +`define USE_BLOCK_BRAM (* ramstyle = "block" *) `define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *) `define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *) +`define RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams on" *) `define DISABLE_BRAM (* ramstyle = "logic" *) `define PRESERVE_NET (* preserve *) +`define BLACKBOX_CELL (* black_box *) +`define STRING string `elsif VIVADO `define MAX_FANOUT 8 -`define IF_DATA_SIZE(x) $bits(x.data) +`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256) +`define USE_BLOCK_BRAM (* ram_style = "block" *) `define USE_FAST_BRAM (* ram_style = "distributed" *) `define NO_RW_RAM_CHECK (* rw_addr_collision = "no" *) +`define RW_RAM_CHECK (* rw_addr_collision = "yes" *) `define DISABLE_BRAM (* ram_style = "registers" *) `define PRESERVE_NET (* keep = "true" *) +`define BLACKBOX_CELL (* black_box *) +`define STRING +`ifndef SIMULATION + `define ASYNC_BRAM_PATCH +`endif `else `define MAX_FANOUT 8 -`define IF_DATA_SIZE(x) x.DATA_WIDTH +`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256) +`define USE_BLOCK_BRAM `define USE_FAST_BRAM `define NO_RW_RAM_CHECK +`define RW_RAM_CHECK `define DISABLE_BRAM `define PRESERVE_NET +`define BLACKBOX_CELL +`define STRING string `endif /////////////////////////////////////////////////////////////////////////////// @@ -192,7 +215,7 @@ `define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x))) -`define UP(x) (((x) != 0) ? (x) : 1) +`define UP(x) (((x) > 0) ? (x) : 1) `define CDIV(n,d) ((n + d - 1) / (d)) @@ -204,23 +227,23 @@ `define SEXT(len, x) {{(len-$bits(x)+1){x[$bits(x)-1]}}, x[$bits(x)-2:0]} `define TRACE_ARRAY1D(lvl, fmt, arr, n) \ - `TRACE(lvl, ("{")); \ + `TRACE(lvl, ("{")) \ for (integer __i = (n-1); __i >= 0; --__i) begin \ - if (__i != (n-1)) `TRACE(lvl, (", ")); \ - `TRACE(lvl, (fmt, arr[__i])); \ + if (__i != (n-1)) `TRACE(lvl, (", ")) \ + `TRACE(lvl, (fmt, arr[__i])) \ end \ - `TRACE(lvl, ("}")); + `TRACE(lvl, ("}")) `define TRACE_ARRAY2D(lvl, fmt, arr, m, n) \ - `TRACE(lvl, ("{")); \ + `TRACE(lvl, ("{")) \ for (integer __i = n-1; __i >= 0; --__i) begin \ - if (__i != (n-1)) `TRACE(lvl, (", ")); \ - `TRACE(lvl, ("{")); \ + if (__i != (n-1)) `TRACE(lvl, (", ")) \ + `TRACE(lvl, ("{")) \ for (integer __j = (m-1); __j >= 0; --__j) begin \ - if (__j != (m-1)) `TRACE(lvl, (", "));\ - `TRACE(lvl, (fmt, arr[__i][__j])); \ + if (__j != (m-1)) `TRACE(lvl, (", "))\ + `TRACE(lvl, (fmt, arr[__i][__j])) \ end \ - `TRACE(lvl, ("}")); \ + `TRACE(lvl, ("}")) \ end \ `TRACE(lvl, ("}")) @@ -239,10 +262,13 @@ `RESET_RELAY_EX (dst, src, 1, 0) // size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2, 5 -> 2 -`define TO_OUT_BUF_SIZE(s) `MIN(s, 2) +`define TO_OUT_BUF_SIZE(s) `MIN(s & 7, 2) // reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2, 5 > 3 -`define TO_OUT_BUF_REG(s) ((s < 2) ? s : (s - 2)) +`define TO_OUT_BUF_REG(s) (((s & 7) < 2) ? (s & 7) : ((s & 7) - 2)) + +// lut(x): (x & 8) != 0 +`define TO_OUT_BUF_LUTRAM(s) ((s & 8) != 0) `define REPEAT(n,f,s) `_REPEAT_``n(f,s) `define _REPEAT_0(f,s) diff --git a/hw/rtl/VX_scope.vh b/hw/rtl/VX_scope.vh index a74770640..b3d427ede 100644 --- a/hw/rtl/VX_scope.vh +++ b/hw/rtl/VX_scope.vh @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,48 +21,67 @@ input wire scope_bus_in, \ output wire scope_bus_out, -`define SCOPE_IO_SWITCH(__count) \ - wire scope_bus_in_w [__count]; \ - wire scope_bus_out_w [__count]; \ - `RESET_RELAY_EX(scope_reset_w, scope_reset, __count, 4); \ - VX_scope_switch #( \ - .N (__count) \ - ) scope_switch ( \ - .clk (clk), \ - .reset (scope_reset), \ - .req_in (scope_bus_in), \ - .rsp_out (scope_bus_out), \ - .req_out (scope_bus_in_w), \ - .rsp_in (scope_bus_out_w) \ - ); - `define SCOPE_IO_BIND(__i) \ .scope_reset (scope_reset_w[__i]), \ .scope_bus_in (scope_bus_in_w[__i]), \ .scope_bus_out (scope_bus_out_w[__i]), -`define SCOPE_IO_UNUSED() \ - `UNUSED_VAR (scope_reset); \ - `UNUSED_VAR (scope_bus_in); \ - assign scope_bus_out = 0; - -`define SCOPE_IO_UNUSED_W(__i) \ +`define SCOPE_IO_UNUSED(__i) \ `UNUSED_VAR (scope_reset_w[__i]); \ `UNUSED_VAR (scope_bus_in_w[__i]); \ assign scope_bus_out_w[__i] = 0; +`define SCOPE_IO_SWITCH(__count) \ + wire [__count-1:0] scope_bus_in_w; \ + wire [__count-1:0] scope_bus_out_w; \ + wire [__count-1:0] scope_reset_w = {__count{scope_reset}}; \ + VX_scope_switch #( \ + .N (__count) \ + ) scope_switch ( \ + .clk (clk), \ + .reset (scope_reset), \ + .req_in (scope_bus_in), \ + .rsp_out (scope_bus_out), \ + .req_out (scope_bus_in_w), \ + .rsp_in (scope_bus_out_w) \ + ) + +`define SCOPE_TAP_EX(__idx, __id, __xtriggers_w, __htriggers_w, __probes_w, __xtriggers, __htriggers, __probes, __start, __stop, __depth) \ + VX_scope_tap #( \ + .SCOPE_ID (__id), \ + .XTRIGGERW(__xtriggers_w), \ + .HTRIGGERW(__htriggers_w), \ + .PROBEW (__probes_w), \ + .DEPTH (__depth) \ + ) scope_tap_``idx ( \ + .clk (clk), \ + .reset (scope_reset_w[__idx]), \ + .start (__start), \ + .stop (__stop), \ + .xtriggers(__xtriggers), \ + .htriggers(__htriggers), \ + .probes (__probes), \ + .bus_in (scope_bus_in_w[__idx]), \ + .bus_out(scope_bus_out_w[__idx]) \ + ) + +`define SCOPE_TAP(__idx, __id, __xtriggers, __htriggers, __probes, __start, __stop, __depth) \ + `SCOPE_TAP_EX(__idx, __id, $bits(__xtriggers), $bits(__htriggers), $bits(__probes), __xtriggers, __htriggers, __probes, __start, __stop, __depth) + `else `define SCOPE_IO_DECL -`define SCOPE_IO_SWITCH(__count) - `define SCOPE_IO_BIND(__i) -`define SCOPE_IO_UNUSED_W(__i) - `define SCOPE_IO_UNUSED(__i) +`define SCOPE_IO_SWITCH(__count) + +`define SCOPE_TAP(__idx, __id, __xtriggers, __probes, __depth) + +`define SCOPE_TAP_EX(__idx, __id, __xtriggers_w, __probes_w, __xtriggers, __probes, __depth) + `endif `endif // VX_SCOPE_VH diff --git a/hw/rtl/VX_socket.sv b/hw/rtl/VX_socket.sv index 694edfe9c..0b963f83d 100644 --- a/hw/rtl/VX_socket.sv +++ b/hw/rtl/VX_socket.sv @@ -24,14 +24,14 @@ module VX_socket import VX_gpu_pkg::*; #( input wire reset, `ifdef PERF_ENABLE - VX_mem_perf_if.slave mem_perf_if, + input sysmem_perf_t sysmem_perf, `endif // DCRs VX_dcr_bus_if.slave dcr_bus_if, // Memory - VX_mem_bus_if.master mem_bus_if, + VX_mem_bus_if.master mem_bus_if [`L1_MEM_PORTS], `ifdef GBAR_ENABLE // Barrier @@ -49,14 +49,12 @@ module VX_socket import VX_gpu_pkg::*; #( `ifdef GBAR_ENABLE VX_gbar_bus_if per_core_gbar_bus_if[`SOCKET_SIZE](); - `RESET_RELAY (gbar_arb_reset, reset); - VX_gbar_arb #( .NUM_REQS (`SOCKET_SIZE), .OUT_BUF ((`SOCKET_SIZE > 1) ? 2 : 0) ) gbar_arb ( .clk (clk), - .reset (gbar_arb_reset), + .reset (reset), .bus_in_if (per_core_gbar_bus_if), .bus_out_if (gbar_bus_if) ); @@ -65,11 +63,13 @@ module VX_socket import VX_gpu_pkg::*; #( /////////////////////////////////////////////////////////////////////////// `ifdef PERF_ENABLE - VX_mem_perf_if mem_perf_tmp_if(); - assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache; - assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; - assign mem_perf_tmp_if.lmem = 'x; - assign mem_perf_tmp_if.mem = mem_perf_if.mem; + cache_perf_t icache_perf, dcache_perf; + sysmem_perf_t sysmem_perf_tmp; + always @(*) begin + sysmem_perf_tmp = sysmem_perf; + sysmem_perf_tmp.icache = icache_perf; + sysmem_perf_tmp.dcache = dcache_perf; + end `endif /////////////////////////////////////////////////////////////////////////// @@ -82,12 +82,12 @@ module VX_socket import VX_gpu_pkg::*; #( VX_mem_bus_if #( .DATA_SIZE (ICACHE_LINE_SIZE), .TAG_WIDTH (ICACHE_MEM_TAG_WIDTH) - ) icache_mem_bus_if(); + ) icache_mem_bus_if[1](); `RESET_RELAY (icache_reset, reset); VX_cache_cluster #( - .INSTANCE_ID ($sformatf("%s-icache", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-icache", INSTANCE_ID))), .NUM_UNITS (`NUM_ICACHES), .NUM_INPUTS (`SOCKET_SIZE), .TAG_SEL_IDX (0), @@ -97,19 +97,22 @@ module VX_socket import VX_gpu_pkg::*; #( .NUM_WAYS (`ICACHE_NUM_WAYS), .WORD_SIZE (ICACHE_WORD_SIZE), .NUM_REQS (1), + .MEM_PORTS (1), .CRSQ_SIZE (`ICACHE_CRSQ_SIZE), .MSHR_SIZE (`ICACHE_MSHR_SIZE), .MRSQ_SIZE (`ICACHE_MRSQ_SIZE), .MREQ_SIZE (`ICACHE_MREQ_SIZE), .TAG_WIDTH (ICACHE_TAG_WIDTH), + .FLAGS_WIDTH (0), .UUID_WIDTH (`UUID_WIDTH), .WRITE_ENABLE (0), + .REPL_POLICY (`ICACHE_REPL_POLICY), .NC_ENABLE (0), - .CORE_OUT_BUF (2), + .CORE_OUT_BUF (3), .MEM_OUT_BUF (2) ) icache ( `ifdef PERF_ENABLE - .cache_perf (mem_perf_tmp_if.icache), + .cache_perf (icache_perf), `endif .clk (clk), .reset (icache_reset), @@ -127,12 +130,12 @@ module VX_socket import VX_gpu_pkg::*; #( VX_mem_bus_if #( .DATA_SIZE (DCACHE_LINE_SIZE), .TAG_WIDTH (DCACHE_MEM_TAG_WIDTH) - ) dcache_mem_bus_if(); + ) dcache_mem_bus_if[`L1_MEM_PORTS](); `RESET_RELAY (dcache_reset, reset); VX_cache_cluster #( - .INSTANCE_ID ($sformatf("%s-dcache", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-dcache", INSTANCE_ID))), .NUM_UNITS (`NUM_DCACHES), .NUM_INPUTS (`SOCKET_SIZE), .TAG_SEL_IDX (0), @@ -142,21 +145,24 @@ module VX_socket import VX_gpu_pkg::*; #( .NUM_WAYS (`DCACHE_NUM_WAYS), .WORD_SIZE (DCACHE_WORD_SIZE), .NUM_REQS (DCACHE_NUM_REQS), + .MEM_PORTS (`L1_MEM_PORTS), .CRSQ_SIZE (`DCACHE_CRSQ_SIZE), .MSHR_SIZE (`DCACHE_MSHR_SIZE), .MRSQ_SIZE (`DCACHE_MRSQ_SIZE), .MREQ_SIZE (`DCACHE_WRITEBACK ? `DCACHE_MSHR_SIZE : `DCACHE_MREQ_SIZE), .TAG_WIDTH (DCACHE_TAG_WIDTH), .UUID_WIDTH (`UUID_WIDTH), + .FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH), .WRITE_ENABLE (1), .WRITEBACK (`DCACHE_WRITEBACK), - .DIRTY_BYTES (`DCACHE_WRITEBACK), + .DIRTY_BYTES (`DCACHE_DIRTYBYTES), + .REPL_POLICY (`DCACHE_REPL_POLICY), .NC_ENABLE (1), - .CORE_OUT_BUF (2), + .CORE_OUT_BUF (3), .MEM_OUT_BUF (2) ) dcache ( `ifdef PERF_ENABLE - .cache_perf (mem_perf_tmp_if.dcache), + .cache_perf (dcache_perf), `endif .clk (clk), .reset (dcache_reset), @@ -166,51 +172,64 @@ module VX_socket import VX_gpu_pkg::*; #( /////////////////////////////////////////////////////////////////////////// - VX_mem_bus_if #( - .DATA_SIZE (`L1_LINE_SIZE), - .TAG_WIDTH (L1_MEM_TAG_WIDTH) - ) l1_mem_bus_if[2](); + for (genvar i = 0; i < `L1_MEM_PORTS; ++i) begin : g_mem_bus_if + if (i == 0) begin : g_i0 + VX_mem_bus_if #( + .DATA_SIZE (`L1_LINE_SIZE), + .TAG_WIDTH (L1_MEM_TAG_WIDTH) + ) l1_mem_bus_if[2](); - VX_mem_bus_if #( - .DATA_SIZE (`L1_LINE_SIZE), - .TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH) - ) l1_mem_arb_bus_if[1](); + VX_mem_bus_if #( + .DATA_SIZE (`L1_LINE_SIZE), + .TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH) + ) l1_mem_arb_bus_if[1](); - `ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH); - `ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH); + `ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_bus_if[0], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH, `UUID_WIDTH); + `ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_bus_if[1], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH, `UUID_WIDTH); - VX_mem_arb #( - .NUM_INPUTS (2), - .DATA_SIZE (`L1_LINE_SIZE), - .TAG_WIDTH (L1_MEM_TAG_WIDTH), - .TAG_SEL_IDX (0), - .ARBITER ("R"), - .REQ_OUT_BUF (2), - .RSP_OUT_BUF (2) - ) mem_arb ( - .clk (clk), - .reset (reset), - .bus_in_if (l1_mem_bus_if), - .bus_out_if (l1_mem_arb_bus_if) - ); + VX_mem_arb #( + .NUM_INPUTS (2), + .NUM_OUTPUTS(1), + .DATA_SIZE (`L1_LINE_SIZE), + .TAG_WIDTH (L1_MEM_TAG_WIDTH), + .TAG_SEL_IDX(0), + .ARBITER ("P"), // prioritize the icache + .REQ_OUT_BUF(3), + .RSP_OUT_BUF(3) + ) mem_arb ( + .clk (clk), + .reset (reset), + .bus_in_if (l1_mem_bus_if), + .bus_out_if (l1_mem_arb_bus_if) + ); - `ASSIGN_VX_MEM_BUS_IF (mem_bus_if, l1_mem_arb_bus_if[0]); + `ASSIGN_VX_MEM_BUS_IF (mem_bus_if[0], l1_mem_arb_bus_if[0]); + end else begin : g_i + VX_mem_bus_if #( + .DATA_SIZE (`L1_LINE_SIZE), + .TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH) + ) l1_mem_arb_bus_if(); + + `ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_arb_bus_if, dcache_mem_bus_if[i], L1_MEM_ARB_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH, `UUID_WIDTH); + `ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], l1_mem_arb_bus_if); + end + end /////////////////////////////////////////////////////////////////////////// wire [`SOCKET_SIZE-1:0] per_core_busy; - VX_dcr_bus_if core_dcr_bus_if(); - `BUFFER_DCR_BUS_IF (core_dcr_bus_if, dcr_bus_if, (`SOCKET_SIZE > 1)); - // Generate all cores - for (genvar core_id = 0; core_id < `SOCKET_SIZE; ++core_id) begin : cores + for (genvar core_id = 0; core_id < `SOCKET_SIZE; ++core_id) begin : g_cores `RESET_RELAY (core_reset, reset); + VX_dcr_bus_if core_dcr_bus_if(); + `BUFFER_DCR_BUS_IF (core_dcr_bus_if, dcr_bus_if, 1'b1, (`SOCKET_SIZE > 1)) + VX_core #( .CORE_ID ((SOCKET_ID * `SOCKET_SIZE) + core_id), - .INSTANCE_ID ($sformatf("%s-core%0d", INSTANCE_ID, core_id)) + .INSTANCE_ID (`SFORMATF(("%s-core%0d", INSTANCE_ID, core_id))) ) core ( `SCOPE_IO_BIND (scope_core + core_id) @@ -218,7 +237,7 @@ module VX_socket import VX_gpu_pkg::*; #( .reset (core_reset), `ifdef PERF_ENABLE - .mem_perf_if (mem_perf_tmp_if), + .sysmem_perf (sysmem_perf_tmp), `endif .dcr_bus_if (core_dcr_bus_if), @@ -235,6 +254,6 @@ module VX_socket import VX_gpu_pkg::*; #( ); end - `BUFFER_EX(busy, (| per_core_busy), 1'b1, (`SOCKET_SIZE > 1)); + `BUFFER_EX(busy, (| per_core_busy), 1'b1, 1, (`SOCKET_SIZE > 1)); endmodule diff --git a/hw/rtl/VX_types.vh b/hw/rtl/VX_types.vh index 927ffae96..455d42ce1 100644 --- a/hw/rtl/VX_types.vh +++ b/hw/rtl/VX_types.vh @@ -166,6 +166,8 @@ `define VX_CSR_MPM_MEM_WRITES_H 12'hB99 `define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency `define VX_CSR_MPM_MEM_LT_H 12'hB9A +`define VX_CSR_MPM_MEM_BANK_ST 12'hB1E // bank conflicts +`define VX_CSR_MPM_MEM_BANK_ST_H 12'hB9E // PERF: lmem `define VX_CSR_MPM_LMEM_READS 12'hB1B // memory reads `define VX_CSR_MPM_LMEM_READS_H 12'hB9B @@ -173,6 +175,9 @@ `define VX_CSR_MPM_LMEM_WRITES_H 12'hB9C `define VX_CSR_MPM_LMEM_BANK_ST 12'hB1D // bank conflicts `define VX_CSR_MPM_LMEM_BANK_ST_H 12'hB9D +// PERF: coalescer +`define VX_CSR_MPM_COALESCER_MISS 12'hB1F // coalescer misses +`define VX_CSR_MPM_COALESCER_MISS_H 12'hB9F // Machine Performance-monitoring memory counters (class 3) /////////////////// // @@ -184,6 +189,19 @@ `define VX_CSR_MIMPID 12'hF13 `define VX_CSR_MHARTID 12'hF14 +// Vector CSRs + +`define VX_CSR_VSTART 12'h008 +`define VX_CSR_VXSAT 12'h009 +`define VX_CSR_VXRM 12'h00A +`define VX_CSR_VCSR 12'h00F +`define VX_CSR_VL 12'hC20 +`define VX_CSR_VTYPE 12'hC21 +`define VX_CSR_VLENB 12'hC22 +`define VX_CSR_VCYCLE 12'hC00 +`define VX_CSR_VTIME 12'hC01 +`define VX_CSR_VINSTRET 12'hC02 + // GPGU CSRs `define VX_CSR_THREAD_ID 12'hCC0 @@ -197,4 +215,10 @@ `define VX_CSR_NUM_CORES 12'hFC2 `define VX_CSR_LOCAL_MEM_BASE 12'hFC3 +`define VX_MAT_MUL_SIZE 12'hFC4 // VX_MAT_MUL_SIZE = Matrix Size / TC Size +`define VX_TC_NUM 12'hFC5 +`define VX_TC_SIZE 12'hFC6 + + + `endif // VX_TYPES_VH diff --git a/hw/rtl/Vortex.sv b/hw/rtl/Vortex.sv index 978259101..44d7cb205 100644 --- a/hw/rtl/Vortex.sv +++ b/hw/rtl/Vortex.sv @@ -21,19 +21,19 @@ module Vortex import VX_gpu_pkg::*; ( input wire reset, // Memory request - output wire mem_req_valid, - output wire mem_req_rw, - output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen, - output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr, - output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data, - output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag, - input wire mem_req_ready, + output wire mem_req_valid [`VX_MEM_PORTS], + output wire mem_req_rw [`VX_MEM_PORTS], + output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen [`VX_MEM_PORTS], + output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr [`VX_MEM_PORTS], + output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data [`VX_MEM_PORTS], + output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag [`VX_MEM_PORTS], + input wire mem_req_ready [`VX_MEM_PORTS], // Memory response - input wire mem_rsp_valid, - input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data, - input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag, - output wire mem_rsp_ready, + input wire mem_rsp_valid [`VX_MEM_PORTS], + input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data [`VX_MEM_PORTS], + input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag [`VX_MEM_PORTS], + output wire mem_rsp_ready [`VX_MEM_PORTS], // DCR write request input wire dcr_wr_valid, @@ -50,22 +50,25 @@ module Vortex import VX_gpu_pkg::*; ( `endif `ifdef PERF_ENABLE - VX_mem_perf_if mem_perf_if(); - assign mem_perf_if.icache = 'x; - assign mem_perf_if.dcache = 'x; - assign mem_perf_if.l2cache = 'x; - assign mem_perf_if.lmem = 'x; + cache_perf_t l3_perf; + mem_perf_t mem_perf; + sysmem_perf_t sysmem_perf; + always @(*) begin + sysmem_perf = '0; + sysmem_perf.l3cache = l3_perf; + sysmem_perf.mem = mem_perf; + end `endif VX_mem_bus_if #( .DATA_SIZE (`L2_LINE_SIZE), .TAG_WIDTH (L2_MEM_TAG_WIDTH) - ) per_cluster_mem_bus_if[`NUM_CLUSTERS](); + ) per_cluster_mem_bus_if[`NUM_CLUSTERS * `L2_MEM_PORTS](); VX_mem_bus_if #( .DATA_SIZE (`L3_LINE_SIZE), .TAG_WIDTH (L3_MEM_TAG_WIDTH) - ) mem_bus_if(); + ) mem_bus_if[`L3_MEM_PORTS](); `RESET_RELAY (l3_reset, reset); @@ -77,6 +80,7 @@ module Vortex import VX_gpu_pkg::*; ( .NUM_WAYS (`L3_NUM_WAYS), .WORD_SIZE (L3_WORD_SIZE), .NUM_REQS (L3_NUM_REQS), + .MEM_PORTS (`L3_MEM_PORTS), .CRSQ_SIZE (`L3_CRSQ_SIZE), .MSHR_SIZE (`L3_MSHR_SIZE), .MRSQ_SIZE (`L3_MRSQ_SIZE), @@ -84,10 +88,12 @@ module Vortex import VX_gpu_pkg::*; ( .TAG_WIDTH (L2_MEM_TAG_WIDTH), .WRITE_ENABLE (1), .WRITEBACK (`L3_WRITEBACK), - .DIRTY_BYTES (`L3_WRITEBACK), + .DIRTY_BYTES (`L3_DIRTYBYTES), + .REPL_POLICY (`L3_REPL_POLICY), .UUID_WIDTH (`UUID_WIDTH), - .CORE_OUT_BUF (2), - .MEM_OUT_BUF (2), + .FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH), + .CORE_OUT_BUF (3), + .MEM_OUT_BUF (3), .NC_ENABLE (1), .PASSTHRU (!`L3_ENABLED) ) l3cache ( @@ -95,31 +101,28 @@ module Vortex import VX_gpu_pkg::*; ( .reset (l3_reset), `ifdef PERF_ENABLE - .cache_perf (mem_perf_if.l3cache), + .cache_perf (l3_perf), `endif .core_bus_if (per_cluster_mem_bus_if), .mem_bus_if (mem_bus_if) ); - assign mem_req_valid = mem_bus_if.req_valid; - assign mem_req_rw = mem_bus_if.req_data.rw; - assign mem_req_byteen= mem_bus_if.req_data.byteen; - assign mem_req_addr = mem_bus_if.req_data.addr; - assign mem_req_data = mem_bus_if.req_data.data; - assign mem_req_tag = mem_bus_if.req_data.tag; - assign mem_bus_if.req_ready = mem_req_ready; - `UNUSED_VAR (mem_bus_if.req_data.atype) + for (genvar i = 0; i < `L3_MEM_PORTS; ++i) begin : g_mem_bus_if + assign mem_req_valid[i] = mem_bus_if[i].req_valid; + assign mem_req_rw[i] = mem_bus_if[i].req_data.rw; + assign mem_req_byteen[i] = mem_bus_if[i].req_data.byteen; + assign mem_req_addr[i] = mem_bus_if[i].req_data.addr; + assign mem_req_data[i] = mem_bus_if[i].req_data.data; + assign mem_req_tag[i] = mem_bus_if[i].req_data.tag; + `UNUSED_VAR (mem_bus_if[i].req_data.flags) + assign mem_bus_if[i].req_ready = mem_req_ready[i]; - assign mem_bus_if.rsp_valid = mem_rsp_valid; - assign mem_bus_if.rsp_data.data = mem_rsp_data; - assign mem_bus_if.rsp_data.tag = mem_rsp_tag; - assign mem_rsp_ready = mem_bus_if.rsp_ready; - - wire mem_req_fire = mem_req_valid && mem_req_ready; - wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; - `UNUSED_VAR (mem_req_fire) - `UNUSED_VAR (mem_rsp_fire) + assign mem_bus_if[i].rsp_valid = mem_rsp_valid[i]; + assign mem_bus_if[i].rsp_data.data = mem_rsp_data[i]; + assign mem_bus_if[i].rsp_data.tag = mem_rsp_tag[i]; + assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready; + end VX_dcr_bus_if dcr_bus_if(); assign dcr_bus_if.write_valid = dcr_wr_valid; @@ -129,16 +132,16 @@ module Vortex import VX_gpu_pkg::*; ( wire [`NUM_CLUSTERS-1:0] per_cluster_busy; // Generate all clusters - for (genvar cluster_id = 0; cluster_id < `NUM_CLUSTERS; ++cluster_id) begin : clusters + for (genvar cluster_id = 0; cluster_id < `NUM_CLUSTERS; ++cluster_id) begin : g_clusters `RESET_RELAY (cluster_reset, reset); VX_dcr_bus_if cluster_dcr_bus_if(); - `BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, (`NUM_CLUSTERS > 1)); + `BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, 1'b1, (`NUM_CLUSTERS > 1)) VX_cluster #( .CLUSTER_ID (cluster_id), - .INSTANCE_ID ($sformatf("cluster%0d", cluster_id)) + .INSTANCE_ID (`SFORMATF(("cluster%0d", cluster_id))) ) cluster ( `SCOPE_IO_BIND (scope_cluster + cluster_id) @@ -146,59 +149,83 @@ module Vortex import VX_gpu_pkg::*; ( .reset (cluster_reset), `ifdef PERF_ENABLE - .mem_perf_if (mem_perf_if), + .sysmem_perf (sysmem_perf), `endif .dcr_bus_if (cluster_dcr_bus_if), - .mem_bus_if (per_cluster_mem_bus_if[cluster_id]), + .mem_bus_if (per_cluster_mem_bus_if[cluster_id * `L2_MEM_PORTS +: `L2_MEM_PORTS]), .busy (per_cluster_busy[cluster_id]) ); end - `BUFFER_EX(busy, (| per_cluster_busy), 1'b1, (`NUM_CLUSTERS > 1)); + `BUFFER_EX(busy, (| per_cluster_busy), 1'b1, 1, (`NUM_CLUSTERS > 1)); `ifdef PERF_ENABLE + localparam MEM_PORTS_CTR_W = `CLOG2(`VX_MEM_PORTS+1); + + wire [`VX_MEM_PORTS-1:0] mem_req_fire, mem_rsp_fire; + wire [`VX_MEM_PORTS-1:0] mem_rd_req_fire, mem_wr_req_fire; + + for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_perf_ctrs + assign mem_req_fire[i] = mem_req_valid[i] & mem_req_ready[i]; + assign mem_rsp_fire[i] = mem_rsp_valid[i] & mem_rsp_ready[i]; + assign mem_rd_req_fire[i] = mem_req_fire[i] & ~mem_req_rw[i]; + assign mem_wr_req_fire[i] = mem_req_fire[i] & mem_req_rw[i]; + end + + wire [MEM_PORTS_CTR_W-1:0] perf_mem_reads_per_cycle; + wire [MEM_PORTS_CTR_W-1:0] perf_mem_writes_per_cycle; + wire [MEM_PORTS_CTR_W-1:0] perf_mem_rsps_per_cycle; + + `POP_COUNT(perf_mem_reads_per_cycle, mem_rd_req_fire); + `POP_COUNT(perf_mem_writes_per_cycle, mem_wr_req_fire); + `POP_COUNT(perf_mem_rsps_per_cycle, mem_rsp_fire); + reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads; - mem_perf_t mem_perf; always @(posedge clk) begin if (reset) begin perf_mem_pending_reads <= '0; end else begin perf_mem_pending_reads <= $signed(perf_mem_pending_reads) + - `PERF_CTR_BITS'($signed(2'(mem_req_fire && ~mem_bus_if.req_data.rw) - 2'(mem_rsp_fire))); + `PERF_CTR_BITS'($signed((MEM_PORTS_CTR_W+1)'(perf_mem_reads_per_cycle) - (MEM_PORTS_CTR_W+1)'(perf_mem_rsps_per_cycle))); end end - wire mem_rd_req_fire = mem_req_fire && ~mem_bus_if.req_data.rw; - wire mem_wr_req_fire = mem_req_fire && mem_bus_if.req_data.rw; - always @(posedge clk) begin if (reset) begin mem_perf <= '0; end else begin - mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(mem_rd_req_fire); - mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(mem_wr_req_fire); + mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(perf_mem_reads_per_cycle); + mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(perf_mem_writes_per_cycle); mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads; end end - assign mem_perf_if.mem = mem_perf; `endif + // dump device configuration + initial begin + `TRACE(0, ("CONFIGS: num_threads=%0d, num_warps=%0d, num_cores=%0d, num_clusters=%0d, socket_size=%0d, local_mem_base=0x%0h, num_barriers=%0d\n", + `NUM_THREADS, `NUM_WARPS, `NUM_CORES, `NUM_CLUSTERS, `SOCKET_SIZE, `LMEM_BASE_ADDR, `NUM_BARRIERS)) + end + `ifdef DBG_TRACE_MEM - always @(posedge clk) begin - if (mem_req_fire) begin - if (mem_req_rw) - `TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%h data=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data)); - else - `TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen)); - end - if (mem_rsp_fire) begin - `TRACE(1, ("%d: MEM Rd Rsp: tag=0x%0h, data=0x%h\n", $time, mem_rsp_tag, mem_rsp_data)); + for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_trace + always @(posedge clk) begin + if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin + if (mem_bus_if[i].req_data.rw) begin + `TRACE(2, ("%t: MEM Wr Req[%0d]: addr=0x%0h, byteen=0x%h data=0x%h, tag=0x%0h (#%0d)\n", $time, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid)) + end else begin + `TRACE(2, ("%t: MEM Rd Req[%0d]: addr=0x%0h, byteen=0x%h, tag=0x%0h (#%0d)\n", $time, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid)) + end + end + if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin + `TRACE(2, ("%t: MEM Rd Rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n", $time, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid)) + end end end `endif diff --git a/hw/rtl/Vortex_axi.sv b/hw/rtl/Vortex_axi.sv index 5d2f5b0a7..1944a31d8 100644 --- a/hw/rtl/Vortex_axi.sv +++ b/hw/rtl/Vortex_axi.sv @@ -82,112 +82,26 @@ module Vortex_axi import VX_gpu_pkg::*; #( // Status output wire busy ); - `STATIC_ASSERT((AXI_DATA_WIDTH == `VX_MEM_DATA_WIDTH), ("invalid memory data size: current=%0d, expected=%0d", AXI_DATA_WIDTH, `VX_MEM_DATA_WIDTH)) - `STATIC_ASSERT((AXI_ADDR_WIDTH >= `MEM_ADDR_WIDTH), ("invalid memory address size: current=%0d, expected=%0d", AXI_ADDR_WIDTH, `VX_MEM_ADDR_WIDTH)) - //`STATIC_ASSERT((AXI_TID_WIDTH >= `VX_MEM_TAG_WIDTH), ("invalid memory tag size: current=%0d, expected=%0d", AXI_TID_WIDTH, `VX_MEM_TAG_WIDTH)) + localparam DST_LDATAW = `CLOG2(AXI_DATA_WIDTH); + localparam SRC_LDATAW = `CLOG2(`VX_MEM_DATA_WIDTH); + localparam SUB_LDATAW = DST_LDATAW - SRC_LDATAW; + localparam VX_MEM_TAG_A_WIDTH = `VX_MEM_TAG_WIDTH + `MAX(SUB_LDATAW, 0); + localparam VX_MEM_ADDR_A_WIDTH = `VX_MEM_ADDR_WIDTH - SUB_LDATAW; - wire mem_req_valid; - wire mem_req_rw; - wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen; - wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr; - wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data; - wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag; - wire mem_req_ready; + wire mem_req_valid [`VX_MEM_PORTS]; + wire mem_req_rw [`VX_MEM_PORTS]; + wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen [`VX_MEM_PORTS]; + wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr [`VX_MEM_PORTS]; + wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data [`VX_MEM_PORTS]; + wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag [`VX_MEM_PORTS]; + wire mem_req_ready [`VX_MEM_PORTS]; - wire mem_rsp_valid; - wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data; - wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag; - wire mem_rsp_ready; + wire mem_rsp_valid [`VX_MEM_PORTS]; + wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data [`VX_MEM_PORTS]; + wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag [`VX_MEM_PORTS]; + wire mem_rsp_ready [`VX_MEM_PORTS]; - wire [`MEM_ADDR_WIDTH-1:0] m_axi_awaddr_unqual [AXI_NUM_BANKS]; - wire [`MEM_ADDR_WIDTH-1:0] m_axi_araddr_unqual [AXI_NUM_BANKS]; - - wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_awid_unqual [AXI_NUM_BANKS]; - wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_arid_unqual [AXI_NUM_BANKS]; - - wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_bid_unqual [AXI_NUM_BANKS]; - wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_rid_unqual [AXI_NUM_BANKS]; - - for (genvar i = 0; i < AXI_NUM_BANKS; ++i) begin - assign m_axi_awaddr[i] = `MEM_ADDR_WIDTH'(m_axi_awaddr_unqual[i]); - assign m_axi_araddr[i] = `MEM_ADDR_WIDTH'(m_axi_araddr_unqual[i]); - - assign m_axi_awid[i] = AXI_TID_WIDTH'(m_axi_awid_unqual[i]); - assign m_axi_arid[i] = AXI_TID_WIDTH'(m_axi_arid_unqual[i]); - - assign m_axi_rid_unqual[i] = `VX_MEM_TAG_WIDTH'(m_axi_rid[i]); - assign m_axi_bid_unqual[i] = `VX_MEM_TAG_WIDTH'(m_axi_bid[i]); - end - - VX_axi_adapter #( - .DATA_WIDTH (`VX_MEM_DATA_WIDTH), - .ADDR_WIDTH (`MEM_ADDR_WIDTH), - .TAG_WIDTH (`VX_MEM_TAG_WIDTH), - .NUM_BANKS (AXI_NUM_BANKS), - .RSP_OUT_BUF((AXI_NUM_BANKS > 1) ? 2 : 0) - ) axi_adapter ( - .clk (clk), - .reset (reset), - - .mem_req_valid (mem_req_valid), - .mem_req_rw (mem_req_rw), - .mem_req_byteen (mem_req_byteen), - .mem_req_addr (mem_req_addr), - .mem_req_data (mem_req_data), - .mem_req_tag (mem_req_tag), - .mem_req_ready (mem_req_ready), - - .mem_rsp_valid (mem_rsp_valid), - .mem_rsp_data (mem_rsp_data), - .mem_rsp_tag (mem_rsp_tag), - .mem_rsp_ready (mem_rsp_ready), - - .m_axi_awvalid (m_axi_awvalid), - .m_axi_awready (m_axi_awready), - .m_axi_awaddr (m_axi_awaddr_unqual), - .m_axi_awid (m_axi_awid_unqual), - .m_axi_awlen (m_axi_awlen), - .m_axi_awsize (m_axi_awsize), - .m_axi_awburst (m_axi_awburst), - .m_axi_awlock (m_axi_awlock), - .m_axi_awcache (m_axi_awcache), - .m_axi_awprot (m_axi_awprot), - .m_axi_awqos (m_axi_awqos), - .m_axi_awregion (m_axi_awregion), - - .m_axi_wvalid (m_axi_wvalid), - .m_axi_wready (m_axi_wready), - .m_axi_wdata (m_axi_wdata), - .m_axi_wstrb (m_axi_wstrb), - .m_axi_wlast (m_axi_wlast), - - .m_axi_bvalid (m_axi_bvalid), - .m_axi_bready (m_axi_bready), - .m_axi_bid (m_axi_bid_unqual), - .m_axi_bresp (m_axi_bresp), - - .m_axi_arvalid (m_axi_arvalid), - .m_axi_arready (m_axi_arready), - .m_axi_araddr (m_axi_araddr_unqual), - .m_axi_arid (m_axi_arid_unqual), - .m_axi_arlen (m_axi_arlen), - .m_axi_arsize (m_axi_arsize), - .m_axi_arburst (m_axi_arburst), - .m_axi_arlock (m_axi_arlock), - .m_axi_arcache (m_axi_arcache), - .m_axi_arprot (m_axi_arprot), - .m_axi_arqos (m_axi_arqos), - .m_axi_arregion (m_axi_arregion), - - .m_axi_rvalid (m_axi_rvalid), - .m_axi_rready (m_axi_rready), - .m_axi_rdata (m_axi_rdata), - .m_axi_rlast (m_axi_rlast) , - .m_axi_rid (m_axi_rid_unqual), - .m_axi_rresp (m_axi_rresp) - ); - - `SCOPE_IO_SWITCH (1) + `SCOPE_IO_SWITCH (1); Vortex vortex ( `SCOPE_IO_BIND (0) @@ -215,4 +129,133 @@ module Vortex_axi import VX_gpu_pkg::*; #( .busy (busy) ); + wire mem_req_valid_a [`VX_MEM_PORTS]; + wire mem_req_rw_a [`VX_MEM_PORTS]; + wire [(AXI_DATA_WIDTH/8)-1:0] mem_req_byteen_a [`VX_MEM_PORTS]; + wire [VX_MEM_ADDR_A_WIDTH-1:0] mem_req_addr_a [`VX_MEM_PORTS]; + wire [AXI_DATA_WIDTH-1:0] mem_req_data_a [`VX_MEM_PORTS]; + wire [VX_MEM_TAG_A_WIDTH-1:0] mem_req_tag_a [`VX_MEM_PORTS]; + wire mem_req_ready_a [`VX_MEM_PORTS]; + + wire mem_rsp_valid_a [`VX_MEM_PORTS]; + wire [AXI_DATA_WIDTH-1:0] mem_rsp_data_a [`VX_MEM_PORTS]; + wire [VX_MEM_TAG_A_WIDTH-1:0] mem_rsp_tag_a [`VX_MEM_PORTS]; + wire mem_rsp_ready_a [`VX_MEM_PORTS]; + + // Adjust memory data width to match AXI interface + for (genvar i = 0; i < `VX_MEM_PORTS; i++) begin : g_mem_adapter + VX_mem_data_adapter #( + .SRC_DATA_WIDTH (`VX_MEM_DATA_WIDTH), + .DST_DATA_WIDTH (AXI_DATA_WIDTH), + .SRC_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH), + .DST_ADDR_WIDTH (VX_MEM_ADDR_A_WIDTH), + .SRC_TAG_WIDTH (`VX_MEM_TAG_WIDTH), + .DST_TAG_WIDTH (VX_MEM_TAG_A_WIDTH), + .REQ_OUT_BUF (0), + .RSP_OUT_BUF (0) + ) mem_data_adapter ( + .clk (clk), + .reset (reset), + + .mem_req_valid_in (mem_req_valid[i]), + .mem_req_addr_in (mem_req_addr[i]), + .mem_req_rw_in (mem_req_rw[i]), + .mem_req_byteen_in (mem_req_byteen[i]), + .mem_req_data_in (mem_req_data[i]), + .mem_req_tag_in (mem_req_tag[i]), + .mem_req_ready_in (mem_req_ready[i]), + + .mem_rsp_valid_in (mem_rsp_valid[i]), + .mem_rsp_data_in (mem_rsp_data[i]), + .mem_rsp_tag_in (mem_rsp_tag[i]), + .mem_rsp_ready_in (mem_rsp_ready[i]), + + .mem_req_valid_out (mem_req_valid_a[i]), + .mem_req_addr_out (mem_req_addr_a[i]), + .mem_req_rw_out (mem_req_rw_a[i]), + .mem_req_byteen_out (mem_req_byteen_a[i]), + .mem_req_data_out (mem_req_data_a[i]), + .mem_req_tag_out (mem_req_tag_a[i]), + .mem_req_ready_out (mem_req_ready_a[i]), + + .mem_rsp_valid_out (mem_rsp_valid_a[i]), + .mem_rsp_data_out (mem_rsp_data_a[i]), + .mem_rsp_tag_out (mem_rsp_tag_a[i]), + .mem_rsp_ready_out (mem_rsp_ready_a[i]) + ); + end + + VX_axi_adapter #( + .DATA_WIDTH (AXI_DATA_WIDTH), + .ADDR_WIDTH_IN (VX_MEM_ADDR_A_WIDTH), + .ADDR_WIDTH_OUT (AXI_ADDR_WIDTH), + .TAG_WIDTH_IN (VX_MEM_TAG_A_WIDTH), + .TAG_WIDTH_OUT (AXI_TID_WIDTH), + .NUM_PORTS_IN (`VX_MEM_PORTS), + .NUM_BANKS_OUT (AXI_NUM_BANKS), + .INTERLEAVE (`PLATFORM_MEMORY_INTERLEAVE), + .REQ_OUT_BUF ((`VX_MEM_PORTS > 1) ? 2 : 0), + .RSP_OUT_BUF ((`VX_MEM_PORTS > 1 || AXI_NUM_BANKS > 1) ? 2 : 0) + ) axi_adapter ( + .clk (clk), + .reset (reset), + + .mem_req_valid (mem_req_valid_a), + .mem_req_rw (mem_req_rw_a), + .mem_req_byteen (mem_req_byteen_a), + .mem_req_addr (mem_req_addr_a), + .mem_req_data (mem_req_data_a), + .mem_req_tag (mem_req_tag_a), + .mem_req_ready (mem_req_ready_a), + + .mem_rsp_valid (mem_rsp_valid_a), + .mem_rsp_data (mem_rsp_data_a), + .mem_rsp_tag (mem_rsp_tag_a), + .mem_rsp_ready (mem_rsp_ready_a), + + .m_axi_awvalid (m_axi_awvalid), + .m_axi_awready (m_axi_awready), + .m_axi_awaddr (m_axi_awaddr), + .m_axi_awid (m_axi_awid), + .m_axi_awlen (m_axi_awlen), + .m_axi_awsize (m_axi_awsize), + .m_axi_awburst (m_axi_awburst), + .m_axi_awlock (m_axi_awlock), + .m_axi_awcache (m_axi_awcache), + .m_axi_awprot (m_axi_awprot), + .m_axi_awqos (m_axi_awqos), + .m_axi_awregion (m_axi_awregion), + + .m_axi_wvalid (m_axi_wvalid), + .m_axi_wready (m_axi_wready), + .m_axi_wdata (m_axi_wdata), + .m_axi_wstrb (m_axi_wstrb), + .m_axi_wlast (m_axi_wlast), + + .m_axi_bvalid (m_axi_bvalid), + .m_axi_bready (m_axi_bready), + .m_axi_bid (m_axi_bid), + .m_axi_bresp (m_axi_bresp), + + .m_axi_arvalid (m_axi_arvalid), + .m_axi_arready (m_axi_arready), + .m_axi_araddr (m_axi_araddr), + .m_axi_arid (m_axi_arid), + .m_axi_arlen (m_axi_arlen), + .m_axi_arsize (m_axi_arsize), + .m_axi_arburst (m_axi_arburst), + .m_axi_arlock (m_axi_arlock), + .m_axi_arcache (m_axi_arcache), + .m_axi_arprot (m_axi_arprot), + .m_axi_arqos (m_axi_arqos), + .m_axi_arregion (m_axi_arregion), + + .m_axi_rvalid (m_axi_rvalid), + .m_axi_rready (m_axi_rready), + .m_axi_rdata (m_axi_rdata), + .m_axi_rlast (m_axi_rlast), + .m_axi_rid (m_axi_rid), + .m_axi_rresp (m_axi_rresp) + ); + endmodule diff --git a/hw/rtl/afu/opae/local_mem_cfg_pkg.sv b/hw/rtl/afu/opae/local_mem_cfg_pkg.sv index ef9fae28a..87b3290c1 100644 --- a/hw/rtl/afu/opae/local_mem_cfg_pkg.sv +++ b/hw/rtl/afu/opae/local_mem_cfg_pkg.sv @@ -28,9 +28,19 @@ // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE // POSSIBILITY OF SUCH DAMAGE. -//`include "platform_afu_top_config.vh" +`include "VX_define.vh" -`ifdef PLATFORM_PROVIDES_LOCAL_MEMORY +`ifndef PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH +`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH ((`PLATFORM_MEMORY_ADDR_WIDTH - $clog2(`PLATFORM_MEMORY_NUM_BANKS)) - $clog2(`PLATFORM_MEMORY_DATA_SIZE)) +`endif + +`ifndef PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH +`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH (`PLATFORM_MEMORY_DATA_SIZE * 8) +`endif + +`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH +`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH 4 +`endif package local_mem_cfg_pkg; @@ -57,5 +67,3 @@ package local_mem_cfg_pkg; typedef logic [LOCAL_MEM_DATA_N_BYTES-1:0] t_local_mem_byte_mask; endpackage // local_mem_cfg_pkg - -`endif // PLATFORM_PROVIDES_LOCAL_MEMORY diff --git a/hw/rtl/afu/opae/vortex_afu.sv b/hw/rtl/afu/opae/vortex_afu.sv index 93f63c48d..4e1de24eb 100644 --- a/hw/rtl/afu/opae/vortex_afu.sv +++ b/hw/rtl/afu/opae/vortex_afu.sv @@ -11,12 +11,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +`include "VX_define.vh" + `ifndef NOPAE `include "afu_json_info.vh" `else `include "vortex_afu.vh" `endif -`include "VX_define.vh" module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_gpu_pkg::*; #( parameter NUM_LOCAL_MEM_BANKS = 2 @@ -40,23 +41,31 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ output t_local_mem_burst_cnt avs_burstcount [NUM_LOCAL_MEM_BANKS], input wire avs_readdatavalid [NUM_LOCAL_MEM_BANKS] ); - localparam LMEM_DATA_WIDTH = $bits(t_local_mem_data); localparam LMEM_DATA_SIZE = LMEM_DATA_WIDTH / 8; localparam LMEM_ADDR_WIDTH = $bits(t_local_mem_addr); + + localparam LMEM_BYTE_ADDR_WIDTH = LMEM_ADDR_WIDTH + $clog2(LMEM_DATA_SIZE); + localparam CCI_VX_ADDR_WIDTH = `VX_MEM_ADDR_WIDTH + ($clog2(`VX_MEM_DATA_WIDTH) - $clog2(LMEM_DATA_WIDTH)); + localparam LMEM_BURST_CTRW = $bits(t_local_mem_burst_cnt); + localparam MEM_PORTS_BITS = `CLOG2(`VX_MEM_PORTS); + localparam MEM_PORTS_WIDTH = `UP(MEM_PORTS_BITS); + localparam CCI_DATA_WIDTH = $bits(t_ccip_clData); localparam CCI_DATA_SIZE = CCI_DATA_WIDTH / 8; localparam CCI_ADDR_WIDTH = $bits(t_ccip_clAddr); + localparam RESET_CTR_WIDTH = `CLOG2(`RESET_DELAY+1); + localparam AVS_RD_QUEUE_SIZE = 32; - localparam _VX_MEM_TAG_WIDTH = `VX_MEM_TAG_WIDTH; - localparam _AVS_REQ_TAGW_VX = _VX_MEM_TAG_WIDTH + `CLOG2(LMEM_DATA_WIDTH) - `CLOG2(`VX_MEM_DATA_WIDTH); - localparam _AVS_REQ_TAGW_VX2 = `MAX(_VX_MEM_TAG_WIDTH, _AVS_REQ_TAGW_VX); - localparam _AVS_REQ_TAGW_CCI = CCI_ADDR_WIDTH + `CLOG2(LMEM_DATA_WIDTH) - `CLOG2(CCI_DATA_WIDTH); - localparam _AVS_REQ_TAGW_CCI2 = `MAX(CCI_ADDR_WIDTH, _AVS_REQ_TAGW_CCI); - localparam AVS_REQ_TAGW = `MAX(_AVS_REQ_TAGW_VX2, _AVS_REQ_TAGW_CCI2); + localparam VX_AVS_REQ_TAGW = `VX_MEM_TAG_WIDTH + `CLOG2(LMEM_DATA_WIDTH) - `CLOG2(`VX_MEM_DATA_WIDTH); + localparam CCI_AVS_REQ_TAGW = CCI_ADDR_WIDTH + `CLOG2(LMEM_DATA_WIDTH) - `CLOG2(CCI_DATA_WIDTH); + localparam VX_AVS_REQ_TAGW2 = `MAX(`VX_MEM_TAG_WIDTH, VX_AVS_REQ_TAGW); + localparam CCI_AVS_REQ_TAGW2 = `MAX(CCI_ADDR_WIDTH, CCI_AVS_REQ_TAGW); + localparam CCI_VX_TAG_WIDTH = `MAX(VX_AVS_REQ_TAGW2, CCI_AVS_REQ_TAGW2); + localparam AVS_TAG_WIDTH = CCI_VX_TAG_WIDTH + 1; // adding the arbiter bit localparam CCI_RD_WINDOW_SIZE = 8; localparam CCI_RW_PENDING_SIZE= 256; @@ -64,6 +73,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ localparam AFU_ID_L = 16'h0002; // AFU ID Lower localparam AFU_ID_H = 16'h0004; // AFU ID Higher + localparam CMD_IDLE = 0; localparam CMD_MEM_READ = `AFU_IMAGE_CMD_MEM_READ; localparam CMD_MEM_WRITE = `AFU_IMAGE_CMD_MEM_WRITE; localparam CMD_DCR_WRITE = `AFU_IMAGE_CMD_DCR_WRITE; @@ -78,7 +88,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ localparam COUT_TID_WIDTH = `CLOG2(`VX_MEM_BYTEEN_WIDTH); localparam COUT_QUEUE_DATAW = COUT_TID_WIDTH + 8; - localparam COUT_QUEUE_SIZE = 64; + localparam COUT_QUEUE_SIZE = 1024; localparam MMIO_DEV_CAPS = `AFU_IMAGE_MMIO_DEV_CAPS; localparam MMIO_ISA_CAPS = `AFU_IMAGE_MMIO_ISA_CAPS; @@ -96,7 +106,9 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ wire [127:0] afu_id = `AFU_ACCEL_UUID; - wire [63:0] dev_caps = {16'b0, + wire [63:0] dev_caps = {8'b0, + 5'(LMEM_BYTE_ADDR_WIDTH-20), + 3'(`CLOG2(NUM_LOCAL_MEM_BANKS)), 8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0), 16'(`NUM_CORES * `NUM_CLUSTERS), 8'(`NUM_WARPS), @@ -109,22 +121,22 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ reg [STATE_WIDTH-1:0] state; - // Vortex ports /////////////////////////////////////////////////////////////// + // Vortex ports /////////////////////////////////////////////////////////// - wire vx_mem_req_valid; - wire vx_mem_req_rw; - wire [`VX_MEM_BYTEEN_WIDTH-1:0] vx_mem_req_byteen; - wire [`VX_MEM_ADDR_WIDTH-1:0] vx_mem_req_addr; - wire [`VX_MEM_DATA_WIDTH-1:0] vx_mem_req_data; - wire [`VX_MEM_TAG_WIDTH-1:0] vx_mem_req_tag; - wire vx_mem_req_ready; + wire vx_mem_req_valid [`VX_MEM_PORTS]; + wire vx_mem_req_rw [`VX_MEM_PORTS]; + wire [`VX_MEM_BYTEEN_WIDTH-1:0] vx_mem_req_byteen [`VX_MEM_PORTS]; + wire [`VX_MEM_ADDR_WIDTH-1:0] vx_mem_req_addr [`VX_MEM_PORTS]; + wire [`VX_MEM_DATA_WIDTH-1:0] vx_mem_req_data [`VX_MEM_PORTS]; + wire [`VX_MEM_TAG_WIDTH-1:0] vx_mem_req_tag [`VX_MEM_PORTS]; + wire vx_mem_req_ready [`VX_MEM_PORTS]; - wire vx_mem_rsp_valid; - wire [`VX_MEM_DATA_WIDTH-1:0] vx_mem_rsp_data; - wire [`VX_MEM_TAG_WIDTH-1:0] vx_mem_rsp_tag; - wire vx_mem_rsp_ready; + wire vx_mem_rsp_valid [`VX_MEM_PORTS]; + wire [`VX_MEM_DATA_WIDTH-1:0] vx_mem_rsp_data [`VX_MEM_PORTS]; + wire [`VX_MEM_TAG_WIDTH-1:0] vx_mem_rsp_tag [`VX_MEM_PORTS]; + wire vx_mem_rsp_ready [`VX_MEM_PORTS]; - // CMD variables ////////////////////////////////////////////////////////////// + // CMD variables ////////////////////////////////////////////////////////// reg [2:0][63:0] cmd_args; @@ -137,16 +149,14 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ wire [`VX_DCR_ADDR_WIDTH-1:0] cmd_dcr_addr = `VX_DCR_ADDR_WIDTH'(cmd_args[0]); wire [`VX_DCR_DATA_WIDTH-1:0] cmd_dcr_data = `VX_DCR_DATA_WIDTH'(cmd_args[1]); - // MMIO controller //////////////////////////////////////////////////////////// + // MMIO controller //////////////////////////////////////////////////////// - t_ccip_c0_ReqMmioHdr mmio_hdr; - assign mmio_hdr = t_ccip_c0_ReqMmioHdr'(cp2af_sRxPort.c0.hdr); - `UNUSED_VAR (mmio_hdr) + t_ccip_c0_ReqMmioHdr mmio_req_hdr; + assign mmio_req_hdr = t_ccip_c0_ReqMmioHdr'(cp2af_sRxPort.c0.hdr[$bits(t_ccip_c0_ReqMmioHdr)-1:0]); + `UNUSED_VAR (mmio_req_hdr) - `STATIC_ASSERT(($bits(t_ccip_c0_ReqMmioHdr)-$bits(mmio_hdr.address)) == 12, ("Oops!")) - - t_if_ccip_c2_Tx mmio_tx; - assign af2cp_sTxPort.c2 = mmio_tx; + t_if_ccip_c2_Tx mmio_rsp; + assign af2cp_sTxPort.c2 = mmio_rsp; `ifdef SCOPE @@ -170,41 +180,66 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ if (reset) begin cmd_scope_reading <= 0; cmd_scope_writing <= 0; - scope_bus_in <= 0; + scope_bus_in <= 0; end else begin + scope_bus_in <= 0; if (scope_bus_out) begin cmd_scope_reading <= 1; scope_bus_ctr <= 63; end - scope_bus_in <= 0; if (cp2af_sRxPort.c0.mmioWrValid - && (MMIO_SCOPE_WRITE == mmio_hdr.address)) begin + && (MMIO_SCOPE_WRITE == mmio_req_hdr.address)) begin cmd_scope_wdata <= 64'(cp2af_sRxPort.c0.data); cmd_scope_writing <= 1; scope_bus_ctr <= 63; scope_bus_in <= 1; end - end - if (cmd_scope_writing) begin - scope_bus_in <= 1'(cmd_scope_wdata >> scope_bus_ctr); - scope_bus_ctr <= scope_bus_ctr - 1; - if (scope_bus_ctr == 0) begin - cmd_scope_writing <= 0; + if (cmd_scope_writing) begin + scope_bus_in <= cmd_scope_wdata[scope_bus_ctr]; + scope_bus_ctr <= scope_bus_ctr - 6'd1; + if (scope_bus_ctr == 0) begin + cmd_scope_writing <= 0; + scope_bus_ctr <= 0; + end end - end - if (cmd_scope_reading) begin - cmd_scope_rdata <= {cmd_scope_rdata[62:0], scope_bus_out}; - scope_bus_ctr <= scope_bus_ctr - 1; - if (scope_bus_ctr == 0) begin - cmd_scope_reading <= 0; + if (cmd_scope_reading) begin + cmd_scope_rdata <= {cmd_scope_rdata[62:0], scope_bus_out}; + scope_bus_ctr <= scope_bus_ctr - 6'd1; + if (scope_bus_ctr == 0) begin + cmd_scope_reading <= 0; + scope_bus_ctr <= 0; + end end end end `endif - wire [COUT_QUEUE_DATAW-1:0] cout_q_dout; - wire cout_q_full, cout_q_empty; + // Console output queue read ////////////////////////////////////////////// + + wire [`VX_MEM_PORTS-1:0][COUT_QUEUE_DATAW-1:0] cout_q_dout; + wire [`VX_MEM_PORTS-1:0] cout_q_full, cout_q_empty, cout_q_pop; + + reg [MEM_PORTS_WIDTH-1:0] cout_q_id; + + always @(posedge clk) begin + if (reset) begin + cout_q_id <= 0; + end else begin + if (cp2af_sRxPort.c0.mmioRdValid && mmio_req_hdr.address == MMIO_STATUS) begin + cout_q_id <= cout_q_id + 1; + end + end + end + + for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_cout_q_pop + assign cout_q_pop[i] = (cp2af_sRxPort.c0.mmioRdValid && mmio_req_hdr.address == MMIO_STATUS) + && (cout_q_id == i) + && ~cout_q_empty[i]; + end + + wire [COUT_QUEUE_DATAW-1:0] cout_q_dout_s = cout_q_dout[cout_q_id] & {COUT_QUEUE_DATAW{!cout_q_empty[cout_q_id]}}; + wire cout_q_empty_all = & cout_q_empty; `ifdef SIMULATION `ifndef VERILATOR @@ -226,60 +261,23 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ `endif `endif + // MMIO controller //////////////////////////////////////////////////////// + + // Handle MMIO read requests always @(posedge clk) begin if (reset) begin - mmio_tx.mmioRdValid <= 0; - mmio_tx.hdr <= '0; + mmio_rsp.mmioRdValid <= 0; + cout_q_id <= 0; end else begin - mmio_tx.mmioRdValid <= cp2af_sRxPort.c0.mmioRdValid; - mmio_tx.hdr.tid <= mmio_hdr.tid; - end - // serve MMIO write request - if (cp2af_sRxPort.c0.mmioWrValid) begin - case (mmio_hdr.address) - MMIO_CMD_ARG0: begin - cmd_args[0] <= 64'(cp2af_sRxPort.c0.data); - `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: MMIO_CMD_ARG0: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data))); - `endif - end - MMIO_CMD_ARG1: begin - cmd_args[1] <= 64'(cp2af_sRxPort.c0.data); - `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: MMIO_CMD_ARG1: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data))); - `endif - end - MMIO_CMD_ARG2: begin - cmd_args[2] <= 64'(cp2af_sRxPort.c0.data); - `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: MMIO_CMD_ARG2: data=%0d\n", $time, 64'(cp2af_sRxPort.c0.data))); - `endif - end - MMIO_CMD_TYPE: begin - `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: MMIO_CMD_TYPE: data=%0d\n", $time, 64'(cp2af_sRxPort.c0.data))); - `endif - end - `ifdef SCOPE - MMIO_SCOPE_WRITE: begin - `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: MMIO_SCOPE_WRITE: data=0x%h\n", $time, cmd_scope_wdata)); - `endif - end - `endif - default: begin - `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: Unknown MMIO Wr: addr=0x%0h, data=0x%h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data))); - `endif - end - endcase + mmio_rsp.mmioRdValid <= cp2af_sRxPort.c0.mmioRdValid; end - // serve MMIO read requests + mmio_rsp.hdr.tid <= mmio_req_hdr.tid; + if (cp2af_sRxPort.c0.mmioRdValid) begin - case (mmio_hdr.address) + case (mmio_req_hdr.address) // AFU header - 16'h0000: mmio_tx.data <= { + 16'h0000: mmio_rsp.data <= { 4'b0001, // Feature type = AFU 8'b0, // reserved 4'b0, // afu minor revision = 0 @@ -289,105 +287,139 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ 4'b0, // afu major revision = 0 12'b0 // feature ID = 0 }; - AFU_ID_L: mmio_tx.data <= afu_id[63:0]; // afu id low - AFU_ID_H: mmio_tx.data <= afu_id[127:64]; // afu id hi - 16'h0006: mmio_tx.data <= 64'h0; // next AFU - 16'h0008: mmio_tx.data <= 64'h0; // reserved + AFU_ID_L: mmio_rsp.data <= afu_id[63:0]; // afu id low + AFU_ID_H: mmio_rsp.data <= afu_id[127:64]; // afu id hi + 16'h0006: mmio_rsp.data <= 64'h0; // next AFU + 16'h0008: mmio_rsp.data <= 64'h0; // reserved MMIO_STATUS: begin - mmio_tx.data <= 64'({cout_q_dout, !cout_q_empty, 8'(state)}); + mmio_rsp.data <= 64'({cout_q_dout_s, ~cout_q_empty_all, 8'(state)}); `ifdef DBG_TRACE_AFU - if (state != STATE_WIDTH'(mmio_tx.data)) begin - `TRACE(2, ("%d: MMIO_STATUS: addr=0x%0h, state=%0d\n", $time, mmio_hdr.address, state)); + if (state != STATE_WIDTH'(mmio_rsp.data)) begin + `TRACE(2, ("%t: AFU: MMIO_STATUS: addr=0x%0h, state=%0d\n", $time, mmio_req_hdr.address, state)) end `endif end `ifdef SCOPE MMIO_SCOPE_READ: begin - mmio_tx.data <= cmd_scope_rdata; + mmio_rsp.data <= cmd_scope_rdata; `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: MMIO_SCOPE_READ: data=0x%h\n", $time, cmd_scope_rdata)); + `TRACE(2, ("%t: AFU: MMIO_SCOPE_READ: data=0x%h\n", $time, cmd_scope_rdata)) `endif end `endif MMIO_DEV_CAPS: begin - mmio_tx.data <= dev_caps; + mmio_rsp.data <= dev_caps; `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: MMIO_DEV_CAPS: data=0x%h\n", $time, dev_caps)); + `TRACE(2, ("%t: AFU: MMIO_DEV_CAPS: data=0x%h\n", $time, dev_caps)) `endif end MMIO_ISA_CAPS: begin - mmio_tx.data <= isa_caps; + mmio_rsp.data <= isa_caps; `ifdef DBG_TRACE_AFU - if (state != STATE_WIDTH'(mmio_tx.data)) begin - `TRACE(2, ("%d: MMIO_ISA_CAPS: data=%0d\n", $time, isa_caps)); + if (state != STATE_WIDTH'(mmio_rsp.data)) begin + `TRACE(2, ("%t: AFU: MMIO_ISA_CAPS: data=%0d\n", $time, isa_caps)) end `endif end default: begin - mmio_tx.data <= 64'h0; + mmio_rsp.data <= 64'h0; `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: Unknown MMIO Rd: addr=0x%0h\n", $time, mmio_hdr.address)); + `TRACE(2, ("%t: AFU: Unknown MMIO Rd: addr=0x%0h\n", $time, mmio_req_hdr.address)) `endif end endcase end end - // COMMAND FSM //////////////////////////////////////////////////////////////// + // Handle MMIO write requests + always @(posedge clk) begin + if (cp2af_sRxPort.c0.mmioWrValid) begin + case (mmio_req_hdr.address) + MMIO_CMD_ARG0: begin + cmd_args[0] <= 64'(cp2af_sRxPort.c0.data); + `ifdef DBG_TRACE_AFU + `TRACE(2, ("%t: AFU: MMIO_CMD_ARG0: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data))) + `endif + end + MMIO_CMD_ARG1: begin + cmd_args[1] <= 64'(cp2af_sRxPort.c0.data); + `ifdef DBG_TRACE_AFU + `TRACE(2, ("%t: AFU: MMIO_CMD_ARG1: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data))) + `endif + end + MMIO_CMD_ARG2: begin + cmd_args[2] <= 64'(cp2af_sRxPort.c0.data); + `ifdef DBG_TRACE_AFU + `TRACE(2, ("%t: AFU: MMIO_CMD_ARG2: data=%0d\n", $time, 64'(cp2af_sRxPort.c0.data))) + `endif + end + MMIO_CMD_TYPE: begin + `ifdef DBG_TRACE_AFU + `TRACE(2, ("%t: AFU: MMIO_CMD_TYPE: data=%0d\n", $time, 64'(cp2af_sRxPort.c0.data))) + `endif + end + `ifdef SCOPE + MMIO_SCOPE_WRITE: begin + `ifdef DBG_TRACE_AFU + `TRACE(2, ("%t: AFU: MMIO_SCOPE_WRITE: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data))) + `endif + end + `endif + default: begin + `ifdef DBG_TRACE_AFU + `TRACE(2, ("%t: Unknown MMIO Wr: addr=0x%0h, data=0x%h\n", $time, mmio_req_hdr.address, 64'(cp2af_sRxPort.c0.data))) + `endif + end + endcase + end + end + + // COMMAND FSM //////////////////////////////////////////////////////////// wire cmd_mem_rd_done; reg cmd_mem_wr_done; + reg [RESET_CTR_WIDTH-1:0] vx_reset_ctr; reg vx_busy_wait; - reg vx_running; + reg vx_reset = 1; // asserted at initialization wire vx_busy; - reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr; - always @(posedge clk) begin - if (state == STATE_RUN) begin - vx_reset_ctr <= vx_reset_ctr + $bits(vx_reset_ctr)'(1); - end else begin - vx_reset_ctr <= '0; - end - end - - wire is_mmio_wr_cmd = cp2af_sRxPort.c0.mmioWrValid && (MMIO_CMD_TYPE == mmio_hdr.address); - wire [CMD_TYPE_WIDTH-1:0] cmd_type = is_mmio_wr_cmd ? - CMD_TYPE_WIDTH'(cp2af_sRxPort.c0.data) : CMD_TYPE_WIDTH'(0); + wire is_mmio_wr_cmd = cp2af_sRxPort.c0.mmioWrValid && (MMIO_CMD_TYPE == mmio_req_hdr.address); + wire [CMD_TYPE_WIDTH-1:0] cmd_type = is_mmio_wr_cmd ? CMD_TYPE_WIDTH'(cp2af_sRxPort.c0.data) : CMD_TYPE_WIDTH'(CMD_IDLE); always @(posedge clk) begin if (reset) begin - state <= STATE_IDLE; - vx_busy_wait <= 0; - vx_running <= 0; + state <= STATE_IDLE; + vx_reset <= 1; end else begin case (state) STATE_IDLE: begin case (cmd_type) CMD_MEM_READ: begin `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: STATE MEM_READ: ia=0x%0h addr=0x%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size)); + `TRACE(2, ("%t: AFU: Goto STATE MEM_READ: ia=0x%0h addr=0x%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size)) `endif state <= STATE_MEM_READ; end CMD_MEM_WRITE: begin `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: STATE MEM_WRITE: ia=0x%0h addr=0x%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size)); + `TRACE(2, ("%t: AFU: Goto STATE MEM_WRITE: ia=0x%0h addr=0x%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size)) `endif state <= STATE_MEM_WRITE; end CMD_DCR_WRITE: begin `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: STATE DCR_WRITE: addr=0x%0h data=%0d\n", $time, cmd_dcr_addr, cmd_dcr_data)); + `TRACE(2, ("%t: AFU: Goto STATE DCR_WRITE: addr=0x%0h data=%0d\n", $time, cmd_dcr_addr, cmd_dcr_data)) `endif state <= STATE_DCR_WRITE; end CMD_RUN: begin `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: STATE RUN\n", $time)); + `TRACE(2, ("%t: AFU: Goto STATE RUN\n", $time)) `endif state <= STATE_RUN; - vx_running <= 0; + vx_reset_ctr <= RESET_CTR_WIDTH'(`RESET_DELAY-1); + vx_reset <= 1; end default: begin state <= state; @@ -398,58 +430,60 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ if (cmd_mem_rd_done) begin state <= STATE_IDLE; `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: STATE IDLE\n", $time)); + `TRACE(2, ("%t: AFU: Goto STATE IDLE\n", $time)) `endif end end STATE_MEM_WRITE: begin if (cmd_mem_wr_done) begin state <= STATE_IDLE; - `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: STATE IDLE\n", $time)); - `endif end end STATE_DCR_WRITE: begin state <= STATE_IDLE; `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: STATE IDLE\n", $time)); + `TRACE(2, ("%t: AFU: Goto STATE IDLE\n", $time)) `endif end STATE_RUN: begin - if (vx_running) begin - if (vx_busy_wait) begin - // wait until the gpu goes busy - if (vx_busy) begin - vx_busy_wait <= 0; - end - end else begin - // wait until the gpu is not busy - if (~vx_busy) begin - state <= STATE_IDLE; - `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: AFU: End execution\n", $time)); - `TRACE(2, ("%d: STATE IDLE\n", $time)); - `endif - end - end + if (vx_reset) begin + // wait until the reset network is ready + if (vx_reset_ctr == RESET_CTR_WIDTH'(0)) begin + `ifdef DBG_TRACE_AFU + `TRACE(2, ("%t: AFU: Begin execution\n", $time)) + `endif + vx_busy_wait <= 1; + vx_reset <= 0; + end end else begin - // wait until the reset sequence is complete - if (vx_reset_ctr == (`RESET_DELAY-1)) begin - `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: AFU: Begin execution\n", $time)); - `endif - vx_running <= 1; - vx_busy_wait <= 1; - end + if (vx_busy_wait) begin + // wait until processor goes busy + if (vx_busy) begin + vx_busy_wait <= 0; + end + end else begin + // wait until the processor is not busy + if (~vx_busy) begin + `ifdef DBG_TRACE_AFU + `TRACE(2, ("%t: AFU: End execution\n", $time)) + `TRACE(2, ("%t: AFU: Goto STATE IDLE\n", $time)) + `endif + state <= STATE_IDLE; + end + end end end default:; endcase + + // ensure reset network initialization + if (vx_reset_ctr != RESET_CTR_WIDTH'(0)) begin + vx_reset_ctr <= vx_reset_ctr - RESET_CTR_WIDTH'(1); + end end end - // AVS Controller ///////////////////////////////////////////////////////////// + // AVS Controller ///////////////////////////////////////////////////////// wire cci_mem_rd_req_valid; wire cci_mem_wr_req_valid; @@ -467,28 +501,80 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ wire [CCI_ADDR_WIDTH-1:0] cci_mem_rsp_tag; wire cci_mem_rsp_ready; - //-- + // adjust VX mnemory interface to be compatible with CCI VX_mem_bus_if #( .DATA_SIZE (LMEM_DATA_SIZE), - .ADDR_WIDTH (LMEM_ADDR_WIDTH), - .TAG_WIDTH (AVS_REQ_TAGW) - ) cci_vx_mem_bus_if[2](); + .ADDR_WIDTH (CCI_VX_ADDR_WIDTH), + .TAG_WIDTH (CCI_VX_TAG_WIDTH) + ) vx_mem_bus_if[`VX_MEM_PORTS](); - `RESET_RELAY (cci_adapter_reset, reset); + wire [`VX_MEM_PORTS-1:0] vx_mem_req_valid_qual; + wire [`VX_MEM_PORTS-1:0] vx_mem_req_ready_qual; - VX_mem_adapter #( + for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_vx_mem_adapter + VX_mem_data_adapter #( + .SRC_DATA_WIDTH (`VX_MEM_DATA_WIDTH), + .DST_DATA_WIDTH (LMEM_DATA_WIDTH), + .SRC_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH), + .DST_ADDR_WIDTH (CCI_VX_ADDR_WIDTH), + .SRC_TAG_WIDTH (`VX_MEM_TAG_WIDTH), + .DST_TAG_WIDTH (CCI_VX_TAG_WIDTH), + .REQ_OUT_BUF (0), + .RSP_OUT_BUF (2) + ) vx_mem_data_adapter ( + .clk (clk), + .reset (reset), + + .mem_req_valid_in (vx_mem_req_valid_qual[i]), + .mem_req_addr_in (vx_mem_req_addr[i]), + .mem_req_rw_in (vx_mem_req_rw[i]), + .mem_req_byteen_in (vx_mem_req_byteen[i]), + .mem_req_data_in (vx_mem_req_data[i]), + .mem_req_tag_in (vx_mem_req_tag[i]), + .mem_req_ready_in (vx_mem_req_ready_qual[i]), + + .mem_rsp_valid_in (vx_mem_rsp_valid[i]), + .mem_rsp_data_in (vx_mem_rsp_data[i]), + .mem_rsp_tag_in (vx_mem_rsp_tag[i]), + .mem_rsp_ready_in (vx_mem_rsp_ready[i]), + + .mem_req_valid_out (vx_mem_bus_if[i].req_valid), + .mem_req_addr_out (vx_mem_bus_if[i].req_data.addr), + .mem_req_rw_out (vx_mem_bus_if[i].req_data.rw), + .mem_req_byteen_out (vx_mem_bus_if[i].req_data.byteen), + .mem_req_data_out (vx_mem_bus_if[i].req_data.data), + .mem_req_tag_out (vx_mem_bus_if[i].req_data.tag), + .mem_req_ready_out (vx_mem_bus_if[i].req_ready), + + .mem_rsp_valid_out (vx_mem_bus_if[i].rsp_valid), + .mem_rsp_data_out (vx_mem_bus_if[i].rsp_data.data), + .mem_rsp_tag_out (vx_mem_bus_if[i].rsp_data.tag), + .mem_rsp_ready_out (vx_mem_bus_if[i].rsp_ready) + ); + assign vx_mem_bus_if[i].req_data.flags = '0; + end + + // adjust CCI mnemory interface to be compatible with VX + + VX_mem_bus_if #( + .DATA_SIZE (LMEM_DATA_SIZE), + .ADDR_WIDTH (CCI_VX_ADDR_WIDTH), + .TAG_WIDTH (CCI_VX_TAG_WIDTH) + ) cci_vx_mem_arb_in_if[2](); + + VX_mem_data_adapter #( .SRC_DATA_WIDTH (CCI_DATA_WIDTH), .DST_DATA_WIDTH (LMEM_DATA_WIDTH), .SRC_ADDR_WIDTH (CCI_ADDR_WIDTH), - .DST_ADDR_WIDTH (LMEM_ADDR_WIDTH), + .DST_ADDR_WIDTH (CCI_VX_ADDR_WIDTH), .SRC_TAG_WIDTH (CCI_ADDR_WIDTH), - .DST_TAG_WIDTH (AVS_REQ_TAGW), + .DST_TAG_WIDTH (CCI_VX_TAG_WIDTH), .REQ_OUT_BUF (0), .RSP_OUT_BUF (0) - ) cci_mem_adapter ( + ) cci_mem_data_adapter ( .clk (clk), - .reset (cci_adapter_reset), + .reset (reset), .mem_req_valid_in (cci_mem_req_valid), .mem_req_addr_in (cci_mem_req_addr), @@ -503,129 +589,125 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ .mem_rsp_tag_in (cci_mem_rsp_tag), .mem_rsp_ready_in (cci_mem_rsp_ready), - .mem_req_valid_out (cci_vx_mem_bus_if[1].req_valid), - .mem_req_addr_out (cci_vx_mem_bus_if[1].req_data.addr), - .mem_req_rw_out (cci_vx_mem_bus_if[1].req_data.rw), - .mem_req_byteen_out (cci_vx_mem_bus_if[1].req_data.byteen), - .mem_req_data_out (cci_vx_mem_bus_if[1].req_data.data), - .mem_req_tag_out (cci_vx_mem_bus_if[1].req_data.tag), - .mem_req_ready_out (cci_vx_mem_bus_if[1].req_ready), + .mem_req_valid_out (cci_vx_mem_arb_in_if[1].req_valid), + .mem_req_addr_out (cci_vx_mem_arb_in_if[1].req_data.addr), + .mem_req_rw_out (cci_vx_mem_arb_in_if[1].req_data.rw), + .mem_req_byteen_out (cci_vx_mem_arb_in_if[1].req_data.byteen), + .mem_req_data_out (cci_vx_mem_arb_in_if[1].req_data.data), + .mem_req_tag_out (cci_vx_mem_arb_in_if[1].req_data.tag), + .mem_req_ready_out (cci_vx_mem_arb_in_if[1].req_ready), - .mem_rsp_valid_out (cci_vx_mem_bus_if[1].rsp_valid), - .mem_rsp_data_out (cci_vx_mem_bus_if[1].rsp_data.data), - .mem_rsp_tag_out (cci_vx_mem_bus_if[1].rsp_data.tag), - .mem_rsp_ready_out (cci_vx_mem_bus_if[1].rsp_ready) + .mem_rsp_valid_out (cci_vx_mem_arb_in_if[1].rsp_valid), + .mem_rsp_data_out (cci_vx_mem_arb_in_if[1].rsp_data.data), + .mem_rsp_tag_out (cci_vx_mem_arb_in_if[1].rsp_data.tag), + .mem_rsp_ready_out (cci_vx_mem_arb_in_if[1].rsp_ready) ); + assign cci_vx_mem_arb_in_if[1].req_data.flags = '0; - assign cci_vx_mem_bus_if[1].req_data.atype = '0; - `UNUSED_VAR (cci_vx_mem_bus_if[1].req_data.atype) + // arbitrate between CCI and VX memory interfaces - //-- + `ASSIGN_VX_MEM_BUS_IF(cci_vx_mem_arb_in_if[0], vx_mem_bus_if[0]); - wire vx_mem_is_cout; - wire vx_mem_req_valid_qual; - wire vx_mem_req_ready_qual; - - assign vx_mem_req_valid_qual = vx_mem_req_valid && ~vx_mem_is_cout; - - `RESET_RELAY (vx_adapter_reset, reset); - - VX_mem_adapter #( - .SRC_DATA_WIDTH (`VX_MEM_DATA_WIDTH), - .DST_DATA_WIDTH (LMEM_DATA_WIDTH), - .SRC_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH), - .DST_ADDR_WIDTH (LMEM_ADDR_WIDTH), - .SRC_TAG_WIDTH (`VX_MEM_TAG_WIDTH), - .DST_TAG_WIDTH (AVS_REQ_TAGW), - .REQ_OUT_BUF (0), - .RSP_OUT_BUF (2) - ) vx_mem_adapter ( - .clk (clk), - .reset (vx_adapter_reset), - - .mem_req_valid_in (vx_mem_req_valid_qual), - .mem_req_addr_in (vx_mem_req_addr), - .mem_req_rw_in (vx_mem_req_rw), - .mem_req_byteen_in (vx_mem_req_byteen), - .mem_req_data_in (vx_mem_req_data), - .mem_req_tag_in (vx_mem_req_tag), - .mem_req_ready_in (vx_mem_req_ready_qual), - - .mem_rsp_valid_in (vx_mem_rsp_valid), - .mem_rsp_data_in (vx_mem_rsp_data), - .mem_rsp_tag_in (vx_mem_rsp_tag), - .mem_rsp_ready_in (vx_mem_rsp_ready), - - .mem_req_valid_out (cci_vx_mem_bus_if[0].req_valid), - .mem_req_addr_out (cci_vx_mem_bus_if[0].req_data.addr), - .mem_req_rw_out (cci_vx_mem_bus_if[0].req_data.rw), - .mem_req_byteen_out (cci_vx_mem_bus_if[0].req_data.byteen), - .mem_req_data_out (cci_vx_mem_bus_if[0].req_data.data), - .mem_req_tag_out (cci_vx_mem_bus_if[0].req_data.tag), - .mem_req_ready_out (cci_vx_mem_bus_if[0].req_ready), - - .mem_rsp_valid_out (cci_vx_mem_bus_if[0].rsp_valid), - .mem_rsp_data_out (cci_vx_mem_bus_if[0].rsp_data.data), - .mem_rsp_tag_out (cci_vx_mem_bus_if[0].rsp_data.tag), - .mem_rsp_ready_out (cci_vx_mem_bus_if[0].rsp_ready) - ); - - assign cci_vx_mem_bus_if[0].req_data.atype = '0; - `UNUSED_VAR (cci_vx_mem_bus_if[0].req_data.atype) - - //-- VX_mem_bus_if #( .DATA_SIZE (LMEM_DATA_SIZE), - .ADDR_WIDTH (LMEM_ADDR_WIDTH), - .TAG_WIDTH (AVS_REQ_TAGW+1) - ) mem_bus_if[1](); + .ADDR_WIDTH (CCI_VX_ADDR_WIDTH), + .TAG_WIDTH (AVS_TAG_WIDTH) + ) cci_vx_mem_arb_out_if[1](); VX_mem_arb #( .NUM_INPUTS (2), + .NUM_OUTPUTS (1), .DATA_SIZE (LMEM_DATA_SIZE), - .ADDR_WIDTH (LMEM_ADDR_WIDTH), - .TAG_WIDTH (AVS_REQ_TAGW), + .ADDR_WIDTH (CCI_VX_ADDR_WIDTH), + .TAG_WIDTH (CCI_VX_TAG_WIDTH), .ARBITER ("P"), // prioritize VX requests .REQ_OUT_BUF (0), .RSP_OUT_BUF (0) ) mem_arb ( .clk (clk), .reset (reset), - .bus_in_if (cci_vx_mem_bus_if), - .bus_out_if (mem_bus_if) + .bus_in_if (cci_vx_mem_arb_in_if), + .bus_out_if (cci_vx_mem_arb_out_if) ); + `UNUSED_VAR (cci_vx_mem_arb_out_if[0].req_data.flags) - //-- + // final merged memory interface + wire mem_req_valid [`VX_MEM_PORTS]; + wire mem_req_rw [`VX_MEM_PORTS]; + wire [CCI_VX_ADDR_WIDTH-1:0] mem_req_addr [`VX_MEM_PORTS]; + wire [LMEM_DATA_SIZE-1:0] mem_req_byteen [`VX_MEM_PORTS]; + wire [LMEM_DATA_WIDTH-1:0] mem_req_data [`VX_MEM_PORTS]; + wire [AVS_TAG_WIDTH-1:0] mem_req_tag [`VX_MEM_PORTS]; + wire mem_req_ready [`VX_MEM_PORTS]; - `RESET_RELAY (avs_adapter_reset, reset); + wire mem_rsp_valid [`VX_MEM_PORTS]; + wire [LMEM_DATA_WIDTH-1:0] mem_rsp_data [`VX_MEM_PORTS]; + wire [AVS_TAG_WIDTH-1:0] mem_rsp_tag [`VX_MEM_PORTS]; + wire mem_rsp_ready [`VX_MEM_PORTS]; + for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_mem_bus_if + if (i == 0) begin : g_i0 + // assign port0 to CCI/VX arbiter + assign mem_req_valid[i] = cci_vx_mem_arb_out_if[i].req_valid; + assign mem_req_rw[i] = cci_vx_mem_arb_out_if[i].req_data.rw; + assign mem_req_addr[i] = cci_vx_mem_arb_out_if[i].req_data.addr; + assign mem_req_byteen[i]= cci_vx_mem_arb_out_if[i].req_data.byteen; + assign mem_req_data[i] = cci_vx_mem_arb_out_if[i].req_data.data; + assign mem_req_tag[i] = cci_vx_mem_arb_out_if[i].req_data.tag; + assign cci_vx_mem_arb_out_if[i].req_ready = mem_req_ready[i]; + + assign cci_vx_mem_arb_out_if[i].rsp_valid = mem_rsp_valid[i]; + assign cci_vx_mem_arb_out_if[i].rsp_data.data = mem_rsp_data[i]; + assign cci_vx_mem_arb_out_if[i].rsp_data.tag = mem_rsp_tag[i]; + assign mem_rsp_ready[i] = cci_vx_mem_arb_out_if[i].rsp_ready; + end else begin : g_i + // assign other ports to VX memory bus + assign mem_req_valid[i] = vx_mem_bus_if[i].req_valid; + assign mem_req_rw[i] = vx_mem_bus_if[i].req_data.rw; + assign mem_req_addr[i] = vx_mem_bus_if[i].req_data.addr; + assign mem_req_byteen[i]= vx_mem_bus_if[i].req_data.byteen; + assign mem_req_data[i] = vx_mem_bus_if[i].req_data.data; + assign mem_req_tag[i] = AVS_TAG_WIDTH'(vx_mem_bus_if[i].req_data.tag); + assign vx_mem_bus_if[i].req_ready = mem_req_ready[i]; + + assign vx_mem_bus_if[i].rsp_valid = mem_rsp_valid[i]; + assign vx_mem_bus_if[i].rsp_data.data = mem_rsp_data[i]; + assign vx_mem_bus_if[i].rsp_data.tag = CCI_VX_TAG_WIDTH'(mem_rsp_tag[i]); + assign mem_rsp_ready[i] = vx_mem_bus_if[i].rsp_ready; + end + end + + // convert merged memory interface to AVS VX_avs_adapter #( .DATA_WIDTH (LMEM_DATA_WIDTH), - .ADDR_WIDTH (LMEM_ADDR_WIDTH), + .ADDR_WIDTH_IN (CCI_VX_ADDR_WIDTH), + .ADDR_WIDTH_OUT(LMEM_ADDR_WIDTH), .BURST_WIDTH (LMEM_BURST_CTRW), - .NUM_BANKS (NUM_LOCAL_MEM_BANKS), - .TAG_WIDTH (AVS_REQ_TAGW + 1), + .NUM_PORTS_IN (`VX_MEM_PORTS), + .NUM_BANKS_OUT (NUM_LOCAL_MEM_BANKS), + .TAG_WIDTH (AVS_TAG_WIDTH), .RD_QUEUE_SIZE (AVS_RD_QUEUE_SIZE), - .REQ_OUT_BUF (2), - .RSP_OUT_BUF (0) + .INTERLEAVE (`PLATFORM_MEMORY_INTERLEAVE), + .REQ_OUT_BUF (2), // always needed due to CCI/VX arbiter + .RSP_OUT_BUF ((`VX_MEM_PORTS > 1 || NUM_LOCAL_MEM_BANKS > 1) ? 2 : 0) ) avs_adapter ( .clk (clk), - .reset (avs_adapter_reset), + .reset (reset), // Memory request - .mem_req_valid (mem_bus_if[0].req_valid), - .mem_req_rw (mem_bus_if[0].req_data.rw), - .mem_req_byteen (mem_bus_if[0].req_data.byteen), - .mem_req_addr (mem_bus_if[0].req_data.addr), - .mem_req_data (mem_bus_if[0].req_data.data), - .mem_req_tag (mem_bus_if[0].req_data.tag), - .mem_req_ready (mem_bus_if[0].req_ready), + .mem_req_valid (mem_req_valid), + .mem_req_rw (mem_req_rw), + .mem_req_byteen (mem_req_byteen), + .mem_req_addr (mem_req_addr), + .mem_req_data (mem_req_data), + .mem_req_tag (mem_req_tag), + .mem_req_ready (mem_req_ready), // Memory response - .mem_rsp_valid (mem_bus_if[0].rsp_valid), - .mem_rsp_data (mem_bus_if[0].rsp_data.data), - .mem_rsp_tag (mem_bus_if[0].rsp_data.tag), - .mem_rsp_ready (mem_bus_if[0].rsp_ready), + .mem_rsp_valid (mem_rsp_valid), + .mem_rsp_data (mem_rsp_data), + .mem_rsp_tag (mem_rsp_tag), + .mem_rsp_ready (mem_rsp_ready), // AVS bus .avs_writedata (avs_writedata), @@ -639,10 +721,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ .avs_readdatavalid(avs_readdatavalid) ); - assign mem_bus_if[0].req_data.atype = '0; - `UNUSED_VAR (mem_bus_if[0].req_data.atype) - - // CCI-P Read Request /////////////////////////////////////////////////////////// + // CCI-P Read Request ///////////////////////////////////////////////////// reg [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_ctr; wire [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_addr; @@ -748,7 +827,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ cci_rd_req_addr <= cci_rd_req_addr + 1; cci_rd_req_ctr <= cci_rd_req_ctr + $bits(cci_rd_req_ctr)'(1); `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: CCI Rd Req: addr=0x%0h, tag=0x%0h, rem=%0d, pending=%0d\n", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr - 1), cci_pending_reads)); + `TRACE(2, ("%t: AFU: CCI Rd Req: addr=0x%0h, tag=0x%0h, rem=%0d, pending=%0d\n", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr - 1), cci_pending_reads)) `endif end @@ -758,13 +837,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ cci_mem_wr_req_addr_base <= cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE); end `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=0x%h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data)); + `TRACE(2, ("%t: AFU: CCI Rd Rsp: idx=%0d, ctr=%0d, data=0x%h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data)) `endif end if (cci_rdq_pop) begin `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: CCI Rd Queue Pop: pending=%0d\n", $time, cci_pending_reads)); + `TRACE(2, ("%t: AFU: CCI Rd Queue Pop: pending=%0d\n", $time, cci_pending_reads)) `endif end @@ -811,7 +890,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ end ) - // CCI-P Write Request ////////////////////////////////////////////////////////// + // CCI-P Write Request //////////////////////////////////////////////////// reg [CCI_ADDR_WIDTH-1:0] cci_mem_rd_req_ctr; reg [CCI_ADDR_WIDTH-1:0] cci_mem_rd_req_addr; @@ -858,14 +937,11 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ `UNUSED_VAR (cci_pending_writes) - assign cci_mem_rd_req_valid = (STATE_MEM_READ == state) - && ~cci_mem_rd_req_done; + assign cci_mem_rd_req_valid = (STATE_MEM_READ == state) && ~cci_mem_rd_req_done; - assign cci_mem_rsp_ready = ~cp2af_sRxPort.c1TxAlmFull - && ~cci_pending_writes_full; + assign cci_mem_rsp_ready = ~cp2af_sRxPort.c1TxAlmFull && ~cci_pending_writes_full; - assign cmd_mem_rd_done = cci_wr_req_done - && cci_pending_writes_empty; + assign cmd_mem_rd_done = cci_wr_req_done && cci_pending_writes_empty; // Send write requests to CCI always @(posedge clk) begin @@ -902,13 +978,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ cci_wr_req_done <= 1; end `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: CCI Wr Req: addr=0x%0h, rem=%0d, pending=%0d, data=0x%h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data)); + `TRACE(2, ("%t: AFU: CCI Wr Req: addr=0x%0h, rem=%0d, pending=%0d, data=0x%h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data)) `endif end if (cci_wr_rsp_fire) begin `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: CCI Wr Rsp: pending=%0d\n", $time, cci_pending_writes)); + `TRACE(2, ("%t: AFU: CCI Wr Rsp: pending=%0d\n", $time, cci_pending_writes)) `endif end end @@ -924,19 +1000,19 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ assign cci_mem_req_data = cci_rdq_dout[CCI_RD_QUEUE_DATAW-1:CCI_ADDR_WIDTH]; assign cci_mem_req_tag = cci_mem_req_rw ? cci_mem_wr_req_ctr : cci_mem_rd_req_ctr; - // Vortex /////////////////////////////////////////////////////////////////// + // Vortex ///////////////////////////////////////////////////////////////// - wire vx_dcr_wr_valid = (STATE_DCR_WRITE == state); - wire [`VX_DCR_ADDR_WIDTH-1:0] vx_dcr_wr_addr = cmd_dcr_addr; - wire [`VX_DCR_DATA_WIDTH-1:0] vx_dcr_wr_data = cmd_dcr_data; + wire vx_dcr_wr_valid = (STATE_DCR_WRITE == state); + wire [`VX_DCR_ADDR_WIDTH-1:0] vx_dcr_wr_addr = cmd_dcr_addr; + wire [`VX_DCR_DATA_WIDTH-1:0] vx_dcr_wr_data = cmd_dcr_data; - `SCOPE_IO_SWITCH (2) + `SCOPE_IO_SWITCH (2); Vortex vortex ( `SCOPE_IO_BIND (1) .clk (clk), - .reset (reset || ~vx_running), + .reset (vx_reset), // Memory request .mem_req_valid (vx_mem_req_valid), @@ -962,106 +1038,107 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ .busy (vx_busy) ); - // COUT HANDLING ////////////////////////////////////////////////////////////// + // COUT HANDLING ////////////////////////////////////////////////////////// - wire [COUT_TID_WIDTH-1:0] cout_tid; + for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_cout - VX_onehot_encoder #( - .N (`VX_MEM_BYTEEN_WIDTH) - ) cout_tid_enc ( - .data_in (vx_mem_req_byteen), - .data_out (cout_tid), - `UNUSED_PIN (valid_out) - ); + wire [COUT_TID_WIDTH-1:0] cout_tid; - wire [`VX_MEM_ADDR_WIDTH-1:0] io_cout_addr_b = `VX_MEM_ADDR_WIDTH'(`IO_COUT_ADDR >> `CLOG2(`MEM_BLOCK_SIZE)); + VX_onehot_encoder #( + .N (`VX_MEM_BYTEEN_WIDTH) + ) cout_tid_enc ( + .data_in (vx_mem_req_byteen[i]), + .data_out (cout_tid), + `UNUSED_PIN (valid_out) + ); - assign vx_mem_is_cout = (vx_mem_req_addr == io_cout_addr_b); + wire [`VX_MEM_BYTEEN_WIDTH-1:0][7:0] vx_mem_req_data_m = vx_mem_req_data[i]; - assign vx_mem_req_ready = vx_mem_is_cout ? ~cout_q_full : vx_mem_req_ready_qual; + wire [7:0] cout_char = vx_mem_req_data_m[cout_tid]; - wire [`VX_MEM_BYTEEN_WIDTH-1:0][7:0] vx_mem_req_data_m = vx_mem_req_data; + wire [`VX_MEM_ADDR_WIDTH-1:0] io_cout_addr_b = `VX_MEM_ADDR_WIDTH'(`IO_COUT_ADDR >> `CLOG2(`MEM_BLOCK_SIZE)); - wire [7:0] cout_char = vx_mem_req_data_m[cout_tid]; + wire vx_mem_is_cout = (vx_mem_req_addr[i] == io_cout_addr_b); - wire cout_q_push = vx_mem_req_valid && vx_mem_is_cout && ~cout_q_full; + assign vx_mem_req_valid_qual[i] = vx_mem_req_valid[i] && ~vx_mem_is_cout; + assign vx_mem_req_ready[i] = vx_mem_is_cout ? ~cout_q_full[i] : vx_mem_req_ready_qual[i]; - wire cout_q_pop = cp2af_sRxPort.c0.mmioRdValid - && (mmio_hdr.address == MMIO_STATUS) - && ~cout_q_empty; + wire cout_q_push = vx_mem_req_valid[i] && vx_mem_is_cout && ~cout_q_full[i]; - VX_fifo_queue #( - .DATAW (COUT_QUEUE_DATAW), - .DEPTH (COUT_QUEUE_SIZE) - ) cout_queue ( - .clk (clk), - .reset (reset), - .push (cout_q_push), - .pop (cout_q_pop), - .data_in ({cout_tid, cout_char}), - .data_out (cout_q_dout), - .empty (cout_q_empty), - .full (cout_q_full), - `UNUSED_PIN (alm_empty), - `UNUSED_PIN (alm_full), - `UNUSED_PIN (size) - ); + VX_fifo_queue #( + .DATAW (COUT_QUEUE_DATAW), + .DEPTH (COUT_QUEUE_SIZE) + ) cout_queue ( + .clk (clk), + .reset (reset), + .push (cout_q_push), + .pop (cout_q_pop[i]), + .data_in ({cout_tid, cout_char}), + .data_out (cout_q_dout[i]), + .empty (cout_q_empty[i]), + .full (cout_q_full[i]), + `UNUSED_PIN (alm_empty), + `UNUSED_PIN (alm_full), + `UNUSED_PIN (size) + ); + end - // SCOPE ////////////////////////////////////////////////////////////////////// + // SCOPE ////////////////////////////////////////////////////////////////// `ifdef DBG_SCOPE_AFU - wire mem_req_fire = mem_bus_if[0].req_valid && mem_bus_if[0].req_ready; - wire mem_rsp_fire = mem_bus_if[0].rsp_valid && mem_bus_if[0].rsp_ready; - wire avs_write_fire = avs_write[0] && ~avs_waitrequest[0]; - wire avs_read_fire = avs_read[0] && ~avs_waitrequest[0]; - wire [$bits(t_local_mem_addr)-1:0] mem_bus_if_addr = mem_bus_if[0].req_data.addr; - reg [STATE_WIDTH-1:0] state_prev; always @(posedge clk) begin state_prev <= state; end - wire state_changed = (state != state_prev); - - VX_scope_tap #( - .SCOPE_ID (0), - .TRIGGERW (24), - .PROBEW (431) - ) scope_tap ( - .clk(clk), - .reset(scope_reset_w[0]), - .start(1'b0), - .stop(1'b0), - .triggers({ - reset, - state_changed, - mem_req_fire, - mem_rsp_fire, - avs_write_fire, - avs_read_fire, + wire state_changed = (state != state_prev); + wire vx_mem_req_fire = vx_mem_req_valid[0] && vx_mem_req_ready[0]; + wire vx_mem_rsp_fire = vx_mem_rsp_valid[0] && vx_mem_rsp_ready[0]; + wire avs_req_fire = (avs_write[0] || avs_read[0]) && ~avs_waitrequest[0]; + wire reset_negedge; + `NEG_EDGE (reset_negedge, reset); + `SCOPE_TAP (0, 0, { + vx_reset, + vx_busy, + vx_mem_req_valid[0], + vx_mem_req_ready[0], + vx_mem_rsp_valid[0], + vx_mem_rsp_ready[0], + avs_read[0], + avs_write[0], avs_waitrequest[0], - avs_readdatavalid[0], - cp2af_sRxPort.c0.mmioRdValid, - cp2af_sRxPort.c0.mmioWrValid, cp2af_sRxPort.c0.rspValid, cp2af_sRxPort.c1.rspValid, af2cp_sTxPort.c0.valid, af2cp_sTxPort.c1.valid, cp2af_sRxPort.c0TxAlmFull, - cp2af_sRxPort.c1TxAlmFull, - af2cp_sTxPort.c2.mmioRdValid, - cci_wr_req_fire, - cci_wr_rsp_fire, + cp2af_sRxPort.c1TxAlmFull + },{ + state_changed, + vx_dcr_wr_valid, // ack-free + avs_readdatavalid[0], // ack-free + cp2af_sRxPort.c0.mmioRdValid, // ack-free + cp2af_sRxPort.c0.mmioWrValid, // ack-free + af2cp_sTxPort.c2.mmioRdValid, // ack-free + cp2af_sRxPort.c0.rspValid, // ack-free + cp2af_sRxPort.c1.rspValid, // ack-free cci_rd_req_fire, - cci_rd_rsp_fire, - cci_pending_reads_full, - cci_pending_writes_empty, - cci_pending_writes_full - }), - .probes({ + cci_wr_req_fire, + avs_req_fire, + vx_mem_req_fire, + vx_mem_rsp_fire + },{ cmd_type, state, - mmio_hdr.address, - mmio_hdr.length, + vx_mem_req_rw[0], + vx_mem_req_byteen[0], + vx_mem_req_addr[0], + vx_mem_req_data[0], + vx_mem_req_tag[0], + vx_mem_rsp_data[0], + vx_mem_rsp_tag[0], + vx_dcr_wr_addr, + vx_dcr_wr_data, + mmio_req_hdr.address, cp2af_sRxPort.c0.hdr.mdata, af2cp_sTxPort.c0.hdr.address, af2cp_sTxPort.c0.hdr.mdata, @@ -1073,29 +1150,27 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_ cci_mem_wr_req_ctr, cci_rd_req_ctr, cci_rd_rsp_ctr, - cci_wr_req_ctr, - mem_bus_if_addr - }), - .bus_in(scope_bus_in_w[0]), - .bus_out(scope_bus_out_w[0]) - ); + cci_wr_req_ctr + }, + reset_negedge, 1'b0, 4096 + ); `else - `SCOPE_IO_UNUSED_W(0) + `SCOPE_IO_UNUSED(0) `endif - /////////////////////////////////////////////////////////////////////////////// + /////////////////////////////////////////////////////////////////////////// `ifdef DBG_TRACE_AFU always @(posedge clk) begin for (integer i = 0; i < NUM_LOCAL_MEM_BANKS; ++i) begin if (avs_write[i] && ~avs_waitrequest[i]) begin - `TRACE(2, ("%d: AVS Wr Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h, data=0x%h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i], avs_writedata[i])); + `TRACE(2, ("%t: AVS Wr Req[%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h, data=0x%h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i], avs_writedata[i])) end if (avs_read[i] && ~avs_waitrequest[i]) begin - `TRACE(2, ("%d: AVS Rd Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i])); + `TRACE(2, ("%t: AVS Rd Req[%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i])) end if (avs_readdatavalid[i]) begin - `TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%h\n", $time, i, avs_readdata[i])); + `TRACE(2, ("%t: AVS Rd Rsp[%0d]: data=0x%h\n", $time, i, avs_readdata[i])) end end end diff --git a/hw/rtl/afu/opae/vortex_afu.vh b/hw/rtl/afu/opae/vortex_afu.vh index 6aa532983..31f09ae90 100644 --- a/hw/rtl/afu/opae/vortex_afu.vh +++ b/hw/rtl/afu/opae/vortex_afu.vh @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,9 +17,9 @@ `define AFU_ACCEL_NAME "vortex_afu" `define AFU_ACCEL_UUID 128'h35F9452B_25C2_434C_93D5_6F8C60DB361C -`define AFU_IMAGE_CMD_MEM_READ 1 +`define AFU_IMAGE_CMD_MEM_READ 1 `define AFU_IMAGE_CMD_MEM_WRITE 2 -`define AFU_IMAGE_CMD_RUN 3 +`define AFU_IMAGE_CMD_RUN 3 `define AFU_IMAGE_CMD_DCR_WRITE 4 `define AFU_IMAGE_CMD_MAX_VALUE 4 diff --git a/hw/rtl/afu/xrt/VX_afu_ctrl.sv b/hw/rtl/afu/xrt/VX_afu_ctrl.sv index 687b55a8c..0aa7ffe04 100644 --- a/hw/rtl/afu/xrt/VX_afu_ctrl.sv +++ b/hw/rtl/afu/xrt/VX_afu_ctrl.sv @@ -14,22 +14,20 @@ `include "vortex_afu.vh" module VX_afu_ctrl #( - parameter AXI_ADDR_WIDTH = 8, - parameter AXI_DATA_WIDTH = 32, - parameter AXI_NUM_BANKS = 1 + parameter S_AXI_ADDR_WIDTH = 8, + parameter S_AXI_DATA_WIDTH = 32 ) ( // axi4 lite slave signals input wire clk, input wire reset, - input wire clk_en, input wire s_axi_awvalid, - input wire [AXI_ADDR_WIDTH-1:0] s_axi_awaddr, + input wire [S_AXI_ADDR_WIDTH-1:0] s_axi_awaddr, output wire s_axi_awready, input wire s_axi_wvalid, - input wire [AXI_DATA_WIDTH-1:0] s_axi_wdata, - input wire [AXI_DATA_WIDTH/8-1:0] s_axi_wstrb, + input wire [S_AXI_DATA_WIDTH-1:0] s_axi_wdata, + input wire [S_AXI_DATA_WIDTH/8-1:0]s_axi_wstrb, output wire s_axi_wready, output wire s_axi_bvalid, @@ -37,11 +35,11 @@ module VX_afu_ctrl #( input wire s_axi_bready, input wire s_axi_arvalid, - input wire [AXI_ADDR_WIDTH-1:0] s_axi_araddr, + input wire [S_AXI_ADDR_WIDTH-1:0] s_axi_araddr, output wire s_axi_arready, output wire s_axi_rvalid, - output wire [AXI_DATA_WIDTH-1:0] s_axi_rdata, + output wire [S_AXI_DATA_WIDTH-1:0] s_axi_rdata, output wire [1:0] s_axi_rresp, input wire s_axi_rready, @@ -52,13 +50,13 @@ module VX_afu_ctrl #( input wire ap_idle, output wire interrupt, + output wire ap_ctrl_read, + `ifdef SCOPE input wire scope_bus_in, output wire scope_bus_out, `endif - output wire [63:0] mem_base [AXI_NUM_BANKS], - output wire dcr_wr_valid, output wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr, output wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data @@ -110,39 +108,38 @@ module VX_afu_ctrl #( ADDR_DEV_0 = 8'h10, ADDR_DEV_1 = 8'h14, - //ADDR_DEV_CTRL = 8'h18, - ADDR_ISA_0 = 8'h1C, - ADDR_ISA_1 = 8'h20, - //ADDR_ISA_CTRL = 8'h24, + ADDR_ISA_0 = 8'h18, + ADDR_ISA_1 = 8'h1C, - ADDR_DCR_0 = 8'h28, - ADDR_DCR_1 = 8'h2C, - //ADDR_DCR_CTRL = 8'h30, + ADDR_DCR_0 = 8'h20, + ADDR_DCR_1 = 8'h24, `ifdef SCOPE - ADDR_SCP_0 = 8'h34, - ADDR_SCP_1 = 8'h38, - //ADDR_SCP_CTRL = 8'h3C, + ADDR_SCP_0 = 8'h28, + ADDR_SCP_1 = 8'h2C, `endif - ADDR_MEM_0 = 8'h40, - ADDR_MEM_1 = 8'h44, - //ADDR_MEM_CTRL = 8'h48, - ADDR_BITS = 8; localparam - WSTATE_IDLE = 2'd0, + WSTATE_ADDR = 2'd0, WSTATE_DATA = 2'd1, - WSTATE_RESP = 2'd2; + WSTATE_RESP = 2'd2, + WSTATE_WIDTH = 2; localparam - RSTATE_IDLE = 2'd0, - RSTATE_DATA = 2'd1; + RSTATE_ADDR = 2'd0, + RSTATE_DATA = 2'd1, + RSTATE_RESP = 2'd2, + RSTATE_WIDTH = 2; + + localparam MEMORY_BANK_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH - `CLOG2(`PLATFORM_MEMORY_NUM_BANKS); // device caps - wire [63:0] dev_caps = {16'b0, + wire [63:0] dev_caps = {8'b0, + 5'(MEMORY_BANK_ADDR_WIDTH-20), + 3'(`CLOG2(`PLATFORM_MEMORY_NUM_BANKS)), 8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0), 16'(`NUM_CORES * `NUM_CLUSTERS), 8'(`NUM_WARPS), @@ -153,16 +150,18 @@ module VX_afu_ctrl #( 2'(`CLOG2(`XLEN)-4), 30'(`MISA_STD)}; - reg [1:0] wstate; + reg [WSTATE_WIDTH-1:0] wstate; reg [ADDR_BITS-1:0] waddr; wire [31:0] wmask; wire s_axi_aw_fire; wire s_axi_w_fire; + wire s_axi_b_fire; - reg [1:0] rstate; + logic [RSTATE_WIDTH-1:0] rstate; reg [31:0] rdata; - wire [ADDR_BITS-1:0] raddr; + reg [ADDR_BITS-1:0] raddr; wire s_axi_ar_fire; + wire s_axi_r_fire; reg ap_reset_r; reg ap_start_r; @@ -170,20 +169,23 @@ module VX_afu_ctrl #( reg gie_r; reg [1:0] ier_r; reg [1:0] isr_r; - reg [63:0] mem_r [AXI_NUM_BANKS]; reg [31:0] dcra_r; reg [31:0] dcrv_r; reg dcr_wr_valid_r; + logic wready_stall; + logic rvalid_stall; + `ifdef SCOPE - reg [63:0] scope_bus_wdata; - reg [63:0] scope_bus_rdata; + reg [63:0] scope_bus_wdata, scope_bus_rdata; reg [5:0] scope_bus_ctr; - reg cmd_scope_reading; - reg cmd_scope_writing; + reg cmd_scope_writing, cmd_scope_reading; reg scope_bus_out_r; + reg scope_rdata_valid; + + reg is_scope_waddr, is_scope_raddr; always @(posedge clk) begin if (reset) begin @@ -191,18 +193,33 @@ module VX_afu_ctrl #( cmd_scope_writing <= 0; scope_bus_ctr <= '0; scope_bus_out_r <= 0; - end else if (clk_en) begin + is_scope_waddr <= 0; + is_scope_raddr <= 0; + scope_bus_rdata <= '0; + scope_rdata_valid <= 0; + end else begin + scope_bus_out_r <= 0; + if (s_axi_aw_fire) begin + is_scope_waddr <= (s_axi_awaddr[ADDR_BITS-1:0] == ADDR_SCP_0) + || (s_axi_awaddr[ADDR_BITS-1:0] == ADDR_SCP_1); + end + if (s_axi_ar_fire) begin + is_scope_raddr <= (s_axi_araddr[ADDR_BITS-1:0] == ADDR_SCP_0) + || (s_axi_araddr[ADDR_BITS-1:0] == ADDR_SCP_1); + end if (s_axi_w_fire && waddr == ADDR_SCP_0) begin scope_bus_wdata[31:0] <= (s_axi_wdata & wmask) | (scope_bus_wdata[31:0] & ~wmask); end if (s_axi_w_fire && waddr == ADDR_SCP_1) begin scope_bus_wdata[63:32] <= (s_axi_wdata & wmask) | (scope_bus_wdata[63:32] & ~wmask); cmd_scope_writing <= 1; + scope_rdata_valid <= 0; scope_bus_out_r <= 1; scope_bus_ctr <= 63; end if (scope_bus_in) begin cmd_scope_reading <= 1; + scope_bus_rdata <= '0; scope_bus_ctr <= 63; end if (cmd_scope_reading) begin @@ -210,13 +227,16 @@ module VX_afu_ctrl #( scope_bus_ctr <= scope_bus_ctr - 1; if (scope_bus_ctr == 0) begin cmd_scope_reading <= 0; + scope_rdata_valid <= 1; + scope_bus_ctr <= 0; end end if (cmd_scope_writing) begin - scope_bus_out_r <= 1'(scope_bus_wdata >> scope_bus_ctr); + scope_bus_out_r <= scope_bus_wdata[scope_bus_ctr]; scope_bus_ctr <= scope_bus_ctr - 1; if (scope_bus_ctr == 0) begin cmd_scope_writing <= 0; + scope_bus_ctr <= 0; end end end @@ -224,41 +244,50 @@ module VX_afu_ctrl #( assign scope_bus_out = scope_bus_out_r; + assign wready_stall = is_scope_waddr && cmd_scope_writing; + assign rvalid_stall = is_scope_raddr && ~scope_rdata_valid; + +`else + + assign wready_stall = 0; + assign rvalid_stall = 0; + `endif - // AXI Write + // AXI Write Request + assign s_axi_awready = (wstate == WSTATE_ADDR); + assign s_axi_wready = (wstate == WSTATE_DATA) && ~wready_stall; - assign s_axi_awready = (wstate == WSTATE_IDLE); - assign s_axi_wready = (wstate == WSTATE_DATA); + // AXI Write Response assign s_axi_bvalid = (wstate == WSTATE_RESP); assign s_axi_bresp = 2'b00; // OKAY - assign s_axi_aw_fire = s_axi_awvalid && s_axi_awready; - assign s_axi_w_fire = s_axi_wvalid && s_axi_wready; - - for (genvar i = 0; i < 4; ++i) begin + for (genvar i = 0; i < 4; ++i) begin : g_wmask assign wmask[8 * i +: 8] = {8{s_axi_wstrb[i]}}; end + assign s_axi_aw_fire = s_axi_awvalid && s_axi_awready; + assign s_axi_w_fire = s_axi_wvalid && s_axi_wready; + assign s_axi_b_fire = s_axi_bvalid && s_axi_bready; + // wstate always @(posedge clk) begin if (reset) begin - wstate <= WSTATE_IDLE; - end else if (clk_en) begin + wstate <= WSTATE_ADDR; + end else begin case (wstate) - WSTATE_IDLE: wstate <= s_axi_awvalid ? WSTATE_DATA : WSTATE_IDLE; - WSTATE_DATA: wstate <= s_axi_wvalid ? WSTATE_RESP : WSTATE_DATA; - WSTATE_RESP: wstate <= s_axi_bready ? WSTATE_IDLE : WSTATE_RESP; - default: wstate <= WSTATE_IDLE; + WSTATE_ADDR: wstate <= s_axi_aw_fire ? WSTATE_DATA : WSTATE_ADDR; + WSTATE_DATA: wstate <= s_axi_w_fire ? WSTATE_RESP : WSTATE_DATA; + WSTATE_RESP: wstate <= s_axi_b_fire ? WSTATE_ADDR : WSTATE_RESP; + default: wstate <= WSTATE_ADDR; endcase end end // waddr always @(posedge clk) begin - if (clk_en) begin - if (s_axi_aw_fire) - waddr <= s_axi_awaddr[ADDR_BITS-1:0]; + if (s_axi_aw_fire) begin + waddr <= s_axi_awaddr[ADDR_BITS-1:0]; end end @@ -276,16 +305,13 @@ module VX_afu_ctrl #( dcra_r <= '0; dcrv_r <= '0; dcr_wr_valid_r <= 0; + end else begin + dcr_wr_valid_r <= 0; + ap_reset_r <= 0; - for (integer i = 0; i < AXI_NUM_BANKS; ++i) begin - mem_r[i] <= '0; - end - end else if (clk_en) begin if (ap_ready) ap_start_r <= auto_restart_r; - dcr_wr_valid_r <= 0; - if (s_axi_w_fire) begin case (waddr) ADDR_AP_CTRL: begin @@ -317,16 +343,7 @@ module VX_afu_ctrl #( dcrv_r <= (s_axi_wdata & wmask) | (dcrv_r & ~wmask); dcr_wr_valid_r <= 1; end - default: begin - for (integer i = 0; i < AXI_NUM_BANKS; ++i) begin - if (waddr == (ADDR_MEM_0 + 8'(i) * 8'd12)) begin - mem_r[i][31:0] <= (s_axi_wdata & wmask) | (mem_r[i][31:0] & ~wmask); - end - if (waddr == (ADDR_MEM_1 + 8'(i) * 8'd12)) begin - mem_r[i][63:32] <= (s_axi_wdata & wmask) | (mem_r[i][63:32] & ~wmask); - end - end - end + default:; endcase if (ier_r[0] & ap_done) @@ -337,82 +354,87 @@ module VX_afu_ctrl #( end end - // AXI Read + // AXI Read Request + assign s_axi_arready = (rstate == RSTATE_ADDR); - assign s_axi_arready = (rstate == RSTATE_IDLE); - assign s_axi_rvalid = (rstate == RSTATE_DATA); + // AXI Read Response + assign s_axi_rvalid = (rstate == RSTATE_RESP); assign s_axi_rdata = rdata; assign s_axi_rresp = 2'b00; // OKAY assign s_axi_ar_fire = s_axi_arvalid && s_axi_arready; - assign raddr = s_axi_araddr[ADDR_BITS-1:0]; + assign s_axi_r_fire = s_axi_rvalid && s_axi_rready; // rstate always @(posedge clk) begin if (reset) begin - rstate <= RSTATE_IDLE; - end else if (clk_en) begin + rstate <= RSTATE_ADDR; + end else begin case (rstate) - RSTATE_IDLE: rstate <= s_axi_arvalid ? RSTATE_DATA : RSTATE_IDLE; - RSTATE_DATA: rstate <= (s_axi_rready & s_axi_rvalid) ? RSTATE_IDLE : RSTATE_DATA; - default: rstate <= RSTATE_IDLE; + RSTATE_ADDR: rstate <= s_axi_ar_fire ? RSTATE_DATA : RSTATE_ADDR; + RSTATE_DATA: rstate <= rvalid_stall ? RSTATE_DATA : RSTATE_RESP; + RSTATE_RESP: rstate <= s_axi_r_fire ? RSTATE_ADDR : RSTATE_RESP; + default: rstate <= RSTATE_ADDR; endcase end end + // raddr + always @(posedge clk) begin + if (s_axi_ar_fire) begin + raddr <= s_axi_araddr[ADDR_BITS-1:0]; + end + end + // rdata always @(posedge clk) begin - if (clk_en) begin - if (s_axi_ar_fire) begin - rdata <= '0; - case (raddr) - ADDR_AP_CTRL: begin - rdata[0] <= ap_start_r; - rdata[1] <= ap_done; - rdata[2] <= ap_idle; - rdata[3] <= ap_ready; - rdata[7] <= auto_restart_r; - end - ADDR_GIE: begin - rdata <= 32'(gie_r); - end - ADDR_IER: begin - rdata <= 32'(ier_r); - end - ADDR_ISR: begin - rdata <= 32'(isr_r); - end - ADDR_DEV_0: begin - rdata <= dev_caps[31:0]; - end - ADDR_DEV_1: begin - rdata <= dev_caps[63:32]; - end - ADDR_ISA_0: begin - rdata <= isa_caps[31:0]; - end - ADDR_ISA_1: begin - rdata <= isa_caps[63:32]; - end - `ifdef SCOPE - ADDR_SCP_0: begin - rdata <= scope_bus_rdata[31:0]; - end - ADDR_SCP_1: begin - rdata <= scope_bus_rdata[63:32]; - end - `endif - default:; - endcase + rdata <= '0; + case (raddr) + ADDR_AP_CTRL: begin + rdata[0] <= ap_start_r; + rdata[1] <= ap_done; + rdata[2] <= ap_idle; + rdata[3] <= ap_ready; + rdata[7] <= auto_restart_r; end - end + ADDR_GIE: begin + rdata <= 32'(gie_r); + end + ADDR_IER: begin + rdata <= 32'(ier_r); + end + ADDR_ISR: begin + rdata <= 32'(isr_r); + end + ADDR_DEV_0: begin + rdata <= dev_caps[31:0]; + end + ADDR_DEV_1: begin + rdata <= dev_caps[63:32]; + end + ADDR_ISA_0: begin + rdata <= isa_caps[31:0]; + end + ADDR_ISA_1: begin + rdata <= isa_caps[63:32]; + end + `ifdef SCOPE + ADDR_SCP_0: begin + rdata <= scope_bus_rdata[31:0]; + end + ADDR_SCP_1: begin + rdata <= scope_bus_rdata[63:32]; + end + `endif + default:; + endcase end assign ap_reset = ap_reset_r; assign ap_start = ap_start_r; assign interrupt = gie_r & (| isr_r); - assign mem_base = mem_r; + assign ap_ctrl_read = s_axi_r_fire && (raddr == ADDR_AP_CTRL); assign dcr_wr_valid = dcr_wr_valid_r; assign dcr_wr_addr = `VX_DCR_ADDR_WIDTH'(dcra_r); diff --git a/hw/rtl/afu/xrt/VX_afu_wrap.sv b/hw/rtl/afu/xrt/VX_afu_wrap.sv index a844802e9..c3c51f795 100644 --- a/hw/rtl/afu/xrt/VX_afu_wrap.sv +++ b/hw/rtl/afu/xrt/VX_afu_wrap.sv @@ -10,68 +10,93 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +// +// Reference: https://www.xilinx.com/developer/articles/porting-rtl-designs-to-vitis-rtl-kernels.html `include "vortex_afu.vh" module VX_afu_wrap #( - parameter C_S_AXI_CTRL_ADDR_WIDTH = 8, - parameter C_S_AXI_CTRL_DATA_WIDTH = 32, - parameter C_M_AXI_MEM_ID_WIDTH = `M_AXI_MEM_ID_WIDTH, - parameter C_M_AXI_MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH, - parameter C_M_AXI_MEM_DATA_WIDTH = `VX_MEM_DATA_WIDTH + parameter C_S_AXI_CTRL_ADDR_WIDTH = 8, + parameter C_S_AXI_CTRL_DATA_WIDTH = 32, + parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH, + parameter C_M_AXI_MEM_DATA_WIDTH = `PLATFORM_MEMORY_DATA_SIZE * 8, + parameter C_M_AXI_MEM_ADDR_WIDTH = 64, +`ifdef PLATFORM_MERGED_MEMORY_INTERFACE + parameter C_M_AXI_MEM_NUM_BANKS = 1 +`else + parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS +`endif ) ( // System signals - input wire ap_clk, - input wire ap_rst_n, + input wire clk, + input wire reset, // AXI4 master interface - `REPEAT (`M_AXI_MEM_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA), - +`ifdef PLATFORM_MERGED_MEMORY_INTERFACE + `REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA), +`else + `REPEAT (`PLATFORM_MEMORY_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA), +`endif // AXI4-Lite slave interface input wire s_axi_ctrl_awvalid, output wire s_axi_ctrl_awready, input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_awaddr, + input wire s_axi_ctrl_wvalid, output wire s_axi_ctrl_wready, input wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_wdata, input wire [C_S_AXI_CTRL_DATA_WIDTH/8-1:0] s_axi_ctrl_wstrb, + input wire s_axi_ctrl_arvalid, output wire s_axi_ctrl_arready, input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_araddr, + output wire s_axi_ctrl_rvalid, input wire s_axi_ctrl_rready, output wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_rdata, output wire [1:0] s_axi_ctrl_rresp, + output wire s_axi_ctrl_bvalid, input wire s_axi_ctrl_bready, output wire [1:0] s_axi_ctrl_bresp, output wire interrupt ); - localparam C_M_AXI_MEM_NUM_BANKS = `M_AXI_MEM_NUM_BANKS; + localparam M_AXI_MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH; - localparam STATE_IDLE = 0; - localparam STATE_RUN = 1; + typedef enum logic [1:0] { + STATE_IDLE = 0, + STATE_INIT = 1, + STATE_RUN = 2, + STATE_DONE = 3 + } state_e; + + localparam PENDING_WR_SIZEW = 12; // max outstanding requests size + localparam NUM_MEM_BANKS_SIZEW = `CLOG2(C_M_AXI_MEM_NUM_BANKS+1); wire m_axi_mem_awvalid_a [C_M_AXI_MEM_NUM_BANKS]; wire m_axi_mem_awready_a [C_M_AXI_MEM_NUM_BANKS]; wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_a [C_M_AXI_MEM_NUM_BANKS]; wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_awid_a [C_M_AXI_MEM_NUM_BANKS]; wire [7:0] m_axi_mem_awlen_a [C_M_AXI_MEM_NUM_BANKS]; + wire m_axi_mem_wvalid_a [C_M_AXI_MEM_NUM_BANKS]; wire m_axi_mem_wready_a [C_M_AXI_MEM_NUM_BANKS]; wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_wdata_a [C_M_AXI_MEM_NUM_BANKS]; wire [C_M_AXI_MEM_DATA_WIDTH/8-1:0] m_axi_mem_wstrb_a [C_M_AXI_MEM_NUM_BANKS]; wire m_axi_mem_wlast_a [C_M_AXI_MEM_NUM_BANKS]; + wire m_axi_mem_bvalid_a [C_M_AXI_MEM_NUM_BANKS]; wire m_axi_mem_bready_a [C_M_AXI_MEM_NUM_BANKS]; wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_bid_a [C_M_AXI_MEM_NUM_BANKS]; wire [1:0] m_axi_mem_bresp_a [C_M_AXI_MEM_NUM_BANKS]; + wire m_axi_mem_arvalid_a [C_M_AXI_MEM_NUM_BANKS]; wire m_axi_mem_arready_a [C_M_AXI_MEM_NUM_BANKS]; wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_a [C_M_AXI_MEM_NUM_BANKS]; wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_arid_a [C_M_AXI_MEM_NUM_BANKS]; wire [7:0] m_axi_mem_arlen_a [C_M_AXI_MEM_NUM_BANKS]; + wire m_axi_mem_rvalid_a [C_M_AXI_MEM_NUM_BANKS]; wire m_axi_mem_rready_a [C_M_AXI_MEM_NUM_BANKS]; wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_rdata_a [C_M_AXI_MEM_NUM_BANKS]; @@ -80,30 +105,31 @@ module VX_afu_wrap #( wire [1:0] m_axi_mem_rresp_a [C_M_AXI_MEM_NUM_BANKS]; // convert memory interface to array - `REPEAT (`M_AXI_MEM_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON); - - wire reset = ~ap_rst_n; +`ifdef PLATFORM_MERGED_MEMORY_INTERFACE + `REPEAT (1, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON); +`else + `REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON); +`endif reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr; - reg [15:0] vx_pending_writes; - reg vx_busy_wait; - reg vx_running; - + reg [PENDING_WR_SIZEW-1:0] vx_pending_writes; + reg vx_reset = 1; // asserted at initialization wire vx_busy; - wire [63:0] mem_base [C_M_AXI_MEM_NUM_BANKS]; - wire dcr_wr_valid; wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr; wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data; - reg state; + state_e state; wire ap_reset; wire ap_start; - wire ap_idle = ~vx_running; - wire ap_done = ~(state == STATE_RUN || vx_pending_writes != 0); - wire ap_ready = 1'b1; + wire ap_ctrl_read; + wire ap_idle = (state == STATE_IDLE); + wire ap_done = (state == STATE_DONE) && (vx_pending_writes == '0); + wire ap_ready = ap_done; + + wire ap_done_ack = ap_done && ap_ctrl_read; `ifdef SCOPE wire scope_bus_in; @@ -111,108 +137,129 @@ module VX_afu_wrap #( wire scope_reset = reset; `endif - always @(posedge ap_clk) begin + always @(posedge clk) begin if (reset || ap_reset) begin - state <= STATE_IDLE; - vx_busy_wait <= 0; - vx_running <= 0; + state <= STATE_IDLE; + vx_reset <= 1; end else begin case (state) STATE_IDLE: begin if (ap_start) begin `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: STATE RUN\n", $time)); + `TRACE(2, ("%t: AFU: Begin initialization\n", $time)) `endif - state <= STATE_RUN; - vx_running <= 0; + state <= STATE_INIT; + vx_reset_ctr <= (`RESET_DELAY-1); + vx_reset <= 1; + end + end + STATE_INIT: begin + if (vx_reset) begin + // wait for reset to complete + if (vx_reset_ctr == 0) begin + `ifdef DBG_TRACE_AFU + `TRACE(2, ("%t: AFU: Initialization completed\n", $time)) + `endif + vx_reset <= 0; + end + end else begin + // wait until processor goes busy + if (vx_busy) begin + `ifdef DBG_TRACE_AFU + `TRACE(2, ("%t: AFU: Begin execution\n", $time)) + `endif + state <= STATE_RUN; + end end end STATE_RUN: begin - if (vx_running) begin - if (vx_busy_wait) begin - // wait until processor goes busy - if (vx_busy) begin - vx_busy_wait <= 0; - end - end else begin - // wait until the processor is not busy - if (~vx_busy) begin - state <= STATE_IDLE; - `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: AFU: End execution\n", $time)); - `TRACE(2, ("%d: STATE IDLE\n", $time)); - `endif - end - end - end else begin - // wait until the reset sequence is complete - if (vx_reset_ctr == (`RESET_DELAY-1)) begin - `ifdef DBG_TRACE_AFU - `TRACE(2, ("%d: AFU: Begin execution\n", $time)); - `endif - vx_running <= 1; - vx_busy_wait <= 1; - end + // wait until the processor is not busy + if (~vx_busy) begin + `ifdef DBG_TRACE_AFU + `TRACE(2, ("%t: AFU: Execution completed\n", $time)) + `endif + state <= STATE_DONE; + end + end + STATE_DONE: begin + // wait for host's done acknowledgement + if (ap_done_ack) begin + `ifdef DBG_TRACE_AFU + `TRACE(2, ("%t: AFU: Processor idle\n", $time)) + `endif + state <= STATE_IDLE; end end endcase + + // ensure reset network initialization + if (vx_reset_ctr != '0) begin + vx_reset_ctr <= vx_reset_ctr - 1; + end end end - reg m_axi_mem_wfire; - reg m_axi_mem_bfire; + wire [C_M_AXI_MEM_NUM_BANKS-1:0] m_axi_wr_req_fire, m_axi_wr_rsp_fire; + wire [NUM_MEM_BANKS_SIZEW-1:0] cur_wr_reqs, cur_wr_rsps; - always @(*) begin - m_axi_mem_wfire = 0; - m_axi_mem_bfire = 0; - for (integer i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin - m_axi_mem_wfire |= m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i]; - m_axi_mem_bfire |= m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i]; - end + for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_m_axi_wr_req_fire + VX_axi_write_ack axi_write_ack ( + .clk (clk), + .reset (reset), + .awvalid(m_axi_mem_awvalid_a[i]), + .awready(m_axi_mem_awready_a[i]), + .wvalid (m_axi_mem_wvalid_a[i]), + .wready (m_axi_mem_wready_a[i]), + .tx_ack (m_axi_wr_req_fire[i]), + `UNUSED_PIN (aw_ack), + `UNUSED_PIN (w_ack), + `UNUSED_PIN (tx_rdy) + ); end - always @(posedge ap_clk) begin - if (reset || ap_reset) begin + for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_m_axi_wr_rsp_fire + assign m_axi_wr_rsp_fire[i] = m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i]; + end + + `POP_COUNT(cur_wr_reqs, m_axi_wr_req_fire); + `POP_COUNT(cur_wr_rsps, m_axi_wr_rsp_fire); + + wire signed [NUM_MEM_BANKS_SIZEW:0] reqs_sub = (NUM_MEM_BANKS_SIZEW+1)'(cur_wr_reqs) - + (NUM_MEM_BANKS_SIZEW+1)'(cur_wr_rsps); + + always @(posedge clk) begin + if (reset) begin vx_pending_writes <= '0; end else begin - if (m_axi_mem_wfire && ~m_axi_mem_bfire) - vx_pending_writes <= vx_pending_writes + 1; - if (~m_axi_mem_wfire && m_axi_mem_bfire) - vx_pending_writes <= vx_pending_writes - 1; - end - end - - always @(posedge ap_clk) begin - if (state == STATE_RUN) begin - vx_reset_ctr <= vx_reset_ctr + 1; - end else begin - vx_reset_ctr <= '0; + vx_pending_writes <= vx_pending_writes + PENDING_WR_SIZEW'(reqs_sub); end end VX_afu_ctrl #( - .AXI_ADDR_WIDTH (C_S_AXI_CTRL_ADDR_WIDTH), - .AXI_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH), - .AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS) + .S_AXI_ADDR_WIDTH (C_S_AXI_CTRL_ADDR_WIDTH), + .S_AXI_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH) ) afu_ctrl ( - .clk (ap_clk), - .reset (reset || ap_reset), - .clk_en (1'b1), + .clk (clk), + .reset (reset), .s_axi_awvalid (s_axi_ctrl_awvalid), .s_axi_awready (s_axi_ctrl_awready), .s_axi_awaddr (s_axi_ctrl_awaddr), + .s_axi_wvalid (s_axi_ctrl_wvalid), .s_axi_wready (s_axi_ctrl_wready), .s_axi_wdata (s_axi_ctrl_wdata), .s_axi_wstrb (s_axi_ctrl_wstrb), + .s_axi_arvalid (s_axi_ctrl_arvalid), .s_axi_arready (s_axi_ctrl_arready), .s_axi_araddr (s_axi_ctrl_araddr), + .s_axi_rvalid (s_axi_ctrl_rvalid), .s_axi_rready (s_axi_ctrl_rready), .s_axi_rdata (s_axi_ctrl_rdata), .s_axi_rresp (s_axi_ctrl_rresp), + .s_axi_bvalid (s_axi_ctrl_bvalid), .s_axi_bready (s_axi_ctrl_bready), .s_axi_bresp (s_axi_ctrl_bresp), @@ -224,42 +271,42 @@ module VX_afu_wrap #( .ap_idle (ap_idle), .interrupt (interrupt), + .ap_ctrl_read (ap_ctrl_read), + `ifdef SCOPE .scope_bus_in (scope_bus_out), .scope_bus_out (scope_bus_in), `endif - .mem_base (mem_base), - .dcr_wr_valid (dcr_wr_valid), .dcr_wr_addr (dcr_wr_addr), .dcr_wr_data (dcr_wr_data) ); - wire [`MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_w [C_M_AXI_MEM_NUM_BANKS]; - wire [`MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_w [C_M_AXI_MEM_NUM_BANKS]; + wire [M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_u [C_M_AXI_MEM_NUM_BANKS]; + wire [M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_u [C_M_AXI_MEM_NUM_BANKS]; - for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin - assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_w[i]) + C_M_AXI_MEM_ADDR_WIDTH'(mem_base[i]); - assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_w[i]) + C_M_AXI_MEM_ADDR_WIDTH'(mem_base[i]); + for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_addressing + assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_u[i]) + C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET); + assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_u[i]) + C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET); end - `SCOPE_IO_SWITCH (2) + `SCOPE_IO_SWITCH (2); Vortex_axi #( .AXI_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH), - .AXI_ADDR_WIDTH (`MEM_ADDR_WIDTH), + .AXI_ADDR_WIDTH (M_AXI_MEM_ADDR_WIDTH), .AXI_TID_WIDTH (C_M_AXI_MEM_ID_WIDTH), .AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS) ) vortex_axi ( `SCOPE_IO_BIND (1) - .clk (ap_clk), - .reset (reset || ap_reset || ~vx_running), + .clk (clk), + .reset (vx_reset), .m_axi_awvalid (m_axi_mem_awvalid_a), .m_axi_awready (m_axi_mem_awready_a), - .m_axi_awaddr (m_axi_mem_awaddr_w), + .m_axi_awaddr (m_axi_mem_awaddr_u), .m_axi_awid (m_axi_mem_awid_a), .m_axi_awlen (m_axi_mem_awlen_a), `UNUSED_PIN (m_axi_awsize), @@ -283,7 +330,7 @@ module VX_afu_wrap #( .m_axi_arvalid (m_axi_mem_arvalid_a), .m_axi_arready (m_axi_mem_arready_a), - .m_axi_araddr (m_axi_mem_araddr_w), + .m_axi_araddr (m_axi_mem_araddr_u), .m_axi_arid (m_axi_mem_arid_a), .m_axi_arlen (m_axi_mem_arlen_a), `UNUSED_PIN (m_axi_arsize), @@ -310,38 +357,79 @@ module VX_afu_wrap #( // SCOPE ////////////////////////////////////////////////////////////////////// +`ifdef SCOPE `ifdef DBG_SCOPE_AFU - `define TRIGGERS { \ - reset, \ - ap_start, \ - ap_done, \ - ap_idle, \ - interrupt, \ - vx_busy_wait, \ - vx_busy, \ - vx_running \ - } - - `define PROBES { \ - vx_pending_writes \ - } - - VX_scope_tap #( - .SCOPE_ID (0), - .TRIGGERW ($bits(`TRIGGERS)), - .PROBEW ($bits(`PROBES)) - ) scope_tap ( - .clk (clk), - .reset (scope_reset_w[0]), - .start (1'b0), - .stop (1'b0), - .triggers (`TRIGGERS), - .probes (`PROBES), - .bus_in (scope_bus_in_w[0]), - .bus_out (scope_bus_out_w[0]) - ); + wire m_axi_mem_awfire_0 = m_axi_mem_awvalid_a[0] & m_axi_mem_awready_a[0]; + wire m_axi_mem_arfire_0 = m_axi_mem_arvalid_a[0] & m_axi_mem_arready_a[0]; + wire m_axi_mem_wfire_0 = m_axi_mem_wvalid_a[0] & m_axi_mem_wready_a[0]; + wire m_axi_mem_bfire_0 = m_axi_mem_bvalid_a[0] & m_axi_mem_bready_a[0]; + wire reset_negedge; + `NEG_EDGE (reset_negedge, reset); + `SCOPE_TAP (0, 0, { + ap_reset, + ap_start, + ap_done, + ap_idle, + interrupt, + vx_reset, + vx_busy, + state, + m_axi_mem_awvalid_a[0], + m_axi_mem_awready_a[0], + m_axi_mem_wvalid_a[0], + m_axi_mem_wready_a[0], + m_axi_mem_bvalid_a[0], + m_axi_mem_bready_a[0], + m_axi_mem_arvalid_a[0], + m_axi_mem_arready_a[0], + m_axi_mem_rvalid_a[0], + m_axi_mem_rready_a[0] + }, { + dcr_wr_valid, + m_axi_mem_awfire_0, + m_axi_mem_arfire_0, + m_axi_mem_wfire_0, + m_axi_mem_bfire_0 + }, { + dcr_wr_addr, + dcr_wr_data, + vx_pending_writes, + m_axi_mem_awaddr_u[0], + m_axi_mem_awid_a[0], + m_axi_mem_bid_a[0], + m_axi_mem_araddr_u[0], + m_axi_mem_arid_a[0], + m_axi_mem_rid_a[0] + }, + reset_negedge, 1'b0, 4096 + ); `else - `SCOPE_IO_UNUSED_W(0) + `SCOPE_IO_UNUSED(0) +`endif +`endif + +`ifdef CHIPSCOPE +`ifdef DBG_SCOPE_AFU + ila_afu ila_afu_inst ( + .clk (clk), + .probe0 ({ + ap_reset, + ap_start, + ap_done, + ap_idle, + state, + interrupt + }), + .probe1 ({ + vx_pending_writes, + vx_busy, + vx_reset, + dcr_wr_valid, + dcr_wr_addr, + dcr_wr_data + }) + ); +`endif `endif `ifdef SIMULATION @@ -352,7 +440,7 @@ module VX_afu_wrap #( initial begin $assertoff(0, vortex_axi); end - always @(posedge ap_clk) begin + always @(posedge clk) begin if (reset) begin assert_delay_ctr <= '0; assert_enabled <= 0; @@ -371,19 +459,22 @@ module VX_afu_wrap #( `endif `ifdef DBG_TRACE_AFU - always @(posedge ap_clk) begin + always @(posedge clk) begin for (integer i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin if (m_axi_mem_awvalid_a[i] && m_axi_mem_awready_a[i]) begin - `TRACE(2, ("%d: AFU Wr Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i])); + `TRACE(2, ("%t: AXI Wr Req [%0d]: addr=0x%0h, id=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i])) end if (m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i]) begin - `TRACE(2, ("%d: AFU Wr Req [%0d]: data=0x%h\n", $time, i, m_axi_mem_wdata_a[i])); + `TRACE(2, ("%t: AXI Wr Req [%0d]: strb=0x%h, data=0x%h\n", $time, i, m_axi_mem_wstrb_a[i], m_axi_mem_wdata_a[i])) + end + if (m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i]) begin + `TRACE(2, ("%t: AXI Wr Rsp [%0d]: id=0x%0h\n", $time, i, m_axi_mem_bid_a[i])) end if (m_axi_mem_arvalid_a[i] && m_axi_mem_arready_a[i]) begin - `TRACE(2, ("%d: AFU Rd Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i])); + `TRACE(2, ("%t: AXI Rd Req [%0d]: addr=0x%0h, id=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i])) end if (m_axi_mem_rvalid_a[i] && m_axi_mem_rready_a[i]) begin - `TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i])); + `TRACE(2, ("%t: AXI Rd Rsp [%0d]: data=0x%h, id=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i])) end end end diff --git a/hw/rtl/afu/xrt/vortex_afu.v b/hw/rtl/afu/xrt/vortex_afu.v index 2c31900cb..bfae1125a 100644 --- a/hw/rtl/afu/xrt/vortex_afu.v +++ b/hw/rtl/afu/xrt/vortex_afu.v @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,37 +16,50 @@ module vortex_afu #( parameter C_S_AXI_CTRL_ADDR_WIDTH = 8, parameter C_S_AXI_CTRL_DATA_WIDTH = 32, - parameter C_M_AXI_MEM_ID_WIDTH = `M_AXI_MEM_ID_WIDTH, + parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH, + parameter C_M_AXI_MEM_DATA_WIDTH = (`PLATFORM_MEMORY_DATA_SIZE * 8), parameter C_M_AXI_MEM_ADDR_WIDTH = 64, - parameter C_M_AXI_MEM_DATA_WIDTH = `VX_MEM_DATA_WIDTH +`ifdef PLATFORM_MERGED_MEMORY_INTERFACE + parameter C_M_AXI_MEM_NUM_BANKS = 1 +`else + parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS +`endif ) ( // System signals input wire ap_clk, input wire ap_rst_n, - + // AXI4 master interface - `REPEAT (`M_AXI_MEM_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA), +`ifdef PLATFORM_MERGED_MEMORY_INTERFACE + `REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA), +`else + `REPEAT (`PLATFORM_MEMORY_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA), +`endif // AXI4-Lite slave interface input wire s_axi_ctrl_awvalid, output wire s_axi_ctrl_awready, input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_awaddr, + input wire s_axi_ctrl_wvalid, output wire s_axi_ctrl_wready, input wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_wdata, input wire [C_S_AXI_CTRL_DATA_WIDTH/8-1:0] s_axi_ctrl_wstrb, - input wire s_axi_ctrl_arvalid, + + input wire s_axi_ctrl_arvalid, output wire s_axi_ctrl_arready, input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_araddr, + output wire s_axi_ctrl_rvalid, - input wire s_axi_ctrl_rready, + input wire s_axi_ctrl_rready, output wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_rdata, output wire [1:0] s_axi_ctrl_rresp, - output wire s_axi_ctrl_bvalid, + + output wire s_axi_ctrl_bvalid, input wire s_axi_ctrl_bready, output wire [1:0] s_axi_ctrl_bresp, - - output wire interrupt + + output wire interrupt ); VX_afu_wrap #( @@ -54,32 +67,39 @@ module vortex_afu #( .C_S_AXI_CTRL_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH), .C_M_AXI_MEM_ID_WIDTH (C_M_AXI_MEM_ID_WIDTH), .C_M_AXI_MEM_ADDR_WIDTH (C_M_AXI_MEM_ADDR_WIDTH), - .C_M_AXI_MEM_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH) + .C_M_AXI_MEM_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH), + .C_M_AXI_MEM_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS) ) afu_wrap ( - .ap_clk (ap_clk), - .ap_rst_n (ap_rst_n), - - `REPEAT (`M_AXI_MEM_NUM_BANKS, AXI_MEM_ARGS, REPEAT_COMMA), - + .clk (ap_clk), + .reset (~ap_rst_n), + `ifdef PLATFORM_MERGED_MEMORY_INTERFACE + `REPEAT (1, AXI_MEM_ARGS, REPEAT_COMMA), + `else + `REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_ARGS, REPEAT_COMMA), + `endif .s_axi_ctrl_awvalid (s_axi_ctrl_awvalid), .s_axi_ctrl_awready (s_axi_ctrl_awready), - .s_axi_ctrl_awaddr (s_axi_ctrl_awaddr), + .s_axi_ctrl_awaddr (s_axi_ctrl_awaddr), + .s_axi_ctrl_wvalid (s_axi_ctrl_wvalid), .s_axi_ctrl_wready (s_axi_ctrl_wready), .s_axi_ctrl_wdata (s_axi_ctrl_wdata), .s_axi_ctrl_wstrb (s_axi_ctrl_wstrb), + .s_axi_ctrl_arvalid (s_axi_ctrl_arvalid), .s_axi_ctrl_arready (s_axi_ctrl_arready), .s_axi_ctrl_araddr (s_axi_ctrl_araddr), + .s_axi_ctrl_rvalid (s_axi_ctrl_rvalid), .s_axi_ctrl_rready (s_axi_ctrl_rready), .s_axi_ctrl_rdata (s_axi_ctrl_rdata), .s_axi_ctrl_rresp (s_axi_ctrl_rresp), + .s_axi_ctrl_bvalid (s_axi_ctrl_bvalid), .s_axi_ctrl_bready (s_axi_ctrl_bready), .s_axi_ctrl_bresp (s_axi_ctrl_bresp), .interrupt (interrupt) ); - + endmodule diff --git a/hw/rtl/afu/xrt/vortex_afu.vh b/hw/rtl/afu/xrt/vortex_afu.vh index 3616b0794..c66ede2b7 100644 --- a/hw/rtl/afu/xrt/vortex_afu.vh +++ b/hw/rtl/afu/xrt/vortex_afu.vh @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -14,12 +14,12 @@ `ifndef VORTEX_AFU_VH `define VORTEX_AFU_VH -`ifndef M_AXI_MEM_NUM_BANKS -`define M_AXI_MEM_NUM_BANKS 1 +`ifndef PLATFORM_MEMORY_OFFSET +`define PLATFORM_MEMORY_OFFSET 0 `endif -`ifndef M_AXI_MEM_ID_WIDTH -`define M_AXI_MEM_ID_WIDTH 32 +`ifndef PLATFORM_MEMORY_ID_WIDTH +`define PLATFORM_MEMORY_ID_WIDTH 32 `endif `define GEN_AXI_MEM(i) \ diff --git a/hw/rtl/cache/VX_bank_flush.sv b/hw/rtl/cache/VX_bank_flush.sv index 6c02c1e13..e50f8ef44 100644 --- a/hw/rtl/cache/VX_bank_flush.sv +++ b/hw/rtl/cache/VX_bank_flush.sv @@ -33,7 +33,7 @@ module VX_bank_flush #( output wire flush_init, output wire flush_valid, output wire [`CS_LINE_SEL_BITS-1:0] flush_line, - output wire [NUM_WAYS-1:0] flush_way, + output wire [`CS_WAY_SEL_WIDTH-1:0] flush_way, input wire flush_ready, input wire mshr_empty, input wire bank_empty @@ -48,20 +48,21 @@ module VX_bank_flush #( localparam STATE_WAIT2 = 4; localparam STATE_DONE = 5; - reg [2:0] state_r, state_n; + reg [2:0] state, state_n; - reg [CTR_WIDTH-1:0] counter_r; + reg [CTR_WIDTH-1:0] counter; always @(*) begin - state_n = state_r; - case (state_r) - STATE_IDLE: begin + state_n = state; + case (state) + //STATE_IDLE: + default : begin if (flush_begin) begin state_n = STATE_WAIT1; end end STATE_INIT: begin - if (counter_r == ((2 ** `CS_LINE_SEL_BITS)-1)) begin + if (counter == ((2 ** `CS_LINE_SEL_BITS)-1)) begin state_n = STATE_IDLE; end end @@ -72,7 +73,7 @@ module VX_bank_flush #( end end STATE_FLUSH: begin - if (counter_r == ((2 ** CTR_WIDTH)-1) && flush_ready) begin + if (counter == ((2 ** CTR_WIDTH)-1) && flush_ready) begin state_n = (BANK_ID == 0) ? STATE_DONE : STATE_WAIT2; end end @@ -93,35 +94,30 @@ module VX_bank_flush #( always @(posedge clk) begin if (reset) begin - state_r <= STATE_INIT; - counter_r <= '0; + state <= STATE_INIT; + counter <= '0; end else begin - state_r <= state_n; - if (state_r != STATE_IDLE) begin - if ((state_r == STATE_INIT) - || ((state_r == STATE_FLUSH) && flush_ready)) begin - counter_r <= counter_r + CTR_WIDTH'(1); + state <= state_n; + if (state != STATE_IDLE) begin + if ((state == STATE_INIT) + || ((state == STATE_FLUSH) && flush_ready)) begin + counter <= counter + CTR_WIDTH'(1); end end else begin - counter_r <= '0; + counter <= '0; end end end - assign flush_end = (state_r == STATE_DONE); - assign flush_init = (state_r == STATE_INIT); - assign flush_valid = (state_r == STATE_FLUSH); - assign flush_line = counter_r[`CS_LINE_SEL_BITS-1:0]; + assign flush_end = (state == STATE_DONE); + assign flush_init = (state == STATE_INIT); + assign flush_valid = (state == STATE_FLUSH); + assign flush_line = counter[`CS_LINE_SEL_BITS-1:0]; - if (WRITEBACK && `CS_WAY_SEL_BITS > 0) begin - reg [NUM_WAYS-1:0] flush_way_r; - always @(*) begin - flush_way_r = '0; - flush_way_r[counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]] = 1; - end - assign flush_way = flush_way_r; - end else begin - assign flush_way = {NUM_WAYS{1'b1}}; + if (WRITEBACK && (NUM_WAYS > 1)) begin : g_flush_way + assign flush_way = counter[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]; + end else begin : g_flush_way_all + assign flush_way = '0; end endmodule diff --git a/hw/rtl/cache/VX_cache.sv b/hw/rtl/cache/VX_cache.sv index ae0747690..fb08c879b 100644 --- a/hw/rtl/cache/VX_cache.sv +++ b/hw/rtl/cache/VX_cache.sv @@ -19,23 +19,26 @@ module VX_cache import VX_gpu_pkg::*; #( // Number of Word requests per cycle parameter NUM_REQS = 4, + // Number of memory ports + parameter MEM_PORTS = 1, + // Size of cache in bytes - parameter CACHE_SIZE = 4096, + parameter CACHE_SIZE = 32768, // Size of line inside a bank in bytes parameter LINE_SIZE = 64, // Number of banks - parameter NUM_BANKS = 1, + parameter NUM_BANKS = 4, // Number of associative ways - parameter NUM_WAYS = 1, + parameter NUM_WAYS = 4, // Size of a word in bytes - parameter WORD_SIZE = `XLEN/8, + parameter WORD_SIZE = 16, // Core Response Queue Size - parameter CRSQ_SIZE = 2, + parameter CRSQ_SIZE = 4, // Miss Reserv Queue Knob - parameter MSHR_SIZE = 8, + parameter MSHR_SIZE = 16, // Memory Response Queue Size - parameter MRSQ_SIZE = 0, + parameter MRSQ_SIZE = 4, // Memory Request Queue Size parameter MREQ_SIZE = 4, @@ -48,17 +51,23 @@ module VX_cache import VX_gpu_pkg::*; #( // Enable dirty bytes on writeback parameter DIRTY_BYTES = 0, + // Replacement policy + parameter REPL_POLICY = `CS_REPL_FIFO, + // Request debug identifier parameter UUID_WIDTH = 0, // core request tag size parameter TAG_WIDTH = UUID_WIDTH + 1, + // core request flags + parameter FLAGS_WIDTH = 0, + // Core response output register - parameter CORE_OUT_BUF = 0, + parameter CORE_OUT_BUF = 3, // Memory request output register - parameter MEM_OUT_BUF = 0 + parameter MEM_OUT_BUF = 3 ) ( // PERF `ifdef PERF_ENABLE @@ -69,34 +78,37 @@ module VX_cache import VX_gpu_pkg::*; #( input wire reset, VX_mem_bus_if.slave core_bus_if [NUM_REQS], - VX_mem_bus_if.master mem_bus_if + VX_mem_bus_if.master mem_bus_if [MEM_PORTS] ); `STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter: number of banks must be power of 2")) `STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter: writeback requires write enable")) `STATIC_ASSERT(WRITEBACK || !DIRTY_BYTES, ("invalid parameter: dirty bytes require writeback")) - - // In writeback mode, memory fill response may issue a new memory request to handle evicted blocks. - // We need to ensure that the memory request queue never fills up to avoid deadlock. - `STATIC_ASSERT(!WRITEBACK || (MREQ_SIZE >= MSHR_SIZE), ("invalid parameter: writeback requires MREQ_SIZE >= MSHR_SIZE")) + `STATIC_ASSERT(NUM_BANKS >= MEM_PORTS, ("invalid parameter: number of banks must be greater or equal to number of memory ports")) localparam REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS); localparam WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS); localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE); - localparam MEM_TAG_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS; + localparam MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH); localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE; localparam WORD_WIDTH = WORD_SIZE * 8; localparam WORD_SEL_BITS = `CLOG2(WORDS_PER_LINE); localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS); localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS); localparam LINE_ADDR_WIDTH = (`CS_WORD_ADDR_WIDTH - BANK_SEL_BITS - WORD_SEL_BITS); - localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + 1; + localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + `UP(FLAGS_WIDTH); localparam CORE_RSP_DATAW = WORD_WIDTH + TAG_WIDTH; + localparam BANK_MEM_TAG_WIDTH = UUID_WIDTH + MSHR_ADDR_WIDTH; + localparam MEM_REQ_DATAW = (`CS_LINE_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)); + localparam MEM_RSP_DATAW = `CS_LINE_WIDTH + MEM_TAG_WIDTH; + localparam MEM_PORTS_SEL_BITS = `CLOG2(MEM_PORTS); + localparam MEM_PORTS_SEL_WIDTH = `UP(MEM_PORTS_SEL_BITS); + localparam MEM_ARB_SEL_BITS = `CLOG2(`CDIV(NUM_BANKS, MEM_PORTS)); + localparam MEM_ARB_SEL_WIDTH = `UP(MEM_ARB_SEL_BITS); - localparam CORE_REQ_BUF_ENABLE = (NUM_BANKS != 1) || (NUM_REQS != 1); - localparam MEM_REQ_BUF_ENABLE = (NUM_BANKS != 1); - - localparam REQ_XBAR_BUF = (NUM_REQS > 4) ? 2 : 0; + localparam REQ_XBAR_BUF = (NUM_REQS > 2) ? 2 : 0; + localparam CORE_RSP_BUF_ENABLE = (NUM_BANKS != 1) || (NUM_REQS != 1); + localparam MEM_REQ_BUF_ENABLE = (NUM_BANKS != 1); `ifdef PERF_ENABLE wire [NUM_BANKS-1:0] perf_read_miss_per_bank; @@ -110,6 +122,7 @@ module VX_cache import VX_gpu_pkg::*; #( ) core_bus2_if[NUM_REQS](); wire [NUM_BANKS-1:0] per_bank_flush_begin; + wire [`UP(UUID_WIDTH)-1:0] flush_uuid; wire [NUM_BANKS-1:0] per_bank_flush_end; wire [NUM_BANKS-1:0] per_bank_core_req_fire; @@ -117,7 +130,9 @@ module VX_cache import VX_gpu_pkg::*; #( VX_cache_flush #( .NUM_REQS (NUM_REQS), .NUM_BANKS (NUM_BANKS), - .BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // bank xbar latency + .UUID_WIDTH(UUID_WIDTH), + .TAG_WIDTH (TAG_WIDTH), + .BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // request xbar latency ) flush_unit ( .clk (clk), .reset (reset), @@ -125,92 +140,101 @@ module VX_cache import VX_gpu_pkg::*; #( .core_bus_out_if (core_bus2_if), .bank_req_fire (per_bank_core_req_fire), .flush_begin (per_bank_flush_begin), + .flush_uuid (flush_uuid), .flush_end (per_bank_flush_end) ); - /////////////////////////////////////////////////////////////////////////// + // Memory response gather ///////////////////////////////////////////////// - // Core response buffering - wire [NUM_REQS-1:0] core_rsp_valid_s; - wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_s; - wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s; - wire [NUM_REQS-1:0] core_rsp_ready_s; + VX_mem_bus_if #( + .DATA_SIZE (LINE_SIZE), + .TAG_WIDTH (MEM_TAG_WIDTH) + ) mem_bus_tmp_if[MEM_PORTS](); - `RESET_RELAY_EX (core_rsp_reset, reset, NUM_REQS, `MAX_FANOUT); - - for (genvar i = 0; i < NUM_REQS; ++i) begin + wire [MEM_PORTS-1:0] mem_rsp_queue_valid; + wire [MEM_PORTS-1:0][MEM_RSP_DATAW-1:0] mem_rsp_queue_data; + wire [MEM_PORTS-1:0] mem_rsp_queue_ready; + for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue VX_elastic_buffer #( - .DATAW (`CS_WORD_WIDTH + TAG_WIDTH), - .SIZE (CORE_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0), - .OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF)) - ) core_rsp_buf ( - .clk (clk), - .reset (core_rsp_reset[i]), - .valid_in (core_rsp_valid_s[i]), - .ready_in (core_rsp_ready_s[i]), - .data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}), - .data_out ({core_bus2_if[i].rsp_data.data, core_bus2_if[i].rsp_data.tag}), - .valid_out (core_bus2_if[i].rsp_valid), - .ready_out (core_bus2_if[i].rsp_ready) + .DATAW (MEM_RSP_DATAW), + .SIZE (MRSQ_SIZE), + .OUT_REG (MRSQ_SIZE > 2) + ) mem_rsp_queue ( + .clk (clk), + .reset (reset), + .valid_in (mem_bus_tmp_if[i].rsp_valid), + .data_in (mem_bus_tmp_if[i].rsp_data), + .ready_in (mem_bus_tmp_if[i].rsp_ready), + .valid_out (mem_rsp_queue_valid[i]), + .data_out (mem_rsp_queue_data[i]), + .ready_out (mem_rsp_queue_ready[i]) ); end - /////////////////////////////////////////////////////////////////////////// + wire [MEM_PORTS-1:0][MEM_RSP_DATAW-MEM_ARB_SEL_BITS-1:0] mem_rsp_queue_data_s; + wire [MEM_PORTS-1:0][BANK_SEL_WIDTH-1:0] mem_rsp_queue_sel; - // Memory request buffering - wire mem_req_valid_s; - wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_s; - wire mem_req_rw_s; - wire [LINE_SIZE-1:0] mem_req_byteen_s; - wire [`CS_LINE_WIDTH-1:0] mem_req_data_s; - wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s; - wire mem_req_flush_s; - wire mem_req_ready_s; + for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue_data_s + wire [BANK_MEM_TAG_WIDTH-1:0] mem_rsp_tag_s = mem_rsp_queue_data[i][MEM_TAG_WIDTH-1:MEM_ARB_SEL_BITS]; + wire [`CS_LINE_WIDTH-1:0] mem_rsp_data_s = mem_rsp_queue_data[i][MEM_RSP_DATAW-1:MEM_TAG_WIDTH]; + assign mem_rsp_queue_data_s[i] = {mem_rsp_data_s, mem_rsp_tag_s}; + end - wire mem_bus_if_flush; + for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue_sel + if (NUM_BANKS > 1) begin : g_multibanks + if (NUM_BANKS != MEM_PORTS) begin : g_arb_sel + VX_bits_concat #( + .L (MEM_ARB_SEL_BITS), + .R (MEM_PORTS_SEL_BITS) + ) mem_rsp_sel_concat ( + .left_in (mem_rsp_queue_data[i][MEM_ARB_SEL_BITS-1:0]), + .right_in (MEM_PORTS_SEL_WIDTH'(i)), + .data_out (mem_rsp_queue_sel[i]) + ); + end else begin : g_no_arb_sel + assign mem_rsp_queue_sel[i] = MEM_PORTS_SEL_WIDTH'(i); + end + end else begin : g_singlebank + assign mem_rsp_queue_sel[i] = 0; + end + end - VX_elastic_buffer #( - .DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + 1), - .SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0), - .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF)) - ) mem_req_buf ( + wire [NUM_BANKS-1:0] per_bank_mem_rsp_valid; + wire [NUM_BANKS-1:0][MEM_RSP_DATAW-MEM_ARB_SEL_BITS-1:0] per_bank_mem_rsp_pdata; + wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready; + + VX_stream_omega #( + .NUM_INPUTS (MEM_PORTS), + .NUM_OUTPUTS (NUM_BANKS), + .DATAW (MEM_RSP_DATAW-MEM_ARB_SEL_BITS), + .ARBITER ("R"), + .OUT_BUF (3) + ) mem_rsp_xbar ( .clk (clk), .reset (reset), - .valid_in (mem_req_valid_s), - .ready_in (mem_req_ready_s), - .data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s, mem_req_flush_s}), - .data_out ({mem_bus_if.req_data.rw, mem_bus_if.req_data.byteen, mem_bus_if.req_data.addr, mem_bus_if.req_data.data, mem_bus_if.req_data.tag, mem_bus_if_flush}), - .valid_out (mem_bus_if.req_valid), - .ready_out (mem_bus_if.req_ready) + .valid_in (mem_rsp_queue_valid), + .data_in (mem_rsp_queue_data_s), + .sel_in (mem_rsp_queue_sel), + .ready_in (mem_rsp_queue_ready), + .valid_out (per_bank_mem_rsp_valid), + .data_out (per_bank_mem_rsp_pdata), + `UNUSED_PIN (sel_out), + .ready_out (per_bank_mem_rsp_ready), + `UNUSED_PIN (collisions) ); - assign mem_bus_if.req_data.atype = mem_bus_if_flush ? `ADDR_TYPE_WIDTH'(1 << `ADDR_TYPE_FLUSH) : '0; + wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_rsp_data; + wire [NUM_BANKS-1:0][BANK_MEM_TAG_WIDTH-1:0] per_bank_mem_rsp_tag; - /////////////////////////////////////////////////////////////////////////// + for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_per_bank_mem_rsp_data + assign { + per_bank_mem_rsp_data[i], + per_bank_mem_rsp_tag[i] + } = per_bank_mem_rsp_pdata[i]; + end - // Memory response buffering - wire mem_rsp_valid_s; - wire [`CS_LINE_WIDTH-1:0] mem_rsp_data_s; - wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s; - wire mem_rsp_ready_s; - - VX_elastic_buffer #( - .DATAW (MEM_TAG_WIDTH + `CS_LINE_WIDTH), - .SIZE (MRSQ_SIZE), - .OUT_REG (MRSQ_SIZE > 2) - ) mem_rsp_queue ( - .clk (clk), - .reset (reset), - .valid_in (mem_bus_if.rsp_valid), - .ready_in (mem_bus_if.rsp_ready), - .data_in ({mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data}), - .data_out ({mem_rsp_tag_s, mem_rsp_data_s}), - .valid_out (mem_rsp_valid_s), - .ready_out (mem_rsp_ready_s) - ); - - /////////////////////////////////////////////////////////////////////////// + // Core requests dispatch ///////////////////////////////////////////////// wire [NUM_BANKS-1:0] per_bank_core_req_valid; wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr; @@ -220,7 +244,7 @@ module VX_cache import VX_gpu_pkg::*; #( wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_req_data; wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_req_tag; wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_req_idx; - wire [NUM_BANKS-1:0] per_bank_core_req_flush; + wire [NUM_BANKS-1:0][`UP(FLAGS_WIDTH)-1:0] per_bank_core_req_flags; wire [NUM_BANKS-1:0] per_bank_core_req_ready; wire [NUM_BANKS-1:0] per_bank_core_rsp_valid; @@ -230,33 +254,21 @@ module VX_cache import VX_gpu_pkg::*; #( wire [NUM_BANKS-1:0] per_bank_core_rsp_ready; wire [NUM_BANKS-1:0] per_bank_mem_req_valid; - wire [NUM_BANKS-1:0][`CS_MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr; + wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_mem_req_addr; wire [NUM_BANKS-1:0] per_bank_mem_req_rw; wire [NUM_BANKS-1:0][LINE_SIZE-1:0] per_bank_mem_req_byteen; wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_req_data; - wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0] per_bank_mem_req_id; - wire [NUM_BANKS-1:0] per_bank_mem_req_flush; + wire [NUM_BANKS-1:0][BANK_MEM_TAG_WIDTH-1:0] per_bank_mem_req_tag; + wire [NUM_BANKS-1:0][`UP(FLAGS_WIDTH)-1:0] per_bank_mem_req_flags; wire [NUM_BANKS-1:0] per_bank_mem_req_ready; - wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready; - - assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready; - - if (NUM_BANKS == 1) begin - assign mem_rsp_ready_s = per_bank_mem_rsp_ready; - end else begin - assign mem_rsp_ready_s = per_bank_mem_rsp_ready[`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s)]; - end - - // Bank requests dispatch - wire [NUM_REQS-1:0] core_req_valid; wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr; wire [NUM_REQS-1:0] core_req_rw; wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen; wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data; wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag; - wire [NUM_REQS-1:0] core_req_flush; + wire [NUM_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] core_req_flags; wire [NUM_REQS-1:0] core_req_ready; wire [NUM_REQS-1:0][LINE_ADDR_WIDTH-1:0] core_req_line_addr; @@ -266,35 +278,38 @@ module VX_cache import VX_gpu_pkg::*; #( wire [NUM_REQS-1:0][CORE_REQ_DATAW-1:0] core_req_data_in; wire [NUM_BANKS-1:0][CORE_REQ_DATAW-1:0] core_req_data_out; - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req assign core_req_valid[i] = core_bus2_if[i].req_valid; assign core_req_rw[i] = core_bus2_if[i].req_data.rw; assign core_req_byteen[i] = core_bus2_if[i].req_data.byteen; assign core_req_addr[i] = core_bus2_if[i].req_data.addr; assign core_req_data[i] = core_bus2_if[i].req_data.data; assign core_req_tag[i] = core_bus2_if[i].req_data.tag; - assign core_req_flush[i] = core_bus2_if[i].req_data.atype[`ADDR_TYPE_FLUSH]; + assign core_req_flags[i] = `UP(FLAGS_WIDTH)'(core_bus2_if[i].req_data.flags); assign core_bus2_if[i].req_ready = core_req_ready[i]; end - for (genvar i = 0; i < NUM_REQS; ++i) begin - if (WORDS_PER_LINE > 1) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_wsel + if (WORDS_PER_LINE > 1) begin : g_wsel assign core_req_wsel[i] = core_req_addr[i][0 +: WORD_SEL_BITS]; - end else begin + end else begin : g_no_wsel assign core_req_wsel[i] = '0; end + end + + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_line_addr assign core_req_line_addr[i] = core_req_addr[i][(BANK_SEL_BITS + WORD_SEL_BITS) +: LINE_ADDR_WIDTH]; end - if (NUM_BANKS > 1) begin - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_bid + if (NUM_BANKS > 1) begin : g_multibanks assign core_req_bid[i] = core_req_addr[i][WORD_SEL_BITS +: BANK_SEL_BITS]; + end else begin : g_singlebank + assign core_req_bid[i] = '0; end - end else begin - assign core_req_bid = '0; end - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_data_in assign core_req_data_in[i] = { core_req_line_addr[i], core_req_rw[i], @@ -302,26 +317,26 @@ module VX_cache import VX_gpu_pkg::*; #( core_req_byteen[i], core_req_data[i], core_req_tag[i], - core_req_flush[i] + core_req_flags[i] }; end + assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready; + `ifdef PERF_ENABLE wire [`PERF_CTR_BITS-1:0] perf_collisions; `endif - `RESET_RELAY (req_xbar_reset, reset); - VX_stream_xbar #( .NUM_INPUTS (NUM_REQS), .NUM_OUTPUTS (NUM_BANKS), .DATAW (CORE_REQ_DATAW), .PERF_CTR_BITS (`PERF_CTR_BITS), - .ARBITER ("F"), + .ARBITER ("R"), .OUT_BUF (REQ_XBAR_BUF) ) req_xbar ( .clk (clk), - .reset (req_xbar_reset), + .reset (reset), `ifdef PERF_ENABLE .collisions(perf_collisions), `else @@ -337,7 +352,7 @@ module VX_cache import VX_gpu_pkg::*; #( .ready_out (per_bank_core_req_ready) ); - for (genvar i = 0; i < NUM_BANKS; ++i) begin + for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_core_req_data_out assign { per_bank_core_req_addr[i], per_bank_core_req_rw[i], @@ -345,50 +360,42 @@ module VX_cache import VX_gpu_pkg::*; #( per_bank_core_req_byteen[i], per_bank_core_req_data[i], per_bank_core_req_tag[i], - per_bank_core_req_flush[i] + per_bank_core_req_flags[i] } = core_req_data_out[i]; end - // Banks access - for (genvar bank_id = 0; bank_id < NUM_BANKS; ++bank_id) begin : banks - wire [`CS_LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr; - wire curr_bank_mem_rsp_valid; - - if (NUM_BANKS == 1) begin - assign curr_bank_mem_rsp_valid = mem_rsp_valid_s; - end else begin - assign curr_bank_mem_rsp_valid = mem_rsp_valid_s && (`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s) == bank_id); - end - - `RESET_RELAY (bank_reset, reset); + // Banks access /////////////////////////////////////////////////////////// + for (genvar bank_id = 0; bank_id < NUM_BANKS; ++bank_id) begin : g_banks VX_cache_bank #( .BANK_ID (bank_id), - .INSTANCE_ID ($sformatf("%s-bank%0d", INSTANCE_ID, bank_id)), + .INSTANCE_ID (`SFORMATF(("%s-bank%0d", INSTANCE_ID, bank_id))), .CACHE_SIZE (CACHE_SIZE), .LINE_SIZE (LINE_SIZE), .NUM_BANKS (NUM_BANKS), .NUM_WAYS (NUM_WAYS), .WORD_SIZE (WORD_SIZE), .NUM_REQS (NUM_REQS), + .WRITE_ENABLE (WRITE_ENABLE), + .WRITEBACK (WRITEBACK), + .DIRTY_BYTES (DIRTY_BYTES), + .REPL_POLICY (REPL_POLICY), .CRSQ_SIZE (CRSQ_SIZE), .MSHR_SIZE (MSHR_SIZE), .MREQ_SIZE (MREQ_SIZE), - .WRITE_ENABLE (WRITE_ENABLE), - .DIRTY_BYTES (DIRTY_BYTES), - .WRITEBACK (WRITEBACK), .UUID_WIDTH (UUID_WIDTH), .TAG_WIDTH (TAG_WIDTH), - .CORE_OUT_BUF (CORE_REQ_BUF_ENABLE ? 0 : CORE_OUT_BUF), - .MEM_OUT_BUF (MEM_REQ_BUF_ENABLE ? 0 : MEM_OUT_BUF) + .FLAGS_WIDTH (FLAGS_WIDTH), + .CORE_OUT_REG (CORE_RSP_BUF_ENABLE ? 0 : `TO_OUT_BUF_REG(CORE_OUT_BUF)), + .MEM_OUT_REG (MEM_REQ_BUF_ENABLE ? 0 : `TO_OUT_BUF_REG(MEM_OUT_BUF)) ) bank ( .clk (clk), - .reset (bank_reset), + .reset (reset), `ifdef PERF_ENABLE - .perf_read_misses (perf_read_miss_per_bank[bank_id]), - .perf_write_misses (perf_write_miss_per_bank[bank_id]), - .perf_mshr_stalls (perf_mshr_stall_per_bank[bank_id]), + .perf_read_miss (perf_read_miss_per_bank[bank_id]), + .perf_write_miss (perf_write_miss_per_bank[bank_id]), + .perf_mshr_stall (perf_mshr_stall_per_bank[bank_id]), `endif // Core request @@ -400,7 +407,7 @@ module VX_cache import VX_gpu_pkg::*; #( .core_req_data (per_bank_core_req_data[bank_id]), .core_req_tag (per_bank_core_req_tag[bank_id]), .core_req_idx (per_bank_core_req_idx[bank_id]), - .core_req_flush (per_bank_core_req_flush[bank_id]), + .core_req_flags (per_bank_core_req_flags[bank_id]), .core_req_ready (per_bank_core_req_ready[bank_id]), // Core response @@ -412,50 +419,49 @@ module VX_cache import VX_gpu_pkg::*; #( // Memory request .mem_req_valid (per_bank_mem_req_valid[bank_id]), - .mem_req_addr (curr_bank_mem_req_addr), + .mem_req_addr (per_bank_mem_req_addr[bank_id]), .mem_req_rw (per_bank_mem_req_rw[bank_id]), .mem_req_byteen (per_bank_mem_req_byteen[bank_id]), .mem_req_data (per_bank_mem_req_data[bank_id]), - .mem_req_id (per_bank_mem_req_id[bank_id]), - .mem_req_flush (per_bank_mem_req_flush[bank_id]), + .mem_req_tag (per_bank_mem_req_tag[bank_id]), + .mem_req_flags (per_bank_mem_req_flags[bank_id]), .mem_req_ready (per_bank_mem_req_ready[bank_id]), // Memory response - .mem_rsp_valid (curr_bank_mem_rsp_valid), - .mem_rsp_data (mem_rsp_data_s), - .mem_rsp_id (`CS_MEM_TAG_TO_REQ_ID(mem_rsp_tag_s)), + .mem_rsp_valid (per_bank_mem_rsp_valid[bank_id]), + .mem_rsp_data (per_bank_mem_rsp_data[bank_id]), + .mem_rsp_tag (per_bank_mem_rsp_tag[bank_id]), .mem_rsp_ready (per_bank_mem_rsp_ready[bank_id]), + // Flush request .flush_begin (per_bank_flush_begin[bank_id]), + .flush_uuid (flush_uuid), .flush_end (per_bank_flush_end[bank_id]) ); - - if (NUM_BANKS == 1) begin - assign per_bank_mem_req_addr[bank_id] = curr_bank_mem_req_addr; - end else begin - assign per_bank_mem_req_addr[bank_id] = `CS_LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, bank_id); - end end - // Bank responses gather + // Core responses gather ////////////////////////////////////////////////// wire [NUM_BANKS-1:0][CORE_RSP_DATAW-1:0] core_rsp_data_in; wire [NUM_REQS-1:0][CORE_RSP_DATAW-1:0] core_rsp_data_out; - for (genvar i = 0; i < NUM_BANKS; ++i) begin + wire [NUM_REQS-1:0] core_rsp_valid_s; + wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_s; + wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s; + wire [NUM_REQS-1:0] core_rsp_ready_s; + + for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_core_rsp_data_in assign core_rsp_data_in[i] = {per_bank_core_rsp_data[i], per_bank_core_rsp_tag[i]}; end - `RESET_RELAY (rsp_xbar_reset, reset); - VX_stream_xbar #( .NUM_INPUTS (NUM_BANKS), .NUM_OUTPUTS (NUM_REQS), .DATAW (CORE_RSP_DATAW), - .ARBITER ("F") + .ARBITER ("R") ) rsp_xbar ( .clk (clk), - .reset (rsp_xbar_reset), + .reset (reset), `UNUSED_PIN (collisions), .valid_in (per_bank_core_rsp_valid), .data_in (core_rsp_data_in), @@ -467,113 +473,170 @@ module VX_cache import VX_gpu_pkg::*; #( `UNUSED_PIN (sel_out) ); - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_data_s assign {core_rsp_data_s[i], core_rsp_tag_s[i]} = core_rsp_data_out[i]; end - /////////////////////////////////////////////////////////////////////////// + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_buf + VX_elastic_buffer #( + .DATAW (`CS_WORD_WIDTH + TAG_WIDTH), + .SIZE (CORE_RSP_BUF_ENABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0), + .OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF)) + ) core_rsp_buf ( + .clk (clk), + .reset (reset), + .valid_in (core_rsp_valid_s[i]), + .ready_in (core_rsp_ready_s[i]), + .data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}), + .data_out ({core_bus2_if[i].rsp_data.data, core_bus2_if[i].rsp_data.tag}), + .valid_out (core_bus2_if[i].rsp_valid), + .ready_out (core_bus2_if[i].rsp_ready) + ); + end - wire mem_req_valid_p; - wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_p; - wire mem_req_rw_p; - wire [LINE_SIZE-1:0] mem_req_byteen_p; - wire [`CS_LINE_WIDTH-1:0] mem_req_data_p; - wire [MEM_TAG_WIDTH-1:0] mem_req_tag_p; - wire [MSHR_ADDR_WIDTH-1:0] mem_req_id_p; - wire mem_req_flush_p; - wire mem_req_ready_p; + // Memory request arbitration ///////////////////////////////////////////// - // Memory request arbitration - - wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + 1)-1:0] data_in; - - for (genvar i = 0; i < NUM_BANKS; ++i) begin - assign data_in[i] = { - per_bank_mem_req_addr[i], + wire [NUM_BANKS-1:0][MEM_REQ_DATAW-1:0] per_bank_mem_req_pdata; + for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_per_bank_mem_req_pdata + assign per_bank_mem_req_pdata[i] = { per_bank_mem_req_rw[i], - per_bank_mem_req_byteen[i], + per_bank_mem_req_addr[i], per_bank_mem_req_data[i], - per_bank_mem_req_id[i], - per_bank_mem_req_flush[i] + per_bank_mem_req_byteen[i], + per_bank_mem_req_flags[i], + per_bank_mem_req_tag[i] }; end + wire [MEM_PORTS-1:0] mem_req_valid; + wire [MEM_PORTS-1:0][MEM_REQ_DATAW-1:0] mem_req_pdata; + wire [MEM_PORTS-1:0] mem_req_ready; + wire [MEM_PORTS-1:0][MEM_ARB_SEL_WIDTH-1:0] mem_req_sel_out; + VX_stream_arb #( .NUM_INPUTS (NUM_BANKS), - .DATAW (`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + MSHR_ADDR_WIDTH + 1), - .ARBITER ("F") + .NUM_OUTPUTS(MEM_PORTS), + .DATAW (MEM_REQ_DATAW), + .ARBITER ("R") ) mem_req_arb ( .clk (clk), .reset (reset), .valid_in (per_bank_mem_req_valid), + .data_in (per_bank_mem_req_pdata), .ready_in (per_bank_mem_req_ready), - .data_in (data_in), - .data_out ({mem_req_addr_p, mem_req_rw_p, mem_req_byteen_p, mem_req_data_p, mem_req_id_p, mem_req_flush_p}), - .valid_out (mem_req_valid_p), - .ready_out (mem_req_ready_p), - `UNUSED_PIN (sel_out) + .valid_out (mem_req_valid), + .data_out (mem_req_pdata), + .ready_out (mem_req_ready), + .sel_out (mem_req_sel_out) ); - if (NUM_BANKS > 1) begin - wire [`CS_BANK_SEL_BITS-1:0] mem_req_bank_id = `CS_MEM_ADDR_TO_BANK_ID(mem_req_addr_p); - assign mem_req_tag_p = MEM_TAG_WIDTH'({mem_req_bank_id, mem_req_id_p}); - end else begin - assign mem_req_tag_p = MEM_TAG_WIDTH'(mem_req_id_p); - end + for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_req_buf + wire mem_req_rw; + wire [`CS_LINE_ADDR_WIDTH-1:0] mem_req_addr; + wire [`CS_LINE_WIDTH-1:0] mem_req_data; + wire [LINE_SIZE-1:0] mem_req_byteen; + wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags; + wire [BANK_MEM_TAG_WIDTH-1:0] mem_req_tag; - // Memory request multi-port handling + assign { + mem_req_rw, + mem_req_addr, + mem_req_data, + mem_req_byteen, + mem_req_flags, + mem_req_tag + } = mem_req_pdata[i]; - assign mem_req_valid_s = mem_req_valid_p; - assign mem_req_addr_s = mem_req_addr_p; - assign mem_req_tag_s = mem_req_tag_p; - assign mem_req_flush_s = mem_req_flush_p; - assign mem_req_ready_p = mem_req_ready_s; + wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_w; + wire [MEM_TAG_WIDTH-1:0] mem_req_tag_w; + wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags_w; - if (WRITE_ENABLE != 0) begin - assign mem_req_rw_s = mem_req_rw_p; - assign mem_req_byteen_s = mem_req_byteen_p; - assign mem_req_data_s = mem_req_data_p; - end else begin - `UNUSED_VAR (mem_req_byteen_p) - `UNUSED_VAR (mem_req_data_p) - `UNUSED_VAR (mem_req_rw_p) + if (NUM_BANKS > 1) begin : g_mem_req_tag_multibanks + if (NUM_BANKS != MEM_PORTS) begin : g_arb_sel + wire [`CS_BANK_SEL_BITS-1:0] mem_req_bank_id; + VX_bits_concat #( + .L (MEM_ARB_SEL_BITS), + .R (MEM_PORTS_SEL_BITS) + ) bank_id_concat ( + .left_in (mem_req_sel_out[i]), + .right_in (MEM_PORTS_SEL_WIDTH'(i)), + .data_out (mem_req_bank_id) + ); + assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'({mem_req_addr, mem_req_bank_id}); + assign mem_req_tag_w = {mem_req_tag, mem_req_sel_out[i]}; + end else begin : g_no_arb_sel + `UNUSED_VAR (mem_req_sel_out) + assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'({mem_req_addr, MEM_PORTS_SEL_WIDTH'(i)}); + assign mem_req_tag_w = MEM_TAG_WIDTH'(mem_req_tag); + end + end else begin : g_mem_req_tag + `UNUSED_VAR (mem_req_sel_out) + assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'(mem_req_addr); + assign mem_req_tag_w = MEM_TAG_WIDTH'(mem_req_tag); + end - assign mem_req_rw_s = 0; - assign mem_req_byteen_s = {LINE_SIZE{1'b1}}; - assign mem_req_data_s = '0; + VX_elastic_buffer #( + .DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)), + .SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0), + .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF)) + ) mem_req_buf ( + .clk (clk), + .reset (reset), + .valid_in (mem_req_valid[i]), + .ready_in (mem_req_ready[i]), + .data_in ({mem_req_rw, mem_req_byteen, mem_req_addr_w, mem_req_data, mem_req_tag_w, mem_req_flags}), + .data_out ({mem_bus_tmp_if[i].req_data.rw, mem_bus_tmp_if[i].req_data.byteen, mem_bus_tmp_if[i].req_data.addr, mem_bus_tmp_if[i].req_data.data, mem_bus_tmp_if[i].req_data.tag, mem_req_flags_w}), + .valid_out (mem_bus_tmp_if[i].req_valid), + .ready_out (mem_bus_tmp_if[i].req_ready) + ); + + if (FLAGS_WIDTH != 0) begin : g_mem_req_flags + assign mem_bus_tmp_if[i].req_data.flags = mem_req_flags_w; + end else begin : g_no_mem_req_flags + assign mem_bus_tmp_if[i].req_data.flags = '0; + `UNUSED_VAR (mem_req_flags_w) + end + + if (WRITE_ENABLE) begin : g_mem_bus_if + `ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[i]); + end else begin : g_mem_bus_if_ro + `ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[i]); + end end `ifdef PERF_ENABLE - // per cycle: core_reads, core_writes - wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle; - wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle; - - wire [NUM_REQS-1:0] perf_core_reads_per_req; - wire [NUM_REQS-1:0] perf_core_writes_per_req; - - // per cycle: read misses, write misses, msrq stalls, pipeline stalls - wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle; - wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle; - wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle; - wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle; + wire [NUM_REQS-1:0] perf_core_reads_per_req; + wire [NUM_REQS-1:0] perf_core_writes_per_req; + wire [NUM_REQS-1:0] perf_crsp_stall_per_req; + wire [MEM_PORTS-1:0] perf_mem_stall_per_port; `BUFFER(perf_core_reads_per_req, core_req_valid & core_req_ready & ~core_req_rw); `BUFFER(perf_core_writes_per_req, core_req_valid & core_req_ready & core_req_rw); + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req + assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready; + end + + for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_perf_mem_stall_per_port + assign perf_mem_stall_per_port[i] = mem_bus_if[i].req_valid && ~mem_bus_if[i].req_ready; + end + + // per cycle: read misses, write misses, msrq stalls, pipeline stalls + wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle; + wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle; + wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle; + wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle; + wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle; + wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle; + wire [`CLOG2(MEM_PORTS+1)-1:0] perf_mem_stall_per_cycle; + `POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req); `POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req); `POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank); `POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank); `POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank); - - wire [NUM_REQS-1:0] perf_crsp_stall_per_req; - for (genvar i = 0; i < NUM_REQS; ++i) begin - assign perf_crsp_stall_per_req[i] = core_bus2_if[i].rsp_valid && ~core_bus2_if[i].rsp_ready; - end - `POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req); - - wire perf_mem_stall_per_cycle = mem_bus_if.req_valid && ~mem_bus_if.req_ready; + `POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port); reg [`PERF_CTR_BITS-1:0] perf_core_reads; reg [`PERF_CTR_BITS-1:0] perf_core_writes; diff --git a/hw/rtl/cache/VX_cache_bank.sv b/hw/rtl/cache/VX_cache_bank.sv index dbbb4aba3..2a5db12c4 100644 --- a/hw/rtl/cache/VX_cache_bank.sv +++ b/hw/rtl/cache/VX_cache_bank.sv @@ -47,19 +47,26 @@ module VX_cache_bank #( // Enable dirty bytes on writeback parameter DIRTY_BYTES = 0, + // Replacement policy + parameter REPL_POLICY = `CS_REPL_FIFO, + // Request debug identifier parameter UUID_WIDTH = 0, // core request tag size parameter TAG_WIDTH = UUID_WIDTH + 1, - // Core response output buffer - parameter CORE_OUT_BUF = 0, + // core request flags + parameter FLAGS_WIDTH = 0, - // Memory request output buffer - parameter MEM_OUT_BUF = 0, + // Core response output register + parameter CORE_OUT_REG = 0, + + // Memory request output register + parameter MEM_OUT_REG = 0, parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE), + parameter MEM_TAG_WIDTH = UUID_WIDTH + MSHR_ADDR_WIDTH, parameter REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS), parameter WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS) ) ( @@ -67,9 +74,9 @@ module VX_cache_bank #( input wire reset, `ifdef PERF_ENABLE - output wire perf_read_misses, - output wire perf_write_misses, - output wire perf_mshr_stalls, + output wire perf_read_miss, + output wire perf_write_miss, + output wire perf_mshr_stall, `endif // Core Request @@ -81,7 +88,7 @@ module VX_cache_bank #( input wire [`CS_WORD_WIDTH-1:0] core_req_data, // data to be written input wire [TAG_WIDTH-1:0] core_req_tag, // identifier of the request (request id) input wire [REQ_SEL_WIDTH-1:0] core_req_idx, // index of the request in the core request array - input wire core_req_flush, // flush enable + input wire [`UP(FLAGS_WIDTH)-1:0] core_req_flags, output wire core_req_ready, // Core Response @@ -97,18 +104,19 @@ module VX_cache_bank #( output wire mem_req_rw, output wire [LINE_SIZE-1:0] mem_req_byteen, output wire [`CS_LINE_WIDTH-1:0] mem_req_data, - output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id, // index of the head entry in the mshr - output wire mem_req_flush, + output wire [MEM_TAG_WIDTH-1:0] mem_req_tag, + output wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags, input wire mem_req_ready, // Memory response input wire mem_rsp_valid, input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data, - input wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id, + input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag, output wire mem_rsp_ready, // flush input wire flush_begin, + input wire [`UP(UUID_WIDTH)-1:0] flush_uuid, output wire flush_end ); @@ -136,43 +144,45 @@ module VX_cache_bank #( wire [MSHR_ADDR_WIDTH-1:0] replay_id; wire replay_ready; - wire is_init_st0, is_init_st1; + + wire valid_sel, valid_st0, valid_st1; + wire is_init_st0; + wire is_creq_st0, is_creq_st1; + wire is_fill_st0, is_fill_st1; wire is_flush_st0, is_flush_st1; - wire [NUM_WAYS-1:0] flush_way_st0; + wire [`CS_WAY_SEL_WIDTH-1:0] flush_way_st0, evict_way_st0; + wire [`CS_WAY_SEL_WIDTH-1:0] way_idx_st0, way_idx_st1; wire [`CS_LINE_ADDR_WIDTH-1:0] addr_sel, addr_st0, addr_st1; - wire [`CS_LINE_SEL_BITS-1:0] line_sel_st0, line_sel_st1; + wire [`CS_LINE_SEL_BITS-1:0] line_idx_st0, line_idx_st1; + wire [`CS_TAG_SEL_BITS-1:0] line_tag_st0, line_tag_st1; + wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0, evict_tag_st1; wire rw_sel, rw_st0, rw_st1; - wire [WORD_SEL_WIDTH-1:0] wsel_sel, wsel_st0, wsel_st1; + wire [WORD_SEL_WIDTH-1:0] word_idx_sel, word_idx_st0, word_idx_st1; wire [WORD_SIZE-1:0] byteen_sel, byteen_st0, byteen_st1; wire [REQ_SEL_WIDTH-1:0] req_idx_sel, req_idx_st0, req_idx_st1; wire [TAG_WIDTH-1:0] tag_sel, tag_st0, tag_st1; - wire [`CS_WORD_WIDTH-1:0] read_data_st1; + wire [`CS_WORD_WIDTH-1:0] write_word_st0, write_word_st1; wire [`CS_LINE_WIDTH-1:0] data_sel, data_st0, data_st1; - wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0, mshr_id_st0, mshr_id_st1; - wire valid_sel, valid_st0, valid_st1; - wire is_creq_st0, is_creq_st1; - wire is_fill_st0, is_fill_st1; + wire [MSHR_ADDR_WIDTH-1:0] mshr_id_st0, mshr_id_st1; + wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0; + wire is_dirty_st0, is_dirty_st1; wire is_replay_st0, is_replay_st1; - wire creq_flush_sel, creq_flush_st0, creq_flush_st1; - wire evict_dirty_st0, evict_dirty_st1; - wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1; - wire [NUM_WAYS-1:0] tag_matches_st0; - wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0; - wire [MSHR_ADDR_WIDTH-1:0] mshr_prev_st0, mshr_prev_st1; + wire is_hit_st0, is_hit_st1; + wire [`UP(FLAGS_WIDTH)-1:0] flags_sel, flags_st0, flags_st1; wire mshr_pending_st0, mshr_pending_st1; + wire [MSHR_ADDR_WIDTH-1:0] mshr_previd_st0, mshr_previd_st1; wire mshr_empty; wire flush_valid; wire init_valid; wire [`CS_LINE_SEL_BITS-1:0] flush_sel; - wire [NUM_WAYS-1:0] flush_way; + wire [`CS_WAY_SEL_WIDTH-1:0] flush_way; wire flush_ready; // ensure we have no pending memory request in the bank wire no_pending_req = ~valid_st0 && ~valid_st1 && mreq_queue_empty; - // flush unit VX_bank_flush #( .BANK_ID (BANK_ID), .CACHE_SIZE (CACHE_SIZE), @@ -194,11 +204,7 @@ module VX_cache_bank #( .bank_empty (no_pending_req) ); - wire rdw_hazard1_sel; - wire rdw_hazard2_sel; - reg rdw_hazard3_st1; - - wire pipe_stall = crsp_queue_stall || rdw_hazard3_st1; + wire pipe_stall = crsp_queue_stall; // inputs arbitration: // mshr replay has highest priority to maximize utilization since there is no miss. @@ -217,216 +223,217 @@ module VX_cache_bank #( wire creq_enable = creq_grant && core_req_valid; assign replay_ready = replay_grant - && ~rdw_hazard1_sel + && ~(!WRITEBACK && replay_rw && mreq_queue_alm_full) // needed for writethrough && ~pipe_stall; assign mem_rsp_ready = fill_grant - && (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions - && ~rdw_hazard2_sel + && ~(WRITEBACK && mreq_queue_alm_full) // needed for writeback && ~pipe_stall; assign flush_ready = flush_grant - && (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions - && ~rdw_hazard2_sel + && ~(WRITEBACK && mreq_queue_alm_full) // needed for writeback && ~pipe_stall; assign core_req_ready = creq_grant - && ~mreq_queue_alm_full - && ~mshr_alm_full + && ~mreq_queue_alm_full // needed for fill requests + && ~mshr_alm_full // needed for mshr allocation && ~pipe_stall; wire init_fire = init_valid; wire replay_fire = replay_valid && replay_ready; wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; - wire flush_fire = flush_valid && flush_ready; + wire flush_fire = flush_valid && flush_ready; wire core_req_fire = core_req_valid && core_req_ready; + wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id = mem_rsp_tag[MSHR_ADDR_WIDTH-1:0]; + + wire [TAG_WIDTH-1:0] mem_rsp_tag_s; + if (TAG_WIDTH > MEM_TAG_WIDTH) begin : g_mem_rsp_tag_s_pad + assign mem_rsp_tag_s = {mem_rsp_tag, (TAG_WIDTH-MEM_TAG_WIDTH)'(1'b0)}; + end else begin : g_mem_rsp_tag_s_cut + assign mem_rsp_tag_s = mem_rsp_tag[MEM_TAG_WIDTH-1 -: TAG_WIDTH]; + `UNUSED_VAR (mem_rsp_tag) + end + + wire [TAG_WIDTH-1:0] flush_tag; + if (UUID_WIDTH != 0) begin : g_flush_tag_uuid + assign flush_tag = {flush_uuid, (TAG_WIDTH-UUID_WIDTH)'(1'b0)}; + end else begin : g_flush_tag_0 + `UNUSED_VAR (flush_uuid) + assign flush_tag = '0; + end + assign valid_sel = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire; assign rw_sel = replay_valid ? replay_rw : core_req_rw; assign byteen_sel = replay_valid ? replay_byteen : core_req_byteen; - assign wsel_sel = replay_valid ? replay_wsel : core_req_wsel; - assign req_idx_sel = replay_valid ? replay_idx : core_req_idx; - assign tag_sel = replay_valid ? replay_tag : core_req_tag; - assign creq_flush_sel = core_req_valid && core_req_flush; - assign addr_sel = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) : (replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : core_req_addr)); + assign word_idx_sel= replay_valid ? replay_wsel : core_req_wsel; + assign req_idx_sel = replay_valid ? replay_idx : core_req_idx; + assign tag_sel = (init_valid | flush_valid) ? (flush_valid ? flush_tag : '0) : + (replay_valid ? replay_tag : (mem_rsp_valid ? mem_rsp_tag_s : core_req_tag)); + assign flags_sel = core_req_valid ? core_req_flags : '0; - if (WRITE_ENABLE) begin - assign data_sel[`CS_WORD_WIDTH-1:0] = replay_valid ? replay_data : (mem_rsp_valid ? mem_rsp_data[`CS_WORD_WIDTH-1:0] : core_req_data); - end else begin - assign data_sel[`CS_WORD_WIDTH-1:0] = mem_rsp_data[`CS_WORD_WIDTH-1:0]; + if (WRITE_ENABLE) begin : g_data_sel + for (genvar i = 0; i < `CS_LINE_WIDTH; ++i) begin : g_i + if (i < `CS_WORD_WIDTH) begin : g_lo + assign data_sel[i] = replay_valid ? replay_data[i] : (mem_rsp_valid ? mem_rsp_data[i] : core_req_data[i]); + end else begin : g_hi + assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words of data_sel + end + end + end else begin : g_data_sel_ro + assign data_sel = mem_rsp_data; `UNUSED_VAR (core_req_data) `UNUSED_VAR (replay_data) end - for (genvar i = `CS_WORD_WIDTH; i < `CS_LINE_WIDTH; ++i) begin - assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words of data_sel + + if (UUID_WIDTH != 0) begin : g_req_uuid_sel + assign req_uuid_sel = tag_sel[TAG_WIDTH-1 -: UUID_WIDTH]; + end else begin : g_req_uuid_sel_0 + assign req_uuid_sel = '0; end - if (UUID_WIDTH != 0) begin - assign req_uuid_sel = tag_sel[TAG_WIDTH-1 -: UUID_WIDTH]; - end else begin - assign req_uuid_sel = 0; - end + wire is_init_sel = init_valid; + wire is_creq_sel = creq_enable || replay_enable; + wire is_fill_sel = fill_enable; + wire is_flush_sel = flush_enable; + wire is_replay_sel = replay_enable; VX_pipe_register #( - .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + NUM_WAYS + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH), + .DATAW (1 + 1 + 1 + 1 + 1 + 1 + `UP(FLAGS_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH), .RESETW (1) ) pipe_reg0 ( .clk (clk), .reset (reset), .enable (~pipe_stall), - .data_in ({valid_sel, init_valid, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}), - .data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0}) + .data_in ({valid_sel, is_init_sel, is_fill_sel, is_flush_sel, is_creq_sel, is_replay_sel, flags_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, word_idx_sel, req_idx_sel, tag_sel, replay_id}), + .data_out ({valid_st0, is_init_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, flags_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, replay_id_st0}) ); - if (UUID_WIDTH != 0) begin + if (UUID_WIDTH != 0) begin : g_req_uuid_st0 assign req_uuid_st0 = tag_st0[TAG_WIDTH-1 -: UUID_WIDTH]; - end else begin - assign req_uuid_st0 = 0; + end else begin : g_req_uuid_st0_0 + assign req_uuid_st0 = '0; end - wire do_init_st0 = valid_st0 && is_init_st0; - wire do_flush_st0 = valid_st0 && is_flush_st0; - wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0; - wire do_creq_wr_st0 = valid_st0 && is_creq_st0 && rw_st0; - wire do_replay_rd_st0 = valid_st0 && is_replay_st0 && ~rw_st0; - wire do_replay_wr_st0 = valid_st0 && is_replay_st0 && rw_st0; - wire do_fill_st0 = valid_st0 && is_fill_st0; - wire do_cache_rd_st0 = do_creq_rd_st0 || do_replay_rd_st0; - wire do_cache_wr_st0 = do_creq_wr_st0 || do_replay_wr_st0; - wire do_lookup_st0 = do_cache_rd_st0 || do_cache_wr_st0; + wire is_read_st0 = is_creq_st0 && ~rw_st0; + wire is_write_st0 = is_creq_st0 && rw_st0; - wire [`CS_WORD_WIDTH-1:0] write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0]; + wire do_init_st0 = valid_st0 && is_init_st0; + wire do_flush_st0 = valid_st0 && is_flush_st0; + wire do_read_st0 = valid_st0 && is_read_st0; + wire do_write_st0 = valid_st0 && is_write_st0; + wire do_fill_st0 = valid_st0 && is_fill_st0; - assign line_sel_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0]; + wire is_read_st1 = is_creq_st1 && ~rw_st1; + wire is_write_st1 = is_creq_st1 && rw_st1; - wire [NUM_WAYS-1:0] evict_way_st0; - wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0; + wire do_read_st1 = valid_st1 && is_read_st1; + wire do_write_st1 = valid_st1 && is_write_st1; + + assign line_idx_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0]; + assign line_tag_st0 = `CS_LINE_ADDR_TAG(addr_st0); + + assign write_word_st0 = data_st0[`CS_WORD_WIDTH-1:0]; + + wire do_lookup_st0 = do_read_st0 || do_write_st0; + wire do_lookup_st1 = do_read_st1 || do_write_st1; + + wire [`CS_WAY_SEL_WIDTH-1:0] victim_way_st0; + wire [NUM_WAYS-1:0] tag_matches_st0; + + VX_cache_repl #( + .CACHE_SIZE (CACHE_SIZE), + .LINE_SIZE (LINE_SIZE), + .NUM_BANKS (NUM_BANKS), + .NUM_WAYS (NUM_WAYS), + .REPL_POLICY (REPL_POLICY) + ) cache_repl ( + .clk (clk), + .reset (reset), + .stall (pipe_stall), + .init (do_init_st0), + .lookup_valid(do_lookup_st1 && ~pipe_stall), + .lookup_hit (is_hit_st1), + .lookup_line(line_idx_st1), + .lookup_way (way_idx_st1), + .repl_valid (do_fill_st0 && ~pipe_stall), + .repl_line (line_idx_st0), + .repl_way (victim_way_st0) + ); + + assign evict_way_st0 = is_fill_st0 ? victim_way_st0 : flush_way_st0; VX_cache_tags #( - .INSTANCE_ID($sformatf("%s-tags", INSTANCE_ID)), - .BANK_ID (BANK_ID), .CACHE_SIZE (CACHE_SIZE), .LINE_SIZE (LINE_SIZE), .NUM_BANKS (NUM_BANKS), .NUM_WAYS (NUM_WAYS), .WORD_SIZE (WORD_SIZE), - .WRITEBACK (WRITEBACK), - .UUID_WIDTH (UUID_WIDTH) + .WRITEBACK (WRITEBACK) ) cache_tags ( .clk (clk), .reset (reset), - - .req_uuid (req_uuid_st0), - - .stall (pipe_stall), - - // init/flush/fill/write/lookup + // inputs .init (do_init_st0), - .flush (do_flush_st0), - .fill (do_fill_st0), - .write (do_cache_wr_st0), - .lookup (do_lookup_st0), - .line_addr (addr_st0), - .way_sel (flush_way_st0), - .tag_matches(tag_matches_st0), - - // replacement - .evict_dirty(evict_dirty_st0), + .flush (do_flush_st0 && ~pipe_stall), + .fill (do_fill_st0 && ~pipe_stall), + .read (do_read_st0 && ~pipe_stall), + .write (do_write_st0 && ~pipe_stall), + .line_idx (line_idx_st0), + .line_tag (line_tag_st0), .evict_way (evict_way_st0), + // outputs + .tag_matches(tag_matches_st0), + .evict_dirty(is_dirty_st0), .evict_tag (evict_tag_st0) ); - wire [`CS_LINE_ADDR_WIDTH-1:0] addr2_st0; + wire [`CS_WAY_SEL_WIDTH-1:0] hit_idx_st0; + VX_onehot_encoder #( + .N (NUM_WAYS) + ) way_idx_enc ( + .data_in (tag_matches_st0), + .data_out (hit_idx_st0), + `UNUSED_PIN (valid_out) + ); - wire is_flush2_st0 = WRITEBACK && is_flush_st0; + assign way_idx_st0 = is_creq_st0 ? hit_idx_st0 : evict_way_st0; + assign is_hit_st0 = (| tag_matches_st0); - assign mshr_id_st0 = is_creq_st0 ? mshr_alloc_id_st0 : replay_id_st0; - - assign way_sel_st0 = (is_fill_st0 || is_flush2_st0) ? evict_way_st0 : tag_matches_st0; - - assign addr2_st0 = (is_fill_st0 || is_flush2_st0) ? {evict_tag_st0, line_sel_st0} : addr_st0; + wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0; + assign mshr_id_st0 = is_replay_st0 ? replay_id_st0 : mshr_alloc_id_st0; VX_pipe_register #( - .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1 + 1), + .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(FLAGS_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_TAG_SEL_BITS + `CS_TAG_SEL_BITS + `CS_LINE_SEL_BITS + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1), .RESETW (1) ) pipe_reg1 ( .clk (clk), .reset (reset), .enable (~pipe_stall), - .data_in ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush2_st0, is_creq_st0, creq_flush_st0, rw_st0, addr2_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, evict_dirty_st0, mshr_pending_st0}), - .data_out ({valid_st1, is_init_st1, is_replay_st1, is_fill_st1, is_flush_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, evict_dirty_st1, mshr_pending_st1}) + .data_in ({valid_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, is_dirty_st0, is_hit_st0, rw_st0, flags_st0, way_idx_st0, evict_tag_st0, line_tag_st0, line_idx_st0, data_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_previd_st0, mshr_pending_st0}), + .data_out ({valid_st1, is_fill_st1, is_flush_st1, is_creq_st1, is_replay_st1, is_dirty_st1, is_hit_st1, rw_st1, flags_st1, way_idx_st1, evict_tag_st1, line_tag_st1, line_idx_st1, data_st1, byteen_st1, word_idx_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_previd_st1, mshr_pending_st1}) ); - // we have a tag hit - wire is_hit_st1 = (| way_sel_st1); - - if (UUID_WIDTH != 0) begin + if (UUID_WIDTH != 0) begin : g_req_uuid_st1 assign req_uuid_st1 = tag_st1[TAG_WIDTH-1 -: UUID_WIDTH]; - end else begin - assign req_uuid_st1 = 0; + end else begin : g_req_uuid_st1_0 + assign req_uuid_st1 = '0; end - wire is_read_st1 = is_creq_st1 && ~rw_st1; - wire is_write_st1 = is_creq_st1 && rw_st1; - - wire do_init_st1 = valid_st1 && is_init_st1; - wire do_fill_st1 = valid_st1 && is_fill_st1; - wire do_flush_st1 = valid_st1 && is_flush_st1; - - wire do_creq_rd_st1 = valid_st1 && is_read_st1; - wire do_creq_wr_st1 = valid_st1 && is_write_st1; - wire do_replay_rd_st1 = valid_st1 && is_replay_st1 && ~rw_st1; - wire do_replay_wr_st1 = valid_st1 && is_replay_st1 && rw_st1; - - wire do_read_hit_st1 = do_creq_rd_st1 && is_hit_st1; - wire do_read_miss_st1 = do_creq_rd_st1 && ~is_hit_st1; - - wire do_write_hit_st1 = do_creq_wr_st1 && is_hit_st1; - wire do_write_miss_st1= do_creq_wr_st1 && ~is_hit_st1; - - wire do_cache_rd_st1 = do_read_hit_st1 || do_replay_rd_st1; - wire do_cache_wr_st1 = do_write_hit_st1 || do_replay_wr_st1; - - assign line_sel_st1 = addr_st1[`CS_LINE_SEL_BITS-1:0]; - - `UNUSED_VAR (do_write_miss_st1) + assign addr_st1 = {line_tag_st1, line_idx_st1}; // ensure mshr replay always get a hit - `RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("missed mshr replay")); + `RUNTIME_ASSERT (~(valid_st1 && is_replay_st1 && ~is_hit_st1), ("%t: missed mshr replay", $time)) - // both tag and data stores use BRAM with no read-during-write protection. - // we ned to stall the pipeline to prevent read-after-write hazards. - assign rdw_hazard1_sel = do_fill_st0; // stall first replay following a fill - assign rdw_hazard2_sel = WRITEBACK && do_cache_wr_st0; // a writeback can evict any preceeding write - always @(posedge clk) begin - // stall reads following writes to same line address - rdw_hazard3_st1 <= do_cache_rd_st0 && do_cache_wr_st1 && (line_sel_st0 == line_sel_st1) - && ~rdw_hazard3_st1; // release pipeline stall - end + assign write_word_st1 = data_st1[`CS_WORD_WIDTH-1:0]; + `UNUSED_VAR (data_st1) - wire [`CS_LINE_WIDTH-1:0] write_data_st1 = {`CS_WORDS_PER_LINE{data_st1[`CS_WORD_WIDTH-1:0]}}; - wire [`CS_LINE_WIDTH-1:0] fill_data_st1 = data_st1; - wire [LINE_SIZE-1:0] write_byteen_st1; - - wire [`CS_LINE_WIDTH-1:0] dirty_data_st1; - wire [LINE_SIZE-1:0] dirty_byteen_st1; - - if (`CS_WORDS_PER_LINE > 1) begin - reg [LINE_SIZE-1:0] write_byteen_r; - always @(*) begin - write_byteen_r = '0; - write_byteen_r[wsel_st1 * WORD_SIZE +: WORD_SIZE] = byteen_st1; - end - assign write_byteen_st1 = write_byteen_r; - end else begin - assign write_byteen_st1 = byteen_st1; - end + wire[`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] read_data_st1; + wire [LINE_SIZE-1:0] evict_byteen_st1; VX_cache_data #( - .INSTANCE_ID ($sformatf("%s-data", INSTANCE_ID)), - .BANK_ID (BANK_ID), .CACHE_SIZE (CACHE_SIZE), .LINE_SIZE (LINE_SIZE), .NUM_BANKS (NUM_BANKS), @@ -434,56 +441,57 @@ module VX_cache_bank #( .WORD_SIZE (WORD_SIZE), .WRITE_ENABLE (WRITE_ENABLE), .WRITEBACK (WRITEBACK), - .DIRTY_BYTES (DIRTY_BYTES), - .UUID_WIDTH (UUID_WIDTH) + .DIRTY_BYTES (DIRTY_BYTES) ) cache_data ( .clk (clk), .reset (reset), - - .req_uuid (req_uuid_st1), - - .stall (pipe_stall), - - .init (do_init_st1), - .read (do_cache_rd_st1), - .fill (do_fill_st1), - .flush (do_flush_st1), - .write (do_cache_wr_st1), - .way_sel (way_sel_st1), - .line_addr (addr_st1), - .wsel (wsel_st1), - .fill_data (fill_data_st1), - .write_data (write_data_st1), - .write_byteen(write_byteen_st1), + // inputs + .init (do_init_st0), + .fill (do_fill_st0 && ~pipe_stall), + .flush (do_flush_st0 && ~pipe_stall), + .read (do_read_st0 && ~pipe_stall), + .write (do_write_st0 && ~pipe_stall), + .evict_way (evict_way_st0), + .tag_matches(tag_matches_st0), + .line_idx (line_idx_st0), + .fill_data (data_st0), + .write_word (write_word_st0), + .word_idx (word_idx_st0), + .write_byteen(byteen_st0), + .way_idx_r (way_idx_st1), + // outputs .read_data (read_data_st1), - .dirty_data (dirty_data_st1), - .dirty_byteen(dirty_byteen_st1) + .evict_byteen(evict_byteen_st1) ); - wire [MSHR_SIZE-1:0] mshr_lookup_pending_st0; - wire [MSHR_SIZE-1:0] mshr_lookup_rw_st0; - wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~pipe_stall; - wire mshr_lookup_st0 = mshr_allocate_st0; - wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~pipe_stall; + // only allocate MSHR entries for non-replay core requests + wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~is_replay_st0; + wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~is_replay_st1; // release allocated mshr entry if we had a hit wire mshr_release_st1; - if (WRITEBACK) begin + if (WRITEBACK) begin : g_mshr_release assign mshr_release_st1 = is_hit_st1; - end else begin - // we need to keep missed write requests in MSHR if there is already a pending entry to the same address - // this ensures that missed write requests are replayed locally in case a pending fill arrives without the write content - // this can happen when writes are sent late, when the fill was already in flight. + end else begin : g_mshr_release_ro + // we need to keep missed write requests in MSHR if there is already a pending entry to the same address. + // this ensures that missed write requests are replayed locally in case a pending fill arrives without the write content. + // this can happen when writes are sent to memory late, when a related fill was already in flight. assign mshr_release_st1 = is_hit_st1 || (rw_st1 && ~mshr_pending_st1); end + wire mshr_release_fire = mshr_finalize_st1 && mshr_release_st1 && ~pipe_stall; + + wire [1:0] mshr_dequeue; + `POP_COUNT(mshr_dequeue, {replay_fire, mshr_release_fire}); + VX_pending_size #( - .SIZE (MSHR_SIZE) + .SIZE (MSHR_SIZE), + .DECRW (2) ) mshr_pending_size ( .clk (clk), .reset (reset), .incr (core_req_fire), - .decr (replay_fire || (mshr_finalize_st1 && mshr_release_st1)), + .decr (mshr_dequeue), .empty (mshr_empty), `UNUSED_PIN (alm_empty), .full (mshr_alm_full), @@ -492,11 +500,12 @@ module VX_cache_bank #( ); VX_cache_mshr #( - .INSTANCE_ID ($sformatf("%s-mshr", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-mshr", INSTANCE_ID))), .BANK_ID (BANK_ID), .LINE_SIZE (LINE_SIZE), .NUM_BANKS (NUM_BANKS), .MSHR_SIZE (MSHR_SIZE), + .WRITEBACK (WRITEBACK), .UUID_WIDTH (UUID_WIDTH), .DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH) ) cache_mshr ( @@ -504,7 +513,7 @@ module VX_cache_bank #( .reset (reset), .deq_req_uuid (req_uuid_sel), - .lkp_req_uuid (req_uuid_st0), + .alc_req_uuid (req_uuid_st0), .fin_req_uuid (req_uuid_st1), // memory fill @@ -521,37 +530,23 @@ module VX_cache_bank #( .dequeue_ready (replay_ready), // allocate - .allocate_valid (mshr_allocate_st0), + .allocate_valid (mshr_allocate_st0 && ~pipe_stall), .allocate_addr (addr_st0), .allocate_rw (rw_st0), - .allocate_data ({wsel_st0, byteen_st0, write_data_st0, tag_st0, req_idx_st0}), + .allocate_data ({word_idx_st0, byteen_st0, write_word_st0, tag_st0, req_idx_st0}), .allocate_id (mshr_alloc_id_st0), - .allocate_prev (mshr_prev_st0), + .allocate_pending(mshr_pending_st0), + .allocate_previd(mshr_previd_st0), `UNUSED_PIN (allocate_ready), - // lookup - .lookup_valid (mshr_lookup_st0), - .lookup_addr (addr_st0), - .lookup_pending (mshr_lookup_pending_st0), - .lookup_rw (mshr_lookup_rw_st0), - // finalize - .finalize_valid (mshr_finalize_st1), - .finalize_release(mshr_release_st1), - .finalize_pending(mshr_pending_st1), + .finalize_valid (mshr_finalize_st1 && ~pipe_stall), + .finalize_is_release(mshr_release_st1), + .finalize_is_pending(mshr_pending_st1), .finalize_id (mshr_id_st1), - .finalize_prev (mshr_prev_st1) + .finalize_previd(mshr_previd_st1) ); - // check if there are pending requests to same line in the MSHR - wire [MSHR_SIZE-1:0] lookup_matches; - for (genvar i = 0; i < MSHR_SIZE; ++i) begin - assign lookup_matches[i] = mshr_lookup_pending_st0[i] - && (i != mshr_alloc_id_st0) // exclude current mshr id - && (WRITEBACK || ~mshr_lookup_rw_st0[i]); // exclude write requests if writethrough - end - assign mshr_pending_st0 = (| lookup_matches); - // schedule core response wire crsp_queue_valid, crsp_queue_ready; @@ -559,19 +554,19 @@ module VX_cache_bank #( wire [REQ_SEL_WIDTH-1:0] crsp_queue_idx; wire [TAG_WIDTH-1:0] crsp_queue_tag; - assign crsp_queue_valid = do_cache_rd_st1; + assign crsp_queue_valid = do_read_st1 && is_hit_st1; assign crsp_queue_idx = req_idx_st1; - assign crsp_queue_data = read_data_st1; + assign crsp_queue_data = read_data_st1[word_idx_st1]; assign crsp_queue_tag = tag_st1; VX_elastic_buffer #( .DATAW (TAG_WIDTH + `CS_WORD_WIDTH + REQ_SEL_WIDTH), .SIZE (CRSQ_SIZE), - .OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF)) + .OUT_REG (CORE_OUT_REG) ) core_rsp_queue ( .clk (clk), .reset (reset), - .valid_in (crsp_queue_valid && ~rdw_hazard3_st1), + .valid_in (crsp_queue_valid), .ready_in (crsp_queue_ready), .data_in ({crsp_queue_tag, crsp_queue_data, crsp_queue_idx}), .data_out ({core_rsp_tag, core_rsp_data, core_rsp_idx}), @@ -587,59 +582,93 @@ module VX_cache_bank #( wire [`CS_LINE_WIDTH-1:0] mreq_queue_data; wire [LINE_SIZE-1:0] mreq_queue_byteen; wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr; - wire [MSHR_ADDR_WIDTH-1:0] mreq_queue_id; + wire [MEM_TAG_WIDTH-1:0] mreq_queue_tag; wire mreq_queue_rw; - wire mreq_queue_flush; + wire [`UP(FLAGS_WIDTH)-1:0] mreq_queue_flags; - wire is_fill_or_flush_st1 = is_fill_st1 || is_flush_st1; + wire is_fill_or_flush_st1 = is_fill_st1 || (is_flush_st1 && WRITEBACK); wire do_fill_or_flush_st1 = valid_st1 && is_fill_or_flush_st1; - wire do_writeback_st1 = do_fill_or_flush_st1 && evict_dirty_st1; + wire do_writeback_st1 = do_fill_or_flush_st1 && is_dirty_st1; + wire [`CS_LINE_ADDR_WIDTH-1:0] evict_addr_st1 = {evict_tag_st1, line_idx_st1}; - if (WRITEBACK) begin - if (DIRTY_BYTES) begin - // ensure dirty bytes match the tag info - wire has_dirty_bytes = (| dirty_byteen_st1); - `RUNTIME_ASSERT (~do_fill_or_flush_st1 || (evict_dirty_st1 == has_dirty_bytes), ("missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", evict_dirty_st1, has_dirty_bytes, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID))); + if (WRITE_ENABLE) begin : g_mreq_queue + if (WRITEBACK) begin : g_wb + if (DIRTY_BYTES) begin : g_dirty_bytes + // ensure dirty bytes match the tag info + wire has_dirty_bytes = (| evict_byteen_st1); + `RUNTIME_ASSERT (~do_fill_or_flush_st1 || (is_dirty_st1 == has_dirty_bytes), ("%t: missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", $time, is_dirty_st1, has_dirty_bytes, `CS_BANK_TO_FULL_ADDR(addr_st1, BANK_ID))) + end + // issue a fill request on a read/write miss + // issue a writeback on a dirty line eviction + assign mreq_queue_push = ((do_lookup_st1 && ~is_hit_st1 && ~mshr_pending_st1) + || do_writeback_st1) + && ~pipe_stall; + assign mreq_queue_addr = is_fill_or_flush_st1 ? evict_addr_st1 : addr_st1; + assign mreq_queue_rw = is_fill_or_flush_st1; + assign mreq_queue_data = read_data_st1; + assign mreq_queue_byteen = is_fill_or_flush_st1 ? evict_byteen_st1 : '1; + `UNUSED_VAR (write_word_st1) + `UNUSED_VAR (byteen_st1) + end else begin : g_wt + wire [LINE_SIZE-1:0] line_byteen; + VX_demux #( + .DATAW (WORD_SIZE), + .N (`CS_WORDS_PER_LINE) + ) byteen_demux ( + .sel_in (word_idx_st1), + .data_in (byteen_st1), + .data_out (line_byteen) + ); + // issue a fill request on a read miss + // issue a memory write on a write request + assign mreq_queue_push = ((do_read_st1 && ~is_hit_st1 && ~mshr_pending_st1) + || do_write_st1) + && ~pipe_stall; + assign mreq_queue_addr = addr_st1; + assign mreq_queue_rw = rw_st1; + assign mreq_queue_data = {`CS_WORDS_PER_LINE{write_word_st1}}; + assign mreq_queue_byteen = rw_st1 ? line_byteen : '1; + `UNUSED_VAR (is_fill_or_flush_st1) + `UNUSED_VAR (do_writeback_st1) + `UNUSED_VAR (evict_addr_st1) + `UNUSED_VAR (evict_byteen_st1) end - assign mreq_queue_push = (((do_read_miss_st1 || do_write_miss_st1) && ~mshr_pending_st1) - || do_writeback_st1) - && ~rdw_hazard3_st1; - end else begin + end else begin : g_mreq_queue_ro + // issue a fill request on a read miss + assign mreq_queue_push = (do_read_st1 && ~is_hit_st1 && ~mshr_pending_st1) + && ~pipe_stall; + assign mreq_queue_addr = addr_st1; + assign mreq_queue_rw = 0; + assign mreq_queue_data = '0; + assign mreq_queue_byteen = '1; `UNUSED_VAR (do_writeback_st1) - assign mreq_queue_push = ((do_read_miss_st1 && ~mshr_pending_st1) - || do_creq_wr_st1) - && ~rdw_hazard3_st1; + `UNUSED_VAR (evict_addr_st1) + `UNUSED_VAR (evict_byteen_st1) + `UNUSED_VAR (write_word_st1) + `UNUSED_VAR (byteen_st1) + end + + if (UUID_WIDTH != 0) begin : g_mreq_queue_tag_uuid + assign mreq_queue_tag = {req_uuid_st1, mshr_id_st1}; + end else begin : g_mreq_queue_tag + assign mreq_queue_tag = mshr_id_st1; end assign mreq_queue_pop = mem_req_valid && mem_req_ready; - assign mreq_queue_addr = addr_st1; - assign mreq_queue_id = mshr_id_st1; - assign mreq_queue_flush = creq_flush_st1; - - if (WRITE_ENABLE) begin - assign mreq_queue_rw = WRITEBACK ? is_fill_or_flush_st1 : rw_st1; - assign mreq_queue_data = WRITEBACK ? dirty_data_st1 : write_data_st1; - assign mreq_queue_byteen = WRITEBACK ? dirty_byteen_st1 : write_byteen_st1; - end else begin - assign mreq_queue_rw = 0; - assign mreq_queue_data = 0; - assign mreq_queue_byteen = 0; - `UNUSED_VAR (dirty_data_st1) - `UNUSED_VAR (dirty_byteen_st1) - end + assign mreq_queue_flags = flags_st1; VX_fifo_queue #( - .DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + 1), + .DATAW (1 + `CS_LINE_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)), .DEPTH (MREQ_SIZE), - .ALM_FULL (MREQ_SIZE-PIPELINE_STAGES), - .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF)) + .ALM_FULL (MREQ_SIZE - PIPELINE_STAGES), + .OUT_REG (MEM_OUT_REG) ) mem_req_queue ( .clk (clk), .reset (reset), .push (mreq_queue_push), .pop (mreq_queue_pop), - .data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_id, mreq_queue_byteen, mreq_queue_data, mreq_queue_flush}), - .data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_data, mem_req_flush}), + .data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_byteen, mreq_queue_data, mreq_queue_tag, mreq_queue_flags}), + .data_out ({mem_req_rw, mem_req_addr, mem_req_byteen, mem_req_data, mem_req_tag, mem_req_flags}), .empty (mreq_queue_empty), .alm_full (mreq_queue_alm_full), `UNUSED_PIN (full), @@ -649,44 +678,101 @@ module VX_cache_bank #( assign mem_req_valid = ~mreq_queue_empty; + `UNUSED_VAR (do_lookup_st0) + /////////////////////////////////////////////////////////////////////////////// `ifdef PERF_ENABLE - assign perf_read_misses = do_read_miss_st1; - assign perf_write_misses = do_write_miss_st1; - assign perf_mshr_stalls = mshr_alm_full; + assign perf_read_miss = do_read_st1 && ~is_hit_st1; + assign perf_write_miss = do_write_st1 && ~is_hit_st1; + assign perf_mshr_stall = mshr_alm_full; `endif `ifdef DBG_TRACE_CACHE wire crsp_queue_fire = crsp_queue_valid && crsp_queue_ready; wire input_stall = (replay_valid || mem_rsp_valid || core_req_valid || flush_valid) && ~(replay_fire || mem_rsp_fire || core_req_fire || flush_fire); + + wire [`XLEN-1:0] mem_rsp_full_addr = `CS_BANK_TO_FULL_ADDR(mem_rsp_addr, BANK_ID); + wire [`XLEN-1:0] replay_full_addr = `CS_BANK_TO_FULL_ADDR(replay_addr, BANK_ID); + wire [`XLEN-1:0] core_req_full_addr = `CS_BANK_TO_FULL_ADDR(core_req_addr, BANK_ID); + wire [`XLEN-1:0] full_addr_st0 = `CS_BANK_TO_FULL_ADDR(addr_st0, BANK_ID); + wire [`XLEN-1:0] full_addr_st1 = `CS_BANK_TO_FULL_ADDR(addr_st1, BANK_ID); + wire [`XLEN-1:0] mreq_queue_full_addr = `CS_BANK_TO_FULL_ADDR(mreq_queue_addr, BANK_ID); + always @(posedge clk) begin if (input_stall || pipe_stall) begin - `TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw1=%b, rdw2=%b, rdw3=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard1_sel, rdw_hazard2_sel, rdw_hazard3_st1)); + `TRACE(4, ("%t: *** %s stall: crsq=%b, mreq=%b, mshr=%b\n", $time, INSTANCE_ID, + crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full)) end if (mem_rsp_fire) begin - `TRACE(2, ("%d: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data)); + `TRACE(2, ("%t: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, + mem_rsp_full_addr, mem_rsp_id, mem_rsp_data, req_uuid_sel)) end if (replay_fire) begin - `TRACE(2, ("%d: %s mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel)); + `TRACE(2, ("%t: %s mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, + replay_full_addr, replay_tag, replay_idx, req_uuid_sel)) end if (core_req_fire) begin - if (core_req_rw) - `TRACE(2, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel)); - else - `TRACE(2, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel)); + if (core_req_rw) begin + `TRACE(2, ("%t: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, + core_req_full_addr, core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel)) + end else begin + `TRACE(2, ("%t: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, + core_req_full_addr, core_req_tag, core_req_idx, req_uuid_sel)) + end + end + if (do_init_st0) begin + `TRACE(3, ("%t: %s tags-init: addr=0x%0h, line=%0d\n", $time, INSTANCE_ID, full_addr_st0, line_idx_st0)) + end + if (do_fill_st0 && ~pipe_stall) begin + `TRACE(3, ("%t: %s tags-fill: addr=0x%0h, way=%0d, line=%0d, dirty=%b (#%0d)\n", $time, INSTANCE_ID, + full_addr_st0, evict_way_st0, line_idx_st0, is_dirty_st0, req_uuid_st0)) + end + if (do_flush_st0 && ~pipe_stall) begin + `TRACE(3, ("%t: %s tags-flush: addr=0x%0h, way=%0d, line=%0d, dirty=%b (#%0d)\n", $time, INSTANCE_ID, + full_addr_st0, evict_way_st0, line_idx_st0, is_dirty_st0, req_uuid_st0)) + end + if (do_lookup_st0 && ~pipe_stall) begin + if (is_hit_st0) begin + `TRACE(3, ("%t: %s tags-hit: addr=0x%0h, rw=%b, way=%0d, line=%0d, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, + full_addr_st0, rw_st0, way_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0)) + end else begin + `TRACE(3, ("%t: %s tags-miss: addr=0x%0h, rw=%b, way=%0d, line=%0d, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, + full_addr_st0, rw_st0, way_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0)) + end + end + if (do_fill_st0 && ~pipe_stall) begin + `TRACE(3, ("%t: %s data-fill: addr=0x%0h, way=%0d, line=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, + full_addr_st0, way_idx_st0, line_idx_st0, data_st0, req_uuid_st0)) + end + if (do_flush_st0 && ~pipe_stall) begin + `TRACE(3, ("%t: %s data-flush: addr=0x%0h, way=%0d, line=%0d (#%0d)\n", $time, INSTANCE_ID, + full_addr_st0, way_idx_st0, line_idx_st0, req_uuid_st0)) + end + if (do_read_st1 && is_hit_st1 && ~pipe_stall) begin + `TRACE(3, ("%t: %s data-read: addr=0x%0h, way=%0d, line=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, + full_addr_st1, way_idx_st1, line_idx_st1, word_idx_st1, crsp_queue_data, req_uuid_st1)) + end + if (do_write_st1 && is_hit_st1 && ~pipe_stall) begin + `TRACE(3, ("%t: %s data-write: addr=0x%0h, way=%0d, line=%0d, wsel=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, + full_addr_st1, way_idx_st1, line_idx_st1, word_idx_st1, byteen_st1, write_word_st1, req_uuid_st1)) end if (crsp_queue_fire) begin - `TRACE(2, ("%d: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1)); + `TRACE(2, ("%t: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, + full_addr_st1, crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1)) end if (mreq_queue_push) begin - if (do_creq_wr_st1 && !WRITEBACK) - `TRACE(2, ("%d: %s writethrough: addr=0x%0h, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1)); - else if (do_writeback_st1) - `TRACE(2, ("%d: %s writeback: addr=0x%0h, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data)); - else - `TRACE(2, ("%d: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_id, req_uuid_st1)); + if (!WRITEBACK && do_write_st1) begin + `TRACE(2, ("%t: %s writethrough: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, + mreq_queue_full_addr, mreq_queue_byteen, mreq_queue_data, req_uuid_st1)) + end else if (WRITEBACK && do_writeback_st1) begin + `TRACE(2, ("%t: %s writeback: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, + mreq_queue_full_addr, mreq_queue_byteen, mreq_queue_data, req_uuid_st1)) + end else begin + `TRACE(2, ("%t: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, + mreq_queue_full_addr, mshr_id_st1, req_uuid_st1)) + end end end `endif diff --git a/hw/rtl/cache/VX_cache_bypass.sv b/hw/rtl/cache/VX_cache_bypass.sv index 379d33e8a..7e78db71a 100644 --- a/hw/rtl/cache/VX_cache_bypass.sv +++ b/hw/rtl/cache/VX_cache_bypass.sv @@ -15,10 +15,10 @@ module VX_cache_bypass #( parameter NUM_REQS = 1, + parameter MEM_PORTS = 1, parameter TAG_SEL_IDX = 0, - parameter PASSTHRU = 0, - parameter NC_ENABLE = 0, + parameter CACHE_ENABLE = 0, parameter WORD_SIZE = 1, parameter LINE_SIZE = 1, @@ -29,14 +29,11 @@ module VX_cache_bypass #( parameter MEM_ADDR_WIDTH = 1, parameter MEM_TAG_IN_WIDTH = 1, - parameter MEM_TAG_OUT_WIDTH = 1, parameter UUID_WIDTH = 0, parameter CORE_OUT_BUF = 0, - parameter MEM_OUT_BUF = 0, - - parameter CORE_DATA_WIDTH = WORD_SIZE * 8 + parameter MEM_OUT_BUF = 0 ) ( input wire clk, input wire reset, @@ -48,304 +45,222 @@ module VX_cache_bypass #( VX_mem_bus_if.master core_bus_out_if [NUM_REQS], // Memory request in - VX_mem_bus_if.slave mem_bus_in_if, + VX_mem_bus_if.slave mem_bus_in_if [MEM_PORTS], // Memory request out - VX_mem_bus_if.master mem_bus_out_if + VX_mem_bus_if.master mem_bus_out_if [MEM_PORTS] ); - localparam DIRECT_PASSTHRU = PASSTHRU && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == 1); + localparam DIRECT_PASSTHRU = !CACHE_ENABLE && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == MEM_PORTS); + localparam CORE_DATA_WIDTH = WORD_SIZE * 8; + localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE; + localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE); - localparam REQ_SEL_BITS = `CLOG2(NUM_REQS); - localparam MUX_DATAW = 1 + WORD_SIZE + CORE_ADDR_WIDTH + `ADDR_TYPE_WIDTH + CORE_DATA_WIDTH + CORE_TAG_WIDTH; - - localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE; - localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE); - - localparam CORE_TAG_ID_BITS = CORE_TAG_WIDTH - UUID_WIDTH; - localparam MEM_TAG_ID_BITS = REQ_SEL_BITS + WSEL_BITS + CORE_TAG_ID_BITS; - localparam MEM_TAG_BYPASS_BITS = UUID_WIDTH + MEM_TAG_ID_BITS; + localparam CORE_TAG_ID_WIDTH = CORE_TAG_WIDTH - UUID_WIDTH; + localparam MEM_TAG_ID_WIDTH = `CLOG2(`CDIV(NUM_REQS, MEM_PORTS)) + CORE_TAG_ID_WIDTH; + localparam MEM_TAG_NC1_WIDTH = UUID_WIDTH + MEM_TAG_ID_WIDTH; + localparam MEM_TAG_NC2_WIDTH = MEM_TAG_NC1_WIDTH + WSEL_BITS; + localparam MEM_TAG_OUT_WIDTH = CACHE_ENABLE ? `MAX(MEM_TAG_IN_WIDTH, MEM_TAG_NC2_WIDTH) : MEM_TAG_NC2_WIDTH; `STATIC_ASSERT(0 == (`IO_BASE_ADDR % `MEM_BLOCK_SIZE), ("invalid parameter")) - // handle core requests /////////////////////////////////////////////////// + // hanlde non-cacheable core request switch /////////////////////////////// + + VX_mem_bus_if #( + .DATA_SIZE (WORD_SIZE), + .TAG_WIDTH (CORE_TAG_WIDTH) + ) core_bus_nc_switch_if[(CACHE_ENABLE ? 2 : 1) * NUM_REQS](); - wire core_req_nc_valid; - wire [NUM_REQS-1:0] core_req_nc_valids; - wire [NUM_REQS-1:0] core_req_nc_idxs; - wire [`UP(REQ_SEL_BITS)-1:0] core_req_nc_idx; wire [NUM_REQS-1:0] core_req_nc_sel; - wire core_req_nc_ready; - for (genvar i = 0; i < NUM_REQS; ++i) begin - if (PASSTHRU != 0) begin - assign core_req_nc_idxs[i] = 1'b1; - end else if (NC_ENABLE) begin - assign core_req_nc_idxs[i] = core_bus_in_if[i].req_data.atype[`ADDR_TYPE_IO]; - end else begin - assign core_req_nc_idxs[i] = 1'b0; + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_is_nc + if (CACHE_ENABLE) begin : g_cache + assign core_req_nc_sel[i] = ~core_bus_in_if[i].req_data.flags[`MEM_REQ_FLAG_IO]; + end else begin : g_no_cache + assign core_req_nc_sel[i] = 1'b0; end - assign core_req_nc_valids[i] = core_bus_in_if[i].req_valid && core_req_nc_idxs[i]; end - VX_generic_arbiter #( - .NUM_REQS (NUM_REQS), - .TYPE (PASSTHRU ? "R" : "P") - ) core_req_nc_arb ( - .clk (clk), - .reset (reset), - .requests (core_req_nc_valids), - .grant_index (core_req_nc_idx), - .grant_onehot (core_req_nc_sel), - .grant_valid (core_req_nc_valid), - .grant_ready (core_req_nc_ready) + VX_mem_switch #( + .NUM_INPUTS (NUM_REQS), + .NUM_OUTPUTS ((CACHE_ENABLE ? 2 : 1) * NUM_REQS), + .DATA_SIZE (WORD_SIZE), + .TAG_WIDTH (CORE_TAG_WIDTH), + .ARBITER ("R"), + .REQ_OUT_BUF (0), + .RSP_OUT_BUF (DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(CORE_OUT_BUF)) + ) core_bus_nc_switch ( + .clk (clk), + .reset (reset), + .bus_sel (core_req_nc_sel), + .bus_in_if (core_bus_in_if), + .bus_out_if(core_bus_nc_switch_if) ); - for (genvar i = 0; i < NUM_REQS; ++i) begin - assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && ~core_req_nc_idxs[i]; - assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data; - assign core_bus_in_if[i].req_ready = core_req_nc_valids[i] ? (core_req_nc_ready && core_req_nc_sel[i]) - : core_bus_out_if[i].req_ready; + VX_mem_bus_if #( + .DATA_SIZE (WORD_SIZE), + .TAG_WIDTH (CORE_TAG_WIDTH) + ) core_bus_in_nc_if[NUM_REQS](); + + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_nc_switch_if + + assign core_bus_in_nc_if[i].req_valid = core_bus_nc_switch_if[0 * NUM_REQS + i].req_valid; + assign core_bus_in_nc_if[i].req_data = core_bus_nc_switch_if[0 * NUM_REQS + i].req_data; + assign core_bus_nc_switch_if[0 * NUM_REQS + i].req_ready = core_bus_in_nc_if[i].req_ready; + + assign core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_valid = core_bus_in_nc_if[i].rsp_valid; + assign core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_data = core_bus_in_nc_if[i].rsp_data; + assign core_bus_in_nc_if[i].rsp_ready = core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_ready; + + if (CACHE_ENABLE) begin : g_cache + assign core_bus_out_if[i].req_valid = core_bus_nc_switch_if[1 * NUM_REQS + i].req_valid; + assign core_bus_out_if[i].req_data = core_bus_nc_switch_if[1 * NUM_REQS + i].req_data; + assign core_bus_nc_switch_if[1 * NUM_REQS + i].req_ready = core_bus_out_if[i].req_ready; + + assign core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_valid = core_bus_out_if[i].rsp_valid; + assign core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_data = core_bus_out_if[i].rsp_data; + assign core_bus_out_if[i].rsp_ready = core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_ready; + end else begin : g_no_cache + `INIT_VX_MEM_BUS_IF (core_bus_out_if[i]) + end end // handle memory requests ///////////////////////////////////////////////// - wire mem_req_out_valid; - wire mem_req_out_rw; - wire [LINE_SIZE-1:0] mem_req_out_byteen; - wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_out_addr; - wire [`ADDR_TYPE_WIDTH-1:0] mem_req_out_atype; - wire [`CS_LINE_WIDTH-1:0] mem_req_out_data; - wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_out_tag; - wire mem_req_out_ready; + VX_mem_bus_if #( + .DATA_SIZE (WORD_SIZE), + .TAG_WIDTH (MEM_TAG_NC1_WIDTH) + ) core_bus_nc_arb_if[MEM_PORTS](); - wire core_req_nc_sel_rw; - wire [WORD_SIZE-1:0] core_req_nc_sel_byteen; - wire [CORE_ADDR_WIDTH-1:0] core_req_nc_sel_addr; - wire [`ADDR_TYPE_WIDTH-1:0] core_req_nc_sel_atype; - wire [CORE_DATA_WIDTH-1:0] core_req_nc_sel_data; - wire [CORE_TAG_WIDTH-1:0] core_req_nc_sel_tag; + VX_mem_arb #( + .NUM_INPUTS (NUM_REQS), + .NUM_OUTPUTS(MEM_PORTS), + .DATA_SIZE (WORD_SIZE), + .TAG_WIDTH (CORE_TAG_WIDTH), + .TAG_SEL_IDX(TAG_SEL_IDX), + .ARBITER (CACHE_ENABLE ? "P" : "R"), + .REQ_OUT_BUF(0), + .RSP_OUT_BUF(0) + ) core_bus_nc_arb ( + .clk (clk), + .reset (reset), + .bus_in_if (core_bus_in_nc_if), + .bus_out_if (core_bus_nc_arb_if) + ); - wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in; - for (genvar i = 0; i < NUM_REQS; ++i) begin - assign core_req_nc_mux_in[i] = { - core_bus_in_if[i].req_data.rw, - core_bus_in_if[i].req_data.byteen, - core_bus_in_if[i].req_data.addr, - core_bus_in_if[i].req_data.atype, - core_bus_in_if[i].req_data.data, - core_bus_in_if[i].req_data.tag - }; - end + VX_mem_bus_if #( + .DATA_SIZE (LINE_SIZE), + .TAG_WIDTH (MEM_TAG_NC2_WIDTH) + ) mem_bus_out_nc_if[MEM_PORTS](); - assign { - core_req_nc_sel_rw, - core_req_nc_sel_byteen, - core_req_nc_sel_addr, - core_req_nc_sel_atype, - core_req_nc_sel_data, - core_req_nc_sel_tag - } = core_req_nc_mux_in[core_req_nc_idx]; + for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_out_nc + wire core_req_nc_arb_rw; + wire [WORD_SIZE-1:0] core_req_nc_arb_byteen; + wire [CORE_ADDR_WIDTH-1:0] core_req_nc_arb_addr; + wire [`MEM_REQ_FLAGS_WIDTH-1:0] core_req_nc_arb_flags; + wire [CORE_DATA_WIDTH-1:0] core_req_nc_arb_data; + wire [MEM_TAG_NC1_WIDTH-1:0] core_req_nc_arb_tag; - assign core_req_nc_ready = ~mem_bus_in_if.req_valid && mem_req_out_ready; + assign { + core_req_nc_arb_rw, + core_req_nc_arb_addr, + core_req_nc_arb_data, + core_req_nc_arb_byteen, + core_req_nc_arb_flags, + core_req_nc_arb_tag + } = core_bus_nc_arb_if[i].req_data; - assign mem_req_out_valid = mem_bus_in_if.req_valid || core_req_nc_valid; - assign mem_req_out_rw = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.rw : core_req_nc_sel_rw; - assign mem_req_out_addr = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.addr : core_req_nc_sel_addr[WSEL_BITS +: MEM_ADDR_WIDTH]; - assign mem_req_out_atype = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.atype : core_req_nc_sel_atype; + logic [MEM_ADDR_WIDTH-1:0] core_req_nc_arb_addr_w; + logic [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] core_req_nc_arb_byteen_w; + logic [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] core_req_nc_arb_data_w; + logic [CORE_DATA_WIDTH-1:0] core_rsp_nc_arb_data_w; + wire [MEM_TAG_NC2_WIDTH-1:0] core_req_nc_arb_tag_w; + wire [MEM_TAG_NC1_WIDTH-1:0] core_rsp_nc_arb_tag_w; - wire [MEM_TAG_ID_BITS-1:0] mem_req_tag_id_bypass; - - wire [CORE_TAG_ID_BITS-1:0] core_req_in_id = core_req_nc_sel_tag[CORE_TAG_ID_BITS-1:0]; - - if (WORDS_PER_LINE > 1) begin - reg [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] mem_req_byteen_in_r; - reg [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_r; - - wire [WSEL_BITS-1:0] req_wsel = core_req_nc_sel_addr[WSEL_BITS-1:0]; - - always @(*) begin - mem_req_byteen_in_r = '0; - mem_req_byteen_in_r[req_wsel] = core_req_nc_sel_byteen; - - mem_req_data_in_r = 'x; - mem_req_data_in_r[req_wsel] = core_req_nc_sel_data; - end - - assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : mem_req_byteen_in_r; - assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : mem_req_data_in_r; - if (NUM_REQS > 1) begin - assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, req_wsel, core_req_in_id}); - end else begin - assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({req_wsel, core_req_in_id}); - end - end else begin - assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : core_req_nc_sel_byteen; - assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : core_req_nc_sel_data; - if (NUM_REQS > 1) begin - assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, core_req_in_id}); - end else begin - assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_in_id}); - end - end - - wire [MEM_TAG_BYPASS_BITS-1:0] mem_req_tag_bypass; - - if (UUID_WIDTH != 0) begin - assign mem_req_tag_bypass = {core_req_nc_sel_tag[CORE_TAG_ID_BITS +: UUID_WIDTH], mem_req_tag_id_bypass}; - end else begin - assign mem_req_tag_bypass = mem_req_tag_id_bypass; - end - - if (PASSTHRU != 0) begin - assign mem_req_out_tag = mem_req_tag_bypass; - `UNUSED_VAR (mem_bus_in_if.req_data.tag) - end else begin - if (NC_ENABLE) begin + if (WORDS_PER_LINE > 1) begin : g_multi_word_line + wire [WSEL_BITS-1:0] rsp_wsel; + wire [WSEL_BITS-1:0] req_wsel = core_req_nc_arb_addr[WSEL_BITS-1:0]; + always @(*) begin + core_req_nc_arb_byteen_w = '0; + core_req_nc_arb_byteen_w[req_wsel] = core_req_nc_arb_byteen; + core_req_nc_arb_data_w = 'x; + core_req_nc_arb_data_w[req_wsel] = core_req_nc_arb_data; + end VX_bits_insert #( - .N (MEM_TAG_OUT_WIDTH-1), - .S (1), + .N (MEM_TAG_NC1_WIDTH), + .S (WSEL_BITS), .POS (TAG_SEL_IDX) - ) mem_req_tag_in_nc_insert ( - .data_in (mem_bus_in_if.req_valid ? (MEM_TAG_OUT_WIDTH-1)'(mem_bus_in_if.req_data.tag) : (MEM_TAG_OUT_WIDTH-1)'(mem_req_tag_bypass)), - .ins_in (~mem_bus_in_if.req_valid), - .data_out (mem_req_out_tag) + ) wsel_insert ( + .data_in (core_req_nc_arb_tag), + .ins_in (req_wsel), + .data_out (core_req_nc_arb_tag_w) ); - end else begin - assign mem_req_out_tag = mem_bus_in_if.req_data.tag; + VX_bits_remove #( + .N (MEM_TAG_NC2_WIDTH), + .S (WSEL_BITS), + .POS (TAG_SEL_IDX) + ) wsel_remove ( + .data_in (mem_bus_out_nc_if[i].rsp_data.tag), + .sel_out (rsp_wsel), + .data_out (core_rsp_nc_arb_tag_w) + ); + assign core_req_nc_arb_addr_w = core_req_nc_arb_addr[WSEL_BITS +: MEM_ADDR_WIDTH]; + assign core_rsp_nc_arb_data_w = mem_bus_out_nc_if[i].rsp_data.data[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH]; + end else begin : g_single_word_line + assign core_req_nc_arb_addr_w = core_req_nc_arb_addr; + assign core_req_nc_arb_byteen_w = core_req_nc_arb_byteen; + assign core_req_nc_arb_data_w = core_req_nc_arb_data; + assign core_req_nc_arb_tag_w = MEM_TAG_NC2_WIDTH'(core_req_nc_arb_tag); + + assign core_rsp_nc_arb_data_w = mem_bus_out_nc_if[i].rsp_data.data; + assign core_rsp_nc_arb_tag_w = MEM_TAG_NC1_WIDTH'(mem_bus_out_nc_if[i].rsp_data.tag); + end + + assign mem_bus_out_nc_if[i].req_valid = core_bus_nc_arb_if[i].req_valid; + assign mem_bus_out_nc_if[i].req_data = { + core_req_nc_arb_rw, + core_req_nc_arb_addr_w, + core_req_nc_arb_data_w, + core_req_nc_arb_byteen_w, + core_req_nc_arb_flags, + core_req_nc_arb_tag_w + }; + assign core_bus_nc_arb_if[i].req_ready = mem_bus_out_nc_if[i].req_ready; + + assign core_bus_nc_arb_if[i].rsp_valid = mem_bus_out_nc_if[i].rsp_valid; + assign core_bus_nc_arb_if[i].rsp_data = { + core_rsp_nc_arb_data_w, + core_rsp_nc_arb_tag_w + }; + assign mem_bus_out_nc_if[i].rsp_ready = core_bus_nc_arb_if[i].rsp_ready; + end + + VX_mem_bus_if #( + .DATA_SIZE (LINE_SIZE), + .TAG_WIDTH (MEM_TAG_OUT_WIDTH) + ) mem_bus_out_src_if[(CACHE_ENABLE ? 2 : 1) * MEM_PORTS](); + + for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_out_src + `ASSIGN_VX_MEM_BUS_IF_EX(mem_bus_out_src_if[0 * MEM_PORTS + i], mem_bus_out_nc_if[i], MEM_TAG_OUT_WIDTH, MEM_TAG_NC2_WIDTH, UUID_WIDTH); + if (CACHE_ENABLE) begin : g_cache + `ASSIGN_VX_MEM_BUS_IF_EX(mem_bus_out_src_if[1 * MEM_PORTS + i], mem_bus_in_if[i], MEM_TAG_OUT_WIDTH, MEM_TAG_IN_WIDTH, UUID_WIDTH); + end else begin : g_no_cache + `UNUSED_VX_MEM_BUS_IF(mem_bus_in_if[i]) end end - assign mem_bus_in_if.req_ready = mem_req_out_ready; - - VX_elastic_buffer #( - .DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `ADDR_TYPE_WIDTH + `CS_LINE_WIDTH + MEM_TAG_OUT_WIDTH), - .SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0), - .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF)) - ) mem_req_buf ( - .clk (clk), - .reset (reset), - .valid_in (mem_req_out_valid), - .ready_in (mem_req_out_ready), - .data_in ({mem_req_out_rw, mem_req_out_byteen, mem_req_out_addr, mem_req_out_atype, mem_req_out_data, mem_req_out_tag}), - .data_out ({mem_bus_out_if.req_data.rw, mem_bus_out_if.req_data.byteen, mem_bus_out_if.req_data.addr, mem_bus_out_if.req_data.atype, mem_bus_out_if.req_data.data, mem_bus_out_if.req_data.tag}), - .valid_out (mem_bus_out_if.req_valid), - .ready_out (mem_bus_out_if.req_ready) + VX_mem_arb #( + .NUM_INPUTS ((CACHE_ENABLE ? 2 : 1) * MEM_PORTS), + .NUM_OUTPUTS(MEM_PORTS), + .DATA_SIZE (LINE_SIZE), + .TAG_WIDTH (MEM_TAG_OUT_WIDTH), + .ARBITER ("R"), + .REQ_OUT_BUF(DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(MEM_OUT_BUF)), + .RSP_OUT_BUF(0) + ) mem_bus_out_arb ( + .clk (clk), + .reset (reset), + .bus_in_if (mem_bus_out_src_if), + .bus_out_if (mem_bus_out_if) ); - // handle core responses ////////////////////////////////////////////////// - - wire [NUM_REQS-1:0] core_rsp_in_valid; - wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_in_data; - wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_in_tag; - wire [NUM_REQS-1:0] core_rsp_in_ready; - - wire is_mem_rsp_nc; - if (PASSTHRU != 0) begin - assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid; - end else begin - if (NC_ENABLE) begin - assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid && mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX]; - end else begin - assign is_mem_rsp_nc = 1'b0; - end - end - - wire [(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1:0] mem_rsp_tag_id_nc; - - VX_bits_remove #( - .N (MEM_TAG_OUT_WIDTH), - .S (NC_ENABLE), - .POS (TAG_SEL_IDX) - ) mem_rsp_tag_in_nc_remove ( - .data_in (mem_bus_out_if.rsp_data.tag), - .data_out (mem_rsp_tag_id_nc) - ); - - wire [`UP(REQ_SEL_BITS)-1:0] rsp_idx; - if (NUM_REQS > 1) begin - assign rsp_idx = mem_rsp_tag_id_nc[(CORE_TAG_ID_BITS + WSEL_BITS) +: REQ_SEL_BITS]; - end else begin - assign rsp_idx = 1'b0; - end - - reg [NUM_REQS-1:0] rsp_nc_valid_r; - always @(*) begin - rsp_nc_valid_r = '0; - rsp_nc_valid_r[rsp_idx] = is_mem_rsp_nc; - end - - for (genvar i = 0; i < NUM_REQS; ++i) begin - assign core_rsp_in_valid[i] = core_bus_out_if[i].rsp_valid || rsp_nc_valid_r[i]; - assign core_bus_out_if[i].rsp_ready = core_rsp_in_ready[i]; - end - - if (WORDS_PER_LINE > 1) begin - wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS +: WSEL_BITS]; - for (genvar i = 0; i < NUM_REQS; ++i) begin - assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ? - core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH]; - end - end else begin - for (genvar i = 0; i < NUM_REQS; ++i) begin - assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data; - end - end - - wire [(CORE_TAG_ID_BITS + UUID_WIDTH)-1:0] mem_rsp_tag_in_nc2; - if (UUID_WIDTH != 0) begin - assign mem_rsp_tag_in_nc2 = {mem_rsp_tag_id_nc[(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1 -: UUID_WIDTH], mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0]}; - end else begin - assign mem_rsp_tag_in_nc2 = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0]; - end - - for (genvar i = 0; i < NUM_REQS; ++i) begin - if (PASSTHRU) begin - assign core_rsp_in_tag[i] = mem_rsp_tag_in_nc2; - end else if (NC_ENABLE) begin - assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.tag : mem_rsp_tag_in_nc2; - end else begin - assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_data.tag; - end - end - - for (genvar i = 0; i < NUM_REQS; ++i) begin - VX_elastic_buffer #( - .DATAW (`CS_WORD_WIDTH + CORE_TAG_WIDTH), - .SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0), - .OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF)) - ) core_rsp_buf ( - .clk (clk), - .reset (reset), - .valid_in (core_rsp_in_valid[i]), - .ready_in (core_rsp_in_ready[i]), - .data_in ({core_rsp_in_data[i], core_rsp_in_tag[i]}), - .data_out ({core_bus_in_if[i].rsp_data.data, core_bus_in_if[i].rsp_data.tag}), - .valid_out (core_bus_in_if[i].rsp_valid), - .ready_out (core_bus_in_if[i].rsp_ready) - ); - end - - // handle memory responses //////////////////////////////////////////////// - - if (PASSTHRU != 0) begin - assign mem_bus_in_if.rsp_valid = 1'b0; - assign mem_bus_in_if.rsp_data.data = '0; - assign mem_bus_in_if.rsp_data.tag = '0; - end else if (NC_ENABLE) begin - assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid && ~mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX]; - assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data; - assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc[MEM_TAG_IN_WIDTH-1:0]; - end else begin - assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid; - assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data; - assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc; - end - - wire [NUM_REQS-1:0] core_rsp_out_valid; - for (genvar i = 0; i < NUM_REQS; ++i) begin - assign core_rsp_out_valid[i] = core_bus_out_if[i].rsp_valid; - end - - assign mem_bus_out_if.rsp_ready = is_mem_rsp_nc ? (~core_rsp_out_valid[rsp_idx] && core_rsp_in_ready[rsp_idx]) : mem_bus_in_if.rsp_ready; - endmodule diff --git a/hw/rtl/cache/VX_cache_cluster.sv b/hw/rtl/cache/VX_cache_cluster.sv index 939768b63..dbe0aeb08 100644 --- a/hw/rtl/cache/VX_cache_cluster.sv +++ b/hw/rtl/cache/VX_cache_cluster.sv @@ -23,23 +23,26 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( // Number of requests per cycle parameter NUM_REQS = 4, + // Number of memory ports + parameter MEM_PORTS = 1, + // Size of cache in bytes - parameter CACHE_SIZE = 16384, + parameter CACHE_SIZE = 32768, // Size of line inside a bank in bytes parameter LINE_SIZE = 64, // Number of banks - parameter NUM_BANKS = 1, + parameter NUM_BANKS = 4, // Number of associative ways parameter NUM_WAYS = 4, // Size of a word in bytes - parameter WORD_SIZE = 4, + parameter WORD_SIZE = 16, // Core Response Queue Size - parameter CRSQ_SIZE = 2, + parameter CRSQ_SIZE = 4, // Miss Reserv Queue Knob - parameter MSHR_SIZE = 8, + parameter MSHR_SIZE = 16, // Memory Response Queue Size - parameter MRSQ_SIZE = 0, + parameter MRSQ_SIZE = 4, // Memory Request Queue Size parameter MREQ_SIZE = 4, @@ -52,20 +55,26 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( // Enable dirty bytes on writeback parameter DIRTY_BYTES = 0, + // Replacement policy + parameter REPL_POLICY = `CS_REPL_FIFO, + // Request debug identifier parameter UUID_WIDTH = 0, // core request tag size parameter TAG_WIDTH = UUID_WIDTH + 1, + // core request flags + parameter FLAGS_WIDTH = 0, + // enable bypass for non-cacheable addresses parameter NC_ENABLE = 0, // Core response output buffer - parameter CORE_OUT_BUF = 0, + parameter CORE_OUT_BUF = 3, // Memory request output buffer - parameter MEM_OUT_BUF = 0 + parameter MEM_OUT_BUF = 3 ) ( input wire clk, input wire reset, @@ -76,14 +85,16 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( `endif VX_mem_bus_if.slave core_bus_if [NUM_INPUTS * NUM_REQS], - VX_mem_bus_if.master mem_bus_if + VX_mem_bus_if.master mem_bus_if [MEM_PORTS] ); localparam NUM_CACHES = `UP(NUM_UNITS); localparam PASSTHRU = (NUM_UNITS == 0); localparam ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES); - localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) : - (NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) : - `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS)); + + localparam CACHE_MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH); + localparam BYPASS_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, MEM_PORTS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH); + localparam NC_TAG_WIDTH = `MAX(CACHE_MEM_TAG_WIDTH, BYPASS_TAG_WIDTH) + 1; + localparam MEM_TAG_WIDTH = PASSTHRU ? BYPASS_TAG_WIDTH : (NC_ENABLE ? NC_TAG_WIDTH : CACHE_MEM_TAG_WIDTH); `STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter")) @@ -95,16 +106,14 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( VX_mem_bus_if #( .DATA_SIZE (LINE_SIZE), .TAG_WIDTH (MEM_TAG_WIDTH) - ) cache_mem_bus_if[NUM_CACHES](); + ) cache_mem_bus_if[NUM_CACHES * MEM_PORTS](); VX_mem_bus_if #( .DATA_SIZE (WORD_SIZE), .TAG_WIDTH (ARB_TAG_WIDTH) ) arb_core_bus_if[NUM_CACHES * NUM_REQS](); - `RESET_RELAY_EX (cache_arb_reset, reset, NUM_REQS, `MAX_FANOUT); - - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_arb VX_mem_bus_if #( .DATA_SIZE (WORD_SIZE), .TAG_WIDTH (TAG_WIDTH) @@ -115,7 +124,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( .TAG_WIDTH (ARB_TAG_WIDTH) ) arb_core_bus_tmp_if[NUM_CACHES](); - for (genvar j = 0; j < NUM_INPUTS; ++j) begin + for (genvar j = 0; j < NUM_INPUTS; ++j) begin : g_core_bus_tmp_if `ASSIGN_VX_MEM_BUS_IF (core_bus_tmp_if[j], core_bus_if[j * NUM_REQS + i]); end @@ -127,40 +136,40 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( .TAG_SEL_IDX (TAG_SEL_IDX), .ARBITER ("R"), .REQ_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0), - .RSP_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0) - ) cache_arb ( + .RSP_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? CORE_OUT_BUF : 0) + ) core_arb ( .clk (clk), - .reset (cache_arb_reset[i]), + .reset (reset), .bus_in_if (core_bus_tmp_if), .bus_out_if (arb_core_bus_tmp_if) ); - for (genvar k = 0; k < NUM_CACHES; ++k) begin + for (genvar k = 0; k < NUM_CACHES; ++k) begin : g_arb_core_bus_if `ASSIGN_VX_MEM_BUS_IF (arb_core_bus_if[k * NUM_REQS + i], arb_core_bus_tmp_if[k]); end end - for (genvar i = 0; i < NUM_CACHES; ++i) begin : caches - - `RESET_RELAY (cache_reset, reset); - + for (genvar i = 0; i < NUM_CACHES; ++i) begin : g_cache_wrap VX_cache_wrap #( - .INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, i)), + .INSTANCE_ID (`SFORMATF(("%s%0d", INSTANCE_ID, i))), .CACHE_SIZE (CACHE_SIZE), .LINE_SIZE (LINE_SIZE), .NUM_BANKS (NUM_BANKS), .NUM_WAYS (NUM_WAYS), .WORD_SIZE (WORD_SIZE), .NUM_REQS (NUM_REQS), + .MEM_PORTS (MEM_PORTS), + .WRITE_ENABLE (WRITE_ENABLE), + .WRITEBACK (WRITEBACK), + .DIRTY_BYTES (DIRTY_BYTES), + .REPL_POLICY (REPL_POLICY), .CRSQ_SIZE (CRSQ_SIZE), .MSHR_SIZE (MSHR_SIZE), .MRSQ_SIZE (MRSQ_SIZE), .MREQ_SIZE (MREQ_SIZE), - .WRITE_ENABLE (WRITE_ENABLE), - .WRITEBACK (WRITEBACK), - .DIRTY_BYTES (DIRTY_BYTES), .UUID_WIDTH (UUID_WIDTH), .TAG_WIDTH (ARB_TAG_WIDTH), + .FLAGS_WIDTH (FLAGS_WIDTH), .TAG_SEL_IDX (TAG_SEL_IDX), .CORE_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : CORE_OUT_BUF), .MEM_OUT_BUF ((NUM_CACHES > 1) ? 2 : MEM_OUT_BUF), @@ -171,32 +180,48 @@ module VX_cache_cluster import VX_gpu_pkg::*; #( .cache_perf (perf_cache_unit[i]), `endif .clk (clk), - .reset (cache_reset), + .reset (reset), .core_bus_if (arb_core_bus_if[i * NUM_REQS +: NUM_REQS]), - .mem_bus_if (cache_mem_bus_if[i]) + .mem_bus_if (cache_mem_bus_if[i * MEM_PORTS +: MEM_PORTS]) ); end - VX_mem_bus_if #( - .DATA_SIZE (LINE_SIZE), - .TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1)) - ) mem_bus_tmp_if[1](); + for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_if + VX_mem_bus_if #( + .DATA_SIZE (LINE_SIZE), + .TAG_WIDTH (MEM_TAG_WIDTH) + ) arb_core_bus_tmp_if[NUM_CACHES](); - VX_mem_arb #( - .NUM_INPUTS (NUM_CACHES), - .DATA_SIZE (LINE_SIZE), - .TAG_WIDTH (MEM_TAG_WIDTH), - .TAG_SEL_IDX (TAG_SEL_IDX), - .ARBITER ("R"), - .REQ_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0), - .RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0) - ) mem_arb ( - .clk (clk), - .reset (reset), - .bus_in_if (cache_mem_bus_if), - .bus_out_if (mem_bus_tmp_if) - ); + VX_mem_bus_if #( + .DATA_SIZE (LINE_SIZE), + .TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1)) + ) mem_bus_tmp_if[1](); - `ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]); + for (genvar j = 0; j < NUM_CACHES; ++j) begin : g_arb_core_bus_tmp_if + `ASSIGN_VX_MEM_BUS_IF (arb_core_bus_tmp_if[j], cache_mem_bus_if[j * MEM_PORTS + i]); + end + + VX_mem_arb #( + .NUM_INPUTS (NUM_CACHES), + .NUM_OUTPUTS (1), + .DATA_SIZE (LINE_SIZE), + .TAG_WIDTH (MEM_TAG_WIDTH), + .TAG_SEL_IDX (TAG_SEL_IDX), + .ARBITER ("R"), + .REQ_OUT_BUF ((NUM_CACHES > 1) ? MEM_OUT_BUF : 0), + .RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0) + ) mem_arb ( + .clk (clk), + .reset (reset), + .bus_in_if (arb_core_bus_tmp_if), + .bus_out_if (mem_bus_tmp_if) + ); + + if (WRITE_ENABLE) begin : g_we + `ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[0]); + end else begin : g_ro + `ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[0]); + end + end endmodule diff --git a/hw/rtl/cache/VX_cache_data.sv b/hw/rtl/cache/VX_cache_data.sv index a114e1689..2165a36a6 100644 --- a/hw/rtl/cache/VX_cache_data.sv +++ b/hw/rtl/cache/VX_cache_data.sv @@ -14,8 +14,6 @@ `include "VX_cache_define.vh" module VX_cache_data #( - parameter `STRING INSTANCE_ID= "", - parameter BANK_ID = 0, // Size of cache in bytes parameter CACHE_SIZE = 1024, // Size of line inside a bank in bytes @@ -31,169 +29,116 @@ module VX_cache_data #( // Enable cache writeback parameter WRITEBACK = 0, // Enable dirty bytes on writeback - parameter DIRTY_BYTES = 0, - // Request debug identifier - parameter UUID_WIDTH = 0 + parameter DIRTY_BYTES = 0 ) ( input wire clk, input wire reset, - -`IGNORE_UNUSED_BEGIN - input wire[`UP(UUID_WIDTH)-1:0] req_uuid, -`IGNORE_UNUSED_END - - input wire stall, - + // inputs input wire init, - input wire read, input wire fill, input wire flush, + input wire read, input wire write, - input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr, - input wire [`UP(`CS_WORD_SEL_BITS)-1:0] wsel, + input wire [`CS_LINE_SEL_BITS-1:0] line_idx, + input wire [`CS_WAY_SEL_WIDTH-1:0] evict_way, + input wire [NUM_WAYS-1:0] tag_matches, input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] fill_data, - input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] write_data, - input wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_byteen, - input wire [NUM_WAYS-1:0] way_sel, - output wire [`CS_WORD_WIDTH-1:0] read_data, - output wire [`CS_LINE_WIDTH-1:0] dirty_data, - output wire [LINE_SIZE-1:0] dirty_byteen + input wire [`CS_WORD_WIDTH-1:0] write_word, + input wire [WORD_SIZE-1:0] write_byteen, + input wire [`UP(`CS_WORD_SEL_BITS)-1:0] word_idx, + input wire [`CS_WAY_SEL_WIDTH-1:0] way_idx_r, + // outputs + output wire [`CS_LINE_WIDTH-1:0] read_data, + output wire [LINE_SIZE-1:0] evict_byteen ); - `UNUSED_SPARAM (INSTANCE_ID) - `UNUSED_PARAM (BANK_ID) `UNUSED_PARAM (WORD_SIZE) - `UNUSED_VAR (stall) - `UNUSED_VAR (line_addr) - `UNUSED_VAR (init) - `UNUSED_VAR (read) - `UNUSED_VAR (flush) - localparam BYTEENW = (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) ? (LINE_SIZE * NUM_WAYS) : 1; + wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_mask; + for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin : g_write_mask + wire word_en = (`CS_WORDS_PER_LINE == 1) || (word_idx == i); + assign write_mask[i] = write_byteen & {WORD_SIZE{word_en}}; + end - wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0]; + if (DIRTY_BYTES != 0) begin : g_dirty_bytes - wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_rdata; - wire [`LOG2UP(NUM_WAYS)-1:0] way_idx; + wire [NUM_WAYS-1:0][LINE_SIZE-1:0] byteen_rdata; - if (WRITEBACK) begin - if (DIRTY_BYTES) begin - wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_rdata; - wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_wdata; - - for (genvar i = 0; i < NUM_WAYS; ++i) begin - wire [LINE_SIZE-1:0] wdata = write ? (bs_rdata[i] | write_byteen) : ((fill || flush) ? '0 : bs_rdata[i]); - assign bs_wdata[i] = init ? '0 : (way_sel[i] ? wdata : bs_rdata[i]); - end + for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_byteen_store + wire [LINE_SIZE-1:0] byteen_wdata = {LINE_SIZE{write}}; // only asserted on writes + wire [LINE_SIZE-1:0] byteen_wren = {LINE_SIZE{init || fill || flush}} | write_mask; + wire byteen_write = ((fill || flush) && ((NUM_WAYS == 1) || (evict_way == i))) + || (write && tag_matches[i]) + || init; + wire byteen_read = fill || flush; VX_sp_ram #( - .DATAW (LINE_SIZE * NUM_WAYS), - .SIZE (`CS_LINES_PER_BANK) + .DATAW (LINE_SIZE), + .WRENW (LINE_SIZE), + .SIZE (`CS_LINES_PER_BANK), + .OUT_REG (1), + .RDW_MODE ("R") ) byteen_store ( .clk (clk), .reset (reset), - .read (write || fill || flush), - .write (init || write || fill || flush), - .wren (1'b1), - .addr (line_sel), - .wdata (bs_wdata), - .rdata (bs_rdata) + .read (byteen_read), + .write (byteen_write), + .wren (byteen_wren), + .addr (line_idx), + .wdata (byteen_wdata), + .rdata (byteen_rdata[i]) ); - - assign dirty_byteen = bs_rdata[way_idx]; - end else begin - assign dirty_byteen = {LINE_SIZE{1'b1}}; end - wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] flipped_rdata; - for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin - for (genvar j = 0; j < NUM_WAYS; ++j) begin - assign flipped_rdata[j][i] = line_rdata[i][j]; - end - end - assign dirty_data = flipped_rdata[way_idx]; - end else begin - assign dirty_byteen = '0; - assign dirty_data = '0; + assign evict_byteen = byteen_rdata[way_idx_r]; + + end else begin : g_no_dirty_bytes + `UNUSED_VAR (init) + `UNUSED_VAR (flush) + assign evict_byteen = '1; // update whole line end - // order the data layout to perform ways multiplexing last. - // this allows converting way index to binary in parallel with BRAM readaccess and way selection. + wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] line_rdata; - wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_wdata; - wire [BYTEENW-1:0] line_wren; + for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_data_store - if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin - wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w; - for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin - for (genvar j = 0; j < NUM_WAYS; ++j) begin - assign line_wdata[i][j] = (fill || !WRITE_ENABLE) ? fill_data[i] : write_data[i]; - assign wren_w[i][j] = ((fill || !WRITE_ENABLE) ? {WORD_SIZE{1'b1}} : write_byteen[i]) - & {WORD_SIZE{(way_sel[j] || (NUM_WAYS == 1))}}; - end + localparam WRENW = WRITE_ENABLE ? LINE_SIZE : 1; + + wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] line_wdata; + wire [WRENW-1:0] line_wren; + + if (WRITE_ENABLE) begin : g_wren + assign line_wdata = fill ? fill_data : {`CS_WORDS_PER_LINE{write_word}}; + assign line_wren = {LINE_SIZE{fill}} | write_mask; + end else begin : g_no_wren + `UNUSED_VAR (write_word) + `UNUSED_VAR (write_mask) + assign line_wdata = fill_data; + assign line_wren = 1'b1; end - assign line_wren = wren_w; - end else begin - `UNUSED_VAR (write) - `UNUSED_VAR (write_byteen) - `UNUSED_VAR (write_data) - assign line_wdata = fill_data; - assign line_wren = fill; + + wire line_write = (fill && ((NUM_WAYS == 1) || (evict_way == i))) + || (write && tag_matches[i] && WRITE_ENABLE); + + wire line_read = read || ((fill || flush) && WRITEBACK); + + VX_sp_ram #( + .DATAW (`CS_LINE_WIDTH), + .SIZE (`CS_LINES_PER_BANK), + .WRENW (WRENW), + .OUT_REG (1), + .RDW_MODE ("R") + ) data_store ( + .clk (clk), + .reset (reset), + .read (line_read), + .write (line_write), + .wren (line_wren), + .addr (line_idx), + .wdata (line_wdata), + .rdata (line_rdata[i]) + ); end - VX_onehot_encoder #( - .N (NUM_WAYS) - ) way_enc ( - .data_in (way_sel), - .data_out (way_idx), - `UNUSED_PIN (valid_out) - ); - - wire line_read = (read && ~stall) - || (WRITEBACK && (fill || flush)); - - wire line_write = write || fill; - - VX_sp_ram #( - .DATAW (`CS_LINE_WIDTH * NUM_WAYS), - .SIZE (`CS_LINES_PER_BANK), - .WRENW (BYTEENW), - .NO_RWCHECK (1), - .RW_ASSERT (1) - ) data_store ( - .clk (clk), - .reset (reset), - .read (line_read), - .write (line_write), - .wren (line_wren), - .addr (line_sel), - .wdata (line_wdata), - .rdata (line_rdata) - ); - - wire [NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] per_way_rdata; - if (`CS_WORDS_PER_LINE > 1) begin - assign per_way_rdata = line_rdata[wsel]; - end else begin - `UNUSED_VAR (wsel) - assign per_way_rdata = line_rdata; - end - assign read_data = per_way_rdata[way_idx]; - -`ifdef DBG_TRACE_CACHE - always @(posedge clk) begin - if (fill && ~stall) begin - `TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data)); - end - if (flush && ~stall) begin - `TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, dirty_byteen, dirty_data)); - end - if (read && ~stall) begin - `TRACE(3, ("%d: %s read: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, read_data, req_uuid)); - end - if (write && ~stall) begin - `TRACE(3, ("%d: %s write: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, write_byteen[wsel], write_data[wsel], req_uuid)); - end - end -`endif + assign read_data = line_rdata[way_idx_r]; endmodule diff --git a/hw/rtl/cache/VX_cache_define.vh b/hw/rtl/cache/VX_cache_define.vh index e6d7da167..fb1751b0d 100644 --- a/hw/rtl/cache/VX_cache_define.vh +++ b/hw/rtl/cache/VX_cache_define.vh @@ -22,6 +22,7 @@ `define CS_LINE_WIDTH (8 * LINE_SIZE) `define CS_BANK_SIZE (CACHE_SIZE / NUM_BANKS) `define CS_WAY_SEL_BITS `CLOG2(NUM_WAYS) +`define CS_WAY_SEL_WIDTH `UP(`CS_WAY_SEL_BITS) `define CS_LINES_PER_BANK (`CS_BANK_SIZE / (LINE_SIZE * NUM_WAYS)) `define CS_WORDS_PER_LINE (LINE_SIZE / WORD_SIZE) @@ -54,12 +55,7 @@ /////////////////////////////////////////////////////////////////////////////// -`define CS_LINE_TO_MEM_ADDR(x, i) {x, `CS_BANK_SEL_BITS'(i)} -`define CS_MEM_ADDR_TO_BANK_ID(x) x[0 +: `CS_BANK_SEL_BITS] -`define CS_MEM_TAG_TO_REQ_ID(x) x[MSHR_ADDR_WIDTH-1:0] -`define CS_MEM_TAG_TO_BANK_ID(x) x[MSHR_ADDR_WIDTH +: `CS_BANK_SEL_BITS] - -`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))} +`define CS_BANK_TO_FULL_ADDR(x, b) {x, (`XLEN-$bits(x))'(b << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))} `define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)} /////////////////////////////////////////////////////////////////////////////// @@ -74,4 +70,10 @@ `PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, count, (count > 1)) \ `PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, count, (count > 1)) +/////////////////////////////////////////////////////////////////////////////// + +`define CS_REPL_RANDOM 0 +`define CS_REPL_FIFO 1 +`define CS_REPL_PLRU 2 + `endif // VX_CACHE_DEFINE_VH diff --git a/hw/rtl/cache/VX_cache_flush.sv b/hw/rtl/cache/VX_cache_flush.sv index 7a33565fc..57546dbc9 100644 --- a/hw/rtl/cache/VX_cache_flush.sv +++ b/hw/rtl/cache/VX_cache_flush.sv @@ -18,6 +18,10 @@ module VX_cache_flush #( parameter NUM_REQS = 4, // Number of banks parameter NUM_BANKS = 1, + // Request debug identifier + parameter UUID_WIDTH = 0, + // core request tag size + parameter TAG_WIDTH = UUID_WIDTH + 1, // Bank select latency parameter BANK_SEL_LATENCY = 1 ) ( @@ -27,8 +31,11 @@ module VX_cache_flush #( VX_mem_bus_if.master core_bus_out_if [NUM_REQS], input wire [NUM_BANKS-1:0] bank_req_fire, output wire [NUM_BANKS-1:0] flush_begin, + output wire [`UP(UUID_WIDTH)-1:0] flush_uuid, input wire [NUM_BANKS-1:0] flush_end ); + `UNUSED_PARAM (TAG_WIDTH) + localparam STATE_IDLE = 0; localparam STATE_WAIT1 = 1; localparam STATE_FLUSH = 2; @@ -41,13 +48,13 @@ module VX_cache_flush #( wire no_inflight_reqs; - if (BANK_SEL_LATENCY != 0) begin + if (BANK_SEL_LATENCY != 0) begin : g_bank_sel_latency localparam NUM_REQS_W = `CLOG2(NUM_REQS+1); localparam NUM_BANKS_W = `CLOG2(NUM_BANKS+1); wire [NUM_REQS-1:0] core_bus_out_fire; - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_fire assign core_bus_out_fire[i] = core_bus_out_if[i].req_valid && core_bus_out_if[i].req_ready; end @@ -74,7 +81,7 @@ module VX_cache_flush #( `UNUSED_PIN (size) ); - end else begin + end else begin : g_no_bank_sel_latency assign no_inflight_reqs = 0; `UNUSED_VAR (bank_req_fire) end @@ -82,28 +89,38 @@ module VX_cache_flush #( reg [NUM_BANKS-1:0] flush_done, flush_done_n; wire [NUM_REQS-1:0] flush_req_mask; - for (genvar i = 0; i < NUM_REQS; ++i) begin - assign flush_req_mask[i] = core_bus_in_if[i].req_valid && core_bus_in_if[i].req_data.atype[`ADDR_TYPE_FLUSH]; + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_flush_req_mask + assign flush_req_mask[i] = core_bus_in_if[i].req_valid && core_bus_in_if[i].req_data.flags[`MEM_REQ_FLAG_FLUSH]; end wire flush_req_enable = (| flush_req_mask); reg [NUM_REQS-1:0] lock_released, lock_released_n; + reg [`UP(UUID_WIDTH)-1:0] flush_uuid_r, flush_uuid_n; - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_req wire input_enable = ~flush_req_enable || lock_released[i]; assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && input_enable; assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data; assign core_bus_in_if[i].req_ready = core_bus_out_if[i].req_ready && input_enable; end - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_in_rsp assign core_bus_in_if[i].rsp_valid = core_bus_out_if[i].rsp_valid; assign core_bus_in_if[i].rsp_data = core_bus_out_if[i].rsp_data; assign core_bus_out_if[i].rsp_ready = core_bus_in_if[i].rsp_ready; end + reg [NUM_REQS-1:0][`UP(UUID_WIDTH)-1:0] core_bus_out_uuid; wire [NUM_REQS-1:0] core_bus_out_ready; - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_uuid + if (UUID_WIDTH != 0) begin : g_uuid + assign core_bus_out_uuid[i] = core_bus_in_if[i].req_data.tag.uuid; + end else begin : g_no_uuid + assign core_bus_out_uuid[i] = 0; + end + end + + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_ready assign core_bus_out_ready[i] = core_bus_out_if[i].req_ready; end @@ -111,10 +128,17 @@ module VX_cache_flush #( state_n = state; flush_done_n = flush_done; lock_released_n = lock_released; + flush_uuid_n = flush_uuid_r; case (state) - STATE_IDLE: begin + //STATE_IDLE: + default: begin if (flush_req_enable) begin state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT1 : STATE_FLUSH; + for (integer i = NUM_REQS-1; i >= 0; --i) begin + if (flush_req_mask[i]) begin + flush_uuid_n = core_bus_out_uuid[i]; + end + end end end STATE_WAIT1: begin @@ -158,8 +182,10 @@ module VX_cache_flush #( flush_done <= flush_done_n; lock_released <= lock_released_n; end + flush_uuid_r <= flush_uuid_n; end assign flush_begin = {NUM_BANKS{state == STATE_FLUSH}}; + assign flush_uuid = flush_uuid_r; endmodule diff --git a/hw/rtl/cache/VX_cache_mshr.sv b/hw/rtl/cache/VX_cache_mshr.sv index 4f8163269..b8256278d 100644 --- a/hw/rtl/cache/VX_cache_mshr.sv +++ b/hw/rtl/cache/VX_cache_mshr.sv @@ -24,36 +24,23 @@ // arrival and are dequeued in the same order. // Each entry has a next pointer to the next entry pending for the same cache line. // -// During the fill operation, the MSHR will release the MSHR entry at fill_id +// During the fill request, the MSHR will dequue the MSHR entry at the fill_id location // which represents the first request in the pending list that initiated the memory fill. // -// The dequeue operation directly follows the fill operation and will release +// The dequeue response directly follows the fill request and will release // all the subsequent entries linked to fill_id (pending the same cache line). // -// During the allocation operation, the MSHR will allocate the next free slot +// During the allocation request, the MSHR will allocate the next free slot // for the incoming core request. We return the allocated slot id as well as // the slot id of the previous entry for the same cache line. This is used to -// link the new entry to the pending list during finalization. +// link the new entry to the pending list. // -// The lookup operation is used to find all pending entries for a given cache line. -// This is used to by the cache bank to determine if a cache miss is already pending -// and therefore avoid issuing a memory fill request. -// -// The finalize operation is used to release the allocated MSHR entry if we had a hit. -// If we had a miss and finalize_pending is true, we link the allocated entry to -// its corresponding pending list (via finalize_prev). +// The finalize request is used to persit or release the currently allocated MSHR entry +// if we had a cache miss or a hit, respectively. // // Warning: This MSHR implementation is strongly coupled with the bank pipeline // and as such changes to either module requires careful evaluation. // -// This architecture implements three pipeline stages: -// - Arbitration: cache bank arbitration before entering pipeline. -// fill and dequeue operations are executed at this stage. -// - stage 0: cache bank tag access stage. -// allocate and lookup operations are executed at this stage. -// - stage 1: cache bank tdatag access stage. -// finalize operation is executed at this stage. -// module VX_cache_mshr #( parameter `STRING INSTANCE_ID= "", @@ -68,6 +55,9 @@ module VX_cache_mshr #( parameter UUID_WIDTH = 0, // MSHR parameters parameter DATA_WIDTH = 1, + // Enable cache writeback + parameter WRITEBACK = 0, + parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE) ) ( input wire clk, @@ -75,7 +65,7 @@ module VX_cache_mshr #( `IGNORE_UNUSED_BEGIN input wire[`UP(UUID_WIDTH)-1:0] deq_req_uuid, - input wire[`UP(UUID_WIDTH)-1:0] lkp_req_uuid, + input wire[`UP(UUID_WIDTH)-1:0] alc_req_uuid, input wire[`UP(UUID_WIDTH)-1:0] fin_req_uuid, `IGNORE_UNUSED_END @@ -98,26 +88,21 @@ module VX_cache_mshr #( input wire allocate_rw, input wire [DATA_WIDTH-1:0] allocate_data, output wire [MSHR_ADDR_WIDTH-1:0] allocate_id, - output wire [MSHR_ADDR_WIDTH-1:0] allocate_prev, + output wire allocate_pending, + output wire [MSHR_ADDR_WIDTH-1:0] allocate_previd, output wire allocate_ready, - // lookup - input wire lookup_valid, - input wire [`CS_LINE_ADDR_WIDTH-1:0] lookup_addr, - output wire [MSHR_SIZE-1:0] lookup_pending, - output wire [MSHR_SIZE-1:0] lookup_rw, - // finalize input wire finalize_valid, - input wire finalize_release, - input wire finalize_pending, - input wire [MSHR_ADDR_WIDTH-1:0] finalize_id, - input wire [MSHR_ADDR_WIDTH-1:0] finalize_prev + input wire finalize_is_release, + input wire finalize_is_pending, + input wire [MSHR_ADDR_WIDTH-1:0] finalize_previd, + input wire [MSHR_ADDR_WIDTH-1:0] finalize_id ); `UNUSED_PARAM (BANK_ID) - reg [`CS_LINE_ADDR_WIDTH-1:0] addr_table [MSHR_SIZE-1:0]; - reg [MSHR_ADDR_WIDTH-1:0] next_index [MSHR_SIZE-1:0]; + reg [`CS_LINE_ADDR_WIDTH-1:0] addr_table [0:MSHR_SIZE-1]; + reg [MSHR_ADDR_WIDTH-1:0] next_index [0:MSHR_SIZE-1]; reg [MSHR_SIZE-1:0] valid_table, valid_table_n; reg [MSHR_SIZE-1:0] next_table, next_table_x, next_table_n; @@ -135,8 +120,8 @@ module VX_cache_mshr #( wire dequeue_fire = dequeue_valid && dequeue_ready; wire [MSHR_SIZE-1:0] addr_matches; - for (genvar i = 0; i < MSHR_SIZE; ++i) begin - assign addr_matches[i] = valid_table[i] && (addr_table[i] == lookup_addr); + for (genvar i = 0; i < MSHR_SIZE; ++i) begin : g_addr_matches + assign addr_matches[i] = valid_table[i] && (addr_table[i] == allocate_addr); end VX_lzc #( @@ -148,11 +133,13 @@ module VX_cache_mshr #( .valid_out (allocate_rdy_n) ); - VX_onehot_encoder #( + // find matching tail-entry + VX_priority_encoder #( .N (MSHR_SIZE) ) prev_sel ( .data_in (addr_matches & ~next_table_x), - .data_out (prev_idx), + .index_out (prev_idx), + `UNUSED_PIN (onehot_out), `UNUSED_PIN (valid_out) ); @@ -171,17 +158,22 @@ module VX_cache_mshr #( valid_table_n[dequeue_id] = 0; if (next_table[dequeue_id]) begin dequeue_id_n = next_index[dequeue_id]; + end else if (finalize_valid && finalize_is_pending && (finalize_previd == dequeue_id)) begin + dequeue_id_n = finalize_id; end else begin dequeue_val_n = 0; end end if (finalize_valid) begin - if (finalize_release) begin + if (finalize_is_release) begin valid_table_n[finalize_id] = 0; end - if (finalize_pending) begin - next_table_x[finalize_prev] = 1; + // warning: This code allows 'finalize_is_pending' to be asserted regardless of hit/miss + // to reduce the its propagation delay into the MSHR. this is safe because wrong updates + // to 'next_table_n' will be cleared during 'allocate_fire' below. + if (finalize_is_pending) begin + next_table_x[finalize_previd] = 1; end end @@ -204,12 +196,12 @@ module VX_cache_mshr #( end if (allocate_fire) begin - addr_table[allocate_id] <= allocate_addr; + addr_table[allocate_id] <= allocate_addr; write_table[allocate_id] <= allocate_rw; end - if (finalize_valid && finalize_pending) begin - next_index[finalize_prev] <= finalize_id; + if (finalize_valid && finalize_is_pending) begin + next_index[finalize_previd] <= finalize_id; end dequeue_id_r <= dequeue_id_n; @@ -217,20 +209,21 @@ module VX_cache_mshr #( next_table <= next_table_n; end - `RUNTIME_ASSERT((~allocate_fire || ~valid_table[allocate_id_r]), ("%t: *** %s inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, - `CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id_r, lkp_req_uuid)) + `RUNTIME_ASSERT(~(allocate_fire && valid_table[allocate_id_r]), ("%t: *** %s inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, + `CS_BANK_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id_r, alc_req_uuid)) - `RUNTIME_ASSERT((~finalize_valid || valid_table[finalize_id]), ("%t: *** %s invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, - `CS_LINE_TO_FULL_ADDR(addr_table[finalize_id], BANK_ID), finalize_id, fin_req_uuid)) + `RUNTIME_ASSERT(~(finalize_valid && ~valid_table[finalize_id]), ("%t: *** %s invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, + `CS_BANK_TO_FULL_ADDR(addr_table[finalize_id], BANK_ID), finalize_id, fin_req_uuid)) - `RUNTIME_ASSERT((~fill_valid || valid_table[fill_id]), ("%t: *** %s invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID, - `CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), fill_id)) + `RUNTIME_ASSERT(~(fill_valid && ~valid_table[fill_id]), ("%t: *** %s invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID, + `CS_BANK_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), fill_id)) VX_dp_ram #( - .DATAW (DATA_WIDTH), - .SIZE (MSHR_SIZE), - .LUTRAM (1) - ) entries ( + .DATAW (DATA_WIDTH), + .SIZE (MSHR_SIZE), + .RDW_MODE ("R"), + .RADDR_REG (1) + ) mshr_store ( .clk (clk), .reset (reset), .read (1'b1), @@ -245,19 +238,20 @@ module VX_cache_mshr #( assign fill_addr = addr_table[fill_id]; assign allocate_ready = allocate_rdy; - assign allocate_id = allocate_id_r; - assign allocate_prev = prev_idx; + assign allocate_id = allocate_id_r; + assign allocate_previd = prev_idx; - assign dequeue_valid = dequeue_val; - assign dequeue_addr = addr_table[dequeue_id_r]; - assign dequeue_rw = write_table[dequeue_id_r]; - assign dequeue_id = dequeue_id_r; + if (WRITEBACK) begin : g_pending_wb + assign allocate_pending = |addr_matches; + end else begin : g_pending_wt + // exclude write requests if writethrough + assign allocate_pending = |(addr_matches & ~write_table); + end - // return pending entries for the given cache line - assign lookup_pending = addr_matches; - assign lookup_rw = write_table; - - `UNUSED_VAR (lookup_valid) + assign dequeue_valid = dequeue_val; + assign dequeue_addr = addr_table[dequeue_id_r]; + assign dequeue_rw = write_table[dequeue_id_r]; + assign dequeue_id = dequeue_id_r; `ifdef DBG_TRACE_CACHE reg show_table; @@ -265,37 +259,42 @@ module VX_cache_mshr #( if (reset) begin show_table <= 0; end else begin - show_table <= allocate_fire || lookup_valid || finalize_valid || fill_valid || dequeue_fire; + show_table <= allocate_fire || finalize_valid || fill_valid || dequeue_fire; + end + if (allocate_fire) begin + `TRACE(3, ("%t: %s allocate: addr=0x%0h, id=%0d, pending=%b, prev=%0d (#%0d)\n", $time, INSTANCE_ID, + `CS_BANK_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id, allocate_pending, prev_idx, alc_req_uuid)) + end + if (finalize_valid && finalize_is_release) begin + `TRACE(3, ("%t: %s release: id=%0d (#%0d)\n", $time, INSTANCE_ID, finalize_id, fin_req_uuid)) + end + if (finalize_valid && finalize_is_pending) begin + `TRACE(3, ("%t: %s finalize: id=%0d (#%0d)\n", $time, INSTANCE_ID, finalize_id, fin_req_uuid)) + end + if (fill_valid) begin + `TRACE(3, ("%t: %s fill: addr=0x%0h, id=%0d\n", $time, INSTANCE_ID, + `CS_BANK_TO_FULL_ADDR(fill_addr, BANK_ID), fill_id)) + end + if (dequeue_fire) begin + `TRACE(3, ("%t: %s dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID, + `CS_BANK_TO_FULL_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_uuid)) end - if (allocate_fire) - `TRACE(3, ("%d: %s allocate: addr=0x%0h, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID, - `CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_prev, allocate_id, lkp_req_uuid)); - if (lookup_valid) - `TRACE(3, ("%d: %s lookup: addr=0x%0h, matches=%b (#%0d)\n", $time, INSTANCE_ID, - `CS_LINE_TO_FULL_ADDR(lookup_addr, BANK_ID), lookup_pending, lkp_req_uuid)); - if (finalize_valid) - `TRACE(3, ("%d: %s finalize release=%b, pending=%b, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID, - finalize_release, finalize_pending, finalize_prev, finalize_id, fin_req_uuid)); - if (fill_valid) - `TRACE(3, ("%d: %s fill: addr=0x%0h, addr=0x%0h, id=%0d\n", $time, INSTANCE_ID, - `CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), `CS_LINE_TO_FULL_ADDR(fill_addr, BANK_ID), fill_id)); - if (dequeue_fire) - `TRACE(3, ("%d: %s dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID, - `CS_LINE_TO_FULL_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_uuid)); if (show_table) begin - `TRACE(3, ("%d: %s table", $time, INSTANCE_ID)); + `TRACE(3, ("%t: %s table", $time, INSTANCE_ID)) for (integer i = 0; i < MSHR_SIZE; ++i) begin if (valid_table[i]) begin - `TRACE(3, (" %0d=0x%0h", i, `CS_LINE_TO_FULL_ADDR(addr_table[i], BANK_ID))); - if (write_table[i]) - `TRACE(3, ("(w)")); - else - `TRACE(3, ("(r)")); - if (next_table[i]) - `TRACE(3, ("->%0d", next_index[i])); + `TRACE(3, (" %0d=0x%0h", i, `CS_BANK_TO_FULL_ADDR(addr_table[i], BANK_ID))) + if (write_table[i]) begin + `TRACE(3, ("(w)")) + end else begin + `TRACE(3, ("(r)")) + end + if (next_table[i]) begin + `TRACE(3, ("->%0d", next_index[i])) + end end end - `TRACE(3, ("\n")); + `TRACE(3, ("\n")) end end `endif diff --git a/hw/rtl/cache/VX_cache_repl.sv b/hw/rtl/cache/VX_cache_repl.sv new file mode 100644 index 000000000..1007bd06a --- /dev/null +++ b/hw/rtl/cache/VX_cache_repl.sv @@ -0,0 +1,210 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_cache_define.vh" + +// Fast PLRU encoder and decoder utility +// Adapted from BaseJump STL: http://bjump.org/data_out.html + +module plru_decoder #( + parameter NUM_WAYS = 1, + parameter WAY_IDX_BITS = $clog2(NUM_WAYS), + parameter WAY_IDX_WIDTH = `UP(WAY_IDX_BITS) +) ( + input wire [WAY_IDX_WIDTH-1:0] way_idx, + output wire [`UP(NUM_WAYS-1)-1:0] lru_data, + output wire [`UP(NUM_WAYS-1)-1:0] lru_mask +); + if (NUM_WAYS > 1) begin : g_dec + wire [`UP(NUM_WAYS-1)-1:0] data; + `IGNORE_UNOPTFLAT_BEGIN + wire [`UP(NUM_WAYS-1)-1:0] mask; + `IGNORE_UNOPTFLAT_END + for (genvar i = 0; i < NUM_WAYS-1; ++i) begin : g_i + if (i == 0) begin : g_i_0 + assign mask[i] = 1'b1; + end else if (i % 2 == 1) begin : g_i_odd + assign mask[i] = mask[(i-1)/2] & ~way_idx[WAY_IDX_BITS-$clog2(i+2)+1]; + end else begin : g_i_even + assign mask[i] = mask[(i-2)/2] & way_idx[WAY_IDX_BITS-$clog2(i+2)+1]; + end + assign data[i] = ~way_idx[WAY_IDX_BITS-$clog2(i+2)]; + end + assign lru_data = data; + assign lru_mask = mask; + end else begin : g_no_dec + `UNUSED_VAR (way_idx) + assign lru_data = '0; + assign lru_mask = '0; + end + +endmodule + +module plru_encoder #( + parameter NUM_WAYS = 1, + parameter WAY_IDX_BITS = $clog2(NUM_WAYS), + parameter WAY_IDX_WIDTH = `UP(WAY_IDX_BITS) +) ( + input wire [`UP(NUM_WAYS-1)-1:0] lru_in, + output wire [WAY_IDX_WIDTH-1:0] way_idx +); + if (NUM_WAYS > 1) begin : g_enc + wire [WAY_IDX_BITS-1:0] tmp; + for (genvar i = 0; i < WAY_IDX_BITS; ++i) begin : g_i + if (i == 0) begin : g_i_0 + assign tmp[WAY_IDX_WIDTH-1] = lru_in[0]; + end else begin : g_i_n + VX_mux #( + .N (2**i) + ) mux ( + .data_in (lru_in[((2**i)-1)+:(2**i)]), + .sel_in (tmp[WAY_IDX_BITS-1-:i]), + .data_out (tmp[WAY_IDX_BITS-1-i]) + ); + end + end + assign way_idx = tmp; + end else begin : g_no_enc + `UNUSED_VAR (lru_in) + assign way_idx = '0; + end + +endmodule + +module VX_cache_repl #( + parameter CACHE_SIZE = 1024, + // Size of line inside a bank in bytes + parameter LINE_SIZE = 64, + // Number of banks + parameter NUM_BANKS = 1, + // Number of associative ways + parameter NUM_WAYS = 1, + // replacement policy + parameter REPL_POLICY = `CS_REPL_FIFO +) ( + input wire clk, + input wire reset, + input wire stall, + input wire init, + input wire lookup_valid, + input wire lookup_hit, + input wire [`CS_LINE_SEL_BITS-1:0] lookup_line, + input wire [`CS_WAY_SEL_WIDTH-1:0] lookup_way, + input wire repl_valid, + input wire [`CS_LINE_SEL_BITS-1:0] repl_line, + output wire [`CS_WAY_SEL_WIDTH-1:0] repl_way +); + localparam WAY_SEL_WIDTH = `CS_WAY_SEL_WIDTH; + `UNUSED_VAR (reset) + `UNUSED_VAR (init) + `UNUSED_VAR (stall) + + if (NUM_WAYS > 1) begin : g_enable + if (REPL_POLICY == `CS_REPL_PLRU) begin : g_plru + // Pseudo Least Recently Used replacement policy + localparam LRU_WIDTH = `UP(NUM_WAYS-1); + + wire [LRU_WIDTH-1:0] plru_rdata; + wire [LRU_WIDTH-1:0] plru_wdata; + wire [LRU_WIDTH-1:0] plru_wmask; + + VX_dp_ram #( + .DATAW (LRU_WIDTH), + .SIZE (`CS_LINES_PER_BANK), + .WRENW (LRU_WIDTH), + .RDW_MODE ("R"), + .RADDR_REG (1) + ) plru_store ( + .clk (clk), + .reset (1'b0), + .read (repl_valid), + .write (init || (lookup_valid && lookup_hit)), + .wren (init ? '1 : plru_wmask), + .waddr (lookup_line), + .raddr (repl_line), + .wdata (init ? '0 : plru_wdata), + .rdata (plru_rdata) + ); + + plru_decoder #( + .NUM_WAYS (NUM_WAYS) + ) plru_dec ( + .way_idx (lookup_way), + .lru_data (plru_wdata), + .lru_mask (plru_wmask) + ); + + plru_encoder #( + .NUM_WAYS (NUM_WAYS) + ) plru_enc ( + .lru_in (plru_rdata), + .way_idx (repl_way) + ); + + end else if (REPL_POLICY == `CS_REPL_FIFO) begin : g_fifo + // Fifo replacement policy + `UNUSED_VAR (lookup_valid) + `UNUSED_VAR (lookup_hit) + `UNUSED_VAR (lookup_line) + `UNUSED_VAR (lookup_way) + + wire [WAY_SEL_WIDTH-1:0] fifo_rdata; + wire [WAY_SEL_WIDTH-1:0] fifo_wdata = fifo_rdata + 1; + + VX_sp_ram #( + .DATAW (WAY_SEL_WIDTH), + .SIZE (`CS_LINES_PER_BANK), + .RDW_MODE ("R"), + .RADDR_REG (1) + ) fifo_store ( + .clk (clk), + .reset (1'b0), + .read (repl_valid), + .write (init || repl_valid), + .wren (1'b1), + .addr (repl_line), + .wdata (init ? '0 : fifo_wdata), + .rdata (fifo_rdata) + ); + + assign repl_way = fifo_rdata; + end else begin : g_random + // Random replacement policy + `UNUSED_VAR (lookup_valid) + `UNUSED_VAR (lookup_hit) + `UNUSED_VAR (lookup_line) + `UNUSED_VAR (lookup_way) + `UNUSED_VAR (repl_valid) + `UNUSED_VAR (repl_line) + reg [WAY_SEL_WIDTH-1:0] victim_idx; + always @(posedge clk) begin + if (reset) begin + victim_idx <= 0; + end else if (~stall) begin + victim_idx <= victim_idx + 1; + end + end + assign repl_way = victim_idx; + end + end else begin : g_disable + `UNUSED_VAR (clk) + `UNUSED_VAR (lookup_valid) + `UNUSED_VAR (lookup_hit) + `UNUSED_VAR (lookup_line) + `UNUSED_VAR (lookup_way) + `UNUSED_VAR (repl_valid) + `UNUSED_VAR (repl_line) + assign repl_way = 1'b0; + end + +endmodule diff --git a/hw/rtl/cache/VX_cache_tags.sv b/hw/rtl/cache/VX_cache_tags.sv index 7fef69be6..66b9bc689 100644 --- a/hw/rtl/cache/VX_cache_tags.sv +++ b/hw/rtl/cache/VX_cache_tags.sv @@ -14,8 +14,6 @@ `include "VX_cache_define.vh" module VX_cache_tags #( - parameter `STRING INSTANCE_ID = "", - parameter BANK_ID = 0, // Size of cache in bytes parameter CACHE_SIZE = 1024, // Size of line inside a bank in bytes @@ -27,96 +25,61 @@ module VX_cache_tags #( // Size of a word in bytes parameter WORD_SIZE = 1, // Enable cache writeback - parameter WRITEBACK = 0, - // Request debug identifier - parameter UUID_WIDTH = 0 + parameter WRITEBACK = 0 ) ( input wire clk, input wire reset, -`IGNORE_UNUSED_BEGIN - input wire [`UP(UUID_WIDTH)-1:0] req_uuid, -`IGNORE_UNUSED_END - - input wire stall, - - // init/fill/lookup + // inputs input wire init, input wire flush, input wire fill, + input wire read, input wire write, - input wire lookup, - input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr, - input wire [NUM_WAYS-1:0] way_sel, - output wire [NUM_WAYS-1:0] tag_matches, + input wire [`CS_LINE_SEL_BITS-1:0] line_idx, + input wire [`CS_TAG_SEL_BITS-1:0] line_tag, + input wire [`CS_WAY_SEL_WIDTH-1:0] evict_way, - // eviction + // outputs + output wire [NUM_WAYS-1:0] tag_matches, output wire evict_dirty, - output wire [NUM_WAYS-1:0] evict_way, output wire [`CS_TAG_SEL_BITS-1:0] evict_tag ); - `UNUSED_SPARAM (INSTANCE_ID) - `UNUSED_PARAM (BANK_ID) - `UNUSED_VAR (lookup) - - // valid, dirty, tag - localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS; - - wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0]; - wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_ADDR_TAG(line_addr); + // valid, dirty, tag + localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS; wire [NUM_WAYS-1:0][`CS_TAG_SEL_BITS-1:0] read_tag; wire [NUM_WAYS-1:0] read_valid; wire [NUM_WAYS-1:0] read_dirty; + `UNUSED_VAR (read) - if (NUM_WAYS > 1) begin - reg [NUM_WAYS-1:0] evict_way_r; - // cyclic assignment of replacement way - always @(posedge clk) begin - if (reset) begin - evict_way_r <= 1; - end else if (~stall) begin // holding the value on stalls prevents filling different slots twice - evict_way_r <= {evict_way_r[NUM_WAYS-2:0], evict_way_r[NUM_WAYS-1]}; - end - end - - assign evict_way = fill ? evict_way_r : way_sel; - - VX_onehot_mux #( - .DATAW (`CS_TAG_SEL_BITS), - .N (NUM_WAYS) - ) evict_tag_sel ( - .data_in (read_tag), - .sel_in (evict_way), - .data_out (evict_tag) - ); - end else begin - `UNUSED_VAR (stall) - assign evict_way = 1'b1; - assign evict_tag = read_tag; + if (WRITEBACK) begin : g_evict_tag_wb + assign evict_dirty = read_dirty[evict_way]; + assign evict_tag = read_tag[evict_way]; + end else begin : g_evict_tag_wt + `UNUSED_VAR (read_dirty) + assign evict_dirty = 1'b0; + assign evict_tag = '0; end - // fill and flush need to also read in writeback mode - wire fill_s = fill && (!WRITEBACK || ~stall); - wire flush_s = flush && (!WRITEBACK || ~stall); + for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_tag_store + wire way_en = (NUM_WAYS == 1) || (evict_way == i); + wire do_init = init; // init all ways + wire do_fill = fill && way_en; + wire do_flush = flush && (!WRITEBACK || way_en); // flush the whole line in writethrough mode + wire do_write = WRITEBACK && write && tag_matches[i]; // only write on tag hit - for (genvar i = 0; i < NUM_WAYS; ++i) begin - - wire do_fill = fill_s && evict_way[i]; - wire do_flush = flush_s && (!WRITEBACK || way_sel[i]); // flush the whole line in writethrough mode - wire do_write = WRITEBACK && write && tag_matches[i]; - - wire line_read = (WRITEBACK && (fill_s || flush_s)); - wire line_write = init || do_fill || do_flush || do_write; - wire line_valid = ~(init || flush); + wire line_read = read || write || (WRITEBACK && (fill || flush)); + wire line_write = do_init || do_fill || do_flush || do_write; + wire line_valid = fill || write; wire [TAG_WIDTH-1:0] line_wdata; wire [TAG_WIDTH-1:0] line_rdata; - if (WRITEBACK) begin + if (WRITEBACK) begin : g_wdata assign line_wdata = {line_valid, write, line_tag}; assign {read_valid[i], read_dirty[i], read_tag[i]} = line_rdata; - end else begin + end else begin : g_wdata assign line_wdata = {line_valid, line_tag}; assign {read_valid[i], read_tag[i]} = line_rdata; assign read_dirty[i] = 1'b0; @@ -125,52 +88,22 @@ module VX_cache_tags #( VX_sp_ram #( .DATAW (TAG_WIDTH), .SIZE (`CS_LINES_PER_BANK), - .NO_RWCHECK (1), - .RW_ASSERT (1) + .RDW_MODE ("W"), + .RADDR_REG (1) ) tag_store ( .clk (clk), .reset (reset), .read (line_read), .write (line_write), .wren (1'b1), - .addr (line_sel), + .addr (line_idx), .wdata (line_wdata), .rdata (line_rdata) ); end - for (genvar i = 0; i < NUM_WAYS; ++i) begin + for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_tag_matches assign tag_matches[i] = read_valid[i] && (line_tag == read_tag[i]); end - assign evict_dirty = | (read_dirty & evict_way); - -`ifdef DBG_TRACE_CACHE - wire [`CS_LINE_ADDR_WIDTH-1:0] evict_line_addr = {evict_tag, line_sel}; - always @(posedge clk) begin - if (fill && ~stall) begin - `TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, dirty=%b, evict_addr=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), evict_way, line_sel, line_tag, evict_dirty, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID))); - end - if (init) begin - `TRACE(3, ("%d: %s init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel)); - end - if (flush && ~stall) begin - `TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, dirty=%b\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID), way_sel, line_sel, evict_dirty)); - end - if (lookup && ~stall) begin - if (tag_matches != 0) begin - if (write) - `TRACE(3, ("%d: %s write-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); - else - `TRACE(3, ("%d: %s read-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid)); - end else begin - if (write) - `TRACE(3, ("%d: %s write-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid)); - else - `TRACE(3, ("%d: %s read-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid)); - end - end - end -`endif - endmodule diff --git a/hw/rtl/cache/VX_cache_top.sv b/hw/rtl/cache/VX_cache_top.sv index 0959701aa..ae8cc3fc6 100644 --- a/hw/rtl/cache/VX_cache_top.sv +++ b/hw/rtl/cache/VX_cache_top.sv @@ -19,8 +19,11 @@ module VX_cache_top import VX_gpu_pkg::*; #( // Number of Word requests per cycle parameter NUM_REQS = 4, + // Number of memory ports + parameter MEM_PORTS = 1, + // Size of cache in bytes - parameter CACHE_SIZE = 16384, + parameter CACHE_SIZE = 65536, // Size of line inside a bank in bytes parameter LINE_SIZE = 64, // Number of banks @@ -28,39 +31,39 @@ module VX_cache_top import VX_gpu_pkg::*; #( // Number of associative ways parameter NUM_WAYS = 4, // Size of a word in bytes - parameter WORD_SIZE = 4, + parameter WORD_SIZE = 16, // Core Response Queue Size - parameter CRSQ_SIZE = 2, + parameter CRSQ_SIZE = 8, // Miss Reserv Queue Knob parameter MSHR_SIZE = 16, // Memory Response Queue Size - parameter MRSQ_SIZE = 0, + parameter MRSQ_SIZE = 8, // Memory Request Queue Size - parameter MREQ_SIZE = 4, + parameter MREQ_SIZE = 8, // Enable cache writeable parameter WRITE_ENABLE = 1, // Enable cache writeback - parameter WRITEBACK = 0, + parameter WRITEBACK = 1, // Enable dirty bytes on writeback - parameter DIRTY_BYTES = 0, + parameter DIRTY_BYTES = 1, // Request debug identifier parameter UUID_WIDTH = 0, // core request tag size - parameter TAG_WIDTH = 16, + parameter TAG_WIDTH = 32, // Core response output buffer - parameter CORE_OUT_BUF = 2, + parameter CORE_OUT_BUF = 3, // Memory request output buffer - parameter MEM_OUT_BUF = 2, + parameter MEM_OUT_BUF = 3, - parameter MEM_TAG_WIDTH = `CLOG2(MSHR_SIZE) + `CLOG2(NUM_BANKS) + parameter MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH) ) ( input wire clk, input wire reset, @@ -71,35 +74,35 @@ module VX_cache_top import VX_gpu_pkg::*; #( `endif // Core request - input wire [NUM_REQS-1:0] core_req_valid, - input wire [NUM_REQS-1:0] core_req_rw, - input wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen, - input wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr, - input wire [NUM_REQS-1:0][`ADDR_TYPE_WIDTH-1:0] core_req_atype, - input wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data, - input wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag, - output wire [NUM_REQS-1:0] core_req_ready, + input wire core_req_valid [NUM_REQS], + input wire core_req_rw [NUM_REQS], + input wire[WORD_SIZE-1:0] core_req_byteen [NUM_REQS], + input wire[`CS_WORD_ADDR_WIDTH-1:0] core_req_addr [NUM_REQS], + input wire[`MEM_REQ_FLAGS_WIDTH-1:0] core_req_flags [NUM_REQS], + input wire[`CS_WORD_WIDTH-1:0] core_req_data [NUM_REQS], + input wire[TAG_WIDTH-1:0] core_req_tag [NUM_REQS], + output wire core_req_ready [NUM_REQS], // Core response - output wire [NUM_REQS-1:0] core_rsp_valid, - output wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data, - output wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag, - input wire [NUM_REQS-1:0] core_rsp_ready, + output wire core_rsp_valid [NUM_REQS], + output wire[`CS_WORD_WIDTH-1:0] core_rsp_data [NUM_REQS], + output wire[TAG_WIDTH-1:0] core_rsp_tag [NUM_REQS], + input wire core_rsp_ready [NUM_REQS], // Memory request - output wire mem_req_valid, - output wire mem_req_rw, - output wire [LINE_SIZE-1:0] mem_req_byteen, - output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr, - output wire [`CS_LINE_WIDTH-1:0] mem_req_data, - output wire [MEM_TAG_WIDTH-1:0] mem_req_tag, - input wire mem_req_ready, + output wire mem_req_valid [MEM_PORTS], + output wire mem_req_rw [MEM_PORTS], + output wire [LINE_SIZE-1:0] mem_req_byteen [MEM_PORTS], + output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr [MEM_PORTS], + output wire [`CS_LINE_WIDTH-1:0] mem_req_data [MEM_PORTS], + output wire [MEM_TAG_WIDTH-1:0] mem_req_tag [MEM_PORTS], + input wire mem_req_ready [MEM_PORTS], // Memory response - input wire mem_rsp_valid, - input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data, - input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag, - output wire mem_rsp_ready + input wire mem_rsp_valid [MEM_PORTS], + input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data [MEM_PORTS], + input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag [MEM_PORTS], + output wire mem_rsp_ready [MEM_PORTS] ); VX_mem_bus_if #( .DATA_SIZE (WORD_SIZE), @@ -109,7 +112,7 @@ module VX_cache_top import VX_gpu_pkg::*; #( VX_mem_bus_if #( .DATA_SIZE (LINE_SIZE), .TAG_WIDTH (MEM_TAG_WIDTH) - ) mem_bus_if(); + ) mem_bus_if[MEM_PORTS](); // Core request for (genvar i = 0; i < NUM_REQS; ++i) begin @@ -117,7 +120,7 @@ module VX_cache_top import VX_gpu_pkg::*; #( assign core_bus_if[i].req_data.rw = core_req_rw[i]; assign core_bus_if[i].req_data.byteen = core_req_byteen[i]; assign core_bus_if[i].req_data.addr = core_req_addr[i]; - assign core_bus_if[i].req_data.atype = core_req_atype[i]; + assign core_bus_if[i].req_data.flags = core_req_flags[i]; assign core_bus_if[i].req_data.data = core_req_data[i]; assign core_bus_if[i].req_data.tag = core_req_tag[i]; assign core_req_ready[i] = core_bus_if[i].req_ready; @@ -125,29 +128,32 @@ module VX_cache_top import VX_gpu_pkg::*; #( // Core response for (genvar i = 0; i < NUM_REQS; ++i) begin - assign core_rsp_valid[i] = core_bus_if[i].rsp_valid; + assign core_rsp_valid[i]= core_bus_if[i].rsp_valid; assign core_rsp_data[i] = core_bus_if[i].rsp_data.data; - assign core_rsp_tag[i] = core_bus_if[i].rsp_data.tag; + assign core_rsp_tag[i] = core_bus_if[i].rsp_data.tag; assign core_bus_if[i].rsp_ready = core_rsp_ready[i]; end // Memory request - assign mem_req_valid = mem_bus_if.req_valid; - assign mem_req_rw = mem_bus_if.req_data.rw; - assign mem_req_byteen = mem_bus_if.req_data.byteen; - assign mem_req_addr = mem_bus_if.req_data.addr; - assign mem_req_data = mem_bus_if.req_data.data; - assign mem_req_tag = mem_bus_if.req_data.tag; - assign mem_bus_if.req_ready = mem_req_ready; - `UNUSED_VAR (mem_bus_if.req_data.atype) + for (genvar i = 0; i < MEM_PORTS; ++i) begin + assign mem_req_valid[i] = mem_bus_if[i].req_valid; + assign mem_req_rw[i] = mem_bus_if[i].req_data.rw; + assign mem_req_byteen[i]= mem_bus_if[i].req_data.byteen; + assign mem_req_addr[i] = mem_bus_if[i].req_data.addr; + assign mem_req_data[i] = mem_bus_if[i].req_data.data; + assign mem_req_tag[i] = mem_bus_if[i].req_data.tag; + assign mem_bus_if[i].req_ready = mem_req_ready[i]; + end // Memory response - assign mem_bus_if.rsp_valid = mem_rsp_valid; - assign mem_bus_if.rsp_data.data = mem_rsp_data; - assign mem_bus_if.rsp_data.tag = mem_rsp_tag; - assign mem_rsp_ready = mem_bus_if.rsp_ready; + for (genvar i = 0; i < MEM_PORTS; ++i) begin + assign mem_bus_if[i].rsp_valid = mem_rsp_valid[i]; + assign mem_bus_if[i].rsp_data.data = mem_rsp_data[i]; + assign mem_bus_if[i].rsp_data.tag = mem_rsp_tag[i]; + assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready; + end - VX_cache #( + VX_cache_wrap #( .INSTANCE_ID (INSTANCE_ID), .CACHE_SIZE (CACHE_SIZE), .LINE_SIZE (LINE_SIZE), @@ -155,6 +161,7 @@ module VX_cache_top import VX_gpu_pkg::*; #( .NUM_WAYS (NUM_WAYS), .WORD_SIZE (WORD_SIZE), .NUM_REQS (NUM_REQS), + .MEM_PORTS (MEM_PORTS), .CRSQ_SIZE (CRSQ_SIZE), .MSHR_SIZE (MSHR_SIZE), .MRSQ_SIZE (MRSQ_SIZE), diff --git a/hw/rtl/cache/VX_cache_wrap.sv b/hw/rtl/cache/VX_cache_wrap.sv index 37940297f..e86d30c3e 100644 --- a/hw/rtl/cache/VX_cache_wrap.sv +++ b/hw/rtl/cache/VX_cache_wrap.sv @@ -21,24 +21,26 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( // Number of Word requests per cycle parameter NUM_REQS = 4, + // Number of memory ports + parameter MEM_PORTS = 1, // Size of cache in bytes parameter CACHE_SIZE = 4096, // Size of line inside a bank in bytes parameter LINE_SIZE = 64, // Number of banks - parameter NUM_BANKS = 1, + parameter NUM_BANKS = 4, // Number of associative ways - parameter NUM_WAYS = 1, + parameter NUM_WAYS = 4, // Size of a word in bytes - parameter WORD_SIZE = 4, + parameter WORD_SIZE = 16, // Core Response Queue Size - parameter CRSQ_SIZE = 2, + parameter CRSQ_SIZE = 4, // Miss Reserv Queue Knob - parameter MSHR_SIZE = 8, + parameter MSHR_SIZE = 16, // Memory Response Queue Size - parameter MRSQ_SIZE = 0, + parameter MRSQ_SIZE = 4, // Memory Request Queue Size parameter MREQ_SIZE = 4, @@ -51,12 +53,18 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( // Enable dirty bytes on writeback parameter DIRTY_BYTES = 0, + // Replacement policy + parameter REPL_POLICY = `CS_REPL_FIFO, + // Request debug identifier parameter UUID_WIDTH = 0, // core request tag size parameter TAG_WIDTH = UUID_WIDTH + 1, + // core request flags + parameter FLAGS_WIDTH = 0, + // enable bypass for non-cacheable addresses parameter NC_ENABLE = 0, @@ -64,10 +72,10 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( parameter PASSTHRU = 0, // Core response output buffer - parameter CORE_OUT_BUF = 0, + parameter CORE_OUT_BUF = 3, // Memory request output buffer - parameter MEM_OUT_BUF = 0 + parameter MEM_OUT_BUF = 3 ) ( input wire clk, @@ -79,19 +87,16 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( `endif VX_mem_bus_if.slave core_bus_if [NUM_REQS], - VX_mem_bus_if.master mem_bus_if + VX_mem_bus_if.master mem_bus_if [MEM_PORTS] ); `STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter")) - localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE); - localparam CACHE_MEM_TAG_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS; - - localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) : - (NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) : - `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS)); - - localparam NC_OR_BYPASS = (NC_ENABLE || PASSTHRU); + localparam CACHE_MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH); + localparam BYPASS_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, MEM_PORTS, LINE_SIZE, WORD_SIZE, TAG_WIDTH); + localparam NC_TAG_WIDTH = `MAX(CACHE_MEM_TAG_WIDTH, BYPASS_TAG_WIDTH) + 1; + localparam MEM_TAG_WIDTH = PASSTHRU ? BYPASS_TAG_WIDTH : (NC_ENABLE ? NC_TAG_WIDTH : CACHE_MEM_TAG_WIDTH); + localparam BYPASS_ENABLE = (NC_ENABLE || PASSTHRU); VX_mem_bus_if #( .DATA_SIZE (WORD_SIZE), @@ -101,18 +106,21 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( VX_mem_bus_if #( .DATA_SIZE (LINE_SIZE), .TAG_WIDTH (CACHE_MEM_TAG_WIDTH) - ) mem_bus_cache_if(); + ) mem_bus_cache_if[MEM_PORTS](); - if (NC_OR_BYPASS) begin + VX_mem_bus_if #( + .DATA_SIZE (LINE_SIZE), + .TAG_WIDTH (MEM_TAG_WIDTH) + ) mem_bus_tmp_if[MEM_PORTS](); - `RESET_RELAY (nc_bypass_reset, reset); + if (BYPASS_ENABLE) begin : g_bypass VX_cache_bypass #( .NUM_REQS (NUM_REQS), + .MEM_PORTS (MEM_PORTS), .TAG_SEL_IDX (TAG_SEL_IDX), - .PASSTHRU (PASSTHRU), - .NC_ENABLE (PASSTHRU ? 0 : NC_ENABLE), + .CACHE_ENABLE (!PASSTHRU), .WORD_SIZE (WORD_SIZE), .LINE_SIZE (LINE_SIZE), @@ -122,7 +130,6 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( .MEM_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH), .MEM_TAG_IN_WIDTH (CACHE_MEM_TAG_WIDTH), - .MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH), .UUID_WIDTH (UUID_WIDTH), @@ -130,51 +137,35 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( .MEM_OUT_BUF (MEM_OUT_BUF) ) cache_bypass ( .clk (clk), - .reset (nc_bypass_reset), + .reset (reset), .core_bus_in_if (core_bus_if), .core_bus_out_if(core_bus_cache_if), .mem_bus_in_if (mem_bus_cache_if), - .mem_bus_out_if (mem_bus_if) + .mem_bus_out_if (mem_bus_tmp_if) ); - end else begin + end else begin : g_no_bypass - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_cache_if `ASSIGN_VX_MEM_BUS_IF (core_bus_cache_if[i], core_bus_if[i]); end - `ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_cache_if); + for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_tmp_if + `ASSIGN_VX_MEM_BUS_IF (mem_bus_tmp_if[i], mem_bus_cache_if[i]); + end end - if (PASSTHRU != 0) begin - - for (genvar i = 0; i < NUM_REQS; ++i) begin - `UNUSED_VAR (core_bus_cache_if[i].req_valid) - `UNUSED_VAR (core_bus_cache_if[i].req_data) - assign core_bus_cache_if[i].req_ready = 0; - - assign core_bus_cache_if[i].rsp_valid = 0; - assign core_bus_cache_if[i].rsp_data = '0; - `UNUSED_VAR (core_bus_cache_if[i].rsp_ready) + for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_if + if (WRITE_ENABLE) begin : g_we + `ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[i]); + end else begin : g_ro + `ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[i]); end + end - assign mem_bus_cache_if.req_valid = 0; - assign mem_bus_cache_if.req_data = '0; - `UNUSED_VAR (mem_bus_cache_if.req_ready) - - `UNUSED_VAR (mem_bus_cache_if.rsp_valid) - `UNUSED_VAR (mem_bus_cache_if.rsp_data) - assign mem_bus_cache_if.rsp_ready = 0; - - `ifdef PERF_ENABLE - assign cache_perf = '0; - `endif - - end else begin - - `RESET_RELAY (cache_reset, reset); + if (PASSTHRU == 0) begin : g_cache VX_cache #( .INSTANCE_ID (INSTANCE_ID), @@ -184,20 +175,23 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( .NUM_WAYS (NUM_WAYS), .WORD_SIZE (WORD_SIZE), .NUM_REQS (NUM_REQS), + .MEM_PORTS (MEM_PORTS), + .WRITE_ENABLE (WRITE_ENABLE), + .WRITEBACK (WRITEBACK), + .DIRTY_BYTES (DIRTY_BYTES), + .REPL_POLICY (REPL_POLICY), .CRSQ_SIZE (CRSQ_SIZE), .MSHR_SIZE (MSHR_SIZE), .MRSQ_SIZE (MRSQ_SIZE), .MREQ_SIZE (MREQ_SIZE), - .WRITE_ENABLE (WRITE_ENABLE), - .WRITEBACK (WRITEBACK), - .DIRTY_BYTES (DIRTY_BYTES), .UUID_WIDTH (UUID_WIDTH), .TAG_WIDTH (TAG_WIDTH), - .CORE_OUT_BUF (NC_OR_BYPASS ? 1 : CORE_OUT_BUF), - .MEM_OUT_BUF (NC_OR_BYPASS ? 1 : MEM_OUT_BUF) + .FLAGS_WIDTH (FLAGS_WIDTH), + .CORE_OUT_BUF (BYPASS_ENABLE ? 1 : CORE_OUT_BUF), + .MEM_OUT_BUF (BYPASS_ENABLE ? 1 : MEM_OUT_BUF) ) cache ( .clk (clk), - .reset (cache_reset), + .reset (reset), `ifdef PERF_ENABLE .cache_perf (cache_perf), `endif @@ -205,64 +199,105 @@ module VX_cache_wrap import VX_gpu_pkg::*; #( .mem_bus_if (mem_bus_cache_if) ); + end else begin : g_passthru + + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_cache_if + `UNUSED_VX_MEM_BUS_IF (core_bus_cache_if[i]) + end + + for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_cache_if + `INIT_VX_MEM_BUS_IF (mem_bus_cache_if[i]) + end + + `ifdef PERF_ENABLE + wire [NUM_REQS-1:0] perf_core_reads_per_req; + wire [NUM_REQS-1:0] perf_core_writes_per_req; + wire [NUM_REQS-1:0] perf_crsp_stall_per_req; + wire [MEM_PORTS-1:0] perf_mem_stall_per_port; + + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req + assign perf_core_reads_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && ~core_bus_if[i].req_data.rw; + assign perf_core_writes_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && core_bus_if[i].req_data.rw; + assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready; + end + + for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_perf_mem_stall_per_port + assign perf_mem_stall_per_port[i] = mem_bus_if[i].req_valid && ~mem_bus_if[i].req_ready; + end + + // per cycle: read misses, write misses, msrq stalls, pipeline stalls + wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle; + wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle; + wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle; + wire [`CLOG2(MEM_PORTS+1)-1:0] perf_mem_stall_per_cycle; + + `POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req); + `POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req); + `POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req); + `POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port); + + reg [`PERF_CTR_BITS-1:0] perf_core_reads; + reg [`PERF_CTR_BITS-1:0] perf_core_writes; + reg [`PERF_CTR_BITS-1:0] perf_mem_stalls; + reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls; + + always @(posedge clk) begin + if (reset) begin + perf_core_reads <= '0; + perf_core_writes <= '0; + perf_mem_stalls <= '0; + perf_crsp_stalls <= '0; + end else begin + perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle); + perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle); + perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'(perf_mem_stall_per_cycle); + perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle); + end + end + + assign cache_perf.reads = perf_core_reads; + assign cache_perf.writes = perf_core_writes; + assign cache_perf.read_misses = '0; + assign cache_perf.write_misses = '0; + assign cache_perf.bank_stalls = '0; + assign cache_perf.mshr_stalls = '0; + assign cache_perf.mem_stalls = perf_mem_stalls; + assign cache_perf.crsp_stalls = perf_crsp_stalls; + `endif + end `ifdef DBG_TRACE_CACHE - - for (genvar i = 0; i < NUM_REQS; ++i) begin - wire [`UP(UUID_WIDTH)-1:0] core_req_uuid; - wire [`UP(UUID_WIDTH)-1:0] core_rsp_uuid; - - if (UUID_WIDTH != 0) begin - assign core_req_uuid = core_bus_if[i].req_data.tag[TAG_WIDTH-1 -: UUID_WIDTH]; - assign core_rsp_uuid = core_bus_if[i].rsp_data.tag[TAG_WIDTH-1 -: UUID_WIDTH]; - end else begin - assign core_req_uuid = 0; - assign core_rsp_uuid = 0; - end - - wire core_req_fire = core_bus_if[i].req_valid && core_bus_if[i].req_ready; - wire core_rsp_fire = core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready; - + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_trace_core always @(posedge clk) begin - if (core_req_fire) begin - if (core_bus_if[i].req_data.rw) - `TRACE(1, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid)); - else - `TRACE(1, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_req_uuid)); + if (core_bus_if[i].req_valid && core_bus_if[i].req_ready) begin + if (core_bus_if[i].req_data.rw) begin + `TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_bus_if[i].req_data.tag.uuid)) + end else begin + `TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.tag.uuid)) + end end - if (core_rsp_fire) begin - `TRACE(1, ("%d: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid)); + if (core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready) begin + `TRACE(2, ("%t: %s core-rd-rsp[%0d]: tag=0x%0h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, core_bus_if[i].rsp_data.tag.value, core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag.uuid)) end end end - wire [`UP(UUID_WIDTH)-1:0] mem_req_uuid; - wire [`UP(UUID_WIDTH)-1:0] mem_rsp_uuid; - - if ((UUID_WIDTH != 0) && (NC_OR_BYPASS != 0)) begin - assign mem_req_uuid = mem_bus_if.req_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH]; - assign mem_rsp_uuid = mem_bus_if.rsp_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH]; - end else begin - assign mem_req_uuid = 0; - assign mem_rsp_uuid = 0; - end - - wire mem_req_fire = mem_bus_if.req_valid && mem_bus_if.req_ready; - wire mem_rsp_fire = mem_bus_if.rsp_valid && mem_bus_if.rsp_ready; - - always @(posedge clk) begin - if (mem_req_fire) begin - if (mem_bus_if.req_data.rw) - `TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%h, data=0x%h (#%0d)\n", - $time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_bus_if.req_data.byteen, mem_bus_if.req_data.data, mem_req_uuid)); - else - `TRACE(1, ("%d: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n", - $time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_req_uuid)); - end - if (mem_rsp_fire) begin - `TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n", - $time, INSTANCE_ID, mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data, mem_rsp_uuid)); + for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_trace_mem + always @(posedge clk) begin + if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin + if (mem_bus_if[i].req_data.rw) begin + `TRACE(2, ("%t: %s mem-wr-req[%0d]: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", + $time, INSTANCE_ID, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.uuid)) + end else begin + `TRACE(2, ("%t: %s mem-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n", + $time, INSTANCE_ID, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid)) + end + end + if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin + `TRACE(2, ("%t: %s mem-rd-rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n", + $time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid)) + end end end `endif diff --git a/hw/rtl/core/VX_alu_int.sv b/hw/rtl/core/VX_alu_int.sv index 47bfcc6bf..8e43d8f3f 100644 --- a/hw/rtl/core/VX_alu_int.sv +++ b/hw/rtl/core/VX_alu_int.sv @@ -71,19 +71,19 @@ module VX_alu_int #( wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_imm = execute_if.data.op_args.alu.use_imm ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_args.alu.imm)}} : alu_in2; wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_br = (execute_if.data.op_args.alu.use_imm && ~is_br_op) ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_args.alu.imm)}} : alu_in2; - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_add_result assign add_result[i] = alu_in1_PC[i] + alu_in2_imm[i]; assign add_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] + alu_in2_imm[i][31:0])); end - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_sub_result wire [`XLEN:0] sub_in1 = {is_signed & alu_in1[i][`XLEN-1], alu_in1[i]}; wire [`XLEN:0] sub_in2 = {is_signed & alu_in2_br[i][`XLEN-1], alu_in2_br[i]}; assign sub_result[i] = sub_in1 - sub_in2; assign sub_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] - alu_in2_imm[i][31:0])); end - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_shr_result wire [`XLEN:0] shr_in1 = {is_signed && alu_in1[i][`XLEN-1], alu_in1[i]}; always @(*) begin case (alu_op[1:0]) @@ -102,7 +102,7 @@ module VX_alu_int #( assign shr_result_w[i] = `XLEN'($signed(shr_res_w)); end - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_msc_result always @(*) begin case (alu_op[1:0]) 2'b00: msc_result[i] = alu_in1[i] & alu_in2_imm[i]; // AND @@ -114,7 +114,7 @@ module VX_alu_int #( assign msc_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] << alu_in2_imm[i][4:0])); // SLLW end - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_alu_result wire [`XLEN-1:0] slt_br_result = `XLEN'({is_br_op && ~(| sub_result[i][`XLEN-1:0]), sub_result[i][`XLEN]}); wire [`XLEN-1:0] sub_slt_br_result = (is_sub_op && ~is_br_op) ? sub_result[i][`XLEN-1:0] : slt_br_result; always @(*) begin @@ -141,9 +141,9 @@ module VX_alu_int #( assign cbr_dest = add_result[0][1 +: `PC_BITS]; - if (LANE_BITS != 0) begin + if (LANE_BITS != 0) begin : g_tid assign tid = execute_if.data.tid[0 +: LANE_BITS]; - end else begin + end else begin : g_tid_0 assign tid = 0; end @@ -185,7 +185,7 @@ module VX_alu_int #( .data_out ({branch_ctl_if.valid, branch_ctl_if.wid, branch_ctl_if.taken, branch_ctl_if.dest}) ); - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_commit assign commit_if.data.data[i] = (is_br_op_r && is_br_static) ? {(PC_r + `PC_BITS'(2)), 1'd0} : alu_result_r[i]; end @@ -194,8 +194,8 @@ module VX_alu_int #( `ifdef DBG_TRACE_PIPELINE always @(posedge clk) begin if (br_enable) begin - `TRACE(1, ("%d: %s-branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n", - $time, INSTANCE_ID, br_wid, {commit_if.data.PC, 1'b0}, br_taken, {br_dest, 1'b0}, commit_if.data.uuid)); + `TRACE(2, ("%t: %s branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n", + $time, INSTANCE_ID, br_wid, {commit_if.data.PC, 1'b0}, br_taken, {br_dest, 1'b0}, commit_if.data.uuid)) end end `endif diff --git a/hw/rtl/core/VX_alu_muldiv.sv b/hw/rtl/core/VX_alu_muldiv.sv index 3beb035f4..d374013bc 100644 --- a/hw/rtl/core/VX_alu_muldiv.sv +++ b/hw/rtl/core/VX_alu_muldiv.sv @@ -68,7 +68,7 @@ module VX_alu_muldiv #( wire mul_fire_in = mul_valid_in && mul_ready_in; - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mul_result_tmp reg [`XLEN-1:0] mul_resultl, mul_resulth; wire [`XLEN-1:0] mul_in1 = is_alu_w ? (execute_if.data.rs1_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs1_data[i]; wire [`XLEN-1:0] mul_in2 = is_alu_w ? (execute_if.data.rs2_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs2_data[i]; @@ -103,7 +103,7 @@ module VX_alu_muldiv #( wire [NUM_LANES-1:0][`XLEN:0] mul_in1; wire [NUM_LANES-1:0][`XLEN:0] mul_in2; - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mul_in assign mul_in1[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]} : {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]}; assign mul_in2[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]} : {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]}; end @@ -149,7 +149,7 @@ module VX_alu_muldiv #( `else - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_multiplier wire [`XLEN:0] mul_in1 = {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]}; wire [`XLEN:0] mul_in2 = {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]}; @@ -184,7 +184,7 @@ module VX_alu_muldiv #( `endif - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mul_result_out `ifdef XLEN_64 assign mul_result_out[i] = is_mulh_out ? mul_result_tmp[i][2*(`XLEN)-1:`XLEN] : (is_mul_w_out ? `XLEN'($signed(mul_result_tmp[i][31:0])) : @@ -219,7 +219,7 @@ module VX_alu_muldiv #( wire [NUM_LANES-1:0][`XLEN-1:0] div_in1; wire [NUM_LANES-1:0][`XLEN-1:0] div_in2; - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_div_in `ifdef XLEN_64 assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i]; assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i]; @@ -234,7 +234,7 @@ module VX_alu_muldiv #( wire [NUM_LANES-1:0][`XLEN-1:0] div_result_in; wire div_fire_in = div_valid_in && div_ready_in; - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_div_result_in reg [`XLEN-1:0] div_quotient, div_remainder; always @(*) begin dpi_idiv (div_fire_in, is_signed_op, div_in1[i], div_in2[i], div_quotient, div_remainder); @@ -306,7 +306,7 @@ module VX_alu_muldiv #( assign {div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out, is_div_w_out, div_pid_out, div_sop_out, div_eop_out} = div_tag_r; - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_div_result_out `ifdef XLEN_64 assign div_result_out[i] = is_rem_op_out ? (is_div_w_out ? `XLEN'($signed(div_remainder[i][31:0])) : div_remainder[i]) : (is_div_w_out ? `XLEN'($signed(div_quotient[i][31:0])) : div_quotient[i]); @@ -324,8 +324,8 @@ module VX_alu_muldiv #( VX_stream_arb #( .NUM_INPUTS (2), .DATAW (TAG_WIDTH + (NUM_LANES * `XLEN)), - .ARBITER ("F"), - .OUT_BUF (1) + .ARBITER ("P"), + .OUT_BUF (2) ) rsp_buf ( .clk (clk), .reset (reset), diff --git a/hw/rtl/core/VX_alu_unit.sv b/hw/rtl/core/VX_alu_unit.sv index 86bcaf05e..e87221709 100644 --- a/hw/rtl/core/VX_alu_unit.sv +++ b/hw/rtl/core/VX_alu_unit.sv @@ -30,20 +30,24 @@ module VX_alu_unit #( `UNUSED_SPARAM (INSTANCE_ID) localparam BLOCK_SIZE = `NUM_ALU_BLOCKS; localparam NUM_LANES = `NUM_ALU_LANES; - localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); - localparam PID_WIDTH = `UP(PID_BITS); - localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1; - localparam RSP_ARB_SIZE = 1 + `EXT_M_ENABLED; localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS); + localparam PE_COUNT = 1 + `EXT_M_ENABLED; + localparam PE_SEL_BITS = `CLOG2(PE_COUNT); + localparam PE_IDX_INT = 0; + localparam PE_IDX_MDV = PE_IDX_INT + `EXT_M_ENABLED; VX_execute_if #( .NUM_LANES (NUM_LANES) ) per_block_execute_if[BLOCK_SIZE](); + VX_commit_if #( + .NUM_LANES (NUM_LANES) + ) per_block_commit_if[BLOCK_SIZE](); + VX_dispatch_unit #( .BLOCK_SIZE (BLOCK_SIZE), .NUM_LANES (NUM_LANES), - .OUT_BUF (PARTIAL_BW ? 1 : 0) + .OUT_BUF (PARTIAL_BW ? 3 : 0) ) dispatch_unit ( .clk (clk), .reset (reset), @@ -51,103 +55,62 @@ module VX_alu_unit #( .execute_if (per_block_execute_if) ); - VX_commit_if #( - .NUM_LANES (NUM_LANES) - ) per_block_commit_if[BLOCK_SIZE](); - - for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin - - `RESET_RELAY_EN (block_reset, reset,(BLOCK_SIZE > 1)); - - wire is_muldiv_op = `EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == `ALU_TYPE_MULDIV); + for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_alus VX_execute_if #( .NUM_LANES (NUM_LANES) - ) int_execute_if(); + ) pe_execute_if[PE_COUNT](); - VX_commit_if #( + VX_commit_if#( .NUM_LANES (NUM_LANES) - ) int_commit_if(); + ) pe_commit_if[PE_COUNT](); - assign int_execute_if.valid = per_block_execute_if[block_idx].valid && ~is_muldiv_op; - assign int_execute_if.data = per_block_execute_if[block_idx].data; + reg [`UP(PE_SEL_BITS)-1:0] pe_select; + always @(*) begin + pe_select = PE_IDX_INT; + if (`EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == `ALU_TYPE_MULDIV)) + pe_select = PE_IDX_MDV; + end + + VX_pe_switch #( + .PE_COUNT (PE_COUNT), + .NUM_LANES (NUM_LANES), + .ARBITER ("R"), + .REQ_OUT_BUF (0), + .RSP_OUT_BUF (PARTIAL_BW ? 1 : 3) + ) pe_switch ( + .clk (clk), + .reset (reset), + .pe_sel (pe_select), + .execute_in_if (per_block_execute_if[block_idx]), + .commit_out_if (per_block_commit_if[block_idx]), + .execute_out_if (pe_execute_if), + .commit_in_if (pe_commit_if) + ); VX_alu_int #( - .INSTANCE_ID ($sformatf("%s-int%0d", INSTANCE_ID, block_idx)), + .INSTANCE_ID (`SFORMATF(("%s-int%0d", INSTANCE_ID, block_idx))), .BLOCK_IDX (block_idx), .NUM_LANES (NUM_LANES) ) alu_int ( .clk (clk), - .reset (block_reset), - .execute_if (int_execute_if), + .reset (reset), + .execute_if (pe_execute_if[PE_IDX_INT]), .branch_ctl_if (branch_ctl_if[block_idx]), - .commit_if (int_commit_if) + .commit_if (pe_commit_if[PE_IDX_INT]) ); `ifdef EXT_M_ENABLE - - VX_execute_if #( - .NUM_LANES (NUM_LANES) - ) muldiv_execute_if(); - - VX_commit_if #( - .NUM_LANES (NUM_LANES) - ) muldiv_commit_if(); - - assign muldiv_execute_if.valid = per_block_execute_if[block_idx].valid && is_muldiv_op; - assign muldiv_execute_if.data = per_block_execute_if[block_idx].data; - VX_alu_muldiv #( - .INSTANCE_ID ($sformatf("%s-muldiv%0d", INSTANCE_ID, block_idx)), + .INSTANCE_ID (`SFORMATF(("%s-muldiv%0d", INSTANCE_ID, block_idx))), .NUM_LANES (NUM_LANES) ) muldiv_unit ( .clk (clk), - .reset (block_reset), - .execute_if (muldiv_execute_if), - .commit_if (muldiv_commit_if) + .reset (reset), + .execute_if (pe_execute_if[PE_IDX_MDV]), + .commit_if (pe_commit_if[PE_IDX_MDV]) ); - `endif - - assign per_block_execute_if[block_idx].ready = - `ifdef EXT_M_ENABLE - is_muldiv_op ? muldiv_execute_if.ready : - `endif - int_execute_if.ready; - - // send response - - VX_stream_arb #( - .NUM_INPUTS (RSP_ARB_SIZE), - .DATAW (RSP_ARB_DATAW), - .OUT_BUF (PARTIAL_BW ? 1 : 3), - .ARBITER ("F") - ) rsp_arb ( - .clk (clk), - .reset (block_reset), - .valid_in ({ - `ifdef EXT_M_ENABLE - muldiv_commit_if.valid, - `endif - int_commit_if.valid - }), - .ready_in ({ - `ifdef EXT_M_ENABLE - muldiv_commit_if.ready, - `endif - int_commit_if.ready - }), - .data_in ({ - `ifdef EXT_M_ENABLE - muldiv_commit_if.data, - `endif - int_commit_if.data - }), - .data_out (per_block_commit_if[block_idx].data), - .valid_out (per_block_commit_if[block_idx].valid), - .ready_out (per_block_commit_if[block_idx].ready), - `UNUSED_PIN (sel_out) - ); end VX_gather_unit #( diff --git a/hw/rtl/core/VX_commit.sv b/hw/rtl/core/VX_commit.sv index d78c2ec89..d53331928 100644 --- a/hw/rtl/core/VX_commit.sv +++ b/hw/rtl/core/VX_commit.sv @@ -13,7 +13,7 @@ `include "VX_define.vh" -module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #( +module VX_commit import VX_gpu_pkg::*; #( parameter `STRING INSTANCE_ID = "" ) ( input wire clk, @@ -41,28 +41,26 @@ module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #( wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] per_issue_commit_tmask; wire [`ISSUE_WIDTH-1:0] per_issue_commit_eop; - for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_commit_arbs wire [`NUM_EX_UNITS-1:0] valid_in; wire [`NUM_EX_UNITS-1:0][DATAW-1:0] data_in; wire [`NUM_EX_UNITS-1:0] ready_in; - for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin + for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin : g_data_in assign valid_in[j] = commit_if[j * `ISSUE_WIDTH + i].valid; assign data_in[j] = commit_if[j * `ISSUE_WIDTH + i].data; assign commit_if[j * `ISSUE_WIDTH + i].ready = ready_in[j]; end - `RESET_RELAY (arb_reset, reset); - VX_stream_arb #( .NUM_INPUTS (`NUM_EX_UNITS), .DATAW (DATAW), - .ARBITER ("R"), + .ARBITER ("P"), .OUT_BUF (1) ) commit_arb ( .clk (clk), - .reset (arb_reset), + .reset (reset), .valid_in (valid_in), .ready_in (ready_in), .data_in (data_in), @@ -86,7 +84,7 @@ module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #( assign commit_fire_any = (| per_issue_commit_fire); - for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_commit_size wire [COMMIT_SIZEW-1:0] count; `POP_COUNT(count, per_issue_commit_tmask[i]); assign commit_size[i] = count; @@ -103,7 +101,7 @@ module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #( .data_out ({commit_fire_any_r, commit_size_r}) ); - VX_reduce #( + VX_reduce_tree #( .DATAW_IN (COMMIT_SIZEW), .DATAW_OUT (COMMIT_ALL_SIZEW), .N (`ISSUE_WIDTH), @@ -162,7 +160,7 @@ module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #( // Writeback - for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_writeback assign writeback_if[i].valid = commit_arb_if[i].valid && commit_arb_if[i].data.wb; assign writeback_if[i].data.uuid = commit_arb_if[i].data.uuid; assign writeback_if[i].data.wis = wid_to_wis(commit_arb_if[i].data.wid); @@ -176,15 +174,15 @@ module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #( end `ifdef DBG_TRACE_PIPELINE - for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin - for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_trace + for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin : g_j always @(posedge clk) begin if (commit_if[j * `ISSUE_WIDTH + i].valid && commit_if[j * `ISSUE_WIDTH + i].ready) begin - `TRACE(1, ("%d: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, commit_if[j * `ISSUE_WIDTH + i].data.wid, {commit_if[j * `ISSUE_WIDTH + i].data.PC, 1'b0})); + `TRACE(1, ("%t: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, commit_if[j * `ISSUE_WIDTH + i].data.wid, {commit_if[j * `ISSUE_WIDTH + i].data.PC, 1'b0})) trace_ex_type(1, j); - `TRACE(1, (", tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", commit_if[j * `ISSUE_WIDTH + i].data.tmask, commit_if[j * `ISSUE_WIDTH + i].data.wb, commit_if[j * `ISSUE_WIDTH + i].data.rd, commit_if[j * `ISSUE_WIDTH + i].data.sop, commit_if[j * `ISSUE_WIDTH + i].data.eop)); - `TRACE_ARRAY1D(1, "0x%0h", commit_if[j * `ISSUE_WIDTH + i].data.data, `NUM_THREADS); - `TRACE(1, (" (#%0d)\n", commit_if[j * `ISSUE_WIDTH + i].data.uuid)); + `TRACE(1, (", tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", commit_if[j * `ISSUE_WIDTH + i].data.tmask, commit_if[j * `ISSUE_WIDTH + i].data.wb, commit_if[j * `ISSUE_WIDTH + i].data.rd, commit_if[j * `ISSUE_WIDTH + i].data.sop, commit_if[j * `ISSUE_WIDTH + i].data.eop)) + `TRACE_ARRAY1D(1, "0x%0h", commit_if[j * `ISSUE_WIDTH + i].data.data, `NUM_THREADS) + `TRACE(1, (" (#%0d)\n", commit_if[j * `ISSUE_WIDTH + i].data.uuid)) end end end diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index 4c82db812..34bbcfb48 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -28,7 +28,7 @@ module VX_core import VX_gpu_pkg::*; #( input wire reset, `ifdef PERF_ENABLE - VX_mem_perf_if.slave mem_perf_if, + input sysmem_perf_t sysmem_perf, `endif VX_dcr_bus_if.slave dcr_bus_if, @@ -65,44 +65,37 @@ module VX_core import VX_gpu_pkg::*; #( ) lsu_mem_if[`NUM_LSU_BLOCKS](); `ifdef PERF_ENABLE - VX_mem_perf_if mem_perf_tmp_if(); - VX_pipeline_perf_if pipeline_perf_if(); - - assign mem_perf_tmp_if.icache = mem_perf_if.icache; - assign mem_perf_tmp_if.dcache = mem_perf_if.dcache; - assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache; - assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; - assign mem_perf_tmp_if.mem = mem_perf_if.mem; + lmem_perf_t lmem_perf; + coalescer_perf_t coalescer_perf; + pipeline_perf_t pipeline_perf; + sysmem_perf_t sysmem_perf_tmp; + always @(*) begin + sysmem_perf_tmp = sysmem_perf; + sysmem_perf_tmp.lmem = lmem_perf; + sysmem_perf_tmp.coalescer = coalescer_perf; + end `endif - `RESET_RELAY (dcr_data_reset, reset); - `RESET_RELAY (schedule_reset, reset); - `RESET_RELAY (fetch_reset, reset); - `RESET_RELAY (decode_reset, reset); - `RESET_RELAY (issue_reset, reset); - `RESET_RELAY (execute_reset, reset); - `RESET_RELAY (commit_reset, reset); - base_dcrs_t base_dcrs; VX_dcr_data dcr_data ( .clk (clk), - .reset (dcr_data_reset), + .reset (reset), .dcr_bus_if (dcr_bus_if), .base_dcrs (base_dcrs) ); - `SCOPE_IO_SWITCH (3) + `SCOPE_IO_SWITCH (3); VX_schedule #( - .INSTANCE_ID ($sformatf("%s-schedule", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-schedule", INSTANCE_ID))), .CORE_ID (CORE_ID) ) schedule ( .clk (clk), - .reset (schedule_reset), + .reset (reset), `ifdef PERF_ENABLE - .sched_perf (pipeline_perf_if.sched), + .sched_perf (pipeline_perf.sched), `endif .base_dcrs (base_dcrs), @@ -123,36 +116,36 @@ module VX_core import VX_gpu_pkg::*; #( ); VX_fetch #( - .INSTANCE_ID ($sformatf("%s-fetch", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-fetch", INSTANCE_ID))) ) fetch ( `SCOPE_IO_BIND (0) .clk (clk), - .reset (fetch_reset), + .reset (reset), .icache_bus_if (icache_bus_if), .schedule_if (schedule_if), .fetch_if (fetch_if) ); VX_decode #( - .INSTANCE_ID ($sformatf("%s-decode", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-decode", INSTANCE_ID))) ) decode ( .clk (clk), - .reset (decode_reset), + .reset (reset), .fetch_if (fetch_if), .decode_if (decode_if), .decode_sched_if(decode_sched_if) ); VX_issue #( - .INSTANCE_ID ($sformatf("%s-issue", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-issue", INSTANCE_ID))) ) issue ( `SCOPE_IO_BIND (1) .clk (clk), - .reset (issue_reset), + .reset (reset), `ifdef PERF_ENABLE - .issue_perf (pipeline_perf_if.issue), + .issue_perf (pipeline_perf.issue), `endif .decode_if (decode_if), @@ -161,17 +154,17 @@ module VX_core import VX_gpu_pkg::*; #( ); VX_execute #( - .INSTANCE_ID ($sformatf("%s-execute", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-execute", INSTANCE_ID))), .CORE_ID (CORE_ID) ) execute ( `SCOPE_IO_BIND (2) .clk (clk), - .reset (execute_reset), + .reset (reset), `ifdef PERF_ENABLE - .mem_perf_if (mem_perf_tmp_if), - .pipeline_perf_if(pipeline_perf_if), + .sysmem_perf (sysmem_perf_tmp), + .pipeline_perf (pipeline_perf), `endif .base_dcrs (base_dcrs), @@ -189,10 +182,10 @@ module VX_core import VX_gpu_pkg::*; #( ); VX_commit #( - .INSTANCE_ID ($sformatf("%s-commit", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-commit", INSTANCE_ID))) ) commit ( .clk (clk), - .reset (commit_reset), + .reset (reset), .commit_if (commit_if), @@ -202,134 +195,19 @@ module VX_core import VX_gpu_pkg::*; #( .commit_sched_if(commit_sched_if) ); - VX_lsu_mem_if #( - .NUM_LANES (`NUM_LSU_LANES), - .DATA_SIZE (LSU_WORD_SIZE), - .TAG_WIDTH (LSU_TAG_WIDTH) - ) lsu_dcache_if[`NUM_LSU_BLOCKS](); - -`ifdef LMEM_ENABLE - - `RESET_RELAY (lmem_unit_reset, reset); - - VX_lmem_unit #( + VX_mem_unit #( .INSTANCE_ID (INSTANCE_ID) - ) lmem_unit ( - .clk (clk), - .reset (lmem_unit_reset), + ) mem_unit ( + .clk (clk), + .reset (reset), `ifdef PERF_ENABLE - .cache_perf (mem_perf_tmp_if.lmem), + .lmem_perf (lmem_perf), + .coalescer_perf(coalescer_perf), `endif - .lsu_mem_in_if (lsu_mem_if), - .lsu_mem_out_if (lsu_dcache_if) + .lsu_mem_if (lsu_mem_if), + .dcache_bus_if (dcache_bus_if) ); -`else - - for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin - `ASSIGN_VX_LSU_MEM_IF (lsu_dcache_if[i], lsu_mem_if[i]); - end - -`endif - - for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin - - VX_lsu_mem_if #( - .NUM_LANES (DCACHE_CHANNELS), - .DATA_SIZE (DCACHE_WORD_SIZE), - .TAG_WIDTH (DCACHE_TAG_WIDTH) - ) dcache_coalesced_if(); - - if (LSU_WORD_SIZE != DCACHE_WORD_SIZE) begin - - `RESET_RELAY (mem_coalescer_reset, reset); - - VX_mem_coalescer #( - .INSTANCE_ID ($sformatf("%s-coalescer%0d", INSTANCE_ID, i)), - .NUM_REQS (`NUM_LSU_LANES), - .DATA_IN_SIZE (LSU_WORD_SIZE), - .DATA_OUT_SIZE (DCACHE_WORD_SIZE), - .ADDR_WIDTH (LSU_ADDR_WIDTH), - .ATYPE_WIDTH (`ADDR_TYPE_WIDTH), - .TAG_WIDTH (LSU_TAG_WIDTH), - .UUID_WIDTH (`UUID_WIDTH), - .QUEUE_SIZE (`LSUQ_OUT_SIZE) - ) mem_coalescer ( - .clk (clk), - .reset (mem_coalescer_reset), - - // Input request - .in_req_valid (lsu_dcache_if[i].req_valid), - .in_req_mask (lsu_dcache_if[i].req_data.mask), - .in_req_rw (lsu_dcache_if[i].req_data.rw), - .in_req_byteen (lsu_dcache_if[i].req_data.byteen), - .in_req_addr (lsu_dcache_if[i].req_data.addr), - .in_req_atype (lsu_dcache_if[i].req_data.atype), - .in_req_data (lsu_dcache_if[i].req_data.data), - .in_req_tag (lsu_dcache_if[i].req_data.tag), - .in_req_ready (lsu_dcache_if[i].req_ready), - - // Input response - .in_rsp_valid (lsu_dcache_if[i].rsp_valid), - .in_rsp_mask (lsu_dcache_if[i].rsp_data.mask), - .in_rsp_data (lsu_dcache_if[i].rsp_data.data), - .in_rsp_tag (lsu_dcache_if[i].rsp_data.tag), - .in_rsp_ready (lsu_dcache_if[i].rsp_ready), - - // Output request - .out_req_valid (dcache_coalesced_if.req_valid), - .out_req_mask (dcache_coalesced_if.req_data.mask), - .out_req_rw (dcache_coalesced_if.req_data.rw), - .out_req_byteen (dcache_coalesced_if.req_data.byteen), - .out_req_addr (dcache_coalesced_if.req_data.addr), - .out_req_atype (dcache_coalesced_if.req_data.atype), - .out_req_data (dcache_coalesced_if.req_data.data), - .out_req_tag (dcache_coalesced_if.req_data.tag), - .out_req_ready (dcache_coalesced_if.req_ready), - - // Output response - .out_rsp_valid (dcache_coalesced_if.rsp_valid), - .out_rsp_mask (dcache_coalesced_if.rsp_data.mask), - .out_rsp_data (dcache_coalesced_if.rsp_data.data), - .out_rsp_tag (dcache_coalesced_if.rsp_data.tag), - .out_rsp_ready (dcache_coalesced_if.rsp_ready) - ); - - end else begin - - `ASSIGN_VX_LSU_MEM_IF (dcache_coalesced_if, lsu_dcache_if[i]); - - end - - VX_mem_bus_if #( - .DATA_SIZE (DCACHE_WORD_SIZE), - .TAG_WIDTH (DCACHE_TAG_WIDTH) - ) dcache_bus_tmp_if[DCACHE_CHANNELS](); - - `RESET_RELAY (lsu_adapter_reset, reset); - - VX_lsu_adapter #( - .NUM_LANES (DCACHE_CHANNELS), - .DATA_SIZE (DCACHE_WORD_SIZE), - .TAG_WIDTH (DCACHE_TAG_WIDTH), - .TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH), - .ARBITER ("P"), - .REQ_OUT_BUF (0), - .RSP_OUT_BUF (0) - ) lsu_adapter ( - .clk (clk), - .reset (lsu_adapter_reset), - .lsu_mem_if (dcache_coalesced_if), - .mem_bus_if (dcache_bus_tmp_if) - ); - - for (genvar j = 0; j < DCACHE_CHANNELS; ++j) begin - `ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i * DCACHE_CHANNELS + j], dcache_bus_tmp_if[j]); - end - - end - - `ifdef PERF_ENABLE wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle; @@ -353,8 +231,8 @@ module VX_core import VX_gpu_pkg::*; #( wire [LSU_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r; wire [LSU_NUM_REQS-1:0] perf_dcache_rsp_fire; - for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin - for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin + for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_perf_dcache + for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin : g_j assign perf_dcache_rd_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && ~lsu_mem_if[i].req_data.rw; assign perf_dcache_wr_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && lsu_mem_if[i].req_data.rw; assign perf_dcache_rsp_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].rsp_valid && lsu_mem_if[i].rsp_data.mask[j] && lsu_mem_if[i].rsp_ready; @@ -400,12 +278,11 @@ module VX_core import VX_gpu_pkg::*; #( end end - assign pipeline_perf_if.ifetches = perf_ifetches; - assign pipeline_perf_if.loads = perf_loads; - assign pipeline_perf_if.stores = perf_stores; - assign pipeline_perf_if.load_latency = perf_dcache_lat; - assign pipeline_perf_if.ifetch_latency = perf_icache_lat; - assign pipeline_perf_if.load_latency = perf_dcache_lat; + assign pipeline_perf.ifetches = perf_ifetches; + assign pipeline_perf.loads = perf_loads; + assign pipeline_perf.stores = perf_stores; + assign pipeline_perf.ifetch_latency = perf_icache_lat; + assign pipeline_perf.load_latency = perf_dcache_lat; `endif diff --git a/hw/rtl/core/VX_core_top.sv b/hw/rtl/core/VX_core_top.sv index 420ae7b67..ca0bd4cf6 100644 --- a/hw/rtl/core/VX_core_top.sv +++ b/hw/rtl/core/VX_core_top.sv @@ -32,7 +32,7 @@ module VX_core_top import VX_gpu_pkg::*; #( output wire [DCACHE_NUM_REQS-1:0] dcache_req_rw, output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] dcache_req_byteen, output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] dcache_req_addr, - output wire [DCACHE_NUM_REQS-1:0][`ADDR_TYPE_WIDTH-1:0] dcache_req_atype, + output wire [DCACHE_NUM_REQS-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] dcache_req_flags, output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_req_data, output wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] dcache_req_tag, input wire [DCACHE_NUM_REQS-1:0] dcache_req_ready, @@ -96,7 +96,7 @@ module VX_core_top import VX_gpu_pkg::*; #( assign dcache_req_rw[i] = dcache_bus_if[i].req_data.rw; assign dcache_req_byteen[i] = dcache_bus_if[i].req_data.byteen; assign dcache_req_addr[i] = dcache_bus_if[i].req_data.addr; - assign dcache_req_atype[i] = dcache_bus_if[i].req_data.atype; + assign dcache_req_flags[i] = dcache_bus_if[i].req_data.flags; assign dcache_req_data[i] = dcache_bus_if[i].req_data.data; assign dcache_req_tag[i] = dcache_bus_if[i].req_data.tag; assign dcache_bus_if[i].req_ready = dcache_req_ready[i]; @@ -119,7 +119,7 @@ module VX_core_top import VX_gpu_pkg::*; #( assign icache_req_data = icache_bus_if.req_data.data; assign icache_req_tag = icache_bus_if.req_data.tag; assign icache_bus_if.req_ready = icache_req_ready; - `UNUSED_VAR (icache_bus_if.req_data.atype) + `UNUSED_VAR (icache_bus_if.req_data.flags) assign icache_bus_if.rsp_valid = icache_rsp_valid; assign icache_bus_if.rsp_data.tag = icache_rsp_tag; @@ -127,13 +127,13 @@ module VX_core_top import VX_gpu_pkg::*; #( assign icache_rsp_ready = icache_bus_if.rsp_ready; `ifdef PERF_ENABLE - VX_mem_perf_if mem_perf_if(); - assign mem_perf_if.icache = '0; - assign mem_perf_if.dcache = '0; - assign mem_perf_if.l2cache = '0; - assign mem_perf_if.l3cache = '0; - assign mem_perf_if.lmem = '0; - assign mem_perf_if.mem = '0; + sysmem_perf_t mem_perf; + assign mem_perf.icache = '0; + assign mem_perf.dcache = '0; + assign mem_perf.l2cache = '0; + assign mem_perf.l3cache = '0; + assign mem_perf.lmem = '0; + assign mem_perf.mem = '0; `endif `ifdef SCOPE @@ -144,7 +144,7 @@ module VX_core_top import VX_gpu_pkg::*; #( `endif VX_core #( - .INSTANCE_ID ($sformatf("core")), + .INSTANCE_ID (`SFORMATF(("core"))), .CORE_ID (CORE_ID) ) core ( `SCOPE_IO_BIND (0) @@ -152,7 +152,7 @@ module VX_core_top import VX_gpu_pkg::*; #( .reset (reset), `ifdef PERF_ENABLE - .mem_perf_if (mem_perf_if), + .sysmem_perf (sysmem_perf), `endif .dcr_bus_if (dcr_bus_if), diff --git a/hw/rtl/core/VX_csr_data.sv b/hw/rtl/core/VX_csr_data.sv index a2b0741ad..9ba72a353 100644 --- a/hw/rtl/core/VX_csr_data.sv +++ b/hw/rtl/core/VX_csr_data.sv @@ -41,8 +41,8 @@ import VX_fpu_pkg::*; input base_dcrs_t base_dcrs, `ifdef PERF_ENABLE - VX_mem_perf_if.slave mem_perf_if, - VX_pipeline_perf_if.slave pipeline_perf_if, + input sysmem_perf_t sysmem_perf, + input pipeline_perf_t pipeline_perf, `endif VX_commit_csr_if.slave commit_csr_if, @@ -83,7 +83,7 @@ import VX_fpu_pkg::*; wire [`NUM_FPU_BLOCKS-1:0][`NW_WIDTH-1:0] fpu_write_wid; fflags_t [`NUM_FPU_BLOCKS-1:0] fpu_write_fflags; - for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin + for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin : g_fpu_write assign fpu_write_enable[i] = fpu_csr_if[i].write_enable; assign fpu_write_wid[i] = fpu_csr_if[i].write_wid; assign fpu_write_fflags[i] = fpu_csr_if[i].write_fflags; @@ -107,7 +107,7 @@ import VX_fpu_pkg::*; end end - for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin + for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin : g_fpu_csr_read_frm assign fpu_csr_if[i].read_frm = fcsr[fpu_csr_if[i].read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]; end @@ -155,41 +155,41 @@ import VX_fpu_pkg::*; // CSRs read ////////////////////////////////////////////////////////////// - reg [`XLEN-1:0] read_data_ro_r; - reg [`XLEN-1:0] read_data_rw_r; - reg read_addr_valid_r; + reg [`XLEN-1:0] read_data_ro_w; + reg [`XLEN-1:0] read_data_rw_w; + reg read_addr_valid_w; always @(*) begin - read_data_ro_r = '0; - read_data_rw_r = '0; - read_addr_valid_r = 1; + read_data_ro_w = '0; + read_data_rw_w = '0; + read_addr_valid_w = 1; case (read_addr) - `VX_CSR_MVENDORID : read_data_ro_r = `XLEN'(`VENDOR_ID); - `VX_CSR_MARCHID : read_data_ro_r = `XLEN'(`ARCHITECTURE_ID); - `VX_CSR_MIMPID : read_data_ro_r = `XLEN'(`IMPLEMENTATION_ID); - `VX_CSR_MISA : read_data_ro_r = `XLEN'({2'(`CLOG2(`XLEN/16)), 30'(`MISA_STD)}); + `VX_CSR_MVENDORID : read_data_ro_w = `XLEN'(`VENDOR_ID); + `VX_CSR_MARCHID : read_data_ro_w = `XLEN'(`ARCHITECTURE_ID); + `VX_CSR_MIMPID : read_data_ro_w = `XLEN'(`IMPLEMENTATION_ID); + `VX_CSR_MISA : read_data_ro_w = `XLEN'({2'(`CLOG2(`XLEN/16)), 30'(`MISA_STD)}); `ifdef EXT_F_ENABLE - `VX_CSR_FFLAGS : read_data_rw_r = `XLEN'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]); - `VX_CSR_FRM : read_data_rw_r = `XLEN'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]); - `VX_CSR_FCSR : read_data_rw_r = `XLEN'(fcsr[read_wid]); + `VX_CSR_FFLAGS : read_data_rw_w = `XLEN'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]); + `VX_CSR_FRM : read_data_rw_w = `XLEN'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]); + `VX_CSR_FCSR : read_data_rw_w = `XLEN'(fcsr[read_wid]); `endif - `VX_CSR_MSCRATCH : read_data_rw_r = mscratch; + `VX_CSR_MSCRATCH : read_data_rw_w = mscratch; - `VX_CSR_WARP_ID : read_data_ro_r = `XLEN'(read_wid); - `VX_CSR_CORE_ID : read_data_ro_r = `XLEN'(CORE_ID); - `VX_CSR_ACTIVE_THREADS: read_data_ro_r = `XLEN'(thread_masks[read_wid]); - `VX_CSR_ACTIVE_WARPS: read_data_ro_r = `XLEN'(active_warps); - `VX_CSR_NUM_THREADS: read_data_ro_r = `XLEN'(`NUM_THREADS); - `VX_CSR_NUM_WARPS : read_data_ro_r = `XLEN'(`NUM_WARPS); - `VX_CSR_NUM_CORES : read_data_ro_r = `XLEN'(`NUM_CORES * `NUM_CLUSTERS); - `VX_CSR_LOCAL_MEM_BASE: read_data_ro_r = `XLEN'(`LMEM_BASE_ADDR); + `VX_CSR_WARP_ID : read_data_ro_w = `XLEN'(read_wid); + `VX_CSR_CORE_ID : read_data_ro_w = `XLEN'(CORE_ID); + `VX_CSR_ACTIVE_THREADS: read_data_ro_w = `XLEN'(thread_masks[read_wid]); + `VX_CSR_ACTIVE_WARPS: read_data_ro_w = `XLEN'(active_warps); + `VX_CSR_NUM_THREADS: read_data_ro_w = `XLEN'(`NUM_THREADS); + `VX_CSR_NUM_WARPS : read_data_ro_w = `XLEN'(`NUM_WARPS); + `VX_CSR_NUM_CORES : read_data_ro_w = `XLEN'(`NUM_CORES * `NUM_CLUSTERS); + `VX_CSR_LOCAL_MEM_BASE: read_data_ro_w = `XLEN'(`LMEM_BASE_ADDR); - `CSR_READ_64(`VX_CSR_MCYCLE, read_data_ro_r, cycles); + `CSR_READ_64(`VX_CSR_MCYCLE, read_data_ro_w, cycles); - `VX_CSR_MPM_RESERVED : read_data_ro_r = 'x; - `VX_CSR_MPM_RESERVED_H : read_data_ro_r = 'x; + `VX_CSR_MPM_RESERVED : read_data_ro_w = 'x; + `VX_CSR_MPM_RESERVED_H : read_data_ro_w = 'x; - `CSR_READ_64(`VX_CSR_MINSTRET, read_data_ro_r, commit_csr_if.instret); + `CSR_READ_64(`VX_CSR_MINSTRET, read_data_ro_w, commit_csr_if.instret); `VX_CSR_SATP, `VX_CSR_MSTATUS, @@ -200,77 +200,79 @@ import VX_fpu_pkg::*; `VX_CSR_MTVEC, `VX_CSR_MEPC, `VX_CSR_PMPCFG0, - `VX_CSR_PMPADDR0 : read_data_ro_r = `XLEN'(0); + `VX_CSR_PMPADDR0 : read_data_ro_w = `XLEN'(0); default: begin - read_addr_valid_r = 0; + read_addr_valid_w = 0; if ((read_addr >= `VX_CSR_MPM_USER && read_addr < (`VX_CSR_MPM_USER + 32)) || (read_addr >= `VX_CSR_MPM_USER_H && read_addr < (`VX_CSR_MPM_USER_H + 32))) begin - read_addr_valid_r = 1; + read_addr_valid_w = 1; `ifdef PERF_ENABLE case (base_dcrs.mpm_class) `VX_DCR_MPM_CLASS_CORE: begin case (read_addr) // PERF: pipeline - `CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_r, pipeline_perf_if.sched.idles); - `CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_r, pipeline_perf_if.sched.stalls); - `CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_r, pipeline_perf_if.issue.ibf_stalls); - `CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_r, pipeline_perf_if.issue.scb_stalls); - `CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_r, pipeline_perf_if.issue.opd_stalls); - `CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_ALU]); + `CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_w, pipeline_perf.sched.idles); + `CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_w, pipeline_perf.sched.stalls); + `CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_w, pipeline_perf.issue.ibf_stalls); + `CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_w, pipeline_perf.issue.scb_stalls); + `CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_w, pipeline_perf.issue.opd_stalls); + `CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_ALU]); `ifdef EXT_F_ENABLE - `CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_FPU]); + `CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_FPU]); `else - `CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, `PERF_CTR_BITS'(0)); + `CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, `PERF_CTR_BITS'(0)); `endif - `CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_LSU]); - `CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_SFU]); - `CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_CSRS]); - `CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_WCTL]); + `CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_LSU]); + `CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_SFU]); + `CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_CSRS]); + `CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_WCTL]); // PERF: memory - `CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_r, pipeline_perf_if.ifetches); - `CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_r, pipeline_perf_if.loads); - `CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_r, pipeline_perf_if.stores); - `CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_r, pipeline_perf_if.ifetch_latency); - `CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_r, pipeline_perf_if.load_latency); + `CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf.ifetches); + `CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf.loads); + `CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_w, pipeline_perf.stores); + `CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_w, pipeline_perf.ifetch_latency); + `CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_w, pipeline_perf.load_latency); default:; endcase end `VX_DCR_MPM_CLASS_MEM: begin case (read_addr) // PERF: icache - `CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_r, mem_perf_if.icache.reads); - `CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_r, mem_perf_if.icache.read_misses); - `CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_r, mem_perf_if.icache.mshr_stalls); + `CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, sysmem_perf.icache.reads); + `CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, sysmem_perf.icache.read_misses); + `CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, sysmem_perf.icache.mshr_stalls); // PERF: dcache - `CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_r, mem_perf_if.dcache.reads); - `CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_r, mem_perf_if.dcache.writes); - `CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_r, mem_perf_if.dcache.read_misses); - `CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_r, mem_perf_if.dcache.write_misses); - `CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_r, mem_perf_if.dcache.bank_stalls); - `CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_r, mem_perf_if.dcache.mshr_stalls); + `CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_w, sysmem_perf.dcache.reads); + `CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_w, sysmem_perf.dcache.writes); + `CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_w, sysmem_perf.dcache.read_misses); + `CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_w, sysmem_perf.dcache.write_misses); + `CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_w, sysmem_perf.dcache.bank_stalls); + `CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_w, sysmem_perf.dcache.mshr_stalls); // PERF: lmem - `CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_r, mem_perf_if.lmem.reads); - `CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_r, mem_perf_if.lmem.writes); - `CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_r, mem_perf_if.lmem.bank_stalls); + `CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, sysmem_perf.lmem.reads); + `CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, sysmem_perf.lmem.writes); + `CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, sysmem_perf.lmem.bank_stalls); // PERF: l2cache - `CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_r, mem_perf_if.l2cache.reads); - `CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_r, mem_perf_if.l2cache.writes); - `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_r, mem_perf_if.l2cache.read_misses); - `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_r, mem_perf_if.l2cache.write_misses); - `CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_r, mem_perf_if.l2cache.bank_stalls); - `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l2cache.mshr_stalls); + `CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_w, sysmem_perf.l2cache.reads); + `CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_w, sysmem_perf.l2cache.writes); + `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_w, sysmem_perf.l2cache.read_misses); + `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_w, sysmem_perf.l2cache.write_misses); + `CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l2cache.bank_stalls); + `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l2cache.mshr_stalls); // PERF: l3cache - `CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_r, mem_perf_if.l3cache.reads); - `CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_r, mem_perf_if.l3cache.writes); - `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_r, mem_perf_if.l3cache.read_misses); - `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_r, mem_perf_if.l3cache.write_misses); - `CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_r, mem_perf_if.l3cache.bank_stalls); - `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l3cache.mshr_stalls); + `CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_w, sysmem_perf.l3cache.reads); + `CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_w, sysmem_perf.l3cache.writes); + `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_w, sysmem_perf.l3cache.read_misses); + `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_w, sysmem_perf.l3cache.write_misses); + `CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l3cache.bank_stalls); + `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l3cache.mshr_stalls); // PERF: memory - `CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_r, mem_perf_if.mem.reads); - `CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_r, mem_perf_if.mem.writes); - `CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_r, mem_perf_if.mem.latency); + `CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, sysmem_perf.mem.reads); + `CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, sysmem_perf.mem.writes); + `CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_w, sysmem_perf.mem.latency); + // PERF: coalescer + `CSR_READ_64(`VX_CSR_MPM_COALESCER_MISS, read_data_ro_w, sysmem_perf.coalescer.misses); default:; endcase end @@ -282,16 +284,16 @@ import VX_fpu_pkg::*; endcase end - assign read_data_ro = read_data_ro_r; - assign read_data_rw = read_data_rw_r; + assign read_data_ro = read_data_ro_w; + assign read_data_rw = read_data_rw_w; `UNUSED_VAR (base_dcrs) - `RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid)) + `RUNTIME_ASSERT(~read_enable || read_addr_valid_w, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid)) `ifdef PERF_ENABLE - `UNUSED_VAR (mem_perf_if.icache); - `UNUSED_VAR (mem_perf_if.lmem); + `UNUSED_VAR (sysmem_perf.icache); + `UNUSED_VAR (sysmem_perf.lmem); `endif endmodule diff --git a/hw/rtl/core/VX_csr_unit.sv b/hw/rtl/core/VX_csr_unit.sv index 999c9c416..24969f5ec 100644 --- a/hw/rtl/core/VX_csr_unit.sv +++ b/hw/rtl/core/VX_csr_unit.sv @@ -24,8 +24,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #( input base_dcrs_t base_dcrs, `ifdef PERF_ENABLE - VX_mem_perf_if.slave mem_perf_if, - VX_pipeline_perf_if.slave pipeline_perf_if, + input sysmem_perf_t sysmem_perf, + input pipeline_perf_t pipeline_perf, `endif `ifdef EXT_F_ENABLE @@ -66,7 +66,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #( wire [NUM_LANES-1:0][`XLEN-1:0] rs1_data; `UNUSED_VAR (rs1_data) - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_rs1_data assign rs1_data[i] = execute_if.data.rs1_data[i]; end @@ -82,8 +82,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #( .base_dcrs (base_dcrs), `ifdef PERF_ENABLE - .mem_perf_if (mem_perf_if), - .pipeline_perf_if(pipeline_perf_if), + .sysmem_perf (sysmem_perf), + .pipeline_perf (pipeline_perf), `endif .commit_csr_if (commit_csr_if), @@ -113,12 +113,15 @@ module VX_csr_unit import VX_gpu_pkg::*; #( wire [NUM_LANES-1:0][`XLEN-1:0] wtid, gtid; - for (genvar i = 0; i < NUM_LANES; ++i) begin - if (PID_BITS != 0) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_wtid + if (PID_BITS != 0) begin : g_pid assign wtid[i] = `XLEN'(execute_if.data.pid * NUM_LANES + i); - end else begin + end else begin : g_no_pid assign wtid[i] = `XLEN'(i); end + end + + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_gtid assign gtid[i] = (`XLEN'(CORE_ID) << (`NW_BITS + `NT_BITS)) + (`XLEN'(execute_if.data.wid) << `NT_BITS) + wtid[i]; end diff --git a/hw/rtl/core/VX_dcr_data.sv b/hw/rtl/core/VX_dcr_data.sv index 4ac137547..6a13e034a 100644 --- a/hw/rtl/core/VX_dcr_data.sv +++ b/hw/rtl/core/VX_dcr_data.sv @@ -13,7 +13,7 @@ `include "VX_define.vh" -module VX_dcr_data import VX_gpu_pkg::*, VX_trace_pkg::*; ( +module VX_dcr_data import VX_gpu_pkg::*; ( input wire clk, input wire reset, @@ -50,9 +50,9 @@ module VX_dcr_data import VX_gpu_pkg::*, VX_trace_pkg::*; ( `ifdef DBG_TRACE_PIPELINE always @(posedge clk) begin if (dcr_bus_if.write_valid) begin - `TRACE(1, ("%d: base-dcr: state=", $time)); + `TRACE(2, ("%t: base-dcr: state=", $time)) trace_base_dcr(1, dcr_bus_if.write_addr); - `TRACE(1, (", data=0x%h\n", dcr_bus_if.write_data)); + `TRACE(2, (", data=0x%h\n", dcr_bus_if.write_data)) end end `endif diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv index 9660859ce..70bb181a1 100644 --- a/hw/rtl/core/VX_decode.sv +++ b/hw/rtl/core/VX_decode.sv @@ -15,19 +15,19 @@ `ifdef EXT_F_ENABLE `define USED_IREG(x) \ - x``_r = {1'b0, ``x}; \ + x``_v = {1'b0, ``x}; \ use_``x = 1 `define USED_FREG(x) \ - x``_r = {1'b1, ``x}; \ + x``_v = {1'b1, ``x}; \ use_``x = 1 `else `define USED_IREG(x) \ - x``_r = ``x; \ + x``_v = ``x; \ use_``x = 1 `endif -module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #( +module VX_decode import VX_gpu_pkg::*; #( parameter `STRING INSTANCE_ID = "" ) ( input wire clk, @@ -50,7 +50,7 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #( reg [`EX_BITS-1:0] ex_type; reg [`INST_OP_BITS-1:0] op_type; op_args_t op_args; - reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r; + reg [`NR_BITS-1:0] rd_v, rs1_v, rs2_v, rs3_v; reg use_rd, use_rs1, use_rs2, use_rs3; reg is_wstall; @@ -152,13 +152,13 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #( always @(*) begin - ex_type = '0; + ex_type = 'x; op_type = 'x; op_args = 'x; - rd_r = '0; - rs1_r = '0; - rs2_r = '0; - rs3_r = '0; + rd_v = '0; + rs1_v = '0; + rs2_v = '0; + rs3_v = '0; use_rd = 0; use_rs1 = 0; use_rs2 = 0; @@ -376,14 +376,16 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #( `USED_IREG (rs2); end `ifdef EXT_F_ENABLE - `INST_FMADD, - `INST_FMSUB, - `INST_FNMSUB, - `INST_FNMADD: begin + `INST_FMADD, // 7'b1000011 + `INST_FMSUB, // 7'b1000111 + `INST_FNMSUB, // 7'b1001011 + `INST_FNMADD: // 7'b1001111 + begin ex_type = `EX_FPU; - op_type = `INST_OP_BITS'({2'b11, opcode[3:2]}); + op_type = `INST_OP_BITS'({2'b00, 1'b1, opcode[3]}); op_args.fpu.frm = func3; op_args.fpu.fmt[0] = func2[0]; // float / double + op_args.fpu.fmt[1] = opcode[3] ^ opcode[2]; // SUB use_rd = 1; `USED_FREG (rd); `USED_FREG (rs1); @@ -399,9 +401,10 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #( case (func5) 5'b00000, // FADD 5'b00001, // FSUB - 5'b00010, // FMUL - 5'b00011: begin // FDIV - op_type = `INST_OP_BITS'(func5[1:0]); + 5'b00010: // FMUL + begin + op_type = `INST_OP_BITS'({2'b00, 1'b0, func5[1]}); + op_args.fpu.fmt[1] = func5[0]; // SUB `USED_FREG (rd); `USED_FREG (rs1); `USED_FREG (rs2); @@ -430,6 +433,13 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #( `USED_FREG (rs1); end `endif + 5'b00011: begin + // FDIV + op_type = `INST_OP_BITS'(`INST_FPU_DIV); + `USED_FREG (rd); + `USED_FREG (rs1); + `USED_FREG (rs2); + end 5'b01011: begin // FSQRT op_type = `INST_OP_BITS'(`INST_FPU_SQRT); @@ -527,7 +537,7 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #( end // disable write to integer register r0 - wire wb = use_rd && (rd_r != 0); + wire wb = use_rd && (rd_v != 0); VX_elastic_buffer #( .DATAW (DATAW), @@ -537,7 +547,7 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #( .reset (reset), .valid_in (fetch_if.valid), .ready_in (fetch_if.ready), - .data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_r, rs1_r, rs2_r, rs3_r}), + .data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_v, rs1_v, rs2_v, rs3_v}), .data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}), .valid_out (decode_if.valid), .ready_out (decode_if.ready) @@ -547,9 +557,10 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #( wire fetch_fire = fetch_if.valid && fetch_if.ready; - assign decode_sched_if.valid = fetch_fire; - assign decode_sched_if.wid = fetch_if.data.wid; - assign decode_sched_if.is_wstall = is_wstall; + assign decode_sched_if.valid = fetch_fire; + assign decode_sched_if.wid = fetch_if.data.wid; + assign decode_sched_if.unlock = ~is_wstall; + `ifndef L1_ENABLE assign fetch_if.ibuf_pop = decode_if.ibuf_pop; `endif @@ -557,14 +568,14 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #( `ifdef DBG_TRACE_PIPELINE always @(posedge clk) begin if (decode_if.valid && decode_if.ready) begin - `TRACE(1, ("%d: %s: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, INSTANCE_ID, decode_if.data.wid, {decode_if.data.PC, 1'd0}, instr)); + `TRACE(1, ("%t: %s: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, INSTANCE_ID, decode_if.data.wid, {decode_if.data.PC, 1'd0}, instr)) trace_ex_type(1, decode_if.data.ex_type); - `TRACE(1, (", op=")); + `TRACE(1, (", op=")) trace_ex_op(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args); `TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, opds=%b%b%b%b", - decode_if.data.tmask, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, use_rd, use_rs1, use_rs2, use_rs3)); + decode_if.data.tmask, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, use_rd, use_rs1, use_rs2, use_rs3)) trace_op_args(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args); - `TRACE(1, (" (#%0d)\n", decode_if.data.uuid)); + `TRACE(1, (" (#%0d)\n", decode_if.data.uuid)) end end `endif diff --git a/hw/rtl/core/VX_dispatch.sv b/hw/rtl/core/VX_dispatch.sv index 8ea3a6125..1c24fe46d 100644 --- a/hw/rtl/core/VX_dispatch.sv +++ b/hw/rtl/core/VX_dispatch.sv @@ -33,7 +33,7 @@ module VX_dispatch import VX_gpu_pkg::*; #( localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH; wire [`NUM_THREADS-1:0][`NT_WIDTH-1:0] tids; - for (genvar i = 0; i < `NUM_THREADS; ++i) begin + for (genvar i = 0; i < `NUM_THREADS; ++i) begin : g_tids assign tids[i] = `NT_WIDTH'(i); end @@ -50,23 +50,19 @@ module VX_dispatch import VX_gpu_pkg::*; #( `UNUSED_PIN (valid_out) ); - wire [`NUM_EX_UNITS-1:0] operands_reset; - assign operands_if.ready = operands_reset[operands_if.data.ex_type]; - - for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin - - `RESET_RELAY (buffer_reset, reset); + wire [`NUM_EX_UNITS-1:0] operands_ready_in; + assign operands_if.ready = operands_ready_in[operands_if.data.ex_type]; + for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_buffers VX_elastic_buffer #( .DATAW (DATAW), .SIZE (2), - .OUT_REG (2), // 2-cycle EB for area reduction - .LUTRAM (1) + .OUT_REG (1) ) buffer ( .clk (clk), - .reset (buffer_reset), + .reset (reset), .valid_in (operands_if.valid && (operands_if.data.ex_type == `EX_BITS'(i))), - .ready_in (operands_reset[i]), + .ready_in (operands_ready_in[i]), .data_in ({ operands_if.data.uuid, operands_if.data.wis, @@ -92,7 +88,7 @@ module VX_dispatch import VX_gpu_pkg::*; #( wire operands_if_stall = operands_if.valid && ~operands_if.ready; - for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin + for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_perf_stalls always @(posedge clk) begin if (reset) begin perf_stalls_r[i] <= '0; diff --git a/hw/rtl/core/VX_dispatch_unit.sv b/hw/rtl/core/VX_dispatch_unit.sv index 618ea1221..5d37d0578 100644 --- a/hw/rtl/core/VX_dispatch_unit.sv +++ b/hw/rtl/core/VX_dispatch_unit.sv @@ -49,13 +49,12 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( wire [`ISSUE_WIDTH-1:0][IN_DATAW-1:0] dispatch_data; wire [`ISSUE_WIDTH-1:0] dispatch_ready; - for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_dispatch_data assign dispatch_valid[i] = dispatch_if[i].valid; assign dispatch_data[i] = dispatch_if[i].data; assign dispatch_if[i].ready = dispatch_ready[i]; end - wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices; wire [BLOCK_SIZE-1:0] block_ready; wire [BLOCK_SIZE-1:0][NUM_LANES-1:0] block_tmask; wire [BLOCK_SIZE-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] block_regs; @@ -66,30 +65,53 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( wire batch_done = (& block_done); + // batch select logic + logic [BATCH_COUNT_W-1:0] batch_idx; - if (BATCH_COUNT != 1) begin + + if (BATCH_COUNT != 1) begin : g_batch_idx + wire [BATCH_COUNT_W-1:0] batch_idx_n; + wire [BATCH_COUNT-1:0] valid_batches; + for (genvar i = 0; i < BATCH_COUNT; ++i) begin : g_valid_batches + assign valid_batches[i] = | dispatch_valid[i * BLOCK_SIZE +: BLOCK_SIZE]; + end + + VX_generic_arbiter #( + .NUM_REQS (BATCH_COUNT), + .TYPE ("P") + ) batch_sel ( + .clk (clk), + .reset (reset), + .requests (valid_batches), + .grant_index (batch_idx_n), + `UNUSED_PIN (grant_onehot), + `UNUSED_PIN (grant_valid), + .grant_ready (batch_done) + ); + always @(posedge clk) begin if (reset) begin batch_idx <= '0; - end else begin - batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done); + end else if (batch_done) begin + batch_idx <= batch_idx_n; end end - end else begin + end else begin : g_batch_idx_0 assign batch_idx = 0; `UNUSED_VAR (batch_done) end - for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin + wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices; + for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_issue_indices + assign issue_indices[block_idx] = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx); + end - wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx); - assign issue_indices[block_idx] = issue_idx; - - `RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1)); + for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_blocks + wire [ISSUE_W-1:0] issue_idx = issue_indices[block_idx]; wire valid_p, ready_p; - if (`NUM_THREADS != NUM_LANES) begin + if (`NUM_THREADS > NUM_LANES) begin : g_partial_threads reg [NUM_PACKETS-1:0] sent_mask_p; wire [PID_WIDTH-1:0] start_p_n, start_p, end_p; wire dispatch_valid_r; @@ -102,7 +124,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( wire fire_eop = fire_p && is_last_p; always @(posedge clk) begin - if (block_reset) begin + if (reset) begin sent_mask_p <= '0; is_first_p <= 1; end else begin @@ -124,8 +146,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; - for (genvar i = 0; i < NUM_PACKETS; ++i) begin - for (genvar j = 0; j < NUM_LANES; ++j) begin + for (genvar i = 0; i < NUM_PACKETS; ++i) begin : g_per_packet_data + for (genvar j = 0; j < NUM_LANES; ++j) begin : g_j localparam k = i * NUM_LANES + j; assign per_packet_tmask[i][j] = dispatch_tmask[k]; assign per_packet_regs[i][0][j] = dispatch_rs1_data[k]; @@ -135,10 +157,12 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( end wire [NUM_PACKETS-1:0] packet_valids; - wire [NUM_PACKETS-1:0][PID_WIDTH-1:0] packet_ids; - - for (genvar i = 0; i < NUM_PACKETS; ++i) begin + for (genvar i = 0; i < NUM_PACKETS; ++i) begin : g_packet_valids assign packet_valids[i] = (| per_packet_tmask[i]); + end + + wire [NUM_PACKETS-1:0][PID_WIDTH-1:0] packet_ids; + for (genvar i = 0; i < NUM_PACKETS; ++i) begin : g_packet_ids assign packet_ids[i] = PID_WIDTH'(i); end @@ -187,13 +211,13 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( assign block_pid[block_idx] = start_p; assign block_sop[block_idx] = is_first_p; assign block_eop[block_idx] = is_last_p; - if (FANOUT_ENABLE) begin + if (FANOUT_ENABLE) begin : g_block_ready_fanout assign block_ready[block_idx] = dispatch_valid_r && ready_p && block_enable; - end else begin + end else begin : g_block_ready assign block_ready[block_idx] = ready_p && block_enable; end - assign block_done[block_idx] = ~dispatch_valid[issue_idx] || fire_eop; - end else begin + assign block_done[block_idx] = fire_eop || ~dispatch_valid[issue_idx]; + end else begin : g_full_threads assign valid_p = dispatch_valid[issue_idx]; assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS]; assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; @@ -203,29 +227,31 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( assign block_sop[block_idx] = 1'b1; assign block_eop[block_idx] = 1'b1; assign block_ready[block_idx] = ready_p; - assign block_done[block_idx] = ~valid_p || ready_p; + assign block_done[block_idx] = ready_p || ~valid_p; end wire [ISSUE_ISW_W-1:0] isw; - if (BATCH_COUNT != 1) begin - if (BLOCK_SIZE != 1) begin + if (BATCH_COUNT != 1) begin : g_isw_batch + if (BLOCK_SIZE != 1) begin : g_block assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)}; - end else begin + end else begin : g_no_block assign isw = batch_idx; end - end else begin + end else begin : g_isw assign isw = block_idx; end wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw); + logic [OUT_DATAW-1:0] execute_data, execute_data_w; + VX_elastic_buffer #( .DATAW (OUT_DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) ) buf_out ( .clk (clk), - .reset (block_reset), + .reset (reset), .valid_in (valid_p), .ready_in (ready_p), .data_in ({ @@ -239,17 +265,27 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #( block_pid[block_idx], block_sop[block_idx], block_eop[block_idx]}), - .data_out (execute_if[block_idx].data), + .data_out (execute_data), .valid_out (execute_if[block_idx].valid), .ready_out (execute_if[block_idx].ready) ); + + if (`NUM_THREADS != NUM_LANES) begin : g_execute_data_w_partial + assign execute_data_w = execute_data; + end else begin : g_execute_data_w_full + always @(*) begin + execute_data_w = execute_data; + execute_data_w[2:0] = {1'b0, 1'b1, 1'b1}; // default pid, sop, and eop + end + end + assign execute_if[block_idx].data = execute_data_w; end reg [`ISSUE_WIDTH-1:0] ready_in; always @(*) begin ready_in = 0; - for (integer i = 0; i < BLOCK_SIZE; ++i) begin - ready_in[issue_indices[i]] = block_ready[i] && block_eop[i]; + for (integer block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin + ready_in[issue_indices[block_idx]] = block_ready[block_idx] && block_eop[block_idx]; end end assign dispatch_ready = ready_in; diff --git a/hw/rtl/core/VX_execute.sv b/hw/rtl/core/VX_execute.sv index ded25918c..b2e38909a 100644 --- a/hw/rtl/core/VX_execute.sv +++ b/hw/rtl/core/VX_execute.sv @@ -23,8 +23,8 @@ module VX_execute import VX_gpu_pkg::*; #( input wire reset, `ifdef PERF_ENABLE - VX_mem_perf_if.slave mem_perf_if, - VX_pipeline_perf_if.slave pipeline_perf_if, + input sysmem_perf_t sysmem_perf, + input pipeline_perf_t pipeline_perf, `endif input base_dcrs_t base_dcrs, @@ -51,41 +51,35 @@ module VX_execute import VX_gpu_pkg::*; #( VX_fpu_csr_if fpu_csr_if[`NUM_FPU_BLOCKS](); `endif - `RESET_RELAY (alu_reset, reset); - `RESET_RELAY (lsu_reset, reset); - `RESET_RELAY (sfu_reset, reset); - VX_alu_unit #( - .INSTANCE_ID ($sformatf("%s-alu", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-alu", INSTANCE_ID))) ) alu_unit ( .clk (clk), - .reset (alu_reset), + .reset (reset), .dispatch_if (dispatch_if[`EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]), .commit_if (commit_if[`EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]), .branch_ctl_if (branch_ctl_if) ); - `SCOPE_IO_SWITCH (1) + `SCOPE_IO_SWITCH (1); VX_lsu_unit #( - .INSTANCE_ID ($sformatf("%s-lsu", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-lsu", INSTANCE_ID))) ) lsu_unit ( `SCOPE_IO_BIND (0) .clk (clk), - .reset (lsu_reset), + .reset (reset), .dispatch_if (dispatch_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]), .commit_if (commit_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]), .lsu_mem_if (lsu_mem_if) ); `ifdef EXT_F_ENABLE - `RESET_RELAY (fpu_reset, reset); - VX_fpu_unit #( - .INSTANCE_ID ($sformatf("%s-fpu", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-fpu", INSTANCE_ID))) ) fpu_unit ( .clk (clk), - .reset (fpu_reset), + .reset (reset), .dispatch_if (dispatch_if[`EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]), .commit_if (commit_if[`EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]), .fpu_csr_if (fpu_csr_if) @@ -93,14 +87,14 @@ module VX_execute import VX_gpu_pkg::*; #( `endif VX_sfu_unit #( - .INSTANCE_ID ($sformatf("%s-sfu", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-sfu", INSTANCE_ID))), .CORE_ID (CORE_ID) ) sfu_unit ( .clk (clk), - .reset (sfu_reset), + .reset (reset), `ifdef PERF_ENABLE - .mem_perf_if (mem_perf_if), - .pipeline_perf_if (pipeline_perf_if), + .sysmem_perf (sysmem_perf), + .pipeline_perf (pipeline_perf), `endif .base_dcrs (base_dcrs), .dispatch_if (dispatch_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]), diff --git a/hw/rtl/core/VX_fetch.sv b/hw/rtl/core/VX_fetch.sv index 043a87939..d441f06e5 100644 --- a/hw/rtl/core/VX_fetch.sv +++ b/hw/rtl/core/VX_fetch.sv @@ -51,8 +51,9 @@ module VX_fetch import VX_gpu_pkg::*; #( wire [`NUM_THREADS-1:0] rsp_tmask; VX_dp_ram #( - .DATAW (`PC_BITS + `NUM_THREADS), - .SIZE (`NUM_WARPS), + .DATAW (`PC_BITS + `NUM_THREADS), + .SIZE (`NUM_WARPS), + .RDW_MODE ("R"), .LUTRAM (1) ) tag_store ( .clk (clk), @@ -71,7 +72,7 @@ module VX_fetch import VX_gpu_pkg::*; #( // This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache requests. // This issue is particularly prevalent when the icache and dcache are disabled and both requests share the same bus. wire [`NUM_WARPS-1:0] pending_ibuf_full; - for (genvar i = 0; i < `NUM_WARPS; ++i) begin + for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_pending_reads VX_pending_size #( .SIZE (`IBUF_SIZE) ) pending_reads ( @@ -116,9 +117,9 @@ module VX_fetch import VX_gpu_pkg::*; #( .ready_out (icache_bus_if.req_ready) ); - assign icache_bus_if.req_data.atype = '0; + assign icache_bus_if.req_data.flags = '0; assign icache_bus_if.req_data.rw = 0; - assign icache_bus_if.req_data.byteen = 4'b1111; + assign icache_bus_if.req_data.byteen = '1; assign icache_bus_if.req_data.data = '0; // Icache Response @@ -131,47 +132,59 @@ module VX_fetch import VX_gpu_pkg::*; #( assign fetch_if.data.uuid = rsp_uuid; assign icache_bus_if.rsp_ready = fetch_if.ready; +`ifdef SCOPE `ifdef DBG_SCOPE_FETCH + `SCOPE_IO_SWITCH (1); wire schedule_fire = schedule_if.valid && schedule_if.ready; - wire icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready; - VX_scope_tap #( - .SCOPE_ID (1), - .TRIGGERW (4), - .PROBEW (`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + - ICACHE_TAG_WIDTH + ICACHE_WORD_SIZE + ICACHE_ADDR_WIDTH + - (ICACHE_WORD_SIZE*8) + ICACHE_TAG_WIDTH) - ) scope_tap ( - .clk (clk), - .reset (scope_reset), - .start (1'b0), - .stop (1'b0), - .triggers ({ - reset, + wire icache_bus_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready; + wire icache_bus_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready; + wire reset_negedge; + `NEG_EDGE (reset_negedge, reset); + `SCOPE_TAP_EX (0, 1, 6, 3, ( + `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + + `UUID_WIDTH + ICACHE_WORD_SIZE + ICACHE_ADDR_WIDTH + + `UUID_WIDTH + (ICACHE_WORD_SIZE * 8) + ), { + schedule_if.valid, + schedule_if.ready, + icache_bus_if.req_valid, + icache_bus_if.req_ready, + icache_bus_if.rsp_valid, + icache_bus_if.rsp_ready + }, { schedule_fire, - icache_req_fire, - icache_rsp_fire - }), - .probes ({ + icache_bus_req_fire, + icache_bus_rsp_fire + },{ schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC, - icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr, - icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag - }), - .bus_in (scope_bus_in), - .bus_out (scope_bus_out) + icache_bus_if.req_data.tag.uuid, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr, + icache_bus_if.rsp_data.tag.uuid, icache_bus_if.rsp_data.data + }, + reset_negedge, 1'b0, 4096 ); `else - `SCOPE_IO_UNUSED() + `SCOPE_IO_UNUSED(0) +`endif +`endif + +`ifdef CHIPSCOPE +`ifdef DBG_SCOPE_FETCH + ila_fetch ila_fetch_inst ( + .clk (clk), + .probe0 ({schedule_if.valid, schedule_if.data, schedule_if.ready}), + .probe1 ({icache_bus_if.req_valid, icache_bus_if.req_data, icache_bus_if.req_ready}), + .probe2 ({icache_bus_if.rsp_valid, icache_bus_if.rsp_data, icache_bus_if.rsp_ready}) + ); +`endif `endif `ifdef DBG_TRACE_MEM - wire schedule_fire = schedule_if.valid && schedule_if.ready; - wire fetch_fire = fetch_if.valid && fetch_if.ready; always @(posedge clk) begin - if (schedule_fire) begin - `TRACE(1, ("%d: %s req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, INSTANCE_ID, schedule_if.data.wid, {schedule_if.data.PC, 1'b0}, schedule_if.data.tmask, schedule_if.data.uuid)); + if (schedule_if.valid && schedule_if.ready) begin + `TRACE(1, ("%t: %s req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, INSTANCE_ID, schedule_if.data.wid, {schedule_if.data.PC, 1'b0}, schedule_if.data.tmask, schedule_if.data.uuid)) end - if (fetch_fire) begin - `TRACE(1, ("%d: %s rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, INSTANCE_ID, fetch_if.data.wid, {fetch_if.data.PC, 1'b0}, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid)); + if (fetch_if.valid && fetch_if.ready) begin + `TRACE(1, ("%t: %s rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, INSTANCE_ID, fetch_if.data.wid, {fetch_if.data.PC, 1'b0}, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid)) end end `endif diff --git a/hw/rtl/core/VX_fpu_unit.sv b/hw/rtl/core/VX_fpu_unit.sv index 496b24e29..1565f3728 100644 --- a/hw/rtl/core/VX_fpu_unit.sv +++ b/hw/rtl/core/VX_fpu_unit.sv @@ -41,7 +41,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( VX_dispatch_unit #( .BLOCK_SIZE (BLOCK_SIZE), .NUM_LANES (NUM_LANES), - .OUT_BUF (PARTIAL_BW ? 1 : 0) + .OUT_BUF (PARTIAL_BW ? 3 : 0) ) dispatch_unit ( .clk (clk), .reset (reset), @@ -53,12 +53,10 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( .NUM_LANES (NUM_LANES) ) per_block_commit_if[BLOCK_SIZE](); - for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin + for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_fpus `UNUSED_VAR (per_block_execute_if[block_idx].data.tid) `UNUSED_VAR (per_block_execute_if[block_idx].data.wb) - `RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1)); - // Store request info wire fpu_req_valid, fpu_req_ready; wire fpu_rsp_valid, fpu_rsp_ready; @@ -71,9 +69,9 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( wire [NUM_LANES-1:0] fpu_rsp_tmask; wire [`PC_BITS-1:0] fpu_rsp_PC; wire [`NR_BITS-1:0] fpu_rsp_rd; - wire [PID_WIDTH-1:0] fpu_rsp_pid; - wire fpu_rsp_sop; - wire fpu_rsp_eop; + wire [PID_WIDTH-1:0] fpu_rsp_pid, fpu_rsp_pid_u; + wire fpu_rsp_sop, fpu_rsp_sop_u; + wire fpu_rsp_eop, fpu_rsp_eop_u; wire [TAG_WIDTH-1:0] fpu_req_tag, fpu_rsp_tag; wire mdata_full; @@ -89,17 +87,30 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( .SIZE (`FPUQ_SIZE) ) tag_store ( .clk (clk), - .reset (block_reset), + .reset (reset), .acquire_en (execute_fire), .write_addr (fpu_req_tag), .write_data ({per_block_execute_if[block_idx].data.uuid, per_block_execute_if[block_idx].data.wid, per_block_execute_if[block_idx].data.tmask, per_block_execute_if[block_idx].data.PC, per_block_execute_if[block_idx].data.rd, per_block_execute_if[block_idx].data.pid, per_block_execute_if[block_idx].data.sop, per_block_execute_if[block_idx].data.eop}), - .read_data ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}), + .read_data ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid_u, fpu_rsp_sop_u, fpu_rsp_eop_u}), .read_addr (fpu_rsp_tag), .release_en (fpu_rsp_fire), .full (mdata_full), `UNUSED_PIN (empty) ); + if (PID_BITS != 0) begin : g_fpu_rsp_pid + assign fpu_rsp_pid = fpu_rsp_pid_u; + assign fpu_rsp_sop = fpu_rsp_sop_u; + assign fpu_rsp_eop = fpu_rsp_eop_u; + end else begin : g_no_fpu_rsp_pid + `UNUSED_VAR (fpu_rsp_pid_u) + `UNUSED_VAR (fpu_rsp_sop_u) + `UNUSED_VAR (fpu_rsp_eop_u) + assign fpu_rsp_pid = 0; + assign fpu_rsp_sop = 1; + assign fpu_rsp_eop = 1; + end + // resolve dynamic FRM from CSR wire [`INST_FRM_BITS-1:0] fpu_req_frm; `ASSIGN_BLOCKED_WID (fpu_csr_if[block_idx].read_wid, per_block_execute_if[block_idx].data.wid, block_idx, `NUM_FPU_BLOCKS) @@ -119,7 +130,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( .OUT_BUF (PARTIAL_BW ? 1 : 3) ) fpu_dpi ( .clk (clk), - .reset (block_reset), + .reset (reset), .valid_in (fpu_req_valid), .mask_in (per_block_execute_if[block_idx].data.tmask), @@ -148,7 +159,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( .OUT_BUF (PARTIAL_BW ? 1 : 3) ) fpu_fpnew ( .clk (clk), - .reset (block_reset), + .reset (reset), .valid_in (fpu_req_valid), .mask_in (per_block_execute_if[block_idx].data.tmask), @@ -177,7 +188,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( .OUT_BUF (PARTIAL_BW ? 1 : 3) ) fpu_dsp ( .clk (clk), - .reset (block_reset), + .reset (reset), .valid_in (fpu_req_valid), .mask_in (per_block_execute_if[block_idx].data.tmask), @@ -200,27 +211,38 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( `endif - // handle FPU response - + // handle CSR update fflags_t fpu_rsp_fflags_q; - if (PID_BITS != 0) begin + if (PID_BITS != 0) begin : g_pid fflags_t fpu_rsp_fflags_r; always @(posedge clk) begin - if (block_reset) begin + if (reset) begin fpu_rsp_fflags_r <= '0; end else if (fpu_rsp_fire) begin fpu_rsp_fflags_r <= fpu_rsp_eop ? '0 : (fpu_rsp_fflags_r | fpu_rsp_fflags); end end assign fpu_rsp_fflags_q = fpu_rsp_fflags_r | fpu_rsp_fflags; - end else begin + end else begin : g_no_pid assign fpu_rsp_fflags_q = fpu_rsp_fflags; end - assign fpu_csr_if[block_idx].write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags; - `ASSIGN_BLOCKED_WID (fpu_csr_if[block_idx].write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS) - assign fpu_csr_if[block_idx].write_fflags = fpu_rsp_fflags_q; + VX_fpu_csr_if fpu_csr_tmp_if(); + assign fpu_csr_tmp_if.write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags; + `ASSIGN_BLOCKED_WID (fpu_csr_tmp_if.write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS) + assign fpu_csr_tmp_if.write_fflags = fpu_rsp_fflags_q; + + VX_pipe_register #( + .DATAW (1 + `NW_WIDTH + $bits(fflags_t)), + .RESETW (1) + ) fpu_csr_reg ( + .clk (clk), + .reset (reset), + .enable (1'b1), + .data_in ({fpu_csr_tmp_if.write_enable, fpu_csr_tmp_if.write_wid, fpu_csr_tmp_if.write_fflags}), + .data_out ({fpu_csr_if[block_idx].write_enable, fpu_csr_if[block_idx].write_wid, fpu_csr_if[block_idx].write_fflags}) + ); // send response @@ -229,7 +251,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( .SIZE (0) ) rsp_buf ( .clk (clk), - .reset (block_reset), + .reset (reset), .valid_in (fpu_rsp_valid), .ready_in (fpu_rsp_ready), .data_in ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_result, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}), diff --git a/hw/rtl/core/VX_gather_unit.sv b/hw/rtl/core/VX_gather_unit.sv index 293495eba..284d5c167 100644 --- a/hw/rtl/core/VX_gather_unit.sv +++ b/hw/rtl/core/VX_gather_unit.sv @@ -41,17 +41,17 @@ module VX_gather_unit import VX_gpu_pkg::*; #( wire [BLOCK_SIZE-1:0] commit_in_ready; wire [BLOCK_SIZE-1:0][ISSUE_ISW_W-1:0] commit_in_isw; - for (genvar i = 0; i < BLOCK_SIZE; ++i) begin + for (genvar i = 0; i < BLOCK_SIZE; ++i) begin : g_commit_in assign commit_in_valid[i] = commit_in_if[i].valid; assign commit_in_data[i] = commit_in_if[i].data; assign commit_in_if[i].ready = commit_in_ready[i]; - if (BLOCK_SIZE != `ISSUE_WIDTH) begin - if (BLOCK_SIZE != 1) begin + if (BLOCK_SIZE != `ISSUE_WIDTH) begin : g_commit_in_isw_partial + if (BLOCK_SIZE != 1) begin : g_block assign commit_in_isw[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_ISW_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)}; - end else begin + end else begin : g_no_block assign commit_in_isw[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_ISW_W]; end - end else begin + end else begin : g_commit_in_isw_full assign commit_in_isw[i] = BLOCK_SIZE_W'(i); end end @@ -70,11 +70,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #( commit_out_data[commit_in_isw[i]] = commit_in_data[i]; end end - for (genvar i = 0; i < BLOCK_SIZE; ++i) begin + + for (genvar i = 0; i < BLOCK_SIZE; ++i) begin : g_commit_in_ready assign commit_in_ready[i] = commit_out_ready[commit_in_isw[i]]; end - for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin: g_out_bufs VX_commit_if #( .NUM_LANES (NUM_LANES) ) commit_tmp_if(); @@ -94,31 +95,31 @@ module VX_gather_unit import VX_gpu_pkg::*; #( .ready_out (commit_tmp_if.ready) ); - logic [`NUM_THREADS-1:0] commit_tmask_r; - logic [`NUM_THREADS-1:0][`XLEN-1:0] commit_data_r; - if (PID_BITS != 0) begin + logic [`NUM_THREADS-1:0] commit_tmask_w; + logic [`NUM_THREADS-1:0][`XLEN-1:0] commit_data_w; + if (PID_BITS != 0) begin : g_commit_data_with_pid always @(*) begin - commit_tmask_r = '0; - commit_data_r = 'x; + commit_tmask_w = '0; + commit_data_w = 'x; for (integer j = 0; j < NUM_LANES; ++j) begin - commit_tmask_r[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.tmask[j]; - commit_data_r[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.data[j]; + commit_tmask_w[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.tmask[j]; + commit_data_w[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.data[j]; end end - end else begin - assign commit_tmask_r = commit_tmp_if.data.tmask; - assign commit_data_r = commit_tmp_if.data.data; + end else begin : g_commit_data_no_pid + assign commit_tmask_w = commit_tmp_if.data.tmask; + assign commit_data_w = commit_tmp_if.data.data; end assign commit_out_if[i].valid = commit_tmp_if.valid; assign commit_out_if[i].data = { commit_tmp_if.data.uuid, commit_tmp_if.data.wid, - commit_tmask_r, + commit_tmask_w, commit_tmp_if.data.PC, commit_tmp_if.data.wb, commit_tmp_if.data.rd, - commit_data_r, + commit_data_w, 1'b0, // PID commit_tmp_if.data.sop, commit_tmp_if.data.eop diff --git a/hw/rtl/core/VX_ibuffer.sv b/hw/rtl/core/VX_ibuffer.sv index e8edf64c7..abb261b7e 100644 --- a/hw/rtl/core/VX_ibuffer.sv +++ b/hw/rtl/core/VX_ibuffer.sv @@ -35,11 +35,11 @@ module VX_ibuffer import VX_gpu_pkg::*; #( wire [PER_ISSUE_WARPS-1:0] ibuf_ready_in; assign decode_if.ready = ibuf_ready_in[decode_if.data.wid]; - for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin + for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_instr_bufs VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`IBUF_SIZE), - .OUT_REG (2) // 2-cycle EB for area reduction + .OUT_REG (1) ) instr_buf ( .clk (clk), .reset (reset), diff --git a/hw/rtl/core/VX_ipdom_stack.sv b/hw/rtl/core/VX_ipdom_stack.sv index 0ec05cbae..6bec14504 100644 --- a/hw/rtl/core/VX_ipdom_stack.sv +++ b/hw/rtl/core/VX_ipdom_stack.sv @@ -16,7 +16,6 @@ module VX_ipdom_stack #( parameter WIDTH = 1, parameter DEPTH = 1, - parameter OUT_REG = 0, parameter ADDRW = `LOG2UP(DEPTH) ) ( input wire clk, @@ -31,76 +30,63 @@ module VX_ipdom_stack #( output wire empty, output wire full ); - reg slot_set [DEPTH-1:0]; - - reg [ADDRW-1:0] rd_ptr, wr_ptr; + reg [ADDRW-1:0] rd_ptr, rd_ptr_n, wr_ptr; reg empty_r, full_r; wire [WIDTH-1:0] d0, d1; - wire d_set_n = slot_set[rd_ptr]; + wire d_set_r; + + always @(*) begin + rd_ptr_n = rd_ptr; + if (push) begin + rd_ptr_n = wr_ptr; + end else if (pop) begin + rd_ptr_n = rd_ptr - ADDRW'(d_set_r); + end + end always @(posedge clk) begin if (reset) begin - rd_ptr <= '0; wr_ptr <= '0; empty_r <= 1; full_r <= 0; + rd_ptr <= '0; end else begin - `ASSERT(~push || ~full, ("runtime error: writing to a full stack!")); - `ASSERT(~pop || ~empty, ("runtime error: reading an empty stack!")); - `ASSERT(~push || ~pop, ("runtime error: push and pop in same cycle not supported!")); + `ASSERT(~push || ~full, ("%t: runtime error: writing to a full stack!", $time)); + `ASSERT(~pop || ~empty, ("%t: runtime error: reading an empty stack!", $time)); + `ASSERT(~push || ~pop, ("%t: runtime error: push and pop in same cycle not supported!", $time)); if (push) begin - rd_ptr <= wr_ptr; wr_ptr <= wr_ptr + ADDRW'(1); empty_r <= 0; full_r <= (ADDRW'(DEPTH-1) == wr_ptr); end else if (pop) begin - wr_ptr <= wr_ptr - ADDRW'(d_set_n); - rd_ptr <= rd_ptr - ADDRW'(d_set_n); - empty_r <= (rd_ptr == 0) && (d_set_n == 1); + wr_ptr <= wr_ptr - ADDRW'(d_set_r); + empty_r <= (rd_ptr == 0) && d_set_r; full_r <= 0; end + rd_ptr <= rd_ptr_n; end end + wire [WIDTH * 2:0] qout = push ? {1'b0, q1, q0} : {1'b1, d1, d0}; + VX_dp_ram #( - .DATAW (WIDTH * 2), - .SIZE (DEPTH), - .OUT_REG (OUT_REG ? 1 : 0), - .LUTRAM (OUT_REG ? 0 : 1) - ) store ( + .DATAW (1 + WIDTH * 2), + .SIZE (DEPTH), + .OUT_REG (1), + .RDW_MODE ("R") + ) ipdom_store ( .clk (clk), .reset (reset), .read (1'b1), - .write (push), + .write (push || pop), .wren (1'b1), - .waddr (wr_ptr), - .wdata ({q1, q0}), - .raddr (rd_ptr), - .rdata ({d1, d0}) - ); - - always @(posedge clk) begin - if (push) begin - slot_set[wr_ptr] <= 0; - end else if (pop) begin - slot_set[rd_ptr] <= 1; - end - end - - wire d_set_r; - - VX_pipe_register #( - .DATAW (1), - .DEPTH (OUT_REG) - ) pipe_reg ( - .clk (clk), - .reset (reset), - .enable (1'b1), - .data_in (d_set_n), - .data_out (d_set_r) + .waddr (push ? wr_ptr : rd_ptr), + .wdata (qout), + .raddr (rd_ptr_n), + .rdata ({d_set_r, d1, d0}) ); assign d = d_set_r ? d0 : d1; diff --git a/hw/rtl/core/VX_issue.sv b/hw/rtl/core/VX_issue.sv index 1480e6649..924d1a67d 100644 --- a/hw/rtl/core/VX_issue.sv +++ b/hw/rtl/core/VX_issue.sv @@ -29,16 +29,17 @@ module VX_issue import VX_gpu_pkg::*; #( VX_writeback_if.slave writeback_if [`ISSUE_WIDTH], VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH] ); + `STATIC_ASSERT ((`ISSUE_WIDTH <= `NUM_WARPS), ("invalid parameter")) `ifdef PERF_ENABLE issue_perf_t per_issue_perf [`ISSUE_WIDTH]; `PERF_COUNTER_ADD (issue_perf, per_issue_perf, ibf_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2)) `PERF_COUNTER_ADD (issue_perf, per_issue_perf, scb_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2)) `PERF_COUNTER_ADD (issue_perf, per_issue_perf, opd_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2)) - for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin + for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_issue_perf_units_uses `PERF_COUNTER_ADD (issue_perf, per_issue_perf, units_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2)) end - for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin + for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin : g_issue_perf_sfu_uses `PERF_COUNTER_ADD (issue_perf, per_issue_perf, sfu_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2)) end `endif @@ -49,9 +50,9 @@ module VX_issue import VX_gpu_pkg::*; #( wire [`ISSUE_WIDTH-1:0] decode_ready_in; assign decode_if.ready = decode_ready_in[decode_isw]; - `SCOPE_IO_SWITCH (`ISSUE_WIDTH) + `SCOPE_IO_SWITCH (`ISSUE_WIDTH); - for (genvar issue_id = 0; issue_id < `ISSUE_WIDTH; ++issue_id) begin : issue_slices + for (genvar issue_id = 0; issue_id < `ISSUE_WIDTH; ++issue_id) begin : g_slices VX_decode_if #( .NUM_WARPS (PER_ISSUE_WARPS) ) per_issue_decode_if(); @@ -76,15 +77,13 @@ module VX_issue import VX_gpu_pkg::*; #( assign decode_if.ibuf_pop[issue_id * PER_ISSUE_WARPS +: PER_ISSUE_WARPS] = per_issue_decode_if.ibuf_pop; `endif - `RESET_RELAY (slice_reset, reset); - VX_issue_slice #( - .INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, issue_id)), + .INSTANCE_ID (`SFORMATF(("%s%0d", INSTANCE_ID, issue_id))), .ISSUE_ID (issue_id) ) issue_slice ( `SCOPE_IO_BIND(issue_id) .clk (clk), - .reset (slice_reset), + .reset (reset), `ifdef PERF_ENABLE .issue_perf (per_issue_perf[issue_id]), `endif @@ -94,7 +93,7 @@ module VX_issue import VX_gpu_pkg::*; #( ); // Assign transposed dispatch_if - for (genvar ex_id = 0; ex_id < `NUM_EX_UNITS; ++ex_id) begin + for (genvar ex_id = 0; ex_id < `NUM_EX_UNITS; ++ex_id) begin : g_dispatch_if `ASSIGN_VX_IF(dispatch_if[ex_id * `ISSUE_WIDTH + issue_id], per_issue_dispatch_if[ex_id]); end end diff --git a/hw/rtl/core/VX_issue_slice.sv b/hw/rtl/core/VX_issue_slice.sv index 03b91b5fe..29c610880 100644 --- a/hw/rtl/core/VX_issue_slice.sv +++ b/hw/rtl/core/VX_issue_slice.sv @@ -13,7 +13,7 @@ `include "VX_define.vh" -module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( +module VX_issue_slice import VX_gpu_pkg::*; #( parameter `STRING INSTANCE_ID = "", parameter ISSUE_ID = 0 ) ( @@ -36,16 +36,11 @@ module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( VX_scoreboard_if scoreboard_if(); VX_operands_if operands_if(); - `RESET_RELAY (ibuf_reset, reset); - `RESET_RELAY (scoreboard_reset, reset); - `RESET_RELAY (operands_reset, reset); - `RESET_RELAY (dispatch_reset, reset); - VX_ibuffer #( - .INSTANCE_ID ($sformatf("%s-ibuffer", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-ibuffer", INSTANCE_ID))) ) ibuffer ( .clk (clk), - .reset (ibuf_reset), + .reset (reset), `ifdef PERF_ENABLE .perf_stalls (issue_perf.ibf_stalls), `endif @@ -54,10 +49,10 @@ module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( ); VX_scoreboard #( - .INSTANCE_ID ($sformatf("%s-scoreboard", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-scoreboard", INSTANCE_ID))) ) scoreboard ( .clk (clk), - .reset (scoreboard_reset), + .reset (reset), `ifdef PERF_ENABLE .perf_stalls (issue_perf.scb_stalls), .perf_units_uses(issue_perf.units_uses), @@ -69,10 +64,10 @@ module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( ); VX_operands #( - .INSTANCE_ID ($sformatf("%s-operands", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-operands", INSTANCE_ID))) ) operands ( .clk (clk), - .reset (operands_reset), + .reset (reset), `ifdef PERF_ENABLE .perf_stalls (issue_perf.opd_stalls), `endif @@ -82,10 +77,10 @@ module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( ); VX_dispatch #( - .INSTANCE_ID ($sformatf("%s-dispatch", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-dispatch", INSTANCE_ID))) ) dispatch ( .clk (clk), - .reset (dispatch_reset), + .reset (reset), `ifdef PERF_ENABLE `UNUSED_PIN (perf_stalls), `endif @@ -93,65 +88,90 @@ module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( .dispatch_if (dispatch_if) ); +`ifdef SCOPE `ifdef DBG_SCOPE_ISSUE - wire operands_if_fire = operands_if.valid && operands_if.ready; - wire operands_if_not_ready = ~operands_if.ready; - wire writeback_if_valid = writeback_if.valid; - VX_scope_tap #( - .SCOPE_ID (2), - .TRIGGERW (4), - .PROBEW (`UUID_WIDTH + `NUM_THREADS + `EX_BITS + `INST_OP_BITS + - 1 + `NR_BITS + (`NUM_THREADS * 3 * `XLEN) + - `UUID_WIDTH + `NUM_THREADS + `NR_BITS + (`NUM_THREADS*`XLEN) + 1) - ) scope_tap ( - .clk (clk), - .reset (scope_reset), - .start (1'b0), - .stop (1'b0), - .triggers ({ - reset, - operands_if_fire, - operands_if_not_ready, - writeback_if_valid - }), - .probes ({ + `SCOPE_IO_SWITCH (1); + wire decode_fire = decode_if.valid && decode_if.ready; + wire operands_fire = operands_if.valid && operands_if.ready; + wire reset_negedge; + `NEG_EDGE (reset_negedge, reset); + `SCOPE_TAP_EX (0, 2, 4, 3, ( + `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + 1 + `NR_BITS * 4 + + `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + 1 + `NR_BITS + (3 * `XLEN) + + `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + ), { + decode_if.valid, + decode_if.ready, + operands_if.valid, + operands_if.ready + }, { + decode_fire, + operands_fire, + writeback_if.valid // ack-free + }, { + decode_if.data.uuid, + decode_if.data.wid, + decode_if.data.tmask, + decode_if.data.PC, + decode_if.data.ex_type, + decode_if.data.op_type, + decode_if.data.wb, + decode_if.data.rd, + decode_if.data.rs1, + decode_if.data.rs2, + decode_if.data.rs3, operands_if.data.uuid, + operands_if.data.wis, operands_if.data.tmask, + operands_if.data.PC, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.wb, operands_if.data.rd, - operands_if.data.rs1_data, - operands_if.data.rs2_data, - operands_if.data.rs3_data, + operands_if.data.rs1_data[0], + operands_if.data.rs2_data[0], + operands_if.data.rs3_data[0], writeback_if.data.uuid, + writeback_if.data.wis, writeback_if.data.tmask, writeback_if.data.rd, writeback_if.data.data, writeback_if.data.eop - }), - .bus_in (scope_bus_in), - .bus_out (scope_bus_out) + }, + reset_negedge, 1'b0, 4096 ); `else - `SCOPE_IO_UNUSED() + `SCOPE_IO_UNUSED(0) +`endif +`endif + +`ifdef CHIPSCOPE +`ifdef DBG_SCOPE_ISSUE + ila_issue ila_issue_inst ( + .clk (clk), + .probe0 ({decode_if.valid, decode_if.data, decode_if.ready}), + .probe1 ({scoreboard_if.valid, scoreboard_if.data, scoreboard_if.ready}), + .probe2 ({operands_if.valid, operands_if.data, operands_if.ready}), + .probe3 ({writeback_if.valid, writeback_if.data}) + ); +`endif `endif `ifdef DBG_TRACE_PIPELINE always @(posedge clk) begin if (operands_if.valid && operands_if.ready) begin - `TRACE(1, ("%d: %s wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, wis_to_wid(operands_if.data.wis, ISSUE_ID), {operands_if.data.PC, 1'b0})); + `TRACE(1, ("%t: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, wis_to_wid(operands_if.data.wis, ISSUE_ID), {operands_if.data.PC, 1'b0})) trace_ex_type(1, operands_if.data.ex_type); - `TRACE(1, (", op=")); + `TRACE(1, (", op=")) trace_ex_op(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args); - `TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if.data.tmask, operands_if.data.wb, operands_if.data.rd)); - `TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs1_data, `NUM_THREADS); - `TRACE(1, (", rs2_data=")); - `TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs2_data, `NUM_THREADS); - `TRACE(1, (", rs3_data=")); - `TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs3_data, `NUM_THREADS); + `TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if.data.tmask, operands_if.data.wb, operands_if.data.rd)) + `TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs1_data, `NUM_THREADS) + `TRACE(1, (", rs2_data=")) + `TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs2_data, `NUM_THREADS) + `TRACE(1, (", rs3_data=")) + `TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs3_data, `NUM_THREADS) trace_op_args(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args); - `TRACE(1, (" (#%0d)\n", operands_if.data.uuid)); + `TRACE(1, (" (#%0d)\n", operands_if.data.uuid)) end end `endif diff --git a/hw/rtl/core/VX_issue_top.sv b/hw/rtl/core/VX_issue_top.sv index 0166cf770..2d81ee044 100644 --- a/hw/rtl/core/VX_issue_top.sv +++ b/hw/rtl/core/VX_issue_top.sv @@ -80,7 +80,7 @@ module VX_issue_top import VX_gpu_pkg::*; #( assign decode_if.data.rs3 = decode_rs3; assign decode_ready = decode_if.ready; - for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_writeback_if assign writeback_if[i].valid = writeback_valid[i]; assign writeback_if[i].data.uuid = writeback_uuid[i]; assign writeback_if[i].data.wis = writeback_wis[i]; @@ -92,7 +92,7 @@ module VX_issue_top import VX_gpu_pkg::*; #( assign writeback_if[i].data.eop = writeback_eop[i]; end - for (genvar i = 0; i < `NUM_EX_UNITS * `ISSUE_WIDTH; ++i) begin + for (genvar i = 0; i < `NUM_EX_UNITS * `ISSUE_WIDTH; ++i) begin : g_dispatch_if assign dispatch_valid[i] = dispatch_if[i].valid; assign dispatch_uuid[i] = dispatch_if[i].data.uuid; assign dispatch_wis[i] = dispatch_if[i].data.wis; @@ -113,6 +113,13 @@ module VX_issue_top import VX_gpu_pkg::*; #( issue_perf_t issue_perf = '0; `endif +`ifdef SCOPE + wire [0:0] scope_reset_w = 1'b0; + wire [0:0] scope_bus_in_w = 1'b0; + wire [0:0] scope_bus_out_w; + `UNUSED_VAR (scope_bus_out_w) +`endif + VX_issue #( .INSTANCE_ID (INSTANCE_ID) ) issue ( diff --git a/hw/rtl/core/VX_lmem_unit.sv b/hw/rtl/core/VX_lmem_unit.sv deleted file mode 100644 index accb7a586..000000000 --- a/hw/rtl/core/VX_lmem_unit.sv +++ /dev/null @@ -1,201 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -`include "VX_define.vh" - -module VX_lmem_unit import VX_gpu_pkg::*; #( - parameter `STRING INSTANCE_ID = "" -) ( - input wire clk, - input wire reset, - -`ifdef PERF_ENABLE - output cache_perf_t cache_perf, -`endif - - VX_lsu_mem_if.slave lsu_mem_in_if [`NUM_LSU_BLOCKS], - VX_lsu_mem_if.master lsu_mem_out_if [`NUM_LSU_BLOCKS] -); - `STATIC_ASSERT(`IS_DIVISBLE((1 << `LMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter")) - `STATIC_ASSERT(0 == (`LMEM_BASE_ADDR % (1 << `LMEM_LOG_SIZE)), ("invalid parameter")) - - localparam REQ_DATAW = `NUM_LSU_LANES + 1 + `NUM_LSU_LANES * (LSU_WORD_SIZE + LSU_ADDR_WIDTH + `ADDR_TYPE_WIDTH + LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH; - localparam RSP_DATAW = `NUM_LSU_LANES + `NUM_LSU_LANES * (LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH; - localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE); - - VX_lsu_mem_if #( - .NUM_LANES (`NUM_LSU_LANES), - .DATA_SIZE (LSU_WORD_SIZE), - .TAG_WIDTH (LSU_TAG_WIDTH) - ) lsu_switch_if[`NUM_LSU_BLOCKS](); - - `RESET_RELAY_EX (block_reset, reset, `NUM_LSU_BLOCKS, 1); - - for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin - - wire [`NUM_LSU_LANES-1:0] is_addr_local_mask; - for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin - assign is_addr_local_mask[j] = lsu_mem_in_if[i].req_data.atype[j][`ADDR_TYPE_LOCAL]; - end - - wire is_addr_global = | (lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask); - wire is_addr_local = | (lsu_mem_in_if[i].req_data.mask & is_addr_local_mask); - - wire req_global_ready; - wire req_local_ready; - - VX_elastic_buffer #( - .DATAW (REQ_DATAW), - .SIZE (2), - .OUT_REG (1) - ) req_global_buf ( - .clk (clk), - .reset (block_reset[i]), - .valid_in (lsu_mem_in_if[i].req_valid && is_addr_global), - .data_in ({ - lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask, - lsu_mem_in_if[i].req_data.rw, - lsu_mem_in_if[i].req_data.byteen, - lsu_mem_in_if[i].req_data.addr, - lsu_mem_in_if[i].req_data.atype, - lsu_mem_in_if[i].req_data.data, - lsu_mem_in_if[i].req_data.tag - }), - .ready_in (req_global_ready), - .valid_out (lsu_mem_out_if[i].req_valid), - .data_out ({ - lsu_mem_out_if[i].req_data.mask, - lsu_mem_out_if[i].req_data.rw, - lsu_mem_out_if[i].req_data.byteen, - lsu_mem_out_if[i].req_data.addr, - lsu_mem_out_if[i].req_data.atype, - lsu_mem_out_if[i].req_data.data, - lsu_mem_out_if[i].req_data.tag - }), - .ready_out (lsu_mem_out_if[i].req_ready) - ); - - VX_elastic_buffer #( - .DATAW (REQ_DATAW), - .SIZE (0), - .OUT_REG (0) - ) req_local_buf ( - .clk (clk), - .reset (block_reset[i]), - .valid_in (lsu_mem_in_if[i].req_valid && is_addr_local), - .data_in ({ - lsu_mem_in_if[i].req_data.mask & is_addr_local_mask, - lsu_mem_in_if[i].req_data.rw, - lsu_mem_in_if[i].req_data.byteen, - lsu_mem_in_if[i].req_data.addr, - lsu_mem_in_if[i].req_data.atype, - lsu_mem_in_if[i].req_data.data, - lsu_mem_in_if[i].req_data.tag - }), - .ready_in (req_local_ready), - .valid_out (lsu_switch_if[i].req_valid), - .data_out ({ - lsu_switch_if[i].req_data.mask, - lsu_switch_if[i].req_data.rw, - lsu_switch_if[i].req_data.byteen, - lsu_switch_if[i].req_data.addr, - lsu_switch_if[i].req_data.atype, - lsu_switch_if[i].req_data.data, - lsu_switch_if[i].req_data.tag - }), - .ready_out (lsu_switch_if[i].req_ready) - ); - - assign lsu_mem_in_if[i].req_ready = (req_global_ready && is_addr_global) - || (req_local_ready && is_addr_local); - - VX_stream_arb #( - .NUM_INPUTS (2), - .DATAW (RSP_DATAW), - .ARBITER ("R"), - .OUT_BUF (1) - ) rsp_arb ( - .clk (clk), - .reset (block_reset[i]), - .valid_in ({ - lsu_switch_if[i].rsp_valid, - lsu_mem_out_if[i].rsp_valid - }), - .ready_in ({ - lsu_switch_if[i].rsp_ready, - lsu_mem_out_if[i].rsp_ready - }), - .data_in ({ - lsu_switch_if[i].rsp_data, - lsu_mem_out_if[i].rsp_data - }), - .data_out (lsu_mem_in_if[i].rsp_data), - .valid_out (lsu_mem_in_if[i].rsp_valid), - .ready_out (lsu_mem_in_if[i].rsp_ready), - `UNUSED_PIN (sel_out) - ); - end - - VX_mem_bus_if #( - .DATA_SIZE (LSU_WORD_SIZE), - .TAG_WIDTH (LSU_TAG_WIDTH) - ) lmem_bus_if[LSU_NUM_REQS](); - - for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin - VX_mem_bus_if #( - .DATA_SIZE (LSU_WORD_SIZE), - .TAG_WIDTH (LSU_TAG_WIDTH) - ) lmem_bus_tmp_if[`NUM_LSU_LANES](); - - VX_lsu_adapter #( - .NUM_LANES (`NUM_LSU_LANES), - .DATA_SIZE (LSU_WORD_SIZE), - .TAG_WIDTH (LSU_TAG_WIDTH), - .TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH), - .ARBITER ("P"), - .REQ_OUT_BUF (3), - .RSP_OUT_BUF (0) - ) lsu_adapter ( - .clk (clk), - .reset (block_reset[i]), - .lsu_mem_if (lsu_switch_if[i]), - .mem_bus_if (lmem_bus_tmp_if) - ); - - for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin - `ASSIGN_VX_MEM_BUS_IF (lmem_bus_if[i * `NUM_LSU_LANES + j], lmem_bus_tmp_if[j]); - end - end - - `RESET_RELAY (lmem_reset, reset); - - VX_local_mem #( - .INSTANCE_ID($sformatf("%s-lmem", INSTANCE_ID)), - .SIZE (1 << `LMEM_LOG_SIZE), - .NUM_REQS (LSU_NUM_REQS), - .NUM_BANKS (`LMEM_NUM_BANKS), - .WORD_SIZE (LSU_WORD_SIZE), - .ADDR_WIDTH (LMEM_ADDR_WIDTH), - .UUID_WIDTH (`UUID_WIDTH), - .TAG_WIDTH (LSU_TAG_WIDTH), - .OUT_BUF (3) - ) local_mem ( - .clk (clk), - .reset (lmem_reset), - `ifdef PERF_ENABLE - .cache_perf (cache_perf), - `endif - .mem_bus_if (lmem_bus_if) - ); - -endmodule diff --git a/hw/rtl/core/VX_lsu_slice.sv b/hw/rtl/core/VX_lsu_slice.sv index 8c685fca2..7c5b11017 100644 --- a/hw/rtl/core/VX_lsu_slice.sv +++ b/hw/rtl/core/VX_lsu_slice.sv @@ -13,7 +13,7 @@ `include "VX_define.vh" -module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( +module VX_lsu_slice import VX_gpu_pkg::*; #( parameter `STRING INSTANCE_ID = "" ) ( `SCOPE_IO_DECL @@ -59,25 +59,25 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( wire req_is_fence, rsp_is_fence; wire [NUM_LANES-1:0][`XLEN-1:0] full_addr; - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_full_addr assign full_addr[i] = execute_if.data.rs1_data[i] + `SEXT(`XLEN, execute_if.data.op_args.lsu.offset); end // address type calculation - wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] mem_req_atype; - for (genvar i = 0; i < NUM_LANES; ++i) begin + wire [NUM_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] mem_req_flags; + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_flags wire [MEM_ADDRW-1:0] block_addr = full_addr[i][MEM_ASHIFT +: MEM_ADDRW]; // is I/O address wire [MEM_ADDRW-1:0] io_addr_start = MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT); wire [MEM_ADDRW-1:0] io_addr_end = MEM_ADDRW'(`XLEN'(`IO_END_ADDR) >> MEM_ASHIFT); - assign mem_req_atype[i][`ADDR_TYPE_FLUSH] = req_is_fence; - assign mem_req_atype[i][`ADDR_TYPE_IO] = (block_addr >= io_addr_start) && (block_addr < io_addr_end); + assign mem_req_flags[i][`MEM_REQ_FLAG_FLUSH] = req_is_fence; + assign mem_req_flags[i][`MEM_REQ_FLAG_IO] = (block_addr >= io_addr_start) && (block_addr < io_addr_end); `ifdef LMEM_ENABLE // is local memory address wire [MEM_ADDRW-1:0] lmem_addr_start = MEM_ADDRW'(`XLEN'(`LMEM_BASE_ADDR) >> MEM_ASHIFT); wire [MEM_ADDRW-1:0] lmem_addr_end = MEM_ADDRW'((`XLEN'(`LMEM_BASE_ADDR) + `XLEN'(1 << `LMEM_LOG_SIZE)) >> MEM_ASHIFT); - assign mem_req_atype[i][`ADDR_TYPE_LOCAL] = (block_addr >= lmem_addr_start) && (block_addr < lmem_addr_end); + assign mem_req_flags[i][`MEM_REQ_FLAG_LOCAL] = (block_addr >= lmem_addr_start) && (block_addr < lmem_addr_end); `endif end @@ -102,8 +102,6 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( wire mem_req_fire = mem_req_valid && mem_req_ready; wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; - `UNUSED_VAR (mem_req_fire) - `UNUSED_VAR (mem_rsp_fire) wire mem_rsp_sop_pkt, mem_rsp_eop_pkt; wire no_rsp_buf_valid, no_rsp_buf_ready; @@ -151,49 +149,49 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] req_align; - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_addr assign req_align[i] = full_addr[i][REQ_ASHIFT-1:0]; assign mem_req_addr[i] = full_addr[i][`MEM_ADDR_WIDTH-1:REQ_ASHIFT]; end // byte enable formatting - for (genvar i = 0; i < NUM_LANES; ++i) begin - reg [LSU_WORD_SIZE-1:0] mem_req_byteen_r; + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_byteen_w + reg [LSU_WORD_SIZE-1:0] mem_req_byteen_w; always @(*) begin - mem_req_byteen_r = '0; + mem_req_byteen_w = '0; case (`INST_LSU_WSIZE(execute_if.data.op_type)) 0: begin // 8-bit - mem_req_byteen_r[req_align[i]] = 1'b1; + mem_req_byteen_w[req_align[i]] = 1'b1; end 1: begin // 16 bit - mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1; - mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1; + mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1; + mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1; end `ifdef XLEN_64 2: begin // 32 bit - mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1; - mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1; - mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1; - mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1; + mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1; + mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1; + mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1; + mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1; end `endif // 3: 64 bit - default : mem_req_byteen_r = {LSU_WORD_SIZE{1'b1}}; + default : mem_req_byteen_w = {LSU_WORD_SIZE{1'b1}}; endcase end - assign mem_req_byteen[i] = mem_req_byteen_r; + assign mem_req_byteen[i] = mem_req_byteen_w; end // memory misalignment not supported! - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_missalign wire lsu_req_fire = execute_if.valid && execute_if.ready; `RUNTIME_ASSERT((~lsu_req_fire || ~execute_if.data.tmask[i] || req_is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if.data.op_type))) == 0), - ("misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)", - execute_if.data.wid, {execute_if.data.PC, 1'b0}, full_addr[i], `INST_LSU_WSIZE(execute_if.data.op_type), execute_if.data.uuid)); + ("%t: misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)", + $time, execute_if.data.wid, {execute_if.data.PC, 1'b0}, full_addr[i], `INST_LSU_WSIZE(execute_if.data.op_type), execute_if.data.uuid)) end // store data formatting - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_data always @(*) begin mem_req_data[i] = execute_if.data.rs2_data[i]; case (req_align[i]) @@ -215,7 +213,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr; - if (PID_BITS != 0) begin + if (PID_BITS != 0) begin : g_pids reg [`LSUQ_IN_SIZE-1:0][PID_BITS:0] pkt_ctr; reg [`LSUQ_IN_SIZE-1:0] pkt_sop, pkt_eop; @@ -271,10 +269,10 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( assign mem_rsp_sop_pkt = pkt_sop[pkt_raddr]; assign mem_rsp_eop_pkt = mem_rsp_eop_fire && pkt_eop[pkt_raddr] && (pkt_ctr[pkt_raddr] == 1); - `RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("allocator full!")) - `RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("Oops!")) + `RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("%t: allocator full!", $time)) + `RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("%t: oops! broken sop request!", $time)) `UNUSED_VAR (mem_rsp_sop) - end else begin + end else begin : g_no_pids assign pkt_waddr = 0; assign mem_rsp_sop_pkt = mem_rsp_sop; assign mem_rsp_eop_pkt = mem_rsp_eop; @@ -300,7 +298,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( wire [NUM_LANES-1:0] lsu_mem_req_mask; wire [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] lsu_mem_req_byteen; wire [NUM_LANES-1:0][LSU_ADDR_WIDTH-1:0] lsu_mem_req_addr; - wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] lsu_mem_req_atype; + wire [NUM_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] lsu_mem_req_flags; wire [NUM_LANES-1:0][(LSU_WORD_SIZE*8)-1:0] lsu_mem_req_data; wire [LSU_TAG_WIDTH-1:0] lsu_mem_req_tag; wire lsu_mem_req_ready; @@ -311,16 +309,14 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( wire [LSU_TAG_WIDTH-1:0] lsu_mem_rsp_tag; wire lsu_mem_rsp_ready; - `RESET_RELAY (mem_scheduler_reset, reset); - VX_mem_scheduler #( - .INSTANCE_ID ($sformatf("%s-scheduler", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-memsched", INSTANCE_ID))), .CORE_REQS (NUM_LANES), .MEM_CHANNELS(NUM_LANES), .WORD_SIZE (LSU_WORD_SIZE), .LINE_SIZE (LSU_WORD_SIZE), .ADDR_WIDTH (LSU_ADDR_WIDTH), - .ATYPE_WIDTH (`ADDR_TYPE_WIDTH), + .FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH), .TAG_WIDTH (TAG_WIDTH), .CORE_QUEUE_SIZE (`LSUQ_IN_SIZE), .MEM_QUEUE_SIZE (`LSUQ_OUT_SIZE), @@ -330,7 +326,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( .CORE_OUT_BUF(0) ) mem_scheduler ( .clk (clk), - .reset (mem_scheduler_reset), + .reset (reset), // Input request .core_req_valid (mem_req_valid), @@ -338,12 +334,12 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( .core_req_mask (mem_req_mask), .core_req_byteen(mem_req_byteen), .core_req_addr (mem_req_addr), - .core_req_atype (mem_req_atype), + .core_req_flags (mem_req_flags), .core_req_data (mem_req_data), .core_req_tag (mem_req_tag), .core_req_ready (mem_req_ready), `UNUSED_PIN (core_req_empty), - `UNUSED_PIN (core_req_sent), + `UNUSED_PIN (core_req_wr_notify), // Output response .core_rsp_valid (mem_rsp_valid), @@ -360,7 +356,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( .mem_req_mask (lsu_mem_req_mask), .mem_req_byteen (lsu_mem_req_byteen), .mem_req_addr (lsu_mem_req_addr), - .mem_req_atype (lsu_mem_req_atype), + .mem_req_flags (lsu_mem_req_flags), .mem_req_data (lsu_mem_req_data), .mem_req_tag (lsu_mem_req_tag), .mem_req_ready (lsu_mem_req_ready), @@ -378,7 +374,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( assign lsu_mem_if.req_data.rw = lsu_mem_req_rw; assign lsu_mem_if.req_data.byteen = lsu_mem_req_byteen; assign lsu_mem_if.req_data.addr = lsu_mem_req_addr; - assign lsu_mem_if.req_data.atype = lsu_mem_req_atype; + assign lsu_mem_if.req_data.flags = lsu_mem_req_flags; assign lsu_mem_if.req_data.data = lsu_mem_req_data; assign lsu_mem_if.req_data.tag = lsu_mem_req_tag; assign lsu_mem_req_ready = lsu_mem_if.req_ready; @@ -426,7 +422,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( `endif `endif - for (genvar i = 0; i < NUM_LANES; i++) begin + for (genvar i = 0; i < NUM_LANES; i++) begin : g_rsp_data `ifdef XLEN_64 wire [63:0] rsp_data64 = mem_rsp_data[i]; wire [31:0] rsp_data32 = (rsp_align[i][2] ? mem_rsp_data[i][63:32] : mem_rsp_data[i][31:0]); @@ -483,6 +479,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( .valid_out (commit_no_rsp_if.valid), .ready_out (commit_no_rsp_if.ready) ); + assign commit_no_rsp_if.data.rd = '0; assign commit_no_rsp_if.data.wb = 1'b0; assign commit_no_rsp_if.data.data = commit_rsp_if.data.data; // arbiter MUX optimization @@ -507,51 +504,74 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #( `ifdef DBG_TRACE_MEM always @(posedge clk) begin if (execute_if.valid && fence_lock) begin - `TRACE(1, ("%d: *** %s fence wait\n", $time, INSTANCE_ID)); + `TRACE(2, ("%t: *** %s fence wait\n", $time, INSTANCE_ID)) end if (mem_req_fire) begin if (mem_req_rw) begin - `TRACE(1, ("%d: %s Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask)); - `TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES); - `TRACE(1, (", atype=")); - `TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES); - `TRACE(1, (", byteen=0x%0h, data=", mem_req_byteen)); - `TRACE_ARRAY1D(1, "0x%0h", mem_req_data, NUM_LANES); - `TRACE(1, (", tag=0x%0h (#%0d)\n", mem_req_tag, execute_if.data.uuid)); + `TRACE(2, ("%t: %s Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask)) + `TRACE_ARRAY1D(2, "0x%h", full_addr, NUM_LANES) + `TRACE(2, (", flags=")) + `TRACE_ARRAY1D(2, "%b", mem_req_flags, NUM_LANES) + `TRACE(2, (", byteen=0x%0h, data=", mem_req_byteen)) + `TRACE_ARRAY1D(2, "0x%0h", mem_req_data, NUM_LANES) + `TRACE(2, (", sop=%b, eop=%b, tag=0x%0h (#%0d)\n", execute_if.data.sop, execute_if.data.eop, mem_req_tag, execute_if.data.uuid)) end else begin - `TRACE(1, ("%d: %s Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask)); - `TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES); - `TRACE(1, (", atype=")); - `TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES); - `TRACE(1, (", byteen=0x%0h, rd=%0d, tag=0x%0h (#%0d)\n", mem_req_byteen, execute_if.data.rd, mem_req_tag, execute_if.data.uuid)); + `TRACE(2, ("%t: %s Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask)) + `TRACE_ARRAY1D(2, "0x%h", full_addr, NUM_LANES) + `TRACE(2, (", flags=")) + `TRACE_ARRAY1D(2, "%b", mem_req_flags, NUM_LANES) + `TRACE(2, (", byteen=0x%0h, rd=%0d, sop=%b, eop=%b, tag=0x%0h (#%0d)\n", mem_req_byteen, execute_if.data.rd, execute_if.data.sop, execute_if.data.eop, mem_req_tag, execute_if.data.uuid)) end end if (mem_rsp_fire) begin - `TRACE(1, ("%d: %s Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=", - $time, INSTANCE_ID, rsp_wid, {rsp_pc, 1'b0}, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop)); - `TRACE_ARRAY1D(1, "0x%0h", mem_rsp_data, NUM_LANES); - `TRACE(1, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid)); + `TRACE(2, ("%t: %s Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=", + $time, INSTANCE_ID, rsp_wid, {rsp_pc, 1'b0}, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop)) + `TRACE_ARRAY1D(2, "0x%0h", mem_rsp_data, NUM_LANES) + `TRACE(2, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid)) end end `endif +`ifdef SCOPE `ifdef DBG_SCOPE_LSU - VX_scope_tap #( - .SCOPE_ID (3), - .TRIGGERW (3), - .PROBEW (1 + NUM_LANES*(`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE*8) + `UUID_WIDTH + NUM_LANES*LSU_WORD_SIZE*8 + `UUID_WIDTH) - ) scope_tap ( - .clk (clk), - .reset (scope_reset), - .start (1'b0), - .stop (1'b0), - .triggers({reset, mem_req_fire, mem_rsp_fire}), - .probes ({mem_req_rw, full_addr, mem_req_byteen, mem_req_data, execute_if.data.uuid, rsp_data, rsp_uuid}), - .bus_in (scope_bus_in), - .bus_out(scope_bus_out) + `SCOPE_IO_SWITCH (1); + wire reset_negedge; + `NEG_EDGE (reset_negedge, reset); + `SCOPE_TAP_EX (0, 3, 4, 2, ( + 1 + NUM_LANES * (`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE * 8) + `UUID_WIDTH + NUM_LANES * LSU_WORD_SIZE * 8 + `UUID_WIDTH + ), { + mem_req_valid, + mem_req_ready, + mem_rsp_valid, + mem_rsp_ready + }, { + mem_req_fire, + mem_rsp_fire + }, { + mem_req_rw, + full_addr, + mem_req_byteen, + mem_req_data, + execute_if.data.uuid, + rsp_data, + rsp_uuid + }, + reset_negedge, 1'b0, 4096 ); `else - `SCOPE_IO_UNUSED() + `SCOPE_IO_UNUSED(0) +`endif +`endif + +`ifdef CHIPSCOPE +`ifdef DBG_SCOPE_LSU + ila_lsu ila_lsu_inst ( + .clk (clk), + .probe0 ({execute_if.valid, execute_if.data, execute_if.ready}), + .probe1 ({lsu_mem_if.req_valid, lsu_mem_if.req_data, lsu_mem_if.req_ready}), + .probe2 ({lsu_mem_if.rsp_valid, lsu_mem_if.rsp_data, lsu_mem_if.rsp_ready}) + ); +`endif `endif endmodule diff --git a/hw/rtl/core/VX_lsu_unit.sv b/hw/rtl/core/VX_lsu_unit.sv index d40f5fcfb..7a64a849b 100644 --- a/hw/rtl/core/VX_lsu_unit.sv +++ b/hw/rtl/core/VX_lsu_unit.sv @@ -31,9 +31,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( localparam BLOCK_SIZE = `NUM_LSU_BLOCKS; localparam NUM_LANES = `NUM_LSU_LANES; -`ifdef SCOPE `SCOPE_IO_SWITCH (BLOCK_SIZE); -`endif VX_execute_if #( .NUM_LANES (NUM_LANES) @@ -42,7 +40,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( VX_dispatch_unit #( .BLOCK_SIZE (BLOCK_SIZE), .NUM_LANES (NUM_LANES), - .OUT_BUF (1) + .OUT_BUF (3) ) dispatch_unit ( .clk (clk), .reset (reset), @@ -54,16 +52,13 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( .NUM_LANES (NUM_LANES) ) per_block_commit_if[BLOCK_SIZE](); - for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : lsu_slices - - `RESET_RELAY (slice_reset, reset); - + for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_slices VX_lsu_slice #( - .INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, block_idx)) + .INSTANCE_ID (`SFORMATF(("%s%0d", INSTANCE_ID, block_idx))) ) lsu_slice( `SCOPE_IO_BIND (block_idx) .clk (clk), - .reset (slice_reset), + .reset (reset), .execute_if (per_block_execute_if[block_idx]), .commit_if (per_block_commit_if[block_idx]), .lsu_mem_if (lsu_mem_if[block_idx]) diff --git a/hw/rtl/core/VX_mem_unit.sv b/hw/rtl/core/VX_mem_unit.sv new file mode 100644 index 000000000..655aef787 --- /dev/null +++ b/hw/rtl/core/VX_mem_unit.sv @@ -0,0 +1,260 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_define.vh" + +module VX_mem_unit import VX_gpu_pkg::*; #( + parameter `STRING INSTANCE_ID = "" +) ( + input wire clk, + input wire reset, + +`ifdef PERF_ENABLE + output lmem_perf_t lmem_perf, + output coalescer_perf_t coalescer_perf, +`endif + + VX_lsu_mem_if.slave lsu_mem_if [`NUM_LSU_BLOCKS], + VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS] +); + VX_lsu_mem_if #( + .NUM_LANES (`NUM_LSU_LANES), + .DATA_SIZE (LSU_WORD_SIZE), + .TAG_WIDTH (LSU_TAG_WIDTH) + ) lsu_dcache_if[`NUM_LSU_BLOCKS](); + +`ifdef LMEM_ENABLE + + `STATIC_ASSERT(`IS_DIVISBLE((1 << `LMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter")) + `STATIC_ASSERT(0 == (`LMEM_BASE_ADDR % (1 << `LMEM_LOG_SIZE)), ("invalid parameter")) + + localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE); + + VX_lsu_mem_if #( + .NUM_LANES (`NUM_LSU_LANES), + .DATA_SIZE (LSU_WORD_SIZE), + .TAG_WIDTH (LSU_TAG_WIDTH) + ) lsu_lmem_if[`NUM_LSU_BLOCKS](); + + for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lmem_switches + VX_lmem_switch #( + .REQ0_OUT_BUF (1), + .REQ1_OUT_BUF (0), + .RSP_OUT_BUF (1), + .ARBITER ("P") + ) lmem_switch ( + .clk (clk), + .reset (reset), + .lsu_in_if (lsu_mem_if[i]), + .global_out_if(lsu_dcache_if[i]), + .local_out_if (lsu_lmem_if[i]) + ); + end + + VX_lsu_mem_if #( + .NUM_LANES (`NUM_LSU_LANES), + .DATA_SIZE (LSU_WORD_SIZE), + .TAG_WIDTH (LMEM_TAG_WIDTH) + ) lmem_arb_if[1](); + + VX_lsu_mem_arb #( + .NUM_INPUTS (`NUM_LSU_BLOCKS), + .NUM_OUTPUTS(1), + .NUM_LANES (`NUM_LSU_LANES), + .DATA_SIZE (LSU_WORD_SIZE), + .TAG_WIDTH (LSU_TAG_WIDTH), + .TAG_SEL_IDX(0), + .ARBITER ("R"), + .REQ_OUT_BUF(0), + .RSP_OUT_BUF(2) + ) lmem_arb ( + .clk (clk), + .reset (reset), + .bus_in_if (lsu_lmem_if), + .bus_out_if (lmem_arb_if) + ); + + VX_mem_bus_if #( + .DATA_SIZE (LSU_WORD_SIZE), + .TAG_WIDTH (LMEM_TAG_WIDTH) + ) lmem_adapt_if[`NUM_LSU_LANES](); + + VX_lsu_adapter #( + .NUM_LANES (`NUM_LSU_LANES), + .DATA_SIZE (LSU_WORD_SIZE), + .TAG_WIDTH (LMEM_TAG_WIDTH), + .TAG_SEL_BITS (LMEM_TAG_WIDTH - `UUID_WIDTH), + .ARBITER ("P"), + .REQ_OUT_BUF (3), + .RSP_OUT_BUF (0) + ) lmem_adapter ( + .clk (clk), + .reset (reset), + .lsu_mem_if (lmem_arb_if[0]), + .mem_bus_if (lmem_adapt_if) + ); + + VX_local_mem #( + .INSTANCE_ID(`SFORMATF(("%s-lmem", INSTANCE_ID))), + .SIZE (1 << `LMEM_LOG_SIZE), + .NUM_REQS (`NUM_LSU_LANES), + .NUM_BANKS (`LMEM_NUM_BANKS), + .WORD_SIZE (LSU_WORD_SIZE), + .ADDR_WIDTH (LMEM_ADDR_WIDTH), + .UUID_WIDTH (`UUID_WIDTH), + .TAG_WIDTH (LMEM_TAG_WIDTH), + .OUT_BUF (3) + ) local_mem ( + .clk (clk), + .reset (reset), + `ifdef PERF_ENABLE + .lmem_perf (lmem_perf), + `endif + .mem_bus_if (lmem_adapt_if) + ); + +`else + +`ifdef PERF_ENABLE + assign lmem_perf = '0; +`endif + + for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_dcache_if + `ASSIGN_VX_MEM_BUS_IF (lsu_dcache_if[i], lsu_mem_if[i]); + end + +`endif + + VX_lsu_mem_if #( + .NUM_LANES (DCACHE_CHANNELS), + .DATA_SIZE (DCACHE_WORD_SIZE), + .TAG_WIDTH (DCACHE_TAG_WIDTH) + ) dcache_coalesced_if[`NUM_LSU_BLOCKS](); + +`ifdef PERF_ENABLE + wire [`NUM_LSU_BLOCKS-1:0][`PERF_CTR_BITS-1:0] per_block_coalescer_misses; + wire [`PERF_CTR_BITS-1:0] coalescer_misses; + VX_reduce_tree #( + .DATAW_IN (`PERF_CTR_BITS), + .DATAW_OUT (`PERF_CTR_BITS), + .N (`NUM_LSU_BLOCKS), + .OP ("+") + ) coalescer_reduce ( + .data_in (per_block_coalescer_misses), + .data_out (coalescer_misses) + ); + `BUFFER(coalescer_perf.misses, coalescer_misses); +`endif + + if ((`NUM_LSU_LANES > 1) && (LSU_WORD_SIZE != DCACHE_WORD_SIZE)) begin : g_enabled + + for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_coalescers + VX_mem_coalescer #( + .INSTANCE_ID (`SFORMATF(("%s-coalescer%0d", INSTANCE_ID, i))), + .NUM_REQS (`NUM_LSU_LANES), + .DATA_IN_SIZE (LSU_WORD_SIZE), + .DATA_OUT_SIZE (DCACHE_WORD_SIZE), + .ADDR_WIDTH (LSU_ADDR_WIDTH), + .FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH), + .TAG_WIDTH (LSU_TAG_WIDTH), + .UUID_WIDTH (`UUID_WIDTH), + .QUEUE_SIZE (`LSUQ_OUT_SIZE), + .PERF_CTR_BITS (`PERF_CTR_BITS) + ) mem_coalescer ( + .clk (clk), + .reset (reset), + + `ifdef PERF_ENABLE + .misses (per_block_coalescer_misses[i]), + `else + `UNUSED_PIN (misses), + `endif + + // Input request + .in_req_valid (lsu_dcache_if[i].req_valid), + .in_req_mask (lsu_dcache_if[i].req_data.mask), + .in_req_rw (lsu_dcache_if[i].req_data.rw), + .in_req_byteen (lsu_dcache_if[i].req_data.byteen), + .in_req_addr (lsu_dcache_if[i].req_data.addr), + .in_req_flags (lsu_dcache_if[i].req_data.flags), + .in_req_data (lsu_dcache_if[i].req_data.data), + .in_req_tag (lsu_dcache_if[i].req_data.tag), + .in_req_ready (lsu_dcache_if[i].req_ready), + + // Input response + .in_rsp_valid (lsu_dcache_if[i].rsp_valid), + .in_rsp_mask (lsu_dcache_if[i].rsp_data.mask), + .in_rsp_data (lsu_dcache_if[i].rsp_data.data), + .in_rsp_tag (lsu_dcache_if[i].rsp_data.tag), + .in_rsp_ready (lsu_dcache_if[i].rsp_ready), + + // Output request + .out_req_valid (dcache_coalesced_if[i].req_valid), + .out_req_mask (dcache_coalesced_if[i].req_data.mask), + .out_req_rw (dcache_coalesced_if[i].req_data.rw), + .out_req_byteen (dcache_coalesced_if[i].req_data.byteen), + .out_req_addr (dcache_coalesced_if[i].req_data.addr), + .out_req_flags (dcache_coalesced_if[i].req_data.flags), + .out_req_data (dcache_coalesced_if[i].req_data.data), + .out_req_tag (dcache_coalesced_if[i].req_data.tag), + .out_req_ready (dcache_coalesced_if[i].req_ready), + + // Output response + .out_rsp_valid (dcache_coalesced_if[i].rsp_valid), + .out_rsp_mask (dcache_coalesced_if[i].rsp_data.mask), + .out_rsp_data (dcache_coalesced_if[i].rsp_data.data), + .out_rsp_tag (dcache_coalesced_if[i].rsp_data.tag), + .out_rsp_ready (dcache_coalesced_if[i].rsp_ready) + ); + end + + end else begin : g_passthru + + for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_dcache_coalesced_if + `ASSIGN_VX_MEM_BUS_IF (dcache_coalesced_if[i], lsu_dcache_if[i]); + `ifdef PERF_ENABLE + assign per_block_coalescer_misses[i] = '0; + `endif + end + + end + + for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_dcache_adapters + + VX_mem_bus_if #( + .DATA_SIZE (DCACHE_WORD_SIZE), + .TAG_WIDTH (DCACHE_TAG_WIDTH) + ) dcache_bus_tmp_if[DCACHE_CHANNELS](); + + VX_lsu_adapter #( + .NUM_LANES (DCACHE_CHANNELS), + .DATA_SIZE (DCACHE_WORD_SIZE), + .TAG_WIDTH (DCACHE_TAG_WIDTH), + .TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH), + .ARBITER ("P"), + .REQ_OUT_BUF (0), + .RSP_OUT_BUF (0) + ) dcache_adapter ( + .clk (clk), + .reset (reset), + .lsu_mem_if (dcache_coalesced_if[i]), + .mem_bus_if (dcache_bus_tmp_if) + ); + + for (genvar j = 0; j < DCACHE_CHANNELS; ++j) begin : g_dcache_bus_if + `ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i * DCACHE_CHANNELS + j], dcache_bus_tmp_if[j]); + end + + end + +endmodule diff --git a/hw/rtl/core/VX_mem_unit_top.sv b/hw/rtl/core/VX_mem_unit_top.sv new file mode 100644 index 000000000..17786a09b --- /dev/null +++ b/hw/rtl/core/VX_mem_unit_top.sv @@ -0,0 +1,127 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_define.vh" + +module VX_mem_unit_top import VX_gpu_pkg::*; #( + parameter `STRING INSTANCE_ID = "", + parameter LSU_WORD_WIDTH = LSU_WORD_SIZE * 8 +) ( + // Clock + input wire clk, + input wire reset, + + // LSU memory request + input wire [`NUM_LSU_BLOCKS-1:0] lsu_req_valid, + input wire [`NUM_LSU_BLOCKS-1:0] lsu_req_rw, + input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0] lsu_req_mask, + input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_WORD_SIZE-1:0] lsu_req_byteen, + input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_ADDR_WIDTH-1:0] lsu_req_addr, + input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] lsu_req_flags, + input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_WORD_WIDTH-1:0] lsu_req_data, + input wire [`NUM_LSU_BLOCKS-1:0][LSU_TAG_WIDTH-1:0] lsu_req_tag, + output wire [`NUM_LSU_BLOCKS-1:0] lsu_req_ready, + + // LSU memory response + output wire [`NUM_LSU_BLOCKS-1:0] lsu_rsp_valid, + output wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0] lsu_rsp_mask, + output wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_WORD_WIDTH-1:0] lsu_rsp_data, + output wire [`NUM_LSU_BLOCKS-1:0][LSU_TAG_WIDTH-1:0] lsu_rsp_tag, + input wire [`NUM_LSU_BLOCKS-1:0] lsu_rsp_ready, + + // Memory request + output wire [DCACHE_NUM_REQS-1:0] mem_req_valid, + output wire [DCACHE_NUM_REQS-1:0] mem_req_rw, + output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] mem_req_byteen, + output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] mem_req_addr, + output wire [DCACHE_NUM_REQS-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] mem_req_flags, + output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] mem_req_data, + output wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] mem_req_tag, + input wire [DCACHE_NUM_REQS-1:0] mem_req_ready, + + // Memory response + input wire [DCACHE_NUM_REQS-1:0] mem_rsp_valid, + input wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] mem_rsp_data, + input wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] mem_rsp_tag, + output wire [DCACHE_NUM_REQS-1:0] mem_rsp_ready +); + VX_lsu_mem_if #( + .NUM_LANES (`NUM_LSU_LANES), + .DATA_SIZE (LSU_WORD_SIZE), + .TAG_WIDTH (LSU_TAG_WIDTH) + ) lsu_mem_if[`NUM_LSU_BLOCKS](); + + // LSU memory request + for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_mem_req + assign lsu_mem_if[i].req_valid = lsu_req_valid[i]; + assign lsu_mem_if[i].req_data.rw = lsu_req_rw[i]; + assign lsu_mem_if[i].req_data.mask = lsu_req_mask[i]; + assign lsu_mem_if[i].req_data.byteen = lsu_req_byteen[i]; + assign lsu_mem_if[i].req_data.addr = lsu_req_addr[i]; + assign lsu_mem_if[i].req_data.flags = lsu_req_flags[i]; + assign lsu_mem_if[i].req_data.data = lsu_req_data[i]; + assign lsu_mem_if[i].req_data.tag = lsu_req_tag[i]; + assign lsu_req_ready[i] = lsu_mem_if[i].req_ready; + end + + // LSU memory response + for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_rsp + assign lsu_rsp_valid[i] = lsu_mem_if[i].rsp_valid; + assign lsu_rsp_mask[i] = lsu_mem_if[i].rsp_data.mask; + assign lsu_rsp_data[i] = lsu_mem_if[i].rsp_data.data; + assign lsu_rsp_tag[i] = lsu_mem_if[i].rsp_data.tag; + assign lsu_mem_if[i].rsp_ready = lsu_rsp_ready[i]; + end + + VX_mem_bus_if #( + .DATA_SIZE (DCACHE_WORD_SIZE), + .TAG_WIDTH (DCACHE_TAG_WIDTH) + ) mem_bus_if[DCACHE_NUM_REQS](); + + // memory request + for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_mem_req + assign mem_req_valid[i] = mem_bus_if[i].req_valid; + assign mem_req_rw[i] = mem_bus_if[i].req_data.rw; + assign mem_req_byteen[i] = mem_bus_if[i].req_data.byteen; + assign mem_req_addr[i] = mem_bus_if[i].req_data.addr; + assign mem_req_flags[i] = mem_bus_if[i].req_data.flags; + assign mem_req_data[i] = mem_bus_if[i].req_data.data; + assign mem_req_tag[i] = mem_bus_if[i].req_data.tag; + assign mem_bus_if[i].req_ready = mem_req_ready[i]; + end + + // memory response + for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_mem_bus_rsp + assign mem_bus_if[i].rsp_valid = mem_rsp_valid[i]; + assign mem_bus_if[i].rsp_data.tag = mem_rsp_tag[i]; + assign mem_bus_if[i].rsp_data.data = mem_rsp_data[i]; + assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready; + end + +`ifdef PERF_ENABLE + cache_perf_t lmem_perf = '0; +`endif + + VX_mem_unit #( + .INSTANCE_ID (INSTANCE_ID) + ) mem_unit ( + .clk (clk), + .reset (reset), + `ifdef PERF_ENABLE + .lmem_perf (lmem_perf), + `endif + .lsu_mem_if (lsu_mem_if), + .dcache_bus_if (mem_bus_if) + ); + +endmodule diff --git a/hw/rtl/core/VX_operands.sv b/hw/rtl/core/VX_operands.sv index e3df0c1fa..2353becca 100644 --- a/hw/rtl/core/VX_operands.sv +++ b/hw/rtl/core/VX_operands.sv @@ -23,7 +23,7 @@ module VX_operands import VX_gpu_pkg::*; #( parameter `STRING INSTANCE_ID = "", parameter NUM_BANKS = 4, - parameter OUT_BUF = 4 // using 2-cycle EB for area reduction + parameter OUT_BUF = 3 ) ( input wire clk, input wire reset, @@ -37,15 +37,15 @@ module VX_operands import VX_gpu_pkg::*; #( VX_operands_if.master operands_if ); `UNUSED_SPARAM (INSTANCE_ID) - localparam NUM_SRC_REGS = 3; - localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_REGS); + localparam NUM_SRC_OPDS = 3; + localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_OPDS); localparam REQ_SEL_WIDTH = `UP(REQ_SEL_BITS); localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS); localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS); localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS; localparam META_DATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + `UUID_WIDTH; localparam REGS_DATAW = `XLEN * `NUM_THREADS; - localparam DATAW = META_DATAW + NUM_SRC_REGS * REGS_DATAW; + localparam DATAW = META_DATAW + NUM_SRC_OPDS * REGS_DATAW; localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS); localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS; localparam XLEN_SIZE = `XLEN / 8; @@ -53,87 +53,80 @@ module VX_operands import VX_gpu_pkg::*; #( `UNUSED_VAR (writeback_if.data.sop) - wire [NUM_SRC_REGS-1:0] src_valid; - wire [NUM_SRC_REGS-1:0] req_in_valid, req_in_ready; - wire [NUM_SRC_REGS-1:0][PER_BANK_ADDRW-1:0] req_in_data; - wire [NUM_SRC_REGS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx; + wire [NUM_SRC_OPDS-1:0] src_valid; + wire [NUM_SRC_OPDS-1:0] req_valid_in, req_ready_in; + wire [NUM_SRC_OPDS-1:0][PER_BANK_ADDRW-1:0] req_data_in; + wire [NUM_SRC_OPDS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx; wire [NUM_BANKS-1:0] gpr_rd_valid, gpr_rd_ready; wire [NUM_BANKS-1:0] gpr_rd_valid_st1, gpr_rd_valid_st2; wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr, gpr_rd_addr_st1; - wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data_st1, gpr_rd_data_st2; + wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data_st2; wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx, gpr_rd_req_idx_st1, gpr_rd_req_idx_st2; + wire pipe_ready_in; wire pipe_valid_st1, pipe_ready_st1; wire pipe_valid_st2, pipe_ready_st2; wire [META_DATAW-1:0] pipe_data, pipe_data_st1, pipe_data_st2; - reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_n; - wire [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_st1, src_data_st2; + reg [NUM_SRC_OPDS-1:0][(`NUM_THREADS * `XLEN)-1:0] src_data_st2, src_data_m_st2; - reg [NUM_SRC_REGS-1:0] data_fetched_n; - wire [NUM_SRC_REGS-1:0] data_fetched_st1; + reg [NUM_SRC_OPDS-1:0] data_fetched_st1; reg has_collision_n; wire has_collision_st1; - wire [NUM_SRC_REGS-1:0][`NR_BITS-1:0] src_regs = {scoreboard_if.data.rs3, - scoreboard_if.data.rs2, - scoreboard_if.data.rs1}; + wire [NUM_SRC_OPDS-1:0][`NR_BITS-1:0] src_opds; + assign src_opds = {scoreboard_if.data.rs3, scoreboard_if.data.rs2, scoreboard_if.data.rs1}; - for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin - if (ISSUE_WIS != 0) begin - assign req_in_data[i] = {src_regs[i][`NR_BITS-1:BANK_SEL_BITS], scoreboard_if.data.wis}; - end else begin - assign req_in_data[i] = src_regs[i][`NR_BITS-1:BANK_SEL_BITS]; + for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_req_data_in + if (ISSUE_WIS != 0) begin : g_wis + assign req_data_in[i] = {src_opds[i][`NR_BITS-1:BANK_SEL_BITS], scoreboard_if.data.wis}; + end else begin : g_no_wis + assign req_data_in[i] = src_opds[i][`NR_BITS-1:BANK_SEL_BITS]; end - if (NUM_BANKS != 1) begin - assign req_bank_idx[i] = src_regs[i][BANK_SEL_BITS-1:0]; - end else begin + end + + for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_req_bank_idx + if (NUM_BANKS != 1) begin : g_multibanks + assign req_bank_idx[i] = src_opds[i][BANK_SEL_BITS-1:0]; + end else begin : g_singlebank assign req_bank_idx[i] = '0; end end - for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin - assign src_valid[i] = (src_regs[i] != 0) && ~data_fetched_st1[i]; + for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_src_valid + assign src_valid[i] = (src_opds[i] != 0) && ~data_fetched_st1[i]; end - assign req_in_valid = {NUM_SRC_REGS{scoreboard_if.valid}} & src_valid; + assign req_valid_in = {NUM_SRC_OPDS{scoreboard_if.valid}} & src_valid; VX_stream_xbar #( - .NUM_INPUTS (NUM_SRC_REGS), + .NUM_INPUTS (NUM_SRC_OPDS), .NUM_OUTPUTS (NUM_BANKS), .DATAW (PER_BANK_ADDRW), .ARBITER ("P"), // use priority arbiter - .PERF_CTR_BITS(`PERF_CTR_BITS), .OUT_BUF (0) // no output buffering ) req_xbar ( .clk (clk), .reset (reset), `UNUSED_PIN(collisions), - .valid_in (req_in_valid), - .data_in (req_in_data), + .valid_in (req_valid_in), + .data_in (req_data_in), .sel_in (req_bank_idx), - .ready_in (req_in_ready), + .ready_in (req_ready_in), .valid_out (gpr_rd_valid), .data_out (gpr_rd_addr), .sel_out (gpr_rd_req_idx), .ready_out (gpr_rd_ready) ); - wire pipe_in_ready = pipe_ready_st1 || ~pipe_valid_st1; - - assign gpr_rd_ready = {NUM_BANKS{pipe_in_ready}}; - - assign scoreboard_if.ready = pipe_in_ready && ~has_collision_n; - - wire pipe_fire_st1 = pipe_valid_st1 && pipe_ready_st1; - wire pipe_fire_st2 = pipe_valid_st2 && pipe_ready_st2; + assign gpr_rd_ready = {NUM_BANKS{pipe_ready_in}}; always @(*) begin has_collision_n = 0; - for (integer i = 0; i < NUM_SRC_REGS; ++i) begin - for (integer j = 1; j < (NUM_SRC_REGS-i); ++j) begin + for (integer i = 0; i < NUM_SRC_OPDS; ++i) begin + for (integer j = 1; j < (NUM_SRC_OPDS-i); ++j) begin has_collision_n |= src_valid[i] && src_valid[j+i] && (req_bank_idx[i] == req_bank_idx[j+i]); @@ -141,14 +134,7 @@ module VX_operands import VX_gpu_pkg::*; #( end end - always @(*) begin - data_fetched_n = data_fetched_st1; - if (scoreboard_if.ready) begin - data_fetched_n = '0; - end else begin - data_fetched_n = data_fetched_st1 | req_in_ready; - end - end + wire [NUM_SRC_OPDS-1:0] req_fire_in = req_valid_in & req_ready_in; assign pipe_data = { scoreboard_if.data.wis, @@ -162,61 +148,74 @@ module VX_operands import VX_gpu_pkg::*; #( scoreboard_if.data.uuid }; - VX_pipe_register #( - .DATAW (1 + NUM_SRC_REGS + NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)), - .RESETW (1 + NUM_SRC_REGS) + assign scoreboard_if.ready = pipe_ready_in && ~has_collision_n; + + wire pipe_fire_st1 = pipe_valid_st1 && pipe_ready_st1; + wire pipe_fire_st2 = pipe_valid_st2 && pipe_ready_st2; + + VX_pipe_buffer #( + .DATAW (NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)) ) pipe_reg1 ( .clk (clk), .reset (reset), - .enable (pipe_in_ready), - .data_in ({scoreboard_if.valid, data_fetched_n, gpr_rd_valid, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}), - .data_out ({pipe_valid_st1, data_fetched_st1, gpr_rd_valid_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1}) + .valid_in (scoreboard_if.valid), + .ready_in (pipe_ready_in), + .data_in ({gpr_rd_valid, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}), + .data_out ({gpr_rd_valid_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1}), + .valid_out(pipe_valid_st1), + .ready_out(pipe_ready_st1) ); - assign pipe_ready_st1 = pipe_ready_st2 || ~pipe_valid_st2; - - assign src_data_st1 = pipe_fire_st2 ? '0 : src_data_n; + always @(posedge clk) begin + if (reset || scoreboard_if.ready) begin + data_fetched_st1 <= 0; + end else begin + data_fetched_st1 <= data_fetched_st1 | req_fire_in; + end + end wire pipe_valid2_st1 = pipe_valid_st1 && ~has_collision_st1; - `RESET_RELAY (pipe2_reset, reset); // needed for pipe_reg2's wide RESETW - - VX_pipe_register #( - .DATAW (1 + NUM_SRC_REGS * REGS_DATAW + NUM_BANKS + NUM_BANKS * REGS_DATAW + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH), - .RESETW (1 + NUM_SRC_REGS * REGS_DATAW) + VX_pipe_buffer #( + .DATAW (NUM_BANKS * (1 + REQ_SEL_WIDTH) + META_DATAW) ) pipe_reg2 ( .clk (clk), - .reset (pipe2_reset), - .enable (pipe_ready_st1), - .data_in ({pipe_valid2_st1, src_data_st1, gpr_rd_valid_st1, gpr_rd_data_st1, pipe_data_st1, gpr_rd_req_idx_st1}), - .data_out ({pipe_valid_st2, src_data_st2, gpr_rd_valid_st2, gpr_rd_data_st2, pipe_data_st2, gpr_rd_req_idx_st2}) + .reset (reset), + .valid_in (pipe_valid2_st1), + .ready_in (pipe_ready_st1), + .data_in ({gpr_rd_valid_st1, gpr_rd_req_idx_st1, pipe_data_st1}), + .data_out ({gpr_rd_valid_st2, gpr_rd_req_idx_st2, pipe_data_st2}), + .valid_out(pipe_valid_st2), + .ready_out(pipe_ready_st2) ); always @(*) begin - src_data_n = src_data_st2; + src_data_m_st2 = src_data_st2; for (integer b = 0; b < NUM_BANKS; ++b) begin if (gpr_rd_valid_st2[b]) begin - src_data_n[gpr_rd_req_idx_st2[b]] = gpr_rd_data_st2[b]; + src_data_m_st2[gpr_rd_req_idx_st2[b]] = gpr_rd_data_st2[b]; end end end + always @(posedge clk) begin + if (reset || pipe_fire_st2) begin + src_data_st2 <= 0; + end else begin + src_data_st2 <= src_data_m_st2; + end + end + VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), - .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)), - .LUTRAM (1) + .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) ) out_buf ( .clk (clk), .reset (reset), .valid_in (pipe_valid_st2), .ready_in (pipe_ready_st2), - .data_in ({ - pipe_data_st2, - src_data_n[0], - src_data_n[1], - src_data_n[2] - }), + .data_in ({pipe_data_st2, src_data_m_st2}), .data_out ({ operands_if.data.wis, operands_if.data.tmask, @@ -227,51 +226,39 @@ module VX_operands import VX_gpu_pkg::*; #( operands_if.data.op_args, operands_if.data.rd, operands_if.data.uuid, - operands_if.data.rs1_data, + operands_if.data.rs3_data, operands_if.data.rs2_data, - operands_if.data.rs3_data + operands_if.data.rs1_data }), .valid_out (operands_if.valid), .ready_out (operands_if.ready) ); wire [PER_BANK_ADDRW-1:0] gpr_wr_addr; - if (ISSUE_WIS != 0) begin + if (ISSUE_WIS != 0) begin : g_gpr_wr_addr assign gpr_wr_addr = {writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS], writeback_if.data.wis}; - end else begin + end else begin : g_gpr_wr_addr_no_wis assign gpr_wr_addr = writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS]; end wire [BANK_SEL_WIDTH-1:0] gpr_wr_bank_idx; - if (NUM_BANKS != 1) begin + if (NUM_BANKS != 1) begin : g_gpr_wr_bank_idx assign gpr_wr_bank_idx = writeback_if.data.rd[BANK_SEL_BITS-1:0]; - end else begin + end else begin : g_gpr_wr_bank_idx_0 assign gpr_wr_bank_idx = '0; end - `ifdef GPR_RESET - reg wr_enabled = 0; - always @(posedge clk) begin - if (reset) begin - wr_enabled <= 1; - end - end - `else - wire wr_enabled = 1; - `endif - - for (genvar b = 0; b < NUM_BANKS; ++b) begin + for (genvar b = 0; b < NUM_BANKS; ++b) begin : g_gpr_rams wire gpr_wr_enabled; - if (BANK_SEL_BITS != 0) begin - assign gpr_wr_enabled = wr_enabled - && writeback_if.valid + if (BANK_SEL_BITS != 0) begin : g_gpr_wr_enabled_multibanks + assign gpr_wr_enabled = writeback_if.valid && (gpr_wr_bank_idx == BANK_SEL_BITS'(b)); - end else begin - assign gpr_wr_enabled = wr_enabled && writeback_if.valid; + end else begin : g_gpr_wr_enabled + assign gpr_wr_enabled = writeback_if.valid; end wire [BYTEENW-1:0] wren; - for (genvar i = 0; i < `NUM_THREADS; ++i) begin + for (genvar i = 0; i < `NUM_THREADS; ++i) begin : g_wren assign wren[i*XLEN_SIZE+:XLEN_SIZE] = {XLEN_SIZE{writeback_if.data.tmask[i]}}; end @@ -282,7 +269,8 @@ module VX_operands import VX_gpu_pkg::*; #( `ifdef GPR_RESET .RESET_RAM (1), `endif - .NO_RWCHECK (1) + .OUT_REG (1), + .RDW_MODE ("R") ) gpr_ram ( .clk (clk), .reset (reset), @@ -292,7 +280,7 @@ module VX_operands import VX_gpu_pkg::*; #( .waddr (gpr_wr_addr), .wdata (writeback_if.data.data), .raddr (gpr_rd_addr_st1[b]), - .rdata (gpr_rd_data_st1[b]) + .rdata (gpr_rd_data_st2[b]) ); end @@ -302,7 +290,7 @@ module VX_operands import VX_gpu_pkg::*; #( if (reset) begin collisions_r <= '0; end else begin - collisions_r <= collisions_r + `PERF_CTR_BITS'(scoreboard_if.valid && pipe_in_ready && has_collision_n); + collisions_r <= collisions_r + `PERF_CTR_BITS'(scoreboard_if.valid && pipe_ready_in && has_collision_n); end end assign perf_stalls = collisions_r; diff --git a/hw/rtl/core/VX_pe_switch.sv b/hw/rtl/core/VX_pe_switch.sv new file mode 100644 index 000000000..377715e1d --- /dev/null +++ b/hw/rtl/core/VX_pe_switch.sv @@ -0,0 +1,93 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_define.vh" + +module VX_pe_switch import VX_gpu_pkg::*; #( + parameter PE_COUNT = 0, + parameter NUM_LANES = 0, + parameter REQ_OUT_BUF = 0, + parameter RSP_OUT_BUF = 0, + parameter `STRING ARBITER = "R", + parameter PE_SEL_BITS = `CLOG2(PE_COUNT) +) ( + input wire clk, + input wire reset, + input wire [`UP(PE_SEL_BITS)-1:0] pe_sel, + VX_execute_if.slave execute_in_if, + VX_commit_if.master commit_out_if, + VX_execute_if.master execute_out_if[PE_COUNT], + VX_commit_if .slave commit_in_if[PE_COUNT] +); + localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); + localparam PID_WIDTH = `UP(PID_BITS); + localparam REQ_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `INST_ALU_BITS + $bits(op_args_t) + 1 + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1; + localparam RSP_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1; + + wire [PE_COUNT-1:0] pe_req_valid; + wire [PE_COUNT-1:0][REQ_DATAW-1:0] pe_req_data; + wire [PE_COUNT-1:0] pe_req_ready; + + VX_stream_switch #( + .DATAW (REQ_DATAW), + .NUM_INPUTS (1), + .NUM_OUTPUTS (PE_COUNT), + .OUT_BUF (REQ_OUT_BUF) + ) req_switch ( + .clk (clk), + .reset (reset), + .sel_in (pe_sel), + .valid_in (execute_in_if.valid), + .ready_in (execute_in_if.ready), + .data_in (execute_in_if.data), + .data_out (pe_req_data), + .valid_out (pe_req_valid), + .ready_out (pe_req_ready) + ); + + for (genvar i = 0; i < PE_COUNT; ++i) begin : g_execute_out_if + assign execute_out_if[i].valid = pe_req_valid[i]; + assign execute_out_if[i].data = pe_req_data[i]; + assign pe_req_ready[i] = execute_out_if[i].ready; + end + + /////////////////////////////////////////////////////////////////////////// + + wire [PE_COUNT-1:0] pe_rsp_valid; + wire [PE_COUNT-1:0][RSP_DATAW-1:0] pe_rsp_data; + wire [PE_COUNT-1:0] pe_rsp_ready; + + for (genvar i = 0; i < PE_COUNT; ++i) begin : g_commit_in_if + assign pe_rsp_valid[i] = commit_in_if[i].valid; + assign pe_rsp_data[i] = commit_in_if[i].data; + assign commit_in_if[i].ready = pe_rsp_ready[i]; + end + + VX_stream_arb #( + .NUM_INPUTS (PE_COUNT), + .DATAW (RSP_DATAW), + .ARBITER (ARBITER), + .OUT_BUF (RSP_OUT_BUF) + ) rsp_arb ( + .clk (clk), + .reset (reset), + .valid_in (pe_rsp_valid), + .ready_in (pe_rsp_ready), + .data_in (pe_rsp_data), + .data_out (commit_out_if.data), + .valid_out (commit_out_if.valid), + .ready_out (commit_out_if.ready), + `UNUSED_PIN (sel_out) + ); + +endmodule diff --git a/hw/rtl/core/VX_schedule.sv b/hw/rtl/core/VX_schedule.sv index 71a74c6ac..fee7c21a5 100644 --- a/hw/rtl/core/VX_schedule.sv +++ b/hw/rtl/core/VX_schedule.sv @@ -68,8 +68,6 @@ module VX_schedule import VX_gpu_pkg::*; #( reg [`PERF_CTR_BITS-1:0] cycles; - reg [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] issued_instrs; - wire schedule_fire = schedule_valid && schedule_ready; wire schedule_if_fire = schedule_if.valid && schedule_if.ready; @@ -78,7 +76,7 @@ module VX_schedule import VX_gpu_pkg::*; #( wire [`NUM_ALU_BLOCKS-1:0][`NW_WIDTH-1:0] branch_wid; wire [`NUM_ALU_BLOCKS-1:0] branch_taken; wire [`NUM_ALU_BLOCKS-1:0][`PC_BITS-1:0] branch_dest; - for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin + for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin : g_branch_init assign branch_valid[i] = branch_ctl_if[i].valid; assign branch_wid[i] = branch_ctl_if[i].wid; assign branch_taken[i] = branch_ctl_if[i].taken; @@ -113,6 +111,16 @@ module VX_schedule import VX_gpu_pkg::*; #( barrier_stalls_n= barrier_stalls; warp_pcs_n = warp_pcs; + // decode unlock + if (decode_sched_if.valid && decode_sched_if.unlock) begin + stalled_warps_n[decode_sched_if.wid] = 0; + end + + // CSR unlock + if (sched_csr_if.unlock_warp) begin + stalled_warps_n[sched_csr_if.unlock_wid] = 0; + end + // wspawn handling if (wspawn.valid && is_single_warp) begin active_warps_n |= wspawn.wmask; @@ -170,10 +178,11 @@ module VX_schedule import VX_gpu_pkg::*; #( stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp end end + `ifdef GBAR_ENABLE - if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_id)) begin + if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_data.id)) begin barrier_ctrs_n[warp_ctl_if.barrier.id] = '0; // reset barrier counter - barrier_masks_n[gbar_bus_if.rsp_id] = '0; // reset barrier mask + barrier_masks_n[gbar_bus_if.rsp_data.id] = '0; // reset barrier mask stalled_warps_n = '0; // unlock all warps end `endif @@ -188,16 +197,6 @@ module VX_schedule import VX_gpu_pkg::*; #( end end - // decode unlock - if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin - stalled_warps_n[decode_sched_if.wid] = 0; - end - - // CSR unlock - if (sched_csr_if.unlock_warp) begin - stalled_warps_n[sched_csr_if.unlock_wid] = 0; - end - // stall the warp until decode stage if (schedule_fire) begin stalled_warps_n[schedule_wid] = 1; @@ -223,7 +222,6 @@ module VX_schedule import VX_gpu_pkg::*; #( active_warps <= '0; thread_masks <= '0; barrier_stalls <= '0; - issued_instrs <= '0; cycles <= '0; wspawn.valid <= 0; @@ -268,10 +266,6 @@ module VX_schedule import VX_gpu_pkg::*; #( end `endif - if (schedule_if_fire) begin - issued_instrs[schedule_if.data.wid] <= issued_instrs[schedule_if.data.wid] + `UUID_WIDTH'(1); - end - if (busy) begin cycles <= cycles + 1; end @@ -281,21 +275,19 @@ module VX_schedule import VX_gpu_pkg::*; #( // barrier handling `ifdef GBAR_ENABLE - assign gbar_bus_if.req_valid = gbar_req_valid; - assign gbar_bus_if.req_id = gbar_req_id; - assign gbar_bus_if.req_size_m1 = gbar_req_size_m1; - assign gbar_bus_if.req_core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES); + assign gbar_bus_if.req_valid = gbar_req_valid; + assign gbar_bus_if.req_data.id = gbar_req_id; + assign gbar_bus_if.req_data.size_m1 = gbar_req_size_m1; + assign gbar_bus_if.req_data.core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES); `endif // split/join handling - `RESET_RELAY (split_join_reset, reset); - VX_split_join #( - .INSTANCE_ID ($sformatf("%s-splitjoin", INSTANCE_ID)) + .INSTANCE_ID (`SFORMATF(("%s-splitjoin", INSTANCE_ID))) ) split_join ( .clk (clk), - .reset (split_join_reset), + .reset (reset), .valid (warp_ctl_if.valid), .wid (warp_ctl_if.wid), .split (warp_ctl_if.split), @@ -324,7 +316,7 @@ module VX_schedule import VX_gpu_pkg::*; #( ); wire [`NUM_WARPS-1:0][(`NUM_THREADS + `PC_BITS)-1:0] schedule_data; - for (genvar i = 0; i < `NUM_WARPS; ++i) begin + for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_schedule_data assign schedule_data[i] = {thread_masks[i], warp_pcs[i]}; end @@ -333,67 +325,50 @@ module VX_schedule import VX_gpu_pkg::*; #( schedule_data[schedule_wid][(`NUM_THREADS + `PC_BITS)-5:0] }; -`ifndef NDEBUG - localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS); - reg [`UUID_WIDTH-1:0] instr_uuid; - wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid); -`ifdef SV_DPI - always @(posedge clk) begin - if (reset) begin - instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 32'd0)); - end else if (schedule_fire) begin - instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid))); - end - end + wire [`UUID_WIDTH-1:0] instr_uuid; +`ifdef UUID_ENABLE + VX_uuid_gen #( + .CORE_ID (CORE_ID), + .UUID_WIDTH (`UUID_WIDTH) + ) uuid_gen ( + .clk (clk), + .reset (reset), + .incr (schedule_fire), + .wid (schedule_wid), + .uuid (instr_uuid) + ); `else - wire [GNW_WIDTH+16-1:0] w_uuid = {g_wid, 16'(schedule_pc)}; - always @(*) begin - instr_uuid = `UUID_WIDTH'(w_uuid); - end -`endif -`else - wire [`UUID_WIDTH-1:0] instr_uuid = '0; + assign instr_uuid = '0; `endif VX_elastic_buffer #( - .DATAW (`NUM_THREADS + `PC_BITS + `NW_WIDTH) + .DATAW (`NUM_THREADS + `PC_BITS + `NW_WIDTH + `UUID_WIDTH), + .SIZE (2), // need to buffer out ready_in + .OUT_REG (1) // should be registered for BRAM acces in fetch unit ) out_buf ( .clk (clk), .reset (reset), .valid_in (schedule_valid), .ready_in (schedule_ready), - .data_in ({schedule_tmask, schedule_pc, schedule_wid}), - .data_out ({schedule_if.data.tmask, schedule_if.data.PC, schedule_if.data.wid}), + .data_in ({schedule_tmask, schedule_pc, schedule_wid, instr_uuid}), + .data_out ({schedule_if.data.tmask, schedule_if.data.PC, schedule_if.data.wid, schedule_if.data.uuid}), .valid_out (schedule_if.valid), .ready_out (schedule_if.ready) ); - assign schedule_if.data.uuid = instr_uuid; - // Track pending instructions per warp - reg [`NUM_WARPS-1:0] per_warp_incr; - always @(*) begin - per_warp_incr = 0; - if (schedule_if_fire) begin - per_warp_incr[schedule_if.data.wid] = 1; - end - end - wire [`NUM_WARPS-1:0] pending_warp_empty; wire [`NUM_WARPS-1:0] pending_warp_alm_empty; - `RESET_RELAY_EX (pending_instr_reset, reset, `NUM_WARPS, `MAX_FANOUT); - - for (genvar i = 0; i < `NUM_WARPS; ++i) begin - + for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_pending_sizes VX_pending_size #( .SIZE (4096), .ALM_EMPTY (1) ) counter ( .clk (clk), - .reset (pending_instr_reset[i]), - .incr (per_warp_incr[i]), + .reset (reset), + .incr (schedule_if_fire && (schedule_if.data.wid == `NW_WIDTH'(i))), .decr (commit_sched_if.committed_warps[i]), .empty (pending_warp_empty[i]), .alm_empty (pending_warp_alm_empty[i]), @@ -407,7 +382,7 @@ module VX_schedule import VX_gpu_pkg::*; #( wire no_pending_instr = (& pending_warp_empty); - `BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1); + `BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1, 1); // export CSRs assign sched_csr_if.cycles = cycles; @@ -422,7 +397,7 @@ module VX_schedule import VX_gpu_pkg::*; #( timeout_ctr <= '0; timeout_enable <= 0; end else begin - if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin + if (decode_sched_if.valid && decode_sched_if.unlock) begin timeout_enable <= 1; end if (timeout_enable && active_warps !=0 && active_warps == stalled_warps) begin diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv index 9b3a146c6..6899752e8 100644 --- a/hw/rtl/core/VX_scoreboard.sv +++ b/hw/rtl/core/VX_scoreboard.sv @@ -30,6 +30,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #( VX_scoreboard_if.master scoreboard_if ); `UNUSED_SPARAM (INSTANCE_ID) + localparam NUM_SRC_OPDS = 3; + localparam NUM_OPDS = NUM_SRC_OPDS + 1; localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + 1; VX_ibuffer_if staging_if [PER_ISSUE_WARPS](); @@ -42,7 +44,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle; wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r; - VX_reduce #( + VX_reduce_tree #( .DATAW_IN (`NUM_EX_UNITS), .N (PER_ISSUE_WARPS), .OP ("|") @@ -51,7 +53,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( .data_out (perf_units_per_cycle) ); - VX_reduce #( + VX_reduce_tree #( .DATAW_IN (`NUM_SFU_UNITS), .N (PER_ISSUE_WARPS), .OP ("|") @@ -60,17 +62,17 @@ module VX_scoreboard import VX_gpu_pkg::*; #( .data_out (perf_sfu_per_cycle) ); - `BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT)); - `BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT)); + `BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, 0, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT)); + `BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, 0, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT)); wire [PER_ISSUE_WARPS-1:0] stg_valid_in; - for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin + for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_stg_valid_in assign stg_valid_in[w] = staging_if[w].valid; end wire perf_stall_per_cycle = (|stg_valid_in) && ~(|(stg_valid_in & operands_ready)); - always @(posedge clk) begin + always @(posedge clk) begin : g_perf_stalls if (reset) begin perf_stalls <= '0; end else begin @@ -78,7 +80,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( end end - for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin + for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_perf_units_uses always @(posedge clk) begin if (reset) begin perf_units_uses[i] <= '0; @@ -88,7 +90,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #( end end - for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin + for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin : g_perf_sfu_uses always @(posedge clk) begin if (reset) begin perf_sfu_uses[i] <= '0; @@ -99,10 +101,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #( end `endif - for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin - VX_elastic_buffer #( - .DATAW (DATAW), - .SIZE (1) + for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_stanging_bufs + VX_pipe_buffer #( + .DATAW (DATAW) ) stanging_buf ( .clk (clk), .reset (reset), @@ -115,10 +116,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #( ); end - for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin + for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_scoreboard reg [`NUM_REGS-1:0] inuse_regs; - reg [3:0] operands_busy, operands_busy_n; + reg [NUM_OPDS-1:0] operands_busy, operands_busy_n; wire ibuffer_fire = ibuffer_if[w].valid && ibuffer_if[w].ready; @@ -128,6 +129,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #( && (writeback_if.data.wis == ISSUE_WIS_W'(w)) && writeback_if.data.eop; + wire [NUM_OPDS-1:0][`NR_BITS-1:0] ibuf_opds, stg_opds; + assign ibuf_opds = {ibuffer_if[w].data.rs3, ibuffer_if[w].data.rs2, ibuffer_if[w].data.rs1, ibuffer_if[w].data.rd}; + assign stg_opds = {staging_if[w].data.rs3, staging_if[w].data.rs2, staging_if[w].data.rs1, staging_if[w].data.rd}; + `ifdef PERF_ENABLE reg [`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units; reg [`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu; @@ -135,86 +140,36 @@ module VX_scoreboard import VX_gpu_pkg::*; #( always @(*) begin perf_inuse_units_per_cycle[w] = '0; perf_inuse_sfu_per_cycle[w] = '0; - if (staging_if[w].valid) begin - if (operands_busy[0]) begin - perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rd]] = 1; - if (inuse_units[staging_if[w].data.rd] == `EX_SFU) begin - perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rd]] = 1; - end - end - if (operands_busy[1]) begin - perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs1]] = 1; - if (inuse_units[staging_if[w].data.rs1] == `EX_SFU) begin - perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs1]] = 1; - end - end - if (operands_busy[2]) begin - perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs2]] = 1; - if (inuse_units[staging_if[w].data.rs2] == `EX_SFU) begin - perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs2]] = 1; - end - end - if (operands_busy[3]) begin - perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs3]] = 1; - if (inuse_units[staging_if[w].data.rs3] == `EX_SFU) begin - perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs3]] = 1; + for (integer i = 0; i < NUM_OPDS; ++i) begin + if (staging_if[w].valid && operands_busy[i]) begin + perf_inuse_units_per_cycle[w][inuse_units[stg_opds[i]]] = 1; + if (inuse_units[stg_opds[i]] == `EX_SFU) begin + perf_inuse_sfu_per_cycle[w][inuse_sfu[stg_opds[i]]] = 1; end end end end `endif - always @(*) begin - operands_busy_n = operands_busy; - if (ibuffer_fire) begin - operands_busy_n = { - inuse_regs[ibuffer_if[w].data.rs3], - inuse_regs[ibuffer_if[w].data.rs2], - inuse_regs[ibuffer_if[w].data.rs1], - inuse_regs[ibuffer_if[w].data.rd] - }; - end - if (writeback_fire) begin + for (genvar i = 0; i < NUM_OPDS; ++i) begin : g_operands_busy_n + always @(*) begin + operands_busy_n[i] = operands_busy[i]; if (ibuffer_fire) begin - if (writeback_if.data.rd == ibuffer_if[w].data.rd) begin - operands_busy_n[0] = 0; - end - if (writeback_if.data.rd == ibuffer_if[w].data.rs1) begin - operands_busy_n[1] = 0; - end - if (writeback_if.data.rd == ibuffer_if[w].data.rs2) begin - operands_busy_n[2] = 0; - end - if (writeback_if.data.rd == ibuffer_if[w].data.rs3) begin - operands_busy_n[3] = 0; - end - end else begin - if (writeback_if.data.rd == staging_if[w].data.rd) begin - operands_busy_n[0] = 0; - end - if (writeback_if.data.rd == staging_if[w].data.rs1) begin - operands_busy_n[1] = 0; - end - if (writeback_if.data.rd == staging_if[w].data.rs2) begin - operands_busy_n[2] = 0; - end - if (writeback_if.data.rd == staging_if[w].data.rs3) begin - operands_busy_n[3] = 0; + operands_busy_n[i] = inuse_regs[ibuf_opds[i]]; + if (staging_fire && staging_if[w].data.wb && staging_if[w].data.rd == ibuf_opds[i]) begin + operands_busy_n[i] = 1; end end - end - if (staging_fire && staging_if[w].data.wb) begin - if (staging_if[w].data.rd == ibuffer_if[w].data.rd) begin - operands_busy_n[0] = 1; - end - if (staging_if[w].data.rd == ibuffer_if[w].data.rs1) begin - operands_busy_n[1] = 1; - end - if (staging_if[w].data.rd == ibuffer_if[w].data.rs2) begin - operands_busy_n[2] = 1; - end - if (staging_if[w].data.rd == ibuffer_if[w].data.rs3) begin - operands_busy_n[3] = 1; + if (writeback_fire) begin + if (ibuffer_fire) begin + if (writeback_if.data.rd == ibuf_opds[i]) begin + operands_busy_n[i] = 0; + end + end else begin + if (writeback_if.data.rd == stg_opds[i]) begin + operands_busy_n[i] = 0; + end + end end end end @@ -230,8 +185,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #( inuse_regs[staging_if[w].data.rd] <= 1; end end + operands_busy <= operands_busy_n; operands_ready[w] <= ~(| operands_busy_n); + `ifdef PERF_ENABLE if (staging_fire && staging_if[w].data.wb) begin inuse_units[staging_if[w].data.rd] <= staging_if[w].data.ex_type; @@ -251,9 +208,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #( end else begin if (staging_if[w].valid && ~staging_if[w].ready) begin `ifdef DBG_TRACE_PIPELINE - `TRACE(3, ("%d: *** %s-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n", + `TRACE(4, ("%t: *** %s-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n", $time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr, - operands_busy, staging_if[w].data.uuid)); + operands_busy, staging_if[w].data.uuid)) `endif timeout_ctr <= timeout_ctr + 1; end else if (ibuffer_fire) begin @@ -265,11 +222,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #( `RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT), ("%t: *** %s timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)", $time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr, - operands_busy, staging_if[w].data.uuid)); + operands_busy, staging_if[w].data.uuid)) `RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if.data.rd] != 0, ("%t: *** %s invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)", - $time, INSTANCE_ID, w, {writeback_if.data.PC, 1'b0}, writeback_if.data.tmask, writeback_if.data.rd, writeback_if.data.uuid)); + $time, INSTANCE_ID, w, {writeback_if.data.PC, 1'b0}, writeback_if.data.tmask, writeback_if.data.rd, writeback_if.data.uuid)) `endif end @@ -278,23 +235,20 @@ module VX_scoreboard import VX_gpu_pkg::*; #( wire [PER_ISSUE_WARPS-1:0][DATAW-1:0] arb_data_in; wire [PER_ISSUE_WARPS-1:0] arb_ready_in; - for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin + for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_arb_data_in assign arb_valid_in[w] = staging_if[w].valid && operands_ready[w]; assign arb_data_in[w] = staging_if[w].data; assign staging_if[w].ready = arb_ready_in[w] && operands_ready[w]; end - `RESET_RELAY (arb_reset, reset); - VX_stream_arb #( .NUM_INPUTS (PER_ISSUE_WARPS), .DATAW (DATAW), - .ARBITER ("F"), - .LUTRAM (1), - .OUT_BUF (4) // using 2-cycle EB for area reduction + .ARBITER ("C"), + .OUT_BUF (3) ) out_arb ( .clk (clk), - .reset (arb_reset), + .reset (reset), .valid_in (arb_valid_in), .ready_in (arb_ready_in), .data_in (arb_data_in), diff --git a/hw/rtl/core/VX_sfu_unit.sv b/hw/rtl/core/VX_sfu_unit.sv index 5ef4211d0..d91141a7b 100644 --- a/hw/rtl/core/VX_sfu_unit.sv +++ b/hw/rtl/core/VX_sfu_unit.sv @@ -21,8 +21,8 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( input wire reset, `ifdef PERF_ENABLE - VX_mem_perf_if.slave mem_perf_if, - VX_pipeline_perf_if.slave pipeline_perf_if, + input sysmem_perf_t sysmem_perf, + input pipeline_perf_t pipeline_perf, `endif input base_dcrs_t base_dcrs, @@ -41,24 +41,25 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( VX_warp_ctl_if.master warp_ctl_if ); `UNUSED_SPARAM (INSTANCE_ID) - localparam BLOCK_SIZE = 1; - localparam NUM_LANES = `NUM_SFU_LANES; - localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); - localparam PID_WIDTH = `UP(PID_BITS); - - localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `PC_BITS + PID_WIDTH + 1 + 1; - localparam RSP_ARB_SIZE = 1 + 1; - localparam RSP_ARB_IDX_WCTL = 0; - localparam RSP_ARB_IDX_CSRS = 1; + localparam BLOCK_SIZE = 1; + localparam NUM_LANES = `NUM_SFU_LANES; + localparam PE_COUNT = 2; + localparam PE_SEL_BITS = `CLOG2(PE_COUNT); + localparam PE_IDX_WCTL = 0; + localparam PE_IDX_CSRS = 1; VX_execute_if #( .NUM_LANES (NUM_LANES) ) per_block_execute_if[BLOCK_SIZE](); + VX_commit_if #( + .NUM_LANES (NUM_LANES) + ) per_block_commit_if[BLOCK_SIZE](); + VX_dispatch_unit #( .BLOCK_SIZE (BLOCK_SIZE), .NUM_LANES (NUM_LANES), - .OUT_BUF (1) + .OUT_BUF (3) ) dispatch_unit ( .clk (clk), .reset (reset), @@ -66,65 +67,62 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( .execute_if (per_block_execute_if) ); - wire [RSP_ARB_SIZE-1:0] rsp_arb_valid_in; - wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in; - wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in; - - // Warp control block VX_execute_if #( .NUM_LANES (NUM_LANES) - ) wctl_execute_if(); + ) pe_execute_if[PE_COUNT](); + VX_commit_if#( .NUM_LANES (NUM_LANES) - ) wctl_commit_if(); + ) pe_commit_if[PE_COUNT](); - assign wctl_execute_if.valid = per_block_execute_if[0].valid && `INST_SFU_IS_WCTL(per_block_execute_if[0].data.op_type); - assign wctl_execute_if.data = per_block_execute_if[0].data; + reg [PE_SEL_BITS-1:0] pe_select; + always @(*) begin + pe_select = PE_IDX_WCTL; + if (`INST_SFU_IS_CSR(per_block_execute_if[0].data.op_type)) + pe_select = PE_IDX_CSRS; + end - `RESET_RELAY (wctl_reset, reset); + VX_pe_switch #( + .PE_COUNT (PE_COUNT), + .NUM_LANES (NUM_LANES), + .ARBITER ("R"), + .REQ_OUT_BUF(0), + .RSP_OUT_BUF(3) + ) pe_switch ( + .clk (clk), + .reset (reset), + .pe_sel (pe_select), + .execute_in_if (per_block_execute_if[0]), + .commit_out_if (per_block_commit_if[0]), + .execute_out_if (pe_execute_if), + .commit_in_if (pe_commit_if) + ); VX_wctl_unit #( - .INSTANCE_ID ($sformatf("%s-wctl", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-wctl", INSTANCE_ID))), .NUM_LANES (NUM_LANES) ) wctl_unit ( .clk (clk), - .reset (wctl_reset), - .execute_if (wctl_execute_if), + .reset (reset), + .execute_if (pe_execute_if[PE_IDX_WCTL]), .warp_ctl_if(warp_ctl_if), - .commit_if (wctl_commit_if) + .commit_if (pe_commit_if[PE_IDX_WCTL]) ); - assign rsp_arb_valid_in[RSP_ARB_IDX_WCTL] = wctl_commit_if.valid; - assign rsp_arb_data_in[RSP_ARB_IDX_WCTL] = wctl_commit_if.data; - assign wctl_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_WCTL]; - - // CSR unit - VX_execute_if #( - .NUM_LANES (NUM_LANES) - ) csr_execute_if(); - VX_commit_if #( - .NUM_LANES (NUM_LANES) - ) csr_commit_if(); - - assign csr_execute_if.valid = per_block_execute_if[0].valid && `INST_SFU_IS_CSR(per_block_execute_if[0].data.op_type); - assign csr_execute_if.data = per_block_execute_if[0].data; - - `RESET_RELAY (csr_reset, reset); - VX_csr_unit #( - .INSTANCE_ID ($sformatf("%s-csr", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-csr", INSTANCE_ID))), .CORE_ID (CORE_ID), .NUM_LANES (NUM_LANES) ) csr_unit ( .clk (clk), - .reset (csr_reset), + .reset (reset), .base_dcrs (base_dcrs), - .execute_if (csr_execute_if), + .execute_if (pe_execute_if[PE_IDX_CSRS]), `ifdef PERF_ENABLE - .mem_perf_if (mem_perf_if), - .pipeline_perf_if(pipeline_perf_if), + .sysmem_perf (sysmem_perf), + .pipeline_perf (pipeline_perf), `endif `ifdef EXT_F_ENABLE @@ -133,47 +131,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( .sched_csr_if (sched_csr_if), .commit_csr_if (commit_csr_if), - .commit_if (csr_commit_if) - ); - - assign rsp_arb_valid_in[RSP_ARB_IDX_CSRS] = csr_commit_if.valid; - assign rsp_arb_data_in[RSP_ARB_IDX_CSRS] = csr_commit_if.data; - assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSRS]; - - // can accept new request? - - reg sfu_req_ready; - always @(*) begin - case (per_block_execute_if[0].data.op_type) - `INST_SFU_CSRRW, - `INST_SFU_CSRRS, - `INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready; - default: sfu_req_ready = wctl_execute_if.ready; - endcase - end - assign per_block_execute_if[0].ready = sfu_req_ready; - - // response arbitration - - VX_commit_if #( - .NUM_LANES (NUM_LANES) - ) arb_commit_if[BLOCK_SIZE](); - - VX_stream_arb #( - .NUM_INPUTS (RSP_ARB_SIZE), - .DATAW (RSP_ARB_DATAW), - .ARBITER ("R"), - .OUT_BUF (3) - ) rsp_arb ( - .clk (clk), - .reset (reset), - .valid_in (rsp_arb_valid_in), - .ready_in (rsp_arb_ready_in), - .data_in (rsp_arb_data_in), - .data_out (arb_commit_if[0].data), - .valid_out (arb_commit_if[0].valid), - .ready_out (arb_commit_if[0].ready), - `UNUSED_PIN (sel_out) + .commit_if (pe_commit_if[PE_IDX_CSRS]) ); VX_gather_unit #( @@ -181,9 +139,9 @@ module VX_sfu_unit import VX_gpu_pkg::*; #( .NUM_LANES (NUM_LANES), .OUT_BUF (3) ) gather_unit ( - .clk (clk), - .reset (reset), - .commit_in_if (arb_commit_if), + .clk (clk), + .reset (reset), + .commit_in_if (per_block_commit_if), .commit_out_if (commit_if) ); diff --git a/hw/rtl/core/VX_split_join.sv b/hw/rtl/core/VX_split_join.sv index 7f887e602..c3f1f73f3 100644 --- a/hw/rtl/core/VX_split_join.sv +++ b/hw/rtl/core/VX_split_join.sv @@ -45,16 +45,13 @@ module VX_split_join import VX_gpu_pkg::*; #( wire ipdom_push = valid && split.valid && split.is_dvg; wire ipdom_pop = valid && sjoin.valid && sjoin_is_dvg; - for (genvar i = 0; i < `NUM_WARPS; ++i) begin - - `RESET_RELAY (ipdom_reset, reset); - + for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_ipdom_stacks VX_ipdom_stack #( .WIDTH (`NUM_THREADS+`PC_BITS), .DEPTH (`DV_STACK_SIZE) ) ipdom_stack ( .clk (clk), - .reset (ipdom_reset), + .reset (reset), .q0 (ipdom_q0), .q1 (ipdom_q1), .d (ipdom_data[i]), diff --git a/hw/rtl/core/VX_trace_pkg.sv b/hw/rtl/core/VX_trace_pkg.sv deleted file mode 100644 index b4eae96fe..000000000 --- a/hw/rtl/core/VX_trace_pkg.sv +++ /dev/null @@ -1,399 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -`ifndef VX_TRACE_PKG_VH -`define VX_TRACE_PKG_VH - -`include "VX_define.vh" - -package VX_trace_pkg; - -`ifdef SIMULATION - -`ifdef SV_DPI - import "DPI-C" function void dpi_trace(input int level, input string format /*verilator sformat*/); -`endif - - import VX_gpu_pkg::*; - - task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type); - case (ex_type) - `EX_ALU: `TRACE(level, ("ALU")); - `EX_LSU: `TRACE(level, ("LSU")); - `EX_FPU: `TRACE(level, ("FPU")); - `EX_SFU: `TRACE(level, ("SFU")); - default: `TRACE(level, ("?")); - endcase - endtask - - task trace_ex_op(input int level, - input [`EX_BITS-1:0] ex_type, - input [`INST_OP_BITS-1:0] op_type, - input VX_gpu_pkg::op_args_t op_args - ); - case (ex_type) - `EX_ALU: begin - case (op_args.alu.xtype) - `ALU_TYPE_ARITH: begin - if (op_args.alu.is_w) begin - if (op_args.alu.use_imm) begin - case (`INST_ALU_BITS'(op_type)) - `INST_ALU_ADD: `TRACE(level, ("ADDIW")); - `INST_ALU_SLL: `TRACE(level, ("SLLIW")); - `INST_ALU_SRL: `TRACE(level, ("SRLIW")); - `INST_ALU_SRA: `TRACE(level, ("SRAIW")); - default: `TRACE(level, ("?")); - endcase - end else begin - case (`INST_ALU_BITS'(op_type)) - `INST_ALU_ADD: `TRACE(level, ("ADDW")); - `INST_ALU_SUB: `TRACE(level, ("SUBW")); - `INST_ALU_SLL: `TRACE(level, ("SLLW")); - `INST_ALU_SRL: `TRACE(level, ("SRLW")); - `INST_ALU_SRA: `TRACE(level, ("SRAW")); - default: `TRACE(level, ("?")); - endcase - end - end else begin - if (op_args.alu.use_imm) begin - case (`INST_ALU_BITS'(op_type)) - `INST_ALU_ADD: `TRACE(level, ("ADDI")); - `INST_ALU_SLL: `TRACE(level, ("SLLI")); - `INST_ALU_SRL: `TRACE(level, ("SRLI")); - `INST_ALU_SRA: `TRACE(level, ("SRAI")); - `INST_ALU_SLT: `TRACE(level, ("SLTI")); - `INST_ALU_SLTU: `TRACE(level, ("SLTIU")); - `INST_ALU_XOR: `TRACE(level, ("XORI")); - `INST_ALU_OR: `TRACE(level, ("ORI")); - `INST_ALU_AND: `TRACE(level, ("ANDI")); - `INST_ALU_LUI: `TRACE(level, ("LUI")); - `INST_ALU_AUIPC: `TRACE(level, ("AUIPC")); - default: `TRACE(level, ("?")); - endcase - end else begin - case (`INST_ALU_BITS'(op_type)) - `INST_ALU_ADD: `TRACE(level, ("ADD")); - `INST_ALU_SUB: `TRACE(level, ("SUB")); - `INST_ALU_SLL: `TRACE(level, ("SLL")); - `INST_ALU_SRL: `TRACE(level, ("SRL")); - `INST_ALU_SRA: `TRACE(level, ("SRA")); - `INST_ALU_SLT: `TRACE(level, ("SLT")); - `INST_ALU_SLTU: `TRACE(level, ("SLTU")); - `INST_ALU_XOR: `TRACE(level, ("XOR")); - `INST_ALU_OR: `TRACE(level, ("OR")); - `INST_ALU_AND: `TRACE(level, ("AND")); - `INST_ALU_CZEQ: `TRACE(level, ("CZERO.EQZ")); - `INST_ALU_CZNE: `TRACE(level, ("CZERO.NEZ")); - default: `TRACE(level, ("?")); - endcase - end - end - end - `ALU_TYPE_BRANCH: begin - case (`INST_BR_BITS'(op_type)) - `INST_BR_EQ: `TRACE(level, ("BEQ")); - `INST_BR_NE: `TRACE(level, ("BNE")); - `INST_BR_LT: `TRACE(level, ("BLT")); - `INST_BR_GE: `TRACE(level, ("BGE")); - `INST_BR_LTU: `TRACE(level, ("BLTU")); - `INST_BR_GEU: `TRACE(level, ("BGEU")); - `INST_BR_JAL: `TRACE(level, ("JAL")); - `INST_BR_JALR: `TRACE(level, ("JALR")); - `INST_BR_ECALL: `TRACE(level, ("ECALL")); - `INST_BR_EBREAK:`TRACE(level, ("EBREAK")); - `INST_BR_URET: `TRACE(level, ("URET")); - `INST_BR_SRET: `TRACE(level, ("SRET")); - `INST_BR_MRET: `TRACE(level, ("MRET")); - default: `TRACE(level, ("?")); - endcase - end - `ALU_TYPE_MULDIV: begin - if (op_args.alu.is_w) begin - case (`INST_M_BITS'(op_type)) - `INST_M_MUL: `TRACE(level, ("MULW")); - `INST_M_DIV: `TRACE(level, ("DIVW")); - `INST_M_DIVU: `TRACE(level, ("DIVUW")); - `INST_M_REM: `TRACE(level, ("REMW")); - `INST_M_REMU: `TRACE(level, ("REMUW")); - default: `TRACE(level, ("?")); - endcase - end else begin - case (`INST_M_BITS'(op_type)) - `INST_M_MUL: `TRACE(level, ("MUL")); - `INST_M_MULH: `TRACE(level, ("MULH")); - `INST_M_MULHSU:`TRACE(level, ("MULHSU")); - `INST_M_MULHU: `TRACE(level, ("MULHU")); - `INST_M_DIV: `TRACE(level, ("DIV")); - `INST_M_DIVU: `TRACE(level, ("DIVU")); - `INST_M_REM: `TRACE(level, ("REM")); - `INST_M_REMU: `TRACE(level, ("REMU")); - default: `TRACE(level, ("?")); - endcase - end - end - default: `TRACE(level, ("?")); - endcase - end - `EX_LSU: begin - if (op_args.lsu.is_float) begin - case (`INST_LSU_BITS'(op_type)) - `INST_LSU_LW: `TRACE(level, ("FLW")); - `INST_LSU_LD: `TRACE(level, ("FLD")); - `INST_LSU_SW: `TRACE(level, ("FSW")); - `INST_LSU_SD: `TRACE(level, ("FSD")); - default: `TRACE(level, ("?")); - endcase - end else begin - case (`INST_LSU_BITS'(op_type)) - `INST_LSU_LB: `TRACE(level, ("LB")); - `INST_LSU_LH: `TRACE(level, ("LH")); - `INST_LSU_LW: `TRACE(level, ("LW")); - `INST_LSU_LD: `TRACE(level, ("LD")); - `INST_LSU_LBU:`TRACE(level, ("LBU")); - `INST_LSU_LHU:`TRACE(level, ("LHU")); - `INST_LSU_LWU:`TRACE(level, ("LWU")); - `INST_LSU_SB: `TRACE(level, ("SB")); - `INST_LSU_SH: `TRACE(level, ("SH")); - `INST_LSU_SW: `TRACE(level, ("SW")); - `INST_LSU_SD: `TRACE(level, ("SD")); - `INST_LSU_FENCE:`TRACE(level,("FENCE")); - default: `TRACE(level, ("?")); - endcase - end - end - `EX_FPU: begin - case (`INST_FPU_BITS'(op_type)) - `INST_FPU_ADD: begin - if (op_args.fpu.fmt[0]) - `TRACE(level, ("FADD.D")); - else - `TRACE(level, ("FADD.S")); - end - `INST_FPU_SUB: begin - if (op_args.fpu.fmt[0]) - `TRACE(level, ("FSUB.D")); - else - `TRACE(level, ("FSUB.S")); - end - `INST_FPU_MUL: begin - if (op_args.fpu.fmt[0]) - `TRACE(level, ("FMUL.D")); - else - `TRACE(level, ("FMUL.S")); - end - `INST_FPU_DIV: begin - if (op_args.fpu.fmt[0]) - `TRACE(level, ("FDIV.D")); - else - `TRACE(level, ("FDIV.S")); - end - `INST_FPU_SQRT: begin - if (op_args.fpu.fmt[0]) - `TRACE(level, ("FSQRT.D")); - else - `TRACE(level, ("FSQRT.S")); - end - `INST_FPU_MADD: begin - if (op_args.fpu.fmt[0]) - `TRACE(level, ("FMADD.D")); - else - `TRACE(level, ("FMADD.S")); - end - `INST_FPU_MSUB: begin - if (op_args.fpu.fmt[0]) - `TRACE(level, ("FMSUB.D")); - else - `TRACE(level, ("FMSUB.S")); - end - `INST_FPU_NMADD: begin - if (op_args.fpu.fmt[0]) - `TRACE(level, ("FNMADD.D")); - else - `TRACE(level, ("FNMADD.S")); - end - `INST_FPU_NMSUB: begin - if (op_args.fpu.fmt[0]) - `TRACE(level, ("FNMSUB.D")); - else - `TRACE(level, ("FNMSUB.S")); - end - `INST_FPU_CMP: begin - if (op_args.fpu.fmt[0]) begin - case (op_args.fpu.frm[1:0]) - 0: `TRACE(level, ("FLE.D")); - 1: `TRACE(level, ("FLT.D")); - 2: `TRACE(level, ("FEQ.D")); - default: `TRACE(level, ("?")); - endcase - end else begin - case (op_args.fpu.frm[1:0]) - 0: `TRACE(level, ("FLE.S")); - 1: `TRACE(level, ("FLT.S")); - 2: `TRACE(level, ("FEQ.S")); - default: `TRACE(level, ("?")); - endcase - end - end - `INST_FPU_F2F: begin - if (op_args.fpu.fmt[0]) begin - `TRACE(level, ("FCVT.D.S")); - end else begin - `TRACE(level, ("FCVT.S.D")); - end - end - `INST_FPU_F2I: begin - if (op_args.fpu.fmt[0]) begin - if (op_args.fpu.fmt[1]) begin - `TRACE(level, ("FCVT.L.D")); - end else begin - `TRACE(level, ("FCVT.W.D")); - end - end else begin - if (op_args.fpu.fmt[1]) begin - `TRACE(level, ("FCVT.L.S")); - end else begin - `TRACE(level, ("FCVT.W.S")); - end - end - end - `INST_FPU_F2U: begin - if (op_args.fpu.fmt[0]) begin - if (op_args.fpu.fmt[1]) begin - `TRACE(level, ("FCVT.LU.D")); - end else begin - `TRACE(level, ("FCVT.WU.D")); - end - end else begin - if (op_args.fpu.fmt[1]) begin - `TRACE(level, ("FCVT.LU.S")); - end else begin - `TRACE(level, ("FCVT.WU.S")); - end - end - end - `INST_FPU_I2F: begin - if (op_args.fpu.fmt[0]) begin - if (op_args.fpu.fmt[1]) begin - `TRACE(level, ("FCVT.D.L")); - end else begin - `TRACE(level, ("FCVT.D.W")); - end - end else begin - if (op_args.fpu.fmt[1]) begin - `TRACE(level, ("FCVT.S.L")); - end else begin - `TRACE(level, ("FCVT.S.W")); - end - end - end - `INST_FPU_U2F: begin - if (op_args.fpu.fmt[0]) begin - if (op_args.fpu.fmt[1]) begin - `TRACE(level, ("FCVT.D.LU")); - end else begin - `TRACE(level, ("FCVT.D.WU")); - end - end else begin - if (op_args.fpu.fmt[1]) begin - `TRACE(level, ("FCVT.S.LU")); - end else begin - `TRACE(level, ("FCVT.S.WU")); - end - end - end - `INST_FPU_MISC: begin - if (op_args.fpu.fmt[0]) begin - case (op_args.fpu.frm) - 0: `TRACE(level, ("FSGNJ.D")); - 1: `TRACE(level, ("FSGNJN.D")); - 2: `TRACE(level, ("FSGNJX.D")); - 3: `TRACE(level, ("FCLASS.D")); - 4: `TRACE(level, ("FMV.X.D")); - 5: `TRACE(level, ("FMV.D.X")); - 6: `TRACE(level, ("FMIN.D")); - 7: `TRACE(level, ("FMAX.D")); - endcase - end else begin - case (op_args.fpu.frm) - 0: `TRACE(level, ("FSGNJ.S")); - 1: `TRACE(level, ("FSGNJN.S")); - 2: `TRACE(level, ("FSGNJX.S")); - 3: `TRACE(level, ("FCLASS.S")); - 4: `TRACE(level, ("FMV.X.S")); - 5: `TRACE(level, ("FMV.S.X")); - 6: `TRACE(level, ("FMIN.S")); - 7: `TRACE(level, ("FMAX.S")); - endcase - end - end - default: `TRACE(level, ("?")); - endcase - end - `EX_SFU: begin - case (`INST_SFU_BITS'(op_type)) - `INST_SFU_TMC: `TRACE(level, ("TMC")); - `INST_SFU_WSPAWN:`TRACE(level, ("WSPAWN")); - `INST_SFU_SPLIT: begin if (op_args.wctl.is_neg) `TRACE(level, ("SPLIT.N")); else `TRACE(level, ("SPLIT")); end - `INST_SFU_JOIN: `TRACE(level, ("JOIN")); - `INST_SFU_BAR: `TRACE(level, ("BAR")); - `INST_SFU_PRED: begin if (op_args.wctl.is_neg) `TRACE(level, ("PRED.N")); else `TRACE(level, ("PRED")); end - `INST_SFU_CSRRW: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end - `INST_SFU_CSRRS: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end - `INST_SFU_CSRRC: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end - default: `TRACE(level, ("?")); - endcase - end - default: `TRACE(level, ("?")); - endcase - endtask - - task trace_op_args(input int level, - input [`EX_BITS-1:0] ex_type, - input [`INST_OP_BITS-1:0] op_type, - input VX_gpu_pkg::op_args_t op_args - ); - case (ex_type) - `EX_ALU: begin - `TRACE(level, (", use_PC=%b, use_imm=%b, imm=0x%0h", op_args.alu.use_PC, op_args.alu.use_imm, op_args.alu.imm)); - end - `EX_LSU: begin - `TRACE(level, (", offset=0x%0h", op_args.lsu.offset)); - end - `EX_FPU: begin - `TRACE(level, (", fmt=0x%0h, frm=0x%0h", op_args.fpu.fmt, op_args.fpu.frm)); - end - `EX_SFU: begin - if (`INST_SFU_IS_CSR(op_type)) begin - `TRACE(level, (", addr=0x%0h, use_imm=%b, imm=0x%0h", op_args.csr.addr, op_args.csr.use_imm, op_args.csr.imm)); - end - end - default:; - endcase - endtask - - task trace_base_dcr(input int level, input [`VX_DCR_ADDR_WIDTH-1:0] addr); - case (addr) - `VX_DCR_BASE_STARTUP_ADDR0: `TRACE(level, ("STARTUP_ADDR0")); - `VX_DCR_BASE_STARTUP_ADDR1: `TRACE(level, ("STARTUP_ADDR1")); - `VX_DCR_BASE_STARTUP_ARG0: `TRACE(level, ("STARTUP_ARG0")); - `VX_DCR_BASE_STARTUP_ARG1: `TRACE(level, ("STARTUP_ARG1")); - `VX_DCR_BASE_MPM_CLASS: `TRACE(level, ("MPM_CLASS")); - default: `TRACE(level, ("?")); - endcase - endtask - -`endif - -endpackage - -`endif // VX_TRACE_PKG_VH diff --git a/hw/rtl/core/VX_uuid_gen.sv b/hw/rtl/core/VX_uuid_gen.sv new file mode 100644 index 000000000..cbde9091d --- /dev/null +++ b/hw/rtl/core/VX_uuid_gen.sv @@ -0,0 +1,44 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_define.vh" + +module VX_uuid_gen import VX_gpu_pkg::*; #( + parameter CORE_ID = 0, + parameter UUID_WIDTH = 48 +) ( + input wire clk, + input wire reset, + input wire incr, + input wire [`NW_WIDTH-1:0] wid, + output wire [UUID_WIDTH-1:0] uuid +); + localparam GNW_WIDTH = UUID_WIDTH - 32; + reg [31:0] uuid_cntrs [0:`NUM_WARPS-1]; + reg [`NUM_WARPS-1:0] has_uuid_cntrs; + + always @(posedge clk) begin + if (reset) begin + has_uuid_cntrs <= '0; + end else if (incr) begin + has_uuid_cntrs[wid] <= 1; + end + if (incr) begin + uuid_cntrs[wid] <= has_uuid_cntrs[wid] ? (uuid_cntrs[wid] + 1) : 1; + end + end + + wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(wid); + assign uuid = {g_wid, (has_uuid_cntrs[wid] ? uuid_cntrs[wid] : 0)}; + +endmodule diff --git a/hw/rtl/core/VX_wctl_unit.sv b/hw/rtl/core/VX_wctl_unit.sv index 132f679d4..bb85b70c9 100644 --- a/hw/rtl/core/VX_wctl_unit.sv +++ b/hw/rtl/core/VX_wctl_unit.sv @@ -50,9 +50,9 @@ module VX_wctl_unit import VX_gpu_pkg::*; #( wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR); wire [`UP(LANE_BITS)-1:0] tid; - if (LANE_BITS != 0) begin + if (LANE_BITS != 0) begin : g_tid assign tid = execute_if.data.tid[0 +: LANE_BITS]; - end else begin + end else begin : g_no_tid assign tid = 0; end @@ -63,7 +63,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #( wire not_pred = execute_if.data.op_args.wctl.is_neg; wire [NUM_LANES-1:0] taken; - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_taken assign taken[i] = (execute_if.data.rs1_data[i][0] ^ not_pred); end @@ -131,7 +131,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #( // wspawn wire [`NUM_WARPS-1:0] wspawn_wmask; - for (genvar i = 0; i < `NUM_WARPS; ++i) begin + for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_wspawn_wmask assign wspawn_wmask[i] = (i < rs1_data[`NW_BITS:0]) && (i != execute_if.data.wid); end assign wspawn.valid = is_wspawn; @@ -162,7 +162,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #( assign warp_ctl_if.sjoin = sjoin_r; assign warp_ctl_if.barrier = barrier_r; - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_commit_if assign commit_if.data.data[i] = `XLEN'(dvstack_ptr); end diff --git a/hw/rtl/fpu/VX_fcvt_unit.sv b/hw/rtl/fpu/VX_fcvt_unit.sv index b5b7b1690..5756a25ed 100644 --- a/hw/rtl/fpu/VX_fcvt_unit.sv +++ b/hw/rtl/fpu/VX_fcvt_unit.sv @@ -1,17 +1,17 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -// Modified port of cast module from fpnew Libray +// Modified port of cast module from fpnew Libray // reference: https://github.com/pulp-platform/fpnew `include "VX_fpu_define.vh" @@ -22,7 +22,8 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #( parameter LATENCY = 1, parameter INT_WIDTH = 32, parameter MAN_BITS = 23, - parameter EXP_BITS = 8 + parameter EXP_BITS = 8, + parameter OUT_REG = 0 ) ( input wire clk, input wire reset, @@ -35,10 +36,10 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #( input wire is_signed, input wire [31:0] dataa, - output wire [31:0] result, + output wire [31:0] result, output wire [`FP_FLAGS_BITS-1:0] fflags -); +); // Constants localparam EXP_BIAS = 2**(EXP_BITS-1)-1; @@ -55,11 +56,11 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #( localparam FMT_SHIFT_COMPENSATION = S_MAN_WIDTH - 1 - MAN_BITS; localparam NUM_FP_STICKY = 2 * S_MAN_WIDTH - MAN_BITS - 1; // removed mantissa, 1. and R localparam NUM_INT_STICKY = 2 * S_MAN_WIDTH - INT_WIDTH; // removed int and R - + // Input processing - - fclass_t fclass; - VX_fp_classifier #( + + fclass_t fclass; + VX_fp_classifier #( .EXP_BITS (EXP_BITS), .MAN_BITS (MAN_BITS) ) fp_classifier ( @@ -69,9 +70,9 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #( ); wire [S_MAN_WIDTH-1:0] input_mant; - wire [S_EXP_WIDTH-1:0] input_exp; + wire [S_EXP_WIDTH-1:0] input_exp; wire input_sign; - + wire i2f_sign = dataa[INT_WIDTH-1]; wire f2i_sign = dataa[INT_WIDTH-1] && is_signed; wire [S_MAN_WIDTH-1:0] f2i_mantissa = f2i_sign ? (-dataa) : dataa; @@ -81,7 +82,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #( assign input_sign = is_itof ? f2i_sign : i2f_sign; // Pipeline stage0 - + wire is_itof_s0; wire is_signed_s0; wire [2:0] rnd_mode_s0; @@ -92,7 +93,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #( VX_pipe_register #( .DATAW (1 + `INST_FRM_BITS + 1 + $bits(fclass_t) + 1 + S_EXP_WIDTH + S_MAN_WIDTH), - .DEPTH (LATENCY > 2) + .DEPTH (LATENCY > 1) ) pipe_reg0 ( .clk (clk), .reset (reset), @@ -100,7 +101,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #( .data_in ({is_itof, is_signed, frm, fclass, input_sign, input_exp, input_mant}), .data_out ({is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0}) ); - + // Normalization wire [LZC_RESULT_WIDTH-1:0] renorm_shamt_s0; // renormalization shift amount @@ -113,12 +114,12 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #( .data_out (renorm_shamt_s0), .valid_out (mant_is_nonzero_s0) ); - + wire mant_is_zero_s0 = ~mant_is_nonzero_s0; - wire [S_MAN_WIDTH-1:0] input_mant_n_s0; // normalized input mantissa + wire [S_MAN_WIDTH-1:0] input_mant_n_s0; // normalized input mantissa wire [S_EXP_WIDTH-1:0] input_exp_n_s0; // unbiased true exponent - + // Realign input mantissa, append zeroes if destination is wider assign input_mant_n_s0 = encoded_mant_s0 << renorm_shamt_s0; @@ -140,7 +141,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #( VX_pipe_register #( .DATAW (1 + `INST_FRM_BITS + 1 + $bits(fclass_t) + 1 + 1 + S_MAN_WIDTH + S_EXP_WIDTH), - .DEPTH (LATENCY > 1) + .DEPTH (LATENCY > 2) ) pipe_reg1 ( .clk (clk), .reset (reset), @@ -169,30 +170,30 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #( wire of_before_round_s1 = overflow; // Pipeline stage2 - + wire is_itof_s2; wire is_signed_s2; wire [2:0] rnd_mode_s2; - fclass_t fclass_s2; + fclass_t fclass_s2; wire mant_is_zero_s2; wire input_sign_s2; wire [2*S_MAN_WIDTH:0] destination_mant_s2; wire [EXP_BITS-1:0] final_exp_s2; wire of_before_round_s2; - + VX_pipe_register #( .DATAW (1 + 1 + `INST_FRM_BITS + $bits(fclass_t) + 1 + 1 + (2*S_MAN_WIDTH+1) + EXP_BITS + 1), - .DEPTH (LATENCY > 3) + .DEPTH (LATENCY > 0) ) pipe_reg2 ( .clk (clk), .reset (reset), .enable (enable), .data_in ({is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}), .data_out ({is_itof_s2, is_signed_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2}) - ); - + ); + // Rouding and classification - + wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments wire [INT_WIDTH-1:0] final_int_s2; // integer shifted in position wire [1:0] f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2; @@ -237,20 +238,20 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #( wire is_itof_s3; wire is_signed_s3; - fclass_t fclass_s3; + fclass_t fclass_s3; wire mant_is_zero_s3; wire input_sign_s3; wire rounded_sign_s3; wire [INT_WIDTH-1:0] rounded_abs_s3; - wire of_before_round_s3; + wire of_before_round_s3; wire f2i_round_has_sticky_s3; wire i2f_round_has_sticky_s3; - `UNUSED_VAR (fclass_s3) + `UNUSED_VAR (fclass_s3) VX_pipe_register #( .DATAW (1 + 1 + $bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1), - .DEPTH (LATENCY > 4) + .DEPTH (LATENCY > 3) ) pipe_reg3 ( .clk (clk), .reset (reset), @@ -258,7 +259,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #( .data_in ({is_itof_s2, is_signed_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}), .data_out ({is_itof_s3, is_signed_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3}) ); - + // Assemble regular result, nan box short ones. Int zeroes need to be detected wire [INT_WIDTH-1:0] fmt_result_s3 = mant_is_zero_s3 ? 0 : {rounded_sign_s3, rounded_abs_s3[EXP_BITS+MAN_BITS-1:0]}; @@ -278,18 +279,18 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #( f2i_special_result_s3[INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1; // alone yields 2**(31)-1 f2i_special_result_s3[INT_WIDTH-1] = ~is_signed_s3; // for unsigned casts yields 2**31 end - end + end // Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned) - wire f2i_result_is_special_s3 = fclass_s3.is_nan + wire f2i_result_is_special_s3 = fclass_s3.is_nan | fclass_s3.is_inf | of_before_round_s3 | (input_sign_s3 & ~is_signed_s3 & ~rounded_int_res_zero_s3); - + fflags_t f2i_special_status_s3; fflags_t i2f_status_s3, f2i_status_s3; fflags_t tmp_fflags_s3; - + // All integer special cases are invalid assign f2i_special_status_s3 = {1'b1, 4'h0}; @@ -306,7 +307,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #( VX_pipe_register #( .DATAW (32 + `FP_FLAGS_BITS), - .DEPTH (LATENCY > 0) + .DEPTH (OUT_REG) ) pipe_reg4 ( .clk (clk), .reset (reset), diff --git a/hw/rtl/fpu/VX_fncp_unit.sv b/hw/rtl/fpu/VX_fncp_unit.sv index a0876dcd7..27836fcbc 100644 --- a/hw/rtl/fpu/VX_fncp_unit.sv +++ b/hw/rtl/fpu/VX_fncp_unit.sv @@ -1,17 +1,17 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -// Modified port of noncomp module from fpnew Libray +// Modified port of noncomp module from fpnew Libray // reference: https://github.com/pulp-platform/fpnew `include "VX_fpu_define.vh" @@ -19,9 +19,10 @@ `ifdef FPU_DSP module VX_fncp_unit import VX_fpu_pkg::*; #( - parameter LATENCY = 2, + parameter LATENCY = 1, parameter EXP_BITS = 8, - parameter MAN_BITS = 23 + parameter MAN_BITS = 23, + parameter OUT_REG = 0 ) ( input wire clk, input wire reset, @@ -33,10 +34,10 @@ module VX_fncp_unit import VX_fpu_pkg::*; #( input wire [31:0] dataa, input wire [31:0] datab, - output wire [31:0] result, + output wire [31:0] result, output wire [`FP_FLAGS_BITS-1:0] fflags -); +); localparam NEG_INF = 32'h00000001, NEG_NORM = 32'h00000002, NEG_SUBNORM = 32'h00000004, @@ -55,15 +56,15 @@ module VX_fncp_unit import VX_fpu_pkg::*; #( wire a_smaller, ab_equal; // Setup - assign a_sign = dataa[31]; + assign a_sign = dataa[31]; assign a_exponent = dataa[30:23]; assign a_mantissa = dataa[22:0]; - assign b_sign = datab[31]; + assign b_sign = datab[31]; assign b_exponent = datab[30:23]; assign b_mantissa = datab[22:0]; - VX_fp_classifier #( + VX_fp_classifier #( .EXP_BITS (EXP_BITS), .MAN_BITS (MAN_BITS) ) fp_class_a ( @@ -72,7 +73,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #( .clss_o (a_fclass) ); - VX_fp_classifier #( + VX_fp_classifier #( .EXP_BITS (EXP_BITS), .MAN_BITS (MAN_BITS) ) fp_class_b ( @@ -82,7 +83,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #( ); assign a_smaller = (dataa < datab) ^ (a_sign || b_sign); - assign ab_equal = (dataa == datab) + assign ab_equal = (dataa == datab) || (a_fclass.is_zero && b_fclass.is_zero); // +0 == -0 // Pipeline stage0 @@ -101,54 +102,54 @@ module VX_fncp_unit import VX_fpu_pkg::*; #( VX_pipe_register #( .DATAW (4 + 2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fclass_t) + 1 + 1), - .DEPTH (LATENCY > 1) + .DEPTH (LATENCY > 0) ) pipe_reg0 ( .clk (clk), .reset (reset), .enable (enable), .data_in ({op_mod, dataa, datab, a_sign, b_sign, a_exponent, a_mantissa, a_fclass, b_fclass, a_smaller, ab_equal}), .data_out ({op_mod_s0, dataa_s0, datab_s0, a_sign_s0, b_sign_s0, a_exponent_s0, a_mantissa_s0, a_fclass_s0, b_fclass_s0, a_smaller_s0, ab_equal_s0}) - ); + ); // FCLASS reg [31:0] fclass_mask_s0; // generate a 10-bit mask for integer reg - always @(*) begin + always @(*) begin if (a_fclass_s0.is_normal) begin fclass_mask_s0 = a_sign_s0 ? NEG_NORM : POS_NORM; - end + end else if (a_fclass_s0.is_inf) begin fclass_mask_s0 = a_sign_s0 ? NEG_INF : POS_INF; - end + end else if (a_fclass_s0.is_zero) begin fclass_mask_s0 = a_sign_s0 ? NEG_ZERO : POS_ZERO; - end + end else if (a_fclass_s0.is_subnormal) begin fclass_mask_s0 = a_sign_s0 ? NEG_SUBNORM : POS_SUBNORM; - end + end else if (a_fclass_s0.is_nan) begin fclass_mask_s0 = {22'h0, a_fclass_s0.is_quiet, a_fclass_s0.is_signaling, 8'h0}; - end - else begin + end + else begin fclass_mask_s0 = QUT_NAN; end end - // Min/Max + // Min/Max reg [31:0] fminmax_res_s0; always @(*) begin if (a_fclass_s0.is_nan && b_fclass_s0.is_nan) fminmax_res_s0 = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN - else if (a_fclass_s0.is_nan) + else if (a_fclass_s0.is_nan) fminmax_res_s0 = datab_s0; - else if (b_fclass_s0.is_nan) + else if (b_fclass_s0.is_nan) fminmax_res_s0 = dataa_s0; - else begin + else begin // FMIN, FMAX fminmax_res_s0 = (op_mod_s0[0] ^ a_smaller_s0) ? dataa_s0 : datab_s0; end end - // Sign injection + // Sign injection reg [31:0] fsgnj_res_s0; // result of sign injection always @(*) begin case (op_mod_s0[1:0]) @@ -158,12 +159,12 @@ module VX_fncp_unit import VX_fpu_pkg::*; #( endcase end - // Comparison + // Comparison reg fcmp_res_s0; // result of comparison reg fcmp_fflags_NV_s0; // comparison fflags always @(*) begin case (op_mod_s0[1:0]) - 0: begin // LE + 0: begin // LE if (a_fclass_s0.is_nan || b_fclass_s0.is_nan) begin fcmp_res_s0 = 0; fcmp_fflags_NV_s0 = 1; @@ -179,12 +180,12 @@ module VX_fncp_unit import VX_fpu_pkg::*; #( end else begin fcmp_res_s0 = (a_smaller_s0 & ~ab_equal_s0); fcmp_fflags_NV_s0 = 0; - end + end end 2: begin // EQ if (a_fclass_s0.is_nan || b_fclass_s0.is_nan) begin fcmp_res_s0 = 0; - fcmp_fflags_NV_s0 = a_fclass_s0.is_signaling | b_fclass_s0.is_signaling; + fcmp_fflags_NV_s0 = a_fclass_s0.is_signaling | b_fclass_s0.is_signaling; end else begin fcmp_res_s0 = ab_equal_s0; fcmp_fflags_NV_s0 = 0; @@ -192,7 +193,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #( end default: begin fcmp_res_s0 = 'x; - fcmp_fflags_NV_s0 = 'x; + fcmp_fflags_NV_s0 = 'x; end endcase end @@ -216,7 +217,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #( // FMV result_s0 = dataa_s0; fflags_NV_s0 = 0; - end + end 6,7: begin // MIN/MAX result_s0 = fminmax_res_s0; @@ -229,7 +230,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #( VX_pipe_register #( .DATAW (32 + 1), - .DEPTH (LATENCY > 0) + .DEPTH (OUT_REG) ) pipe_reg1 ( .clk (clk), .reset (reset), diff --git a/hw/rtl/fpu/VX_fpu_cvt.sv b/hw/rtl/fpu/VX_fpu_cvt.sv index 37a2ab419..2d0d52753 100644 --- a/hw/rtl/fpu/VX_fpu_cvt.sv +++ b/hw/rtl/fpu/VX_fpu_cvt.sv @@ -46,56 +46,68 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #( input wire ready_out, output wire valid_out ); - `UNUSED_VAR (frm) + localparam DATAW = 32 + `INST_FRM_BITS + 1 + 1; + + wire [NUM_LANES-1:0][DATAW-1:0] data_in; wire [NUM_LANES-1:0] mask_out; wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out; fflags_t [NUM_LANES-1:0] fflags_out; wire pe_enable; - wire [NUM_PES-1:0][31:0] pe_data_in; + wire [NUM_PES-1:0][DATAW-1:0] pe_data_in; wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out; + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in + assign data_in[i][0 +: 32] = dataa[i]; + assign data_in[i][32 +: `INST_FRM_BITS] = frm; + assign data_in[i][32 + `INST_FRM_BITS +: 1] = is_itof; + assign data_in[i][32 + `INST_FRM_BITS + 1 +: 1] = is_signed; + end + VX_pe_serializer #( .NUM_LANES (NUM_LANES), .NUM_PES (NUM_PES), .LATENCY (`LATENCY_FCVT), - .DATA_IN_WIDTH(32), - .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), + .DATA_IN_WIDTH (DATAW), + .DATA_OUT_WIDTH (`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), .PE_REG (0), - .OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0) + .OUT_BUF (2) ) pe_serializer ( .clk (clk), .reset (reset), .valid_in (valid_in), - .data_in (dataa), + .data_in (data_in), .tag_in ({mask_in, tag_in}), .ready_in (ready_in), .pe_enable (pe_enable), - .pe_data_in (pe_data_in), - .pe_data_out(pe_data_out), + .pe_data_out(pe_data_in), + .pe_data_in (pe_data_out), .valid_out (valid_out), .data_out (data_out), .tag_out ({mask_out, tag_out}), .ready_out (ready_out) ); - for (genvar i = 0; i < NUM_LANES; ++i) begin + `UNUSED_VAR (pe_data_in) + + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result assign result[i] = data_out[i][0 +: 32]; assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS]; end - for (genvar i = 0; i < NUM_PES; ++i) begin + for (genvar i = 0; i < NUM_PES; ++i) begin : g_fcvt_units VX_fcvt_unit #( - .LATENCY (`LATENCY_FCVT) + .LATENCY (`LATENCY_FCVT), + .OUT_REG (1) ) fcvt_unit ( .clk (clk), .reset (reset), .enable (pe_enable), - .frm (frm), - .is_itof (is_itof), - .is_signed (is_signed), + .frm (pe_data_in[0][32 +: `INST_FRM_BITS]), + .is_itof (pe_data_in[0][32 + `INST_FRM_BITS +: 1]), + .is_signed (pe_data_in[0][32 + `INST_FRM_BITS + 1 +: 1]), .dataa (pe_data_in[i][0 +: 32]), .result (pe_data_out[i][0 +: 32]), .fflags (pe_data_out[i][32 +: `FP_FLAGS_BITS]) diff --git a/hw/rtl/fpu/VX_fpu_div.sv b/hw/rtl/fpu/VX_fpu_div.sv index 81fc8f022..2238307a6 100644 --- a/hw/rtl/fpu/VX_fpu_div.sv +++ b/hw/rtl/fpu/VX_fpu_div.sv @@ -44,31 +44,33 @@ module VX_fpu_div import VX_fpu_pkg::*; #( output wire valid_out, input wire ready_out ); - `UNUSED_VAR (frm) + localparam DATAW = 2 * 32 + `INST_FRM_BITS; + + wire [NUM_LANES-1:0][DATAW-1:0] data_in; - wire [NUM_LANES-1:0][2*32-1:0] data_in; wire [NUM_LANES-1:0] mask_out; wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out; wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags_out; wire pe_enable; - wire [NUM_PES-1:0][2*32-1:0] pe_data_in; + wire [NUM_PES-1:0][DATAW-1:0] pe_data_in; wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out; - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in assign data_in[i][0 +: 32] = dataa[i]; assign data_in[i][32 +: 32] = datab[i]; + assign data_in[i][64 +: `INST_FRM_BITS] = frm; end VX_pe_serializer #( .NUM_LANES (NUM_LANES), .NUM_PES (NUM_PES), .LATENCY (`LATENCY_FDIV), - .DATA_IN_WIDTH(2*32), - .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), + .DATA_IN_WIDTH (DATAW), + .DATA_OUT_WIDTH (`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), .PE_REG (0), - .OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0) + .OUT_BUF (2) ) pe_serializer ( .clk (clk), .reset (reset), @@ -77,15 +79,17 @@ module VX_fpu_div import VX_fpu_pkg::*; #( .tag_in ({mask_in, tag_in}), .ready_in (ready_in), .pe_enable (pe_enable), - .pe_data_in (pe_data_in), - .pe_data_out(pe_data_out), + .pe_data_out(pe_data_in), + .pe_data_in (pe_data_out), .valid_out (valid_out), .data_out (data_out), .tag_out ({mask_out, tag_out}), .ready_out (ready_out) ); - for (genvar i = 0; i < NUM_LANES; ++i) begin + `UNUSED_VAR (pe_data_in) + + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result assign result[i] = data_out[i][0 +: 32]; assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS]; end @@ -94,7 +98,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #( `ifdef QUARTUS - for (genvar i = 0; i < NUM_PES; ++i) begin + for (genvar i = 0; i < NUM_PES; ++i) begin : g_fdivs acl_fdiv fdiv ( .clk (clk), .areset (1'b0), @@ -112,7 +116,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #( `elsif VIVADO - for (genvar i = 0; i < NUM_PES; ++i) begin + for (genvar i = 0; i < NUM_PES; ++i) begin : g_fdivs wire [3:0] tuser; xil_fdiv fdiv ( .aclk (clk), @@ -134,7 +138,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #( `else - for (genvar i = 0; i < NUM_PES; ++i) begin + for (genvar i = 0; i < NUM_PES; ++i) begin : g_fdivs reg [63:0] r; `UNUSED_VAR (r) fflags_t f; @@ -143,9 +147,9 @@ module VX_fpu_div import VX_fpu_pkg::*; #( dpi_fdiv ( pe_enable, int'(0), - {32'hffffffff, pe_data_in[i][0 +: 32]}, - {32'hffffffff, pe_data_in[i][32 +: 32]}, - frm, + {32'hffffffff, pe_data_in[i][0 +: 32]}, // a + {32'hffffffff, pe_data_in[i][32 +: 32]}, // b + pe_data_in[0][64 +: `INST_FRM_BITS], // frm r, f ); diff --git a/hw/rtl/fpu/VX_fpu_dpi.sv b/hw/rtl/fpu/VX_fpu_dpi.sv index 781b5b88e..e900e105c 100644 --- a/hw/rtl/fpu/VX_fpu_dpi.sv +++ b/hw/rtl/fpu/VX_fpu_dpi.sv @@ -76,7 +76,6 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #( reg is_fadd, is_fsub, is_fmul, is_fmadd, is_fmsub, is_fnmadd, is_fnmsub; reg is_div, is_fcmp, is_itof, is_utof, is_ftoi, is_ftou, is_f2f; - reg dst_fmt, int_fmt; reg [NUM_LANES-1:0][63:0] operands [3]; @@ -88,7 +87,8 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #( end end - `UNUSED_VAR (fmt) + wire f_fmt = fmt[0]; + wire i_fmt = fmt[1]; always @(*) begin is_fadd = 0; @@ -106,25 +106,11 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #( is_ftou = 0; is_f2f = 0; - dst_fmt = 0; - int_fmt = 0; - - `ifdef FLEN_64 - dst_fmt = fmt[0]; - `endif - - `ifdef XLEN_64 - int_fmt = fmt[1]; - `endif - case (op_type) - `INST_FPU_ADD: begin core_select = FPU_FMA; is_fadd = 1; end - `INST_FPU_SUB: begin core_select = FPU_FMA; is_fsub = 1; end + `INST_FPU_ADD: begin core_select = FPU_FMA; is_fadd = ~i_fmt; is_fsub = i_fmt; end + `INST_FPU_MADD: begin core_select = FPU_FMA; is_fmadd = ~i_fmt; is_fmsub = i_fmt; end + `INST_FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = ~i_fmt; is_fnmsub = i_fmt; end `INST_FPU_MUL: begin core_select = FPU_FMA; is_fmul = 1; end - `INST_FPU_MADD: begin core_select = FPU_FMA; is_fmadd = 1; end - `INST_FPU_MSUB: begin core_select = FPU_FMA; is_fmsub = 1; end - `INST_FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = 1; end - `INST_FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end `INST_FPU_DIV: begin core_select = FPU_DIVSQRT; is_div = 1; end `INST_FPU_SQRT: begin core_select = FPU_DIVSQRT; end `INST_FPU_CMP: begin core_select = FPU_NCP; is_fcmp = 1; end @@ -138,7 +124,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #( end generate - begin : fma + begin : g_fma reg [NUM_LANES-1:0][`XLEN-1:0] result_fma; reg [NUM_LANES-1:0][63:0] result_fadd; @@ -164,13 +150,13 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #( always @(*) begin for (integer i = 0; i < NUM_LANES; ++i) begin - dpi_fadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fadd[i], fflags_fadd[i]); - dpi_fsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fsub[i], fflags_fsub[i]); - dpi_fmul (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fmul[i], fflags_fmul[i]); - dpi_fmadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmadd[i], fflags_fmadd[i]); - dpi_fmsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmsub[i], fflags_fmsub[i]); - dpi_fnmadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmadd[i], fflags_fnmadd[i]); - dpi_fnmsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmsub[i], fflags_fnmsub[i]); + dpi_fadd (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fadd[i], fflags_fadd[i]); + dpi_fsub (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fsub[i], fflags_fsub[i]); + dpi_fmul (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fmul[i], fflags_fmul[i]); + dpi_fmadd (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmadd[i], fflags_fmadd[i]); + dpi_fmsub (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmsub[i], fflags_fmsub[i]); + dpi_fnmadd (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmadd[i], fflags_fnmadd[i]); + dpi_fnmsub (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmsub[i], fflags_fnmsub[i]); result_fma[i] = is_fadd ? result_fadd[i][`XLEN-1:0] : is_fsub ? result_fsub[i][`XLEN-1:0] : @@ -214,7 +200,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #( endgenerate generate - begin : fdiv + begin : g_fdiv reg [NUM_LANES-1:0][`XLEN-1:0] result_fdiv_r; reg [NUM_LANES-1:0][63:0] result_fdiv; @@ -226,7 +212,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #( always @(*) begin for (integer i = 0; i < NUM_LANES; ++i) begin - dpi_fdiv (fdiv_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fdiv[i], fflags_fdiv[i]); + dpi_fdiv (fdiv_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fdiv[i], fflags_fdiv[i]); result_fdiv_r[i] = result_fdiv[i][`XLEN-1:0]; end end @@ -253,7 +239,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #( endgenerate generate - begin : fsqrt + begin : g_fsqrt reg [NUM_LANES-1:0][`XLEN-1:0] result_fsqrt_r; reg [NUM_LANES-1:0][63:0] result_fsqrt; @@ -265,7 +251,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #( always @(*) begin for (integer i = 0; i < NUM_LANES; ++i) begin - dpi_fsqrt (fsqrt_fire, int'(dst_fmt), operands[0][i], frm, result_fsqrt[i], fflags_fsqrt[i]); + dpi_fsqrt (fsqrt_fire, int'(f_fmt), operands[0][i], frm, result_fsqrt[i], fflags_fsqrt[i]); result_fsqrt_r[i] = result_fsqrt[i][`XLEN-1:0]; end end @@ -292,7 +278,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #( endgenerate generate - begin : fcvt + begin : g_fcvt reg [NUM_LANES-1:0][`XLEN-1:0] result_fcvt; reg [NUM_LANES-1:0][63:0] result_itof; @@ -313,11 +299,11 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #( always @(*) begin for (integer i = 0; i < NUM_LANES; ++i) begin - dpi_itof (fcvt_fire, int'(dst_fmt), int'(int_fmt), operands[0][i], frm, result_itof[i], fflags_itof[i]); - dpi_utof (fcvt_fire, int'(dst_fmt), int'(int_fmt), operands[0][i], frm, result_utof[i], fflags_utof[i]); - dpi_ftoi (fcvt_fire, int'(int_fmt), int'(dst_fmt), operands[0][i], frm, result_ftoi[i], fflags_ftoi[i]); - dpi_ftou (fcvt_fire, int'(int_fmt), int'(dst_fmt), operands[0][i], frm, result_ftou[i], fflags_ftou[i]); - dpi_f2f (fcvt_fire, int'(dst_fmt), operands[0][i], result_f2f[i]); + dpi_itof (fcvt_fire, int'(f_fmt), int'(i_fmt), operands[0][i], frm, result_itof[i], fflags_itof[i]); + dpi_utof (fcvt_fire, int'(f_fmt), int'(i_fmt), operands[0][i], frm, result_utof[i], fflags_utof[i]); + dpi_ftoi (fcvt_fire, int'(i_fmt), int'(f_fmt), operands[0][i], frm, result_ftoi[i], fflags_ftoi[i]); + dpi_ftou (fcvt_fire, int'(i_fmt), int'(f_fmt), operands[0][i], frm, result_ftou[i], fflags_ftou[i]); + dpi_f2f (fcvt_fire, int'(f_fmt), operands[0][i], result_f2f[i]); result_fcvt[i] = is_itof ? result_itof[i][`XLEN-1:0] : is_utof ? result_utof[i][`XLEN-1:0] : @@ -356,7 +342,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #( endgenerate generate - begin : fncp + begin : g_fncp reg [NUM_LANES-1:0][`XLEN-1:0] result_fncp; reg [NUM_LANES-1:0][63:0] result_fclss; @@ -384,17 +370,17 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #( always @(*) begin for (integer i = 0; i < NUM_LANES; ++i) begin - dpi_fclss (fncp_fire, int'(dst_fmt), operands[0][i], result_fclss[i]); - dpi_fle (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fle[i], fflags_fle[i]); - dpi_flt (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_flt[i], fflags_flt[i]); - dpi_feq (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_feq[i], fflags_feq[i]); - dpi_fmin (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fmin[i], fflags_fmin[i]); - dpi_fmax (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fmax[i], fflags_fmax[i]); - dpi_fsgnj (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnj[i]); - dpi_fsgnjn (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnjn[i]); - dpi_fsgnjx (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnjx[i]); - result_fmvx[i] = dst_fmt ? operands[0][i] : 64'($signed(operands[0][i][31:0])); // sign-extension - result_fmvf[i] = dst_fmt ? operands[0][i] : (operands[0][i] | 64'hffffffff00000000); // nan-boxing + dpi_fclss (fncp_fire, int'(f_fmt), operands[0][i], result_fclss[i]); + dpi_fle (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fle[i], fflags_fle[i]); + dpi_flt (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_flt[i], fflags_flt[i]); + dpi_feq (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_feq[i], fflags_feq[i]); + dpi_fmin (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fmin[i], fflags_fmin[i]); + dpi_fmax (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fmax[i], fflags_fmax[i]); + dpi_fsgnj (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fsgnj[i]); + dpi_fsgnjn (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fsgnjn[i]); + dpi_fsgnjx (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fsgnjx[i]); + result_fmvx[i] = f_fmt ? operands[0][i] : 64'($signed(operands[0][i][31:0])); // sign-extension + result_fmvf[i] = f_fmt ? operands[0][i] : (operands[0][i] | 64'hffffffff00000000); // nan-boxing end end @@ -444,7 +430,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #( VX_stream_arb #( .NUM_INPUTS (2), .DATAW (RSP_DATAW), - .ARBITER ("R"), + .ARBITER ("P"), .OUT_BUF (0) ) div_sqrt_arb ( .clk (clk), @@ -463,14 +449,14 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #( wire [NUM_FPC-1:0][RSP_DATAW-1:0] per_core_data_out; - for (genvar i = 0; i < NUM_FPC; ++i) begin + for (genvar i = 0; i < NUM_FPC; ++i) begin : g_per_core_data_out assign per_core_data_out[i] = {per_core_result[i], per_core_has_fflags[i], per_core_fflags[i], per_core_tag_out[i]}; end VX_stream_arb #( .NUM_INPUTS (NUM_FPC), .DATAW (RSP_DATAW), - .ARBITER ("F"), + .ARBITER ("R"), .OUT_BUF (OUT_BUF) ) rsp_arb ( .clk (clk), diff --git a/hw/rtl/fpu/VX_fpu_dsp.sv b/hw/rtl/fpu/VX_fpu_dsp.sv index ad398dcd7..af75c8a75 100644 --- a/hw/rtl/fpu/VX_fpu_dsp.sv +++ b/hw/rtl/fpu/VX_fpu_dsp.sv @@ -51,68 +51,39 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #( localparam FPU_DIVSQRT = 1; localparam FPU_CVT = 2; localparam FPU_NCP = 3; - localparam NUM_FPC = 4; - localparam FPC_BITS = `LOG2UP(NUM_FPC); + localparam NUM_FPCORES = 4; + localparam FPCORES_BITS = `LOG2UP(NUM_FPCORES); + localparam REQ_DATAW = NUM_LANES + TAG_WIDTH + `INST_FPU_BITS + `INST_FMT_BITS + `INST_FRM_BITS + 3 * (NUM_LANES * 32); localparam RSP_DATAW = (NUM_LANES * 32) + 1 + $bits(fflags_t) + TAG_WIDTH; `UNUSED_VAR (fmt) - wire [NUM_FPC-1:0] per_core_ready_in; - wire [NUM_FPC-1:0][NUM_LANES-1:0][31:0] per_core_result; - wire [NUM_FPC-1:0][TAG_WIDTH-1:0] per_core_tag_out; - wire [NUM_FPC-1:0] per_core_ready_out; - wire [NUM_FPC-1:0] per_core_valid_out; - wire [NUM_FPC-1:0] per_core_has_fflags; - fflags_t [NUM_FPC-1:0] per_core_fflags; + wire [NUM_FPCORES-1:0] per_core_valid_in; + wire [NUM_FPCORES-1:0][REQ_DATAW-1:0] per_core_data_in; + wire [NUM_FPCORES-1:0] per_core_ready_in; - wire div_ready_in, sqrt_ready_in; - wire [NUM_LANES-1:0][31:0] div_result, sqrt_result; - wire [TAG_WIDTH-1:0] div_tag_out, sqrt_tag_out; - wire div_ready_out, sqrt_ready_out; - wire div_valid_out, sqrt_valid_out; - wire div_has_fflags, sqrt_has_fflags; - fflags_t div_fflags, sqrt_fflags; + wire [NUM_FPCORES-1:0][NUM_LANES-1:0] per_core_mask_in; + wire [NUM_FPCORES-1:0][TAG_WIDTH-1:0] per_core_tag_in; + wire [NUM_FPCORES-1:0][`INST_FPU_BITS-1:0] per_core_op_type; + wire [NUM_FPCORES-1:0][`INST_FMT_BITS-1:0] per_core_fmt; + wire [NUM_FPCORES-1:0][`INST_FRM_BITS-1:0] per_core_frm; + wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_dataa; + wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_datab; + wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_datac; - reg [FPC_BITS-1:0] core_select; - reg is_madd, is_sub, is_neg, is_div, is_itof, is_signed; - - always @(*) begin - is_madd = 0; - is_sub = 0; - is_neg = 0; - is_div = 0; - is_itof = 0; - is_signed = 0; - case (op_type) - `INST_FPU_ADD: begin core_select = FPU_FMA; end - `INST_FPU_SUB: begin core_select = FPU_FMA; is_sub = 1; end - `INST_FPU_MUL: begin core_select = FPU_FMA; is_neg = 1; end - `INST_FPU_MADD: begin core_select = FPU_FMA; is_madd = 1; end - `INST_FPU_MSUB: begin core_select = FPU_FMA; is_madd = 1; is_sub = 1; end - `INST_FPU_NMADD: begin core_select = FPU_FMA; is_madd = 1; is_neg = 1; end - `INST_FPU_NMSUB: begin core_select = FPU_FMA; is_madd = 1; is_sub = 1; is_neg = 1; end - `INST_FPU_DIV: begin core_select = FPU_DIVSQRT; is_div = 1; end - `INST_FPU_SQRT: begin core_select = FPU_DIVSQRT; end - `INST_FPU_F2I: begin core_select = FPU_CVT; is_signed = 1; end - `INST_FPU_F2U: begin core_select = FPU_CVT; end - `INST_FPU_I2F: begin core_select = FPU_CVT; is_itof = 1; is_signed = 1; end - `INST_FPU_U2F: begin core_select = FPU_CVT; is_itof = 1; end - default: begin core_select = FPU_NCP; end - endcase - end - - `RESET_RELAY (fma_reset, reset); - `RESET_RELAY (div_reset, reset); - `RESET_RELAY (sqrt_reset, reset); - `RESET_RELAY (cvt_reset, reset); - `RESET_RELAY (ncp_reset, reset); + wire [NUM_FPCORES-1:0] per_core_valid_out; + wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_result; + wire [NUM_FPCORES-1:0][TAG_WIDTH-1:0] per_core_tag_out; + wire [NUM_FPCORES-1:0] per_core_has_fflags; + fflags_t [NUM_FPCORES-1:0] per_core_fflags; + wire [NUM_FPCORES-1:0] per_core_ready_out; wire [NUM_LANES-1:0][31:0] dataa_s; wire [NUM_LANES-1:0][31:0] datab_s; wire [NUM_LANES-1:0][31:0] datac_s; - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data assign dataa_s[i] = dataa[i][31:0]; assign datab_s[i] = datab[i][31:0]; assign datac_s[i] = datac[i][31:0]; @@ -122,23 +93,60 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #( `UNUSED_VAR (datab) `UNUSED_VAR (datac) + // Decode fpu core type + wire [FPCORES_BITS-1:0] core_select = op_type[3:2]; + + VX_stream_switch #( + .DATAW (REQ_DATAW), + .NUM_OUTPUTS (NUM_FPCORES) + ) req_switch ( + .clk (clk), + .reset (reset), + .sel_in (core_select), + .valid_in (valid_in), + .ready_in (ready_in), + .data_in ({mask_in, tag_in, fmt, frm, dataa_s, datab_s, datac_s, op_type}), + .data_out (per_core_data_in), + .valid_out (per_core_valid_in), + .ready_out (per_core_ready_in) + ); + + for (genvar i = 0; i < NUM_FPCORES; ++i) begin : g_per_core_data_in + assign { + per_core_mask_in[i], + per_core_tag_in[i], + per_core_fmt[i], + per_core_frm[i], + per_core_dataa[i], + per_core_datab[i], + per_core_datac[i], + per_core_op_type[i] + } = per_core_data_in[i]; + end + + // FMA core /////////////////////////////////////////////////////////////// + + wire is_madd = per_core_op_type[FPU_FMA][1]; + wire is_neg = per_core_op_type[FPU_FMA][0]; + wire is_sub = per_core_fmt[FPU_FMA][1]; + VX_fpu_fma #( .NUM_LANES (NUM_LANES), .TAG_WIDTH (TAG_WIDTH) ) fpu_fma ( .clk (clk), - .reset (fma_reset), - .valid_in (valid_in && (core_select == FPU_FMA)), + .reset (reset), + .valid_in (per_core_valid_in[FPU_FMA]), .ready_in (per_core_ready_in[FPU_FMA]), - .mask_in (mask_in), - .tag_in (tag_in), - .frm (frm), + .mask_in (per_core_mask_in[FPU_FMA]), + .tag_in (per_core_tag_in[FPU_FMA]), + .frm (per_core_frm[FPU_FMA]), .is_madd (is_madd), .is_sub (is_sub), .is_neg (is_neg), - .dataa (dataa_s), - .datab (datab_s), - .datac (datac_s), + .dataa (per_core_dataa[FPU_FMA]), + .datab (per_core_datab[FPU_FMA]), + .datac (per_core_datac[FPU_FMA]), .has_fflags (per_core_has_fflags[FPU_FMA]), .fflags (per_core_fflags[FPU_FMA]), .result (per_core_result[FPU_FMA]), @@ -147,25 +155,99 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #( .valid_out (per_core_valid_out[FPU_FMA]) ); + // Div/Sqrt cores ///////////////////////////////////////////////////////// + + wire [1:0] div_sqrt_valid_in; + wire [1:0][REQ_DATAW-1:0] div_sqrt_data_in; + wire [1:0] div_sqrt_ready_in; + + wire [1:0][NUM_LANES-1:0] div_sqrt_mask_in; + wire [1:0][TAG_WIDTH-1:0] div_sqrt_tag_in; + wire [1:0][`INST_FPU_BITS-1:0] div_sqrt_op_type; + wire [1:0][`INST_FMT_BITS-1:0] div_sqrt_fmt; + wire [1:0][`INST_FRM_BITS-1:0] div_sqrt_frm; + wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_dataa; + wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_datab; + wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_datac; + + wire [1:0] div_sqrt_valid_out; + wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_result; + wire [1:0][TAG_WIDTH-1:0] div_sqrt_tag_out; + wire [1:0] div_sqrt_has_fflags; + fflags_t [1:0] div_sqrt_fflags; + wire [1:0] div_sqrt_ready_out; + + wire div_sqrt_valid_tmp_in; + wire [REQ_DATAW-1:0] div_sqrt_data_tmp_in; + wire div_sqrt_ready_tmp_in; + + VX_elastic_buffer #( + .DATAW (REQ_DATAW) + ) div_sqrt_req_buffer ( + .clk (clk), + .reset (reset), + .valid_in (per_core_valid_in[FPU_DIVSQRT]), + .ready_in (per_core_ready_in[FPU_DIVSQRT]), + .data_in (per_core_data_in[FPU_DIVSQRT]), + .data_out (div_sqrt_data_tmp_in), + .valid_out (div_sqrt_valid_tmp_in), + .ready_out (div_sqrt_ready_tmp_in) + ); + + wire is_sqrt = div_sqrt_data_tmp_in[0]; // op_type[0] + + VX_stream_switch #( + .DATAW (REQ_DATAW), + .NUM_OUTPUTS (2) + ) div_sqrt_req_switch ( + .clk (clk), + .reset (reset), + .sel_in (is_sqrt), + .valid_in (div_sqrt_valid_tmp_in), + .ready_in (div_sqrt_ready_tmp_in), + .data_in (div_sqrt_data_tmp_in), + .data_out (div_sqrt_data_in), + .valid_out (div_sqrt_valid_in), + .ready_out (div_sqrt_ready_in) + ); + + for (genvar i = 0; i < 2; ++i) begin : g_div_sqrt_data_in + assign { + div_sqrt_mask_in[i], + div_sqrt_tag_in[i], + div_sqrt_fmt[i], + div_sqrt_frm[i], + div_sqrt_dataa[i], + div_sqrt_datab[i], + div_sqrt_datac[i], + div_sqrt_op_type[i] + } = div_sqrt_data_in[i]; + end + + `UNUSED_VAR (div_sqrt_op_type) + `UNUSED_VAR (div_sqrt_fmt) + `UNUSED_VAR (div_sqrt_datab) + `UNUSED_VAR (div_sqrt_datac) + VX_fpu_div #( .NUM_LANES (NUM_LANES), .TAG_WIDTH (TAG_WIDTH) ) fpu_div ( .clk (clk), - .reset (div_reset), - .valid_in (valid_in && (core_select == FPU_DIVSQRT) && is_div), - .ready_in (div_ready_in), - .mask_in (mask_in), - .tag_in (tag_in), - .frm (frm), - .dataa (dataa_s), - .datab (datab_s), - .has_fflags (div_has_fflags), - .fflags (div_fflags), - .result (div_result), - .tag_out (div_tag_out), - .valid_out (div_valid_out), - .ready_out (div_ready_out) + .reset (reset), + .valid_in (div_sqrt_valid_in[0]), + .ready_in (div_sqrt_ready_in[0]), + .mask_in (div_sqrt_mask_in[0]), + .tag_in (div_sqrt_tag_in[0]), + .frm (div_sqrt_frm[0]), + .dataa (div_sqrt_dataa[0]), + .datab (div_sqrt_datab[0]), + .has_fflags (div_sqrt_has_fflags[0]), + .fflags (div_sqrt_fflags[0]), + .result (div_sqrt_result[0]), + .tag_out (div_sqrt_tag_out[0]), + .valid_out (div_sqrt_valid_out[0]), + .ready_out (div_sqrt_ready_out[0]) ); VX_fpu_sqrt #( @@ -173,92 +255,42 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #( .TAG_WIDTH (TAG_WIDTH) ) fpu_sqrt ( .clk (clk), - .reset (sqrt_reset), - .valid_in (valid_in && (core_select == FPU_DIVSQRT) && ~is_div), - .ready_in (sqrt_ready_in), - .mask_in (mask_in), - .tag_in (tag_in), - .frm (frm), - .dataa (dataa_s), - .has_fflags (sqrt_has_fflags), - .fflags (sqrt_fflags), - .result (sqrt_result), - .tag_out (sqrt_tag_out), - .valid_out (sqrt_valid_out), - .ready_out (sqrt_ready_out) + .reset (reset), + .valid_in (div_sqrt_valid_in[1]), + .ready_in (div_sqrt_ready_in[1]), + .mask_in (div_sqrt_mask_in[1]), + .tag_in (div_sqrt_tag_in[1]), + .frm (div_sqrt_frm[1]), + .dataa (div_sqrt_dataa[1]), + .has_fflags (div_sqrt_has_fflags[1]), + .fflags (div_sqrt_fflags[1]), + .result (div_sqrt_result[1]), + .tag_out (div_sqrt_tag_out[1]), + .valid_out (div_sqrt_valid_out[1]), + .ready_out (div_sqrt_ready_out[1]) ); - wire cvt_ret_int_in = ~is_itof; - wire cvt_ret_int_out; - - VX_fpu_cvt #( - .NUM_LANES (NUM_LANES), - .TAG_WIDTH (TAG_WIDTH+1) - ) fpu_cvt ( - .clk (clk), - .reset (cvt_reset), - .valid_in (valid_in && (core_select == FPU_CVT)), - .ready_in (per_core_ready_in[FPU_CVT]), - .mask_in (mask_in), - .tag_in ({cvt_ret_int_in, tag_in}), - .frm (frm), - .is_itof (is_itof), - .is_signed (is_signed), - .dataa (dataa_s), - .has_fflags (per_core_has_fflags[FPU_CVT]), - .fflags (per_core_fflags[FPU_CVT]), - .result (per_core_result[FPU_CVT]), - .tag_out ({cvt_ret_int_out, per_core_tag_out[FPU_CVT]}), - .valid_out (per_core_valid_out[FPU_CVT]), - .ready_out (per_core_ready_out[FPU_CVT]) - ); - - wire ncp_ret_int_in = (op_type == `INST_FPU_CMP) - || `INST_FPU_IS_CLASS(op_type, frm) - || `INST_FPU_IS_MVXW(op_type, frm); - wire ncp_ret_int_out; - - wire ncp_ret_sext_in = `INST_FPU_IS_MVXW(op_type, frm); - wire ncp_ret_sext_out; - - VX_fpu_ncp #( - .NUM_LANES (NUM_LANES), - .TAG_WIDTH (TAG_WIDTH+2) - ) fpu_ncp ( - .clk (clk), - .reset (ncp_reset), - .valid_in (valid_in && (core_select == FPU_NCP)), - .ready_in (per_core_ready_in[FPU_NCP]), - .mask_in (mask_in), - .tag_in ({ncp_ret_sext_in, ncp_ret_int_in, tag_in}), - .op_type (op_type), - .frm (frm), - .dataa (dataa_s), - .datab (datab_s), - .result (per_core_result[FPU_NCP]), - .has_fflags (per_core_has_fflags[FPU_NCP]), - .fflags (per_core_fflags[FPU_NCP]), - .tag_out ({ncp_ret_sext_out, ncp_ret_int_out, per_core_tag_out[FPU_NCP]}), - .valid_out (per_core_valid_out[FPU_NCP]), - .ready_out (per_core_ready_out[FPU_NCP]) - ); - - /////////////////////////////////////////////////////////////////////////// - - assign per_core_ready_in[FPU_DIVSQRT] = is_div ? div_ready_in : sqrt_ready_in; + wire [1:0][RSP_DATAW-1:0] div_sqrt_arb_data_in; + for (genvar i = 0; i < 2; ++i) begin : g_div_sqrt_arb_data_in + assign div_sqrt_arb_data_in[i] = { + div_sqrt_result[i], + div_sqrt_has_fflags[i], + div_sqrt_fflags[i], + div_sqrt_tag_out[i] + }; + end VX_stream_arb #( .NUM_INPUTS (2), .DATAW (RSP_DATAW), - .ARBITER ("R"), + .ARBITER ("P"), .OUT_BUF (0) - ) div_sqrt_arb ( + ) div_sqrt_rsp_arb ( .clk (clk), .reset (reset), - .valid_in ({sqrt_valid_out, div_valid_out}), - .ready_in ({sqrt_ready_out, div_ready_out}), - .data_in ({{sqrt_result, sqrt_has_fflags, sqrt_fflags, sqrt_tag_out}, - {div_result, div_has_fflags, div_fflags, div_tag_out}}), + .valid_in (div_sqrt_valid_out), + .ready_in (div_sqrt_ready_out), + .data_in (div_sqrt_arb_data_in), .data_out ({ per_core_result[FPU_DIVSQRT], per_core_has_fflags[FPU_DIVSQRT], @@ -270,12 +302,73 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #( `UNUSED_PIN (sel_out) ); + // CVT core /////////////////////////////////////////////////////////////// + + wire is_itof = per_core_op_type[FPU_CVT][1]; + wire is_signed = ~per_core_op_type[FPU_CVT][0]; + wire cvt_ret_int_in = ~is_itof; + wire cvt_ret_int_out; + + VX_fpu_cvt #( + .NUM_LANES (NUM_LANES), + .TAG_WIDTH (1+TAG_WIDTH) + ) fpu_cvt ( + .clk (clk), + .reset (reset), + .valid_in (per_core_valid_in[FPU_CVT]), + .ready_in (per_core_ready_in[FPU_CVT]), + .mask_in (per_core_mask_in[FPU_CVT]), + .tag_in ({cvt_ret_int_in, per_core_tag_in[FPU_CVT]}), + .frm (per_core_frm[FPU_CVT]), + .is_itof (is_itof), + .is_signed (is_signed), + .dataa (per_core_dataa[FPU_CVT]), + .has_fflags (per_core_has_fflags[FPU_CVT]), + .fflags (per_core_fflags[FPU_CVT]), + .result (per_core_result[FPU_CVT]), + .tag_out ({cvt_ret_int_out, per_core_tag_out[FPU_CVT]}), + .valid_out (per_core_valid_out[FPU_CVT]), + .ready_out (per_core_ready_out[FPU_CVT]) + ); + + // NCP core /////////////////////////////////////////////////////////////// + + wire ncp_ret_int_in = (per_core_op_type[FPU_NCP] == `INST_FPU_CMP) + || `INST_FPU_IS_CLASS(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP]) + || `INST_FPU_IS_MVXW(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP]); + wire ncp_ret_int_out; + + wire ncp_ret_sext_in = `INST_FPU_IS_MVXW(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP]); + wire ncp_ret_sext_out; + + VX_fpu_ncp #( + .NUM_LANES (NUM_LANES), + .TAG_WIDTH (TAG_WIDTH+2) + ) fpu_ncp ( + .clk (clk), + .reset (reset), + .valid_in (per_core_valid_in[FPU_NCP]), + .ready_in (per_core_ready_in[FPU_NCP]), + .mask_in (per_core_mask_in[FPU_NCP]), + .tag_in ({ncp_ret_sext_in, ncp_ret_int_in, per_core_tag_in[FPU_NCP]}), + .op_type (per_core_op_type[FPU_NCP]), + .frm (per_core_frm[FPU_NCP]), + .dataa (per_core_dataa[FPU_NCP]), + .datab (per_core_datab[FPU_NCP]), + .result (per_core_result[FPU_NCP]), + .has_fflags (per_core_has_fflags[FPU_NCP]), + .fflags (per_core_fflags[FPU_NCP]), + .tag_out ({ncp_ret_sext_out, ncp_ret_int_out, per_core_tag_out[FPU_NCP]}), + .valid_out (per_core_valid_out[FPU_NCP]), + .ready_out (per_core_ready_out[FPU_NCP]) + ); + /////////////////////////////////////////////////////////////////////////// - reg [NUM_FPC-1:0][RSP_DATAW+2-1:0] per_core_data_out; + reg [NUM_FPCORES-1:0][RSP_DATAW+2-1:0] per_core_data_out; always @(*) begin - for (integer i = 0; i < NUM_FPC; ++i) begin + for (integer i = 0; i < NUM_FPCORES; ++i) begin per_core_data_out[i][RSP_DATAW+1:2] = { per_core_result[i], per_core_has_fflags[i], @@ -294,9 +387,9 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #( `UNUSED_VAR (op_ret_int_out) VX_stream_arb #( - .NUM_INPUTS (NUM_FPC), + .NUM_INPUTS (NUM_FPCORES), .DATAW (RSP_DATAW + 2), - .ARBITER ("F"), + .ARBITER ("R"), .OUT_BUF (OUT_BUF) ) rsp_arb ( .clk (clk), @@ -310,25 +403,22 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #( `UNUSED_PIN (sel_out) ); - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result `ifdef FPU_RV64F - reg [`XLEN-1:0] result_r; + reg [`XLEN-1:0] result_w; always @(*) begin case (op_ret_int_out) - 2'b11: result_r = `XLEN'($signed(result_s[i])); - 2'b01: result_r = {32'h00000000, result_s[i]}; - default: result_r = {32'hffffffff, result_s[i]}; + 2'b11: result_w = `XLEN'($signed(result_s[i])); + 2'b01: result_w = {32'h00000000, result_s[i]}; + default: result_w = {32'hffffffff, result_s[i]}; endcase end - assign result[i] = result_r; + assign result[i] = result_w; `else assign result[i] = result_s[i]; `endif end - // can accept new request? - assign ready_in = per_core_ready_in[core_select]; - endmodule `endif diff --git a/hw/rtl/fpu/VX_fpu_fma.sv b/hw/rtl/fpu/VX_fpu_fma.sv index 3522d8a1e..e793ff55b 100644 --- a/hw/rtl/fpu/VX_fpu_fma.sv +++ b/hw/rtl/fpu/VX_fpu_fma.sv @@ -49,26 +49,27 @@ module VX_fpu_fma import VX_fpu_pkg::*; #( input wire ready_out, output wire valid_out ); - `UNUSED_VAR (frm) + localparam DATAW = 3 * 32 + `INST_FRM_BITS; + + wire [NUM_LANES-1:0][DATAW-1:0] data_in; - wire [NUM_LANES-1:0][3*32-1:0] data_in; wire [NUM_LANES-1:0] mask_out; wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out; wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags_out; wire pe_enable; - wire [NUM_PES-1:0][3*32-1:0] pe_data_in; + wire [NUM_PES-1:0][DATAW-1:0] pe_data_in; wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out; reg [NUM_LANES-1:0][31:0] a, b, c; - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_select always @(*) begin if (is_madd) begin // MADD / MSUB / NMADD / NMSUB - a[i] = is_neg ? {~dataa[i][31], dataa[i][30:0]} : dataa[i]; + a[i] = {is_neg ^ dataa[i][31], dataa[i][30:0]}; b[i] = datab[i]; - c[i] = (is_neg ^ is_sub) ? {~datac[i][31], datac[i][30:0]} : datac[i]; + c[i] = {is_neg ^ is_sub ^ datac[i][31], datac[i][30:0]}; end else begin if (is_neg) begin // MUL @@ -77,29 +78,30 @@ module VX_fpu_fma import VX_fpu_pkg::*; #( c[i] = '0; end else begin // ADD / SUB - a[i] = 32'h3f800000; // 1.0f - b[i] = dataa[i]; - c[i] = is_sub ? {~datab[i][31], datab[i][30:0]} : datab[i]; + a[i] = dataa[i]; + b[i] = 32'h3f800000; // 1.0f + c[i] = {is_sub ^ datab[i][31], datab[i][30:0]}; end end end end - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in assign data_in[i][0 +: 32] = a[i]; assign data_in[i][32 +: 32] = b[i]; assign data_in[i][64 +: 32] = c[i]; + assign data_in[i][96 +: `INST_FRM_BITS] = frm; end VX_pe_serializer #( .NUM_LANES (NUM_LANES), .NUM_PES (NUM_PES), .LATENCY (`LATENCY_FMA), - .DATA_IN_WIDTH(3*32), - .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), + .DATA_IN_WIDTH (DATAW), + .DATA_OUT_WIDTH (`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), - .PE_REG ((NUM_LANES != NUM_PES) ? 1 : 0), // must be registered for DSPs - .OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0) + .PE_REG (0), + .OUT_BUF (2) ) pe_serializer ( .clk (clk), .reset (reset), @@ -108,15 +110,17 @@ module VX_fpu_fma import VX_fpu_pkg::*; #( .tag_in ({mask_in, tag_in}), .ready_in (ready_in), .pe_enable (pe_enable), - .pe_data_in (pe_data_in), - .pe_data_out(pe_data_out), + .pe_data_out(pe_data_in), + .pe_data_in (pe_data_out), .valid_out (valid_out), .data_out (data_out), .tag_out ({mask_out, tag_out}), .ready_out (ready_out) ); - for (genvar i = 0; i < NUM_LANES; ++i) begin + `UNUSED_VAR (pe_data_in) + + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result assign result[i] = data_out[i][0 +: 32]; assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS]; end @@ -125,7 +129,7 @@ module VX_fpu_fma import VX_fpu_pkg::*; #( `ifdef QUARTUS - for (genvar i = 0; i < NUM_PES; ++i) begin + for (genvar i = 0; i < NUM_PES; ++i) begin : g_fmas acl_fmadd fmadd ( .clk (clk), .areset (1'b0), @@ -143,7 +147,7 @@ module VX_fpu_fma import VX_fpu_pkg::*; #( `elsif VIVADO - for (genvar i = 0; i < NUM_PES; ++i) begin + for (genvar i = 0; i < NUM_PES; ++i) begin : g_fmas wire [2:0] tuser; xil_fma fma ( @@ -168,7 +172,7 @@ module VX_fpu_fma import VX_fpu_pkg::*; #( `else - for (genvar i = 0; i < NUM_PES; ++i) begin + for (genvar i = 0; i < NUM_PES; ++i) begin : g_fmas reg [63:0] r; `UNUSED_VAR (r) fflags_t f; @@ -177,10 +181,10 @@ module VX_fpu_fma import VX_fpu_pkg::*; #( dpi_fmadd ( pe_enable, int'(0), - {32'hffffffff, pe_data_in[i][0 +: 32]}, - {32'hffffffff, pe_data_in[i][32 +: 32]}, - {32'hffffffff, pe_data_in[i][64 +: 32]}, - frm, + {32'hffffffff, pe_data_in[i][0 +: 32]}, // a + {32'hffffffff, pe_data_in[i][32 +: 32]}, // b + {32'hffffffff, pe_data_in[i][64 +: 32]}, // c + pe_data_in[0][96 +: `INST_FRM_BITS], // frm r, f ); diff --git a/hw/rtl/fpu/VX_fpu_fpnew.sv b/hw/rtl/fpu/VX_fpu_fpnew.sv index 9ee7f1a2c..596a86513 100644 --- a/hw/rtl/fpu/VX_fpu_fpnew.sv +++ b/hw/rtl/fpu/VX_fpu_fpnew.sv @@ -90,7 +90,7 @@ module VX_fpu_fpnew reg [TAG_WIDTH-1:0] fpu_tag_in, fpu_tag_out; - reg [2:0][NUM_LANES-1:0][`XLEN-1:0] fpu_operands; + logic [2:0][NUM_LANES-1:0][`XLEN-1:0] fpu_operands; wire [NUM_LANES-1:0][`XLEN-1:0] fpu_result; fpnew_pkg::status_t fpu_status; @@ -134,20 +134,13 @@ module VX_fpu_fpnew fpu_op = fpnew_pkg::ADD; fpu_operands[1] = dataa; fpu_operands[2] = datab; - end - `INST_FPU_SUB: begin - fpu_op = fpnew_pkg::ADD; - fpu_operands[1] = dataa; - fpu_operands[2] = datab; - fpu_op_mod = 1; + fpu_op_mod = fmt[1]; // FADD or FSUB end `INST_FPU_MUL: begin fpu_op = fpnew_pkg::MUL; end + `INST_FPU_MADD: begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = fmt[1]; end + `INST_FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = ~fmt[1]; end `INST_FPU_DIV: begin fpu_op = fpnew_pkg::DIV; end `INST_FPU_SQRT: begin fpu_op = fpnew_pkg::SQRT; end - `INST_FPU_MADD: begin fpu_op = fpnew_pkg::FMADD; end - `INST_FPU_MSUB: begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = 1; end - `INST_FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = 1; end - `INST_FPU_NMSUB: begin fpu_op = fpnew_pkg::FNMSUB; end `ifdef FLEN_64 `INST_FPU_F2F: begin fpu_op = fpnew_pkg::F2F; fpu_src_fmt = fmt[0] ? fpnew_pkg::FP32 : fpnew_pkg::FP64; end `endif @@ -169,7 +162,7 @@ module VX_fpu_fpnew end `UNUSED_VAR (mask_in) - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_fpnew_coreses wire [(TAG_WIDTH+1)-1:0] fpu_tag; wire fpu_valid_out_uq; wire fpu_ready_in_uq; @@ -183,8 +176,7 @@ module VX_fpu_fpnew .Features (FPU_FEATURES), .Implementation (FPU_IMPLEMENTATION), .TagType (logic[(TAG_WIDTH+1)-1:0]), - .TrueSIMDClass (1), - .EnableSIMDMask (1) + .DivSqrtSel (fpnew_pkg::PULP) ) fpnew_core ( .clk_i (clk), .rst_ni (~reset), @@ -196,11 +188,11 @@ module VX_fpu_fpnew .dst_fmt_i (fpu_dst_fmt), .int_fmt_i (fpu_int_fmt), .vectorial_op_i (1'b0), - .simd_mask_i (mask_in[i]), + .simd_mask_i (1'b1), .tag_i ({fpu_tag_in, fpu_has_fflags}), .in_valid_i (fpu_valid_in), .in_ready_o (fpu_ready_in_uq), - .flush_i (reset), + .flush_i (1'b0), .result_o (fpu_result[i]), .status_o (fpu_status_uq), .tag_o (fpu_tag), @@ -209,7 +201,7 @@ module VX_fpu_fpnew `UNUSED_PIN (busy_o) ); - if (i == 0) begin + if (i == 0) begin : g_output_0 assign {fpu_tag_out, fpu_has_fflags_out} = fpu_tag; assign fpu_valid_out = fpu_valid_out_uq; assign fpu_ready_in = fpu_ready_in_uq; diff --git a/hw/rtl/fpu/VX_fpu_ncp.sv b/hw/rtl/fpu/VX_fpu_ncp.sv index 34b822d89..21162dd6c 100644 --- a/hw/rtl/fpu/VX_fpu_ncp.sv +++ b/hw/rtl/fpu/VX_fpu_ncp.sv @@ -45,31 +45,34 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #( input wire ready_out, output wire valid_out ); - `UNUSED_VAR (frm) + localparam DATAW = 2 * 32 + `INST_FRM_BITS + `INST_FPU_BITS; + + wire [NUM_LANES-1:0][DATAW-1:0] data_in; - wire [NUM_LANES-1:0][2*32-1:0] data_in; wire [NUM_LANES-1:0] mask_out; wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out; fflags_t [NUM_LANES-1:0] fflags_out; wire pe_enable; - wire [NUM_PES-1:0][2*32-1:0] pe_data_in; + wire [NUM_PES-1:0][DATAW-1:0] pe_data_in; wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out; - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in assign data_in[i][0 +: 32] = dataa[i]; assign data_in[i][32 +: 32] = datab[i]; + assign data_in[i][64 +: `INST_FRM_BITS] = frm; + assign data_in[i][64 + `INST_FRM_BITS +: `INST_FPU_BITS] = op_type; end VX_pe_serializer #( .NUM_LANES (NUM_LANES), .NUM_PES (NUM_PES), .LATENCY (`LATENCY_FNCP), - .DATA_IN_WIDTH(2*32), - .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), + .DATA_IN_WIDTH (DATAW), + .DATA_OUT_WIDTH (`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), .PE_REG (0), - .OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0) + .OUT_BUF (2) ) pe_serializer ( .clk (clk), .reset (reset), @@ -78,28 +81,31 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #( .tag_in ({mask_in, tag_in}), .ready_in (ready_in), .pe_enable (pe_enable), - .pe_data_in (pe_data_in), - .pe_data_out(pe_data_out), + .pe_data_out(pe_data_in), + .pe_data_in (pe_data_out), .valid_out (valid_out), .data_out (data_out), .tag_out ({mask_out, tag_out}), .ready_out (ready_out) ); - for (genvar i = 0; i < NUM_LANES; ++i) begin + `UNUSED_VAR (pe_data_in) + + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result assign result[i] = data_out[i][0 +: 32]; assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS]; end - for (genvar i = 0; i < NUM_PES; ++i) begin + for (genvar i = 0; i < NUM_PES; ++i) begin : g_fncp_units VX_fncp_unit #( - .LATENCY (`LATENCY_FNCP) + .LATENCY (`LATENCY_FNCP), + .OUT_REG (1) ) fncp_unit ( .clk (clk), .reset (reset), .enable (pe_enable), - .frm (frm), - .op_type (op_type), + .frm (pe_data_in[0][64 +: `INST_FRM_BITS]), + .op_type (pe_data_in[0][64 + `INST_FRM_BITS +: `INST_FPU_BITS]), .dataa (pe_data_in[i][0 +: 32]), .datab (pe_data_in[i][32 +: 32]), .result (pe_data_out[i][0 +: 32]), diff --git a/hw/rtl/fpu/VX_fpu_sqrt.sv b/hw/rtl/fpu/VX_fpu_sqrt.sv index a6e6dda9a..172a42e6f 100644 --- a/hw/rtl/fpu/VX_fpu_sqrt.sv +++ b/hw/rtl/fpu/VX_fpu_sqrt.sv @@ -43,43 +43,51 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( input wire ready_out, output wire valid_out ); + localparam DATAW = 32 + `INST_FRM_BITS; - `UNUSED_VAR (frm) + wire [NUM_LANES-1:0][DATAW-1:0] data_in; wire [NUM_LANES-1:0] mask_out; wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out; wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags_out; wire pe_enable; - wire [NUM_PES-1:0][31:0] pe_data_in; + wire [NUM_PES-1:0][DATAW-1:0] pe_data_in; wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out; + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in + assign data_in[i][0 +: 32] = dataa[i]; + assign data_in[i][32 +: `INST_FRM_BITS] = frm; + end + VX_pe_serializer #( .NUM_LANES (NUM_LANES), .NUM_PES (NUM_PES), .LATENCY (`LATENCY_FSQRT), - .DATA_IN_WIDTH(32), - .DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32), + .DATA_IN_WIDTH (DATAW), + .DATA_OUT_WIDTH (`FP_FLAGS_BITS + 32), .TAG_WIDTH (NUM_LANES + TAG_WIDTH), .PE_REG (0), - .OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0) + .OUT_BUF (2) ) pe_serializer ( .clk (clk), .reset (reset), .valid_in (valid_in), - .data_in (dataa), + .data_in (data_in), .tag_in ({mask_in, tag_in}), .ready_in (ready_in), .pe_enable (pe_enable), - .pe_data_in (pe_data_in), - .pe_data_out(pe_data_out), + .pe_data_out(pe_data_in), + .pe_data_in (pe_data_out), .valid_out (valid_out), .data_out (data_out), .tag_out ({mask_out, tag_out}), .ready_out (ready_out) ); - for (genvar i = 0; i < NUM_LANES; ++i) begin + `UNUSED_VAR (pe_data_in) + + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result assign result[i] = data_out[i][0 +: 32]; assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS]; end @@ -88,12 +96,12 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( `ifdef QUARTUS - for (genvar i = 0; i < NUM_PES; ++i) begin + for (genvar i = 0; i < NUM_PES; ++i) begin : g_fsqrts acl_fsqrt fsqrt ( .clk (clk), .areset (1'b0), .en (pe_enable), - .a (pe_data_in[i]), + .a (pe_data_in[i][0 +: 32]), .q (pe_data_out[i][0 +: 32]) ); assign pe_data_out[i][32 +: `FP_FLAGS_BITS] = 'x; @@ -105,14 +113,14 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( `elsif VIVADO - for (genvar i = 0; i < NUM_PES; ++i) begin + for (genvar i = 0; i < NUM_PES; ++i) begin : g_fsqrts wire tuser; xil_fsqrt fsqrt ( .aclk (clk), .aclken (pe_enable), .s_axis_a_tvalid (1'b1), - .s_axis_a_tdata (pe_data_in[i]), + .s_axis_a_tdata (pe_data_in[i][0 +: 32]), `UNUSED_PIN (m_axis_result_tvalid), .m_axis_result_tdata (pe_data_out[i][0 +: 32]), .m_axis_result_tuser (tuser) @@ -126,7 +134,7 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( `else - for (genvar i = 0; i < NUM_PES; ++i) begin + for (genvar i = 0; i < NUM_PES; ++i) begin : g_fsqrts reg [63:0] r; `UNUSED_VAR (r) fflags_t f; @@ -135,8 +143,8 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #( dpi_fsqrt ( pe_enable, int'(0), - {32'hffffffff, pe_data_in[i]}, - frm, + {32'hffffffff, pe_data_in[i][0 +: 32]}, // a + pe_data_in[0][32 +: `INST_FRM_BITS], // frm r, f ); diff --git a/hw/rtl/interfaces/VX_decode_sched_if.sv b/hw/rtl/interfaces/VX_decode_sched_if.sv index b82aafb55..1f47c30e9 100644 --- a/hw/rtl/interfaces/VX_decode_sched_if.sv +++ b/hw/rtl/interfaces/VX_decode_sched_if.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,18 +16,18 @@ interface VX_decode_sched_if (); wire valid; - wire is_wstall; + wire unlock; wire [`NW_WIDTH-1:0] wid; modport master ( output valid, - output is_wstall, + output unlock, output wid ); modport slave ( input valid, - input is_wstall, + input unlock, input wid ); diff --git a/hw/rtl/interfaces/VX_pipeline_perf_if.sv b/hw/rtl/interfaces/VX_pipeline_perf_if.sv deleted file mode 100644 index 840630353..000000000 --- a/hw/rtl/interfaces/VX_pipeline_perf_if.sv +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -`include "VX_define.vh" - -interface VX_pipeline_perf_if import VX_gpu_pkg::*; (); - sched_perf_t sched; - issue_perf_t issue; - - wire [`PERF_CTR_BITS-1:0] ifetches; - wire [`PERF_CTR_BITS-1:0] loads; - wire [`PERF_CTR_BITS-1:0] stores; - wire [`PERF_CTR_BITS-1:0] ifetch_latency; - wire [`PERF_CTR_BITS-1:0] load_latency; - - modport master ( - output sched, - output issue, - output ifetches, - output loads, - output stores, - output ifetch_latency, - output load_latency - ); - - modport slave ( - input sched, - input issue, - input ifetches, - input loads, - input stores, - input ifetch_latency, - input load_latency - ); - -endinterface diff --git a/hw/rtl/libs/VX_async_ram_patch.sv b/hw/rtl/libs/VX_async_ram_patch.sv new file mode 100644 index 000000000..dd4d2b42d --- /dev/null +++ b/hw/rtl/libs/VX_async_ram_patch.sv @@ -0,0 +1,277 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_platform.vh" + +`define RAM_INITIALIZATION \ + if (INIT_ENABLE != 0) begin : g_init \ + if (INIT_FILE != "") begin : g_file \ + initial $readmemh(INIT_FILE, ram); \ + end else begin : g_value \ + initial begin \ + for (integer i = 0; i < SIZE; ++i) begin : g_i \ + ram[i] = INIT_VALUE; \ + end \ + end \ + end \ + end + +`define SYNC_RAM_WF_BLOCK(__d, __re, __we, __ra, __wa) \ + `RAM_ATTRIBUTES `RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + reg [ADDRW-1:0] raddr_r; \ + always @(posedge clk) begin \ + if (__we) begin \ + ram[__wa] <= wdata; \ + end \ + if (__re) begin \ + raddr_r <= __ra; \ + end \ + end \ + assign __d = ram[raddr_r] + +`define SYNC_RAM_WF_WREN_BLOCK(__d, __re, __we, __ra, __wa) \ + `RAM_ATTRIBUTES `RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + reg [ADDRW-1:0] raddr_r; \ + always @(posedge clk) begin \ + if (__we) begin \ + for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[__wa][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \ + end \ + end \ + end \ + if (__re) begin \ + raddr_r <= __ra; \ + end \ + end \ + assign __d = ram[raddr_r] + +`define SYNC_RAM_RF_BLOCK(__d, __re, __we, __ra, __wa) \ + `RAM_ATTRIBUTES reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + reg [DATAW-1:0] rdata_r; \ + always @(posedge clk) begin \ + if (__we) begin \ + ram[__wa] <= wdata; \ + end \ + if (__re) begin \ + rdata_r <= ram[__ra]; \ + end \ + end \ + assign __d = rdata_r + +`define SYNC_RAM_RF_WREN_BLOCK(__d, __re, __we, __ra, __wa) \ + `RAM_ATTRIBUTES reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + reg [DATAW-1:0] rdata_r; \ + always @(posedge clk) begin \ + if (__we) begin \ + for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[__wa][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \ + end \ + end \ + end \ + if (__re) begin \ + rdata_r <= ram[__ra]; \ + end \ + end \ + assign __d = rdata_r + +`define ASYNC_RAM_BLOCK(__d, __we, __ra, __wa) \ + `RAM_ATTRIBUTES reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + always @(posedge clk) begin \ + if (__we) begin \ + ram[__wa] <= wdata; \ + end \ + end \ + assign __d = ram[__ra] + +`define ASYNC_RAM_BLOCK_WREN(__d, __we, __ra, __wa) \ + `RAM_ATTRIBUTES reg [DATAW-1:0] ram [0:SIZE-1]; \ + `RAM_INITIALIZATION \ + always @(posedge clk) begin \ + if (__we) begin \ + for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[__wa][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \ + end \ + end \ + end \ + end \ + assign __d = ram[__ra] + +`TRACING_OFF +module VX_async_ram_patch #( + parameter DATAW = 1, + parameter SIZE = 1, + parameter WRENW = 1, + parameter DUAL_PORT = 0, + parameter FORCE_BRAM = 0, + parameter RADDR_REG = 0, // read address registered hint + parameter RADDR_RESET = 0, // read address has reset + parameter WRITE_FIRST = 0, + parameter INIT_ENABLE = 0, + parameter INIT_FILE = "", + parameter [DATAW-1:0] INIT_VALUE = 0, + parameter ADDRW = `LOG2UP(SIZE) +) ( + input wire clk, + input wire reset, + input wire read, + input wire write, + input wire [WRENW-1:0] wren, + input wire [ADDRW-1:0] waddr, + input wire [DATAW-1:0] wdata, + input wire [ADDRW-1:0] raddr, + output wire [DATAW-1:0] rdata +); + localparam WSELW = DATAW / WRENW; + + `UNUSED_VAR (reset) + + (* keep = "true" *) wire [ADDRW-1:0] raddr_w, raddr_s; + (* keep = "true" *) wire read_s; + assign raddr_w = raddr; + + wire raddr_reset_w; + if (RADDR_RESET) begin : g_raddr_reset + (* keep = "true" *) wire raddr_reset; + assign raddr_reset = 0; + assign raddr_reset_w = raddr_reset; + end else begin : g_no_raddr_reset + assign raddr_reset_w = 0; + end + + VX_placeholder #( + .I (ADDRW + 1), + .O (ADDRW + 1) + ) placeholder1 ( + .in ({raddr_w, raddr_reset_w}), + .out ({raddr_s, read_s}) + ); + + wire [DATAW-1:0] rdata_s; + + if (1) begin : g_sync_ram + if (WRENW != 1) begin : g_wren + if (FORCE_BRAM) begin : g_bram + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `USE_BLOCK_BRAM + `SYNC_RAM_WF_WREN_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `USE_BLOCK_BRAM + `SYNC_RAM_RF_WREN_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end + end else begin : g_lutram + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES + `SYNC_RAM_WF_WREN_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES + `SYNC_RAM_RF_WREN_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end + end + end else begin : g_no_wren + if (FORCE_BRAM) begin : g_bram + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `USE_BLOCK_BRAM + `SYNC_RAM_WF_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `USE_BLOCK_BRAM + `SYNC_RAM_RF_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end + end else begin : g_lutram + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES + `SYNC_RAM_WF_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES + `SYNC_RAM_RF_BLOCK(rdata_s, read_s, write, raddr_s, waddr); + `undef RAM_ATTRIBUTES + end + end + end + end + + if (RADDR_REG) begin : g_raddr_reg + assign rdata = rdata_s; + end else begin : g_async_ram + (* keep = "true" *) wire is_raddr_reg; + VX_placeholder #( + .O (1) + ) placeholder2 ( + .in (1'b0), + .out (is_raddr_reg) + ); + wire [DATAW-1:0] rdata_a; + if (DUAL_PORT) begin : g_dp + if (WRENW != 1) begin : g_wren + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `RW_RAM_CHECK + `ASYNC_RAM_BLOCK_WREN(rdata_a, write, raddr, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `NO_RW_RAM_CHECK + `ASYNC_RAM_BLOCK_WREN(rdata_a, write, raddr, waddr); + `undef RAM_ATTRIBUTES + end + end else begin : g_no_wren + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `RW_RAM_CHECK + `ASYNC_RAM_BLOCK(rdata_a, write, raddr, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `NO_RW_RAM_CHECK + `ASYNC_RAM_BLOCK(rdata_a, write, raddr, waddr); + `undef RAM_ATTRIBUTES + end + end + end else begin : g_sp + if (WRENW != 1) begin : g_wren + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `RW_RAM_CHECK + `ASYNC_RAM_BLOCK_WREN(rdata_a, write, waddr, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `NO_RW_RAM_CHECK + `ASYNC_RAM_BLOCK_WREN(rdata_a, write, waddr, waddr); + `undef RAM_ATTRIBUTES + end + end else begin : g_no_wren + if (WRITE_FIRST) begin : g_write_first + `define RAM_ATTRIBUTES `RW_RAM_CHECK + `ASYNC_RAM_BLOCK(rdata_a, write, waddr, waddr); + `undef RAM_ATTRIBUTES + end else begin : g_read_first + `define RAM_ATTRIBUTES `NO_RW_RAM_CHECK + `ASYNC_RAM_BLOCK(rdata_a, write, waddr, waddr); + `undef RAM_ATTRIBUTES + end + end + end + assign rdata = is_raddr_reg ? rdata_s : rdata_a; + end + +endmodule +`TRACING_ON diff --git a/hw/rtl/libs/VX_avs_adapter.sv b/hw/rtl/libs/VX_avs_adapter.sv index 35d329c7b..48810db3b 100644 --- a/hw/rtl/libs/VX_avs_adapter.sv +++ b/hw/rtl/libs/VX_avs_adapter.sv @@ -16,11 +16,15 @@ `TRACING_OFF module VX_avs_adapter #( parameter DATA_WIDTH = 1, - parameter ADDR_WIDTH = 1, + parameter ADDR_WIDTH_IN = 1, + parameter ADDR_WIDTH_OUT= 32, parameter BURST_WIDTH = 1, - parameter NUM_BANKS = 1, + parameter NUM_PORTS_IN = 1, + parameter NUM_BANKS_OUT = 1, parameter TAG_WIDTH = 1, parameter RD_QUEUE_SIZE = 1, + parameter INTERLEAVE = 0, + parameter ARBITER = "R", parameter REQ_OUT_BUF = 0, parameter RSP_OUT_BUF = 0 ) ( @@ -28,190 +32,244 @@ module VX_avs_adapter #( input wire reset, // Memory request - input wire mem_req_valid, - input wire mem_req_rw, - input wire [DATA_WIDTH/8-1:0] mem_req_byteen, - input wire [ADDR_WIDTH-1:0] mem_req_addr, - input wire [DATA_WIDTH-1:0] mem_req_data, - input wire [TAG_WIDTH-1:0] mem_req_tag, - output wire mem_req_ready, + input wire mem_req_valid [NUM_PORTS_IN], + input wire mem_req_rw [NUM_PORTS_IN], + input wire [DATA_WIDTH/8-1:0] mem_req_byteen [NUM_PORTS_IN], + input wire [ADDR_WIDTH_IN-1:0] mem_req_addr [NUM_PORTS_IN], + input wire [DATA_WIDTH-1:0] mem_req_data [NUM_PORTS_IN], + input wire [TAG_WIDTH-1:0] mem_req_tag [NUM_PORTS_IN], + output wire mem_req_ready [NUM_PORTS_IN], // Memory response - output wire mem_rsp_valid, - output wire [DATA_WIDTH-1:0] mem_rsp_data, - output wire [TAG_WIDTH-1:0] mem_rsp_tag, - input wire mem_rsp_ready, + output wire mem_rsp_valid [NUM_PORTS_IN], + output wire [DATA_WIDTH-1:0] mem_rsp_data [NUM_PORTS_IN], + output wire [TAG_WIDTH-1:0] mem_rsp_tag [NUM_PORTS_IN], + input wire mem_rsp_ready [NUM_PORTS_IN], // AVS bus - output wire [DATA_WIDTH-1:0] avs_writedata [NUM_BANKS], - input wire [DATA_WIDTH-1:0] avs_readdata [NUM_BANKS], - output wire [ADDR_WIDTH-1:0] avs_address [NUM_BANKS], - input wire avs_waitrequest [NUM_BANKS], - output wire avs_write [NUM_BANKS], - output wire avs_read [NUM_BANKS], - output wire [DATA_WIDTH/8-1:0] avs_byteenable [NUM_BANKS], - output wire [BURST_WIDTH-1:0] avs_burstcount [NUM_BANKS], - input wire avs_readdatavalid [NUM_BANKS] + output wire [DATA_WIDTH-1:0] avs_writedata [NUM_BANKS_OUT], + input wire [DATA_WIDTH-1:0] avs_readdata [NUM_BANKS_OUT], + output wire [ADDR_WIDTH_OUT-1:0] avs_address [NUM_BANKS_OUT], + input wire avs_waitrequest [NUM_BANKS_OUT], + output wire avs_write [NUM_BANKS_OUT], + output wire avs_read [NUM_BANKS_OUT], + output wire [DATA_WIDTH/8-1:0] avs_byteenable [NUM_BANKS_OUT], + output wire [BURST_WIDTH-1:0] avs_burstcount [NUM_BANKS_OUT], + input wire avs_readdatavalid [NUM_BANKS_OUT] ); - localparam DATA_SIZE = DATA_WIDTH/8; - localparam RD_QUEUE_ADDR_WIDTH = `CLOG2(RD_QUEUE_SIZE+1); - localparam BANK_ADDRW = `LOG2UP(NUM_BANKS); - localparam LOG2_NUM_BANKS = `CLOG2(NUM_BANKS); - localparam BANK_OFFSETW = ADDR_WIDTH - LOG2_NUM_BANKS; + localparam DATA_SIZE = DATA_WIDTH/8; + localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS_OUT); + localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS); + localparam DST_ADDR_WDITH = ADDR_WIDTH_OUT + BANK_SEL_BITS; // convert output addresss to input space + localparam BANK_ADDR_WIDTH = DST_ADDR_WDITH - BANK_SEL_BITS; + localparam NUM_PORTS_IN_BITS = `CLOG2(NUM_PORTS_IN); + localparam NUM_PORTS_IN_WIDTH = `UP(NUM_PORTS_IN_BITS); + localparam REQ_QUEUE_DATAW = TAG_WIDTH + NUM_PORTS_IN_BITS; + localparam REQ_XBAR_DATAW = 1 + BANK_ADDR_WIDTH + DATA_WIDTH + DATA_SIZE + TAG_WIDTH; + localparam RSP_XBAR_DATAW = DATA_WIDTH + TAG_WIDTH; - // Requests handling ////////////////////////////////////////////////////// + `STATIC_ASSERT ((DST_ADDR_WDITH >= ADDR_WIDTH_IN), ("invalid address width: current=%0d, expected=%0d", DST_ADDR_WDITH, ADDR_WIDTH_IN)) - wire [NUM_BANKS-1:0] req_queue_push, req_queue_pop; - wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] req_queue_tag_out; - wire [NUM_BANKS-1:0] req_queue_going_full; - wire [NUM_BANKS-1:0][RD_QUEUE_ADDR_WIDTH-1:0] req_queue_size; - wire [BANK_ADDRW-1:0] req_bank_sel; - wire [BANK_OFFSETW-1:0] req_bank_off; - wire [NUM_BANKS-1:0] bank_req_ready; + // Bank selection - if (NUM_BANKS > 1) begin - assign req_bank_sel = mem_req_addr[BANK_ADDRW-1:0]; - end else begin - assign req_bank_sel = '0; + wire [NUM_PORTS_IN-1:0][BANK_SEL_WIDTH-1:0] req_bank_sel; + wire [NUM_PORTS_IN-1:0][BANK_ADDR_WIDTH-1:0] req_bank_addr; + + if (NUM_BANKS_OUT > 1) begin : g_bank_sel + for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_i + wire [DST_ADDR_WDITH-1:0] mem_req_addr_dst = DST_ADDR_WDITH'(mem_req_addr[i]); + if (INTERLEAVE) begin : g_interleave + assign req_bank_sel[i] = mem_req_addr_dst[BANK_SEL_BITS-1:0]; + assign req_bank_addr[i] = mem_req_addr_dst[BANK_SEL_BITS +: BANK_ADDR_WIDTH]; + end else begin : g_no_interleave + assign req_bank_sel[i] = mem_req_addr_dst[BANK_ADDR_WIDTH +: BANK_SEL_BITS]; + assign req_bank_addr[i] = mem_req_addr_dst[BANK_ADDR_WIDTH-1:0]; + end + end + end else begin : g_no_bank_sel + for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_i + assign req_bank_sel[i] = '0; + assign req_bank_addr[i] = DST_ADDR_WDITH'(mem_req_addr[i]); + end end - assign req_bank_off = mem_req_addr[ADDR_WIDTH-1:LOG2_NUM_BANKS]; + // Requests handling - for (genvar i = 0; i < NUM_BANKS; ++i) begin - assign req_queue_push[i] = mem_req_valid && ~mem_req_rw && bank_req_ready[i] && (req_bank_sel == i); + wire [NUM_PORTS_IN-1:0] req_xbar_valid_in; + wire [NUM_PORTS_IN-1:0][REQ_XBAR_DATAW-1:0] req_xbar_data_in; + wire [NUM_PORTS_IN-1:0] req_xbar_ready_in; + + wire [NUM_BANKS_OUT-1:0] req_xbar_valid_out; + wire [NUM_BANKS_OUT-1:0][REQ_XBAR_DATAW-1:0] req_xbar_data_out; + wire [NUM_BANKS_OUT-1:0][NUM_PORTS_IN_WIDTH-1:0] req_xbar_sel_out; + wire [NUM_BANKS_OUT-1:0] req_xbar_ready_out; + + for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_req_xbar_data_in + assign req_xbar_valid_in[i] = mem_req_valid[i]; + assign req_xbar_data_in[i] = {mem_req_rw[i], req_bank_addr[i], mem_req_byteen[i], mem_req_data[i], mem_req_tag[i]}; + assign mem_req_ready[i] = req_xbar_ready_in[i]; end - `RESET_RELAY_EX (bank_reset, reset, NUM_BANKS, 1); + VX_stream_xbar #( + .NUM_INPUTS (NUM_PORTS_IN), + .NUM_OUTPUTS(NUM_BANKS_OUT), + .DATAW (REQ_XBAR_DATAW), + .ARBITER (ARBITER), + .OUT_BUF (REQ_OUT_BUF) + ) req_xbar ( + .clk (clk), + .reset (reset), + .sel_in (req_bank_sel), + .valid_in (req_xbar_valid_in), + .data_in (req_xbar_data_in), + .ready_in (req_xbar_ready_in), + .valid_out (req_xbar_valid_out), + .data_out (req_xbar_data_out), + .ready_out (req_xbar_ready_out), + .sel_out (req_xbar_sel_out), + `UNUSED_PIN (collisions) + ); - for (genvar i = 0; i < NUM_BANKS; ++i) begin + wire [NUM_BANKS_OUT-1:0][REQ_QUEUE_DATAW-1:0] rd_req_queue_data_out; + wire [NUM_BANKS_OUT-1:0] rd_req_queue_pop; + + for (genvar i = 0; i < NUM_BANKS_OUT; ++i) begin : g_req_xbar_data_out + + wire ready_out; + wire rw_out; + wire [BANK_ADDR_WIDTH-1:0] addr_out; + wire [TAG_WIDTH-1:0] tag_out; + wire [DATA_WIDTH-1:0] data_out; + wire [DATA_SIZE-1:0] byteen_out; + wire valid_out; + + assign {rw_out, addr_out, byteen_out, data_out, tag_out} = req_xbar_data_out[i]; + + wire rd_req_queue_going_full; + wire rd_req_queue_push; + + // stall pipeline if the request queue is needed and going full + wire rd_req_queue_ready = rw_out || ~rd_req_queue_going_full; + assign valid_out = req_xbar_valid_out[i] && rd_req_queue_ready; + assign ready_out = ~avs_waitrequest[i] && rd_req_queue_ready; + assign rd_req_queue_push = valid_out && ready_out && ~rw_out; VX_pending_size #( .SIZE (RD_QUEUE_SIZE) ) pending_size ( .clk (clk), - .reset (bank_reset[i]), - .incr (req_queue_push[i]), - .decr (req_queue_pop[i]), + .reset (reset), + .incr (rd_req_queue_push), + .decr (rd_req_queue_pop[i]), `UNUSED_PIN (empty), `UNUSED_PIN (alm_empty), - .full (req_queue_going_full[i]), + .full (rd_req_queue_going_full), `UNUSED_PIN (alm_full), - .size (req_queue_size[i]) + `UNUSED_PIN (size) ); - `UNUSED_VAR (req_queue_size) + + wire [REQ_QUEUE_DATAW-1:0] rd_req_queue_data_in; + if (NUM_PORTS_IN > 1) begin : g_input_sel + assign rd_req_queue_data_in = {tag_out, req_xbar_sel_out[i]}; + end else begin : g_no_input_sel + `UNUSED_VAR (req_xbar_sel_out[i]) + assign rd_req_queue_data_in = tag_out; + end VX_fifo_queue #( - .DATAW (TAG_WIDTH), + .DATAW (REQ_QUEUE_DATAW), .DEPTH (RD_QUEUE_SIZE) ) rd_req_queue ( .clk (clk), - .reset (bank_reset[i]), - .push (req_queue_push[i]), - .pop (req_queue_pop[i]), - .data_in (mem_req_tag), - .data_out (req_queue_tag_out[i]), + .reset (reset), + .push (rd_req_queue_push), + .pop (rd_req_queue_pop[i]), + .data_in (rd_req_queue_data_in), + .data_out (rd_req_queue_data_out[i]), `UNUSED_PIN (empty), `UNUSED_PIN (full), `UNUSED_PIN (alm_empty), `UNUSED_PIN (alm_full), `UNUSED_PIN (size) ); - end - - for (genvar i = 0; i < NUM_BANKS; ++i) begin - wire valid_out; - wire rw_out; - wire [DATA_SIZE-1:0] byteen_out; - wire [BANK_OFFSETW-1:0] addr_out; - wire [DATA_WIDTH-1:0] data_out; - wire ready_out; - - wire valid_out_w = mem_req_valid && ~req_queue_going_full[i] && (req_bank_sel == i); - wire ready_out_w; - - VX_elastic_buffer #( - .DATAW (1 + DATA_SIZE + BANK_OFFSETW + DATA_WIDTH), - .SIZE (`TO_OUT_BUF_SIZE(REQ_OUT_BUF)), - .OUT_REG (`TO_OUT_BUF_REG(REQ_OUT_BUF)) - ) req_out_buf ( - .clk (clk), - .reset (bank_reset[i]), - .valid_in (valid_out_w), - .ready_in (ready_out_w), - .data_in ({mem_req_rw, mem_req_byteen, req_bank_off, mem_req_data}), - .data_out ({rw_out, byteen_out, addr_out, data_out}), - .valid_out (valid_out), - .ready_out (ready_out) - ); assign avs_read[i] = valid_out && ~rw_out; assign avs_write[i] = valid_out && rw_out; - assign avs_address[i] = ADDR_WIDTH'(addr_out); + assign avs_address[i] = ADDR_WIDTH_OUT'(addr_out); assign avs_byteenable[i] = byteen_out; assign avs_writedata[i] = data_out; assign avs_burstcount[i] = BURST_WIDTH'(1); - assign ready_out = ~avs_waitrequest[i]; - - assign bank_req_ready[i] = ready_out_w && ~req_queue_going_full[i]; + assign req_xbar_ready_out[i] = ready_out; end - if (NUM_BANKS > 1) begin - assign mem_req_ready = bank_req_ready[req_bank_sel]; - end else begin - assign mem_req_ready = bank_req_ready; - end + // Responses handling - // Responses handling ///////////////////////////////////////////////////// + wire [NUM_BANKS_OUT-1:0] rsp_xbar_valid_in; + wire [NUM_BANKS_OUT-1:0][RSP_XBAR_DATAW-1:0] rsp_xbar_data_in; + wire [NUM_BANKS_OUT-1:0][NUM_PORTS_IN_WIDTH-1:0] rsp_xbar_sel_in; + wire [NUM_BANKS_OUT-1:0] rsp_xbar_ready_in; - wire [NUM_BANKS-1:0] rsp_arb_valid_in; - wire [NUM_BANKS-1:0][DATA_WIDTH+TAG_WIDTH-1:0] rsp_arb_data_in; - wire [NUM_BANKS-1:0] rsp_arb_ready_in; + wire [NUM_PORTS_IN-1:0] rsp_xbar_valid_out; + wire [NUM_PORTS_IN-1:0][RSP_XBAR_DATAW-1:0] rsp_xbar_data_out; + wire [NUM_PORTS_IN-1:0] rsp_xbar_ready_out; - wire [NUM_BANKS-1:0][DATA_WIDTH-1:0] rsp_queue_data_out; - wire [NUM_BANKS-1:0] rsp_queue_empty; + for (genvar i = 0; i < NUM_BANKS_OUT; ++i) begin : g_rsp_xbar_data_in - for (genvar i = 0; i < NUM_BANKS; ++i) begin + wire [DATA_WIDTH-1:0] rsp_queue_data_out; + wire rsp_queue_empty; VX_fifo_queue #( .DATAW (DATA_WIDTH), .DEPTH (RD_QUEUE_SIZE) - ) rd_rsp_queue ( + ) rsp_queue ( .clk (clk), - .reset (bank_reset[i]), + .reset (reset), .push (avs_readdatavalid[i]), - .pop (req_queue_pop[i]), + .pop (rd_req_queue_pop[i]), .data_in (avs_readdata[i]), - .data_out (rsp_queue_data_out[i]), - .empty (rsp_queue_empty[i]), + .data_out (rsp_queue_data_out), + .empty (rsp_queue_empty), `UNUSED_PIN (full), `UNUSED_PIN (alm_empty), `UNUSED_PIN (alm_full), `UNUSED_PIN (size) ); + + assign rsp_xbar_valid_in[i] = ~rsp_queue_empty; + assign rsp_xbar_data_in[i] = {rsp_queue_data_out, rd_req_queue_data_out[i][NUM_PORTS_IN_BITS +: TAG_WIDTH]}; + if (NUM_PORTS_IN > 1) begin : g_input_sel + assign rsp_xbar_sel_in[i] = rd_req_queue_data_out[i][0 +: NUM_PORTS_IN_BITS]; + end else begin : g_no_input_sel + assign rsp_xbar_sel_in[i] = 0; + end + assign rd_req_queue_pop[i] = rsp_xbar_valid_in[i] && rsp_xbar_ready_in[i]; end - for (genvar i = 0; i < NUM_BANKS; ++i) begin - assign rsp_arb_valid_in[i] = !rsp_queue_empty[i]; - assign rsp_arb_data_in[i] = {rsp_queue_data_out[i], req_queue_tag_out[i]}; - assign req_queue_pop[i] = rsp_arb_valid_in[i] && rsp_arb_ready_in[i]; - end - - VX_stream_arb #( - .NUM_INPUTS (NUM_BANKS), - .DATAW (DATA_WIDTH + TAG_WIDTH), - .ARBITER ("F"), + VX_stream_xbar #( + .NUM_INPUTS (NUM_BANKS_OUT), + .NUM_OUTPUTS(NUM_PORTS_IN), + .DATAW (RSP_XBAR_DATAW), + .ARBITER (ARBITER), .OUT_BUF (RSP_OUT_BUF) - ) rsp_arb ( + ) rsp_xbar ( .clk (clk), .reset (reset), - .valid_in (rsp_arb_valid_in), - .data_in (rsp_arb_data_in), - .ready_in (rsp_arb_ready_in), - .data_out ({mem_rsp_data, mem_rsp_tag}), - .valid_out (mem_rsp_valid), - .ready_out (mem_rsp_ready), + .valid_in (rsp_xbar_valid_in), + .data_in (rsp_xbar_data_in), + .ready_in (rsp_xbar_ready_in), + .sel_in (rsp_xbar_sel_in), + .data_out (rsp_xbar_data_out), + .valid_out (rsp_xbar_valid_out), + .ready_out (rsp_xbar_ready_out), + `UNUSED_PIN (collisions), `UNUSED_PIN (sel_out) ); + for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_rsp_xbar_data_out + assign mem_rsp_valid[i] = rsp_xbar_valid_out[i]; + assign {mem_rsp_data[i], mem_rsp_tag[i]} = rsp_xbar_data_out[i]; + assign rsp_xbar_ready_out[i] = mem_rsp_ready[i]; + end + endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_axi_adapter.sv b/hw/rtl/libs/VX_axi_adapter.sv index 7fffb9be2..8ea658511 100644 --- a/hw/rtl/libs/VX_axi_adapter.sv +++ b/hw/rtl/libs/VX_axi_adapter.sv @@ -16,170 +16,286 @@ `TRACING_OFF module VX_axi_adapter #( parameter DATA_WIDTH = 512, - parameter ADDR_WIDTH = 32, - parameter TAG_WIDTH = 8, - parameter NUM_BANKS = 1, - parameter AVS_ADDR_WIDTH = (ADDR_WIDTH - `CLOG2(DATA_WIDTH/8)), - parameter RSP_OUT_BUF = 0 -) ( + parameter ADDR_WIDTH_IN = 26, // word-addressable + parameter ADDR_WIDTH_OUT = 32, // byte-addressable + parameter TAG_WIDTH_IN = 8, + parameter TAG_WIDTH_OUT = 8, + parameter NUM_PORTS_IN = 1, + parameter NUM_BANKS_OUT = 1, + parameter INTERLEAVE = 0, + parameter TAG_BUFFER_SIZE= 16, + parameter ARBITER = "R", + parameter REQ_OUT_BUF = 0, + parameter RSP_OUT_BUF = 0, + parameter DATA_SIZE = DATA_WIDTH/8 + ) ( input wire clk, input wire reset, // Vortex request - input wire mem_req_valid, - input wire mem_req_rw, - input wire [DATA_WIDTH/8-1:0] mem_req_byteen, - input wire [AVS_ADDR_WIDTH-1:0] mem_req_addr, - input wire [DATA_WIDTH-1:0] mem_req_data, - input wire [TAG_WIDTH-1:0] mem_req_tag, - output wire mem_req_ready, + input wire mem_req_valid [NUM_PORTS_IN], + input wire mem_req_rw [NUM_PORTS_IN], + input wire [DATA_SIZE-1:0] mem_req_byteen [NUM_PORTS_IN], + input wire [ADDR_WIDTH_IN-1:0] mem_req_addr [NUM_PORTS_IN], + input wire [DATA_WIDTH-1:0] mem_req_data [NUM_PORTS_IN], + input wire [TAG_WIDTH_IN-1:0] mem_req_tag [NUM_PORTS_IN], + output wire mem_req_ready [NUM_PORTS_IN], // Vortex response - output wire mem_rsp_valid, - output wire [DATA_WIDTH-1:0] mem_rsp_data, - output wire [TAG_WIDTH-1:0] mem_rsp_tag, - input wire mem_rsp_ready, + output wire mem_rsp_valid [NUM_PORTS_IN], + output wire [DATA_WIDTH-1:0] mem_rsp_data [NUM_PORTS_IN], + output wire [TAG_WIDTH_IN-1:0] mem_rsp_tag [NUM_PORTS_IN], + input wire mem_rsp_ready [NUM_PORTS_IN], // AXI write request address channel - output wire m_axi_awvalid [NUM_BANKS], - input wire m_axi_awready [NUM_BANKS], - output wire [ADDR_WIDTH-1:0] m_axi_awaddr [NUM_BANKS], - output wire [TAG_WIDTH-1:0] m_axi_awid [NUM_BANKS], - output wire [7:0] m_axi_awlen [NUM_BANKS], - output wire [2:0] m_axi_awsize [NUM_BANKS], - output wire [1:0] m_axi_awburst [NUM_BANKS], - output wire [1:0] m_axi_awlock [NUM_BANKS], - output wire [3:0] m_axi_awcache [NUM_BANKS], - output wire [2:0] m_axi_awprot [NUM_BANKS], - output wire [3:0] m_axi_awqos [NUM_BANKS], - output wire [3:0] m_axi_awregion [NUM_BANKS], + output wire m_axi_awvalid [NUM_BANKS_OUT], + input wire m_axi_awready [NUM_BANKS_OUT], + output wire [ADDR_WIDTH_OUT-1:0] m_axi_awaddr [NUM_BANKS_OUT], + output wire [TAG_WIDTH_OUT-1:0] m_axi_awid [NUM_BANKS_OUT], + output wire [7:0] m_axi_awlen [NUM_BANKS_OUT], + output wire [2:0] m_axi_awsize [NUM_BANKS_OUT], + output wire [1:0] m_axi_awburst [NUM_BANKS_OUT], + output wire [1:0] m_axi_awlock [NUM_BANKS_OUT], + output wire [3:0] m_axi_awcache [NUM_BANKS_OUT], + output wire [2:0] m_axi_awprot [NUM_BANKS_OUT], + output wire [3:0] m_axi_awqos [NUM_BANKS_OUT], + output wire [3:0] m_axi_awregion [NUM_BANKS_OUT], // AXI write request data channel - output wire m_axi_wvalid [NUM_BANKS], - input wire m_axi_wready [NUM_BANKS], - output wire [DATA_WIDTH-1:0] m_axi_wdata [NUM_BANKS], - output wire [DATA_WIDTH/8-1:0] m_axi_wstrb [NUM_BANKS], - output wire m_axi_wlast [NUM_BANKS], + output wire m_axi_wvalid [NUM_BANKS_OUT], + input wire m_axi_wready [NUM_BANKS_OUT], + output wire [DATA_WIDTH-1:0] m_axi_wdata [NUM_BANKS_OUT], + output wire [DATA_SIZE-1:0] m_axi_wstrb [NUM_BANKS_OUT], + output wire m_axi_wlast [NUM_BANKS_OUT], // AXI write response channel - input wire m_axi_bvalid [NUM_BANKS], - output wire m_axi_bready [NUM_BANKS], - input wire [TAG_WIDTH-1:0] m_axi_bid [NUM_BANKS], - input wire [1:0] m_axi_bresp [NUM_BANKS], + input wire m_axi_bvalid [NUM_BANKS_OUT], + output wire m_axi_bready [NUM_BANKS_OUT], + input wire [TAG_WIDTH_OUT-1:0] m_axi_bid [NUM_BANKS_OUT], + input wire [1:0] m_axi_bresp [NUM_BANKS_OUT], // AXI read address channel - output wire m_axi_arvalid [NUM_BANKS], - input wire m_axi_arready [NUM_BANKS], - output wire [ADDR_WIDTH-1:0] m_axi_araddr [NUM_BANKS], - output wire [TAG_WIDTH-1:0] m_axi_arid [NUM_BANKS], - output wire [7:0] m_axi_arlen [NUM_BANKS], - output wire [2:0] m_axi_arsize [NUM_BANKS], - output wire [1:0] m_axi_arburst [NUM_BANKS], - output wire [1:0] m_axi_arlock [NUM_BANKS], - output wire [3:0] m_axi_arcache [NUM_BANKS], - output wire [2:0] m_axi_arprot [NUM_BANKS], - output wire [3:0] m_axi_arqos [NUM_BANKS], - output wire [3:0] m_axi_arregion [NUM_BANKS], + output wire m_axi_arvalid [NUM_BANKS_OUT], + input wire m_axi_arready [NUM_BANKS_OUT], + output wire [ADDR_WIDTH_OUT-1:0] m_axi_araddr [NUM_BANKS_OUT], + output wire [TAG_WIDTH_OUT-1:0] m_axi_arid [NUM_BANKS_OUT], + output wire [7:0] m_axi_arlen [NUM_BANKS_OUT], + output wire [2:0] m_axi_arsize [NUM_BANKS_OUT], + output wire [1:0] m_axi_arburst [NUM_BANKS_OUT], + output wire [1:0] m_axi_arlock [NUM_BANKS_OUT], + output wire [3:0] m_axi_arcache [NUM_BANKS_OUT], + output wire [2:0] m_axi_arprot [NUM_BANKS_OUT], + output wire [3:0] m_axi_arqos [NUM_BANKS_OUT], + output wire [3:0] m_axi_arregion [NUM_BANKS_OUT], // AXI read response channel - input wire m_axi_rvalid [NUM_BANKS], - output wire m_axi_rready [NUM_BANKS], - input wire [DATA_WIDTH-1:0] m_axi_rdata [NUM_BANKS], - input wire m_axi_rlast [NUM_BANKS], - input wire [TAG_WIDTH-1:0] m_axi_rid [NUM_BANKS], - input wire [1:0] m_axi_rresp [NUM_BANKS] + input wire m_axi_rvalid [NUM_BANKS_OUT], + output wire m_axi_rready [NUM_BANKS_OUT], + input wire [DATA_WIDTH-1:0] m_axi_rdata [NUM_BANKS_OUT], + input wire m_axi_rlast [NUM_BANKS_OUT], + input wire [TAG_WIDTH_OUT-1:0] m_axi_rid [NUM_BANKS_OUT], + input wire [1:0] m_axi_rresp [NUM_BANKS_OUT] ); - localparam AXSIZE = `CLOG2(DATA_WIDTH/8); - localparam BANK_ADDRW = `LOG2UP(NUM_BANKS); - localparam LOG2_NUM_BANKS = `CLOG2(NUM_BANKS); + localparam LOG2_DATA_SIZE = `CLOG2(DATA_SIZE); + localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS_OUT); + localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS); + localparam DST_ADDR_WDITH = (ADDR_WIDTH_OUT - LOG2_DATA_SIZE) + BANK_SEL_BITS; // convert byte-addressable output addresss to block-addressable input space + localparam BANK_ADDR_WIDTH = DST_ADDR_WDITH - BANK_SEL_BITS; + localparam NUM_PORTS_IN_BITS = `CLOG2(NUM_PORTS_IN); + localparam NUM_PORTS_IN_WIDTH = `UP(NUM_PORTS_IN_BITS); + localparam TAG_BUFFER_ADDRW = `CLOG2(TAG_BUFFER_SIZE); + localparam NEEDED_TAG_WIDTH = TAG_WIDTH_IN + NUM_PORTS_IN_BITS; + localparam READ_TAG_WIDTH = (NEEDED_TAG_WIDTH > TAG_WIDTH_OUT) ? TAG_BUFFER_ADDRW : TAG_WIDTH_IN; + localparam READ_FULL_TAG_WIDTH = READ_TAG_WIDTH + NUM_PORTS_IN_BITS; + localparam WRITE_TAG_WIDTH = `MIN(TAG_WIDTH_IN, TAG_WIDTH_OUT); + localparam DST_TAG_WIDTH = `MAX(READ_FULL_TAG_WIDTH, WRITE_TAG_WIDTH); + localparam XBAR_TAG_WIDTH = `MAX(READ_TAG_WIDTH, WRITE_TAG_WIDTH); + localparam REQ_XBAR_DATAW = 1 + BANK_ADDR_WIDTH + DATA_SIZE + DATA_WIDTH + XBAR_TAG_WIDTH; + localparam RSP_XBAR_DATAW = DATA_WIDTH + READ_TAG_WIDTH; - wire [BANK_ADDRW-1:0] req_bank_sel; + `STATIC_ASSERT ((DST_ADDR_WDITH >= ADDR_WIDTH_IN), ("invalid address width: current=%0d, expected=%0d", DST_ADDR_WDITH, ADDR_WIDTH_IN)) + `STATIC_ASSERT ((TAG_WIDTH_OUT >= DST_TAG_WIDTH), ("invalid output tag width: current=%0d, expected=%0d", TAG_WIDTH_OUT, DST_TAG_WIDTH)) - if (NUM_BANKS > 1) begin - assign req_bank_sel = mem_req_addr[BANK_ADDRW-1:0]; - end else begin - assign req_bank_sel = '0; - end + // Bank selection - wire mem_req_fire = mem_req_valid && mem_req_ready; + wire [NUM_PORTS_IN-1:0][BANK_SEL_WIDTH-1:0] req_bank_sel; + wire [NUM_PORTS_IN-1:0][BANK_ADDR_WIDTH-1:0] req_bank_addr; - reg [NUM_BANKS-1:0] m_axi_aw_ack; - reg [NUM_BANKS-1:0] m_axi_w_ack; - - for (genvar i = 0; i < NUM_BANKS; ++i) begin - wire m_axi_aw_fire = m_axi_awvalid[i] && m_axi_awready[i]; - wire m_axi_w_fire = m_axi_wvalid[i] && m_axi_wready[i]; - always @(posedge clk) begin - if (reset) begin - m_axi_aw_ack[i] <= 0; - m_axi_w_ack[i] <= 0; - end else begin - if (mem_req_fire && (req_bank_sel == i)) begin - m_axi_aw_ack[i] <= 0; - m_axi_w_ack[i] <= 0; - end else begin - if (m_axi_aw_fire) - m_axi_aw_ack[i] <= 1; - if (m_axi_w_fire) - m_axi_w_ack[i] <= 1; - end + if (NUM_BANKS_OUT > 1) begin : g_bank_sel + for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_i + wire [DST_ADDR_WDITH-1:0] mem_req_addr_dst = DST_ADDR_WDITH'(mem_req_addr[i]); + if (INTERLEAVE) begin : g_interleave + assign req_bank_sel[i] = mem_req_addr_dst[BANK_SEL_BITS-1:0]; + assign req_bank_addr[i] = mem_req_addr_dst[BANK_SEL_BITS +: BANK_ADDR_WIDTH]; + end else begin : g_no_interleave + assign req_bank_sel[i] = mem_req_addr_dst[BANK_ADDR_WIDTH +: BANK_SEL_BITS]; + assign req_bank_addr[i] = mem_req_addr_dst[BANK_ADDR_WIDTH-1:0]; end end + end else begin : g_no_bank_sel + for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_i + assign req_bank_sel[i] = '0; + assign req_bank_addr[i] = DST_ADDR_WDITH'(mem_req_addr[i]); + end end - wire axi_write_ready [NUM_BANKS]; + // Tag handling logic - for (genvar i = 0; i < NUM_BANKS; ++i) begin - assign axi_write_ready[i] = (m_axi_awready[i] || m_axi_aw_ack[i]) - && (m_axi_wready[i] || m_axi_w_ack[i]); + wire [NUM_PORTS_IN-1:0] mem_rd_req_tag_ready; + wire [NUM_PORTS_IN-1:0][READ_TAG_WIDTH-1:0] mem_rd_req_tag; + wire [NUM_PORTS_IN-1:0][READ_TAG_WIDTH-1:0] mem_rd_rsp_tag; + + for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_tag_buf + if (NEEDED_TAG_WIDTH > TAG_WIDTH_OUT) begin : g_enabled + wire [TAG_BUFFER_ADDRW-1:0] tbuf_waddr, tbuf_raddr; + wire tbuf_full; + VX_index_buffer #( + .DATAW (TAG_WIDTH_IN), + .SIZE (TAG_BUFFER_SIZE) + ) tag_buf ( + .clk (clk), + .reset (reset), + .acquire_en (mem_req_valid[i] && ~mem_req_rw[i] && mem_req_ready[i]), + .write_addr (tbuf_waddr), + .write_data (mem_req_tag[i]), + .read_data (mem_rsp_tag[i]), + .read_addr (tbuf_raddr), + .release_en (mem_rsp_valid[i] && mem_rsp_ready[i]), + .full (tbuf_full), + `UNUSED_PIN (empty) + ); + assign mem_rd_req_tag_ready[i] = ~tbuf_full; + assign mem_rd_req_tag[i] = tbuf_waddr; + assign tbuf_raddr = mem_rd_rsp_tag[i]; + end else begin : g_none + assign mem_rd_req_tag_ready[i] = 1; + assign mem_rd_req_tag[i] = mem_req_tag[i]; + assign mem_rsp_tag[i] = mem_rd_rsp_tag[i]; + end end - // Vortex request ack - if (NUM_BANKS > 1) begin - assign mem_req_ready = mem_req_rw ? axi_write_ready[req_bank_sel] : m_axi_arready[req_bank_sel]; - end else begin - assign mem_req_ready = mem_req_rw ? axi_write_ready[0] : m_axi_arready[0]; + // AXI request handling + + wire [NUM_PORTS_IN-1:0] req_xbar_valid_in; + wire [NUM_PORTS_IN-1:0][REQ_XBAR_DATAW-1:0] req_xbar_data_in; + wire [NUM_PORTS_IN-1:0] req_xbar_ready_in; + + wire [NUM_BANKS_OUT-1:0] req_xbar_valid_out; + wire [NUM_BANKS_OUT-1:0][REQ_XBAR_DATAW-1:0] req_xbar_data_out; + wire [NUM_BANKS_OUT-1:0][NUM_PORTS_IN_WIDTH-1:0] req_xbar_sel_out; + wire [NUM_BANKS_OUT-1:0] req_xbar_ready_out; + + for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_req_xbar_data_in + wire tag_ready = mem_req_rw[i] || mem_rd_req_tag_ready[i]; + wire [XBAR_TAG_WIDTH-1:0] tag_value = mem_req_rw[i] ? XBAR_TAG_WIDTH'(mem_req_tag[i]) : XBAR_TAG_WIDTH'(mem_rd_req_tag[i]); + assign req_xbar_valid_in[i] = mem_req_valid[i] && tag_ready; + assign req_xbar_data_in[i] = {mem_req_rw[i], req_bank_addr[i], mem_req_byteen[i], mem_req_data[i], tag_value}; + assign mem_req_ready[i] = req_xbar_ready_in[i] && tag_ready; end - // AXI write request address channel - for (genvar i = 0; i < NUM_BANKS; ++i) begin - assign m_axi_awvalid[i] = mem_req_valid && mem_req_rw && (req_bank_sel == i) && ~m_axi_aw_ack[i]; - assign m_axi_awaddr[i] = (ADDR_WIDTH'(mem_req_addr) >> LOG2_NUM_BANKS) << AXSIZE; - assign m_axi_awid[i] = mem_req_tag; + VX_stream_xbar #( + .NUM_INPUTS (NUM_PORTS_IN), + .NUM_OUTPUTS(NUM_BANKS_OUT), + .DATAW (REQ_XBAR_DATAW), + .ARBITER (ARBITER), + .OUT_BUF (REQ_OUT_BUF) + ) req_xbar ( + .clk (clk), + .reset (reset), + .sel_in (req_bank_sel), + .valid_in (req_xbar_valid_in), + .data_in (req_xbar_data_in), + .ready_in (req_xbar_ready_in), + .valid_out (req_xbar_valid_out), + .data_out (req_xbar_data_out), + .ready_out (req_xbar_ready_out), + .sel_out (req_xbar_sel_out), + `UNUSED_PIN (collisions) + ); + + for (genvar i = 0; i < NUM_BANKS_OUT; ++i) begin : g_axi_reqs + + wire xbar_rw_out; + wire [BANK_ADDR_WIDTH-1:0] xbar_addr_out; + wire [XBAR_TAG_WIDTH-1:0] xbar_tag_out; + wire [DATA_WIDTH-1:0] xbar_data_out; + wire [DATA_SIZE-1:0] xbar_byteen_out; + + assign { + xbar_rw_out, + xbar_addr_out, + xbar_byteen_out, + xbar_data_out, + xbar_tag_out + } = req_xbar_data_out[i]; + + // AXi request handshake + + wire m_axi_aw_ack, m_axi_w_ack, axi_write_ready; + + VX_axi_write_ack axi_write_ack ( + .clk (clk), + .reset (reset), + .awvalid(m_axi_awvalid[i]), + .awready(m_axi_awready[i]), + .wvalid (m_axi_wvalid[i]), + .wready (m_axi_wready[i]), + .aw_ack (m_axi_aw_ack), + .w_ack (m_axi_w_ack), + .tx_rdy (axi_write_ready), + `UNUSED_PIN (tx_ack) + ); + + assign req_xbar_ready_out[i] = xbar_rw_out ? axi_write_ready : m_axi_arready[i]; + + // AXI write address channel + + assign m_axi_awvalid[i] = req_xbar_valid_out[i] && xbar_rw_out && ~m_axi_aw_ack; + + if (INTERLEAVE) begin : g_m_axi_awaddr_i + assign m_axi_awaddr[i] = (ADDR_WIDTH_OUT'(xbar_addr_out) << (BANK_SEL_BITS + LOG2_DATA_SIZE)) | (ADDR_WIDTH_OUT'(i) << LOG2_DATA_SIZE); + end else begin : g_m_axi_awaddr_ni + assign m_axi_awaddr[i] = (ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE) | (ADDR_WIDTH_OUT'(i) << (BANK_ADDR_WIDTH + LOG2_DATA_SIZE)); + end + + assign m_axi_awid[i] = TAG_WIDTH_OUT'(xbar_tag_out); assign m_axi_awlen[i] = 8'b00000000; - assign m_axi_awsize[i] = 3'(AXSIZE); + assign m_axi_awsize[i] = 3'(LOG2_DATA_SIZE); assign m_axi_awburst[i] = 2'b00; assign m_axi_awlock[i] = 2'b00; assign m_axi_awcache[i] = 4'b0000; assign m_axi_awprot[i] = 3'b000; assign m_axi_awqos[i] = 4'b0000; assign m_axi_awregion[i]= 4'b0000; - end - // AXI write request data channel - for (genvar i = 0; i < NUM_BANKS; ++i) begin - assign m_axi_wvalid[i] = mem_req_valid && mem_req_rw && (req_bank_sel == i) && ~m_axi_w_ack[i]; - assign m_axi_wdata[i] = mem_req_data; - assign m_axi_wstrb[i] = mem_req_byteen; - assign m_axi_wlast[i] = 1'b1; - end + // AXI write data channel - // AXI write response channel (ignore) - for (genvar i = 0; i < NUM_BANKS; ++i) begin - `UNUSED_VAR (m_axi_bvalid[i]) - `UNUSED_VAR (m_axi_bid[i]) - `UNUSED_VAR (m_axi_bresp[i]) - assign m_axi_bready[i] = 1'b1; - `RUNTIME_ASSERT(~m_axi_bvalid[i] || m_axi_bresp[i] == 0, ("%t: *** AXI response error", $time)); - end + assign m_axi_wvalid[i] = req_xbar_valid_out[i] && xbar_rw_out && ~m_axi_w_ack; + assign m_axi_wstrb[i] = xbar_byteen_out; + assign m_axi_wdata[i] = xbar_data_out; + assign m_axi_wlast[i] = 1'b1; - // AXI read request channel - for (genvar i = 0; i < NUM_BANKS; ++i) begin - assign m_axi_arvalid[i] = mem_req_valid && ~mem_req_rw && (req_bank_sel == i); - assign m_axi_araddr[i] = (ADDR_WIDTH'(mem_req_addr) >> LOG2_NUM_BANKS) << AXSIZE; - assign m_axi_arid[i] = mem_req_tag; + // AXI read address channel + + wire [READ_FULL_TAG_WIDTH-1:0] xbar_tag_r_out; + if (NUM_PORTS_IN > 1) begin : g_xbar_tag_r_out + assign xbar_tag_r_out = READ_FULL_TAG_WIDTH'({xbar_tag_out, req_xbar_sel_out[i]}); + end else begin : g_no_input_sel + `UNUSED_VAR (req_xbar_sel_out) + assign xbar_tag_r_out = READ_TAG_WIDTH'(xbar_tag_out); + end + + assign m_axi_arvalid[i] = req_xbar_valid_out[i] && ~xbar_rw_out; + + // convert address to byte-addressable space + if (INTERLEAVE) begin : g_m_axi_araddr_i + assign m_axi_araddr[i] = (ADDR_WIDTH_OUT'(xbar_addr_out) << (BANK_SEL_BITS + LOG2_DATA_SIZE)) | (ADDR_WIDTH_OUT'(i) << LOG2_DATA_SIZE); + end else begin : g_m_axi_araddr_ni + assign m_axi_araddr[i] = (ADDR_WIDTH_OUT'(xbar_addr_out) << LOG2_DATA_SIZE) | (ADDR_WIDTH_OUT'(i) << (BANK_ADDR_WIDTH + LOG2_DATA_SIZE)); + end + assign m_axi_arid[i] = TAG_WIDTH_OUT'(xbar_tag_r_out); assign m_axi_arlen[i] = 8'b00000000; - assign m_axi_arsize[i] = 3'(AXSIZE); + assign m_axi_arsize[i] = 3'(LOG2_DATA_SIZE); assign m_axi_arburst[i] = 2'b00; assign m_axi_arlock[i] = 2'b00; assign m_axi_arcache[i] = 4'b0000; @@ -188,38 +304,65 @@ module VX_axi_adapter #( assign m_axi_arregion[i]= 4'b0000; end + // AXI write response channel (ignore) + + for (genvar i = 0; i < NUM_BANKS_OUT; ++i) begin : g_axi_write_rsp + `UNUSED_VAR (m_axi_bvalid[i]) + `UNUSED_VAR (m_axi_bid[i]) + `UNUSED_VAR (m_axi_bresp[i]) + assign m_axi_bready[i] = 1'b1; + `RUNTIME_ASSERT(~m_axi_bvalid[i] || m_axi_bresp[i] == 0, ("%t: *** AXI response error", $time)) + end + // AXI read response channel - wire [NUM_BANKS-1:0] rsp_arb_valid_in; - wire [NUM_BANKS-1:0][DATA_WIDTH+TAG_WIDTH-1:0] rsp_arb_data_in; - wire [NUM_BANKS-1:0] rsp_arb_ready_in; + wire [NUM_BANKS_OUT-1:0] rsp_xbar_valid_in; + wire [NUM_BANKS_OUT-1:0][RSP_XBAR_DATAW-1:0] rsp_xbar_data_in; + wire [NUM_BANKS_OUT-1:0][NUM_PORTS_IN_WIDTH-1:0] rsp_xbar_sel_in; + wire [NUM_BANKS_OUT-1:0] rsp_xbar_ready_in; - `UNUSED_VAR (m_axi_rlast) - - for (genvar i = 0; i < NUM_BANKS; ++i) begin - assign rsp_arb_valid_in[i] = m_axi_rvalid[i]; - assign rsp_arb_data_in[i] = {m_axi_rdata[i], m_axi_rid[i]}; - assign m_axi_rready[i] = rsp_arb_ready_in[i]; - `RUNTIME_ASSERT(~m_axi_rvalid[i] || m_axi_rlast[i] == 1, ("%t: *** AXI response error", $time)); - `RUNTIME_ASSERT(~m_axi_rvalid[i] || m_axi_rresp[i] == 0, ("%t: *** AXI response error", $time)); + for (genvar i = 0; i < NUM_BANKS_OUT; ++i) begin : g_rsp_xbar_data_in + assign rsp_xbar_valid_in[i] = m_axi_rvalid[i]; + assign rsp_xbar_data_in[i] = {m_axi_rdata[i], m_axi_rid[i][NUM_PORTS_IN_BITS +: READ_TAG_WIDTH]}; + if (NUM_PORTS_IN > 1) begin : g_input_sel + assign rsp_xbar_sel_in[i] = m_axi_rid[i][0 +: NUM_PORTS_IN_BITS]; + end else begin : g_no_input_sel + assign rsp_xbar_sel_in[i] = 0; + end + assign m_axi_rready[i] = rsp_xbar_ready_in[i]; + `RUNTIME_ASSERT(~(m_axi_rvalid[i] && m_axi_rlast[i] == 0), ("%t: *** AXI response error", $time)) + `RUNTIME_ASSERT(~(m_axi_rvalid[i] && m_axi_rresp[i] != 0), ("%t: *** AXI response error", $time)) end - - VX_stream_arb #( - .NUM_INPUTS (NUM_BANKS), - .DATAW (DATA_WIDTH + TAG_WIDTH), - .ARBITER ("F"), + + wire [NUM_PORTS_IN-1:0] rsp_xbar_valid_out; + wire [NUM_PORTS_IN-1:0][DATA_WIDTH+READ_TAG_WIDTH-1:0] rsp_xbar_data_out; + wire [NUM_PORTS_IN-1:0] rsp_xbar_ready_out; + + VX_stream_xbar #( + .NUM_INPUTS (NUM_BANKS_OUT), + .NUM_OUTPUTS(NUM_PORTS_IN), + .DATAW (RSP_XBAR_DATAW), + .ARBITER (ARBITER), .OUT_BUF (RSP_OUT_BUF) - ) rsp_arb ( + ) rsp_xbar ( .clk (clk), .reset (reset), - .valid_in (rsp_arb_valid_in), - .data_in (rsp_arb_data_in), - .ready_in (rsp_arb_ready_in), - .data_out ({mem_rsp_data, mem_rsp_tag}), - .valid_out (mem_rsp_valid), - .ready_out (mem_rsp_ready), + .valid_in (rsp_xbar_valid_in), + .data_in (rsp_xbar_data_in), + .ready_in (rsp_xbar_ready_in), + .sel_in (rsp_xbar_sel_in), + .data_out (rsp_xbar_data_out), + .valid_out (rsp_xbar_valid_out), + .ready_out (rsp_xbar_ready_out), + `UNUSED_PIN (collisions), `UNUSED_PIN (sel_out) ); + for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_rsp_xbar_data_out + assign mem_rsp_valid[i] = rsp_xbar_valid_out[i]; + assign {mem_rsp_data[i], mem_rd_rsp_tag[i]} = rsp_xbar_data_out[i]; + assign rsp_xbar_ready_out[i] = mem_rsp_ready[i]; + end + endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_axi_write_ack.sv b/hw/rtl/libs/VX_axi_write_ack.sv new file mode 100644 index 000000000..faf4fc858 --- /dev/null +++ b/hw/rtl/libs/VX_axi_write_ack.sv @@ -0,0 +1,60 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_platform.vh" + +`TRACING_OFF +module VX_axi_write_ack ( + input wire clk, + input wire reset, + input wire awvalid, + input wire awready, + input wire wvalid, + input wire wready, + output wire aw_ack, + output wire w_ack, + output wire tx_ack, + output wire tx_rdy +); + reg aw_fired; + reg w_fired; + + wire aw_fire = awvalid && awready; + wire w_fire = wvalid && wready; + + always @(posedge clk) begin + if (reset) begin + aw_fired <= 0; + w_fired <= 0; + end else begin + if (aw_fire) begin + aw_fired <= 1; + end + if (w_fire) begin + w_fired <= 1; + end + if (tx_ack) begin + aw_fired <= 0; + w_fired <= 0; + end + end + end + + assign aw_ack = aw_fired; + assign w_ack = w_fired; + + assign tx_ack = (aw_fire || aw_fired) && (w_fire || w_fired); + assign tx_rdy = (awready || aw_fired) && (wready || w_fired); + +endmodule +`TRACING_ON diff --git a/hw/rtl/libs/VX_bits_concat.sv b/hw/rtl/libs/VX_bits_concat.sv new file mode 100644 index 000000000..cb3cec430 --- /dev/null +++ b/hw/rtl/libs/VX_bits_concat.sv @@ -0,0 +1,36 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_platform.vh" + +`TRACING_OFF +module VX_bits_concat #( + parameter L = 1, + parameter R = 1 +) ( + input wire [`UP(L)-1:0] left_in, + input wire [`UP(R)-1:0] right_in, + output wire [(L+R)-1:0] data_out +); + if (L == 0) begin : g_right_only + `UNUSED_VAR (left_in) + assign data_out = right_in; + end else if (R == 0) begin : g_left_only + `UNUSED_VAR (right_in) + assign data_out = left_in; + end else begin : g_concat + assign data_out = {left_in, right_in}; + end + +endmodule +`TRACING_ON diff --git a/hw/rtl/libs/VX_bits_insert.sv b/hw/rtl/libs/VX_bits_insert.sv index f0f00a2b5..dee8141bb 100644 --- a/hw/rtl/libs/VX_bits_insert.sv +++ b/hw/rtl/libs/VX_bits_insert.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,19 +19,19 @@ module VX_bits_insert #( parameter S = 1, parameter POS = 0 ) ( - input wire [N-1:0] data_in, - input wire [`UP(S)-1:0] ins_in, + input wire [N-1:0] data_in, + input wire [`UP(S)-1:0] ins_in, output wire [N+S-1:0] data_out -); - if (S == 0) begin +); + if (S == 0) begin : g_passthru `UNUSED_VAR (ins_in) assign data_out = data_in; - end else begin - if (POS == 0) begin + end else begin : g_insert + if (POS == 0) begin : g_pos_0 assign data_out = {data_in, ins_in}; - end else if (POS == N) begin + end else if (POS == N) begin : g_pos_N assign data_out = {ins_in, data_in}; - end else begin + end else begin : g_pos assign data_out = {data_in[N-1:POS], ins_in, data_in[POS-1:0]}; end end diff --git a/hw/rtl/libs/VX_bits_remove.sv b/hw/rtl/libs/VX_bits_remove.sv index bc2f60a70..fae7d470c 100644 --- a/hw/rtl/libs/VX_bits_remove.sv +++ b/hw/rtl/libs/VX_bits_remove.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,17 +19,24 @@ module VX_bits_remove #( parameter S = 1, parameter POS = 0 ) ( - input wire [N-1:0] data_in, + input wire [N-1:0] data_in, + output wire [`UP(S)-1:0] sel_out, output wire [N-S-1:0] data_out ); `STATIC_ASSERT (((0 == S) || ((POS + S) <= N)), ("invalid parameter")) - - if (POS == 0 || S == 0) begin + + if (S == 0) begin : g_passthru + assign sel_out = 0; + assign data_out = data_in; + end else if (POS == 0) begin : g_pos_0 + assign sel_out = data_in[0 +: S]; assign data_out = data_in[N-1:S]; - end else if ((POS + S) < N) begin - assign data_out = {data_in[N-1:(POS+S)], data_in[POS-1:0]}; - end else begin + end else if ((POS + S) == N) begin : g_pos_N + assign sel_out = data_in[POS +: S]; assign data_out = data_in[POS-1:0]; + end else begin : g_pos + assign sel_out = data_in[POS +: S]; + assign data_out = {data_in[N-1:(POS+S)], data_in[POS-1:0]}; end `UNUSED_VAR (data_in) diff --git a/hw/rtl/libs/VX_bypass_buffer.sv b/hw/rtl/libs/VX_bypass_buffer.sv index 4eefce440..7378a4fdd 100644 --- a/hw/rtl/libs/VX_bypass_buffer.sv +++ b/hw/rtl/libs/VX_bypass_buffer.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -25,30 +25,33 @@ module VX_bypass_buffer #( parameter DATAW = 1, parameter PASSTHRU = 0 -) ( +) ( input wire clk, input wire reset, input wire valid_in, - output wire ready_in, + output wire ready_in, input wire [DATAW-1:0] data_in, output wire [DATAW-1:0] data_out, input wire ready_out, output wire valid_out -); - if (PASSTHRU != 0) begin +); + if (PASSTHRU != 0) begin : g_passthru + `UNUSED_VAR (clk) `UNUSED_VAR (reset) assign ready_in = ready_out; - assign valid_out = valid_in; + assign valid_out = valid_in; assign data_out = data_in; - end else begin + + end else begin : g_buffer + reg [DATAW-1:0] buffer; reg has_data; always @(posedge clk) begin if (reset) begin has_data <= 0; - end else begin + end else begin if (ready_out) begin has_data <= 0; end else if (~has_data) begin @@ -63,6 +66,7 @@ module VX_bypass_buffer #( assign ready_in = ready_out || ~has_data; assign data_out = has_data ? buffer : data_in; assign valid_out = valid_in || has_data; + end endmodule diff --git a/hw/rtl/libs/VX_cyclic_arbiter.sv b/hw/rtl/libs/VX_cyclic_arbiter.sv index c4a42da14..283c0aa4f 100644 --- a/hw/rtl/libs/VX_cyclic_arbiter.sv +++ b/hw/rtl/libs/VX_cyclic_arbiter.sv @@ -26,42 +26,58 @@ module VX_cyclic_arbiter #( output wire grant_valid, input wire grant_ready ); - if (NUM_REQS == 1) begin + if (NUM_REQS == 1) begin : g_passthru `UNUSED_VAR (clk) `UNUSED_VAR (reset) + `UNUSED_VAR (grant_ready) assign grant_index = '0; assign grant_onehot = requests; assign grant_valid = requests[0]; - end else begin + end else begin : g_arbiter localparam IS_POW2 = (1 << LOG_NUM_REQS) == NUM_REQS; + wire [LOG_NUM_REQS-1:0] grant_index_um; + wire [NUM_REQS-1:0] grant_onehot_w, grant_onehot_um; reg [LOG_NUM_REQS-1:0] grant_index_r; always @(posedge clk) begin if (reset) begin grant_index_r <= '0; - end else begin - if (!IS_POW2 && grant_index_r == LOG_NUM_REQS'(NUM_REQS-1)) begin + end else if (grant_valid && grant_ready) begin + if (!IS_POW2 && grant_index == LOG_NUM_REQS'(NUM_REQS-1)) begin grant_index_r <= '0; - end else if (~grant_valid || grant_ready) begin - grant_index_r <= grant_index_r + LOG_NUM_REQS'(1); + end else begin + grant_index_r <= grant_index + LOG_NUM_REQS'(1); end end end - reg [NUM_REQS-1:0] grant_onehot_r; - always @(*) begin - grant_onehot_r = '0; - grant_onehot_r[grant_index_r] = 1'b1; - end + VX_priority_encoder #( + .N (NUM_REQS) + ) priority_encoder ( + .data_in (requests), + .onehot_out (grant_onehot_um), + .index_out (grant_index_um), + .valid_out (grant_valid) + ); - assign grant_index = grant_index_r; - assign grant_onehot = grant_onehot_r; - assign grant_valid = requests[grant_index_r]; + VX_demux #( + .DATAW (1), + .N (NUM_REQS) + ) grant_decoder ( + .sel_in (grant_index), + .data_in (1'b1), + .data_out (grant_onehot_w) + ); + + wire is_hit = requests[grant_index_r]; + + assign grant_index = is_hit ? grant_index_r : grant_index_um; + assign grant_onehot = is_hit ? grant_onehot_w : grant_onehot_um; end diff --git a/hw/rtl/libs/VX_demux.sv b/hw/rtl/libs/VX_demux.sv new file mode 100644 index 000000000..6a1ddc853 --- /dev/null +++ b/hw/rtl/libs/VX_demux.sv @@ -0,0 +1,47 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_platform.vh" + +// Fast encoder using parallel prefix computation +// Adapted from BaseJump STL: http://bjump.org/data_out.html + +`TRACING_OFF +module VX_demux #( + parameter DATAW = 1, + parameter N = 0, + parameter MODEL = 0, + parameter LN = `LOG2UP(N) +) ( + input wire [LN-1:0] sel_in, + input wire [DATAW-1:0] data_in, + output wire [N-1:0][DATAW-1:0] data_out +); + if (N > 1) begin : g_demux + logic [N-1:0][DATAW-1:0] shift; + if (MODEL == 1) begin : g_model1 + always @(*) begin + shift = '0; + shift[sel_in] = {DATAW{1'b1}}; + end + end else begin : g_model0 + assign shift = ((N*DATAW)'({DATAW{1'b1}})) << (sel_in * DATAW); + end + assign data_out = {N{data_in}} & shift; + end else begin : g_passthru + `UNUSED_VAR (sel_in) + assign data_out = data_in; + end + +endmodule +`TRACING_ON diff --git a/hw/rtl/libs/VX_divider.sv b/hw/rtl/libs/VX_divider.sv index 551940da1..b8424843d 100644 --- a/hw/rtl/libs/VX_divider.sv +++ b/hw/rtl/libs/VX_divider.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -24,7 +24,7 @@ module VX_divider #( parameter LATENCY = 0 ) ( input wire clk, - input wire enable, + input wire enable, input wire [N_WIDTH-1:0] numer, input wire [D_WIDTH-1:0] denom, output wire [Q_WIDTH-1:0] quotient, @@ -37,7 +37,7 @@ module VX_divider #( wire [D_WIDTH-1:0] remainder_unqual; lpm_divide divide ( - .clock (clk), + .clock (clk), .clken (enable), .numer (numer), .denom (denom), @@ -47,7 +47,7 @@ module VX_divider #( defparam divide.lpm_type = "LPM_DIVIDE", - divide.lpm_widthn = N_WIDTH, + divide.lpm_widthn = N_WIDTH, divide.lpm_widthd = D_WIDTH, divide.lpm_nrepresentation = N_SIGNED ? "SIGNED" : "UNSIGNED", divide.lpm_drepresentation = D_SIGNED ? "SIGNED" : "UNSIGNED", @@ -62,36 +62,36 @@ module VX_divider #( reg [N_WIDTH-1:0] quotient_unqual; reg [D_WIDTH-1:0] remainder_unqual; - always @(*) begin + always @(*) begin begin if (N_SIGNED && D_SIGNED) begin quotient_unqual = $signed(numer) / $signed(denom); remainder_unqual = $signed(numer) % $signed(denom); - end + end else if (N_SIGNED && !D_SIGNED) begin quotient_unqual = $signed(numer) / denom; remainder_unqual = $signed(numer) % denom; - end + end else if (!N_SIGNED && D_SIGNED) begin quotient_unqual = numer / $signed(denom); remainder_unqual = numer % $signed(denom); - end + end else begin quotient_unqual = numer / denom; - remainder_unqual = numer % denom; + remainder_unqual = numer % denom; end end end - if (LATENCY == 0) begin + if (LATENCY == 0) begin : g_comb assign quotient = quotient_unqual [Q_WIDTH-1:0]; assign remainder = remainder_unqual [R_WIDTH-1:0]; - end else begin + end else begin : g_pipe reg [N_WIDTH-1:0] quotient_pipe [LATENCY-1:0]; reg [D_WIDTH-1:0] remainder_pipe [LATENCY-1:0]; - for (genvar i = 0; i < LATENCY; ++i) begin - always @(posedge clk) begin + for (genvar i = 0; i < LATENCY; ++i) begin : g_reg + always @(posedge clk) begin if (enable) begin quotient_pipe[i] <= (0 == i) ? quotient_unqual : quotient_pipe[i-1]; remainder_pipe[i] <= (0 == i) ? remainder_unqual : remainder_pipe[i-1]; @@ -101,7 +101,7 @@ module VX_divider #( assign quotient = quotient_pipe[LATENCY-1][Q_WIDTH-1:0]; assign remainder = remainder_pipe[LATENCY-1][R_WIDTH-1:0]; - end + end `endif diff --git a/hw/rtl/libs/VX_dp_ram.sv b/hw/rtl/libs/VX_dp_ram.sv index 6683eaecc..87ba9999b 100644 --- a/hw/rtl/libs/VX_dp_ram.sv +++ b/hw/rtl/libs/VX_dp_ram.sv @@ -13,18 +13,68 @@ `include "VX_platform.vh" +`define RAM_INITIALIZATION \ + if (INIT_ENABLE != 0) begin : g_init \ + if (INIT_FILE != "") begin : g_file \ + initial $readmemh(INIT_FILE, ram); \ + end else begin : g_value \ + initial begin \ + for (integer i = 0; i < SIZE; ++i) begin : g_i \ + ram[i] = INIT_VALUE; \ + end \ + end \ + end \ + end + +`ifdef SIMULATION + `define RAM_RESET_BLOCK if (RESET_RAM && reset) begin \ + for (integer i = 0; i < SIZE; ++i) begin \ + ram[i] <= DATAW'(INIT_VALUE); \ + end \ + end else +`else + `define RAM_RESET_BLOCK +`endif + +`define RAM_WRITE_ALL `RAM_RESET_BLOCK \ + if (write) begin \ + ram[waddr] <= wdata; \ + end + +`ifdef QUARTUS + `define RAM_ARRAY_WREN reg [WRENW-1:0][WSELW-1:0] ram [0:SIZE-1]; + `define RAM_WRITE_WREN `RAM_RESET_BLOCK \ + if (write) begin \ + for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[waddr][i] <= wdata[i * WSELW +: WSELW]; \ + end \ + end \ + end +`else + `define RAM_ARRAY_WREN reg [DATAW-1:0] ram [0:SIZE-1]; + `define RAM_WRITE_WREN `RAM_RESET_BLOCK \ + if (write) begin \ + for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \ + end \ + end \ + end +`endif + `TRACING_OFF module VX_dp_ram #( parameter DATAW = 1, parameter SIZE = 1, - parameter ADDR_MIN = 0, parameter WRENW = 1, parameter OUT_REG = 0, - parameter NO_RWCHECK = 0, parameter LUTRAM = 0, - parameter RW_ASSERT = 0, + parameter `STRING RDW_MODE = "W", // W: write-first, R: read-first + parameter RADDR_REG = 0, // read address registered hint + parameter RADDR_RESET = 0, // read address has reset + parameter RDW_ASSERT = 0, parameter RESET_RAM = 0, - parameter READ_ENABLE = 0, parameter INIT_ENABLE = 0, parameter INIT_FILE = "", parameter [DATAW-1:0] INIT_VALUE = 0, @@ -41,206 +91,271 @@ module VX_dp_ram #( output wire [DATAW-1:0] rdata ); localparam WSELW = DATAW / WRENW; - `STATIC_ASSERT((WRENW * WSELW == DATAW), ("invalid parameter")) + `UNUSED_PARAM (LUTRAM) + `UNUSED_PARAM (RADDR_REG) + `UNUSED_PARAM (RADDR_RESET) -`define RAM_INITIALIZATION \ - if (INIT_ENABLE != 0) begin \ - if (INIT_FILE != "") begin \ - initial $readmemh(INIT_FILE, ram); \ - end else begin \ - initial \ - for (integer i = 0; i < SIZE; ++i) \ - ram[i] = INIT_VALUE; \ - end \ - end - - `UNUSED_PARAM (RW_ASSERT) - `UNUSED_VAR (read) - - if (WRENW > 1) begin - `RUNTIME_ASSERT(~write || (| wren), ("invalid write enable mask")); - end - - wire [DATAW-1:0] rdata_w; + `STATIC_ASSERT(!(WRENW * WSELW != DATAW), ("invalid parameter")) + `STATIC_ASSERT((RDW_MODE == "R" || RDW_MODE == "W"), ("invalid parameter")) + `UNUSED_PARAM (RDW_ASSERT) `ifdef SYNTHESIS - if (WRENW > 1) begin - `ifdef QUARTUS - if (LUTRAM != 0) begin - `USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i] <= wdata[i * WSELW +: WSELW]; + localparam FORCE_BRAM = !LUTRAM && `FORCE_BRAM(SIZE, DATAW); + if (OUT_REG) begin : g_sync + if (FORCE_BRAM) begin : g_bram + if (RDW_MODE == "W") begin : g_write_first + if (WRENW != 1) begin : g_wren + `RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [ADDRW-1:0] raddr_r; + always @(posedge clk) begin + `RAM_WRITE_WREN + if (read) begin + raddr_r <= raddr; + end end + assign rdata = ram[raddr_r]; + end else begin : g_no_wren + `RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [ADDRW-1:0] raddr_r; + always @(posedge clk) begin + `RAM_WRITE_ALL + if (read) begin + raddr_r <= raddr; + end + end + assign rdata = ram[raddr_r]; + end + end else if (RDW_MODE == "R") begin : g_read_first + if (WRENW != 1) begin : g_wren + `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + `RAM_WRITE_WREN + if (read) begin + rdata_r <= ram[raddr]; + end + end + assign rdata = rdata_r; + end else begin : g_no_wren + `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + `RAM_WRITE_ALL + if (read) begin + rdata_r <= ram[raddr]; + end + end + assign rdata = rdata_r; end end - assign rdata_w = ram[raddr]; - end else begin - if (NO_RWCHECK != 0) begin - `NO_RW_RAM_CHECK reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i] <= wdata[i * WSELW +: WSELW]; + end else begin : g_auto + if (RDW_MODE == "W") begin : g_write_first + if (WRENW != 1) begin : g_wren + `RW_RAM_CHECK `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [ADDRW-1:0] raddr_r; + always @(posedge clk) begin + `RAM_WRITE_WREN + if (read) begin + raddr_r <= raddr; end end - end - assign rdata_w = ram[raddr]; - end else begin - reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i] <= wdata[i * WSELW +: WSELW]; + assign rdata = ram[raddr_r]; + end else begin : g_no_wren + `RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [ADDRW-1:0] raddr_r; + always @(posedge clk) begin + `RAM_WRITE_ALL + if (read) begin + raddr_r <= raddr; end end + assign rdata = ram[raddr_r]; + end + end else if (RDW_MODE == "R") begin : g_read_first + if (WRENW != 1) begin : g_wren + `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + `RAM_WRITE_WREN + if (read) begin + rdata_r <= ram[raddr]; + end + end + assign rdata = rdata_r; + end else begin : g_no_wren + reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + `RAM_WRITE_ALL + if (read) begin + rdata_r <= ram[raddr]; + end + end + assign rdata = rdata_r; end - assign rdata_w = ram[raddr]; end end - `else - // default synthesis - if (LUTRAM != 0) begin - `USE_FAST_BRAM reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; + end else begin : g_async + `UNUSED_VAR (read) + if (FORCE_BRAM) begin : g_bram + `ifdef ASYNC_BRAM_PATCH + VX_async_ram_patch #( + .DATAW (DATAW), + .SIZE (SIZE), + .WRENW (WRENW), + .DUAL_PORT (1), + .FORCE_BRAM (FORCE_BRAM), + .RADDR_REG (RADDR_REG), + .RADDR_RESET(RADDR_RESET), + .WRITE_FIRST(RDW_MODE == "W"), + .INIT_ENABLE(INIT_ENABLE), + .INIT_FILE (INIT_FILE), + .INIT_VALUE (INIT_VALUE) + ) async_ram_patch ( + .clk (clk), + .reset (reset), + .read (read), + .write (write), + .wren (wren), + .waddr (waddr), + .wdata (wdata), + .raddr (raddr), + .rdata (rdata) + ); + `else + if (RDW_MODE == "W") begin : g_write_first + if (WRENW != 1) begin : g_wren + `RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + always @(posedge clk) begin + `RAM_WRITE_WREN end + assign rdata = ram[raddr]; + end else begin : g_no_wren + `RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + `RAM_WRITE_ALL + end + assign rdata = ram[raddr]; + end + end else begin : g_read_first + if (WRENW != 1) begin : g_wren + `NO_RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + always @(posedge clk) begin + `RAM_WRITE_WREN + end + assign rdata = ram[raddr]; + end else begin : g_no_wren + `NO_RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + `RAM_WRITE_ALL + end + assign rdata = ram[raddr]; end end - assign rdata_w = ram[raddr]; - end else begin - if (NO_RWCHECK != 0) begin - `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; - end + `endif + end else begin : g_auto + if (RDW_MODE == "W") begin : g_write_first + if (WRENW != 1) begin : g_wren + `RW_RAM_CHECK `RAM_ARRAY_WREN + `RAM_INITIALIZATION + always @(posedge clk) begin + `RAM_WRITE_WREN end - end - assign rdata_w = ram[raddr]; - end else begin - reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - for (integer i = 0; i < WRENW; ++i) begin - if (wren[i]) - ram[waddr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; - end + assign rdata = ram[raddr]; + end else begin : g_no_wren + `RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + `RAM_WRITE_ALL end + assign rdata = ram[raddr]; end - assign rdata_w = ram[raddr]; - end - end - `endif - end else begin - // (WRENW == 1) - if (LUTRAM != 0) begin - `USE_FAST_BRAM reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - ram[waddr] <= wdata; - end - end - assign rdata_w = ram[raddr]; - end else begin - if (NO_RWCHECK != 0) begin - `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - ram[waddr] <= wdata; + end else begin : g_read_first + if (WRENW != 1) begin : g_wren + `NO_RW_RAM_CHECK `RAM_ARRAY_WREN + `RAM_INITIALIZATION + always @(posedge clk) begin + `RAM_WRITE_WREN end - end - assign rdata_w = ram[raddr]; - end else begin - reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; - `RAM_INITIALIZATION - always @(posedge clk) begin - if (write) begin - ram[waddr] <= wdata; + assign rdata = ram[raddr]; + end else begin : g_no_wren + `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + `RAM_WRITE_ALL end + assign rdata = ram[raddr]; end - assign rdata_w = ram[raddr]; end end end `else // simulation - reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1]; + reg [DATAW-1:0] ram [0:SIZE-1]; `RAM_INITIALIZATION - wire [DATAW-1:0] ram_n; - for (genvar i = 0; i < WRENW; ++i) begin - assign ram_n[i * WSELW +: WSELW] = ((WRENW == 1) | wren[i]) ? wdata[i * WSELW +: WSELW] : ram[waddr][i * WSELW +: WSELW]; - end - - reg [DATAW-1:0] prev_data; - reg [ADDRW-1:0] prev_waddr; - reg prev_write; - always @(posedge clk) begin - if (RESET_RAM && reset) begin - for (integer i = 0; i < SIZE; ++i) begin - ram[i] <= DATAW'(INIT_VALUE); - end - end else begin - if (write) begin - ram[waddr] <= ram_n; - end - end - if (reset) begin - prev_write <= 0; - prev_data <= '0; - prev_waddr <= '0; - end else begin - prev_write <= write; - prev_data <= ram[waddr]; - prev_waddr <= waddr; - end + `RAM_WRITE_WREN end - if (LUTRAM || !NO_RWCHECK) begin - `UNUSED_VAR (prev_write) - `UNUSED_VAR (prev_data) - `UNUSED_VAR (prev_waddr) - assign rdata_w = ram[raddr]; - end else begin - assign rdata_w = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr]; - if (RW_ASSERT) begin - `RUNTIME_ASSERT(~read || (rdata_w == ram[raddr]), ("read after write hazard")); + if (OUT_REG) begin : g_sync + if (RDW_MODE == "W") begin : g_write_first + reg [ADDRW-1:0] raddr_r; + always @(posedge clk) begin + if (read) begin + raddr_r <= raddr; + end + end + assign rdata = ram[raddr_r]; + end else if (RDW_MODE == "R") begin : g_read_first + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read) begin + rdata_r <= ram[raddr]; + end + end + assign rdata = rdata_r; + end + end else begin : g_async + `UNUSED_VAR (read) + if (RDW_MODE == "W") begin : g_write_first + assign rdata = ram[raddr]; + end else begin : g_read_first + reg [DATAW-1:0] prev_data; + reg [ADDRW-1:0] prev_waddr; + reg prev_write; + + always @(posedge clk) begin + if (reset) begin + prev_write <= 0; + prev_data <= '0; + prev_waddr <= '0; + end else begin + prev_write <= write; + prev_data <= ram[waddr]; + prev_waddr <= waddr; + end + end + + assign rdata = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr]; + if (RDW_ASSERT) begin : g_rw_asert + `RUNTIME_ASSERT(~read || (rdata == ram[raddr]), ("%t: read after write hazard", $time)) + end end end `endif - if (OUT_REG != 0) begin - reg [DATAW-1:0] rdata_r; - always @(posedge clk) begin - if (READ_ENABLE && reset) begin - rdata_r <= '0; - end else if (!READ_ENABLE || read) begin - rdata_r <= rdata_w; - end - end - assign rdata = rdata_r; - end else begin - assign rdata = rdata_w; - end - endmodule `TRACING_ON diff --git a/hw/rtl/mem/VX_mem_perf_if.sv b/hw/rtl/libs/VX_edge_trigger.sv similarity index 50% rename from hw/rtl/mem/VX_mem_perf_if.sv rename to hw/rtl/libs/VX_edge_trigger.sv index 8d5ab7693..9e876985c 100644 --- a/hw/rtl/mem/VX_mem_perf_if.sv +++ b/hw/rtl/libs/VX_edge_trigger.sv @@ -1,43 +1,43 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -`include "VX_define.vh" +`include "VX_platform.vh" -interface VX_mem_perf_if import VX_gpu_pkg::*; (); +`TRACING_OFF +module VX_edge_trigger #( + parameter POS = 0, + parameter INIT = 0 +) ( + input wire clk, + input wire reset, + input wire data_in, + output wire data_out +); + reg prev; - cache_perf_t icache; - cache_perf_t dcache; - cache_perf_t l2cache; - cache_perf_t l3cache; - cache_perf_t lmem; - mem_perf_t mem; + always @(posedge clk) begin + if (reset) begin + prev <= INIT; + end else begin + prev <= data_in; + end + end - modport master ( - output icache, - output dcache, - output l2cache, - output l3cache, - output lmem, - output mem - ); + if (POS != 0) begin : g_pos + assign data_out = data_in & ~prev; + end else begin : g_neg + assign data_out = ~data_in & prev; + end - modport slave ( - input icache, - input dcache, - input l2cache, - input l3cache, - input lmem, - input mem - ); - -endinterface +endmodule +`TRACING_ON diff --git a/hw/rtl/libs/VX_elastic_buffer.sv b/hw/rtl/libs/VX_elastic_buffer.sv index 9213572d3..c90aa0616 100644 --- a/hw/rtl/libs/VX_elastic_buffer.sv +++ b/hw/rtl/libs/VX_elastic_buffer.sv @@ -18,8 +18,7 @@ module VX_elastic_buffer #( parameter DATAW = 1, parameter SIZE = 1, parameter OUT_REG = 0, - parameter LUTRAM = 0, - parameter MAX_FANOUT = 0 + parameter LUTRAM = 0 ) ( input wire clk, input wire reset, @@ -32,7 +31,7 @@ module VX_elastic_buffer #( input wire ready_out, output wire valid_out ); - if (SIZE == 0) begin + if (SIZE == 0) begin : g_passthru `UNUSED_VAR (clk) `UNUSED_VAR (reset) @@ -41,47 +40,11 @@ module VX_elastic_buffer #( assign data_out = data_in; assign ready_in = ready_out; - end else if (MAX_FANOUT != 0 && (DATAW > (MAX_FANOUT + MAX_FANOUT/2))) begin - - localparam NUM_SLICES = `CDIV(DATAW, MAX_FANOUT); - localparam N_DATAW = DATAW / NUM_SLICES; - - for (genvar i = 0; i < NUM_SLICES; ++i) begin - - localparam S_DATAW = (i == NUM_SLICES-1) ? (DATAW - i * N_DATAW) : N_DATAW; - - wire valid_out_t, ready_in_t; - `UNUSED_VAR (valid_out_t) - `UNUSED_VAR (ready_in_t) - - `RESET_RELAY (slice_reset, reset); - - VX_elastic_buffer #( - .DATAW (S_DATAW), - .SIZE (SIZE), - .OUT_REG (OUT_REG), - .LUTRAM (LUTRAM) - ) buffer_slice ( - .clk (clk), - .reset (slice_reset), - .valid_in (valid_in), - .data_in (data_in[i * N_DATAW +: S_DATAW]), - .ready_in (ready_in_t), - .valid_out (valid_out_t), - .data_out (data_out[i * N_DATAW +: S_DATAW]), - .ready_out (ready_out) - ); - - if (i == 0) begin - assign ready_in = ready_in_t; - assign valid_out = valid_out_t; - end - end - - end else if (SIZE == 1) begin + end else if (SIZE == 1) begin : g_eb1 VX_pipe_buffer #( - .DATAW (DATAW) + .DATAW (DATAW), + .DEPTH (`MAX(OUT_REG, 1)) ) pipe_buffer ( .clk (clk), .reset (reset), @@ -93,32 +56,51 @@ module VX_elastic_buffer #( .ready_out (ready_out) ); - end else if (SIZE == 2 && LUTRAM == 0) begin + end else if (SIZE == 2 && LUTRAM == 0) begin : g_eb2 - VX_skid_buffer #( + wire valid_out_t; + wire [DATAW-1:0] data_out_t; + wire ready_out_t; + + VX_stream_buffer #( .DATAW (DATAW), - .HALF_BW (OUT_REG == 2), - .OUT_REG (OUT_REG) - ) skid_buffer ( + .OUT_REG (OUT_REG == 1) + ) stream_buffer ( .clk (clk), .reset (reset), .valid_in (valid_in), .data_in (data_in), .ready_in (ready_in), + .valid_out (valid_out_t), + .data_out (data_out_t), + .ready_out (ready_out_t) + ); + + VX_pipe_buffer #( + .DATAW (DATAW), + .DEPTH ((OUT_REG > 1) ? (OUT_REG-1) : 0) + ) out_buf ( + .clk (clk), + .reset (reset), + .valid_in (valid_out_t), + .data_in (data_out_t), + .ready_in (ready_out_t), .valid_out (valid_out), .data_out (data_out), .ready_out (ready_out) ); - end else begin + end else begin : g_ebN wire empty, full; wire [DATAW-1:0] data_out_t; wire ready_out_t; + wire valid_out_t = ~empty; + wire push = valid_in && ready_in; - wire pop = ~empty && ready_out_t; + wire pop = valid_out_t && ready_out_t; VX_fifo_queue #( .DATAW (DATAW), @@ -143,11 +125,11 @@ module VX_elastic_buffer #( VX_pipe_buffer #( .DATAW (DATAW), - .DEPTH ((OUT_REG > 0) ? (OUT_REG-1) : 0) + .DEPTH ((OUT_REG > 1) ? (OUT_REG-1) : 0) ) out_buf ( .clk (clk), .reset (reset), - .valid_in (~empty), + .valid_in (valid_out_t), .data_in (data_out_t), .ready_in (ready_out_t), .valid_out (valid_out), diff --git a/hw/rtl/libs/VX_fair_arbiter.sv b/hw/rtl/libs/VX_fair_arbiter.sv deleted file mode 100644 index 82bcfc5c6..000000000 --- a/hw/rtl/libs/VX_fair_arbiter.sv +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -`include "VX_platform.vh" - -`TRACING_OFF -module VX_fair_arbiter #( - parameter NUM_REQS = 1, - parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS) -) ( - input wire clk, - input wire reset, - input wire [NUM_REQS-1:0] requests, - output wire [LOG_NUM_REQS-1:0] grant_index, - output wire [NUM_REQS-1:0] grant_onehot, - output wire grant_valid, - input wire grant_ready -); - if (NUM_REQS == 1) begin - - `UNUSED_VAR (clk) - `UNUSED_VAR (reset) - `UNUSED_VAR (grant_ready) - - assign grant_index = '0; - assign grant_onehot = requests; - assign grant_valid = requests[0]; - - end else begin - - reg [NUM_REQS-1:0] requests_r; - - wire [NUM_REQS-1:0] requests_sel = requests_r & requests; - wire [NUM_REQS-1:0] requests_qual = (| requests_sel) ? requests_sel : requests; - - always @(posedge clk) begin - if (reset) begin - requests_r <= '0; - end else if (grant_ready) begin - requests_r <= requests_qual & ~grant_onehot; - end - end - - VX_priority_arbiter #( - .NUM_REQS (NUM_REQS) - ) priority_arbiter ( - .requests (requests_qual), - .grant_index (grant_index), - .grant_onehot (grant_onehot), - .grant_valid (grant_valid) - ); - - end - -endmodule -`TRACING_ON diff --git a/hw/rtl/libs/VX_fifo_queue.sv b/hw/rtl/libs/VX_fifo_queue.sv index ea00d67c7..615484abc 100644 --- a/hw/rtl/libs/VX_fifo_queue.sv +++ b/hw/rtl/libs/VX_fifo_queue.sv @@ -15,12 +15,12 @@ `TRACING_OFF module VX_fifo_queue #( - parameter DATAW = 1, - parameter DEPTH = 2, + parameter DATAW = 32, + parameter DEPTH = 32, parameter ALM_FULL = (DEPTH - 1), parameter ALM_EMPTY = 1, parameter OUT_REG = 0, - parameter LUTRAM = 1, + parameter LUTRAM = 0, parameter SIZEW = `CLOG2(DEPTH+1) ) ( input wire clk, @@ -36,225 +36,98 @@ module VX_fifo_queue #( output wire [SIZEW-1:0] size ); - localparam ADDRW = `CLOG2(DEPTH); - `STATIC_ASSERT(ALM_FULL > 0, ("alm_full must be greater than 0!")) `STATIC_ASSERT(ALM_FULL < DEPTH, ("alm_full must be smaller than size!")) `STATIC_ASSERT(ALM_EMPTY > 0, ("alm_empty must be greater than 0!")) `STATIC_ASSERT(ALM_EMPTY < DEPTH, ("alm_empty must be smaller than size!")) - `STATIC_ASSERT(`IS_POW2(DEPTH), ("size must be a power of 2!")) + `STATIC_ASSERT(`IS_POW2(DEPTH), ("depth must be a power of 2!")) - if (DEPTH == 1) begin + VX_pending_size #( + .SIZE (DEPTH), + .ALM_EMPTY (ALM_EMPTY), + .ALM_FULL (ALM_FULL) + ) pending_size ( + .clk (clk), + .reset (reset), + .incr (push), + .decr (pop), + .empty (empty), + .full (full), + .alm_empty(alm_empty), + .alm_full(alm_full), + .size (size) + ); + + if (DEPTH == 1) begin : g_depth_1 + `UNUSED_PARAM (OUT_REG) + `UNUSED_PARAM (LUTRAM) reg [DATAW-1:0] head_r; - reg size_r; + + always @(posedge clk) begin + if (push) begin + head_r <= data_in; + end + end + + assign data_out = head_r; + + end else begin : g_depth_n + + localparam ADDRW = `CLOG2(DEPTH); + + wire [DATAW-1:0] data_out_w; + reg [ADDRW-1:0] rd_ptr_r; + reg [ADDRW-1:0] wr_ptr_r; always @(posedge clk) begin if (reset) begin - head_r <= '0; - size_r <= '0; + wr_ptr_r <= '0; + rd_ptr_r <= (OUT_REG != 0) ? 1 : 0; end else begin - `ASSERT(~push || ~full, ("runtime error: writing to a full queue")); - `ASSERT(~pop || ~empty, ("runtime error: reading an empty queue")); - if (push) begin - if (~pop) begin - size_r <= 1; - end + wr_ptr_r <= wr_ptr_r + ADDRW'(push); + rd_ptr_r <= rd_ptr_r + ADDRW'(pop); + end + end + + VX_dp_ram #( + .DATAW (DATAW), + .SIZE (DEPTH), + .LUTRAM (LUTRAM), + .RDW_MODE ("W"), + .RADDR_REG (1), + .RADDR_RESET (1) + ) dp_ram ( + .clk (clk), + .reset (reset), + .read (1'b1), + .write (push), + .wren (1'b1), + .raddr (rd_ptr_r), + .waddr (wr_ptr_r), + .wdata (data_in), + .rdata (data_out_w) + ); + + if (OUT_REG != 0) begin : g_out_reg + reg [DATAW-1:0] data_out_r; + wire going_empty = (ALM_EMPTY == 1) ? alm_empty : (size[ADDRW-1:0] == ADDRW'(1)); + wire bypass = push && (empty || (going_empty && pop)); + always @(posedge clk) begin + if (bypass) begin + data_out_r <= data_in; end else if (pop) begin - size_r <= '0; - end - if (push) begin - head_r <= data_in; + data_out_r <= data_out_w; end end + assign data_out = data_out_r; + end else begin : g_no_out_reg + assign data_out = data_out_w; end - - assign data_out = head_r; - assign empty = (size_r == 0); - assign alm_empty = 1'b1; - assign full = (size_r != 0); - assign alm_full = 1'b1; - assign size = size_r; - - end else begin - - reg empty_r, alm_empty_r; - reg full_r, alm_full_r; - reg [ADDRW-1:0] used_r; - wire [ADDRW-1:0] used_n; - - always @(posedge clk) begin - if (reset) begin - empty_r <= 1; - alm_empty_r <= 1; - full_r <= 0; - alm_full_r <= 0; - used_r <= '0; - end else begin - `ASSERT(~(push && ~pop) || ~full, ("runtime error: incrementing full queue")); - `ASSERT(~(pop && ~push) || ~empty, ("runtime error: decrementing empty queue")); - if (push) begin - if (~pop) begin - empty_r <= 0; - if (used_r == ADDRW'(ALM_EMPTY)) - alm_empty_r <= 0; - if (used_r == ADDRW'(DEPTH-1)) - full_r <= 1; - if (used_r == ADDRW'(ALM_FULL-1)) - alm_full_r <= 1; - end - end else if (pop) begin - full_r <= 0; - if (used_r == ADDRW'(ALM_FULL)) - alm_full_r <= 0; - if (used_r == ADDRW'(1)) - empty_r <= 1; - if (used_r == ADDRW'(ALM_EMPTY+1)) - alm_empty_r <= 1; - end - used_r <= used_n; - end - end - - if (DEPTH == 2 && LUTRAM == 0) begin - - assign used_n = used_r ^ (push ^ pop); - - if (0 == OUT_REG) begin - - reg [1:0][DATAW-1:0] shift_reg; - - always @(posedge clk) begin - if (push) begin - shift_reg[1] <= shift_reg[0]; - shift_reg[0] <= data_in; - end - end - - assign data_out = shift_reg[!used_r[0]]; - - end else begin - - reg [DATAW-1:0] data_out_r; - reg [DATAW-1:0] buffer; - - always @(posedge clk) begin - if (push) begin - buffer <= data_in; - end - if (push && (empty_r || (used_r && pop))) begin - data_out_r <= data_in; - end else if (pop) begin - data_out_r <= buffer; - end - end - - assign data_out = data_out_r; - - end - - end else begin - - assign used_n = $signed(used_r) + ADDRW'($signed(2'(push) - 2'(pop))); - - if (0 == OUT_REG) begin - - reg [ADDRW-1:0] rd_ptr_r; - reg [ADDRW-1:0] wr_ptr_r; - - always @(posedge clk) begin - if (reset) begin - rd_ptr_r <= '0; - wr_ptr_r <= '0; - end else begin - wr_ptr_r <= wr_ptr_r + ADDRW'(push); - rd_ptr_r <= rd_ptr_r + ADDRW'(pop); - end - end - - VX_dp_ram #( - .DATAW (DATAW), - .SIZE (DEPTH), - .LUTRAM (LUTRAM) - ) dp_ram ( - .clk (clk), - .reset (reset), - .read (1'b1), - .write (push), - .wren (1'b1), - .waddr (wr_ptr_r), - .wdata (data_in), - .raddr (rd_ptr_r), - .rdata (data_out) - ); - - end else begin - - wire [DATAW-1:0] dout; - reg [DATAW-1:0] dout_r; - reg [ADDRW-1:0] wr_ptr_r; - reg [ADDRW-1:0] rd_ptr_r; - reg [ADDRW-1:0] rd_ptr_n_r; - - always @(posedge clk) begin - if (reset) begin - wr_ptr_r <= '0; - rd_ptr_r <= '0; - rd_ptr_n_r <= 1; - end else begin - wr_ptr_r <= wr_ptr_r + ADDRW'(push); - if (pop) begin - rd_ptr_r <= rd_ptr_n_r; - if (DEPTH > 2) begin - rd_ptr_n_r <= rd_ptr_r + ADDRW'(2); - end else begin // (DEPTH == 2); - rd_ptr_n_r <= ~rd_ptr_n_r; - end - end - end - end - - wire going_empty; - if (ALM_EMPTY == 1) begin - assign going_empty = alm_empty_r; - end else begin - assign going_empty = (used_r == ADDRW'(1)); - end - - VX_dp_ram #( - .DATAW (DATAW), - .SIZE (DEPTH), - .LUTRAM (LUTRAM) - ) dp_ram ( - .clk (clk), - .reset (reset), - .read (1'b1), - .write (push), - .wren (1'b1), - .waddr (wr_ptr_r), - .wdata (data_in), - .raddr (rd_ptr_n_r), - .rdata (dout) - ); - - always @(posedge clk) begin - if (push && (empty_r || (going_empty && pop))) begin - dout_r <= data_in; - end else if (pop) begin - dout_r <= dout; - end - end - - assign data_out = dout_r; - end - end - - assign empty = empty_r; - assign alm_empty = alm_empty_r; - assign full = full_r; - assign alm_full = alm_full_r; - assign size = {full_r, used_r}; end + `RUNTIME_ASSERT(~(push && ~pop) || ~full, ("%t: runtime error: incrementing full queue", $time)) + `RUNTIME_ASSERT(~(pop && ~push) || ~empty, ("%t: runtime error: decrementing empty queue", $time)) + endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_find_first.sv b/hw/rtl/libs/VX_find_first.sv index 18f345855..2a1714e18 100644 --- a/hw/rtl/libs/VX_find_first.sv +++ b/hw/rtl/libs/VX_find_first.sv @@ -28,27 +28,29 @@ module VX_find_first #( localparam TL = (1 << LOGN) - 1; localparam TN = (1 << (LOGN+1)) - 1; -`IGNORE_WARNINGS_BEGIN - wire [TN-1:0] s_n; - wire [TN-1:0][DATAW-1:0] d_n; -`IGNORE_WARNINGS_END +`IGNORE_UNOPTFLAT_BEGIN + wire s_n [TN]; + wire [DATAW-1:0] d_n [TN]; +`IGNORE_UNOPTFLAT_END - for (genvar i = 0; i < N; ++i) begin + for (genvar i = 0; i < N; ++i) begin : g_reverse assign s_n[TL+i] = REVERSE ? valid_in[N-1-i] : valid_in[i]; assign d_n[TL+i] = REVERSE ? data_in[N-1-i] : data_in[i]; end - if (TL < (TN-N)) begin - for (genvar i = TL+N; i < TN; ++i) begin + if (TL < (TN-N)) begin : g_fill + for (genvar i = TL+N; i < TN; ++i) begin : g_i assign s_n[i] = 0; assign d_n[i] = '0; end end - for (genvar j = 0; j < LOGN; ++j) begin - for (genvar i = 0; i < (2**j); ++i) begin - assign s_n[2**j-1+i] = s_n[2**(j+1)-1+i*2] | s_n[2**(j+1)-1+i*2+1]; - assign d_n[2**j-1+i] = s_n[2**(j+1)-1+i*2] ? d_n[2**(j+1)-1+i*2] : d_n[2**(j+1)-1+i*2+1]; + for (genvar j = 0; j < LOGN; ++j) begin : g_scan + localparam I = 1 << j; + for (genvar i = 0; i < I; ++i) begin : g_i + localparam K = I+i-1; + assign s_n[K] = s_n[2*K+1] | s_n[2*K+2]; + assign d_n[K] = s_n[2*K+1] ? d_n[2*K+1] : d_n[2*K+2]; end end diff --git a/hw/rtl/libs/VX_generic_arbiter.sv b/hw/rtl/libs/VX_generic_arbiter.sv index a1f7be4a0..2b0d086db 100644 --- a/hw/rtl/libs/VX_generic_arbiter.sv +++ b/hw/rtl/libs/VX_generic_arbiter.sv @@ -16,7 +16,7 @@ `TRACING_OFF module VX_generic_arbiter #( parameter NUM_REQS = 1, - parameter `STRING TYPE = "P", + parameter `STRING TYPE = "P", // P: priority, R: round-robin, M: matrix, C: cyclic parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS) ) ( input wire clk, @@ -27,7 +27,9 @@ module VX_generic_arbiter #( output wire grant_valid, input wire grant_ready ); - if (TYPE == "P") begin + `STATIC_ASSERT((TYPE == "P" || TYPE == "R" || TYPE == "M" || TYPE == "C"), ("invalid parameter")) + + if (TYPE == "P") begin : g_priority `UNUSED_VAR (clk) `UNUSED_VAR (reset) @@ -42,7 +44,7 @@ module VX_generic_arbiter #( .grant_onehot (grant_onehot) ); - end else if (TYPE == "R") begin + end else if (TYPE == "R") begin : g_round_robin VX_rr_arbiter #( .NUM_REQS (NUM_REQS) @@ -56,21 +58,7 @@ module VX_generic_arbiter #( .grant_ready (grant_ready) ); - end else if (TYPE == "F") begin - - VX_fair_arbiter #( - .NUM_REQS (NUM_REQS) - ) fair_arbiter ( - .clk (clk), - .reset (reset), - .requests (requests), - .grant_valid (grant_valid), - .grant_index (grant_index), - .grant_onehot (grant_onehot), - .grant_ready (grant_ready) - ); - - end else if (TYPE == "M") begin + end else if (TYPE == "M") begin : g_matrix VX_matrix_arbiter #( .NUM_REQS (NUM_REQS) @@ -84,7 +72,7 @@ module VX_generic_arbiter #( .grant_ready (grant_ready) ); - end else if (TYPE == "C") begin + end else if (TYPE == "C") begin : g_cyclic VX_cyclic_arbiter #( .NUM_REQS (NUM_REQS) @@ -98,11 +86,9 @@ module VX_generic_arbiter #( .grant_ready (grant_ready) ); - end else begin - - `ERROR(("invalid parameter")); - end + `RUNTIME_ASSERT (((~(| requests) != 1) || (grant_valid && (requests[grant_index] != 0) && (grant_onehot == (NUM_REQS'(1) << grant_index)))), ("%t: invalid arbiter grant!", $time)) + endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_index_buffer.sv b/hw/rtl/libs/VX_index_buffer.sv index 4e8439818..8d0320c5d 100644 --- a/hw/rtl/libs/VX_index_buffer.sv +++ b/hw/rtl/libs/VX_index_buffer.sv @@ -15,10 +15,10 @@ `TRACING_OFF module VX_index_buffer #( - parameter DATAW = 1, - parameter SIZE = 1, - parameter LUTRAM = 1, - parameter ADDRW = `LOG2UP(SIZE) + parameter DATAW = 1, + parameter SIZE = 1, + parameter LUTRAM = 0, + parameter ADDRW = `LOG2UP(SIZE) ) ( input wire clk, input wire reset, @@ -49,9 +49,10 @@ module VX_index_buffer #( ); VX_dp_ram #( - .DATAW (DATAW), - .SIZE (SIZE), - .LUTRAM (LUTRAM) + .DATAW (DATAW), + .SIZE (SIZE), + .LUTRAM (LUTRAM), + .RDW_MODE ("W") ) data_table ( .clk (clk), .reset (reset), diff --git a/hw/rtl/libs/VX_index_queue.sv b/hw/rtl/libs/VX_index_queue.sv index 23ec6ed83..e73db0ff9 100644 --- a/hw/rtl/libs/VX_index_queue.sv +++ b/hw/rtl/libs/VX_index_queue.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -20,9 +20,9 @@ module VX_index_queue #( ) ( input wire clk, input wire reset, - input wire [DATAW-1:0] write_data, + input wire [DATAW-1:0] write_data, output wire [`LOG2UP(SIZE)-1:0] write_addr, - input wire push, + input wire push, input wire pop, output wire full, output wire empty, @@ -30,33 +30,33 @@ module VX_index_queue #( output wire [DATAW-1:0] read_data ); reg [DATAW-1:0] entries [SIZE-1:0]; - reg [SIZE-1:0] valid; + reg [SIZE-1:0] valid; reg [`LOG2UP(SIZE):0] rd_ptr, wr_ptr; wire [`LOG2UP(SIZE)-1:0] rd_a, wr_a; wire enqueue, dequeue; assign rd_a = rd_ptr[`LOG2UP(SIZE)-1:0]; - assign wr_a = wr_ptr[`LOG2UP(SIZE)-1:0]; + assign wr_a = wr_ptr[`LOG2UP(SIZE)-1:0]; assign empty = (wr_ptr == rd_ptr); assign full = (wr_a == rd_a) && (wr_ptr[`LOG2UP(SIZE)] != rd_ptr[`LOG2UP(SIZE)]); - assign enqueue = push; + assign enqueue = push; assign dequeue = !empty && !valid[rd_a]; // auto-remove when head is invalid - `RUNTIME_ASSERT(!push || !full, ("%t: *** invalid inputs", $time)); - + `RUNTIME_ASSERT(!push || !full, ("%t: *** invalid inputs", $time)) + always @(posedge clk) begin if (reset) begin rd_ptr <= '0; wr_ptr <= '0; - valid <= '0; + valid <= '0; end else begin if (enqueue) begin valid[wr_a] <= 1; wr_ptr <= wr_ptr + 1; - end + end if (dequeue) begin rd_ptr <= rd_ptr + 1; end @@ -67,7 +67,7 @@ module VX_index_queue #( if (enqueue) begin entries[wr_a] <= write_data; - end + end end assign write_addr = wr_a; diff --git a/hw/rtl/libs/VX_lzc.sv b/hw/rtl/libs/VX_lzc.sv index 2589bf5a7..af2cb650d 100644 --- a/hw/rtl/libs/VX_lzc.sv +++ b/hw/rtl/libs/VX_lzc.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -23,18 +23,18 @@ module VX_lzc #( output wire [LOGN-1:0] data_out, output wire valid_out ); - if (N == 1) begin + if (N == 1) begin : g_passthru `UNUSED_PARAM (REVERSE) assign data_out = '0; assign valid_out = data_in; - end else begin + end else begin : g_lzc wire [N-1:0][LOGN-1:0] indices; - for (genvar i = 0; i < N; ++i) begin + for (genvar i = 0; i < N; ++i) begin : g_indices assign indices[i] = REVERSE ? LOGN'(i) : LOGN'(N-1-i); end @@ -42,7 +42,7 @@ module VX_lzc #( .N (N), .DATAW (LOGN), .REVERSE (!REVERSE) - ) find_first ( + ) find_first ( .data_in (indices), .valid_in (data_in), .data_out (data_out), @@ -50,6 +50,6 @@ module VX_lzc #( ); end - + endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_matrix_arbiter.sv b/hw/rtl/libs/VX_matrix_arbiter.sv index 23f9ea2a0..b6b88e47a 100644 --- a/hw/rtl/libs/VX_matrix_arbiter.sv +++ b/hw/rtl/libs/VX_matrix_arbiter.sv @@ -26,7 +26,7 @@ module VX_matrix_arbiter #( output wire grant_valid, input wire grant_ready ); - if (NUM_REQS == 1) begin + if (NUM_REQS == 1) begin : g_passthru `UNUSED_VAR (clk) `UNUSED_VAR (reset) @@ -36,59 +36,49 @@ module VX_matrix_arbiter #( assign grant_onehot = requests; assign grant_valid = requests[0]; - end else begin + end else begin : g_arbiter - reg [NUM_REQS-1:1] state [NUM_REQS-1:0]; + reg [NUM_REQS-1:1] state [NUM_REQS-1:0]; wire [NUM_REQS-1:0] pri [NUM_REQS-1:0]; - wire [NUM_REQS-1:0] grant_unqual; + wire [NUM_REQS-1:0] grant; - for (genvar i = 0; i < NUM_REQS; ++i) begin - for (genvar j = 0; j < NUM_REQS; ++j) begin - if (j > i) begin - assign pri[j][i] = requests[i] && state[i][j]; - end - else if (j < i) begin - assign pri[j][i] = requests[i] && !state[j][i]; - end - else begin - assign pri[j][i] = 0; + for (genvar r = 0; r < NUM_REQS; ++r) begin : g_pri_r + for (genvar c = 0; c < NUM_REQS; ++c) begin : g_pri_c + if (r > c) begin : g_row + assign pri[r][c] = requests[c] && state[c][r]; + end else if (r < c) begin : g_col + assign pri[r][c] = requests[c] && !state[r][c]; + end else begin : g_equal + assign pri[r][c] = 0; end end - assign grant_unqual[i] = requests[i] && !(| pri[i]); end - for (genvar i = 0; i < NUM_REQS; ++i) begin - for (genvar j = i + 1; j < NUM_REQS; ++j) begin + for (genvar r = 0; r < NUM_REQS; ++r) begin : g_grant + assign grant[r] = requests[r] && ~(| pri[r]); + end + + for (genvar r = 0; r < NUM_REQS; ++r) begin : g_state_r + for (genvar c = r + 1; c < NUM_REQS; ++c) begin : g_state_c always @(posedge clk) begin if (reset) begin - state[i][j] <= '0; - end else begin - state[i][j] <= (state[i][j] || grant_unqual[j]) && !grant_unqual[i]; + state[r][c] <= '0; + end else if (grant_ready) begin + state[r][c] <= (state[r][c] || grant[c]) && ~grant[r]; end end end end - reg [NUM_REQS-1:0] grant_unqual_prev; - always @(posedge clk) begin - if (reset) begin - grant_unqual_prev <= '0; - end else if (grant_ready) begin - grant_unqual_prev <= grant_unqual; - end - end - assign grant_onehot = grant_ready ? grant_unqual : grant_unqual_prev; + assign grant_onehot = grant; VX_onehot_encoder #( .N (NUM_REQS) ) encoder ( - .data_in (grant_unqual), - .data_out (grant_index), - `UNUSED_PIN (valid_out) + .data_in (grant_onehot), + .data_out (grant_index), + .valid_out (grant_valid) ); - - assign grant_valid = (| requests); - end endmodule diff --git a/hw/rtl/libs/VX_mem_bank_adapter.sv b/hw/rtl/libs/VX_mem_bank_adapter.sv new file mode 100644 index 000000000..252a37ab0 --- /dev/null +++ b/hw/rtl/libs/VX_mem_bank_adapter.sv @@ -0,0 +1,261 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_platform.vh" + +`TRACING_OFF +module VX_mem_bank_adapter #( + parameter DATA_WIDTH = 512, + parameter ADDR_WIDTH_IN = 26, // word-addressable + parameter ADDR_WIDTH_OUT = 32, // byte-addressable + parameter TAG_WIDTH_IN = 8, + parameter TAG_WIDTH_OUT = 8, + parameter NUM_PORTS_IN = 1, + parameter NUM_BANKS_OUT = 1, + parameter INTERLEAVE = 0, + parameter TAG_BUFFER_SIZE= 32, + parameter ARBITER = "R", + parameter REQ_OUT_BUF = 1, + parameter RSP_OUT_BUF = 1, + parameter DATA_SIZE = DATA_WIDTH/8 + ) ( + input wire clk, + input wire reset, + + // Input request + input wire mem_req_valid_in [NUM_PORTS_IN], + input wire mem_req_rw_in [NUM_PORTS_IN], + input wire [DATA_SIZE-1:0] mem_req_byteen_in [NUM_PORTS_IN], + input wire [ADDR_WIDTH_IN-1:0] mem_req_addr_in [NUM_PORTS_IN], + input wire [DATA_WIDTH-1:0] mem_req_data_in [NUM_PORTS_IN], + input wire [TAG_WIDTH_IN-1:0] mem_req_tag_in [NUM_PORTS_IN], + output wire mem_req_ready_in [NUM_PORTS_IN], + + // Input response + output wire mem_rsp_valid_in [NUM_PORTS_IN], + output wire [DATA_WIDTH-1:0] mem_rsp_data_in [NUM_PORTS_IN], + output wire [TAG_WIDTH_IN-1:0] mem_rsp_tag_in [NUM_PORTS_IN], + input wire mem_rsp_ready_in [NUM_PORTS_IN], + + // Output request + output wire mem_req_valid_out [NUM_BANKS_OUT], + output wire mem_req_rw_out [NUM_BANKS_OUT], + output wire [DATA_SIZE-1:0] mem_req_byteen_out [NUM_BANKS_OUT], + output wire [ADDR_WIDTH_OUT-1:0] mem_req_addr_out [NUM_BANKS_OUT], + output wire [DATA_WIDTH-1:0] mem_req_data_out [NUM_BANKS_OUT], + output wire [TAG_WIDTH_OUT-1:0] mem_req_tag_out [NUM_BANKS_OUT], + input wire mem_req_ready_out [NUM_BANKS_OUT], + + // Output response + input wire mem_rsp_valid_out [NUM_BANKS_OUT], + input wire [DATA_WIDTH-1:0] mem_rsp_data_out [NUM_BANKS_OUT], + input wire [TAG_WIDTH_OUT-1:0] mem_rsp_tag_out [NUM_BANKS_OUT], + output wire mem_rsp_ready_out [NUM_BANKS_OUT] +); + localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS_OUT); + localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS); + localparam DST_ADDR_WDITH = ADDR_WIDTH_OUT + BANK_SEL_BITS; // convert output addresss to input space + localparam BANK_ADDR_WIDTH = DST_ADDR_WDITH - BANK_SEL_BITS; + localparam NUM_PORTS_IN_BITS = `CLOG2(NUM_PORTS_IN); + localparam NUM_PORTS_IN_WIDTH = `UP(NUM_PORTS_IN_BITS); + localparam TAG_BUFFER_ADDRW = `CLOG2(TAG_BUFFER_SIZE); + localparam NEEDED_TAG_WIDTH = TAG_WIDTH_IN + NUM_PORTS_IN_BITS; + localparam READ_TAG_WIDTH = (NEEDED_TAG_WIDTH > TAG_WIDTH_OUT) ? TAG_BUFFER_ADDRW : TAG_WIDTH_IN; + localparam WRITE_TAG_WIDTH = TAG_WIDTH_IN; + localparam XBAR_TAG_WIDTH = `MAX(READ_TAG_WIDTH, WRITE_TAG_WIDTH); + localparam DST_TAG_WIDTH = XBAR_TAG_WIDTH + NUM_PORTS_IN_BITS; + localparam REQ_XBAR_DATAW = 1 + BANK_ADDR_WIDTH + DATA_SIZE + DATA_WIDTH + XBAR_TAG_WIDTH; + localparam RSP_XBAR_DATAW = DATA_WIDTH + READ_TAG_WIDTH; + + `STATIC_ASSERT ((DST_ADDR_WDITH >= ADDR_WIDTH_IN), ("invalid address width: current=%0d, expected=%0d", DST_ADDR_WDITH, ADDR_WIDTH_IN)) + `STATIC_ASSERT ((TAG_WIDTH_OUT >= DST_TAG_WIDTH), ("invalid output tag width: current=%0d, expected=%0d", TAG_WIDTH_OUT, DST_TAG_WIDTH)) + + // Bank selection + + wire [NUM_PORTS_IN-1:0][BANK_SEL_WIDTH-1:0] req_bank_sel; + wire [NUM_PORTS_IN-1:0][BANK_ADDR_WIDTH-1:0] req_bank_addr; + + if (NUM_BANKS_OUT > 1) begin : g_bank_sel + for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_i + wire [DST_ADDR_WDITH-1:0] mem_req_addr_dst = DST_ADDR_WDITH'(mem_req_addr_in[i]); + if (INTERLEAVE) begin : g_interleave + assign req_bank_sel[i] = mem_req_addr_dst[BANK_SEL_BITS-1:0]; + assign req_bank_addr[i] = mem_req_addr_dst[BANK_SEL_BITS +: BANK_ADDR_WIDTH]; + end else begin : g_no_interleave + assign req_bank_sel[i] = mem_req_addr_dst[BANK_ADDR_WIDTH +: BANK_SEL_BITS]; + assign req_bank_addr[i] = mem_req_addr_dst[BANK_ADDR_WIDTH-1:0]; + end + end + end else begin : g_no_bank_sel + for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_i + assign req_bank_sel[i] = '0; + assign req_bank_addr[i] = DST_ADDR_WDITH'(mem_req_addr_in[i]); + end + end + + // Tag handling logic + + wire [NUM_PORTS_IN-1:0] mem_rd_req_tag_in_ready; + wire [NUM_PORTS_IN-1:0][READ_TAG_WIDTH-1:0] mem_rd_req_tag_in; + wire [NUM_PORTS_IN-1:0][READ_TAG_WIDTH-1:0] mem_rd_rsp_tag_in; + + for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_tag_buf + if (NEEDED_TAG_WIDTH > TAG_WIDTH_OUT) begin : g_enabled + wire [TAG_BUFFER_ADDRW-1:0] tbuf_waddr, tbuf_raddr; + wire tbuf_full; + VX_index_buffer #( + .DATAW (TAG_WIDTH_IN), + .SIZE (TAG_BUFFER_SIZE) + ) tag_buf ( + .clk (clk), + .reset (reset), + .acquire_en (mem_req_valid_in[i] && ~mem_req_rw_in[i] && mem_req_ready_in[i]), + .write_addr (tbuf_waddr), + .write_data (mem_req_tag_in[i]), + .read_data (mem_rsp_tag_in[i]), + .read_addr (tbuf_raddr), + .release_en (mem_rsp_valid_in[i] && mem_rsp_ready_in[i]), + .full (tbuf_full), + `UNUSED_PIN (empty) + ); + assign mem_rd_req_tag_in_ready[i] = ~tbuf_full; + assign mem_rd_req_tag_in[i] = tbuf_waddr; + assign tbuf_raddr = mem_rd_rsp_tag_in[i]; + end else begin : g_none + assign mem_rd_req_tag_in_ready[i] = 1; + assign mem_rd_req_tag_in[i] = mem_req_tag_in[i]; + assign mem_rsp_tag_in[i] = mem_rd_rsp_tag_in[i]; + end + end + + // Requests handling + + wire [NUM_PORTS_IN-1:0] req_xbar_valid_in; + wire [NUM_PORTS_IN-1:0][REQ_XBAR_DATAW-1:0] req_xbar_data_in; + wire [NUM_PORTS_IN-1:0] req_xbar_ready_in; + + wire [NUM_BANKS_OUT-1:0] req_xbar_valid_out; + wire [NUM_BANKS_OUT-1:0][REQ_XBAR_DATAW-1:0] req_xbar_data_out; + wire [NUM_BANKS_OUT-1:0][NUM_PORTS_IN_WIDTH-1:0] req_xbar_sel_out; + wire [NUM_BANKS_OUT-1:0] req_xbar_ready_out; + + for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_req_xbar_data_in + wire tag_ready = mem_req_rw_in[i] || mem_rd_req_tag_in_ready[i]; + wire [XBAR_TAG_WIDTH-1:0] tag_value = mem_req_rw_in[i] ? XBAR_TAG_WIDTH'(mem_req_tag_in[i]) : XBAR_TAG_WIDTH'(mem_rd_req_tag_in[i]); + assign req_xbar_valid_in[i] = mem_req_valid_in[i] && tag_ready; + assign req_xbar_data_in[i] = {mem_req_rw_in[i], req_bank_addr[i], mem_req_byteen_in[i], mem_req_data_in[i], tag_value}; + assign mem_req_ready_in[i] = req_xbar_ready_in[i] && tag_ready; + end + + VX_stream_xbar #( + .NUM_INPUTS (NUM_PORTS_IN), + .NUM_OUTPUTS(NUM_BANKS_OUT), + .DATAW (REQ_XBAR_DATAW), + .ARBITER (ARBITER), + .OUT_BUF (REQ_OUT_BUF) + ) req_xbar ( + .clk (clk), + .reset (reset), + .sel_in (req_bank_sel), + .valid_in (req_xbar_valid_in), + .data_in (req_xbar_data_in), + .ready_in (req_xbar_ready_in), + .valid_out (req_xbar_valid_out), + .data_out (req_xbar_data_out), + .ready_out (req_xbar_ready_out), + .sel_out (req_xbar_sel_out), + `UNUSED_PIN (collisions) + ); + + for (genvar i = 0; i < NUM_BANKS_OUT; ++i) begin : g_req_xbar_data_out + + wire xbar_rw_out; + wire [BANK_ADDR_WIDTH-1:0] xbar_addr_out; + wire [XBAR_TAG_WIDTH-1:0] xbar_tag_out; + wire [DATA_WIDTH-1:0] xbar_data_out; + wire [DATA_SIZE-1:0] xbar_byteen_out; + + assign { + xbar_rw_out, + xbar_addr_out, + xbar_byteen_out, + xbar_data_out, + xbar_tag_out + } = req_xbar_data_out[i]; + + assign mem_req_valid_out[i] = req_xbar_valid_out[i]; + assign mem_req_rw_out[i] = xbar_rw_out; + assign mem_req_addr_out[i] = ADDR_WIDTH_OUT'(xbar_addr_out); + assign mem_req_byteen_out[i] = xbar_byteen_out; + assign mem_req_data_out[i] = xbar_data_out; + + if (NUM_PORTS_IN > 1) begin : g_input_sel + assign mem_req_tag_out[i] = TAG_WIDTH_OUT'({xbar_tag_out, req_xbar_sel_out[i]}); + end else begin : g_no_input_sel + `UNUSED_VAR (req_xbar_sel_out[i]) + assign mem_req_tag_out[i] = TAG_WIDTH_OUT'(xbar_tag_out); + end + + assign req_xbar_ready_out[i] = mem_req_ready_out[i]; + end + + // Responses handling + + wire [NUM_BANKS_OUT-1:0] rsp_xbar_valid_in; + wire [NUM_BANKS_OUT-1:0][RSP_XBAR_DATAW-1:0] rsp_xbar_data_in; + wire [NUM_BANKS_OUT-1:0][NUM_PORTS_IN_WIDTH-1:0] rsp_xbar_sel_in; + wire [NUM_BANKS_OUT-1:0] rsp_xbar_ready_in; + + for (genvar i = 0; i < NUM_BANKS_OUT; ++i) begin : g_rsp_xbar_data_in + assign rsp_xbar_valid_in[i] = mem_rsp_valid_out[i]; + assign rsp_xbar_data_in[i] = {mem_rsp_data_out[i], mem_rsp_tag_out[i][NUM_PORTS_IN_BITS +: READ_TAG_WIDTH]}; + if (NUM_PORTS_IN > 1) begin : g_input_sel + assign rsp_xbar_sel_in[i] = mem_rsp_tag_out[i][0 +: NUM_PORTS_IN_BITS]; + end else begin : g_no_input_sel + assign rsp_xbar_sel_in[i] = 0; + end + assign mem_rsp_ready_out[i] = rsp_xbar_ready_in[i]; + end + + wire [NUM_PORTS_IN-1:0] rsp_xbar_valid_out; + wire [NUM_PORTS_IN-1:0][DATA_WIDTH+READ_TAG_WIDTH-1:0] rsp_xbar_data_out; + wire [NUM_PORTS_IN-1:0] rsp_xbar_ready_out; + + VX_stream_xbar #( + .NUM_INPUTS (NUM_BANKS_OUT), + .NUM_OUTPUTS(NUM_PORTS_IN), + .DATAW (RSP_XBAR_DATAW), + .ARBITER (ARBITER), + .OUT_BUF (RSP_OUT_BUF) + ) rsp_xbar ( + .clk (clk), + .reset (reset), + .valid_in (rsp_xbar_valid_in), + .data_in (rsp_xbar_data_in), + .ready_in (rsp_xbar_ready_in), + .sel_in (rsp_xbar_sel_in), + .data_out (rsp_xbar_data_out), + .valid_out (rsp_xbar_valid_out), + .ready_out (rsp_xbar_ready_out), + `UNUSED_PIN (collisions), + `UNUSED_PIN (sel_out) + ); + + for (genvar i = 0; i < NUM_PORTS_IN; ++i) begin : g_rsp_xbar_data_out + assign mem_rsp_valid_in[i] = rsp_xbar_valid_out[i]; + assign {mem_rsp_data_in[i], mem_rd_rsp_tag_in[i]} = rsp_xbar_data_out[i]; + assign rsp_xbar_ready_out[i] = mem_rsp_ready_in[i]; + end + +endmodule +`TRACING_ON diff --git a/hw/rtl/libs/VX_mem_coalescer.sv b/hw/rtl/libs/VX_mem_coalescer.sv index d1ffde09a..956b3cafe 100644 --- a/hw/rtl/libs/VX_mem_coalescer.sv +++ b/hw/rtl/libs/VX_mem_coalescer.sv @@ -18,12 +18,13 @@ module VX_mem_coalescer #( parameter `STRING INSTANCE_ID = "", parameter NUM_REQS = 1, parameter ADDR_WIDTH = 32, - parameter ATYPE_WIDTH = 1, + parameter FLAGS_WIDTH = 0, parameter DATA_IN_SIZE = 4, parameter DATA_OUT_SIZE = 64, parameter TAG_WIDTH = 8, parameter UUID_WIDTH = 0, // upper section of the request tag contains the UUID parameter QUEUE_SIZE = 8, + parameter PERF_CTR_BITS = `CLOG2(NUM_REQS+1), parameter DATA_IN_WIDTH = DATA_IN_SIZE * 8, parameter DATA_OUT_WIDTH= DATA_OUT_SIZE * 8, @@ -37,13 +38,15 @@ module VX_mem_coalescer #( input wire clk, input wire reset, + output wire [PERF_CTR_BITS-1:0] misses, + // Input request input wire in_req_valid, input wire in_req_rw, input wire [NUM_REQS-1:0] in_req_mask, input wire [NUM_REQS-1:0][DATA_IN_SIZE-1:0] in_req_byteen, input wire [NUM_REQS-1:0][ADDR_WIDTH-1:0] in_req_addr, - input wire [NUM_REQS-1:0][ATYPE_WIDTH-1:0] in_req_atype, + input wire [NUM_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] in_req_flags, input wire [NUM_REQS-1:0][DATA_IN_WIDTH-1:0] in_req_data, input wire [TAG_WIDTH-1:0] in_req_tag, output wire in_req_ready, @@ -61,7 +64,7 @@ module VX_mem_coalescer #( output wire [OUT_REQS-1:0] out_req_mask, output wire [OUT_REQS-1:0][DATA_OUT_SIZE-1:0] out_req_byteen, output wire [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr, - output wire [OUT_REQS-1:0][ATYPE_WIDTH-1:0] out_req_atype, + output wire [OUT_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] out_req_flags, output wire [OUT_REQS-1:0][DATA_OUT_WIDTH-1:0] out_req_data, output wire [OUT_TAG_WIDTH-1:0] out_req_tag, input wire out_req_ready, @@ -74,18 +77,18 @@ module VX_mem_coalescer #( output wire out_rsp_ready ); `UNUSED_SPARAM (INSTANCE_ID) + `STATIC_ASSERT ((NUM_REQS > 1), ("invalid parameter")) `STATIC_ASSERT (`IS_DIVISBLE(NUM_REQS * DATA_IN_WIDTH, DATA_OUT_WIDTH), ("invalid parameter")) `STATIC_ASSERT ((NUM_REQS * DATA_IN_WIDTH >= DATA_OUT_WIDTH), ("invalid parameter")) - `RUNTIME_ASSERT ((~in_req_valid || in_req_mask != 0), ("invalid request mask")); - `RUNTIME_ASSERT ((~out_rsp_valid || out_rsp_mask != 0), ("invalid request mask")); + `RUNTIME_ASSERT ((~in_req_valid || in_req_mask != 0), ("%t: invalid request mask", $time)) + `RUNTIME_ASSERT ((~out_rsp_valid || out_rsp_mask != 0), ("%t: invalid request mask", $time)) localparam TAG_ID_WIDTH = TAG_WIDTH - UUID_WIDTH; - localparam NUM_REQS_W = `LOG2UP(NUM_REQS); // tag + mask + offest localparam IBUF_DATA_WIDTH = TAG_ID_WIDTH + NUM_REQS + (NUM_REQS * DATA_RATIO_W); - localparam STATE_SETUP = 0; - localparam STATE_SEND = 1; + localparam STATE_WAIT = 0; + localparam STATE_SEND = 1; logic state_r, state_n; @@ -93,7 +96,7 @@ module VX_mem_coalescer #( logic out_req_rw_r, out_req_rw_n; logic [OUT_REQS-1:0] out_req_mask_r, out_req_mask_n; logic [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr_r, out_req_addr_n; - logic [OUT_REQS-1:0][ATYPE_WIDTH-1:0] out_req_atype_r, out_req_atype_n; + logic [OUT_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] out_req_flags_r, out_req_flags_n; logic [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] out_req_byteen_r, out_req_byteen_n; logic [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_WIDTH-1:0] out_req_data_r, out_req_data_n; logic [OUT_TAG_WIDTH-1:0] out_req_tag_r, out_req_tag_n; @@ -111,95 +114,94 @@ module VX_mem_coalescer #( logic [OUT_REQS-1:0] batch_valid_r, batch_valid_n; logic [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] seed_addr_r, seed_addr_n; - logic [OUT_REQS-1:0][ATYPE_WIDTH-1:0] seed_atype_r, seed_atype_n; + logic [OUT_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] seed_flags_r, seed_flags_n; logic [NUM_REQS-1:0] addr_matches_r, addr_matches_n; - logic [NUM_REQS-1:0] processed_mask_r, processed_mask_n; + logic [NUM_REQS-1:0] req_rem_mask_r, req_rem_mask_n; - wire [OUT_REQS-1:0][NUM_REQS_W-1:0] seed_idx; - - wire [NUM_REQS-1:0][OUT_ADDR_WIDTH-1:0] in_addr_base; wire [NUM_REQS-1:0][DATA_RATIO_W-1:0] in_addr_offset; - for (genvar i = 0; i < NUM_REQS; i++) begin - assign in_addr_base[i] = in_req_addr[i][ADDR_WIDTH-1:DATA_RATIO_W]; + for (genvar i = 0; i < NUM_REQS; i++) begin : g_in_addr_offset assign in_addr_offset[i] = in_req_addr[i][DATA_RATIO_W-1:0]; end - for (genvar i = 0; i < OUT_REQS; ++i) begin - wire [DATA_RATIO-1:0] batch_mask = in_req_mask[i * DATA_RATIO +: DATA_RATIO] & ~processed_mask_r[i * DATA_RATIO +: DATA_RATIO]; + for (genvar i = 0; i < OUT_REQS; ++i) begin : g_seed_gen + wire [DATA_RATIO-1:0] batch_mask; wire [DATA_RATIO_W-1:0] batch_idx; + + assign batch_mask = in_req_mask[i * DATA_RATIO +: DATA_RATIO] & req_rem_mask_r[i * DATA_RATIO +: DATA_RATIO]; + VX_priority_encoder #( .N (DATA_RATIO) ) priority_encoder ( - .data_in (batch_mask), - .index (batch_idx), - `UNUSED_PIN (onehot), - .valid_out (batch_valid_n[i]) + .data_in (batch_mask), + .index_out (batch_idx), + `UNUSED_PIN (onehot_out), + .valid_out (batch_valid_n[i]) ); - if (OUT_REQS > 1) begin - assign seed_idx[i] = {(NUM_REQS_W-DATA_RATIO_W)'(i), batch_idx}; - end else begin - assign seed_idx[i] = batch_idx; + + wire [DATA_RATIO-1:0][OUT_ADDR_WIDTH-1:0] addr_base; + for (genvar j = 0; j < DATA_RATIO; ++j) begin : g_addr_base + assign addr_base[j] = in_req_addr[DATA_RATIO * i + j][ADDR_WIDTH-1:DATA_RATIO_W]; end - end - for (genvar i = 0; i < OUT_REQS; ++i) begin - assign seed_addr_n[i] = in_addr_base[seed_idx[i]]; - assign seed_atype_n[i] = in_req_atype[seed_idx[i]]; - end + wire [DATA_RATIO-1:0][`UP(FLAGS_WIDTH)-1:0] req_flags; + for (genvar j = 0; j < DATA_RATIO; ++j) begin : g_req_flags + assign req_flags[j] = in_req_flags[DATA_RATIO * i + j]; + end - for (genvar i = 0; i < OUT_REQS; ++i) begin - for (genvar j = 0; j < DATA_RATIO; ++j) begin - assign addr_matches_n[i * DATA_RATIO + j] = (in_addr_base[i * DATA_RATIO + j] == seed_addr_n[i]); + assign seed_addr_n[i] = addr_base[batch_idx]; + assign seed_flags_n[i] = req_flags[batch_idx]; + + for (genvar j = 0; j < DATA_RATIO; ++j) begin : g_addr_matches_n + assign addr_matches_n[i * DATA_RATIO + j] = (addr_base[j] == seed_addr_n[i]); end end wire [NUM_REQS-1:0] current_pmask = in_req_mask & addr_matches_r; - reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] req_byteen_merged; - reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_WIDTH-1:0] req_data_merged; + wire [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] req_byteen_merged; + wire [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_WIDTH-1:0] req_data_merged; - always @(*) begin - req_byteen_merged = '0; - req_data_merged = 'x; - for (integer i = 0; i < OUT_REQS; ++i) begin + for (genvar i = 0; i < OUT_REQS; ++i) begin : g_data_merged + reg [DATA_RATIO-1:0][DATA_IN_SIZE-1:0] byteen_merged; + reg [DATA_RATIO-1:0][DATA_IN_WIDTH-1:0] data_merged; + always @(*) begin + byteen_merged = '0; + data_merged = 'x; for (integer j = 0; j < DATA_RATIO; ++j) begin - if (current_pmask[i * DATA_RATIO + j]) begin - for (integer k = 0; k < DATA_IN_SIZE; ++k) begin - if (in_req_byteen[DATA_RATIO * i + j][k]) begin - req_byteen_merged[i][in_addr_offset[DATA_RATIO * i + j]][k] = 1'b1; - req_data_merged[i][in_addr_offset[DATA_RATIO * i + j]][k * 8 +: 8] = in_req_data[DATA_RATIO * i + j][k * 8 +: 8]; - end + for (integer k = 0; k < DATA_IN_SIZE; ++k) begin + // perform byte-level merge since each thread may have different bytes enabled + if (current_pmask[i * DATA_RATIO + j] && in_req_byteen[DATA_RATIO * i + j][k]) begin + byteen_merged[in_addr_offset[DATA_RATIO * i + j]][k] = 1'b1; + data_merged[in_addr_offset[DATA_RATIO * i + j]][k * 8 +: 8] = in_req_data[DATA_RATIO * i + j][k * 8 +: 8]; end end end end + assign req_byteen_merged[i] = byteen_merged; + assign req_data_merged[i] = data_merged; end - wire [OUT_REQS * DATA_RATIO - 1:0] pending_mask; - for (genvar i = 0; i < OUT_REQS * DATA_RATIO; ++i) begin - assign pending_mask[i] = in_req_mask[i] && ~addr_matches_r[i] && ~processed_mask_r[i]; - end - wire batch_completed = ~(| pending_mask); + wire is_last_batch = ~(| (in_req_mask & ~addr_matches_r & req_rem_mask_r)); + + wire out_req_fire = out_req_valid && out_req_ready; always @(*) begin state_n = state_r; - out_req_valid_n = out_req_valid_r; out_req_mask_n = out_req_mask_r; out_req_rw_n = out_req_rw_r; out_req_addr_n = out_req_addr_r; - out_req_atype_n = out_req_atype_r; + out_req_flags_n = out_req_flags_r; out_req_byteen_n = out_req_byteen_r; out_req_data_n = out_req_data_r; out_req_tag_n = out_req_tag_r; - - processed_mask_n = processed_mask_r; + req_rem_mask_n = req_rem_mask_r; in_req_ready_n = 0; case (state_r) - STATE_SETUP: begin + STATE_WAIT: begin // wait for pending outgoing request to submit - if (out_req_valid && out_req_ready) begin + if (out_req_fire) begin out_req_valid_n = 0; end if (in_req_valid && ~out_req_valid_n && ~ibuf_full) begin @@ -207,37 +209,31 @@ module VX_mem_coalescer #( end end default/*STATE_SEND*/: begin + state_n = STATE_WAIT; out_req_valid_n = 1; out_req_mask_n = batch_valid_r; out_req_rw_n = in_req_rw; out_req_addr_n = seed_addr_r; - out_req_atype_n = seed_atype_r; + out_req_flags_n = seed_flags_r; out_req_byteen_n= req_byteen_merged; out_req_data_n = req_data_merged; out_req_tag_n = {in_req_tag[TAG_WIDTH-1 -: UUID_WIDTH], ibuf_waddr}; - - in_req_ready_n = batch_completed; - - if (batch_completed) begin - processed_mask_n = '0; - end else begin - processed_mask_n = processed_mask_r | current_pmask; - end - - state_n = STATE_SETUP; + req_rem_mask_n = is_last_batch ? '1 : (req_rem_mask_r & ~current_pmask); + in_req_ready_n = is_last_batch; end endcase end VX_pipe_register #( - .DATAW (1 + NUM_REQS + 1 + 1 + NUM_REQS + OUT_REQS * (1 + 1 + OUT_ADDR_WIDTH + ATYPE_WIDTH + OUT_ADDR_WIDTH + ATYPE_WIDTH + DATA_OUT_SIZE + DATA_OUT_WIDTH) + OUT_TAG_WIDTH), - .RESETW (1 + NUM_REQS + 1) + .DATAW (1 + NUM_REQS + 1 + 1 + NUM_REQS + OUT_REQS * (1 + 1 + OUT_ADDR_WIDTH + `UP(FLAGS_WIDTH) + OUT_ADDR_WIDTH + `UP(FLAGS_WIDTH) + DATA_OUT_SIZE + DATA_OUT_WIDTH) + OUT_TAG_WIDTH), + .RESETW (1 + NUM_REQS + 1), + .INIT_VALUE ({1'b0, {NUM_REQS{1'b1}}, 1'b0}) ) pipe_reg ( .clk (clk), .reset (reset), .enable (1'b1), - .data_in ({state_n, processed_mask_n, out_req_valid_n, out_req_rw_n, addr_matches_n, batch_valid_n, out_req_mask_n, seed_addr_n, seed_atype_n, out_req_addr_n, out_req_atype_n, out_req_byteen_n, out_req_data_n, out_req_tag_n}), - .data_out ({state_r, processed_mask_r, out_req_valid_r, out_req_rw_r, addr_matches_r, batch_valid_r, out_req_mask_r, seed_addr_r, seed_atype_r, out_req_addr_r, out_req_atype_r, out_req_byteen_r, out_req_data_r, out_req_tag_r}) + .data_in ({state_n, req_rem_mask_n, out_req_valid_n, out_req_rw_n, addr_matches_n, batch_valid_n, out_req_mask_n, seed_addr_n, seed_flags_n, out_req_addr_n, out_req_flags_n, out_req_byteen_n, out_req_data_n, out_req_tag_n}), + .data_out ({state_r, req_rem_mask_r, out_req_valid_r, out_req_rw_r, addr_matches_r, batch_valid_r, out_req_mask_r, seed_addr_r, seed_flags_r, out_req_addr_r, out_req_flags_r, out_req_byteen_r, out_req_data_r, out_req_tag_r}) ); wire out_rsp_fire = out_rsp_valid && out_rsp_ready; @@ -278,7 +274,12 @@ module VX_mem_coalescer #( assign out_req_mask = out_req_mask_r; assign out_req_byteen = out_req_byteen_r; assign out_req_addr = out_req_addr_r; - assign out_req_atype = out_req_atype_r; + if (FLAGS_WIDTH != 0) begin : g_out_req_flags + assign out_req_flags = out_req_flags_r; + end else begin : g_out_req_flags_0 + `UNUSED_VAR (out_req_flags_r) + assign out_req_flags = '0; + end assign out_req_data = out_req_data_r; assign out_req_tag = out_req_tag_r; @@ -306,30 +307,55 @@ module VX_mem_coalescer #( assign {ibuf_dout_tag, ibuf_dout_pmask, ibuf_dout_offset} = ibuf_dout; wire [NUM_REQS-1:0][DATA_IN_WIDTH-1:0] in_rsp_data_n; - wire [NUM_REQS-1:0] in_rsp_mask_n; - - for (genvar i = 0; i < OUT_REQS; ++i) begin - for (genvar j = 0; j < DATA_RATIO; ++j) begin - assign in_rsp_mask_n[i * DATA_RATIO + j] = out_rsp_mask[i] && ibuf_dout_pmask[i * DATA_RATIO + j]; + for (genvar i = 0; i < OUT_REQS; ++i) begin : g_in_rsp_data_n + for (genvar j = 0; j < DATA_RATIO; ++j) begin : g_j assign in_rsp_data_n[i * DATA_RATIO + j] = out_rsp_data[i][ibuf_dout_offset[i * DATA_RATIO + j] * DATA_IN_WIDTH +: DATA_IN_WIDTH]; end end + wire [NUM_REQS-1:0] in_rsp_mask_n; + for (genvar i = 0; i < OUT_REQS; ++i) begin : g_in_rsp_mask_n + for (genvar j = 0; j < DATA_RATIO; ++j) begin : g_j + assign in_rsp_mask_n[i * DATA_RATIO + j] = out_rsp_mask[i] && ibuf_dout_pmask[i * DATA_RATIO + j]; + end + end + assign in_rsp_valid = out_rsp_valid; assign in_rsp_mask = in_rsp_mask_n; assign in_rsp_data = in_rsp_data_n; assign in_rsp_tag = {out_rsp_tag[OUT_TAG_WIDTH-1 -: UUID_WIDTH], ibuf_dout_tag}; assign out_rsp_ready = in_rsp_ready; + // compute coalescing misses + // misses are partial transfers (not fuly coalesced) + + reg [PERF_CTR_BITS-1:0] misses_r; + + wire partial_transfer = (out_req_fire && req_rem_mask_r != '1); + + always @(posedge clk) begin + if (reset) begin + misses_r <= '0; + end else begin + misses_r <= misses_r + PERF_CTR_BITS'(partial_transfer); + end + end + + assign misses = misses_r; + `ifdef DBG_TRACE_MEM wire [`UP(UUID_WIDTH)-1:0] out_req_uuid; wire [`UP(UUID_WIDTH)-1:0] out_rsp_uuid; - if (UUID_WIDTH != 0) begin + if (UUID_WIDTH != 0) begin : g_out_req_uuid assign out_req_uuid = out_req_tag[OUT_TAG_WIDTH-1 -: UUID_WIDTH]; - assign out_rsp_uuid = out_rsp_tag[OUT_TAG_WIDTH-1 -: UUID_WIDTH]; - end else begin + end else begin : g_out_req_uuid_0 assign out_req_uuid = '0; + end + + if (UUID_WIDTH != 0) begin : g_out_rsp_uuid + assign out_rsp_uuid = out_rsp_tag[OUT_TAG_WIDTH-1 -: UUID_WIDTH]; + end else begin : g_out_rsp_uuid_0 assign out_rsp_uuid = '0; end @@ -343,38 +369,33 @@ module VX_mem_coalescer #( end end - wire out_req_fire = out_req_valid && out_req_ready; - always @(posedge clk) begin if (out_req_fire) begin if (out_req_rw) begin - `TRACE(1, ("%d: %s-out-req-wr: valid=%b, addr=", $time, INSTANCE_ID, out_req_mask)); - `TRACE_ARRAY1D(1, "0x%h", out_req_addr, OUT_REQS); - `TRACE(1, (", atype=")); - `TRACE_ARRAY1D(1, "%b", out_req_atype, OUT_REQS); - `TRACE(1, (", byteen=")); - `TRACE_ARRAY1D(1, "0x%h", out_req_byteen, OUT_REQS); - `TRACE(1, (", data=")); - `TRACE_ARRAY1D(1, "0x%0h", out_req_data, OUT_REQS); + `TRACE(2, ("%t: %s out-req-wr: valid=%b, addr=", $time, INSTANCE_ID, out_req_mask)) + `TRACE_ARRAY1D(2, "0x%h", out_req_addr, OUT_REQS) + `TRACE(2, (", flags=")) + `TRACE_ARRAY1D(2, "%b", out_req_flags, OUT_REQS) + `TRACE(2, (", byteen=")) + `TRACE_ARRAY1D(2, "0x%h", out_req_byteen, OUT_REQS) + `TRACE(2, (", data=")) + `TRACE_ARRAY1D(2, "0x%0h", out_req_data, OUT_REQS) end else begin - `TRACE(1, ("%d: %s-out-req-rd: valid=%b, addr=", $time, INSTANCE_ID, out_req_mask)); - `TRACE_ARRAY1D(1, "0x%h", out_req_addr, OUT_REQS); - `TRACE(1, (", atype=")); - `TRACE_ARRAY1D(1, "%b", out_req_atype, OUT_REQS); - end - `TRACE(1, (", offset=")); - `TRACE_ARRAY1D(1, "%0d", out_req_offset, NUM_REQS); - `TRACE(1, (", pmask=%b, tag=0x%0h (#%0d)\n", out_req_pmask, out_req_tag, out_req_uuid)); - if ($countones(out_req_pmask) > 1) begin - `TRACE(1, ("%t: *** %s: coalesced=%d (#%0d)\n", $time, INSTANCE_ID, $countones(out_req_pmask), out_req_uuid)); + `TRACE(2, ("%d: %s out-req-rd: valid=%b, addr=", $time, INSTANCE_ID, out_req_mask)) + `TRACE_ARRAY1D(2, "0x%h", out_req_addr, OUT_REQS) + `TRACE(2, (", flags=")) + `TRACE_ARRAY1D(2, "%b", out_req_flags, OUT_REQS) end + `TRACE(2, (", offset=")) + `TRACE_ARRAY1D(2, "%0d", out_req_offset, NUM_REQS) + `TRACE(2, (", pmask=%b, coalesced=%0d, tag=0x%0h (#%0d)\n", out_req_pmask, $countones(out_req_pmask), out_req_tag, out_req_uuid)) end if (out_rsp_fire) begin - `TRACE(1, ("%d: %s-out-rsp: valid=%b, data=", $time, INSTANCE_ID, out_rsp_mask)); - `TRACE_ARRAY1D(1, "0x%0h", out_rsp_data, OUT_REQS); - `TRACE(1, (", offset=")); - `TRACE_ARRAY1D(1, "%0d", ibuf_dout_offset, NUM_REQS); - `TRACE(1, (", eop=%b, pmask=%b, tag=0x%0h (#%0d)\n", out_rsp_eop, ibuf_dout_pmask, out_rsp_tag, out_rsp_uuid)); + `TRACE(2, ("%t: %s out-rsp: valid=%b, data=", $time, INSTANCE_ID, out_rsp_mask)) + `TRACE_ARRAY1D(2, "0x%0h", out_rsp_data, OUT_REQS) + `TRACE(2, (", offset=")) + `TRACE_ARRAY1D(2, "%0d", ibuf_dout_offset, NUM_REQS) + `TRACE(2, (", eop=%b, pmask=%b, tag=0x%0h (#%0d)\n", out_rsp_eop, ibuf_dout_pmask, out_rsp_tag, out_rsp_uuid)) end end `endif diff --git a/hw/rtl/libs/VX_mem_adapter.sv b/hw/rtl/libs/VX_mem_data_adapter.sv similarity index 84% rename from hw/rtl/libs/VX_mem_adapter.sv rename to hw/rtl/libs/VX_mem_data_adapter.sv index 263df0159..653c81e6c 100644 --- a/hw/rtl/libs/VX_mem_adapter.sv +++ b/hw/rtl/libs/VX_mem_data_adapter.sv @@ -14,7 +14,7 @@ `include "VX_platform.vh" `TRACING_OFF -module VX_mem_adapter #( +module VX_mem_data_adapter #( parameter SRC_DATA_WIDTH = 1, parameter SRC_ADDR_WIDTH = 1, parameter DST_DATA_WIDTH = 1, @@ -53,14 +53,16 @@ module VX_mem_adapter #( input wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_out, output wire mem_rsp_ready_out ); - `STATIC_ASSERT ((DST_TAG_WIDTH >= SRC_TAG_WIDTH), ("oops!")) - localparam DST_DATA_SIZE = (DST_DATA_WIDTH / 8); localparam DST_LDATAW = `CLOG2(DST_DATA_WIDTH); localparam SRC_LDATAW = `CLOG2(SRC_DATA_WIDTH); localparam D = `ABS(DST_LDATAW - SRC_LDATAW); localparam P = 2**D; + localparam EXPECTED_TAG_WIDTH = SRC_TAG_WIDTH + ((DST_LDATAW > SRC_LDATAW) ? D : 0); + + `STATIC_ASSERT(DST_TAG_WIDTH >= EXPECTED_TAG_WIDTH, ("invalid DST_TAG_WIDTH parameter, current=%0d, expected=%0d", DST_TAG_WIDTH, EXPECTED_TAG_WIDTH)) + wire mem_req_valid_out_w; wire [DST_ADDR_WIDTH-1:0] mem_req_addr_out_w; wire mem_req_rw_out_w; @@ -74,9 +76,10 @@ module VX_mem_adapter #( wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_in_w; wire mem_rsp_ready_in_w; + `UNUSED_VAR (mem_req_tag_in) `UNUSED_VAR (mem_rsp_tag_out) - if (DST_LDATAW > SRC_LDATAW) begin + if (DST_LDATAW > SRC_LDATAW) begin : g_wider_dst_data `UNUSED_VAR (clk) `UNUSED_VAR (reset) @@ -88,28 +91,44 @@ module VX_mem_adapter #( wire [P-1:0][SRC_DATA_WIDTH-1:0] mem_rsp_data_out_w = mem_rsp_data_out; - if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH - D)) begin + if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH - D)) begin : g_mem_req_addr_out_w_src `UNUSED_VAR (mem_req_addr_in_qual) assign mem_req_addr_out_w = mem_req_addr_in_qual[DST_ADDR_WIDTH-1:0]; - end else if (DST_ADDR_WIDTH > (SRC_ADDR_WIDTH - D)) begin + end else if (DST_ADDR_WIDTH > (SRC_ADDR_WIDTH - D)) begin : g_mem_req_addr_out_w_dst assign mem_req_addr_out_w = DST_ADDR_WIDTH'(mem_req_addr_in_qual); - end else begin + end else begin : g_mem_req_addr_out_w assign mem_req_addr_out_w = mem_req_addr_in_qual; end + VX_demux #( + .DATAW (SRC_DATA_WIDTH/8), + .N (P) + ) req_be_demux ( + .sel_in (req_idx), + .data_in (mem_req_byteen_in), + .data_out (mem_req_byteen_out_w) + ); + + VX_demux #( + .DATAW (SRC_DATA_WIDTH), + .N (P) + ) req_data_demux ( + .sel_in (req_idx), + .data_in (mem_req_data_in), + .data_out (mem_req_data_out_w) + ); + assign mem_req_valid_out_w = mem_req_valid_in; assign mem_req_rw_out_w = mem_req_rw_in; - assign mem_req_byteen_out_w = DST_DATA_SIZE'(mem_req_byteen_in) << ((DST_LDATAW-3)'(req_idx) << (SRC_LDATAW-3)); - assign mem_req_data_out_w = DST_DATA_WIDTH'(mem_req_data_in) << ((DST_LDATAW'(req_idx)) << SRC_LDATAW); assign mem_req_tag_out_w = DST_TAG_WIDTH'({mem_req_tag_in, req_idx}); assign mem_req_ready_in = mem_req_ready_out_w; assign mem_rsp_valid_in_w = mem_rsp_valid_out; assign mem_rsp_data_in_w = mem_rsp_data_out_w[rsp_idx]; - assign mem_rsp_tag_in_w = SRC_TAG_WIDTH'(mem_rsp_tag_out[SRC_TAG_WIDTH+D-1:D]); + assign mem_rsp_tag_in_w = SRC_TAG_WIDTH'(mem_rsp_tag_out[DST_TAG_WIDTH-1:D]); assign mem_rsp_ready_out = mem_rsp_ready_in_w; - end else if (DST_LDATAW < SRC_LDATAW) begin + end else if (DST_LDATAW < SRC_LDATAW) begin : g_wider_src_data reg [D-1:0] req_ctr, rsp_ctr; @@ -153,16 +172,16 @@ module VX_mem_adapter #( end assign mem_rsp_tag_in_x = (rsp_ctr != 0) ? mem_rsp_tag_in_r : mem_rsp_tag_out; `RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_x == mem_rsp_tag_out), - ("%t: *** out-of-order memory reponse! cur=%d, expected=%d", $time, mem_rsp_tag_in_x, mem_rsp_tag_out)) + ("%t: *** out-of-order memory reponse! cur=0x%0h, expected=0x%0h", $time, mem_rsp_tag_in_x, mem_rsp_tag_out)) wire [SRC_ADDR_WIDTH+D-1:0] mem_req_addr_in_qual = {mem_req_addr_in, req_ctr}; - if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH + D)) begin + if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH + D)) begin : g_mem_req_addr_out_w_src `UNUSED_VAR (mem_req_addr_in_qual) assign mem_req_addr_out_w = mem_req_addr_in_qual[DST_ADDR_WIDTH-1:0]; - end else if (DST_ADDR_WIDTH > (SRC_ADDR_WIDTH + D)) begin + end else if (DST_ADDR_WIDTH > (SRC_ADDR_WIDTH + D)) begin : g_mem_req_addr_out_w_dst assign mem_req_addr_out_w = DST_ADDR_WIDTH'(mem_req_addr_in_qual); - end else begin + end else begin : g_mem_req_addr_out_w assign mem_req_addr_out_w = mem_req_addr_in_qual; end @@ -178,17 +197,17 @@ module VX_mem_adapter #( assign mem_rsp_tag_in_w = SRC_TAG_WIDTH'(mem_rsp_tag_out); assign mem_rsp_ready_out = mem_rsp_ready_in_w; - end else begin + end else begin : g_passthru `UNUSED_VAR (clk) `UNUSED_VAR (reset) - if (DST_ADDR_WIDTH < SRC_ADDR_WIDTH) begin + if (DST_ADDR_WIDTH < SRC_ADDR_WIDTH) begin : g_mem_req_addr_out_w_src `UNUSED_VAR (mem_req_addr_in) assign mem_req_addr_out_w = mem_req_addr_in[DST_ADDR_WIDTH-1:0]; - end else if (DST_ADDR_WIDTH > SRC_ADDR_WIDTH) begin + end else if (DST_ADDR_WIDTH > SRC_ADDR_WIDTH) begin : g_mem_req_addr_out_w_dst assign mem_req_addr_out_w = DST_ADDR_WIDTH'(mem_req_addr_in); - end else begin + end else begin : g_mem_req_addr_out_w assign mem_req_addr_out_w = mem_req_addr_in; end diff --git a/hw/rtl/libs/VX_mem_scheduler.sv b/hw/rtl/libs/VX_mem_scheduler.sv index aa3ef9b2f..db9aecdd6 100644 --- a/hw/rtl/libs/VX_mem_scheduler.sv +++ b/hw/rtl/libs/VX_mem_scheduler.sv @@ -21,7 +21,7 @@ module VX_mem_scheduler #( parameter WORD_SIZE = 4, parameter LINE_SIZE = WORD_SIZE, parameter ADDR_WIDTH = 32 - `CLOG2(WORD_SIZE), - parameter ATYPE_WIDTH = 1, + parameter FLAGS_WIDTH = 0, parameter TAG_WIDTH = 8, parameter UUID_WIDTH = 0, // upper section of the request tag contains the UUID parameter CORE_QUEUE_SIZE= 8, @@ -32,7 +32,7 @@ module VX_mem_scheduler #( parameter WORD_WIDTH = WORD_SIZE * 8, parameter LINE_WIDTH = LINE_SIZE * 8, - parameter COALESCE_ENABLE = (LINE_SIZE != WORD_SIZE), + parameter COALESCE_ENABLE = (CORE_REQS > 1) && (LINE_SIZE != WORD_SIZE), parameter PER_LINE_REQS = LINE_SIZE / WORD_SIZE, parameter MERGED_REQS = CORE_REQS / PER_LINE_REQS, parameter MEM_BATCHES = `CDIV(MERGED_REQS, MEM_CHANNELS), @@ -50,12 +50,12 @@ module VX_mem_scheduler #( input wire [CORE_REQS-1:0] core_req_mask, input wire [CORE_REQS-1:0][WORD_SIZE-1:0] core_req_byteen, input wire [CORE_REQS-1:0][ADDR_WIDTH-1:0] core_req_addr, - input wire [CORE_REQS-1:0][ATYPE_WIDTH-1:0] core_req_atype, + input wire [CORE_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] core_req_flags, input wire [CORE_REQS-1:0][WORD_WIDTH-1:0] core_req_data, input wire [TAG_WIDTH-1:0] core_req_tag, output wire core_req_ready, output wire core_req_empty, - output wire core_req_sent, + output wire core_req_wr_notify, // Core response output wire core_rsp_valid, @@ -72,7 +72,7 @@ module VX_mem_scheduler #( output wire [MEM_CHANNELS-1:0] mem_req_mask, output wire [MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen, output wire [MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr, - output wire [MEM_CHANNELS-1:0][ATYPE_WIDTH-1:0] mem_req_atype, + output wire [MEM_CHANNELS-1:0][`UP(FLAGS_WIDTH)-1:0] mem_req_flags, output wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data, output wire [MEM_TAG_WIDTH-1:0] mem_req_tag, input wire mem_req_ready, @@ -94,10 +94,10 @@ module VX_mem_scheduler #( localparam CORE_BATCHES = COALESCE_ENABLE ? 1 : MEM_BATCHES; localparam CORE_BATCH_BITS = `CLOG2(CORE_BATCHES); + `STATIC_ASSERT ((MEM_CHANNELS <= CORE_REQS), ("invalid parameter")) `STATIC_ASSERT (`IS_DIVISBLE(CORE_REQS * WORD_SIZE, LINE_SIZE), ("invalid parameter")) `STATIC_ASSERT ((TAG_WIDTH >= UUID_WIDTH), ("invalid parameter")) - `STATIC_ASSERT ((0 == RSP_PARTIAL) || (1 == RSP_PARTIAL), ("invalid parameter")) - `RUNTIME_ASSERT((~core_req_valid || core_req_mask != 0), ("invalid request mask")); + `RUNTIME_ASSERT((~core_req_valid || core_req_mask != 0), ("%t: invalid request mask", $time)) wire ibuf_push; wire ibuf_pop; @@ -113,7 +113,7 @@ module VX_mem_scheduler #( wire reqq_rw; wire [CORE_REQS-1:0][WORD_SIZE-1:0] reqq_byteen; wire [CORE_REQS-1:0][ADDR_WIDTH-1:0] reqq_addr; - wire [CORE_REQS-1:0][ATYPE_WIDTH-1:0] reqq_atype; + wire [CORE_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] reqq_flags; wire [CORE_REQS-1:0][WORD_WIDTH-1:0] reqq_data; wire [REQQ_TAG_WIDTH-1:0] reqq_tag; wire reqq_ready; @@ -123,7 +123,7 @@ module VX_mem_scheduler #( wire reqq_rw_s; wire [MERGED_REQS-1:0][LINE_SIZE-1:0] reqq_byteen_s; wire [MERGED_REQS-1:0][MEM_ADDR_WIDTH-1:0] reqq_addr_s; - wire [MERGED_REQS-1:0][ATYPE_WIDTH-1:0] reqq_atype_s; + wire [MERGED_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] reqq_flags_s; wire [MERGED_REQS-1:0][LINE_WIDTH-1:0] reqq_data_s; wire [MERGED_TAG_WIDTH-1:0] reqq_tag_s; wire reqq_ready_s; @@ -133,7 +133,7 @@ module VX_mem_scheduler #( wire mem_req_rw_s; wire [MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen_s; wire [MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr_s; - wire [MEM_CHANNELS-1:0][ATYPE_WIDTH-1:0] mem_req_atype_s; + wire [MEM_CHANNELS-1:0][`UP(FLAGS_WIDTH)-1:0] mem_req_flags_s; wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data_s; wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s; wire mem_req_ready_s; @@ -161,14 +161,14 @@ module VX_mem_scheduler #( wire reqq_ready_in; wire [REQQ_TAG_WIDTH-1:0] reqq_tag_u; - if (UUID_WIDTH != 0) begin + if (UUID_WIDTH != 0) begin : g_reqq_tag_u_uuid assign reqq_tag_u = {core_req_tag[TAG_WIDTH-1 -: UUID_WIDTH], ibuf_waddr}; - end else begin + end else begin : g_reqq_tag_u assign reqq_tag_u = ibuf_waddr; end VX_elastic_buffer #( - .DATAW (1 + CORE_REQS * (1 + WORD_SIZE + ADDR_WIDTH + ATYPE_WIDTH + WORD_WIDTH) + REQQ_TAG_WIDTH), + .DATAW (1 + CORE_REQS * (1 + WORD_SIZE + ADDR_WIDTH + `UP(FLAGS_WIDTH) + WORD_WIDTH) + REQQ_TAG_WIDTH), .SIZE (CORE_QUEUE_SIZE), .OUT_REG (1) ) req_queue ( @@ -176,8 +176,8 @@ module VX_mem_scheduler #( .reset (reset), .valid_in (reqq_valid_in), .ready_in (reqq_ready_in), - .data_in ({core_req_rw, core_req_mask, core_req_byteen, core_req_addr, core_req_atype, core_req_data, reqq_tag_u}), - .data_out ({reqq_rw, reqq_mask, reqq_byteen, reqq_addr, reqq_atype, reqq_data, reqq_tag}), + .data_in ({core_req_rw, core_req_mask, core_req_byteen, core_req_addr, core_req_flags, core_req_data, reqq_tag_u}), + .data_out ({reqq_rw, reqq_mask, reqq_byteen, reqq_addr, reqq_flags, reqq_data, reqq_tag}), .valid_out(reqq_valid), .ready_out(reqq_ready) ); @@ -188,8 +188,8 @@ module VX_mem_scheduler #( // no pending requests assign core_req_empty = !reqq_valid && ibuf_empty; - // notify request submisison - assign core_req_sent = reqq_valid && reqq_ready; + // notify write request submisison + assign core_req_wr_notify = reqq_valid && reqq_ready && reqq_rw; // Index buffer /////////////////////////////////////////////////////////// @@ -221,23 +221,23 @@ module VX_mem_scheduler #( // Handle memory coalescing /////////////////////////////////////////////// - if (COALESCE_ENABLE) begin - - `RESET_RELAY (coalescer_reset, reset); + if (COALESCE_ENABLE) begin : g_coalescer VX_mem_coalescer #( - .INSTANCE_ID ($sformatf("%s-coalescer", INSTANCE_ID)), + .INSTANCE_ID (`SFORMATF(("%s-coalescer", INSTANCE_ID))), .NUM_REQS (CORE_REQS), .DATA_IN_SIZE (WORD_SIZE), .DATA_OUT_SIZE (LINE_SIZE), .ADDR_WIDTH (ADDR_WIDTH), - .ATYPE_WIDTH (ATYPE_WIDTH), + .FLAGS_WIDTH (FLAGS_WIDTH), .TAG_WIDTH (REQQ_TAG_WIDTH), .UUID_WIDTH (UUID_WIDTH), .QUEUE_SIZE (MEM_QUEUE_SIZE) ) coalescer ( - .clk (clk), - .reset (coalescer_reset), + .clk (clk), + .reset (reset), + + `UNUSED_PIN (misses), // Input request .in_req_valid (reqq_valid), @@ -245,7 +245,7 @@ module VX_mem_scheduler #( .in_req_rw (reqq_rw), .in_req_byteen (reqq_byteen), .in_req_addr (reqq_addr), - .in_req_atype (reqq_atype), + .in_req_flags (reqq_flags), .in_req_data (reqq_data), .in_req_tag (reqq_tag), .in_req_ready (reqq_ready), @@ -263,7 +263,7 @@ module VX_mem_scheduler #( .out_req_rw (reqq_rw_s), .out_req_byteen (reqq_byteen_s), .out_req_addr (reqq_addr_s), - .out_req_atype (reqq_atype_s), + .out_req_flags (reqq_flags_s), .out_req_data (reqq_data_s), .out_req_tag (reqq_tag_s), .out_req_ready (reqq_ready_s), @@ -276,14 +276,13 @@ module VX_mem_scheduler #( .out_rsp_ready (mem_rsp_ready) ); - end else begin - + end else begin : g_no_coalescer assign reqq_valid_s = reqq_valid; assign reqq_mask_s = reqq_mask; assign reqq_rw_s = reqq_rw; assign reqq_byteen_s= reqq_byteen; assign reqq_addr_s = reqq_addr; - assign reqq_atype_s = reqq_atype; + assign reqq_flags_s = reqq_flags; assign reqq_data_s = reqq_data; assign reqq_tag_s = reqq_tag; assign reqq_ready = reqq_ready_s; @@ -301,25 +300,25 @@ module VX_mem_scheduler #( wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0] mem_req_mask_b; wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen_b; wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr_b; - wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][ATYPE_WIDTH-1:0] mem_req_atype_b; + wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][`UP(FLAGS_WIDTH)-1:0] mem_req_flags_b; wire [MEM_BATCHES-1:0][MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data_b; wire [BATCH_SEL_WIDTH-1:0] req_batch_idx; - for (genvar i = 0; i < MEM_BATCHES; ++i) begin - for (genvar j = 0; j < MEM_CHANNELS; ++j) begin + for (genvar i = 0; i < MEM_BATCHES; ++i) begin : g_mem_req_data_b + for (genvar j = 0; j < MEM_CHANNELS; ++j) begin : g_j localparam r = i * MEM_CHANNELS + j; - if (r < MERGED_REQS) begin + if (r < MERGED_REQS) begin : g_valid assign mem_req_mask_b[i][j] = reqq_mask_s[r]; assign mem_req_byteen_b[i][j] = reqq_byteen_s[r]; assign mem_req_addr_b[i][j] = reqq_addr_s[r]; - assign mem_req_atype_b[i][j] = reqq_atype_s[r]; + assign mem_req_flags_b[i][j] = reqq_flags_s[r]; assign mem_req_data_b[i][j] = reqq_data_s[r]; - end else begin + end else begin : g_padding assign mem_req_mask_b[i][j] = 0; assign mem_req_byteen_b[i][j] = '0; assign mem_req_addr_b[i][j] = '0; - assign mem_req_atype_b[i][j] = '0; + assign mem_req_flags_b[i][j] = '0; assign mem_req_data_b[i][j] = '0; end end @@ -329,10 +328,10 @@ module VX_mem_scheduler #( assign mem_req_rw_s = reqq_rw_s; assign mem_req_byteen_s = mem_req_byteen_b[req_batch_idx]; assign mem_req_addr_s = mem_req_addr_b[req_batch_idx]; - assign mem_req_atype_s = mem_req_atype_b[req_batch_idx]; + assign mem_req_flags_s = mem_req_flags_b[req_batch_idx]; assign mem_req_data_s = mem_req_data_b[req_batch_idx]; - if (MEM_BATCHES != 1) begin + if (MEM_BATCHES != 1) begin : g_batch reg [MEM_BATCH_BITS-1:0] req_batch_idx_r; wire is_degenerate_batch = ~(| mem_req_mask_s); @@ -357,7 +356,7 @@ module VX_mem_scheduler #( wire [MEM_BATCHES-1:0][MEM_BATCH_BITS-1:0] req_batch_idxs; wire [MEM_BATCH_BITS-1:0] req_batch_idx_last; - for (genvar i = 0; i < MEM_BATCHES; ++i) begin + for (genvar i = 0; i < MEM_BATCHES; ++i) begin : g_req_batch assign req_batch_valids[i] = (| mem_req_mask_b[i]); assign req_batch_idxs[i] = MEM_BATCH_BITS'(i); end @@ -378,7 +377,7 @@ module VX_mem_scheduler #( assign req_sent_all = mem_req_ready_b && (req_batch_idx_r == req_batch_idx_last); assign mem_req_tag_s = {reqq_tag_s, req_batch_idx}; - end else begin + end else begin : g_no_batch assign mem_req_valid_s = reqq_valid_s; assign req_batch_idx = '0; @@ -389,8 +388,10 @@ module VX_mem_scheduler #( assign reqq_ready_s = req_sent_all; + wire [MEM_CHANNELS-1:0][`UP(FLAGS_WIDTH)-1:0] mem_req_flags_u; + VX_elastic_buffer #( - .DATAW (MEM_CHANNELS + 1 + MEM_CHANNELS * (LINE_SIZE + MEM_ADDR_WIDTH + ATYPE_WIDTH + LINE_WIDTH) + MEM_TAG_WIDTH), + .DATAW (MEM_CHANNELS + 1 + MEM_CHANNELS * (LINE_SIZE + MEM_ADDR_WIDTH + `UP(FLAGS_WIDTH) + LINE_WIDTH) + MEM_TAG_WIDTH), .SIZE (`TO_OUT_BUF_SIZE(MEM_OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF)) ) mem_req_buf ( @@ -398,115 +399,137 @@ module VX_mem_scheduler #( .reset (reset), .valid_in (mem_req_valid_s), .ready_in (mem_req_ready_s), - .data_in ({mem_req_mask_s, mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_atype_s, mem_req_data_s, mem_req_tag_s}), - .data_out ({mem_req_mask, mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_atype, mem_req_data, mem_req_tag}), + .data_in ({mem_req_mask_s, mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_flags_s, mem_req_data_s, mem_req_tag_s}), + .data_out ({mem_req_mask, mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_flags_u, mem_req_data, mem_req_tag}), .valid_out (mem_req_valid), .ready_out (mem_req_ready) ); + if (FLAGS_WIDTH != 0) begin : g_mem_req_flags + assign mem_req_flags = mem_req_flags_u; + end else begin : g_mem_req_flags_0 + `UNUSED_VAR (mem_req_flags_u) + assign mem_req_flags = '0; + end + // Handle memory responses //////////////////////////////////////////////// - reg [CORE_QUEUE_SIZE-1:0][CORE_REQS-1:0] rsp_rem_mask; - wire [CORE_REQS-1:0] rsp_rem_mask_n, curr_mask; wire [BATCH_SEL_WIDTH-1:0] rsp_batch_idx; - - if (CORE_BATCHES > 1) begin + if (CORE_BATCHES > 1) begin : g_rsp_batch_idx assign rsp_batch_idx = mem_rsp_tag_s[CORE_BATCH_BITS-1:0]; - end else begin + end else begin : g_rsp_batch_idx_0 assign rsp_batch_idx = '0; end - for (genvar r = 0; r < CORE_REQS; ++r) begin - localparam i = r / CORE_CHANNELS; - localparam j = r % CORE_CHANNELS; - assign curr_mask[r] = (BATCH_SEL_WIDTH'(i) == rsp_batch_idx) && mem_rsp_mask_s[j]; - end - - assign rsp_rem_mask_n = rsp_rem_mask[ibuf_raddr] & ~curr_mask; - - wire rsp_complete = ~(| rsp_rem_mask_n); - - wire mem_rsp_fire_s = mem_rsp_valid_s && mem_rsp_ready_s; - - always @(posedge clk) begin - if (ibuf_push) begin - rsp_rem_mask[ibuf_waddr] <= core_req_mask; - end - if (mem_rsp_fire_s) begin - rsp_rem_mask[ibuf_raddr] <= rsp_rem_mask_n; - end - end - - if (RSP_PARTIAL == 1) begin - - reg [CORE_QUEUE_SIZE-1:0] rsp_sop_r; - - always @(posedge clk) begin - if (ibuf_push) begin - rsp_sop_r[ibuf_waddr] <= 1; - end - if (mem_rsp_fire_s) begin - rsp_sop_r[ibuf_raddr] <= 0; - end - end + if (CORE_REQS == 1) begin : g_rsp_1 + `UNUSED_VAR (rsp_batch_idx) assign crsp_valid = mem_rsp_valid_s; - assign crsp_mask = curr_mask; - assign crsp_sop = rsp_sop_r[ibuf_raddr]; - - for (genvar r = 0; r < CORE_REQS; ++r) begin - localparam j = r % CORE_CHANNELS; - assign crsp_data[r] = mem_rsp_data_s[j]; - end + assign crsp_mask = mem_rsp_mask_s; + assign crsp_sop = 1'b1; + assign crsp_eop = 1'b1; + assign crsp_data = mem_rsp_data_s; assign mem_rsp_ready_s = crsp_ready; - end else begin + end else begin : g_rsp_N - reg [CORE_BATCHES*CORE_CHANNELS*WORD_WIDTH-1:0] rsp_store [CORE_QUEUE_SIZE-1:0]; - reg [CORE_BATCHES*CORE_CHANNELS*WORD_WIDTH-1:0] rsp_store_n; - reg [CORE_REQS-1:0] rsp_orig_mask [CORE_QUEUE_SIZE-1:0]; + reg [CORE_QUEUE_SIZE-1:0][CORE_REQS-1:0] rsp_rem_mask; + wire [CORE_REQS-1:0] rsp_rem_mask_n, curr_mask; - always @(*) begin - rsp_store_n = rsp_store[ibuf_raddr]; - for (integer i = 0; i < CORE_CHANNELS; ++i) begin - if ((CORE_CHANNELS == 1) || mem_rsp_mask_s[i]) begin - rsp_store_n[(rsp_batch_idx * CORE_CHANNELS + i) * WORD_WIDTH +: WORD_WIDTH] = mem_rsp_data_s[i]; - end - end + for (genvar r = 0; r < CORE_REQS; ++r) begin : g_curr_mask + localparam i = r / CORE_CHANNELS; + localparam j = r % CORE_CHANNELS; + assign curr_mask[r] = (BATCH_SEL_WIDTH'(i) == rsp_batch_idx) && mem_rsp_mask_s[j]; end + assign rsp_rem_mask_n = rsp_rem_mask[ibuf_raddr] & ~curr_mask; + + wire mem_rsp_fire_s = mem_rsp_valid_s && mem_rsp_ready_s; + always @(posedge clk) begin if (ibuf_push) begin - rsp_orig_mask[ibuf_waddr] <= core_req_mask; + rsp_rem_mask[ibuf_waddr] <= core_req_mask; end - if (mem_rsp_valid_s) begin - rsp_store[ibuf_raddr] <= rsp_store_n; + if (mem_rsp_fire_s) begin + rsp_rem_mask[ibuf_raddr] <= rsp_rem_mask_n; end end - assign crsp_valid = mem_rsp_valid_s && rsp_complete; - assign crsp_mask = rsp_orig_mask[ibuf_raddr]; - assign crsp_sop = 1'b1; + wire rsp_complete = ~(| rsp_rem_mask_n) || (CORE_REQS == 1); - for (genvar r = 0; r < CORE_REQS; ++r) begin - localparam i = r / CORE_CHANNELS; - localparam j = r % CORE_CHANNELS; - assign crsp_data[r] = rsp_store_n[(i * CORE_CHANNELS + j) * WORD_WIDTH +: WORD_WIDTH]; + if (RSP_PARTIAL != 0) begin : g_rsp_partial + + reg [CORE_QUEUE_SIZE-1:0] rsp_sop_r; + + always @(posedge clk) begin + if (ibuf_push) begin + rsp_sop_r[ibuf_waddr] <= 1; + end + if (mem_rsp_fire_s) begin + rsp_sop_r[ibuf_raddr] <= 0; + end + end + + assign crsp_valid = mem_rsp_valid_s; + assign crsp_mask = curr_mask; + assign crsp_sop = rsp_sop_r[ibuf_raddr]; + + for (genvar r = 0; r < CORE_REQS; ++r) begin : g_crsp_data + localparam j = r % CORE_CHANNELS; + assign crsp_data[r] = mem_rsp_data_s[j]; + end + + assign mem_rsp_ready_s = crsp_ready; + + end else begin : g_rsp_full + + wire [CORE_CHANNELS-1:0][CORE_BATCHES-1:0][WORD_WIDTH-1:0] rsp_store_n; + reg [CORE_REQS-1:0] rsp_orig_mask [CORE_QUEUE_SIZE-1:0]; + + for (genvar i = 0; i < CORE_CHANNELS; ++i) begin : g_rsp_store + for (genvar j = 0; j < CORE_BATCHES; ++j) begin : g_j + reg [WORD_WIDTH-1:0] rsp_store [0:CORE_QUEUE_SIZE-1]; + wire rsp_wren = mem_rsp_fire_s + && (BATCH_SEL_WIDTH'(j) == rsp_batch_idx) + && ((CORE_CHANNELS == 1) || mem_rsp_mask_s[i]); + always @(posedge clk) begin + if (rsp_wren) begin + rsp_store[ibuf_raddr] <= mem_rsp_data_s[i]; + end + end + assign rsp_store_n[i][j] = rsp_wren ? mem_rsp_data_s[i] : rsp_store[ibuf_raddr]; + end + end + + always @(posedge clk) begin + if (ibuf_push) begin + rsp_orig_mask[ibuf_waddr] <= core_req_mask; + end + end + + assign crsp_valid = mem_rsp_valid_s && rsp_complete; + assign crsp_mask = rsp_orig_mask[ibuf_raddr]; + assign crsp_sop = 1'b1; + + for (genvar r = 0; r < CORE_REQS; ++r) begin : g_crsp_data + localparam i = r / CORE_CHANNELS; + localparam j = r % CORE_CHANNELS; + assign crsp_data[r] = rsp_store_n[j][i]; + end + + assign mem_rsp_ready_s = crsp_ready || ~rsp_complete; end - assign mem_rsp_ready_s = crsp_ready || ~rsp_complete; - + assign crsp_eop = rsp_complete; end - if (UUID_WIDTH != 0) begin + if (UUID_WIDTH != 0) begin : g_crsp_tag assign crsp_tag = {mem_rsp_tag_s[MEM_TAG_WIDTH-1 -: UUID_WIDTH], ibuf_dout}; - end else begin + end else begin : g_crsp_tag_0 assign crsp_tag = ibuf_dout; end - assign crsp_eop = rsp_complete; - // Send response to caller VX_elastic_buffer #( @@ -518,7 +541,7 @@ module VX_mem_scheduler #( .reset (reset), .valid_in (crsp_valid), .ready_in (crsp_ready), - .data_in ({crsp_mask, crsp_sop, crsp_eop, crsp_data, crsp_tag}), + .data_in ({crsp_mask, crsp_sop, crsp_eop, crsp_data, crsp_tag}), .data_out ({core_rsp_mask, core_rsp_sop, core_rsp_eop, core_rsp_data, core_rsp_tag}), .valid_out (core_rsp_valid), .ready_out (core_rsp_ready) @@ -527,9 +550,9 @@ module VX_mem_scheduler #( `ifdef SIMULATION wire [`UP(UUID_WIDTH)-1:0] req_dbg_uuid; - if (UUID_WIDTH != 0) begin + if (UUID_WIDTH != 0) begin : g_req_dbg_uuid assign req_dbg_uuid = core_req_tag[TAG_WIDTH-1 -: UUID_WIDTH]; - end else begin + end else begin : g_req_dbg_uuid_0 assign req_dbg_uuid = '0; end @@ -569,11 +592,11 @@ module VX_mem_scheduler #( wire [`UP(UUID_WIDTH)-1:0] mem_rsp_dbg_uuid; wire [`UP(UUID_WIDTH)-1:0] rsp_dbg_uuid; - if (UUID_WIDTH != 0) begin + if (UUID_WIDTH != 0) begin : g_dbg_uuid assign mem_req_dbg_uuid = mem_req_tag_s[MEM_TAG_WIDTH-1 -: UUID_WIDTH]; assign mem_rsp_dbg_uuid = mem_rsp_tag_s[MEM_TAG_WIDTH-1 -: UUID_WIDTH]; assign rsp_dbg_uuid = core_rsp_tag[TAG_WIDTH-1 -: UUID_WIDTH]; - end else begin + end else begin : g_dbg_uuid_0 assign mem_req_dbg_uuid = '0; assign mem_rsp_dbg_uuid = '0; assign rsp_dbg_uuid = '0; @@ -586,41 +609,41 @@ module VX_mem_scheduler #( always @(posedge clk) begin if (core_req_fire) begin if (core_req_rw) begin - `TRACE(1, ("%d: %s-core-req-wr: valid=%b, addr=", $time, INSTANCE_ID, core_req_mask)); - `TRACE_ARRAY1D(1, "0x%h", core_req_addr, CORE_REQS); - `TRACE(1, (", byteen=")); - `TRACE_ARRAY1D(1, "0x%h", core_req_byteen, CORE_REQS); - `TRACE(1, (", data=")); - `TRACE_ARRAY1D(1, "0x%0h", core_req_data, CORE_REQS); + `TRACE(2, ("%t: %s core-req-wr: valid=%b, addr=", $time, INSTANCE_ID, core_req_mask)) + `TRACE_ARRAY1D(2, "0x%h", core_req_addr, CORE_REQS) + `TRACE(2, (", byteen=")) + `TRACE_ARRAY1D(2, "0x%h", core_req_byteen, CORE_REQS) + `TRACE(2, (", data=")) + `TRACE_ARRAY1D(2, "0x%0h", core_req_data, CORE_REQS) end else begin - `TRACE(1, ("%d: %s-core-req-rd: valid=%b, addr=", $time, INSTANCE_ID, core_req_mask)); - `TRACE_ARRAY1D(1, "0x%h", core_req_addr, CORE_REQS); + `TRACE(2, ("%t: %s core-req-rd: valid=%b, addr=", $time, INSTANCE_ID, core_req_mask)) + `TRACE_ARRAY1D(2, "0x%h", core_req_addr, CORE_REQS) end - `TRACE(1, (", tag=0x%0h (#%0d)\n", core_req_tag, req_dbg_uuid)); + `TRACE(2, (", tag=0x%0h (#%0d)\n", core_req_tag, req_dbg_uuid)) end if (core_rsp_valid && core_rsp_ready) begin - `TRACE(1, ("%d: %s-core-rsp: valid=%b, sop=%b, eop=%b, data=", $time, INSTANCE_ID, core_rsp_mask, core_rsp_sop, core_rsp_eop)); - `TRACE_ARRAY1D(1, "0x%0h", core_rsp_data, CORE_REQS); - `TRACE(1, (", tag=0x%0h (#%0d)\n", core_rsp_tag, rsp_dbg_uuid)); + `TRACE(2, ("%t: %s core-rsp: valid=%b, sop=%b, eop=%b, data=", $time, INSTANCE_ID, core_rsp_mask, core_rsp_sop, core_rsp_eop)) + `TRACE_ARRAY1D(2, "0x%0h", core_rsp_data, CORE_REQS) + `TRACE(2, (", tag=0x%0h (#%0d)\n", core_rsp_tag, rsp_dbg_uuid)) end if (| mem_req_fire_s) begin if (| mem_req_rw_s) begin - `TRACE(1, ("%d: %s-mem-req-wr: valid=%b, addr=", $time, INSTANCE_ID, mem_req_mask_s)); - `TRACE_ARRAY1D(1, "0x%h", mem_req_addr_s, CORE_CHANNELS); - `TRACE(1, (", byteen=")); - `TRACE_ARRAY1D(1, "0x%h", mem_req_byteen_s, CORE_CHANNELS); - `TRACE(1, (", data=")); - `TRACE_ARRAY1D(1, "0x%0h", mem_req_data_s, CORE_CHANNELS); + `TRACE(2, ("%t: %s mem-req-wr: valid=%b, addr=", $time, INSTANCE_ID, mem_req_mask_s)) + `TRACE_ARRAY1D(2, "0x%h", mem_req_addr_s, CORE_CHANNELS) + `TRACE(2, (", byteen=")) + `TRACE_ARRAY1D(2, "0x%h", mem_req_byteen_s, CORE_CHANNELS) + `TRACE(2, (", data=")) + `TRACE_ARRAY1D(2, "0x%0h", mem_req_data_s, CORE_CHANNELS) end else begin - `TRACE(1, ("%d: %s-mem-req-rd: valid=%b, addr=", $time, INSTANCE_ID, mem_req_mask_s)); - `TRACE_ARRAY1D(1, "0x%h", mem_req_addr_s, CORE_CHANNELS); + `TRACE(2, ("%t: %s mem-req-rd: valid=%b, addr=", $time, INSTANCE_ID, mem_req_mask_s)) + `TRACE_ARRAY1D(2, "0x%h", mem_req_addr_s, CORE_CHANNELS) end - `TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_waddr_s, req_batch_idx, mem_req_dbg_uuid)); + `TRACE(2, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_waddr_s, req_batch_idx, mem_req_dbg_uuid)) end - if (mem_rsp_fire_s) begin - `TRACE(1, ("%d: %s-mem-rsp: valid=%b, data=", $time, INSTANCE_ID, mem_rsp_mask_s)); - `TRACE_ARRAY1D(1, "0x%0h", mem_rsp_data_s, CORE_CHANNELS); - `TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_raddr, rsp_batch_idx, mem_rsp_dbg_uuid)); + if (mem_rsp_valid_s && mem_rsp_ready_s) begin + `TRACE(2, ("%t: %s mem-rsp: valid=%b, data=", $time, INSTANCE_ID, mem_rsp_mask_s)) + `TRACE_ARRAY1D(2, "0x%0h", mem_rsp_data_s, CORE_CHANNELS) + `TRACE(2, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_raddr, rsp_batch_idx, mem_rsp_dbg_uuid)) end end `endif diff --git a/hw/rtl/libs/VX_multiplier.sv b/hw/rtl/libs/VX_multiplier.sv index 2f046779f..11bf13a9f 100644 --- a/hw/rtl/libs/VX_multiplier.sv +++ b/hw/rtl/libs/VX_multiplier.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,7 +21,7 @@ module VX_multiplier #( parameter SIGNED = 0, parameter LATENCY = 0 ) ( - input wire clk, + input wire clk, input wire enable, input wire [A_WIDTH-1:0] dataa, input wire [B_WIDTH-1:0] datab, @@ -29,15 +29,15 @@ module VX_multiplier #( ); wire [R_WIDTH-1:0] prod_w; - if (SIGNED != 0) begin + if (SIGNED != 0) begin : g_prod_s assign prod_w = R_WIDTH'($signed(dataa) * $signed(datab)); - end else begin + end else begin : g_prod_u assign prod_w = R_WIDTH'(dataa * datab); end - - if (LATENCY == 0) begin + + if (LATENCY == 0) begin : g_passthru assign result = prod_w; - end else begin + end else begin : g_latency reg [LATENCY-1:0][R_WIDTH-1:0] prod_r; always @(posedge clk) begin if (enable) begin @@ -46,8 +46,8 @@ module VX_multiplier #( prod_r[i] <= prod_r[i-1]; end end - end - assign result = prod_r[LATENCY-1]; + end + assign result = prod_r[LATENCY-1]; end endmodule diff --git a/hw/rtl/libs/VX_mux.sv b/hw/rtl/libs/VX_mux.sv index f0bc78cae..19a06600f 100644 --- a/hw/rtl/libs/VX_mux.sv +++ b/hw/rtl/libs/VX_mux.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,13 +19,13 @@ module VX_mux #( parameter N = 1, parameter LN = `LOG2UP(N) ) ( - input wire [N-1:0][DATAW-1:0] data_in, - input wire [LN-1:0] sel_in, + input wire [N-1:0][DATAW-1:0] data_in, + input wire [LN-1:0] sel_in, output wire [DATAW-1:0] data_out -); - if (N > 1) begin +); + if (N > 1) begin : g_mux assign data_out = data_in[sel_in]; - end else begin + end else begin : g_passthru `UNUSED_VAR (sel_in) assign data_out = data_in; end diff --git a/hw/rtl/libs/VX_onehot_encoder.sv b/hw/rtl/libs/VX_onehot_encoder.sv index 8f7ada257..08198e430 100644 --- a/hw/rtl/libs/VX_onehot_encoder.sv +++ b/hw/rtl/libs/VX_onehot_encoder.sv @@ -13,7 +13,7 @@ `include "VX_platform.vh" -// Fast encoder using parallel prefix computation +// Fast one-hot encoder using parallel prefix computation // Adapted from BaseJump STL: http://bjump.org/data_out.html `TRACING_OFF @@ -27,44 +27,40 @@ module VX_onehot_encoder #( output wire [LN-1:0] data_out, output wire valid_out ); - if (N == 1) begin + if (N == 1) begin : g_n1 assign data_out = 0; assign valid_out = data_in; - end else if (N == 2) begin + end else if (N == 2) begin : g_n2 assign data_out = data_in[!REVERSE]; assign valid_out = (| data_in); - end else if (MODEL == 1) begin + end else if (MODEL == 1) begin : g_model1 localparam M = 1 << LN; `IGNORE_UNOPTFLAT_BEGIN - wire [LN-1:0][M-1:0] addr; - wire [LN:0][M-1:0] v; + wire [M-1:0] addr [LN]; + wire [M-1:0] v [LN+1]; `IGNORE_UNOPTFLAT_END // base case, also handle padding for non-power of two inputs assign v[0] = REVERSE ? (M'(data_in) << (M - N)) : M'(data_in); - for (genvar lvl = 1; lvl < (LN+1); ++lvl) begin + for (genvar lvl = 1; lvl < (LN+1); ++lvl) begin : g_scan_l localparam SN = 1 << (LN - lvl); localparam SI = M / SN; - localparam SW = lvl; - - for (genvar s = 0; s < SN; ++s) begin + for (genvar s = 0; s < SN; ++s) begin : g_scan_s `IGNORE_UNOPTFLAT_BEGIN wire [1:0] vs = {v[lvl-1][s*SI+(SI>>1)], v[lvl-1][s*SI]}; `IGNORE_UNOPTFLAT_END - assign v[lvl][s*SI] = (| vs); - - if (lvl == 1) begin - assign addr[lvl-1][s*SI +: SW] = vs[!REVERSE]; - end else begin - assign addr[lvl-1][s*SI +: SW] = { + if (lvl == 1) begin : g_lvl_1 + assign addr[lvl-1][s*SI +: lvl] = vs[!REVERSE]; + end else begin : g_lvl_n + assign addr[lvl-1][s*SI +: lvl] = { vs[!REVERSE], - addr[lvl-2][s*SI +: SW-1] | addr[lvl-2][s*SI+(SI>>1) +: SW-1] + addr[lvl-2][s*SI +: lvl-1] | addr[lvl-2][s*SI+(SI>>1) +: lvl-1] }; end end @@ -73,11 +69,11 @@ module VX_onehot_encoder #( assign data_out = addr[LN-1][LN-1:0]; assign valid_out = v[LN][0]; - end else if (MODEL == 2 && REVERSE == 0) begin + end else if (MODEL == 2 && REVERSE == 0) begin : g_model2 - for (genvar j = 0; j < LN; ++j) begin + for (genvar j = 0; j < LN; ++j) begin : g_data_out wire [N-1:0] mask; - for (genvar i = 0; i < N; ++i) begin + for (genvar i = 0; i < N; ++i) begin : g_mask assign mask[i] = i[j]; end assign data_out[j] = | (mask & data_in); @@ -85,31 +81,31 @@ module VX_onehot_encoder #( assign valid_out = (| data_in); - end else begin + end else begin : g_model0 - reg [LN-1:0] index_r; + reg [LN-1:0] index_w; - if (REVERSE != 0) begin + if (REVERSE != 0) begin : g_msb always @(*) begin - index_r = 'x; + index_w = 'x; for (integer i = N-1; i >= 0; --i) begin if (data_in[i]) begin - index_r = LN'(N-1-i); + index_w = LN'(N-1-i); end end end - end else begin + end else begin : g_lsb always @(*) begin - index_r = 'x; + index_w = 'x; for (integer i = 0; i < N; ++i) begin if (data_in[i]) begin - index_r = LN'(i); + index_w = LN'(i); end end end end - assign data_out = index_r; + assign data_out = index_w; assign valid_out = (| data_in); end diff --git a/hw/rtl/libs/VX_onehot_mux.sv b/hw/rtl/libs/VX_onehot_mux.sv index cc0fffaa6..8b97692f5 100644 --- a/hw/rtl/libs/VX_onehot_mux.sv +++ b/hw/rtl/libs/VX_onehot_mux.sv @@ -24,116 +24,126 @@ module VX_onehot_mux #( input wire [N-1:0] sel_in, output wire [DATAW-1:0] data_out ); - if (N == 1) begin + if (N == 1) begin : g_passthru `UNUSED_VAR (sel_in) assign data_out = data_in; - end else if (LUT_OPT && N == 2) begin + end else if (LUT_OPT && N == 2) begin : g_lut2 `UNUSED_VAR (sel_in) assign data_out = sel_in[0] ? data_in[0] : data_in[1]; - end else if (LUT_OPT && N == 3) begin - reg [DATAW-1:0] data_out_r; + end else if (LUT_OPT && N == 3) begin : g_lut3 + reg [DATAW-1:0] data_out_w; always @(*) begin case (sel_in) - 3'b001: data_out_r = data_in[0]; - 3'b010: data_out_r = data_in[1]; - 3'b100: data_out_r = data_in[2]; - default: data_out_r = 'x; + 3'b001: data_out_w = data_in[0]; + 3'b010: data_out_w = data_in[1]; + 3'b100: data_out_w = data_in[2]; + default: data_out_w = 'x; endcase end - assign data_out = data_out_r; - end else if (LUT_OPT && N == 4) begin - reg [DATAW-1:0] data_out_r; + assign data_out = data_out_w; + end else if (LUT_OPT && N == 4) begin : g_lut4 + reg [DATAW-1:0] data_out_w; always @(*) begin case (sel_in) - 4'b0001: data_out_r = data_in[0]; - 4'b0010: data_out_r = data_in[1]; - 4'b0100: data_out_r = data_in[2]; - 4'b1000: data_out_r = data_in[3]; - default: data_out_r = 'x; + 4'b0001: data_out_w = data_in[0]; + 4'b0010: data_out_w = data_in[1]; + 4'b0100: data_out_w = data_in[2]; + 4'b1000: data_out_w = data_in[3]; + default: data_out_w = 'x; endcase end - assign data_out = data_out_r; - end else if (LUT_OPT && N == 5) begin - reg [DATAW-1:0] data_out_r; + assign data_out = data_out_w; + end else if (LUT_OPT && N == 5) begin : g_lut5 + reg [DATAW-1:0] data_out_w; always @(*) begin case (sel_in) - 5'b00001: data_out_r = data_in[0]; - 5'b00010: data_out_r = data_in[1]; - 5'b00100: data_out_r = data_in[2]; - 5'b01000: data_out_r = data_in[3]; - 5'b10000: data_out_r = data_in[4]; - default: data_out_r = 'x; + 5'b00001: data_out_w = data_in[0]; + 5'b00010: data_out_w = data_in[1]; + 5'b00100: data_out_w = data_in[2]; + 5'b01000: data_out_w = data_in[3]; + 5'b10000: data_out_w = data_in[4]; + default: data_out_w = 'x; endcase end - assign data_out = data_out_r; - end else if (LUT_OPT && N == 6) begin - reg [DATAW-1:0] data_out_r; + assign data_out = data_out_w; + end else if (LUT_OPT && N == 6) begin : g_lut6 + reg [DATAW-1:0] data_out_w; always @(*) begin case (sel_in) - 6'b000001: data_out_r = data_in[0]; - 6'b000010: data_out_r = data_in[1]; - 6'b000100: data_out_r = data_in[2]; - 6'b001000: data_out_r = data_in[3]; - 6'b010000: data_out_r = data_in[4]; - 6'b100000: data_out_r = data_in[5]; - default: data_out_r = 'x; + 6'b000001: data_out_w = data_in[0]; + 6'b000010: data_out_w = data_in[1]; + 6'b000100: data_out_w = data_in[2]; + 6'b001000: data_out_w = data_in[3]; + 6'b010000: data_out_w = data_in[4]; + 6'b100000: data_out_w = data_in[5]; + default: data_out_w = 'x; endcase end - assign data_out = data_out_r; - end else if (LUT_OPT && N == 7) begin - reg [DATAW-1:0] data_out_r; + assign data_out = data_out_w; + end else if (LUT_OPT && N == 7) begin : g_lut7 + reg [DATAW-1:0] data_out_w; always @(*) begin case (sel_in) - 7'b0000001: data_out_r = data_in[0]; - 7'b0000010: data_out_r = data_in[1]; - 7'b0000100: data_out_r = data_in[2]; - 7'b0001000: data_out_r = data_in[3]; - 7'b0010000: data_out_r = data_in[4]; - 7'b0100000: data_out_r = data_in[5]; - 7'b1000000: data_out_r = data_in[6]; - default: data_out_r = 'x; + 7'b0000001: data_out_w = data_in[0]; + 7'b0000010: data_out_w = data_in[1]; + 7'b0000100: data_out_w = data_in[2]; + 7'b0001000: data_out_w = data_in[3]; + 7'b0010000: data_out_w = data_in[4]; + 7'b0100000: data_out_w = data_in[5]; + 7'b1000000: data_out_w = data_in[6]; + default: data_out_w = 'x; endcase end - assign data_out = data_out_r; - end else if (LUT_OPT && N == 8) begin - reg [DATAW-1:0] data_out_r; + assign data_out = data_out_w; + end else if (LUT_OPT && N == 8) begin : g_lut8 + reg [DATAW-1:0] data_out_w; always @(*) begin case (sel_in) - 8'b00000001: data_out_r = data_in[0]; - 8'b00000010: data_out_r = data_in[1]; - 8'b00000100: data_out_r = data_in[2]; - 8'b00001000: data_out_r = data_in[3]; - 8'b00010000: data_out_r = data_in[4]; - 8'b00100000: data_out_r = data_in[5]; - 8'b01000000: data_out_r = data_in[6]; - 8'b10000000: data_out_r = data_in[7]; - default: data_out_r = 'x; + 8'b00000001: data_out_w = data_in[0]; + 8'b00000010: data_out_w = data_in[1]; + 8'b00000100: data_out_w = data_in[2]; + 8'b00001000: data_out_w = data_in[3]; + 8'b00010000: data_out_w = data_in[4]; + 8'b00100000: data_out_w = data_in[5]; + 8'b01000000: data_out_w = data_in[6]; + 8'b10000000: data_out_w = data_in[7]; + default: data_out_w = 'x; endcase end - assign data_out = data_out_r; - end else if (MODEL == 1) begin + assign data_out = data_out_w; + end else if (MODEL == 1) begin : g_model1 wire [N-1:0][DATAW-1:0] mask; - for (genvar i = 0; i < N; ++i) begin + for (genvar i = 0; i < N; ++i) begin : g_mask assign mask[i] = {DATAW{sel_in[i]}} & data_in[i]; end - for (genvar i = 0; i < DATAW; ++i) begin + for (genvar i = 0; i < DATAW; ++i) begin : g_data_out wire [N-1:0] gather; - for (genvar j = 0; j < N; ++j) begin + for (genvar j = 0; j < N; ++j) begin : g_gather assign gather[j] = mask[j][i]; end assign data_out[i] = (| gather); end - end else if (MODEL == 2) begin - reg [DATAW-1:0] data_out_r; + end else if (MODEL == 2) begin : g_model2 + VX_find_first #( + .N (N), + .DATAW (DATAW) + ) find_first ( + .valid_in (sel_in), + .data_in (data_in), + .data_out (data_out), + `UNUSED_PIN (valid_out) + ); + end else if (MODEL == 3) begin : g_model3 + reg [DATAW-1:0] data_out_w; always @(*) begin - data_out_r = 'x; + data_out_w = 'x; for (integer i = 0; i < N; ++i) begin if (sel_in[i]) begin - data_out_r = data_in[i]; + data_out_w = data_in[i]; end end end - assign data_out = data_out_r; + assign data_out = data_out_w; end endmodule diff --git a/hw/syn/xilinx/test/kernel/start.S b/hw/rtl/libs/VX_onehot_shift.sv similarity index 57% rename from hw/syn/xilinx/test/kernel/start.S rename to hw/rtl/libs/VX_onehot_shift.sv index e9295d643..3222e3067 100644 --- a/hw/syn/xilinx/test/kernel/start.S +++ b/hw/rtl/libs/VX_onehot_shift.sv @@ -1,23 +1,32 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -.section .init, "ax" -.global _start -.type _start, @function -_start: - # call main routine - call main +`include "VX_platform.vh" - # end execution - .insn r 0x0b, 0, 0, x0, x0, x0 -.size _start, .-_start \ No newline at end of file +`TRACING_OFF +module VX_onehot_shift #( + parameter N = 1, + parameter M = 1 +) ( + input wire [N-1:0] data_in0, + input wire [M-1:0] data_in1, + output wire [N*M-1:0] data_out +); + for (genvar i = 0; i < M; ++i) begin : g_i + for (genvar j = 0; j < N; ++j) begin : g_j + assign data_out[i*N + j] = data_in1[i] & data_in0[j]; + end + end + +endmodule +`TRACING_ON diff --git a/hw/rtl/libs/VX_pe_serializer.sv b/hw/rtl/libs/VX_pe_serializer.sv index eac1eddcb..4a66a6399 100644 --- a/hw/rtl/libs/VX_pe_serializer.sv +++ b/hw/rtl/libs/VX_pe_serializer.sv @@ -35,8 +35,8 @@ module VX_pe_serializer #( // PE output wire pe_enable, - output wire [NUM_PES-1:0][DATA_IN_WIDTH-1:0] pe_data_in, - input wire [NUM_PES-1:0][DATA_OUT_WIDTH-1:0] pe_data_out, + output wire [NUM_PES-1:0][DATA_IN_WIDTH-1:0] pe_data_out, + input wire [NUM_PES-1:0][DATA_OUT_WIDTH-1:0] pe_data_in, // output output wire valid_out, @@ -49,101 +49,92 @@ module VX_pe_serializer #( wire [TAG_WIDTH-1:0] tag_out_u; wire ready_out_u; - wire [NUM_PES-1:0][DATA_IN_WIDTH-1:0] pe_data_in_s; - wire valid_out_s; - wire [TAG_WIDTH-1:0] tag_out_s; + wire [NUM_PES-1:0][DATA_IN_WIDTH-1:0] pe_data_out_w; + wire pe_valid_in; + wire [TAG_WIDTH-1:0] pe_tag_in; wire enable; VX_shift_register #( .DATAW (1 + TAG_WIDTH), - .DEPTH (LATENCY + PE_REG), + .DEPTH (PE_REG + LATENCY), .RESETW (1) ) shift_reg ( .clk (clk), .reset (reset), .enable (enable), - .data_in ({valid_in, tag_in}), - .data_out ({valid_out_s, tag_out_s}) + .data_in ({valid_in, tag_in}), + .data_out ({pe_valid_in, pe_tag_in}) ); VX_pipe_register #( - .DATAW (NUM_PES * DATA_IN_WIDTH), - .DEPTH (PE_REG) - ) pe_reg ( + .DATAW (NUM_PES * DATA_IN_WIDTH), + .DEPTH (PE_REG) + ) pe_data_reg ( .clk (clk), .reset (reset), .enable (enable), - .data_in (pe_data_in_s), - .data_out (pe_data_in) + .data_in (pe_data_out_w), + .data_out (pe_data_out) ); - if (NUM_LANES != NUM_PES) begin + assign pe_enable = enable; + + if (NUM_LANES != NUM_PES) begin : g_serialize localparam BATCH_SIZE = NUM_LANES / NUM_PES; localparam BATCH_SIZEW = `LOG2UP(BATCH_SIZE); - reg [BATCH_SIZEW-1:0] batch_in_idx; - reg [BATCH_SIZEW-1:0] batch_out_idx; + reg [BATCH_SIZEW-1:0] batch_in_idx, batch_out_idx; + reg batch_in_done, batch_out_done; - for (genvar i = 0; i < NUM_PES; ++i) begin - assign pe_data_in_s[i] = data_in[batch_in_idx * NUM_PES + i]; + for (genvar i = 0; i < NUM_PES; ++i) begin : g_pe_data_out_w + assign pe_data_out_w[i] = data_in[batch_in_idx * NUM_PES + i]; end always @(posedge clk) begin if (reset) begin - batch_in_idx <= '0; - batch_out_idx <= '0; + batch_in_idx <= '0; + batch_out_idx <= '0; + batch_in_done <= 0; + batch_out_done <= 0; end else if (enable) begin - if (valid_in) begin - batch_in_idx <= batch_in_idx + BATCH_SIZEW'(1); - end - if (valid_out_s) begin - batch_out_idx <= batch_out_idx + BATCH_SIZEW'(1); - end + batch_in_idx <= batch_in_idx + BATCH_SIZEW'(valid_in); + batch_out_idx <= batch_out_idx + BATCH_SIZEW'(pe_valid_in); + batch_in_done <= valid_in && (batch_in_idx == BATCH_SIZEW'(BATCH_SIZE-2)); + batch_out_done <= pe_valid_in && (batch_out_idx == BATCH_SIZEW'(BATCH_SIZE-2)); end end - wire batch_in_done = (batch_in_idx == BATCH_SIZEW'(BATCH_SIZE-1)); - wire batch_out_done = (batch_out_idx == BATCH_SIZEW'(BATCH_SIZE-1)); + reg [BATCH_SIZE-1:0][(NUM_PES * DATA_OUT_WIDTH)-1:0] data_out_r, data_out_n; - reg valid_out_r; - reg [BATCH_SIZE-1:0][NUM_PES-1:0][DATA_OUT_WIDTH-1:0] data_out_r; - reg [TAG_WIDTH-1:0] tag_out_r; - - wire valid_out_b = valid_out_s && batch_out_done; - wire ready_out_b = ready_out_u || ~valid_out_u; + always @(*) begin + data_out_n = data_out_r; + if (pe_valid_in) begin + data_out_n[batch_out_idx] = pe_data_in; + end + end always @(posedge clk) begin - if (reset) begin - valid_out_r <= 1'b0; - end else if (ready_out_b) begin - valid_out_r <= valid_out_b; - end - if (ready_out_b) begin - data_out_r[batch_out_idx] <= pe_data_out; - tag_out_r <= tag_out_s; - end + data_out_r <= data_out_n; end - assign enable = ready_out_b || ~valid_out_b; - assign ready_in = enable && batch_in_done; - assign pe_enable = enable; - - assign valid_out_u = valid_out_r; - assign data_out_u = data_out_r; - assign tag_out_u = tag_out_r; - - end else begin - - assign pe_data_in_s = data_in; - assign enable = ready_out_u || ~valid_out_u; - assign ready_in = enable; - assign pe_enable = enable; + assign ready_in = enable && batch_in_done; - assign valid_out_u = valid_out_s; - assign data_out_u = pe_data_out; - assign tag_out_u = tag_out_s; + assign valid_out_u = batch_out_done; + assign data_out_u = data_out_n; + assign tag_out_u = pe_tag_in; + + end else begin : g_passthru + + assign pe_data_out_w = data_in; + + assign enable = ready_out_u || ~pe_valid_in; + assign ready_in = enable; + + assign valid_out_u = pe_valid_in; + assign data_out_u = pe_data_in; + assign tag_out_u = pe_tag_in; end diff --git a/hw/rtl/libs/VX_pending_size.sv b/hw/rtl/libs/VX_pending_size.sv index 031e57695..b94889e6e 100644 --- a/hw/rtl/libs/VX_pending_size.sv +++ b/hw/rtl/libs/VX_pending_size.sv @@ -13,7 +13,7 @@ `include "VX_platform.vh" -//`TRACING_OFF +`TRACING_OFF module VX_pending_size #( parameter SIZE = 1, parameter INCRW = 1, @@ -34,97 +34,159 @@ module VX_pending_size #( ); `STATIC_ASSERT(INCRW <= SIZEW, ("invalid parameter: %d vs %d", INCRW, SIZEW)) `STATIC_ASSERT(DECRW <= SIZEW, ("invalid parameter: %d vs %d", DECRW, SIZEW)) - localparam ADDRW = `LOG2UP(SIZE); - reg empty_r, alm_empty_r; - reg full_r, alm_full_r; + if (SIZE == 1) begin : g_size_eq1 - if (INCRW != 1 || DECRW != 1) begin - - reg [SIZEW-1:0] size_r; - - wire [SIZEW-1:0] size_n = size_r + SIZEW'(incr) - SIZEW'(decr); + reg size_r; always @(posedge clk) begin if (reset) begin - empty_r <= 1; - alm_empty_r <= 1; - alm_full_r <= 0; - full_r <= 0; - size_r <= '0; + size_r <= '0; end else begin - `ASSERT((SIZEW'(incr) >= SIZEW'(decr)) || (size_n >= size_r), ("runtime error: counter overflow")); - `ASSERT((SIZEW'(incr) <= SIZEW'(decr)) || (size_n <= size_r), ("runtime error: counter underflow")); - size_r <= size_n; - empty_r <= (size_n == SIZEW'(0)); - alm_empty_r <= (size_n == SIZEW'(ALM_EMPTY)); - full_r <= (size_n == SIZEW'(SIZE)); - alm_full_r <= (size_n == SIZEW'(ALM_FULL)); - end - end - - assign size = size_r; - - end else begin - - reg [ADDRW-1:0] used_r; - wire [ADDRW-1:0] used_n; - - always @(posedge clk) begin - if (reset) begin - empty_r <= 1; - alm_empty_r <= 1; - full_r <= 0; - alm_full_r <= 0; - used_r <= '0; - end else begin - `ASSERT(~(incr && ~decr) || ~full, ("runtime error: counter overflow")); - `ASSERT(~(decr && ~incr) || ~empty, ("runtime error: counter underflow")); if (incr) begin if (~decr) begin - empty_r <= 0; - if (used_r == ADDRW'(ALM_EMPTY)) - alm_empty_r <= 0; - if (used_r == ADDRW'(SIZE-1)) - full_r <= 1; - if (used_r == ADDRW'(ALM_FULL-1)) - alm_full_r <= 1; + size_r <= 1; end end else if (decr) begin - if (used_r == ADDRW'(1)) - empty_r <= 1; - if (used_r == ADDRW'(ALM_EMPTY+1)) - alm_empty_r <= 1; - full_r <= 0; - if (used_r == ADDRW'(ALM_FULL)) - alm_full_r <= 0; + size_r <= '0; end - used_r <= used_n; end end - if (SIZE == 2) begin - assign used_n = used_r ^ (incr ^ decr); - end else begin - assign used_n = $signed(used_r) + ADDRW'($signed(2'(incr) - 2'(decr))); + assign empty = (size_r == 0); + assign full = (size_r != 0); + assign alm_empty = 1'b1; + assign alm_full = 1'b1; + assign size = size_r; + + end else begin : g_size_gt1 + + reg empty_r, alm_empty_r; + reg full_r, alm_full_r; + + if (INCRW != 1 || DECRW != 1) begin : g_wide_step + + localparam DELTAW = `MIN(SIZEW, `MAX(INCRW, DECRW)+1); + + logic [SIZEW-1:0] size_n, size_r; + + wire [DELTAW-1:0] delta = DELTAW'(incr) - DELTAW'(decr); + + assign size_n = $signed(size_r) + SIZEW'($signed(delta)); + + always @(posedge clk) begin + if (reset) begin + empty_r <= 1; + full_r <= 0; + alm_empty_r <= 1; + alm_full_r <= 0; + size_r <= '0; + end else begin + `ASSERT((DELTAW'(incr) <= DELTAW'(decr)) || (size_n >= size_r), ("runtime error: counter overflow")); + `ASSERT((DELTAW'(incr) >= DELTAW'(decr)) || (size_n <= size_r), ("runtime error: counter underflow")); + empty_r <= (size_n == SIZEW'(0)); + full_r <= (size_n == SIZEW'(SIZE)); + alm_empty_r <= (size_n <= SIZEW'(ALM_EMPTY)); + alm_full_r <= (size_n >= SIZEW'(ALM_FULL)); + size_r <= size_n; + end + end + + assign size = size_r; + + end else begin : g_single_step + + localparam ADDRW = `LOG2UP(SIZE); + + reg [ADDRW-1:0] used_r; + + wire is_alm_empty = (used_r == ADDRW'(ALM_EMPTY)); + wire is_alm_empty_n = (used_r == ADDRW'(ALM_EMPTY+1)); + wire is_alm_full = (used_r == ADDRW'(ALM_FULL)); + wire is_alm_full_n = (used_r == ADDRW'(ALM_FULL-1)); + + always @(posedge clk) begin + if (reset) begin + alm_empty_r <= 1; + alm_full_r <= 0; + end else begin + if (incr) begin + if (~decr) begin + if (is_alm_empty) + alm_empty_r <= 0; + if (is_alm_full_n) + alm_full_r <= 1; + end + end else if (decr) begin + if (is_alm_full) + alm_full_r <= 0; + if (is_alm_empty_n) + alm_empty_r <= 1; + end + end + end + + if (SIZE > 2) begin : g_size_gt2 + + wire is_empty_n = (used_r == ADDRW'(1)); + wire is_full_n = (used_r == ADDRW'(SIZE-1)); + + wire [1:0] delta = {~incr & decr, incr ^ decr}; + + always @(posedge clk) begin + if (reset) begin + empty_r <= 1; + full_r <= 0; + used_r <= '0; + end else begin + if (incr) begin + if (~decr) begin + empty_r <= 0; + if (is_full_n) + full_r <= 1; + end + end else if (decr) begin + full_r <= 0; + if (is_empty_n) + empty_r <= 1; + end + used_r <= $signed(used_r) + ADDRW'($signed(delta)); + end + end + + end else begin : g_size_eq2 + + always @(posedge clk) begin + if (reset) begin + empty_r <= 1; + full_r <= 0; + used_r <= '0; + end else begin + empty_r <= (empty_r & ~incr) | (~full_r & decr & ~incr); + full_r <= (~empty_r & incr & ~decr) | (full_r & ~(decr ^ incr)); + used_r <= used_r ^ (incr ^ decr); + end + end + end + + if (SIZE > 1) begin : g_sizeN + if (SIZEW > ADDRW) begin : g_not_log2 + assign size = {full_r, used_r}; + end else begin : g_log2 + assign size = used_r; + end + end else begin : g_size1 + assign size = full_r; + end + end - if (SIZE > 1) begin - if (SIZEW > ADDRW) begin - assign size = {full_r, used_r}; - end else begin - assign size = used_r; - end - end else begin - assign size = full_r; - end + assign empty = empty_r; + assign full = full_r; + assign alm_empty = alm_empty_r; + assign alm_full = alm_full_r; end - assign empty = empty_r; - assign alm_empty = alm_empty_r; - assign alm_full = alm_full_r; - assign full = full_r; - endmodule -//`TRACING_ON +`TRACING_ON diff --git a/hw/rtl/libs/VX_pipe_buffer.sv b/hw/rtl/libs/VX_pipe_buffer.sv index 167235c17..5ba23bc08 100644 --- a/hw/rtl/libs/VX_pipe_buffer.sv +++ b/hw/rtl/libs/VX_pipe_buffer.sv @@ -24,8 +24,9 @@ `TRACING_OFF module VX_pipe_buffer #( - parameter DATAW = 1, - parameter DEPTH = 1 + parameter DATAW = 1, + parameter RESETW = 0, + parameter DEPTH = 1 ) ( input wire clk, input wire reset, @@ -36,16 +37,16 @@ module VX_pipe_buffer #( input wire ready_out, output wire valid_out ); - if (DEPTH == 0) begin + if (DEPTH == 0) begin : g_passthru `UNUSED_VAR (clk) `UNUSED_VAR (reset) assign ready_in = ready_out; assign valid_out = valid_in; assign data_out = data_in; - end else begin + end else begin : g_register wire [DEPTH:0] valid; `IGNORE_UNOPTFLAT_BEGIN - wire [DEPTH:0] ready; + wire ready [DEPTH+1]; `IGNORE_UNOPTFLAT_END wire [DEPTH:0][DATAW-1:0] data; @@ -53,11 +54,11 @@ module VX_pipe_buffer #( assign data[0] = data_in; assign ready_in = ready[0]; - for (genvar i = 0; i < DEPTH; ++i) begin + for (genvar i = 0; i < DEPTH; ++i) begin : g_pipe_regs assign ready[i] = (ready[i+1] || ~valid[i+1]); VX_pipe_register #( .DATAW (1 + DATAW), - .RESETW (1) + .RESETW (1 + RESETW) ) pipe_register ( .clk (clk), .reset (reset), @@ -70,7 +71,6 @@ module VX_pipe_buffer #( assign valid_out = valid[DEPTH]; assign data_out = data[DEPTH]; assign ready[DEPTH] = ready_out; - end endmodule diff --git a/hw/rtl/libs/VX_pipe_register.sv b/hw/rtl/libs/VX_pipe_register.sv index 707438abd..ef19cb58b 100644 --- a/hw/rtl/libs/VX_pipe_register.sv +++ b/hw/rtl/libs/VX_pipe_register.sv @@ -17,8 +17,8 @@ module VX_pipe_register #( parameter DATAW = 1, parameter RESETW = 0, - parameter DEPTH = 1, - parameter MAX_FANOUT = 0 + parameter [`UP(RESETW)-1:0] INIT_VALUE = {`UP(RESETW){1'b0}}, + parameter DEPTH = 1 ) ( input wire clk, input wire reset, @@ -26,81 +26,61 @@ module VX_pipe_register #( input wire [DATAW-1:0] data_in, output wire [DATAW-1:0] data_out ); - if (DEPTH == 0) begin + if (DEPTH == 0) begin : g_passthru `UNUSED_VAR (clk) `UNUSED_VAR (reset) `UNUSED_VAR (enable) assign data_out = data_in; - end else if (DEPTH == 1) begin - if (MAX_FANOUT != 0 && (DATAW > (MAX_FANOUT + MAX_FANOUT/2))) begin - localparam NUM_SLICES = `CDIV(DATAW, MAX_FANOUT); - localparam N_DATAW = DATAW / NUM_SLICES; - for (genvar i = 0; i < NUM_SLICES; ++i) begin - localparam SLICE_START = i * N_DATAW; - localparam SLICE_END = SLICE_START + S_DATAW - 1; - localparam S_DATAW = (i == NUM_SLICES-1) ? (DATAW - SLICE_START) : N_DATAW; - localparam S_RESETW = (SLICE_END >= (DATAW - RESETW)) ? - ((SLICE_START >= (DATAW - RESETW)) ? S_DATAW : (SLICE_END - (DATAW - RESETW) + 1)) : 0; - VX_pipe_register #( - .DATAW (S_DATAW), - .RESETW (S_RESETW) - ) pipe_register_slice ( - .clk (clk), - .reset (reset), - .enable (enable), - .data_in (data_in[i * N_DATAW +: S_DATAW]), - .data_out (data_out[i * N_DATAW +: S_DATAW]) - ); + end else if (DEPTH == 1) begin : g_depth1 + if (RESETW == 0) begin : g_no_reset + `UNUSED_VAR (reset) + reg [DATAW-1:0] value; + + always @(posedge clk) begin + if (enable) begin + value <= data_in; + end end - end else begin - if (RESETW == 0) begin - `UNUSED_VAR (reset) - reg [DATAW-1:0] value; + assign data_out = value; + end else if (RESETW < DATAW) begin : g_partial_reset + reg [DATAW-RESETW-1:0] value_d; + reg [RESETW-1:0] value_r; - always @(posedge clk) begin - if (enable) begin - value <= data_in; - end + always @(posedge clk) begin + if (reset) begin + value_r <= INIT_VALUE; + end else if (enable) begin + value_r <= data_in[DATAW-1:DATAW-RESETW]; end - assign data_out = value; - end else if (RESETW == DATAW) begin - reg [DATAW-1:0] value; - - always @(posedge clk) begin - if (reset) begin - value <= RESETW'(0); - end else if (enable) begin - value <= data_in; - end - end - assign data_out = value; - end else begin - reg [DATAW-RESETW-1:0] value_d; - reg [RESETW-1:0] value_r; - - always @(posedge clk) begin - if (reset) begin - value_r <= RESETW'(0); - end else if (enable) begin - value_r <= data_in[DATAW-1:DATAW-RESETW]; - end - end - - always @(posedge clk) begin - if (enable) begin - value_d <= data_in[DATAW-RESETW-1:0]; - end - end - assign data_out = {value_r, value_d}; end + + always @(posedge clk) begin + if (enable) begin + value_d <= data_in[DATAW-RESETW-1:0]; + end + end + assign data_out = {value_r, value_d}; + end else begin : g_full_reset + reg [DATAW-1:0] value; + + always @(posedge clk) begin + if (reset) begin + value <= INIT_VALUE; + end else if (enable) begin + value <= data_in; + end + end + assign data_out = value; end - end else begin + end else begin : g_recursive wire [DEPTH:0][DATAW-1:0] data_delayed; assign data_delayed[0] = data_in; - for (genvar i = 1; i <= DEPTH; ++i) begin + + for (genvar i = 1; i <= DEPTH; ++i) begin : g_pipe_reg VX_pipe_register #( .DATAW (DATAW), - .RESETW (RESETW) + .RESETW (RESETW), + .INIT_VALUE (INIT_VALUE) ) pipe_reg ( .clk (clk), .reset (reset), diff --git a/hw/rtl/interfaces/VX_sfu_perf_if.sv b/hw/rtl/libs/VX_placeholder.sv similarity index 70% rename from hw/rtl/interfaces/VX_sfu_perf_if.sv rename to hw/rtl/libs/VX_placeholder.sv index db9c5d125..738da615b 100644 --- a/hw/rtl/interfaces/VX_sfu_perf_if.sv +++ b/hw/rtl/libs/VX_placeholder.sv @@ -1,27 +1,27 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -`include "VX_define.vh" +`include "VX_platform.vh" -interface VX_sfu_perf_if (); - wire [`PERF_CTR_BITS-1:0] wctl_stalls; +`TRACING_OFF +`BLACKBOX_CELL module VX_placeholder #( + parameter I = 0, + parameter O = 0 +) ( + input wire [`UP(I)-1:0] in, + output wire [`UP(O)-1:0] out +); + // empty module - modport master ( - output wctl_stalls - ); - - modport slave ( - input wctl_stalls - ); - -endinterface +endmodule +`TRACING_ON diff --git a/hw/rtl/libs/VX_popcount.sv b/hw/rtl/libs/VX_popcount.sv index eaec78789..fa8c49099 100644 --- a/hw/rtl/libs/VX_popcount.sv +++ b/hw/rtl/libs/VX_popcount.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,21 +21,21 @@ module VX_popcount63( reg [2:0] sum; always @(*) begin case (data_in) - 6'd0: sum=3'd0; 6'd1: sum=3'd1; 6'd2: sum=3'd1; 6'd3: sum=3'd2; + 6'd0: sum=3'd0; 6'd1: sum=3'd1; 6'd2: sum=3'd1; 6'd3: sum=3'd2; 6'd4: sum=3'd1; 6'd5: sum=3'd2; 6'd6: sum=3'd2; 6'd7: sum=3'd3; - 6'd8: sum=3'd1; 6'd9: sum=3'd2; 6'd10: sum=3'd2; 6'd11: sum=3'd3; + 6'd8: sum=3'd1; 6'd9: sum=3'd2; 6'd10: sum=3'd2; 6'd11: sum=3'd3; 6'd12: sum=3'd2; 6'd13: sum=3'd3; 6'd14: sum=3'd3; 6'd15: sum=3'd4; - 6'd16: sum=3'd1; 6'd17: sum=3'd2; 6'd18: sum=3'd2; 6'd19: sum=3'd3; + 6'd16: sum=3'd1; 6'd17: sum=3'd2; 6'd18: sum=3'd2; 6'd19: sum=3'd3; 6'd20: sum=3'd2; 6'd21: sum=3'd3; 6'd22: sum=3'd3; 6'd23: sum=3'd4; - 6'd24: sum=3'd2; 6'd25: sum=3'd3; 6'd26: sum=3'd3; 6'd27: sum=3'd4; + 6'd24: sum=3'd2; 6'd25: sum=3'd3; 6'd26: sum=3'd3; 6'd27: sum=3'd4; 6'd28: sum=3'd3; 6'd29: sum=3'd4; 6'd30: sum=3'd4; 6'd31: sum=3'd5; - 6'd32: sum=3'd1; 6'd33: sum=3'd2; 6'd34: sum=3'd2; 6'd35: sum=3'd3; + 6'd32: sum=3'd1; 6'd33: sum=3'd2; 6'd34: sum=3'd2; 6'd35: sum=3'd3; 6'd36: sum=3'd2; 6'd37: sum=3'd3; 6'd38: sum=3'd3; 6'd39: sum=3'd4; - 6'd40: sum=3'd2; 6'd41: sum=3'd3; 6'd42: sum=3'd3; 6'd43: sum=3'd4; + 6'd40: sum=3'd2; 6'd41: sum=3'd3; 6'd42: sum=3'd3; 6'd43: sum=3'd4; 6'd44: sum=3'd3; 6'd45: sum=3'd4; 6'd46: sum=3'd4; 6'd47: sum=3'd5; - 6'd48: sum=3'd2; 6'd49: sum=3'd3; 6'd50: sum=3'd3; 6'd51: sum=3'd4; + 6'd48: sum=3'd2; 6'd49: sum=3'd3; 6'd50: sum=3'd3; 6'd51: sum=3'd4; 6'd52: sum=3'd3; 6'd53: sum=3'd4; 6'd54: sum=3'd4; 6'd55: sum=3'd5; - 6'd56: sum=3'd3; 6'd57: sum=3'd4; 6'd58: sum=3'd4; 6'd59: sum=3'd5; + 6'd56: sum=3'd3; 6'd57: sum=3'd4; 6'd58: sum=3'd4; 6'd59: sum=3'd5; 6'd60: sum=3'd4; 6'd61: sum=3'd5; 6'd62: sum=3'd5; 6'd63: sum=3'd6; endcase end @@ -49,7 +49,7 @@ module VX_popcount32( reg [1:0] sum; always @(*) begin case (data_in) - 3'd0: sum=2'd0; 3'd1: sum=2'd1; 3'd2: sum=2'd1; 3'd3: sum=2'd2; + 3'd0: sum=2'd0; 3'd1: sum=2'd1; 3'd2: sum=2'd1; 3'd3: sum=2'd2; 3'd4: sum=2'd1; 3'd5: sum=2'd2; 3'd6: sum=2'd2; 3'd7: sum=2'd3; endcase end @@ -88,23 +88,23 @@ endmodule module VX_popcount #( parameter MODEL = 1, parameter N = 1, - parameter M = `CLOG2(N+1) + parameter M = `CLOG2(N+1) ) ( input wire [N-1:0] data_in, output wire [M-1:0] data_out ); - `UNUSED_PARAM (MODEL) + `UNUSED_PARAM (MODEL) `ifndef SYNTHESIS assign data_out = $countones(data_in); `elsif QUARTUS assign data_out = $countones(data_in); `else - if (N == 1) begin + if (N == 1) begin : g_passthru assign data_out = data_in; - end else if (N <= 3) begin + end else if (N <= 3) begin : g_popcount3 reg [2:0] t_in; wire [1:0] t_out; @@ -113,10 +113,10 @@ module VX_popcount #( t_in[N-1:0] = data_in; end VX_popcount32 pc32(t_in, t_out); - assign data_out = t_out[M-1:0]; - - end else if (N <= 6) begin - + assign data_out = t_out[M-1:0]; + + end else if (N <= 6) begin : g_popcount6 + reg [5:0] t_in; wire [2:0] t_out; always @(*) begin @@ -125,9 +125,9 @@ module VX_popcount #( end VX_popcount63 pc63(t_in, t_out); assign data_out = t_out[M-1:0]; - - end else if (N <= 9) begin - + + end else if (N <= 9) begin : g_popcount9 + reg [8:0] t_in; wire [4:0] t1_out; wire [3:0] t2_out; @@ -140,8 +140,8 @@ module VX_popcount #( VX_sum33 sum33(t1_out[2:0], {1'b0, t1_out[4:3]}, t2_out); assign data_out = t2_out[M-1:0]; - end else if (N <= 12) begin - + end else if (N <= 12) begin : g_popcount12 + reg [11:0] t_in; wire [5:0] t1_out; wire [3:0] t2_out; @@ -154,8 +154,8 @@ module VX_popcount #( VX_sum33 sum33(t1_out[2:0], t1_out[5:3], t2_out); assign data_out = t2_out[M-1:0]; - end else if (N <= 18) begin - + end else if (N <= 18) begin : g_popcount18 + reg [17:0] t_in; wire [8:0] t1_out; wire [5:0] t2_out; @@ -171,23 +171,23 @@ module VX_popcount #( VX_popcount32 pc32c({t1_out[2], t1_out[5], t1_out[8]}, t2_out[5:4]); assign data_out = {2'b0,t2_out[1:0]} + {1'b0,t2_out[3:2],1'b0} + {t2_out[5:4],2'b0}; - end else if (MODEL == 1) begin + end else if (MODEL == 1) begin : g_model1 localparam PN = 1 << `CLOG2(N); localparam LOGPN = `CLOG2(PN); `IGNORE_UNOPTFLAT_BEGIN - wire [M-1:0] tmp [LOGPN-1:0][PN-1:0]; + wire [M-1:0] tmp [LOGPN-1:0][PN-1:0]; `IGNORE_UNOPTFLAT_END for (genvar j = 0; j < LOGPN; ++j) begin localparam D = j + 1; localparam Q = (D < LOGPN) ? (D + 1) : M; - for (genvar i = 0; i < (1 << (LOGPN-j-1)); ++i) begin + for (genvar i = 0; i < (1 << (LOGPN-j-1)); ++i) begin localparam l = i * 2; localparam r = i * 2 + 1; - wire [Q-1:0] res; - if (j == 0) begin + wire [Q-1:0] res; + if (j == 0) begin if (r < N) begin assign res = data_in[l] + data_in[r]; end else if (l < N) begin @@ -203,20 +203,20 @@ module VX_popcount #( end assign data_out = tmp[LOGPN-1][0]; - - end else begin - reg [M-1:0] cnt_r; + end else begin : g_model2 + + reg [M-1:0] cnt_w; always @(*) begin - cnt_r = '0; + cnt_w = '0; for (integer i = 0; i < N; ++i) begin - cnt_r = cnt_r + M'(data_in[i]); + cnt_w = cnt_w + M'(data_in[i]); end end - assign data_out = cnt_r; - + assign data_out = cnt_w; + end `endif diff --git a/hw/rtl/libs/VX_priority_arbiter.sv b/hw/rtl/libs/VX_priority_arbiter.sv index cd4844d25..de5a3b3b1 100644 --- a/hw/rtl/libs/VX_priority_arbiter.sv +++ b/hw/rtl/libs/VX_priority_arbiter.sv @@ -23,21 +23,21 @@ module VX_priority_arbiter #( output wire [NUM_REQS-1:0] grant_onehot, output wire grant_valid ); - if (NUM_REQS == 1) begin + if (NUM_REQS == 1) begin : g_passthru assign grant_index = '0; assign grant_onehot = requests; assign grant_valid = requests[0]; - end else begin + end else begin : g_encoder VX_priority_encoder #( .N (NUM_REQS) ) priority_encoder ( - .data_in (requests), - .index (grant_index), - .onehot (grant_onehot), - .valid_out (grant_valid) + .data_in (requests), + .index_out (grant_index), + .onehot_out (grant_onehot), + .valid_out (grant_valid) ); end diff --git a/hw/rtl/libs/VX_priority_encoder.sv b/hw/rtl/libs/VX_priority_encoder.sv index 5a08e3412..444c40683 100644 --- a/hw/rtl/libs/VX_priority_encoder.sv +++ b/hw/rtl/libs/VX_priority_encoder.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -14,46 +14,67 @@ `include "VX_platform.vh" `TRACING_OFF -module VX_priority_encoder #( - parameter N = 1, +module VX_priority_encoder #( + parameter N = 1, parameter REVERSE = 0, parameter MODEL = 1, parameter LN = `LOG2UP(N) ) ( - input wire [N-1:0] data_in, - output wire [N-1:0] onehot, - output wire [LN-1:0] index, + input wire [N-1:0] data_in, + output wire [N-1:0] onehot_out, + output wire [LN-1:0] index_out, output wire valid_out ); - wire [N-1:0] reversed; + wire [N-1:0] reversed; - if (REVERSE != 0) begin - for (genvar i = 0; i < N; ++i) begin + if (REVERSE != 0) begin : g_reverse + for (genvar i = 0; i < N; ++i) begin : g_i assign reversed[N-i-1] = data_in[i]; - end - end else begin + end + end else begin : g_no_reverse assign reversed = data_in; end - if (N == 1) begin + if (N == 1) begin : g_n1 - assign onehot = reversed; - assign index = '0; - assign valid_out = reversed; + assign onehot_out = reversed; + assign index_out = '0; + assign valid_out = reversed; - end else if (N == 2) begin + end else if (N == 2) begin : g_n2 - assign onehot = {~reversed[0], reversed[0]}; - assign index = ~reversed[0]; - assign valid_out = (| reversed); + assign onehot_out = {reversed[1] && ~reversed[0], reversed[0]}; + assign index_out = ~reversed[0]; + assign valid_out = (| reversed); - end else if (MODEL == 1) begin + end else if (MODEL == 1) begin : g_model1 + + `IGNORE_UNOPTFLAT_BEGIN + wire [N-1:0] higher_pri_regs; + `IGNORE_UNOPTFLAT_END + + assign higher_pri_regs[0] = 1'b0; + for (genvar i = 1; i < N; ++i) begin : g_higher_pri_regs + assign higher_pri_regs[i] = higher_pri_regs[i-1] | reversed[i-1]; + end + assign onehot_out[N-1:0] = reversed[N-1:0] & ~higher_pri_regs[N-1:0]; + + VX_lzc #( + .N (N), + .REVERSE (1) + ) lzc ( + .data_in (reversed), + .data_out (index_out), + .valid_out (valid_out) + ); + + end else if (MODEL == 2) begin : g_model2 wire [N-1:0] scan_lo; VX_scan #( .N (N), - .OP (2) + .OP ("|") ) scan ( .data_in (reversed), .data_out (scan_lo) @@ -64,66 +85,46 @@ module VX_priority_encoder #( .REVERSE (1) ) lzc ( .data_in (reversed), - .data_out (index), - `UNUSED_PIN (valid_out) + .data_out (index_out), + .valid_out(valid_out) ); - assign onehot = scan_lo & {(~scan_lo[N-2:0]), 1'b1}; - assign valid_out = scan_lo[N-1]; + assign onehot_out = scan_lo & {(~scan_lo[N-2:0]), 1'b1}; - end else if (MODEL == 2) begin + end else if (MODEL == 3) begin : g_model3 - `IGNORE_WARNINGS_BEGIN - wire [N-1:0] higher_pri_regs; - `IGNORE_WARNINGS_END - assign higher_pri_regs[N-1:1] = higher_pri_regs[N-2:0] | reversed[N-2:0]; - assign higher_pri_regs[0] = 1'b0; - assign onehot[N-1:0] = reversed[N-1:0] & ~higher_pri_regs[N-1:0]; + assign onehot_out = reversed & -reversed; VX_lzc #( .N (N), .REVERSE (1) ) lzc ( .data_in (reversed), - .data_out (index), + .data_out (index_out), .valid_out (valid_out) ); - end else if (MODEL == 3) begin + end else begin : g_model0 - assign onehot = reversed & -reversed; - - VX_lzc #( - .N (N), - .REVERSE (1) - ) lzc ( - .data_in (reversed), - .data_out (index), - .valid_out (valid_out) - ); - - end else begin - - reg [LN-1:0] index_r; - reg [N-1:0] onehot_r; + reg [LN-1:0] index_w; + reg [N-1:0] onehot_w; always @(*) begin - index_r = 'x; - onehot_r = 'x; + index_w = 'x; + onehot_w = 'x; for (integer i = N-1; i >= 0; --i) begin if (reversed[i]) begin - index_r = LN'(i); - onehot_r = '0; - onehot_r[i] = 1'b1; + index_w = LN'(i); + onehot_w = N'(1) << i; end end - end + end - assign index = index_r; - assign onehot = onehot_r; - assign valid_out = (| reversed); + assign index_out = index_w; + assign onehot_out = onehot_w; + assign valid_out = (| reversed); - end + end endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_reduce.sv b/hw/rtl/libs/VX_reduce_tree.sv similarity index 64% rename from hw/rtl/libs/VX_reduce.sv rename to hw/rtl/libs/VX_reduce_tree.sv index ac0117567..d179bb596 100644 --- a/hw/rtl/libs/VX_reduce.sv +++ b/hw/rtl/libs/VX_reduce_tree.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -14,7 +14,7 @@ `include "VX_platform.vh" `TRACING_OFF -module VX_reduce #( +module VX_reduce_tree #( parameter DATAW_IN = 1, parameter DATAW_OUT = DATAW_IN, parameter N = 1, @@ -23,9 +23,9 @@ module VX_reduce #( input wire [N-1:0][DATAW_IN-1:0] data_in, output wire [DATAW_OUT-1:0] data_out ); - if (N == 1) begin + if (N == 1) begin : g_passthru assign data_out = DATAW_OUT'(data_in[0]); - end else begin + end else begin : g_reduce localparam int N_A = N / 2; localparam int N_B = N - N_A; @@ -33,40 +33,46 @@ module VX_reduce #( wire [N_B-1:0][DATAW_IN-1:0] in_B; wire [DATAW_OUT-1:0] out_A, out_B; - for (genvar i = 0; i < N_A; i++) begin + for (genvar i = 0; i < N_A; i++) begin : g_in_A assign in_A[i] = data_in[i]; end - for (genvar i = 0; i < N_B; i++) begin + for (genvar i = 0; i < N_B; i++) begin : g_in_B assign in_B[i] = data_in[N_A + i]; end - VX_reduce #( - .DATAW_IN (DATAW_IN), + VX_reduce_tree #( + .DATAW_IN (DATAW_IN), .DATAW_OUT (DATAW_OUT), .N (N_A), .OP (OP) ) reduce_A ( - .data_in (in_A), + .data_in (in_A), .data_out (out_A) ); - VX_reduce #( - .DATAW_IN (DATAW_IN), + VX_reduce_tree #( + .DATAW_IN (DATAW_IN), .DATAW_OUT (DATAW_OUT), .N (N_B), .OP (OP) ) reduce_B ( - .data_in (in_B), + .data_in (in_B), .data_out (out_B) ); - if (OP == "+") assign data_out = out_A + out_B; - else if (OP == "^") assign data_out = out_A ^ out_B; - else if (OP == "&") assign data_out = out_A & out_B; - else if (OP == "|") assign data_out = out_A | out_B; - else `ERROR(("invalid parameter")); + if (OP == "+") begin : g_plus + assign data_out = out_A + out_B; + end else if (OP == "^") begin : g_xor + assign data_out = out_A ^ out_B; + end else if (OP == "&") begin : g_and + assign data_out = out_A & out_B; + end else if (OP == "|") begin : g_or + assign data_out = out_A | out_B; + end else begin : g_error + `ERROR(("invalid parameter")); + end end - + endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_reset_relay.sv b/hw/rtl/libs/VX_reset_relay.sv index d7e735c25..0e2a7f4ca 100644 --- a/hw/rtl/libs/VX_reset_relay.sv +++ b/hw/rtl/libs/VX_reset_relay.sv @@ -22,19 +22,19 @@ module VX_reset_relay #( input wire reset, output wire [N-1:0] reset_o ); - if (MAX_FANOUT >= 0 && N > (MAX_FANOUT + MAX_FANOUT/2)) begin + if (MAX_FANOUT >= 0 && N > (MAX_FANOUT + MAX_FANOUT/2)) begin : g_relay localparam F = `UP(MAX_FANOUT); localparam R = N / F; `PRESERVE_NET reg [R-1:0] reset_r; - for (genvar i = 0; i < R; ++i) begin + for (genvar i = 0; i < R; ++i) begin : g_reset_r always @(posedge clk) begin reset_r[i] <= reset; end end - for (genvar i = 0; i < N; ++i) begin + for (genvar i = 0; i < N; ++i) begin : g_reset_o assign reset_o[i] = reset_r[i / F]; end - end else begin + end else begin : g_passthru `UNUSED_VAR (clk) assign reset_o = {N{reset}}; end diff --git a/hw/rtl/libs/VX_rr_arbiter.sv b/hw/rtl/libs/VX_rr_arbiter.sv index 52a981184..6be552572 100644 --- a/hw/rtl/libs/VX_rr_arbiter.sv +++ b/hw/rtl/libs/VX_rr_arbiter.sv @@ -28,7 +28,7 @@ module VX_rr_arbiter #( output wire grant_valid, input wire grant_ready ); - if (NUM_REQS == 1) begin + if (NUM_REQS == 1) begin : g_passthru `UNUSED_VAR (clk) `UNUSED_VAR (reset) @@ -38,17 +38,19 @@ module VX_rr_arbiter #( assign grant_onehot = requests; assign grant_valid = requests[0]; - end else if (LUT_OPT && NUM_REQS == 2) begin + end else if (LUT_OPT && NUM_REQS == 2) begin : g_lut2 - reg [LOG_NUM_REQS-1:0] grant_index_r; - reg [NUM_REQS-1:0] grant_onehot_r; - reg [LOG_NUM_REQS-1:0] state; + reg [LOG_NUM_REQS-1:0] grant_index_w; + reg [NUM_REQS-1:0] grant_onehot_w; + reg [LOG_NUM_REQS-1:0] state; always @(*) begin casez ({state, requests}) 3'b0_01, - 3'b1_?1: begin grant_onehot_r = 2'b01; grant_index_r = LOG_NUM_REQS'(0); end - default: begin grant_onehot_r = 2'b10; grant_index_r = LOG_NUM_REQS'(1); end + 3'b1_?1: begin grant_onehot_w = 2'b01; grant_index_w = LOG_NUM_REQS'(0); end + 3'b0_1?, + 3'b1_10: begin grant_onehot_w = 2'b10; grant_index_w = LOG_NUM_REQS'(1); end + default: begin grant_onehot_w = 2'b00; grant_index_w = 'x; end endcase end @@ -56,29 +58,32 @@ module VX_rr_arbiter #( if (reset) begin state <= '0; end else if (grant_ready) begin - state <= grant_index_r; + state <= grant_index_w; end end - assign grant_index = grant_index_r; - assign grant_onehot = grant_onehot_r; + assign grant_index = grant_index_w; + assign grant_onehot = grant_onehot_w; assign grant_valid = (| requests); - end else if (LUT_OPT && NUM_REQS == 3) begin + end else if (LUT_OPT && NUM_REQS == 3) begin : g_lut3 - reg [LOG_NUM_REQS-1:0] grant_index_r; - reg [NUM_REQS-1:0] grant_onehot_r; - reg [LOG_NUM_REQS-1:0] state; + reg [LOG_NUM_REQS-1:0] grant_index_w; + reg [NUM_REQS-1:0] grant_onehot_w; + reg [LOG_NUM_REQS-1:0] state; always @(*) begin casez ({state, requests}) 5'b00_001, 5'b01_0?1, - 5'b10_??1: begin grant_onehot_r = 3'b001; grant_index_r = LOG_NUM_REQS'(0); end + 5'b10_??1: begin grant_onehot_w = 3'b001; grant_index_w = LOG_NUM_REQS'(0); end 5'b00_?1?, 5'b01_010, - 5'b10_?10: begin grant_onehot_r = 3'b010; grant_index_r = LOG_NUM_REQS'(1); end - default: begin grant_onehot_r = 3'b100; grant_index_r = LOG_NUM_REQS'(2); end + 5'b10_?10: begin grant_onehot_w = 3'b010; grant_index_w = LOG_NUM_REQS'(1); end + 5'b00_10?, + 5'b01_1??, + 5'b10_100: begin grant_onehot_w = 3'b100; grant_index_w = LOG_NUM_REQS'(2); end + default: begin grant_onehot_w = 3'b000; grant_index_w = 'x; end endcase end @@ -86,35 +91,39 @@ module VX_rr_arbiter #( if (reset) begin state <= '0; end else if (grant_ready) begin - state <= grant_index_r; + state <= grant_index_w; end end - assign grant_index = grant_index_r; - assign grant_onehot = grant_onehot_r; + assign grant_index = grant_index_w; + assign grant_onehot = grant_onehot_w; assign grant_valid = (| requests); - end else if (LUT_OPT && NUM_REQS == 4) begin + end else if (LUT_OPT && NUM_REQS == 4) begin : g_lut4 - reg [LOG_NUM_REQS-1:0] grant_index_r; - reg [NUM_REQS-1:0] grant_onehot_r; - reg [LOG_NUM_REQS-1:0] state; + reg [LOG_NUM_REQS-1:0] grant_index_w; + reg [NUM_REQS-1:0] grant_onehot_w; + reg [LOG_NUM_REQS-1:0] state; always @(*) begin casez ({state, requests}) 6'b00_0001, 6'b01_00?1, 6'b10_0??1, - 6'b11_???1: begin grant_onehot_r = 4'b0001; grant_index_r = LOG_NUM_REQS'(0); end + 6'b11_???1: begin grant_onehot_w = 4'b0001; grant_index_w = LOG_NUM_REQS'(0); end 6'b00_??1?, 6'b01_0010, 6'b10_0?10, - 6'b11_??10: begin grant_onehot_r = 4'b0010; grant_index_r = LOG_NUM_REQS'(1); end + 6'b11_??10: begin grant_onehot_w = 4'b0010; grant_index_w = LOG_NUM_REQS'(1); end 6'b00_?10?, 6'b01_?1??, 6'b10_0100, - 6'b11_?100: begin grant_onehot_r = 4'b0100; grant_index_r = LOG_NUM_REQS'(2); end - default: begin grant_onehot_r = 4'b1000; grant_index_r = LOG_NUM_REQS'(3); end + 6'b11_?100: begin grant_onehot_w = 4'b0100; grant_index_w = LOG_NUM_REQS'(2); end + 6'b00_100?, + 6'b01_10??, + 6'b10_1???, + 6'b11_1000: begin grant_onehot_w = 4'b1000; grant_index_w = LOG_NUM_REQS'(3); end + default: begin grant_onehot_w = 4'b0000; grant_index_w = 'x; end endcase end @@ -122,19 +131,19 @@ module VX_rr_arbiter #( if (reset) begin state <= '0; end else if (grant_ready) begin - state <= grant_index_r; + state <= grant_index_w; end end - assign grant_index = grant_index_r; - assign grant_onehot = grant_onehot_r; + assign grant_index = grant_index_w; + assign grant_onehot = grant_onehot_w; assign grant_valid = (| requests); - end else if (LUT_OPT && NUM_REQS == 5) begin + end else if (LUT_OPT && NUM_REQS == 5) begin : g_lut5 - reg [LOG_NUM_REQS-1:0] grant_index_r; - reg [NUM_REQS-1:0] grant_onehot_r; - reg [LOG_NUM_REQS-1:0] state; + reg [LOG_NUM_REQS-1:0] grant_index_w; + reg [NUM_REQS-1:0] grant_onehot_w; + reg [LOG_NUM_REQS-1:0] state; always @(*) begin casez ({state, requests}) @@ -142,23 +151,28 @@ module VX_rr_arbiter #( 8'b001_000?1, 8'b010_00??1, 8'b011_0???1, - 8'b100_????1: begin grant_onehot_r = 5'b00001; grant_index_r = LOG_NUM_REQS'(0); end + 8'b100_????1: begin grant_onehot_w = 5'b00001; grant_index_w = LOG_NUM_REQS'(0); end 8'b000_???1?, 8'b001_00010, 8'b010_00?10, 8'b011_0??10, - 8'b100_???10: begin grant_onehot_r = 5'b00010; grant_index_r = LOG_NUM_REQS'(1); end + 8'b100_???10: begin grant_onehot_w = 5'b00010; grant_index_w = LOG_NUM_REQS'(1); end 8'b000_??10?, 8'b001_??1??, 8'b010_00100, 8'b011_0?100, - 8'b100_??100: begin grant_onehot_r = 5'b00100; grant_index_r = LOG_NUM_REQS'(2); end + 8'b100_??100: begin grant_onehot_w = 5'b00100; grant_index_w = LOG_NUM_REQS'(2); end 8'b000_?100?, 8'b001_?10??, 8'b010_?1???, 8'b011_01000, - 8'b100_?1000: begin grant_onehot_r = 5'b01000; grant_index_r = LOG_NUM_REQS'(3); end - default: begin grant_onehot_r = 5'b10000; grant_index_r = LOG_NUM_REQS'(4); end + 8'b100_?1000: begin grant_onehot_w = 5'b01000; grant_index_w = LOG_NUM_REQS'(3); end + 8'b000_1000?, + 8'b001_100??, + 8'b010_10???, + 8'b011_1????, + 8'b100_10000: begin grant_onehot_w = 5'b10000; grant_index_w = LOG_NUM_REQS'(4); end + default: begin grant_onehot_w = 5'b00000; grant_index_w = 'x; end endcase end @@ -166,19 +180,19 @@ module VX_rr_arbiter #( if (reset) begin state <= '0; end else if (grant_ready) begin - state <= grant_index_r; + state <= grant_index_w; end end - assign grant_index = grant_index_r; - assign grant_onehot = grant_onehot_r; + assign grant_index = grant_index_w; + assign grant_onehot = grant_onehot_w; assign grant_valid = (| requests); - end else if (LUT_OPT && NUM_REQS == 6) begin + end else if (LUT_OPT && NUM_REQS == 6) begin : g_lut6 - reg [LOG_NUM_REQS-1:0] grant_index_r; - reg [NUM_REQS-1:0] grant_onehot_r; - reg [LOG_NUM_REQS-1:0] state; + reg [LOG_NUM_REQS-1:0] grant_index_w; + reg [NUM_REQS-1:0] grant_onehot_w; + reg [LOG_NUM_REQS-1:0] state; always @(*) begin casez ({state, requests}) @@ -187,32 +201,38 @@ module VX_rr_arbiter #( 9'b010_000??1, 9'b011_00???1, 9'b100_0????1, - 9'b101_?????1: begin grant_onehot_r = 6'b000001; grant_index_r = LOG_NUM_REQS'(0); end + 9'b101_?????1: begin grant_onehot_w = 6'b000001; grant_index_w = LOG_NUM_REQS'(0); end 9'b000_????1?, 9'b001_000010, 9'b010_000?10, 9'b011_00??10, 9'b100_0???10, - 9'b101_????10: begin grant_onehot_r = 6'b000010; grant_index_r = LOG_NUM_REQS'(1); end + 9'b101_????10: begin grant_onehot_w = 6'b000010; grant_index_w = LOG_NUM_REQS'(1); end 9'b000_???10?, 9'b001_???1??, 9'b010_000100, 9'b011_00?100, 9'b100_0??100, - 9'b101_???100: begin grant_onehot_r = 6'b000100; grant_index_r = LOG_NUM_REQS'(2); end + 9'b101_???100: begin grant_onehot_w = 6'b000100; grant_index_w = LOG_NUM_REQS'(2); end 9'b000_??100?, 9'b001_??10??, 9'b010_??1???, 9'b011_001000, 9'b100_0?1000, - 9'b101_??1000: begin grant_onehot_r = 6'b001000; grant_index_r = LOG_NUM_REQS'(3); end + 9'b101_??1000: begin grant_onehot_w = 6'b001000; grant_index_w = LOG_NUM_REQS'(3); end 9'b000_?1000?, 9'b001_?100??, 9'b010_?10???, 9'b011_?1????, 9'b100_010000, - 9'b101_?10000: begin grant_onehot_r = 6'b010000; grant_index_r = LOG_NUM_REQS'(4); end - default: begin grant_onehot_r = 6'b100000; grant_index_r = LOG_NUM_REQS'(5); end + 9'b101_?10000: begin grant_onehot_w = 6'b010000; grant_index_w = LOG_NUM_REQS'(4); end + 9'b000_10000?, + 9'b001_1000??, + 9'b010_100???, + 9'b011_10????, + 9'b100_1?????, + 9'b101_100000: begin grant_onehot_w = 6'b100000; grant_index_w = LOG_NUM_REQS'(5); end + default: begin grant_onehot_w = 6'b000000; grant_index_w = 'x; end endcase end @@ -220,65 +240,72 @@ module VX_rr_arbiter #( if (reset) begin state <= '0; end else if (grant_ready) begin - state <= grant_index_r; + state <= grant_index_w; end end - assign grant_index = grant_index_r; - assign grant_onehot = grant_onehot_r; + assign grant_index = grant_index_w; + assign grant_onehot = grant_onehot_w; assign grant_valid = (| requests); - end else if (LUT_OPT && NUM_REQS == 7) begin + end else if (LUT_OPT && NUM_REQS == 7) begin : g_lut7 - reg [LOG_NUM_REQS-1:0] grant_index_r; - reg [NUM_REQS-1:0] grant_onehot_r; - reg [LOG_NUM_REQS-1:0] state; + reg [LOG_NUM_REQS-1:0] grant_index_w; + reg [NUM_REQS-1:0] grant_onehot_w; + reg [LOG_NUM_REQS-1:0] state; always @(*) begin casez ({state, requests}) - 10'b000_000001, - 10'b001_0000?1, - 10'b010_000??1, - 10'b011_00???1, - 10'b100_00???1, - 10'b101_0????1, - 10'b110_?????1: begin grant_onehot_r = 7'b0000001; grant_index_r = LOG_NUM_REQS'(0); end + 10'b000_0000001, + 10'b001_00000?1, + 10'b010_0000??1, + 10'b011_000???1, + 10'b100_000???1, + 10'b101_00????1, + 10'b110_??????1: begin grant_onehot_w = 7'b0000001; grant_index_w = LOG_NUM_REQS'(0); end 10'b000_?????1?, 10'b001_0000010, 10'b010_0000?10, 10'b011_000??10, 10'b100_00???10, 10'b101_0????10, - 10'b110_?????10: begin grant_onehot_r = 7'b0000010; grant_index_r = LOG_NUM_REQS'(1); end + 10'b110_?????10: begin grant_onehot_w = 7'b0000010; grant_index_w = LOG_NUM_REQS'(1); end 10'b000_????10?, 10'b001_????1??, 10'b010_0000100, 10'b011_000?100, 10'b100_00??100, 10'b101_0???100, - 10'b110_????100: begin grant_onehot_r = 7'b0000100; grant_index_r = LOG_NUM_REQS'(2); end + 10'b110_????100: begin grant_onehot_w = 7'b0000100; grant_index_w = LOG_NUM_REQS'(2); end 10'b000_???100?, 10'b001_???10??, 10'b010_???1???, 10'b011_0001000, 10'b100_00?1000, 10'b101_0??1000, - 10'b110_???1000: begin grant_onehot_r = 7'b0001000; grant_index_r = LOG_NUM_REQS'(3); end + 10'b110_???1000: begin grant_onehot_w = 7'b0001000; grant_index_w = LOG_NUM_REQS'(3); end 10'b000_??1000?, 10'b001_??100??, 10'b010_??10???, 10'b011_??1????, 10'b100_0010000, 10'b101_0?10000, - 10'b110_??10000: begin grant_onehot_r = 7'b0010000; grant_index_r = LOG_NUM_REQS'(4); end + 10'b110_??10000: begin grant_onehot_w = 7'b0010000; grant_index_w = LOG_NUM_REQS'(4); end 10'b000_?10000?, 10'b001_?1000??, 10'b010_?100???, 10'b011_?10????, 10'b100_?1?????, 10'b101_0100000, - 10'b110_?100000: begin grant_onehot_r = 7'b0100000; grant_index_r = LOG_NUM_REQS'(5); end - default: begin grant_onehot_r = 7'b1000000; grant_index_r = LOG_NUM_REQS'(6); end + 10'b110_?100000: begin grant_onehot_w = 7'b0100000; grant_index_w = LOG_NUM_REQS'(5); end + 10'b000_100000?, + 10'b001_10000??, + 10'b010_1000???, + 10'b011_100????, + 10'b100_10?????, + 10'b101_1??????, + 10'b110_1000000: begin grant_onehot_w = 7'b1000000; grant_index_w = LOG_NUM_REQS'(6); end + default: begin grant_onehot_w = 7'b0000000; grant_index_w = 'x; end endcase end @@ -286,19 +313,19 @@ module VX_rr_arbiter #( if (reset) begin state <= '0; end else if (grant_ready) begin - state <= grant_index_r; + state <= grant_index_w; end end - assign grant_index = grant_index_r; - assign grant_onehot = grant_onehot_r; + assign grant_index = grant_index_w; + assign grant_onehot = grant_onehot_w; assign grant_valid = (| requests); - end else if (LUT_OPT && NUM_REQS == 8) begin + end else if (LUT_OPT && NUM_REQS == 8) begin : g_lut8 - reg [LOG_NUM_REQS-1:0] grant_index_r; - reg [NUM_REQS-1:0] grant_onehot_r; - reg [LOG_NUM_REQS-1:0] state; + reg [LOG_NUM_REQS-1:0] grant_index_w; + reg [NUM_REQS-1:0] grant_onehot_w; + reg [LOG_NUM_REQS-1:0] state; always @(*) begin casez ({state, requests}) @@ -309,7 +336,7 @@ module VX_rr_arbiter #( 11'b100_000????1, 11'b101_00?????1, 11'b110_0??????1, - 11'b111_???????1: begin grant_onehot_r = 8'b00000001; grant_index_r = LOG_NUM_REQS'(0); end + 11'b111_???????1: begin grant_onehot_w = 8'b00000001; grant_index_w = LOG_NUM_REQS'(0); end 11'b000_??????1?, 11'b001_00000010, 11'b010_00000?10, @@ -317,7 +344,7 @@ module VX_rr_arbiter #( 11'b100_000???10, 11'b101_00????10, 11'b110_0?????10, - 11'b111_??????10: begin grant_onehot_r = 8'b00000010; grant_index_r = LOG_NUM_REQS'(1); end + 11'b111_??????10: begin grant_onehot_w = 8'b00000010; grant_index_w = LOG_NUM_REQS'(1); end 11'b000_?????10?, 11'b001_?????1??, 11'b010_00000100, @@ -325,7 +352,7 @@ module VX_rr_arbiter #( 11'b100_000??100, 11'b101_00???100, 11'b110_0????100, - 11'b111_?????100: begin grant_onehot_r = 8'b00000100; grant_index_r = LOG_NUM_REQS'(2); end + 11'b111_?????100: begin grant_onehot_w = 8'b00000100; grant_index_w = LOG_NUM_REQS'(2); end 11'b000_????100?, 11'b001_????10??, 11'b010_????1???, @@ -333,7 +360,7 @@ module VX_rr_arbiter #( 11'b100_000?1000, 11'b101_00??1000, 11'b110_0???1000, - 11'b111_????1000: begin grant_onehot_r = 8'b00001000; grant_index_r = LOG_NUM_REQS'(3); end + 11'b111_????1000: begin grant_onehot_w = 8'b00001000; grant_index_w = LOG_NUM_REQS'(3); end 11'b000_???1000?, 11'b001_???100??, 11'b010_???10???, @@ -341,7 +368,7 @@ module VX_rr_arbiter #( 11'b100_00010000, 11'b101_00?10000, 11'b110_0??10000, - 11'b111_???10000: begin grant_onehot_r = 8'b00010000; grant_index_r = LOG_NUM_REQS'(4); end + 11'b111_???10000: begin grant_onehot_w = 8'b00010000; grant_index_w = LOG_NUM_REQS'(4); end 11'b000_??10000?, 11'b001_??1000??, 11'b010_??100???, @@ -349,7 +376,7 @@ module VX_rr_arbiter #( 11'b100_??1?????, 11'b101_00100000, 11'b110_0?100000, - 11'b111_??100000: begin grant_onehot_r = 8'b00100000; grant_index_r = LOG_NUM_REQS'(5); end + 11'b111_??100000: begin grant_onehot_w = 8'b00100000; grant_index_w = LOG_NUM_REQS'(5); end 11'b000_?100000?, 11'b001_?10000??, 11'b010_?1000???, @@ -357,8 +384,16 @@ module VX_rr_arbiter #( 11'b100_?10?????, 11'b101_?1??????, 11'b110_01000000, - 11'b111_?1000000: begin grant_onehot_r = 8'b01000000; grant_index_r = LOG_NUM_REQS'(6); end - default: begin grant_onehot_r = 8'b10000000; grant_index_r = LOG_NUM_REQS'(7); end + 11'b111_?1000000: begin grant_onehot_w = 8'b01000000; grant_index_w = LOG_NUM_REQS'(6); end + 11'b000_1000000?, + 11'b001_100000??, + 11'b010_10000???, + 11'b011_1000????, + 11'b100_100?????, + 11'b101_10??????, + 11'b110_1???????, + 11'b111_10000000: begin grant_onehot_w = 8'b10000000; grant_index_w = LOG_NUM_REQS'(7); end + default: begin grant_onehot_w = 8'b00000000; grant_index_w = 'x; end endcase end @@ -366,81 +401,72 @@ module VX_rr_arbiter #( if (reset) begin state <= '0; end else if (grant_ready) begin - state <= grant_index_r; + state <= grant_index_w; end end - assign grant_index = grant_index_r; - assign grant_onehot = grant_onehot_r; + assign grant_index = grant_index_w; + assign grant_onehot = grant_onehot_w; assign grant_valid = (| requests); - end else if (MODEL == 1) begin + end else if (MODEL == 1) begin : g_model1 `IGNORE_UNOPTFLAT_BEGIN - wire [NUM_REQS-1:0] mask_higher_pri_regs, unmask_higher_pri_regs; + wire [NUM_REQS-1:0] masked_pri_reqs, unmasked_pri_reqs; `IGNORE_UNOPTFLAT_END - wire [NUM_REQS-1:0] grant_masked, grant_unmasked; + reg [NUM_REQS-1:0] reqs_mask; - reg [NUM_REQS-1:0] pointer_reg; + wire [NUM_REQS-1:0] masked_reqs = requests & reqs_mask; - wire [NUM_REQS-1:0] req_masked = requests & pointer_reg; - - assign mask_higher_pri_regs[0] = 1'b0; - for (genvar i = 1; i < NUM_REQS; ++i) begin - assign mask_higher_pri_regs[i] = mask_higher_pri_regs[i-1] | req_masked[i-1]; + assign masked_pri_reqs[0] = 1'b0; + for (genvar i = 1; i < NUM_REQS; ++i) begin : g_masked_pri_reqs + assign masked_pri_reqs[i] = masked_pri_reqs[i-1] | masked_reqs[i-1]; end - assign grant_masked[NUM_REQS-1:0] = req_masked[NUM_REQS-1:0] & ~mask_higher_pri_regs[NUM_REQS-1:0]; - - assign unmask_higher_pri_regs[0] = 1'b0; - for (genvar i = 1; i < NUM_REQS; ++i) begin - assign unmask_higher_pri_regs[i] = unmask_higher_pri_regs[i-1] | requests[i-1]; + assign unmasked_pri_reqs[0] = 1'b0; + for (genvar i = 1; i < NUM_REQS; ++i) begin : g_unmasked_pri_reqs + assign unmasked_pri_reqs[i] = unmasked_pri_reqs[i-1] | requests[i-1]; end - assign grant_unmasked[NUM_REQS-1:0] = requests[NUM_REQS-1:0] & ~unmask_higher_pri_regs[NUM_REQS-1:0]; + wire [NUM_REQS-1:0] grant_masked = masked_reqs & ~masked_pri_reqs; + wire [NUM_REQS-1:0] grant_unmasked = requests & ~unmasked_pri_reqs; - wire no_req_masked = ~(|req_masked); - assign grant_onehot = ({NUM_REQS{no_req_masked}} & grant_unmasked) | grant_masked; + wire has_masked_reqs = (| masked_reqs); + wire has_unmasked_reqs = (| requests); + + assign grant_onehot = has_masked_reqs ? grant_masked : grant_unmasked; always @(posedge clk) begin if (reset) begin - pointer_reg <= {NUM_REQS{1'b1}}; + reqs_mask <= {NUM_REQS{1'b1}}; end else if (grant_ready) begin - if (|req_masked) begin - pointer_reg <= mask_higher_pri_regs; - end else if (|requests) begin - pointer_reg <= unmask_higher_pri_regs; - end else begin - pointer_reg <= pointer_reg; + if (has_masked_reqs) begin + reqs_mask <= masked_pri_reqs; + end else if (has_unmasked_reqs) begin + reqs_mask <= unmasked_pri_reqs; end end end - assign grant_valid = (| requests); - VX_onehot_encoder #( .N (NUM_REQS) ) onehot_encoder ( .data_in (grant_onehot), .data_out (grant_index), - `UNUSED_PIN (valid_out) + .valid_out(grant_valid) ); - end else begin + end else if (MODEL == 2) begin : g_model2 - reg [LOG_NUM_REQS-1:0] grant_index_r; - reg [NUM_REQS-1:0] grant_onehot_r; - reg [NUM_REQS-1:0] state; + reg [NUM_REQS-1:0][LOG_NUM_REQS-1:0] grant_table; + reg [LOG_NUM_REQS-1:0] state; - always @(*) begin - grant_index_r = 'x; - grant_onehot_r = 'x; - for (integer i = 0; i < NUM_REQS; ++i) begin - for (integer j = 0; j < NUM_REQS; ++j) begin - if (state[i] && requests[(j + 1) % NUM_REQS]) begin - grant_index_r = LOG_NUM_REQS'((j + 1) % NUM_REQS); - grant_onehot_r = '0; - grant_onehot_r[(j + 1) % NUM_REQS] = 1; + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_grant_table + always @(*) begin + grant_table[i] = 'x; + for (integer j = NUM_REQS-1; j >= 0; --j) begin + if (requests[(i+j+1) % NUM_REQS]) begin + grant_table[i] = LOG_NUM_REQS'((i+j+1) % NUM_REQS); end end end @@ -448,15 +474,24 @@ module VX_rr_arbiter #( always @(posedge clk) begin if (reset) begin - state <= '0; - end else if (grant_ready) begin - state <= grant_index_r; + state <= 0; + end else if (grant_valid && grant_ready) begin + state <= grant_index; end end - assign grant_index = grant_index_r; - assign grant_onehot = grant_onehot_r; - assign grant_valid = (| requests); + VX_demux #( + .DATAW (1), + .N (NUM_REQS) + ) grant_decoder ( + .sel_in (grant_index), + .data_in (grant_valid), + .data_out (grant_onehot) + ); + + assign grant_index = grant_table[state]; + assign grant_valid = (| requests); + end endmodule diff --git a/hw/rtl/libs/VX_scan.sv b/hw/rtl/libs/VX_scan.sv index f263dd218..6effd5814 100644 --- a/hw/rtl/libs/VX_scan.sv +++ b/hw/rtl/libs/VX_scan.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,8 +19,8 @@ `TRACING_OFF module VX_scan #( parameter N = 1, - parameter OP = 0, // 0: XOR, 1: AND, 2: OR - parameter REVERSE = 0 // 0: LO->HI, 1: HI->LO + parameter `STRING OP = "^", // ^: XOR, &: AND, |: OR + parameter REVERSE = 0 // 0: LO->HI, 1: HI->LO ) ( input wire [N-1:0] data_in, output wire [N-1:0] data_out @@ -28,48 +28,48 @@ module VX_scan #( localparam LOGN = `CLOG2(N); `IGNORE_UNOPTFLAT_BEGIN - wire [LOGN:0][N-1:0] t; + wire [LOGN:0][N-1:0] t; `IGNORE_UNOPTFLAT_END // reverses bits - if (REVERSE != 0) begin + if (REVERSE != 0) begin : g_data_in_reverse assign t[0] = data_in; - end else begin + end else begin : g_data_in_no_reverse assign t[0] = {<<{data_in}}; end // optimize for the common case of small and-scans - if ((N == 2) && (OP == 1)) begin + if ((N == 2) && (OP == "&")) begin : g_scan_n2_and assign t[LOGN] = {t[0][1], &t[0][1:0]}; - end else if ((N == 3) && (OP == 1)) begin + end else if ((N == 3) && (OP == "&")) begin : g_scan_n3_and assign t[LOGN] = {t[0][2], &t[0][2:1], &t[0][2:0]}; - end else if ((N == 4) && (OP == 1)) begin + end else if ((N == 4) && (OP == "&")) begin : g_scan_n4_and assign t[LOGN] = {t[0][3], &t[0][3:2], &t[0][3:1], &t[0][3:0]}; - end else begin + end else begin : g_scan // general case wire [N-1:0] fill; - for (genvar i = 0; i < LOGN; ++i) begin + for (genvar i = 0; i < LOGN; ++i) begin : g_i wire [N-1:0] shifted = N'({fill, t[i]} >> (1< 1) begin + if (N > 1) begin : g_switch reg req_out_r [N]; reg rsp_out_r; @@ -34,7 +34,7 @@ module VX_scope_switch #( req_out_r[i] <= 0; end rsp_out_r <= 0; - end else begin + end else begin for (integer i = 0; i < N; ++i) begin req_out_r[i] <= req_in; end @@ -46,10 +46,13 @@ module VX_scope_switch #( end end - assign req_out = req_out_r; + for (genvar i = 0; i < N; ++i) begin : g_req_out + assign req_out[i] = req_out_r[i]; + end + assign rsp_out = rsp_out_r; - - end else begin + + end else begin : g_passthru `UNUSED_VAR (clk) `UNUSED_VAR (reset) diff --git a/hw/rtl/libs/VX_scope_tap.sv b/hw/rtl/libs/VX_scope_tap.sv index c5ba778a2..6c0914b0c 100644 --- a/hw/rtl/libs/VX_scope_tap.sv +++ b/hw/rtl/libs/VX_scope_tap.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -14,300 +14,406 @@ `include "VX_platform.vh" `TRACING_OFF -module VX_scope_tap #( +module VX_scope_tap #( parameter SCOPE_ID = 0, // scope identifier parameter SCOPE_IDW = 8, // scope identifier width - parameter TRIGGERW = 0, // trigger signals width - parameter PROBEW = 0, // probe signal width - parameter SIZE = 256, // trace buffer size - parameter IDLE_CTRW = 16 // idle time between triggers counter width -) ( + parameter XTRIGGERW = 0, // changed trigger signals width + parameter HTRIGGERW = 0, // high trigger signals width + parameter PROBEW = 1, // probe signal width + parameter DEPTH = 256, // trace buffer depth + parameter IDLE_CTRW = 32, // idle time between triggers counter width + parameter TX_DATAW = 64 // transfer data width +) ( input wire clk, input wire reset, input wire start, input wire stop, - input wire [TRIGGERW-1:0] triggers, + input wire [`UP(XTRIGGERW)-1:0] xtriggers, + input wire [`UP(HTRIGGERW)-1:0] htriggers, input wire [PROBEW-1:0] probes, input wire bus_in, - output wire bus_out + output wire bus_out ); - localparam TX_DATAW = 64; - localparam TX_DATA_BITS = `LOG2UP(TX_DATAW); - localparam DATAW = PROBEW + TRIGGERW; - localparam DATA_BITS = `LOG2UP(DATAW); - localparam ADDRW = `CLOG2(SIZE); - localparam TRIGGER_ENABLE = (TRIGGERW != 0); - localparam MAX_IDLE_CTR = (2 ** IDLE_CTRW) - 1; - - localparam CTRL_STATE_IDLE = 2'd0; - localparam CTRL_STATE_RECV = 2'd1; - localparam CTRL_STATE_CMD = 2'd2; - localparam CTRL_STATE_SEND = 2'd3; - localparam CTRL_STATE_BITS = 2; + localparam HAS_TRIGGERS = XTRIGGERW != 0 || HTRIGGERW != 0; + localparam CTR_WIDTH = 64; + localparam SER_CTR_WIDTH = `LOG2UP(TX_DATAW); + localparam DATAW = PROBEW + XTRIGGERW + HTRIGGERW; + localparam ADDRW = `CLOG2(DEPTH); + localparam SIZEW = `CLOG2(DEPTH+1); + localparam MAX_IDLE_CTR = (2 ** IDLE_CTRW) - 1; + localparam DATA_BLOCKS = `CDIV(DATAW, TX_DATAW); + localparam BLOCK_IDX_WIDTH = `LOG2UP(DATA_BLOCKS); - localparam TAP_STATE_IDLE = 2'd0; - localparam TAP_STATE_WAIT = 2'd1; - localparam TAP_STATE_RUN = 2'd2; - localparam TAP_STATE_BITS = 2; + localparam CTRL_STATE_IDLE = 2'd0; + localparam CTRL_STATE_RECV = 2'd1; + localparam CTRL_STATE_CMD = 2'd2; + localparam CTRL_STATE_SEND = 2'd3; + localparam CTRL_STATE_BITS = 2; - localparam CMD_GET_WIDTH = 3'd0; - localparam CMD_GET_COUNT = 3'd1; - localparam CMD_GET_START = 3'd2; - localparam CMD_GET_DATA = 3'd3; - localparam CMD_SET_START = 3'd4; - localparam CMD_SET_STOP = 3'd5; - localparam CMD_TYPE_BITS = 3; + localparam TAP_STATE_IDLE = 2'd0; + localparam TAP_STATE_RUN = 2'd1; + localparam TAP_STATE_DONE = 2'd2; + localparam TAP_STATE_BITS = 2; - localparam GET_TYPE_WIDTH = 2'd0; - localparam GET_TYPE_COUNT = 2'd1; - localparam GET_TYPE_START = 2'd2; - localparam GET_TYPE_DATA = 2'd3; - localparam GET_TYPE_BITS = 2; + localparam CMD_GET_WIDTH = 3'd0; + localparam CMD_GET_COUNT = 3'd1; + localparam CMD_GET_START = 3'd2; + localparam CMD_GET_DATA = 3'd3; + localparam CMD_SET_START = 3'd4; + localparam CMD_SET_STOP = 3'd5; + localparam CMD_SET_DEPTH = 3'd6; + localparam CMD_TYPE_BITS = 3; - `NO_RW_RAM_CHECK reg [DATAW-1:0] data_store [SIZE-1:0]; - `NO_RW_RAM_CHECK reg [IDLE_CTRW-1:0] delta_store [SIZE-1:0]; + localparam SEND_TYPE_WIDTH = 2'd0; + localparam SEND_TYPE_COUNT = 2'd1; + localparam SEND_TYPE_START = 2'd2; + localparam SEND_TYPE_DATA = 2'd3; + localparam SEND_TYPE_BITS = 2; - reg [TRIGGERW-1:0] prev_triggers; - reg [IDLE_CTRW-1:0] delta; - reg [63:0] timestamp, start_time; - - reg [ADDRW-1:0] waddr, waddr_end; - - reg cmd_start, delta_flush; - - reg [63:0] start_delay, delay_cntr; + `STATIC_ASSERT ((IDLE_CTRW <= TX_DATAW), ("invalid parameter")) + `STATIC_ASSERT(`IS_POW2(DEPTH), ("depth must be a power of 2!")) reg [TAP_STATE_BITS-1:0] tap_state; reg [CTRL_STATE_BITS-1:0] ctrl_state; - reg [GET_TYPE_BITS-1:0] get_type; - - reg [TX_DATA_BITS-1:0] ser_tx_ctr; - reg [DATA_BITS-1:0] read_offset; + reg [SEND_TYPE_BITS-1:0] send_type; + + reg [CTR_WIDTH-1:0] timestamp, start_time; + reg [CTR_WIDTH-1:0] start_delay, stop_delay; + reg [`UP(XTRIGGERW)-1:0] prev_xtrig; + reg [`UP(HTRIGGERW)-1:0] prev_htrig; + reg [IDLE_CTRW-1:0] delta; + reg cmd_start, cmd_stop; + reg dflush; + + reg [SIZEW-1:0] waddr, waddr_end; + wire [DATAW-1:0] data_in; + + wire [DATAW-1:0] data_value; + wire [IDLE_CTRW-1:0] delta_value; reg [ADDRW-1:0] raddr; - reg read_data; // // trace capture // - wire [ADDRW-1:0] raddr_n = raddr + 1; + wire do_capture; - wire [ADDRW:0] count = (ADDRW+1)'(waddr) + 1; + wire write_en = (tap_state == TAP_STATE_RUN) && do_capture; + + if (HAS_TRIGGERS) begin : g_delta_store + if (XTRIGGERW != 0 && HTRIGGERW != 0) begin : g_data_in_pxh + assign data_in = {probes, xtriggers, htriggers}; + end else if (XTRIGGERW != 0) begin : g_data_in_px + assign data_in = {probes, xtriggers}; + end else begin : g_data_in_ph + assign data_in = {probes, htriggers}; + end + assign do_capture = dflush || (xtriggers != prev_xtrig) || (htriggers != prev_htrig) || (htriggers != '0); + VX_dp_ram #( + .DATAW (IDLE_CTRW), + .SIZE (DEPTH), + .OUT_REG (1), + .RDW_MODE ("R") + ) delta_store ( + .clk (clk), + .reset (reset), + .read (1'b1), + .wren (1'b1), + .write (write_en), + .waddr (waddr[ADDRW-1:0]), + .wdata (delta), + .raddr (raddr), + .rdata (delta_value) + ); + end else begin : g_no_delta_store + assign data_in = probes; + assign delta_value = '0; + assign do_capture = 1; + end + + VX_dp_ram #( + .DATAW (DATAW), + .SIZE (DEPTH), + .OUT_REG (1), + .RDW_MODE ("R") + ) data_store ( + .clk (clk), + .reset (reset), + .read (1'b1), + .wren (1'b1), + .write (write_en), + .waddr (waddr[ADDRW-1:0]), + .wdata (data_in), + .raddr (raddr), + .rdata (data_value) + ); always @(posedge clk) begin if (reset) begin - tap_state <= TAP_STATE_IDLE; - raddr <= '0; - waddr <= '0; - delta <= '0; - prev_triggers <= '0; - read_offset <= '0; - read_data <= 0; - timestamp <= '0; + timestamp <= '0; end else begin - timestamp <= timestamp + 1; + timestamp <= timestamp + CTR_WIDTH'(1); + end + end + always @(posedge clk) begin + if (reset) begin + tap_state <= TAP_STATE_IDLE; + delta <= '0; + dflush <= 0; + prev_xtrig <= '0; + prev_htrig <= '0; + waddr <= '0; + end else begin case (tap_state) TAP_STATE_IDLE: begin - if (start || cmd_start) begin - delta <= '0; - delta_flush <= 1; - if (0 == start_delay) begin - tap_state <= TAP_STATE_RUN; - start_time <= timestamp; - `ifdef DBG_TRACE_SCOPE - `TRACE(2, ("%d: *** scope #%0d: recording start - time=%0d\n", $time, SCOPE_ID, timestamp)); - `endif - end else begin - tap_state <= TAP_STATE_WAIT; - delay_cntr <= start_delay; - `ifdef DBG_TRACE_SCOPE - `TRACE(2, ("%d: *** scope #%0d: delayed start - time=%0d\n", $time, SCOPE_ID, start_delay)); - `endif - end - end - end - TAP_STATE_WAIT: begin - delay_cntr <= delay_cntr - 1; - if (1 == delay_cntr) begin + if (start || cmd_start) begin + dflush <= 1; tap_state <= TAP_STATE_RUN; start_time <= timestamp; `ifdef DBG_TRACE_SCOPE - `TRACE(2, ("%d: *** scope #%0d: recording start - time=%0d\n", $time, SCOPE_ID, timestamp)); + `TRACE(2, ("%t: scope_tap%0d: recording start - time=%0d\n", $time, SCOPE_ID, timestamp)) `endif end end TAP_STATE_RUN: begin - if (TRIGGER_ENABLE != 0) begin - if (delta_flush || (triggers != prev_triggers)) begin - data_store[waddr] <= {probes, triggers}; - delta_store[waddr] <= delta; - waddr <= waddr + 1; - delta <= '0; - delta_flush <= 0; - end else begin - delta <= delta + 1; - delta_flush <= (delta == (MAX_IDLE_CTR-1)); + dflush <= 0; + if (!(stop || cmd_stop) && (waddr < waddr_end)) begin + if (do_capture) begin + waddr <= waddr + SIZEW'(1); end - prev_triggers <= triggers; - end else begin - data_store[waddr] <= {probes, triggers}; - delta_store[waddr] <= '0; - waddr <= waddr + 1; - end - if (stop || (waddr >= waddr_end)) begin - waddr <= waddr; + if (HAS_TRIGGERS) begin + if (do_capture) begin + delta <= '0; + end else begin + delta <= delta + IDLE_CTRW'(1); + dflush <= (delta == IDLE_CTRW'(MAX_IDLE_CTR-1)); + end + prev_xtrig <= xtriggers; + prev_htrig <= htriggers; + end + end else begin + tap_state <= TAP_STATE_DONE; `ifdef DBG_TRACE_SCOPE - `TRACE(2, ("%d: *** scope #%0d: recording stop - waddr=(%0d, %0d)\n", $time, SCOPE_ID, waddr, waddr_end)); + `TRACE(2, ("%t: scope_tap%0d: recording stop - waddr=(%0d, %0d)\n", $time, SCOPE_ID, waddr, waddr_end)) `endif - tap_state <= TAP_STATE_IDLE; end end default:; endcase - - if (ctrl_state == CTRL_STATE_SEND - && get_type == GET_TYPE_DATA - && ser_tx_ctr == 0) begin - if (~read_data) begin - read_data <= 1; - end else begin - if (DATAW > TX_DATAW) begin - `IGNORE_WARNINGS_BEGIN - if (read_offset < DATA_BITS'(DATAW-TX_DATAW)) begin - read_offset <= read_offset + DATA_BITS'(TX_DATAW); - end else begin - raddr <= raddr_n; - read_data <= 0; - read_offset <= '0; - end - `IGNORE_WARNINGS_END - end else begin - raddr <= raddr_n; - read_data <= 0; - end - if (raddr_n == waddr) begin - raddr <= 0; - end - end - end end end // - // command controller + // trace controller // - + reg bus_out_r; - + reg [TX_DATAW-1:0] ser_buf_in; wire [TX_DATAW-1:0] ser_buf_in_n = {ser_buf_in[TX_DATAW-2:0], bus_in}; `UNUSED_VAR (ser_buf_in) + wire [DATA_BLOCKS-1:0][TX_DATAW-1:0] data_blocks; + logic [BLOCK_IDX_WIDTH-1:0] data_block_idx; + reg [SER_CTR_WIDTH-1:0] ser_tx_ctr; + reg is_read_data; + reg is_get_data; + wire [CMD_TYPE_BITS-1:0] cmd_type = ser_buf_in[CMD_TYPE_BITS-1:0]; wire [SCOPE_IDW-1:0] cmd_scope_id = ser_buf_in_n[CMD_TYPE_BITS +: SCOPE_IDW]; wire [TX_DATAW-CMD_TYPE_BITS-SCOPE_IDW-1:0] cmd_data = ser_buf_in[TX_DATAW-1:CMD_TYPE_BITS+SCOPE_IDW]; - wire [TX_DATAW-1:0] data_chunk = TX_DATAW'(DATAW'(data_store[raddr] >> read_offset)); - wire [TX_DATAW-1:0] get_data = read_data ? data_chunk : TX_DATAW'(delta_store[raddr]); - + for (genvar i = 0; i < DATA_BLOCKS; ++i) begin : g_data_blocks + for (genvar j = 0; j < TX_DATAW; ++j) begin : g_j + localparam k = i * TX_DATAW + j; + if (k < DATAW) begin : g_valid + assign data_blocks[i][j] = data_value[k]; + end else begin : g_padding + assign data_blocks[i][j] = '0; + end + end + end + + if (DATA_BLOCKS > 1) begin : g_data_block_idx + always @(posedge clk) begin + if (reset) begin + data_block_idx <= '0; + end else if ((ctrl_state == CTRL_STATE_SEND) + && (send_type == SEND_TYPE_DATA) + && (ser_tx_ctr == 0) + && is_read_data) begin + if (data_block_idx < BLOCK_IDX_WIDTH'(DATA_BLOCKS-1)) begin + data_block_idx <= data_block_idx + BLOCK_IDX_WIDTH'(1); + end else begin + data_block_idx <= '0; + end + end + end + end else begin : g_data_block_idx_0 + assign data_block_idx = 0; + end + always @(posedge clk) begin if (reset) begin ctrl_state <= CTRL_STATE_IDLE; + send_type <= SEND_TYPE_BITS'(SEND_TYPE_WIDTH); + waddr_end <= SIZEW'(DEPTH); cmd_start <= 0; + cmd_stop <= 0; start_delay <= '0; - waddr_end <= ADDRW'(SIZE-1); - bus_out_r <= 0; + stop_delay <= '0; + bus_out_r <= 0; + raddr <= '0; + is_read_data<= 0; + ser_tx_ctr <= '0; + is_get_data <= 0; end else begin - bus_out_r <= 0; - cmd_start <= 0; + bus_out_r <= 0; + is_get_data <= 0; + + if (start_delay != 0) begin + start_delay <= start_delay - CTR_WIDTH'(1); + end + + if (stop_delay != 0) begin + stop_delay <= stop_delay - CTR_WIDTH'(1); + end + + cmd_start <= (start_delay == CTR_WIDTH'(1)); + cmd_stop <= (stop_delay == CTR_WIDTH'(1)); case (ctrl_state) CTRL_STATE_IDLE: begin if (bus_in) begin + ser_tx_ctr <= SER_CTR_WIDTH'(TX_DATAW-1); ctrl_state <= CTRL_STATE_RECV; end - ser_tx_ctr <= TX_DATA_BITS'(TX_DATAW-1); end CTRL_STATE_RECV: begin - ser_tx_ctr <= ser_tx_ctr - 1; + ser_tx_ctr <= ser_tx_ctr - SER_CTR_WIDTH'(1); ser_buf_in <= ser_buf_in_n; if (ser_tx_ctr == 0) begin + // check if command is for this scope ctrl_state <= (cmd_scope_id == SCOPE_ID) ? CTRL_STATE_CMD : CTRL_STATE_IDLE; end end - CTRL_STATE_CMD: begin + CTRL_STATE_CMD: begin ctrl_state <= CTRL_STATE_IDLE; - case (cmd_type) + case (cmd_type) CMD_SET_START: begin - start_delay <= 64'(cmd_data); - cmd_start <= 1; + start_delay <= CTR_WIDTH'(cmd_data); + cmd_start <= (cmd_data == 0); end CMD_SET_STOP: begin - waddr_end <= ADDRW'(cmd_data); + stop_delay <= CTR_WIDTH'(cmd_data); + cmd_stop <= (cmd_data == 0); + end + CMD_SET_DEPTH: begin + waddr_end <= SIZEW'(cmd_data); end CMD_GET_WIDTH, CMD_GET_START, CMD_GET_COUNT, - CMD_GET_DATA: begin - ctrl_state <= CTRL_STATE_SEND; - get_type <= GET_TYPE_BITS'(cmd_type); - ser_tx_ctr <= TX_DATA_BITS'(TX_DATAW-1); + CMD_GET_DATA: begin + send_type <= SEND_TYPE_BITS'(cmd_type); + ser_tx_ctr <= SER_CTR_WIDTH'(TX_DATAW-1); + ctrl_state <= CTRL_STATE_SEND; bus_out_r <= 1; end default:; - endcase + endcase `ifdef DBG_TRACE_SCOPE - `TRACE(2, ("%d: *** scope #%0d: CMD: type=%0d\n", $time, SCOPE_ID, cmd_type)); + `TRACE(2, ("%t: scope_tap%0d: CMD: type=%0d\n", $time, SCOPE_ID, cmd_type)) `endif end CTRL_STATE_SEND: begin - ser_tx_ctr <= ser_tx_ctr - 1; - case (get_type) - GET_TYPE_WIDTH: begin + case (send_type) + SEND_TYPE_WIDTH: begin bus_out_r <= 1'(DATAW >> ser_tx_ctr); `ifdef DBG_TRACE_SCOPE if (ser_tx_ctr == 0) begin - `TRACE(2, ("%d: *** scope #%0d: SEND width=%0d\n", $time, SCOPE_ID, DATAW)); - end - `endif + `TRACE(2, ("%t: scope_tap%0d: SEND width=%0d\n", $time, SCOPE_ID, DATAW)) + end + `endif end - GET_TYPE_COUNT: begin - bus_out_r <= 1'(count >> ser_tx_ctr); + SEND_TYPE_COUNT: begin + bus_out_r <= 1'(waddr >> ser_tx_ctr); `ifdef DBG_TRACE_SCOPE if (ser_tx_ctr == 0) begin - `TRACE(2, ("%d: *** scope #%0d: SEND count=%0d\n", $time, SCOPE_ID, count)); - end - `endif + `TRACE(2, ("%t: scope_tap%0d: SEND count=%0d\n", $time, SCOPE_ID, waddr)) + end + `endif end - GET_TYPE_START: begin - bus_out_r <= 1'(start_time >> ser_tx_ctr); + SEND_TYPE_START: begin + bus_out_r <= 1'(start_time >> ser_tx_ctr); `ifdef DBG_TRACE_SCOPE if (ser_tx_ctr == 0) begin - `TRACE(2, ("%d: *** scope #%0d: SEND start=%0d\n", $time, SCOPE_ID, start_time)); - end - `endif + `TRACE(2, ("%t: scope_tap%0d: SEND start=%0d\n", $time, SCOPE_ID, start_time)) + end + `endif end - GET_TYPE_DATA: begin - bus_out_r <= 1'(get_data >> ser_tx_ctr); + SEND_TYPE_DATA: begin + is_get_data <= 1; + if (ser_tx_ctr == 0) begin + if (is_read_data) begin + if (data_block_idx == BLOCK_IDX_WIDTH'(DATA_BLOCKS-1)) begin + raddr <= raddr + ADDRW'(1); + is_read_data <= 0; // switch to delta mode + end + end else begin + is_read_data <= 1; // switch to data mode + end + end `ifdef DBG_TRACE_SCOPE if (ser_tx_ctr == 0) begin - `TRACE(2, ("%d: *** scope #%0d: SEND data=%0d\n", $time, SCOPE_ID, get_data)); - end - `endif + if (is_read_data) begin + `TRACE(2, ("%t: scope_tap%0d: SEND data=0x%0h\n", $time, SCOPE_ID, get_data)) + end else begin + `TRACE(2, ("%t: scope_tap%0d: SEND delta=0x%0h\n", $time, SCOPE_ID, get_data)) + end + end + `endif end default:; endcase + ser_tx_ctr <= ser_tx_ctr - SER_CTR_WIDTH'(1); if (ser_tx_ctr == 0) begin ctrl_state <= CTRL_STATE_IDLE; - end + end end default:; endcase - end + end end - assign bus_out = bus_out_r; + wire [BLOCK_IDX_WIDTH-1:0] data_block_idx_r; + wire [SER_CTR_WIDTH-1:0] ser_tx_ctr_r; + wire is_read_data_r; + + VX_pipe_register #( + .DATAW (1 + SER_CTR_WIDTH + BLOCK_IDX_WIDTH) + ) data_sel_buf ( + .clk (clk), + .reset (reset), + .enable (1'b1), + .data_in ({is_read_data, ser_tx_ctr, data_block_idx}), + .data_out ({is_read_data_r, ser_tx_ctr_r, data_block_idx_r}) + ); + + wire [TX_DATAW-1:0] get_data = is_read_data_r ? data_blocks[data_block_idx_r] : TX_DATAW'(delta_value); + wire bus_out_w = is_get_data ? get_data[ser_tx_ctr_r] : bus_out_r; + + VX_pipe_register #( + .DATAW (1), + .DEPTH (1) + ) buf_out ( + .clk (clk), + .reset (reset), + .enable (1'b1), + .data_in (bus_out_w), + .data_out (bus_out) + ); endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_serial_div.sv b/hw/rtl/libs/VX_serial_div.sv index e7af40009..593be2d9a 100644 --- a/hw/rtl/libs/VX_serial_div.sv +++ b/hw/rtl/libs/VX_serial_div.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,7 +29,7 @@ module VX_serial_div #( input wire is_signed, input wire [LANES-1:0][WIDTHN-1:0] numer, - input wire [LANES-1:0][WIDTHD-1:0] denom, + input wire [LANES-1:0][WIDTHD-1:0] denom, output wire [LANES-1:0][WIDTHQ-1:0] quotient, output wire [LANES-1:0][WIDTHR-1:0] remainder @@ -49,14 +49,14 @@ module VX_serial_div #( reg [CNTRW-1:0] cntr; reg busy_r; - for (genvar i = 0; i < LANES; ++i) begin + for (genvar i = 0; i < LANES; ++i) begin : g_setup wire negate_numer = is_signed && numer[i][WIDTHN-1]; wire negate_denom = is_signed && denom[i][WIDTHD-1]; assign numer_qual[i] = negate_numer ? -$signed(numer[i]) : numer[i]; assign denom_qual[i] = negate_denom ? -$signed(denom[i]) : denom[i]; assign sub_result[i] = working[i][WIDTHN + MIN_ND : WIDTHN] - denom_r[i]; end - + always @(posedge clk) begin if (reset) begin busy_r <= 0; @@ -74,18 +74,21 @@ module VX_serial_div #( end end - for (genvar i = 0; i < LANES; ++i) begin + for (genvar i = 0; i < LANES; ++i) begin : g_div always @(posedge clk) begin if (strobe) begin working[i] <= {{WIDTHD{1'b0}}, numer_qual[i], 1'b0}; denom_r[i] <= denom_qual[i]; inv_quot[i] <= (denom[i] != 0) && is_signed && (numer[i][31] ^ denom[i][31]); inv_rem[i] <= is_signed && numer[i][31]; - end else if (busy_r) begin + end else if (busy_r) begin working[i] <= sub_result[i][WIDTHD] ? {working[i][WIDTHN+MIN_ND-1:0], 1'b0} : {sub_result[i][WIDTHD-1:0], working[i][WIDTHN-1:0], 1'b1}; end end + end + + for (genvar i = 0; i < LANES; ++i) begin : g_output wire [WIDTHQ-1:0] q = working[i][WIDTHQ-1:0]; wire [WIDTHR-1:0] r = working[i][WIDTHN+WIDTHR:WIDTHN+1]; assign quotient[i] = inv_quot[i] ? -$signed(q) : q; diff --git a/hw/rtl/libs/VX_serial_mul.sv b/hw/rtl/libs/VX_serial_mul.sv index 9369dfd10..d847b7111 100644 --- a/hw/rtl/libs/VX_serial_mul.sv +++ b/hw/rtl/libs/VX_serial_mul.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -13,7 +13,7 @@ `include "VX_platform.vh" -// Iterative integer multiplier +// Iterative integer multiplier // An adaptation of ZipCPU algorithm for a multi-lane elastic architecture. // https://zipcpu.com/zipcpu/2021/07/03/slowmpy.html @@ -65,7 +65,7 @@ module VX_serial_mul #( end end - for (genvar i = 0; i < LANES; ++i) begin + for (genvar i = 0; i < LANES; ++i) begin : g_mul wire [X_WIDTH-1:0] axb = b[i][0] ? a[i] : '0; always @(posedge clk) begin @@ -73,12 +73,12 @@ module VX_serial_mul #( if (SIGNED) begin a[i] <= X_WIDTH'($signed(dataa[i])); b[i] <= Y_WIDTH'($signed(datab[i])); - end else begin + end else begin a[i] <= dataa[i]; b[i] <= datab[i]; end p[i] <= 0; - end else if (busy_r) begin + end else if (busy_r) begin b[i] <= (b[i] >> 1); p[i][Y_WIDTH-2:0] <= p[i][Y_WIDTH-1:1]; if (SIGNED) begin @@ -93,9 +93,9 @@ module VX_serial_mul #( end end - if (SIGNED) begin + if (SIGNED) begin : g_signed assign result[i] = R_WIDTH'(p[i][P_WIDTH-1:0] + {1'b1, {(X_WIDTH-2){1'b0}}, 1'b1, {(Y_WIDTH){1'b0}}}); - end else begin + end else begin : g_unsigned assign result[i] = R_WIDTH'(p[i]); end end diff --git a/hw/rtl/libs/VX_shift_register.sv b/hw/rtl/libs/VX_shift_register.sv index 56726d2cb..b4809fe90 100644 --- a/hw/rtl/libs/VX_shift_register.sv +++ b/hw/rtl/libs/VX_shift_register.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -14,13 +14,13 @@ `include "VX_platform.vh" `TRACING_OFF -module VX_shift_register #( +module VX_shift_register #( parameter DATAW = 1, parameter RESETW = 0, parameter DEPTH = 1, - parameter NUM_TAPS = 1, + parameter NUM_TAPS = 1, parameter TAP_START = 0, - parameter TAP_STRIDE = 1 + parameter TAP_STRIDE = 1 ) ( input wire clk, input wire reset, @@ -28,7 +28,7 @@ module VX_shift_register #( input wire [DATAW-1:0] data_in, output wire [NUM_TAPS-1:0][DATAW-1:0] data_out ); - if (DEPTH != 0) begin + if (DEPTH != 0) begin : g_shift_register reg [DEPTH-1:0][DATAW-1:0] entries; always @(posedge clk) begin @@ -36,7 +36,7 @@ module VX_shift_register #( if ((i >= (DATAW-RESETW)) && reset) begin for (integer j = 0; j < DEPTH; ++j) entries[j][i] <= 0; - end else if (enable) begin + end else if (enable) begin for (integer j = 1; j < DEPTH; ++j) entries[j-1][i] <= entries[j][i]; entries[DEPTH-1][i] <= data_in[i]; @@ -44,10 +44,10 @@ module VX_shift_register #( end end - for (genvar i = 0; i < NUM_TAPS; ++i) begin + for (genvar i = 0; i < NUM_TAPS; ++i) begin : g_data_out assign data_out[i] = entries[i * TAP_STRIDE + TAP_START]; end - end else begin + end else begin : g_passthru `UNUSED_VAR (clk) `UNUSED_VAR (reset) `UNUSED_VAR (enable) diff --git a/hw/rtl/libs/VX_skid_buffer.sv b/hw/rtl/libs/VX_skid_buffer.sv index 53c213622..b77cce2a4 100644 --- a/hw/rtl/libs/VX_skid_buffer.sv +++ b/hw/rtl/libs/VX_skid_buffer.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,19 +19,19 @@ module VX_skid_buffer #( parameter PASSTHRU = 0, parameter HALF_BW = 0, parameter OUT_REG = 0 -) ( +) ( input wire clk, input wire reset, - + input wire valid_in, - output wire ready_in, + output wire ready_in, input wire [DATAW-1:0] data_in, output wire [DATAW-1:0] data_out, input wire ready_out, output wire valid_out ); - if (PASSTHRU != 0) begin + if (PASSTHRU != 0) begin : g_passthru `UNUSED_VAR (clk) `UNUSED_VAR (reset) @@ -40,7 +40,7 @@ module VX_skid_buffer #( assign data_out = data_in; assign ready_in = ready_out; - end else if (HALF_BW != 0) begin + end else if (HALF_BW != 0) begin : g_half_bw VX_toggle_buffer #( .DATAW (DATAW) @@ -55,7 +55,7 @@ module VX_skid_buffer #( .ready_out (ready_out) ); - end else begin + end else begin : g_full_bw VX_stream_buffer #( .DATAW (DATAW), diff --git a/hw/rtl/libs/VX_sp_ram.sv b/hw/rtl/libs/VX_sp_ram.sv index 4ab2a9b7a..bce7d00da 100644 --- a/hw/rtl/libs/VX_sp_ram.sv +++ b/hw/rtl/libs/VX_sp_ram.sv @@ -13,16 +13,67 @@ `include "VX_platform.vh" +`define RAM_INITIALIZATION \ + if (INIT_ENABLE != 0) begin : g_init \ + if (INIT_FILE != "") begin : g_file \ + initial $readmemh(INIT_FILE, ram); \ + end else begin : g_value \ + initial begin \ + for (integer i = 0; i < SIZE; ++i) begin : g_i \ + ram[i] = INIT_VALUE; \ + end \ + end \ + end \ + end + +`ifdef SIMULATION + `define RAM_RESET_BLOCK if (RESET_RAM && reset) begin \ + for (integer i = 0; i < SIZE; ++i) begin \ + ram[i] <= DATAW'(INIT_VALUE); \ + end \ + end else +`else + `define RAM_RESET_BLOCK +`endif + +`define RAM_WRITE_ALL `RAM_RESET_BLOCK \ + if (write) begin \ + ram[addr] <= wdata; \ + end + +`ifdef QUARTUS + `define RAM_ARRAY_WREN reg [WRENW-1:0][WSELW-1:0] ram [0:SIZE-1]; + `define RAM_WRITE_WREN `RAM_RESET_BLOCK \ + if (write) begin \ + for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[addr][i] <= wdata[i * WSELW +: WSELW]; \ + end \ + end \ + end +`else + `define RAM_ARRAY_WREN reg [DATAW-1:0] ram [0:SIZE-1]; + `define RAM_WRITE_WREN `RAM_RESET_BLOCK \ + if (write) begin \ + for (integer i = 0; i < WRENW; ++i) begin \ + if (wren[i]) begin \ + ram[addr][i * WSELW +: WSELW] <= wdata[i * WSELW +: WSELW]; \ + end \ + end \ + end +`endif + `TRACING_OFF module VX_sp_ram #( parameter DATAW = 1, parameter SIZE = 1, - parameter ADDR_MIN = 0, parameter WRENW = 1, parameter OUT_REG = 0, - parameter NO_RWCHECK = 0, - parameter RW_ASSERT = 0, parameter LUTRAM = 0, + parameter `STRING RDW_MODE = "W", // W: write-first, R: read-first, N: no-change + parameter RADDR_REG = 0, // read address registered hint + parameter RADDR_RESET = 0, // read address has reset + parameter RDW_ASSERT = 0, parameter RESET_RAM = 0, parameter INIT_ENABLE = 0, parameter INIT_FILE = "", @@ -38,31 +89,334 @@ module VX_sp_ram #( input wire [DATAW-1:0] wdata, output wire [DATAW-1:0] rdata ); - VX_dp_ram #( - .DATAW (DATAW), - .SIZE (SIZE), - .ADDR_MIN (ADDR_MIN), - .WRENW (WRENW), - .OUT_REG (OUT_REG), - .NO_RWCHECK (NO_RWCHECK), - .RW_ASSERT (RW_ASSERT), - .LUTRAM (LUTRAM), - .RESET_RAM (RESET_RAM), - .INIT_ENABLE (INIT_ENABLE), - .INIT_FILE (INIT_FILE), - .INIT_VALUE (INIT_VALUE), - .ADDRW (ADDRW) - ) dp_ram ( - .clk (clk), - .reset (reset), - .read (read), - .write (write), - .wren (wren), - .waddr (addr), - .wdata (wdata), - .raddr (addr), - .rdata (rdata) - ); + localparam WSELW = DATAW / WRENW; + `UNUSED_PARAM (LUTRAM) + `UNUSED_PARAM (RADDR_REG) + `UNUSED_PARAM (RADDR_RESET) + + `STATIC_ASSERT(!(WRENW * WSELW != DATAW), ("invalid parameter")) + `STATIC_ASSERT((RDW_MODE == "R" || RDW_MODE == "W" || RDW_MODE == "N"), ("invalid parameter")) + `UNUSED_PARAM (RDW_ASSERT) + +`ifdef SYNTHESIS + localparam FORCE_BRAM = !LUTRAM && `FORCE_BRAM(SIZE, DATAW); + if (OUT_REG) begin : g_sync + if (FORCE_BRAM) begin : g_bram + if (RDW_MODE == "W") begin : g_write_first + if (WRENW != 1) begin : g_wren + `RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [ADDRW-1:0] addr_r; + always @(posedge clk) begin + `RAM_WRITE_WREN + if (read) begin + addr_r <= addr; + end + end + assign rdata = ram[addr_r]; + end else begin : g_no_wren + `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + `RAM_WRITE_ALL + if (read) begin + if (write) begin + rdata_r <= wdata; + end else begin + rdata_r <= ram[addr]; + end + end + end + assign rdata = rdata_r; + end + end else if (RDW_MODE == "R") begin : g_read_first + if (WRENW != 1) begin : g_wren + `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + `RAM_WRITE_WREN + if (read) begin + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end else begin : g_no_wren + `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + `RAM_WRITE_ALL + if (read) begin + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end + end else if (RDW_MODE == "N") begin : g_no_change + if (WRENW != 1) begin : g_wren + `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + `RAM_WRITE_WREN + else if (read) begin + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end else begin : g_no_wren + `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + `RAM_WRITE_ALL + else if (read) begin + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end + end + end else begin : g_auto + if (RDW_MODE == "W") begin : g_write_first + if (WRENW != 1) begin : g_wren + `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [ADDRW-1:0] addr_r; + always @(posedge clk) begin + `RAM_WRITE_WREN + if (read) begin + addr_r <= addr; + end + end + assign rdata = ram[addr_r]; + end else begin : g_no_wren + reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + `RAM_WRITE_ALL + if (read) begin + if (write) begin + rdata_r <= wdata; + end else begin + rdata_r <= ram[addr]; + end + end + end + assign rdata = rdata_r; + end + end else if (RDW_MODE == "R") begin : g_read_first + if (WRENW != 1) begin : g_wren + `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + `RAM_WRITE_WREN + if (read) begin + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end else begin : g_no_wren + reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + `RAM_WRITE_ALL + if (read) begin + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end + end else if (RDW_MODE == "N") begin : g_no_change + if (WRENW != 1) begin : g_wren + `RAM_ARRAY_WREN + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + `RAM_WRITE_WREN + else if (read) begin + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end else begin : g_no_wren + reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + `RAM_WRITE_ALL + else if (read) begin + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end + end + end + end else begin : g_async + `UNUSED_VAR (read) + if (FORCE_BRAM) begin : g_bram + `ifdef ASYNC_BRAM_PATCH + VX_async_ram_patch #( + .DATAW (DATAW), + .SIZE (SIZE), + .WRENW (WRENW), + .DUAL_PORT (0), + .FORCE_BRAM (FORCE_BRAM), + .RADDR_REG (RADDR_REG), + .RADDR_RESET(RADDR_RESET), + .WRITE_FIRST(RDW_MODE == "W"), + .INIT_ENABLE(INIT_ENABLE), + .INIT_FILE (INIT_FILE), + .INIT_VALUE (INIT_VALUE) + ) async_ram_patch ( + .clk (clk), + .reset (reset), + .read (read), + .write (write), + .wren (wren), + .waddr (addr), + .wdata (wdata), + .raddr (addr), + .rdata (rdata) + ); + `else + if (RDW_MODE == "W") begin : g_write_first + if (WRENW != 1) begin : g_wren + `RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + always @(posedge clk) begin + `RAM_WRITE_WREN + end + assign rdata = ram[addr]; + end else begin : g_no_wren + `RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + `RAM_WRITE_ALL + end + assign rdata = ram[addr]; + end + end else begin : g_read_first + if (WRENW != 1) begin : g_wren + `NO_RW_RAM_CHECK `USE_BLOCK_BRAM `RAM_ARRAY_WREN + `RAM_INITIALIZATION + always @(posedge clk) begin + `RAM_WRITE_WREN + end + assign rdata = ram[addr]; + end else begin : g_no_wren + `NO_RW_RAM_CHECK `USE_BLOCK_BRAM reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + `RAM_WRITE_ALL + end + assign rdata = ram[addr]; + end + end + `endif + end else begin : g_auto + if (RDW_MODE == "W") begin : g_write_first + if (WRENW != 1) begin : g_wren + `RW_RAM_CHECK `RAM_ARRAY_WREN + `RAM_INITIALIZATION + always @(posedge clk) begin + `RAM_WRITE_WREN + end + assign rdata = ram[addr]; + end else begin : g_no_wren + `RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + `RAM_WRITE_ALL + end + assign rdata = ram[addr]; + end + end else begin : g_read_first + if (WRENW != 1) begin : g_wren + `NO_RW_RAM_CHECK `RAM_ARRAY_WREN + `RAM_INITIALIZATION + always @(posedge clk) begin + `RAM_WRITE_WREN + end + assign rdata = ram[addr]; + end else begin : g_no_wren + `NO_RW_RAM_CHECK reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + always @(posedge clk) begin + `RAM_WRITE_ALL + end + assign rdata = ram[addr]; + end + end + end + end +`else + // simulation + reg [DATAW-1:0] ram [0:SIZE-1]; + `RAM_INITIALIZATION + + always @(posedge clk) begin + `RAM_WRITE_WREN + end + + if (OUT_REG) begin : g_sync + if (RDW_MODE == "W") begin : g_write_first + reg [ADDRW-1:0] addr_r; + always @(posedge clk) begin + if (read) begin + addr_r <= addr; + end + end + assign rdata = ram[addr_r]; + end else if (RDW_MODE == "R") begin : g_read_first + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read) begin + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end else if (RDW_MODE == "N") begin : g_no_change + reg [DATAW-1:0] rdata_r; + always @(posedge clk) begin + if (read && ~write) begin + rdata_r <= ram[addr]; + end + end + assign rdata = rdata_r; + end + end else begin : g_async + `UNUSED_VAR (read) + if (RDW_MODE == "W") begin : g_write_first + assign rdata = ram[addr]; + end else begin : g_read_first + reg [DATAW-1:0] prev_data; + reg [ADDRW-1:0] prev_waddr; + reg prev_write; + always @(posedge clk) begin + if (reset) begin + prev_write <= 0; + prev_data <= '0; + prev_waddr <= '0; + end else begin + prev_write <= write; + prev_data <= ram[addr]; + prev_waddr <= addr; + end + end + assign rdata = (prev_write && (prev_waddr == addr)) ? prev_data : ram[addr]; + if (RDW_ASSERT) begin : g_rw_asert + `RUNTIME_ASSERT(~read || (rdata == ram[addr]), ("%t: read after write hazard", $time)) + end + end + end +`endif endmodule `TRACING_ON diff --git a/hw/rtl/libs/VX_stream_arb.sv b/hw/rtl/libs/VX_stream_arb.sv index 98fed5859..8cc96c97e 100644 --- a/hw/rtl/libs/VX_stream_arb.sv +++ b/hw/rtl/libs/VX_stream_arb.sv @@ -21,8 +21,8 @@ module VX_stream_arb #( parameter `STRING ARBITER = "R", parameter MAX_FANOUT = `MAX_FANOUT, parameter OUT_BUF = 0, - parameter LUTRAM = 0, - parameter NUM_REQS = `CDIV(NUM_INPUTS, NUM_OUTPUTS), + parameter NUM_REQS = (NUM_INPUTS > NUM_OUTPUTS) ? `CDIV(NUM_INPUTS, NUM_OUTPUTS) : `CDIV(NUM_OUTPUTS, NUM_INPUTS), + parameter SEL_COUNT = `MIN(NUM_INPUTS, NUM_OUTPUTS), parameter LOG_NUM_REQS = `CLOG2(NUM_REQS), parameter NUM_REQS_W = `UP(LOG_NUM_REQS) ) ( @@ -35,103 +35,69 @@ module VX_stream_arb #( output wire [NUM_OUTPUTS-1:0] valid_out, output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out, - output wire [NUM_OUTPUTS-1:0][NUM_REQS_W-1:0] sel_out, - input wire [NUM_OUTPUTS-1:0] ready_out + input wire [NUM_OUTPUTS-1:0] ready_out, + + output wire [SEL_COUNT-1:0][NUM_REQS_W-1:0] sel_out ); - if (NUM_INPUTS > NUM_OUTPUTS) begin + if (NUM_INPUTS > NUM_OUTPUTS) begin : g_input_select - if (NUM_OUTPUTS > 1) begin + // #Inputs > #Outputs - // (#inputs > #outputs) and (#outputs > 1) + if (MAX_FANOUT != 0 && (NUM_REQS > (MAX_FANOUT + MAX_FANOUT /2))) begin : g_fanout - for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin + localparam NUM_SLICES = `CDIV(NUM_REQS, MAX_FANOUT); + localparam LOG_NUM_REQS2 = `CLOG2(MAX_FANOUT); + localparam LOG_NUM_REQS3 = `CLOG2(NUM_SLICES); + localparam DATAW2 = DATAW + LOG_NUM_REQS2; - localparam SLICE_BEGIN = i * NUM_REQS; - localparam SLICE_END = `MIN(SLICE_BEGIN + NUM_REQS, NUM_INPUTS); + wire [NUM_SLICES-1:0][NUM_OUTPUTS-1:0] valid_tmp; + wire [NUM_SLICES-1:0][NUM_OUTPUTS-1:0][DATAW2-1:0] data_tmp; + wire [NUM_SLICES-1:0][NUM_OUTPUTS-1:0] ready_tmp; + + for (genvar s = 0; s < NUM_SLICES; ++s) begin : g_slice_arbs + + localparam SLICE_STRIDE= MAX_FANOUT * NUM_OUTPUTS; + localparam SLICE_BEGIN = s * SLICE_STRIDE; + localparam SLICE_END = `MIN(SLICE_BEGIN + SLICE_STRIDE, NUM_INPUTS); localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN; - `RESET_RELAY (slice_reset, reset); + wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_tmp_u; + wire [NUM_OUTPUTS-1:0][LOG_NUM_REQS2-1:0] sel_tmp_u; VX_stream_arb #( .NUM_INPUTS (SLICE_SIZE), - .NUM_OUTPUTS (1), + .NUM_OUTPUTS (NUM_OUTPUTS), .DATAW (DATAW), .ARBITER (ARBITER), .MAX_FANOUT (MAX_FANOUT), - .OUT_BUF (OUT_BUF), - .LUTRAM (LUTRAM) - ) arb_slice ( + .OUT_BUF (3) + ) fanout_slice_arb ( .clk (clk), - .reset (slice_reset), + .reset (reset), .valid_in (valid_in[SLICE_END-1: SLICE_BEGIN]), - .ready_in (ready_in[SLICE_END-1: SLICE_BEGIN]), .data_in (data_in[SLICE_END-1: SLICE_BEGIN]), - .data_out (data_out[i]), - .sel_out (sel_out[i]), - .valid_out (valid_out[i]), - .ready_out (ready_out[i]) + .ready_in (ready_in[SLICE_END-1: SLICE_BEGIN]), + .valid_out (valid_tmp[s]), + .data_out (data_tmp_u), + .ready_out (ready_tmp[s]), + .sel_out (sel_tmp_u) ); - end - end else if (MAX_FANOUT != 0 && (NUM_INPUTS > (MAX_FANOUT + MAX_FANOUT /2))) begin - - // (#inputs > max_fanout) and (#outputs == 1) - - localparam NUM_SLICES = `CDIV(NUM_INPUTS, MAX_FANOUT); - localparam LOG_NUM_REQS2 = `CLOG2(MAX_FANOUT); - localparam LOG_NUM_REQS3 = `CLOG2(NUM_SLICES); - - wire [NUM_SLICES-1:0] valid_tmp; - wire [NUM_SLICES-1:0][DATAW+LOG_NUM_REQS2-1:0] data_tmp; - wire [NUM_SLICES-1:0] ready_tmp; - - for (genvar i = 0; i < NUM_SLICES; ++i) begin - - localparam SLICE_BEGIN = i * MAX_FANOUT; - localparam SLICE_END = `MIN(SLICE_BEGIN + MAX_FANOUT, NUM_INPUTS); - localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN; - - wire [DATAW-1:0] data_tmp_u; - wire [`LOG2UP(SLICE_SIZE)-1:0] sel_tmp_u; - - `RESET_RELAY (slice_reset, reset); - - if (MAX_FANOUT != 1) begin - VX_stream_arb #( - .NUM_INPUTS (SLICE_SIZE), - .NUM_OUTPUTS (1), - .DATAW (DATAW), - .ARBITER (ARBITER), - .MAX_FANOUT (MAX_FANOUT), - .OUT_BUF (3), // registered output - .LUTRAM (LUTRAM) - ) fanout_slice_arb ( - .clk (clk), - .reset (slice_reset), - .valid_in (valid_in[SLICE_END-1: SLICE_BEGIN]), - .data_in (data_in[SLICE_END-1: SLICE_BEGIN]), - .ready_in (ready_in[SLICE_END-1: SLICE_BEGIN]), - .valid_out (valid_tmp[i]), - .data_out (data_tmp_u), - .sel_out (sel_tmp_u), - .ready_out (ready_tmp[i]) - ); + for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_data_tmp + assign data_tmp[s][o] = {data_tmp_u[o], sel_tmp_u[o]}; end - - assign data_tmp[i] = {data_tmp_u, LOG_NUM_REQS2'(sel_tmp_u)}; end - wire [DATAW+LOG_NUM_REQS2-1:0] data_out_u; - wire [LOG_NUM_REQS3-1:0] sel_out_u; + wire [NUM_OUTPUTS-1:0][DATAW2-1:0] data_out_u; + wire [NUM_OUTPUTS-1:0][LOG_NUM_REQS3-1:0] sel_out_u; VX_stream_arb #( - .NUM_INPUTS (NUM_SLICES), - .NUM_OUTPUTS (1), - .DATAW (DATAW + LOG_NUM_REQS2), + .NUM_INPUTS (NUM_SLICES * NUM_OUTPUTS), + .NUM_OUTPUTS (NUM_OUTPUTS), + .DATAW (DATAW2), .ARBITER (ARBITER), .MAX_FANOUT (MAX_FANOUT), - .OUT_BUF (OUT_BUF), - .LUTRAM (LUTRAM) + .OUT_BUF (OUT_BUF) ) fanout_join_arb ( .clk (clk), .reset (reset), @@ -144,117 +110,111 @@ module VX_stream_arb #( .ready_out (ready_out) ); - assign data_out = data_out_u[LOG_NUM_REQS2 +: DATAW]; - assign sel_out = {sel_out_u, data_out_u[0 +: LOG_NUM_REQS2]}; + for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_data_out + assign sel_out[o] = {sel_out_u[o], data_out_u[o][LOG_NUM_REQS2-1:0]}; + assign data_out[o] = data_out_u[o][DATAW2-1:LOG_NUM_REQS2]; + end - end else begin - - // (#inputs <= max_fanout) and (#outputs == 1) - - wire valid_in_r; - wire [DATAW-1:0] data_in_r; - wire ready_in_r; + end else begin : g_arbiter + wire [NUM_REQS-1:0] arb_requests; wire arb_valid; wire [NUM_REQS_W-1:0] arb_index; wire [NUM_REQS-1:0] arb_onehot; wire arb_ready; + for (genvar r = 0; r < NUM_REQS; ++r) begin : g_requests + wire [NUM_OUTPUTS-1:0] requests; + for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_o + localparam i = r * NUM_OUTPUTS + o; + assign requests[o] = valid_in[i]; + end + assign arb_requests[r] = (| requests); + end + VX_generic_arbiter #( .NUM_REQS (NUM_REQS), .TYPE (ARBITER) ) arbiter ( .clk (clk), .reset (reset), - .requests (valid_in), + .requests (arb_requests), .grant_valid (arb_valid), .grant_index (arb_index), .grant_onehot (arb_onehot), .grant_ready (arb_ready) ); - assign valid_in_r = arb_valid; - assign data_in_r = data_in[arb_index]; - assign arb_ready = ready_in_r; + wire [NUM_OUTPUTS-1:0] valid_out_w; + wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_w; + wire [NUM_OUTPUTS-1:0] ready_out_w; - for (genvar i = 0; i < NUM_REQS; ++i) begin - assign ready_in[i] = ready_in_r && arb_onehot[i]; + for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_data_out_w + wire [NUM_REQS-1:0] valid_in_w; + wire [NUM_REQS-1:0][DATAW-1:0] data_in_w; + for (genvar r = 0; r < NUM_REQS; ++r) begin : g_r + localparam i = r * NUM_OUTPUTS + o; + if (r < NUM_INPUTS) begin : g_valid + assign valid_in_w[r] = valid_in[i]; + assign data_in_w[r] = data_in[i]; + end else begin : g_padding + assign valid_in_w[r] = 0; + assign data_in_w[r] = '0; + end + end + assign valid_out_w[o] = (NUM_OUTPUTS == 1) ? arb_valid : (| (valid_in_w & arb_onehot)); + assign data_out_w[o] = data_in_w[arb_index]; end - VX_elastic_buffer #( - .DATAW (LOG_NUM_REQS + DATAW), - .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), - .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)), - .LUTRAM (LUTRAM) - ) out_buf ( - .clk (clk), - .reset (reset), - .valid_in (valid_in_r), - .ready_in (ready_in_r), - .data_in ({arb_index, data_in_r}), - .data_out ({sel_out, data_out}), - .valid_out (valid_out), - .ready_out (ready_out) - ); + for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_ready_in + localparam o = i % NUM_OUTPUTS; + localparam r = i / NUM_OUTPUTS; + assign ready_in[i] = ready_out_w[o] && arb_onehot[r]; + end + + assign arb_ready = (| ready_out_w); + + for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_out_buf + VX_elastic_buffer #( + .DATAW (LOG_NUM_REQS + DATAW), + .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), + .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)), + .LUTRAM (`TO_OUT_BUF_LUTRAM(OUT_BUF)) + ) out_buf ( + .clk (clk), + .reset (reset), + .valid_in (valid_out_w[o]), + .ready_in (ready_out_w[o]), + .data_in ({arb_index, data_out_w[o]}), + .data_out ({sel_out[o], data_out[o]}), + .valid_out (valid_out[o]), + .ready_out (ready_out[o]) + ); + end end - end else if (NUM_OUTPUTS > NUM_INPUTS) begin + end else if (NUM_INPUTS < NUM_OUTPUTS) begin : g_output_select - if (NUM_INPUTS > 1) begin + // #Inputs < #Outputs - // (#inputs > 1) and (#outputs > #inputs) + if (MAX_FANOUT != 0 && (NUM_REQS > (MAX_FANOUT + MAX_FANOUT /2))) begin : g_fanout - for (genvar i = 0; i < NUM_INPUTS; ++i) begin + localparam NUM_SLICES = `CDIV(NUM_REQS, MAX_FANOUT); + localparam LOG_NUM_REQS2 = `CLOG2(MAX_FANOUT); + localparam LOG_NUM_REQS3 = `CLOG2(NUM_SLICES); - localparam SLICE_BEGIN = i * NUM_REQS; - localparam SLICE_END = `MIN(SLICE_BEGIN + NUM_REQS, NUM_OUTPUTS); - localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN; - - `RESET_RELAY (slice_reset, reset); - - VX_stream_arb #( - .NUM_INPUTS (1), - .NUM_OUTPUTS (SLICE_SIZE), - .DATAW (DATAW), - .ARBITER (ARBITER), - .MAX_FANOUT (MAX_FANOUT), - .OUT_BUF (OUT_BUF), - .LUTRAM (LUTRAM) - ) arb_slice ( - .clk (clk), - .reset (slice_reset), - .valid_in (valid_in[i]), - .ready_in (ready_in[i]), - .data_in (data_in[i]), - .data_out (data_out[SLICE_END-1: SLICE_BEGIN]), - .valid_out (valid_out[SLICE_END-1: SLICE_BEGIN]), - .ready_out (ready_out[SLICE_END-1: SLICE_BEGIN]), - `UNUSED_PIN (sel_out) - ); - - for (genvar j = SLICE_BEGIN; j < SLICE_END; ++j) begin - assign sel_out[j] = i; - end - end - - end else if (MAX_FANOUT != 0 && (NUM_OUTPUTS > (MAX_FANOUT + MAX_FANOUT /2))) begin - - // (#inputs == 1) and (#outputs > max_fanout) - - localparam NUM_SLICES = `CDIV(NUM_OUTPUTS, MAX_FANOUT); - - wire [NUM_SLICES-1:0] valid_tmp; - wire [NUM_SLICES-1:0][DATAW-1:0] data_tmp; - wire [NUM_SLICES-1:0] ready_tmp; + wire [NUM_SLICES-1:0][NUM_INPUTS-1:0] valid_tmp; + wire [NUM_SLICES-1:0][NUM_INPUTS-1:0][DATAW-1:0] data_tmp; + wire [NUM_SLICES-1:0][NUM_INPUTS-1:0] ready_tmp; + wire [NUM_INPUTS-1:0][LOG_NUM_REQS3-1:0] sel_tmp; VX_stream_arb #( - .NUM_INPUTS (1), - .NUM_OUTPUTS (NUM_SLICES), + .NUM_INPUTS (NUM_INPUTS), + .NUM_OUTPUTS (NUM_SLICES * NUM_INPUTS), .DATAW (DATAW), .ARBITER (ARBITER), .MAX_FANOUT (MAX_FANOUT), - .OUT_BUF (3), // registered output - .LUTRAM (LUTRAM) + .OUT_BUF (3) ) fanout_fork_arb ( .clk (clk), .reset (reset), @@ -264,111 +224,140 @@ module VX_stream_arb #( .data_out (data_tmp), .valid_out (valid_tmp), .ready_out (ready_tmp), - `UNUSED_PIN (sel_out) + .sel_out (sel_tmp) ); - for (genvar i = 0; i < NUM_SLICES; ++i) begin + wire [NUM_SLICES-1:0][NUM_INPUTS-1:0][LOG_NUM_REQS2-1:0] sel_out_w; - localparam SLICE_BEGIN = i * MAX_FANOUT; - localparam SLICE_END = `MIN(SLICE_BEGIN + MAX_FANOUT, NUM_OUTPUTS); + for (genvar s = 0; s < NUM_SLICES; ++s) begin : g_slice_arbs + + localparam SLICE_STRIDE= MAX_FANOUT * NUM_INPUTS; + localparam SLICE_BEGIN = s * SLICE_STRIDE; + localparam SLICE_END = `MIN(SLICE_BEGIN + SLICE_STRIDE, NUM_OUTPUTS); localparam SLICE_SIZE = SLICE_END - SLICE_BEGIN; - `RESET_RELAY (slice_reset, reset); + wire [NUM_INPUTS-1:0][LOG_NUM_REQS2-1:0] sel_out_u; VX_stream_arb #( - .NUM_INPUTS (1), + .NUM_INPUTS (NUM_INPUTS), .NUM_OUTPUTS (SLICE_SIZE), .DATAW (DATAW), .ARBITER (ARBITER), .MAX_FANOUT (MAX_FANOUT), - .OUT_BUF (OUT_BUF), - .LUTRAM (LUTRAM) + .OUT_BUF (OUT_BUF) ) fanout_slice_arb ( .clk (clk), - .reset (slice_reset), - .valid_in (valid_tmp[i]), - .ready_in (ready_tmp[i]), - .data_in (data_tmp[i]), + .reset (reset), + .valid_in (valid_tmp[s]), + .ready_in (ready_tmp[s]), + .data_in (data_tmp[s]), .data_out (data_out[SLICE_END-1: SLICE_BEGIN]), .valid_out (valid_out[SLICE_END-1: SLICE_BEGIN]), .ready_out (ready_out[SLICE_END-1: SLICE_BEGIN]), - `UNUSED_PIN (sel_out) + .sel_out (sel_out_w[s]) ); end - end else begin + for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_sel_out + assign sel_out[i] = {sel_tmp[i], sel_out_w[sel_tmp[i]][i]}; + end - // (#inputs == 1) and (#outputs <= max_fanout) + end else begin : g_arbiter - wire [NUM_OUTPUTS-1:0] ready_in_r; - - wire [NUM_OUTPUTS-1:0] arb_requests; + wire [NUM_REQS-1:0] arb_requests; wire arb_valid; - wire [NUM_OUTPUTS-1:0] arb_onehot; + wire [NUM_REQS_W-1:0] arb_index; + wire [NUM_REQS-1:0] arb_onehot; wire arb_ready; + for (genvar r = 0; r < NUM_REQS; ++r) begin : g_requests + wire [NUM_INPUTS-1:0] requests; + for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_i + localparam o = r * NUM_INPUTS + i; + assign requests[i] = ready_out[o]; + end + assign arb_requests[r] = (| requests); + end + VX_generic_arbiter #( - .NUM_REQS (NUM_OUTPUTS), + .NUM_REQS (NUM_REQS), .TYPE (ARBITER) ) arbiter ( .clk (clk), .reset (reset), .requests (arb_requests), .grant_valid (arb_valid), - `UNUSED_PIN (grant_index), + .grant_index (arb_index), .grant_onehot (arb_onehot), .grant_ready (arb_ready) ); - assign arb_requests = ready_in_r; - assign arb_ready = valid_in[0]; - assign ready_in = arb_valid; + wire [NUM_OUTPUTS-1:0] valid_out_w; + wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_w; + wire [NUM_OUTPUTS-1:0] ready_out_w; - for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin + for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_data_out_w + localparam i = o % NUM_INPUTS; + localparam r = o / NUM_INPUTS; + assign valid_out_w[o] = valid_in[i] && arb_onehot[r]; + assign data_out_w[o] = data_in[i]; + end + + for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_ready_in + wire [NUM_REQS-1:0] ready_out_s; + for (genvar r = 0; r < NUM_REQS; ++r) begin : g_r + localparam o = r * NUM_INPUTS + i; + assign ready_out_s[r] = ready_out_w[o]; + end + assign ready_in[i] = (NUM_INPUTS == 1) ? arb_valid : (| (ready_out_s & arb_onehot)); + end + + assign arb_ready = (| valid_in); + + for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_out_buf VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)), - .LUTRAM (LUTRAM) + .LUTRAM (`TO_OUT_BUF_LUTRAM(OUT_BUF)) ) out_buf ( .clk (clk), .reset (reset), - .valid_in (valid_in && arb_onehot[i]), - .ready_in (ready_in_r[i]), - .data_in (data_in), - .data_out (data_out[i]), - .valid_out (valid_out[i]), - .ready_out (ready_out[i]) + .valid_in (valid_out_w[o]), + .ready_in (ready_out_w[o]), + .data_in (data_out_w[o]), + .data_out (data_out[o]), + .valid_out (valid_out[o]), + .ready_out (ready_out[o]) ); end + + for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_sel_out + assign sel_out[i] = arb_index; + end end - assign sel_out = 0; - - end else begin + end else begin : g_passthru // #Inputs == #Outputs - `RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT); - - for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin - + for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_out_buf VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)), - .LUTRAM (LUTRAM) + .LUTRAM (`TO_OUT_BUF_LUTRAM(OUT_BUF)) ) out_buf ( .clk (clk), - .reset (out_buf_reset[i]), - .valid_in (valid_in[i]), - .ready_in (ready_in[i]), - .data_in (data_in[i]), - .data_out (data_out[i]), - .valid_out (valid_out[i]), - .ready_out (ready_out[i]) + .reset (reset), + .valid_in (valid_in[o]), + .ready_in (ready_in[o]), + .data_in (data_in[o]), + .data_out (data_out[o]), + .valid_out (valid_out[o]), + .ready_out (ready_out[o]) ); - assign sel_out[i] = NUM_REQS_W'(i); + assign sel_out[o] = NUM_REQS_W'(0); end end diff --git a/hw/rtl/libs/VX_stream_buffer.sv b/hw/rtl/libs/VX_stream_buffer.sv index bebe8ec71..ea4467cb3 100644 --- a/hw/rtl/libs/VX_stream_buffer.sv +++ b/hw/rtl/libs/VX_stream_buffer.sv @@ -1,18 +1,18 @@ // Copyright 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at -// +// // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -// A stream elastic buffer operates at full-bandwidth where push and pop can happen simultaneously +// A stream elastic buffer_r operates at full-bandwidth where fire_in and fire_out can happen simultaneously // It has the following benefits: // + full-bandwidth throughput // + ready_in and ready_out are decoupled @@ -27,102 +27,86 @@ module VX_stream_buffer #( parameter DATAW = 1, parameter OUT_REG = 0, parameter PASSTHRU = 0 -) ( +) ( input wire clk, input wire reset, input wire valid_in, - output wire ready_in, + output wire ready_in, input wire [DATAW-1:0] data_in, output wire [DATAW-1:0] data_out, input wire ready_out, output wire valid_out -); - if (PASSTHRU != 0) begin +); + if (PASSTHRU != 0) begin : g_passthru + `UNUSED_VAR (clk) `UNUSED_VAR (reset) assign ready_in = ready_out; - assign valid_out = valid_in; + assign valid_out = valid_in; assign data_out = data_in; - end else begin - if (OUT_REG != 0) begin - reg [DATAW-1:0] data_out_r; - reg [DATAW-1:0] buffer; - reg valid_out_r; - reg use_buffer; - - wire push = valid_in && ready_in; - wire stall_out = valid_out_r && ~ready_out; - - always @(posedge clk) begin - if (reset) begin - valid_out_r <= 0; - use_buffer <= 0; - end else begin - if (ready_out) begin - use_buffer <= 0; - end else if (valid_in && valid_out) begin - use_buffer <= 1; - end - if (~stall_out) begin - valid_out_r <= valid_in || use_buffer; - end - end + end else begin : g_buffer + + reg [DATAW-1:0] data_out_r, buffer_r; + reg valid_out_r, valid_in_r; + + wire fire_in = valid_in && ready_in; + wire flow_out = ready_out || ~valid_out; + + always @(posedge clk) begin + if (reset) begin + valid_in_r <= 1'b1; + end else if (valid_in || flow_out) begin + valid_in_r <= flow_out; end - - always @(posedge clk) begin - if (push) begin - buffer <= data_in; - end - if (~stall_out) begin - data_out_r <= use_buffer ? buffer : data_in; - end - end - - assign ready_in = ~use_buffer; - assign valid_out = valid_out_r; - assign data_out = data_out_r; - - end else begin - - reg [1:0][DATAW-1:0] shift_reg; - reg valid_out_r, ready_in_r, rd_ptr_r; - - wire push = valid_in && ready_in; - wire pop = valid_out_r && ready_out; - - always @(posedge clk) begin - if (reset) begin - valid_out_r <= 0; - ready_in_r <= 1; - rd_ptr_r <= 1; - end else begin - if (push) begin - if (!pop) begin - ready_in_r <= rd_ptr_r; - valid_out_r <= 1; - end - end else if (pop) begin - ready_in_r <= 1; - valid_out_r <= rd_ptr_r; - end - rd_ptr_r <= rd_ptr_r ^ (push ^ pop); - end - end - - always @(posedge clk) begin - if (push) begin - shift_reg[1] <= shift_reg[0]; - shift_reg[0] <= data_in; - end - end - - assign ready_in = ready_in_r; - assign valid_out = valid_out_r; - assign data_out = shift_reg[rd_ptr_r]; end - end + + always @(posedge clk) begin + if (reset) begin + valid_out_r <= 1'b0; + end else if (flow_out) begin + valid_out_r <= valid_in || ~valid_in_r; + end + end + + if (OUT_REG != 0) begin : g_out_reg + + always @(posedge clk) begin + if (fire_in) begin + buffer_r <= data_in; + end + end + + always @(posedge clk) begin + if (flow_out) begin + data_out_r <= valid_in_r ? data_in : buffer_r; + end + end + + assign data_out = data_out_r; + + end else begin : g_no_out_reg + + always @(posedge clk) begin + if (fire_in) begin + data_out_r <= data_in; + end + end + + always @(posedge clk) begin + if (fire_in) begin + buffer_r <= data_out_r; + end + end + + assign data_out = valid_in_r ? data_out_r : buffer_r; + + end + + assign valid_out = valid_out_r; + assign ready_in = valid_in_r; + + end endmodule `TRACING_ON - diff --git a/hw/rtl/libs/VX_stream_omega.sv b/hw/rtl/libs/VX_stream_omega.sv new file mode 100644 index 000000000..fd0d84def --- /dev/null +++ b/hw/rtl/libs/VX_stream_omega.sv @@ -0,0 +1,215 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_define.vh" + +`TRACING_OFF +module VX_stream_omega #( + parameter NUM_INPUTS = 4, + parameter NUM_OUTPUTS = 4, + parameter RADIX = 2, + parameter DATAW = 4, + parameter ARBITER = "R", + parameter OUT_BUF = 0, + parameter MAX_FANOUT = `MAX_FANOUT, + parameter PERF_CTR_BITS = 32, + parameter IN_WIDTH = `LOG2UP(NUM_INPUTS), + parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS) +) ( + input wire clk, + input wire reset, + + input wire [NUM_INPUTS-1:0] valid_in, + input wire [NUM_INPUTS-1:0][DATAW-1:0] data_in, + input wire [NUM_INPUTS-1:0][OUT_WIDTH-1:0] sel_in, + output wire [NUM_INPUTS-1:0] ready_in, + + output wire [NUM_OUTPUTS-1:0] valid_out, + output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out, + output wire [NUM_OUTPUTS-1:0][IN_WIDTH-1:0] sel_out, + input wire [NUM_OUTPUTS-1:0] ready_out, + + output wire [PERF_CTR_BITS-1:0] collisions +); + `STATIC_ASSERT (`IS_POW2(RADIX), ("inavlid parameters")) + + // If network size smaller than radix, simply use a crossbar. + if (NUM_INPUTS <= RADIX && NUM_OUTPUTS <= RADIX) begin : g_fallback + VX_stream_xbar #( + .NUM_INPUTS (NUM_INPUTS), + .NUM_OUTPUTS (NUM_OUTPUTS), + .DATAW (DATAW), + .ARBITER (ARBITER), + .OUT_BUF (OUT_BUF), + .MAX_FANOUT (MAX_FANOUT), + .PERF_CTR_BITS (PERF_CTR_BITS) + ) xbar_switch ( + .clk, + .reset, + .valid_in, + .data_in, + .sel_in, + .ready_in, + .valid_out, + .data_out, + .sel_out, + .ready_out, + .collisions + ); + end else begin : g_omega + localparam RADIX_LG = `LOG2UP(RADIX); + localparam N_INPUTS_M = `MAX(NUM_INPUTS, NUM_OUTPUTS); + localparam N_INPUTS_LG = `CDIV(`CLOG2(N_INPUTS_M), RADIX_LG); + localparam N_INPUTS = RADIX ** N_INPUTS_LG; + localparam NUM_STAGES = `LOG2UP(N_INPUTS) / RADIX_LG; + localparam NUM_SWITCHES = N_INPUTS / RADIX; + + typedef struct packed { + logic [N_INPUTS_LG-1:0] sel_in; + logic [DATAW-1:0] data; + logic [IN_WIDTH-1:0] sel_out; + } omega_t; + + // Wires for internal connections between stages + wire [NUM_STAGES-1:0][NUM_SWITCHES-1:0][RADIX-1:0] switch_valid_in, switch_valid_out; + omega_t [NUM_STAGES-1:0][NUM_SWITCHES-1:0][RADIX-1:0] switch_data_in, switch_data_out; + wire [NUM_STAGES-1:0][NUM_SWITCHES-1:0][RADIX-1:0][RADIX_LG-1:0] switch_sel_in; + wire [NUM_STAGES-1:0][NUM_SWITCHES-1:0][RADIX-1:0] switch_ready_in, switch_ready_out; + + // Connect inputs to first stage + for (genvar i = 0; i < N_INPUTS; ++i) begin : g_tie_inputs + localparam DST_IDX = ((i << 1) | (i >> (N_INPUTS_LG-1))) & (N_INPUTS-1); + localparam switch = DST_IDX / RADIX; + localparam port = DST_IDX % RADIX; + if (i < NUM_INPUTS) begin : g_valid + assign switch_valid_in[0][switch][port] = valid_in[i]; + assign switch_data_in[0][switch][port] = '{ + sel_in: N_INPUTS_LG'(sel_in[i]), + data: data_in[i], + sel_out: IN_WIDTH'(i) + }; + assign ready_in[i] = switch_ready_in[0][switch][port]; + end else begin : g_padding + assign switch_valid_in[0][switch][port] = 0; + assign switch_data_in[0][switch][port] = 'x; + `UNUSED_VAR (switch_ready_in[0][switch][port]) + end + end + + // Connect switch sel_in + for (genvar stage = 0; stage < NUM_STAGES; ++stage) begin : g_sel_in + for (genvar switch = 0; switch < NUM_SWITCHES; ++switch) begin : g_switches + for (genvar port = 0; port < RADIX; ++port) begin : g_ports + assign switch_sel_in[stage][switch][port] = switch_data_in[stage][switch][port].sel_in[(NUM_STAGES-1-stage) * RADIX_LG +: RADIX_LG]; + end + end + end + + // Connect internal stages + for (genvar stage = 0; stage < NUM_STAGES-1; ++stage) begin : g_stages + for (genvar switch = 0; switch < NUM_SWITCHES; ++switch) begin : g_switches + for (genvar port = 0; port < RADIX; port++) begin : g_ports + localparam lane = switch * RADIX + port; + localparam dst_lane = ((lane << 1) | (lane >> (N_INPUTS_LG-1))) & (N_INPUTS-1); + localparam dst_switch = dst_lane / RADIX; + localparam dst_port = dst_lane % RADIX; + assign switch_valid_in[stage+1][dst_switch][dst_port] = switch_valid_out[stage][switch][port]; + assign switch_data_in[stage+1][dst_switch][dst_port] = switch_data_out[stage][switch][port]; + assign switch_ready_out[stage][switch][port] = switch_ready_in[stage+1][dst_switch][dst_port]; + end + end + end + + // Connect network switches + for (genvar switch = 0; switch < NUM_SWITCHES; ++switch) begin : g_switches + for (genvar stage = 0; stage < NUM_STAGES; ++stage) begin : g_stages + VX_stream_xbar #( + .NUM_INPUTS (RADIX), + .NUM_OUTPUTS (RADIX), + .DATAW ($bits(omega_t)), + .ARBITER (ARBITER), + .OUT_BUF (OUT_BUF), + .MAX_FANOUT (MAX_FANOUT), + .PERF_CTR_BITS(PERF_CTR_BITS) + ) xbar_switch ( + .clk (clk), + .reset (reset), + .valid_in (switch_valid_in[stage][switch]), + .data_in (switch_data_in[stage][switch]), + .sel_in (switch_sel_in[stage][switch]), + .ready_in (switch_ready_in[stage][switch]), + .valid_out (switch_valid_out[stage][switch]), + .data_out (switch_data_out[stage][switch]), + `UNUSED_PIN (sel_out), + .ready_out (switch_ready_out[stage][switch]), + `UNUSED_PIN (collisions) + ); + end + end + + // Connect outputs to last stage + for (genvar i = 0; i < N_INPUTS; ++i) begin : g_tie_outputs + localparam switch = i / RADIX; + localparam port = i % RADIX; + if (i < NUM_OUTPUTS) begin : g_valid + assign valid_out[i] = switch_valid_out[NUM_STAGES-1][switch][port]; + assign data_out[i] = switch_data_out[NUM_STAGES-1][switch][port].data; + assign sel_out[i] = switch_data_out[NUM_STAGES-1][switch][port].sel_out; + assign switch_ready_out[NUM_STAGES-1][switch][port] = ready_out[i]; + end else begin : g_padding + `UNUSED_VAR (switch_valid_out[NUM_STAGES-1][switch][port]) + `UNUSED_VAR (switch_data_out[NUM_STAGES-1][switch][port]) + assign switch_ready_out[NUM_STAGES-1][switch][port] = 0; + end + end + + // compute inputs collision + // we have a collision when there exists a valid transfer with multiple input candicates + // we count the unique duplicates each cycle. + + reg [NUM_STAGES-1:0][NUM_SWITCHES-1:0][RADIX-1:0] per_cycle_collision, per_cycle_collision_r; + wire [`CLOG2(NUM_STAGES*NUM_SWITCHES*RADIX+1)-1:0] collision_count; + reg [PERF_CTR_BITS-1:0] collisions_r; + + always @(*) begin + per_cycle_collision = 0; + for (integer stage = 0; stage < NUM_STAGES; ++stage) begin + for (integer switch = 0; switch < NUM_SWITCHES; ++switch) begin + for (integer port_a = 0; port_a < RADIX; ++port_a) begin + for (integer port_b = port_a + 1; port_b < RADIX; ++port_b) begin + per_cycle_collision[stage][switch][port_a] |= switch_valid_in[stage][switch][port_a] + && switch_valid_in[stage][switch][port_b] + && (switch_sel_in[stage][switch][port_a] == switch_sel_in[stage][switch][port_b]) + && (switch_ready_in[stage][switch][port_a] | switch_ready_in[stage][switch][port_b]); + end + end + end + end + end + + `BUFFER(per_cycle_collision_r, per_cycle_collision); + `POP_COUNT(collision_count, per_cycle_collision_r); + + always @(posedge clk) begin + if (reset) begin + collisions_r <= '0; + end else begin + collisions_r <= collisions_r + PERF_CTR_BITS'(collision_count); + end + end + + assign collisions = collisions_r; + end + +endmodule +`TRACING_ON diff --git a/hw/rtl/libs/VX_stream_pack.sv b/hw/rtl/libs/VX_stream_pack.sv index 7f024b184..944b120c2 100644 --- a/hw/rtl/libs/VX_stream_pack.sv +++ b/hw/rtl/libs/VX_stream_pack.sv @@ -38,7 +38,8 @@ module VX_stream_pack #( output wire [TAG_WIDTH-1:0] tag_out, input wire ready_out ); - if (NUM_REQS > 1) begin + if (NUM_REQS > 1) begin : g_pack + localparam LOG_NUM_REQS = `CLOG2(NUM_REQS); wire [LOG_NUM_REQS-1:0] grant_index; @@ -62,11 +63,11 @@ module VX_stream_pack #( wire [NUM_REQS-1:0] tag_matches; - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_tag_matches assign tag_matches[i] = (tag_in[i][TAG_SEL_BITS-1:0] == tag_sel[TAG_SEL_BITS-1:0]); end - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_ready_in assign ready_in[i] = grant_ready & tag_matches[i]; end @@ -87,7 +88,7 @@ module VX_stream_pack #( .ready_out (ready_out) ); - end else begin + end else begin : g_passthru `UNUSED_VAR (clk) `UNUSED_VAR (reset) diff --git a/hw/rtl/libs/VX_stream_switch.sv b/hw/rtl/libs/VX_stream_switch.sv index 3a905cb1d..fa719af77 100644 --- a/hw/rtl/libs/VX_stream_switch.sv +++ b/hw/rtl/libs/VX_stream_switch.sv @@ -36,117 +36,91 @@ module VX_stream_switch #( output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out, input wire [NUM_OUTPUTS-1:0] ready_out ); - if (NUM_INPUTS > NUM_OUTPUTS) begin + if (NUM_INPUTS > NUM_OUTPUTS) begin : g_input_select - wire [NUM_OUTPUTS-1:0][NUM_REQS-1:0] valid_in_r; - wire [NUM_OUTPUTS-1:0][NUM_REQS-1:0][DATAW-1:0] data_in_r; + for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_out_buf - for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin - for (genvar j = 0; j < NUM_REQS; ++j) begin - localparam ii = i * NUM_REQS + j; - if (ii < NUM_INPUTS) begin - assign valid_in_r[i][j] = valid_in[ii]; - assign data_in_r[i][j] = data_in[ii]; - end else begin - assign valid_in_r[i][j] = 0; - assign data_in_r[i][j] = '0; + wire [NUM_REQS-1:0] valid_in_w; + wire [NUM_REQS-1:0][DATAW-1:0] data_in_w; + wire [NUM_REQS-1:0] ready_in_w; + + for (genvar r = 0; r < NUM_REQS; ++r) begin : g_r + localparam i = r * NUM_OUTPUTS + o; + if (i < NUM_INPUTS) begin : g_valid + assign valid_in_w[r] = valid_in[i]; + assign data_in_w[r] = data_in[i]; + assign ready_in[i] = ready_in_w[r]; + end else begin : g_padding + assign valid_in_w[r] = 0; + assign data_in_w[r] = '0; + `UNUSED_VAR (ready_in_w[r]) end end - end - wire [NUM_OUTPUTS-1:0] valid_out_r; - wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_r; - wire [NUM_OUTPUTS-1:0] ready_out_r; - - for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin - assign valid_out_r[i] = valid_in_r[i][sel_in[i]]; - assign data_out_r[i] = data_in_r[i][sel_in[i]]; - end - - for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin - for (genvar j = 0; j < NUM_REQS; ++j) begin - localparam ii = i * NUM_REQS + j; - if (ii < NUM_INPUTS) begin - assign ready_in[ii] = ready_out_r[i] & (sel_in[i] == LOG_NUM_REQS'(j)); - end - end - end - - `RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT); - - for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) ) out_buf ( .clk (clk), - .reset (out_buf_reset[i]), - .valid_in (valid_out_r[i]), - .ready_in (ready_out_r[i]), - .data_in (data_out_r[i]), - .data_out (data_out[i]), - .valid_out (valid_out[i]), - .ready_out (ready_out[i]) + .reset (reset), + .valid_in (valid_in_w[sel_in[o]]), + .ready_in (ready_in_w[sel_in[o]]), + .data_in (data_in_w[sel_in[o]]), + .data_out (data_out[o]), + .valid_out (valid_out[o]), + .ready_out (ready_out[o]) ); end - end else if (NUM_OUTPUTS > NUM_INPUTS) begin + end else if (NUM_OUTPUTS > NUM_INPUTS) begin : g_output_select - wire [NUM_INPUTS-1:0][NUM_REQS-1:0] valid_out_r; - wire [NUM_INPUTS-1:0][NUM_REQS-1:0] ready_out_r; + // Inputs < Outputs - for (genvar i = 0; i < NUM_INPUTS; ++i) begin - for (genvar j = 0; j < NUM_REQS; ++j) begin - assign valid_out_r[i][j] = valid_in[i] & (sel_in[i] == LOG_NUM_REQS'(j)); - end - assign ready_in[i] = ready_out_r[i][sel_in[i]]; - end + for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_out_buf - `RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT); + wire [NUM_REQS-1:0] ready_out_w; - for (genvar i = 0; i < NUM_INPUTS; ++i) begin - for (genvar j = 0; j < NUM_REQS; ++j) begin - localparam ii = i * NUM_REQS + j; - if (ii < NUM_OUTPUTS) begin + for (genvar r = 0; r < NUM_REQS; ++r) begin : g_r + localparam o = r * NUM_INPUTS + i; + if (o < NUM_OUTPUTS) begin : g_valid + wire valid_out_w = valid_in[i] && (sel_in[i] == LOG_NUM_REQS'(r)); VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) ) out_buf ( .clk (clk), - .reset (out_buf_reset[ii]), - .valid_in (valid_out_r[i][j]), - .ready_in (ready_out_r[i][j]), + .reset (reset), + .valid_in (valid_out_w), + .ready_in (ready_out_w[r]), .data_in (data_in[i]), - .data_out (data_out[ii]), - .valid_out (valid_out[ii]), - .ready_out (ready_out[ii]) + .data_out (data_out[o]), + .valid_out (valid_out[o]), + .ready_out (ready_out[o]) ); - end else begin - `UNUSED_VAR (out_buf_reset[ii]) - `UNUSED_VAR (valid_out_r[i][j]) - assign ready_out_r[i][j] = '0; + end else begin : g_padding + assign ready_out_w[r] = '0; end end + + assign ready_in[i] = ready_out_w[sel_in[i]]; end - end else begin + end else begin : g_passthru // #Inputs == #Outputs `UNUSED_VAR (sel_in) - `RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT); - - for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_out_buf VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)) ) out_buf ( .clk (clk), - .reset (out_buf_reset[i]), + .reset (reset), .valid_in (valid_in[i]), .ready_in (ready_in[i]), .data_in (data_in[i]), @@ -155,7 +129,6 @@ module VX_stream_switch #( .ready_out (ready_out[i]) ); end - end endmodule diff --git a/hw/rtl/libs/VX_stream_unpack.sv b/hw/rtl/libs/VX_stream_unpack.sv index e8b905cdf..b0cca961a 100644 --- a/hw/rtl/libs/VX_stream_unpack.sv +++ b/hw/rtl/libs/VX_stream_unpack.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -15,8 +15,8 @@ `TRACING_OFF module VX_stream_unpack #( - parameter NUM_REQS = 1, - parameter DATA_WIDTH = 1, + parameter NUM_REQS = 1, + parameter DATA_WIDTH = 1, parameter TAG_WIDTH = 1, parameter OUT_BUF = 0 ) ( @@ -31,36 +31,32 @@ module VX_stream_unpack #( output wire ready_in, // output - output wire [NUM_REQS-1:0] valid_out, + output wire [NUM_REQS-1:0] valid_out, output wire [NUM_REQS-1:0][DATA_WIDTH-1:0] data_out, output wire [NUM_REQS-1:0][TAG_WIDTH-1:0] tag_out, input wire [NUM_REQS-1:0] ready_out ); - if (NUM_REQS > 1) begin + if (NUM_REQS > 1) begin : g_unpack - reg [NUM_REQS-1:0] sent_mask; - wire [NUM_REQS-1:0] ready_out_r; + reg [NUM_REQS-1:0] rem_mask_r; + wire [NUM_REQS-1:0] ready_out_w; - wire [NUM_REQS-1:0] sent_mask_n = sent_mask | ready_out_r; - wire sent_all = ~(| (mask_in & ~sent_mask_n)); + wire [NUM_REQS-1:0] rem_mask_n = rem_mask_r & ~ready_out_w; + wire sent_all = ~(| (mask_in & rem_mask_n)); always @(posedge clk) begin if (reset) begin - sent_mask <= '0; + rem_mask_r <= '1; end else begin if (valid_in) begin - if (sent_all) begin - sent_mask <= '0; - end else begin - sent_mask <= sent_mask_n; - end + rem_mask_r <= sent_all ? '1 : rem_mask_n; end end end assign ready_in = sent_all; - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_outbuf VX_elastic_buffer #( .DATAW (DATA_WIDTH + TAG_WIDTH), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), @@ -68,21 +64,21 @@ module VX_stream_unpack #( ) out_buf ( .clk (clk), .reset (reset), - .valid_in (valid_in && mask_in[i] && ~sent_mask[i]), - .ready_in (ready_out_r[i]), + .valid_in (valid_in && mask_in[i] && rem_mask_r[i]), + .ready_in (ready_out_w[i]), .data_in ({data_in[i], tag_in}), .data_out ({data_out[i], tag_out[i]}), .valid_out (valid_out[i]), .ready_out (ready_out[i]) ); end - - end else begin - + + end else begin : g_passthru + `UNUSED_VAR (clk) `UNUSED_VAR (reset) `UNUSED_VAR (mask_in) - assign valid_out = valid_in; + assign valid_out = valid_in; assign data_out = data_in; assign tag_out = tag_in; assign ready_in = ready_out; diff --git a/hw/rtl/libs/VX_stream_xbar.sv b/hw/rtl/libs/VX_stream_xbar.sv index b7bdcbf5e..34972b8d7 100644 --- a/hw/rtl/libs/VX_stream_xbar.sv +++ b/hw/rtl/libs/VX_stream_xbar.sv @@ -18,19 +18,16 @@ module VX_stream_xbar #( parameter NUM_INPUTS = 4, parameter NUM_OUTPUTS = 4, parameter DATAW = 4, - parameter IN_WIDTH = `LOG2UP(NUM_INPUTS), - parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS), parameter ARBITER = "R", parameter OUT_BUF = 0, - parameter LUTRAM = 0, parameter MAX_FANOUT = `MAX_FANOUT, - parameter PERF_CTR_BITS = `CLOG2(NUM_INPUTS+1) + parameter PERF_CTR_BITS = `CLOG2(NUM_INPUTS+1), + parameter IN_WIDTH = `LOG2UP(NUM_INPUTS), + parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS) ) ( input wire clk, input wire reset, - output wire [PERF_CTR_BITS-1:0] collisions, - input wire [NUM_INPUTS-1:0] valid_in, input wire [NUM_INPUTS-1:0][DATAW-1:0] data_in, input wire [NUM_INPUTS-1:0][OUT_WIDTH-1:0] sel_in, @@ -39,40 +36,68 @@ module VX_stream_xbar #( output wire [NUM_OUTPUTS-1:0] valid_out, output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out, output wire [NUM_OUTPUTS-1:0][IN_WIDTH-1:0] sel_out, - input wire [NUM_OUTPUTS-1:0] ready_out + input wire [NUM_OUTPUTS-1:0] ready_out, + + output wire [PERF_CTR_BITS-1:0] collisions ); `UNUSED_VAR (clk) `UNUSED_VAR (reset) - if (NUM_INPUTS != 1) begin + if (NUM_INPUTS != 1) begin : g_multi_inputs - if (NUM_OUTPUTS != 1) begin + if (NUM_OUTPUTS != 1) begin : g_multiple_outputs // (#inputs > 1) and (#outputs > 1) + wire [NUM_INPUTS-1:0][NUM_OUTPUTS-1:0] per_output_valid_in; + wire [NUM_OUTPUTS-1:0][NUM_INPUTS-1:0] per_output_valid_in_w; + wire [NUM_OUTPUTS-1:0][NUM_INPUTS-1:0] per_output_ready_in; + wire [NUM_INPUTS-1:0][NUM_OUTPUTS-1:0] per_output_ready_in_w; - for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin + VX_transpose #( + .N (NUM_OUTPUTS), + .M (NUM_INPUTS) + ) rdy_in_transpose ( + .data_in (per_output_ready_in), + .data_out (per_output_ready_in_w) + ); - wire [NUM_INPUTS-1:0] valid_in_q; - for (genvar j = 0; j < NUM_INPUTS; ++j) begin - assign valid_in_q[j] = valid_in[j] && (sel_in[j] == i); - end + for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_ready_in + assign ready_in[i] = | per_output_ready_in_w[i]; + end - `RESET_RELAY (slice_reset, reset); + for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_sel_in_demux + VX_demux #( + .DATAW (1), + .N (NUM_OUTPUTS) + ) sel_in_demux ( + .sel_in (sel_in[i]), + .data_in (valid_in[i]), + .data_out (per_output_valid_in[i]) + ); + end + VX_transpose #( + .N (NUM_INPUTS), + .M (NUM_OUTPUTS) + ) val_in_transpose ( + .data_in (per_output_valid_in), + .data_out (per_output_valid_in_w) + ); + + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_xbar_arbs VX_stream_arb #( .NUM_INPUTS (NUM_INPUTS), .NUM_OUTPUTS (1), .DATAW (DATAW), .ARBITER (ARBITER), .MAX_FANOUT (MAX_FANOUT), - .OUT_BUF (OUT_BUF), - .LUTRAM (LUTRAM) + .OUT_BUF (OUT_BUF) ) xbar_arb ( .clk (clk), - .reset (slice_reset), - .valid_in (valid_in_q), + .reset (reset), + .valid_in (per_output_valid_in_w[i]), .data_in (data_in), .ready_in (per_output_ready_in[i]), .valid_out (valid_out[i]), @@ -82,11 +107,7 @@ module VX_stream_xbar #( ); end - for (genvar i = 0; i < NUM_INPUTS; ++i) begin - assign ready_in[i] = per_output_ready_in[sel_in[i]][i]; - end - - end else begin + end else begin : g_one_output // (#inputs >= 1) and (#outputs == 1) @@ -96,8 +117,7 @@ module VX_stream_xbar #( .DATAW (DATAW), .ARBITER (ARBITER), .MAX_FANOUT (MAX_FANOUT), - .OUT_BUF (OUT_BUF), - .LUTRAM (LUTRAM) + .OUT_BUF (OUT_BUF) ) xbar_arb ( .clk (clk), .reset (reset), @@ -113,33 +133,37 @@ module VX_stream_xbar #( `UNUSED_VAR (sel_in) end - end else if (NUM_OUTPUTS != 1) begin + end else if (NUM_OUTPUTS != 1) begin : g_single_input // (#inputs == 1) and (#outputs > 1) - logic [NUM_OUTPUTS-1:0] valid_out_r, ready_out_r; - logic [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_r; - always @(*) begin - valid_out_r = '0; - valid_out_r[sel_in] = valid_in; - end - assign data_out_r = {NUM_OUTPUTS{data_in}}; - assign ready_in = ready_out_r[sel_in]; + wire [NUM_OUTPUTS-1:0] valid_out_w, ready_out_w; + wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_w; - `RESET_RELAY_EX (out_buf_reset, reset, NUM_OUTPUTS, `MAX_FANOUT); + VX_demux #( + .DATAW (1), + .N (NUM_OUTPUTS) + ) sel_in_demux ( + .sel_in (sel_in[0]), + .data_in (valid_in[0]), + .data_out (valid_out_w) + ); - for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin + assign ready_in[0] = ready_out_w[sel_in[0]]; + assign data_out_w = {NUM_OUTPUTS{data_in[0]}}; + + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_out_buf VX_elastic_buffer #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)), - .LUTRAM (LUTRAM) + .LUTRAM (`TO_OUT_BUF_LUTRAM(OUT_BUF)) ) out_buf ( .clk (clk), - .reset (out_buf_reset[i]), - .valid_in (valid_out_r[i]), - .ready_in (ready_out_r[i]), - .data_in (data_out_r[i]), + .reset (reset), + .valid_in (valid_out_w[i]), + .ready_in (ready_out_w[i]), + .data_in (data_out_w[i]), .data_out (data_out[i]), .valid_out (valid_out[i]), .ready_out (ready_out[i]) @@ -148,7 +172,7 @@ module VX_stream_xbar #( assign sel_out = 0; - end else begin + end else begin : g_passthru // (#inputs == 1) and (#outputs == 1) @@ -156,7 +180,7 @@ module VX_stream_xbar #( .DATAW (DATAW), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)), .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)), - .LUTRAM (LUTRAM) + .LUTRAM (`TO_OUT_BUF_LUTRAM(OUT_BUF)) ) out_buf ( .clk (clk), .reset (reset), @@ -182,13 +206,13 @@ module VX_stream_xbar #( reg [PERF_CTR_BITS-1:0] collisions_r; always @(*) begin - per_cycle_collision = 0; + per_cycle_collision = '0; for (integer i = 0; i < NUM_INPUTS; ++i) begin - for (integer j = 1; j < (NUM_INPUTS-i); ++j) begin + for (integer j = i + 1; j < NUM_INPUTS; ++j) begin per_cycle_collision[i] |= valid_in[i] - && valid_in[j+i] - && (sel_in[i] == sel_in[j+i]) - && (ready_in[i] | ready_in[j+i]); + && valid_in[j] + && (sel_in[i] == sel_in[j]) + && (ready_in[i] | ready_in[j]); end end end diff --git a/hw/rtl/libs/VX_toggle_buffer.sv b/hw/rtl/libs/VX_toggle_buffer.sv index fb24a7f79..9d6b42720 100644 --- a/hw/rtl/libs/VX_toggle_buffer.sv +++ b/hw/rtl/libs/VX_toggle_buffer.sv @@ -1,11 +1,11 @@ // Copyright 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at -// +// // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -26,23 +26,26 @@ module VX_toggle_buffer #( parameter DATAW = 1, parameter PASSTHRU = 0 -) ( +) ( input wire clk, input wire reset, input wire valid_in, - output wire ready_in, + output wire ready_in, input wire [DATAW-1:0] data_in, output wire [DATAW-1:0] data_out, input wire ready_out, output wire valid_out -); - if (PASSTHRU != 0) begin +); + if (PASSTHRU != 0) begin : g_passthru + `UNUSED_VAR (clk) `UNUSED_VAR (reset) assign ready_in = ready_out; - assign valid_out = valid_in; + assign valid_out = valid_in; assign data_out = data_in; - end else begin + + end else begin : g_buffer + reg [DATAW-1:0] buffer; reg has_data; @@ -54,7 +57,7 @@ module VX_toggle_buffer #( has_data <= valid_in; end else if (ready_out) begin has_data <= 0; - end + end end if (~has_data) begin buffer <= data_in; diff --git a/hw/syn/xilinx/test/kernel/main.c b/hw/rtl/libs/VX_transpose.sv similarity index 53% rename from hw/syn/xilinx/test/kernel/main.c rename to hw/rtl/libs/VX_transpose.sv index 4fcfd99c0..2fc0bd695 100644 --- a/hw/syn/xilinx/test/kernel/main.c +++ b/hw/rtl/libs/VX_transpose.sv @@ -1,36 +1,32 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#include -#include +`include "VX_platform.vh" -typedef struct { - uint32_t count; - uint32_t src_addr; - uint32_t dst_addr; -} kernel_arg_t; +`TRACING_OFF +module VX_transpose #( + parameter DATAW = 1, + parameter N = 1, + parameter M = 1 +) ( + input wire [N-1:0][M-1:0][DATAW-1:0] data_in, + output wire [M-1:0][N-1:0][DATAW-1:0] data_out +); + for (genvar i = 0; i < N; ++i) begin : g_i + for (genvar j = 0; j < M; ++j) begin : g_j + assign data_out[j][i] = data_in[i][j]; + end + end -int main() { - kernel_arg_t* arg = (kernel_arg_t*)csr_read(VX_CSR_MSCRATCH); - uint32_t count = arg->count; - int32_t* src_ptr = (int32_t*)arg->src_addr; - int32_t* dst_ptr = (int32_t*)arg->dst_addr; - - uint32_t offset = vx_core_id() * count; - - for (uint32_t i = 0; i < count; ++i) { - dst_ptr[offset + i] = src_ptr[offset + i]; - } - - return 0; -} +endmodule +`TRACING_ON diff --git a/hw/rtl/mem/VX_gbar_arb.sv b/hw/rtl/mem/VX_gbar_arb.sv index 9ff761ec2..bdd225d72 100644 --- a/hw/rtl/mem/VX_gbar_arb.sv +++ b/hw/rtl/mem/VX_gbar_arb.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -33,9 +33,9 @@ module VX_gbar_arb #( wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_in; wire [NUM_REQS-1:0] req_ready_in; - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_req_data_in assign req_valid_in[i] = bus_in_if[i].req_valid; - assign req_data_in[i] = {bus_in_if[i].req_id, bus_in_if[i].req_size_m1, bus_in_if[i].req_core_id}; + assign req_data_in[i] = bus_in_if[i].req_data; assign bus_in_if[i].req_ready = req_ready_in[i]; end @@ -51,7 +51,7 @@ module VX_gbar_arb #( .valid_in (req_valid_in), .ready_in (req_ready_in), .data_in (req_data_in), - .data_out ({bus_out_if.req_id, bus_out_if.req_size_m1, bus_out_if.req_core_id}), + .data_out (bus_out_if.req_data), .valid_out (bus_out_if.req_valid), .ready_out (bus_out_if.req_ready), `UNUSED_PIN (sel_out) @@ -60,7 +60,7 @@ module VX_gbar_arb #( // broadcast response reg rsp_valid; - reg [`NB_WIDTH-1:0] rsp_id; + reg [`NB_WIDTH-1:0] rsp_data; always @(posedge clk) begin if (reset) begin @@ -68,12 +68,12 @@ module VX_gbar_arb #( end else begin rsp_valid <= bus_out_if.rsp_valid; end - rsp_id <= bus_out_if.rsp_id; + rsp_data <= bus_out_if.rsp_data; end - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_bus_in_if assign bus_in_if[i].rsp_valid = rsp_valid; - assign bus_in_if[i].rsp_id = rsp_id; + assign bus_in_if[i].rsp_data = rsp_data; end endmodule diff --git a/hw/rtl/mem/VX_gbar_bus_if.sv b/hw/rtl/mem/VX_gbar_bus_if.sv index 235c4c7a0..a93d7e204 100644 --- a/hw/rtl/mem/VX_gbar_bus_if.sv +++ b/hw/rtl/mem/VX_gbar_bus_if.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -15,35 +15,39 @@ interface VX_gbar_bus_if (); - wire req_valid; - wire [`NB_WIDTH-1:0] req_id; - wire [`NC_WIDTH-1:0] req_size_m1; - wire [`NC_WIDTH-1:0] req_core_id; - wire req_ready; + typedef struct packed { + logic [`NB_WIDTH-1:0] id; + logic [`NC_WIDTH-1:0] size_m1; + logic [`NC_WIDTH-1:0] core_id; + } req_data_t; - wire rsp_valid; - wire [`NB_WIDTH-1:0] rsp_id; + typedef struct packed { + logic [`NB_WIDTH-1:0] id; + } rsp_data_t; + + logic req_valid; + req_data_t req_data; + logic req_ready; + + logic rsp_valid; + rsp_data_t rsp_data; modport master ( - output req_valid, - output req_id, - output req_size_m1, - output req_core_id, - input req_ready, + output req_valid, + output req_data, + input req_ready, - input rsp_valid, - input rsp_id + input rsp_valid, + input rsp_data ); modport slave ( - input req_valid, - input req_id, - input req_size_m1, - input req_core_id, - output req_ready, - - output rsp_valid, - output rsp_id + input req_valid, + input req_data, + output req_ready, + + output rsp_valid, + output rsp_data ); endinterface diff --git a/hw/rtl/mem/VX_gbar_unit.sv b/hw/rtl/mem/VX_gbar_unit.sv index a6e5d9baa..b90b355f1 100644 --- a/hw/rtl/mem/VX_gbar_unit.sv +++ b/hw/rtl/mem/VX_gbar_unit.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -13,7 +13,7 @@ `include "VX_define.vh" -module VX_gbar_unit #( +module VX_gbar_unit #( parameter `STRING INSTANCE_ID = "" ) ( input wire clk, @@ -25,8 +25,8 @@ module VX_gbar_unit #( reg [`NB_WIDTH-1:0][`NUM_CORES-1:0] barrier_masks; wire [`CLOG2(`NUM_CORES+1)-1:0] active_barrier_count; - wire [`NUM_CORES-1:0] curr_barrier_mask = barrier_masks[gbar_bus_if.req_id]; - + wire [`NUM_CORES-1:0] curr_barrier_mask = barrier_masks[gbar_bus_if.req_data.id]; + `POP_COUNT(active_barrier_count, curr_barrier_mask); `UNUSED_VAR (active_barrier_count) @@ -42,29 +42,29 @@ module VX_gbar_unit #( rsp_valid <= 0; end if (gbar_bus_if.req_valid) begin - if (active_barrier_count[`NC_WIDTH-1:0] == gbar_bus_if.req_size_m1) begin - barrier_masks[gbar_bus_if.req_id] <= '0; - rsp_bar_id <= gbar_bus_if.req_id; + if (active_barrier_count[`NC_WIDTH-1:0] == gbar_bus_if.req_data.size_m1) begin + barrier_masks[gbar_bus_if.req_data.id] <= '0; + rsp_bar_id <= gbar_bus_if.req_data.id; rsp_valid <= 1; end else begin - barrier_masks[gbar_bus_if.req_id][gbar_bus_if.req_core_id] <= 1; + barrier_masks[gbar_bus_if.req_data.id][gbar_bus_if.req_data.core_id] <= 1; end end end end assign gbar_bus_if.rsp_valid = rsp_valid; - assign gbar_bus_if.rsp_id = rsp_bar_id; + assign gbar_bus_if.rsp_data.id = rsp_bar_id; assign gbar_bus_if.req_ready = 1; // global barrier unit is always ready (no dependencies) - + `ifdef DBG_TRACE_GBAR always @(posedge clk) begin if (gbar_bus_if.req_valid && gbar_bus_if.req_ready) begin - `TRACE(1, ("%d: %s-acquire: bar_id=%0d, size=%0d, core_id=%0d\n", - $time, INSTANCE_ID, gbar_bus_if.req_id, gbar_bus_if.req_size_m1, gbar_bus_if.req_core_id)); + `TRACE(2, ("%t: %s acquire: bar_id=%0d, size=%0d, core_id=%0d\n", + $time, INSTANCE_ID, gbar_bus_if.req_data.id, gbar_bus_if.req_data.size_m1, gbar_bus_if.req_data.core_id)) end if (gbar_bus_if.rsp_valid) begin - `TRACE(1, ("%d: %s-release: bar_id=%0d\n", $time, INSTANCE_ID, gbar_bus_if.rsp_id)); + `TRACE(2, ("%t: %s release: bar_id=%0d\n", $time, INSTANCE_ID, gbar_bus_if.rsp_data.id)) end end `endif diff --git a/hw/rtl/mem/VX_lmem_switch.sv b/hw/rtl/mem/VX_lmem_switch.sv new file mode 100644 index 000000000..7d9742ffb --- /dev/null +++ b/hw/rtl/mem/VX_lmem_switch.sv @@ -0,0 +1,117 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_define.vh" + +module VX_lmem_switch import VX_gpu_pkg::*; #( + parameter REQ0_OUT_BUF = 0, + parameter REQ1_OUT_BUF = 0, + parameter RSP_OUT_BUF = 0, + parameter `STRING ARBITER = "R" +) ( + input wire clk, + input wire reset, + VX_lsu_mem_if.slave lsu_in_if, + VX_lsu_mem_if.master global_out_if, + VX_lsu_mem_if.master local_out_if +); + localparam REQ_DATAW = `NUM_LSU_LANES + 1 + `NUM_LSU_LANES * (LSU_WORD_SIZE + LSU_ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH; + localparam RSP_DATAW = `NUM_LSU_LANES + `NUM_LSU_LANES * (LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH; + + wire [`NUM_LSU_LANES-1:0] is_addr_local_mask; + wire req_global_ready; + wire req_local_ready; + + for (genvar i = 0; i < `NUM_LSU_LANES; ++i) begin : g_is_addr_local_mask + assign is_addr_local_mask[i] = lsu_in_if.req_data.flags[i][`MEM_REQ_FLAG_LOCAL]; + end + + wire is_addr_global = | (lsu_in_if.req_data.mask & ~is_addr_local_mask); + wire is_addr_local = | (lsu_in_if.req_data.mask & is_addr_local_mask); + + assign lsu_in_if.req_ready = (req_global_ready && is_addr_global) + || (req_local_ready && is_addr_local); + + VX_elastic_buffer #( + .DATAW (REQ_DATAW), + .SIZE (`TO_OUT_BUF_SIZE(REQ0_OUT_BUF)), + .OUT_REG (`TO_OUT_BUF_REG(REQ0_OUT_BUF)) + ) req_global_buf ( + .clk (clk), + .reset (reset), + .valid_in (lsu_in_if.req_valid && is_addr_global), + .data_in ({ + lsu_in_if.req_data.mask & ~is_addr_local_mask, + lsu_in_if.req_data.rw, + lsu_in_if.req_data.addr, + lsu_in_if.req_data.data, + lsu_in_if.req_data.byteen, + lsu_in_if.req_data.flags, + lsu_in_if.req_data.tag + }), + .ready_in (req_global_ready), + .valid_out (global_out_if.req_valid), + .data_out (global_out_if.req_data), + .ready_out (global_out_if.req_ready) + ); + + VX_elastic_buffer #( + .DATAW (REQ_DATAW), + .SIZE (`TO_OUT_BUF_SIZE(REQ1_OUT_BUF)), + .OUT_REG (`TO_OUT_BUF_REG(REQ1_OUT_BUF)) + ) req_local_buf ( + .clk (clk), + .reset (reset), + .valid_in (lsu_in_if.req_valid && is_addr_local), + .data_in ({ + lsu_in_if.req_data.mask & is_addr_local_mask, + lsu_in_if.req_data.rw, + lsu_in_if.req_data.addr, + lsu_in_if.req_data.data, + lsu_in_if.req_data.byteen, + lsu_in_if.req_data.flags, + lsu_in_if.req_data.tag + }), + .ready_in (req_local_ready), + .valid_out (local_out_if.req_valid), + .data_out (local_out_if.req_data), + .ready_out (local_out_if.req_ready) + ); + + VX_stream_arb #( + .NUM_INPUTS (2), + .DATAW (RSP_DATAW), + .ARBITER (ARBITER), + .OUT_BUF (RSP_OUT_BUF) + ) rsp_arb ( + .clk (clk), + .reset (reset), + .valid_in ({ + local_out_if.rsp_valid, + global_out_if.rsp_valid + }), + .ready_in ({ + local_out_if.rsp_ready, + global_out_if.rsp_ready + }), + .data_in ({ + local_out_if.rsp_data, + global_out_if.rsp_data + }), + .data_out (lsu_in_if.rsp_data), + .valid_out (lsu_in_if.rsp_valid), + .ready_out (lsu_in_if.rsp_ready), + `UNUSED_PIN (sel_out) + ); + +endmodule diff --git a/hw/rtl/mem/VX_local_mem.sv b/hw/rtl/mem/VX_local_mem.sv index 3dce0ec43..8f9279971 100644 --- a/hw/rtl/mem/VX_local_mem.sv +++ b/hw/rtl/mem/VX_local_mem.sv @@ -43,7 +43,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( // PERF `ifdef PERF_ENABLE - output cache_perf_t cache_perf, + output lmem_perf_t lmem_perf, `endif VX_mem_bus_if.slave mem_bus_if [NUM_REQS] @@ -67,20 +67,20 @@ module VX_local_mem import VX_gpu_pkg::*; #( // bank selection wire [NUM_REQS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx; - if (NUM_BANKS > 1) begin - for (genvar i = 0; i < NUM_REQS; ++i) begin + if (NUM_BANKS > 1) begin : g_req_bank_idx + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_req_bank_idxs assign req_bank_idx[i] = mem_bus_if[i].req_data.addr[0 +: BANK_SEL_BITS]; end - end else begin + end else begin : g_req_bank_idx_0 assign req_bank_idx = 0; end // bank addressing wire [NUM_REQS-1:0][BANK_ADDR_WIDTH-1:0] req_bank_addr; - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_req_bank_addr assign req_bank_addr[i] = mem_bus_if[i].req_data.addr[BANK_SEL_BITS +: BANK_ADDR_WIDTH]; - `UNUSED_VAR (mem_bus_if[i].req_data.atype) + `UNUSED_VAR (mem_bus_if[i].req_data.flags) end // bank requests dispatch @@ -104,13 +104,13 @@ module VX_local_mem import VX_gpu_pkg::*; #( wire [`PERF_CTR_BITS-1:0] perf_collisions; `endif - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_req_data_in assign req_valid_in[i] = mem_bus_if[i].req_valid; assign req_data_in[i] = { mem_bus_if[i].req_data.rw, req_bank_addr[i], - mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, + mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.tag }; assign mem_bus_if[i].req_ready = req_ready_in[i]; @@ -121,7 +121,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( .NUM_OUTPUTS (NUM_BANKS), .DATAW (REQ_DATAW), .PERF_CTR_BITS (`PERF_CTR_BITS), - .ARBITER ("F"), + .ARBITER ("P"), .OUT_BUF (3) // output should be registered for the data_store addressing ) req_xbar ( .clk (clk), @@ -141,12 +141,12 @@ module VX_local_mem import VX_gpu_pkg::*; #( .ready_out (per_bank_req_ready) ); - for (genvar i = 0; i < NUM_BANKS; ++i) begin + for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_per_bank_req_data_soa assign { per_bank_req_rw[i], per_bank_req_addr[i], - per_bank_req_byteen[i], per_bank_req_data[i], + per_bank_req_byteen[i], per_bank_req_tag[i] } = per_bank_req_data_aos[i]; end @@ -159,33 +159,31 @@ module VX_local_mem import VX_gpu_pkg::*; #( wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_rsp_tag; wire [NUM_BANKS-1:0] per_bank_rsp_ready; - for (genvar i = 0; i < NUM_BANKS; ++i) begin + for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_data_store wire bank_rsp_valid, bank_rsp_ready; - wire [WORD_WIDTH-1:0] bank_rsp_data; - - `RESET_RELAY_EN (bram_reset, reset, (NUM_BANKS > 1)); VX_sp_ram #( .DATAW (WORD_WIDTH), .SIZE (WORDS_PER_BANK), .WRENW (WORD_SIZE), - .NO_RWCHECK (1) - ) data_store ( + .OUT_REG (1), + .RDW_MODE ("R") + ) lmem_store ( .clk (clk), - .reset (bram_reset), + .reset (reset), .read (per_bank_req_valid[i] && per_bank_req_ready[i] && ~per_bank_req_rw[i]), .write (per_bank_req_valid[i] && per_bank_req_ready[i] && per_bank_req_rw[i]), .wren (per_bank_req_byteen[i]), .addr (per_bank_req_addr[i]), .wdata (per_bank_req_data[i]), - .rdata (bank_rsp_data) + .rdata (per_bank_rsp_data[i]) ); // read-during-write hazard detection reg [BANK_ADDR_WIDTH-1:0] last_wr_addr; reg last_wr_valid; always @(posedge clk) begin - if (bram_reset) begin + if (reset) begin last_wr_valid <= 0; end else begin last_wr_valid <= per_bank_req_valid[i] && per_bank_req_ready[i] && per_bank_req_rw[i]; @@ -194,20 +192,20 @@ module VX_local_mem import VX_gpu_pkg::*; #( end wire is_rdw_hazard = last_wr_valid && ~per_bank_req_rw[i] && (per_bank_req_addr[i] == last_wr_addr); - // drop write response and stall on read-during-write hazard + // drop write response assign bank_rsp_valid = per_bank_req_valid[i] && ~per_bank_req_rw[i] && ~is_rdw_hazard; assign per_bank_req_ready[i] = (bank_rsp_ready || per_bank_req_rw[i]) && ~is_rdw_hazard; // register BRAM output VX_pipe_buffer #( - .DATAW (REQ_SEL_WIDTH + WORD_WIDTH + TAG_WIDTH) + .DATAW (REQ_SEL_WIDTH + TAG_WIDTH) ) bram_buf ( .clk (clk), - .reset (bram_reset), + .reset (reset), .valid_in (bank_rsp_valid), .ready_in (bank_rsp_ready), - .data_in ({per_bank_req_idx[i], bank_rsp_data, per_bank_req_tag[i]}), - .data_out ({per_bank_rsp_idx[i], per_bank_rsp_data[i], per_bank_rsp_tag[i]}), + .data_in ({per_bank_req_idx[i], per_bank_req_tag[i]}), + .data_out ({per_bank_rsp_idx[i], per_bank_rsp_tag[i]}), .valid_out (per_bank_rsp_valid[i]), .ready_out (per_bank_rsp_ready[i]) ); @@ -217,7 +215,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( wire [NUM_BANKS-1:0][RSP_DATAW-1:0] per_bank_rsp_data_aos; - for (genvar i = 0; i < NUM_BANKS; ++i) begin + for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_per_bank_rsp_data_aos assign per_bank_rsp_data_aos[i] = {per_bank_rsp_data[i], per_bank_rsp_tag[i]}; end @@ -245,9 +243,9 @@ module VX_local_mem import VX_gpu_pkg::*; #( `UNUSED_PIN (sel_out) ); - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_mem_bus_if assign mem_bus_if[i].rsp_valid = rsp_valid_out[i]; - assign mem_bus_if[i].rsp_data = rsp_data_out[i]; + assign mem_bus_if[i].rsp_data = rsp_data_out[i]; assign rsp_ready_out[i] = mem_bus_if[i].rsp_ready; end @@ -258,7 +256,7 @@ module VX_local_mem import VX_gpu_pkg::*; #( wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle; wire [NUM_REQS-1:0] req_rw; - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_req_rw assign req_rw[i] = mem_bus_if[i].req_data.rw; end @@ -288,77 +286,65 @@ module VX_local_mem import VX_gpu_pkg::*; #( end end - assign cache_perf.reads = perf_reads; - assign cache_perf.writes = perf_writes; - assign cache_perf.read_misses = '0; - assign cache_perf.write_misses = '0; - assign cache_perf.bank_stalls = perf_collisions; - assign cache_perf.mshr_stalls = '0; - assign cache_perf.mem_stalls = '0; - assign cache_perf.crsp_stalls = perf_crsp_stalls; + assign lmem_perf.reads = perf_reads; + assign lmem_perf.writes = perf_writes; + assign lmem_perf.bank_stalls = perf_collisions; + assign lmem_perf.crsp_stalls = perf_crsp_stalls; `endif `ifdef DBG_TRACE_MEM - wire [NUM_REQS-1:0][`UP(UUID_WIDTH)-1:0] req_uuid; - wire [NUM_REQS-1:0][`UP(UUID_WIDTH)-1:0] rsp_uuid; - - for (genvar i = 0; i < NUM_REQS; ++i) begin - if (UUID_WIDTH != 0) begin - assign req_uuid[i] = mem_bus_if[i].req_data.tag[TAG_WIDTH-1 -: UUID_WIDTH]; - assign rsp_uuid[i] = mem_bus_if[i].rsp_data.tag[TAG_WIDTH-1 -: UUID_WIDTH]; - end else begin - assign req_uuid[i] = 0; - assign rsp_uuid[i] = 0; - end - end - + wire [NUM_BANKS-1:0][TAG_WIDTH-UUID_WIDTH-1:0] per_bank_req_tag_value; wire [NUM_BANKS-1:0][`UP(UUID_WIDTH)-1:0] per_bank_req_uuid; + + wire [NUM_BANKS-1:0][TAG_WIDTH-UUID_WIDTH-1:0] per_bank_rsp_tag_value; wire [NUM_BANKS-1:0][`UP(UUID_WIDTH)-1:0] per_bank_rsp_uuid; - for (genvar i = 0; i < NUM_BANKS; ++i) begin - if (UUID_WIDTH != 0) begin + for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_per_bank_req_uuid + assign per_bank_req_tag_value[i] = per_bank_req_tag[i][TAG_WIDTH-UUID_WIDTH-1:0]; + assign per_bank_rsp_tag_value[i] = per_bank_rsp_tag[i][TAG_WIDTH-UUID_WIDTH-1:0]; + if (UUID_WIDTH != 0) begin : g_uuid assign per_bank_req_uuid[i] = per_bank_req_tag[i][TAG_WIDTH-1 -: UUID_WIDTH]; assign per_bank_rsp_uuid[i] = per_bank_rsp_tag[i][TAG_WIDTH-1 -: UUID_WIDTH]; - end else begin + end else begin : g_no_uuid assign per_bank_req_uuid[i] = 0; assign per_bank_rsp_uuid[i] = 0; end end - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_REQS; ++i) begin : g_req_trace always @(posedge clk) begin if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin if (mem_bus_if[i].req_data.rw) begin - `TRACE(1, ("%d: %s wr-req: req_idx=%0d, addr=0x%0h, tag=0x%0h, byteen=%h, data=0x%h (#%0d)\n", - $time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, req_uuid[i])); + `TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n", + $time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid)) end else begin - `TRACE(1, ("%d: %s rd-req: req_idx=%0d, addr=0x%0h, tag=0x%0h (#%0d)\n", - $time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag, req_uuid[i])); + `TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n", + $time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid)) end end if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin - `TRACE(1, ("%d: %s rd-rsp: req_idx=%0d, tag=0x%0h, data=0x%h (#%0d)\n", - $time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.tag, mem_bus_if[i].rsp_data.data[i], rsp_uuid[i])); + `TRACE(2, ("%t: %s core-rd-rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n", + $time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid)) end end end - for (genvar i = 0; i < NUM_BANKS; ++i) begin + for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_bank_trace always @(posedge clk) begin if (per_bank_req_valid[i] && per_bank_req_ready[i]) begin if (per_bank_req_rw[i]) begin - `TRACE(2, ("%d: %s-bank%0d wr-req: addr=0x%0h, tag=0x%0h, byteen=%h, data=0x%h (#%0d)\n", - $time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag[i], per_bank_req_byteen[i], per_bank_req_data[i], per_bank_req_uuid[i])); + `TRACE(2, ("%t: %s bank-wr-req[%0d]: addr=0x%0h, byteen=0x%h, data=0x%h, tag=0x%0h (#%0d)\n", + $time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_byteen[i], per_bank_req_data[i], per_bank_req_tag_value[i], per_bank_req_uuid[i])) end else begin - `TRACE(2, ("%d: %s-bank%0d rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n", - $time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag[i], per_bank_req_uuid[i])); + `TRACE(2, ("%t: %s bank-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n", + $time, INSTANCE_ID, i, per_bank_req_addr[i], per_bank_req_tag_value[i], per_bank_req_uuid[i])) end end if (per_bank_rsp_valid[i] && per_bank_rsp_ready[i]) begin - `TRACE(2, ("%d: %s-bank%0d rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n", - $time, INSTANCE_ID, i, per_bank_rsp_tag[i], per_bank_rsp_data[i], per_bank_rsp_uuid[i])); + `TRACE(2, ("%t: %s bank-rd-rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n", + $time, INSTANCE_ID, i, per_bank_rsp_data[i], per_bank_rsp_tag_value[i], per_bank_rsp_uuid[i])) end end end diff --git a/hw/rtl/mem/VX_local_mem_top.sv b/hw/rtl/mem/VX_local_mem_top.sv index e576d32ec..fda15cde2 100644 --- a/hw/rtl/mem/VX_local_mem_top.sv +++ b/hw/rtl/mem/VX_local_mem_top.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,15 +17,13 @@ module VX_local_mem_top import VX_gpu_pkg::*; #( parameter `STRING INSTANCE_ID = "", // Size of cache in bytes - parameter SIZE = (1024*16*8), - + parameter SIZE = (1024*16*8), + // Number of Word requests per cycle - parameter NUM_REQS = 4, + parameter NUM_REQS = 4, // Number of banks parameter NUM_BANKS = 4, - // Address width - parameter ADDR_WIDTH = `CLOG2(SIZE), // Size of a word in bytes parameter WORD_SIZE = `XLEN/8, @@ -33,8 +31,14 @@ module VX_local_mem_top import VX_gpu_pkg::*; #( parameter UUID_WIDTH = 0, // Request tag size - parameter TAG_WIDTH = 16 - ) ( + parameter TAG_WIDTH = 16, + + // Address width + parameter NUM_WORDS = SIZE / WORD_SIZE, + parameter WORDS_PER_BANK = NUM_WORDS / NUM_BANKS, + parameter BANK_ADDR_WIDTH = `CLOG2(WORDS_PER_BANK), + parameter ADDR_WIDTH = BANK_ADDR_WIDTH + `CLOG2(NUM_BANKS) + ) ( input wire clk, input wire reset, @@ -43,7 +47,7 @@ module VX_local_mem_top import VX_gpu_pkg::*; #( input wire [NUM_REQS-1:0] mem_req_rw, input wire [NUM_REQS-1:0][WORD_SIZE-1:0] mem_req_byteen, input wire [NUM_REQS-1:0][ADDR_WIDTH-1:0] mem_req_addr, - input wire [NUM_REQS-1:0][`ADDR_TYPE_WIDTH-1:0] mem_req_atype, + input wire [NUM_REQS-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] mem_req_flags, input wire [NUM_REQS-1:0][WORD_SIZE*8-1:0] mem_req_data, input wire [NUM_REQS-1:0][TAG_WIDTH-1:0] mem_req_tag, output wire [NUM_REQS-1:0] mem_req_ready, @@ -56,7 +60,8 @@ module VX_local_mem_top import VX_gpu_pkg::*; #( ); VX_mem_bus_if #( .DATA_SIZE (WORD_SIZE), - .TAG_WIDTH (TAG_WIDTH) + .TAG_WIDTH (TAG_WIDTH), + .ADDR_WIDTH(ADDR_WIDTH) ) mem_bus_if[NUM_REQS](); // memory request @@ -65,7 +70,7 @@ module VX_local_mem_top import VX_gpu_pkg::*; #( assign mem_bus_if[i].req_data.rw = mem_req_rw[i]; assign mem_bus_if[i].req_data.byteen = mem_req_byteen[i]; assign mem_bus_if[i].req_data.addr = mem_req_addr[i]; - assign mem_bus_if[i].req_data.atype = mem_req_atype[i]; + assign mem_bus_if[i].req_data.flags = mem_req_flags[i]; assign mem_bus_if[i].req_data.data = mem_req_data[i]; assign mem_bus_if[i].req_data.tag = mem_req_tag[i]; assign mem_req_ready[i] = mem_bus_if[i].req_ready; @@ -86,9 +91,10 @@ module VX_local_mem_top import VX_gpu_pkg::*; #( .NUM_BANKS (NUM_BANKS), .WORD_SIZE (WORD_SIZE), .ADDR_WIDTH (ADDR_WIDTH), - .UUID_WIDTH (UUID_WIDTH), - .TAG_WIDTH (TAG_WIDTH) - ) local_mem ( + .UUID_WIDTH (UUID_WIDTH), + .TAG_WIDTH (TAG_WIDTH), + .OUT_BUF (3) + ) local_mem ( .clk (clk), .reset (reset), .mem_bus_if (mem_bus_if) diff --git a/hw/rtl/core/VX_lsu_adapter.sv b/hw/rtl/mem/VX_lsu_adapter.sv similarity index 87% rename from hw/rtl/core/VX_lsu_adapter.sv rename to hw/rtl/mem/VX_lsu_adapter.sv index 21d43d280..4991ab6ed 100644 --- a/hw/rtl/core/VX_lsu_adapter.sv +++ b/hw/rtl/mem/VX_lsu_adapter.sv @@ -29,7 +29,7 @@ module VX_lsu_adapter import VX_gpu_pkg::*; #( VX_mem_bus_if.master mem_bus_if [NUM_LANES] ); localparam REQ_ADDR_WIDTH = `MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE); - localparam REQ_DATA_WIDTH = 1 + DATA_SIZE + REQ_ADDR_WIDTH + `ADDR_TYPE_WIDTH + DATA_SIZE * 8; + localparam REQ_DATA_WIDTH = 1 + DATA_SIZE + REQ_ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + DATA_SIZE * 8; localparam RSP_DATA_WIDTH = DATA_SIZE * 8; // handle request unpacking @@ -41,29 +41,16 @@ module VX_lsu_adapter import VX_gpu_pkg::*; #( wire [NUM_LANES-1:0][TAG_WIDTH-1:0] req_tag_out; wire [NUM_LANES-1:0] req_ready_out; - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_req_data_in assign req_data_in[i] = { lsu_mem_if.req_data.rw, - lsu_mem_if.req_data.byteen[i], lsu_mem_if.req_data.addr[i], - lsu_mem_if.req_data.atype[i], - lsu_mem_if.req_data.data[i] + lsu_mem_if.req_data.data[i], + lsu_mem_if.req_data.byteen[i], + lsu_mem_if.req_data.flags[i] }; end - for (genvar i = 0; i < NUM_LANES; ++i) begin - assign mem_bus_if[i].req_valid = req_valid_out[i]; - assign { - mem_bus_if[i].req_data.rw, - mem_bus_if[i].req_data.byteen, - mem_bus_if[i].req_data.addr, - mem_bus_if[i].req_data.atype, - mem_bus_if[i].req_data.data - } = req_data_out[i]; - assign mem_bus_if[i].req_data.tag = req_tag_out[i]; - assign req_ready_out[i] = mem_bus_if[i].req_ready; - end - VX_stream_unpack #( .NUM_REQS (NUM_LANES), .DATA_WIDTH (REQ_DATA_WIDTH), @@ -83,6 +70,19 @@ module VX_lsu_adapter import VX_gpu_pkg::*; #( .ready_out (req_ready_out) ); + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_bus_req + assign mem_bus_if[i].req_valid = req_valid_out[i]; + assign { + mem_bus_if[i].req_data.rw, + mem_bus_if[i].req_data.addr, + mem_bus_if[i].req_data.data, + mem_bus_if[i].req_data.byteen, + mem_bus_if[i].req_data.flags + } = req_data_out[i]; + assign mem_bus_if[i].req_data.tag = req_tag_out[i]; + assign req_ready_out[i] = mem_bus_if[i].req_ready; + end + // handle response packing wire [NUM_LANES-1:0] rsp_valid_out; @@ -90,10 +90,10 @@ module VX_lsu_adapter import VX_gpu_pkg::*; #( wire [NUM_LANES-1:0][TAG_WIDTH-1:0] rsp_tag_out; wire [NUM_LANES-1:0] rsp_ready_out; - for (genvar i = 0; i < NUM_LANES; ++i) begin + for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_bus_rsp assign rsp_valid_out[i] = mem_bus_if[i].rsp_valid; - assign rsp_data_out[i] = mem_bus_if[i].rsp_data.data; - assign rsp_tag_out[i] = mem_bus_if[i].rsp_data.tag; + assign rsp_data_out[i] = mem_bus_if[i].rsp_data.data; + assign rsp_tag_out[i] = mem_bus_if[i].rsp_data.tag; assign mem_bus_if[i].rsp_ready = rsp_ready_out[i]; end diff --git a/hw/rtl/mem/VX_lsu_mem_arb.sv b/hw/rtl/mem/VX_lsu_mem_arb.sv new file mode 100644 index 000000000..c6d38d840 --- /dev/null +++ b/hw/rtl/mem/VX_lsu_mem_arb.sv @@ -0,0 +1,185 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_define.vh" + +module VX_lsu_mem_arb #( + parameter NUM_INPUTS = 1, + parameter NUM_OUTPUTS = 1, + parameter NUM_LANES = 1, + parameter DATA_SIZE = 1, + parameter TAG_WIDTH = 1, + parameter TAG_SEL_IDX = 0, + parameter REQ_OUT_BUF = 0, + parameter RSP_OUT_BUF = 0, + parameter `STRING ARBITER = "R", + parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH, + parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)), + parameter FLAGS_WIDTH = `MEM_REQ_FLAGS_WIDTH +) ( + input wire clk, + input wire reset, + + VX_lsu_mem_if.slave bus_in_if [NUM_INPUTS], + VX_lsu_mem_if.master bus_out_if [NUM_OUTPUTS] +); + localparam DATA_WIDTH = (8 * DATA_SIZE); + localparam LOG_NUM_REQS = `ARB_SEL_BITS(NUM_INPUTS, NUM_OUTPUTS); + localparam REQ_DATAW = 1 + NUM_LANES * (1 + ADDR_WIDTH + DATA_WIDTH + DATA_SIZE + FLAGS_WIDTH) + TAG_WIDTH; + localparam RSP_DATAW = NUM_LANES * (1 + DATA_WIDTH) + TAG_WIDTH; + + `STATIC_ASSERT ((NUM_INPUTS >= NUM_OUTPUTS), ("invalid parameter: NUM_INPUTS=%0d, NUM_OUTPUTS=%0d", NUM_INPUTS, NUM_OUTPUTS)); + + wire [NUM_INPUTS-1:0] req_valid_in; + wire [NUM_INPUTS-1:0][REQ_DATAW-1:0] req_data_in; + wire [NUM_INPUTS-1:0] req_ready_in; + + wire [NUM_OUTPUTS-1:0] req_valid_out; + wire [NUM_OUTPUTS-1:0][REQ_DATAW-1:0] req_data_out; + wire [NUM_OUTPUTS-1:0][`UP(LOG_NUM_REQS)-1:0] req_sel_out; + wire [NUM_OUTPUTS-1:0] req_ready_out; + + for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_req_data_in + assign req_valid_in[i] = bus_in_if[i].req_valid; + assign req_data_in[i] = bus_in_if[i].req_data; + assign bus_in_if[i].req_ready = req_ready_in[i]; + end + + VX_stream_arb #( + .NUM_INPUTS (NUM_INPUTS), + .NUM_OUTPUTS (NUM_OUTPUTS), + .DATAW (REQ_DATAW), + .ARBITER (ARBITER), + .OUT_BUF (REQ_OUT_BUF) + ) req_arb ( + .clk (clk), + .reset (reset), + .valid_in (req_valid_in), + .ready_in (req_ready_in), + .data_in (req_data_in), + .data_out (req_data_out), + .sel_out (req_sel_out), + .valid_out (req_valid_out), + .ready_out (req_ready_out) + ); + + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_bus_out_if + wire [TAG_WIDTH-1:0] req_tag_out; + VX_bits_insert #( + .N (TAG_WIDTH), + .S (LOG_NUM_REQS), + .POS (TAG_SEL_IDX) + ) bits_insert ( + .data_in (req_tag_out), + .ins_in (req_sel_out[i]), + .data_out (bus_out_if[i].req_data.tag) + ); + assign bus_out_if[i].req_valid = req_valid_out[i]; + assign { + bus_out_if[i].req_data.mask, + bus_out_if[i].req_data.rw, + bus_out_if[i].req_data.addr, + bus_out_if[i].req_data.data, + bus_out_if[i].req_data.byteen, + bus_out_if[i].req_data.flags, + req_tag_out + } = req_data_out[i]; + assign req_ready_out[i] = bus_out_if[i].req_ready; + end + + /////////////////////////////////////////////////////////////////////////// + + wire [NUM_INPUTS-1:0] rsp_valid_out; + wire [NUM_INPUTS-1:0][RSP_DATAW-1:0] rsp_data_out; + wire [NUM_INPUTS-1:0] rsp_ready_out; + + wire [NUM_OUTPUTS-1:0] rsp_valid_in; + wire [NUM_OUTPUTS-1:0][RSP_DATAW-1:0] rsp_data_in; + wire [NUM_OUTPUTS-1:0] rsp_ready_in; + + if (NUM_INPUTS > NUM_OUTPUTS) begin : g_rsp_enabled + + wire [NUM_OUTPUTS-1:0][LOG_NUM_REQS-1:0] rsp_sel_in; + + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_rsp_data_in + wire [TAG_WIDTH-1:0] rsp_tag_out; + VX_bits_remove #( + .N (TAG_WIDTH + LOG_NUM_REQS), + .S (LOG_NUM_REQS), + .POS (TAG_SEL_IDX) + ) bits_remove ( + .data_in (bus_out_if[i].rsp_data.tag), + .sel_out (rsp_sel_in[i]), + .data_out (rsp_tag_out) + ); + assign rsp_valid_in[i] = bus_out_if[i].rsp_valid; + assign rsp_data_in[i] = { + bus_out_if[i].rsp_data.mask, + bus_out_if[i].rsp_data.data, + rsp_tag_out + }; + assign bus_out_if[i].rsp_ready = rsp_ready_in[i]; + end + + VX_stream_switch #( + .NUM_INPUTS (NUM_OUTPUTS), + .NUM_OUTPUTS (NUM_INPUTS), + .DATAW (RSP_DATAW), + .OUT_BUF (RSP_OUT_BUF) + ) rsp_switch ( + .clk (clk), + .reset (reset), + .sel_in (rsp_sel_in), + .valid_in (rsp_valid_in), + .ready_in (rsp_ready_in), + .data_in (rsp_data_in), + .data_out (rsp_data_out), + .valid_out (rsp_valid_out), + .ready_out (rsp_ready_out) + ); + + end else begin : g_passthru + + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_rsp_data_in + assign rsp_valid_in[i] = bus_out_if[i].rsp_valid; + assign rsp_data_in[i] = bus_out_if[i].rsp_data; + assign bus_out_if[i].rsp_ready = rsp_ready_in[i]; + end + + VX_stream_arb #( + .NUM_INPUTS (NUM_OUTPUTS), + .NUM_OUTPUTS (NUM_INPUTS), + .DATAW (RSP_DATAW), + .ARBITER (ARBITER), + .OUT_BUF (RSP_OUT_BUF) + ) req_arb ( + .clk (clk), + .reset (reset), + .valid_in (rsp_valid_in), + .ready_in (rsp_ready_in), + .data_in (rsp_data_in), + .data_out (rsp_data_out), + .valid_out (rsp_valid_out), + .ready_out (rsp_ready_out), + `UNUSED_PIN (sel_out) + ); + + end + + for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_output + assign bus_in_if[i].rsp_valid = rsp_valid_out[i]; + assign bus_in_if[i].rsp_data = rsp_data_out[i]; + assign rsp_ready_out[i] = bus_in_if[i].rsp_ready; + end + +endmodule diff --git a/hw/rtl/interfaces/VX_lsu_mem_if.sv b/hw/rtl/mem/VX_lsu_mem_if.sv similarity index 72% rename from hw/rtl/interfaces/VX_lsu_mem_if.sv rename to hw/rtl/mem/VX_lsu_mem_if.sv index 661071eb6..4a7732a2a 100644 --- a/hw/rtl/interfaces/VX_lsu_mem_if.sv +++ b/hw/rtl/mem/VX_lsu_mem_if.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,26 +16,32 @@ interface VX_lsu_mem_if #( parameter NUM_LANES = 1, parameter DATA_SIZE = 1, - parameter ATYPE_WIDTH= `ADDR_TYPE_WIDTH, parameter TAG_WIDTH = 1, + parameter FLAGS_WIDTH= `MEM_REQ_FLAGS_WIDTH, parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH, - parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE) + parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE), + parameter UUID_WIDTH = `UUID_WIDTH ) (); typedef struct packed { - logic rw; - logic [NUM_LANES-1:0] mask; - logic [NUM_LANES-1:0][DATA_SIZE-1:0] byteen; + logic [`UP(UUID_WIDTH)-1:0] uuid; + logic [TAG_WIDTH-`UP(UUID_WIDTH)-1:0] value; + } tag_t; + + typedef struct packed { + logic [NUM_LANES-1:0] mask; + logic rw; logic [NUM_LANES-1:0][ADDR_WIDTH-1:0] addr; - logic [NUM_LANES-1:0][ATYPE_WIDTH-1:0] atype; logic [NUM_LANES-1:0][DATA_SIZE*8-1:0] data; - logic [TAG_WIDTH-1:0] tag; + logic [NUM_LANES-1:0][DATA_SIZE-1:0] byteen; + logic [NUM_LANES-1:0][FLAGS_WIDTH-1:0] flags; + tag_t tag; } req_data_t; typedef struct packed { - logic [NUM_LANES-1:0] mask; + logic [NUM_LANES-1:0] mask; logic [NUM_LANES-1:0][DATA_SIZE*8-1:0] data; - logic [TAG_WIDTH-1:0] tag; + tag_t tag; } rsp_data_t; logic req_valid; diff --git a/hw/rtl/mem/VX_mem_arb.sv b/hw/rtl/mem/VX_mem_arb.sv index ef51e2387..0fc374258 100644 --- a/hw/rtl/mem/VX_mem_arb.sv +++ b/hw/rtl/mem/VX_mem_arb.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -13,30 +13,31 @@ `include "VX_define.vh" -module VX_mem_arb #( - parameter NUM_INPUTS = 1, +module VX_mem_arb #( + parameter NUM_INPUTS = 1, parameter NUM_OUTPUTS = 1, parameter DATA_SIZE = 1, - parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH, - parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)), - parameter TAG_WIDTH = 1, - parameter TAG_SEL_IDX = 0, + parameter TAG_WIDTH = 1, + parameter TAG_SEL_IDX = 0, parameter REQ_OUT_BUF = 0, parameter RSP_OUT_BUF = 0, - parameter `STRING ARBITER = "R" + parameter `STRING ARBITER = "R", + parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH, + parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)), + parameter FLAGS_WIDTH = `MEM_REQ_FLAGS_WIDTH ) ( input wire clk, input wire reset, VX_mem_bus_if.slave bus_in_if [NUM_INPUTS], VX_mem_bus_if.master bus_out_if [NUM_OUTPUTS] -); +); localparam DATA_WIDTH = (8 * DATA_SIZE); localparam LOG_NUM_REQS = `ARB_SEL_BITS(NUM_INPUTS, NUM_OUTPUTS); - localparam REQ_DATAW = TAG_WIDTH + ADDR_WIDTH + `ADDR_TYPE_WIDTH + 1 + DATA_SIZE + DATA_WIDTH; - localparam RSP_DATAW = TAG_WIDTH + DATA_WIDTH; + localparam REQ_DATAW = 1 + ADDR_WIDTH + DATA_WIDTH + DATA_SIZE + FLAGS_WIDTH + TAG_WIDTH; + localparam RSP_DATAW = DATA_WIDTH + TAG_WIDTH; - `STATIC_ASSERT ((NUM_INPUTS >= NUM_OUTPUTS), ("invalid parameter")) + `STATIC_ASSERT ((NUM_INPUTS >= NUM_OUTPUTS), ("invalid parameter: NUM_INPUTS=%0d, NUM_OUTPUTS=%0d", NUM_INPUTS, NUM_OUTPUTS)); wire [NUM_INPUTS-1:0] req_valid_in; wire [NUM_INPUTS-1:0][REQ_DATAW-1:0] req_data_in; @@ -47,20 +48,13 @@ module VX_mem_arb #( wire [NUM_OUTPUTS-1:0][`UP(LOG_NUM_REQS)-1:0] req_sel_out; wire [NUM_OUTPUTS-1:0] req_ready_out; - for (genvar i = 0; i < NUM_INPUTS; ++i) begin + for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_req_data_in assign req_valid_in[i] = bus_in_if[i].req_valid; - assign req_data_in[i] = { - bus_in_if[i].req_data.rw, - bus_in_if[i].req_data.byteen, - bus_in_if[i].req_data.addr, - bus_in_if[i].req_data.atype, - bus_in_if[i].req_data.data, - bus_in_if[i].req_data.tag - }; + assign req_data_in[i] = bus_in_if[i].req_data; assign bus_in_if[i].req_ready = req_ready_in[i]; end - VX_stream_arb #( + VX_stream_arb #( .NUM_INPUTS (NUM_INPUTS), .NUM_OUTPUTS (NUM_OUTPUTS), .DATAW (REQ_DATAW), @@ -78,9 +72,9 @@ module VX_mem_arb #( .ready_out (req_ready_out) ); - for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_bus_out_if wire [TAG_WIDTH-1:0] req_tag_out; - VX_bits_insert #( + VX_bits_insert #( .N (TAG_WIDTH), .S (LOG_NUM_REQS), .POS (TAG_SEL_IDX) @@ -92,10 +86,10 @@ module VX_mem_arb #( assign bus_out_if[i].req_valid = req_valid_out[i]; assign { bus_out_if[i].req_data.rw, - bus_out_if[i].req_data.byteen, bus_out_if[i].req_data.addr, - bus_out_if[i].req_data.atype, - bus_out_if[i].req_data.data, + bus_out_if[i].req_data.data, + bus_out_if[i].req_data.byteen, + bus_out_if[i].req_data.flags, req_tag_out } = req_data_out[i]; assign req_ready_out[i] = bus_out_if[i].req_ready; @@ -111,31 +105,25 @@ module VX_mem_arb #( wire [NUM_OUTPUTS-1:0][RSP_DATAW-1:0] rsp_data_in; wire [NUM_OUTPUTS-1:0] rsp_ready_in; - if (NUM_INPUTS > NUM_OUTPUTS) begin + if (NUM_INPUTS > NUM_OUTPUTS) begin : g_rsp_enabled wire [NUM_OUTPUTS-1:0][LOG_NUM_REQS-1:0] rsp_sel_in; - for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_rsp_data_in wire [TAG_WIDTH-1:0] rsp_tag_out; - VX_bits_remove #( + VX_bits_remove #( .N (TAG_WIDTH + LOG_NUM_REQS), .S (LOG_NUM_REQS), .POS (TAG_SEL_IDX) ) bits_remove ( .data_in (bus_out_if[i].rsp_data.tag), + .sel_out (rsp_sel_in[i]), .data_out (rsp_tag_out) ); - assign rsp_valid_in[i] = bus_out_if[i].rsp_valid; - assign rsp_data_in[i] = {rsp_tag_out, bus_out_if[i].rsp_data.data}; + assign rsp_data_in[i] = {bus_out_if[i].rsp_data.data, rsp_tag_out}; assign bus_out_if[i].rsp_ready = rsp_ready_in[i]; - - if (NUM_INPUTS > 1) begin - assign rsp_sel_in[i] = bus_out_if[i].rsp_data.tag[TAG_SEL_IDX +: LOG_NUM_REQS]; - end else begin - assign rsp_sel_in[i] = '0; - end - end + end VX_stream_switch #( .NUM_INPUTS (NUM_OUTPUTS), @@ -154,14 +142,11 @@ module VX_mem_arb #( .ready_out (rsp_ready_out) ); - end else begin - - for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin + end else begin : g_passthru + + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_rsp_data_in assign rsp_valid_in[i] = bus_out_if[i].rsp_valid; - assign rsp_data_in[i] = { - bus_out_if[i].rsp_data.tag, - bus_out_if[i].rsp_data.data - }; + assign rsp_data_in[i] = bus_out_if[i].rsp_data; assign bus_out_if[i].rsp_ready = rsp_ready_in[i]; end @@ -184,13 +169,10 @@ module VX_mem_arb #( ); end - - for (genvar i = 0; i < NUM_INPUTS; ++i) begin + + for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_output assign bus_in_if[i].rsp_valid = rsp_valid_out[i]; - assign { - bus_in_if[i].rsp_data.tag, - bus_in_if[i].rsp_data.data - } = rsp_data_out[i]; + assign bus_in_if[i].rsp_data = rsp_data_out[i]; assign rsp_ready_out[i] = bus_in_if[i].rsp_ready; end diff --git a/hw/rtl/mem/VX_mem_bus_if.sv b/hw/rtl/mem/VX_mem_bus_if.sv index 1b7fca777..ccfd51a99 100644 --- a/hw/rtl/mem/VX_mem_bus_if.sv +++ b/hw/rtl/mem/VX_mem_bus_if.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -15,24 +15,30 @@ interface VX_mem_bus_if #( parameter DATA_SIZE = 1, - parameter ATYPE_WIDTH= `ADDR_TYPE_WIDTH, + parameter FLAGS_WIDTH= `MEM_REQ_FLAGS_WIDTH, parameter TAG_WIDTH = 1, parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH, - parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE) + parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE), + parameter UUID_WIDTH = `UUID_WIDTH ) (); + typedef struct packed { + logic [`UP(UUID_WIDTH)-1:0] uuid; + logic [TAG_WIDTH-`UP(UUID_WIDTH)-1:0] value; + } tag_t; + typedef struct packed { logic rw; - logic [DATA_SIZE-1:0] byteen; logic [ADDR_WIDTH-1:0] addr; - logic [ATYPE_WIDTH-1:0] atype; logic [DATA_SIZE*8-1:0] data; - logic [TAG_WIDTH-1:0] tag; + logic [DATA_SIZE-1:0] byteen; + logic [FLAGS_WIDTH-1:0] flags; + tag_t tag; } req_data_t; typedef struct packed { logic [DATA_SIZE*8-1:0] data; - logic [TAG_WIDTH-1:0] tag; + tag_t tag; } rsp_data_t; logic req_valid; diff --git a/hw/rtl/mem/VX_mem_switch.sv b/hw/rtl/mem/VX_mem_switch.sv index fd26c2aa8..0c28883b5 100644 --- a/hw/rtl/mem/VX_mem_switch.sv +++ b/hw/rtl/mem/VX_mem_switch.sv @@ -14,68 +14,88 @@ `include "VX_define.vh" module VX_mem_switch import VX_gpu_pkg::*; #( - parameter NUM_REQS = 1, + parameter NUM_INPUTS = 1, + parameter NUM_OUTPUTS = 1, parameter DATA_SIZE = 1, + parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH, + parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)), parameter TAG_WIDTH = 1, - parameter ADDR_WIDTH = 1, parameter REQ_OUT_BUF = 0, parameter RSP_OUT_BUF = 0, parameter `STRING ARBITER = "R", + parameter NUM_REQS = (NUM_INPUTS > NUM_OUTPUTS) ? `CDIV(NUM_INPUTS, NUM_OUTPUTS) : `CDIV(NUM_OUTPUTS, NUM_INPUTS), + parameter SEL_COUNT = `MIN(NUM_INPUTS, NUM_OUTPUTS), parameter LOG_NUM_REQS = `CLOG2(NUM_REQS) ) ( input wire clk, input wire reset, - input wire [`UP(LOG_NUM_REQS)-1:0] bus_sel, - VX_mem_bus_if.slave bus_in_if, - VX_mem_bus_if.master bus_out_if [NUM_REQS] + input wire [SEL_COUNT-1:0][`UP(LOG_NUM_REQS)-1:0] bus_sel, + VX_mem_bus_if.slave bus_in_if [NUM_INPUTS], + VX_mem_bus_if.master bus_out_if [NUM_OUTPUTS] ); localparam DATA_WIDTH = (8 * DATA_SIZE); - localparam REQ_DATAW = TAG_WIDTH + ADDR_WIDTH + `ADDR_TYPE_WIDTH + 1 + DATA_SIZE + DATA_WIDTH; + localparam REQ_DATAW = TAG_WIDTH + ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + 1 + DATA_SIZE + DATA_WIDTH; localparam RSP_DATAW = TAG_WIDTH + DATA_WIDTH; // handle requests //////////////////////////////////////////////////////// - wire [NUM_REQS-1:0] req_valid_out; - wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_out; - wire [NUM_REQS-1:0] req_ready_out; + wire [NUM_INPUTS-1:0] req_valid_in; + wire [NUM_INPUTS-1:0][REQ_DATAW-1:0] req_data_in; + wire [NUM_INPUTS-1:0] req_ready_in; + + wire [NUM_OUTPUTS-1:0] req_valid_out; + wire [NUM_OUTPUTS-1:0][REQ_DATAW-1:0] req_data_out; + wire [NUM_OUTPUTS-1:0] req_ready_out; + + for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_req_data_in + assign req_valid_in[i] = bus_in_if[i].req_valid; + assign req_data_in[i] = bus_in_if[i].req_data; + assign bus_in_if[i].req_ready = req_ready_in[i]; + end VX_stream_switch #( - .NUM_OUTPUTS (NUM_REQS), + .NUM_INPUTS (NUM_INPUTS), + .NUM_OUTPUTS (NUM_OUTPUTS), .DATAW (REQ_DATAW), .OUT_BUF (REQ_OUT_BUF) ) req_switch ( .clk (clk), .reset (reset), .sel_in (bus_sel), - .valid_in (bus_in_if.req_valid), - .data_in (bus_in_if.req_data), - .ready_in (bus_in_if.req_ready), + .valid_in (req_valid_in), + .data_in (req_data_in), + .ready_in (req_ready_in), .valid_out (req_valid_out), .data_out (req_data_out), .ready_out (req_ready_out) ); - for (genvar i = 0; i < NUM_REQS; ++i) begin + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_req_data_out assign bus_out_if[i].req_valid = req_valid_out[i]; - assign bus_out_if[i].req_data = req_data_out[i]; + assign bus_out_if[i].req_data = req_data_out[i]; assign req_ready_out[i] = bus_out_if[i].req_ready; end // handle responses /////////////////////////////////////////////////////// - wire [NUM_REQS-1:0] rsp_valid_in; - wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_in; - wire [NUM_REQS-1:0] rsp_ready_in; + wire [NUM_OUTPUTS-1:0] rsp_valid_in; + wire [NUM_OUTPUTS-1:0][RSP_DATAW-1:0] rsp_data_in; + wire [NUM_OUTPUTS-1:0] rsp_ready_in; - for (genvar i = 0; i < NUM_REQS; ++i) begin + wire [NUM_INPUTS-1:0] rsp_valid_out; + wire [NUM_INPUTS-1:0][RSP_DATAW-1:0] rsp_data_out; + wire [NUM_INPUTS-1:0] rsp_ready_out; + + for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_rsp_data_in assign rsp_valid_in[i] = bus_out_if[i].rsp_valid; - assign rsp_data_in[i] = bus_out_if[i].rsp_data; + assign rsp_data_in[i] = bus_out_if[i].rsp_data; assign bus_out_if[i].rsp_ready = rsp_ready_in[i]; end VX_stream_arb #( - .NUM_INPUTS (NUM_REQS), + .NUM_INPUTS (NUM_OUTPUTS), + .NUM_OUTPUTS(NUM_INPUTS), .DATAW (RSP_DATAW), .ARBITER (ARBITER), .OUT_BUF (RSP_OUT_BUF) @@ -85,10 +105,16 @@ module VX_mem_switch import VX_gpu_pkg::*; #( .valid_in (rsp_valid_in), .data_in (rsp_data_in), .ready_in (rsp_ready_in), - .valid_out (bus_in_if.rsp_valid), - .data_out (bus_in_if.rsp_data), - .ready_out (bus_in_if.rsp_ready), + .valid_out (rsp_valid_out), + .data_out (rsp_data_out), + .ready_out (rsp_ready_out), `UNUSED_PIN (sel_out) ); + for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_rsp_data_out + assign bus_in_if[i].rsp_valid = rsp_valid_out[i]; + assign bus_in_if[i].rsp_data = rsp_data_out[i]; + assign rsp_ready_out[i] = bus_in_if[i].rsp_ready; + end + endmodule diff --git a/hw/scripts/ip_gen.sh b/hw/scripts/altera_ip_gen.sh similarity index 100% rename from hw/scripts/ip_gen.sh rename to hw/scripts/altera_ip_gen.sh diff --git a/hw/scripts/bin2coe.py b/hw/scripts/bin2coe.py new file mode 100755 index 000000000..eaaa3619e --- /dev/null +++ b/hw/scripts/bin2coe.py @@ -0,0 +1,96 @@ +#!/usr/bin/env python3 + +# Copyright © 2019-2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import os + +def parse_binfile_option(option): + addr, path = option.split(':') + return int(addr, 0), path + +def parse_value_option(option): + addr, value = option.split(':') + return int(addr, 0), value + +def load_binary_data(addr, path, word_size, memory, little_endian): + with open(path, 'rb') as f: + binary_data = f.read() + + word_count = len(binary_data) // word_size + if len(binary_data) % word_size != 0: + word_count += 1 + + for i in range(word_count): + word_data = binary_data[i * word_size: (i + 1) * word_size] + if little_endian: + word_data = word_data[::-1] # Reverse the byte order for little-endian + hex_value = word_data.hex().zfill(word_size * 2) + memory[addr + i] = hex_value + +def add_value_data(addr, value, memory, word_size): + value = value.zfill(word_size * 2) + memory[addr] = value + +def binary_to_coe(output_file, word_size, depth, default_value, memory): + if depth == 0: + depth = max(memory.keys()) + 1 + + with open(output_file, 'w') as coe_file: + coe_file.write("; This file was generated from binary blobs and/or values\n") + coe_file.write("memory_initialization_radix=16;\n") + coe_file.write("memory_initialization_vector=\n") + + for addr in range(depth): + hex_value = memory.get(addr, default_value) + coe_file.write(f"{hex_value},\n") + + coe_file.seek(coe_file.tell() - 2) + coe_file.write(";\n") + +def main(): + parser = argparse.ArgumentParser(description="Convert binaries and values to a Xilinx COE file.") + parser.add_argument("--binfile", action='append', help="Binary file with starting address in the format :") + parser.add_argument("--value", action='append', help="Hex value with starting address in the format :") + parser.add_argument("--out", default="output.coe", help="Output file (optional).") + parser.add_argument("--wordsize", type=int, default=4, help="Word size in bytes (default 4).") + parser.add_argument("--depth", type=int, default=0, help="Address size (optional).") + parser.add_argument("--default", default="00", help="Default hex value as string (optional).") + parser.add_argument("--little_endian", action='store_true', help="Interpret binary files as little-endian (default is big-endian).") + + args = parser.parse_args() + + if args.binfile is None and args.value is None: + raise ValueError("At least one --binfile or --value must be provided.") + + # Initialize memory dictionary + memory = {} + + # Process binary files + if args.binfile: + for option in args.binfile: + addr, path = parse_binfile_option(option) + load_binary_data(addr, path, args.wordsize, memory, args.little_endian) + + # Process individual values + if args.value: + for option in args.value: + addr, value = parse_value_option(option) + add_value_data(addr, value, memory, args.wordsize) + + # Generate the COE file + binary_to_coe(args.out, args.wordsize, args.depth, args.default.zfill(args.wordsize * 2), memory) + +if __name__ == "__main__": + main() diff --git a/hw/scripts/gen_sources.sh b/hw/scripts/gen_sources.sh index 0748b3632..ed9143eb3 100755 --- a/hw/scripts/gen_sources.sh +++ b/hw/scripts/gen_sources.sh @@ -1,18 +1,20 @@ #!/bin/bash # Copyright © 2019-2023 -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + defines=() includes=() externs=() @@ -21,40 +23,47 @@ output_file="" define_header="" top_module="" copy_folder="" -prepropressor=0 +preprocessor=0 defines_str="" params_str="" includes_str="" -script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +# Helper function to append options +add_option() { + if [ -n "$1" ]; then + echo "$1 $2" + else + echo "$2" + fi +} -# parse command arguments +# Parse command arguments while getopts D:G:T:I:J:O:H:C:Ph flag do case "${flag}" in D) defines+=( ${OPTARG} ) - defines_str+="-D${OPTARG} " + defines_str=$(add_option "$defines_str" "-D${OPTARG}") ;; - G) params_str+="-G${OPTARG} " + G) params_str=$(add_option "$params_str" "-G${OPTARG}") ;; - T) top_module=( ${OPTARG} ) + T) top_module="${OPTARG}" ;; I) includes+=( ${OPTARG} ) - includes_str+="-I${OPTARG} " + includes_str=$(add_option "$includes_str" "-I${OPTARG}") ;; J) externs+=( ${OPTARG} ) - includes_str+="-I${OPTARG} " + includes_str=$(add_option "$includes_str" "-I${OPTARG}") ;; - O) output_file=( ${OPTARG} ) + O) output_file="${OPTARG}" ;; - H) define_header=( ${OPTARG} ) + H) define_header="${OPTARG}" ;; - C) copy_folder=( ${OPTARG} ) + C) copy_folder="${OPTARG}" ;; - P) prepropressor=1 + P) preprocessor=1 ;; - h) echo "Usage: [-D] [-G=] [-T] [-I] [-J] [-O] [-C: copy to] [-H] [-P: macro prepropressing] [-h help]" + h) echo "Usage: [-D] [-G=] [-T] [-I] [-J] [-O] [-C: copy to] [-H] [-P: macro preprocessing] [-h help]" exit 0 ;; \?) echo "Invalid option: -$OPTARG" 1>&2 @@ -70,33 +79,32 @@ if [ "$define_header" != "" ]; then # dump defines into a header file for value in ${defines[@]}; do arrNV=(${value//=/ }) - if (( ${#arrNV[@]} > 1 )); - then + if (( ${#arrNV[@]} > 1 )); then echo "\`define ${arrNV[0]} ${arrNV[1]}" else echo "\`define $value" - fi + fi done - } > $define_header + } > "$define_header" fi if [ "$copy_folder" != "" ]; then - # copy source files - mkdir -p $copy_folder + # copy source files + mkdir -p "$copy_folder" for dir in ${includes[@]}; do find "$dir" -maxdepth 1 -type f | while read -r file; do file_ext="${file##*.}" - file_name=$(basename -- $file) - if [ $prepropressor != 0 ] && { [ "$file_ext" == "v" ] || [ "$file_ext" == "sv" ]; }; then + file_name=$(basename -- "$file") + if [ $preprocessor != 0 ] && { [ "$file_ext" == "v" ] || [ "$file_ext" == "sv" ]; }; then if [[ -n "$params_str" && $file_name == "$top_module."* ]]; then temp_file=$(mktemp) - $script_dir/repl_params.py $params_str -T$top_module $file > $temp_file - verilator $defines_str $includes_str -E -P $temp_file > $copy_folder/$file_name + $script_dir/repl_params.py $params_str -T$top_module "$file" > "$temp_file" + verilator $defines_str $includes_str -E -P "$temp_file" > "$copy_folder/$file_name" else - verilator $defines_str $includes_str -E -P $file > $copy_folder/$file_name - fi + verilator $defines_str $includes_str -E -P "$file" > "$copy_folder/$file_name" + fi else - cp $file $copy_folder + cp "$file" "$copy_folder" fi done done @@ -112,7 +120,7 @@ if [ "$output_file" != "" ]; then fi for dir in ${externs[@]}; do - echo "+incdir+$(realpath $dir)" + echo "+incdir+$(realpath "$dir")" done for dir in ${externs[@]}; do @@ -124,24 +132,24 @@ if [ "$output_file" != "" ]; then if [ "$copy_folder" != "" ]; then # dump include directories - echo "+incdir+$(realpath $copy_folder)" + echo "+incdir+$(realpath "$copy_folder")" # dump source files - find "$(realpath $copy_folder)" -maxdepth 1 -type f -name "*_pkg.sv" -print - find "$(realpath $copy_folder)" -maxdepth 1 -type f \( -name "*.v" -o -name "*.sv" \) ! -name "*_pkg.sv" -print + find "$(realpath "$copy_folder")" -maxdepth 1 -type f -name "*_pkg.sv" -print + find "$(realpath "$copy_folder")" -maxdepth 1 -type f \( -name "*.v" -o -name "*.sv" \) ! -name "*_pkg.sv" -print else # dump include directories for dir in ${includes[@]}; do - echo "+incdir+$(realpath $dir)" + echo "+incdir+$(realpath "$dir")" done - + # dump source files for dir in ${includes[@]}; do - find "$(realpath $dir)" -maxdepth 1 -type f -name "*_pkg.sv" -print + find "$(realpath "$dir")" -maxdepth 1 -type f -name "*_pkg.sv" -print done for dir in ${includes[@]}; do - find "$(realpath $dir)" -maxdepth 1 -type f \( -name "*.v" -o -name "*.sv" \) ! -name "*_pkg.sv" -print + find "$(realpath "$dir")" -maxdepth 1 -type f \( -name "*.v" -o -name "*.sv" \) ! -name "*_pkg.sv" -print done fi - } > $output_file -fi + } > "$output_file" +fi \ No newline at end of file diff --git a/hw/scripts/ila_insert.tcl b/hw/scripts/ila_insert.tcl new file mode 100644 index 000000000..de9f0eec0 --- /dev/null +++ b/hw/scripts/ila_insert.tcl @@ -0,0 +1,231 @@ +###################################################################### +# Automatically inserts ILA instances in a batch flow, and calls "implement_debug_core". Can also be used in a GUI flow +# This should ONLY be invoked after synthesis, and before opt_design. If opt_design is called first, marked nets may be missing and not found +# Warning: Currently will skip a net if it has no obvious clock domain on the driver. Nets connected to input buffers will be dropped unless "mark_debug_clock" is attached to the net. +# Nets attached to VIO cores have the "mark_debug" attribute, and will be filtered out unless the "mark_debug_valid" attribute is attached. +# Supports the following additional attributes beyond "mark_debug" +# attribute mark_debug_valid of X : signal is "true"; -- Marks a net for ILA capture, even if net is also attached to a VIO core +# attribute mark_debug_clock of X : signal is "inst1_bufg/clock"; -- Specifies clock net to use for capturing this net. May create a new ILA core for that clock domain +# attribute mark_debug_depth of X : signal is "4096"; -- overrides default depth for this ILA core. valid values: 1024, 2048, ... 132072. Last attribute that is scanned will win. +# attribute mark_debug_adv_trigger of X : signal is "true"; -- specifies that advanced trigger capability will be added to ILA core +# Engineer: J. McCluskey +proc insert_ila { depth } { + # sequence through debug nets and organize them by clock in the + # clock_list array. Also create max and min array for bus indices + set dbgs [get_nets -hierarchical -filter {MARK_DEBUG}] + if {[llength $dbgs] == 0} { + puts "No debug net found. No ILA cores created" + return + } + + # process list of nets to find and reject nets that are attached to VIO cores. + # This has a side effect that VIO nets can't be monitored with an ILA + # This can be overridden by using the attribute "mark_debug_valid" = "true" on a net like this. + set net_list {} + foreach net $dbgs { + if { [get_property -quiet MARK_DEBUG_VALID $net] != "true" } { + set pin_list [get_pins -of_objects [get_nets -segments $net]] + set not_vio_net 1 + foreach pin $pin_list { + if { [get_property IS_DEBUG_CORE [get_cells -of_object $pin]] == 1 } { + # It seems this net is attached to a debug core (i.e. VIO core) already, so we should skip adding it to the netlist + set not_vio_net 0 + break + } + } + if { $not_vio_net == 1 } { lappend net_list $net; } + } else { + lappend net_list $net + } + } + + # check again to see if we have any nets left now + if {[llength $net_list] == 0} { + puts "All nets with MARK_DEBUG are already connected to VIO cores. No ILA cores created" + return + } + + # Now that the netlist has been filtered, determine bus names and clock domains + foreach d $net_list { + # name is root name of a bus, index is the bit index in the bus + set name [regsub {\[[[:digit:]]+\]$} $d {}] + set index [regsub {^.*\[([[:digit:]]+)\]$} $d {\1}] + if {[string is integer -strict $index]} { + if {![info exists max($name)]} { + set max($name) $index + set min($name) $index + } elseif {$index > $max($name)} { + set max($name) $index + } elseif {$index < $min($name)} { + set min($name) $index + } + } else { + set max($name) -1 + } + # Now we search for the local clock net associated with the target net. + # There may be ambiguities or no answer in some cases + if {![info exists clocks($name)]} { + # does MARK_DEBUG_CLOCK decorate this net? If not, then search backwards to the driver cell + set clk_name [get_property -quiet MARK_DEBUG_CLOCK $d] + if { [llength $clk_name] == 0 } { + # trace to the clock net, tracing backwards via the driver pin. + set driver_pin [get_pins -filter {DIRECTION == "OUT" && IS_LEAF == TRUE } -of_objects [ get_nets -segments $d ]] + set driver_cell [get_cells -of_objects $driver_pin] + if { [get_property IS_SEQUENTIAL $driver_cell] == 1 } { + set timing_arc [get_timing_arcs -to $driver_pin] + set cell_clock_pin [get_pins -filter {IS_CLOCK} [get_property FROM_PIN $timing_arc]] + if { [llength $cell_clock_pin] > 1 } { + puts "Error: in insert_ila. Found more than 1 clock pin in driver cell $driver_cell with timing arc $timing_arc for net $d" + continue + } + } else { + # our driver cell is a LUT or LUTMEM in combinatorial mode, we need to trace further. + set paths [get_timing_paths -quiet -through $driver_pin ] + if { [llength $paths] > 0 } { + # note that here we arbitrarily select the start point of the FIRST timing path... there might be multiple clocks with timing paths for this net. + # use MARK_DEBUG_CLOCK to specify another clock in this case. + set cell_clock_pin [get_pins [get_property STARTPOINT_PIN [lindex $paths 0]]] + } else { + # Can't find any timing path, so skip the net, and warn the user. + puts "Critical Warning: from insert_ila.tcl Can't trace any clock domain on driver of net $d" + puts "Please attach the attribute MARK_DEBUG_CLOCK with a string containing the net name of the desired sampling clock, .i.e." + puts "attribute mark_debug_clock of $d : signal is \"inst_bufg/clk\";" + continue + } + } + # clk_net will usually be a list of net segments, which needs filtering to determine the net connected to the driver pin + set clk_net [get_nets -segments -of_objects $cell_clock_pin] + } else { + set clk_net [get_nets -segments $clk_name] + if { [llength $clk_net] == 0 } { puts "MARK_DEBUG_CLOCK attribute on net $d does not match any known net. Please fix."; continue; } + } + # trace forward to net actually connected to clock buffer output, not any of the lower level segment names + set clocks($name) [get_nets -of_objects [get_pins -filter {DIRECTION == "OUT" && IS_LEAF == TRUE } -of_objects $clk_net]] + if { [llength $clocks($name)] == 0 } { + puts "Critical Warning: from insert_ila.tcl Can't trace any clock domain on driver of net $d" + puts "Please attach the attribute MARK_DEBUG_CLOCK with a string containing the net name of the desired sampling clock, .i.e." + puts "attribute mark_debug_clock of $d : signal is \"inst_bufg/clk\";" + continue + } + if {![info exists clock_list($clocks($name))]} { + # found a new clock + puts "New clock found is $clocks($name)" + set clock_list($clocks($name)) [list $name] + set ila_depth($clocks($name)) $depth + set ila_adv_trigger($clocks($name)) false + } else { + lappend clock_list($clocks($name)) $name + } + # Does this net have a "MARK_DEBUG_DEPTH" attribute attached? + set clk_depth [get_property -quiet MARK_DEBUG_DEPTH $d] + if { [llength $clk_depth] != 0 } { + set ila_depth($clocks($name)) $clk_depth + } + # Does this net have a "MARK_DEBUG_ADV_TRIGGER" attribute attached? + set trigger [get_property -quiet MARK_DEBUG_ADV_TRIGGER $d] + if { $trigger == "true" } { + set ila_adv_trigger($clocks($name)) true + } + } + } + + set ila_count 0 + set trig_out "" + set trig_out_ack "" + + if { [llength [array names clock_list]] > 1 } { + set enable_trigger true + } else { + set enable_trigger false + } + + foreach c [array names clock_list] { + # Now build and connect an ILA core for each clock domain + [incr ila_count ] + set ila_inst "ila_$ila_count" + # first verify if depth is a member of the set, 1024, 2048, 4096, 8192, ... 131072 + if { $ila_depth($c) < 1024 || [expr $ila_depth($c) & ($ila_depth($c) - 1)] || $ila_depth($c) > 131072 } { + # Depth is not right... lets fix it, and continue + if { $ila_depth($c) < 1024 } { + set new_depth 1024 + } elseif { $ila_depth($c) > 131072 } { + set new_depth 131072 + } else { + # round value to next highest power of 2, (in log space) + set new_depth [expr 1 << int( log($ila_depth($c))/log(2) + .9999 )] + } + puts "Can't create ILA core $ila_inst with depth of $ila_depth($c)! Changed capture depth to $new_depth" + set ila_depth($c) $new_depth + } + # create ILA and connect its clock + puts "Creating ILA $ila_inst with clock $c, capture depth $ila_depth($c) and advanced trigger = $ila_adv_trigger($c)" + create_debug_core $ila_inst ila + if { $ila_adv_trigger($c) } { set mu_cnt 4; } else { set mu_cnt 2; } + set_property C_DATA_DEPTH $ila_depth($c) [get_debug_cores $ila_inst] + set_property C_TRIGIN_EN $enable_trigger [get_debug_cores $ila_inst] + set_property C_TRIGOUT_EN $enable_trigger [get_debug_cores $ila_inst] + set_property C_ADV_TRIGGER $ila_adv_trigger($c) [get_debug_cores $ila_inst] + set_property C_INPUT_PIPE_STAGES 1 [get_debug_cores $ila_inst] + set_property C_EN_STRG_QUAL true [get_debug_cores $ila_inst] + set_property ALL_PROBE_SAME_MU true [get_debug_cores $ila_inst] + set_property ALL_PROBE_SAME_MU_CNT $mu_cnt [get_debug_cores $ila_inst] + set_property port_width 1 [get_debug_ports $ila_inst/clk] + connect_debug_port $ila_inst/clk $c + # hookup trigger ports in a circle if more than one ILA is created + if { $enable_trigger == true } { + create_debug_port $ila_inst trig_in + create_debug_port $ila_inst trig_in_ack + create_debug_port $ila_inst trig_out + create_debug_port $ila_inst trig_out_ack + if { $trig_out != "" } { + connect_debug_port $ila_inst/trig_in [get_nets $trig_out] + } + if { $trig_out_ack != "" } { + connect_debug_port $ila_inst/trig_in_ack [get_nets $trig_out_ack] + } + set trig_out ${ila_inst}_trig_out_$ila_count + create_net $trig_out + connect_debug_port $ila_inst/trig_out [get_nets $trig_out] + set trig_out_ack ${ila_inst}_trig_out_ack_$ila_count + create_net $trig_out_ack + connect_debug_port $ila_inst/trig_out_ack [get_nets $trig_out_ack] + } + # add probes + set nprobes 0 + foreach n [lsort $clock_list($c)] { + set nets {} + if {$max($n) < 0} { + lappend nets [get_nets $n] + } else { + # n is a bus name + for {set i $min($n)} {$i <= $max($n)} {incr i} { + lappend nets [get_nets $n[$i]] + } + } + set prb probe$nprobes + if {$nprobes > 0} { + create_debug_port $ila_inst probe + } + set_property port_width [llength $nets] [get_debug_ports $ila_inst/$prb] + connect_debug_port $ila_inst/$prb $nets + incr nprobes + } + } + + # at this point, we need to complete the circular connection of trigger outputs and acks + if { $enable_trigger == true } { + connect_debug_port ila_1/trig_in [get_nets $trig_out] + connect_debug_port ila_1/trig_in_ack [get_nets $trig_out_ack] + } + set project_found [get_projects -quiet] + if { $project_found != "New Project" } { + puts "Saving constraints now in project [current_project -quiet]" + save_constraints_as debug_constraints.xdc + } + + # run ILA cores implementation + implement_debug_core + + # write out probe info file + write_debug_probes -force debug_nets.ltx +} \ No newline at end of file diff --git a/hw/scripts/scope.py b/hw/scripts/scope.py index 5361e8afe..f6d93961b 100755 --- a/hw/scripts/scope.py +++ b/hw/scripts/scope.py @@ -1,12 +1,12 @@ #!/usr/bin/env python3 # Copyright © 2019-2023 -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -19,9 +19,9 @@ import xml.etree.ElementTree as ET import re import json -vl_int_re = re.compile(r"\d+'s*h([\da-fA-F]+)") +vl_int_re = re.compile(r"\d+'s*h([\da-fA-F]+)") -def parse_vl_int(text): +def parse_vl_int(text): str_hex = re.sub(vl_int_re, r'\1', text) return int(str_hex, 16) @@ -33,16 +33,18 @@ def source_loc(xml_doc, xml_loc): end_line = loc[3] end_col = loc[4] file = xml_doc.find(".//file/[@id='" + file_id + "']").get("filename") - return file + " (" + start_line + ":" + start_col + "-" + end_line + ":" + end_col + ")" - + return f"{file} ({start_line}:{start_col}-{end_line}:{end_col})" + def parse_dtype_width(xml_doc, dtype_id): xml_type = xml_doc.find(".//typetable/*[@id='" + dtype_id + "']") - if xml_type.tag == "packarraydtype" or xml_type.tag == "unpackarraydtype": + if xml_type.tag in ["packarraydtype", "unpackarraydtype"]: sub_dtype_id = xml_type.get("sub_dtype_id") base_width = parse_dtype_width(xml_doc, sub_dtype_id) - const = xml_type.iter("const") - left = parse_vl_int(next(const).get("name")) - right = parse_vl_int(next(const).get("name")) + const_iter = xml_type.iter("const") + first_const = next(const_iter) + second_const = next(const_iter) + left = parse_vl_int(first_const.get("name")) + right = parse_vl_int(second_const.get("name")) return base_width * (left - right + 1) elif xml_type.tag == "structdtype": width = 0 @@ -65,31 +67,77 @@ def parse_dtype_width(xml_doc, dtype_id): if left != None and right != None: return int(left) - int(right) + 1 return 1 - + def parse_var_name(xml_doc, xml_node): if xml_node.tag == "varref": return xml_node.get("name") elif xml_node.tag == "varxref": name = xml_node.get("name") dotted = xml_node.get("dotted") - return dotted + '.' + name + return f"{dotted}.{name}" + elif xml_node.tag == "arraysel": + return parse_arraysel_name(xml_doc, xml_node) else: - raise ET.ParseError("invalid probe entry" + source_loc(xml_doc, xml_node.get("loc"))) + raise ET.ParseError("invalid probe entry: tag=" + xml_node.tag + ", " + source_loc(xml_doc, xml_node.get("loc"))) return name -def parse_sel_name(xml_doc, xml_node): - name = parse_var_name(xml_doc, xml_node.find("*")) - const = xml_node.iter("const") - offset = parse_vl_int(next(const).get("name")) - #size = parse_vl_int(next(const).get("name")) - return name + '_' + str(offset) +def parse_sel_field(xml_doc, dtype_id, offset, width): + xml_type = xml_doc.find(".//typetable/*[@id='" + dtype_id + "']") + name = xml_type.get("name") + if xml_type.tag == "structdtype": + bit_offset = 0 + members = list(xml_type.findall("memberdtype")) + members.reverse() + for member in members: + sub_dtype_id = member.get("sub_dtype_id") + member_name = member.get("name") + member_width = parse_dtype_width(xml_doc, sub_dtype_id) + if bit_offset <= offset < bit_offset + member_width: + if width != member_width and sub_dtype_id: + sub_field = parse_sel_field(xml_doc, sub_dtype_id, offset - bit_offset, width) + return f".{member_name}{sub_field}" + else: + return f".{member_name}" + bit_offset += member_width + raise ET.ParseError("invalid probe entry: " + source_loc(xml_doc, xml_type.get("loc"))) + elif xml_type.tag in ["packarraydtype", "unpackarraydtype"]: + sub_dtype_id = xml_type.get("sub_dtype_id") + base_width = parse_dtype_width(xml_doc, sub_dtype_id) + if width > base_width: + return "" + array_index = offset // base_width + sub_offset = offset % base_width + array_sel_name = f"_{array_index}" # array indexing is not supported in VCD + sub_field = parse_sel_field(xml_doc, sub_dtype_id, sub_offset, width) + return f"{array_sel_name}{sub_field}" + elif xml_type.tag == "basicdtype": + if width == 1: + return F"[{offset}]" + end = width - 1 + offset + return F"[{end}:{offset}]" + else: + raise ET.ParseError("invalid probe entry: tag=" + xml_type.tag + ", " + source_loc(xml_doc, xml_type.get("loc"))) + return None -def parse_array_name(xml_doc, xml_node): +def parse_sel_name(xml_doc, xml_node): + first_child = xml_node.find("*") + name = parse_var_name(xml_doc, first_child) + dtype_id = first_child.get("dtype_id") + const_iter = xml_node.iter("const") + first_const = next(const_iter) + second_const = next(const_iter) + offset = parse_vl_int(first_const.get("name")) + width = parse_vl_int(second_const.get("name")) + return name + parse_sel_field(xml_doc, dtype_id, offset, width) + +def parse_arraysel_name(xml_doc, xml_node): if xml_node.tag == "arraysel": - name = parse_array_name(xml_doc, xml_node.find("*")) - xml_size = xml_node.find("const").get("name") - array_size = parse_vl_int(xml_size) - name = name + '_' + str(array_size) + first_child = xml_node.find("*") + name = parse_arraysel_name(xml_doc, first_child) + const_iter = xml_node.iter("const") + first_const = next(const_iter) + offset = parse_vl_int(first_const.get("name")) + name = f"{name}_{offset}" # array indexing is not supported in VCD else: name = parse_var_name(xml_doc, xml_node) return name @@ -97,9 +145,10 @@ def parse_array_name(xml_doc, xml_node): def parse_vl_port(xml_doc, xml_node, signals): total_width = 0 if xml_node.tag == "concat": - for xml_child in xml_node.findall("*"): + child_nodes = xml_node.findall("*") + for xml_child in child_nodes: total_width = total_width + parse_vl_port(xml_doc, xml_child, signals) - elif xml_node.tag == "varref" or xml_node.tag == "varxref": + elif xml_node.tag in ["varref", "varxref"]: name = parse_var_name(xml_doc, xml_node) dtype_id = xml_node.get("dtype_id") signal_width = parse_dtype_width(xml_doc, dtype_id) @@ -112,64 +161,84 @@ def parse_vl_port(xml_doc, xml_node, signals): signals.append([name, signal_width]) total_width = total_width + signal_width elif xml_node.tag == "arraysel": - name = parse_array_name(xml_doc, xml_node) + name = parse_arraysel_name(xml_doc, xml_node) dtype_id = xml_node.get("dtype_id") signal_width = parse_dtype_width(xml_doc, dtype_id) signals.append([name, signal_width]) total_width = total_width + signal_width else: - raise ET.ParseError("invalid probe entry: " + source_loc(xml_doc, xml_node.get("loc"))) + raise ET.ParseError("invalid probe entry: tag=" + xml_node.tag + ", " + source_loc(xml_doc, xml_node.get("loc"))) + # Check for duplicate signal names + signal_names = [signal[0] for signal in signals] + duplicates = set([name for name in signal_names if signal_names.count(name) > 1]) + if len(duplicates) > 0: + raise ET.ParseError("duplicate signal names: " + ", ".join(duplicates)) return total_width def parse_xml(filename, max_taps): xml_doc = ET.parse(filename) modules = {} xml_modules = xml_doc.findall(".//module/[@origName='VX_scope_tap']") - for xml_module in xml_modules: + for xml_module in xml_modules: scope_id = parse_vl_int(xml_module.find(".//var/[@name='SCOPE_ID']/const").get("name")) - triggerw = parse_vl_int(xml_module.find(".//var/[@name='TRIGGERW']/const").get("name")) + xtriggerw = parse_vl_int(xml_module.find(".//var/[@name='XTRIGGERW']/const").get("name")) + htriggerw = parse_vl_int(xml_module.find(".//var/[@name='HTRIGGERW']/const").get("name")) probew = parse_vl_int(xml_module.find(".//var/[@name='PROBEW']/const").get("name")) module_name = xml_module.get("name") - modules[module_name] = [scope_id, triggerw, probew] + modules[module_name] = [scope_id, xtriggerw, htriggerw, probew] taps = [] - xml_instances = xml_doc.iter("instance") - for xml_instance in xml_instances: + xml_instances = xml_doc.iter("instance") + for xml_instance in xml_instances: if (max_taps != -1 and len(taps) >= max_taps): - break + break defName = xml_instance.get("defName") module = modules.get(defName) if module is None: continue - triggers = [] - probes = [] - w = parse_vl_port(xml_doc, xml_instance.find(".//port/[@name='triggers']/*"), triggers) - if w != module[1]: - raise ET.ParseError("invalid triggers width: actual=" + str(w) + ", expected=" + str(module[1])) + + xtriggers = [] + htriggers = [] + probes = [] + + if module[1] > 0: + w = parse_vl_port(xml_doc, xml_instance.find(".//port/[@name='xtriggers']/*"), xtriggers) + if w != module[1]: + raise ET.ParseError("invalid xtriggers width: actual=" + str(w) + ", expected=" + str(module[1])) + + if module[2] > 0: + w = parse_vl_port(xml_doc, xml_instance.find(".//port/[@name='htriggers']/*"), htriggers) + if w != module[2]: + raise ET.ParseError("invalid htriggers width: actual=" + str(w) + ", expected=" + str(module[2])) + w = parse_vl_port(xml_doc, xml_instance.find(".//port/[@name='probes']/*"), probes) - if w != module[2]: - raise ET.ParseError("invalid probes width: actual=" + str(w) + ", expected=" + str(module[2])) + if w != module[3]: + raise ET.ParseError("invalid probes width: actual=" + str(w) + ", expected=" + str(module[3])) + signals = probes - for trigger in triggers: - signals.append(trigger) + for xtrigger in xtriggers: + signals.append(xtrigger) + for htrigger in htriggers: + signals.append(htrigger) + loc = xml_instance.get("loc") hier = xml_doc.find(".//cell/[@loc='" + loc + "']").get("hier") path = hier.rsplit(".", 1)[0] taps.append({"id":module[0], - "width":module[1] + module[2], - "signals":signals, + "width":module[1] + module[2] + module[3], + "signals":signals, "path":path}) return {"version":"0.1.0", "taps":taps} -def main(): +def main(): parser = argparse.ArgumentParser(description='Scope headers generator.') parser.add_argument('-o', nargs='?', default='scope.json', metavar='o', help='Output JSON manifest') parser.add_argument('-n', nargs='?', default=-1, metavar='n', type=int, help='Maximum number of taps to read') parser.add_argument('xml', help='Design XML descriptor file') args = parser.parse_args() #print("args=", args) - scope_taps = parse_xml(args.xml, args.n) + scope_taps = parse_xml(args.xml, args.n) with open(args.o, "w") as f: json.dump(scope_taps, f, ensure_ascii=False, indent=4) diff --git a/hw/scripts/xilinx_async_bram_patch.tcl b/hw/scripts/xilinx_async_bram_patch.tcl new file mode 100644 index 000000000..239d8cae6 --- /dev/null +++ b/hw/scripts/xilinx_async_bram_patch.tcl @@ -0,0 +1,718 @@ +# Copyright © 2019-2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +namespace eval vortex { + +variable debug 0 + +proc print_error {msg {do_exit 1}} { + if {$do_exit} { + puts "ERROR: $msg" + exit -1 + } else { + variable debug + if {$debug} {puts "WARNING: $msg"} + } +} + +proc str_replace {str match repl} { + set result "" + regsub $match $str $repl result + return $result +} + +proc regex_escape {str} { + return [string map { + \\ \\\\ + ^ \\^ + . \\. + \[ \\\[ + \] \\\] + \$ \\\$ + \( \\\( + \) \\\) + | \\| + * \\* + + \\+ + ? \\? + \{ \\\{ + \} \\\} + } $str] +} + +proc unique_cell_name {name} { + if {[get_cells -quiet $name] == {}} { return $name } + set index 0 + while {[get_cells -quiet ${name}_${index}] != {}} { incr index } + return ${name}_${index} +} + +proc unique_net_name {name} { + if {[get_nets -quiet $name] == {}} { return $name } + set index 0 + while {[get_nets -quiet ${name}_${index}] != {}} { incr index } + return ${name}_${index} +} + +proc build_parent_child_map {all_cells} { + set parent_child_map {} + foreach cell $all_cells { + set parent [get_property PARENT $cell] + if {$parent ne ""} { + if {[dict exists $parent_child_map $parent]} { + dict lappend parent_child_map $parent $cell + } else { + dict set parent_child_map $parent [list $cell] + } + } + } + return $parent_child_map +} + +proc find_cell_descendants_recursive {parent_cell parent_child_map} { + set descendants {} + if {[dict exists $parent_child_map $parent_cell]} { + set children [dict get $parent_child_map $parent_cell] + foreach child $children { + # Add the child to the list + lappend descendants $child + # Recursively add its descendants + set sub_descendants [find_cell_descendants_recursive $child $parent_child_map] + lappend descendants {*}$sub_descendants + } + } + return $descendants +} + +proc find_cell_descendants {parent_cell} { + set all_cells [get_cells -hierarchical] + set parent_child_map [build_parent_child_map $all_cells] + return [find_cell_descendants_recursive $parent_cell $parent_child_map] +} + +proc find_nested_cells {parent_cell name_match {should_exist 1}} { + set hier_sep [get_hierarchy_separator] + set matching_cells {} + foreach cell [find_cell_descendants $parent_cell] { + set parent_name [get_property PARENT $cell] + set cell_name [get_property NAME $cell] + set name_prefix [regex_escape "${parent_name}${hier_sep}"] + set pattern "${name_prefix}${name_match}" + if {[regexp $pattern $cell_name]} { + lappend matching_cells $cell + } + } + if {[llength $matching_cells] == 0} { + print_error "No matching cell found for '$parent_cell' matching '$name_match'." $should_exist + } + return $matching_cells +} + +proc find_cell_nets {cell name_match {should_exist 1}} { + set matching_nets {} + foreach net [get_nets -hierarchical -filter "PARENT_CELL == $cell"] { + set name [get_property NAME $net] + if {[regexp $name_match $name]} { + lappend matching_nets $net + } + } + if {[llength $matching_nets] == 0} { + print_error "No matching net found for '$cell' matching '$name_match'." $should_exist + } + return $matching_nets +} + +proc find_cell_net {cell name_match {should_exist 1}} { + set nets [find_cell_nets $cell $name_match $should_exist] + if {[llength $nets] == 0} { + return "" + } elseif {[llength $nets] > 1} { + puts "ERROR: Multiple matching nets found for '$cell' matching '$name_match'." + exit -1 + } + return [lindex $nets 0] +} + +proc get_cell_net {cell name} { + set net [get_nets -hierarchical -filter "PARENT_CELL == $cell && NAME == $name"] + if {[llength $net] == 0} { + puts "ERROR: No matching net found for '$cell' matching '$name'." + exit -1 + } + return $net; +} + +proc find_cell_pins {cell name_match {should_exist 1}} { + set hier_sep [get_hierarchy_separator] + set matching_pins {} + foreach pin [get_pins -of_objects $cell] { + set name [get_property NAME $pin] + set name_prefix [regex_escape "${cell}${hier_sep}"] + set pattern "${name_prefix}${name_match}" + if {[regexp $pattern $name]} { + lappend matching_pins $pin + } + } + if {[llength $matching_pins] == 0} { + print_error "No matching pin found for '$cell' matching '$name_match'." $should_exist + } + return $matching_pins +} + +proc get_cell_pin {cell name} { + set pin [get_pins -of_objects $cell -filter "NAME == $name"] + if {[llength $pin] == 0} { + puts "ERROR: No matching pin found for '$cell' matching '$name'." + exit -1 + } + return $pin +} + +proc remove_cell_from_netlist {cell} { + variable debug + + # Disconnect all pins of the cell + foreach pin [get_pins -quiet -of_objects $cell] { + foreach net [get_nets -quiet -of_objects $pin] { + disconnect_net -net $net -objects $pin + if {$debug} {puts "DEBUG: Disconnected net '$net' from pin '$pin'."} + } + } + + # Remove the cell + remove_cell $cell + if {$debug} {puts "DEBUG: Cell '$cell' was removed successfully."} +} + +proc find_net_driver {taregt_net {should_exist 1}} { + set driverPins [get_pins -quiet -leaf -of_objects $taregt_net -filter {DIRECTION == "OUT"}] + if {[llength $driverPins] == 0} { + set driverPorts [get_ports -quiet -of_objects $taregt_net -filter {DIRECTION == "IN"}] + if {[llength $driverPorts] == 0} { + print_error "No driver found for '$taregt_net'." $should_exist + } elseif {[llength $driverPorts] > 1} { + puts "WARNING: Multiple driver ports found for '$taregt_net'." + return [lindex $driverPorts 0] + } + return $driverPorts + } elseif {[llength $driverPins] > 1} { + puts "WARNING: Multiple driver pins found for '$taregt_net'." + return [lindex $driverPins 0] + } + return $driverPins +} + +proc find_pin_driver {target_pin {should_exist 1}} { + set net [get_nets -quiet -of_objects $target_pin] + if {[llength $net] == 0} { + print_error "No net connected to pin '$target_pin'." $should_exist + return "" + } elseif {[llength $net] > 1} { + puts "ERROR: Multiple nets connected to pin '$target_pin'." + exit -1 + } + return [find_net_driver $net] +} + +proc create_register_next {parent reg_cell raddr_reset} { + variable debug + + set hier_sep [get_hierarchy_separator] + + set reg_d_pin [get_pins "${reg_cell}${hier_sep}D"] + if {[llength $reg_d_pin] == 0} { + puts "ERROR: No D pin found on register cell '$reg_cell'." + exit -1 + } elseif {[llength $reg_d_pin] > 1} { + puts "ERROR: Multiple D pins found on register cell '$reg_cell'." + exit -1 + } + + if {$debug} {puts "DEBUG: reg_d_pin: '$reg_d_pin'"} + + set reg_d_src_pin [find_pin_driver $reg_d_pin] + if {$reg_d_src_pin == ""} { + puts "ERROR: No source pin found connected to '$reg_d_pin'." + exit -1 + } + + if {$debug} {puts "DEBUG: reg_d_src_pin: '$reg_d_src_pin'"} + + if {$raddr_reset == ""} { + return $reg_d_src_pin + } + + set reg_r_src_pin "" + + set register_type [get_property REF_NAME $reg_cell] + if {$register_type == "FDRE"} { + set reg_r_pin [get_pins "${reg_cell}${hier_sep}R"] + if {[llength $reg_r_pin] == 0} { + puts "ERROR: No R pin found on FDRE cell '$reg_cell'." + exit -1 + } elseif {[llength $reg_r_pin] > 1} { + puts "ERROR: Multiple R pins found on FDRE cell '$reg_cell'." + exit -1 + } + + if {$debug} {puts "DEBUG: reg_r_pin: '$reg_r_pin'"} + + set reg_r_src_pin [find_pin_driver $reg_r_pin] + if {$reg_r_src_pin == ""} { + puts "ERROR: No source pin found connected to '$reg_r_pin'." + exit -1 + } + } elseif {$register_type == "FDSE"} { + set reg_s_pin [get_pins "${reg_cell}${hier_sep}S"] + if {[llength $reg_s_pin] == 0} { + puts "ERROR: No S pin found on FDSE cell '$reg_cell'." + exit -1 + } elseif {[llength $reg_s_pin] > 1} { + puts "ERROR: Multiple S pins found on FDSE cell '$reg_cell'." + exit -1 + } + + if {$debug} {puts "DEBUG: reg_s_pin: '$reg_s_pin'"} + + set reg_r_src_pin [find_pin_driver $reg_s_pin] + if {$reg_r_src_pin == ""} { + puts "ERROR: No source pin found connected to '$reg_s_pin'." + exit -1 + } + } else { + puts "ERROR: Unsupported register type: '$register_type'." + exit 1 + } + + if {$debug} {puts "DEBUG: reg_r_src_pin: '$reg_r_src_pin'"} + + set reg_d_src_net [get_nets -of_objects $reg_d_src_pin] + if {[llength $reg_d_src_net] == 0} { + puts "ERROR: Unable to get source nets for pins." + exit -1 + } elseif {[llength $reg_d_src_net] > 1} { + puts "ERROR: Multiple source nets found for pins." + exit -1 + } + + set reg_r_src_net [get_nets -of_objects $reg_r_src_pin] + if {[llength $reg_r_src_net] == 0} { + puts "ERROR: Unable to get source nets for pins." + exit -1 + } elseif {[llength $reg_r_src_net] > 1} { + puts "ERROR: Multiple source nets found for pins." + exit -1 + } + + # Create a MUX cell to implement register next value + # Use a 2x1 LUT to describe the logic: + # FDRE: O = I1 ? 0 : I0; where I0=D, I1=R + # FDSE: O = I1 ? 1 : I0; where I0=D, I1=S + set lut_name [unique_cell_name "${parent}${hier_sep}raddr_next"] + set lut_cell [create_cell -reference LUT2 $lut_name] + if {$debug} {puts "DEBUG: Created lut cell: '$lut_cell'"} + + if {$register_type == "FDRE"} { + set_property INIT 4'b0010 $lut_cell + } elseif {$register_type == "FDSE"} { + set_property INIT 4'b1110 $lut_cell + } else { + puts "ERROR: Unsupported register type: '$register_type'." + exit 1 + } + + set lut_i0_pin [get_pins "${lut_cell}${hier_sep}I0"] + if {[llength $lut_i0_pin] == 0} { + puts "ERROR: No I0 pin found on FDSE cell '$lut_cell'." + exit -1 + } elseif {[llength $lut_i0_pin] > 1} { + puts "ERROR: Multiple I0 pins found on FDSE cell '$lut_cell'." + exit -1 + } + + set lut_i1_pin [get_pins "${lut_cell}${hier_sep}I1"] + if {[llength $lut_i1_pin] == 0} { + puts "ERROR: No I1 pin found on FDSE cell '$lut_cell'." + exit -1 + } elseif {[llength $lut_i1_pin] > 1} { + puts "ERROR: Multiple I1 pins found on FDSE cell '$lut_cell'." + exit -1 + } + + set lut_o_pin [get_pins "${lut_cell}${hier_sep}O"] + if {[llength $lut_o_pin] == 0} { + puts "ERROR: No O pin found on FDSE cell '$lut_cell'." + exit -1 + } elseif {[llength $lut_o_pin] > 1} { + puts "ERROR: Multiple O pins found on FDSE cell '$lut_cell'." + exit -1 + } + + connect_net -net $reg_d_src_net -objects $lut_i0_pin -hierarchical + if {$debug} {puts "DEBUG: Connected net '$reg_d_src_net' to pin '$lut_i0_pin'."} + + connect_net -net $reg_r_src_net -objects $lut_i1_pin -hierarchical + if {$debug} {puts "DEBUG: Connected net '$reg_r_src_net' to pin '$lut_i1_pin'."} + + return $lut_o_pin +} + +proc getOrCreateVCCPin {parent} { + variable debug + + set hier_sep [get_hierarchy_separator] + set cell_name "${parent}${hier_sep}VCC" + + set vcc_cell [get_cells -quiet $cell_name] + if {[llength $vcc_cell] == 0} { + set vcc_cell [create_cell -reference VCC $cell_name] + if {$debug} {puts "DEBUG: Created VCC cell: '$vcc_cell'"} + } elseif {[llength $vcc_cell] > 1} { + puts "ERROR: Multiple VCC cells found with name '$cell_name'." + exit -1 + } + + set vcc_pin [get_pins "${vcc_cell}${hier_sep}P"] + if {[llength $vcc_pin] == 0} { + puts "ERROR: No VCC pin found on VCC cell '$vcc_cell'." + exit -1 + } elseif {[llength $vcc_pin] > 1} { + puts "ERROR: Multiple VCC pins found on VCC cell '$vcc_cell'." + exit -1 + } + + return $vcc_pin +} + +proc getOrCreateGNDPin {parent} { + variable debug + + set hier_sep [get_hierarchy_separator] + set cell_name "${parent}${hier_sep}GND" + + set gnd_cell [get_cells -quiet $cell_name] + if {[llength $gnd_cell] == 0} { + set gnd_cell [create_cell -reference GND $cell_name] + if {$debug} {puts "DEBUG: Created GND cell: '$gnd_cell'"} + } elseif {[llength $gnd_cell] > 1} { + puts "ERROR: Multiple GND cells found with name '$cell_name'." + exit -1 + } + + set gnd_pin [get_pins "${gnd_cell}${hier_sep}G"] + if {[llength $gnd_pin] == 0} { + puts "ERROR: No GND pin found on GND cell '$gnd_cell'." + exit -1 + } elseif {[llength $gnd_pin] > 1} { + puts "ERROR: Multiple GND pins found on GND cell '$gnd_cell'." + exit -1 + } + + return $gnd_pin +} + +proc find_net_sinks {source_net {should_exist 1}} { + set sink_pins {} + # Iterate through all pins connected to the source net + foreach pin [get_pins -quiet -of_objects $source_net] { + set direction [get_property DIRECTION $pin] + # Input pins of nested cells + if {$direction == "IN"} { + lappend sink_pins $pin + } + # Output pins of the parent cell + set pin_cell [get_cells -of_objects $pin] + set is_primitive [get_property IS_PRIMITIVE $pin_cell] + if {$direction == "OUT" && !$is_primitive} { + lappend sink_pins $pin + } + } + # Add any top-module output ports connected to the source net + foreach port [get_ports -quiet -of_objects $source_net -filter {DIRECTION == "OUT"}] { + lappend sink_pins $port + } + if {[llength $sink_pins] == 0} { + print_error "No sink found for '$source_net'." $should_exist + } + return $sink_pins +} + +proc find_matching_nets {cell nets match repl} { + set matching_nets {} + foreach net $nets { + set net_name [str_replace $net $match $repl] + set matching_net [get_cell_net $cell $net_name] + if {$matching_net != ""} { + lappend matching_nets $matching_net + } + } + if {[llength $matching_nets] == 0} { + puts "ERROR: No matching nets found for '$nets'." + exit -1 + } elseif {[llength $matching_nets] != [llength $nets]} { + puts "ERROR: Mismatch in number of matching nets." + exit -1 + } + return $matching_nets +} + +proc find_matching_pins {cell pins match repl} { + set matching_pins {} + foreach pin $pins { + set pin_name [str_replace $pin $match $repl] + set matching_pin [get_cell_pin $cell $pin_name] + if {$matching_pin != ""} { + lappend matching_pins $matching_pin + } + } + if {[llength $matching_pins] == 0} { + puts "ERROR: No matching pins found for '$pins'." + exit -1 + } elseif {[llength $matching_pins] != [llength $pins]} { + puts "ERROR: Mismatch in number of matching pins." + exit -1 + } + return $matching_pins +} + +proc replace_net_source {net source_pin} { + variable debug + foreach pin [find_net_sinks $net 0] { + # disconnect net from pin + disconnect_net -net $net -objects $pin + if {$debug} {puts "DEBUG: Disconnected net '$net' from pin '$pin'."} + + # find/create source net + set source_net [get_nets -quiet -of_objects $source_pin] + if {[llength $source_net] == 0} { + # Create a new net (in source_cell's parent) if none exists + set source_cell [get_cells -of_objects $source_pin] + set net_name [unique_net_name "${source_cell}_tmp_net"] + set source_net [create_net $net_name] + if {$debug} {puts "DEBUG: Created source_net: '$source_net'"} + # Connect the source pin to the new net + connect_net -net $source_net -objects $source_pin -hierarchical + if {$debug} {puts "DEBUG: Connected net '$source_net' to pin '$source_pin'."} + } elseif {[llength $source_net] > 1} { + puts "ERROR: Multiple nets connected to pin '$source_pin'." + exit -1 + } + + set external_net [get_nets -quiet -of_objects $pin] + if {[llength $external_net] == 0} { + # Connect pin to source net + connect_net -net $source_net -objects $pin -hierarchical + if {$debug} {puts "DEBUG: Connected net '$source_net' to pin '$pin'."} + } elseif {[llength $external_net] == 1} { + foreach external_pin [get_pins -of_objects $external_net] { + # disconnect external net from pin + disconnect_net -net $external_net -objects $pin + if {$debug} {puts "DEBUG: Disconnected net '$external_net' from pin '$pin'."} + # recurse-connect external net's pins to source_pin + replace_net_source $external_net $source_pin + } + } else { + puts "ERROR: Multiple nets connected to pin '$pin'." + exit -1 + } + } +} + +proc resolve_async_bram {inst} { + variable debug + + puts "INFO: Resolving asynchronous BRAM patch: '$inst'." + + set hier_sep [get_hierarchy_separator] + + set raddr_w_nets [find_cell_nets $inst "raddr_w(\\\[\\d+\\\])?$"] + + set read_s_net [find_cell_net $inst "read_s$"] + if {$debug} {puts "DEBUG: read_s_net: '$read_s_net'"} + + set is_raddr_reg_net [find_cell_net $inst "g_async_ram.is_raddr_reg$" 0] + if {$debug} {puts "DEBUG: is_raddr_reg_net: '$is_raddr_reg_net'"} + + set raddr_s_nets [find_matching_nets $inst $raddr_w_nets "raddr_w(\\\[\\d+\\\])?$" "raddr_s\\1"] + + set reg_next_pins {} + set reg_ce_src_pin "" + + set raddr_reset_net [find_cell_net $inst "raddr_reset$" 0] + if {$debug} {puts "DEBUG: raddr_reset: '$raddr_reset_net'"} + + # Process each raddr_w net + foreach raddr_w_net $raddr_w_nets { + if {$debug} {puts "DEBUG: Processing raddr_w net: '$raddr_w_net'"} + + # Find raddr_w_net's driver pin + set raddr_src_pin [find_net_driver $raddr_w_net] + if {$debug} {puts "DEBUG: raddr_src_pin: '$raddr_src_pin'"} + if {[get_ports -quiet $raddr_src_pin] ne ""} { + puts "WARNING: Net '$raddr_w_net' is not registered, driver_type=port" + break + } + + # Get the driver cell + set raddr_src_cell [get_cells -of_objects $raddr_src_pin] + if {[llength $raddr_src_cell] == 0} { + puts "ERROR: No source cell found connected to pin '$raddr_src_pin'." + exit -1 + } elseif {[llength $raddr_src_cell] > 1} { + puts "ERROR: Multiple source cells found connected to pin '$raddr_src_pin'." + exit -1 + } + + # Check driver type + set driver_type [get_property REF_NAME $raddr_src_cell] + if {$driver_type == "FDRE" || $driver_type == "FDSE"} { + if {$debug} {puts "DEBUG: Net '$raddr_w_net' is registered, driver_type='$driver_type'"} + } else { + puts "WARNING: Net '$raddr_w_net' is not registered, driver_type='$driver_type'" + break + } + + # Create register next cell and return output pin + set reg_next_pin [create_register_next $inst $raddr_src_cell $raddr_reset_net] + if {$reg_next_pin == ""} { + puts "ERROR: failed to create register next value for '$raddr_src_cell'." + exit -1 + } + if {$debug} {puts "DEBUG: reg_next_pin: '$reg_next_pin'"} + + lappend reg_next_pins $reg_next_pin + + # Find the CE pin on raddr_src_cell + if {$reg_ce_src_pin == ""} { + set reg_ce_pin [get_pins "${raddr_src_cell}${hier_sep}CE"] + if {[llength $reg_ce_pin] == 0} { + puts "ERROR: No CE pin found on register cell '$raddr_src_cell'." + exit -1 + } elseif {[llength $reg_ce_pin] > 1} { + puts "ERROR: Multiple CE pins found on register cell '$raddr_src_cell'." + exit -1 + } + if {$debug} {puts "DEBUG: reg_ce_pin: '$reg_ce_pin'"} + + set reg_ce_src_pin [find_pin_driver $reg_ce_pin] + if {$reg_ce_src_pin == ""} { + puts "ERROR: No source pin found connected to '$reg_ce_pin'." + exit -1 + } + if {$debug} {puts "DEBUG: reg_ce_src_pin: '$reg_ce_src_pin'"} + } + } + + set addr_width [llength $raddr_w_nets] + + # do we have a fully registered read address? + if {[llength $reg_next_pins] == $addr_width} { + if {$debug} {puts "DEBUG: Fully registered read address detected."} + + # Connect all reg_next_pins to all input pins attached to raddr_s_nets + for {set addr_idx 0} {$addr_idx < $addr_width} {incr addr_idx} { + set raddr_s_net [lindex $raddr_s_nets $addr_idx] + set reg_next_pin [lindex $reg_next_pins $addr_idx] + if {$debug} {puts "DEBUG: Connecting pin '$reg_next_pin' net to '$raddr_s_net's pins."} + replace_net_source $raddr_s_net $reg_next_pin + } + + # Connect reg_ce_src_pin to all input pins attached to read_s_net + if {$debug} {puts "DEBUG: Connecting pin '$reg_ce_src_pin' net to '$read_s_net's pins."} + replace_net_source $read_s_net $reg_ce_src_pin + + if {$is_raddr_reg_net != ""} { + # Create Const<1>'s pin + set vcc_pin [getOrCreateVCCPin $inst] + + # Connect vcc_pin to all input pins attached to is_raddr_reg_net + if {$debug} {puts "DEBUG: Connecting pin '$vcc_pin' to net '$is_raddr_reg_net's pins."} + replace_net_source $is_raddr_reg_net $vcc_pin + } + } else { + if {$is_raddr_reg_net == ""} { + puts "ERROR: read address not fully registered!" + exit -1 + } else { + puts "WARNING: read address not fully registered!" + } + + # Create Const<0>'s pin + set gnd_pin [getOrCreateGNDPin $inst] + + # Connect GND to all input pins attached to raddr_s_nets + for {set addr_idx 0} {$addr_idx < $addr_width} {incr addr_idx} { + set raddr_s_net [lindex $raddr_s_nets $addr_idx] + if {$debug} {puts "DEBUG: Connecting pin '$gnd_pin' net to '$raddr_s_net's pins."} + replace_net_source $raddr_s_net $gnd_pin + } + + # Connect GND to all input pins attached to read_s_net + if {$debug} {puts "DEBUG: Connecting pin '$gnd_pin' net to '$read_s_net's pins."} + replace_net_source $read_s_net $gnd_pin + + # Connect gnd_pin to all input pins attached to is_raddr_reg_net + if {$debug} {puts "DEBUG: Connecting pin '$gnd_pin' to net '$is_raddr_reg_net's pins."} + replace_net_source $is_raddr_reg_net $gnd_pin + } + + # Remove placeholder cells + foreach cell [find_nested_cells $inst "placeholder1$"] { + remove_cell_from_netlist $cell + } + if {$is_raddr_reg_net != ""} { + foreach cell [find_nested_cells $inst "g_async_ram.placeholder2$"] { + remove_cell_from_netlist $cell + } + } +} + +proc resolve_async_brams {} { + variable debug + set bram_patch_cells {} + foreach cell [get_cells -hierarchical -filter {REF_NAME =~ "*VX_async_ram_patch*"}] { + if {$debug} {puts "DEBUG: Found async BRAM patch cell: '$cell'."} + lappend bram_patch_cells $cell + } + if {[llength $bram_patch_cells] != 0} { + foreach cell $bram_patch_cells { + resolve_async_bram $cell + } + } else { + puts "INFO: No async BRAM patch cells found in the design." + } +} + +proc dump_async_bram_cells {} { + set bram_patch_cells [get_cells -hierarchical -filter {REF_NAME =~ "*VX_async_ram_patch*"}] + if {[llength $bram_patch_cells] != 0} { + foreach cell $bram_patch_cells { + puts "INFO: Found async BRAM patch cell: '$cell'." + set child_cells [find_cell_descendants $cell] + foreach child $child_cells { + set type [get_property REF_NAME $child] + puts "INFO: child cell: '$child', type: '$type'" + } + } + } else { + puts "INFO: No async BRAM patch cells found in the design." + } +} + +} + +# Invoke the procedure to resolve async BRAM +vortex::resolve_async_brams + +# dump async bram cells +#vortex::dump_async_bram_cells diff --git a/hw/scripts/xilinx_export_netlist.tcl b/hw/scripts/xilinx_export_netlist.tcl new file mode 100644 index 000000000..a6ff22ff5 --- /dev/null +++ b/hw/scripts/xilinx_export_netlist.tcl @@ -0,0 +1,84 @@ +# Copyright © 2019-2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Function to export netlist to a Graphviz DOT file +proc export_netlist {dot_file_name} { + # Open the DOT file for writing + set dot_file [open $dot_file_name "w"] + + # Start the DOT graph definition + puts $dot_file "digraph Netlist {" + puts $dot_file "rankdir=LR;" ;# Set the graph direction from left to right + + # Extract and add cells to the graph + foreach cell [get_cells -hierarchical] { + set cell_name [get_property NAME $cell] + set cell_type [get_property REF_NAME $cell] + puts $dot_file "\"$cell_name\" \[label=\"$cell_name\\n($cell_type)\", shape=box\];" + } + + # Extract and add ports to the graph + foreach port [get_ports] { + set port_name [get_property NAME $port] + set direction [get_property DIRECTION $port] + set shape "ellipse" + + # Color code input and output ports for easier identification + if {$direction == "IN"} { + set color "lightblue" + } else { + set color "lightgreen" + } + puts $dot_file "\"$port_name\" \[label=\"$port_name\", shape=$shape, style=filled, fillcolor=$color\];" + } + + # Traverse nets and create edges between ports and pins + foreach net [get_nets -hierarchical] { + set net_name [get_property NAME $net] + + # Find source and destination pins + set source_pin "" + set sink_pins {} + + foreach pin [get_pins -of_objects $net] { + set direction [get_property DIRECTION $pin] + set cell [get_cells -of_objects $pin] + set pin_name [get_property NAME $pin] + + if {$direction == "OUT"} { + # Set as source pin + set source_pin "$cell/$pin_name" + } else { + # Collect as sink pin + lappend sink_pins "$cell/$pin_name" + } + } + + # Output edges from source to all sinks + if {$source_pin != ""} { + foreach sink_pin $sink_pins { + puts $dot_file "\"$source_pin\" -> \"$sink_pin\" \[label=\"$net_name\"\];" + } + } + } + + # End the DOT graph definition + puts $dot_file "}" + + # Close the DOT file + close $dot_file + puts "Netlist exported to DOT file: $dot_file_name" +} + +# Run the export function +export_netlist "netlist.dot" \ No newline at end of file diff --git a/hw/syn/xilinx/xrt/scripts/gen_ip.tcl b/hw/scripts/xilinx_ip_gen.tcl similarity index 86% rename from hw/syn/xilinx/xrt/scripts/gen_ip.tcl rename to hw/scripts/xilinx_ip_gen.tcl index 5aae6db74..a1048fc77 100644 --- a/hw/syn/xilinx/xrt/scripts/gen_ip.tcl +++ b/hw/scripts/xilinx_ip_gen.tcl @@ -1,31 +1,36 @@ # Copyright © 2019-2023 -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -if { $::argc != 1 } { - puts "ERROR: Program \"$::argv0\" requires 1 arguments!\n" - puts "Usage: $::argv0 \n" +if { $::argc < 1 || $::argc > 2 } { + puts "ERROR: Program \"$::argv0\" requires 1 or 2 arguments!\n" + puts "Usage: $::argv0 []\n" exit } set ip_dir [lindex $::argv 0] +# create_ip requires that a project is open in memory. +if { $::argc == 2 } { + set device_part [lindex $::argv 1] + create_project -in_memory -part $device_part +} else { + # Create project without specifying a device part + create_project -in_memory +} + # IP folder does not exist. Create IP folder file mkdir ${ip_dir} -# create_ip requires that a project is open in memory. -# Create project but don't do anything with it -create_project -in_memory - create_ip -name floating_point -vendor xilinx.com -library ip -version 7.1 -module_name xil_fdiv -dir ${ip_dir} set_property -dict [list CONFIG.Component_Name {xil_fdiv} CONFIG.Operation_Type {Divide} CONFIG.Flow_Control {NonBlocking} CONFIG.Has_ACLKEN {true} CONFIG.C_Has_UNDERFLOW {true} CONFIG.C_Has_OVERFLOW {true} CONFIG.C_Has_INVALID_OP {true} CONFIG.C_Has_DIVIDE_BY_ZERO {true} CONFIG.A_Precision_Type {Single} CONFIG.C_A_Exponent_Width {8} CONFIG.C_A_Fraction_Width {24} CONFIG.Result_Precision_Type {Single} CONFIG.C_Result_Exponent_Width {8} CONFIG.C_Result_Fraction_Width {24} CONFIG.C_Mult_Usage {No_Usage} CONFIG.Has_RESULT_TREADY {false} CONFIG.C_Latency {28} CONFIG.C_Rate {1}] [get_ips xil_fdiv] diff --git a/hw/syn/altera/README b/hw/syn/altera/README index 11d048442..3f9168d5c 100644 --- a/hw/syn/altera/README +++ b/hw/syn/altera/README @@ -10,10 +10,10 @@ cd build_fpga && qsub-synth # check last 10 lines in build log for possible errors tail -n 10 ./build_arria10_fpga_1c/build.log -# Check if the job is submitted to the queue and running. Status should be R +# Check if the job is submitted to the queue and running. Status should be R qstat | grep -# Constantly monitoring the job submitted to the queue. Stop this using Ctrl+C +# Constantly monitoring the job submitted to the queue. Stop this using Ctrl+C watch ‘qstat | grep ’ # @@ -35,7 +35,7 @@ fpgaconf --bus 0xaf /synth/vortex_afu.gbs # get portid fpgainfo port -# Running the Test case +# Running the Test case cd /driver/tests/basic make run-fpga @@ -54,13 +54,9 @@ TARGET=asesim make -C runtime/opae PREFIX=build_base CONFIGS="-DEXT_F_DISABLE -DL1_DISABLE -DSM_DISABLE -DNUM_WARPS=2 -DNUM_THREADS=2" TARGET=asesim make # ASE test runs -./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/basic/basic -n1 -t0 -./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/basic/basic -n1 -t1 -./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/basic/basic -n16 -./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/demo/demo -n16 -./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/regression/dogfood/dogfood -n16 -./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/opencl/vecadd/vecadd -./run_ase.sh build_base_arria10_asesim_1c ../../../../tests/opencl/sgemm/sgemm -n4 +start_ase.sh +ASE_LOG=0 ASE_WORKDIR=/synth/work TARGET=asesim ./ci/blackbox.sh --driver=opae --app=vecadd +stop_ase.sh # modify "vsim_run.tcl" to dump VCD trace vcd file trace.vcd diff --git a/hw/syn/altera/quartus/Makefile b/hw/syn/altera/dut/Makefile similarity index 70% rename from hw/syn/altera/quartus/Makefile rename to hw/syn/altera/dut/Makefile index d0a2999bd..173408eca 100644 --- a/hw/syn/altera/quartus/Makefile +++ b/hw/syn/altera/dut/Makefile @@ -9,26 +9,26 @@ SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts IP_CACHE_DIR := $(ROOT_DIR)/hw/syn/altera/ip_cache/$(DEVICE_FAMILY) -.PHONY: dogfood unittest pipeline lmem cache fpu core issue vortex top test +.PHONY: unittest scope mem_unit lmem cache fpu core issue vortex top ip-gen: $(IP_CACHE_DIR)/ip_gen.log $(IP_CACHE_DIR)/ip_gen.log: - $(SCRIPT_DIR)/ip_gen.sh $(IP_CACHE_DIR) - -dogfood: - mkdir -p dogfood/$(BUILD_DIR) - cp dogfood/Makefile dogfood/$(BUILD_DIR) - $(MAKE) -C dogfood/$(BUILD_DIR) clean && $(MAKE) -C dogfood/$(BUILD_DIR) > dogfood/$(BUILD_DIR)/build.log 2>&1 & + $(SCRIPT_DIR)/altera_ip_gen.sh $(IP_CACHE_DIR) unittest: mkdir -p unittest/$(BUILD_DIR) cp unittest/Makefile unittest/$(BUILD_DIR) $(MAKE) -C unittest/$(BUILD_DIR) clean && $(MAKE) -C unittest/$(BUILD_DIR) > unittest/$(BUILD_DIR)/build.log 2>&1 & -pipeline: - mkdir -p pipeline/$(BUILD_DIR) - cp pipeline/Makefile pipeline/$(BUILD_DIR) - $(MAKE) -C pipeline/$(BUILD_DIR) clean && $(MAKE) -C pipeline/$(BUILD_DIR) > pipeline/$(BUILD_DIR)/build.log 2>&1 & +scope: + mkdir -p scope/$(BUILD_DIR) + cp scope/Makefile scope/$(BUILD_DIR) + $(MAKE) -C scope/$(BUILD_DIR) clean && $(MAKE) -C scope/$(BUILD_DIR) > scope/$(BUILD_DIR)/build.log 2>&1 & + +mem_unit: + mkdir -p mem_unit/$(BUILD_DIR) + cp mem_unit/Makefile mem_unit/$(BUILD_DIR) + $(MAKE) -C mem_unit/$(BUILD_DIR) clean && $(MAKE) -C mem_unit/$(BUILD_DIR) > mem_unit/$(BUILD_DIR)/build.log 2>&1 & lmem: mkdir -p lmem/$(BUILD_DIR) @@ -63,9 +63,4 @@ vortex: ip-gen top: ip-gen mkdir -p top/$(BUILD_DIR) cp top/Makefile top/$(BUILD_DIR) - $(MAKE) -C top/$(BUILD_DIR) clean && $(MAKE) -C top/$(BUILD_DIR) > top/$(BUILD_DIR)/build.log 2>&1 & - -test: ip-gen - mkdir -p test/$(BUILD_DIR) - cp test/Makefile test/$(BUILD_DIR) - $(MAKE) -C test/$(BUILD_DIR) clean && $(MAKE) -C test/$(BUILD_DIR) > test/$(BUILD_DIR)/build.log 2>&1 & + $(MAKE) -C top/$(BUILD_DIR) clean && $(MAKE) -C top/$(BUILD_DIR) > top/$(BUILD_DIR)/build.log 2>&1 & \ No newline at end of file diff --git a/hw/syn/altera/quartus/cache/Makefile b/hw/syn/altera/dut/cache/Makefile similarity index 100% rename from hw/syn/altera/quartus/cache/Makefile rename to hw/syn/altera/dut/cache/Makefile diff --git a/hw/syn/altera/quartus/common.mk b/hw/syn/altera/dut/common.mk similarity index 97% rename from hw/syn/altera/quartus/common.mk rename to hw/syn/altera/dut/common.mk index 3890dcfe8..1adcb3d49 100644 --- a/hw/syn/altera/quartus/common.mk +++ b/hw/syn/altera/dut/common.mk @@ -1,7 +1,7 @@ ROOT_DIR := $(realpath ../../../../../..) include $(ROOT_DIR)/config.mk -SRC_DIR := $(VORTEX_HOME)/hw/syn/altera/quartus +SRC_DIR := $(VORTEX_HOME)/hw/syn/altera/dut RTL_DIR := $(VORTEX_HOME)/hw/rtl AFU_DIR := $(RTL_DIR)/afu/opae @@ -21,7 +21,6 @@ endif CONFIGS += -DNDEBUG CONFIGS += -DQUARTUS CONFIGS += -DSYNTHESIS -CONFIGS += -DNOGLOBALS PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf diff --git a/hw/syn/altera/quartus/core/Makefile b/hw/syn/altera/dut/core/Makefile similarity index 66% rename from hw/syn/altera/quartus/core/Makefile rename to hw/syn/altera/dut/core/Makefile index eeeaa5233..c78c4a651 100644 --- a/hw/syn/altera/quartus/core/Makefile +++ b/hw/syn/altera/dut/core/Makefile @@ -9,6 +9,6 @@ include ../../common.mk FPU_INCLUDE = -I$(RTL_DIR)/fpu ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) - FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -J$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/fpnew/src + FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src endif RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -I$(IP_CACHE_DIR) $(FPU_INCLUDE) \ No newline at end of file diff --git a/hw/syn/altera/quartus/fpu/Makefile b/hw/syn/altera/dut/fpu/Makefile similarity index 58% rename from hw/syn/altera/quartus/fpu/Makefile rename to hw/syn/altera/dut/fpu/Makefile index b7826dc68..38d5c718c 100644 --- a/hw/syn/altera/quartus/fpu/Makefile +++ b/hw/syn/altera/dut/fpu/Makefile @@ -6,6 +6,6 @@ include ../../common.mk FPU_INCLUDE = -I$(RTL_DIR)/fpu ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) - FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -J$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/fpnew/src + FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src endif RTL_INCLUDE = $(FPU_INCLUDE) -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(IP_CACHE_DIR) diff --git a/hw/syn/altera/quartus/issue/Makefile b/hw/syn/altera/dut/issue/Makefile similarity index 66% rename from hw/syn/altera/quartus/issue/Makefile rename to hw/syn/altera/dut/issue/Makefile index c1804a398..45f6981d6 100644 --- a/hw/syn/altera/quartus/issue/Makefile +++ b/hw/syn/altera/dut/issue/Makefile @@ -9,6 +9,6 @@ include ../../common.mk FPU_INCLUDE = -I$(RTL_DIR)/fpu ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) - FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -J$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/fpnew/src + FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src endif RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem $(FPU_INCLUDE) -I$(IP_CACHE_DIR) $(FPU_INCLUDE) \ No newline at end of file diff --git a/hw/syn/altera/quartus/lmem/Makefile b/hw/syn/altera/dut/lmem/Makefile similarity index 100% rename from hw/syn/altera/quartus/lmem/Makefile rename to hw/syn/altera/dut/lmem/Makefile diff --git a/hw/syn/altera/dut/mem_unit/Makefile b/hw/syn/altera/dut/mem_unit/Makefile new file mode 100755 index 000000000..209492265 --- /dev/null +++ b/hw/syn/altera/dut/mem_unit/Makefile @@ -0,0 +1,7 @@ +PROJECT = VX_mem_unit_top +TOP_LEVEL_ENTITY = $(PROJECT) +SRC_FILE = $(PROJECT).sv + +include ../../common.mk + +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/mem -I$(RTL_DIR)/core -I$(RTL_DIR)/fpu diff --git a/hw/syn/altera/quartus/project.sdc b/hw/syn/altera/dut/project.sdc similarity index 100% rename from hw/syn/altera/quartus/project.sdc rename to hw/syn/altera/dut/project.sdc diff --git a/hw/syn/altera/quartus/project.tcl b/hw/syn/altera/dut/project.tcl similarity index 100% rename from hw/syn/altera/quartus/project.tcl rename to hw/syn/altera/dut/project.tcl diff --git a/hw/syn/altera/dut/scope/Makefile b/hw/syn/altera/dut/scope/Makefile new file mode 100755 index 000000000..405f05e8a --- /dev/null +++ b/hw/syn/altera/dut/scope/Makefile @@ -0,0 +1,7 @@ +PROJECT = VX_scope_tap +TOP_LEVEL_ENTITY = $(PROJECT) +SRC_FILE = $(PROJECT).sv + +include ../../common.mk + +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs diff --git a/hw/syn/altera/quartus/timing-html.tcl b/hw/syn/altera/dut/timing-html.tcl similarity index 100% rename from hw/syn/altera/quartus/timing-html.tcl rename to hw/syn/altera/dut/timing-html.tcl diff --git a/hw/syn/altera/dut/top/Makefile b/hw/syn/altera/dut/top/Makefile new file mode 100644 index 000000000..aa897579d --- /dev/null +++ b/hw/syn/altera/dut/top/Makefile @@ -0,0 +1,20 @@ +PROJECT = vortex_afu +TOP_LEVEL_ENTITY = $(PROJECT) +SRC_FILE = $(PROJECT).sv + +include ../../common.mk + +# AFU parameters +CONFIGS += -DNOPAE +CONFIGS += -DPLATFORM_PROVIDES_LOCAL_MEMORY + +#CONFIGS += -DNUM_CORES=2 +#CONFIGS += -DNUM_WARPS=32 +#CONFIGS += -DNUM_THREADS=32 +#CONFIGS += -DL2_ENABLE + +FPU_INCLUDE = -I$(RTL_DIR)/fpu +ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) + FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src +endif +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -I$(AFU_DIR) -I$(AFU_DIR)/ccip -I$(IP_CACHE_DIR) $(FPU_INCLUDE) diff --git a/hw/syn/altera/quartus/unittest/Makefile b/hw/syn/altera/dut/unittest/Makefile similarity index 55% rename from hw/syn/altera/quartus/unittest/Makefile rename to hw/syn/altera/dut/unittest/Makefile index 2bfb18e4e..3539c23b6 100644 --- a/hw/syn/altera/quartus/unittest/Makefile +++ b/hw/syn/altera/dut/unittest/Makefile @@ -6,6 +6,7 @@ include ../../common.mk FPU_INCLUDE = -I$(RTL_DIR)/fpu ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) - FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -J$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/fpnew/src + FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src endif -RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -I$(IP_CACHE_DIR) $(FPU_INCLUDE) \ No newline at end of file +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -I$(IP_CACHE_DIR) $(FPU_INCLUDE) +RTL_INCLUDE = -I.. \ No newline at end of file diff --git a/hw/syn/altera/quartus/vortex/Makefile b/hw/syn/altera/dut/vortex/Makefile similarity index 68% rename from hw/syn/altera/quartus/vortex/Makefile rename to hw/syn/altera/dut/vortex/Makefile index 7429df414..80c256021 100644 --- a/hw/syn/altera/quartus/vortex/Makefile +++ b/hw/syn/altera/dut/vortex/Makefile @@ -11,6 +11,6 @@ include ../../common.mk FPU_INCLUDE = -I$(RTL_DIR)/fpu ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) - FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -J$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/fpnew/src + FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src endif RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -I$(IP_CACHE_DIR) $(FPU_INCLUDE) diff --git a/hw/syn/altera/opae/Makefile b/hw/syn/altera/opae/Makefile index 62a9bb72c..339c40f70 100644 --- a/hw/syn/altera/opae/Makefile +++ b/hw/syn/altera/opae/Makefile @@ -5,7 +5,6 @@ DEVICE_FAMILY ?= arria10 PREFIX ?= build$(XLEN) TARGET ?= fpga -NUM_CORES ?= 1 SRC_DIR := $(VORTEX_HOME)/hw/syn/altera/opae @@ -36,7 +35,6 @@ DBG_SCOPE_FLAGS += -DDBG_SCOPE_AFU DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU -DBG_SCOPE_FLAGS += -DDBG_SCOPE_MSCHED ifeq ($(DEVICE_FAMILY), stratix10) CONFIGS += -DALTERA_S10 @@ -45,6 +43,7 @@ ifeq ($(DEVICE_FAMILY), arria10) CONFIGS += -DALTERA_A10 endif +ifdef NUM_CORES # cluster configuration CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1 CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2 @@ -54,11 +53,15 @@ CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 CONFIGS += $(CONFIGS_$(NUM_CORES)c) +endif -# include paths +# include sources +RTL_PKGS = $(AFU_DIR)/local_mem_cfg_pkg.sv $(AFU_DIR)/ccip/ccip_if_pkg.sv +RTL_PKGS += $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv FPU_INCLUDE = -I$(RTL_DIR)/fpu ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) - FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -J$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/fpnew/src + RTL_PKGS += $(THIRD_PARTY_DIR)/cvfpu/src/fpnew_pkg.sv $(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src/cf_math_pkg $(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl/defs_div_sqrt_mvp.sv + FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src endif RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -I$(AFU_DIR) -I$(IP_CACHE_DIR) RTL_INCLUDE += $(FPU_INCLUDE) @@ -96,13 +99,13 @@ ifdef PERF endif # ast dump flags -XML_CFLAGS = $(filter-out -DSYNTHESIS -DQUARTUS, $(CFLAGS)) -I$(AFU_DIR)/ccip -I$(DPI_DIR) -DNOPAE +XML_CFLAGS = $(filter-out -DSYNTHESIS -DQUARTUS, $(CFLAGS)) $(RTL_PKGS) -I$(AFU_DIR)/ccip -I$(DPI_DIR) -DPLATFORM_PROVIDES_LOCAL_MEMORY -DPLATFORM_MEMORY_NUM_BANKS=1 -DNOPAE -DSV_DPI all: swconfig ip-gen setup build ip-gen: $(IP_CACHE_DIR)/ip-gen.log $(IP_CACHE_DIR)/ip-gen.log: - $(SCRIPT_DIR)/ip_gen.sh $(IP_CACHE_DIR) + $(SCRIPT_DIR)/altera_ip_gen.sh $(IP_CACHE_DIR) swconfig: vortex_afu.h vortex_afu.h: $(SRC_DIR)/vortex_afu.json diff --git a/hw/syn/altera/opae/run_ase.sh b/hw/syn/altera/opae/start_ase.sh similarity index 74% rename from hw/syn/altera/opae/run_ase.sh rename to hw/syn/altera/opae/start_ase.sh index 04fd27540..d408b2170 100755 --- a/hw/syn/altera/opae/run_ase.sh +++ b/hw/syn/altera/opae/start_ase.sh @@ -17,12 +17,6 @@ SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" BUILD_DIR=$(realpath $1) -PROGRAM=$(basename "$2") -PROGRAM_DIR=`dirname $2` - -POCL_PATH=$TOOLDIR/pocl -VORTEX_RT_PATH=$SCRIPT_DIR/../../../../runtime - # Export ASE_WORKDIR variable export ASE_WORKDIR=$BUILD_DIR/synth/work @@ -35,7 +29,6 @@ rm -f $BUILD_DIR/synth/nohup.out pushd $BUILD_DIR/synth echo " [DBG] starting ASE simnulator (stdout saved to '$BUILD_DIR/synth/nohup.out')" setsid make sim &> /dev/null & -SIM_PID=$! popd # Wait for simulator readiness @@ -44,14 +37,3 @@ while [ ! -f $ASE_WORKDIR/.ase_ready.pid ] do sleep 1 done - -# run application -pushd $PROGRAM_DIR -shift 2 -echo " [DBG] running ./$PROGRAM $*" -ASE_LOG=0 LD_LIBRARY_PATH=$POCL_PATH/lib:$VORTEX_RT_PATH/opae:$LD_LIBRARY_PATH ./$PROGRAM $* -popd - -# stop the simulator (kill process group) -kill -- -$(ps -o pgid= $SIM_PID | grep -o '[0-9]*') -wait $SIM_PID 2> /dev/null \ No newline at end of file diff --git a/hw/syn/altera/opae/stop_ase.sh b/hw/syn/altera/opae/stop_ase.sh new file mode 100755 index 000000000..caee290db --- /dev/null +++ b/hw/syn/altera/opae/stop_ase.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +# Copyright © 2019-2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +BUILD_DIR=$(realpath $1) + +# Export ASE_WORKDIR variable +export ASE_WORKDIR=$BUILD_DIR/synth/work + +# stop the simulator (kill process group) +if [ -f "$ASE_WORKDIR/.ase_ready.pid" ]; then + SIM_PID=$(grep '^pid' "$ASE_WORKDIR/.ase_ready.pid" | cut -d'=' -f2 | tr -d ' ') + echo " [DBG] stopping ASE simulator (pid=$SIM_PID)" + kill -- -$(ps -o pgid= $SIM_PID | grep -o '[0-9]*') + wait $SIM_PID 2> /dev/null +else + echo "ASE PID file does not exist." +fi \ No newline at end of file diff --git a/hw/syn/altera/power_play.sh b/hw/syn/altera/power_play.sh old mode 100644 new mode 100755 diff --git a/hw/syn/altera/quartus/test/Makefile b/hw/syn/altera/quartus/test/Makefile deleted file mode 100644 index 0c4a7ae4e..000000000 --- a/hw/syn/altera/quartus/test/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -PROJECT = Vortex -TOP_LEVEL_ENTITY = $(PROJECT) -SRC_FILE = $(PROJECT).sv - -include ../../common.mk - -FPU_INCLUDE = -I$(RTL_DIR)/fpu -ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) - FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -J$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/fpnew/src -endif -RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(IP_CACHE_DIR) $(FPU_INCLUDE) diff --git a/hw/syn/altera/quartus/top/Makefile b/hw/syn/altera/quartus/top/Makefile deleted file mode 100644 index 341690206..000000000 --- a/hw/syn/altera/quartus/top/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -PROJECT = vortex_afu -TOP_LEVEL_ENTITY = $(PROJECT) -SRC_FILE = $(PROJECT).sv - -include ../../common.mk - -# AFU parameters -CONFIGS += -DNOPAE -CONFIGS += -DPLATFORM_PROVIDES_LOCAL_MEMORY -ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_BANKS,$(CONFIGS))) - CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=2 -endif -ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH,$(CONFIGS))) - CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=26 -endif -ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH,$(CONFIGS))) - CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=512 -endif -ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH,$(CONFIGS))) - CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH=4 -endif - -#CONFIGS += -DNUM_CORES=2 -#CONFIGS += -DNUM_WARPS=32 -#CONFIGS += -DNUM_THREADS=32 -#CONFIGS += -DL2_ENABLE - -FPU_INCLUDE = -I$(RTL_DIR)/fpu -ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) - FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -J$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/fpnew/src -endif -RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -I$(AFU_DIR) -I$(AFU_DIR)/ccip -I$(IP_CACHE_DIR) $(FPU_INCLUDE) diff --git a/hw/syn/xilinx/README b/hw/syn/xilinx/README index 563c4c17e..7fdca65d7 100644 --- a/hw/syn/xilinx/README +++ b/hw/syn/xilinx/README @@ -5,9 +5,12 @@ platforminfo -l xbutil validate --device 0000:09:00.1 --verbose # generate FPU IPs -vivado -mode batch -source scripts/gen_ip.tcl -tclargs ip/xilinx_u50_gen3x16_xdma_5_202210_1 +vivado -mode batch -source xilinx_ip_gen.tcl -tclargs ip/xilinx_u50_gen3x16_xdma_5_202210_1 # build FPGA +PREFIX=build_base_1c NUM_CORES=1 TARGET=hw_emu PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 make > build_u55c_hw_emu_base_1c.log 2>&1 & +PREFIX=build_base_1c NUM_CORES=1 TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 make > build_u55c_hw_base_1c.log 2>&1 & + PREFIX=build_base_1c NUM_CORES=1 TARGET=hw_emu PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 make > build_u50_hw_emu_base_1c.log 2>&1 & PREFIX=build_base_1c NUM_CORES=1 TARGET=hw PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 make > build_u50_hw_base_1c.log 2>&1 & @@ -25,23 +28,37 @@ PREFIX=build TARGET=hw_emu PLATFORM=xilinx_vck5000_gen3x16_xdma_1_202120_1 make # debug hw_emu using xsim xsim --gui xilinx_u50_gen3x16_xdma_5_202210_1-0-vortex_afu.wdb & -# debug hw using ILA +# h/w debugging using ILA +## (1) check for ILA support platforminfo --json="hardwarePlatform.extensions.chipscope_debug" xilinx_u50_gen3x16_xdma_5_202210_1 +## (2) chedk for XVC full path to get device id ls /dev/xfpga/xvc_pub* -ls /dev/xvc_pub* -debug_hw --xvc_pcie /dev/xfpga/xvc_pub.u2305.0 --hw_server -debug_hw --xvc_pcie /dev/xvc_pub.u0 --hw_server +## (3) start h/w server +debug_hw --xvc_pcie /dev/xfpga/xvc_pub. --hw_server +## (4) start application and pause +## (5) start vivado to connect to h/w server and select ILA probes debug_hw --vivado --host localhost --ltx_file ./build_xilinx_u50_gen3x16_xdma_5_202210_1_hw/_x/link/vivado/vpl/prj/prj.runs/impl_1/debug_nets.ltx & -make chipscope TARGET=hw PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 +## (6) resume application + +# supported ILA Makefie targets +TARGET=hw PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 make hw_server +TARGET=hw PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 make chipscope # analyze build report vitis_analyzer build_xilinx_u50_gen3x16_xdma_5_202210_1_hw_4c/bin/vortex_afu.xclbin.link_summary +# resuming builds +TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.synth" make > build.log 2>&1 & +TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl" make > build.log 2>&1 & +TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.opt_design" make > build.log 2>&1 & +TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.place_design" make > build.log 2>&1 & +TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.phys_opt_design" make > build.log 2>&1 & +TARGET=hw PLATFORM=xilinx_u55c_gen3x16_xdma_3_202210_1 VPP_FLAGS="--from_step vpl.impl.route_design" make > build.log 2>&1 & + # running test FPGA_BIN_DIR= TARGET=hw_emu ./ci/blackbox.sh --driver=xrt --app=demo -FPGA_BIN_DIR= TARGET=hw ./ci/blackbox.sh --driver=xrt --app=demo -FPGA_BIN_DIR= TARGET=hw_emu ./ci/blackbox.sh --driver=xrt --app=demo -FPGA_BIN_DIR= XRT_DEVICE_INDEX=1 TARGET=hw ./ci/blackbox.sh --driver=xrt --app=demo +FPGA_BIN_DIR= TARGET=hw ./ci/blackbox.sh --driver=xrt --app=sgemm --args="-n1024" +FPGA_BIN_DIR= XRT_DEVICE_INDEX=1 TARGET=hw ./ci/blackbox.sh --driver=xrt --app=sgemm --args="-n1024" # build report logs /bin/vortex_afu.xclbin.info diff --git a/hw/syn/xilinx/dut/Makefile b/hw/syn/xilinx/dut/Makefile new file mode 100644 index 000000000..fe37eb4b8 --- /dev/null +++ b/hw/syn/xilinx/dut/Makefile @@ -0,0 +1,58 @@ +ROOT_DIR := $(realpath ../../../..) +include $(ROOT_DIR)/config.mk + +PREFIX ?= build + +BUILD_DIR := $(PREFIX) + +.PHONY: unittest scope mem_unit lmem cache fpu core issue vortex top + +unittest: + mkdir -p unittest/$(BUILD_DIR) + cp unittest/Makefile unittest/$(BUILD_DIR) + $(MAKE) -C unittest/$(BUILD_DIR) clean && $(MAKE) -C unittest/$(BUILD_DIR) > unittest/$(BUILD_DIR)/build.log 2>&1 & + +scope: + mkdir -p scope/$(BUILD_DIR) + cp scope/Makefile scope/$(BUILD_DIR) + $(MAKE) -C scope/$(BUILD_DIR) clean && $(MAKE) -C scope/$(BUILD_DIR) > scope/$(BUILD_DIR)/build.log 2>&1 & + +mem_unit: + mkdir -p mem_unit/$(BUILD_DIR) + cp mem_unit/Makefile mem_unit/$(BUILD_DIR) + $(MAKE) -C mem_unit/$(BUILD_DIR) clean && $(MAKE) -C mem_unit/$(BUILD_DIR) > mem_unit/$(BUILD_DIR)/build.log 2>&1 & + +lmem: + mkdir -p lmem/$(BUILD_DIR) + cp lmem/Makefile lmem/$(BUILD_DIR) + $(MAKE) -C lmem/$(BUILD_DIR) clean && $(MAKE) -C lmem/$(BUILD_DIR) > lmem/$(BUILD_DIR)/build.log 2>&1 & + +cache: + mkdir -p cache/$(BUILD_DIR) + cp cache/Makefile cache/$(BUILD_DIR) + $(MAKE) -C cache/$(BUILD_DIR) clean && $(MAKE) -C cache/$(BUILD_DIR) > cache/$(BUILD_DIR)/build.log 2>&1 & + +fpu: + mkdir -p fpu/$(BUILD_DIR) + cp fpu/Makefile fpu/$(BUILD_DIR) + $(MAKE) -C fpu/$(BUILD_DIR) clean && $(MAKE) -C fpu/$(BUILD_DIR) > fpu/$(BUILD_DIR)/build.log 2>&1 & + +core: + mkdir -p core/$(BUILD_DIR) + cp core/Makefile core/$(BUILD_DIR) + $(MAKE) -C core/$(BUILD_DIR) clean && $(MAKE) -C core/$(BUILD_DIR) > core/$(BUILD_DIR)/build.log 2>&1 & + +issue: + mkdir -p issue/$(BUILD_DIR) + cp issue/Makefile issue/$(BUILD_DIR) + $(MAKE) -C issue/$(BUILD_DIR) clean && $(MAKE) -C issue/$(BUILD_DIR) > issue/$(BUILD_DIR)/build.log 2>&1 & + +vortex: + mkdir -p vortex/$(BUILD_DIR) + cp vortex/Makefile vortex/$(BUILD_DIR) + $(MAKE) -C vortex/$(BUILD_DIR) clean && $(MAKE) -C vortex/$(BUILD_DIR) > vortex/$(BUILD_DIR)/build.log 2>&1 & + +top: + mkdir -p top/$(BUILD_DIR) + cp top/Makefile top/$(BUILD_DIR) + $(MAKE) -C top/$(BUILD_DIR) clean && $(MAKE) -C top/$(BUILD_DIR) > top/$(BUILD_DIR)/build.log 2>&1 & \ No newline at end of file diff --git a/hw/syn/xilinx/dut/cache/Makefile b/hw/syn/xilinx/dut/cache/Makefile new file mode 100644 index 000000000..f96a76142 --- /dev/null +++ b/hw/syn/xilinx/dut/cache/Makefile @@ -0,0 +1,7 @@ +PROJECT = VX_cache_top +TOP_LEVEL_ENTITY = $(PROJECT) +SRC_FILE = $(PROJECT).sv + +include ../../common.mk + +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache diff --git a/hw/syn/xilinx/dut/common.mk b/hw/syn/xilinx/dut/common.mk new file mode 100644 index 000000000..008e1ed8e --- /dev/null +++ b/hw/syn/xilinx/dut/common.mk @@ -0,0 +1,51 @@ +ROOT_DIR := $(realpath ../../../../../..) +include $(ROOT_DIR)/config.mk + +DEVICE ?= xcu55c-fsvh2892-2L-e + +MAX_JOBS ?= 8 + +VIVADO := $(XILINX_VIVADO)/bin/vivado + +SRC_DIR := $(VORTEX_HOME)/hw/syn/xilinx/dut + +RTL_DIR := $(VORTEX_HOME)/hw/rtl +AFU_DIR := $(RTL_DIR)/afu/xrt +SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts + +NCPUS := $(shell lscpu | grep "^Core(s) per socket:" | awk '{print $$4}') +JOBS ?= $(shell echo $$(( $(NCPUS) > $(MAX_JOBS) ? $(MAX_JOBS) : $(NCPUS) ))) + +CONFIGS += -DNDEBUG +CONFIGS += -DVIVADO +CONFIGS += -DSYNTHESIS + +# Build targets +all: $(PROJECT).xpr + +gen-sources: project_1/sources.txt +project_1/sources.txt: + mkdir -p project_1 + $(SCRIPT_DIR)/gen_sources.sh $(CONFIGS) $(RTL_INCLUDE) -T$(TOP_LEVEL_ENTITY) -P -Cproject_1/src -Oproject_1/sources.txt + +build: $(PROJECT).xpr +$(PROJECT).xpr: project_1/sources.txt +ifdef FPU_IP + MAX_JOBS=$(JOBS) FPU_IP=project_1/ip TOOL_DIR=$(SCRIPT_DIR) $(VIVADO) -mode batch -source $(SRC_DIR)/project.tcl -tclargs $(TOP_LEVEL_ENTITY) $(DEVICE) project_1/sources.txt $(SRC_DIR)/project.xdc +else + MAX_JOBS=$(JOBS) TOOL_DIR=$(SCRIPT_DIR) $(VIVADO) -mode batch -source $(SRC_DIR)/project.tcl -tclargs $(TOP_LEVEL_ENTITY) $(DEVICE) project_1/sources.txt $(SRC_DIR)/project.xdc +endif + +clean: +ifndef RESUME + rm -rf project_1 + rm -rf .Xil + rm -f *.rpt + rm -f *.log + rm -f *.jou + rm -f *.dcp +else + @echo "RESUME is defined, skipping clean." +endif + +.PHONY: all gen-sources build clean \ No newline at end of file diff --git a/hw/syn/xilinx/dut/core/Makefile b/hw/syn/xilinx/dut/core/Makefile new file mode 100644 index 000000000..2ce824a3f --- /dev/null +++ b/hw/syn/xilinx/dut/core/Makefile @@ -0,0 +1,15 @@ +PROJECT = VX_core_top +TOP_LEVEL_ENTITY = $(PROJECT) +SRC_FILE = $(PROJECT).sv +FPU_IP = 1 + +include ../../common.mk + +#CONFIGS += -DNUM_WARPS=32 +#CONFIGS += -DNUM_THREADS=32 + +FPU_INCLUDE = -I$(RTL_DIR)/fpu +ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) + FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src +endif +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE) \ No newline at end of file diff --git a/hw/syn/xilinx/dut/fpu/Makefile b/hw/syn/xilinx/dut/fpu/Makefile new file mode 100644 index 000000000..c3d3fd99f --- /dev/null +++ b/hw/syn/xilinx/dut/fpu/Makefile @@ -0,0 +1,12 @@ +PROJECT = VX_fpu_dsp +TOP_LEVEL_ENTITY = $(PROJECT) +SRC_FILE = $(PROJECT).sv +FPU_IP = 1 + +include ../../common.mk + +FPU_INCLUDE = -I$(RTL_DIR)/fpu +ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) + FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src +endif +RTL_INCLUDE = $(FPU_INCLUDE) -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces diff --git a/hw/syn/xilinx/dut/issue/Makefile b/hw/syn/xilinx/dut/issue/Makefile new file mode 100644 index 000000000..07e8f343d --- /dev/null +++ b/hw/syn/xilinx/dut/issue/Makefile @@ -0,0 +1,14 @@ +PROJECT = VX_issue_top +TOP_LEVEL_ENTITY = $(PROJECT) +SRC_FILE = $(PROJECT).sv + +include ../../common.mk + +#CONFIGS += -DNUM_WARPS=32 +#CONFIGS += -DNUM_THREADS=32 + +FPU_INCLUDE = -I$(RTL_DIR)/fpu +ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) + FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src +endif +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem $(FPU_INCLUDE) $(FPU_INCLUDE) \ No newline at end of file diff --git a/hw/syn/xilinx/dut/lmem/Makefile b/hw/syn/xilinx/dut/lmem/Makefile new file mode 100644 index 000000000..b3ba57c8d --- /dev/null +++ b/hw/syn/xilinx/dut/lmem/Makefile @@ -0,0 +1,7 @@ +PROJECT = VX_local_mem_top +TOP_LEVEL_ENTITY = $(PROJECT) +SRC_FILE = $(PROJECT).sv + +include ../../common.mk + +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/mem diff --git a/hw/syn/xilinx/dut/mem_unit/Makefile b/hw/syn/xilinx/dut/mem_unit/Makefile new file mode 100644 index 000000000..209492265 --- /dev/null +++ b/hw/syn/xilinx/dut/mem_unit/Makefile @@ -0,0 +1,7 @@ +PROJECT = VX_mem_unit_top +TOP_LEVEL_ENTITY = $(PROJECT) +SRC_FILE = $(PROJECT).sv + +include ../../common.mk + +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/mem -I$(RTL_DIR)/core -I$(RTL_DIR)/fpu diff --git a/hw/syn/xilinx/dut/pre_opt_hook.tcl b/hw/syn/xilinx/dut/pre_opt_hook.tcl new file mode 100644 index 000000000..92d1e94f9 --- /dev/null +++ b/hw/syn/xilinx/dut/pre_opt_hook.tcl @@ -0,0 +1,2 @@ +set tool_dir $::env(TOOL_DIR) +source ${tool_dir}/xilinx_async_bram_patch.tcl \ No newline at end of file diff --git a/hw/syn/xilinx/dut/project.tcl b/hw/syn/xilinx/dut/project.tcl new file mode 100644 index 000000000..2ce7a07b9 --- /dev/null +++ b/hw/syn/xilinx/dut/project.tcl @@ -0,0 +1,192 @@ +# Copyright © 2019-2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if { $::argc != 4 } { + puts "ERROR: Program \"$::argv0\" requires 4 arguments!\n" + puts "Usage: $::argv0 \n" + exit +} + +# Set the project name +set project_name "project_1" + +set top_module [lindex $::argv 0] +set device_part [lindex $::argv 1] +set vcs_file [lindex $::argv 2] +set xdc_file [lindex $::argv 3] + +set tool_dir $::env(TOOL_DIR) +set script_dir [ file dirname [ file normalize [ info script ] ] ] + +puts "Using top_module=$top_module" +puts "Using device_part=$device_part" +puts "Using vcs_file=$vcs_file" +puts "Using xdc_file=$xdc_file" +puts "Using tool_dir=$tool_dir" +puts "Using script_dir=$script_dir" + +# Set the number of jobs based on MAX_JOBS environment variable +if {[info exists ::env(MAX_JOBS)]} { + set num_jobs $::env(MAX_JOBS) + puts "using num_jobs=$num_jobs" +} else { + set num_jobs 0 +} + +proc run_setup {} { + global project_name + global top_module device_part vcs_file xdc_file + global script_dir tool_dir + global num_jobs + global argv argc ;# Using global system variables: argv and argc + + # create fpu ip + if {[info exists ::env(FPU_IP)]} { + set ip_dir $::env(FPU_IP) + set argv [list $ip_dir $device_part] + set argc 2 + source ${tool_dir}/xilinx_ip_gen.tcl + } + + source "${tool_dir}/parse_vcs_list.tcl" + set vlist [parse_vcs_list "${vcs_file}"] + + set vsources_list [lindex $vlist 0] + set vincludes_list [lindex $vlist 1] + set vdefines_list [lindex $vlist 2] + + #puts $vsources_list + #puts $vincludes_list + #puts $vdefines_list + # Create project + create_project $project_name $project_name -force -part $device_part + + # Add constrains file + read_xdc $xdc_file + + # Add the design sources + add_files -norecurse -verbose $vsources_list + + # process defines + set_property verilog_define ${vdefines_list} [current_fileset] + + # add fpu ip + if {[info exists ::env(FPU_IP)]} { + set ip_dir $::env(FPU_IP) + add_files -norecurse -verbose ${ip_dir}/xil_fma/xil_fma.xci + add_files -norecurse -verbose ${ip_dir}/xil_fdiv/xil_fdiv.xci + add_files -norecurse -verbose ${ip_dir}/xil_fsqrt/xil_fsqrt.xci + } + + # Synthesis + set_property top $top_module [current_fileset] + set_property \ + -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} \ + -value {-mode out_of_context} \ + -objects [get_runs synth_1] + + # register compilation hooks + #set_property STEPS.SYNTH_DESIGN.TCL.PRE ${script_dir}/pre_synth_hook.tcl [get_runs synth_1] + #set_property STEPS.SYNTH_DESIGN.TCL.POST ${script_dir}/post_synth_hook.tcl [get_runs synth_1] + set_property STEPS.OPT_DESIGN.TCL.PRE ${script_dir}/pre_opt_hook.tcl [get_runs impl_1] + #set_property STEPS.OPT_DESIGN.TCL.POST ${script_dir}/post_opt_hook.tcl [get_runs impl_1] + #set_property STEPS.POWER_OPT_DESIGN.TCL.PRE ${script_dir}/pre_power_opt_hook.tcl [get_runs impl_1] + #set_property STEPS.POWER_OPT_DESIGN.TCL.POST ${script_dir}/post_power_opt_hook.tcl [get_runs impl_1] + #set_property STEPS.PLACE_DESIGN.TCL.PRE ${script_dir}/pre_place_hook.tcl [get_runs impl_1] + #set_property STEPS.PLACE_DESIGN.TCL.POST ${script_dir}/post_place_hook.tcl [get_runs impl_1] + #set_property STEPS.POST_PLACE_POWER_OPT_DESIGN.TCL.PRE ${script_dir}/pre_place_power_opt_hook.tcl [get_runs impl_1] + #set_property STEPS.POST_PLACE_POWER_OPT_DESIGN.TCL.POST ${script_dir}/post_place_power_opt_hook.tcl [get_runs impl_1] + #set_property STEPS.PHYS_OPT_DESIGN.TCL.PRE ${script_dir}/pre_phys_opt_hook.tcl [get_runs impl_1] + #set_property STEPS.PHYS_OPT_DESIGN.TCL.POST ${script_dir}/post_phys_opt_hook.tcl [get_runs impl_1] + #set_property STEPS.ROUTE_DESIGN.TCL.PRE ${script_dir}/pre_route_hook.tcl [get_runs impl_1] + #set_property STEPS.ROUTE_DESIGN.TCL.POST ${script_dir}/post_route_hook.tcl [get_runs impl_1] + #set_property STEPS.WRITE_BITSTREAM.TCL.PRE ${script_dir}/pre_bitstream_hook.tcl [get_runs impl_1] + #set_property STEPS.WRITE_BITSTREAM.TCL.POST ${script_dir}/post_bitstream_hook.tcl [get_runs impl_1] + + update_compile_order -fileset sources_1 +} + +proc run_synthesis {} { + global num_jobs + + if {$num_jobs != 0} { + launch_runs synth_1 -verbose -jobs $num_jobs + } else { + launch_runs synth_1 -verbose + } + wait_on_run synth_1 + open_run synth_1 + report_utilization -file post_synth_util.rpt -hierarchical -hierarchical_percentages + write_checkpoint -force post_synth.dcp +} + +proc run_implementation {} { + global num_jobs + + if {$num_jobs != 0} { + launch_runs impl_1 -verbose -jobs $num_jobs + } else { + launch_runs impl_1 -verbose + } + wait_on_run impl_1 + open_run impl_1 + report_utilization -file post_impl_util.rpt -hierarchical -hierarchical_percentages + write_checkpoint -force post_impl.dcp +} + +proc run_report {} { + # Generate the synthesis report + report_place_status -file place.rpt + report_route_status -file route.rpt + + # Generate timing report + report_timing -nworst 100 -delay_type max -sort_by group -file timing.rpt + + # Generate power and drc reports + report_power -file power.rpt + report_drc -file drc.rpt +} + +############################################################################### + +# Start time +set start_time [clock seconds] + +set checkpoint_synth "post_synth.dcp" +set checkpoint_impl "post_impl.dcp" + +if { [file exists $checkpoint_impl] } { + puts "Resuming from post-implementation checkpoint: $checkpoint_impl" + open_checkpoint $checkpoint_impl + run_report +} elseif { [file exists $checkpoint_synth] } { + puts "Resuming from post-synthesis checkpoint: $checkpoint_synth" + open_checkpoint $checkpoint_synth + run_implementation + run_report +} else { + # Execute full pipeline + run_setup + run_synthesis + run_implementation + run_report +} + +# End time and calculation +set elapsed_time [expr {[clock seconds] - $start_time}] + +# Display elapsed time +set hours [format "%02d" [expr {$elapsed_time / 3600}]] +set minutes [format "%02d" [expr {($elapsed_time % 3600) / 60}]] +set seconds [format "%02d" [expr {$elapsed_time % 60}]] +puts "Total elapsed time: ${hours}h ${minutes}m ${seconds}s" diff --git a/hw/syn/xilinx/dut/project.xdc b/hw/syn/xilinx/dut/project.xdc new file mode 100644 index 000000000..f786e7837 --- /dev/null +++ b/hw/syn/xilinx/dut/project.xdc @@ -0,0 +1,4 @@ +set CLK_FREQ_MHZ 300 +set clk_port_name clk +set clk_port [get_ports $clk_port_name] +create_clock -name core_clock -period [expr 1000.0 / $CLK_FREQ_MHZ] $clk_port \ No newline at end of file diff --git a/hw/syn/xilinx/dut/scope/Makefile b/hw/syn/xilinx/dut/scope/Makefile new file mode 100644 index 000000000..405f05e8a --- /dev/null +++ b/hw/syn/xilinx/dut/scope/Makefile @@ -0,0 +1,7 @@ +PROJECT = VX_scope_tap +TOP_LEVEL_ENTITY = $(PROJECT) +SRC_FILE = $(PROJECT).sv + +include ../../common.mk + +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs diff --git a/hw/syn/xilinx/dut/top/Makefile b/hw/syn/xilinx/dut/top/Makefile new file mode 100644 index 000000000..c471b7807 --- /dev/null +++ b/hw/syn/xilinx/dut/top/Makefile @@ -0,0 +1,17 @@ +PROJECT = vortex_afu +TOP_LEVEL_ENTITY = $(PROJECT) +SRC_FILE = $(PROJECT).sv +FPU_IP = 1 + +include ../../common.mk + +#CONFIGS += -DNUM_CORES=2 +#CONFIGS += -DNUM_WARPS=32 +#CONFIGS += -DNUM_THREADS=32 +#CONFIGS += -DL2_ENABLE + +FPU_INCLUDE = -I$(RTL_DIR)/fpu +ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) + FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src +endif +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -I$(AFU_DIR) -I$(AFU_DIR)/ccip $(FPU_INCLUDE) diff --git a/hw/syn/xilinx/dut/unittest/Makefile b/hw/syn/xilinx/dut/unittest/Makefile new file mode 100644 index 000000000..3d756562e --- /dev/null +++ b/hw/syn/xilinx/dut/unittest/Makefile @@ -0,0 +1,11 @@ +PROJECT = VX_fifo_queue +TOP_LEVEL_ENTITY = $(PROJECT) +SRC_FILE = $(PROJECT).sv + +include ../../common.mk + +FPU_INCLUDE = -I$(RTL_DIR)/fpu +ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) + FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src +endif +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE) \ No newline at end of file diff --git a/hw/syn/xilinx/dut/vortex/Makefile b/hw/syn/xilinx/dut/vortex/Makefile new file mode 100644 index 000000000..eb6d45a88 --- /dev/null +++ b/hw/syn/xilinx/dut/vortex/Makefile @@ -0,0 +1,17 @@ +PROJECT = Vortex +TOP_LEVEL_ENTITY = $(PROJECT) +SRC_FILE = $(PROJECT).sv +FPU_IP = 1 + +include ../../common.mk + +#CONFIGS += -DNUM_CORES=2 +#CONFIGS += -DNUM_WARPS=32 +#CONFIGS += -DNUM_THREADS=32 +#CONFIGS += -DL2_ENABLE + +FPU_INCLUDE = -I$(RTL_DIR)/fpu +ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) + FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src +endif +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE) diff --git a/hw/syn/xilinx/xrt/kill_build.sh b/hw/syn/xilinx/kill_build.sh similarity index 100% rename from hw/syn/xilinx/xrt/kill_build.sh rename to hw/syn/xilinx/kill_build.sh diff --git a/hw/syn/xilinx/xrt/kill_hwserver.sh b/hw/syn/xilinx/kill_hwserver.sh similarity index 100% rename from hw/syn/xilinx/xrt/kill_hwserver.sh rename to hw/syn/xilinx/kill_hwserver.sh diff --git a/hw/syn/xilinx/xrt/kill_sim.sh b/hw/syn/xilinx/kill_sim.sh similarity index 100% rename from hw/syn/xilinx/xrt/kill_sim.sh rename to hw/syn/xilinx/kill_sim.sh diff --git a/hw/syn/xilinx/sandbox/Makefile b/hw/syn/xilinx/sandbox/Makefile new file mode 100644 index 000000000..07d2cf35f --- /dev/null +++ b/hw/syn/xilinx/sandbox/Makefile @@ -0,0 +1,77 @@ +ROOT_DIR := $(realpath ../../../..) +include $(ROOT_DIR)/config.mk + +DEVICE ?= xcu55c-fsvh2892-2L-e + +MAX_JOBS ?= 8 + +VIVADO := $(XILINX_VIVADO)/bin/vivado + +SRC_DIR := $(VORTEX_HOME)/hw/syn/xilinx/sandbox + +RTL_DIR := $(VORTEX_HOME)/hw/rtl +DPI_DIR := $(VORTEX_HOME)/hw/dpi +AFU_DIR := $(RTL_DIR)/afu/xrt +SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts + +KERNEL ?= fibonacci + +NCPUS := $(shell lscpu | grep "^Core(s) per socket:" | awk '{print $$4}') +JOBS ?= $(shell echo $$(( $(NCPUS) > $(MAX_JOBS) ? $(MAX_JOBS) : $(NCPUS) ))) + +# include paths +FPU_INCLUDE = -I$(RTL_DIR)/fpu +ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) + FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src +endif +RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache +RTL_INCLUDE += $(FPU_INCLUDE) +RTL_INCLUDE += -I$(SRC_DIR) + +# compilation flags +CFLAGS += -DNDEBUG -DSYNTHESIS -DVIVADO +CFLAGS += $(CONFIGS) +CFLAGS += $(RTL_INCLUDE) +CFLAGS += -DEXT_F_DISABLE + +# update memory layout for 2MB RAM +CFLAGS += -DSTARTUP_ADDR=32\'h80000 +CFLAGS += -DSTACK_BASE_ADDR=32\'hFF000 + +all: build + +$(KERNEL).bin: + $(MAKE) -C $(ROOT_DIR)/kernel clean + STACK_BASE_ADDR=0xFF000 $(MAKE) -C $(ROOT_DIR)/kernel + $(MAKE) -C $(ROOT_DIR)/tests/kernel/$(KERNEL) clean + STARTUP_ADDR=0x8000 $(MAKE) -C $(ROOT_DIR)/tests/kernel/$(KERNEL) + cp $(ROOT_DIR)/tests/kernel/$(KERNEL)/$(KERNEL).bin $(KERNEL).bin + +kernel.bin.coe: $(KERNEL).bin + $(SCRIPT_DIR)/bin2coe.py --out=$@ --binfile=8192:$(KERNEL).bin --depth=16384 --wordsize=64 --little_endian + +pre_opt_hook.tcl: $(SRC_DIR)/pre_opt_hook.tcl + cp $< $@ + +simulate.tcl: $(SRC_DIR)/simulate.tcl + cp $< $@ + +gen-sources: project_1/sources.txt +project_1/sources.txt: + mkdir -p project_1 + $(SCRIPT_DIR)/gen_sources.sh $(CFLAGS) -P -Cproject_1/src -Oproject_1/sources.txt + +build: done.dcp +done.dcp: project_1/sources.txt kernel.bin.coe project.tcl pre_opt_hook.tcl + MAX_JOBS=$(JOBS) TOOL_DIR=$(SCRIPT_DIR) $(VIVADO) -mode batch -source project.tcl -tclargs $(DEVICE) project_1/sources.txt + echo done > done.dcp + +run: simulate.tcl done.dcp + MAX_JOBS=$(JOBS) TOOL_DIR=$(SCRIPT_DIR) $(VIVADO) -mode batch -source simulate.tcl -tclargs project_1/project_1.xpr 50000ns + +open: done.dcp + $(VIVADO) project_1/project_1.xpr & + +clean: + rm -rf project_1 project1.tcl $(KERNEL).bin kernel.bin.coe + rm -rf .Xil *.log *.jou *.dcp *.rpt diff --git a/hw/syn/xilinx/sandbox/Vortex_top.v b/hw/syn/xilinx/sandbox/Vortex_top.v new file mode 100644 index 000000000..cd634b9b6 --- /dev/null +++ b/hw/syn/xilinx/sandbox/Vortex_top.v @@ -0,0 +1,122 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_define.vh" + +module Vortex_top #( + parameter C_M_AXI_GMEM_DATA_WIDTH = 512, + parameter C_M_AXI_GMEM_ADDR_WIDTH = `XLEN, + parameter C_M_AXI_GMEM_ID_WIDTH = 32, + parameter C_M_AXI_MEM_NUM_BANKS = 1 +) ( + input wire clk, + input wire reset, + + // AXI4 memory interface + output wire m_axi_mem_awvalid, + input wire m_axi_mem_awready, + output wire [C_M_AXI_GMEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr, + output wire [C_M_AXI_GMEM_ID_WIDTH - 1:0] m_axi_mem_awid, + output wire [7:0] m_axi_mem_awlen, + output wire [2:0] m_axi_mem_awsize, + output wire [1:0] m_axi_mem_awburst, + output wire [1:0] m_axi_mem_awlock, + output wire [3:0] m_axi_mem_awcache, + output wire [2:0] m_axi_mem_awprot, + output wire [3:0] m_axi_mem_awqos, + output wire m_axi_mem_wvalid, + input wire m_axi_mem_wready, + output wire [C_M_AXI_GMEM_DATA_WIDTH-1:0] m_axi_mem_wdata, + output wire [C_M_AXI_GMEM_DATA_WIDTH/8-1:0] m_axi_mem_wstrb, + output wire m_axi_mem_wlast, + output wire m_axi_mem_arvalid, + input wire m_axi_mem_arready, + output wire [C_M_AXI_GMEM_ADDR_WIDTH-1:0] m_axi_mem_araddr, + output wire [C_M_AXI_GMEM_ID_WIDTH-1:0] m_axi_mem_arid, + output wire [7:0] m_axi_mem_arlen, + output wire [2:0] m_axi_mem_arsize, + output wire [1:0] m_axi_mem_arburst, + output wire [1:0] m_axi_mem_arlock, + output wire [3:0] m_axi_mem_arcache, + output wire [2:0] m_axi_mem_arprot, + output wire [3:0] m_axi_mem_arqos, + input wire m_axi_mem_rvalid, + output wire m_axi_mem_rready, + input wire [C_M_AXI_GMEM_DATA_WIDTH - 1:0] m_axi_mem_rdata, + input wire m_axi_mem_rlast, + input wire [C_M_AXI_GMEM_ID_WIDTH - 1:0] m_axi_mem_rid, + input wire [1:0] m_axi_mem_rresp, + input wire m_axi_mem_bvalid, + output wire m_axi_mem_bready, + input wire [1:0] m_axi_mem_bresp, + input wire [C_M_AXI_GMEM_ID_WIDTH - 1:0] m_axi_mem_bid, + + input wire dcr_wr_valid, + input wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr, + input wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data, + + output wire busy +); + + Vortex_wrap #( + .C_M_AXI_GMEM_DATA_WIDTH(C_M_AXI_GMEM_DATA_WIDTH), + .C_M_AXI_GMEM_ADDR_WIDTH(C_M_AXI_GMEM_ADDR_WIDTH), + .C_M_AXI_GMEM_ID_WIDTH(C_M_AXI_GMEM_ID_WIDTH), + .C_M_AXI_MEM_NUM_BANKS(C_M_AXI_MEM_NUM_BANKS) + ) wrapper ( + .clk(clk), + .reset(reset), + .m_axi_mem_awvalid(m_axi_mem_awvalid), + .m_axi_mem_awready(m_axi_mem_awready), + .m_axi_mem_awaddr(m_axi_mem_awaddr), + .m_axi_mem_awid(m_axi_mem_awid), + .m_axi_mem_awlen(m_axi_mem_awlen), + .m_axi_mem_awsize(m_axi_mem_awsize), + .m_axi_mem_awburst(m_axi_mem_awburst), + .m_axi_mem_awlock(m_axi_mem_awlock), + .m_axi_mem_awcache(m_axi_mem_awcache), + .m_axi_mem_awprot(m_axi_mem_awprot), + .m_axi_mem_awqos(m_axi_mem_awqos), + .m_axi_mem_wvalid(m_axi_mem_wvalid), + .m_axi_mem_wready(m_axi_mem_wready), + .m_axi_mem_wdata(m_axi_mem_wdata), + .m_axi_mem_wstrb(m_axi_mem_wstrb), + .m_axi_mem_wlast(m_axi_mem_wlast), + .m_axi_mem_arvalid(m_axi_mem_arvalid), + .m_axi_mem_arready(m_axi_mem_arready), + .m_axi_mem_araddr(m_axi_mem_araddr), + .m_axi_mem_arid(m_axi_mem_arid), + .m_axi_mem_arlen(m_axi_mem_arlen), + .m_axi_mem_arsize(m_axi_mem_arsize), + .m_axi_mem_arburst(m_axi_mem_arburst), + .m_axi_mem_arlock(m_axi_mem_arlock), + .m_axi_mem_arcache(m_axi_mem_arcache), + .m_axi_mem_arprot(m_axi_mem_arprot), + .m_axi_mem_arqos(m_axi_mem_arqos), + .m_axi_mem_rvalid(m_axi_mem_rvalid), + .m_axi_mem_rready(m_axi_mem_rready), + .m_axi_mem_rdata(m_axi_mem_rdata), + .m_axi_mem_rlast(m_axi_mem_rlast), + .m_axi_mem_rid(m_axi_mem_rid), + .m_axi_mem_rresp(m_axi_mem_rresp), + .m_axi_mem_bvalid(m_axi_mem_bvalid), + .m_axi_mem_bready(m_axi_mem_bready), + .m_axi_mem_bresp(m_axi_mem_bresp), + .m_axi_mem_bid(m_axi_mem_bid), + .dcr_wr_valid(dcr_wr_valid), + .dcr_wr_addr(dcr_wr_addr), + .dcr_wr_data(dcr_wr_data), + .busy(busy) + ); + +endmodule diff --git a/hw/syn/xilinx/test/project_1_files/Vortex_top.v b/hw/syn/xilinx/sandbox/Vortex_wrap.sv similarity index 97% rename from hw/syn/xilinx/test/project_1_files/Vortex_top.v rename to hw/syn/xilinx/sandbox/Vortex_wrap.sv index a7adf71bc..5ec7a868e 100644 --- a/hw/syn/xilinx/test/project_1_files/Vortex_top.v +++ b/hw/syn/xilinx/sandbox/Vortex_wrap.sv @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -13,7 +13,7 @@ `include "VX_define.vh" -module Vortex_top #( +module Vortex_wrap #( parameter C_M_AXI_GMEM_DATA_WIDTH = 512, parameter C_M_AXI_GMEM_ADDR_WIDTH = `XLEN, parameter C_M_AXI_GMEM_ID_WIDTH = 32, @@ -22,7 +22,7 @@ module Vortex_top #( input wire clk, input wire reset, - // AXI4 memory interface + // AXI4 memory interface output wire m_axi_mem_awvalid, input wire m_axi_mem_awready, output wire [C_M_AXI_GMEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr, @@ -138,13 +138,13 @@ module Vortex_top #( assign m_axi_mem_rvalid_a[0] = m_axi_mem_rvalid; assign m_axi_mem_rready = m_axi_mem_rready_a[0]; - assign m_axi_mem_rdata_a[0] = m_axi_mem_rdata; + assign m_axi_mem_rdata_a[0] = m_axi_mem_rdata; assign m_axi_mem_rlast_a[0] = m_axi_mem_rlast; assign m_axi_mem_rid_a[0] = m_axi_mem_rid; assign m_axi_mem_rresp_a[0] = m_axi_mem_rresp; assign m_axi_mem_bvalid_a[0] = m_axi_mem_bvalid; - assign m_axi_mem_bready = m_axi_mem_bready_a[0]; + assign m_axi_mem_bready = m_axi_mem_bready_a[0]; assign m_axi_mem_bresp_a[0] = m_axi_mem_bresp; assign m_axi_mem_bid_a[0] = m_axi_mem_bid; @@ -177,7 +177,7 @@ module Vortex_top #( .m_axi_bvalid (m_axi_mem_bvalid_a), .m_axi_bready (m_axi_mem_bready_a), .m_axi_bid (m_axi_mem_bid_a), - .m_axi_bresp (m_axi_mem_bresp_a), + .m_axi_bresp (m_axi_mem_bresp_a), .m_axi_arvalid (m_axi_mem_arvalid_a), .m_axi_arready (m_axi_mem_arready_a), @@ -193,7 +193,7 @@ module Vortex_top #( .m_axi_rvalid (m_axi_mem_rvalid_a), .m_axi_rready (m_axi_mem_rready_a), - .m_axi_rdata (m_axi_mem_rdata_a), + .m_axi_rdata (m_axi_mem_rdata_a), .m_axi_rid (m_axi_mem_rid_a), .m_axi_rresp (m_axi_mem_rresp_a), .m_axi_rlast (m_axi_mem_rlast_a), @@ -204,5 +204,5 @@ module Vortex_top #( .busy (busy) ); - + endmodule diff --git a/hw/syn/xilinx/sandbox/pre_opt_hook.tcl b/hw/syn/xilinx/sandbox/pre_opt_hook.tcl new file mode 100644 index 000000000..92d1e94f9 --- /dev/null +++ b/hw/syn/xilinx/sandbox/pre_opt_hook.tcl @@ -0,0 +1,2 @@ +set tool_dir $::env(TOOL_DIR) +source ${tool_dir}/xilinx_async_bram_patch.tcl \ No newline at end of file diff --git a/hw/syn/xilinx/sandbox/project.tcl.in b/hw/syn/xilinx/sandbox/project.tcl.in new file mode 100644 index 000000000..1fcd82c3c --- /dev/null +++ b/hw/syn/xilinx/sandbox/project.tcl.in @@ -0,0 +1,495 @@ +# Copyright © 2019-2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if { $::argc != 2 } { + puts "ERROR: Program \"$::argv0\" requires 2 arguments!\n" + puts "Usage: $::argv0 \n" + exit +} + +set device_part [lindex $::argv 0] +set vcs_file [lindex $::argv 1] + +set tool_dir $::env(TOOL_DIR) +set script_dir [ file dirname [ file normalize [ info script ] ] ] + +puts "Using device_part=$device_part" +puts "Using vcs_file=$vcs_file" +puts "Using tool_dir=$tool_dir" +puts "Using script_dir=$script_dir" + +# Set the number of jobs based on MAX_JOBS environment variable +if {[info exists ::env(MAX_JOBS)]} { + set num_jobs $::env(MAX_JOBS) + puts "using num_jobs=$num_jobs" + #puts $num_jobs +} else { + set num_jobs 0 +} + +proc run_setup {} { + global device_part vcs_file + global tool_dir script_dir + + # Set the project name + set project_name "project_1" + + # Use project name variable, if specified in the tcl shell + if { [info exists ::user_project_name] } { + set project_name $::user_project_name + } + + source "${tool_dir}/parse_vcs_list.tcl" + set vlist [parse_vcs_list "${vcs_file}"] + + set vsources_list [lindex $vlist 0] + set vincludes_list [lindex $vlist 1] + set vdefines_list [lindex $vlist 2] + + #puts ${vsources_list} + #puts ${vincludes_list} + #puts ${vdefines_list} + + # Create project + create_project $project_name $project_name -force -part $device_part + + # Set the directory path for the new project + set proj_dir [get_property directory [current_project]] + + # Create 'sources_1' fileset (if not found) + if {[string equal [get_filesets -quiet sources_1] ""]} { + create_fileset -srcset sources_1 + } + + # add source files + set obj [get_filesets sources_1] + add_files -norecurse -verbose -fileset $obj ${vsources_list} + + # process defines + set obj [get_filesets sources_1] + foreach def $vdefines_list { + set_property -name "verilog_define" -value $def -objects $obj + } + + # Set 'sources_1' fileset properties + set obj [get_filesets sources_1] + set_property -name "name" -value "sources_1" -objects $obj + set_property -name "top" -value "design_1_wrapper" -objects $obj + + # Create 'constrs_1' fileset (if not found) + if {[string equal [get_filesets -quiet constrs_1] ""]} { + create_fileset -constrset constrs_1 + } + + # Set 'constrs_1' fileset object + set obj [get_filesets constrs_1] + + # Empty (no sources present) + + # Set 'constrs_1' fileset properties + set obj [get_filesets constrs_1] + set_property -name "constrs_type" -value "XDC" -objects $obj + set_property -name "name" -value "constrs_1" -objects $obj + set_property -name "target_constrs_file" -value "" -objects $obj + + # Create 'sim_1' fileset (if not found) + if {[string equal [get_filesets -quiet sim_1] ""]} { + create_fileset -simset sim_1 + } + + set testbench_file "" + foreach file ${vsources_list} { + if {[string match "*testbench.v" $file]} { + set testbench_file [file normalize $file] + break + } + } + + # Set 'sim_1' fileset object + set obj [get_filesets sim_1] + # Import local files from the original project + set files [list $testbench_file] + set imported_files [import_files -fileset sim_1 $files] + + # Set 'sim_1' fileset file properties for remote files + # None + + # Set 'sim_1' fileset file properties for local files + set file "testbench.v" + set file_obj [get_files -of_objects [get_filesets sim_1] [list "*$file"]] + set_property -name "file_type" -value "Verilog" -objects $file_obj + set_property -name "is_enabled" -value "1" -objects $file_obj + set_property -name "is_global_include" -value "0" -objects $file_obj + set_property -name "library" -value "xil_defaultlib" -objects $file_obj + set_property -name "path_mode" -value "RelativeFirst" -objects $file_obj + set_property -name "used_in" -value "synthesis implementation simulation" -objects $file_obj + set_property -name "used_in_implementation" -value "1" -objects $file_obj + set_property -name "used_in_simulation" -value "1" -objects $file_obj + set_property -name "used_in_synthesis" -value "1" -objects $file_obj + + # Set 'sim_1' fileset properties + set obj [get_filesets sim_1] + set_property -name "32bit" -value "0" -objects $obj + set_property -name "force_compile_glbl" -value "0" -objects $obj + set_property -name "generate_scripts_only" -value "0" -objects $obj + set_property -name "generic" -value "" -objects $obj + set_property -name "hbs.configure_design_for_hier_access" -value "1" -objects $obj + set_property -name "include_dirs" -value "" -objects $obj + set_property -name "incremental" -value "1" -objects $obj + set_property -name "name" -value "sim_1" -objects $obj + set_property -name "source_set" -value "sources_1" -objects $obj + set_property -name "systemc_include_dirs" -value "" -objects $obj + set_property -name "top" -value "testbench" -objects $obj + set_property -name "top_auto_set" -value "0" -objects $obj + set_property -name "top_lib" -value "xil_defaultlib" -objects $obj + set_property -name "verilog_define" -value "" -objects $obj + set_property -name "verilog_uppercase" -value "0" -objects $obj + + # Set 'utils_1' fileset object + set obj [get_filesets utils_1] + # Empty (no sources present) + + # Set 'utils_1' fileset properties + set obj [get_filesets utils_1] + set_property -name "name" -value "utils_1" -objects $obj + + # Proc to create BD design_1 + proc cr_bd_design_1 { parentCell } { + # The design that will be created by this Tcl proc contains the following + # module references: + # Vortex_top + + # CHANGE DESIGN NAME HERE + set design_name design_1 + + common::send_gid_msg -ssname BD::TCL -id 2010 -severity "INFO" "Currently there is no design <$design_name> in project, so creating one..." + + create_bd_design $design_name + + set bCheckIPsPassed 1 + ################################################################## + # CHECK IPs + ################################################################## + set bCheckIPs 1 + if { $bCheckIPs == 1 } { + set list_check_ips "\ + xilinx.com:ip:axi_bram_ctrl:4.1\ + xilinx.com:ip:blk_mem_gen:8.4\ + " + + set list_ips_missing "" + common::send_gid_msg -ssname BD::TCL -id 2011 -severity "INFO" "Checking if the following IPs exist in the project's IP catalog: $list_check_ips ." + + foreach ip_vlnv $list_check_ips { + set ip_obj [get_ipdefs -all $ip_vlnv] + if { $ip_obj eq "" } { + lappend list_ips_missing $ip_vlnv + } + } + + if { $list_ips_missing ne "" } { + catch {common::send_gid_msg -ssname BD::TCL -id 2012 -severity "ERROR" "The following IPs are not found in the IP Catalog:\n $list_ips_missing\n\nResolution: Please add the repository containing the IP(s) to the project." } + set bCheckIPsPassed 0 + } + + } + + ################################################################## + # CHECK Modules + ################################################################## + set bCheckModules 1 + if { $bCheckModules == 1 } { + set list_check_mods "\ + Vortex_top\ + " + + set list_mods_missing "" + common::send_gid_msg -ssname BD::TCL -id 2020 -severity "INFO" "Checking if the following modules exist in the project's sources: $list_check_mods ." + + foreach mod_vlnv $list_check_mods { + if { [can_resolve_reference $mod_vlnv] == 0 } { + lappend list_mods_missing $mod_vlnv + } + } + + if { $list_mods_missing ne "" } { + catch {common::send_gid_msg -ssname BD::TCL -id 2021 -severity "ERROR" "The following module(s) are not found in the project: $list_mods_missing" } + common::send_gid_msg -ssname BD::TCL -id 2022 -severity "INFO" "Please add source files for the missing module(s) above." + set bCheckIPsPassed 0 + } + } + + if { $bCheckIPsPassed != 1 } { + common::send_gid_msg -ssname BD::TCL -id 2023 -severity "WARNING" "Will not continue with creation of design due to the error(s) above." + return 3 + } + + variable script_folder + + if { $parentCell eq "" } { + set parentCell [get_bd_cells /] + } + + # Get object for parentCell + set parentObj [get_bd_cells $parentCell] + if { $parentObj == "" } { + catch {common::send_gid_msg -ssname BD::TCL -id 2090 -severity "ERROR" "Unable to find parent cell <$parentCell>!"} + return + } + + # Make sure parentObj is hier blk + set parentType [get_property TYPE $parentObj] + if { $parentType ne "hier" } { + catch {common::send_gid_msg -ssname BD::TCL -id 2091 -severity "ERROR" "Parent <$parentObj> has TYPE = <$parentType>. Expected to be ."} + return + } + + # Save current instance; Restore later + set oldCurInst [current_bd_instance .] + + # Set parent object as current + current_bd_instance $parentObj + + + # Create interface ports + + # Create ports + set clk_100MHz [ create_bd_port -dir I -type clk -freq_hz 100000000 clk_100MHz ] + set resetn [ create_bd_port -dir I -type rst resetn ] + set_property -dict [ list \ + CONFIG.POLARITY {ACTIVE_LOW} \ + ] $resetn + set vx_busy [ create_bd_port -dir O vx_busy ] + set vx_reset [ create_bd_port -dir I -type rst vx_reset ] + set_property -dict [ list \ + CONFIG.POLARITY {ACTIVE_HIGH} \ + ] $vx_reset + + set dcr_wr_valid [ create_bd_port -dir I dcr_wr_valid ] + set dcr_wr_addr [ create_bd_port -dir I -from 11 -to 0 dcr_wr_addr ] + set dcr_wr_data [ create_bd_port -dir I -from 31 -to 0 dcr_wr_data ] + + # Create instance: Vortex_top_0, and set properties + set block_name Vortex_top + set block_cell_name Vortex_top_0 + if { [catch {set Vortex_top_0 [create_bd_cell -type module -reference $block_name $block_cell_name] } errmsg] } { + catch {common::send_gid_msg -ssname BD::TCL -id 2095 -severity "ERROR" "Unable to add referenced block <$block_name>. Please add the files for ${block_name}'s definition into the project."} + return 1 + } elseif { $Vortex_top_0 eq "" } { + catch {common::send_gid_msg -ssname BD::TCL -id 2096 -severity "ERROR" "Unable to referenced block <$block_name>. Please add the files for ${block_name}'s definition into the project."} + return 1 + } + + # Create instance: axi_bram_ctrl_0, and set properties + set axi_bram_ctrl_0 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_bram_ctrl:4.1 axi_bram_ctrl_0 ] + set_property -dict [ list \ + CONFIG.DATA_WIDTH {512} \ + CONFIG.ECC_TYPE {0} \ + ] $axi_bram_ctrl_0 + + # Create instance: axi_bram_ctrl_0_bram, and set properties + set axi_bram_ctrl_0_bram [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 axi_bram_ctrl_0_bram ] + + set_property -dict [ list \ + CONFIG.Assume_Synchronous_Clk {true} \ + CONFIG.Byte_Size {8} \ + CONFIG.Load_Init_File {true} \ + CONFIG.Coe_File {@BUILDDIR@/hw/syn/xilinx/sandbox/kernel.bin.coe} \ + CONFIG.EN_SAFETY_CKT {true} \ + CONFIG.Enable_32bit_Address {true} \ + CONFIG.Fill_Remaining_Memory_Locations {false} \ + CONFIG.Memory_Type {Simple_Dual_Port_RAM} \ + CONFIG.Operating_Mode_A {NO_CHANGE} \ + CONFIG.Operating_Mode_B {READ_FIRST} \ + CONFIG.Port_B_Write_Rate {0} \ + CONFIG.Read_Width_A {512} \ + CONFIG.Read_Width_B {512} \ + CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \ + CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \ + CONFIG.Remaining_Memory_Locations {0} \ + CONFIG.Use_Byte_Write_Enable {true} \ + CONFIG.Use_RSTA_Pin {false} \ + CONFIG.Use_RSTB_Pin {true} \ + CONFIG.Write_Width_A {512} \ + CONFIG.Write_Depth_A {16384} \ + CONFIG.use_bram_block {Stand_Alone} \ + ] $axi_bram_ctrl_0_bram + + # Create interface connections + connect_bd_intf_net -intf_net Vortex_top_0_m_axi_mem [get_bd_intf_pins Vortex_top_0/m_axi_mem] [get_bd_intf_pins axi_bram_ctrl_0/S_AXI] + connect_bd_intf_net -intf_net axi_bram_ctrl_0_BRAM_PORTA [get_bd_intf_pins axi_bram_ctrl_0/BRAM_PORTA] [get_bd_intf_pins axi_bram_ctrl_0_bram/BRAM_PORTA] + connect_bd_intf_net -intf_net axi_bram_ctrl_0_BRAM_PORTB [get_bd_intf_pins axi_bram_ctrl_0/BRAM_PORTB] [get_bd_intf_pins axi_bram_ctrl_0_bram/BRAM_PORTB] + + # Create port connections + connect_bd_net -net Vortex_top_0_busy [get_bd_ports vx_busy] [get_bd_pins Vortex_top_0/busy] + connect_bd_net -net clk_wiz_clk_out1 [get_bd_ports clk_100MHz] [get_bd_pins Vortex_top_0/clk] [get_bd_pins axi_bram_ctrl_0/s_axi_aclk] + connect_bd_net -net resetn_1 [get_bd_ports resetn] [get_bd_pins axi_bram_ctrl_0/s_axi_aresetn] + connect_bd_net -net vx_reset_1 [get_bd_ports vx_reset] [get_bd_pins Vortex_top_0/reset] + connect_bd_net -net dcr_wr_valid_1 [get_bd_ports dcr_wr_valid] [get_bd_pins Vortex_top_0/dcr_wr_valid] + connect_bd_net -net dcr_wr_addr_1 [get_bd_ports dcr_wr_addr] [get_bd_pins Vortex_top_0/dcr_wr_addr] + connect_bd_net -net dcr_wr_data_1 [get_bd_ports dcr_wr_data] [get_bd_pins Vortex_top_0/dcr_wr_data] + + # Create address segments + assign_bd_address -offset 0x00000000 -range 0x00100000 -target_address_space [get_bd_addr_spaces Vortex_top_0/m_axi_mem] [get_bd_addr_segs axi_bram_ctrl_0/S_AXI/Mem0] -force + + # Perform GUI Layout + regenerate_bd_layout -layout_string { + "ActiveEmotionalView":"Default View", + "Default View_ScaleFactor":"1.0", + "Default View_TopLeft":"-195,-165", + "ExpandedHierarchyInLayout":"", + "guistr":"# # String gsaved with Nlview 7.0r4 2019-12-20 bk=1.5203 VDI=41 GEI=36 GUI=JA:10.0 TLS + # -string -flagsOSRD + preplace port clk_100MHz -pg 1 -lvl 0 -x 0 -y 40 -defaultsOSRD + preplace port resetn -pg 1 -lvl 0 -x 0 -y 20 -defaultsOSRD + preplace port vx_busy -pg 1 -lvl 4 -x 950 -y 220 -defaultsOSRD + preplace port vx_reset -pg 1 -lvl 0 -x 0 -y 110 -defaultsOSRD + preplace port dcr_wr_valid -pg 1 -lvl 0 -x 0 -y 130 -defaultsOSRD + preplace portBus dcr_wr_addr -pg 1 -lvl 0 -x 0 -y 150 -defaultsOSRD + preplace portBus dcr_wr_data -pg 1 -lvl 0 -x 0 -y 170 -defaultsOSRD + preplace inst Vortex_top_0 -pg 1 -lvl 1 -x 190 -y 130 -defaultsOSRD + preplace inst axi_bram_ctrl_0 -pg 1 -lvl 2 -x 520 -y 140 -defaultsOSRD + preplace inst axi_bram_ctrl_0_bram -pg 1 -lvl 3 -x 800 -y 140 -defaultsOSRD + preplace netloc Vortex_top_0_busy 1 1 3 360J 220 NJ 220 NJ + preplace netloc clk_wiz_clk_out1 1 0 2 20 30 370 + preplace netloc resetn_1 1 0 2 NJ 20 380J + preplace netloc vx_reset_1 1 0 1 NJ 110 + preplace netloc dcr_wr_valid_1 1 0 1 NJ 130 + preplace netloc dcr_wr_addr_1 1 0 1 NJ 150 + preplace netloc dcr_wr_data_1 1 0 1 NJ 170 + preplace netloc axi_bram_ctrl_0_BRAM_PORTB 1 2 1 N 150 + preplace netloc axi_bram_ctrl_0_BRAM_PORTA 1 2 1 N 130 + preplace netloc Vortex_top_0_m_axi_mem 1 1 1 N 120 + levelinfo -pg 1 0 190 520 800 950 + pagesize -pg 1 -db -bbox -sgen -180 0 1060 240 + " + } + + # Restore current instance + current_bd_instance $oldCurInst + + validate_bd_design + save_bd_design + close_bd_design $design_name + } + # End of cr_bd_design_1() + cr_bd_design_1 "" + set_property EXCLUDE_DEBUG_LOGIC "0" [get_files design_1.bd ] + set_property GENERATE_SYNTH_CHECKPOINT "1" [get_files design_1.bd ] + set_property IS_ENABLED "1" [get_files design_1.bd ] + set_property IS_GLOBAL_INCLUDE "0" [get_files design_1.bd ] + #set_property IS_LOCKED "0" [get_files design_1.bd ] + set_property LIBRARY "xil_defaultlib" [get_files design_1.bd ] + set_property PATH_MODE "RelativeFirst" [get_files design_1.bd ] + set_property PFM_NAME "" [get_files design_1.bd ] + set_property REGISTERED_WITH_MANAGER "1" [get_files design_1.bd ] + set_property SYNTH_CHECKPOINT_MODE "Hierarchical" [get_files design_1.bd ] + set_property USED_IN "synthesis implementation simulation" [get_files design_1.bd ] + set_property USED_IN_IMPLEMENTATION "1" [get_files design_1.bd ] + set_property USED_IN_SIMULATION "1" [get_files design_1.bd ] + set_property USED_IN_SYNTHESIS "1" [get_files design_1.bd ] + + # Call make_wrapper to create wrapper files + set wrapper_path [make_wrapper -fileset sources_1 -files [ get_files -norecurse design_1.bd] -top] + add_files -norecurse -fileset sources_1 $wrapper_path + + # register compilation hooks + #set_property STEPS.SYNTH_DESIGN.TCL.PRE ${script_dir}/pre_synth_hook.tcl [get_runs synth_1] + #set_property STEPS.SYNTH_DESIGN.TCL.POST ${script_dir}/post_synth_hook.tcl [get_runs synth_1] + set_property STEPS.OPT_DESIGN.TCL.PRE ${script_dir}/pre_opt_hook.tcl [get_runs impl_1] + #set_property STEPS.OPT_DESIGN.TCL.POST ${script_dir}/post_opt_hook.tcl [get_runs impl_1] + #set_property STEPS.POWER_OPT_DESIGN.TCL.PRE ${script_dir}/pre_power_opt_hook.tcl [get_runs impl_1] + #set_property STEPS.POWER_OPT_DESIGN.TCL.POST ${script_dir}/post_power_opt_hook.tcl [get_runs impl_1] + #set_property STEPS.PLACE_DESIGN.TCL.PRE ${script_dir}/pre_place_hook.tcl [get_runs impl_1] + #set_property STEPS.PLACE_DESIGN.TCL.POST ${script_dir}/post_place_hook.tcl [get_runs impl_1] + #set_property STEPS.POST_PLACE_POWER_OPT_DESIGN.TCL.PRE ${script_dir}/pre_place_power_opt_hook.tcl [get_runs impl_1] + #set_property STEPS.POST_PLACE_POWER_OPT_DESIGN.TCL.POST ${script_dir}/post_place_power_opt_hook.tcl [get_runs impl_1] + #set_property STEPS.PHYS_OPT_DESIGN.TCL.PRE ${script_dir}/pre_phys_opt_hook.tcl [get_runs impl_1] + #set_property STEPS.PHYS_OPT_DESIGN.TCL.POST ${script_dir}/post_phys_opt_hook.tcl [get_runs impl_1] + #set_property STEPS.ROUTE_DESIGN.TCL.PRE ${script_dir}/pre_route_hook.tcl [get_runs impl_1] + #set_property STEPS.ROUTE_DESIGN.TCL.POST ${script_dir}/post_route_hook.tcl [get_runs impl_1] + #set_property STEPS.WRITE_BITSTREAM.TCL.PRE ${script_dir}/pre_bitstream_hook.tcl [get_runs impl_1] + #set_property STEPS.WRITE_BITSTREAM.TCL.POST ${script_dir}/post_bitstream_hook.tcl [get_runs impl_1] + + update_compile_order -fileset sources_1 +} + +proc run_synthesis {} { + global num_jobs + # Synthesis + if {$num_jobs != 0} { + launch_runs synth_1 -jobs $num_jobs + } else { + launch_runs synth_1 + } + wait_on_run synth_1 + open_run synth_1 + report_utilization -file utilization.rpt -hierarchical -hierarchical_percentages + + write_checkpoint -force post_synth.dcp +} + +proc run_implementation {} { + global tool_dir num_jobs + + # Implementation + if {$num_jobs != 0} { + launch_runs impl_1 -jobs $num_jobs + } else { + launch_runs impl_1 + } + wait_on_run impl_1 + open_run impl_1 + report_place_status -file place.rpt + report_route_status -file route.rpt + write_checkpoint -force post_impl.dcp +} + +proc run_report {} { + # Generate reports + report_timing_summary -file timing.rpt + report_power -file power.rpt + report_drc -file drc.rpt +} + +############################################################################### + +# Start time +set start_time [clock seconds] + +# Check if the post-implementation checkpoint exists +if { [file exists post_impl.dcp] } { + puts "Resuming from post-implementation checkpoint: post_impl.dcp" + open_checkpoint post_impl.dcp + run_report +} elseif { [file exists post_synth.dcp] } { + puts "Resuming from post-synthesis checkpoint: post_synth.dcp" + open_checkpoint post_synth.dcp + run_implementation + run_report +} else { + # Execute full pipeline + run_setup + run_synthesis + run_implementation + run_report +} + +# End time and calculation +set elapsed_time [expr {[clock seconds] - $start_time}] + +# Display elapsed time +set hours [format "%02d" [expr {$elapsed_time / 3600}]] +set minutes [format "%02d" [expr {($elapsed_time % 3600) / 60}]] +set seconds [format "%02d" [expr {$elapsed_time % 60}]] +puts "Total elapsed time: ${hours}h ${minutes}m ${seconds}s" \ No newline at end of file diff --git a/hw/syn/xilinx/sandbox/simulate.tcl b/hw/syn/xilinx/sandbox/simulate.tcl new file mode 100644 index 000000000..7ba85669c --- /dev/null +++ b/hw/syn/xilinx/sandbox/simulate.tcl @@ -0,0 +1,34 @@ +# Copyright © 2019-2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if { $::argc != 2 } { + puts "ERROR: Program \"$::argv0\" requires 2 arguments!\n" + puts "Usage: $::argv0 \n" + exit +} + +set project_file [lindex $::argv 0] +set sim_time [lindex $::argv 1] + +set tb_name testbench ;# Replace with actual testbench module + +open_project $project_file ;# Ensure correct project is loaded + +# Ensure testbench is set as simulation top +set_property top $tb_name [get_filesets sim_1] + +# Launch the simulation +launch_simulation -mode behavioral + +# Run for the specified number of cycles +run $sim_time diff --git a/hw/syn/xilinx/test/project_1_files/testbench.v b/hw/syn/xilinx/sandbox/testbench.v similarity index 100% rename from hw/syn/xilinx/test/project_1_files/testbench.v rename to hw/syn/xilinx/sandbox/testbench.v diff --git a/hw/syn/xilinx/test/Makefile b/hw/syn/xilinx/test/Makefile deleted file mode 100644 index e15789516..000000000 --- a/hw/syn/xilinx/test/Makefile +++ /dev/null @@ -1,54 +0,0 @@ -ROOT_DIR := $(realpath ../../../..) -include $(ROOT_DIR)/config.mk - -VIVADO := $(XILINX_VIVADO)/bin/vivado - -SRC_DIR := $(VORTEX_HOME)/hw/syn/xilinx/test - -RTL_DIR := $(VORTEX_HOME)/hw/rtl -DPI_DIR := $(VORTEX_HOME)/hw/dpi -AFU_DIR := $(RTL_DIR)/afu/opae -SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts - -# include paths -FPU_INCLUDE = -I$(RTL_DIR)/fpu -ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) - FPU_INCLUDE += -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src -endif -RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -RTL_INCLUDE += $(FPU_INCLUDE) -RTL_INCLUDE += -I$(SRC_DIR)/project_1_files - -# compilation flags -CFLAGS += -DNDEBUG -DSYNTHESIS -DVIVADO -CFLAGS += $(CONFIGS) -CFLAGS += $(RTL_INCLUDE) -CFLAGS += -DEXT_F_DISABLE -#CFLAGS += -DNUM_CORES 4 - -# update memory layout for 2MB RAM -CFLAGS += -DSTARTUP_ADDR=32\'h80000 -CFLAGS += -DIO_BASE_ADDR=32\'hFF000 - -COE_FILE := $(SRC_DIR)/project_1_files/kernel.bin.coe -ESCAPED_COE_FILE := $(shell echo "$(COE_FILE)" | sed -e 's/[\/&]/\\&/g') - -all: build - -gen-sources: project_1/sources.txt -project_1/sources.txt: - mkdir -p project_1 - $(SCRIPT_DIR)/gen_sources.sh $(CFLAGS) -P -Cproject_1/src -Oproject_1/sources.txt - -project.tcl: project.tcl.in - sed -e 's/%COE_FILE%/$(ESCAPED_COE_FILE)/g' < $< > $@ - -build: project_1/vortex.xpr -project_1/vortex.xpr: project_1/sources.txt project.tcl - $(VIVADO) -mode batch -source project.tcl -tclargs project_1/sources.txt project_1/src $(SCRIPT_DIR) - -run: project_1/vortex.xpr - $(VIVADO) project_1/vortex.xpr & - -clean: - rm -rf project_1 project.tcl diff --git a/hw/syn/xilinx/test/kernel/Makefile b/hw/syn/xilinx/test/kernel/Makefile deleted file mode 100644 index 515533689..000000000 --- a/hw/syn/xilinx/test/kernel/Makefile +++ /dev/null @@ -1,51 +0,0 @@ -ROOT_DIR := $(realpath ../../../../..) -include $(ROOT_DIR)/config.mk - -ifeq ($(XLEN),64) -CFLAGS += -march=rv64imafd -mabi=lp64d -else -CFLAGS += -march=rv32imaf -mabi=ilp32f -endif - -SRC_DIR := $(VORTEX_HOME)/hw/syn/xilinx/test/kernel - -SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts - -BIN2COE_PATH ?= $(SCRIPT_DIR)/bin2coe - -CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc -AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar -DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump -CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy - -CFLAGS += -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections -CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_HOME)/hw - -LDFLAGS += -lm -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/scripts/link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 - -PROJECT = kernel - -SRCS = $(SRC_DIR)/main.c $(SRC_DIR)/start.S - -all: $(PROJECT).elf $(PROJECT).hex $(PROJECT).bin $(PROJECT).dump $(PROJECT).bin.coe - -$(PROJECT).dump: $(PROJECT).elf - $(DP) -D $< > $@ - -$(PROJECT).hex: $(PROJECT).elf - $(CP) -O ihex $< $@ - -$(PROJECT).bin: $(PROJECT).elf - $(CP) -O binary $< $@ - -$(PROJECT).bin.coe: $(PROJECT).bin - $(BIN2COE_PATH)/bin2coe $< --out=$@ --binary=$(PROJECT).bin --data=$(PROJECT).dat --binaddr=8192 --depth=16384 --wordsize=64 - -$(PROJECT).elf: $(SRCS) - $(CC) $(CFLAGS) $^ $(LDFLAGS) -o $@ - -.depend: $(SRCS) - $(CC) $(CFLAGS) -MM $^ > .depend; - -clean: - rm -rf *.bin *.elf *.hex *.dump *.coe .depend diff --git a/hw/syn/xilinx/test/kernel/kernel.dat b/hw/syn/xilinx/test/kernel/kernel.dat deleted file mode 100644 index 6e197b719..000000000 --- a/hw/syn/xilinx/test/kernel/kernel.dat +++ /dev/null @@ -1,3 +0,0 @@ -@1 -000000C00000008000000002, -00000003000000020000000100000000, \ No newline at end of file diff --git a/hw/syn/xilinx/test/project.tcl.in b/hw/syn/xilinx/test/project.tcl.in deleted file mode 100644 index a2692f637..000000000 --- a/hw/syn/xilinx/test/project.tcl.in +++ /dev/null @@ -1,2228 +0,0 @@ -if { $::argc != 3 } { - puts "ERROR: Program \"$::argv0\" requires 3 arguments!\n" - puts "Usage: $::argv0 \n" - exit -} - -set vcs_file [lindex $::argv 0] -set files_dir [lindex $::argv 1] -set tool_dir [lindex $::argv 2] - -#puts $vcs_file -#puts $files_dir -#puts $tool_dir - -set origin_dir [file normalize "."] - -# Use origin directory path location variable, if specified in the tcl shell -if { [info exists ::origin_dir_loc] } { - set origin_dir $::origin_dir_loc -} - -# Set the project name -set project_name "project_1" - -# Use project name variable, if specified in the tcl shell -if { [info exists ::user_project_name] } { - set project_name $::user_project_name -} - -source "${tool_dir}/parse_vcs_list.tcl" -set vlist [parse_vcs_list "${vcs_file}"] - -set vsources_list [lindex $vlist 0] -set vincludes_list [lindex $vlist 1] -set vdefines_list [lindex $vlist 2] - -#puts ${vsources_list} -#puts ${vincludes_list} -#puts ${vdefines_list} - -# Create project -create_project ${project_name} ./${project_name} -force -part xcu280-fsvh2892-2L-e - -# Set the directory path for the new project -set proj_dir [get_property directory [current_project]] - -# Set project properties -set obj [current_project] -set_property -name "board_part" -value "xilinx.com:au280:part0:1.1" -objects $obj -set_property -name "compxlib.activehdl_compiled_library_dir" -value "$proj_dir/${project_name}.cache/compile_simlib/activehdl" -objects $obj -set_property -name "compxlib.funcsim" -value "1" -objects $obj -set_property -name "compxlib.ies_compiled_library_dir" -value "$proj_dir/${project_name}.cache/compile_simlib/ies" -objects $obj -set_property -name "compxlib.modelsim_compiled_library_dir" -value "$proj_dir/${project_name}.cache/compile_simlib/modelsim" -objects $obj -set_property -name "compxlib.overwrite_libs" -value "0" -objects $obj -set_property -name "compxlib.questa_compiled_library_dir" -value "$proj_dir/${project_name}.cache/compile_simlib/questa" -objects $obj -set_property -name "compxlib.riviera_compiled_library_dir" -value "$proj_dir/${project_name}.cache/compile_simlib/riviera" -objects $obj -set_property -name "compxlib.timesim" -value "1" -objects $obj -set_property -name "compxlib.vcs_compiled_library_dir" -value "$proj_dir/${project_name}.cache/compile_simlib/vcs" -objects $obj -set_property -name "compxlib.xsim_compiled_library_dir" -value "" -objects $obj -set_property -name "corecontainer.enable" -value "0" -objects $obj -set_property -name "default_lib" -value "xil_defaultlib" -objects $obj -set_property -name "enable_optional_runs_sta" -value "0" -objects $obj -set_property -name "enable_vhdl_2008" -value "1" -objects $obj -set_property -name "generate_ip_upgrade_log" -value "1" -objects $obj -set_property -name "ip_cache_permissions" -value "read write" -objects $obj -set_property -name "ip_interface_inference_priority" -value "" -objects $obj -set_property -name "ip_output_repo" -value "$proj_dir/${project_name}.cache/ip" -objects $obj -set_property -name "legacy_ip_repo_paths" -value "" -objects $obj -set_property -name "mem.enable_memory_map_generation" -value "1" -objects $obj -set_property -name "platform.board_id" -value "au280" -objects $obj -set_property -name "platform.default_output_type" -value "undefined" -objects $obj -set_property -name "platform.design_intent.datacenter" -value "undefined" -objects $obj -set_property -name "platform.design_intent.embedded" -value "undefined" -objects $obj -set_property -name "platform.design_intent.external_host" -value "undefined" -objects $obj -set_property -name "platform.design_intent.server_managed" -value "undefined" -objects $obj -set_property -name "platform.rom.debug_type" -value "0" -objects $obj -set_property -name "platform.rom.prom_type" -value "0" -objects $obj -set_property -name "platform.slrconstraintmode" -value "0" -objects $obj -set_property -name "preferred_sim_model" -value "rtl" -objects $obj -set_property -name "project_type" -value "Default" -objects $obj -set_property -name "pr_flow" -value "0" -objects $obj -set_property -name "sim.central_dir" -value "$proj_dir/${project_name}.ip_user_files" -objects $obj -set_property -name "sim.ip.auto_export_scripts" -value "1" -objects $obj -set_property -name "sim.use_ip_compiled_libs" -value "1" -objects $obj -set_property -name "simulator.activehdl_gcc_install_dir" -value "" -objects $obj -set_property -name "simulator.activehdl_install_dir" -value "" -objects $obj -set_property -name "simulator.ies_gcc_install_dir" -value "" -objects $obj -set_property -name "simulator.ies_install_dir" -value "" -objects $obj -set_property -name "simulator.modelsim_gcc_install_dir" -value "" -objects $obj -set_property -name "simulator.modelsim_install_dir" -value "" -objects $obj -set_property -name "simulator.questa_gcc_install_dir" -value "" -objects $obj -set_property -name "simulator.riviera_gcc_install_dir" -value "" -objects $obj -set_property -name "simulator.riviera_install_dir" -value "" -objects $obj -set_property -name "simulator.vcs_gcc_install_dir" -value "" -objects $obj -set_property -name "simulator.vcs_install_dir" -value "" -objects $obj -set_property -name "simulator.xcelium_gcc_install_dir" -value "" -objects $obj -set_property -name "simulator.xcelium_install_dir" -value "" -objects $obj -set_property -name "simulator_language" -value "Verilog" -objects $obj -set_property -name "source_mgmt_mode" -value "All" -objects $obj -set_property -name "target_language" -value "Verilog" -objects $obj -set_property -name "target_simulator" -value "XSim" -objects $obj -set_property -name "tool_flow" -value "Vivado" -objects $obj -set_property -name "webtalk.activehdl_export_sim" -value "27" -objects $obj -set_property -name "webtalk.ies_export_sim" -value "27" -objects $obj -set_property -name "webtalk.modelsim_export_sim" -value "27" -objects $obj -set_property -name "webtalk.questa_export_sim" -value "27" -objects $obj -set_property -name "webtalk.riviera_export_sim" -value "27" -objects $obj -set_property -name "webtalk.vcs_export_sim" -value "27" -objects $obj -set_property -name "webtalk.xcelium_export_sim" -value "5" -objects $obj -set_property -name "webtalk.xsim_export_sim" -value "27" -objects $obj -set_property -name "webtalk.xsim_launch_sim" -value "91" -objects $obj -set_property -name "xpm_libraries" -value "XPM_CDC XPM_MEMORY" -objects $obj -set_property -name "xsim.array_display_limit" -value "1024" -objects $obj -set_property -name "xsim.radix" -value "hex" -objects $obj -set_property -name "xsim.time_unit" -value "ns" -objects $obj -set_property -name "xsim.trace_limit" -value "65536" -objects $obj - -# Create 'sources_1' fileset (if not found) -if {[string equal [get_filesets -quiet sources_1] ""]} { - create_fileset -srcset sources_1 -} - -# add source files -set obj [get_filesets sources_1] -add_files -norecurse -verbose -fileset $obj ${vsources_list} - -# process defines -set obj [get_filesets sources_1] -foreach def $vdefines_list { - set_property -name "verilog_define" -value $def -objects $obj -} - -# Set 'sources_1' fileset properties -set obj [get_filesets sources_1] -set_property -name "design_mode" -value "RTL" -objects $obj -set_property -name "edif_extra_search_paths" -value "" -objects $obj -set_property -name "elab_link_dcps" -value "1" -objects $obj -set_property -name "elab_load_timing_constraints" -value "1" -objects $obj -set_property -name "generic" -value "" -objects $obj -set_property -name "include_dirs" -value "" -objects $obj -set_property -name "lib_map_file" -value "" -objects $obj -set_property -name "loop_count" -value "1000" -objects $obj -set_property -name "name" -value "sources_1" -objects $obj -set_property -name "top" -value "design_1_wrapper" -objects $obj -set_property -name "top_auto_set" -value "0" -objects $obj -set_property -name "verilog_define" -value "" -objects $obj -set_property -name "verilog_uppercase" -value "1" -objects $obj -set_property -name "verilog_version" -value "verilog_2001" -objects $obj -set_property -name "vhdl_version" -value "vhdl_2k" -objects $obj - -# Create 'constrs_1' fileset (if not found) -if {[string equal [get_filesets -quiet constrs_1] ""]} { - create_fileset -constrset constrs_1 -} - -# Set 'constrs_1' fileset object -set obj [get_filesets constrs_1] - -# Empty (no sources present) - -# Set 'constrs_1' fileset properties -set obj [get_filesets constrs_1] -set_property -name "constrs_type" -value "XDC" -objects $obj -set_property -name "name" -value "constrs_1" -objects $obj -set_property -name "target_constrs_file" -value "" -objects $obj - -# Create 'sim_1' fileset (if not found) -if {[string equal [get_filesets -quiet sim_1] ""]} { - create_fileset -simset sim_1 -} - -# Set 'sim_1' fileset object -set obj [get_filesets sim_1] -# Import local files from the original project -set files [list \ - [file normalize "$files_dir/testbench.v" ]\ -] -set imported_files [import_files -fileset sim_1 $files] - -# Set 'sim_1' fileset file properties for remote files -# None - -# Set 'sim_1' fileset file properties for local files -set file "testbench.v" -set file_obj [get_files -of_objects [get_filesets sim_1] [list "*$file"]] -set_property -name "file_type" -value "Verilog" -objects $file_obj -set_property -name "is_enabled" -value "1" -objects $file_obj -set_property -name "is_global_include" -value "0" -objects $file_obj -set_property -name "library" -value "xil_defaultlib" -objects $file_obj -set_property -name "path_mode" -value "RelativeFirst" -objects $file_obj -set_property -name "used_in" -value "synthesis implementation simulation" -objects $file_obj -set_property -name "used_in_implementation" -value "1" -objects $file_obj -set_property -name "used_in_simulation" -value "1" -objects $file_obj -set_property -name "used_in_synthesis" -value "1" -objects $file_obj - -# Set 'sim_1' fileset properties -set obj [get_filesets sim_1] -set_property -name "32bit" -value "0" -objects $obj -set_property -name "force_compile_glbl" -value "0" -objects $obj -set_property -name "generate_scripts_only" -value "0" -objects $obj -set_property -name "generic" -value "" -objects $obj -set_property -name "hbs.configure_design_for_hier_access" -value "1" -objects $obj -set_property -name "include_dirs" -value "" -objects $obj -set_property -name "incremental" -value "1" -objects $obj -set_property -name "name" -value "sim_1" -objects $obj -set_property -name "nl.cell" -value "" -objects $obj -set_property -name "nl.incl_unisim_models" -value "0" -objects $obj -set_property -name "nl.mode" -value "funcsim" -objects $obj -set_property -name "nl.process_corner" -value "slow" -objects $obj -set_property -name "nl.rename_top" -value "" -objects $obj -set_property -name "nl.sdf_anno" -value "1" -objects $obj -set_property -name "nl.write_all_overrides" -value "0" -objects $obj -set_property -name "source_set" -value "sources_1" -objects $obj -set_property -name "systemc_include_dirs" -value "" -objects $obj -set_property -name "top" -value "testbench" -objects $obj -set_property -name "top_auto_set" -value "0" -objects $obj -set_property -name "top_lib" -value "xil_defaultlib" -objects $obj -set_property -name "transport_int_delay" -value "0" -objects $obj -set_property -name "transport_path_delay" -value "0" -objects $obj -set_property -name "unifast" -value "0" -objects $obj -set_property -name "verilog_define" -value "" -objects $obj -set_property -name "verilog_uppercase" -value "0" -objects $obj -set_property -name "xelab.dll" -value "0" -objects $obj -set_property -name "xsim.compile.tcl.pre" -value "" -objects $obj -set_property -name "xsim.compile.xsc.more_options" -value "" -objects $obj -set_property -name "xsim.compile.xvhdl.more_options" -value "" -objects $obj -set_property -name "xsim.compile.xvhdl.nosort" -value "1" -objects $obj -set_property -name "xsim.compile.xvhdl.relax" -value "1" -objects $obj -set_property -name "xsim.compile.xvlog.more_options" -value "" -objects $obj -set_property -name "xsim.compile.xvlog.nosort" -value "1" -objects $obj -set_property -name "xsim.compile.xvlog.relax" -value "1" -objects $obj -set_property -name "xsim.elaborate.debug_level" -value "typical" -objects $obj -set_property -name "xsim.elaborate.load_glbl" -value "1" -objects $obj -set_property -name "xsim.elaborate.mt_level" -value "auto" -objects $obj -set_property -name "xsim.elaborate.rangecheck" -value "0" -objects $obj -set_property -name "xsim.elaborate.relax" -value "1" -objects $obj -set_property -name "xsim.elaborate.sdf_delay" -value "sdfmax" -objects $obj -set_property -name "xsim.elaborate.snapshot" -value "" -objects $obj -set_property -name "xsim.elaborate.xelab.more_options" -value "" -objects $obj -set_property -name "xsim.elaborate.xsc.more_options" -value "" -objects $obj -set_property -name "xsim.simulate.add_positional" -value "0" -objects $obj -set_property -name "xsim.simulate.custom_tcl" -value "" -objects $obj -set_property -name "xsim.simulate.log_all_signals" -value "0" -objects $obj -set_property -name "xsim.simulate.no_quit" -value "0" -objects $obj -set_property -name "xsim.simulate.runtime" -value "4000ns" -objects $obj -set_property -name "xsim.simulate.saif" -value "" -objects $obj -set_property -name "xsim.simulate.saif_all_signals" -value "0" -objects $obj -set_property -name "xsim.simulate.saif_scope" -value "" -objects $obj -set_property -name "xsim.simulate.tcl.post" -value "" -objects $obj -set_property -name "xsim.simulate.wdb" -value "" -objects $obj -set_property -name "xsim.simulate.xsim.more_options" -value "" -objects $obj - -# Set 'utils_1' fileset object -set obj [get_filesets utils_1] -# Empty (no sources present) - -# Set 'utils_1' fileset properties -set obj [get_filesets utils_1] -set_property -name "name" -value "utils_1" -objects $obj - -# Proc to create BD design_1 -proc cr_bd_design_1 { parentCell } { -# The design that will be created by this Tcl proc contains the following -# module references: -# Vortex_top - -# CHANGE DESIGN NAME HERE -set design_name design_1 - -common::send_gid_msg -ssname BD::TCL -id 2010 -severity "INFO" "Currently there is no design <$design_name> in project, so creating one..." - -create_bd_design $design_name - -set bCheckIPsPassed 1 -################################################################## -# CHECK IPs -################################################################## -set bCheckIPs 1 -if { $bCheckIPs == 1 } { - set list_check_ips "\ - xilinx.com:ip:axi_bram_ctrl:4.1\ - xilinx.com:ip:blk_mem_gen:8.4\ - " - - set list_ips_missing "" - common::send_gid_msg -ssname BD::TCL -id 2011 -severity "INFO" "Checking if the following IPs exist in the project's IP catalog: $list_check_ips ." - - foreach ip_vlnv $list_check_ips { - set ip_obj [get_ipdefs -all $ip_vlnv] - if { $ip_obj eq "" } { - lappend list_ips_missing $ip_vlnv - } - } - - if { $list_ips_missing ne "" } { - catch {common::send_gid_msg -ssname BD::TCL -id 2012 -severity "ERROR" "The following IPs are not found in the IP Catalog:\n $list_ips_missing\n\nResolution: Please add the repository containing the IP(s) to the project." } - set bCheckIPsPassed 0 - } - - } - - ################################################################## - # CHECK Modules - ################################################################## - set bCheckModules 1 - if { $bCheckModules == 1 } { - set list_check_mods "\ - Vortex_top\ - " - - set list_mods_missing "" - common::send_gid_msg -ssname BD::TCL -id 2020 -severity "INFO" "Checking if the following modules exist in the project's sources: $list_check_mods ." - - foreach mod_vlnv $list_check_mods { - if { [can_resolve_reference $mod_vlnv] == 0 } { - lappend list_mods_missing $mod_vlnv - } - } - - if { $list_mods_missing ne "" } { - catch {common::send_gid_msg -ssname BD::TCL -id 2021 -severity "ERROR" "The following module(s) are not found in the project: $list_mods_missing" } - common::send_gid_msg -ssname BD::TCL -id 2022 -severity "INFO" "Please add source files for the missing module(s) above." - set bCheckIPsPassed 0 - } -} - -if { $bCheckIPsPassed != 1 } { - common::send_gid_msg -ssname BD::TCL -id 2023 -severity "WARNING" "Will not continue with creation of design due to the error(s) above." - return 3 -} - -variable script_folder - -if { $parentCell eq "" } { - set parentCell [get_bd_cells /] -} - -# Get object for parentCell -set parentObj [get_bd_cells $parentCell] -if { $parentObj == "" } { - catch {common::send_gid_msg -ssname BD::TCL -id 2090 -severity "ERROR" "Unable to find parent cell <$parentCell>!"} - return -} - -# Make sure parentObj is hier blk -set parentType [get_property TYPE $parentObj] -if { $parentType ne "hier" } { - catch {common::send_gid_msg -ssname BD::TCL -id 2091 -severity "ERROR" "Parent <$parentObj> has TYPE = <$parentType>. Expected to be ."} - return -} - -# Save current instance; Restore later -set oldCurInst [current_bd_instance .] - -# Set parent object as current -current_bd_instance $parentObj - - -# Create interface ports - -# Create ports -set clk_100MHz [ create_bd_port -dir I -type clk -freq_hz 100000000 clk_100MHz ] -set resetn [ create_bd_port -dir I -type rst resetn ] -set_property -dict [ list \ - CONFIG.POLARITY {ACTIVE_LOW} \ -] $resetn -set vx_busy [ create_bd_port -dir O vx_busy ] -set vx_reset [ create_bd_port -dir I -type rst vx_reset ] -set_property -dict [ list \ - CONFIG.POLARITY {ACTIVE_HIGH} \ -] $vx_reset - -set dcr_wr_valid [ create_bd_port -dir I dcr_wr_valid ] -set dcr_wr_addr [ create_bd_port -dir I -from 11 -to 0 dcr_wr_addr ] -set dcr_wr_data [ create_bd_port -dir I -from 31 -to 0 dcr_wr_data ] - -# Create instance: Vortex_top_0, and set properties -set block_name Vortex_top -set block_cell_name Vortex_top_0 -if { [catch {set Vortex_top_0 [create_bd_cell -type module -reference $block_name $block_cell_name] } errmsg] } { - catch {common::send_gid_msg -ssname BD::TCL -id 2095 -severity "ERROR" "Unable to add referenced block <$block_name>. Please add the files for ${block_name}'s definition into the project."} - return 1 - } elseif { $Vortex_top_0 eq "" } { - catch {common::send_gid_msg -ssname BD::TCL -id 2096 -severity "ERROR" "Unable to referenced block <$block_name>. Please add the files for ${block_name}'s definition into the project."} - return 1 - } - -# Create instance: axi_bram_ctrl_0, and set properties -set axi_bram_ctrl_0 [ create_bd_cell -type ip -vlnv xilinx.com:ip:axi_bram_ctrl:4.1 axi_bram_ctrl_0 ] -set_property -dict [ list \ - CONFIG.DATA_WIDTH {512} \ - CONFIG.ECC_TYPE {0} \ -] $axi_bram_ctrl_0 - -# Create instance: axi_bram_ctrl_0_bram, and set properties -set axi_bram_ctrl_0_bram [ create_bd_cell -type ip -vlnv xilinx.com:ip:blk_mem_gen:8.4 axi_bram_ctrl_0_bram ] - -set_property -dict [ list \ - CONFIG.Assume_Synchronous_Clk {true} \ - CONFIG.Byte_Size {8} \ - CONFIG.Load_Init_File {true} \ - CONFIG.Coe_File {%COE_FILE%} \ - CONFIG.EN_SAFETY_CKT {true} \ - CONFIG.Enable_32bit_Address {true} \ - CONFIG.Fill_Remaining_Memory_Locations {false} \ - CONFIG.Memory_Type {Simple_Dual_Port_RAM} \ - CONFIG.Operating_Mode_A {NO_CHANGE} \ - CONFIG.Operating_Mode_B {READ_FIRST} \ - CONFIG.Port_B_Write_Rate {0} \ - CONFIG.Read_Width_A {512} \ - CONFIG.Read_Width_B {512} \ - CONFIG.Register_PortA_Output_of_Memory_Primitives {false} \ - CONFIG.Register_PortB_Output_of_Memory_Primitives {false} \ - CONFIG.Remaining_Memory_Locations {0} \ - CONFIG.Use_Byte_Write_Enable {true} \ - CONFIG.Use_RSTA_Pin {false} \ - CONFIG.Use_RSTB_Pin {true} \ - CONFIG.Write_Width_A {512} \ - CONFIG.Write_Depth_A {16384} \ - CONFIG.use_bram_block {Stand_Alone} \ -] $axi_bram_ctrl_0_bram - -# Create interface connections -connect_bd_intf_net -intf_net Vortex_top_0_m_axi_mem [get_bd_intf_pins Vortex_top_0/m_axi_mem] [get_bd_intf_pins axi_bram_ctrl_0/S_AXI] -connect_bd_intf_net -intf_net axi_bram_ctrl_0_BRAM_PORTA [get_bd_intf_pins axi_bram_ctrl_0/BRAM_PORTA] [get_bd_intf_pins axi_bram_ctrl_0_bram/BRAM_PORTA] -connect_bd_intf_net -intf_net axi_bram_ctrl_0_BRAM_PORTB [get_bd_intf_pins axi_bram_ctrl_0/BRAM_PORTB] [get_bd_intf_pins axi_bram_ctrl_0_bram/BRAM_PORTB] - -# Create port connections -connect_bd_net -net Vortex_top_0_busy [get_bd_ports vx_busy] [get_bd_pins Vortex_top_0/busy] -connect_bd_net -net clk_wiz_clk_out1 [get_bd_ports clk_100MHz] [get_bd_pins Vortex_top_0/clk] [get_bd_pins axi_bram_ctrl_0/s_axi_aclk] -connect_bd_net -net resetn_1 [get_bd_ports resetn] [get_bd_pins axi_bram_ctrl_0/s_axi_aresetn] -connect_bd_net -net vx_reset_1 [get_bd_ports vx_reset] [get_bd_pins Vortex_top_0/reset] -connect_bd_net -net dcr_wr_valid_1 [get_bd_ports dcr_wr_valid] [get_bd_pins Vortex_top_0/dcr_wr_valid] -connect_bd_net -net dcr_wr_addr_1 [get_bd_ports dcr_wr_addr] [get_bd_pins Vortex_top_0/dcr_wr_addr] -connect_bd_net -net dcr_wr_data_1 [get_bd_ports dcr_wr_data] [get_bd_pins Vortex_top_0/dcr_wr_data] - -# Create address segments -assign_bd_address -offset 0x00000000 -range 0x00100000 -target_address_space [get_bd_addr_spaces Vortex_top_0/m_axi_mem] [get_bd_addr_segs axi_bram_ctrl_0/S_AXI/Mem0] -force - -# Perform GUI Layout -regenerate_bd_layout -layout_string { - "ActiveEmotionalView":"Default View", - "Default View_ScaleFactor":"1.0", - "Default View_TopLeft":"-195,-165", - "ExpandedHierarchyInLayout":"", - "guistr":"# # String gsaved with Nlview 7.0r4 2019-12-20 bk=1.5203 VDI=41 GEI=36 GUI=JA:10.0 TLS -# -string -flagsOSRD -preplace port clk_100MHz -pg 1 -lvl 0 -x 0 -y 40 -defaultsOSRD -preplace port resetn -pg 1 -lvl 0 -x 0 -y 20 -defaultsOSRD -preplace port vx_busy -pg 1 -lvl 4 -x 950 -y 220 -defaultsOSRD -preplace port vx_reset -pg 1 -lvl 0 -x 0 -y 110 -defaultsOSRD -preplace port dcr_wr_valid -pg 1 -lvl 0 -x 0 -y 130 -defaultsOSRD -preplace portBus dcr_wr_addr -pg 1 -lvl 0 -x 0 -y 150 -defaultsOSRD -preplace portBus dcr_wr_data -pg 1 -lvl 0 -x 0 -y 170 -defaultsOSRD -preplace inst Vortex_top_0 -pg 1 -lvl 1 -x 190 -y 130 -defaultsOSRD -preplace inst axi_bram_ctrl_0 -pg 1 -lvl 2 -x 520 -y 140 -defaultsOSRD -preplace inst axi_bram_ctrl_0_bram -pg 1 -lvl 3 -x 800 -y 140 -defaultsOSRD -preplace netloc Vortex_top_0_busy 1 1 3 360J 220 NJ 220 NJ -preplace netloc clk_wiz_clk_out1 1 0 2 20 30 370 -preplace netloc resetn_1 1 0 2 NJ 20 380J -preplace netloc vx_reset_1 1 0 1 NJ 110 -preplace netloc dcr_wr_valid_1 1 0 1 NJ 130 -preplace netloc dcr_wr_addr_1 1 0 1 NJ 150 -preplace netloc dcr_wr_data_1 1 0 1 NJ 170 -preplace netloc axi_bram_ctrl_0_BRAM_PORTB 1 2 1 N 150 -preplace netloc axi_bram_ctrl_0_BRAM_PORTA 1 2 1 N 130 -preplace netloc Vortex_top_0_m_axi_mem 1 1 1 N 120 -levelinfo -pg 1 0 190 520 800 950 -pagesize -pg 1 -db -bbox -sgen -180 0 1060 240 -" -} - - # Restore current instance - current_bd_instance $oldCurInst - - validate_bd_design - save_bd_design - close_bd_design $design_name -} -# End of cr_bd_design_1() -cr_bd_design_1 "" -set_property EXCLUDE_DEBUG_LOGIC "0" [get_files design_1.bd ] -set_property GENERATE_SYNTH_CHECKPOINT "1" [get_files design_1.bd ] -set_property IS_ENABLED "1" [get_files design_1.bd ] -set_property IS_GLOBAL_INCLUDE "0" [get_files design_1.bd ] -#set_property IS_LOCKED "0" [get_files design_1.bd ] -set_property LIBRARY "xil_defaultlib" [get_files design_1.bd ] -set_property PATH_MODE "RelativeFirst" [get_files design_1.bd ] -set_property PFM_NAME "" [get_files design_1.bd ] -set_property REGISTERED_WITH_MANAGER "1" [get_files design_1.bd ] -set_property SYNTH_CHECKPOINT_MODE "Hierarchical" [get_files design_1.bd ] -set_property USED_IN "synthesis implementation simulation" [get_files design_1.bd ] -set_property USED_IN_IMPLEMENTATION "1" [get_files design_1.bd ] -set_property USED_IN_SIMULATION "1" [get_files design_1.bd ] -set_property USED_IN_SYNTHESIS "1" [get_files design_1.bd ] - -#call make_wrapper to create wrapper files -set wrapper_path [make_wrapper -fileset sources_1 -files [ get_files -norecurse design_1.bd] -top] -add_files -norecurse -fileset sources_1 $wrapper_path - -# Create 'synth_1' run (if not found) -if {[string equal [get_runs -quiet synth_1] ""]} { - create_run -name synth_1 -part xcu280-fsvh2892-2L-e -flow {Vivado Synthesis 2020} -strategy "Vivado Synthesis Defaults" -report_strategy {No Reports} -constrset constrs_1 -} else { - set_property strategy "Vivado Synthesis Defaults" [get_runs synth_1] - set_property flow "Vivado Synthesis 2020" [get_runs synth_1] -} -set obj [get_runs synth_1] -set_property set_report_strategy_name 1 $obj -set_property report_strategy {Vivado Synthesis Default Reports} $obj -set_property set_report_strategy_name 0 $obj -# Create 'synth_1_synth_report_utilization_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs synth_1] synth_1_synth_report_utilization_0] "" ] } { - create_report_config -report_name synth_1_synth_report_utilization_0 -report_type report_utilization:1.0 -steps synth_design -runs synth_1 -} -set obj [get_report_configs -of_objects [get_runs synth_1] synth_1_synth_report_utilization_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Utilization - Synth Design" -objects $obj -set_property -name "options.pblocks" -value "" -objects $obj -set_property -name "options.cells" -value "" -objects $obj -set_property -name "options.slr" -value "0" -objects $obj -set_property -name "options.packthru" -value "0" -objects $obj -set_property -name "options.hierarchical" -value "0" -objects $obj -set_property -name "options.hierarchical_depth" -value "" -objects $obj -set_property -name "options.hierarchical_percentages" -value "0" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -set obj [get_runs synth_1] -set_property -name "constrset" -value "constrs_1" -objects $obj -set_property -name "description" -value "Vivado Synthesis Defaults" -objects $obj -set_property -name "flow" -value "Vivado Synthesis 2020" -objects $obj -set_property -name "name" -value "synth_1" -objects $obj -set_property -name "needs_refresh" -value "0" -objects $obj -set_property -name "srcset" -value "sources_1" -objects $obj -set_property -name "incremental_checkpoint" -value "" -objects $obj -set_property -name "auto_incremental_checkpoint" -value "0" -objects $obj -set_property -name "rqs_files" -value "" -objects $obj -set_property -name "incremental_checkpoint.more_options" -value "" -objects $obj -set_property -name "include_in_archive" -value "1" -objects $obj -set_property -name "gen_full_bitstream" -value "1" -objects $obj -set_property -name "write_incremental_synth_checkpoint" -value "0" -objects $obj -set_property -name "auto_incremental_checkpoint.directory" -value "$proj_dir/project_1.srcs/utils_1/imports/synth_1" -objects $obj -set_property -name "strategy" -value "Vivado Synthesis Defaults" -objects $obj -set_property -name "steps.synth_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.synth_design.tcl.post" -value "" -objects $obj -set_property -name "steps.synth_design.args.flatten_hierarchy" -value "rebuilt" -objects $obj -set_property -name "steps.synth_design.args.gated_clock_conversion" -value "off" -objects $obj -set_property -name "steps.synth_design.args.bufg" -value "12" -objects $obj -set_property -name "steps.synth_design.args.directive" -value "Default" -objects $obj -set_property -name "steps.synth_design.args.retiming" -value "0" -objects $obj -set_property -name "steps.synth_design.args.fsm_extraction" -value "auto" -objects $obj -set_property -name "steps.synth_design.args.keep_equivalent_registers" -value "0" -objects $obj -set_property -name "steps.synth_design.args.resource_sharing" -value "auto" -objects $obj -set_property -name "steps.synth_design.args.control_set_opt_threshold" -value "auto" -objects $obj -set_property -name "steps.synth_design.args.no_lc" -value "0" -objects $obj -set_property -name "steps.synth_design.args.no_srlextract" -value "0" -objects $obj -set_property -name "steps.synth_design.args.shreg_min_size" -value "3" -objects $obj -set_property -name "steps.synth_design.args.max_bram" -value "-1" -objects $obj -set_property -name "steps.synth_design.args.max_uram" -value "-1" -objects $obj -set_property -name "steps.synth_design.args.max_dsp" -value "-1" -objects $obj -set_property -name "steps.synth_design.args.max_bram_cascade_height" -value "-1" -objects $obj -set_property -name "steps.synth_design.args.max_uram_cascade_height" -value "-1" -objects $obj -set_property -name "steps.synth_design.args.cascade_dsp" -value "auto" -objects $obj -set_property -name "steps.synth_design.args.assert" -value "0" -objects $obj -set_property -name "steps.synth_design.args.more options" -value "" -objects $obj - -# Create 'synth_1_copy_1' run (if not found) -if {[string equal [get_runs -quiet synth_1_copy_1] ""]} { - create_run -name synth_1_copy_1 -part xcu280-fsvh2892-2L-e -flow {Vivado Synthesis 2020} -strategy "Vivado Synthesis Defaults" -report_strategy {No Reports} -constrset constrs_1 -} else { - set_property strategy "Vivado Synthesis Defaults" [get_runs synth_1_copy_1] - set_property flow "Vivado Synthesis 2020" [get_runs synth_1_copy_1] -} -set obj [get_runs synth_1_copy_1] -set_property set_report_strategy_name 1 $obj -set_property report_strategy {Vivado Synthesis Default Reports} $obj -set_property set_report_strategy_name 0 $obj -# Create 'synth_1_copy_1_synth_report_utilization_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs synth_1_copy_1] synth_1_copy_1_synth_report_utilization_0] "" ] } { - create_report_config -report_name synth_1_copy_1_synth_report_utilization_0 -report_type report_utilization:1.0 -steps synth_design -runs synth_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs synth_1_copy_1] synth_1_copy_1_synth_report_utilization_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Utilization - Synth Design" -objects $obj -set_property -name "options.pblocks" -value "" -objects $obj -set_property -name "options.cells" -value "" -objects $obj -set_property -name "options.slr" -value "0" -objects $obj -set_property -name "options.packthru" -value "0" -objects $obj -set_property -name "options.hierarchical" -value "0" -objects $obj -set_property -name "options.hierarchical_depth" -value "" -objects $obj -set_property -name "options.hierarchical_percentages" -value "0" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -set obj [get_runs synth_1_copy_1] -set_property -name "constrset" -value "constrs_1" -objects $obj -set_property -name "description" -value "Vivado Synthesis Defaults" -objects $obj -set_property -name "flow" -value "Vivado Synthesis 2020" -objects $obj -set_property -name "name" -value "synth_1_copy_1" -objects $obj -set_property -name "needs_refresh" -value "0" -objects $obj -set_property -name "srcset" -value "sources_1" -objects $obj -set_property -name "incremental_checkpoint" -value "" -objects $obj -set_property -name "auto_incremental_checkpoint" -value "0" -objects $obj -set_property -name "rqs_files" -value "" -objects $obj -set_property -name "incremental_checkpoint.more_options" -value "" -objects $obj -set_property -name "include_in_archive" -value "1" -objects $obj -set_property -name "gen_full_bitstream" -value "1" -objects $obj -set_property -name "write_incremental_synth_checkpoint" -value "0" -objects $obj -set_property -name "auto_incremental_checkpoint.directory" -value "$proj_dir/project_1.srcs/utils_1/imports/synth_1" -objects $obj -set_property -name "strategy" -value "Vivado Synthesis Defaults" -objects $obj -set_property -name "steps.synth_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.synth_design.tcl.post" -value "" -objects $obj -set_property -name "steps.synth_design.args.flatten_hierarchy" -value "rebuilt" -objects $obj -set_property -name "steps.synth_design.args.gated_clock_conversion" -value "off" -objects $obj -set_property -name "steps.synth_design.args.bufg" -value "12" -objects $obj -set_property -name "steps.synth_design.args.directive" -value "Default" -objects $obj -set_property -name "steps.synth_design.args.retiming" -value "0" -objects $obj -set_property -name "steps.synth_design.args.fsm_extraction" -value "auto" -objects $obj -set_property -name "steps.synth_design.args.keep_equivalent_registers" -value "0" -objects $obj -set_property -name "steps.synth_design.args.resource_sharing" -value "auto" -objects $obj -set_property -name "steps.synth_design.args.control_set_opt_threshold" -value "auto" -objects $obj -set_property -name "steps.synth_design.args.no_lc" -value "0" -objects $obj -set_property -name "steps.synth_design.args.no_srlextract" -value "0" -objects $obj -set_property -name "steps.synth_design.args.shreg_min_size" -value "3" -objects $obj -set_property -name "steps.synth_design.args.max_bram" -value "-1" -objects $obj -set_property -name "steps.synth_design.args.max_uram" -value "-1" -objects $obj -set_property -name "steps.synth_design.args.max_dsp" -value "-1" -objects $obj -set_property -name "steps.synth_design.args.max_bram_cascade_height" -value "-1" -objects $obj -set_property -name "steps.synth_design.args.max_uram_cascade_height" -value "-1" -objects $obj -set_property -name "steps.synth_design.args.cascade_dsp" -value "auto" -objects $obj -set_property -name "steps.synth_design.args.assert" -value "0" -objects $obj -set_property -name "steps.synth_design.args.more options" -value "" -objects $obj - -# set the current synth run -current_run -synthesis [get_runs synth_1] - -# preserve signal names -set_property STEPS.SYNTH_DESIGN.ARGS.FLATTEN_HIERARCHY none [get_runs synth_1] - -# Create 'impl_1' run (if not found) -if {[string equal [get_runs -quiet impl_1] ""]} { - create_run -name impl_1 -part xcu280-fsvh2892-2L-e -flow {Vivado Implementation 2020} -strategy "Vivado Implementation Defaults" -report_strategy {No Reports} -constrset constrs_1 -parent_run synth_1 -} else { - set_property strategy "Vivado Implementation Defaults" [get_runs impl_1] - set_property flow "Vivado Implementation 2020" [get_runs impl_1] -} -set obj [get_runs impl_1] -set_property set_report_strategy_name 1 $obj -set_property report_strategy {Vivado Implementation Default Reports} $obj -set_property set_report_strategy_name 0 $obj -# Create 'impl_1_init_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_init_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_init_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps init_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_init_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Design Initialization" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_opt_report_drc_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_opt_report_drc_0] "" ] } { - create_report_config -report_name impl_1_opt_report_drc_0 -report_type report_drc:1.0 -steps opt_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_opt_report_drc_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "DRC - Opt Design" -objects $obj -set_property -name "options.upgrade_cw" -value "0" -objects $obj -set_property -name "options.checks" -value "" -objects $obj -set_property -name "options.ruledecks" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_opt_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_opt_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_opt_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps opt_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_opt_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Opt Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_power_opt_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_power_opt_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_power_opt_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps power_opt_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_power_opt_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Power Opt Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_place_report_io_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_place_report_io_0] "" ] } { - create_report_config -report_name impl_1_place_report_io_0 -report_type report_io:1.0 -steps place_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_place_report_io_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "IO - Place Design" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_place_report_utilization_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_place_report_utilization_0] "" ] } { - create_report_config -report_name impl_1_place_report_utilization_0 -report_type report_utilization:1.0 -steps place_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_place_report_utilization_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Utilization - Place Design" -objects $obj -set_property -name "options.pblocks" -value "" -objects $obj -set_property -name "options.cells" -value "" -objects $obj -set_property -name "options.slr" -value "0" -objects $obj -set_property -name "options.packthru" -value "0" -objects $obj -set_property -name "options.hierarchical" -value "0" -objects $obj -set_property -name "options.hierarchical_depth" -value "" -objects $obj -set_property -name "options.hierarchical_percentages" -value "0" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_place_report_control_sets_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_place_report_control_sets_0] "" ] } { - create_report_config -report_name impl_1_place_report_control_sets_0 -report_type report_control_sets:1.0 -steps place_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_place_report_control_sets_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Control Sets - Place Design" -objects $obj -set_property -name "options.verbose" -value "1" -objects $obj -set_property -name "options.cells" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_place_report_incremental_reuse_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_place_report_incremental_reuse_0] "" ] } { - create_report_config -report_name impl_1_place_report_incremental_reuse_0 -report_type report_incremental_reuse:1.0 -steps place_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_place_report_incremental_reuse_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Incremental Reuse - Place Design" -objects $obj -set_property -name "options.cells" -value "" -objects $obj -set_property -name "options.hierarchical" -value "0" -objects $obj -set_property -name "options.hierarchical_depth" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_place_report_incremental_reuse_1' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_place_report_incremental_reuse_1] "" ] } { - create_report_config -report_name impl_1_place_report_incremental_reuse_1 -report_type report_incremental_reuse:1.0 -steps place_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_place_report_incremental_reuse_1] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Incremental Reuse - Place Design" -objects $obj -set_property -name "options.cells" -value "" -objects $obj -set_property -name "options.hierarchical" -value "0" -objects $obj -set_property -name "options.hierarchical_depth" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_place_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_place_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_place_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps place_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_place_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Place Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_post_place_power_opt_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_post_place_power_opt_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_post_place_power_opt_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps post_place_power_opt_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_post_place_power_opt_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Post-Place Power Opt Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_phys_opt_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_phys_opt_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_phys_opt_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps phys_opt_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_phys_opt_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Post-Place Phys Opt Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_route_report_drc_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_route_report_drc_0] "" ] } { - create_report_config -report_name impl_1_route_report_drc_0 -report_type report_drc:1.0 -steps route_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_route_report_drc_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "DRC - Route Design" -objects $obj -set_property -name "options.upgrade_cw" -value "0" -objects $obj -set_property -name "options.checks" -value "" -objects $obj -set_property -name "options.ruledecks" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_route_report_methodology_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_route_report_methodology_0] "" ] } { - create_report_config -report_name impl_1_route_report_methodology_0 -report_type report_methodology:1.0 -steps route_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_route_report_methodology_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Methodology - Route Design" -objects $obj -set_property -name "options.checks" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_route_report_power_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_route_report_power_0] "" ] } { - create_report_config -report_name impl_1_route_report_power_0 -report_type report_power:1.0 -steps route_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_route_report_power_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Power - Route Design" -objects $obj -set_property -name "options.advisory" -value "0" -objects $obj -set_property -name "options.xpe" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_route_report_route_status_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_route_report_route_status_0] "" ] } { - create_report_config -report_name impl_1_route_report_route_status_0 -report_type report_route_status:1.0 -steps route_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_route_report_route_status_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Route Status - Route Design" -objects $obj -set_property -name "options.of_objects" -value "" -objects $obj -set_property -name "options.route_type" -value "" -objects $obj -set_property -name "options.list_all_nets" -value "0" -objects $obj -set_property -name "options.show_all" -value "0" -objects $obj -set_property -name "options.has_routing" -value "0" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_route_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_route_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_route_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps route_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_route_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Timing Summary - Route Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_route_report_incremental_reuse_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_route_report_incremental_reuse_0] "" ] } { - create_report_config -report_name impl_1_route_report_incremental_reuse_0 -report_type report_incremental_reuse:1.0 -steps route_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_route_report_incremental_reuse_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Incremental Reuse - Route Design" -objects $obj -set_property -name "options.cells" -value "" -objects $obj -set_property -name "options.hierarchical" -value "0" -objects $obj -set_property -name "options.hierarchical_depth" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_route_report_clock_utilization_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_route_report_clock_utilization_0] "" ] } { - create_report_config -report_name impl_1_route_report_clock_utilization_0 -report_type report_clock_utilization:1.0 -steps route_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_route_report_clock_utilization_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Clock Utilization - Route Design" -objects $obj -set_property -name "options.write_xdc" -value "0" -objects $obj -set_property -name "options.clock_roots_only" -value "0" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_route_report_bus_skew_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_route_report_bus_skew_0] "" ] } { - create_report_config -report_name impl_1_route_report_bus_skew_0 -report_type report_bus_skew:1.1 -steps route_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_route_report_bus_skew_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Bus Skew - Route Design" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.slack_greater_than" -value "" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.warn_on_violation" -value "1" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_post_route_phys_opt_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_post_route_phys_opt_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_post_route_phys_opt_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps post_route_phys_opt_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_post_route_phys_opt_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Timing Summary - Post-Route Phys Opt Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "1" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_post_route_phys_opt_report_bus_skew_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1] impl_1_post_route_phys_opt_report_bus_skew_0] "" ] } { - create_report_config -report_name impl_1_post_route_phys_opt_report_bus_skew_0 -report_type report_bus_skew:1.1 -steps post_route_phys_opt_design -runs impl_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1] impl_1_post_route_phys_opt_report_bus_skew_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Bus Skew - Post-Route Phys Opt Design" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.slack_greater_than" -value "" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.warn_on_violation" -value "1" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -set obj [get_runs impl_1] -set_property -name "constrset" -value "constrs_1" -objects $obj -set_property -name "description" -value "Default settings for Implementation." -objects $obj -set_property -name "flow" -value "Vivado Implementation 2020" -objects $obj -set_property -name "name" -value "impl_1" -objects $obj -set_property -name "needs_refresh" -value "0" -objects $obj -set_property -name "pr_configuration" -value "" -objects $obj -set_property -name "srcset" -value "sources_1" -objects $obj -set_property -name "incremental_checkpoint" -value "" -objects $obj -set_property -name "auto_incremental_checkpoint" -value "0" -objects $obj -set_property -name "rqs_files" -value "" -objects $obj -set_property -name "incremental_checkpoint.more_options" -value "" -objects $obj -set_property -name "include_in_archive" -value "1" -objects $obj -set_property -name "gen_full_bitstream" -value "1" -objects $obj -set_property -name "auto_incremental_checkpoint.directory" -value "$proj_dir/project_1.srcs/utils_1/imports/impl_1" -objects $obj -set_property -name "strategy" -value "Vivado Implementation Defaults" -objects $obj -set_property -name "steps.init_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.init_design.tcl.post" -value "" -objects $obj -set_property -name "steps.opt_design.is_enabled" -value "1" -objects $obj -set_property -name "steps.opt_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.opt_design.tcl.post" -value "" -objects $obj -set_property -name "steps.opt_design.args.verbose" -value "0" -objects $obj -set_property -name "steps.opt_design.args.directive" -value "Default" -objects $obj -set_property -name "steps.opt_design.args.more options" -value "" -objects $obj -set_property -name "steps.power_opt_design.is_enabled" -value "0" -objects $obj -set_property -name "steps.power_opt_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.power_opt_design.tcl.post" -value "" -objects $obj -set_property -name "steps.power_opt_design.args.more options" -value "" -objects $obj -set_property -name "steps.place_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.place_design.tcl.post" -value "" -objects $obj -set_property -name "steps.place_design.args.directive" -value "Default" -objects $obj -set_property -name "steps.place_design.args.more options" -value "" -objects $obj -set_property -name "steps.post_place_power_opt_design.is_enabled" -value "0" -objects $obj -set_property -name "steps.post_place_power_opt_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.post_place_power_opt_design.tcl.post" -value "" -objects $obj -set_property -name "steps.post_place_power_opt_design.args.more options" -value "" -objects $obj -set_property -name "steps.phys_opt_design.is_enabled" -value "1" -objects $obj -set_property -name "steps.phys_opt_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.phys_opt_design.tcl.post" -value "" -objects $obj -set_property -name "steps.phys_opt_design.args.directive" -value "Default" -objects $obj -set_property -name "steps.phys_opt_design.args.more options" -value "" -objects $obj -set_property -name "steps.route_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.route_design.tcl.post" -value "" -objects $obj -set_property -name "steps.route_design.args.directive" -value "Default" -objects $obj -set_property -name "steps.route_design.args.more options" -value "" -objects $obj -set_property -name "steps.post_route_phys_opt_design.is_enabled" -value "0" -objects $obj -set_property -name "steps.post_route_phys_opt_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.post_route_phys_opt_design.tcl.post" -value "" -objects $obj -set_property -name "steps.post_route_phys_opt_design.args.directive" -value "Default" -objects $obj -set_property -name "steps.post_route_phys_opt_design.args.more options" -value "" -objects $obj -set_property -name "steps.write_bitstream.tcl.pre" -value "" -objects $obj -set_property -name "steps.write_bitstream.tcl.post" -value "" -objects $obj -set_property -name "steps.write_bitstream.args.raw_bitfile" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.mask_file" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.no_binary_bitfile" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.bin_file" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.readback_file" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.logic_location_file" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.verbose" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.more options" -value "" -objects $obj - -# Create 'impl_1_copy_1' run (if not found) -if {[string equal [get_runs -quiet impl_1_copy_1] ""]} { - create_run -name impl_1_copy_1 -part xcu280-fsvh2892-2L-e -flow {Vivado Implementation 2020} -strategy "Vivado Implementation Defaults" -report_strategy {No Reports} -constrset constrs_1 -parent_run synth_1 -} else { - set_property strategy "Vivado Implementation Defaults" [get_runs impl_1_copy_1] - set_property flow "Vivado Implementation 2020" [get_runs impl_1_copy_1] -} -set obj [get_runs impl_1_copy_1] -set_property set_report_strategy_name 1 $obj -set_property report_strategy {Vivado Implementation Default Reports} $obj -set_property set_report_strategy_name 0 $obj -# Create 'impl_1_copy_1_init_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_init_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_copy_1_init_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps init_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_init_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Design Initialization" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_opt_report_drc_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_opt_report_drc_0] "" ] } { - create_report_config -report_name impl_1_copy_1_opt_report_drc_0 -report_type report_drc:1.0 -steps opt_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_opt_report_drc_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "DRC - Opt Design" -objects $obj -set_property -name "options.upgrade_cw" -value "0" -objects $obj -set_property -name "options.checks" -value "" -objects $obj -set_property -name "options.ruledecks" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_opt_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_opt_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_copy_1_opt_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps opt_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_opt_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Opt Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_power_opt_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_power_opt_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_copy_1_power_opt_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps power_opt_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_power_opt_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Power Opt Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_place_report_io_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_place_report_io_0] "" ] } { - create_report_config -report_name impl_1_copy_1_place_report_io_0 -report_type report_io:1.0 -steps place_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_place_report_io_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "IO - Place Design" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_place_report_utilization_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_place_report_utilization_0] "" ] } { - create_report_config -report_name impl_1_copy_1_place_report_utilization_0 -report_type report_utilization:1.0 -steps place_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_place_report_utilization_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Utilization - Place Design" -objects $obj -set_property -name "options.pblocks" -value "" -objects $obj -set_property -name "options.cells" -value "" -objects $obj -set_property -name "options.slr" -value "0" -objects $obj -set_property -name "options.packthru" -value "0" -objects $obj -set_property -name "options.hierarchical" -value "0" -objects $obj -set_property -name "options.hierarchical_depth" -value "" -objects $obj -set_property -name "options.hierarchical_percentages" -value "0" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_place_report_control_sets_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_place_report_control_sets_0] "" ] } { - create_report_config -report_name impl_1_copy_1_place_report_control_sets_0 -report_type report_control_sets:1.0 -steps place_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_place_report_control_sets_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Control Sets - Place Design" -objects $obj -set_property -name "options.verbose" -value "1" -objects $obj -set_property -name "options.cells" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_place_report_incremental_reuse_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_place_report_incremental_reuse_0] "" ] } { - create_report_config -report_name impl_1_copy_1_place_report_incremental_reuse_0 -report_type report_incremental_reuse:1.0 -steps place_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_place_report_incremental_reuse_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Incremental Reuse - Place Design" -objects $obj -set_property -name "options.cells" -value "" -objects $obj -set_property -name "options.hierarchical" -value "0" -objects $obj -set_property -name "options.hierarchical_depth" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_place_report_incremental_reuse_1' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_place_report_incremental_reuse_1] "" ] } { - create_report_config -report_name impl_1_copy_1_place_report_incremental_reuse_1 -report_type report_incremental_reuse:1.0 -steps place_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_place_report_incremental_reuse_1] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Incremental Reuse - Place Design" -objects $obj -set_property -name "options.cells" -value "" -objects $obj -set_property -name "options.hierarchical" -value "0" -objects $obj -set_property -name "options.hierarchical_depth" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_place_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_place_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_copy_1_place_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps place_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_place_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Place Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_post_place_power_opt_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_post_place_power_opt_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_copy_1_post_place_power_opt_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps post_place_power_opt_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_post_place_power_opt_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Post-Place Power Opt Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_phys_opt_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_phys_opt_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_copy_1_phys_opt_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps phys_opt_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_phys_opt_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Post-Place Phys Opt Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_route_report_drc_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_route_report_drc_0] "" ] } { - create_report_config -report_name impl_1_copy_1_route_report_drc_0 -report_type report_drc:1.0 -steps route_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_route_report_drc_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "DRC - Route Design" -objects $obj -set_property -name "options.upgrade_cw" -value "0" -objects $obj -set_property -name "options.checks" -value "" -objects $obj -set_property -name "options.ruledecks" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_route_report_methodology_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_route_report_methodology_0] "" ] } { - create_report_config -report_name impl_1_copy_1_route_report_methodology_0 -report_type report_methodology:1.0 -steps route_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_route_report_methodology_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Methodology - Route Design" -objects $obj -set_property -name "options.checks" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_route_report_power_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_route_report_power_0] "" ] } { - create_report_config -report_name impl_1_copy_1_route_report_power_0 -report_type report_power:1.0 -steps route_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_route_report_power_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Power - Route Design" -objects $obj -set_property -name "options.advisory" -value "0" -objects $obj -set_property -name "options.xpe" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_route_report_route_status_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_route_report_route_status_0] "" ] } { - create_report_config -report_name impl_1_copy_1_route_report_route_status_0 -report_type report_route_status:1.0 -steps route_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_route_report_route_status_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Route Status - Route Design" -objects $obj -set_property -name "options.of_objects" -value "" -objects $obj -set_property -name "options.route_type" -value "" -objects $obj -set_property -name "options.list_all_nets" -value "0" -objects $obj -set_property -name "options.show_all" -value "0" -objects $obj -set_property -name "options.has_routing" -value "0" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_route_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_route_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_copy_1_route_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps route_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_route_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Timing Summary - Route Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_route_report_incremental_reuse_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_route_report_incremental_reuse_0] "" ] } { - create_report_config -report_name impl_1_copy_1_route_report_incremental_reuse_0 -report_type report_incremental_reuse:1.0 -steps route_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_route_report_incremental_reuse_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Incremental Reuse - Route Design" -objects $obj -set_property -name "options.cells" -value "" -objects $obj -set_property -name "options.hierarchical" -value "0" -objects $obj -set_property -name "options.hierarchical_depth" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_route_report_clock_utilization_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_route_report_clock_utilization_0] "" ] } { - create_report_config -report_name impl_1_copy_1_route_report_clock_utilization_0 -report_type report_clock_utilization:1.0 -steps route_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_route_report_clock_utilization_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Clock Utilization - Route Design" -objects $obj -set_property -name "options.write_xdc" -value "0" -objects $obj -set_property -name "options.clock_roots_only" -value "0" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_route_report_bus_skew_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_route_report_bus_skew_0] "" ] } { - create_report_config -report_name impl_1_copy_1_route_report_bus_skew_0 -report_type report_bus_skew:1.1 -steps route_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_route_report_bus_skew_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Bus Skew - Route Design" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.slack_greater_than" -value "" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.warn_on_violation" -value "1" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_post_route_phys_opt_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_post_route_phys_opt_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_copy_1_post_route_phys_opt_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps post_route_phys_opt_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_post_route_phys_opt_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Timing Summary - Post-Route Phys Opt Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "1" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_1_post_route_phys_opt_report_bus_skew_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_post_route_phys_opt_report_bus_skew_0] "" ] } { - create_report_config -report_name impl_1_copy_1_post_route_phys_opt_report_bus_skew_0 -report_type report_bus_skew:1.1 -steps post_route_phys_opt_design -runs impl_1_copy_1 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_1] impl_1_copy_1_post_route_phys_opt_report_bus_skew_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Bus Skew - Post-Route Phys Opt Design" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.slack_greater_than" -value "" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.warn_on_violation" -value "1" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -set obj [get_runs impl_1_copy_1] -set_property -name "constrset" -value "constrs_1" -objects $obj -set_property -name "description" -value "Default settings for Implementation." -objects $obj -set_property -name "flow" -value "Vivado Implementation 2020" -objects $obj -set_property -name "name" -value "impl_1_copy_1" -objects $obj -set_property -name "needs_refresh" -value "0" -objects $obj -set_property -name "pr_configuration" -value "" -objects $obj -set_property -name "srcset" -value "sources_1" -objects $obj -set_property -name "incremental_checkpoint" -value "" -objects $obj -set_property -name "auto_incremental_checkpoint" -value "0" -objects $obj -set_property -name "rqs_files" -value "" -objects $obj -set_property -name "incremental_checkpoint.more_options" -value "" -objects $obj -set_property -name "include_in_archive" -value "1" -objects $obj -set_property -name "gen_full_bitstream" -value "1" -objects $obj -set_property -name "auto_incremental_checkpoint.directory" -value "$proj_dir/project_1.srcs/utils_1/imports/impl_1" -objects $obj -set_property -name "strategy" -value "Vivado Implementation Defaults" -objects $obj -set_property -name "steps.init_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.init_design.tcl.post" -value "" -objects $obj -set_property -name "steps.opt_design.is_enabled" -value "1" -objects $obj -set_property -name "steps.opt_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.opt_design.tcl.post" -value "" -objects $obj -set_property -name "steps.opt_design.args.verbose" -value "0" -objects $obj -set_property -name "steps.opt_design.args.directive" -value "Default" -objects $obj -set_property -name "steps.opt_design.args.more options" -value "" -objects $obj -set_property -name "steps.power_opt_design.is_enabled" -value "0" -objects $obj -set_property -name "steps.power_opt_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.power_opt_design.tcl.post" -value "" -objects $obj -set_property -name "steps.power_opt_design.args.more options" -value "" -objects $obj -set_property -name "steps.place_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.place_design.tcl.post" -value "" -objects $obj -set_property -name "steps.place_design.args.directive" -value "Default" -objects $obj -set_property -name "steps.place_design.args.more options" -value "" -objects $obj -set_property -name "steps.post_place_power_opt_design.is_enabled" -value "0" -objects $obj -set_property -name "steps.post_place_power_opt_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.post_place_power_opt_design.tcl.post" -value "" -objects $obj -set_property -name "steps.post_place_power_opt_design.args.more options" -value "" -objects $obj -set_property -name "steps.phys_opt_design.is_enabled" -value "1" -objects $obj -set_property -name "steps.phys_opt_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.phys_opt_design.tcl.post" -value "" -objects $obj -set_property -name "steps.phys_opt_design.args.directive" -value "Default" -objects $obj -set_property -name "steps.phys_opt_design.args.more options" -value "" -objects $obj -set_property -name "steps.route_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.route_design.tcl.post" -value "" -objects $obj -set_property -name "steps.route_design.args.directive" -value "Default" -objects $obj -set_property -name "steps.route_design.args.more options" -value "" -objects $obj -set_property -name "steps.post_route_phys_opt_design.is_enabled" -value "0" -objects $obj -set_property -name "steps.post_route_phys_opt_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.post_route_phys_opt_design.tcl.post" -value "" -objects $obj -set_property -name "steps.post_route_phys_opt_design.args.directive" -value "Default" -objects $obj -set_property -name "steps.post_route_phys_opt_design.args.more options" -value "" -objects $obj -set_property -name "steps.write_bitstream.tcl.pre" -value "" -objects $obj -set_property -name "steps.write_bitstream.tcl.post" -value "" -objects $obj -set_property -name "steps.write_bitstream.args.raw_bitfile" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.mask_file" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.no_binary_bitfile" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.bin_file" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.readback_file" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.logic_location_file" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.verbose" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.more options" -value "" -objects $obj - -# Create 'impl_1_copy_2' run (if not found) -if {[string equal [get_runs -quiet impl_1_copy_2] ""]} { - create_run -name impl_1_copy_2 -part xcu280-fsvh2892-2L-e -flow {Vivado Implementation 2020} -strategy "Vivado Implementation Defaults" -report_strategy {No Reports} -constrset constrs_1 -parent_run synth_1 -} else { - set_property strategy "Vivado Implementation Defaults" [get_runs impl_1_copy_2] - set_property flow "Vivado Implementation 2020" [get_runs impl_1_copy_2] -} -set obj [get_runs impl_1_copy_2] -set_property set_report_strategy_name 1 $obj -set_property report_strategy {Vivado Implementation Default Reports} $obj -set_property set_report_strategy_name 0 $obj -# Create 'impl_1_copy_2_init_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_init_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_copy_2_init_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps init_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_init_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Design Initialization" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_opt_report_drc_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_opt_report_drc_0] "" ] } { - create_report_config -report_name impl_1_copy_2_opt_report_drc_0 -report_type report_drc:1.0 -steps opt_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_opt_report_drc_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "DRC - Opt Design" -objects $obj -set_property -name "options.upgrade_cw" -value "0" -objects $obj -set_property -name "options.checks" -value "" -objects $obj -set_property -name "options.ruledecks" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_opt_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_opt_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_copy_2_opt_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps opt_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_opt_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Opt Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_power_opt_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_power_opt_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_copy_2_power_opt_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps power_opt_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_power_opt_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Power Opt Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_place_report_io_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_place_report_io_0] "" ] } { - create_report_config -report_name impl_1_copy_2_place_report_io_0 -report_type report_io:1.0 -steps place_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_place_report_io_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "IO - Place Design" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_place_report_utilization_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_place_report_utilization_0] "" ] } { - create_report_config -report_name impl_1_copy_2_place_report_utilization_0 -report_type report_utilization:1.0 -steps place_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_place_report_utilization_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Utilization - Place Design" -objects $obj -set_property -name "options.pblocks" -value "" -objects $obj -set_property -name "options.cells" -value "" -objects $obj -set_property -name "options.slr" -value "0" -objects $obj -set_property -name "options.packthru" -value "0" -objects $obj -set_property -name "options.hierarchical" -value "0" -objects $obj -set_property -name "options.hierarchical_depth" -value "" -objects $obj -set_property -name "options.hierarchical_percentages" -value "0" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_place_report_control_sets_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_place_report_control_sets_0] "" ] } { - create_report_config -report_name impl_1_copy_2_place_report_control_sets_0 -report_type report_control_sets:1.0 -steps place_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_place_report_control_sets_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Control Sets - Place Design" -objects $obj -set_property -name "options.verbose" -value "1" -objects $obj -set_property -name "options.cells" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_place_report_incremental_reuse_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_place_report_incremental_reuse_0] "" ] } { - create_report_config -report_name impl_1_copy_2_place_report_incremental_reuse_0 -report_type report_incremental_reuse:1.0 -steps place_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_place_report_incremental_reuse_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Incremental Reuse - Place Design" -objects $obj -set_property -name "options.cells" -value "" -objects $obj -set_property -name "options.hierarchical" -value "0" -objects $obj -set_property -name "options.hierarchical_depth" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_place_report_incremental_reuse_1' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_place_report_incremental_reuse_1] "" ] } { - create_report_config -report_name impl_1_copy_2_place_report_incremental_reuse_1 -report_type report_incremental_reuse:1.0 -steps place_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_place_report_incremental_reuse_1] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Incremental Reuse - Place Design" -objects $obj -set_property -name "options.cells" -value "" -objects $obj -set_property -name "options.hierarchical" -value "0" -objects $obj -set_property -name "options.hierarchical_depth" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_place_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_place_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_copy_2_place_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps place_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_place_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Place Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_post_place_power_opt_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_post_place_power_opt_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_copy_2_post_place_power_opt_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps post_place_power_opt_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_post_place_power_opt_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Post-Place Power Opt Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_phys_opt_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_phys_opt_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_copy_2_phys_opt_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps phys_opt_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_phys_opt_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "0" -objects $obj -set_property -name "display_name" -value "Timing Summary - Post-Place Phys Opt Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_route_report_drc_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_route_report_drc_0] "" ] } { - create_report_config -report_name impl_1_copy_2_route_report_drc_0 -report_type report_drc:1.0 -steps route_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_route_report_drc_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "DRC - Route Design" -objects $obj -set_property -name "options.upgrade_cw" -value "0" -objects $obj -set_property -name "options.checks" -value "" -objects $obj -set_property -name "options.ruledecks" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_route_report_methodology_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_route_report_methodology_0] "" ] } { - create_report_config -report_name impl_1_copy_2_route_report_methodology_0 -report_type report_methodology:1.0 -steps route_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_route_report_methodology_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Methodology - Route Design" -objects $obj -set_property -name "options.checks" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_route_report_power_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_route_report_power_0] "" ] } { - create_report_config -report_name impl_1_copy_2_route_report_power_0 -report_type report_power:1.0 -steps route_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_route_report_power_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Power - Route Design" -objects $obj -set_property -name "options.advisory" -value "0" -objects $obj -set_property -name "options.xpe" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_route_report_route_status_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_route_report_route_status_0] "" ] } { - create_report_config -report_name impl_1_copy_2_route_report_route_status_0 -report_type report_route_status:1.0 -steps route_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_route_report_route_status_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Route Status - Route Design" -objects $obj -set_property -name "options.of_objects" -value "" -objects $obj -set_property -name "options.route_type" -value "" -objects $obj -set_property -name "options.list_all_nets" -value "0" -objects $obj -set_property -name "options.show_all" -value "0" -objects $obj -set_property -name "options.has_routing" -value "0" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_route_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_route_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_copy_2_route_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps route_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_route_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Timing Summary - Route Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "0" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_route_report_incremental_reuse_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_route_report_incremental_reuse_0] "" ] } { - create_report_config -report_name impl_1_copy_2_route_report_incremental_reuse_0 -report_type report_incremental_reuse:1.0 -steps route_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_route_report_incremental_reuse_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Incremental Reuse - Route Design" -objects $obj -set_property -name "options.cells" -value "" -objects $obj -set_property -name "options.hierarchical" -value "0" -objects $obj -set_property -name "options.hierarchical_depth" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_route_report_clock_utilization_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_route_report_clock_utilization_0] "" ] } { - create_report_config -report_name impl_1_copy_2_route_report_clock_utilization_0 -report_type report_clock_utilization:1.0 -steps route_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_route_report_clock_utilization_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Clock Utilization - Route Design" -objects $obj -set_property -name "options.write_xdc" -value "0" -objects $obj -set_property -name "options.clock_roots_only" -value "0" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_route_report_bus_skew_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_route_report_bus_skew_0] "" ] } { - create_report_config -report_name impl_1_copy_2_route_report_bus_skew_0 -report_type report_bus_skew:1.1 -steps route_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_route_report_bus_skew_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Bus Skew - Route Design" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.slack_greater_than" -value "" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.warn_on_violation" -value "1" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_post_route_phys_opt_report_timing_summary_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_post_route_phys_opt_report_timing_summary_0] "" ] } { - create_report_config -report_name impl_1_copy_2_post_route_phys_opt_report_timing_summary_0 -report_type report_timing_summary:1.0 -steps post_route_phys_opt_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_post_route_phys_opt_report_timing_summary_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Timing Summary - Post-Route Phys Opt Design" -objects $obj -set_property -name "options.check_timing_verbose" -value "0" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "10" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.report_unconstrained" -value "0" -objects $obj -set_property -name "options.warn_on_violation" -value "1" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.cell" -value "" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -# Create 'impl_1_copy_2_post_route_phys_opt_report_bus_skew_0' report (if not found) -if { [ string equal [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_post_route_phys_opt_report_bus_skew_0] "" ] } { - create_report_config -report_name impl_1_copy_2_post_route_phys_opt_report_bus_skew_0 -report_type report_bus_skew:1.1 -steps post_route_phys_opt_design -runs impl_1_copy_2 -} -set obj [get_report_configs -of_objects [get_runs impl_1_copy_2] impl_1_copy_2_post_route_phys_opt_report_bus_skew_0] -if { $obj != "" } { -set_property -name "is_enabled" -value "1" -objects $obj -set_property -name "display_name" -value "Bus Skew - Post-Route Phys Opt Design" -objects $obj -set_property -name "options.delay_type" -value "" -objects $obj -set_property -name "options.setup" -value "0" -objects $obj -set_property -name "options.hold" -value "0" -objects $obj -set_property -name "options.max_paths" -value "" -objects $obj -set_property -name "options.nworst" -value "" -objects $obj -set_property -name "options.unique_pins" -value "0" -objects $obj -set_property -name "options.path_type" -value "" -objects $obj -set_property -name "options.slack_lesser_than" -value "" -objects $obj -set_property -name "options.slack_greater_than" -value "" -objects $obj -set_property -name "options.significant_digits" -value "" -objects $obj -set_property -name "options.warn_on_violation" -value "1" -objects $obj -set_property -name "options.more_options" -value "" -objects $obj - -} -set obj [get_runs impl_1_copy_2] -set_property -name "constrset" -value "constrs_1" -objects $obj -set_property -name "description" -value "Default settings for Implementation." -objects $obj -set_property -name "flow" -value "Vivado Implementation 2020" -objects $obj -set_property -name "name" -value "impl_1_copy_2" -objects $obj -set_property -name "needs_refresh" -value "0" -objects $obj -set_property -name "pr_configuration" -value "" -objects $obj -set_property -name "srcset" -value "sources_1" -objects $obj -set_property -name "incremental_checkpoint" -value "" -objects $obj -set_property -name "auto_incremental_checkpoint" -value "0" -objects $obj -set_property -name "rqs_files" -value "" -objects $obj -set_property -name "incremental_checkpoint.more_options" -value "" -objects $obj -set_property -name "include_in_archive" -value "1" -objects $obj -set_property -name "gen_full_bitstream" -value "1" -objects $obj -set_property -name "auto_incremental_checkpoint.directory" -value "$proj_dir/project_1.srcs/utils_1/imports/impl_1" -objects $obj -set_property -name "strategy" -value "Vivado Implementation Defaults" -objects $obj -set_property -name "steps.init_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.init_design.tcl.post" -value "" -objects $obj -set_property -name "steps.opt_design.is_enabled" -value "1" -objects $obj -set_property -name "steps.opt_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.opt_design.tcl.post" -value "" -objects $obj -set_property -name "steps.opt_design.args.verbose" -value "0" -objects $obj -set_property -name "steps.opt_design.args.directive" -value "Default" -objects $obj -set_property -name "steps.opt_design.args.more options" -value "" -objects $obj -set_property -name "steps.power_opt_design.is_enabled" -value "0" -objects $obj -set_property -name "steps.power_opt_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.power_opt_design.tcl.post" -value "" -objects $obj -set_property -name "steps.power_opt_design.args.more options" -value "" -objects $obj -set_property -name "steps.place_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.place_design.tcl.post" -value "" -objects $obj -set_property -name "steps.place_design.args.directive" -value "Default" -objects $obj -set_property -name "steps.place_design.args.more options" -value "" -objects $obj -set_property -name "steps.post_place_power_opt_design.is_enabled" -value "0" -objects $obj -set_property -name "steps.post_place_power_opt_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.post_place_power_opt_design.tcl.post" -value "" -objects $obj -set_property -name "steps.post_place_power_opt_design.args.more options" -value "" -objects $obj -set_property -name "steps.phys_opt_design.is_enabled" -value "1" -objects $obj -set_property -name "steps.phys_opt_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.phys_opt_design.tcl.post" -value "" -objects $obj -set_property -name "steps.phys_opt_design.args.directive" -value "Default" -objects $obj -set_property -name "steps.phys_opt_design.args.more options" -value "" -objects $obj -set_property -name "steps.route_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.route_design.tcl.post" -value "" -objects $obj -set_property -name "steps.route_design.args.directive" -value "Default" -objects $obj -set_property -name "steps.route_design.args.more options" -value "" -objects $obj -set_property -name "steps.post_route_phys_opt_design.is_enabled" -value "0" -objects $obj -set_property -name "steps.post_route_phys_opt_design.tcl.pre" -value "" -objects $obj -set_property -name "steps.post_route_phys_opt_design.tcl.post" -value "" -objects $obj -set_property -name "steps.post_route_phys_opt_design.args.directive" -value "Default" -objects $obj -set_property -name "steps.post_route_phys_opt_design.args.more options" -value "" -objects $obj -set_property -name "steps.write_bitstream.tcl.pre" -value "" -objects $obj -set_property -name "steps.write_bitstream.tcl.post" -value "" -objects $obj -set_property -name "steps.write_bitstream.args.raw_bitfile" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.mask_file" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.no_binary_bitfile" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.bin_file" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.readback_file" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.logic_location_file" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.verbose" -value "0" -objects $obj -set_property -name "steps.write_bitstream.args.more options" -value "" -objects $obj - -# set the current impl run -current_run -implementation [get_runs impl_1] - -puts "INFO: Project created:${project_name}" -# Create 'drc_1' gadget (if not found) -if {[string equal [get_dashboard_gadgets [ list "drc_1" ] ] ""]} { -create_dashboard_gadget -name {drc_1} -type drc -} -set obj [get_dashboard_gadgets [ list "drc_1" ] ] -set_property -name "active_reports" -value "" -objects $obj -set_property -name "active_reports_invalid" -value "" -objects $obj -set_property -name "active_run" -value "0" -objects $obj -set_property -name "hide_unused_data" -value "1" -objects $obj -set_property -name "incl_new_reports" -value "0" -objects $obj -set_property -name "reports" -value "impl_1#impl_1_route_report_drc_0" -objects $obj -set_property -name "run.step" -value "route_design" -objects $obj -set_property -name "run.type" -value "implementation" -objects $obj -set_property -name "statistics.critical_warning" -value "1" -objects $obj -set_property -name "statistics.error" -value "1" -objects $obj -set_property -name "statistics.info" -value "1" -objects $obj -set_property -name "statistics.warning" -value "1" -objects $obj -set_property -name "view.orientation" -value "Horizontal" -objects $obj -set_property -name "view.type" -value "Graph" -objects $obj - -# Create 'methodology_1' gadget (if not found) -if {[string equal [get_dashboard_gadgets [ list "methodology_1" ] ] ""]} { -create_dashboard_gadget -name {methodology_1} -type methodology -} -set obj [get_dashboard_gadgets [ list "methodology_1" ] ] -set_property -name "active_reports" -value "" -objects $obj -set_property -name "active_reports_invalid" -value "" -objects $obj -set_property -name "active_run" -value "0" -objects $obj -set_property -name "hide_unused_data" -value "1" -objects $obj -set_property -name "incl_new_reports" -value "0" -objects $obj -set_property -name "reports" -value "impl_1#impl_1_route_report_methodology_0" -objects $obj -set_property -name "run.step" -value "route_design" -objects $obj -set_property -name "run.type" -value "implementation" -objects $obj -set_property -name "statistics.critical_warning" -value "1" -objects $obj -set_property -name "statistics.error" -value "1" -objects $obj -set_property -name "statistics.info" -value "1" -objects $obj -set_property -name "statistics.warning" -value "1" -objects $obj -set_property -name "view.orientation" -value "Horizontal" -objects $obj -set_property -name "view.type" -value "Graph" -objects $obj - -# Create 'power_1' gadget (if not found) -if {[string equal [get_dashboard_gadgets [ list "power_1" ] ] ""]} { -create_dashboard_gadget -name {power_1} -type power -} -set obj [get_dashboard_gadgets [ list "power_1" ] ] -set_property -name "active_reports" -value "" -objects $obj -set_property -name "active_reports_invalid" -value "" -objects $obj -set_property -name "active_run" -value "0" -objects $obj -set_property -name "hide_unused_data" -value "1" -objects $obj -set_property -name "incl_new_reports" -value "0" -objects $obj -set_property -name "reports" -value "impl_1#impl_1_route_report_power_0" -objects $obj -set_property -name "run.step" -value "route_design" -objects $obj -set_property -name "run.type" -value "implementation" -objects $obj -set_property -name "statistics.bram" -value "1" -objects $obj -set_property -name "statistics.clocks" -value "1" -objects $obj -set_property -name "statistics.dsp" -value "1" -objects $obj -set_property -name "statistics.gth" -value "1" -objects $obj -set_property -name "statistics.gtp" -value "1" -objects $obj -set_property -name "statistics.gtx" -value "1" -objects $obj -set_property -name "statistics.gtz" -value "1" -objects $obj -set_property -name "statistics.io" -value "1" -objects $obj -set_property -name "statistics.logic" -value "1" -objects $obj -set_property -name "statistics.mmcm" -value "1" -objects $obj -set_property -name "statistics.pcie" -value "1" -objects $obj -set_property -name "statistics.phaser" -value "1" -objects $obj -set_property -name "statistics.pll" -value "1" -objects $obj -set_property -name "statistics.pl_static" -value "1" -objects $obj -set_property -name "statistics.ps7" -value "1" -objects $obj -set_property -name "statistics.ps" -value "1" -objects $obj -set_property -name "statistics.ps_static" -value "1" -objects $obj -set_property -name "statistics.signals" -value "1" -objects $obj -set_property -name "statistics.total_power" -value "1" -objects $obj -set_property -name "statistics.transceiver" -value "1" -objects $obj -set_property -name "statistics.xadc" -value "1" -objects $obj -set_property -name "view.orientation" -value "Horizontal" -objects $obj -set_property -name "view.type" -value "Graph" -objects $obj - -# Create 'timing_1' gadget (if not found) -if {[string equal [get_dashboard_gadgets [ list "timing_1" ] ] ""]} { -create_dashboard_gadget -name {timing_1} -type timing -} -set obj [get_dashboard_gadgets [ list "timing_1" ] ] -set_property -name "active_reports" -value "" -objects $obj -set_property -name "active_reports_invalid" -value "" -objects $obj -set_property -name "active_run" -value "0" -objects $obj -set_property -name "hide_unused_data" -value "1" -objects $obj -set_property -name "incl_new_reports" -value "0" -objects $obj -set_property -name "reports" -value "impl_1#impl_1_route_report_timing_summary_0" -objects $obj -set_property -name "run.step" -value "route_design" -objects $obj -set_property -name "run.type" -value "implementation" -objects $obj -set_property -name "statistics.ths" -value "1" -objects $obj -set_property -name "statistics.tns" -value "1" -objects $obj -set_property -name "statistics.tpws" -value "1" -objects $obj -set_property -name "statistics.whs" -value "1" -objects $obj -set_property -name "statistics.wns" -value "1" -objects $obj -set_property -name "view.orientation" -value "Horizontal" -objects $obj -set_property -name "view.type" -value "Table" -objects $obj - -# Create 'utilization_1' gadget (if not found) -if {[string equal [get_dashboard_gadgets [ list "utilization_1" ] ] ""]} { -create_dashboard_gadget -name {utilization_1} -type utilization -} -set obj [get_dashboard_gadgets [ list "utilization_1" ] ] -set_property -name "active_reports" -value "" -objects $obj -set_property -name "active_reports_invalid" -value "" -objects $obj -set_property -name "active_run" -value "0" -objects $obj -set_property -name "hide_unused_data" -value "1" -objects $obj -set_property -name "incl_new_reports" -value "0" -objects $obj -set_property -name "reports" -value "synth_1#synth_1_synth_report_utilization_0" -objects $obj -set_property -name "run.step" -value "synth_design" -objects $obj -set_property -name "run.type" -value "synthesis" -objects $obj -set_property -name "statistics.bram" -value "1" -objects $obj -set_property -name "statistics.bufg" -value "1" -objects $obj -set_property -name "statistics.dsp" -value "1" -objects $obj -set_property -name "statistics.ff" -value "1" -objects $obj -set_property -name "statistics.gt" -value "1" -objects $obj -set_property -name "statistics.io" -value "1" -objects $obj -set_property -name "statistics.lut" -value "1" -objects $obj -set_property -name "statistics.lutram" -value "1" -objects $obj -set_property -name "statistics.mmcm" -value "1" -objects $obj -set_property -name "statistics.pcie" -value "1" -objects $obj -set_property -name "statistics.pll" -value "1" -objects $obj -set_property -name "statistics.uram" -value "1" -objects $obj -set_property -name "view.orientation" -value "Horizontal" -objects $obj -set_property -name "view.type" -value "Graph" -objects $obj - -# Create 'utilization_2' gadget (if not found) -if {[string equal [get_dashboard_gadgets [ list "utilization_2" ] ] ""]} { -create_dashboard_gadget -name {utilization_2} -type utilization -} -set obj [get_dashboard_gadgets [ list "utilization_2" ] ] -set_property -name "active_reports" -value "" -objects $obj -set_property -name "active_reports_invalid" -value "" -objects $obj -set_property -name "active_run" -value "0" -objects $obj -set_property -name "hide_unused_data" -value "1" -objects $obj -set_property -name "incl_new_reports" -value "0" -objects $obj -set_property -name "reports" -value "impl_1#impl_1_place_report_utilization_0" -objects $obj -set_property -name "run.step" -value "place_design" -objects $obj -set_property -name "run.type" -value "implementation" -objects $obj -set_property -name "statistics.bram" -value "1" -objects $obj -set_property -name "statistics.bufg" -value "1" -objects $obj -set_property -name "statistics.dsp" -value "1" -objects $obj -set_property -name "statistics.ff" -value "1" -objects $obj -set_property -name "statistics.gt" -value "1" -objects $obj -set_property -name "statistics.io" -value "1" -objects $obj -set_property -name "statistics.lut" -value "1" -objects $obj -set_property -name "statistics.lutram" -value "1" -objects $obj -set_property -name "statistics.mmcm" -value "1" -objects $obj -set_property -name "statistics.pcie" -value "1" -objects $obj -set_property -name "statistics.pll" -value "1" -objects $obj -set_property -name "statistics.uram" -value "1" -objects $obj -set_property -name "view.orientation" -value "Horizontal" -objects $obj -set_property -name "view.type" -value "Graph" -objects $obj - -move_dashboard_gadget -name {utilization_1} -row 0 -col 0 -move_dashboard_gadget -name {power_1} -row 1 -col 0 -move_dashboard_gadget -name {drc_1} -row 2 -col 0 -move_dashboard_gadget -name {timing_1} -row 0 -col 1 -move_dashboard_gadget -name {utilization_2} -row 1 -col 1 -move_dashboard_gadget -name {methodology_1} -row 2 -col 1 diff --git a/hw/syn/xilinx/test/project_1_files/kernel.bin.coe b/hw/syn/xilinx/test/project_1_files/kernel.bin.coe deleted file mode 100644 index a316d82b5..000000000 --- a/hw/syn/xilinx/test/project_1_files/kernel.bin.coe +++ /dev/null @@ -1,16386 +0,0 @@ -MEMORY_INITIALIZATION_RADIX=16; -MEMORY_INITIALIZATION_VECTOR= -0, -000000C00000008000000002, -00000003000000020000000100000000, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -00f586b30007a60340d585b300d7073300d787b3002797930027171300f707330207086302e787b3cc5027f30480258304402683040027030000000b008000ef, -00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000008067fef718e300c6a02300478793, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0, -0; diff --git a/hw/syn/xilinx/xrt/Makefile b/hw/syn/xilinx/xrt/Makefile index 38ae29f36..bba2a5d65 100644 --- a/hw/syn/xilinx/xrt/Makefile +++ b/hw/syn/xilinx/xrt/Makefile @@ -4,7 +4,7 @@ include $(ROOT_DIR)/config.mk ifneq ($(findstring Makefile, $(MAKEFILE_LIST)), Makefile) help: $(ECHO) "Makefile Usage:" - $(ECHO) " make all TARGET= PLATFORM=" + $(ECHO) " make all TARGET= PLATFORM=" $(ECHO) " Command to generate the design for specified Target and Device." $(ECHO) "" $(ECHO) " make clean" @@ -15,7 +15,6 @@ endif TARGET ?= hw PLATFORM ?= -NUM_CORES ?= 1 PREFIX ?= build$(XLEN) MAX_JOBS ?= 8 @@ -53,6 +52,9 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE DBG_TRACE_FLAGS += -DDBG_TRACE_MEM DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE DBG_TRACE_FLAGS += -DDBG_TRACE_AFU +DBG_TRACE_FLAGS += -DDBG_TRACE_TEX +DBG_TRACE_FLAGS += -DDBG_TRACE_RASTER +DBG_TRACE_FLAGS += -DDBG_TRACE_OM DBG_TRACE_FLAGS += -DDBG_TRACE_GBAR # Control logic analyzer monitors @@ -60,8 +62,8 @@ DBG_SCOPE_FLAGS += -DDBG_SCOPE_AFU DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU -DBG_SCOPE_FLAGS += -DDBG_SCOPE_MSCHED +ifdef NUM_CORES # cluster configuration CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1 CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2 @@ -71,11 +73,14 @@ CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 CONFIGS += $(CONFIGS_$(NUM_CORES)c) +endif -# include paths +# include sources +RTL_PKGS = $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv FPU_INCLUDE = -I$(RTL_DIR)/fpu ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) - FPU_INCLUDE += -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src + RTL_PKGS += $(THIRD_PARTY_DIR)/cvfpu/src/fpnew_pkg.sv $(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src/cf_math_pkg $(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl/defs_div_sqrt_mvp.sv + FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src endif RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache -I$(AFU_DIR) RTL_INCLUDE += $(FPU_INCLUDE) @@ -84,14 +89,11 @@ RTL_INCLUDE += $(FPU_INCLUDE) VPP_FLAGS += --link --target $(TARGET) --platform $(PLATFORM) --save-temps --no_ip_cache VPP_FLAGS += --vivado.synth.jobs $(JOBS) --vivado.impl.jobs $(JOBS) -ifeq ($(DEV_ARCH), zynquplus) -# ztnq -else ifeq ($(DEV_ARCH), versal) -# versal -else -# alveo -VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:15] -endif +# register compilation hooks +VPP_FLAGS += --xp "vivado_prop:run.impl_1.STEPS.OPT_DESIGN.TCL.PRE=${SRC_DIR}/pre_opt_hook.tcl" + +# load platform settings +include $(SRC_DIR)/platforms.mk VPP_FLAGS += --report_level 2 VPP_FLAGS += --config $(SRC_DIR)/vitis.ini @@ -113,12 +115,13 @@ endif # Debugging ifdef DEBUG - VPP_FLAGS += -g --debug.protocol all - ifneq ($(TARGET), hw) + VPP_FLAGS += -g --optimize 0 --debug.protocol all + ifeq ($(TARGET), hw) + VPP_FLAGS += --debug.chipscope vortex_afu_1 + CFLAGS += -DNDEBUG -DCHIPSCOPE $(DBG_SCOPE_FLAGS) + else VPP_FLAGS += --vivado.prop fileset.sim_1.xsim.elaborate.debug_level=all CFLAGS += -DDEBUG_LEVEL=$(DEBUG) $(DBG_TRACE_FLAGS) - else - CFLAGS += -DNDEBUG endif else VPP_FLAGS += --optimize 3 @@ -128,7 +131,7 @@ endif # Enable scope analyzer ifdef SCOPE CFLAGS += -DSCOPE $(DBG_SCOPE_FLAGS) - SCOPE_JSON += $(BUILD_DIR)/scope.json + SCOPE_JSON += $(BIN_DIR)/scope.json endif # compilation flags @@ -138,7 +141,7 @@ CFLAGS += $(CONFIGS) CFLAGS += $(RTL_INCLUDE) # ast dump flags -XML_CFLAGS = $(filter-out -DSYNTHESIS -DVIVADO, $(CFLAGS)) -I$(DPI_DIR) +XML_CFLAGS = $(filter-out -DSYNTHESIS -DVIVADO, $(CFLAGS)) $(RTL_PKGS) -I$(DPI_DIR) -DSV_DPI # RTL Kernel only supports Hardware and Hardware Emulation. ifneq ($(TARGET),$(findstring $(TARGET), hw hw_emu)) @@ -157,34 +160,34 @@ gen-ast: $(BUILD_DIR)/vortex.xml $(BUILD_DIR)/vortex.xml: mkdir -p $(BUILD_DIR); cd $(BUILD_DIR); verilator --xml-only -O0 $(XML_CFLAGS) vortex_afu.v --xml-output vortex.xml -scope-json: $(BUILD_DIR)/scope.json -$(BUILD_DIR)/scope.json: $(BUILD_DIR)/vortex.xml - mkdir -p $(BUILD_DIR); cd $(BUILD_DIR); $(SCRIPT_DIR)/scope.py vortex.xml -o scope.json +scope-json: $(BIN_DIR)/scope.json +$(BIN_DIR)/scope.json: $(BUILD_DIR)/vortex.xml + mkdir -p $(BUILD_DIR); cd $(BUILD_DIR); $(SCRIPT_DIR)/scope.py vortex.xml -o bin/scope.json gen-xo: $(XO_CONTAINER) $(XO_CONTAINER): $(BUILD_DIR)/sources.txt - mkdir -p $(BUILD_DIR); cd $(BUILD_DIR); $(VIVADO) -mode batch -source $(SRC_DIR)/scripts/gen_xo.tcl -tclargs ../$(XO_CONTAINER) vortex_afu sources.txt $(SCRIPT_DIR) ../$(BUILD_DIR) + mkdir -p $(BUILD_DIR); cd $(BUILD_DIR); TOOL_DIR=$(SCRIPT_DIR) $(VIVADO) -mode batch -source $(SRC_DIR)/gen_xo.tcl -tclargs ../$(XO_CONTAINER) vortex_afu sources.txt ../$(BUILD_DIR) gen-bin: $(XCLBIN_CONTAINER) $(XCLBIN_CONTAINER): $(XO_CONTAINER) $(SCOPE_JSON) - mkdir -p $(BIN_DIR); cd $(BUILD_DIR); $(VPP) $(VPP_FLAGS) -o ../$(XCLBIN_CONTAINER) ../$(XO_CONTAINER) + mkdir -p $(BIN_DIR); cd $(BUILD_DIR); TOOL_DIR=$(SCRIPT_DIR) $(VPP) $(VPP_FLAGS) -o ../$(XCLBIN_CONTAINER) ../$(XO_CONTAINER) emconfig: $(BIN_DIR)/emconfig.json $(BIN_DIR)/emconfig.json: mkdir -p $(BIN_DIR); cd $(BUILD_DIR); emconfigutil --platform $(PLATFORM) --od ../$(BIN_DIR) report: $(XCLBIN_CONTAINER) -ifeq ($(TARGET),$(findstring $(TARGET), hw)) - cp $(BUILD_DIR)/_x/logs/link/syn/ulp_vortex_afu_1_0_synth_1_runme.log $(BUILD_DIR)/bin/runme.log - cp $(BUILD_DIR)/_x/reports/link/imp/impl_1_full_util_routed.rpt $(BUILD_DIR)/bin/synthesis.log - cp $(BUILD_DIR)/_x/reports/link/imp/impl_1_hw_bb_locked_timing_summary_routed.rpt $(BUILD_DIR)/bin/timing.log +ifeq ($(TARGET), hw) + cp $(BUILD_DIR)/_x/logs/link/vivado.log $(BUILD_DIR)/bin + cp $(BUILD_DIR)/_x/logs/link/syn/ulp_vortex_afu_1_0_synth_1_runme.log $(BUILD_DIR)/bin + cp $(BUILD_DIR)/_x/reports/link/syn/ulp_vortex_afu_1_0_synth_1_ulp_vortex_afu_1_0_utilization_synth.rpt $(BUILD_DIR)/bin + cp $(BUILD_DIR)/_x/reports/link/imp/impl_1_hw_bb_locked_utilization_placed.rpt $(BUILD_DIR)/bin + cp $(BUILD_DIR)/_x/reports/link/imp/impl_1_hw_bb_locked_timing_summary_routed.rpt $(BUILD_DIR)/bin + cp $(BUILD_DIR)/_x/link/vivado/vpl/prj/prj.runs/impl_1/hier_utilization.rpt $(BUILD_DIR)/bin endif -hwserver: - debug_hw --xvc_pcie /dev/xfpga/xvc_pub.u2305.0 --hw_server & - chipscope: - debug_hw --vivado --host localhost --ltx_file $(BUILD_DIR)/_x/link/vivado/vpl/prj/prj.runs/impl_1/debug_nets.ltx & + debug_hw --vivado --host localhost --ltx_file $(BUILD_DIR)/bin/vortex_afu.ltx & clean: $(RMDIR) $(BUILD_DIR) diff --git a/hw/syn/xilinx/xrt/scripts/gen_xo.tcl b/hw/syn/xilinx/xrt/gen_xo.tcl similarity index 67% rename from hw/syn/xilinx/xrt/scripts/gen_xo.tcl rename to hw/syn/xilinx/xrt/gen_xo.tcl index 0f95f09be..b852d90e9 100644 --- a/hw/syn/xilinx/xrt/scripts/gen_xo.tcl +++ b/hw/syn/xilinx/xrt/gen_xo.tcl @@ -1,29 +1,29 @@ # Copyright © 2019-2023 -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -if { $::argc != 5 } { +if { $::argc != 4 } { puts "ERROR: Program \"$::argv0\" requires 4 arguments!\n" - puts "Usage: $::argv0 \n" + puts "Usage: $::argv0 \n" exit } set xoname [lindex $::argv 0] set krnl_name [lindex $::argv 1] set vcs_file [lindex $::argv 2] -set tool_dir [lindex $::argv 3] -set build_dir [lindex $::argv 4] +set build_dir [lindex $::argv 3] -set script_path [ file dirname [ file normalize [ info script ] ] ] +set tool_dir $::env(TOOL_DIR) +set script_dir [ file dirname [ file normalize [ info script ] ] ] if {[file exists "${xoname}"]} { file delete -force "${xoname}" @@ -31,10 +31,10 @@ if {[file exists "${xoname}"]} { set argv [list ${build_dir}/ip] set argc 1 -source ${script_path}/gen_ip.tcl +source ${tool_dir}/xilinx_ip_gen.tcl -set argv [list ${krnl_name} ${vcs_file} ${tool_dir} ${build_dir}] -set argc 4 -source ${script_path}/package_kernel.tcl +set argv [list ${krnl_name} ${vcs_file} ${build_dir}] +set argc 3 +source ${script_dir}/package_kernel.tcl -package_xo -xo_path ${xoname} -kernel_name ${krnl_name} -ip_directory "${build_dir}/xo/packaged_kernel" +package_xo -xo_path ${xoname} -kernel_name ${krnl_name} -ip_directory "${build_dir}/xo/packaged_kernel" \ No newline at end of file diff --git a/hw/syn/xilinx/xrt/package_kernel.tcl b/hw/syn/xilinx/xrt/package_kernel.tcl new file mode 100644 index 000000000..dd916d4d7 --- /dev/null +++ b/hw/syn/xilinx/xrt/package_kernel.tcl @@ -0,0 +1,276 @@ +# Copyright © 2019-2023 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if { $::argc != 3 } { + puts "ERROR: Program \"$::argv0\" requires 3 arguments!\n" + puts "Usage: $::argv0 \n" + exit +} + +set krnl_name [lindex $::argv 0] +set vcs_file [lindex $::argv 1] +set build_dir [lindex $::argv 2] + +set tool_dir $::env(TOOL_DIR) +set script_dir [ file dirname [ file normalize [ info script ] ] ] + +puts "Using krnl_name=$krnl_name" +puts "Using vcs_file=$vcs_file" +puts "Using tool_dir=$tool_dir" +puts "Using build_dir=$build_dir" +puts "Using script_dir=$script_dir" + +set path_to_packaged "${build_dir}/xo/packaged_kernel" +set path_to_tmp_project "${build_dir}/xo/project" + +source "${tool_dir}/parse_vcs_list.tcl" +set vlist [parse_vcs_list "${vcs_file}"] + +set vsources_list [lindex $vlist 0] +set vincludes_list [lindex $vlist 1] +set vdefines_list [lindex $vlist 2] + +#puts ${vsources_list} +#puts ${vincludes_list} +#puts ${vdefines_list} + +set chipscope 0 +set num_banks 1 +set merged_mem_if 0 + +# parse vdefines_list for configuration parameters +foreach def $vdefines_list { + set fields [split $def "="] + set name [lindex $fields 0] + if { $name == "CHIPSCOPE" } { + set chipscope 1 + } + if { $name == "PLATFORM_MEMORY_NUM_BANKS" } { + set num_banks [lindex $fields 1] + } + if { $name == "PLATFORM_MERGED_MEMORY_INTERFACE" } { + set merged_mem_if 1 + } +} + +if { $merged_mem_if == 1 } { + set num_banks 1 +} + +create_project -force kernel_pack $path_to_tmp_project + +add_files -norecurse ${vsources_list} + +set obj [get_filesets sources_1] +set ip_files [list \ + [file normalize "${build_dir}/ip/xil_fdiv/xil_fdiv.xci"] \ + [file normalize "${build_dir}/ip/xil_fma/xil_fma.xci"] \ + [file normalize "${build_dir}/ip/xil_fsqrt/xil_fsqrt.xci"] \ +] +add_files -verbose -norecurse -fileset $obj $ip_files + +set_property include_dirs ${vincludes_list} [current_fileset] +set_property verilog_define ${vdefines_list} [current_fileset] + +set obj [get_filesets sources_1] +set_property -verbose -name "top" -value ${krnl_name} -objects $obj + +if { $chipscope == 1 } { + # hw debugging + create_ip -name ila -vendor xilinx.com -library ip -version 6.2 -module_name ila_afu + set_property -dict [list CONFIG.C_ADV_TRIGGER {true} \ + CONFIG.C_EN_STRG_QUAL {1} \ + CONFIG.C_DATA_DEPTH {8192} \ + CONFIG.C_NUM_OF_PROBES {2} \ + CONFIG.C_PROBE0_WIDTH {8} \ + CONFIG.C_PROBE1_WIDTH {64} \ + CONFIG.ALL_PROBE_SAME_MU {false} \ + CONFIG.ALL_PROBE_SAME_MU_CNT {2} \ + ] [get_ips ila_afu] + generate_target {instantiation_template} [get_files ila_afu.xci] + set_property generate_synth_checkpoint false [get_files ila_afu.xci] + + create_ip -name ila -vendor xilinx.com -library ip -version 6.2 -module_name ila_fetch + set_property -dict [list CONFIG.C_ADV_TRIGGER {true} \ + CONFIG.C_EN_STRG_QUAL {1} \ + CONFIG.C_DATA_DEPTH {8192} \ + CONFIG.C_NUM_OF_PROBES {3} \ + CONFIG.C_PROBE0_WIDTH {40} \ + CONFIG.C_PROBE1_WIDTH {80} \ + CONFIG.C_PROBE2_WIDTH {40} \ + CONFIG.ALL_PROBE_SAME_MU {false} \ + CONFIG.ALL_PROBE_SAME_MU_CNT {2} \ + ] [get_ips ila_fetch] + generate_target {instantiation_template} [get_files ila_fetch.xci] + set_property generate_synth_checkpoint false [get_files ila_fetch.xci] + + create_ip -name ila -vendor xilinx.com -library ip -version 6.2 -module_name ila_issue + set_property -dict [list CONFIG.C_ADV_TRIGGER {true} \ + CONFIG.C_EN_STRG_QUAL {1} \ + CONFIG.C_DATA_DEPTH {8192} \ + CONFIG.C_NUM_OF_PROBES {4} \ + CONFIG.C_PROBE0_WIDTH {112} \ + CONFIG.C_PROBE1_WIDTH {112} \ + CONFIG.C_PROBE2_WIDTH {280} \ + CONFIG.C_PROBE3_WIDTH {112} \ + CONFIG.ALL_PROBE_SAME_MU {false} \ + CONFIG.ALL_PROBE_SAME_MU_CNT {2} \ + ] [get_ips ila_issue] + generate_target {instantiation_template} [get_files ila_issue.xci] + set_property generate_synth_checkpoint false [get_files ila_issue.xci] + + create_ip -name ila -vendor xilinx.com -library ip -version 6.2 -module_name ila_lsu + set_property -dict [list CONFIG.C_ADV_TRIGGER {true} \ + CONFIG.C_EN_STRG_QUAL {1} \ + CONFIG.C_DATA_DEPTH {8192} \ + CONFIG.C_NUM_OF_PROBES {3} \ + CONFIG.C_PROBE0_WIDTH {288} \ + CONFIG.C_PROBE1_WIDTH {152} \ + CONFIG.C_PROBE2_WIDTH {72} \ + CONFIG.ALL_PROBE_SAME_MU {false} \ + CONFIG.ALL_PROBE_SAME_MU_CNT {2} \ + ] [get_ips ila_lsu] + generate_target {instantiation_template} [get_files ila_lsu.xci] + set_property generate_synth_checkpoint false [get_files ila_lsu.xci] +} + +update_compile_order -fileset sources_1 +update_compile_order -fileset sim_1 +ipx::package_project -root_dir $path_to_packaged -vendor xilinx.com -library RTLKernel -taxonomy /KernelIP -import_files -set_current false +ipx::unload_core $path_to_packaged/component.xml +ipx::edit_ip_in_project -upgrade true -name tmp_edit_project -directory $path_to_packaged $path_to_packaged/component.xml + +set core [ipx::current_core] + +set_property core_revision 2 $core +foreach up [ipx::get_user_parameters] { + ipx::remove_user_parameter [get_property NAME $up] $core +} + +ipx::associate_bus_interfaces -busif s_axi_ctrl -clock ap_clk $core + +for {set i 0} {$i < $num_banks} {incr i} { + ipx::associate_bus_interfaces -busif m_axi_mem_$i -clock ap_clk $core +} + +set mem_map [::ipx::add_memory_map -quiet "s_axi_ctrl" $core] +set addr_block [::ipx::add_address_block -quiet "reg0" $mem_map] + +set reg [::ipx::add_register "CTRL" $addr_block] +set_property description "Control signals" $reg +set_property address_offset 0x000 $reg +set_property size 32 $reg + +set field [ipx::add_field AP_START $reg] +set_property ACCESS {read-write} $field +set_property BIT_OFFSET {0} $field +set_property BIT_WIDTH {1} $field +set_property DESCRIPTION {Control signal Register for 'ap_start'.} $field +set_property MODIFIED_WRITE_VALUE {modify} $field + +set field [ipx::add_field AP_DONE $reg] +set_property ACCESS {read-only} $field +set_property BIT_OFFSET {1} $field +set_property BIT_WIDTH {1} $field +set_property DESCRIPTION {Control signal Register for 'ap_done'.} $field +set_property READ_ACTION {modify} $field + +set field [ipx::add_field AP_IDLE $reg] +set_property ACCESS {read-only} $field +set_property BIT_OFFSET {2} $field +set_property BIT_WIDTH {1} $field +set_property DESCRIPTION {Control signal Register for 'ap_idle'.} $field +set_property READ_ACTION {modify} $field + +set field [ipx::add_field AP_READY $reg] +set_property ACCESS {read-only} $field +set_property BIT_OFFSET {3} $field +set_property BIT_WIDTH {1} $field +set_property DESCRIPTION {Control signal Register for 'ap_ready'.} $field +set_property READ_ACTION {modify} $field + +set field [ipx::add_field RESERVED_1 $reg] +set_property ACCESS {read-only} $field +set_property BIT_OFFSET {4} $field +set_property BIT_WIDTH {3} $field +set_property DESCRIPTION {Reserved. 0s on read.} $field +set_property READ_ACTION {modify} $field + +set field [ipx::add_field AUTO_RESTART $reg] +set_property ACCESS {read-write} $field +set_property BIT_OFFSET {7} $field +set_property BIT_WIDTH {1} $field +set_property DESCRIPTION {Control signal Register for 'auto_restart'.} $field +set_property MODIFIED_WRITE_VALUE {modify} $field + +set field [ipx::add_field RESERVED_2 $reg] +set_property ACCESS {read-only} $field +set_property BIT_OFFSET {8} $field +set_property BIT_WIDTH {24} $field +set_property DESCRIPTION {Reserved. 0s on read.} $field +set_property READ_ACTION {modify} $field + +set reg [::ipx::add_register "GIER" $addr_block] +set_property description "Global Interrupt Enable Register" $reg +set_property address_offset 0x004 $reg +set_property size 32 $reg + +set reg [::ipx::add_register "IP_IER" $addr_block] +set_property description "IP Interrupt Enable Register" $reg +set_property address_offset 0x008 $reg +set_property size 32 $reg + +set reg [::ipx::add_register "IP_ISR" $addr_block] +set_property description "IP Interrupt Status Register" $reg +set_property address_offset 0x00C $reg +set_property size 32 $reg + +set reg [::ipx::add_register -quiet "DEV" $addr_block] +set_property address_offset 0x010 $reg +set_property size [expr {8*8}] $reg + +set reg [::ipx::add_register -quiet "ISA" $addr_block] +set_property address_offset 0x018 $reg +set_property size [expr {8*8}] $reg + +set reg [::ipx::add_register -quiet "DCR" $addr_block] +set_property address_offset 0x020 $reg +set_property size [expr {8*8}] $reg + +set reg [::ipx::add_register -quiet "SCP" $addr_block] +set_property address_offset 0x028 $reg +set_property size [expr {8*8}] $reg + +for {set i 0} {$i < $num_banks} {incr i} { +# Add register for each memory bank +set reg [::ipx::add_register -quiet "MEM_$i" $addr_block] +set_property address_offset [expr {0x30 + $i * 8}] $reg +set_property size [expr {8*8}] $reg +# Associate the bus interface +set regparam [::ipx::add_register_parameter ASSOCIATED_BUSIF $reg] +set_property value m_axi_mem_$i $regparam +} + +set_property slave_memory_map_ref "s_axi_ctrl" [::ipx::get_bus_interfaces -of $core "s_axi_ctrl"] + +set_property xpm_libraries {XPM_CDC XPM_MEMORY XPM_FIFO} $core +set_property sdx_kernel true $core +set_property sdx_kernel_type rtl $core +set_property supported_families { } $core +set_property auto_family_support_level level_2 $core + +ipx::create_xgui_files $core +ipx::update_checksums $core +ipx::check_integrity -kernel $core +ipx::save_core $core +close_project -delete diff --git a/hw/syn/xilinx/xrt/platforms.mk b/hw/syn/xilinx/xrt/platforms.mk new file mode 100644 index 000000000..a38782a54 --- /dev/null +++ b/hw/syn/xilinx/xrt/platforms.mk @@ -0,0 +1,40 @@ +# Platform specific configurations +# Add your platform specific configurations here + +CONFIGS += -DPLATFORM_MEMORY_DATA_WIDTH=512 + +ifeq ($(DEV_ARCH), zynquplus) +# zynquplus +CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32 +else ifeq ($(DEV_ARCH), versal) +# versal +CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32 +ifneq ($(findstring xilinx_vck5000,$(XSA)),) + CONFIGS += -DPLATFORM_MEMORY_OFFSET=40'hC000000000 +endif +else +# alveo +ifneq ($(findstring xilinx_u55c,$(XSA)),) + # 16 GB of HBM2 with 32 channels (512 MB per channel) + CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=34 + CONFIGS += -DPLATFORM_MERGED_MEMORY_INTERFACE + VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31] + #VPP_FLAGS += $(foreach i,$(shell seq 0 31), --connectivity.sp vortex_afu_1.m_axi_mem_$(i):HBM[$(i)]) +else ifneq ($(findstring xilinx_u50,$(XSA)),) + # 8 GB of HBM2 with 32 channels (256 MB per channel) + CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=33 + VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31] +else ifneq ($(findstring xilinx_u280,$(XSA)),) + # 8 GB of HBM2 with 32 channels (256 MB per channel) + CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=32 -DPLATFORM_MEMORY_ADDR_WIDTH=33 + VPP_FLAGS += --connectivity.sp vortex_afu_1.m_axi_mem_0:HBM[0:31] +else ifneq ($(findstring xilinx_u250,$(XSA)),) + # 64 GB of DDR4 with 4 channels (16 GB per channel) + CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=36 +else ifneq ($(findstring xilinx_u200,$(XSA)),) + # 64 GB of DDR4 with 4 channels (16 GB per channel) + CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=4 -DPLATFORM_MEMORY_ADDR_WIDTH=36 +else + CONFIGS += -DPLATFORM_MEMORY_NUM_BANKS=1 -DPLATFORM_MEMORY_ADDR_WIDTH=32 +endif +endif diff --git a/hw/syn/xilinx/xrt/pre_opt_hook.tcl b/hw/syn/xilinx/xrt/pre_opt_hook.tcl new file mode 100644 index 000000000..0a3dda421 --- /dev/null +++ b/hw/syn/xilinx/xrt/pre_opt_hook.tcl @@ -0,0 +1,4 @@ +set tool_dir $::env(TOOL_DIR) +source ${tool_dir}/xilinx_async_bram_patch.tcl + +report_utilization -file hier_utilization.rpt -hierarchical -hierarchical_percentages \ No newline at end of file diff --git a/hw/syn/xilinx/xrt/scripts/package_kernel.tcl b/hw/syn/xilinx/xrt/scripts/package_kernel.tcl deleted file mode 100644 index 607e7955d..000000000 --- a/hw/syn/xilinx/xrt/scripts/package_kernel.tcl +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright © 2019-2023 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -if { $::argc != 4 } { - puts "ERROR: Program \"$::argv0\" requires 4 arguments!\n" - puts "Usage: $::argv0 \n" - exit -} - -set krnl_name [lindex $::argv 0] -set vcs_file [lindex $::argv 1] -set tool_dir [lindex $::argv 2] -set build_dir [lindex $::argv 3] - -set path_to_packaged "${build_dir}/xo/packaged_kernel" -set path_to_tmp_project "${build_dir}/xo/project" - -source "${tool_dir}/parse_vcs_list.tcl" -set vlist [parse_vcs_list "${vcs_file}"] - -set vsources_list [lindex $vlist 0] -set vincludes_list [lindex $vlist 1] -set vdefines_list [lindex $vlist 2] - -#puts ${vsources_list} -#puts ${vincludes_list} -#puts ${vdefines_list} - -# find if chipscope is enabled -set chipscope 0 -foreach def $vdefines_list { - set fields [split $def "="] - set name [lindex $fields 0] - if { $name == "CHIPSCOPE" } { - set chipscope 1 - } -} - -create_project -force kernel_pack $path_to_tmp_project - -add_files -norecurse ${vsources_list} - -set obj [get_filesets sources_1] -set files [list \ - [file normalize "${build_dir}/ip/xil_fdiv/xil_fdiv.xci"] \ - [file normalize "${build_dir}/ip/xil_fma/xil_fma.xci"] \ - [file normalize "${build_dir}/ip/xil_fsqrt/xil_fsqrt.xci"] \ -] -add_files -verbose -norecurse -fileset $obj $files - -set_property include_dirs ${vincludes_list} [current_fileset] -#set_property verilog_define ${vdefines_list} [current_fileset] - -set obj [get_filesets sources_1] -set_property -verbose -name "top" -value ${krnl_name} -objects $obj - -if { $chipscope == 1 } { - # hw debugging - create_ip -name axis_ila -vendor xilinx.com -library ip -version 1.1 -module_name ila_afu - set_property -dict [list CONFIG.C_ADV_TRIGGER {true} \ - CONFIG.C_EN_STRG_QUAL {1} \ - CONFIG.C_DATA_DEPTH {4096} \ - CONFIG.C_NUM_OF_PROBES {2} \ - CONFIG.C_PROBE0_WIDTH {8} \ - CONFIG.C_PROBE1_WIDTH {24} \ - ] [get_ips ila_afu] - generate_target {instantiation_template} [get_files ila_afu.xci] - set_property generate_synth_checkpoint false [get_files ila_afu.xci] - - create_ip -name axis_ila -vendor xilinx.com -library ip -version 1.1 -module_name ila_fetch - set_property -dict [list CONFIG.C_ADV_TRIGGER {true} \ - CONFIG.C_EN_STRG_QUAL {1} \ - CONFIG.C_DATA_DEPTH {4096} \ - CONFIG.C_NUM_OF_PROBES {3} \ - CONFIG.C_PROBE0_WIDTH {128} \ - CONFIG.C_PROBE1_WIDTH {128} \ - CONFIG.C_PROBE2_WIDTH {128} \ - ] [get_ips ila_fetch] - generate_target {instantiation_template} [get_files ila_fetch.xci] - set_property generate_synth_checkpoint false [get_files ila_fetch.xci] - - create_ip -name axis_ila -vendor xilinx.com -library ip -version 1.1 -module_name ila_issue - set_property -dict [list CONFIG.C_ADV_TRIGGER {true} \ - CONFIG.C_EN_STRG_QUAL {1} \ - CONFIG.C_DATA_DEPTH {4096} \ - CONFIG.C_NUM_OF_PROBES {2} \ - CONFIG.C_PROBE0_WIDTH {256} \ - CONFIG.C_PROBE1_WIDTH {128} \ - ] [get_ips ila_issue] - generate_target {instantiation_template} [get_files ila_issue.xci] - set_property generate_synth_checkpoint false [get_files ila_issue.xci] - - create_ip -name axis_ila -vendor xilinx.com -library ip -version 1.1 -module_name ila_lsu - set_property -dict [list CONFIG.C_ADV_TRIGGER {true} \ - CONFIG.C_EN_STRG_QUAL {1} \ - CONFIG.C_DATA_DEPTH {4096} \ - CONFIG.C_NUM_OF_PROBES {4} \ - CONFIG.C_PROBE0_WIDTH {256} \ - CONFIG.C_PROBE1_WIDTH {128} \ - CONFIG.C_PROBE2_WIDTH {288} \ - CONFIG.C_PROBE3_WIDTH {256} \ - ] [get_ips ila_lsu] - generate_target {instantiation_template} [get_files ila_lsu.xci] - set_property generate_synth_checkpoint false [get_files ila_lsu.xci] - - create_ip -name axis_ila -vendor xilinx.com -library ip -version 1.1 -module_name ila_msched - set_property -dict [list CONFIG.C_ADV_TRIGGER {true} \ - CONFIG.C_EN_STRG_QUAL {1} \ - CONFIG.C_DATA_DEPTH {4096} \ - CONFIG.C_NUM_OF_PROBES {4} \ - CONFIG.C_PROBE0_WIDTH {128} \ - CONFIG.C_PROBE1_WIDTH {128} \ - CONFIG.C_PROBE2_WIDTH {128} \ - CONFIG.C_PROBE3_WIDTH {128} \ - ] [get_ips ila_msched] - generate_target {instantiation_template} [get_files ila_msched.xci] - set_property generate_synth_checkpoint false [get_files ila_msched.xci] -} - -update_compile_order -fileset sources_1 -update_compile_order -fileset sim_1 -ipx::package_project -root_dir $path_to_packaged -vendor xilinx.com -library RTLKernel -taxonomy /KernelIP -import_files -set_current false -ipx::unload_core $path_to_packaged/component.xml -ipx::edit_ip_in_project -upgrade true -name tmp_edit_project -directory $path_to_packaged $path_to_packaged/component.xml - -set core [ipx::current_core] - -set_property core_revision 2 $core -foreach up [ipx::get_user_parameters] { - ipx::remove_user_parameter [get_property NAME $up] $core -} - -ipx::associate_bus_interfaces -busif s_axi_ctrl -clock ap_clk $core - -for {set i 0} {$i < 1} {incr i} { - ipx::associate_bus_interfaces -busif m_axi_mem_$i -clock ap_clk $core -} - -set mem_map [::ipx::add_memory_map -quiet "s_axi_ctrl" $core] -set addr_block [::ipx::add_address_block -quiet "reg0" $mem_map] - -set reg [::ipx::add_register "CTRL" $addr_block] - set_property description "Control signals" $reg - set_property address_offset 0x000 $reg - set_property size 32 $reg - -set field [ipx::add_field AP_START $reg] - set_property ACCESS {read-write} $field - set_property BIT_OFFSET {0} $field - set_property BIT_WIDTH {1} $field - set_property DESCRIPTION {Control signal Register for 'ap_start'.} $field - set_property MODIFIED_WRITE_VALUE {modify} $field - -set field [ipx::add_field AP_DONE $reg] - set_property ACCESS {read-only} $field - set_property BIT_OFFSET {1} $field - set_property BIT_WIDTH {1} $field - set_property DESCRIPTION {Control signal Register for 'ap_done'.} $field - set_property READ_ACTION {modify} $field - -set field [ipx::add_field AP_IDLE $reg] - set_property ACCESS {read-only} $field - set_property BIT_OFFSET {2} $field - set_property BIT_WIDTH {1} $field - set_property DESCRIPTION {Control signal Register for 'ap_idle'.} $field - set_property READ_ACTION {modify} $field - -set field [ipx::add_field AP_READY $reg] - set_property ACCESS {read-only} $field - set_property BIT_OFFSET {3} $field - set_property BIT_WIDTH {1} $field - set_property DESCRIPTION {Control signal Register for 'ap_ready'.} $field - set_property READ_ACTION {modify} $field - -set field [ipx::add_field RESERVED_1 $reg] - set_property ACCESS {read-only} $field - set_property BIT_OFFSET {4} $field - set_property BIT_WIDTH {3} $field - set_property DESCRIPTION {Reserved. 0s on read.} $field - set_property READ_ACTION {modify} $field - -set field [ipx::add_field AUTO_RESTART $reg] - set_property ACCESS {read-write} $field - set_property BIT_OFFSET {7} $field - set_property BIT_WIDTH {1} $field - set_property DESCRIPTION {Control signal Register for 'auto_restart'.} $field - set_property MODIFIED_WRITE_VALUE {modify} $field - -set field [ipx::add_field RESERVED_2 $reg] - set_property ACCESS {read-only} $field - set_property BIT_OFFSET {8} $field - set_property BIT_WIDTH {24} $field - set_property DESCRIPTION {Reserved. 0s on read.} $field - set_property READ_ACTION {modify} $field - -set reg [::ipx::add_register "GIER" $addr_block] - set_property description "Global Interrupt Enable Register" $reg - set_property address_offset 0x004 $reg - set_property size 32 $reg - -set reg [::ipx::add_register "IP_IER" $addr_block] - set_property description "IP Interrupt Enable Register" $reg - set_property address_offset 0x008 $reg - set_property size 32 $reg - -set reg [::ipx::add_register "IP_ISR" $addr_block] - set_property description "IP Interrupt Status Register" $reg - set_property address_offset 0x00C $reg - set_property size 32 $reg - -set reg [::ipx::add_register -quiet "DEV" $addr_block] - set_property address_offset 0x010 $reg - set_property size [expr {8*8}] $reg - -set reg [::ipx::add_register -quiet "ISA" $addr_block] - set_property address_offset 0x01C $reg - set_property size [expr {8*8}] $reg - -set reg [::ipx::add_register -quiet "DCR" $addr_block] - set_property address_offset 0x028 $reg - set_property size [expr {8*8}] $reg - -set reg [::ipx::add_register -quiet "SCP" $addr_block] - set_property address_offset 0x034 $reg - set_property size [expr {8*8}] $reg - -for {set i 0} {$i < 1} {incr i} { - set reg [::ipx::add_register -quiet "MEM_$i" $addr_block] - set_property address_offset [expr {0x040 + $i * 12}] $reg - set_property size [expr {8*8}] $reg - set regparam [::ipx::add_register_parameter -quiet {ASSOCIATED_BUSIF} $reg] - set_property value m_axi_mem_$i $regparam -} - -set_property slave_memory_map_ref "s_axi_ctrl" [::ipx::get_bus_interfaces -of $core "s_axi_ctrl"] - -set_property xpm_libraries {XPM_CDC XPM_MEMORY XPM_FIFO} $core -set_property sdx_kernel true $core -set_property sdx_kernel_type rtl $core -set_property supported_families { } $core -set_property auto_family_support_level level_2 $core - -ipx::create_xgui_files $core -ipx::update_checksums $core -ipx::check_integrity -kernel $core -ipx::save_core $core -close_project -delete diff --git a/hw/syn/yosys/Makefile b/hw/syn/yosys/Makefile index 80bfdae02..3e4d930e4 100644 --- a/hw/syn/yosys/Makefile +++ b/hw/syn/yosys/Makefile @@ -5,7 +5,6 @@ SRC_DIR := $(VORTEX_HOME)/hw/syn/yosys TOP_LEVEL_ENTITY ?= Vortex PREFIX ?= build -NUM_CORES ?= 1 SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts RTL_DIR := $(VORTEX_HOME)/hw/rtl @@ -29,8 +28,8 @@ DBG_SCOPE_FLAGS += -DDBG_SCOPE_AFU DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU -DBG_SCOPE_FLAGS += -DDBG_SCOPE_MSCHED +ifdef NUM_CORES # cluster configuration CONFIGS_1c := -DNUM_CLUSTERS=1 -DNUM_CORES=1 CONFIGS_2c := -DNUM_CLUSTERS=1 -DNUM_CORES=2 @@ -40,11 +39,12 @@ CONFIGS_16c := -DNUM_CLUSTERS=1 -DNUM_CORES=16 -DL2_ENABLE CONFIGS_32c := -DNUM_CLUSTERS=2 -DNUM_CORES=16 -DL2_ENABLE CONFIGS_64c := -DNUM_CLUSTERS=4 -DNUM_CORES=16 -DL2_ENABLE CONFIGS += $(CONFIGS_$(NUM_CORES)c) +endif # include paths FPU_INCLUDE = -I$(RTL_DIR)/fpu ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) - FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -J$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -J$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/fpnew/src + FPU_INCLUDE += -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -J$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -J$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -J$(THIRD_PARTY_DIR)/cvfpu/src endif RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache RTL_INCLUDE += $(FPU_INCLUDE) diff --git a/hw/syn/yosys/synth.sh b/hw/syn/yosys/synth.sh index 79708b189..76559b8d3 100755 --- a/hw/syn/yosys/synth.sh +++ b/hw/syn/yosys/synth.sh @@ -20,6 +20,8 @@ # exit when any command fails set -e +library="" +sdc_file="" source="" top_level="" dir_list=() @@ -66,8 +68,14 @@ checkErrors() usage() { echo "$0 usage:" && grep " .)\ #" $0; exit 0; } [ $# -eq 0 ] && usage -while getopts "s:t:I:D:P:Wh" arg; do +while getopts "c:l:s:t:I:D:P:Wh" arg; do case $arg in + l) # library + library=${OPTARG} + ;; + c) # SDC constraints + sdc_file=${OPTARG} + ;; s) # source source=${OPTARG} ;; @@ -95,6 +103,16 @@ while getopts "s:t:I:D:P:Wh" arg; do done { + # read device library + if [ -n "$library" ]; then + echo "read_liberty $library" + fi + + # read design constraints + if [ -n "$sdc_file" ]; then + echo "read_sdc $sdc_file" + fi + # read design sources for dir in "${dir_list[@]}" do diff --git a/hw/unittest/Makefile b/hw/unittest/Makefile index 5722ec9bc..d3a74d794 100644 --- a/hw/unittest/Makefile +++ b/hw/unittest/Makefile @@ -1,23 +1,26 @@ all: - $(MAKE) -C cache $(MAKE) -C generic_queue $(MAKE) -C mem_streamer $(MAKE) -C cache_top $(MAKE) -C core_top $(MAKE) -C issue_top + $(MAKE) -C local_mem_top + $(MAKE) -C mem_unit_top run: - $(MAKE) -C cache run $(MAKE) -C generic_queue run $(MAKE) -C mem_streamer run $(MAKE) -C cache_top run $(MAKE) -C core_top run $(MAKE) -C issue_top run + $(MAKE) -C local_mem_top run + $(MAKE) -C mem_unit_top run clean: - $(MAKE) -C cache clean $(MAKE) -C generic_queue clean $(MAKE) -C mem_streamer clean $(MAKE) -C cache_top clean $(MAKE) -C core_top clean - $(MAKE) -C issue_top clean \ No newline at end of file + $(MAKE) -C issue_top clean + $(MAKE) -C local_mem_top clean + $(MAKE) -C mem_unit_top clean \ No newline at end of file diff --git a/hw/unittest/cache/cachesim.cpp b/hw/unittest/cache/cachesim.cpp deleted file mode 100644 index acd68419b..000000000 --- a/hw/unittest/cache/cachesim.cpp +++ /dev/null @@ -1,354 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "cachesim.h" -#include -#include -#include -#include -#include - -#ifndef TRACE_START_TIME -#define TRACE_START_TIME 0ull -#endif - -#ifndef TRACE_STOP_TIME -#define TRACE_STOP_TIME -1ull -#endif - -static uint64_t timestamp = 0; -static bool trace_enabled = false; -static uint64_t trace_start_time = TRACE_START_TIME; -static uint64_t trace_stop_time = TRACE_STOP_TIME; - -double sc_time_stamp() { - return timestamp; -} - -bool sim_trace_enabled() { - if (timestamp >= trace_start_time - && timestamp < trace_stop_time) - return true; - return trace_enabled; -} - -void sim_trace_enable(bool enable) { - trace_enabled = enable; -} - -CacheSim::CacheSim() { - // force random values for uninitialized signals - Verilated::randReset(2); - - // create RTL module instance - cache_ = new VVX_cache_top(); - -#ifdef VCD_OUTPUT - Verilated::traceEverOn(true); - tfp_ = new VerilatedVcdC; - cache_->trace(tfp_, 99); - tfp_->open("trace.vcd"); -#endif - - ram_ = nullptr; - mem_rsp_active_ = false; - snp_req_active_ = false; -} - -CacheSim::~CacheSim() { -#ifdef VCD_OUTPUT - tfp_->close(); -#endif - delete cache_; - //need to delete the req and rsp vectors -} - -void CacheSim::attach_ram(RAM* ram) { - ram_ = ram; - mem_rsp_vec_.clear(); -} - -void CacheSim::reset() { -#ifndef NDEBUG - std::cout << timestamp << ": [sim] reset()" << std::endl; -#endif - - cache_->reset = 1; - this->step(); - cache_->reset = 0; - this->step(); - - mem_rsp_vec_.clear(); - //clear req and rsp vecs - -} - -void CacheSim::step() { - //std::cout << timestamp << ": [sim] step()" << std::endl; - //toggle clock - cache_->clk = 0; - this->eval(); - - cache_->clk = 1; - this->eval(); - - //handle core and memory reqs and rsps - this->eval_reqs(); - this->eval_rsps(); - this->eval_mem_bus(); - timestamp++; -} - -void CacheSim::eval() { - cache_->eval(); -#ifdef VCD_OUTPUT - tfp_->dump(timestamp); -#endif - ++timestamp; -} - -void CacheSim::run(){ -//#ifndef NDEBUG - -//#endif - this->step(); - - int valid = 300; - int stalls = 20 + 10; - - while (valid > -1) { - - this->step(); - display_miss(); - if(cache_->core_rsp_valid){ - get_core_rsp(); - } - - if(!cache_->core_req_valid && !cache_->core_rsp_valid){ - valid--; - - } - stalls--; - if (stalls == 20){ - //stall_mem(); - //send_snoop_req(); - stalls--; - } - } -} - -void CacheSim::clear_req(){ - cache_->core_req_valid = 0; -} - -void CacheSim::send_req(core_req_t *req){ - core_req_vec_.push(req); - unsigned int *data = new unsigned int[4]; - core_rsp_vec_.insert(std::pair(req->tag, data)); -} - -bool CacheSim::get_core_req_ready(){ - return cache_->core_req_ready; -} - -bool CacheSim::get_core_rsp_ready(){ - return cache_->core_rsp_ready; -} - -void CacheSim::eval_reqs(){ - //check to see if cache is accepting reqs - if(!core_req_vec_.empty() && cache_->core_req_ready){ - core_req_t *req = core_req_vec_.front(); - - cache_->core_req_valid = req->valid; - cache_->core_req_rw = req->rw; - cache_->core_req_byteen = req->byteen; - - cache_->core_req_addr[0] = req->addr[0]; - cache_->core_req_addr[1] = req->addr[1]; - cache_->core_req_addr[2] = req->addr[2]; - cache_->core_req_addr[3] = req->addr[3]; - - cache_->core_req_data[0] = req->data[0]; - cache_->core_req_data[1] = req->data[1]; - cache_->core_req_data[2] = req->data[2]; - cache_->core_req_data[3] = req->data[3]; - - cache_->core_req_tag = req->tag; - - core_req_vec_.pop(); - - } else { - clear_req(); - } -} - -void CacheSim::eval_rsps(){ - //check to see if a request has been responded to - if (cache_->core_rsp_valid){ - core_rsp_vec_.at(cache_->core_rsp_tag)[0] = cache_->core_rsp_data[0]; - core_rsp_vec_.at(cache_->core_rsp_tag)[1] = cache_->core_rsp_data[1]; - core_rsp_vec_.at(cache_->core_rsp_tag)[2] = cache_->core_rsp_data[2]; - core_rsp_vec_.at(cache_->core_rsp_tag)[3] = cache_->core_rsp_data[3]; - } -} - -void CacheSim::stall_mem(){ - cache_->mem_req_ready = 0; -} - -void CacheSim::send_snoop_req(){ - /*cache_->snp_req_valid = 1; - cache_->snp_req_addr = 0x12222222; - cache_->snp_req_invalidate = 1; - cache_->snp_req_tag = 0xff; */ -} - -void CacheSim::eval_mem_bus() { - if (ram_ == nullptr) { - cache_->mem_req_ready = 0; - return; - } - - // schedule memory responses - int dequeue_index = -1; - for (int i = 0; i < mem_rsp_vec_.size(); i++) { - if (mem_rsp_vec_[i].cycles_left > 0) { - mem_rsp_vec_[i].cycles_left -= 1; - } - if ((dequeue_index == -1) - && (mem_rsp_vec_[i].cycles_left == 0)) { - dequeue_index = i; - } - } - - // send memory response - if (mem_rsp_active_ - && cache_->mem_rsp_valid - && cache_->mem_rsp_ready) { - mem_rsp_active_ = false; - } - if (!mem_rsp_active_) { - if (dequeue_index != -1) { //time to respond to the request - cache_->mem_rsp_valid = 1; - - //copy data from the rsp queue to the cache module - memcpy(cache_->mem_rsp_data.data(), mem_rsp_vec_[dequeue_index].data, MEM_BLOCK_SIZE); - - cache_->mem_rsp_tag = mem_rsp_vec_[dequeue_index].tag; - free(mem_rsp_vec_[dequeue_index].data); //take data out of the queue - mem_rsp_vec_.erase(mem_rsp_vec_.begin() + dequeue_index); - mem_rsp_active_ = true; - } else { - cache_->mem_rsp_valid = 0; - } - } - - // handle memory stalls - bool mem_stalled = false; -#ifdef ENABLE_MEM_STALLS - if (0 == ((timestamp/2) % MEM_STALLS_MODULO)) { - mem_stalled = true; - } else - if (mem_rsp_vec_.size() >= MEM_RQ_SIZE) { - mem_stalled = true; - } -#endif - - // process memory requests - if (!mem_stalled) { - if (cache_->mem_req_valid) { - if (cache_->mem_req_rw) { //write = 1 - uint64_t byteen = cache_->mem_req_byteen; - uint64_t base_addr = (cache_->mem_req_addr * MEM_BLOCK_SIZE); - uint8_t* data = reinterpret_cast(cache_->mem_req_data.data()); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - if ((byteen >> i) & 0x1) { - (*ram_)[base_addr + i] = data[i]; - } - } - } else { - mem_req_t mem_req; - mem_req.cycles_left = MEM_LATENCY; - mem_req.data = (uint8_t*)malloc(MEM_BLOCK_SIZE); - mem_req.tag = cache_->mem_req_tag; - ram_->read(cache_->mem_req_addr * MEM_BLOCK_SIZE, MEM_BLOCK_SIZE, mem_req.data); - mem_rsp_vec_.push_back(mem_req); - } - } - } - - cache_->mem_req_ready = ~mem_stalled; -} - -bool CacheSim::assert_equal(unsigned int* data, unsigned int tag){ - int check = 0; - unsigned int *rsp = core_rsp_vec_.at(tag); - for (int i = 0; i < 4; ++i){ - for (int j = 0; j < 4; ++j){ - if (data[i] == rsp[j]){ - check++; - } - } - } - - return check; - -} - -//DEBUG - -void CacheSim::display_miss(){ - //int i = (unsigned int)cache_->miss_vec; - //std::bitset<8> x(i); - //if (i) std::cout << "Miss Vec " << x << std::endl; - //std::cout << "Miss Vec 0" << cache_->miss_vec[0] << std::endl; -} - -void CacheSim::get_core_req(unsigned int (&rsp)[4]){ - rsp[0] = cache_->core_rsp_data[0]; - rsp[1] = cache_->core_rsp_data[1]; - rsp[2] = cache_->core_rsp_data[2]; - rsp[3] = cache_->core_rsp_data[3]; - - //std::cout << std::hex << "core_rsp_valid: " << cache_->core_rsp_valid << std::endl; - //std::cout << std::hex << "core_rsp_data: " << cache_->core_rsp_data << std::endl; - //std::cout << std::hex << "core_rsp_tag: " << cache_->core_rsp_tag << std::endl; -} - -void CacheSim::get_core_rsp(){ - //std::cout << cache_->genblk5_BRA_0_KET_->bank->is_fill_in_pipe<< std::endl; - char check = cache_->core_rsp_valid; - std::cout << std::hex << "core_rsp_valid: " << (unsigned int) check << std::endl; - std::cout << std::hex << "core_rsp_data[0]: " << cache_->core_rsp_data[0] << std::endl; - std::cout << std::hex << "core_rsp_data[1]: " << cache_->core_rsp_data[1] << std::endl; - std::cout << std::hex << "core_rsp_data[2]: " << cache_->core_rsp_data[2] << std::endl; - std::cout << std::hex << "core_rsp_data[3]: " << cache_->core_rsp_data[3] << std::endl; - std::cout << std::hex << "core_rsp_tag: " << cache_->core_rsp_tag << std::endl; -} - -void CacheSim::get_mem_req(){ - std::cout << std::hex << "mem_req_valid: " << cache_->mem_req_valid << std::endl; - std::cout << std::hex << "mem_req_rw: " << cache_->mem_req_rw << std::endl; - std::cout << std::hex << "mem_req_byteen: " << cache_->mem_req_byteen << std::endl; - std::cout << std::hex << "mem_req_addr: " << cache_->mem_req_addr << std::endl; - std::cout << std::hex << "mem_req_data: " << cache_->mem_req_data << std::endl; - std::cout << std::hex << "mem_req_tag: " << cache_->mem_req_tag << std::endl; -} - -void CacheSim::get_mem_rsp(){ - std::cout << std::hex << "mem_rsp_valid: " << cache_->mem_rsp_valid << std::endl; - std::cout << std::hex << "mem_rsp_data: " << cache_->mem_rsp_data << std::endl; - std::cout << std::hex << "mem_rsp_tag: " << cache_->mem_rsp_tag << std::endl; - std::cout << std::hex << "mem_rsp_ready: " << cache_->mem_rsp_ready << std::endl; -} diff --git a/hw/unittest/cache/cachesim.h b/hw/unittest/cache/cachesim.h deleted file mode 100644 index 5235735d6..000000000 --- a/hw/unittest/cache/cachesim.h +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "VVX_cache_top.h" -#include "VVX_cache_top__Syms.h" -#include "verilated.h" - -#ifdef VCD_OUTPUT -#include -#endif - -#include -#include "ram.h" -#include -#include -#include - -#define ENABLE_MEM_STALLS -#define MEM_LATENCY 100 -#define MEM_RQ_SIZE 16 -#define MEM_STALLS_MODULO 16 - -typedef struct { - int cycles_left; - uint8_t *data; - unsigned tag; -} mem_req_t; - -typedef struct { - char valid; - char rw; - unsigned byteen; - unsigned *addr; - unsigned *data; - unsigned int tag; -} core_req_t; - -class CacheSim { -public: - - CacheSim(); - virtual ~CacheSim(); - - bool busy(); - - void reset(); - void step(); - void wait(uint32_t cycles); - void attach_ram(RAM* ram); - void run(); //run until all reqs are empty - - //req/rsp - void send_req(core_req_t *req); - void clear_req(); - void stall_mem(); - void send_snoop_req(); - void send_snp_fwd_in(); - - //assert funcs - bool assert_equal(unsigned int* data, unsigned int tag); - - //debug funcs - void get_mem_req(); - void get_core_req(unsigned int (&rsp)[4]); - void get_core_rsp(); - bool get_core_req_ready(); - bool get_core_rsp_ready(); - void get_mem_rsp(); - void display_miss(); - -private: - - void eval(); - void eval_reqs(); - void eval_rsps(); - void eval_mem_bus(); - - std::queue core_req_vec_; - std::vector mem_rsp_vec_; - std::map core_rsp_vec_; - int mem_rsp_active_; - - uint32_t snp_req_active_; - uint32_t snp_req_size_; - uint32_t pending_snp_reqs_; - - VVX_cache_top* cache_; - RAM* ram_; -#ifdef VCD_OUTPUT - VerilatedVcdC* tfp_; -#endif -}; diff --git a/hw/unittest/cache/ram.h b/hw/unittest/cache/ram.h deleted file mode 100644 index d01934a52..000000000 --- a/hw/unittest/cache/ram.h +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -class RAM { -private: - - mutable uint8_t *mem_[(1 << 12)]; - - uint8_t *get(uint32_t address) const { - uint32_t block_addr = address >> 20; - uint32_t block_offset = address & 0x000FFFFF; - if (mem_[block_addr] == NULL) { - mem_[block_addr] = new uint8_t[(1 << 20)]; - } - return mem_[block_addr] + block_offset; - } - -public: - - RAM() { - for (uint32_t i = 0; i < (1 << 12); i++) { - mem_[i] = NULL; - } - } - - ~RAM() { - this->clear(); - } - - size_t size() const { - return (1ull << 32); - } - - void clear() { - for (uint32_t i = 0; i < (1 << 12); i++) { - if (mem_[i]) { - delete [] mem_[i]; - mem_[i] = NULL; - } - } - } - - void read(uint32_t address, uint32_t length, uint8_t *data) const { - for (unsigned i = 0; i < length; i++) { - data[i] = *this->get(address + i); - } - } - - void write(uint32_t address, uint32_t length, const uint8_t *data) { - for (unsigned i = 0; i < length; i++) { - *this->get(address + i) = data[i]; - } - } - - uint8_t& operator[](uint32_t address) { - return *get(address); - } - - const uint8_t& operator[](uint32_t address) const { - return *get(address); - } -}; \ No newline at end of file diff --git a/hw/unittest/cache/testbench.cpp b/hw/unittest/cache/testbench.cpp deleted file mode 100644 index bf9dfb340..000000000 --- a/hw/unittest/cache/testbench.cpp +++ /dev/null @@ -1,248 +0,0 @@ -// Copyright © 2019-2023 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "cachesim.h" -#include -#include -#include - -#define VCD_OUTPUT 1 - - -int REQ_RSP(CacheSim *sim){ //verified - unsigned int addr[4] = {0x12222222, 0xabbbbbbb, 0xcddddddd, 0xe4444444}; - unsigned int data[4] = {0xffffffff, 0x11111111, 0x22222222, 0x33333333}; - unsigned int rsp[4] = {0,0,0,0}; - char responded = 0; - //write req - core_req_t* write = new core_req_t; - write->valid = 0xf; - write->rw = 0xf; - write->byteen = 0xffff; - write->addr = addr; - write->data = data; - write->tag = 0xff; - - //read req - core_req_t* read = new core_req_t; - read->valid = 0xf; - read->rw = 0; - read->byteen = 0xffff; - read->addr = addr; - read->data = addr; - read->tag = 0xff; - - // reset the device - sim->reset(); - - //queue reqs - sim->send_req(write); - sim->send_req(read); - - sim->run(); - - int check = sim->assert_equal(data, write->tag); - - if (check == 4) return 1; - - return 0; -} - -int HIT_1(CacheSim *sim){ - unsigned int addr[4] = {0x12222222, 0xabbbbbbb, 0xcddddddd, 0xe4444444}; - unsigned int data[4] = {0xffffffff, 0x11111111, 0x22222222, 0x33333333}; - unsigned int rsp[4] = {0,0,0,0}; - char responded = 0; - //write req - core_req_t* write = new core_req_t; - write->valid = 0xf; - write->rw = 0xf; - write->byteen = 0xffff; - write->addr = addr; - write->data = data; - write->tag = 0x11; - - //read req - core_req_t* read = new core_req_t; - read->valid = 0xf; - read->rw = 0; - read->byteen = 0xffff; - read->addr = addr; - read->data = addr; - read->tag = 0x22; - - // reset the device - sim->reset(); - - //queue reqs - sim->send_req(write); - sim->send_req(read); - - sim->run(); - - bool check = sim->assert_equal(data, write->tag); - - return check; -} - -int MISS_1(CacheSim *sim){ - unsigned int addr1[4] = {0x12222222, 0xabbbbbbb, 0xcddddddd, 0xe4444444}; - unsigned int addr2[4] = {0x12229222, 0xabbbb4bb, 0xcddd47dd, 0xe4423544}; - unsigned int addr3[4] = {0x12223332, 0xabb454bb, 0xcdddeefd, 0xe4447744}; - unsigned int data[4] = {0xffffffff, 0x11111111, 0x22222222, 0x33333333}; - unsigned int rsp[4] = {0,0,0,0}; - char responded = 0; - //write req - core_req_t* write = new core_req_t; - write->valid = 0xf; - write->rw = 0xf; - write->byteen = 0xffff; - write->addr = addr1; - write->data = data; - write->tag = 0xff; - - //read req - core_req_t* read1 = new core_req_t; - read1->valid = 0xf; - read1->rw = 0; - read1->byteen = 0xffff; - read1->addr = addr1; - read1->data = data; - read1->tag = 0xff; - - core_req_t* read2 = new core_req_t; - read2->valid = 0xf; - read2->rw = 0; - read2->byteen = 0xffff; - read2->addr = addr2; - read2->data = data; - read2->tag = 0xff; - - core_req_t* read3 = new core_req_t; - read3->valid = 0xf; - read3->rw = 0; - read3->byteen = 0xffff; - read3->addr = addr3; - read3->data = data; - read3->tag = 0xff; - - // reset the device - sim->reset(); - - //queue reqs - sim->send_req(write); - sim->send_req(read1); - sim->send_req(read2); - sim->send_req(read3); - - sim->run(); - - bool check = sim->assert_equal(data, write->tag); - - return check; -} -int FLUSH(CacheSim *sim){ - unsigned int addr[4] = {0x12222222, 0xabbbbbbb, 0xcddddddd, 0xe4444444}; - unsigned int data[4] = {0xffffffff, 0x11111111, 0x22222222, 0x33333333}; - unsigned int rsp[4] = {0,0,0,0}; - char responded = 0; - //write req - core_req_t* write = new core_req_t; - write->valid = 0xf; - write->rw = 0xf; - write->byteen = 0xffff; - write->addr = addr; - write->data = data; - write->tag = 0xff; - - //read req - core_req_t* read = new core_req_t; - read->valid = 0xf; - read->rw = 0; - read->byteen = 0xffff; - read->addr = addr; - read->data = addr; - read->tag = 0xff; - - // reset the device - sim->reset(); - - //queue reqs - sim->send_req(write); - sim->send_req(read); - - sim->run(); - - bool check = sim->assert_equal(data, write->tag); - - return check; -} - - -int BACK_PRESSURE(CacheSim *sim){ - //happens whenever the core is stalled or memory is stalled - unsigned int addr[4] = {0x12222222, 0xabbbbbbb, 0xcddddddd, 0xe4444444}; - unsigned int data[4] = {0xffffffff, 0x11111111, 0x22222222, 0x33333333}; - unsigned int rsp[4] = {0,0,0,0}; - char responded = 0; - - //write req - core_req_t* write = new core_req_t; - write->valid = 0xf; - write->rw = 0xf; - write->byteen = 0xffff; - write->addr = addr; - write->data = data; - write->tag = 0xff; - - //read req - core_req_t* read = new core_req_t; - read->valid = 0xf; - read->rw = 0; - read->byteen = 0xffff; - read->addr = addr; - read->data = addr; - read->tag = 0xff; - - // reset the device - sim->reset(); - - //queue reqs - for (int i = 0; i < 10; i++){ - sim->send_req(write); - } - sim->send_req(read); - - sim->run(); - - bool check = sim->assert_equal(data, write->tag); - - return check; -} - - -int main(int argc, char **argv) -{ - //init - RAM ram; - CacheSim cachesim; - cachesim.attach_ram(&ram); - int check = REQ_RSP(&cachesim); - if(check){ - std::cout << "PASSED" << std::endl; - } else { - std::cout << "FAILED" << std::endl; - } - - return 0; -} diff --git a/hw/unittest/common.mk b/hw/unittest/common.mk index 48aefd415..71f6914bf 100644 --- a/hw/unittest/common.mk +++ b/hw/unittest/common.mk @@ -25,7 +25,7 @@ VL_FLAGS += $(RTL_PKGS) VL_FLAGS += --cc $(TOP) --top-module $(TOP) # Enable Verilator multithreaded simulation -THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count())') +THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(mp.cpu_count())') VL_FLAGS += -j $(THREADS) #VL_FLAGS += --threads $(THREADS) diff --git a/hw/unittest/core_top/Makefile b/hw/unittest/core_top/Makefile index d9fbf40f6..f9d037999 100644 --- a/hw/unittest/core_top/Makefile +++ b/hw/unittest/core_top/Makefile @@ -16,7 +16,7 @@ SRCS += $(SRC_DIR)/main.cpp DBG_TRACE_FLAGS := -DDBG_TRACE_CACHE -RTL_PKGS := $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv $(RTL_DIR)/core/VX_trace_pkg.sv +RTL_PKGS := $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv RTL_INCLUDE := -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs RTL_INCLUDE += -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/mem -I$(RTL_DIR)/fpu -I$(RTL_DIR)/core diff --git a/hw/unittest/generic_queue/Makefile b/hw/unittest/generic_queue/Makefile index 0adf78fae..ad79c6f94 100644 --- a/hw/unittest/generic_queue/Makefile +++ b/hw/unittest/generic_queue/Makefile @@ -21,4 +21,6 @@ RTL_INCLUDE := -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs TOP := VX_fifo_queue +PARAMS := -GDATAW=32 -GDEPTH=8 + include ../common.mk \ No newline at end of file diff --git a/hw/unittest/issue_top/Makefile b/hw/unittest/issue_top/Makefile index 7e298849c..b6a8b0527 100644 --- a/hw/unittest/issue_top/Makefile +++ b/hw/unittest/issue_top/Makefile @@ -16,7 +16,7 @@ SRCS += $(SRC_DIR)/main.cpp DBG_TRACE_FLAGS := -DDBG_TRACE_CACHE -RTL_PKGS := $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/core/VX_trace_pkg.sv +RTL_PKGS := $(RTL_DIR)/VX_gpu_pkg.sv RTL_INCLUDE := -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs RTL_INCLUDE += -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/mem -I$(RTL_DIR)/core diff --git a/hw/unittest/local_mem_top/Makefile b/hw/unittest/local_mem_top/Makefile new file mode 100644 index 000000000..22a8adfae --- /dev/null +++ b/hw/unittest/local_mem_top/Makefile @@ -0,0 +1,26 @@ +ROOT_DIR := $(realpath ../../..) +include $(ROOT_DIR)/config.mk + +PROJECT := local_mem_top + +RTL_DIR := $(VORTEX_HOME)/hw/rtl +DPI_DIR := $(VORTEX_HOME)/hw/dpi + +SRC_DIR := $(VORTEX_HOME)/hw/unittest/$(PROJECT) + +CXXFLAGS := -I$(SRC_DIR) -I$(VORTEX_HOME)/hw/unittest/common -I$(VORTEX_HOME)/sim/common +CXXFLAGS += -I$(ROOT_DIR)/hw + +SRCS := $(DPI_DIR)/util_dpi.cpp +SRCS += $(SRC_DIR)/main.cpp + +DBG_TRACE_FLAGS := + +RTL_PKGS := $(RTL_DIR)/VX_gpu_pkg.sv + +RTL_INCLUDE := -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs +RTL_INCLUDE += -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/mem + +TOP := VX_local_mem_top + +include ../common.mk \ No newline at end of file diff --git a/hw/unittest/local_mem_top/main.cpp b/hw/unittest/local_mem_top/main.cpp new file mode 100644 index 000000000..5191b4433 --- /dev/null +++ b/hw/unittest/local_mem_top/main.cpp @@ -0,0 +1,49 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "vl_simulator.h" + +#ifndef TRACE_START_TIME +#define TRACE_START_TIME 0ull +#endif + +#ifndef TRACE_STOP_TIME +#define TRACE_STOP_TIME -1ull +#endif + +static uint64_t timestamp = 0; +static bool trace_enabled = false; +static uint64_t trace_start_time = TRACE_START_TIME; +static uint64_t trace_stop_time = TRACE_STOP_TIME; + +double sc_time_stamp() { + return timestamp; +} + +bool sim_trace_enabled() { + if (timestamp >= trace_start_time + && timestamp < trace_stop_time) + return true; + return trace_enabled; +} + +void sim_trace_enable(bool enable) { + trace_enabled = enable; +} + +int main(int argc, char **argv) { + // Initialize Verilators variables + Verilated::commandArgs(argc, argv); + + return 0; +} \ No newline at end of file diff --git a/hw/unittest/cache/Makefile b/hw/unittest/mem_unit_top/Makefile similarity index 78% rename from hw/unittest/cache/Makefile rename to hw/unittest/mem_unit_top/Makefile index b734aaedd..8809551f4 100644 --- a/hw/unittest/cache/Makefile +++ b/hw/unittest/mem_unit_top/Makefile @@ -1,7 +1,7 @@ ROOT_DIR := $(realpath ../../..) include $(ROOT_DIR)/config.mk -PROJECT := cache +PROJECT := mem_unit_top RTL_DIR := $(VORTEX_HOME)/hw/rtl DPI_DIR := $(VORTEX_HOME)/hw/dpi @@ -12,15 +12,15 @@ CXXFLAGS := -I$(SRC_DIR) -I$(VORTEX_HOME)/hw/unittest/common -I$(VORTEX_HOME)/si CXXFLAGS += -I$(ROOT_DIR)/hw SRCS := $(DPI_DIR)/util_dpi.cpp -SRCS += $(SRC_DIR)/cachesim.cpp $(SRC_DIR)/testbench.cpp +SRCS += $(SRC_DIR)/main.cpp -DBG_TRACE_FLAGS := -DDBG_TRACE_CACHE +DBG_TRACE_FLAGS := RTL_PKGS := $(RTL_DIR)/VX_gpu_pkg.sv RTL_INCLUDE := -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -RTL_INCLUDE += -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache +RTL_INCLUDE += -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/mem -I$(RTL_DIR)/core -I$(RTL_DIR)/fpu -TOP := VX_cache_top +TOP := VX_mem_unit_top include ../common.mk \ No newline at end of file diff --git a/hw/unittest/mem_unit_top/main.cpp b/hw/unittest/mem_unit_top/main.cpp new file mode 100644 index 000000000..5191b4433 --- /dev/null +++ b/hw/unittest/mem_unit_top/main.cpp @@ -0,0 +1,49 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "vl_simulator.h" + +#ifndef TRACE_START_TIME +#define TRACE_START_TIME 0ull +#endif + +#ifndef TRACE_STOP_TIME +#define TRACE_STOP_TIME -1ull +#endif + +static uint64_t timestamp = 0; +static bool trace_enabled = false; +static uint64_t trace_start_time = TRACE_START_TIME; +static uint64_t trace_stop_time = TRACE_STOP_TIME; + +double sc_time_stamp() { + return timestamp; +} + +bool sim_trace_enabled() { + if (timestamp >= trace_start_time + && timestamp < trace_stop_time) + return true; + return trace_enabled; +} + +void sim_trace_enable(bool enable) { + trace_enabled = enable; +} + +int main(int argc, char **argv) { + // Initialize Verilators variables + Verilated::commandArgs(argc, argv); + + return 0; +} \ No newline at end of file diff --git a/kernel/include/vx_intrinsics.h b/kernel/include/vx_intrinsics.h index 6000065e9..f22819246 100644 --- a/kernel/include/vx_intrinsics.h +++ b/kernel/include/vx_intrinsics.h @@ -221,6 +221,24 @@ inline void vx_fence() { __asm__ volatile ("fence iorw, iorw"); } +//Matrix load +inline void vx_matrix_load(unsigned dest, unsigned addr) +{ + __asm__ volatile (".insn i 0x7b, 0, x0, %0(%1)" :: "i"(dest), "r"(addr)); +} + +//Matrix Store +inline void vx_matrix_store(unsigned addr) +{ + __asm__ volatile (".insn i 0x7b, 1, x0, 0(%0)" :: "r"(addr)); +} + +//Matrix Mul +inline void vx_matrix_mul() +{ + __asm__ volatile (".insn i 0x7b, 2, x0, 0(x0)"); +} + #ifdef __cplusplus } #endif diff --git a/kernel/scripts/vxbin.py b/kernel/scripts/vxbin.py index 501d8949a..1dcd6a099 100755 --- a/kernel/scripts/vxbin.py +++ b/kernel/scripts/vxbin.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 # Copyright 2019-2023 # diff --git a/miscs/docker/Dockerfile.ubuntu b/miscs/docker/Dockerfile.prod similarity index 88% rename from miscs/docker/Dockerfile.ubuntu rename to miscs/docker/Dockerfile.prod index c3e72a0f4..20c9c033b 100644 --- a/miscs/docker/Dockerfile.ubuntu +++ b/miscs/docker/Dockerfile.prod @@ -17,11 +17,11 @@ FROM ubuntu:20.04 # Set non-interactive installation to avoid user input during build ARG DEBIAN_FRONTEND=noninteractive +# Install necessary dependencies and upgrade installed components # Update and install necessary dependencies RUN apt-get update && apt-get install -y \ software-properties-common \ build-essential \ - python \ python3 \ git \ wget \ @@ -32,6 +32,9 @@ RUN apt-get update && apt-get install -y \ # upgrade installed components RUN apt-get upgrade && apt-get update +# temporary until remote dependency script gets updated +RUN apt-get install -y cmake + # Clone the Vortex repository RUN git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git /vortex @@ -39,7 +42,7 @@ RUN git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git /v WORKDIR /vortex # install system dependencies -RUN ./ci/system_updates.sh +RUN ./ci/install_dependencies.sh # Configure the build folder RUN mkdir build && cd build && ../configure @@ -51,4 +54,4 @@ RUN cd build && ./ci/toolchain_install.sh --all RUN echo "source /vortex/build/ci/toolchain_env.sh" >> ~/.bashrc # Set the working directory to /vortex/build -WORKDIR /vortex/build \ No newline at end of file +WORKDIR /vortex/build diff --git a/miscs/docker/README.md b/miscs/docker/README.md index 897f8f9fb..c077102da 100644 --- a/miscs/docker/README.md +++ b/miscs/docker/README.md @@ -4,17 +4,32 @@ You can install Docker desktop on MAC or PC or Ubuntu. - MAC: https://docs.docker.com/desktop/install/mac-install - Ubuntu: https://docs.docker.com/desktop/install/ubuntu -### 1- Create a Docker image from the Dockerfile - $ docker build -f Dockerfile.ubuntu -t vortex +### 1- Build a Docker Image from the Dockerfile + $ docker build --platform=linux/amd64 -t vortex-packaged -f Dockerfile.prod . -### 2- Build the Docker image - $ docker docker run -it vortex /bin/bash +### 2- Construct and run a Container from the Docker Image + $ docker run -it --name vortex --privileged=true --platform=linux/amd64 vortex-packaged -### 3- Build the project +### 3- Build the Project One you login the Docker terminal, you will be in the build directory. $ make -s -### 4- Run a simple test +### 4- Run a Simple Test +See `docs/` to learn more! - $ ./ci/blackbox.sh --cores=2 --app=vecadd \ No newline at end of file + $ ./ci/blackbox.sh --cores=2 --app=vecadd + +### 5- Exit the Container + + $ exit + $ docker stop vortex + +### 6- Restart and Re-Enter the Container +If you ran step `2` and then step `5` then, you have to start and re-enter the container + + $ docker start vortex + $ docker exec -it vortex + +--- +Note: Apple Silicon macs will run the container in emulation mode, so compiling and running will take a considerable amount of time -- but it still works! \ No newline at end of file diff --git a/perf/cache/cache_perf.log b/perf/cache/cache_perf.log deleted file mode 100644 index 21a446d25..000000000 --- a/perf/cache/cache_perf.log +++ /dev/null @@ -1,3 +0,0 @@ -CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -running: CONFIGS=-DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 make -C ./ci/../driver/rtlsim -verilator --build --exe --cc Vortex --top-module Vortex --language 1800-2009 --assert -Wall -Wpedantic -Wno-DECLFILENAME -Wno-REDEFMACRO --x-initial unique --x-assign unique verilator.vlt -I../../hw/rtl -I../../hw/dpi -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache -I../../hw/rtl/simulate -I../../hw/rtl/fp_cores -I../../third_party/fpnew/src/common_cells/include -I../../third_party/fpnew/src/common_cells/src -I../../third_party/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../third_party/fpnew/src -I../../hw/rtl/tex_unit -I../../hw/rtl/raster_unit -I../../hw/rtl/rop_unit -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -j 64 -DNDEBUG -DIMUL_DPI -DIDIV_DPI -DFPU_DPI ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp ../../hw/dpi/util_dpi.cpp ../../hw/dpi/float_dpi.cpp processor.cpp -CFLAGS '-std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds -fPIC -Wno-maybe-uninitialized -I../../../hw -I../../common -I../../../third_party/softfloat/source/include -I../../../third_party -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DNUM_WARPS=2 -DNUM_THREADS=2 -DPERF_ENABLE -DICACHE_NUM_WAYS=1 -O2 -DNDEBUG' -LDFLAGS '-shared ../../../third_party/softfloat/build/Linux-x86_64-GCC/softfloat.a -L../../../third_party/ramulator -lramulator' -o ../../../driver/rtlsim/librtlsim.so diff --git a/perf/cache/run.sh b/perf/cache/run.sh index ffb86e342..04285c389 100755 --- a/perf/cache/run.sh +++ b/perf/cache/run.sh @@ -10,17 +10,17 @@ sgemm() { echo "begin cache tests" -CONFIGS="-DICACHE_NUM_WAYS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' > ./perf/cache/cache_perf.log -echo -e "\n**************************************\n" >> ./perf/cache/cache_perf.log -CONFIGS="-DDCACHE_NUM_WAYS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log -echo -e "\n**************************************\n" >> ./perf/cache/cache_perf.log -CONFIGS="-DICACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log -echo -e "\n**************************************\n" >> ./perf/cache/cache_perf.log -CONFIGS="-DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log -echo -e "\n**************************************\n" >> ./perf/cache/cache_perf.log -CONFIGS="-DICACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log -echo -e "\n**************************************\n" >> ./perf/cache/cache_perf.log -CONFIGS="-DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> ./perf/cache/cache_perf.log +CONFIGS="-DICACHE_NUM_WAYS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' > cache_perf.log +echo -e "\n**************************************\n" >> cache_perf.log +CONFIGS="-DDCACHE_NUM_WAYS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> cache_perf.log +echo -e "\n**************************************\n" >> cache_perf.log +CONFIGS="-DICACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> cache_perf.log +echo -e "\n**************************************\n" >> cache_perf.log +CONFIGS="-DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> cache_perf.log +echo -e "\n**************************************\n" >> cache_perf.log +CONFIGS="-DICACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> cache_perf.log +echo -e "\n**************************************\n" >> cache_perf.log +CONFIGS="-DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n64" --perf=1 | grep 'PERF' >> cache_perf.log echo "cache tests done!" } @@ -36,6 +36,6 @@ case $1 in -h | --help ) usage ;; * ) sgemm - ;; + ;; esac shift \ No newline at end of file diff --git a/runtime/Makefile b/runtime/Makefile index e5f8af74c..aecac00e1 100644 --- a/runtime/Makefile +++ b/runtime/Makefile @@ -3,6 +3,8 @@ include $(ROOT_DIR)/config.mk all: stub rtlsim simx opae xrt +vm: stub simx + stub: $(MAKE) -C stub diff --git a/runtime/common/common.h b/runtime/common/common.h index 1f718f938..b52d41058 100644 --- a/runtime/common/common.h +++ b/runtime/common/common.h @@ -13,11 +13,12 @@ #pragma once +#include #include #include #include #include -#include +#include #include #include @@ -25,7 +26,7 @@ #define CACHE_BLOCK_SIZE 64 -#define RAM_PAGE_SIZE 4096 +#define RAM_PAGE_SIZE 4096 // Please use MEM_PAGE_SIZE in VX_config.h #define ALLOC_BASE_ADDR USER_BASE_ADDR diff --git a/runtime/common/scope.cpp b/runtime/common/scope.cpp index 33b13cab4..8f8670944 100644 --- a/runtime/common/scope.cpp +++ b/runtime/common/scope.cpp @@ -28,7 +28,11 @@ #include #include -#define FRAME_FLUSH_SIZE 100 +#define SAMPLE_FLUSH_SIZE 100 + +#define TIMEOUT_TIME (60*60) + +#define MAX_DELAY_CYCLES 10000 #define MMIO_SCOPE_READ (AFU_IMAGE_MMIO_SCOPE_READ * 4) #define MMIO_SCOPE_WRITE (AFU_IMAGE_MMIO_SCOPE_WRITE * 4) @@ -39,6 +43,7 @@ #define CMD_GET_DATA 3 #define CMD_SET_START 4 #define CMD_SET_STOP 5 +#define CMD_SET_DEPTH 6 #define CHECK_ERR(_expr) \ do { \ @@ -58,8 +63,8 @@ struct tap_signal_t { struct tap_t { uint32_t id; uint32_t width; - uint32_t frames; - uint32_t cur_frame; + uint32_t samples; + uint32_t cur_sample; uint64_t cycle_time; std::string path; std::vector signals; @@ -67,6 +72,10 @@ struct tap_t { static scope_callback_t g_callback; +static bool g_running = false; + +static std::mutex g_stop_mutex; + using json = nlohmann::json; static std::vector split(const std::string &s, char delimiter) { @@ -90,7 +99,7 @@ static void dump_module(std::ofstream& ofs, auto itt = tails.find(name); if (itt != tails.end()) { for (auto& signal : itt->second->signals) { - ofs << indent << " $var reg " << signal.width << " " << signal.id << " " << signal.name << " $end" << std::endl; + ofs << indent << " $var wire " << signal.width << " " << signal.id << " " << signal.name << " $end" << std::endl; } } @@ -108,7 +117,7 @@ static void dump_header(std::ofstream& ofs, std::vector& taps) { ofs << "$version Generated by Vortex Scope Analyzer $end" << std::endl; ofs << "$timescale 1 ns $end" << std::endl; ofs << "$scope module TOP $end" << std::endl; - ofs << " $var reg 1 0 clk $end" << std::endl; + ofs << " $var wire 1 0 clk $end" << std::endl; std::unordered_map> hierarchy; std::unordered_set heads; @@ -135,22 +144,33 @@ static void dump_header(std::ofstream& ofs, std::vector& taps) { ofs << "enddefinitions $end" << std::endl; } -static tap_t* find_nearest_tap(std::vector& taps) { - tap_t* nearest = nullptr; +// return the earliest tap that has data to dump +static tap_t* find_earliest_tap(std::vector& taps) { + tap_t* earliest = nullptr; for (auto& tap : taps) { - if (tap.cur_frame == tap.frames) - continue; - if (nearest != nullptr) { - if (tap.cycle_time < nearest->cycle_time) - nearest = &tap; + if (tap.samples == 0) + continue; // skip empty taps + if (tap.cur_sample == tap.samples) + continue; // skip finished taps + if (earliest != nullptr) { + if (tap.cycle_time < earliest->cycle_time) + earliest = &tap; } else { - nearest = &tap; + earliest = &tap; } } - return nearest; + return earliest; } -static uint64_t advance_time(std::ofstream& ofs, uint64_t next_time, uint64_t cur_time) { +static uint64_t advance_clock(std::ofstream& ofs, uint64_t cur_time, uint64_t next_time) { + uint64_t delta = next_time - cur_time; + if (delta > MAX_DELAY_CYCLES) { + ofs << '#' << (cur_time * 2 + 0) << std::endl; + ofs << "bx 0" << std::endl; + ofs << '#' << (cur_time * 2 + 1) << std::endl; + ofs << "bx 0" << std::endl; + cur_time = next_time - MAX_DELAY_CYCLES; + } while (cur_time < next_time) { ofs << '#' << (cur_time * 2 + 0) << std::endl; ofs << "b0 0" << std::endl; @@ -163,7 +183,7 @@ static uint64_t advance_time(std::ofstream& ofs, uint64_t next_time, uint64_t cu static int dump_tap(std::ofstream& ofs, tap_t* tap, vx_device_h hdevice) { uint32_t signal_offset = 0; - uint32_t frame_offset = 0; + uint32_t sample_offset = 0; uint64_t word; std::vector signal_data(tap->width); @@ -176,24 +196,24 @@ static int dump_tap(std::ofstream& ofs, tap_t* tap, vx_device_h hdevice) { CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data)); CHECK_ERR(g_callback.registerRead(hdevice, &word)); do { - uint32_t word_offset = frame_offset % 64; + uint32_t word_offset = sample_offset % 64; signal_data[signal_width - signal_offset - 1] = ((word >> word_offset) & 0x1) ? '1' : '0'; ++signal_offset; - ++frame_offset; + ++sample_offset; if (signal_offset == signal_width) { signal_data[signal_width] = 0; // string null termination ofs << 'b' << signal_data.data() << ' ' << signal_it->id << std::endl; - if (frame_offset == tap->width) { - // end-of-frame - ++tap->cur_frame; - if (tap->cur_frame != tap->frames) { + if (sample_offset == tap->width) { + // end-of-sample + ++tap->cur_sample; + if (tap->cur_sample != tap->samples) { // read next delta CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data)); CHECK_ERR(g_callback.registerRead(hdevice, &word)); tap->cycle_time += 1 + word; - if (0 == (tap->cur_frame % FRAME_FLUSH_SIZE)) { + if (0 == (tap->cur_sample % SAMPLE_FLUSH_SIZE)) { ofs << std::flush; - std::cout << std::dec << "[SCOPE] flush tap #" << tap->id << ": "<< tap->cur_frame << "/" << tap->frames << " frames, next_time=" << tap->cycle_time << std::endl; + std::cout << std::dec << "[SCOPE] flush tap #" << tap->id << ": "<< tap->cur_sample << "/" << tap->samples << " samples, next_time=" << tap->cycle_time << std::endl; } } break; @@ -202,8 +222,8 @@ static int dump_tap(std::ofstream& ofs, tap_t* tap, vx_device_h hdevice) { ++signal_it; signal_width = signal_it->width; } - } while ((frame_offset % 64) != 0); - } while (frame_offset != tap->width); + } while ((sample_offset % 64) != 0); + } while (sample_offset != tap->width); return 0; } @@ -241,6 +261,20 @@ int vx_scope_start(scope_callback_t* callback, vx_device_h hdevice, uint64_t sta } } + // setup capture size + const char* capture_size_env = std::getenv("SCOPE_DEPTH"); + if (capture_size_env != nullptr) { + std::stringstream ss(capture_size_env); + uint32_t capture_size; + if (ss >> capture_size) { + for (auto& tap : json_obj["taps"]) { + auto id = tap["id"].get(); + uint64_t cmd_depth = (capture_size << 11) | (id << 3) | CMD_SET_DEPTH; + CHECK_ERR(g_callback.registerWrite(hdevice, cmd_depth)); + } + } + } + // set stop time if (stop_time != uint64_t(-1)) { std::cout << "[SCOPE] stop time: " << std::dec << stop_time << "s" << std::endl; @@ -261,13 +295,39 @@ int vx_scope_start(scope_callback_t* callback, vx_device_h hdevice, uint64_t sta } } + g_running = true; + + // create auto-stop thread + uint32_t timeout_time = TIMEOUT_TIME; + const char* env_timeout = std::getenv("SCOPE_TIMEOUT"); + if (env_timeout != nullptr) { + std::stringstream ss(env_timeout); + uint32_t env_value; + if (ss >> env_value) { + timeout_time = env_value; + std::cout << "[SCOPE] timeout time=" << env_value << std::endl; + } + } + std::thread([hdevice, timeout_time]() { + std::this_thread::sleep_for(std::chrono::seconds(timeout_time)); + std::cout << "[SCOPE] auto-stop timeout!" << std::endl; + vx_scope_stop(hdevice); + }).detach(); + return 0; } int vx_scope_stop(vx_device_h hdevice) { + std::lock_guard lock(g_stop_mutex); + if (nullptr == hdevice) return -1; + if (!g_running) + return 0; + + g_running = false; + std::vector taps; { @@ -285,8 +345,8 @@ int vx_scope_stop(vx_device_h hdevice) { _tap.width = tap["width"].get(); _tap.path = tap["path"].get(); _tap.cycle_time = 0; - _tap.frames = 0; - _tap.cur_frame = 0; + _tap.samples = 0; + _tap.cur_sample = 0; for (auto& signal : tap["signals"]) { auto name = signal[0].get(); @@ -299,19 +359,15 @@ int vx_scope_stop(vx_device_h hdevice) { } } - // stop recording + std::cout << "[SCOPE] stop recording..." << std::endl; + for (auto& tap : taps) { uint64_t cmd_stop = (0 << 11) | (tap.id << 3) | CMD_SET_STOP; CHECK_ERR(g_callback.registerWrite(hdevice, cmd_stop)); } - std::cout << "[SCOPE] trace dump begin..." << std::endl; + std::cout << "[SCOPE] load trace info..." << std::endl; - std::ofstream ofs("scope.vcd"); - - dump_header(ofs, taps); - - // load trace info for (auto& tap : taps) { uint64_t count, start, delta; @@ -319,39 +375,51 @@ int vx_scope_stop(vx_device_h hdevice) { uint64_t cmd_count = (tap.id << 3) | CMD_GET_COUNT; CHECK_ERR(g_callback.registerWrite(hdevice, cmd_count)); CHECK_ERR(g_callback.registerRead(hdevice, &count)); + if (count == 0) + continue; // get start uint64_t cmd_start = (tap.id << 3) | CMD_GET_START; CHECK_ERR(g_callback.registerWrite(hdevice, cmd_start)); CHECK_ERR(g_callback.registerRead(hdevice, &start)); - // get data + // get delta uint64_t cmd_data = (tap.id << 3) | CMD_GET_DATA; CHECK_ERR(g_callback.registerWrite(hdevice, cmd_data)); CHECK_ERR(g_callback.registerRead(hdevice, &delta)); - tap.frames = count; + tap.samples = count; tap.cycle_time = 1 + start + delta; std::cout << std::dec << "[SCOPE] tap #" << tap.id << ": width=" << tap.width - << ", num_frames=" << tap.frames + << ", num_samples=" << tap.samples << ", start_time=" << tap.cycle_time << ", path=" << tap.path << std::endl; } - uint64_t cur_time = 0; + std::cout << "[SCOPE] dump header..." << std::endl; - while (true) { - // find the nearest tap - auto tap = find_nearest_tap(taps); - if (tap == nullptr) - break; + std::ofstream ofs("scope.vcd"); + + dump_header(ofs, taps); + + std::cout << "[SCOPE] dump taps..." << std::endl; + + uint64_t cur_time = 0; + auto tap = find_earliest_tap(taps); + if (tap != nullptr) { + do { + // advance clock + cur_time = advance_clock(ofs, cur_time, tap->cycle_time); + // dump tap + CHECK_ERR(dump_tap(ofs, tap, hdevice)); + // find the nearest tap + tap = find_earliest_tap(taps); + } while (tap != nullptr); // advance clock - cur_time = advance_time(ofs, tap->cycle_time, cur_time); - // dump tap - CHECK_ERR(dump_tap(ofs, tap, hdevice)); - }; + advance_clock(ofs, cur_time, cur_time + 1); + } std::cout << "[SCOPE] trace dump done! - " << (cur_time/2) << " cycles" << std::endl; diff --git a/runtime/include/vortex.h b/runtime/include/vortex.h index 8481002e1..4f1c93418 100644 --- a/runtime/include/vortex.h +++ b/runtime/include/vortex.h @@ -34,6 +34,10 @@ typedef void* vx_buffer_h; #define VX_CAPS_GLOBAL_MEM_SIZE 0x5 #define VX_CAPS_LOCAL_MEM_SIZE 0x6 #define VX_CAPS_ISA_FLAGS 0x7 +#define VX_CAPS_NUM_MEM_BANKS 0x8 +#define VX_CAPS_MEM_BANK_SIZE 0x9 +#define VX_CAPS_TC_SIZE 0xA +#define VX_CAPS_TC_NUM 0xB // device isa flags #define VX_ISA_STD_A (1ull << ISA_STD_A) @@ -65,6 +69,7 @@ typedef void* vx_buffer_h; #define VX_MEM_READ 0x1 #define VX_MEM_WRITE 0x2 #define VX_MEM_READ_WRITE 0x3 +#define VX_MEM_PIN_MEMORY 0x4 // open the device and connect to it int vx_dev_open(vx_device_h* hdevice); diff --git a/runtime/opae/Makefile b/runtime/opae/Makefile index 1a9810eca..04545c887 100644 --- a/runtime/opae/Makefile +++ b/runtime/opae/Makefile @@ -1,3 +1,4 @@ +ROOT_DIR := $(realpath ../..) include ../common.mk TARGET ?= opaesim @@ -8,8 +9,8 @@ SYN_DIR := $(HW_DIR)/syn/altera/opae SRC_DIR := $(VORTEX_HOME)/runtime/opae -CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors -CXXFLAGS += -I$(INC_DIR) -I$(COMMON_DIR) -I$(ROOT_DIR)/hw -I$(DESTDIR) +CXXFLAGS += -std=c++17 -Wall -Wextra -pedantic -Wfatal-errors +CXXFLAGS += -I$(INC_DIR) -I$(COMMON_DIR) -I$(ROOT_DIR)/hw -I$(DESTDIR) -I$(SIM_DIR)/common CXXFLAGS += -DXLEN_$(XLEN) # Position independent code @@ -24,10 +25,11 @@ SRCS = $(SRC_DIR)/vortex.cpp $(SRC_DIR)/driver.cpp # set up target types ifeq ($(TARGET), opaesim) - OPAESIM = $(DESTDIR)/libopae-c-sim.so - CXXFLAGS += -I$(SIM_DIR)/opaesim + BUILD_DEPS = $(DESTDIR)/libopae-c-sim.so + CXXFLAGS += -DOPAESIM -I$(SIM_DIR)/opaesim else - CXXFLAGS += -I$(SYN_DIR) + BUILD_DEPS = $(ROOT_DIR)/hw/syn/altera/opae/vortex_afu.h + CXXFLAGS += -I$(SYN_DIR) -I$(ROOT_DIR)/hw/syn/altera/opae endif # Debugging @@ -47,12 +49,15 @@ PROJECT := libvortex-opae.so all: $(DESTDIR)/$(PROJECT) +$(ROOT_DIR)/hw/syn/altera/opae/vortex_afu.h: + $(MAKE) -C $(ROOT_DIR)/hw/syn/altera/opae swconfig + driver: $(DESTDIR)/libopae-c-sim.so $(DESTDIR)/libopae-c-sim.so: DESTDIR=$(DESTDIR) $(MAKE) -C $(ROOT_DIR)/sim/opaesim $(DESTDIR)/libopae-c-sim.so -$(DESTDIR)/$(PROJECT): $(SRCS) $(OPAESIM) +$(DESTDIR)/$(PROJECT): $(SRCS) $(BUILD_DEPS) $(CXX) $(CXXFLAGS) $(SRCS) $(LDFLAGS) -o $@ clean-driver: diff --git a/runtime/opae/driver.h b/runtime/opae/driver.h index 0d1d4daa7..0a45b6f67 100644 --- a/runtime/opae/driver.h +++ b/runtime/opae/driver.h @@ -13,7 +13,11 @@ #pragma once +#ifdef OPAESIM #include +#else +#include +#endif typedef fpga_result (*pfn_fpgaGetProperties)(fpga_token token, fpga_properties *prop); typedef fpga_result (*pfn_fpgaPropertiesSetObjectType)(fpga_properties prop, fpga_objtype objtype); diff --git a/runtime/opae/vortex.cpp b/runtime/opae/vortex.cpp index 390d5acc4..38ee514ab 100755 --- a/runtime/opae/vortex.cpp +++ b/runtime/opae/vortex.cpp @@ -163,11 +163,6 @@ public: }); { - // retrieve FPGA global memory size - CHECK_FPGA_ERR(api_.fpgaPropertiesGetLocalMemorySize(filter, &global_mem_size_), { - global_mem_size_ = GLOBAL_MEM_SIZE; - }); - // Load ISA CAPS CHECK_FPGA_ERR(api_.fpgaReadMMIO64(fpga_, 0, MMIO_ISA_CAPS, &isa_caps_), { api_.fpgaClose(fpga_); @@ -179,6 +174,12 @@ public: api_.fpgaClose(fpga_); return -1; }); + + // Determine global memory size + uint64_t num_banks, bank_size; + this->get_caps(VX_CAPS_NUM_MEM_BANKS, &num_banks); + this->get_caps(VX_CAPS_MEM_BANK_SIZE, &bank_size); + global_mem_size_ = num_banks * bank_size; } #ifdef SCOPE @@ -194,11 +195,10 @@ public: return device->api_.fpgaReadMMIO64(device->fpga_, 0, MMIO_SCOPE_READ, value); }; - int ret = vx_scope_start(&callback, this, 0, -1); - if (ret != 0) { + CHECK_ERR(vx_scope_start(&callback, this, -1, -1), { api_.fpgaClose(fpga_); - return ret; - } + return err; + }); } #endif return 0; @@ -206,7 +206,6 @@ public: int get_caps(uint32_t caps_id, uint64_t * value) { uint64_t _value; - switch (caps_id) { case VX_CAPS_VERSION: _value = (dev_caps_ >> 0) & 0xff; @@ -227,11 +226,17 @@ public: _value = global_mem_size_; break; case VX_CAPS_LOCAL_MEM_SIZE: - _value = 1ull << ((dev_caps_ >> 48) & 0xff); + _value = 1ull << ((dev_caps_ >> 40) & 0xff); break; case VX_CAPS_ISA_FLAGS: _value = isa_caps_; break; + case VX_CAPS_NUM_MEM_BANKS: + _value = 1 << ((dev_caps_ >> 48) & 0x7); + break; + case VX_CAPS_MEM_BANK_SIZE: + _value = 1ull << (20 + ((dev_caps_ >> 51) & 0x1f)); + break; default: fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id); std::abort(); diff --git a/runtime/rtlsim/Makefile b/runtime/rtlsim/Makefile index f6adbf8c8..a7b15d9ac 100644 --- a/runtime/rtlsim/Makefile +++ b/runtime/rtlsim/Makefile @@ -4,7 +4,7 @@ DESTDIR ?= $(CURDIR)/.. SRC_DIR := $(VORTEX_HOME)/runtime/rtlsim -CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors +CXXFLAGS += -std=c++17 -Wall -Wextra -pedantic -Wfatal-errors CXXFLAGS += -I$(INC_DIR) -I$(COMMON_DIR) -I$(ROOT_DIR)/hw -I$(SIM_DIR)/rtlsim -I$(COMMON_DIR) -I$(SIM_DIR)/common CXXFLAGS += -DXLEN_$(XLEN) diff --git a/runtime/rtlsim/vortex.cpp b/runtime/rtlsim/vortex.cpp index c75a6c12f..ccf61e16f 100644 --- a/runtime/rtlsim/vortex.cpp +++ b/runtime/rtlsim/vortex.cpp @@ -77,6 +77,12 @@ public: case VX_CAPS_ISA_FLAGS: _value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD; break; + case VX_CAPS_NUM_MEM_BANKS: + _value = PLATFORM_MEMORY_NUM_BANKS; + break; + case VX_CAPS_MEM_BANK_SIZE: + _value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_NUM_BANKS); + break; default: std::cout << "invalid caps id: " << caps_id << std::endl; std::abort(); diff --git a/runtime/simx/Makefile b/runtime/simx/Makefile index c20e33b53..9480f5b6a 100644 --- a/runtime/simx/Makefile +++ b/runtime/simx/Makefile @@ -4,7 +4,7 @@ DESTDIR ?= $(CURDIR)/.. SRC_DIR := $(VORTEX_HOME)/runtime/simx -CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors +CXXFLAGS += -std=c++17 -Wall -Wextra -Wfatal-errors CXXFLAGS += -fPIC -Wno-maybe-uninitialized CXXFLAGS += -I$(INC_DIR) -I../common -I$(ROOT_DIR)/hw -I$(SIM_DIR)/simx -I$(COMMON_DIR) -I$(SIM_DIR)/common CXXFLAGS += $(CONFIGS) @@ -42,4 +42,4 @@ clean-runtime: clean: clean-driver clean-runtime -.PHONY: all driver clean-driver clean-runtime clean \ No newline at end of file +.PHONY: all driver clean-driver clean-runtime clean diff --git a/runtime/simx/vortex.cpp b/runtime/simx/vortex.cpp index 89856f3a0..d4febe684 100644 --- a/runtime/simx/vortex.cpp +++ b/runtime/simx/vortex.cpp @@ -27,24 +27,49 @@ #include #include +#include +#ifdef VM_ENABLE +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#endif + using namespace vortex; class vx_device { public: - vx_device() - : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES) - , ram_(0, RAM_PAGE_SIZE) - , processor_(arch_) - , global_mem_(ALLOC_BASE_ADDR, - GLOBAL_MEM_SIZE - ALLOC_BASE_ADDR, - RAM_PAGE_SIZE, - CACHE_BLOCK_SIZE) - { - // attach memory module - processor_.attach_ram(&ram_); - } + vx_device() + : arch_(NUM_THREADS, NUM_WARPS, NUM_CORES) + , ram_(0, MEM_PAGE_SIZE) + , processor_(arch_) + , global_mem_(ALLOC_BASE_ADDR, GLOBAL_MEM_SIZE - ALLOC_BASE_ADDR, MEM_PAGE_SIZE, CACHE_BLOCK_SIZE) + { + // attach memory module + processor_.attach_ram(&ram_); +#ifdef VM_ENABLE + std::cout << "*** VM ENABLED!! ***"<< std::endl; + CHECK_ERR(init_VM(), ); +#endif + } ~vx_device() { +#ifdef VM_ENABLE + global_mem_.release(PAGE_TABLE_BASE_ADDR); + // for (auto i = addr_mapping.begin(); i != addr_mapping.end(); i++) + // page_table_mem_->release(i->second << MEM_PAGE_SIZE); + delete virtual_mem_; + delete page_table_mem_; +#endif if (future_.valid()) { future_.wait(); } @@ -69,6 +94,12 @@ public: case VX_CAPS_NUM_CORES: _value = NUM_CORES * NUM_CLUSTERS; break; + case VX_CAPS_TC_SIZE: + _value = TC_SIZE; + break; + case VX_CAPS_TC_NUM: + _value = TC_NUM; + break; case VX_CAPS_CACHE_LINE_SIZE: _value = CACHE_BLOCK_SIZE; break; @@ -81,6 +112,12 @@ public: case VX_CAPS_ISA_FLAGS: _value = ((uint64_t(MISA_EXT))<<32) | ((log2floor(XLEN)-4) << 30) | MISA_STD; break; + case VX_CAPS_NUM_MEM_BANKS: + _value = PLATFORM_MEMORY_NUM_BANKS; + break; + case VX_CAPS_MEM_BANK_SIZE: + _value = 1ull << (MEM_ADDR_WIDTH / PLATFORM_MEMORY_NUM_BANKS); + break; default: std::cout << "invalid caps id: " << caps_id << std::endl; std::abort(); @@ -90,35 +127,131 @@ public: return 0; } - int mem_alloc(uint64_t size, int flags, uint64_t* dev_addr) { - uint64_t addr; - CHECK_ERR(global_mem_.allocate(size, &addr), { +#ifdef VM_ENABLE + + // physical (ppn) to virtual (vpn) mapping + uint64_t map_p2v(uint64_t ppn, uint32_t flags) + { + DBGPRINT(" [RT:MAP_P2V] ppn: %lx\n", ppn); + if (addr_mapping.find(ppn) != addr_mapping.end()) return addr_mapping[ppn]; + + // If ppn to vpn mapping doesnt exist, create mapping + DBGPRINT(" [RT:MAP_P2V] Not found. Allocate new page table or update a PTE.\n"); + uint64_t vpn; + virtual_mem_->allocate(MEM_PAGE_SIZE, &vpn); + vpn = vpn >> MEM_PAGE_LOG2_SIZE; + CHECK_ERR(update_page_table(ppn, vpn, flags),); + addr_mapping[ppn] = vpn; + return vpn; + } + + bool need_trans(uint64_t dev_pAddr) + { + + // Check if the satp is set and BARE mode + if (processor_.is_satp_unset() || get_mode() == BARE) + return 0; + + // Check if the address is reserved for system usage + // bool isReserved = (PAGE_TABLE_BASE_ADDR <= dev_pAddr && dev_pAddr < PAGE_TABLE_BASE_ADDR + PT_SIZE_LIMIT); + if (PAGE_TABLE_BASE_ADDR <= dev_pAddr) + return 0; + + // Check if the address is reserved for IO usage + if (dev_pAddr < USER_BASE_ADDR) + return 0; + // Check if the address falls within the startup address range + if ((STARTUP_ADDR <= dev_pAddr) && (dev_pAddr <= (STARTUP_ADDR + 0x40000))) + return 0; + + // Now all conditions are not met. Return true because the address needs translation + return 1; + } + + uint64_t phy_to_virt_map(uint64_t size, uint64_t *dev_pAddr, uint32_t flags) + { + DBGPRINT(" [RT:PTV_MAP] size = 0x%lx, dev_pAddr= 0x%lx, flags = 0x%x\n", size, *dev_pAddr, flags); + DBGPRINT(" [RT:PTV_MAP] bit mode: %d\n", XLEN); + + if (!need_trans(*dev_pAddr)) + { + DBGPRINT(" [RT:PTV_MAP] Translation is not needed.\n"); + return 0; + } + + uint64_t init_pAddr = *dev_pAddr; + uint64_t init_vAddr = (map_p2v(init_pAddr >> MEM_PAGE_LOG2_SIZE, flags) << MEM_PAGE_LOG2_SIZE) | (init_pAddr & ((1 << MEM_PAGE_LOG2_SIZE) - 1)); + uint64_t ppn = 0, vpn = 0; + + // dev_pAddr can be of size greater than a page, but we have to map and update + // page tables on a page table granularity. So divide the allocation into pages. + // FUTURE Work: Super Page + for (ppn = (*dev_pAddr >> MEM_PAGE_LOG2_SIZE); ppn < ((*dev_pAddr) >> MEM_PAGE_LOG2_SIZE) + (size >> MEM_PAGE_LOG2_SIZE) ; ppn++) + { + vpn = map_p2v(ppn, flags) >> MEM_PAGE_LOG2_SIZE; + DBGPRINT(" [RT:PTV_MAP] Search vpn in page table:0x%lx\n", vpn); + // Currently a 1-1 mapping is used, this can be changed here to support different + // mapping schemes + } + DBGPRINT(" [RT:PTV_MAP] Mapped virtual addr: 0x%lx to physical addr: 0x%lx\n", init_vAddr, init_pAddr); + // Sanity check + assert(page_table_walk(init_vAddr) == init_pAddr && "ERROR: translated virtual Addresses are not the same with physical Address\n"); + + *dev_pAddr = init_vAddr; // commit vpn to be returned to host + DBGPRINT(" [RT:PTV_MAP] Translated device virtual addr: 0x%lx\n", *dev_pAddr); + + return 0; + } +#endif + + int mem_alloc(uint64_t size, int flags, uint64_t *dev_addr) + { + uint64_t asize = aligned_size(size, MEM_PAGE_SIZE); + uint64_t addr = 0; + + DBGPRINT("[RT:mem_alloc] size: 0x%lx, asize, 0x%lx,flag : 0x%d\n", size, asize, flags); + // HW: when vm is supported this global_mem_ should be virtual memory allocator + CHECK_ERR(global_mem_.allocate(asize, &addr), { return err; }); - CHECK_ERR(this->mem_access(addr, size, flags), { + CHECK_ERR(this->mem_access(addr, asize, flags), { global_mem_.release(addr); return err; }); *dev_addr = addr; +#ifdef VM_ENABLE + // VM address translation + phy_to_virt_map(asize, dev_addr, flags); +#endif return 0; } - int mem_reserve(uint64_t dev_addr, uint64_t size, int flags) { - CHECK_ERR(global_mem_.reserve(dev_addr, size), { + int mem_reserve(uint64_t dev_addr, uint64_t size, int flags) + { + uint64_t asize = aligned_size(size, MEM_PAGE_SIZE); + CHECK_ERR(global_mem_.reserve(dev_addr, asize), { return err; }); - CHECK_ERR(this->mem_access(dev_addr, size, flags), { + DBGPRINT("[RT:mem_reserve] addr: 0x%lx, asize:0x%lx, size: 0x%lx\n", dev_addr, asize, size); + CHECK_ERR(this->mem_access(dev_addr, asize, flags), { global_mem_.release(dev_addr); return err; }); return 0; } - int mem_free(uint64_t dev_addr) { + int mem_free(uint64_t dev_addr) + { +#ifdef VM_ENABLE + uint64_t paddr = page_table_walk(dev_addr); + return global_mem_.release(paddr); +#else return global_mem_.release(dev_addr); +#endif } - int mem_access(uint64_t dev_addr, uint64_t size, int flags) { + int mem_access(uint64_t dev_addr, uint64_t size, int flags) + { uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); if (dev_addr + asize > GLOBAL_MEM_SIZE) return -1; @@ -127,7 +260,8 @@ public: return 0; } - int mem_info(uint64_t* mem_free, uint64_t* mem_used) const { + int mem_info(uint64_t *mem_free, uint64_t *mem_used) const + { if (mem_free) *mem_free = global_mem_.free(); if (mem_used) @@ -135,16 +269,31 @@ public: return 0; } - int upload(uint64_t dest_addr, const void* src, uint64_t size) { + int upload(uint64_t dest_addr, const void *src, uint64_t size) + { uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); if (dest_addr + asize > GLOBAL_MEM_SIZE) return -1; +#ifdef VM_ENABLE + uint64_t pAddr = page_table_walk(dest_addr); + // uint64_t pAddr; + // try { + // pAddr = page_table_walk(dest_addr); + // } catch ( Page_Fault_Exception ) { + // // HW: place holder + // // should be virt_to_phy_map here + // phy_to_virt_map(0, dest_addr, 0); + // } + DBGPRINT(" [RT:upload] Upload data to vAddr = 0x%lx (pAddr=0x%lx)\n", dest_addr, pAddr); + dest_addr = pAddr; //Overwirte +#endif ram_.enable_acl(false); - ram_.write((const uint8_t*)src, dest_addr, size); + ram_.write((const uint8_t *)src, dest_addr, size); ram_.enable_acl(true); - /*DBGPRINT("upload %ld bytes to 0x%lx\n", size, dest_addr); + /* + DBGPRINT("upload %ld bytes to 0x%lx\n", size, dest_addr); for (uint64_t i = 0; i < size && i < 1024; i += 4) { DBGPRINT(" 0x%lx <- 0x%x\n", dest_addr + i, *(uint32_t*)((uint8_t*)src + i)); }*/ @@ -152,13 +301,19 @@ public: return 0; } - int download(void* dest, uint64_t src_addr, uint64_t size) { + int download(void *dest, uint64_t src_addr, uint64_t size) + { uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); if (src_addr + asize > GLOBAL_MEM_SIZE) return -1; +#ifdef VM_ENABLE + uint64_t pAddr = page_table_walk(src_addr); + DBGPRINT(" [RT:download] Download data to vAddr = 0x%lx (pAddr=0x%lx)\n", src_addr, pAddr); + src_addr = pAddr; //Overwirte +#endif ram_.enable_acl(false); - ram_.read((uint8_t*)dest, src_addr, size); + ram_.read((uint8_t *)dest, src_addr, size); ram_.enable_acl(true); /*DBGPRINT("download %ld bytes from 0x%lx\n", size, src_addr); @@ -169,9 +324,11 @@ public: return 0; } - int start(uint64_t krnl_addr, uint64_t args_addr) { + int start(uint64_t krnl_addr, uint64_t args_addr) + { // ensure prior run completed - if (future_.valid()) { + if (future_.valid()) + { future_.wait(); } @@ -182,9 +339,8 @@ public: this->dcr_write(VX_DCR_BASE_STARTUP_ARG1, args_addr >> 32); // start new run - future_ = std::async(std::launch::async, [&]{ - processor_.run(); - }); + future_ = std::async(std::launch::async, [&] + { processor_.run(); }); // clear mpm cache mpm_cache_.clear(); @@ -192,12 +348,14 @@ public: return 0; } - int ready_wait(uint64_t timeout) { + int ready_wait(uint64_t timeout) + { if (!future_.valid()) return 0; uint64_t timeout_sec = timeout / 1000; std::chrono::seconds wait_time(1); - for (;;) { + for (;;) + { // wait for 1 sec and check status auto status = future_.wait_for(wait_time); if (status == std::future_status::ready) @@ -208,8 +366,10 @@ public: return 0; } - int dcr_write(uint32_t addr, uint32_t value) { - if (future_.valid()) { + int dcr_write(uint32_t addr, uint32_t value) + { + if (future_.valid()) + { future_.wait(); // ensure prior run completed } processor_.dcr_write(addr, value); @@ -217,15 +377,18 @@ public: return 0; } - int dcr_read(uint32_t addr, uint32_t* value) const { + int dcr_read(uint32_t addr, uint32_t *value) const + { return dcrs_.read(addr, value); } - int mpm_query(uint32_t addr, uint32_t core_id, uint64_t* value) { + int mpm_query(uint32_t addr, uint32_t core_id, uint64_t *value) + { uint32_t offset = addr - VX_CSR_MPM_BASE; if (offset > 31) return -1; - if (mpm_cache_.count(core_id) == 0) { + if (mpm_cache_.count(core_id) == 0) + { uint64_t mpm_mem_addr = IO_MPM_ADDR + core_id * 32 * sizeof(uint64_t); CHECK_ERR(this->download(mpm_cache_[core_id].data(), mpm_mem_addr, 32 * sizeof(uint64_t)), { return err; @@ -234,15 +397,281 @@ public: *value = mpm_cache_.at(core_id).at(offset); return 0; } +#ifdef VM_ENABLE + /* VM Management */ + + // Initialize to zero the target page table area. 32bit 4K, 64bit 8K + uint16_t init_page_table(uint64_t addr, uint64_t size) + { + uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE); + DBGPRINT(" [RT:init_page_table] (addr=0x%lx, size=0x%lx)\n", addr, asize); + uint8_t *src = new uint8_t[asize]; + if (src == NULL) + return 1; + + for (uint64_t i = 0; i < asize; ++i) + { + src[i] = 0; + } + ram_.enable_acl(false); + ram_.write((const uint8_t *)src, addr, asize); + ram_.enable_acl(true); + return 0; + } + + uint8_t alloc_page_table (uint64_t * pt_addr) + { + CHECK_ERR(page_table_mem_->allocate(PT_SIZE, pt_addr), { return err; }); + CHECK_ERR(init_page_table(*pt_addr, PT_SIZE), { return err; }); + DBGPRINT(" [RT:alloc_page_table] addr= 0x%lx\n", *pt_addr); + return 0; + } + + // reserve IO space, startup space, and local mem area + int virtual_mem_reserve(uint64_t dev_addr, uint64_t size, int flags) + { + CHECK_ERR(virtual_mem_->reserve(dev_addr, size), { + return err; + }); + DBGPRINT("[RT:mem_reserve] addr: 0x%lx, size:0x%lx, size: 0x%lx\n", dev_addr, size, size); + return 0; + } + + int16_t init_VM() + { + uint64_t pt_addr = 0; + // Reserve space for PT + DBGPRINT("[RT:init_VM] Initialize VM\n"); + DBGPRINT("* VM_ADDR_MODE=0x%lx", VM_ADDR_MODE); + DBGPRINT("* PAGE_TABLE_BASE_ADDR=0x%lx", PAGE_TABLE_BASE_ADDR); + DBGPRINT("* PT_LEVEL=0x%lx", PT_LEVEL); + DBGPRINT("* PT_SIZE=0x%lx", PT_SIZE); + DBGPRINT("* PTE_SIZE=0x%lx", PTE_SIZE); + DBGPRINT("* TLB_SIZE=0x%lx", TLB_SIZE); + CHECK_ERR(mem_reserve(PAGE_TABLE_BASE_ADDR, PT_SIZE_LIMIT, VX_MEM_READ_WRITE), { + return err; + }); + page_table_mem_ = new MemoryAllocator (PAGE_TABLE_BASE_ADDR, PT_SIZE_LIMIT, MEM_PAGE_SIZE, CACHE_BLOCK_SIZE); + if (page_table_mem_ == NULL) + { + CHECK_ERR(this->mem_free(PAGE_TABLE_BASE_ADDR),); + return 1; + } + + // HW: virtual mem allocator has the same address range as global_mem. next step is to adjust it + virtual_mem_ = new MemoryAllocator(ALLOC_BASE_ADDR, (GLOBAL_MEM_SIZE - ALLOC_BASE_ADDR), MEM_PAGE_SIZE, CACHE_BLOCK_SIZE); + CHECK_ERR(virtual_mem_reserve(PAGE_TABLE_BASE_ADDR, (GLOBAL_MEM_SIZE - PAGE_TABLE_BASE_ADDR), VX_MEM_READ_WRITE), { + return err; + }); + CHECK_ERR(virtual_mem_reserve(STARTUP_ADDR, 0x40000, VX_MEM_READ_WRITE), { + return err; + }); + + if (virtual_mem_ == nullptr) { + // virtual_mem_ does not intefere with physical mem, so no need to free space + + return 1; + } + + if (VM_ADDR_MODE == BARE) + DBGPRINT("[RT:init_VM] VA_MODE = BARE MODE(addr= 0x0)"); + else + CHECK_ERR(alloc_page_table(&pt_addr),{return err;}); + + CHECK_ERR(processor_.set_satp_by_addr(pt_addr),{return err;}); + return 0; + } + + // Return value in in ptbr + uint64_t get_base_ppn() + { + return processor_.get_base_ppn(); + } + uint64_t get_pte_address(uint64_t base_ppn, uint64_t vpn) + { + return (base_ppn * PT_SIZE) + (vpn * PTE_SIZE); + } + + uint8_t get_mode() + { + return processor_.get_satp_mode(); + } + + int16_t update_page_table(uint64_t ppn, uint64_t vpn, uint32_t flag) + { + DBGPRINT(" [RT:Update PT] Mapping vpn 0x%05lx to ppn 0x%05lx(flags = %u)\n", vpn, ppn, flag); + // sanity check +#if VM_ADDR_MODE == SV39 + assert((((ppn >> 44) == 0) && ((vpn >> 27) == 0)) && "Upper bits are not zero!"); + uint8_t level = 3; +#else // Default is SV32, BARE will not reach this point. + assert((((ppn >> 20) == 0) && ((vpn >> 20) == 0)) && "Upper 12 bits are not zero!"); + uint8_t level = 2; +#endif + int i = level - 1; + vAddr_t vaddr(vpn << MEM_PAGE_LOG2_SIZE); + uint64_t pte_addr = 0, pte_bytes = 0; + uint64_t pt_addr = 0; + uint64_t cur_base_ppn = get_base_ppn(); + + while (i >= 0) + { + DBGPRINT(" [RT:Update PT]Start %u-level page table\n", i); + pte_addr = get_pte_address(cur_base_ppn, vaddr.vpn[i]); + pte_bytes = read_pte(pte_addr); + PTE_t pte_chk(pte_bytes); + DBGPRINT(" [RT:Update PT] PTE addr 0x%lx, PTE bytes 0x%lx\n", pte_addr, pte_bytes); + if (pte_chk.v == 1 && ((pte_bytes & 0xFFFFFFFF) != 0xbaadf00d)) + { + DBGPRINT(" [RT:Update PT] PTE valid (ppn 0x%lx), continuing the walk...\n", pte_chk.ppn); + cur_base_ppn = pte_chk.ppn; + } + else + { + // If valid bit not set, allocate a next level page table + DBGPRINT(" [RT:Update PT] PTE Invalid (ppn 0x%lx) ...\n", pte_chk.ppn); + if (i == 0) + { + // Reach to leaf + DBGPRINT(" [RT:Update PT] Reached to level 0. This should be a leaf node(flag = %x) \n",flag); + uint32_t pte_flag = (flag << 1) | 0x3; + PTE_t new_pte(ppn < 0x%x\n", addr + i, *(uint64_t*)((uint8_t*)dest + i)); + // } + // } + + void write_pte(uint64_t addr, uint64_t value = 0xbaadf00d) + { + DBGPRINT(" [RT:Write_pte] writing pte 0x%lx to pAddr: 0x%lx\n", value, addr); + uint8_t *src = new uint8_t[PTE_SIZE]; + for (uint64_t i = 0; i < PTE_SIZE; ++i) + { + src[i] = (value >> (i << 3)) & 0xff; + } + // std::cout << "writing PTE to RAM addr 0x" << std::hex << addr << std::endl; + ram_.enable_acl(false); + ram_.write((const uint8_t *)src, addr, PTE_SIZE); + ram_.enable_acl(true); + } + + uint64_t read_pte(uint64_t addr) + { + uint8_t *dest = new uint8_t[PTE_SIZE]; +#ifdef XLEN_32 + uint64_t mask = 0x00000000FFFFFFFF; +#else // 64bit + uint64_t mask = 0xFFFFFFFFFFFFFFFF; +#endif + + ram_.read((uint8_t *)dest, addr, PTE_SIZE); + uint64_t ret = (*(uint64_t *)((uint8_t *)dest)) & mask; + DBGPRINT(" [RT:read_pte] reading PTE 0x%lx from RAM addr 0x%lx\n", ret, addr); + + return ret; + } +#endif // VM_ENABLE private: - Arch arch_; - RAM ram_; - Processor processor_; - MemoryAllocator global_mem_; - DeviceConfig dcrs_; - std::future future_; + Arch arch_; + RAM ram_; + Processor processor_; + MemoryAllocator global_mem_; + DeviceConfig dcrs_; + std::future future_; std::unordered_map> mpm_cache_; +#ifdef VM_ENABLE + std::unordered_map addr_mapping; // HW: key: ppn; value: vpn + MemoryAllocator* page_table_mem_; + MemoryAllocator* virtual_mem_; +#endif }; -#include \ No newline at end of file +#include diff --git a/runtime/stub/Makefile b/runtime/stub/Makefile index ae6e27ed1..8315bd8af 100644 --- a/runtime/stub/Makefile +++ b/runtime/stub/Makefile @@ -4,7 +4,7 @@ DESTDIR ?= $(CURDIR)/.. SRC_DIR := $(VORTEX_HOME)/runtime/stub -CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors +CXXFLAGS += -std=c++17 -Wall -Wextra -pedantic -Wfatal-errors CXXFLAGS += -I$(INC_DIR) -I$(COMMON_DIR) -I$(ROOT_DIR)/hw -I$(SIM_DIR)/common CXXFLAGS += -fPIC diff --git a/runtime/stub/utils.cpp b/runtime/stub/utils.cpp index 9826db711..220f916ae 100644 --- a/runtime/stub/utils.cpp +++ b/runtime/stub/utils.cpp @@ -211,6 +211,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { uint64_t mem_reads = 0; uint64_t mem_writes = 0; uint64_t mem_lat = 0; + uint64_t mem_bank_stalls = 0; uint64_t num_cores; CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_CORES, &num_cores), { @@ -222,6 +223,11 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { return err; }); + uint64_t num_mem_bank_ports; + CHECK_ERR(vx_dev_caps(hdevice, VX_CAPS_NUM_MEM_BANKS, &num_mem_bank_ports), { + return err; + }); + bool icache_enable = isa_flags & VX_ISA_EXT_ICACHE; bool dcache_enable = isa_flags & VX_ISA_EXT_DCACHE; bool l2cache_enable = isa_flags & VX_ISA_EXT_L2CACHE; @@ -430,6 +436,8 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { fprintf(stream, "PERF: core%d: icache mshr stalls=%ld (utilization=%d%%)\n", core_id, icache_mshr_stalls, mshr_utilization); } + uint64_t dcache_requests_per_core = 0; + if (dcache_enable) { // PERF: Dcache uint64_t dcache_reads; @@ -440,6 +448,7 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_WRITES, core_id, &dcache_writes), { return err; }); + dcache_requests_per_core += dcache_reads + dcache_writes; uint64_t dcache_read_misses; CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_DCACHE_MISS_R, core_id, &dcache_read_misses), { return err; @@ -468,6 +477,14 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld (utilization=%d%%)\n", core_id, dcache_mshr_stalls, mshr_utilization); } + // PERF: coalescer + uint64_t coalescer_misses; + CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_COALESCER_MISS, core_id, &coalescer_misses), { + return err; + }); + int coalescer_utilization = calcAvgPercent(dcache_requests_per_core - coalescer_misses, dcache_requests_per_core); + fprintf(stream, "PERF: core%d: coalescer misses=%ld (hit ratio=%d%%)\n", core_id, coalescer_misses, coalescer_utilization); + if (l2cache_enable) { // PERF: L2cache uint64_t tmp; @@ -533,6 +550,9 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_LT, core_id, &mem_lat), { return err; }); + CHECK_ERR(vx_mpm_query(hdevice, VX_CSR_MPM_MEM_BANK_ST, core_id, &mem_bank_stalls), { + return err; + }); } } break; default: @@ -608,9 +628,14 @@ extern int vx_dump_perf(vx_device_h hdevice, FILE* stream) { fprintf(stream, "PERF: l3cache mshr stalls=%ld (utilization=%d%%)\n", l3cache_mshr_stalls, mshr_utilization); } - int mem_avg_lat = caclAverage(mem_lat, mem_reads); - fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes); - fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat); + { + uint64_t mem_requests = mem_reads + mem_writes; + int mem_avg_lat = caclAverage(mem_lat, mem_reads); + int mem_bank_utilization = calcAvgPercent(mem_requests, mem_requests + mem_bank_stalls); + fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", mem_requests, mem_reads, mem_writes); + fprintf(stream, "PERF: memory latency=%d cycles\n", mem_avg_lat); + fprintf(stream, "PERF: memory bank stalls=%ld (utilization=%d%%)\n", mem_bank_stalls, mem_bank_utilization); + } } break; default: break; diff --git a/runtime/xrt/Makefile b/runtime/xrt/Makefile index 66d3e481b..f255002f2 100644 --- a/runtime/xrt/Makefile +++ b/runtime/xrt/Makefile @@ -6,8 +6,9 @@ DESTDIR ?= $(CURDIR)/.. SRC_DIR := $(VORTEX_HOME)/runtime/xrt -CXXFLAGS += -std=c++14 -Wall -Wextra -Wfatal-errors +CXXFLAGS += -std=c++17 -Wall -Wextra -Wfatal-errors CXXFLAGS += -I$(INC_DIR) -I$(COMMON_DIR) -I$(ROOT_DIR)/hw -I$(XILINX_XRT)/include -I$(SIM_DIR)/common +CXXFLAGS += -DXLEN_$(XLEN) CXXFLAGS += -fPIC LDFLAGS += -shared -pthread @@ -39,6 +40,11 @@ ifdef SCOPE SRCS += $(COMMON_DIR)/scope.cpp endif +# Enable ILA logic analyzer +ifdef CHIPSCOPE + CXXFLAGS += -DCHIPSCOPE +endif + all: $(DESTDIR)/$(PROJECT) driver: $(DESTDIR)/libxrtsim.so diff --git a/runtime/xrt/vortex.cpp b/runtime/xrt/vortex.cpp index 408bf23ed..0942c700d 100644 --- a/runtime/xrt/vortex.cpp +++ b/runtime/xrt/vortex.cpp @@ -18,15 +18,15 @@ #endif // XRT includes -#ifndef XRTSIM +#ifdef XRTSIM +#include +#else #include "experimental/xrt_bo.h" #include "experimental/xrt_device.h" #include "experimental/xrt_error.h" #include "experimental/xrt_ip.h" #include "experimental/xrt_kernel.h" #include "experimental/xrt_xclbin.h" -#else -#include #endif #include @@ -46,10 +46,10 @@ using namespace vortex; #define MMIO_CTL_ADDR 0x00 #define MMIO_DEV_ADDR 0x10 -#define MMIO_ISA_ADDR 0x1C -#define MMIO_DCR_ADDR 0x28 -#define MMIO_SCP_ADDR 0x34 -#define MMIO_MEM_ADDR 0x40 +#define MMIO_ISA_ADDR 0x18 +#define MMIO_DCR_ADDR 0x20 +#define MMIO_SCP_ADDR 0x28 +#define MMIO_MEM_ADDR 0x30 #define CTL_AP_START (1 << 0) #define CTL_AP_DONE (1 << 1) @@ -58,21 +58,6 @@ using namespace vortex; #define CTL_AP_RESET (1 << 4) #define CTL_AP_RESTART (1 << 7) -struct platform_info_t { - const char *prefix_name; - uint8_t lg2_num_banks; - uint8_t lg2_bank_size; - uint64_t mem_base; -}; - -static const platform_info_t g_platforms[] = { - {"vortex_xrtsim", 4, 0x10, 0x0}, // 64 KB banks - {"xilinx_u50", 4, 0x1C, 0x0}, // 16 MB banks - {"xilinx_u200", 4, 0x1C, 0x0}, // 16 MB banks - {"xilinx_u280", 4, 0x1C, 0x0}, // 16 MB banks - {"xilinx_vck5000", 0, 0x21, 0xC000000000}, -}; - #ifdef CPP_API typedef xrt::device xrt_device_t; @@ -110,25 +95,6 @@ static void dump_xrt_error(xrtDeviceHandle xrtDevice, xrtErrorCode err) { } #endif -static int get_platform_info(const std::string &device_name, - platform_info_t *platform_info) { - for (size_t i = 0; i < (sizeof(g_platforms) / sizeof(platform_info_t)); ++i) { - auto &platform = g_platforms[i]; - if (device_name.rfind(platform.prefix_name, 0) == 0) { - *platform_info = platform; - return 0; - } - } - return -1; -} - -/* -static void wait_for_enter(const std::string &msg) { - std::cout << msg << std::endl; - std::cin.ignore(std::numeric_limits::max(), '\n'); -} -*/ - /////////////////////////////////////////////////////////////////////////////// class vx_device { @@ -185,58 +151,6 @@ public: auto xclbin = xrt::xclbin(xlbin_path_s); auto device_name = xrtDevice.get_info(); - /*{ - uint32_t num_banks = 0; - uint64_t bank_size = 0; - uint64_t mem_base = 0; - - auto mem_json = - nlohmann::json::parse(xrtDevice.get_info()); if - (!mem_json.is_null()) { uint32_t index = 0; for (auto& mem : - mem_json["board"]["memory"]["memories"]) { auto enabled = - mem["enabled"].get(); if (enabled == "true") { if (index == 0) - { mem_base = std::stoull(mem["base_address"].get(), nullptr, - 16); bank_size = std::stoull(mem["range_bytes"].get(), nullptr, - 16); - } - ++index; - } - } - num_banks = index; - } - - fprintf(stderr, "[VXDRV] memory description: base=0x%lx, size=0x%lx, - count=%d\n", mem_base, bank_size, num_banks); - }*/ - - /*{ - std::cout << "Device" << device_index << " : " << - xrtDevice.get_info() << std::endl; std::cout << " - bdf : " << xrtDevice.get_info() << std::endl; - std::cout << " kdma : " << - xrtDevice.get_info() << std::endl; std::cout << " - max_freq : " << - xrtDevice.get_info() << - std::endl; std::cout << " memory : " << - xrtDevice.get_info() << std::endl; std::cout << " - thermal : " << xrtDevice.get_info() << - std::endl; std::cout << " m2m : " << std::boolalpha << - xrtDevice.get_info() << std::dec << std::endl; - std::cout << " nodma : " << std::boolalpha << - xrtDevice.get_info() << std::dec << std::endl; - - std::cout << "Memory info :" << std::endl; - for (const auto& mem_bank : xclbin.get_mems()) { - std::cout << " index : " << mem_bank.get_index() << std::endl; - std::cout << " tag : " << mem_bank.get_tag() << std::endl; - std::cout << " type : " << (int)mem_bank.get_type() << std::endl; - std::cout << " base_address : 0x" << std::hex << - mem_bank.get_base_address() << std::endl; std::cout << " size : 0x" << - (mem_bank.get_size_kb() * 1000) << std::dec << std::endl; std::cout << " - used :" << mem_bank.get_used() << std::endl; - } - }*/ - #else CHECK_HANDLE(xrtDevice, xrtDeviceOpen(device_index), { @@ -262,7 +176,7 @@ public: return -1; }); #else - xrtKernelHandle xrtKernel = nullptr; + xrtKernelHandle xrtKernel = xrtDevice; #endif // get device name @@ -277,34 +191,10 @@ public: xrtDevice_ = xrtDevice; xrtKernel_ = xrtKernel; - CHECK_ERR(get_platform_info(device_name, &platform_), { - fprintf(stderr, "[VXDRV] Error: platform not supported: %s\n", device_name.c_str()); - return err; - }); - CHECK_ERR(this->write_register(MMIO_CTL_ADDR, CTL_AP_RESET), { return err; }); - uint32_t num_banks = 1 << platform_.lg2_num_banks; - uint64_t bank_size = 1ull << platform_.lg2_bank_size; - - for (uint32_t i = 0; i < num_banks; ++i) { - uint32_t reg_addr = MMIO_MEM_ADDR + (i * 12); - uint64_t reg_value = platform_.mem_base + i * bank_size; - - CHECK_ERR(this->write_register(reg_addr, reg_value & 0xffffffff), { - return err; - }); - - CHECK_ERR(this->write_register(reg_addr + 4, (reg_value >> 32) & 0xffffffff), { - return err; - }); - #ifndef BANK_INTERLEAVE - break; - #endif - } - CHECK_ERR(this->read_register(MMIO_DEV_ADDR, (uint32_t *)&dev_caps_), { return err; }); @@ -321,8 +211,18 @@ public: return err; }); + uint64_t num_banks; + this->get_caps(VX_CAPS_NUM_MEM_BANKS, &num_banks); + lg2_num_banks_ = log2ceil(num_banks); + + uint64_t bank_size; + this->get_caps(VX_CAPS_MEM_BANK_SIZE, &bank_size); + lg2_bank_size_ = log2ceil(bank_size); + global_mem_size_ = num_banks * bank_size; + printf("info: device name=%s, memory_capacity=0x%lx bytes, memory_banks=%ld.\n", device_name.c_str(), global_mem_size_, num_banks); + #ifdef BANK_INTERLEAVE xrtBuffers_.reserve(num_banks); for (uint32_t i = 0; i < num_banks; ++i) { @@ -365,14 +265,17 @@ public: *value = (((uint64_t)value_hi) << 32) | value_lo; return 0; }; - int ret = vx_scope_start(&callback, device, 0, -1); - if (ret != 0) { - delete device; - return ret; - } + CHECK_ERR(vx_scope_start(&callback, this, -1, -1), { + return err; + }); } #endif + #ifdef CHIPSCOPE + std::cout << "\nPress ENTER to continue after setting up ILA trigger..." << std::endl; + std::cin.ignore(std::numeric_limits::max(), '\n'); + #endif + return 0; } @@ -399,11 +302,17 @@ public: _value = global_mem_size_; break; case VX_CAPS_LOCAL_MEM_SIZE: - _value = 1ull << ((dev_caps_ >> 48) & 0xff); + _value = 1ull << ((dev_caps_ >> 40) & 0xff); break; case VX_CAPS_ISA_FLAGS: _value = isa_caps_; break; + case VX_CAPS_NUM_MEM_BANKS: + _value = 1 << ((dev_caps_ >> 48) & 0x7); + break; + case VX_CAPS_MEM_BANK_SIZE: + _value = 1ull << (20 + ((dev_caps_ >> 51) & 0x1f)); + break; default: fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id); std::abort(); @@ -520,7 +429,6 @@ public: return err; }); #endif - DBGPRINT("*** write_register: addr=0x%x, value=0x%x\n", addr, value); return 0; } @@ -533,7 +441,6 @@ public: return err; }); #endif - DBGPRINT("*** read_register: addr=0x%x, value=0x%x\n", addr, *value); return 0; } @@ -567,14 +474,14 @@ public: return err; }); #ifdef CPP_API - xrtBuffer.write(host_ptr, asize, bo_offset); - xrtBuffer.sync(XCL_BO_SYNC_BO_TO_DEVICE, asize, bo_offset); + xrtBuffer.write(host_ptr, size, bo_offset); + xrtBuffer.sync(XCL_BO_SYNC_BO_TO_DEVICE, size, bo_offset); #else - CHECK_ERR(xrtBOWrite(xrtBuffer, host_ptr, asize, bo_offset), { + CHECK_ERR(xrtBOWrite(xrtBuffer, host_ptr, size, bo_offset), { dump_xrt_error(xrtDevice_, err); return err; }); - CHECK_ERR(xrtBOSync(xrtBuffer, XCL_BO_SYNC_BO_TO_DEVICE, asize, bo_offset), { + CHECK_ERR(xrtBOSync(xrtBuffer, XCL_BO_SYNC_BO_TO_DEVICE, size, bo_offset), { dump_xrt_error(xrtDevice_, err); return err; }); @@ -613,14 +520,14 @@ public: return err; }); #ifdef CPP_API - xrtBuffer.sync(XCL_BO_SYNC_BO_FROM_DEVICE, asize, bo_offset); - xrtBuffer.read(host_ptr, asize, bo_offset); + xrtBuffer.sync(XCL_BO_SYNC_BO_FROM_DEVICE, size, bo_offset); + xrtBuffer.read(host_ptr, size, bo_offset); #else - CHECK_ERR(xrtBOSync(xrtBuffer, XCL_BO_SYNC_BO_FROM_DEVICE, asize, bo_offset), { + CHECK_ERR(xrtBOSync(xrtBuffer, XCL_BO_SYNC_BO_FROM_DEVICE, size, bo_offset), { dump_xrt_error(xrtDevice_, err); return err; }); - CHECK_ERR(xrtBORead(xrtBuffer, host_ptr, asize, bo_offset), { + CHECK_ERR(xrtBORead(xrtBuffer, host_ptr, size, bo_offset), { dump_xrt_error(xrtDevice_, err); return err; }); @@ -720,30 +627,30 @@ private: MemoryAllocator global_mem_; xrt_device_t xrtDevice_; xrt_kernel_t xrtKernel_; - platform_info_t platform_; uint64_t dev_caps_; uint64_t isa_caps_; uint64_t global_mem_size_; DeviceConfig dcrs_; std::unordered_map> mpm_cache_; + uint32_t lg2_num_banks_; + uint32_t lg2_bank_size_; #ifdef BANK_INTERLEAVE std::vector xrtBuffers_; int get_bank_info(uint64_t addr, uint32_t *pIdx, uint64_t *pOff) { - uint32_t num_banks = 1 << platform_.lg2_num_banks; + uint32_t num_banks = 1 << lg2_num_banks_; uint64_t block_addr = addr / CACHE_BLOCK_SIZE; uint32_t index = block_addr & (num_banks - 1); - uint64_t offset = - (block_addr >> platform_.lg2_num_banks) * CACHE_BLOCK_SIZE; + uint64_t offset = (block_addr >> lg2_num_banks_) * CACHE_BLOCK_SIZE; if (pIdx) { *pIdx = index; } if (pOff) { *pOff = offset; } - printf("get_bank_info(addr=0x%lx, bank=%d, offset=0x%lx\n", addr, index, offset); + //printf("get_bank_info(addr=0x%lx, bank=%d, offset=0x%lx\n", addr, index, offset); return 0; } @@ -764,9 +671,9 @@ private: std::unordered_map xrtBuffers_; int get_bank_info(uint64_t addr, uint32_t *pIdx, uint64_t *pOff) { - uint32_t num_banks = 1 << platform_.lg2_num_banks; - uint64_t bank_size = 1ull << platform_.lg2_bank_size; - uint32_t index = addr >> platform_.lg2_bank_size; + uint32_t num_banks = 1 << lg2_num_banks_; + uint64_t bank_size = 1ull << lg2_bank_size_; + uint32_t index = addr >> lg2_bank_size_; uint64_t offset = addr & (bank_size - 1); if (index > num_banks) { fprintf(stderr, "[VXDRV] Error: address out of range: 0x%lx\n", addr); @@ -778,8 +685,7 @@ private: if (pOff) { *pOff = offset; } - printf("get_bank_info(addr=0x%lx, bank=%d, offset=0x%lx\n", addr, index, - offset); + //printf("get_bank_info(addr=0x%lx, bank=%d, offset=0x%lx\n", addr, index, offset); return 0; } @@ -794,7 +700,7 @@ private: } } else { printf("allocating bank%d...\n", bank_id); - uint64_t bank_size = 1ull << platform_.lg2_bank_size; + uint64_t bank_size = 1ull << lg2_bank_size_; #ifdef CPP_API xrt::bo xrtBuffer(xrtDevice_, bank_size, xrt::bo::flags::normal, bank_id); #else diff --git a/hw/syn/xilinx/xrt/xrt.ini b/runtime/xrt/xrt.ini.in similarity index 54% rename from hw/syn/xilinx/xrt/xrt.ini rename to runtime/xrt/xrt.ini.in index 094219112..90affb447 100644 --- a/hw/syn/xilinx/xrt/xrt.ini +++ b/runtime/xrt/xrt.ini.in @@ -1,9 +1,9 @@ -[Runtime] +[Runtime] runtime_log=console [Emulation] -#debug_mode=batch -#user_pre_sim_script=xsim.tcl +debug_mode=batch +user_pre_sim_script=@VORTEX_HOME@/runtime/xrt/xsim.tcl [Debug] profile=true diff --git a/hw/syn/xilinx/xrt/scripts/xsim.tcl b/runtime/xrt/xsim.tcl similarity index 88% rename from hw/syn/xilinx/xrt/scripts/xsim.tcl rename to runtime/xrt/xsim.tcl index 061bc17ab..ccdc1262f 100644 --- a/hw/syn/xilinx/xrt/scripts/xsim.tcl +++ b/runtime/xrt/xsim.tcl @@ -14,12 +14,9 @@ # limitations under the License. # -#log_wave -r * -#run all -#exit +log_wave -r * -open_vcd xsim_dump.vcd -log_vcd /* -run all -close_vcd -exit +#open_vcd xsim_dump.vcd +#log_vcd /* +#run all +#close_vcd \ No newline at end of file diff --git a/sim/Makefile b/sim/Makefile index e16486e8f..4d5ea89c1 100644 --- a/sim/Makefile +++ b/sim/Makefile @@ -1,6 +1,9 @@ ROOT_DIR := $(realpath ..) include $(ROOT_DIR)/config.mk +simx: + $(MAKE) -C simx + all: $(MAKE) -C simx $(MAKE) -C rtlsim diff --git a/sim/common/bitmanip.h b/sim/common/bitmanip.h index a6cd87ff1..9d074b6ff 100644 --- a/sim/common/bitmanip.h +++ b/sim/common/bitmanip.h @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,32 +16,54 @@ #include #include -constexpr uint32_t count_leading_zeros(uint32_t value) { - return value ? __builtin_clz(value) : 32; +template +constexpr uint32_t count_leading_zeros(T value) { + static_assert(std::is_integral::value, "invalid data type"); + if constexpr (sizeof(T) > 4) { + return value ? __builtin_clzll(value) - (64 - sizeof(T) * 8) : sizeof(T) * 8; + } else { + return value ? __builtin_clz(value) - (32 - sizeof(T) * 8) : sizeof(T) * 8; + } } -constexpr uint32_t count_trailing_zeros(uint32_t value) { - return value ? __builtin_ctz(value) : 32; +template +constexpr uint32_t count_trailing_zeros(T value) { + static_assert(std::is_integral::value, "invalid data type"); + if constexpr (sizeof(T) > 4) { + return value ? __builtin_ctzll(value) : (sizeof(T) * 8); + } else { + return value ? __builtin_ctz(value) : (sizeof(T) * 8); + } } -constexpr bool ispow2(uint32_t value) { +template +constexpr bool ispow2(T value) { + static_assert(std::is_integral::value, "invalid data type"); return value && !(value & (value - 1)); } -constexpr uint32_t log2ceil(uint32_t value) { - return 32 - count_leading_zeros(value - 1); +template +constexpr uint32_t log2ceil(T value) { + static_assert(std::is_integral::value, "invalid data type"); + return (sizeof(T) * 8) - count_leading_zeros(value - 1); } -inline unsigned log2up(uint32_t value) { +template +inline unsigned log2up(T value) { + static_assert(std::is_integral::value, "invalid data type"); return std::max(1, log2ceil(value)); } -constexpr unsigned log2floor(uint32_t value) { - return 31 - count_leading_zeros(value); +template +constexpr unsigned log2floor(T value) { + static_assert(std::is_integral::value, "invalid data type"); + return (sizeof(T) * 8 - 1) - count_leading_zeros(value); } -constexpr unsigned ceil2(uint32_t value) { - return 32 - count_leading_zeros(value); +template +constexpr unsigned ceil2(T value) { + static_assert(std::is_integral::value, "invalid data type"); + return (sizeof(T) * 8) - count_leading_zeros(value); } inline uint64_t bit_clr(uint64_t bits, uint32_t index) { @@ -82,11 +104,32 @@ inline uint64_t bit_getw(uint64_t bits, uint32_t start, uint32_t end) { return (bits << shift) >> (shift + start); } +inline uint64_t bit_reverse(uint64_t bits) { + bits = ((bits & 0xAAAAAAAAAAAAAAAA) >> 1) | ((bits & 0x5555555555555555) << 1); + bits = ((bits & 0xCCCCCCCCCCCCCCCC) >> 2) | ((bits & 0x3333333333333333) << 2); + bits = ((bits & 0xF0F0F0F0F0F0F0F0) >> 4) | ((bits & 0x0F0F0F0F0F0F0F0F) << 4); + bits = ((bits & 0xFF00FF00FF00FF00) >> 8) | ((bits & 0x00FF00FF00FF00FF) << 8); + bits = ((bits & 0xFFFF0000FFFF0000) >> 16) | ((bits & 0x0000FFFF0000FFFF) << 16); + bits = (bits >> 32) | (bits << 32); + return bits; +} + +inline uint64_t bit_reverse(uint64_t bits, uint32_t width) { + assert(width <= 64); + uint64_t reversed(0); + for (uint32_t i = 0; i < width; ++i) { + if (bits & (1ULL << i)) { + reversed |= (1ULL << (width - 1 - i)); + } + } + return reversed; +} + template T sext(const T& word, uint32_t width) { assert(width > 1); assert(width <= (sizeof(T) * 8)); - if (width == (sizeof(T) * 8)) + if (width == (sizeof(T) * 8)) return word; T mask((static_cast(1) << width) - 1); return ((word >> (width - 1)) & 0x1) ? (word | ~mask) : (word & mask); @@ -96,7 +139,7 @@ template T zext(const T& word, uint32_t width) { assert(width > 1); assert(width <= (sizeof(T) * 8)); - if (width == (sizeof(T) * 8)) + if (width == (sizeof(T) * 8)) return word; T mask((static_cast(1) << width) - 1); return word & mask; diff --git a/sim/common/bitvector.h b/sim/common/bitvector.h index 9fcf22c62..7fd2857d9 100644 --- a/sim/common/bitvector.h +++ b/sim/common/bitvector.h @@ -21,32 +21,32 @@ template class BitVector { private: static constexpr size_t BITS_PER_WORD = sizeof(T) * 8; - std::vector bits_; + std::vector words_; size_t size_; bool all_zero_; - size_t wordIndex(size_t pos) const { + constexpr size_t wordIndex(size_t pos) const { return pos / BITS_PER_WORD; } - T bitMask(size_t pos) const { + constexpr T bitMask(size_t pos) const { return T(1) << (pos % BITS_PER_WORD); } void updateAllZero() { - all_zero_ = std::all_of(bits_.begin(), bits_.end(), [](T word) { return word == 0; }); + all_zero_ = std::all_of(words_.begin(), words_.end(), [](T word) { return word == 0; }); } public: explicit BitVector(size_t size = 0) - : bits_((size + (BITS_PER_WORD - 1)) / BITS_PER_WORD) + : words_((size + (BITS_PER_WORD - 1)) / BITS_PER_WORD) , size_(size) , all_zero_(true) {} void set(size_t pos) { if (pos >= size_) throw std::out_of_range("Index out of range"); - bits_[this->wordIndex(pos)] |= this->bitMask(pos); + words_[this->wordIndex(pos)] |= this->bitMask(pos); all_zero_ = false; } @@ -59,19 +59,19 @@ public: } void reset() { - std::fill(bits_.begin(), bits_.end(), 0); + std::fill(words_.begin(), words_.end(), 0); all_zero_ = true; } void reset(size_t pos) { if (pos >= size_) throw std::out_of_range("Index out of range"); - bits_[this->wordIndex(pos)] &= ~this->bitMask(pos); + words_[this->wordIndex(pos)] &= ~this->bitMask(pos); this->updateAllZero(); } bool test(size_t pos) const { if (pos >= size_) throw std::out_of_range("Index out of range"); - return bits_[this->wordIndex(pos)] & this->bitMask(pos); + return words_[this->wordIndex(pos)] & this->bitMask(pos); } size_t size() const { @@ -80,12 +80,12 @@ public: void resize(size_t new_size) { size_ = new_size; - bits_.resize((new_size + (BITS_PER_WORD - 1)) / BITS_PER_WORD, 0); + words_.resize((new_size + (BITS_PER_WORD - 1)) / BITS_PER_WORD, 0); this->updateAllZero(); } bool operator==(const BitVector& other) const { - return (size_ == other.size_) && (bits_ == other.bits_); + return (size_ == other.size_) && (words_ == other.words_); } bool operator!=(const BitVector& other) const { @@ -98,8 +98,8 @@ public: BitVector& operator&=(const BitVector& other) { if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match"); - for (size_t i = 0; i < bits_.size(); ++i) { - bits_[i] &= other.bits_[i]; + for (size_t i = 0; i < words_.size(); ++i) { + words_[i] &= other.words_[i]; } this->updateAllZero(); return *this; @@ -107,8 +107,8 @@ public: BitVector& operator|=(const BitVector& other) { if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match"); - for (size_t i = 0; i < bits_.size(); ++i) { - bits_[i] |= other.bits_[i]; + for (size_t i = 0; i < words_.size(); ++i) { + words_[i] |= other.words_[i]; } this->updateAllZero(); return *this; @@ -116,8 +116,8 @@ public: BitVector& operator^=(const BitVector& other) { if (size_ != other.size_) throw std::invalid_argument("Bit sizes must match"); - for (size_t i = 0; i < bits_.size(); ++i) { - bits_[i] ^= other.bits_[i]; + for (size_t i = 0; i < words_.size(); ++i) { + words_[i] ^= other.words_[i]; } this->updateAllZero(); return *this; @@ -125,23 +125,48 @@ public: BitVector operator~() const { BitVector result(size_); - for (size_t i = 0; i < bits_.size(); ++i) { - result.bits_[i] = ~bits_[i]; + for (size_t i = 0; i < words_.size(); ++i) { + result.words_[i] = ~words_[i]; } result.updateAllZero(); return result; } void flip() { - for (auto &word : bits_) { + for (auto &word : words_) { word = ~word; } this->updateAllZero(); } + void reverse() { + if (size_ == 0) + return; + size_t remaining_bits = size_ % BITS_PER_WORD; + if (remaining_bits != 0) { + std::vector reversed_words(words_.size(), 0); + for (size_t i = 0; i < size_; ++i) { + size_t reversed_pos = size_ - 1 - i; + size_t src_word = i / BITS_PER_WORD; + size_t src_offset = i % BITS_PER_WORD; + size_t dst_word = reversed_pos / BITS_PER_WORD; + size_t dst_offset = reversed_pos % BITS_PER_WORD; + if (words_[src_word] & (T(1) << src_offset)) { + reversed_words[dst_word] |= (T(1) << dst_offset); + } + } + words_ = std::move(reversed_words); + } else { + std::reverse(words_.begin(), words_.end()); + for (auto &word : words_) { + word = static_cast(bit_reverse(static_cast(word))); + } + } + } + size_t count() const { size_t count = 0; - for (const auto &word : bits_) { + for (const auto &word : words_) { count += std::bitset(word).count(); } return count; @@ -160,12 +185,12 @@ public: size_t remaining_bits = size_ % BITS_PER_WORD; T full_mask = ~T(0); for (size_t i = 0; i < full_bits; ++i) { - if (bits_[i] != full_mask) + if (words_[i] != full_mask) return false; } if (remaining_bits > 0) { T partial_mask = (T(1) << remaining_bits) - 1; - if ((bits_[full_bits] & partial_mask) != partial_mask) + if ((words_[full_bits] & partial_mask) != partial_mask) return false; } return true; @@ -181,17 +206,17 @@ public: size_t bit_shift = pos % BITS_PER_WORD; if (word_shift > 0) { - for (size_t i = bits_.size() - 1; i >= word_shift; --i) { - bits_[i] = bits_[i - word_shift]; + for (size_t i = words_.size() - 1; i >= word_shift; --i) { + words_[i] = words_[i - word_shift]; } - std::fill(bits_.begin(), bits_.begin() + word_shift, 0); + std::fill(words_.begin(), words_.begin() + word_shift, 0); } if (bit_shift > 0) { - for (size_t i = bits_.size() - 1; i > 0; --i) { - bits_[i] = (bits_[i] << bit_shift) | (bits_[i - 1] >> (BITS_PER_WORD - bit_shift)); + for (size_t i = words_.size() - 1; i > 0; --i) { + words_[i] = (words_[i] << bit_shift) | (words_[i - 1] >> (BITS_PER_WORD - bit_shift)); } - bits_[0] <<= bit_shift; + words_[0] <<= bit_shift; } this->updateAllZero(); @@ -208,17 +233,17 @@ public: size_t bit_shift = pos % BITS_PER_WORD; if (word_shift > 0) { - for (size_t i = 0; i < bits_.size() - word_shift; ++i) { - bits_[i] = bits_[i + word_shift]; + for (size_t i = 0; i < words_.size() - word_shift; ++i) { + words_[i] = words_[i + word_shift]; } - std::fill(bits_.end() - word_shift, bits_.end(), 0); + std::fill(words_.end() - word_shift, words_.end(), 0); } if (bit_shift > 0) { - for (size_t i = 0; i < bits_.size() - 1; ++i) { - bits_[i] = (bits_[i] >> bit_shift) | (bits_[i + 1] << (BITS_PER_WORD - bit_shift)); + for (size_t i = 0; i < words_.size() - 1; ++i) { + words_[i] = (words_[i] >> bit_shift) | (words_[i + 1] << (BITS_PER_WORD - bit_shift)); } - bits_.back() >>= bit_shift; + words_.back() >>= bit_shift; } this->updateAllZero(); diff --git a/sim/common/dram_sim.cpp b/sim/common/dram_sim.cpp index f7cfa8a32..fc58bd640 100644 --- a/sim/common/dram_sim.cpp +++ b/sim/common/dram_sim.cpp @@ -29,23 +29,58 @@ using namespace vortex; class DramSim::Impl { private: + struct mem_req_t { + uint64_t addr; + bool is_write; + ResponseCallback callback; + void* arg; + }; + Ramulator::IFrontEnd* ramulator_frontend_; Ramulator::IMemorySystem* ramulator_memorysystem_; + uint32_t cpu_channel_size_; + uint64_t cpu_cycles_; + uint32_t scaled_dram_cycles_; + static const uint32_t tick_cycles_ = 1000; + static const uint32_t dram_channel_size_ = 16; // 128 bits + std::queue pending_reqs_; + + void handle_pending_requests() { + if (pending_reqs_.empty()) + return; + auto& req = pending_reqs_.front(); + auto req_type = req.is_write ? Ramulator::Request::Type::Write : Ramulator::Request::Type::Read; + std::function callback = nullptr; + if (req.callback) { + callback = [req_callback = std::move(req.callback), req_arg = std::move(req.arg)](Ramulator::Request& /*dram_req*/) { + req_callback(req_arg); + }; + } + if (ramulator_frontend_->receive_external_requests(req_type, req.addr, 0, callback)) { + if (req.is_write) { + // Ramulator does not handle write responses, so we fire the callback ourselves. + if (req.callback) { + req.callback(req.arg); + } + } + pending_reqs_.pop(); + } + } public: - Impl(int clock_ratio) { + Impl(uint32_t num_channels, uint32_t channel_size, float clock_ratio) { YAML::Node dram_config; dram_config["Frontend"]["impl"] = "GEM5"; dram_config["MemorySystem"]["impl"] = "GenericDRAM"; - dram_config["MemorySystem"]["clock_ratio"] = clock_ratio; + dram_config["MemorySystem"]["clock_ratio"] = 1; dram_config["MemorySystem"]["DRAM"]["impl"] = "HBM2"; dram_config["MemorySystem"]["DRAM"]["org"]["preset"] = "HBM2_8Gb"; dram_config["MemorySystem"]["DRAM"]["org"]["density"] = 8192; + dram_config["MemorySystem"]["DRAM"]["org"]["channel"] = num_channels; dram_config["MemorySystem"]["DRAM"]["timing"]["preset"] = "HBM2_2Gbps"; dram_config["MemorySystem"]["Controller"]["impl"] = "Generic"; dram_config["MemorySystem"]["Controller"]["Scheduler"]["impl"] = "FRFCFS"; dram_config["MemorySystem"]["Controller"]["RefreshManager"]["impl"] = "AllBank"; - dram_config["MemorySystem"]["Controller"]["RefreshManager"]["impl"] = "AllBank"; dram_config["MemorySystem"]["Controller"]["RowPolicy"]["impl"] = "OpenRowPolicy"; { YAML::Node draw_plugin; @@ -59,6 +94,10 @@ public: ramulator_memorysystem_ = Ramulator::Factory::create_memory_system(dram_config); ramulator_frontend_->connect_memory_system(ramulator_memorysystem_); ramulator_memorysystem_->connect_frontend(ramulator_frontend_); + + cpu_channel_size_ = channel_size; + scaled_dram_cycles_ = static_cast(clock_ratio * tick_cycles_); + this->reset(); } ~Impl() { @@ -71,36 +110,44 @@ public: } void reset() { - //-- + cpu_cycles_ = 0; } void tick() { - ramulator_memorysystem_->tick(); + cpu_cycles_ += tick_cycles_; + while (cpu_cycles_ >= scaled_dram_cycles_) { + this->handle_pending_requests(); + ramulator_memorysystem_->tick(); + cpu_cycles_ -= scaled_dram_cycles_; + } } - bool send_request(bool is_write, uint64_t addr, int source_id, ResponseCallback callback, void* arg) { - if (!ramulator_frontend_->receive_external_requests( - is_write ? Ramulator::Request::Type::Write : Ramulator::Request::Type::Read, - addr, - source_id, - [callback_ = std::move(callback), arg_ = std::move(arg)](Ramulator::Request& /*dram_req*/) { - callback_(arg_); + void send_request(uint64_t addr, bool is_write, ResponseCallback response_cb, void* arg) { + // enqueue the request + if (cpu_channel_size_ > dram_channel_size_) { + uint32_t n = cpu_channel_size_ / dram_channel_size_; + for (uint32_t i = 0; i < n; ++i) { + uint64_t dram_byte_addr = (addr / cpu_channel_size_) * dram_channel_size_ + (i * dram_channel_size_); + if (i == 0) { + pending_reqs_.push({dram_byte_addr, is_write, response_cb, arg}); + } else { + pending_reqs_.push({dram_byte_addr, is_write, nullptr, nullptr}); + } } - )) { - return false; + } else if (cpu_channel_size_ < dram_channel_size_) { + uint64_t dram_byte_addr = (addr / cpu_channel_size_) * dram_channel_size_; + pending_reqs_.push({dram_byte_addr, is_write, response_cb, arg}); + } else { + uint64_t dram_byte_addr = addr; + pending_reqs_.push({dram_byte_addr, is_write, response_cb, arg}); } - if (is_write) { - // Ramulator does not handle write responses, so we call the callback ourselves - callback(arg); - } - return true; - } + } }; /////////////////////////////////////////////////////////////////////////////// -DramSim::DramSim(int clock_ratio) - : impl_(new Impl(clock_ratio)) +DramSim::DramSim(uint32_t num_channels, uint32_t channel_size, float clock_ratio) + : impl_(new Impl(num_channels, channel_size, clock_ratio)) {} DramSim::~DramSim() { @@ -115,6 +162,6 @@ void DramSim::tick() { impl_->tick(); } -bool DramSim::send_request(bool is_write, uint64_t addr, int source_id, ResponseCallback callback, void* arg) { - return impl_->send_request(is_write, addr, source_id, callback, arg); +void DramSim::send_request(uint64_t addr, bool is_write, ResponseCallback callback, void* arg) { + impl_->send_request(addr, is_write, callback, arg); } \ No newline at end of file diff --git a/sim/common/dram_sim.h b/sim/common/dram_sim.h index 5fea3f27c..1ff07e2ea 100644 --- a/sim/common/dram_sim.h +++ b/sim/common/dram_sim.h @@ -19,14 +19,15 @@ class DramSim { public: typedef void (*ResponseCallback)(void *arg); - DramSim(int clock_ratio); + DramSim(uint32_t num_channels, uint32_t channel_size, float clock_ratio); ~DramSim(); void reset(); void tick(); - bool send_request(bool is_write, uint64_t addr, int source_id, ResponseCallback callback, void* arg); + // addr: per-channel block address + void send_request(uint64_t addr, bool is_write, ResponseCallback response_cb, void* arg); private: class Impl; diff --git a/sim/common/mem.cpp b/sim/common/mem.cpp index a0f1884d1..61dc38389 100644 --- a/sim/common/mem.cpp +++ b/sim/common/mem.cpp @@ -17,9 +17,20 @@ #include #include #include "util.h" +#include +#include using namespace vortex; +#ifdef VM_ENABLE +// #ifndef NDEBUG +// #define DBGPRINT(format, ...) do { printf("[VXDRV] " format "", ##__VA_ARGS__); } while (0) +// #else +#define DBGPRINT(format, ...) ((void)0) +// #endif +#endif + + RamMemDevice::RamMemDevice(const char *filename, uint32_t wordSize) : wordSize_(wordSize) { std::ifstream input(filename); @@ -123,17 +134,95 @@ void MemoryUnit::ADecoder::write(const void* data, uint64_t addr, uint64_t size) MemoryUnit::MemoryUnit(uint64_t pageSize) : pageSize_(pageSize) +#ifndef VM_ENABLE , enableVM_(pageSize != 0) - , amo_reservation_({0x0, false}) { - if (pageSize != 0) { - tlb_[0] = TLBEntry(0, 077); +#endif + , amo_reservation_({0x0, false}) +#ifdef VM_ENABLE + , TLB_HIT(0) + , TLB_MISS(0) + , TLB_EVICT(0) + , PTW(0) + , satp_(NULL) {}; +#else + { + if (pageSize != 0) + { + tlb_[0] = TLBEntry(0, 077); + } } -} +#endif void MemoryUnit::attach(MemDevice &m, uint64_t start, uint64_t end) { decoder_.map(start, end, m); } + +#ifdef VM_ENABLE +std::pair MemoryUnit::tlbLookup(uint64_t vAddr, ACCESS_TYPE type, uint64_t* size_bits) { + + //Find entry while accounting for different sizes. + for (auto entry : tlb_) + { + if(entry.first == vAddr >> entry.second.size_bits) + { + *size_bits = entry.second.size_bits; + vAddr = vAddr >> (*size_bits); + } + } + + + auto iter = tlb_.find(vAddr); + if (iter != tlb_.end()) { + TLBEntry e = iter->second; + + //Set mru bit if it is a hit. + iter->second.mru_bit = true; + + //If at full capacity and no other unset bits. + // Clear all bits except the one we just looked up. + if (tlb_.size() == TLB_SIZE) + { + // bool no_cleared = true; + // for (auto& entry : tlb_) + // { + // no_cleared = no_cleared & entry.second.mru_bit; + // } + + // if(no_cleared) + // { + for (auto& entry : tlb_) + { + entry.second.mru_bit = false; + } + iter->second.mru_bit = true; + //} + + } + //Check access permissions. + if ( (type == ACCESS_TYPE::FETCH) & ((e.r == 0) | (e.x == 0)) ) + { + throw Page_Fault_Exception("Page Fault : Incorrect permissions."); + } + else if ( (type == ACCESS_TYPE::LOAD) & (e.r == 0) ) + { + throw Page_Fault_Exception("Page Fault : Incorrect permissions."); + } + else if ( (type == ACCESS_TYPE::STORE) & (e.w == 0) ) + { + throw Page_Fault_Exception("Page Fault : Incorrect permissions."); + } + else + { + //TLB Hit + return std::make_pair(true, iter->second.pfn); + } + } else { + //TLB Miss + return std::make_pair(false, 0); + } +} +#else MemoryUnit::TLBEntry MemoryUnit::tlbLookup(uint64_t vAddr, uint32_t flagMask) { auto iter = tlb_.find(vAddr / pageSize_); if (iter != tlb_.end()) { @@ -157,31 +246,96 @@ uint64_t MemoryUnit::toPhyAddr(uint64_t addr, uint32_t flagMask) { } return pAddr; } +#endif -void MemoryUnit::read(void* data, uint64_t addr, uint64_t size, bool sup) { +#ifdef VM_ENABLE +void MemoryUnit::read(void* data, uint64_t addr, uint32_t size, ACCESS_TYPE type) { + DBGPRINT(" [MMU:read] 0x%lx, 0x%x, %u\n",addr,size,type); + uint64_t pAddr; + pAddr = vAddr_to_pAddr(addr, type); + return decoder_.read(data, pAddr, size); +} +#else +void MemoryUnit::read(void* data, uint64_t addr, uint32_t size, bool sup) { uint64_t pAddr = this->toPhyAddr(addr, sup ? 8 : 1); return decoder_.read(data, pAddr, size); } - -void MemoryUnit::write(const void* data, uint64_t addr, uint64_t size, bool sup) { +#endif +#ifdef VM_ENABLE +void MemoryUnit::write(const void* data, uint64_t addr, uint32_t size, ACCESS_TYPE type) { + DBGPRINT(" [MMU:Write] 0x%lx, 0x%x, %u\n",addr,size,type); + uint64_t pAddr; + pAddr = vAddr_to_pAddr(addr, type); + decoder_.write(data, pAddr, size); + amo_reservation_.valid = false; +} +#else +void MemoryUnit::write(const void* data, uint64_t addr, uint32_t size, bool sup) { uint64_t pAddr = this->toPhyAddr(addr, sup ? 16 : 1); decoder_.write(data, pAddr, size); amo_reservation_.valid = false; } +#endif +#ifdef VM_ENABLE +void MemoryUnit::amo_reserve(uint64_t addr) { + DBGPRINT(" [MMU:amo_reserve] 0x%lx\n",addr); + uint64_t pAddr = this->vAddr_to_pAddr(addr,ACCESS_TYPE::LOAD); + amo_reservation_.addr = pAddr; + amo_reservation_.valid = true; +} +#else void MemoryUnit::amo_reserve(uint64_t addr) { uint64_t pAddr = this->toPhyAddr(addr, 1); amo_reservation_.addr = pAddr; amo_reservation_.valid = true; } +#endif +#ifdef VM_ENABLE +bool MemoryUnit::amo_check(uint64_t addr) { + DBGPRINT(" [MMU:amo_check] 0x%lx\n",addr); + uint64_t pAddr = this->vAddr_to_pAddr(addr, ACCESS_TYPE::LOAD); + return amo_reservation_.valid && (amo_reservation_.addr == pAddr); +} +#else bool MemoryUnit::amo_check(uint64_t addr) { uint64_t pAddr = this->toPhyAddr(addr, 1); return amo_reservation_.valid && (amo_reservation_.addr == pAddr); } +#endif + + +#ifdef VM_ENABLE + +void MemoryUnit::tlbAdd(uint64_t virt, uint64_t phys, uint32_t flags, uint64_t size_bits) { + // HW: evict TLB by Most Recently Used + if (tlb_.size() == TLB_SIZE - 1) { + for (auto& entry : tlb_) + { + entry.second.mru_bit = false; + } + + } else if (tlb_.size() == TLB_SIZE) { + uint64_t del; + for (auto entry : tlb_) { + if (!entry.second.mru_bit) + { + del = entry.first; + break; + } + } + tlb_.erase(tlb_.find(del)); + TLB_EVICT++; + } + tlb_[virt / pageSize_] = TLBEntry(phys / pageSize_, flags, size_bits); +} +#else + void MemoryUnit::tlbAdd(uint64_t virt, uint64_t phys, uint32_t flags) { tlb_[virt / pageSize_] = TLBEntry(phys / pageSize_, flags); } +#endif void MemoryUnit::tlbRm(uint64_t va) { if (tlb_.find(va / pageSize_) != tlb_.end()) @@ -325,6 +479,7 @@ uint8_t *RAM::get(uint64_t address) const { } void RAM::read(void* data, uint64_t addr, uint64_t size) { + // printf("====%s (addr= 0x%lx, size= 0x%lx) ====\n", __PRETTY_FUNCTION__,addr,size); if (check_acl_ && acl_mngr_.check(addr, size, 0x1) == false) { throw BadAddress(); } @@ -435,3 +590,171 @@ void RAM::loadHexImage(const char* filename) { --size; } } + +#ifdef VM_ENABLE + +uint64_t MemoryUnit::get_base_ppn() +{ + assert(satp_!= NULL); + return satp_->get_base_ppn(); +} + +uint64_t MemoryUnit::get_satp() +{ + if (is_satp_unset()) + return 0; + else + return satp_->get_satp(); +} + +uint8_t MemoryUnit::is_satp_unset() +{ + return (satp_==NULL); +} + +uint8_t MemoryUnit::get_mode() +{ + assert(satp_!= NULL); + return satp_->get_mode(); +} +void MemoryUnit::set_satp(uint64_t satp) +{ + // uint16_t asid = 0; // set asid for different process + satp_ = new SATP_t (satp ); +} + +bool MemoryUnit::need_trans(uint64_t dev_pAddr) + { + // Check if the satp is set and BARE mode + if ( is_satp_unset() || (get_mode() == BARE)) + return 0; + + // Check if the address is reserved for system usage + // bool isReserved = (PAGE_TABLE_BASE_ADDR <= dev_pAddr && dev_pAddr < PAGE_TABLE_BASE_ADDR + PT_SIZE_LIMIT); + if (PAGE_TABLE_BASE_ADDR <= dev_pAddr) + return 0; + + // Check if the address is reserved for IO usage + if (dev_pAddr < USER_BASE_ADDR) + return 0; + // Check if the address falls within the startup address range + if ((STARTUP_ADDR <= dev_pAddr) && (dev_pAddr <= (STARTUP_ADDR + 0x40000))) + return 0; + + // Now all conditions are not met. Return true because the address needs translation + return 1; + } + +uint64_t MemoryUnit::vAddr_to_pAddr(uint64_t vAddr, ACCESS_TYPE type) +{ + uint64_t pfn; + uint64_t size_bits; + DBGPRINT(" [MMU: V2P] vaddr = 0x%lx, type = 0x%u\n",vAddr,type); + if (!need_trans(vAddr)) + { + DBGPRINT(" [MMU: V2P] Translation is not needed.\n"); + return vAddr; + } + + //First lookup TLB. + std::pair tlb_access = tlbLookup(vAddr, type, &size_bits); + if (tlb_access.first) + { + + pfn = tlb_access.second; + TLB_HIT++; + } + else //Else walk the PT. + { + std::pair ptw_access = page_table_walk(vAddr, type, &size_bits); + tlbAdd(vAddr>>size_bits, ptw_access.first, ptw_access.second,size_bits); + pfn = ptw_access.first; TLB_MISS++; PTW++; + unique_translations.insert(vAddr>>size_bits); + PERF_UNIQUE_PTW = unique_translations.size(); + + } + + //Construct final address using pfn and offset. + DBGPRINT(" [MMU: V2P] translated vAddr: 0x%lx to pAddr 0x%lx\n",vAddr,((pfn << size_bits) + (vAddr & ((1 << size_bits) - 1)))); + return (pfn << size_bits) + (vAddr & ((1 << size_bits) - 1)); +} + +uint64_t MemoryUnit::get_pte_address(uint64_t base_ppn, uint64_t vpn) +{ + return (base_ppn * PT_SIZE) + (vpn * PTE_SIZE); +} + +std::pair MemoryUnit::page_table_walk(uint64_t vAddr_bits, ACCESS_TYPE type, uint64_t *size_bits) +{ + DBGPRINT(" [MMU:PTW] Start: vaddr = 0x%lx, type = %u.\n", vAddr_bits, type); + uint8_t level = PT_LEVEL; + int i = level-1; + vAddr_t vaddr(vAddr_bits); + uint32_t flags =0; + uint64_t pte_addr = 0, pte_bytes = 0; + uint64_t cur_base_ppn = get_base_ppn(); + // Need to fix for super page + *size_bits = 12; + + while (true) + { + // Read PTE. + pte_addr = get_pte_address(cur_base_ppn, vaddr.vpn[i]); + decoder_.read(&pte_bytes, pte_addr, PTE_SIZE); + PTE_t pte(pte_bytes); + DBGPRINT(" [MMU:PTW] Level[%u] pte_addr=0x%lx, pte_bytes =0x%lx, pte.ppn= 0x%lx, pte.flags = %u)\n", i, pte_addr, pte_bytes, pte.ppn, pte.flags); + + assert(((pte.pte_bytes & 0xFFFFFFFF) != 0xbaadf00d) && "ERROR: uninitialzed PTE\n" ); + + // Check if it has invalid flag bits. + if ((pte.v == 0) | ((pte.r == 0) & (pte.w == 1))) + { + assert(0); + throw Page_Fault_Exception(" [MMU:PTW] Page Fault : Attempted to access invalid entry."); + } + + if ((pte.r == 0) & (pte.w == 0) & (pte.x == 0)) + { + // Not a leaf node as rwx == 000 + i--; + if (i < 0) + { + assert(0); + throw Page_Fault_Exception(" [MMU:PTW] Page Fault : No leaf node found."); + } + else + { + // Continue on to next level. + cur_base_ppn= pte.ppn; + DBGPRINT(" [MMU:PTW] next base_ppn: 0x%lx\n", cur_base_ppn); + continue; + } + } + else + { + // Leaf node found, finished walking. + // Check RWX permissions according to access type. + if ((type == ACCESS_TYPE::FETCH) & ((pte.r == 0) | (pte.x == 0))) + { + assert(0); + throw Page_Fault_Exception(" [MMU:PTW] Page Fault : TYPE FETCH, Incorrect permissions."); + } + else if ((type == ACCESS_TYPE::LOAD) & (pte.r == 0)) + { + assert(0); + throw Page_Fault_Exception(" [MMU:PTW] Page Fault : TYPE LOAD, Incorrect permissions."); + } + else if ((type == ACCESS_TYPE::STORE) & (pte.w == 0)) + { + assert(0); + throw Page_Fault_Exception(" [MMU:PTW] Page Fault : TYPE STORE, Incorrect permissions."); + } + cur_base_ppn = pte.ppn; + flags = pte.flags; + break; + } + } + return std::make_pair(cur_base_ppn, flags); +} + +#endif diff --git a/sim/common/mem.h b/sim/common/mem.h index 1f5196113..617e83d69 100644 --- a/sim/common/mem.h +++ b/sim/common/mem.h @@ -18,8 +18,108 @@ #include #include #include +#include +#include +#include "VX_config.h" +#ifdef VM_ENABLE +#include +#include +#include +#endif + namespace vortex { + + +#ifdef VM_ENABLE + +// VA MODE +#define BARE 0x0 +#define SV32 0x1 +#define SV39 0x8 + +enum ACCESS_TYPE { + LOAD, + STORE, + FETCH +}; +class SATP_t +{ + private: + uint64_t address; + uint16_t asid; + uint8_t mode; + uint64_t ppn; + uint64_t satp; + + uint64_t bits(uint64_t input, uint8_t s_idx, uint8_t e_idx) + { + return (input>> s_idx) & (((uint64_t)1 << (e_idx - s_idx + 1)) - 1); + } + bool bit(uint64_t input , uint8_t idx) + { + return (input ) & ((uint64_t)1 << idx); + } + + public: + SATP_t(uint64_t satp) : satp(satp) + { +#ifdef XLEN_32 + mode = bit(satp, 31); + asid = bits(satp, 22, 30); + ppn = bits(satp, 0,21); +#else + mode = bits(satp, 60,63); + asid = bits(satp, 44, 59); + ppn = bits(satp, 0,43); +#endif + address = ppn << MEM_PAGE_LOG2_SIZE; + } + + SATP_t(uint64_t address, uint16_t asid) : address(address), asid(asid) + { +#ifdef XLEN_32 + assert((address >> 32) == 0 && "Upper 32 bits are not zero!"); +#endif + mode= VM_ADDR_MODE; + // asid = 0 ; + ppn = address >> MEM_PAGE_LOG2_SIZE; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshift-count-overflow" +#ifdef XLEN_32 + satp = (((uint64_t)mode << 31) | ((uint64_t)asid << 22) | ppn); +#else + satp = (((uint64_t)mode << 60) | ((uint64_t)asid << 44) | ppn); +#endif +#pragma GCC diagnostic pop + } + uint8_t get_mode() + { + return mode; + } + uint16_t get_asid() + { + return asid; + } + uint64_t get_base_ppn() + { + return ppn; + } + uint64_t get_satp() + { + return satp; + } +}; + + +class Page_Fault_Exception : public std::runtime_error /* or logic_error */ +{ +public: + Page_Fault_Exception(const std::string& what = "") : std::runtime_error(what) {} + uint64_t addr; + ACCESS_TYPE type; +}; +#endif struct BadAddress {}; struct OutOfRange {}; @@ -73,26 +173,53 @@ public: class MemoryUnit { public: +// HW: Expand PageFault struct to contain access_type info for debug purposes struct PageFault { PageFault(uint64_t a, bool nf) : faultAddr(a) , notFound(nf) + // , access_type(ACCESS_TYPE::LOAD) {} - uint64_t faultAddr; - bool notFound; + uint64_t faultAddr; + bool notFound; + // ACCESS_TYPE access_type; }; +#ifdef VM_ENABLE + MemoryUnit(uint64_t pageSize = MEM_PAGE_SIZE); + ~MemoryUnit(){ + if ( this->satp_ != NULL) + delete this->satp_; + }; +#else MemoryUnit(uint64_t pageSize = 0); +#endif void attach(MemDevice &m, uint64_t start, uint64_t end); - void read(void* data, uint64_t addr, uint64_t size, bool sup); - void write(const void* data, uint64_t addr, uint64_t size, bool sup); + +#ifdef VM_ENABLE + void read(void* data, uint64_t addr, uint32_t size, ACCESS_TYPE type = ACCESS_TYPE::LOAD); + void write(const void* data, uint64_t addr, uint32_t size, ACCESS_TYPE type = ACCESS_TYPE::STORE); +#else + void read(void* data, uint64_t addr, uint32_t size, bool sup); + void write(const void* data, uint64_t addr, uint32_t size, bool sup); +#endif void amo_reserve(uint64_t addr); bool amo_check(uint64_t addr); +#ifdef VM_ENABLE + void tlbAdd(uint64_t virt, uint64_t phys, uint32_t flags, uint64_t size_bits); + uint8_t is_satp_unset(); + uint64_t get_satp(); + uint8_t get_mode(); + uint64_t get_base_ppn(); + void set_satp(uint64_t satp); +#else void tlbAdd(uint64_t virt, uint64_t phys, uint32_t flags); +#endif + void tlbRm(uint64_t vaddr); void tlbFlush() { tlb_.clear(); @@ -134,24 +261,71 @@ private: struct TLBEntry { TLBEntry() {} - TLBEntry(uint32_t pfn, uint32_t flags) + #ifdef VM_ENABLE + TLBEntry(uint32_t pfn, uint32_t flags, uint64_t size_bits) : pfn(pfn) , flags(flags) + , mru_bit(true) + , size_bits (size_bits) + { + d = bit(7); + a = bit(6); + g = bit(5); + u = bit(4); + x = bit(3); + w = bit(2); + r = bit(1); + v = bit(0); + } + bool bit(uint8_t idx) + { + return (flags) & (1 << idx); + } + + uint32_t pfn; + uint32_t flags; + bool mru_bit; + uint64_t size_bits; + bool d, a, g, u, x, w, r, v; + #else + TLBEntry(uint32_t pfn, uint32_t flags) + : pfn(pfn) + , flags(flags) {} uint32_t pfn; uint32_t flags; + #endif }; - TLBEntry tlbLookup(uint64_t vAddr, uint32_t flagMask); +#ifdef VM_ENABLE + std::pair tlbLookup(uint64_t vAddr, ACCESS_TYPE type, uint64_t* size_bits); + bool need_trans(uint64_t dev_pAddr); + uint64_t vAddr_to_pAddr(uint64_t vAddr, ACCESS_TYPE type); + + uint64_t get_pte_address(uint64_t base_ppn, uint64_t vpn); + std::pair page_table_walk(uint64_t vAddr_bits, ACCESS_TYPE type, uint64_t* size_bits); +#else uint64_t toPhyAddr(uint64_t vAddr, uint32_t flagMask); + TLBEntry tlbLookup(uint64_t vAddr, uint32_t flagMask); +#endif + + std::unordered_map tlb_; uint64_t pageSize_; ADecoder decoder_; +#ifndef VM_ENABLE bool enableVM_; +#endif amo_reservation_t amo_reservation_; +#ifdef VM_ENABLE + std::unordered_set unique_translations; + uint64_t TLB_HIT, TLB_MISS, TLB_EVICT, PTW, PERF_UNIQUE_PTW; + SATP_t *satp_; +#endif + }; /////////////////////////////////////////////////////////////////////////////// @@ -219,4 +393,149 @@ private: bool check_acl_; }; +#ifdef VM_ENABLE +class PTE_t +{ + + private: + uint64_t address; + uint64_t bits(uint64_t input, uint8_t s_idx, uint8_t e_idx) + { + return (input>> s_idx) & (((uint64_t)1 << (e_idx - s_idx + 1)) - 1); + } + bool bit(uint64_t input, uint8_t idx) + { + return (input) & ((uint64_t)1 << idx); + } + + public: +#if VM_ADDR_MODE == SV39 + bool N; + uint8_t PBMT; +#endif + uint64_t ppn; + uint32_t rsw; + uint32_t flags; + uint8_t level; + bool d, a, g, u, x, w, r, v; + uint64_t pte_bytes; + + void set_flags (uint32_t flag) + { + this->flags = flag; + d = bit(flags,7); + a = bit(flags,6); + g = bit(flags,5); + u = bit(flags,4); + x = bit(flags,3); + w = bit(flags,2); + r = bit(flags,1); + v = bit(flags,0); + } + + PTE_t(uint64_t address, uint32_t flags) : address(address) + { +#if VM_ADDR_MODE == SV39 + N = 0; + PBMT = 0; + level = 3; + ppn = address >> MEM_PAGE_LOG2_SIZE; + // Reserve for Super page support + // ppn = new uint32_t [level]; + // ppn[2]=bits(address,28,53); + // ppn[1]=bits(address,19,27); + // ppn[0]=bits(address,10,18); + set_flags(flags); + // pte_bytes = (N << 63) | (PBMT << 61) | (ppn <<10) | flags ; + pte_bytes = (ppn <<10) | flags ; +#else // if VM_ADDR_MODE == SV32 + assert((address>> 32) == 0 && "Upper 32 bits are not zero!"); + level = 2; + ppn = address >> MEM_PAGE_LOG2_SIZE; + // Reserve for Super page support + // ppn = new uint32_t[level]; + // ppn[1]=bits(address,20,31); + // ppn[0]=bits(address,10,19); + set_flags(flags); + pte_bytes = ppn <<10 | flags ; +#endif + } + + PTE_t(uint64_t pte_bytes) : pte_bytes(pte_bytes) + { +#if VM_ADDR_MODE == SV39 + N = bit(pte_bytes,63); + PBMT = bits(pte_bytes,61,62); + level = 3; + ppn=bits(pte_bytes,10,53); + address = ppn << MEM_PAGE_LOG2_SIZE; + // Reserve for Super page support + // ppn = new uint32_t [level]; + // ppn[2]=bits(pte_bytes,28,53); + // ppn[1]=bits(pte_bytes,19,27); + // ppn[0]=bits(pte_bytes,10,18); +#else //#if VM_ADDR_MODE == SV32 + assert((pte_bytes >> 32) == 0 && "Upper 32 bits are not zero!"); + level = 2; + ppn=bits(pte_bytes,10, 31); + address = ppn << MEM_PAGE_LOG2_SIZE; + // Reserve for Super page support + // ppn = new uint32_t[level]; + // ppn[1]=bits(address, 20,31); + // ppn[0]=bits(address, 10,19); +#endif + rsw = bits(pte_bytes,8,9); + set_flags((uint32_t)(bits(pte_bytes,0,7))); + } + ~PTE_t() + { + // Reserve for Super page support + // delete ppn; + } +}; + +class vAddr_t +{ + + private: + uint64_t address; + uint64_t bits(uint8_t s_idx, uint8_t e_idx) + { + return (address>> s_idx) & (((uint64_t)1 << (e_idx - s_idx + 1)) - 1); + } + bool bit( uint8_t idx) + { + return (address) & ((uint64_t)1 << idx); + } + + public: + uint64_t *vpn; + uint64_t pgoff; + uint8_t level; + vAddr_t(uint64_t address) : address(address) + { +#if VM_ADDR_MODE == SV39 + level = 3; + vpn = new uint64_t [level]; + vpn[2] = bits(30,38); + vpn[1] = bits(21,29); + vpn[0] = bits(12,20); + pgoff = bits(0,11); +#else //#if VM_ADDR_MODE == SV32 + assert((address>> 32) == 0 && "Upper 32 bits are not zero!"); + level = 2; + vpn = new uint64_t [level]; + vpn[1] = bits(22,31); + vpn[0] = bits(12,21); + pgoff = bits(0,11); +#endif + } + + ~vAddr_t() + { + delete vpn; + } +}; +#endif + } // namespace vortex diff --git a/runtime/common/malloc.h b/sim/common/mem_alloc.h similarity index 91% rename from runtime/common/malloc.h rename to sim/common/mem_alloc.h index 480c198a6..5e31d0ea0 100644 --- a/runtime/common/malloc.h +++ b/sim/common/mem_alloc.h @@ -39,6 +39,15 @@ public: page_t* currPage = pages_; while (currPage) { auto nextPage = currPage->next; + #ifdef VM_ENABLE + block_t* currblock = currPage->findfirstUsedBlock(); + block_t* nextblock; + while (currblock) { + nextblock= currblock->nextUsed; + currPage->release(currblock); + currblock = nextblock; + } + #endif delete currPage; currPage = nextPage; } @@ -70,14 +79,15 @@ public: size = alignSize(size, pageAlign_); // Check if the reservation is within memory capacity bounds - if (addr + size > capacity_) { - printf("error: address range out of bounds\n"); + if (addr + size > baseAddress_ + capacity_) { + printf("error: address range out of bounds - requested=0x%lx, base+capacity=0x%lx\n", (addr + size), (baseAddress_ +capacity_)); return -1; } // Ensure the reservation does not overlap with existing pages - if (hasPageOverlap(addr, size)) { - printf("error: address range overlaps with existing allocation\n"); + uint64_t overlapStart, overlapEnd; + if (hasPageOverlap(addr, size, &overlapStart, &overlapEnd)) { + printf("error: address range overlaps with existing allocation - requested=[0x%lx-0x%lx], existing=[0x%lx, 0x%lx]\n", addr, addr+size, overlapStart, overlapEnd); return -1; } @@ -118,12 +128,12 @@ public: auto pageSize = alignSize(size, pageAlign_); uint64_t pageAddr; if (!this->findNextAddress(pageSize, &pageAddr)) { - printf("error: out of memory\n"); + printf("error: out of memory (Can't find next address)\n"); return -1; } currPage = this->createPage(pageAddr, pageSize); if (nullptr == currPage) { - printf("error: out of memory\n"); + printf("error: out of memory (Can't create a page)\n"); return -1; } freeBlock = currPage->findFreeBlock(size); @@ -335,6 +345,11 @@ private: } return nullptr; } +#ifdef VM_ENABLE + block_t* findfirstUsedBlock() { + return usedList_; + } +#endif private: @@ -480,7 +495,7 @@ private: bool findNextAddress(uint64_t size, uint64_t* addr) { if (pages_ == nullptr) { - *addr = baseAddress_; + *addr = baseAddress_; return true; } @@ -498,10 +513,10 @@ private: endOfLastPage = current->addr + current->size; current = current->next; } - + // If no suitable gap is found, place the new page at the end of the last page // Check if the allocator has enough capacity - if ((endOfLastPage + size) <= capacity_) { + if ((endOfLastPage + size) <= (baseAddress_ + capacity_)) { *addr = endOfLastPage; return true; } @@ -509,15 +524,15 @@ private: return false; } - bool hasPageOverlap(uint64_t start, uint64_t size) { + bool hasPageOverlap(uint64_t start, uint64_t size, uint64_t* overlapStart, uint64_t* overlapEnd) { page_t* current = pages_; while (current != nullptr) { uint64_t pageStart = current->addr; uint64_t pageEnd = pageStart + current->size; - uint64_t requestEnd = start + size; - if ((start >= pageStart && start < pageEnd) || // Start of request is inside the page - (requestEnd > pageStart && requestEnd <= pageEnd) || // End of request is inside the page - (start <= pageStart && requestEnd >= pageEnd)) { // Request envelops the page + uint64_t end = start + size; + if ((start <= pageEnd) && (end >= pageStart)) { + *overlapStart = pageStart; + *overlapEnd = pageEnd; return true; } current = current->next; diff --git a/sim/common/mp_macros.h b/sim/common/mp_macros.h new file mode 100644 index 000000000..fde5ac79e --- /dev/null +++ b/sim/common/mp_macros.h @@ -0,0 +1,327 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +// macro primitives + +#define MP_COMMA , +#define MP_REM(...) __VA_ARGS__ +#define MP_EAT(...) + +#define MP_STRINGIZE_(x) #x +#define MP_STRINGIZE(x) MP_STRINGIZE_(x) + +#define MP_CONCAT_(x, ...) x ## __VA_ARGS__ +#define MP_CONCAT(x, ...) MP_CONCAT_(x, __VA_ARGS__) + +#define MP_COUNTOF(arr) (sizeof(arr) / sizeof(arr[0])) + +// conditional macro + +#define MP_IIF_0(x, y) y +#define MP_IIF_1(x, y) x +#define MP_IIF(c) MP_CONCAT(MP_IIF_, c) + +#define MP_PAIR_FIRST(a, b) a +#define MP_PAIR_SECOND(a, b) b + +// pair macros + +#define MP_PAIR(x) MP_REM x +#define MP_PAIR_HEAD_(x, ...) MP_PAIR(x) +#define MP_PAIR_PROBE_(...) (__VA_ARGS__), +#define MP_PAIR_L_(...) MP_PAIR_HEAD_(__VA_ARGS__) +#define MP_PAIR_L(x) MP_PAIR_L_(MP_PAIR_PROBE_ x,) +#define MP_PAIR_R(x) MP_EAT x + +// separator macros + +#define MP_SEP_COMMA() , +#define MP_SEP_SEMICOLON() ; +#define MP_SEP_PLUS() + +#define MP_SEP_AND() & +#define MP_SEP_OR() | +#define MP_SEP_COLON() : +#define MP_SEP_SPACE() /**/ +#define MP_SEP_LESS() < +#define MP_SEP_GREATER() > +#define MP_SEP_ANDL() && +#define MP_SEP_ORL() || + +// MAKE_UNIQUE macro + +#define MP_MAKE_UNIQUE(x) MP_CONCAT(x, __COUNTER__) + +// increment macro + +#define MP_INC(x) MP_INC_ ## x +#define MP_INC_0 1 +#define MP_INC_1 2 +#define MP_INC_2 3 +#define MP_INC_3 4 +#define MP_INC_4 5 +#define MP_INC_5 6 +#define MP_INC_6 7 +#define MP_INC_7 8 +#define MP_INC_8 9 +#define MP_INC_9 10 +#define MP_INC_10 11 +#define MP_INC_11 12 +#define MP_INC_12 13 +#define MP_INC_13 14 +#define MP_INC_14 15 +#define MP_INC_15 16 +#define MP_INC_16 17 +#define MP_INC_17 18 +#define MP_INC_18 19 +#define MP_INC_19 20 +#define MP_INC_20 21 +#define MP_INC_21 22 +#define MP_INC_22 23 +#define MP_INC_23 24 +#define MP_INC_24 25 +#define MP_INC_25 26 +#define MP_INC_26 27 +#define MP_INC_27 28 +#define MP_INC_28 29 +#define MP_INC_29 30 +#define MP_INC_30 31 +#define MP_INC_31 32 +#define MP_INC_32 33 +#define MP_INC_33 34 +#define MP_INC_34 35 +#define MP_INC_35 36 +#define MP_INC_36 37 +#define MP_INC_37 38 +#define MP_INC_38 39 +#define MP_INC_39 40 +#define MP_INC_40 41 +#define MP_INC_41 42 +#define MP_INC_42 43 +#define MP_INC_43 44 +#define MP_INC_44 45 +#define MP_INC_45 46 +#define MP_INC_46 47 +#define MP_INC_47 48 +#define MP_INC_48 49 +#define MP_INC_49 50 +#define MP_INC_50 51 +#define MP_INC_51 52 +#define MP_INC_52 53 +#define MP_INC_53 54 +#define MP_INC_54 55 +#define MP_INC_55 56 +#define MP_INC_56 57 +#define MP_INC_57 58 +#define MP_INC_58 59 +#define MP_INC_59 60 +#define MP_INC_60 61 +#define MP_INC_61 62 +#define MP_INC_62 63 +#define MP_INC_63 64 + +// NARG macro + +#define MP_NARG_N(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10,_11,_12,_13,_14,_15,_16, \ + _17,_18,_19,_20,_21,_22,_23,_24,_25,_26,_27,_28,_29,_30,_31,_32, \ + _33,_34,_35,_36,_37,_38,_39,_40,_41,_42,_43,_44,_45,_46,_47,_48, \ + _49,_50,_51,_52,_53,_54,_55,_56,_57,_58,_59,_60,_61,_62,_63, N, ...) N + +#define MP_NARG_R() 63,62,61,60,59,58,57,56,55,54,53,52,51,50,49,48, \ + 47,46,45,44,43,42,41,40,39,38,37,36,35,34,33,32, \ + 31,30,29,28,27,26,25,24,23,22,21,20,19,18,17,16, \ + 15,14,13,12,11,10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +#define MP_NARG_(...) MP_NARG_N(__VA_ARGS__) +#define MP_NARG(...) MP_NARG_(__VA_ARGS__, MP_NARG_R()) + +// FOR_EACH macro + +#define MP_FOR_EACH_1(idx, func, arg, sep, ...) func(arg, idx, __VA_ARGS__) +#define MP_FOR_EACH_2(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_1(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_3(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_2(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_4(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_3(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_5(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_4(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_6(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_5(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_7(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_6(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_8(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_7(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_9(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_8(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_10(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_9(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_11(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_10(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_12(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_11(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_13(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_12(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_14(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_13(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_15(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_14(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_16(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_15(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_17(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_16(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_18(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_17(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_19(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_18(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_20(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_19(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_21(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_20(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_22(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_21(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_23(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_22(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_24(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_23(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_25(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_24(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_26(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_25(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_27(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_26(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_28(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_27(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_29(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_28(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_30(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_29(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_31(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_30(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_32(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_31(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_33(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_32(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_34(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_33(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_35(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_34(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_36(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_35(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_37(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_36(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_38(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_37(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_39(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_38(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_40(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_39(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_41(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_40(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_42(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_41(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_43(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_42(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_44(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_43(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_45(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_44(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_46(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_45(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_47(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_46(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_48(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_47(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_49(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_48(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_50(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_49(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_51(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_50(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_52(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_51(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_53(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_52(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_54(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_53(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_55(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_54(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_56(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_55(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_57(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_56(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_58(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_57(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_59(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_58(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_60(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_59(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_61(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_60(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_62(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_61(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_63(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_62(MP_INC(idx), func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH_64(idx, func, arg, sep, x, ...) func(arg, idx, x) sep() MP_FOR_EACH_63(MP_INC(idx), func, arg, sep, __VA_ARGS__) + +#define MP_FOR_EACH_(N, func, arg, sep, ...) MP_CONCAT(MP_FOR_EACH_, N)(0, func, arg, sep, __VA_ARGS__) +#define MP_FOR_EACH(func, arg, sep, ...) MP_FOR_EACH_(MP_NARG(__VA_ARGS__), func, arg, sep, __VA_ARGS__) + +// REVERSE_FOR_EACH macro + +#define MP_REVERSE_FOR_EACH_1(func, arg, sep, ...) func(arg, 0, __VA_ARGS__) +#define MP_REVERSE_FOR_EACH_2(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_1(func, arg, sep, __VA_ARGS__) sep() func(arg, 1, x) +#define MP_REVERSE_FOR_EACH_3(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_2(func, arg, sep, __VA_ARGS__) sep() func(arg, 2, x) +#define MP_REVERSE_FOR_EACH_4(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_3(func, arg, sep, __VA_ARGS__) sep() func(arg, 3, x) +#define MP_REVERSE_FOR_EACH_5(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_4(func, arg, sep, __VA_ARGS__) sep() func(arg, 4, x) +#define MP_REVERSE_FOR_EACH_6(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_5(func, arg, sep, __VA_ARGS__) sep() func(arg, 5, x) +#define MP_REVERSE_FOR_EACH_7(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_6(func, arg, sep, __VA_ARGS__) sep() func(arg, 6, x) +#define MP_REVERSE_FOR_EACH_8(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_7(func, arg, sep, __VA_ARGS__) sep() func(arg, 7, x) +#define MP_REVERSE_FOR_EACH_9(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_8(func, arg, sep, __VA_ARGS__) sep() func(arg, 8, x) +#define MP_REVERSE_FOR_EACH_10(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_9(func, arg, sep, __VA_ARGS__) sep() func(arg, 9, x) +#define MP_REVERSE_FOR_EACH_11(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_10(func, arg, sep, __VA_ARGS__) sep() func(arg, 10, x) +#define MP_REVERSE_FOR_EACH_12(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_11(func, arg, sep, __VA_ARGS__) sep() func(arg, 11, x) +#define MP_REVERSE_FOR_EACH_13(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_12(func, arg, sep, __VA_ARGS__) sep() func(arg, 12, x) +#define MP_REVERSE_FOR_EACH_14(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_13(func, arg, sep, __VA_ARGS__) sep() func(arg, 13, x) +#define MP_REVERSE_FOR_EACH_15(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_14(func, arg, sep, __VA_ARGS__) sep() func(arg, 14, x) +#define MP_REVERSE_FOR_EACH_16(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_15(func, arg, sep, __VA_ARGS__) sep() func(arg, 15, x) +#define MP_REVERSE_FOR_EACH_17(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_16(func, arg, sep, __VA_ARGS__) sep() func(arg, 16, x) +#define MP_REVERSE_FOR_EACH_18(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_17(func, arg, sep, __VA_ARGS__) sep() func(arg, 17, x) +#define MP_REVERSE_FOR_EACH_19(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_18(func, arg, sep, __VA_ARGS__) sep() func(arg, 18, x) +#define MP_REVERSE_FOR_EACH_20(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_19(func, arg, sep, __VA_ARGS__) sep() func(arg, 19, x) +#define MP_REVERSE_FOR_EACH_21(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_20(func, arg, sep, __VA_ARGS__) sep() func(arg, 20, x) +#define MP_REVERSE_FOR_EACH_22(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_21(func, arg, sep, __VA_ARGS__) sep() func(arg, 21, x) +#define MP_REVERSE_FOR_EACH_23(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_22(func, arg, sep, __VA_ARGS__) sep() func(arg, 22, x) +#define MP_REVERSE_FOR_EACH_24(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_23(func, arg, sep, __VA_ARGS__) sep() func(arg, 23, x) +#define MP_REVERSE_FOR_EACH_25(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_24(func, arg, sep, __VA_ARGS__) sep() func(arg, 24, x) +#define MP_REVERSE_FOR_EACH_26(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_25(func, arg, sep, __VA_ARGS__) sep() func(arg, 25, x) +#define MP_REVERSE_FOR_EACH_27(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_26(func, arg, sep, __VA_ARGS__) sep() func(arg, 26, x) +#define MP_REVERSE_FOR_EACH_28(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_27(func, arg, sep, __VA_ARGS__) sep() func(arg, 27, x) +#define MP_REVERSE_FOR_EACH_29(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_28(func, arg, sep, __VA_ARGS__) sep() func(arg, 28, x) +#define MP_REVERSE_FOR_EACH_30(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_29(func, arg, sep, __VA_ARGS__) sep() func(arg, 29, x) +#define MP_REVERSE_FOR_EACH_31(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_30(func, arg, sep, __VA_ARGS__) sep() func(arg, 30, x) +#define MP_REVERSE_FOR_EACH_32(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_31(func, arg, sep, __VA_ARGS__) sep() func(arg, 31, x) +#define MP_REVERSE_FOR_EACH_33(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_32(func, arg, sep, __VA_ARGS__) sep() func(arg, 32, x) +#define MP_REVERSE_FOR_EACH_34(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_33(func, arg, sep, __VA_ARGS__) sep() func(arg, 33, x) +#define MP_REVERSE_FOR_EACH_35(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_34(func, arg, sep, __VA_ARGS__) sep() func(arg, 34, x) +#define MP_REVERSE_FOR_EACH_36(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_35(func, arg, sep, __VA_ARGS__) sep() func(arg, 35, x) +#define MP_REVERSE_FOR_EACH_37(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_36(func, arg, sep, __VA_ARGS__) sep() func(arg, 36, x) +#define MP_REVERSE_FOR_EACH_38(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_37(func, arg, sep, __VA_ARGS__) sep() func(arg, 37, x) +#define MP_REVERSE_FOR_EACH_39(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_38(func, arg, sep, __VA_ARGS__) sep() func(arg, 38, x) +#define MP_REVERSE_FOR_EACH_40(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_39(func, arg, sep, __VA_ARGS__) sep() func(arg, 39, x) +#define MP_REVERSE_FOR_EACH_41(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_40(func, arg, sep, __VA_ARGS__) sep() func(arg, 40, x) +#define MP_REVERSE_FOR_EACH_42(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_41(func, arg, sep, __VA_ARGS__) sep() func(arg, 41, x) +#define MP_REVERSE_FOR_EACH_43(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_42(func, arg, sep, __VA_ARGS__) sep() func(arg, 42, x) +#define MP_REVERSE_FOR_EACH_44(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_43(func, arg, sep, __VA_ARGS__) sep() func(arg, 43, x) +#define MP_REVERSE_FOR_EACH_45(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_44(func, arg, sep, __VA_ARGS__) sep() func(arg, 44, x) +#define MP_REVERSE_FOR_EACH_46(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_45(func, arg, sep, __VA_ARGS__) sep() func(arg, 45, x) +#define MP_REVERSE_FOR_EACH_47(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_46(func, arg, sep, __VA_ARGS__) sep() func(arg, 46, x) +#define MP_REVERSE_FOR_EACH_48(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_47(func, arg, sep, __VA_ARGS__) sep() func(arg, 47, x) +#define MP_REVERSE_FOR_EACH_49(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_48(func, arg, sep, __VA_ARGS__) sep() func(arg, 48, x) +#define MP_REVERSE_FOR_EACH_50(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_49(func, arg, sep, __VA_ARGS__) sep() func(arg, 49, x) +#define MP_REVERSE_FOR_EACH_51(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_50(func, arg, sep, __VA_ARGS__) sep() func(arg, 50, x) +#define MP_REVERSE_FOR_EACH_52(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_51(func, arg, sep, __VA_ARGS__) sep() func(arg, 51, x) +#define MP_REVERSE_FOR_EACH_53(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_52(func, arg, sep, __VA_ARGS__) sep() func(arg, 52, x) +#define MP_REVERSE_FOR_EACH_54(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_53(func, arg, sep, __VA_ARGS__) sep() func(arg, 53, x) +#define MP_REVERSE_FOR_EACH_55(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_54(func, arg, sep, __VA_ARGS__) sep() func(arg, 54, x) +#define MP_REVERSE_FOR_EACH_56(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_55(func, arg, sep, __VA_ARGS__) sep() func(arg, 55, x) +#define MP_REVERSE_FOR_EACH_57(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_56(func, arg, sep, __VA_ARGS__) sep() func(arg, 56, x) +#define MP_REVERSE_FOR_EACH_58(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_57(func, arg, sep, __VA_ARGS__) sep() func(arg, 57, x) +#define MP_REVERSE_FOR_EACH_59(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_58(func, arg, sep, __VA_ARGS__) sep() func(arg, 58, x) +#define MP_REVERSE_FOR_EACH_60(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_59(func, arg, sep, __VA_ARGS__) sep() func(arg, 59, x) +#define MP_REVERSE_FOR_EACH_61(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_60(func, arg, sep, __VA_ARGS__) sep() func(arg, 60, x) +#define MP_REVERSE_FOR_EACH_62(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_61(func, arg, sep, __VA_ARGS__) sep() func(arg, 61, x) +#define MP_REVERSE_FOR_EACH_63(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_62(func, arg, sep, __VA_ARGS__) sep() func(arg, 62, x) +#define MP_REVERSE_FOR_EACH_64(func, arg, sep, x, ...) MP_REVERSE_FOR_EACH_63(func, arg, sep, __VA_ARGS__) sep() func(arg, 63, x) + +#define MP_REVERSE_FOR_EACH_(N, func, arg, sep, ...) MP_CONCAT(MP_REVERSE_FOR_EACH_, N)(func, arg, sep, __VA_ARGS__) +#define MP_REVERSE_FOR_EACH(func, arg, sep, ...) MP_REVERSE_FOR_EACH_(MP_NARG(__VA_ARGS__), func, arg, sep, __VA_ARGS__) + +#define MP_FIRST_ARG_(N, ...) N +#define MP_FIRST_ARG(...) MP_FIRST_ARG_(__VA_ARGS__, ignore) + +// MP_REPEAT macro + +#define MP_REPEAT_0(func, sep) +#define MP_REPEAT_1(func, sep) func(0) +#define MP_REPEAT_2(func, sep) MP_REPEAT_1(func, sep) sep func(1) +#define MP_REPEAT_3(func, sep) MP_REPEAT_2(func, sep) sep func(2) +#define MP_REPEAT_4(func, sep) MP_REPEAT_3(func, sep) sep func(3) +#define MP_REPEAT_5(func, sep) MP_REPEAT_4(func, sep) sep func(4) +#define MP_REPEAT_6(func, sep) MP_REPEAT_5(func, sep) sep func(5) +#define MP_REPEAT_7(func, sep) MP_REPEAT_6(func, sep) sep func(6) +#define MP_REPEAT_8(func, sep) MP_REPEAT_7(func, sep) sep func(7) +#define MP_REPEAT_9(func, sep) MP_REPEAT_8(func, sep) sep func(8) +#define MP_REPEAT_10(func, sep) MP_REPEAT_9(func, sep) sep func(9) +#define MP_REPEAT_11(func, sep) MP_REPEAT_10(func, sep) sep func(10) +#define MP_REPEAT_12(func, sep) MP_REPEAT_11(func, sep) sep func(11) +#define MP_REPEAT_13(func, sep) MP_REPEAT_12(func, sep) sep func(12) +#define MP_REPEAT_14(func, sep) MP_REPEAT_13(func, sep) sep func(13) +#define MP_REPEAT_15(func, sep) MP_REPEAT_14(func, sep) sep func(14) +#define MP_REPEAT_16(func, sep) MP_REPEAT_15(func, sep) sep func(15) +#define MP_REPEAT_17(func, sep) MP_REPEAT_16(func, sep) sep func(16) +#define MP_REPEAT_18(func, sep) MP_REPEAT_17(func, sep) sep func(17) +#define MP_REPEAT_19(func, sep) MP_REPEAT_18(func, sep) sep func(18) +#define MP_REPEAT_20(func, sep) MP_REPEAT_19(func, sep) sep func(19) +#define MP_REPEAT_21(func, sep) MP_REPEAT_20(func, sep) sep func(20) +#define MP_REPEAT_22(func, sep) MP_REPEAT_21(func, sep) sep func(21) +#define MP_REPEAT_23(func, sep) MP_REPEAT_22(func, sep) sep func(22) +#define MP_REPEAT_24(func, sep) MP_REPEAT_23(func, sep) sep func(23) +#define MP_REPEAT_25(func, sep) MP_REPEAT_24(func, sep) sep func(24) +#define MP_REPEAT_26(func, sep) MP_REPEAT_25(func, sep) sep func(25) +#define MP_REPEAT_27(func, sep) MP_REPEAT_26(func, sep) sep func(26) +#define MP_REPEAT_28(func, sep) MP_REPEAT_27(func, sep) sep func(27) +#define MP_REPEAT_29(func, sep) MP_REPEAT_28(func, sep) sep func(28) +#define MP_REPEAT_30(func, sep) MP_REPEAT_29(func, sep) sep func(29) +#define MP_REPEAT_31(func, sep) MP_REPEAT_30(func, sep) sep func(30) +#define MP_REPEAT_32(func, sep) MP_REPEAT_31(func, sep) sep func(31) +#define MP_REPEAT(N, func, sep) MP_CONCAT(MP_REPEAT_, N)(func, sep) diff --git a/sim/common/rvfloats.cpp b/sim/common/rvfloats.cpp index 3e577f7f9..ff40fca5c 100644 --- a/sim/common/rvfloats.cpp +++ b/sim/common/rvfloats.cpp @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,6 +16,7 @@ extern "C" { #include +#include "softfloat_ext.h" #include #include <../RISCV/specialize.h> } @@ -158,6 +159,34 @@ uint64_t rv_fdiv_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags) { return from_float64_t(r); } +uint32_t rv_frecip7_s(uint32_t a, uint32_t frm, uint32_t* fflags) { + softfloat_roundingMode = frm; + auto r = f32_recip7(to_float32_t(a)); + if (fflags) { *fflags = softfloat_exceptionFlags; } + return from_float32_t(r); +} + +uint64_t rv_frecip7_d(uint64_t a, uint32_t frm, uint32_t* fflags) { + softfloat_roundingMode = frm; + auto r = f64_recip7(to_float64_t(a)); + if (fflags) { *fflags = softfloat_exceptionFlags; } + return from_float64_t(r); +} + +uint32_t rv_frsqrt7_s(uint32_t a, uint32_t frm, uint32_t* fflags) { + softfloat_roundingMode = frm; + auto r = f32_rsqrte7(to_float32_t(a)); + if (fflags) { *fflags =softfloat_exceptionFlags; } + return from_float32_t(r); +} + +uint64_t rv_frsqrt7_d(uint64_t a, uint32_t frm, uint32_t* fflags) { + softfloat_roundingMode = frm; + auto r = f64_rsqrte7(to_float64_t(a)); + if (fflags) { *fflags = softfloat_exceptionFlags; } + return from_float64_t(r); +} + uint32_t rv_fsqrt_s(uint32_t a, uint32_t frm, uint32_t* fflags) { rv_init(frm); auto r = f32_sqrt(to_float32_t(a)); @@ -315,7 +344,7 @@ bool rv_fle_d(uint64_t a, uint64_t b, uint32_t* fflags) { bool rv_feq_s(uint32_t a, uint32_t b, uint32_t* fflags) { rv_init(0); auto r = f32_eq(to_float32_t(a), to_float32_t(b)); - if (fflags) { *fflags = softfloat_exceptionFlags; } + if (fflags) { *fflags = softfloat_exceptionFlags; } return r; } @@ -326,11 +355,11 @@ bool rv_feq_d(uint64_t a, uint64_t b, uint32_t* fflags) { return r; } -uint32_t rv_fmin_s(uint32_t a, uint32_t b, uint32_t* fflags) { +uint32_t rv_fmin_s(uint32_t a, uint32_t b, uint32_t* fflags) { uint32_t r; rv_init(0); if (isNaNF32UI(a) && isNaNF32UI(b)) { - r = defaultNaNF32UI; + r = defaultNaNF32UI; } else { auto fa = to_float32_t(a); auto fb = to_float32_t(b); @@ -345,11 +374,11 @@ uint32_t rv_fmin_s(uint32_t a, uint32_t b, uint32_t* fflags) { return r; } -uint64_t rv_fmin_d(uint64_t a, uint64_t b, uint32_t* fflags) { +uint64_t rv_fmin_d(uint64_t a, uint64_t b, uint32_t* fflags) { uint64_t r; rv_init(0); if (isNaNF64UI(a) && isNaNF64UI(b)) { - r = defaultNaNF64UI; + r = defaultNaNF64UI; } else { auto fa = to_float64_t(a); auto fb = to_float64_t(b); @@ -368,7 +397,7 @@ uint32_t rv_fmax_s(uint32_t a, uint32_t b, uint32_t* fflags) { uint32_t r; rv_init(0); if (isNaNF32UI(a) && isNaNF32UI(b)) { - r = defaultNaNF32UI; + r = defaultNaNF32UI; } else { auto fa = to_float32_t(a); auto fb = to_float32_t(b); @@ -387,7 +416,7 @@ uint64_t rv_fmax_d(uint64_t a, uint64_t b, uint32_t* fflags) { uint64_t r; rv_init(0); if (isNaNF64UI(a) && isNaNF64UI(b)) { - r = defaultNaNF64UI; + r = defaultNaNF64UI; } else { auto fa = to_float64_t(a); auto fb = to_float64_t(b); @@ -420,8 +449,8 @@ uint32_t rv_fclss_s(uint32_t a) { ( !sign && subnormOrZero && !fracZero ) << 5 | ( !sign && subnormOrZero && fracZero ) << 4 | ( isNaN && isSNaN ) << 8 | - ( isNaN && !isSNaN ) << 9; - + ( isNaN && !isSNaN ) << 9; + return r; } @@ -443,8 +472,8 @@ uint32_t rv_fclss_d(uint64_t a) { ( !sign && subnormOrZero && !fracZero ) << 5 | ( !sign && subnormOrZero && fracZero ) << 4 | ( isNaN && isSNaN ) << 8 | - ( isNaN && !isSNaN ) << 9; - + ( isNaN && !isSNaN ) << 9; + return r; } @@ -454,7 +483,7 @@ uint32_t rv_fsgnj_s(uint32_t a, uint32_t b) { return r; } -uint64_t rv_fsgnj_d(uint64_t a, uint64_t b) { +uint64_t rv_fsgnj_d(uint64_t a, uint64_t b) { auto sign = b & F64_SIGN; auto r = sign | (a & ~F64_SIGN); return r; @@ -466,7 +495,7 @@ uint32_t rv_fsgnjn_s(uint32_t a, uint32_t b) { return r; } -uint64_t rv_fsgnjn_d(uint64_t a, uint64_t b) { +uint64_t rv_fsgnjn_d(uint64_t a, uint64_t b) { auto sign = ~b & F64_SIGN; auto r = sign | (a & ~F64_SIGN); return r; @@ -479,13 +508,18 @@ uint32_t rv_fsgnjx_s(uint32_t a, uint32_t b) { return r; } -uint64_t rv_fsgnjx_d(uint64_t a, uint64_t b) { +uint64_t rv_fsgnjx_d(uint64_t a, uint64_t b) { auto sign1 = a & F64_SIGN; auto sign2 = b & F64_SIGN; auto r = (sign1 ^ sign2) | (a & ~F64_SIGN); return r; } +uint32_t rv_dtof_r(uint64_t a, uint32_t frm) { + rv_init(frm); + return rv_dtof(a); +} + uint32_t rv_dtof(uint64_t a) { auto r = f64_to_f32(to_float64_t(a)); return from_float32_t(r); diff --git a/sim/common/rvfloats.h b/sim/common/rvfloats.h index d921846dd..86b60e8ee 100644 --- a/sim/common/rvfloats.h +++ b/sim/common/rvfloats.h @@ -28,6 +28,8 @@ uint32_t rv_fnmadd_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* uint32_t rv_fnmsub_s(uint32_t a, uint32_t b, uint32_t c, uint32_t frm, uint32_t* fflags); uint32_t rv_fdiv_s(uint32_t a, uint32_t b, uint32_t frm, uint32_t* fflags); uint32_t rv_fsqrt_s(uint32_t a, uint32_t frm, uint32_t* fflags); +uint32_t rv_frecip7_s(uint32_t a, uint32_t frm, uint32_t* fflags); +uint32_t rv_frsqrt7_s(uint32_t a, uint32_t frm, uint32_t* fflags); uint32_t rv_ftoi_s(uint32_t a, uint32_t frm, uint32_t* fflags); uint32_t rv_ftou_s(uint32_t a, uint32_t frm, uint32_t* fflags); @@ -58,6 +60,8 @@ uint64_t rv_fsub_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags); uint64_t rv_fmul_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags); uint64_t rv_fdiv_d(uint64_t a, uint64_t b, uint32_t frm, uint32_t* fflags); uint64_t rv_fsqrt_d(uint64_t a, uint32_t frm, uint32_t* fflags); +uint64_t rv_frecip7_d(uint64_t a, uint32_t frm, uint32_t* fflags); +uint64_t rv_frsqrt7_d(uint64_t a, uint32_t frm, uint32_t* fflags); uint64_t rv_fmadd_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags); uint64_t rv_fmsub_d(uint64_t a, uint64_t b, uint64_t c, uint32_t frm, uint32_t* fflags); @@ -85,6 +89,7 @@ uint64_t rv_fmin_d(uint64_t a, uint64_t b, uint32_t* fflags); uint64_t rv_fmax_d(uint64_t a, uint64_t b, uint32_t* fflags); uint32_t rv_dtof(uint64_t a); +uint32_t rv_dtof_r(uint64_t a, uint32_t frm); uint64_t rv_ftod(uint32_t a); #ifdef __cplusplus diff --git a/sim/common/simobject.h b/sim/common/simobject.h index f4c84e3f3..8d1eed660 100644 --- a/sim/common/simobject.h +++ b/sim/common/simobject.h @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -27,9 +27,9 @@ class SimObjectBase; /////////////////////////////////////////////////////////////////////////////// class SimPortBase { -public: +public: virtual ~SimPortBase() {} - + SimObjectBase* module() const { return module_; } @@ -53,25 +53,25 @@ public: SimPort(SimObjectBase* module) : SimPortBase(module) - , peer_(nullptr) + , sink_(nullptr) , tx_cb_(nullptr) {} - void bind(SimPort* peer) { - assert(peer_ == nullptr); - peer_ = peer; + void bind(SimPort* sink) { + assert(sink_ == nullptr); + sink_ = sink; } void unbind() { - peer_ = nullptr; + sink_ = nullptr; } bool connected() const { - return (peer_ != nullptr); + return (sink_ != nullptr); } - SimPort* peer() const { - return peer_; + SimPort* sink() const { + return sink_; } bool empty() const { @@ -92,7 +92,7 @@ public: auto cycles = queue_.front().cycles; queue_.pop(); return cycles; - } + } void tx_callback(const TxCallback& callback) { tx_cb_ = callback; @@ -111,15 +111,15 @@ protected: }; std::queue queue_; - SimPort* peer_; + SimPort* sink_; TxCallback tx_cb_; void transfer(const Pkt& data, uint64_t cycles) { if (tx_cb_) { tx_cb_(data, cycles); } - if (peer_) { - peer_->transfer(data, cycles); + if (sink_) { + sink_->transfer(data, cycles); } else { queue_.push({data, cycles}); } @@ -137,7 +137,7 @@ public: typedef std::shared_ptr Ptr; virtual ~SimEventBase() {} - + virtual void fire() const = 0; uint64_t cycles() const { @@ -161,30 +161,30 @@ public: typedef std::function Func; - SimCallEvent(const Func& func, const Pkt& pkt, uint64_t cycles) + SimCallEvent(const Func& func, const Pkt& pkt, uint64_t cycles) : SimEventBase(cycles) , func_(func) , pkt_(pkt) {} void* operator new(size_t /*size*/) { - return allocator().allocate(); + return allocator_.allocate(); } void operator delete(void* ptr) { - allocator().deallocate(ptr); + allocator_.deallocate(ptr); } protected: Func func_; Pkt pkt_; - static MemoryPool>& allocator() { - static MemoryPool> instance(64); - return instance; - } + static MemoryPool> allocator_; }; +template +MemoryPool> SimCallEvent::allocator_(64); + /////////////////////////////////////////////////////////////////////////////// template @@ -194,30 +194,30 @@ public: const_cast*>(port_)->transfer(pkt_, cycles_); } - SimPortEvent(const SimPort* port, const Pkt& pkt, uint64_t cycles) - : SimEventBase(cycles) + SimPortEvent(const SimPort* port, const Pkt& pkt, uint64_t cycles) + : SimEventBase(cycles) , port_(port) , pkt_(pkt) {} void* operator new(size_t /*size*/) { - return allocator().allocate(); + return allocator_.allocate(); } void operator delete(void* ptr) { - allocator().deallocate(ptr); + allocator_.deallocate(ptr); } protected: - const SimPort* port_; + const SimPort* port_; Pkt pkt_; - static MemoryPool>& allocator() { - static MemoryPool> instance(64); - return instance; - } + static MemoryPool> allocator_; }; +template +MemoryPool> SimPortEvent::allocator_(64); + /////////////////////////////////////////////////////////////////////////////// class SimContext; @@ -230,11 +230,11 @@ public: const std::string& name() const { return name_; - } + } protected: - SimObjectBase(const SimContext& ctx, const char* name); + SimObjectBase(const SimContext& ctx, const std::string& name); private: @@ -259,8 +259,8 @@ public: protected: - SimObject(const SimContext& ctx, const char* name) - : SimObjectBase(ctx, name) + SimObject(const SimContext& ctx, const std::string& name) + : SimObjectBase(ctx, name) {} private: @@ -283,9 +283,9 @@ private: }; class SimContext { -private: +private: SimContext() {} - + friend class SimPlatform; }; @@ -320,10 +320,10 @@ public: template void schedule(const typename SimCallEvent::Func& callback, - const Pkt& pkt, - uint64_t delay) { + const Pkt& pkt, + uint64_t delay) { assert(delay != 0); - auto evt = std::make_shared>(callback, pkt, cycles_ + delay); + auto evt = std::make_shared>(callback, pkt, cycles_ + delay); events_.emplace_back(evt); } @@ -341,10 +341,10 @@ public: auto evt_it_end = events_.end(); while (evt_it != evt_it_end) { auto& event = *evt_it; - if (cycles_ >= event->cycles()) { + if (cycles_ >= event->cycles()) { event->fire(); evt_it = events_.erase(evt_it); - } else { + } else { ++evt_it; } } @@ -352,7 +352,7 @@ public: for (auto& object : objects_) { object->do_tick(); } - // advance clock + // advance clock ++cycles_; } @@ -390,8 +390,8 @@ private: /////////////////////////////////////////////////////////////////////////////// -inline SimObjectBase::SimObjectBase(const SimContext&, const char* name) - : name_(name) +inline SimObjectBase::SimObjectBase(const SimContext&, const std::string& name) + : name_(name) {} template @@ -402,9 +402,9 @@ typename SimObject::Ptr SimObject::Create(Args&&... args) { template void SimPort::push(const Pkt& pkt, uint64_t delay) const { - if (peer_ && !tx_cb_) { - reinterpret_cast*>(peer_)->push(pkt, delay); + if (sink_ && !tx_cb_) { + reinterpret_cast*>(sink_)->push(pkt, delay); } else { SimPlatform::instance().schedule(this, pkt, delay); - } + } } diff --git a/sim/common/softfloat_ext.cpp b/sim/common/softfloat_ext.cpp new file mode 100644 index 000000000..a9d493b00 --- /dev/null +++ b/sim/common/softfloat_ext.cpp @@ -0,0 +1,457 @@ +/*============================================================================ + +This C source file is part of the SoftFloat IEEE Floating-Point Arithmetic +Package, Release 3e, by John R. Hauser. + +Copyright 2011, 2012, 2013, 2014, 2015, 2016 The Regents of the University of +California. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions, and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions, and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + 3. Neither the name of the University nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS "AS IS", AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ARE +DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +=============================================================================*/ + +#include "softfloat_ext.h" +#include <../RISCV/specialize.h> +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +uint_fast16_t f16_classify(float16_t a) { + union ui16_f16 uA; + uint_fast16_t uiA; + + uA.f = a; + uiA = uA.ui; + + uint_fast16_t infOrNaN = expF16UI(uiA) == 0x1F; + uint_fast16_t subnormalOrZero = expF16UI(uiA) == 0; + bool sign = signF16UI(uiA); + bool fracZero = fracF16UI(uiA) == 0; + bool isNaN = isNaNF16UI(uiA); + bool isSNaN = softfloat_isSigNaNF16UI(uiA); + + return (sign && infOrNaN && fracZero) << 0 | + (sign && !infOrNaN && !subnormalOrZero) << 1 | + (sign && subnormalOrZero && !fracZero) << 2 | + (sign && subnormalOrZero && fracZero) << 3 | + (!sign && infOrNaN && fracZero) << 7 | + (!sign && !infOrNaN && !subnormalOrZero) << 6 | + (!sign && subnormalOrZero && !fracZero) << 5 | + (!sign && subnormalOrZero && fracZero) << 4 | (isNaN && isSNaN) << 8 | + (isNaN && !isSNaN) << 9; +} + +uint_fast16_t f32_classify(float32_t a) { + union ui32_f32 uA; + uint_fast32_t uiA; + + uA.f = a; + uiA = uA.ui; + + uint_fast16_t infOrNaN = expF32UI(uiA) == 0xFF; + uint_fast16_t subnormalOrZero = expF32UI(uiA) == 0; + bool sign = signF32UI(uiA); + bool fracZero = fracF32UI(uiA) == 0; + bool isNaN = isNaNF32UI(uiA); + bool isSNaN = softfloat_isSigNaNF32UI(uiA); + + return (sign && infOrNaN && fracZero) << 0 | + (sign && !infOrNaN && !subnormalOrZero) << 1 | + (sign && subnormalOrZero && !fracZero) << 2 | + (sign && subnormalOrZero && fracZero) << 3 | + (!sign && infOrNaN && fracZero) << 7 | + (!sign && !infOrNaN && !subnormalOrZero) << 6 | + (!sign && subnormalOrZero && !fracZero) << 5 | + (!sign && subnormalOrZero && fracZero) << 4 | (isNaN && isSNaN) << 8 | + (isNaN && !isSNaN) << 9; +} + +uint_fast16_t f64_classify(float64_t a) { + union ui64_f64 uA; + uint_fast64_t uiA; + + uA.f = a; + uiA = uA.ui; + + uint_fast16_t infOrNaN = expF64UI(uiA) == 0x7FF; + uint_fast16_t subnormalOrZero = expF64UI(uiA) == 0; + bool sign = signF64UI(uiA); + bool fracZero = fracF64UI(uiA) == 0; + bool isNaN = isNaNF64UI(uiA); + bool isSNaN = softfloat_isSigNaNF64UI(uiA); + + return (sign && infOrNaN && fracZero) << 0 | + (sign && !infOrNaN && !subnormalOrZero) << 1 | + (sign && subnormalOrZero && !fracZero) << 2 | + (sign && subnormalOrZero && fracZero) << 3 | + (!sign && infOrNaN && fracZero) << 7 | + (!sign && !infOrNaN && !subnormalOrZero) << 6 | + (!sign && subnormalOrZero && !fracZero) << 5 | + (!sign && subnormalOrZero && fracZero) << 4 | (isNaN && isSNaN) << 8 | + (isNaN && !isSNaN) << 9; +} + +static inline uint64_t extract64(uint64_t val, int pos, int len) { + assert(pos >= 0 && len > 0 && len <= 64 - pos); + return (val >> pos) & (~UINT64_C(0) >> (64 - len)); +} + +static inline uint64_t make_mask64(int pos, int len) { + assert(pos >= 0 && len > 0 && pos < 64 && len <= 64); + return (UINT64_MAX >> (64 - len)) << pos; +} + +// user needs to truncate output to required length +static inline uint64_t rsqrte7(uint64_t val, int e, int s, bool sub) { + uint64_t exp = extract64(val, s, e); + uint64_t sig = extract64(val, 0, s); + uint64_t sign = extract64(val, s + e, 1); + const int p = 7; + + static const uint8_t table[] = { + 52, 51, 50, 48, 47, 46, 44, 43, 42, 41, 40, 39, 38, 36, 35, + 34, 33, 32, 31, 30, 30, 29, 28, 27, 26, 25, 24, 23, 23, 22, + 21, 20, 19, 19, 18, 17, 16, 16, 15, 14, 14, 13, 12, 12, 11, + 10, 10, 9, 9, 8, 7, 7, 6, 6, 5, 4, 4, 3, 3, 2, + 2, 1, 1, 0, 127, 125, 123, 121, 119, 118, 116, 114, 113, 111, 109, + 108, 106, 105, 103, 102, 100, 99, 97, 96, 95, 93, 92, 91, 90, 88, + 87, 86, 85, 84, 83, 82, 80, 79, 78, 77, 76, 75, 74, 73, 72, + 71, 70, 70, 69, 68, 67, 66, 65, 64, 63, 63, 62, 61, 60, 59, + 59, 58, 57, 56, 56, 55, 54, 53}; + + if (sub) { + while (extract64(sig, s - 1, 1) == 0) + exp--, sig <<= 1; + + sig = (sig << 1) & make_mask64(0, s); + } + + int idx = ((exp & 1) << (p - 1)) | (sig >> (s - p + 1)); + uint64_t out_sig = (uint64_t)(table[idx]) << (s - p); + uint64_t out_exp = (3 * make_mask64(0, e - 1) + ~exp) / 2; + + return (sign << (s + e)) | (out_exp << s) | out_sig; +} + +float16_t f16_rsqrte7(float16_t in) { + union ui16_f16 uA; + + uA.f = in; + unsigned int ret = f16_classify(in); + bool sub = false; + switch (ret) { + case 0x001: // -inf + case 0x002: // -normal + case 0x004: // -subnormal + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: // qNaN + uA.ui = defaultNaNF16UI; + break; + case 0x008: // -0 + uA.ui = 0xfc00; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7c00; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +num + uA.ui = rsqrte7(uA.ui, 5, 10, sub); + break; + } + + return uA.f; +} + +float32_t f32_rsqrte7(float32_t in) { + union ui32_f32 uA; + + uA.f = in; + unsigned int ret = f32_classify(in); + bool sub = false; + switch (ret) { + case 0x001: // -inf + case 0x002: // -normal + case 0x004: // -subnormal + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: // qNaN + uA.ui = defaultNaNF32UI; + break; + case 0x008: // -0 + uA.ui = 0xff800000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7f800000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +num + uA.ui = rsqrte7(uA.ui, 8, 23, sub); + break; + } + + return uA.f; +} + +float64_t f64_rsqrte7(float64_t in) { + union ui64_f64 uA; + + uA.f = in; + unsigned int ret = f64_classify(in); + bool sub = false; + switch (ret) { + case 0x001: // -inf + case 0x002: // -normal + case 0x004: // -subnormal + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: // qNaN + uA.ui = defaultNaNF64UI; + break; + case 0x008: // -0 + uA.ui = 0xfff0000000000000ul; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7ff0000000000000ul; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +num + uA.ui = rsqrte7(uA.ui, 11, 52, sub); + break; + } + + return uA.f; +} + +// user needs to truncate output to required length +static inline uint64_t recip7(uint64_t val, int e, int s, int rm, bool sub, + bool *round_abnormal) { + uint64_t exp = extract64(val, s, e); + uint64_t sig = extract64(val, 0, s); + uint64_t sign = extract64(val, s + e, 1); + const int p = 7; + + static const uint8_t table[] = { + 127, 125, 123, 121, 119, 117, 116, 114, 112, 110, 109, 107, 105, 104, 102, + 100, 99, 97, 96, 94, 93, 91, 90, 88, 87, 85, 84, 83, 81, 80, + 79, 77, 76, 75, 74, 72, 71, 70, 69, 68, 66, 65, 64, 63, 62, + 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, 48, 47, + 46, 45, 44, 43, 42, 41, 40, 40, 39, 38, 37, 36, 35, 35, 34, + 33, 32, 31, 31, 30, 29, 28, 28, 27, 26, 25, 25, 24, 23, 23, + 22, 21, 21, 20, 19, 19, 18, 17, 17, 16, 15, 15, 14, 14, 13, + 12, 12, 11, 11, 10, 9, 9, 8, 8, 7, 7, 6, 5, 5, 4, + 4, 3, 3, 2, 2, 1, 1, 0}; + + if (sub) { + while (extract64(sig, s - 1, 1) == 0) + exp--, sig <<= 1; + + sig = (sig << 1) & make_mask64(0, s); + + if (exp != 0 && exp != UINT64_MAX) { + *round_abnormal = true; + if (rm == 1 || (rm == 2 && !sign) || (rm == 3 && sign)) + return ((sign << (s + e)) | make_mask64(s, e)) - 1; + else + return (sign << (s + e)) | make_mask64(s, e); + } + } + + int idx = sig >> (s - p); + uint64_t out_sig = (uint64_t)(table[idx]) << (s - p); + uint64_t out_exp = 2 * make_mask64(0, e - 1) + ~exp; + if (out_exp == 0 || out_exp == UINT64_MAX) { + out_sig = (out_sig >> 1) | make_mask64(s - 1, 1); + if (out_exp == UINT64_MAX) { + out_sig >>= 1; + out_exp = 0; + } + } + + return (sign << (s + e)) | (out_exp << s) | out_sig; +} + +float16_t f16_recip7(float16_t in) { + union ui16_f16 uA; + + uA.f = in; + unsigned int ret = f16_classify(in); + bool sub = false; + bool round_abnormal = false; + switch (ret) { + case 0x001: // -inf + uA.ui = 0x8000; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x008: // -0 + uA.ui = 0xfc00; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7c00; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: // qNaN + uA.ui = defaultNaNF16UI; + break; + case 0x004: // -subnormal + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +- normal + uA.ui = recip7(uA.ui, 5, 10, softfloat_roundingMode, sub, &round_abnormal); + if (round_abnormal) { + softfloat_exceptionFlags |= softfloat_flag_inexact | softfloat_flag_overflow; + } + break; + } + + return uA.f; +} + +float32_t f32_recip7(float32_t in) { + union ui32_f32 uA; + + uA.f = in; + unsigned int ret = f32_classify(in); + bool sub = false; + bool round_abnormal = false; + switch (ret) { + case 0x001: // -inf + uA.ui = 0x80000000; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x008: // -0 + uA.ui = 0xff800000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7f800000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: // qNaN + uA.ui = defaultNaNF32UI; + break; + case 0x004: // -subnormal + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +- normal + uA.ui = recip7(uA.ui, 8, 23, softfloat_roundingMode, sub, &round_abnormal); + if (round_abnormal) { + softfloat_exceptionFlags |= softfloat_flag_inexact | softfloat_flag_overflow; + } + break; + } + + return uA.f; +} + +float64_t f64_recip7(float64_t in) { + union ui64_f64 uA; + + uA.f = in; + unsigned int ret = f64_classify(in); + bool sub = false; + bool round_abnormal = false; + switch (ret) { + case 0x001: // -inf + uA.ui = 0x8000000000000000; + break; + case 0x080: //+inf + uA.ui = 0x0; + break; + case 0x008: // -0 + uA.ui = 0xfff0000000000000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x010: // +0 + uA.ui = 0x7ff0000000000000; + softfloat_exceptionFlags |= softfloat_flag_infinite; + break; + case 0x100: // sNaN + softfloat_exceptionFlags |= softfloat_flag_invalid; + [[fallthrough]]; + case 0x200: // qNaN + uA.ui = defaultNaNF64UI; + break; + case 0x004: // -subnormal + case 0x020: //+ sub + sub = true; + [[fallthrough]]; + default: // +- normal + uA.ui = recip7(uA.ui, 11, 52, softfloat_roundingMode, sub, &round_abnormal); + if (round_abnormal) { + softfloat_exceptionFlags |= softfloat_flag_inexact | softfloat_flag_overflow; + } + break; + } + + return uA.f; +} + +#ifdef __cplusplus +} +#endif diff --git a/sim/common/softfloat_ext.h b/sim/common/softfloat_ext.h new file mode 100644 index 000000000..7c98473af --- /dev/null +++ b/sim/common/softfloat_ext.h @@ -0,0 +1,22 @@ +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +uint_fast16_t f16_classify(float16_t); +float16_t f16_rsqrte7(float16_t); +float16_t f16_recip7(float16_t); + +uint_fast16_t f32_classify(float32_t); +float32_t f32_rsqrte7(float32_t); +float32_t f32_recip7(float32_t); + +uint_fast16_t f64_classify(float64_t); +float64_t f64_rsqrte7(float64_t); +float64_t f64_recip7(float64_t); + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/sim/common/stringutil.h b/sim/common/stringutil.h index cddb5c3a3..ce3607c98 100644 --- a/sim/common/stringutil.h +++ b/sim/common/stringutil.h @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -47,7 +47,7 @@ public: , indent_(indent, ' ') , owner_(nullptr) {} - + explicit IndentStream(std::ostream& dest, int indent = 4) : dest_(dest.rdbuf()) , isBeginLine_(true) @@ -76,3 +76,14 @@ private: std::string indent_; std::ostream* owner_; }; + +template +std::string StrFormat(const std::string& fmt, Args... args) { + auto size = std::snprintf(nullptr, 0, fmt.c_str(), args...) + 1; + if (size <= 0) { + throw std::runtime_error("Error during formatting."); + } + std::vector buf(size); + std::snprintf(buf.data(), size, fmt.c_str(), args...); + return std::string(buf.data(), buf.data() + size - 1); +} \ No newline at end of file diff --git a/sim/common/util.cpp b/sim/common/util.cpp index b8683a2d0..8cd67bb33 100644 --- a/sim/common/util.cpp +++ b/sim/common/util.cpp @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -16,10 +16,10 @@ // return file extension const char* fileExtension(const char* filepath) { - const char *ext = strrchr(filepath, '.'); - if (ext == NULL || ext == filepath) - return ""; - return ext + 1; + const char *ext = strrchr(filepath, '.'); + if (ext == NULL || ext == filepath) + return ""; + return ext + 1; } void* aligned_malloc(size_t size, size_t alignment) { diff --git a/sim/opaesim/Makefile b/sim/opaesim/Makefile index 2e549ca74..4fbcad7eb 100644 --- a/sim/opaesim/Makefile +++ b/sim/opaesim/Makefile @@ -30,36 +30,20 @@ DBG_SCOPE_FLAGS += -DDBG_SCOPE_AFU DBG_SCOPE_FLAGS += -DDBG_SCOPE_ISSUE DBG_SCOPE_FLAGS += -DDBG_SCOPE_FETCH DBG_SCOPE_FLAGS += -DDBG_SCOPE_LSU -DBG_SCOPE_FLAGS += -DDBG_SCOPE_MSCHED - -# AFU parameters -CONFIGS += -DPLATFORM_PROVIDES_LOCAL_MEMORY -ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_BANKS,$(CONFIGS))) - CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=2 -endif -ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH,$(CONFIGS))) - CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=26 -endif -ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH,$(CONFIGS))) - CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH=512 -endif -ifeq (,$(findstring PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH,$(CONFIGS))) - CONFIGS += -DPLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH=4 -endif DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS) -SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp +SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp SRCS += $(SRC_DIR)/fpga.cpp $(SRC_DIR)/opae_sim.cpp RTL_PKGS = $(AFU_DIR)/local_mem_cfg_pkg.sv $(AFU_DIR)/ccip/ccip_if_pkg.sv -RTL_PKGS += $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv $(RTL_DIR)/core/VX_trace_pkg.sv +RTL_PKGS += $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv FPU_INCLUDE = -I$(RTL_DIR)/fpu ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) - RTL_PKGS += $(THIRD_PARTY_DIR)/fpnew/src/fpnew_pkg.sv $(THIRD_PARTY_DIR)/fpnew/src/common_cells/src/cf_math_pkg $(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl/defs_div_sqrt_mvp.sv - FPU_INCLUDE += -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src + RTL_PKGS += $(THIRD_PARTY_DIR)/cvfpu/src/fpnew_pkg.sv $(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src/cf_math_pkg $(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl/defs_div_sqrt_mvp.sv + FPU_INCLUDE += -I$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -I$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -I$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/cvfpu/src endif RTL_INCLUDE = -I$(SRC_DIR) -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE) RTL_INCLUDE += -I$(AFU_DIR) -I$(AFU_DIR)/ccip @@ -67,19 +51,19 @@ RTL_INCLUDE += -I$(AFU_DIR) -I$(AFU_DIR)/ccip TOP = vortex_afu_shim VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic -VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO -Wno-GENUNNAMED +VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO VL_FLAGS += --x-initial unique --x-assign unique VL_FLAGS += -DSIMULATION -DSV_DPI VL_FLAGS += -DXLEN_$(XLEN) VL_FLAGS += $(CONFIGS) -VL_FLAGS += $(SRC_DIR)/verilator.vlt +VL_FLAGS += verilator.vlt VL_FLAGS += $(RTL_INCLUDE) VL_FLAGS += $(RTL_PKGS) CXXFLAGS += $(CONFIGS) # Enable Verilator multithreaded simulation -THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count())') +THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(mp.cpu_count())') VL_FLAGS += -j $(THREADS) #VL_FLAGS += --threads $(THREADS) diff --git a/sim/opaesim/fpga.cpp b/sim/opaesim/fpga.cpp index 6c8ce8b2f..d16ef97a1 100644 --- a/sim/opaesim/fpga.cpp +++ b/sim/opaesim/fpga.cpp @@ -93,6 +93,8 @@ extern fpga_result fpgaClose(fpga_handle handle) { return FPGA_INVALID_PARAM; auto sim = reinterpret_cast(handle); + sim->shutdown(); + delete sim; return FPGA_OK; diff --git a/sim/opaesim/opae_sim.cpp b/sim/opaesim/opae_sim.cpp index 9d43ea595..fe4d61857 100644 --- a/sim/opaesim/opae_sim.cpp +++ b/sim/opaesim/opae_sim.cpp @@ -35,21 +35,10 @@ #include #include -#ifndef MEMORY_BANKS - #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS - #define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS - #else - #define MEMORY_BANKS 2 - #endif -#endif - #ifndef MEM_CLOCK_RATIO #define MEM_CLOCK_RATIO 1 #endif -#undef MEM_BLOCK_SIZE -#define MEM_BLOCK_SIZE (PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH / 8) - #define CACHE_BLOCK_SIZE 64 #define CCI_LATENCY 8 @@ -75,6 +64,8 @@ using namespace vortex; +static uint32_t g_mem_bank_addr_width = (PLATFORM_MEMORY_ADDR_WIDTH - log2ceil(PLATFORM_MEMORY_NUM_BANKS)); + static uint64_t timestamp = 0; double sc_time_stamp() { @@ -87,8 +78,9 @@ static uint64_t trace_stop_time = TRACE_STOP_TIME; bool sim_trace_enabled() { if (timestamp >= trace_start_time - && timestamp < trace_stop_time) + && timestamp < trace_stop_time) { return true; + } return trace_enabled; } @@ -103,7 +95,7 @@ public: Impl() : device_(nullptr) , ram_(nullptr) - , dram_sim_(MEM_CLOCK_RATIO) + , dram_sim_(PLATFORM_MEMORY_NUM_BANKS, PLATFORM_MEMORY_DATA_SIZE, MEM_CLOCK_RATIO) , stop_(false) , host_buffer_ids_(0) #ifdef VCD_OUTPUT @@ -119,6 +111,9 @@ public: for (auto& buffer : host_buffers_) { aligned_free(buffer.second.data); } + if (ram_) { + delete ram_; + } #ifdef VCD_OUTPUT if (tfp_) { tfp_->close(); @@ -128,13 +123,10 @@ public: if (device_) { delete device_; } - if (ram_) { - delete ram_; - } } int init() { - // force random values for unitialized signals + // force random values for uninitialized signals Verilated::randReset(VERILATOR_RESET_VALUE); Verilated::randSeed(50); @@ -151,41 +143,40 @@ public: tfp_->open("trace.vcd"); #endif + // allocate RAM ram_ = new RAM(0, RAM_PAGE_SIZE); - #ifndef NDEBUG - // dump device configuration - std::cout << "CONFIGS:" - << " num_threads=" << NUM_THREADS - << ", num_warps=" << NUM_WARPS - << ", num_cores=" << NUM_CORES - << ", num_clusters=" << NUM_CLUSTERS - << ", socket_size=" << SOCKET_SIZE - << ", local_mem_base=0x" << std::hex << LMEM_BASE_ADDR << std::dec - << ", num_barriers=" << NUM_BARRIERS - << std::endl; - #endif // reset the device this->reset(); + // Turn on assertion after reset + Verilated::assertOn(true); + // launch execution thread future_ = std::async(std::launch::async, [&]{ - while (!stop_) { - std::lock_guard guard(mutex_); - this->tick(); - } + while (!stop_) { + std::lock_guard guard(mutex_); + this->tick(); + } }); return 0; } + void shutdown() { + stop_ = true; + if (future_.valid()) { + future_.wait(); + } + } + int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) { auto alloc = aligned_malloc(len, CACHE_BLOCK_SIZE); if (alloc == NULL) return -1; // set uninitialized data to "baadf00d" for (uint32_t i = 0; i < len; ++i) { - ((uint8_t*)alloc)[i] = (0xbaadf00d >> ((i & 0x3) * 8)) & 0xff; + ((uint8_t*)alloc)[i] = (0xbaadf00d >> ((i & 0x3) * 8)) & 0xff; } host_buffer_t buffer; buffer.data = (uint64_t*)alloc; @@ -214,8 +205,9 @@ public: std::lock_guard guard(mutex_); // simulate CPU-GPU latency - for (uint32_t i = 0; i < CPU_GPU_LATENCY; ++i) + for (uint32_t i = 0; i < CPU_GPU_LATENCY; ++i) { this->tick(); + } // simulate mmio request device_->vcp2af_sRxPort_c0_mmioRdValid = 1; @@ -232,8 +224,9 @@ public: std::lock_guard guard(mutex_); // simulate CPU-GPU latency - for (uint32_t i = 0; i < CPU_GPU_LATENCY; ++i) + for (uint32_t i = 0; i < CPU_GPU_LATENCY; ++i) { this->tick(); + } // simulate mmio request device_->vcp2af_sRxPort_c0_mmioWrValid = 1; @@ -270,16 +263,6 @@ private: } device_->reset = 0; - - for (int i = 0; i < RESET_DELAY; ++i) { - device_->clk = 0; - this->eval(); - device_->clk = 1; - this->eval(); - } - - // Turn on assertion after reset - Verilated::assertOn(true); } void tick() { @@ -288,25 +271,24 @@ private: if (!dram_queue_.empty()) { auto mem_req = dram_queue_.front(); - if (dram_sim_.send_request(mem_req->write, mem_req->addr, mem_req->bank_id, [](void* arg) { + dram_sim_.send_request(mem_req->addr, mem_req->write, [](void* arg) { auto orig_req = reinterpret_cast(arg); if (orig_req->ready) { delete orig_req; } else { orig_req->ready = true; } - }, mem_req)) { - dram_queue_.pop(); - } + }, mem_req); + dram_queue_.pop(); } + dram_sim_.tick(); + device_->clk = 0; this->eval(); device_->clk = 1; this->eval(); - dram_sim_.tick(); - #ifndef NDEBUG fflush(stdout); #endif @@ -341,13 +323,14 @@ private: void sRxPort_bus_eval() { // check mmio request bool mmio_req_enabled = device_->vcp2af_sRxPort_c0_mmioRdValid - || device_->vcp2af_sRxPort_c0_mmioWrValid; + || device_->vcp2af_sRxPort_c0_mmioWrValid; // schedule CCI read responses std::list::iterator cci_rd_it(cci_reads_.end()); for (auto it = cci_reads_.begin(), ie = cci_reads_.end(); it != ie; ++it) { - if (it->cycles_left > 0) + if (it->cycles_left > 0) { it->cycles_left -= 1; + } if ((cci_rd_it == ie) && (it->cycles_left == 0)) { cci_rd_it = it; } @@ -356,8 +339,9 @@ private: // schedule CCI write responses std::list::iterator cci_wr_it(cci_writes_.end()); for (auto it = cci_writes_.begin(), ie = cci_writes_.end(); it != ie; ++it) { - if (it->cycles_left > 0) + if (it->cycles_left > 0) { it->cycles_left -= 1; + } if ((cci_wr_it == ie) && (it->cycles_left == 0)) { cci_wr_it = it; } @@ -375,7 +359,7 @@ private: // send CCI read response (ensure mmio disabled) device_->vcp2af_sRxPort_c0_rspValid = 0; if (!mmio_req_enabled - && (cci_rd_it != cci_reads_.end())) { + && (cci_rd_it != cci_reads_.end())) { device_->vcp2af_sRxPort_c0_rspValid = 1; device_->vcp2af_sRxPort_c0_hdr_resp_type = 0; memcpy(device_->vcp2af_sRxPort_c0_data, cci_rd_it->data.data(), CACHE_BLOCK_SIZE); @@ -419,15 +403,14 @@ private: } void avs_bus_reset() { - for (int b = 0; b < MEMORY_BANKS; ++b) { - pending_mem_reqs_[b].clear(); + for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) { device_->avs_readdatavalid[b] = 0; device_->avs_waitrequest[b] = 0; } } void avs_bus_eval() { - for (int b = 0; b < MEMORY_BANKS; ++b) { + for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) { // process memory responses device_->avs_readdatavalid[b] = 0; if (!pending_mem_reqs_[b].empty() @@ -435,7 +418,7 @@ private: auto mem_rd_it = pending_mem_reqs_[b].begin(); auto mem_req = *mem_rd_it; device_->avs_readdatavalid[b] = 1; - memcpy(device_->avs_readdata[b], mem_req->data.data(), MEM_BLOCK_SIZE); + memcpy(device_->avs_readdata[b], mem_req->data.data(), PLATFORM_MEMORY_DATA_SIZE); uint32_t addr = mem_req->addr; pending_mem_reqs_[b].erase(mem_rd_it); delete mem_req; @@ -443,19 +426,25 @@ private: // process memory requests assert(!device_->avs_read[b] || !device_->avs_write[b]); - unsigned byte_addr = (device_->avs_address[b] * MEMORY_BANKS + b) * MEM_BLOCK_SIZE; + #if PLATFORM_MEMORY_INTERLEAVE == 1 + uint64_t byte_addr = (uint64_t(device_->avs_address[b]) * PLATFORM_MEMORY_NUM_BANKS + b) * PLATFORM_MEMORY_DATA_SIZE; + #else + uint64_t byte_addr = (uint64_t(device_->avs_address[b]) + (b << g_mem_bank_addr_width)) * PLATFORM_MEMORY_DATA_SIZE; + #endif + if (device_->avs_write[b]) { + // process write request uint64_t byteen = device_->avs_byteenable[b]; uint8_t* data = (uint8_t*)(device_->avs_writedata[b].data()); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { + for (int i = 0; i < PLATFORM_MEMORY_DATA_SIZE; i++) { if ((byteen >> i) & 0x1) { (*ram_)[byte_addr + i] = data[i]; } } - /*printf("%0ld: [sim] MEM Wr Req: bank=%d, 0x%x, data=0x", timestamp, b, byte_addr); - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - printf("%02x", data[(MEM_BLOCK_SIZE-1)-i]); + /*printf("%0ld: [sim] MEM Wr Req[%d]: addr=0x%lx, byteen=0x%lx, data=0x", timestamp, b, byte_addr, byteen); + for (int i = PLATFORM_MEMORY_DATA_SIZE-1; i >= 0; --i) { + printf("%02x", data[i]); } printf("\n");*/ @@ -469,22 +458,20 @@ private: dram_queue_.push(mem_req); } else if (device_->avs_read[b]) { + // process read request auto mem_req = new mem_req_t(); mem_req->addr = device_->avs_address[b]; mem_req->bank_id = b; - ram_->read(mem_req->data.data(), byte_addr, MEM_BLOCK_SIZE); + ram_->read(mem_req->data.data(), byte_addr, PLATFORM_MEMORY_DATA_SIZE); mem_req->write = false; mem_req->ready = false; pending_mem_reqs_[b].emplace_back(mem_req); - /*printf("%0ld: [sim] MEM Rd Req: bank=%d, addr=%x, pending={", timestamp, b, mem_req.addr * MEM_BLOCK_SIZE); - for (auto& req : pending_mem_reqs_[b]) { - if (req.cycles_left != 0) - printf(" !%0x", req.addr * MEM_BLOCK_SIZE); - else - printf(" %0x", req.addr * MEM_BLOCK_SIZE); + /*printf("%0ld: [sim] MEM Rd Req[%d]: addr=0x%lx, pending={", timestamp, b, byte_addr); + for (int i = PLATFORM_MEMORY_DATA_SIZE-1; i >= 0; --i) { + printf("%02x", mem_req->data[i]); } - printf("}\n");*/ + printf("\n");*/ // send dram request dram_queue_.push(mem_req); @@ -495,7 +482,7 @@ private: } typedef struct { - std::array data; + std::array data; uint32_t addr; uint32_t bank_id; bool write; @@ -528,9 +515,9 @@ private: bool stop_; std::unordered_map host_buffers_; - int64_t host_buffer_ids_; + uint64_t host_buffer_ids_; - std::list pending_mem_reqs_[MEMORY_BANKS]; + std::list pending_mem_reqs_[PLATFORM_MEMORY_NUM_BANKS]; std::list cci_reads_; std::list cci_writes_; @@ -558,6 +545,10 @@ int opae_sim::init() { return impl_->init(); } +void opae_sim::shutdown() { + impl_->shutdown(); +} + int opae_sim::prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags) { return impl_->prepare_buffer(len, buf_addr, wsid, flags); } diff --git a/sim/opaesim/opae_sim.h b/sim/opaesim/opae_sim.h index a04ade0a0..454cc1bf7 100644 --- a/sim/opaesim/opae_sim.h +++ b/sim/opaesim/opae_sim.h @@ -25,6 +25,8 @@ public: int init(); + void shutdown(); + int prepare_buffer(uint64_t len, void **buf_addr, uint64_t *wsid, int flags); void release_buffer(uint64_t wsid); diff --git a/sim/opaesim/verilator.vlt b/sim/opaesim/verilator.vlt deleted file mode 100644 index 66a59bd12..000000000 --- a/sim/opaesim/verilator.vlt +++ /dev/null @@ -1,8 +0,0 @@ -`verilator_config - -lint_off -rule BLKANDNBLK -file "*/fpnew/src/*" -lint_off -rule UNOPTFLAT -file "*/fpnew/src/*" -lint_off -file "*/fpnew/src/*" - -lint_off -file "*/afu/opae/ccip/ccip_if_pkg.sv" -lint_off -file "*/afu/opae/local_mem_cfg_pkg.sv" diff --git a/sim/opaesim/verilator.vlt.in b/sim/opaesim/verilator.vlt.in new file mode 100644 index 000000000..0b118e05e --- /dev/null +++ b/sim/opaesim/verilator.vlt.in @@ -0,0 +1,8 @@ +`verilator_config + +lint_off -rule BLKANDNBLK -file "@VORTEX_HOME@/third_party/cvfpu/*" +lint_off -rule UNOPTFLAT -file "@VORTEX_HOME@/third_party/cvfpu/*" +lint_off -file "@VORTEX_HOME@/third_party/cvfpu/*" + +lint_off -file "@VORTEX_HOME@/hw/rtl/afu/opae/ccip/ccip_if_pkg.sv" +lint_off -file "@VORTEX_HOME@/hw/rtl/afu/opae/local_mem_cfg_pkg.sv" diff --git a/sim/opaesim/vortex_afu_shim.sv b/sim/opaesim/vortex_afu_shim.sv index 8c64c8332..3c2ef27ff 100644 --- a/sim/opaesim/vortex_afu_shim.sv +++ b/sim/opaesim/vortex_afu_shim.sv @@ -1,30 +1,28 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -`include "VX_platform.vh" +`include "VX_define.vh" `IGNORE_WARNINGS_BEGIN `include "vortex_afu.vh" `IGNORE_WARNINGS_END -`include "VX_define.vh" - module vortex_afu_shim import local_mem_cfg_pkg::*; import ccip_if_pkg::*; ( // global signals input clk, input reset, // IF signals between CCI and AFU - input logic vcp2af_sRxPort_c0_TxAlmFull, + input logic vcp2af_sRxPort_c0_TxAlmFull, input logic vcp2af_sRxPort_c1_TxAlmFull, input t_ccip_vc vcp2af_sRxPort_c0_hdr_vc_used, @@ -35,15 +33,15 @@ module vortex_afu_shim import local_mem_cfg_pkg::*; import ccip_if_pkg::*; ( input t_ccip_c0_rsp vcp2af_sRxPort_c0_hdr_resp_type, input t_ccip_mdata vcp2af_sRxPort_c0_hdr_mdata, input t_ccip_clData vcp2af_sRxPort_c0_data, - input logic vcp2af_sRxPort_c0_rspValid, - input logic vcp2af_sRxPort_c0_mmioRdValid, - input logic vcp2af_sRxPort_c0_mmioWrValid, + input logic vcp2af_sRxPort_c0_rspValid, + input logic vcp2af_sRxPort_c0_mmioRdValid, + input logic vcp2af_sRxPort_c0_mmioWrValid, input t_ccip_mmioAddr vcp2af_sRxPort_c0_ReqMmioHdr_address, - input logic [1:0] vcp2af_sRxPort_c0_ReqMmioHdr_length, + input logic [1:0] vcp2af_sRxPort_c0_ReqMmioHdr_length, input logic vcp2af_sRxPort_c0_ReqMmioHdr_rsvd, - input t_ccip_tid vcp2af_sRxPort_c0_ReqMmioHdr_tid, - + input t_ccip_tid vcp2af_sRxPort_c0_ReqMmioHdr_tid, + input t_ccip_vc vcp2af_sRxPort_c1_hdr_vc_used, input logic vcp2af_sRxPort_c1_hdr_rsvd1, input logic vcp2af_sRxPort_c1_hdr_hit_miss, @@ -51,51 +49,51 @@ module vortex_afu_shim import local_mem_cfg_pkg::*; import ccip_if_pkg::*; ( input logic vcp2af_sRxPort_c1_hdr_rsvd0, input t_ccip_clNum vcp2af_sRxPort_c1_hdr_cl_num, input t_ccip_c1_rsp vcp2af_sRxPort_c1_hdr_resp_type, - input t_ccip_mdata vcp2af_sRxPort_c1_hdr_mdata, - input logic vcp2af_sRxPort_c1_rspValid, - + input t_ccip_mdata vcp2af_sRxPort_c1_hdr_mdata, + input logic vcp2af_sRxPort_c1_rspValid, + output t_ccip_vc af2cp_sTxPort_c0_hdr_vc_sel, - output logic [1:0] af2cp_sTxPort_c0_hdr_rsvd1, + output logic [1:0] af2cp_sTxPort_c0_hdr_rsvd1, output t_ccip_clLen af2cp_sTxPort_c0_hdr_cl_len, output t_ccip_c0_req af2cp_sTxPort_c0_hdr_req_type, - output logic [5:0] af2cp_sTxPort_c0_hdr_rsvd0, + output logic [5:0] af2cp_sTxPort_c0_hdr_rsvd0, output t_ccip_clAddr af2cp_sTxPort_c0_hdr_address, output t_ccip_mdata af2cp_sTxPort_c0_hdr_mdata, - output logic af2cp_sTxPort_c0_valid, + output logic af2cp_sTxPort_c0_valid, output logic [5:0] af2cp_sTxPort_c1_hdr_rsvd2, output t_ccip_vc af2cp_sTxPort_c1_hdr_vc_sel, output logic af2cp_sTxPort_c1_hdr_sop, - output logic af2cp_sTxPort_c1_hdr_rsvd1, + output logic af2cp_sTxPort_c1_hdr_rsvd1, output t_ccip_clLen af2cp_sTxPort_c1_hdr_cl_len, output t_ccip_c1_req af2cp_sTxPort_c1_hdr_req_type, - output logic [5:0] af2cp_sTxPort_c1_hdr_rsvd0, + output logic [5:0] af2cp_sTxPort_c1_hdr_rsvd0, output t_ccip_clAddr af2cp_sTxPort_c1_hdr_address, output t_ccip_mdata af2cp_sTxPort_c1_hdr_mdata, - output t_ccip_clData af2cp_sTxPort_c1_data, - output logic af2cp_sTxPort_c1_valid, + output t_ccip_clData af2cp_sTxPort_c1_data, + output logic af2cp_sTxPort_c1_valid, output t_ccip_tid af2cp_sTxPort_c2_hdr_tid, - output logic af2cp_sTxPort_c2_mmioRdValid, - output t_ccip_mmioData af2cp_sTxPort_c2_data, - + output logic af2cp_sTxPort_c2_mmioRdValid, + output t_ccip_mmioData af2cp_sTxPort_c2_data, + // Avalon signals for local memory access - output t_local_mem_data avs_writedata [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS], - input t_local_mem_data avs_readdata [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS], - output t_local_mem_addr avs_address [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS], - input logic avs_waitrequest [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS], - output logic avs_write [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS], - output logic avs_read [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS], - output t_local_mem_byte_mask avs_byteenable [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS], - output t_local_mem_burst_cnt avs_burstcount [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS], - input avs_readdatavalid [`PLATFORM_PARAM_LOCAL_MEMORY_BANKS] + output t_local_mem_data avs_writedata [`PLATFORM_MEMORY_NUM_BANKS], + input t_local_mem_data avs_readdata [`PLATFORM_MEMORY_NUM_BANKS], + output t_local_mem_addr avs_address [`PLATFORM_MEMORY_NUM_BANKS], + input logic avs_waitrequest [`PLATFORM_MEMORY_NUM_BANKS], + output logic avs_write [`PLATFORM_MEMORY_NUM_BANKS], + output logic avs_read [`PLATFORM_MEMORY_NUM_BANKS], + output t_local_mem_byte_mask avs_byteenable [`PLATFORM_MEMORY_NUM_BANKS], + output t_local_mem_burst_cnt avs_burstcount [`PLATFORM_MEMORY_NUM_BANKS], + input avs_readdatavalid [`PLATFORM_MEMORY_NUM_BANKS] ); t_if_ccip_Rx cp2af_sRxPort; t_if_ccip_Tx af2cp_sTxPort; vortex_afu #( - .NUM_LOCAL_MEM_BANKS(`PLATFORM_PARAM_LOCAL_MEMORY_BANKS) + .NUM_LOCAL_MEM_BANKS(`PLATFORM_MEMORY_NUM_BANKS) ) afu ( .clk(clk), .reset(reset), @@ -119,7 +117,7 @@ always @ (*) begin c0_RxHdr.reqMmioHdr.address = vcp2af_sRxPort_c0_ReqMmioHdr_address; c0_RxHdr.reqMmioHdr.length = vcp2af_sRxPort_c0_ReqMmioHdr_length; c0_RxHdr.reqMmioHdr.rsvd = vcp2af_sRxPort_c0_ReqMmioHdr_rsvd; - c0_RxHdr.reqMmioHdr.tid = vcp2af_sRxPort_c0_ReqMmioHdr_tid; + c0_RxHdr.reqMmioHdr.tid = vcp2af_sRxPort_c0_ReqMmioHdr_tid; end else begin c0_RxHdr.rspMemHdr.vc_used = vcp2af_sRxPort_c0_hdr_vc_used; c0_RxHdr.rspMemHdr.rsvd1 = vcp2af_sRxPort_c0_hdr_rsvd1; @@ -134,7 +132,7 @@ end assign cp2af_sRxPort.c0TxAlmFull = vcp2af_sRxPort_c0_TxAlmFull; assign cp2af_sRxPort.c1TxAlmFull = vcp2af_sRxPort_c1_TxAlmFull; -assign cp2af_sRxPort.c0.hdr = c0_RxHdr; +assign cp2af_sRxPort.c0.hdr = c0_RxHdr; assign cp2af_sRxPort.c0.data = vcp2af_sRxPort_c0_data; assign cp2af_sRxPort.c0.rspValid = vcp2af_sRxPort_c0_rspValid; assign cp2af_sRxPort.c0.mmioRdValid = vcp2af_sRxPort_c0_mmioRdValid; @@ -147,8 +145,8 @@ assign cp2af_sRxPort.c1.hdr.format = vcp2af_sRxPort_c1_hdr_format; assign cp2af_sRxPort.c1.hdr.rsvd0 = vcp2af_sRxPort_c1_hdr_rsvd0; assign cp2af_sRxPort.c1.hdr.cl_num = vcp2af_sRxPort_c1_hdr_cl_num; assign cp2af_sRxPort.c1.hdr.resp_type = vcp2af_sRxPort_c1_hdr_resp_type; -assign cp2af_sRxPort.c1.hdr.mdata = vcp2af_sRxPort_c1_hdr_mdata; -assign cp2af_sRxPort.c1.rspValid = vcp2af_sRxPort_c1_rspValid; +assign cp2af_sRxPort.c1.hdr.mdata = vcp2af_sRxPort_c1_hdr_mdata; +assign cp2af_sRxPort.c1.rspValid = vcp2af_sRxPort_c1_rspValid; assign af2cp_sTxPort_c0_hdr_vc_sel = af2cp_sTxPort.c0.hdr.vc_sel; assign af2cp_sTxPort_c0_hdr_rsvd1 = af2cp_sTxPort.c0.hdr.rsvd1; @@ -168,11 +166,11 @@ assign af2cp_sTxPort_c1_hdr_req_type = af2cp_sTxPort.c1.hdr.req_type; assign af2cp_sTxPort_c1_hdr_rsvd0 = af2cp_sTxPort.c1.hdr.rsvd0; assign af2cp_sTxPort_c1_hdr_address = af2cp_sTxPort.c1.hdr.address; assign af2cp_sTxPort_c1_hdr_mdata = af2cp_sTxPort.c1.hdr.mdata; -assign af2cp_sTxPort_c1_data = af2cp_sTxPort.c1.data; +assign af2cp_sTxPort_c1_data = af2cp_sTxPort.c1.data; assign af2cp_sTxPort_c1_valid = af2cp_sTxPort.c1.valid; -assign af2cp_sTxPort_c2_hdr_tid = af2cp_sTxPort.c2.hdr.tid; -assign af2cp_sTxPort_c2_mmioRdValid = af2cp_sTxPort.c2.mmioRdValid; +assign af2cp_sTxPort_c2_hdr_tid = af2cp_sTxPort.c2.hdr.tid; +assign af2cp_sTxPort_c2_mmioRdValid = af2cp_sTxPort.c2.mmioRdValid; assign af2cp_sTxPort_c2_data = af2cp_sTxPort.c2.data; endmodule diff --git a/sim/rtlsim/Makefile b/sim/rtlsim/Makefile index 3deffc759..c036fc571 100644 --- a/sim/rtlsim/Makefile +++ b/sim/rtlsim/Makefile @@ -26,31 +26,26 @@ DBG_TRACE_FLAGS += -DDBG_TRACE_GBAR DBG_FLAGS += -DDEBUG_LEVEL=$(DEBUG) -DVCD_OUTPUT $(DBG_TRACE_FLAGS) -RTL_PKGS = $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv $(RTL_DIR)/core/VX_trace_pkg.sv +RTL_PKGS = $(RTL_DIR)/VX_gpu_pkg.sv $(RTL_DIR)/fpu/VX_fpu_pkg.sv FPU_INCLUDE = -I$(RTL_DIR)/fpu ifneq (,$(findstring FPU_FPNEW,$(CONFIGS))) - RTL_PKGS += $(THIRD_PARTY_DIR)/fpnew/src/fpnew_pkg.sv $(THIRD_PARTY_DIR)/fpnew/src/common_cells/src/cf_math_pkg $(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl/defs_div_sqrt_mvp.sv - FPU_INCLUDE += -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/include -I$(THIRD_PARTY_DIR)/fpnew/src/common_cells/src -I$(THIRD_PARTY_DIR)/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/fpnew/src + RTL_PKGS += $(THIRD_PARTY_DIR)/cvfpu/src/fpnew_pkg.sv $(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src/cf_math_pkg $(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl/defs_div_sqrt_mvp.sv + FPU_INCLUDE += -I$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/include -I$(THIRD_PARTY_DIR)/cvfpu/src/common_cells/src -I$(THIRD_PARTY_DIR)/cvfpu/src/fpu_div_sqrt_mvp/hdl -I$(THIRD_PARTY_DIR)/cvfpu/src endif -RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE) +RTL_INCLUDE = -I$(SRC_DIR) -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/core -I$(RTL_DIR)/mem -I$(RTL_DIR)/cache $(FPU_INCLUDE) -SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp +SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp SRCS += $(SRC_DIR)/processor.cpp -ifdef AXI_BUS - TOP = Vortex_axi - CXXFLAGS += -DAXI_BUS -else - TOP = Vortex -endif +TOP = rtlsim_shim VL_FLAGS = --exe VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic -VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO -Wno-GENUNNAMED +VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO VL_FLAGS += --x-initial unique --x-assign unique -VL_FLAGS += $(SRC_DIR)/verilator.vlt +VL_FLAGS += verilator.vlt VL_FLAGS += -DSIMULATION -DSV_DPI VL_FLAGS += -DXLEN_$(XLEN) VL_FLAGS += $(CONFIGS) @@ -61,7 +56,7 @@ VL_FLAGS += --cc $(TOP) --top-module $(TOP) CXXFLAGS += $(CONFIGS) # Enable Verilator multithreaded simulation -THREADS ?= $(shell python -c 'import multiprocessing as mp; print(mp.cpu_count())') +THREADS ?= $(shell python3 -c 'import multiprocessing as mp; print(mp.cpu_count())') VL_FLAGS += -j $(THREADS) #VL_FLAGS += --threads $(THREADS) diff --git a/sim/rtlsim/main.cpp b/sim/rtlsim/main.cpp index ea0ba9b95..16ce79550 100644 --- a/sim/rtlsim/main.cpp +++ b/sim/rtlsim/main.cpp @@ -33,13 +33,11 @@ const char* program = nullptr; static void parse_args(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "rh?")) != -1) { + while ((c = getopt(argc, argv, "rh")) != -1) { switch (c) { case 'h': - case '?': - show_usage(); - exit(0); - break; + show_usage(); + exit(0); default: show_usage(); exit(-1); diff --git a/sim/rtlsim/processor.cpp b/sim/rtlsim/processor.cpp index e5e00f49e..7c830b01a 100644 --- a/sim/rtlsim/processor.cpp +++ b/sim/rtlsim/processor.cpp @@ -13,13 +13,7 @@ #include "processor.h" -#ifdef AXI_BUS -#include "VVortex_axi.h" -typedef VVortex_axi Device; -#else -#include "VVortex.h" -typedef VVortex Device; -#endif +#include "Vrtlsim_shim.h" #ifdef VCD_OUTPUT #include @@ -41,14 +35,6 @@ typedef VVortex Device; #include #include -#ifndef MEMORY_BANKS - #ifdef PLATFORM_PARAM_LOCAL_MEMORY_BANKS - #define MEMORY_BANKS PLATFORM_PARAM_LOCAL_MEMORY_BANKS - #else - #define MEMORY_BANKS 2 - #endif -#endif - #ifndef MEM_CLOCK_RATIO #define MEM_CLOCK_RATIO 1 #endif @@ -78,6 +64,8 @@ typedef uint64_t Word; using namespace vortex; +static uint32_t g_mem_bank_addr_width = (PLATFORM_MEMORY_ADDR_WIDTH - log2ceil(PLATFORM_MEMORY_NUM_BANKS)); + static uint64_t timestamp = 0; double sc_time_stamp() { @@ -105,8 +93,8 @@ void sim_trace_enable(bool enable) { class Processor::Impl { public: - Impl() : dram_sim_(MEM_CLOCK_RATIO) { - // force random values for unitialized signals + Impl() : dram_sim_(PLATFORM_MEMORY_NUM_BANKS, PLATFORM_MEMORY_DATA_SIZE, MEM_CLOCK_RATIO) { + // force random values for uninitialized signals Verilated::randReset(VERILATOR_RESET_VALUE); Verilated::randSeed(50); @@ -114,7 +102,7 @@ public: Verilated::assertOn(false); // create RTL module instance - device_ = new Device(); + device_ = new Vrtlsim_shim(); #ifdef VCD_OUTPUT Verilated::traceEverOn(true); @@ -125,18 +113,6 @@ public: ram_ = nullptr; - #ifndef NDEBUG - // dump device configuration - std::cout << "CONFIGS:" - << " num_threads=" << NUM_THREADS - << ", num_warps=" << NUM_WARPS - << ", num_cores=" << NUM_CORES - << ", num_clusters=" << NUM_CLUSTERS - << ", socket_size=" << SOCKET_SIZE - << ", local_mem_base=0x" << std::hex << LMEM_BASE_ADDR << std::dec - << ", num_barriers=" << NUM_BARRIERS - << std::endl; - #endif // reset the device this->reset(); @@ -169,14 +145,18 @@ public: } void run() { - #ifndef NDEBUG std::cout << std::dec << timestamp << ": [sim] run()" << std::endl; #endif - // start execution - running_ = true; + // reset device + this->reset(); + + // start device_->reset = 0; + for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) { + device_->mem_req_ready[b] = 1; + } // wait on device to go busy while (!device_->busy) { @@ -188,8 +168,8 @@ public: this->tick(); } - // reset device - this->reset(); + // stop + device_->reset = 1; this->cout_flush(); } @@ -198,31 +178,27 @@ public: device_->dcr_wr_valid = 1; device_->dcr_wr_addr = addr; device_->dcr_wr_data = value; - while (device_->dcr_wr_valid) { - this->tick(); - } + this->tick(); + device_->dcr_wr_valid = 0; + this->tick(); } private: void reset() { - running_ = false; + this->mem_bus_reset(); + this->dcr_bus_reset(); print_bufs_.clear(); - pending_mem_reqs_.clear(); - - { - std::queue empty; - std::swap(dram_queue_, empty); + for (auto& reqs : pending_mem_reqs_) { + reqs.clear(); } - mem_rd_rsp_active_ = false; - mem_wr_rsp_active_ = false; - - this->mem_bus_reset(); - - this->dcr_bus_reset(); + for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) { + std::queue empty; + std::swap(dram_queue_[b], empty); + } device_->reset = 1; @@ -240,27 +216,23 @@ private: this->eval(); this->mem_bus_eval(0); - this->dcr_bus_eval(0); device_->clk = 1; this->eval(); this->mem_bus_eval(1); - this->dcr_bus_eval(1); dram_sim_.tick(); - if (!dram_queue_.empty()) { - auto mem_req = dram_queue_.front(); - if (dram_sim_.send_request(mem_req->write, mem_req->addr, 0, [](void* arg) { - auto orig_req = reinterpret_cast(arg); - if (orig_req->ready) { - delete orig_req; - } else { + for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) { + if (!dram_queue_[b].empty()) { + auto mem_req = dram_queue_[b].front(); + dram_sim_.send_request(mem_req->addr, mem_req->write, [](void* arg) { + // mark completed request as ready + auto orig_req = reinterpret_cast(arg); orig_req->ready = true; - } - }, mem_req)) { - dram_queue_.pop(); + }, mem_req); + dram_queue_[b].pop(); } } @@ -274,296 +246,134 @@ private: #ifdef VCD_OUTPUT if (sim_trace_enabled()) { tfp_->dump(timestamp); - } else { - exit(-1); } #endif ++timestamp; } -#ifdef AXI_BUS - void mem_bus_reset() { - device_->m_axi_wready[0] = 0; - device_->m_axi_awready[0] = 0; - device_->m_axi_arready[0] = 0; - device_->m_axi_rvalid[0] = 0; - device_->m_axi_bvalid[0] = 0; + for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) { + device_->mem_req_ready[b] = 0; + device_->mem_rsp_valid[b] = 0; + } } void mem_bus_eval(bool clk) { if (!clk) { - mem_rd_rsp_ready_ = device_->m_axi_rready[0]; - mem_wr_rsp_ready_ = device_->m_axi_bready[0]; + for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) { + mem_rd_rsp_ready_[b] = device_->mem_rsp_ready[b]; + } return; } - if (ram_ == nullptr) { - device_->m_axi_wready[0] = 0; - device_->m_axi_awready[0] = 0; - device_->m_axi_arready[0] = 0; - return; - } - - // process memory read responses - if (mem_rd_rsp_active_ - && device_->m_axi_rvalid[0] && mem_rd_rsp_ready_) { - mem_rd_rsp_active_ = false; - } - if (!mem_rd_rsp_active_) { - if (!pending_mem_reqs_.empty() - && (*pending_mem_reqs_.begin())->ready - && !(*pending_mem_reqs_.begin())->write) { - auto mem_rsp_it = pending_mem_reqs_.begin(); - auto mem_rsp = *mem_rsp_it; - /* - printf("%0ld: [sim] MEM Rd Rsp: addr=0x%0lx, data=0x", timestamp, mem_rsp->addr); - for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) { - printf("%02x", mem_rsp->block[i]); + for (int b = 0; b < PLATFORM_MEMORY_NUM_BANKS; ++b) { + // process memory responses + if (device_->mem_rsp_valid[b] && mem_rd_rsp_ready_[b]) { + device_->mem_rsp_valid[b] = 0; + } + if (device_->mem_rsp_valid[b] == 0) { + if (!pending_mem_reqs_[b].empty()) { + auto mem_rsp_it = pending_mem_reqs_[b].begin(); + auto mem_rsp = *mem_rsp_it; + if (mem_rsp->ready) { + if (!mem_rsp->write) { + // return read responses + device_->mem_rsp_valid[b] = 1; + memcpy(VDataCast::get(device_->mem_rsp_data[b]), mem_rsp->data.data(), PLATFORM_MEMORY_DATA_SIZE); + device_->mem_rsp_tag[b] = mem_rsp->tag; + } + // delete the request + pending_mem_reqs_[b].erase(mem_rsp_it); + delete mem_rsp; + } } - printf("\n"); - */ - device_->m_axi_rvalid[0] = 1; - device_->m_axi_rid[0] = mem_rsp->tag; - device_->m_axi_rresp[0] = 0; - device_->m_axi_rlast[0] = 1; - memcpy(device_->m_axi_rdata[0].data(), mem_rsp->block.data(), MEM_BLOCK_SIZE); - pending_mem_reqs_.erase(mem_rsp_it); - mem_rd_rsp_active_ = true; - delete mem_rsp; - } else { - device_->m_axi_rvalid[0] = 0; } - } - // process memory write responses - if (mem_wr_rsp_active_ - && device_->m_axi_bvalid[0] && mem_wr_rsp_ready_) { - mem_wr_rsp_active_ = false; - } - if (!mem_wr_rsp_active_) { - if (!pending_mem_reqs_.empty() - && (*pending_mem_reqs_.begin())->ready - && (*pending_mem_reqs_.begin())->write) { - auto mem_rsp_it = pending_mem_reqs_.begin(); - auto mem_rsp = *mem_rsp_it; - /* - printf("%0ld: [sim] MEM Wr Rsp: addr=0x%0lx\n", timestamp, mem_rsp->addr); - */ - device_->m_axi_bvalid[0] = 1; - device_->m_axi_bid[0] = mem_rsp->tag; - device_->m_axi_bresp[0] = 0; - pending_mem_reqs_.erase(mem_rsp_it); - mem_wr_rsp_active_ = true; - delete mem_rsp; - } else { - device_->m_axi_bvalid[0] = 0; - } - } - - // select the memory bank - uint32_t req_addr = device_->m_axi_wvalid[0] ? device_->m_axi_awaddr[0] : device_->m_axi_araddr[0]; - - // process memory requests - if ((device_->m_axi_wvalid[0] || device_->m_axi_arvalid[0]) && running_) { - if (device_->m_axi_wvalid[0]) { - auto byteen = device_->m_axi_wstrb[0]; - auto base_addr = device_->m_axi_awaddr[0]; - auto data = (uint8_t*)device_->m_axi_wdata[0].data(); - - if (base_addr >= uint64_t(IO_COUT_ADDR) - && base_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) { - // process console output - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - if ((byteen >> i) & 0x1) { - auto& ss_buf = print_bufs_[i]; - char c = data[i]; - ss_buf << c; - if (c == '\n') { - std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush; - ss_buf.str(""); + // process memory requests + if (device_->mem_req_valid[b] && device_->mem_req_ready[b]) { + #if PLATFORM_MEMORY_INTERLEAVE == 1 + uint64_t byte_addr = (uint64_t(device_->mem_req_addr[b]) * PLATFORM_MEMORY_NUM_BANKS + b) * PLATFORM_MEMORY_DATA_SIZE; + #else + uint64_t byte_addr = (uint64_t(device_->mem_req_addr[b]) + (b << g_mem_bank_addr_width)) * PLATFORM_MEMORY_DATA_SIZE; + #endif + // check read/write + if (device_->mem_req_rw[b]) { + auto byteen = device_->mem_req_byteen[b]; + auto data = VDataCast::get(device_->mem_req_data[b]); + // check if console output address + if (byte_addr >= uint64_t(IO_COUT_ADDR) + && byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) { + // process console output + for (int i = 0; i < PLATFORM_MEMORY_DATA_SIZE; i++) { + if ((byteen >> i) & 0x1) { + auto& ss_buf = print_bufs_[i]; + char c = data[i]; + ss_buf << c; + if (c == '\n') { + std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush; + ss_buf.str(""); + } } } - } - } else { - // process writes - /* - printf("%0ld: [sim] MEM Wr: addr=0x%0lx, byteen=0x", timestamp, base_addr); - for (int i = (MEM_BLOCK_SIZE/4)-1; i >= 0; --i) { - printf("%x", (int)((byteen >> (4 * i)) & 0xf)); - } - printf(", data=0x"); - for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) { - printf("%02x", data[i]); - } - printf("\n"); - */ - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - if ((byteen >> i) & 0x1) { - (*ram_)[base_addr + i] = data[i]; + } else { + // process memory writes + /*printf("%0ld: [sim] MEM Wr Req[%d]: addr=0x%0lx, tag=0x%0lx, byteen=0x", timestamp, b, byte_addr, device_->mem_req_tag[b]); + for (int i = (PLATFORM_MEMORY_DATA_SIZE/4)-1; i >= 0; --i) { + printf("%x", (int)((byteen >> (4 * i)) & 0xf)); } - } + printf(", data=0x"); + for (int i = PLATFORM_MEMORY_DATA_SIZE-1; i >= 0; --i) { + printf("%02x", data[i]); + } + printf("\n");*/ - auto mem_req = new mem_req_t(); - mem_req->tag = device_->m_axi_awid[0]; - mem_req->addr = device_->m_axi_awaddr[0]; - mem_req->write = true; - mem_req->ready = false; - pending_mem_reqs_.emplace_back(mem_req); - - // send dram request - dram_queue_.push(mem_req); - } - } else { - // process reads - auto mem_req = new mem_req_t(); - mem_req->tag = device_->m_axi_arid[0]; - mem_req->addr = device_->m_axi_araddr[0]; - ram_->read(mem_req->block.data(), device_->m_axi_araddr[0], MEM_BLOCK_SIZE); - mem_req->write = false; - mem_req->ready = false; - pending_mem_reqs_.emplace_back(mem_req); - - // send dram request - dram_queue_.push(mem_req); - } - } - - device_->m_axi_wready[0] = running_; - device_->m_axi_awready[0] = running_; - device_->m_axi_arready[0] = running_; - } - -#else - - void mem_bus_reset() { - device_->mem_req_ready = 0; - device_->mem_rsp_valid = 0; - } - - void mem_bus_eval(bool clk) { - if (!clk) { - mem_rd_rsp_ready_ = device_->mem_rsp_ready; - return; - } - - if (ram_ == nullptr) { - device_->mem_req_ready = 0; - return; - } - - // process memory read responses - if (mem_rd_rsp_active_ - && device_->mem_rsp_valid && mem_rd_rsp_ready_) { - mem_rd_rsp_active_ = false; - } - if (!mem_rd_rsp_active_) { - if (!pending_mem_reqs_.empty() - && (*pending_mem_reqs_.begin())->ready) { - device_->mem_rsp_valid = 1; - auto mem_rsp_it = pending_mem_reqs_.begin(); - auto mem_rsp = *mem_rsp_it; - /* - printf("%0ld: [sim] MEM Rd Rsp: tag=0x%0lx, addr=0x%0lx, data=0x", timestamp, mem_rsp->tag, mem_rsp->addr); - for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) { - printf("%02x", mem_rsp->block[i]); - } - printf("\n"); - */ - memcpy(VDataCast::get(device_->mem_rsp_data), mem_rsp->block.data(), MEM_BLOCK_SIZE); - device_->mem_rsp_tag = mem_rsp->tag; - pending_mem_reqs_.erase(mem_rsp_it); - mem_rd_rsp_active_ = true; - delete mem_rsp; - } else { - device_->mem_rsp_valid = 0; - } - } - - // process memory requests - if (device_->mem_req_valid && running_) { - uint64_t byte_addr = (device_->mem_req_addr * MEM_BLOCK_SIZE); - if (device_->mem_req_rw) { - auto byteen = device_->mem_req_byteen; - auto data = VDataCast::get(device_->mem_req_data); - - if (byte_addr >= uint64_t(IO_COUT_ADDR) - && byte_addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) { - // process console output - for (int i = 0; i < IO_COUT_SIZE; i++) { - if ((byteen >> i) & 0x1) { - auto& ss_buf = print_bufs_[i]; - char c = data[i]; - ss_buf << c; - if (c == '\n') { - std::cout << std::dec << "#" << i << ": " << ss_buf.str() << std::flush; - ss_buf.str(""); + for (int i = 0; i < PLATFORM_MEMORY_DATA_SIZE; i++) { + if ((byteen >> i) & 0x1) { + (*ram_)[byte_addr + i] = data[i]; } } + + auto mem_req = new mem_req_t(); + mem_req->tag = device_->mem_req_tag[b]; + mem_req->addr = byte_addr; + mem_req->write = true; + mem_req->ready = false; + + // enqueue dram request + dram_queue_[b].push(mem_req); + + // add to pending list + pending_mem_reqs_[b].emplace_back(mem_req); } } else { - // process writes - /* - printf("%0ld: [sim] MEM Wr Req: tag=0x%0lx, addr=0x%0lx, byteen=0x", timestamp, device_->mem_req_tag, byte_addr); - for (int i = (MEM_BLOCK_SIZE/4)-1; i >= 0; --i) { - printf("%x", (int)((byteen >> (4 * i)) & 0xf)); - } - printf(", data=0x"); - for (int i = MEM_BLOCK_SIZE-1; i >= 0; --i) { - printf("%d=%02x,", i, data[i]); - } - printf("\n"); - */ - for (int i = 0; i < MEM_BLOCK_SIZE; i++) { - if ((byteen >> i) & 0x1) { - (*ram_)[byte_addr + i] = data[i]; - } - } - + // process memory reads auto mem_req = new mem_req_t(); - mem_req->tag = device_->mem_req_tag; + mem_req->tag = device_->mem_req_tag[b]; mem_req->addr = byte_addr; - mem_req->write = true; - mem_req->ready = true; + mem_req->write = false; + mem_req->ready = false; + ram_->read(mem_req->data.data(), byte_addr, PLATFORM_MEMORY_DATA_SIZE); - // send dram request - dram_queue_.push(mem_req); + /*printf("%0ld: [sim] MEM Rd Req[%d]: addr=0x%0lx, tag=0x%0lx, data=0x", timestamp, b, byte_addr, device_->mem_req_tag[b]); + for (int i = PLATFORM_MEMORY_DATA_SIZE-1; i >= 0; --i) { + printf("%02x", mem_req->data[i]); + } + printf("\n");*/ + + // enqueue dram request + dram_queue_[b].push(mem_req); + + // add to pending list + pending_mem_reqs_[b].emplace_back(mem_req); } - } else { - // process reads - auto mem_req = new mem_req_t(); - mem_req->tag = device_->mem_req_tag; - mem_req->addr = byte_addr; - mem_req->write = false; - mem_req->ready = false; - ram_->read(mem_req->block.data(), byte_addr, MEM_BLOCK_SIZE); - pending_mem_reqs_.emplace_back(mem_req); - - //printf("%0ld: [sim] MEM Rd Req: addr=0x%0lx, tag=0x%0lx\n", timestamp, byte_addr, device_->mem_req_tag); - - // send dram request - dram_queue_.push(mem_req); } } - - device_->mem_req_ready = running_; } -#endif - void dcr_bus_reset() { device_->dcr_wr_valid = 0; } - void dcr_bus_eval(bool clk) { - if (!clk) { - return; - } - if (device_->dcr_wr_valid) { - device_->dcr_wr_valid = 0; - } - } - void wait(uint32_t cycles) { for (int i = 0; i < cycles; ++i) { this->tick(); @@ -573,8 +383,8 @@ private: private: typedef struct { - Device* device; - std::array block; + Vrtlsim_shim* device; + std::array data; uint64_t addr; uint64_t tag; bool write; @@ -583,27 +393,21 @@ private: std::unordered_map print_bufs_; - std::list pending_mem_reqs_; + std::list pending_mem_reqs_[PLATFORM_MEMORY_NUM_BANKS]; - std::queue dram_queue_; + std::queue dram_queue_[PLATFORM_MEMORY_NUM_BANKS]; + + std::array mem_rd_rsp_ready_; DramSim dram_sim_; - Device* device_; + Vrtlsim_shim* device_; + + RAM* ram_; #ifdef VCD_OUTPUT VerilatedVcdC *tfp_; #endif - - RAM* ram_; - - bool mem_rd_rsp_active_; - bool mem_rd_rsp_ready_; - - bool mem_wr_rsp_active_; - bool mem_wr_rsp_ready_; - - bool running_; }; /////////////////////////////////////////////////////////////////////////////// diff --git a/sim/rtlsim/rtlsim_shim.sv b/sim/rtlsim/rtlsim_shim.sv new file mode 100644 index 000000000..1833034d4 --- /dev/null +++ b/sim/rtlsim/rtlsim_shim.sv @@ -0,0 +1,196 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_define.vh" + +module rtlsim_shim import VX_gpu_pkg::*; #( + parameter MEM_DATA_WIDTH = (`PLATFORM_MEMORY_DATA_SIZE * 8), + parameter MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH - $clog2(`PLATFORM_MEMORY_NUM_BANKS), + parameter MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS, + parameter MEM_TAG_WIDTH = 64 +) ( + `SCOPE_IO_DECL + + // Clock + input wire clk, + input wire reset, + + // Memory request + output wire mem_req_valid [MEM_NUM_BANKS], + output wire mem_req_rw [MEM_NUM_BANKS], + output wire [(MEM_DATA_WIDTH/8)-1:0] mem_req_byteen [MEM_NUM_BANKS], + output wire [MEM_ADDR_WIDTH-1:0] mem_req_addr [MEM_NUM_BANKS], + output wire [MEM_DATA_WIDTH-1:0] mem_req_data [MEM_NUM_BANKS], + output wire [MEM_TAG_WIDTH-1:0] mem_req_tag [MEM_NUM_BANKS], + input wire mem_req_ready [MEM_NUM_BANKS], + + // Memory response + input wire mem_rsp_valid [MEM_NUM_BANKS], + input wire [MEM_DATA_WIDTH-1:0] mem_rsp_data [MEM_NUM_BANKS], + input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag [MEM_NUM_BANKS], + output wire mem_rsp_ready [MEM_NUM_BANKS], + + // DCR write request + input wire dcr_wr_valid, + input wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr, + input wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data, + + // Status + output wire busy +); + localparam DST_LDATAW = `CLOG2(MEM_DATA_WIDTH); + localparam SRC_LDATAW = `CLOG2(`VX_MEM_DATA_WIDTH); + localparam SUB_LDATAW = DST_LDATAW - SRC_LDATAW; + localparam VX_MEM_TAG_A_WIDTH = `VX_MEM_TAG_WIDTH + `MAX(SUB_LDATAW, 0); + localparam VX_MEM_ADDR_A_WIDTH = `VX_MEM_ADDR_WIDTH - SUB_LDATAW; + + wire vx_mem_req_valid [`VX_MEM_PORTS]; + wire vx_mem_req_rw [`VX_MEM_PORTS]; + wire [`VX_MEM_BYTEEN_WIDTH-1:0] vx_mem_req_byteen [`VX_MEM_PORTS]; + wire [`VX_MEM_ADDR_WIDTH-1:0] vx_mem_req_addr [`VX_MEM_PORTS]; + wire [`VX_MEM_DATA_WIDTH-1:0] vx_mem_req_data [`VX_MEM_PORTS]; + wire [`VX_MEM_TAG_WIDTH-1:0] vx_mem_req_tag [`VX_MEM_PORTS]; + wire vx_mem_req_ready [`VX_MEM_PORTS]; + + wire vx_mem_rsp_valid [`VX_MEM_PORTS]; + wire [`VX_MEM_DATA_WIDTH-1:0] vx_mem_rsp_data [`VX_MEM_PORTS]; + wire [`VX_MEM_TAG_WIDTH-1:0] vx_mem_rsp_tag [`VX_MEM_PORTS]; + wire vx_mem_rsp_ready [`VX_MEM_PORTS]; + + `SCOPE_IO_SWITCH (1); + + Vortex vortex ( + `SCOPE_IO_BIND (0) + + .clk (clk), + .reset (reset), + + .mem_req_valid (vx_mem_req_valid), + .mem_req_rw (vx_mem_req_rw), + .mem_req_byteen (vx_mem_req_byteen), + .mem_req_addr (vx_mem_req_addr), + .mem_req_data (vx_mem_req_data), + .mem_req_tag (vx_mem_req_tag), + .mem_req_ready (vx_mem_req_ready), + + .mem_rsp_valid (vx_mem_rsp_valid), + .mem_rsp_data (vx_mem_rsp_data), + .mem_rsp_tag (vx_mem_rsp_tag), + .mem_rsp_ready (vx_mem_rsp_ready), + + .dcr_wr_valid (dcr_wr_valid), + .dcr_wr_addr (dcr_wr_addr), + .dcr_wr_data (dcr_wr_data), + + .busy (busy) + ); + + wire mem_req_valid_a [`VX_MEM_PORTS]; + wire mem_req_rw_a [`VX_MEM_PORTS]; + wire [(MEM_DATA_WIDTH/8)-1:0] mem_req_byteen_a [`VX_MEM_PORTS]; + wire [VX_MEM_ADDR_A_WIDTH-1:0] mem_req_addr_a [`VX_MEM_PORTS]; + wire [MEM_DATA_WIDTH-1:0] mem_req_data_a [`VX_MEM_PORTS]; + wire [VX_MEM_TAG_A_WIDTH-1:0] mem_req_tag_a [`VX_MEM_PORTS]; + wire mem_req_ready_a [`VX_MEM_PORTS]; + + wire mem_rsp_valid_a [`VX_MEM_PORTS]; + wire [MEM_DATA_WIDTH-1:0] mem_rsp_data_a [`VX_MEM_PORTS]; + wire [VX_MEM_TAG_A_WIDTH-1:0] mem_rsp_tag_a [`VX_MEM_PORTS]; + wire mem_rsp_ready_a [`VX_MEM_PORTS]; + + // Adjust memory data width to match AXI interface + for (genvar i = 0; i < `VX_MEM_PORTS; i++) begin : g_mem_adapter + VX_mem_data_adapter #( + .SRC_DATA_WIDTH (`VX_MEM_DATA_WIDTH), + .DST_DATA_WIDTH (MEM_DATA_WIDTH), + .SRC_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH), + .DST_ADDR_WIDTH (VX_MEM_ADDR_A_WIDTH), + .SRC_TAG_WIDTH (`VX_MEM_TAG_WIDTH), + .DST_TAG_WIDTH (VX_MEM_TAG_A_WIDTH), + .REQ_OUT_BUF (0), + .RSP_OUT_BUF (0) + ) mem_data_adapter ( + .clk (clk), + .reset (reset), + + .mem_req_valid_in (vx_mem_req_valid[i]), + .mem_req_addr_in (vx_mem_req_addr[i]), + .mem_req_rw_in (vx_mem_req_rw[i]), + .mem_req_byteen_in (vx_mem_req_byteen[i]), + .mem_req_data_in (vx_mem_req_data[i]), + .mem_req_tag_in (vx_mem_req_tag[i]), + .mem_req_ready_in (vx_mem_req_ready[i]), + + .mem_rsp_valid_in (vx_mem_rsp_valid[i]), + .mem_rsp_data_in (vx_mem_rsp_data[i]), + .mem_rsp_tag_in (vx_mem_rsp_tag[i]), + .mem_rsp_ready_in (vx_mem_rsp_ready[i]), + + .mem_req_valid_out (mem_req_valid_a[i]), + .mem_req_addr_out (mem_req_addr_a[i]), + .mem_req_rw_out (mem_req_rw_a[i]), + .mem_req_byteen_out (mem_req_byteen_a[i]), + .mem_req_data_out (mem_req_data_a[i]), + .mem_req_tag_out (mem_req_tag_a[i]), + .mem_req_ready_out (mem_req_ready_a[i]), + + .mem_rsp_valid_out (mem_rsp_valid_a[i]), + .mem_rsp_data_out (mem_rsp_data_a[i]), + .mem_rsp_tag_out (mem_rsp_tag_a[i]), + .mem_rsp_ready_out (mem_rsp_ready_a[i]) + ); + end + + VX_mem_bank_adapter #( + .DATA_WIDTH (MEM_DATA_WIDTH), + .ADDR_WIDTH_IN (VX_MEM_ADDR_A_WIDTH), + .ADDR_WIDTH_OUT (MEM_ADDR_WIDTH), + .TAG_WIDTH_IN (VX_MEM_TAG_A_WIDTH), + .TAG_WIDTH_OUT (MEM_TAG_WIDTH), + .NUM_PORTS_IN (`VX_MEM_PORTS), + .NUM_BANKS_OUT (MEM_NUM_BANKS), + .INTERLEAVE (`PLATFORM_MEMORY_INTERLEAVE), + .REQ_OUT_BUF ((`VX_MEM_PORTS > 1) ? 2 : 0), + .RSP_OUT_BUF ((`VX_MEM_PORTS > 1 || MEM_NUM_BANKS > 1) ? 2 : 0) + ) mem_bank_adapter ( + .clk (clk), + .reset (reset), + + .mem_req_valid_in (mem_req_valid_a), + .mem_req_rw_in (mem_req_rw_a), + .mem_req_byteen_in (mem_req_byteen_a), + .mem_req_addr_in (mem_req_addr_a), + .mem_req_data_in (mem_req_data_a), + .mem_req_tag_in (mem_req_tag_a), + .mem_req_ready_in (mem_req_ready_a), + + .mem_rsp_valid_in (mem_rsp_valid_a), + .mem_rsp_data_in (mem_rsp_data_a), + .mem_rsp_tag_in (mem_rsp_tag_a), + .mem_rsp_ready_in (mem_rsp_ready_a), + + .mem_req_valid_out (mem_req_valid), + .mem_req_rw_out (mem_req_rw), + .mem_req_byteen_out (mem_req_byteen), + .mem_req_addr_out (mem_req_addr), + .mem_req_data_out (mem_req_data), + .mem_req_tag_out (mem_req_tag), + .mem_req_ready_out (mem_req_ready), + + .mem_rsp_valid_out (mem_rsp_valid), + .mem_rsp_data_out (mem_rsp_data), + .mem_rsp_tag_out (mem_rsp_tag), + .mem_rsp_ready_out (mem_rsp_ready) + ); + +endmodule diff --git a/sim/rtlsim/verilator.vlt b/sim/rtlsim/verilator.vlt deleted file mode 100644 index 9cfccbeb4..000000000 --- a/sim/rtlsim/verilator.vlt +++ /dev/null @@ -1,5 +0,0 @@ -`verilator_config - -lint_off -rule BLKANDNBLK -file "*/fpnew/src/*" -lint_off -rule UNOPTFLAT -file "*/fpnew/src/*" -lint_off -file "*/fpnew/src/*" diff --git a/sim/rtlsim/verilator.vlt.in b/sim/rtlsim/verilator.vlt.in new file mode 100644 index 000000000..56de6b2cf --- /dev/null +++ b/sim/rtlsim/verilator.vlt.in @@ -0,0 +1,5 @@ +`verilator_config + +lint_off -rule BLKANDNBLK -file "@VORTEX_HOME@/third_party/cvfpu/*" +lint_off -rule UNOPTFLAT -file "@VORTEX_HOME@/third_party/cvfpu/*" +lint_off -file "@VORTEX_HOME@/third_party/cvfpu/*" diff --git a/sim/simx/Makefile b/sim/simx/Makefile index 31fde7023..83054edc4 100644 --- a/sim/simx/Makefile +++ b/sim/simx/Makefile @@ -17,9 +17,14 @@ CXXFLAGS += $(CONFIGS) LDFLAGS += $(THIRD_PARTY_DIR)/softfloat/build/Linux-x86_64-GCC/softfloat.a LDFLAGS += -Wl,-rpath,$(THIRD_PARTY_DIR)/ramulator -L$(THIRD_PARTY_DIR)/ramulator -lramulator -SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp +SRCS = $(COMMON_DIR)/util.cpp $(COMMON_DIR)/mem.cpp $(COMMON_DIR)/softfloat_ext.cpp $(COMMON_DIR)/rvfloats.cpp $(COMMON_DIR)/dram_sim.cpp SRCS += $(SRC_DIR)/processor.cpp $(SRC_DIR)/cluster.cpp $(SRC_DIR)/socket.cpp $(SRC_DIR)/core.cpp $(SRC_DIR)/emulator.cpp $(SRC_DIR)/decode.cpp $(SRC_DIR)/execute.cpp $(SRC_DIR)/func_unit.cpp $(SRC_DIR)/cache_sim.cpp $(SRC_DIR)/mem_sim.cpp $(SRC_DIR)/local_mem.cpp $(SRC_DIR)/mem_coalescer.cpp $(SRC_DIR)/dcrs.cpp $(SRC_DIR)/types.cpp +# Add V extension sources +ifneq ($(findstring -DEXT_V_ENABLE, $(CONFIGS)),) + SRCS += $(SRC_DIR)/vpu.cpp +endif + # Debugging ifdef DEBUG CXXFLAGS += -g -O0 -DDEBUG_LEVEL=$(DEBUG) diff --git a/sim/simx/arch.h b/sim/simx/arch.h index d72b4ce11..6becf5c91 100644 --- a/sim/simx/arch.h +++ b/sim/simx/arch.h @@ -33,7 +33,7 @@ private: uint64_t local_mem_base_; public: - Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores) + Arch(uint16_t num_threads, uint16_t num_warps, uint16_t num_cores) : num_threads_(num_threads) , num_warps_(num_warps) , num_cores_(num_cores) @@ -70,6 +70,7 @@ public: uint16_t socket_size() const { return socket_size_; } + }; } \ No newline at end of file diff --git a/sim/simx/cache_cluster.h b/sim/simx/cache_cluster.h index 63016577b..8c69c7e63 100644 --- a/sim/simx/cache_cluster.h +++ b/sim/simx/cache_cluster.h @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,81 +21,77 @@ class CacheCluster : public SimObject { public: std::vector>> CoreReqPorts; std::vector>> CoreRspPorts; - SimPort MemReqPort; - SimPort MemRspPort; + std::vector> MemReqPorts; + std::vector> MemRspPorts; - CacheCluster(const SimContext& ctx, - const char* name, - uint32_t num_inputs, - uint32_t num_caches, - uint32_t num_requests, - const CacheSim::Config& cache_config) + CacheCluster(const SimContext& ctx, + const char* name, + uint32_t num_inputs, + uint32_t num_units, + const CacheSim::Config& cache_config) : SimObject(ctx, name) - , CoreReqPorts(num_inputs, std::vector>(num_requests, this)) - , CoreRspPorts(num_inputs, std::vector>(num_requests, this)) - , MemReqPort(this) - , MemRspPort(this) - , caches_(MAX(num_caches, 0x1)) { + , CoreReqPorts(num_inputs, std::vector>(cache_config.num_inputs, this)) + , CoreRspPorts(num_inputs, std::vector>(cache_config.num_inputs, this)) + , MemReqPorts(cache_config.mem_ports, this) + , MemRspPorts(cache_config.mem_ports, this) + , caches_(MAX(num_units, 0x1)) { CacheSim::Config cache_config2(cache_config); - if (0 == num_caches) { - num_caches = 1; + if (0 == num_units) { + num_units = 1; cache_config2.bypass = true; } char sname[100]; - - std::vector input_arbs(num_inputs); - for (uint32_t j = 0; j < num_inputs; ++j) { - snprintf(sname, 100, "%s-input-arb%d", name, j); - input_arbs.at(j) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_requests, cache_config.num_inputs); - for (uint32_t i = 0; i < num_requests; ++i) { - this->CoreReqPorts.at(j).at(i).bind(&input_arbs.at(j)->ReqIn.at(i)); - input_arbs.at(j)->RspIn.at(i).bind(&this->CoreRspPorts.at(j).at(i)); - } - } - std::vector mem_arbs(cache_config.num_inputs); + // Arbitrate incoming core interfaces + std::vector input_arbs(cache_config.num_inputs); for (uint32_t i = 0; i < cache_config.num_inputs; ++i) { - snprintf(sname, 100, "%s-mem-arb%d", name, i); - mem_arbs.at(i) = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_inputs, num_caches); + snprintf(sname, 100, "%s-input-arb%d", name, i); + input_arbs.at(i) = MemArbiter::Create(sname, ArbiterType::RoundRobin, num_inputs, num_units); for (uint32_t j = 0; j < num_inputs; ++j) { - input_arbs.at(j)->ReqOut.at(i).bind(&mem_arbs.at(i)->ReqIn.at(j)); - mem_arbs.at(i)->RspIn.at(j).bind(&input_arbs.at(j)->RspOut.at(i)); + this->CoreReqPorts.at(j).at(i).bind(&input_arbs.at(i)->ReqIn.at(j)); + input_arbs.at(i)->RspIn.at(j).bind(&this->CoreRspPorts.at(j).at(i)); } } - snprintf(sname, 100, "%s-cache-arb", name); - auto cache_arb = MemSwitch::Create(sname, ArbiterType::RoundRobin, num_caches, 1); + // Arbitrate outgoing memory interfaces + std::vector mem_arbs(cache_config.mem_ports); + for (uint32_t i = 0; i < cache_config.mem_ports; ++i) { + snprintf(sname, 100, "%s-mem-arb%d", name, i); + mem_arbs.at(i) = MemArbiter::Create(sname, ArbiterType::RoundRobin, num_units, 1); + mem_arbs.at(i)->ReqOut.at(0).bind(&this->MemReqPorts.at(i)); + this->MemRspPorts.at(i).bind(&mem_arbs.at(i)->RspOut.at(0)); + } - for (uint32_t i = 0; i < num_caches; ++i) { + // Connect caches + for (uint32_t i = 0; i < num_units; ++i) { snprintf(sname, 100, "%s-cache%d", name, i); caches_.at(i) = CacheSim::Create(sname, cache_config2); for (uint32_t j = 0; j < cache_config.num_inputs; ++j) { - mem_arbs.at(j)->ReqOut.at(i).bind(&caches_.at(i)->CoreReqPorts.at(j)); - caches_.at(i)->CoreRspPorts.at(j).bind(&mem_arbs.at(j)->RspOut.at(i)); + input_arbs.at(j)->ReqOut.at(i).bind(&caches_.at(i)->CoreReqPorts.at(j)); + caches_.at(i)->CoreRspPorts.at(j).bind(&input_arbs.at(j)->RspOut.at(i)); } - caches_.at(i)->MemReqPort.bind(&cache_arb->ReqIn.at(i)); - cache_arb->RspIn.at(i).bind(&caches_.at(i)->MemRspPort); + for (uint32_t j = 0; j < cache_config.mem_ports; ++j) { + caches_.at(i)->MemReqPorts.at(j).bind(&mem_arbs.at(j)->ReqIn.at(i)); + mem_arbs.at(j)->RspIn.at(i).bind(&caches_.at(i)->MemRspPorts.at(j)); + } } - - cache_arb->ReqOut.at(0).bind(&this->MemReqPort); - this->MemRspPort.bind(&cache_arb->RspOut.at(0)); } ~CacheCluster() {} void reset() {} - + void tick() {} CacheSim::PerfStats perf_stats() const { CacheSim::PerfStats perf; for (auto cache : caches_) { perf += cache->perf_stats(); - } + } return perf; } diff --git a/sim/simx/cache_sim.cpp b/sim/simx/cache_sim.cpp index 65a8da70b..02997277f 100644 --- a/sim/simx/cache_sim.cpp +++ b/sim/simx/cache_sim.cpp @@ -169,6 +169,25 @@ struct bank_req_t { } }; +inline std::ostream &operator<<(std::ostream &os, const bank_req_t& req) { + os << "set=" << req.set_id << ", rw=" << req.write; + os << std::dec << ", type=" << req.type; + os << ", tag=0x" << std::hex << req.tag; + os << ", req_tags={"; + bool first_port = true; + for (auto& port : req.ports) { + if (port.valid) { + if (!first_port) os << ", "; + first_port = false; + os << "[" << std::dec << port.req_id << "]=0x" << std::hex << port.req_tag; + } + } + os << "}"; + os << std::dec << ", cid=" << req.cid; + os << " (#" << req.uuid << ")"; + return os; +} + struct mshr_entry_t { bank_req_t bank_req; uint32_t line_id; @@ -285,8 +304,8 @@ private: Config config_; params_t params_; std::vector banks_; - MemSwitch::Ptr bank_switch_; - MemSwitch::Ptr bypass_switch_; + MemArbiter::Ptr bank_arb_; + std::vector nc_arbs_; std::vector> mem_req_ports_; std::vector> mem_rsp_ports_; std::vector pipeline_reqs_; @@ -302,40 +321,51 @@ public: , config_(config) , params_(config) , banks_((1 << config.B), {config, params_}) + , nc_arbs_(config.mem_ports) , mem_req_ports_((1 << config.B), simobject) , mem_rsp_ports_((1 << config.B), simobject) , pipeline_reqs_((1 << config.B), config.ports_per_bank) { char sname[100]; - snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str()); if (config_.bypass) { - bypass_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, config_.num_inputs); + snprintf(sname, 100, "%s-bypass-arb", simobject->name().c_str()); + auto bypass_arb = MemArbiter::Create(sname, ArbiterType::RoundRobin, config_.num_inputs, config_.mem_ports); for (uint32_t i = 0; i < config_.num_inputs; ++i) { - simobject->CoreReqPorts.at(i).bind(&bypass_switch_->ReqIn.at(i)); - bypass_switch_->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i)); + simobject->CoreReqPorts.at(i).bind(&bypass_arb->ReqIn.at(i)); + bypass_arb->RspIn.at(i).bind(&simobject->CoreRspPorts.at(i)); + } + for (uint32_t i = 0; i < config_.mem_ports; ++i) { + bypass_arb->ReqOut.at(i).bind(&simobject->MemReqPorts.at(i)); + simobject->MemRspPorts.at(i).bind(&bypass_arb->RspOut.at(i)); } - bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort); - simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0)); return; } - bypass_switch_ = MemSwitch::Create(sname, ArbiterType::Priority, 2); - bypass_switch_->ReqOut.at(0).bind(&simobject->MemReqPort); - simobject->MemRspPort.bind(&bypass_switch_->RspOut.at(0)); + // create non-cacheable arbiter + for (uint32_t i = 0; i < config_.mem_ports; ++i) { + snprintf(sname, 100, "%s-nc-arb%d", simobject->name().c_str(), i); + nc_arbs_.at(i) = MemArbiter::Create(sname, ArbiterType::Priority, 2, 1); + } - if (config.B != 0) { - snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str()); - bank_switch_ = MemSwitch::Create(sname, ArbiterType::RoundRobin, (1 << config.B)); - for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) { - mem_req_ports_.at(i).bind(&bank_switch_->ReqIn.at(i)); - bank_switch_->RspIn.at(i).bind(&mem_rsp_ports_.at(i)); - } - bank_switch_->ReqOut.at(0).bind(&bypass_switch_->ReqIn.at(0)); - bypass_switch_->RspIn.at(0).bind(&bank_switch_->RspOut.at(0)); - } else { - mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0)); - bypass_switch_->RspIn.at(0).bind(&mem_rsp_ports_.at(0)); + // Connect non-cacheable arbiter output to outgoing memory ports + for (uint32_t i = 0; i < config_.mem_ports; ++i) { + nc_arbs_.at(i)->ReqOut.at(0).bind(&simobject->MemReqPorts.at(i)); + simobject->MemRspPorts.at(i).bind(&nc_arbs_.at(i)->RspOut.at(0)); + } + + // Create bank's memory arbiter + snprintf(sname, 100, "%s-bank-arb", simobject->name().c_str()); + auto bank_mem_arb = MemArbiter::Create(sname, ArbiterType::RoundRobin, (1 << config.B), config_.mem_ports); + for (uint32_t i = 0, n = (1 << config.B); i < n; ++i) { + mem_req_ports_.at(i).bind(&bank_mem_arb->ReqIn.at(i)); + bank_mem_arb->RspIn.at(i).bind(&mem_rsp_ports_.at(i)); + } + + // Connect bank's memory arbiter to non-cacheable arbiter's input 0 + for (uint32_t i = 0; i < config_.mem_ports; ++i) { + bank_mem_arb->ReqOut.at(i).bind(&nc_arbs_.at(i)->ReqIn.at(0)); + nc_arbs_.at(i)->RspIn.at(0).bind(&bank_mem_arb->RspOut.at(i)); } // calculate cache initialization cycles @@ -366,8 +396,8 @@ public: } // handle cache bypasss responses - { - auto& bypass_port = bypass_switch_->RspIn.at(1); + for (uint32_t i = 0, n = config_.mem_ports; i < n; ++i) { + auto& bypass_port = nc_arbs_.at(i)->RspIn.at(1); if (!bypass_port.empty()) { auto& mem_rsp = bypass_port.front(); this->processBypassResponse(mem_rsp); @@ -400,7 +430,7 @@ public: continue; auto& mem_rsp = mem_rsp_port.front(); - DT(3, simobject_->name() << "-bank" << bank_id << " fill-rsp: " << mem_rsp); + DT(3, simobject_->name() << "-bank" << bank_id << "-fill-rsp: " << mem_rsp); pipeline_req.type = bank_req_t::Fill; pipeline_req.tag = mem_rsp.tag; mem_rsp_port.pop(); @@ -465,6 +495,7 @@ public: bank_req.type = bank_req_t::Core; bank_req.write = core_req.write; pipeline_req = bank_req; + DT(3, simobject_->name() << "-core-req: " << core_req); } if (core_req.write) @@ -492,21 +523,22 @@ private: uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs; MemRsp core_rsp{tag, mem_rsp.cid, mem_rsp.uuid}; simobject_->CoreRspPorts.at(req_id).push(core_rsp, config_.latency); - DT(3, simobject_->name() << " core-rsp: " << core_rsp); + DT(3, simobject_->name() << "-bypass-core-rsp: " << core_rsp); } void processBypassRequest(const MemReq& core_req, uint32_t req_id) { { MemReq mem_req(core_req); mem_req.tag = (core_req.tag << params_.log2_num_inputs) + req_id; - bypass_switch_->ReqIn.at(1).push(mem_req, 1); - DT(3, simobject_->name() << " dram-req: " << mem_req); + uint32_t mem_port = req_id % config_.mem_ports; + nc_arbs_.at(mem_port)->ReqIn.at(1).push(mem_req, 1); + DT(3, simobject_->name() << "-bypass-dram-req: " << mem_req); } if (core_req.write && config_.write_reponse) { MemRsp core_rsp{core_req.tag, core_req.cid, core_req.uuid}; simobject_->CoreRspPorts.at(req_id).push(core_rsp, 1); - DT(3, simobject_->name() << " core-rsp: " << core_rsp); + DT(3, simobject_->name() << "-bypass-core-rsp: " << core_rsp); } } @@ -536,7 +568,7 @@ private: continue; MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid}; simobject_->CoreRspPorts.at(info.req_id).push(core_rsp, config_.latency); - DT(3, simobject_->name() << "-bank" << bank_id << " replay: " << core_rsp); + DT(3, simobject_->name() << "-bank" << bank_id << "-replay: " << core_rsp); } } } break; @@ -580,7 +612,7 @@ private: mem_req.cid = pipeline_req.cid; mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).push(mem_req, 1); - DT(3, simobject_->name() << "-bank" << bank_id << " writethrough: " << mem_req); + DT(3, simobject_->name() << "-bank" << bank_id << "-writethrough: " << mem_req); } else { // mark line as dirty hit_line.dirty = true; @@ -593,7 +625,7 @@ private: continue; MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid}; simobject_->CoreRspPorts.at(info.req_id).push(core_rsp, config_.latency); - DT(3, simobject_->name() << "-bank" << bank_id << " core-rsp: " << core_rsp); + DT(3, simobject_->name() << "-bank" << bank_id << "-core-rsp: " << core_rsp); } } } else { @@ -612,7 +644,7 @@ private: mem_req.write = true; mem_req.cid = pipeline_req.cid; mem_req_ports_.at(bank_id).push(mem_req, 1); - DT(3, simobject_->name() << "-bank" << bank_id << " writeback: " << mem_req); + DT(3, simobject_->name() << "-bank" << bank_id << "-writeback: " << mem_req); ++perf_stats_.evictions; } } @@ -626,7 +658,7 @@ private: mem_req.cid = pipeline_req.cid; mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).push(mem_req, 1); - DT(3, simobject_->name() << "-bank" << bank_id << " writethrough: " << mem_req); + DT(3, simobject_->name() << "-bank" << bank_id << "-writethrough: " << mem_req); } // send core response if (config_.write_reponse) { @@ -635,7 +667,7 @@ private: continue; MemRsp core_rsp{info.req_tag, pipeline_req.cid, pipeline_req.uuid}; simobject_->CoreRspPorts.at(info.req_id).push(core_rsp, config_.latency); - DT(3, simobject_->name() << "-bank" << bank_id << " core-rsp: " << core_rsp); + DT(3, simobject_->name() << "-bank" << bank_id << "-core-rsp: " << core_rsp); } } } else { @@ -644,6 +676,7 @@ private: // allocate MSHR auto mshr_id = bank.mshr.allocate(pipeline_req, (free_line_id != -1) ? free_line_id : repl_line_id); + DT(3, simobject_->name() << "-bank" << bank_id << "-mshr-enqueue: " << pipeline_req); // send fill request if (!mshr_pending) { @@ -654,7 +687,7 @@ private: mem_req.cid = pipeline_req.cid; mem_req.uuid = pipeline_req.uuid; mem_req_ports_.at(bank_id).push(mem_req, 1); - DT(3, simobject_->name() << "-bank" << bank_id << " fill: " << mem_req); + DT(3, simobject_->name() << "-bank" << bank_id << "-fill: " << mem_req); ++pending_fill_reqs_; } } @@ -673,8 +706,8 @@ CacheSim::CacheSim(const SimContext& ctx, const char* name, const Config& config : SimObject(ctx, name) , CoreReqPorts(config.num_inputs, this) , CoreRspPorts(config.num_inputs, this) - , MemReqPort(this) - , MemRspPort(this) + , MemReqPorts(config.mem_ports, this) + , MemRspPorts(config.mem_ports, this) , impl_(new Impl(this, config)) {} diff --git a/sim/simx/cache_sim.h b/sim/simx/cache_sim.h index df62bf854..1e586fed7 100644 --- a/sim/simx/cache_sim.h +++ b/sim/simx/cache_sim.h @@ -30,6 +30,7 @@ public: uint8_t addr_width; // word address bits uint8_t ports_per_bank; // number of ports per bank uint8_t num_inputs; // number of inputs + uint8_t mem_ports; // memory ports bool write_back; // is write-back bool write_reponse; // enable write response uint16_t mshr_size; // MSHR buffer size @@ -75,8 +76,8 @@ public: std::vector> CoreReqPorts; std::vector> CoreRspPorts; - SimPort MemReqPort; - SimPort MemRspPort; + std::vector> MemReqPorts; + std::vector> MemRspPorts; CacheSim(const SimContext& ctx, const char* name, const Config& config); ~CacheSim(); diff --git a/sim/simx/cluster.cpp b/sim/simx/cluster.cpp index ec5e3f2b6..ebcaa3e39 100644 --- a/sim/simx/cluster.cpp +++ b/sim/simx/cluster.cpp @@ -20,9 +20,9 @@ Cluster::Cluster(const SimContext& ctx, ProcessorImpl* processor, const Arch &arch, const DCRS &dcrs) - : SimObject(ctx, "cluster") - , mem_req_port(this) - , mem_rsp_port(this) + : SimObject(ctx, StrFormat("cluster%d", cluster_id)) + , mem_req_ports(L2_MEM_PORTS, this) + , mem_rsp_ports(L2_MEM_PORTS, this) , cluster_id_(cluster_id) , processor_(processor) , sockets_(NUM_SOCKETS) @@ -35,31 +35,14 @@ Cluster::Cluster(const SimContext& ctx, // create sockets - snprintf(sname, 100, "cluster%d-icache-arb", cluster_id); - auto icache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster); - - snprintf(sname, 100, "cluster%d-dcache-arb", cluster_id); - auto dcache_switch = MemSwitch::Create(sname, ArbiterType::RoundRobin, sockets_per_cluster); - for (uint32_t i = 0; i < sockets_per_cluster; ++i) { uint32_t socket_id = cluster_id * sockets_per_cluster + i; - auto socket = Socket::Create(socket_id, - this, - arch, - dcrs); - - socket->icache_mem_req_port.bind(&icache_switch->ReqIn.at(i)); - icache_switch->RspIn.at(i).bind(&socket->icache_mem_rsp_port); - - socket->dcache_mem_req_port.bind(&dcache_switch->ReqIn.at(i)); - dcache_switch->RspIn.at(i).bind(&socket->dcache_mem_rsp_port); - - sockets_.at(i) = socket; + sockets_.at(i) = Socket::Create(socket_id, this, arch, dcrs); } // Create l2cache - snprintf(sname, 100, "cluster%d-l2cache", cluster_id); + snprintf(sname, 100, "%s-l2cache", this->name().c_str()); l2cache_ = CacheSim::Create(sname, CacheSim::Config{ !L2_ENABLED, log2ceil(L2_CACHE_SIZE),// C @@ -69,21 +52,27 @@ Cluster::Cluster(const SimContext& ctx, log2ceil(L2_NUM_BANKS), // B XLEN, // address bits 1, // number of ports - 2, // request size + L2_NUM_REQS, // request size + L2_MEM_PORTS, // memory ports L2_WRITEBACK, // write-back false, // write response L2_MSHR_SIZE, // mshr size 2, // pipeline latency }); - l2cache_->MemReqPort.bind(&this->mem_req_port); - this->mem_rsp_port.bind(&l2cache_->MemRspPort); + // connect l2cache core interfaces + for (uint32_t i = 0; i < sockets_per_cluster; ++i) { + for (uint32_t j = 0; j < L1_MEM_PORTS; ++j) { + sockets_.at(i)->mem_req_ports.at(j).bind(&l2cache_->CoreReqPorts.at(i * L1_MEM_PORTS + j)); + l2cache_->CoreRspPorts.at(i * L1_MEM_PORTS + j).bind(&sockets_.at(i)->mem_rsp_ports.at(j)); + } + } - icache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(0)); - l2cache_->CoreRspPorts.at(0).bind(&icache_switch->RspOut.at(0)); - - dcache_switch->ReqOut.at(0).bind(&l2cache_->CoreReqPorts.at(1)); - l2cache_->CoreRspPorts.at(1).bind(&dcache_switch->RspOut.at(0)); + // connect l2cache memory interfaces + for (uint32_t i = 0; i < L2_MEM_PORTS; ++i) { + l2cache_->MemReqPorts.at(i).bind(&this->mem_req_ports.at(i)); + this->mem_rsp_ports.at(i).bind(&l2cache_->MemRspPorts.at(i)); + } } Cluster::~Cluster() { @@ -106,6 +95,14 @@ void Cluster::attach_ram(RAM* ram) { } } +#ifdef VM_ENABLE +void Cluster::set_satp(uint64_t satp) { + for (auto& socket : sockets_) { + socket->set_satp(satp); + } +} +#endif + bool Cluster::running() const { for (auto& socket : sockets_) { if (socket->running()) diff --git a/sim/simx/cluster.h b/sim/simx/cluster.h index 253c54fb4..d31aa1672 100644 --- a/sim/simx/cluster.h +++ b/sim/simx/cluster.h @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -32,13 +32,13 @@ public: CacheSim::PerfStats l2cache; }; - SimPort mem_req_port; - SimPort mem_rsp_port; + std::vector> mem_req_ports; + std::vector> mem_rsp_ports; - Cluster(const SimContext& ctx, + Cluster(const SimContext& ctx, uint32_t cluster_id, - ProcessorImpl* processor, - const Arch &arch, + ProcessorImpl* processor, + const Arch &arch, const DCRS &dcrs); ~Cluster(); @@ -57,18 +57,22 @@ public: void attach_ram(RAM* ram); + #ifdef VM_ENABLE + void set_satp(uint64_t satp); + #endif + bool running() const; - int get_exitcode() const; + int get_exitcode() const; void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id); PerfStats perf_stats() const; - + private: uint32_t cluster_id_; ProcessorImpl* processor_; - std::vector sockets_; + std::vector sockets_; std::vector barriers_; CacheSim::Ptr l2cache_; uint32_t cores_per_socket_; diff --git a/sim/simx/constants.h b/sim/simx/constants.h index 09a509ce1..6a79722ae 100644 --- a/sim/simx/constants.h +++ b/sim/simx/constants.h @@ -13,6 +13,8 @@ #pragma once +#include + #ifndef RAM_PAGE_SIZE #define RAM_PAGE_SIZE 4096 #endif @@ -21,18 +23,19 @@ #define MEM_CLOCK_RATIO 1 #endif -#ifndef MEMORY_BANKS -#define MEMORY_BANKS 2 -#endif +inline constexpr int LSU_WORD_SIZE = (XLEN / 8); +inline constexpr int LSU_CHANNELS = NUM_LSU_LANES; +inline constexpr int LSU_NUM_REQS = (NUM_LSU_BLOCKS * LSU_CHANNELS); -#define LSU_WORD_SIZE (XLEN / 8) -#define LSU_CHANNELS NUM_LSU_LANES -#define LSU_NUM_REQS (NUM_LSU_BLOCKS * LSU_CHANNELS) +// The dcache uses coalesced memory blocks +inline constexpr int DCACHE_WORD_SIZE = LSU_LINE_SIZE; +inline constexpr int DCACHE_CHANNELS = UP((NUM_LSU_LANES * (XLEN / 8)) / DCACHE_WORD_SIZE); +inline constexpr int DCACHE_NUM_REQS = (NUM_LSU_BLOCKS * DCACHE_CHANNELS); -#define DCACHE_WORD_SIZE LSU_LINE_SIZE -#define DCACHE_CHANNELS UP((NUM_LSU_LANES * (XLEN / 8)) / DCACHE_WORD_SIZE) -#define DCACHE_NUM_REQS (NUM_LSU_BLOCKS * DCACHE_CHANNELS) +inline constexpr int NUM_SOCKETS = UP(NUM_CORES / SOCKET_SIZE); -#define NUM_SOCKETS UP(NUM_CORES / SOCKET_SIZE) +inline constexpr int L2_NUM_REQS = NUM_SOCKETS * L1_MEM_PORTS; -#define PER_ISSUE_WARPS NUM_WARPS / ISSUE_WIDTH \ No newline at end of file +inline constexpr int L3_NUM_REQS = NUM_CLUSTERS * L2_MEM_PORTS; + +inline constexpr int PER_ISSUE_WARPS = NUM_WARPS / ISSUE_WIDTH; \ No newline at end of file diff --git a/sim/simx/core.cpp b/sim/simx/core.cpp index 04f6abf57..65609a9aa 100644 --- a/sim/simx/core.cpp +++ b/sim/simx/core.cpp @@ -30,7 +30,7 @@ Core::Core(const SimContext& ctx, Socket* socket, const Arch &arch, const DCRS &dcrs) - : SimObject(ctx, "core") + : SimObject(ctx, StrFormat("core%d", core_id)) , icache_req_ports(1, this) , icache_rsp_ports(1, this) , dcache_req_ports(DCACHE_NUM_REQS, this) @@ -44,10 +44,8 @@ Core::Core(const SimContext& ctx, , operands_(ISSUE_WIDTH) , dispatchers_((uint32_t)FUType::Count) , func_units_((uint32_t)FUType::Count) - , lsu_demux_(NUM_LSU_BLOCKS) + , lmem_switch_(NUM_LSU_BLOCKS) , mem_coalescers_(NUM_LSU_BLOCKS) - , lsu_dcache_adapter_(NUM_LSU_BLOCKS) - , lsu_lmem_adapter_(NUM_LSU_BLOCKS) , pending_icache_(arch_.num_warps()) , commit_arbs_(ISSUE_WIDTH) { @@ -59,68 +57,72 @@ Core::Core(const SimContext& ctx, // create the memory coalescer for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) { - snprintf(sname, 100, "core%d-coalescer%d", core_id, i); + snprintf(sname, 100, "%s-coalescer%d", this->name().c_str(), i); mem_coalescers_.at(i) = MemCoalescer::Create(sname, LSU_CHANNELS, DCACHE_CHANNELS, DCACHE_WORD_SIZE, LSUQ_OUT_SIZE, 1); } // create local memory - snprintf(sname, 100, "core%d-local_mem", core_id); + snprintf(sname, 100, "%s-lmem", this->name().c_str()); local_mem_ = LocalMem::Create(sname, LocalMem::Config{ (1 << LMEM_LOG_SIZE), LSU_WORD_SIZE, - LSU_NUM_REQS, + LSU_CHANNELS, log2ceil(LMEM_NUM_BANKS), false }); - // create lsu demux + // create lmem switch for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) { - snprintf(sname, 100, "core%d-lsu_demux%d", core_id, i); - lsu_demux_.at(i) = LocalMemDemux::Create(sname, 1); + snprintf(sname, 100, "%s-lmem_switch%d", this->name().c_str(), i); + lmem_switch_.at(i) = LocalMemSwitch::Create(sname, 1); } - // create lsu dcache adapter + // create dcache adapter + std::vector lsu_dcache_adapter(NUM_LSU_BLOCKS); for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) { - snprintf(sname, 100, "core%d-lsu_dcache_adapter%d", core_id, i); - lsu_dcache_adapter_.at(i) = LsuMemAdapter::Create(sname, DCACHE_CHANNELS, 1); + snprintf(sname, 100, "%s-lsu_dcache_adapter%d", this->name().c_str(), i); + lsu_dcache_adapter.at(i) = LsuMemAdapter::Create(sname, DCACHE_CHANNELS, 1); } - // create lsu lmem adapter - for (uint32_t i = 0; i < NUM_LSU_BLOCKS; ++i) { - snprintf(sname, 100, "core%d-lsu_lmem_adapter%d", core_id, i); - lsu_lmem_adapter_.at(i) = LsuMemAdapter::Create(sname, LSU_CHANNELS, 1); - } + // create lmem arbiter + snprintf(sname, 100, "%s-lmem_arb", this->name().c_str()); + auto lmem_arb = LsuArbiter::Create(sname, ArbiterType::RoundRobin, NUM_LSU_BLOCKS, 1); - // connect lsu demux + // create lmem adapter + snprintf(sname, 100, "%s-lsu_lmem_adapter", this->name().c_str()); + auto lsu_lmem_adapter = LsuMemAdapter::Create(sname, LSU_CHANNELS, 1); + + // connect lmem switch for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) { - lsu_demux_.at(b)->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn); - mem_coalescers_.at(b)->RspIn.bind(&lsu_demux_.at(b)->RspDC); + lmem_switch_.at(b)->ReqDC.bind(&mem_coalescers_.at(b)->ReqIn); + lmem_switch_.at(b)->ReqLmem.bind(&lmem_arb->ReqIn.at(b)); - lsu_demux_.at(b)->ReqLmem.bind(&lsu_lmem_adapter_.at(b)->ReqIn); - lsu_lmem_adapter_.at(b)->RspIn.bind(&lsu_demux_.at(b)->RspLmem); + mem_coalescers_.at(b)->RspIn.bind(&lmem_switch_.at(b)->RspDC); + lmem_arb->RspIn.at(b).bind(&lmem_switch_.at(b)->RspLmem); } - // connect coalescer-adapter + // connect lmem arbiter + lmem_arb->ReqOut.at(0).bind(&lsu_lmem_adapter->ReqIn); + lsu_lmem_adapter->RspIn.bind(&lmem_arb->RspOut.at(0)); + + // connect lmem adapter + for (uint32_t c = 0; c < LSU_CHANNELS; ++c) { + lsu_lmem_adapter->ReqOut.at(c).bind(&local_mem_->Inputs.at(c)); + local_mem_->Outputs.at(c).bind(&lsu_lmem_adapter->RspOut.at(c)); + } + + // connect dcache coalescer for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) { - mem_coalescers_.at(b)->ReqOut.bind(&lsu_dcache_adapter_.at(b)->ReqIn); - lsu_dcache_adapter_.at(b)->RspIn.bind(&mem_coalescers_.at(b)->RspOut); + mem_coalescers_.at(b)->ReqOut.bind(&lsu_dcache_adapter.at(b)->ReqIn); + lsu_dcache_adapter.at(b)->RspIn.bind(&mem_coalescers_.at(b)->RspOut); } - // connect adapter-dcache + // connect dcache adapter for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) { for (uint32_t c = 0; c < DCACHE_CHANNELS; ++c) { uint32_t i = b * DCACHE_CHANNELS + c; - lsu_dcache_adapter_.at(b)->ReqOut.at(c).bind(&dcache_req_ports.at(i)); - dcache_rsp_ports.at(i).bind(&lsu_dcache_adapter_.at(b)->RspOut.at(c)); - } - } - - // connect adapter-lmem - for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) { - for (uint32_t c = 0; c < LSU_CHANNELS; ++c) { - uint32_t i = b * LSU_CHANNELS + c; - lsu_lmem_adapter_.at(b)->ReqOut.at(c).bind(&local_mem_->Inputs.at(i)); - local_mem_->Outputs.at(i).bind(&lsu_lmem_adapter_.at(b)->RspOut.at(c)); + lsu_dcache_adapter.at(b)->ReqOut.at(c).bind(&dcache_req_ports.at(i)); + dcache_rsp_ports.at(i).bind(&lsu_dcache_adapter.at(b)->RspOut.at(c)); } } @@ -129,17 +131,19 @@ Core::Core(const SimContext& ctx, dispatchers_.at((int)FUType::FPU) = SimPlatform::instance().create_object(arch, 2, NUM_FPU_BLOCKS, NUM_FPU_LANES); dispatchers_.at((int)FUType::LSU) = SimPlatform::instance().create_object(arch, 2, NUM_LSU_BLOCKS, NUM_LSU_LANES); dispatchers_.at((int)FUType::SFU) = SimPlatform::instance().create_object(arch, 2, NUM_SFU_BLOCKS, NUM_SFU_LANES); + dispatchers_.at((int)FUType::TCU) = SimPlatform::instance().create_object(arch, 2, NUM_TCU_BLOCKS, NUM_TCU_LANES); // initialize execute units func_units_.at((int)FUType::ALU) = SimPlatform::instance().create_object(this); func_units_.at((int)FUType::FPU) = SimPlatform::instance().create_object(this); func_units_.at((int)FUType::LSU) = SimPlatform::instance().create_object(this); func_units_.at((int)FUType::SFU) = SimPlatform::instance().create_object(this); + func_units_.at((int)FUType::TCU) = SimPlatform::instance().create_object(this); // bind commit arbiters for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { - snprintf(sname, 100, "core%d-commit-arb%d", core_id, i); - auto arbiter = TraceSwitch::Create(sname, ArbiterType::RoundRobin, (uint32_t)FUType::Count, 1); + snprintf(sname, 100, "%s-commit-arb%d", this->name().c_str(), i); + auto arbiter = TraceArbiter::Create(sname, ArbiterType::RoundRobin, (uint32_t)FUType::Count, 1); for (uint32_t j = 0; j < (uint32_t)FUType::Count; ++j) { func_units_.at(j)->Outputs.at(i).bind(&arbiter->Inputs.at(j)); } @@ -428,3 +432,10 @@ bool Core::wspawn(uint32_t num_warps, Word nextPC) { void Core::attach_ram(RAM* ram) { emulator_.attach_ram(ram); } + +#ifdef VM_ENABLE +void Core::set_satp(uint64_t satp) { + emulator_.set_satp(satp); //JAEWON wit, tid??? + // emulator_.set_csr(VX_CSR_SATP,satp,0,0); //JAEWON wit, tid??? +} +#endif \ No newline at end of file diff --git a/sim/simx/core.h b/sim/simx/core.h index c0e3d5de8..1bd0571bc 100644 --- a/sim/simx/core.h +++ b/sim/simx/core.h @@ -26,6 +26,7 @@ #include "dispatcher.h" #include "func_unit.h" #include "mem_coalescer.h" +#include "VX_config.h" namespace vortex { @@ -33,7 +34,7 @@ class Socket; class Arch; class DCRS; -using TraceSwitch = Mux; +using TraceArbiter = Arbiter; class Core : public SimObject { public: @@ -98,6 +99,9 @@ public: void tick(); void attach_ram(RAM* ram); +#ifdef VM_ENABLE + void set_satp(uint64_t satp); +#endif bool running() const; @@ -123,6 +127,10 @@ public: return local_mem_; } + const MemCoalescer::Ptr& mem_coalescer(uint32_t idx) const { + return mem_coalescers_.at(idx); + } + const PerfStats& perf_stats() const { return perf_stats_; } @@ -150,10 +158,8 @@ private: std::vector dispatchers_; std::vector func_units_; LocalMem::Ptr local_mem_; - std::vector lsu_demux_; + std::vector lmem_switch_; std::vector mem_coalescers_; - std::vector lsu_dcache_adapter_; - std::vector lsu_lmem_adapter_; PipelineLatch fetch_latch_; PipelineLatch decode_latch_; @@ -165,7 +171,7 @@ private: PerfStats perf_stats_; - std::vector commit_arbs_; + std::vector commit_arbs_; uint32_t commit_exe_; uint32_t ibuffer_idx_; @@ -174,6 +180,7 @@ private: friend class AluUnit; friend class FpuUnit; friend class SfuUnit; + friend class TcuUnit; }; } // namespace vortex diff --git a/sim/simx/decode.cpp b/sim/simx/decode.cpp index dba57c4ef..521dcf3a2 100644 --- a/sim/simx/decode.cpp +++ b/sim/simx/decode.cpp @@ -47,37 +47,12 @@ static const std::unordered_map sc_instTable = { {Opcode::FMSUB, InstType::R4}, {Opcode::FMNMADD, InstType::R4}, {Opcode::FMNMSUB, InstType::R4}, + {Opcode::VSET, InstType::V}, {Opcode::EXT1, InstType::R}, {Opcode::EXT2, InstType::R4}, {Opcode::R_W, InstType::R}, {Opcode::I_W, InstType::I}, -}; - -enum Constants { - width_opcode= 7, - width_reg = 5, - width_func2 = 2, - width_func3 = 3, - width_func7 = 7, - width_i_imm = 12, - width_j_imm = 20, - - shift_opcode= 0, - shift_rd = width_opcode, - shift_func3 = shift_rd + width_reg, - shift_rs1 = shift_func3 + width_func3, - shift_rs2 = shift_rs1 + width_reg, - shift_func2 = shift_rs2 + width_reg, - shift_func7 = shift_rs2 + width_reg, - shift_rs3 = shift_func7 + width_func2, - - mask_opcode = (1 << width_opcode) - 1, - mask_reg = (1 << width_reg) - 1, - mask_func2 = (1 << width_func2) - 1, - mask_func3 = (1 << width_func3) - 1, - mask_func7 = (1 << width_func7) - 1, - mask_i_imm = (1 << width_i_imm) - 1, - mask_j_imm = (1 << width_j_imm) - 1, + {Opcode::TCU, InstType::I}, }; static const char* op_string(const Instr &instr) { @@ -86,7 +61,7 @@ static const char* op_string(const Instr &instr) { auto func3 = instr.getFunc3(); auto func7 = instr.getFunc7(); auto rd = instr.getRDest(); - auto rs2 = instr.getRSrc(1); + auto rs1 = instr.getRSrc(1); auto imm = instr.getImm(); switch (opcode) { @@ -229,10 +204,14 @@ static const char* op_string(const Instr &instr) { case Opcode::FENCE: return "FENCE"; case Opcode::FL: switch (func3) { - case 0x1: return "VL"; case 0x2: return "FLW"; case 0x3: return "FLD"; + case 0x0: return "VL8"; + case 0x5: return "VL16"; + case 0x6: return "VL32"; + case 0x7: return "VL64"; default: + std::cout << "Could not decode float/vector load with func3: " << func3 << std::endl; std::abort(); } case Opcode::FS: @@ -240,7 +219,12 @@ static const char* op_string(const Instr &instr) { case 0x1: return "VS"; case 0x2: return "FSW"; case 0x3: return "FSD"; + case 0x0: return "VS8"; + case 0x5: return "VS16"; + case 0x6: return "VS32"; + case 0x7: return "VS64"; default: + std::cout << "Could not decode float/vector store with func3: " << func3 << std::endl; std::abort(); } case Opcode::AMO: { @@ -343,7 +327,7 @@ static const char* op_string(const Instr &instr) { std::abort(); } case 0x60: - switch (rs2) { + switch (rs1) { case 0: return "FCVT.W.S"; case 1: return "FCVT.WU.S"; case 2: return "FCVT.L.S"; @@ -352,7 +336,7 @@ static const char* op_string(const Instr &instr) { std::abort(); } case 0x61: - switch (rs2) { + switch (rs1) { case 0: return "FCVT.W.D"; case 1: return "FCVT.WU.D"; case 2: return "FCVT.L.D"; @@ -361,7 +345,7 @@ static const char* op_string(const Instr &instr) { std::abort(); } case 0x68: - switch (rs2) { + switch (rs1) { case 0: return "FCVT.S.W"; case 1: return "FCVT.S.WU"; case 2: return "FCVT.S.L"; @@ -370,7 +354,7 @@ static const char* op_string(const Instr &instr) { std::abort(); } case 0x69: - switch (rs2) { + switch (rs1) { case 0: return "FCVT.D.W"; case 1: return "FCVT.D.WU"; case 2: return "FCVT.D.L"; @@ -389,13 +373,14 @@ static const char* op_string(const Instr &instr) { case Opcode::FMSUB: return func2 ? "FMSUB.D" : "FMSUB.S"; case Opcode::FMNMADD: return func2 ? "FNMADD.D" : "FNMADD.S"; case Opcode::FMNMSUB: return func2 ? "FNMSUB.D" : "FNMSUB.S"; + case Opcode::VSET: return "VSET"; case Opcode::EXT1: switch (func7) { case 0: switch (func3) { case 0: return "TMC"; case 1: return "WSPAWN"; - case 2: return rs2 ? "SPLIT.N" : "SPLIT"; + case 2: return rs1 ? "SPLIT.N" : "SPLIT"; case 3: return "JOIN"; case 4: return "BAR"; case 5: return rd ? "PRED.N" : "PRED"; @@ -405,11 +390,51 @@ static const char* op_string(const Instr &instr) { default: std::abort(); } + + case Opcode::TCU: + switch(func3) + { + case 0: return "ML"; // Matrix Load + case 1: return "MS"; // Matrix Store + case 2: return "MATMUL"; // Matrix Multiply + default: + std::abort(); + } default: std::abort(); } } +#ifdef EXT_V_ENABLE +inline void print_vec_attr(std::ostream &os, const Instr &instr) { + uint32_t mask = instr.getVattrMask(); + if (mask & vattr_vlswidth) + os << ", width:" << instr.getVlsWidth(); + if (mask & vattr_vmop) + os << ", mop:" << instr.getVmop(); + if (mask & vattr_vumop) + os << ", umop:" << instr.getVumop(); + if (mask & vattr_vnf) + os << ", nf:" << instr.getVnf(); + if (mask & vattr_vmask) + os << ", vmask:" << instr.getVmask(); + if (mask & vattr_vs3) + os << ", vs3:" << instr.getVs3(); + if (mask & vattr_zimm) + os << ", zimm:" << ((instr.hasZimm()) ? "true" : "false"); + if (mask & vattr_vlmul) + os << ", lmul:" << instr.getVlmul(); + if (mask & vattr_vsew) + os << ", sew:" << instr.getVsew(); + if (mask & vattr_vta) + os << ", ta:" << instr.getVta(); + if (mask & vattr_vma) + os << ", ma:" << instr.getVma(); + if (mask & vattr_vediv) + os << ", ediv:" << instr.getVediv(); +} +#endif + namespace vortex { std::ostream &operator<<(std::ostream &os, const Instr &instr) { os << op_string(instr); @@ -430,6 +455,17 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) { if (sep++ != 0) { os << ", "; } else { os << " "; } os << "0x" << std::hex << instr.getImm() << std::dec; } +#ifdef EXT_V_ENABLE + if (instr.getOpcode() == Opcode::SYS && instr.getFunc3() >= 5) { + // CSRs with immediate values + if (sep++ != 0) { os << ", "; } else { os << " "; } + os << "0x" << std::hex << instr.getRSrc(0); + } + // Log vector-specific attributes + if (instr.getVattrMask() != 0) { + print_vec_attr(os, instr); + } +#endif return os; } } @@ -441,7 +477,9 @@ std::shared_ptr Emulator::decode(uint32_t code) const { auto func2 = (code >> shift_func2) & mask_func2; auto func3 = (code >> shift_func3) & mask_func3; + auto func6 = (code >> shift_func6) & mask_func6; auto func7 = (code >> shift_func7) & mask_func7; + __unused(func6); auto rd = (code >> shift_rd) & mask_reg; auto rs1 = (code >> shift_rs1) & mask_reg; @@ -455,11 +493,22 @@ std::shared_ptr Emulator::decode(uint32_t code) const { } auto iType = op_it->second; + if (op == Opcode::FL || op == Opcode::FS) { + if (func3 != 0x2 && func3 != 0x3) { + iType = InstType::V; + } + } + switch (iType) { case InstType::R: switch (op) { case Opcode::FCI: switch (func7) { + case 0x20: // FCVT.S.D + case 0x21: // FCVT.D.S + instr->setDestReg(rd, RegType::Float); + instr->addSrcReg(rs1, RegType::Float); + break; case 0x2c: // FSQRT.S case 0x2d: // FSQRT.D instr->setDestReg(rd, RegType::Float); @@ -543,6 +592,14 @@ std::shared_ptr Emulator::decode(uint32_t code) const { case InstType::I: { switch (op) { + case Opcode::TCU: { + instr->setDestReg(rs1, RegType::Integer); + instr->addSrcReg(rs1, RegType::Integer); + instr->setFunc3(func3); + instr->setFunc7(func7); + auto imm = code >> shift_rs2; + instr->setImm(sext(imm, width_i_imm)); + } break; case Opcode::I: case Opcode::I_W: case Opcode::JALR: @@ -636,14 +693,112 @@ std::shared_ptr Emulator::decode(uint32_t code) const { instr->setImm(sext(imm, width_j_imm+1)); } break; - case InstType::R4: + case InstType::R4: { instr->setDestReg(rd, RegType::Float); instr->addSrcReg(rs1, RegType::Float); instr->addSrcReg(rs2, RegType::Float); instr->addSrcReg(rs3, RegType::Float); instr->setFunc2(func2); instr->setFunc3(func3); + } break; + +#ifdef EXT_V_ENABLE + case InstType::V: + switch (op) { + case Opcode::VSET: { + instr->setDestReg(rd, RegType::Integer); + instr->setFunc3(func3); + switch (func3) { + case 7: { + if (code >> (shift_vset - 1) == 0b10) { // vsetvl + instr->addSrcReg(rs1, RegType::Integer); + instr->addSrcReg(rs2, RegType::Integer); + } else { + auto zimm = (code >> shift_rs2) & mask_v_zimm; + instr->setZimm(true); + instr->setVlmul(zimm & mask_v_lmul); + instr->setVsew((zimm >> shift_v_sew) & mask_v_sew); + instr->setVta((zimm >> shift_v_ta) & mask_v_ta); + instr->setVma((zimm >> shift_v_ma) & mask_v_ma); + if ((code >> shift_vset)) { // vsetivli + instr->setImm(rs1); + } else { // vsetvli + instr->addSrcReg(rs1, RegType::Integer); + } + } + } break; + case 3: { // Vector - immediate arithmetic instructions + instr->setDestReg(rd, RegType::Vector); + instr->addSrcReg(rs2, RegType::Vector); + instr->setImm(rs1); + instr->setVmask((code >> shift_func7) & 0x1); + instr->setFunc6(func6); + } break; + default: { // Vector - vector/scalar arithmetic instructions + if (func3 == 1 && func6 == 16) { + instr->setDestReg(rd, RegType::Float); + } else if (func3 == 2 && func6 == 16) { + instr->setDestReg(rd, RegType::Integer); + } else { + instr->setDestReg(rd, RegType::Vector); + } + instr->addSrcReg(rs1, RegType::Vector); + instr->addSrcReg(rs2, RegType::Vector); + instr->setVmask((code >> shift_func7) & 0x1); + instr->setFunc6(func6); + } + } + } break; + case Opcode::FL: + instr->addSrcReg(rs1, RegType::Integer); + instr->setVmop((code >> shift_vmop) & 0b11); + switch (instr->getVmop()) { + case 0b00: + instr->setVumop(rs2); + break; + case 0b10: + instr->addSrcReg(rs2, RegType::Integer); + break; + case 0b01: + case 0b11: + instr->addSrcReg(rs2, RegType::Vector); + break; + } + instr->setVsew(func3 & 0x3); + instr->setDestReg(rd, RegType::Vector); + instr->setVlsWidth(func3); + instr->setVmask((code >> shift_func7) & 0x1); + instr->setVnf((code >> shift_vnf) & mask_func3); + break; + + case Opcode::FS: + instr->addSrcReg(rs1, RegType::Integer); + instr->setVmop((code >> shift_vmop) & 0b11); + switch (instr->getVmop()) { + case 0b00: + instr->setVumop(rs2); + break; + case 0b10: + instr->addSrcReg(rs2, RegType::Integer); + break; + case 0b01: + case 0b11: + instr->addSrcReg(rs2, RegType::Vector); + break; + } + instr->setVsew(func3 & 0x3); + instr->addSrcReg(rd, RegType::Vector); + instr->setVlsWidth(func3); + instr->setVmask((code >> shift_func7) & 0x1); + instr->setVmop((code >> shift_vmop) & 0b11); + instr->setVnf((code >> shift_vnf) & mask_func3); + break; + + default: + std::abort(); + } break; + #endif default: std::abort(); diff --git a/sim/simx/emulator.cpp b/sim/simx/emulator.cpp index 7ed9a10f9..ee297279a 100644 --- a/sim/simx/emulator.cpp +++ b/sim/simx/emulator.cpp @@ -30,20 +30,12 @@ using namespace vortex; -Emulator::ipdom_entry_t::ipdom_entry_t(const ThreadMask &tmask, Word PC) - : tmask(tmask) - , PC(PC) - , fallthrough(false) -{} - -Emulator::ipdom_entry_t::ipdom_entry_t(const ThreadMask &tmask) - : tmask(tmask) - , fallthrough(true) -{} - Emulator::warp_t::warp_t(const Arch& arch) : ireg_file(arch.num_threads(), std::vector(MAX_NUM_REGS)) , freg_file(arch.num_threads(), std::vector(MAX_NUM_REGS)) +#ifdef EXT_V_ENABLE + , vreg_file(MAX_NUM_REGS, std::vector(MAX_NUM_REGS)) +#endif , uuid(0) {} @@ -53,8 +45,6 @@ void Emulator::warp_t::clear(uint64_t startup_addr) { this->uuid = 0; this->fcsr = 0; - std::srand(50); - for (auto& reg_file : this->ireg_file) { for (auto& reg : reg_file) { #ifndef NDEBUG @@ -75,6 +65,21 @@ void Emulator::warp_t::clear(uint64_t startup_addr) { #endif } } + +#ifdef EXT_V_ENABLE + for (auto& reg_file : this->vreg_file) { + for (auto& reg : reg_file) { + #ifndef NDEBUG + reg = 0; + #else + reg = std::rand(); + #endif + } + } + this->vtype = {0, 0, 0, 0, 0}; + this->vl = 0; + this->vlmax = 0; +#endif } /////////////////////////////////////////////////////////////////////////////// @@ -85,8 +90,23 @@ Emulator::Emulator(const Arch &arch, const DCRS &dcrs, Core* core) , core_(core) , warps_(arch.num_warps(), arch) , barriers_(arch.num_barriers(), 0) - , ipdom_size_((arch.num_threads()-1) * 2) + , ipdom_size_(arch.num_threads()-1) + // [TBC] Currently, tradeoff between scratchpad size & performance has not been evaluated. Scratchpad is + // considered to be big enough to hold input tiles for one output tile. + // In future versions, scratchpad size should be fixed to an appropriate value. + , scratchpad(std::vector(32 * 32 * 32768)) + #ifdef EXT_V_ENABLE + , csrs_(arch.num_warps()) + #endif { + std::srand(50); + +#ifdef EXT_V_ENABLE + for (uint32_t i = 0; i < arch_.num_warps(); ++i) { + csrs_.at(i).resize(arch.num_threads()); + } +#endif + this->clear(); } @@ -122,12 +142,16 @@ void Emulator::clear() { active_warps_.set(0); warps_[0].tmask.set(0); wspawn_.valid = false; + + for (auto& reg : scratchpad) { + reg = 0; + } } void Emulator::attach_ram(RAM* ram) { // bind RAM to memory unit #if (XLEN == 64) - mmu_.attach(*ram, 0, 0xFFFFFFFFFFFFFFFF); + mmu_.attach(*ram, 0, 0x7FFFFFFFFF); //39bit SV39 #else mmu_.attach(*ram, 0, 0xFFFFFFFF); #endif @@ -166,6 +190,7 @@ instr_trace_t* Emulator::step() { assert(warp.tmask.any()); #ifndef NDEBUG + // generate unique universal instruction ID uint32_t instr_uuid = warp.uuid++; uint32_t g_wid = core_->id() * arch_.num_warps() + scheduled_warp; uint64_t uuid = (uint64_t(g_wid) << 32) | instr_uuid; @@ -173,10 +198,8 @@ instr_trace_t* Emulator::step() { uint64_t uuid = 0; #endif - DPH(1, "Fetch: cid=" << core_->id() << ", wid=" << scheduled_warp << ", tmask="); - for (uint32_t i = 0, n = arch_.num_threads(); i < n; ++i) - DPN(1, warp.tmask.test(i)); - DPN(1, ", PC=0x" << std::hex << warp.PC << " (#" << std::dec << uuid << ")" << std::endl); + DP(1, "Fetch: cid=" << core_->id() << ", wid=" << scheduled_warp << ", tmask=" << ThreadMaskOS(warp.tmask, arch_.num_threads()) + << ", PC=0x" << std::hex << warp.PC << " (#" << std::dec << uuid << ")"); // Fetch uint32_t instr_code = 0; @@ -280,10 +303,53 @@ bool Emulator::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) { return false; } +#ifdef VM_ENABLE +void Emulator::icache_read(void *data, uint64_t addr, uint32_t size) { + DP(3, "*** icache_read 0x" << std::hex << addr << ", size = 0x " << size); + try + { + mmu_.read(data, addr, size, ACCESS_TYPE::FETCH); + } + catch (Page_Fault_Exception& page_fault) + { + std::cout<local_mem()->read(data, addr, size); + } else { + try + { + mmu_.read(data, addr, size, ACCESS_TYPE::LOAD); + } + catch (Page_Fault_Exception& page_fault) + { + std::cout<= uint64_t(IO_COUT_ADDR) + && addr < (uint64_t(IO_COUT_ADDR) + IO_COUT_SIZE)) { + this->writeToStdOut(data, addr, size); + } else { + if (type == AddrType::Shared) { + core_->local_mem()->write(data, addr, size); + } else { + try + { + // mmu_.write(data, addr, size, 0); + mmu_.write(data, addr, size, ACCESS_TYPE::STORE); + } + catch (Page_Fault_Exception& page_fault) + { + std::cout<= uint64_t(IO_COUT_ADDR) @@ -309,6 +401,7 @@ void Emulator::dcache_write(const void* data, uint64_t addr, uint32_t size) { } DPH(2, "Mem Write: addr=0x" << std::hex << addr << ", data=0x" << ByteStream(data, size) << std::dec << " (size=" << size << ", type=" << type << ")" << std::endl); } +#endif void Emulator::dcache_amo_reserve(uint64_t addr) { auto type = get_addr_type(addr); @@ -356,10 +449,26 @@ void Emulator::cout_flush() { case (addr + (VX_CSR_MPM_BASE_H-VX_CSR_MPM_BASE)) : return ((value >> 32) & 0xFFFFFFFF) #endif +Word Emulator::get_tiles() { + return mat_size; +} + +Word Emulator::get_tc_size() { + return tc_size; +} + +Word Emulator::get_tc_num() { + return tc_num; +} + Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { auto core_perf = core_->perf_stats(); switch (addr) { case VX_CSR_SATP: +#ifdef VM_ENABLE + // return csrs_.at(wid).at(tid)[addr]; + return mmu_.get_satp(); +#endif case VX_CSR_PMPCFG0: case VX_CSR_PMPADDR0: case VX_CSR_MSTATUS: @@ -376,6 +485,34 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_FFLAGS: return warps_.at(wid).fcsr & 0x1F; case VX_CSR_FRM: return (warps_.at(wid).fcsr >> 5); case VX_CSR_FCSR: return warps_.at(wid).fcsr; + +#ifdef EXT_V_ENABLE + // Vector CRSs + case VX_CSR_VSTART: + return csrs_.at(wid).at(tid)[VX_CSR_VSTART]; + case VX_CSR_VXSAT: + return csrs_.at(wid).at(tid)[VX_CSR_VXSAT]; + case VX_CSR_VXRM: + return csrs_.at(wid).at(tid)[VX_CSR_VXRM]; + case VX_CSR_VCSR: { + Word vxsat = csrs_.at(wid).at(tid)[VX_CSR_VXSAT]; + Word vxrm = csrs_.at(wid).at(tid)[VX_CSR_VXRM]; + return (vxrm << 1) | vxsat; + } + case VX_CSR_VL: + return csrs_.at(wid).at(tid)[VX_CSR_VL]; + case VX_CSR_VTYPE: + return csrs_.at(wid).at(tid)[VX_CSR_VTYPE]; + case VX_CSR_VLENB: + return VLEN / 8; + case VX_CSR_VCYCLE: + return csrs_.at(wid).at(tid)[VX_CSR_VCYCLE]; + case VX_CSR_VTIME: + return csrs_.at(wid).at(tid)[VX_CSR_VTIME]; + case VX_CSR_VINSTRET: + return csrs_.at(wid).at(tid)[VX_CSR_VINSTRET]; +#endif + case VX_CSR_MHARTID: return (core_->id() * arch_.num_warps() + wid) * arch_.num_threads() + tid; case VX_CSR_THREAD_ID: return tid; case VX_CSR_WARP_ID: return wid; @@ -387,6 +524,10 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { case VX_CSR_NUM_CORES: return uint32_t(arch_.num_cores()) * arch_.num_clusters(); case VX_CSR_LOCAL_MEM_BASE: return arch_.local_mem_base(); case VX_CSR_MSCRATCH: return csr_mscratch_; + case VX_MAT_MUL_SIZE: return mat_size; + case VX_TC_NUM: return tc_num; + case VX_TC_SIZE: return tc_size; + CSR_READ_64(VX_CSR_MCYCLE, core_perf.cycles); CSR_READ_64(VX_CSR_MINSTRET, core_perf.instrs); default: @@ -422,6 +563,12 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { auto cluster_perf = core_->socket()->cluster()->perf_stats(); auto socket_perf = core_->socket()->perf_stats(); auto lmem_perf = core_->local_mem()->perf_stats(); + + uint64_t coalescer_misses = 0; + for (uint i = 0; i < NUM_LSU_BLOCKS; ++i) { + coalescer_misses += core_->mem_coalescer(i)->perf_stats().misses; + } + switch (addr) { CSR_READ_64(VX_CSR_MPM_ICACHE_READS, socket_perf.icache.reads); CSR_READ_64(VX_CSR_MPM_ICACHE_MISS_R, socket_perf.icache.read_misses); @@ -451,6 +598,9 @@ Word Emulator::get_csr(uint32_t addr, uint32_t tid, uint32_t wid) { CSR_READ_64(VX_CSR_MPM_MEM_READS, proc_perf.mem_reads); CSR_READ_64(VX_CSR_MPM_MEM_WRITES, proc_perf.mem_writes); CSR_READ_64(VX_CSR_MPM_MEM_LT, proc_perf.mem_latency); + CSR_READ_64(VX_CSR_MPM_MEM_BANK_ST, proc_perf.memsim.bank_stalls); + + CSR_READ_64(VX_CSR_MPM_COALESCER_MISS, coalescer_misses); CSR_READ_64(VX_CSR_MPM_LMEM_READS, lmem_perf.reads); CSR_READ_64(VX_CSR_MPM_LMEM_WRITES, lmem_perf.writes); @@ -485,7 +635,38 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) { case VX_CSR_MSCRATCH: csr_mscratch_ = value; break; + +#ifdef EXT_V_ENABLE + // Vector CRSs + case VX_CSR_VSTART: + csrs_.at(wid).at(tid)[VX_CSR_VSTART] = value; + break; + case VX_CSR_VXSAT: + csrs_.at(wid).at(tid)[VX_CSR_VXSAT] = value & 0b1; + break; + case VX_CSR_VXRM: + csrs_.at(wid).at(tid)[VX_CSR_VXRM] = value & 0b11; + break; + case VX_CSR_VCSR: + csrs_.at(wid).at(tid)[VX_CSR_VXSAT] = value & 0b1; + csrs_.at(wid).at(tid)[VX_CSR_VXRM] = (value >> 1) & 0b11; + break; + case VX_CSR_VL: // read only, written by vset(i)vl(i) + csrs_.at(wid).at(tid)[VX_CSR_VL] = value; + break; + case VX_CSR_VTYPE: // read only, written by vset(i)vl(i) + csrs_.at(wid).at(tid)[VX_CSR_VTYPE] = value; + break; + case VX_CSR_VLENB: // read only, set to VLEN / 8 +#endif + case VX_CSR_SATP: + #ifdef VM_ENABLE + // warps_.at(wid).fcsr = (warps_.at(wid).fcsr & ~0x1F) | (value & 0x1F); + // csrs_.at(wid).at(tid)[addr] = value; //what is wid and tid? + mmu_.set_satp(value); + break; + #endif case VX_CSR_MSTATUS: case VX_CSR_MEDELEG: case VX_CSR_MIDELEG: @@ -497,6 +678,16 @@ void Emulator::set_csr(uint32_t addr, Word value, uint32_t tid, uint32_t wid) { case VX_CSR_MNSTATUS: case VX_CSR_MCAUSE: break; + case VX_MAT_MUL_SIZE: + mat_size = value; + break; + case VX_TC_NUM: + tc_num = value; + break; + case VX_TC_SIZE: + tc_size = value; + break; + default: { std::cout << "Error: invalid CSR write addr=0x" << std::hex << addr << ", value=0x" << value << std::dec << std::endl; std::abort(); @@ -513,4 +704,16 @@ void Emulator::update_fcrs(uint32_t fflags, uint32_t tid, uint32_t wid) { this->set_csr(VX_CSR_FCSR, this->get_csr(VX_CSR_FCSR, tid, wid) | fflags, tid, wid); this->set_csr(VX_CSR_FFLAGS, this->get_csr(VX_CSR_FFLAGS, tid, wid) | fflags, tid, wid); } -} \ No newline at end of file +} + +// For riscv-vector test functionality, ecall and ebreak must trap +// These instructions are used in the vector tests to stop execution of the test +// Therefore, without these instructions, undefined and incorrect behavior happens +// +// For now, we need these instructions to trap for testing the riscv-vector isa +void Emulator::trigger_ecall() { + active_warps_.reset(); +} +void Emulator::trigger_ebreak() { + active_warps_.reset(); +} diff --git a/sim/simx/emulator.h b/sim/simx/emulator.h index de466d352..980bc8f8a 100644 --- a/sim/simx/emulator.h +++ b/sim/simx/emulator.h @@ -39,6 +39,9 @@ public: void clear(); void attach_ram(RAM* ram); +#ifdef VM_ENABLE + void set_satp(uint64_t satp) ; +#endif instr_trace_t* step(); @@ -54,17 +57,50 @@ public: int get_exitcode() const; + Word get_tiles(); + Word get_tc_size(); + Word get_tc_num(); + + void dcache_read(void* data, uint64_t addr, uint32_t size); + + void dcache_write(const void* data, uint64_t addr, uint32_t size); + private: struct ipdom_entry_t { - ipdom_entry_t(const ThreadMask &tmask, Word PC); - ipdom_entry_t(const ThreadMask &tmask); + ipdom_entry_t(const ThreadMask &orig_tmask, const ThreadMask &else_tmask, Word PC) + : orig_tmask (orig_tmask) + , else_tmask (else_tmask) + , PC (PC) + , fallthrough(false) + {} - ThreadMask tmask; + ThreadMask orig_tmask; + ThreadMask else_tmask; Word PC; bool fallthrough; }; + struct vtype_t { + uint32_t vill; + uint32_t vma; + uint32_t vta; + uint32_t vsew; + uint32_t vlmul; + }; + + union reg_data_t { + Word u; + WordI i; + WordF f; + float f32; + double f64; + uint32_t u32; + uint64_t u64; + int32_t i32; + int64_t i64; + }; + struct warp_t { warp_t(const Arch& arch); void clear(uint64_t startup_addr); @@ -75,6 +111,12 @@ private: std::vector>freg_file; std::stack ipdom_stack; Byte fcsr; +#ifdef EXT_V_ENABLE + std::vector> vreg_file; + vtype_t vtype; + uint32_t vl; + Word vlmax; +#endif uint32_t uuid; }; @@ -88,12 +130,14 @@ private: void execute(const Instr &instr, uint32_t wid, instr_trace_t *trace); +#ifdef EXT_V_ENABLE + void loadVector(const Instr &instr, uint32_t wid, std::vector &rsdata); + void storeVector(const Instr &instr, uint32_t wid, std::vector &rsdata); + void executeVector(const Instr &instr, uint32_t wid, std::vector &rsdata, std::vector &rddata); +#endif + void icache_read(void* data, uint64_t addr, uint32_t size); - void dcache_read(void* data, uint64_t addr, uint32_t size); - - void dcache_write(const void* data, uint64_t addr, uint32_t size); - void dcache_amo_reserve(uint64_t addr); bool dcache_amo_check(uint64_t addr); @@ -110,6 +154,11 @@ private: void update_fcrs(uint32_t fflags, uint32_t tid, uint32_t wid); + // temporarily added for riscv-vector tests + // TODO: remove once ecall/ebreak are supported + void trigger_ecall(); + void trigger_ebreak(); + const Arch& arch_; const DCRS& dcrs_; Core* core_; @@ -122,6 +171,11 @@ private: uint32_t ipdom_size_; Word csr_mscratch_; wspawn_t wspawn_; + std::vector scratchpad; + uint32_t mat_size; + uint32_t tc_size; + uint32_t tc_num; + std::vector>> csrs_; }; } diff --git a/sim/simx/execute.cpp b/sim/simx/execute.cpp index db098726b..aae018fc5 100644 --- a/sim/simx/execute.cpp +++ b/sim/simx/execute.cpp @@ -25,21 +25,13 @@ #include "emulator.h" #include "instr.h" #include "core.h" +#ifdef EXT_V_ENABLE +#include "processor_impl.h" +#endif +#include "VX_types.h" using namespace vortex; -union reg_data_t { - Word u; - WordI i; - WordF f; - float f32; - double f64; - uint32_t u32; - uint64_t u64; - int32_t i32; - int64_t i64; -}; - inline uint64_t nan_box(uint32_t value) { return value | 0xffffffff00000000; } @@ -127,6 +119,10 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { } DPN(2, "}" << std::endl); break; + #ifdef EXT_V_ENABLE + case RegType::Vector: + break; + #endif default: break; } @@ -677,41 +673,50 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { trace->src_regs[0] = {RegType::Integer, rsrc0}; auto trace_data = std::make_shared(num_threads); trace->data = trace_data; - uint32_t data_bytes = 1 << (func3 & 0x3); - uint32_t data_width = 8 * data_bytes; - for (uint32_t t = thread_start; t < num_threads; ++t) { - if (!warp.tmask.test(t)) - continue; - uint64_t mem_addr = rsdata[t][0].i + immsrc; - uint64_t read_data = 0; - this->dcache_read(&read_data, mem_addr, data_bytes); - trace_data->mem_addrs.at(t) = {mem_addr, data_bytes}; - switch (func3) { - case 0: // RV32I: LB - case 1: // RV32I: LH - rddata[t].i = sext((Word)read_data, data_width); - break; - case 2: - if (opcode == Opcode::L) { - // RV32I: LW + if ((opcode == Opcode::L ) + || (opcode == Opcode::FL && func3 == 2) + || (opcode == Opcode::FL && func3 == 3)) { + uint32_t data_bytes = 1 << (func3 & 0x3); + uint32_t data_width = 8 * data_bytes; + for (uint32_t t = thread_start; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint64_t mem_addr = rsdata[t][0].i + immsrc; + uint64_t read_data = 0; + this->dcache_read(&read_data, mem_addr, data_bytes); + trace_data->mem_addrs.at(t) = {mem_addr, data_bytes}; + switch (func3) { + case 0: // RV32I: LB + case 1: // RV32I: LH rddata[t].i = sext((Word)read_data, data_width); - } else { - // RV32F: FLW - rddata[t].u64 = nan_box((uint32_t)read_data); + break; + case 2: + if (opcode == Opcode::L) { + // RV32I: LW + rddata[t].i = sext((Word)read_data, data_width); + } else { + // RV32F: FLW + rddata[t].u64 = nan_box((uint32_t)read_data); + } + break; + case 3: // RV64I: LD + // RV32D: FLD + case 4: // RV32I: LBU + case 5: // RV32I: LHU + case 6: // RV64I: LWU + rddata[t].u64 = read_data; + break; + default: + std::abort(); } - break; - case 3: // RV64I: LD - // RV32D: FLD - case 4: // RV32I: LBU - case 5: // RV32I: LHU - case 6: // RV64I: LWU - rddata[t].u64 = read_data; - break; - default: - std::abort(); } + rd_write = true; } - rd_write = true; + #ifdef EXT_V_ENABLE + else { + this->loadVector(instr, wid, rsdata); + } + #endif break; } case Opcode::S: @@ -723,24 +728,33 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { trace->src_regs[1] = {data_type, rsrc1}; auto trace_data = std::make_shared(num_threads); trace->data = trace_data; - uint32_t data_bytes = 1 << (func3 & 0x3); - for (uint32_t t = thread_start; t < num_threads; ++t) { - if (!warp.tmask.test(t)) - continue; - uint64_t mem_addr = rsdata[t][0].i + immsrc; - uint64_t write_data = rsdata[t][1].u64; - trace_data->mem_addrs.at(t) = {mem_addr, data_bytes}; - switch (func3) { - case 0: - case 1: - case 2: - case 3: - this->dcache_write(&write_data, mem_addr, data_bytes); - break; - default: - std::abort(); + if ((opcode == Opcode::S) + || (opcode == Opcode::FS && func3 == 2) + || (opcode == Opcode::FS && func3 == 3)) { + uint32_t data_bytes = 1 << (func3 & 0x3); + for (uint32_t t = thread_start; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint64_t mem_addr = rsdata[t][0].i + immsrc; + uint64_t write_data = rsdata[t][1].u64; + trace_data->mem_addrs.at(t) = {mem_addr, data_bytes}; + switch (func3) { + case 0: + case 1: + case 2: + case 3: + this->dcache_write(&write_data, mem_addr, data_bytes); + break; + default: + std::abort(); + } } } + #ifdef EXT_V_ENABLE + else { + this->storeVector(instr, wid, rsdata); + } + #endif break; } case Opcode::AMO: { @@ -829,7 +843,11 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { trace->fetch_stall = true; switch (csr_addr) { case 0x000: // RV32I: ECALL + this->trigger_ecall(); // Re-added for riscv-vector test functionality + break; case 0x001: // RV32I: EBREAK + this->trigger_ebreak(); // Re-added for riscv-vector test functionality + break; case 0x002: // RV32I: URET case 0x102: // RV32I: SRET case 0x302: // RV32I: MRET @@ -1328,7 +1346,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { auto stack_size = warp.ipdom_stack.size(); ThreadMask then_tmask, else_tmask; - auto not_pred = rsrc2 & 0x1; + auto not_pred = (rsrc1 != 0); for (uint32_t t = 0; t < num_threads; ++t) { auto cond = (warp.ireg_file.at(t).at(rsrc0) & 0x1) ^ not_pred; then_tmask[t] = warp.tmask.test(t) && cond; @@ -1347,11 +1365,9 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { } else { next_tmask = else_tmask; } - // push reconvergence thread mask onto the stack - warp.ipdom_stack.emplace(warp.tmask); - // push not taken thread mask onto the stack + // push reconvergence and not-taken thread mask onto the stack auto ntaken_tmask = ~next_tmask & warp.tmask; - warp.ipdom_stack.emplace(ntaken_tmask, next_pc); + warp.ipdom_stack.emplace(warp.tmask, ntaken_tmask, next_pc); } // return divergent state for (uint32_t t = thread_start; t < num_threads; ++t) { @@ -1372,11 +1388,14 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { std::cout << "IPDOM stack is empty!\n" << std::flush; std::abort(); } - next_tmask = warp.ipdom_stack.top().tmask; - if (!warp.ipdom_stack.top().fallthrough) { + if (warp.ipdom_stack.top().fallthrough) { + next_tmask = warp.ipdom_stack.top().orig_tmask; + warp.ipdom_stack.pop(); + } else { + next_tmask = warp.ipdom_stack.top().else_tmask; next_pc = warp.ipdom_stack.top().PC; + warp.ipdom_stack.top().fallthrough = true; } - warp.ipdom_stack.pop(); } } break; case 4: { @@ -1415,6 +1434,180 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { std::abort(); } } break; + case Opcode::TCU: + { //TODO - make it data-type flexible + uint32_t mem_bytes = 1; + DP(3, "mem_bytes=" << mem_bytes << std::endl); + uint16_t tc_size = this->get_csr(VX_TC_SIZE, 0, wid); + uint32_t TC_per_warp = this->get_csr(VX_TC_NUM, 0, wid); + + DP(3, "tc_size=" << tc_size << std::endl); + DP(3, "TC_per_warp=" << TC_per_warp << std::endl); + + //Number of loads - dependant on the thread config + uint32_t n_tiles = this->get_csr(VX_MAT_MUL_SIZE, 0, wid); //CSR instruction before MLOAD will ensure that this csr has value + int num_data_per_thread; + int num_data_per_thread_st; + uint32_t num_threads_actv; + uint32_t num_threads_actv_st; + uint32_t data_bytes_load; + uint32_t data_bytes_store; + uint32_t num_threads_per_tc = MAX (1, num_threads/TC_per_warp); + + //LOAD + if(num_threads > tc_size*tc_size*n_tiles*TC_per_warp) + { + num_threads_actv = tc_size*tc_size*n_tiles*TC_per_warp; + num_data_per_thread = 1; + } + else + { + num_threads_actv = num_threads; + num_data_per_thread = (tc_size*tc_size*n_tiles)/num_threads_per_tc; + } + data_bytes_load = mem_bytes*num_data_per_thread; + + //STORE + if(num_threads > tc_size*tc_size*TC_per_warp) + { + num_threads_actv_st = tc_size*tc_size*TC_per_warp; + num_data_per_thread_st = 1; + } + else + { + num_threads_actv_st = num_threads; + num_data_per_thread_st = (tc_size*tc_size)/num_threads_per_tc; + } + data_bytes_store = mem_bytes*num_data_per_thread_st; + + DP(3, "Num Tiles=" << n_tiles << std::endl); + + switch (func3) { + case 0: + { //Matrix Load + + DP (4, "TCU LOAD"); + trace->fu_type = FUType::LSU; + trace->lsu_type = LsuType::TCU_LOAD; + + trace->src_regs[0] = {RegType::Integer, rsrc0}; + auto trace_data = std::make_shared(num_threads); + trace->data = trace_data; + + for (uint32_t t = thread_start; t < num_threads_actv; ++t) + { + if (!warp.tmask.test(t)) + continue; + DP(3, "Thread ID" << t); + + uint32_t base_addr = rsdata[t][0].i ; + trace_data->mem_addrs.at(t) = {base_addr, data_bytes_load}; + + //Load A or B (depends on immsrc) + int loop_offset = 0; + DP(3, "n_tiles = " << n_tiles << "; num_data_per_thread = " << num_data_per_thread <dcache_read(temp_ref, (base_addr+(n*mem_bytes)+(loop_offset*mem_bytes)), mem_bytes); + + scratchpad[loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n] = *temp_ref; + DP(3, "Scratchpad Index: " << loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n << ", Value: " << scratchpad[loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n]); + } + } + rd_write = true; + } break; + case 1: + { + DP(4, "TCU STORE"); + trace->fu_type = FUType::LSU; + trace->lsu_type = LsuType::TCU_STORE; + + auto trace_data = std::make_shared(num_threads); + trace->data = trace_data; + + for (uint32_t t = thread_start; t < num_threads_actv_st; ++t) + { + if (!warp.tmask.test(t)) + continue; + + DP(3, "Thread ID" << t); + uint32_t base_addr = rsdata[t][0].i ; + + trace_data->mem_addrs.at(t) = {base_addr, data_bytes_store}; + + //Store C + for (int n=0; ndcache_write(temp_ref, base_addr+(n*mem_bytes), mem_bytes); + } + } + //Clear the scratchpad + for(long unsigned int i=0 ; i < scratchpad.size(); i++) + { + scratchpad[i] = 0; + } + } + break; + case 2: + { //Matrix Multiply + DP(4, "TCU MULTIPLY MAT"); + trace->fu_type = FUType::TCU; + trace->tcu_type = TCUType::TCU_MUL; + uint32_t threads_per_tc = MAX (1, num_threads/TC_per_warp); + for (uint32_t t = thread_start; t < num_threads_actv; ++t) + { + if (!warp.tmask.test(t)) + continue; + + DP(3, "Thread ID" << t); + //TC operation [only 1 thread in 1 warp needs to do this] + if (t%threads_per_tc == 0) + { + /* + // TODO : Fix needed for functional correctness + // TODO : change to systolic array implementation + uint32_t thread_offset = t*(tc_size*tc_size); + + int loop_offset = 0; + int offset_b = n_tiles*n_tiles*n_tiles*tc_size*tc_size; + uint32_t accu_offset = (n_tiles)*(n_tiles)*(n_tiles)*tc_size*tc_size*2; + for(int tiles = 0 ; tiles < n_tiles ; tiles++) //What's the HW implication of this?? A counter implementation? + { + for (int i = 0; i < tc_size; i++) { //ROW-1 + for (int j = 0; j < tc_size; j++) { //COL-2 + int sum = 0; + for (int k = 0; k < tc_size; k++) + { //COL-1 + sum = sum + scratchpad[loop_offset + thread_offset*n_tiles + i * tc_size + k] *scratchpad[loop_offset + thread_offset*n_tiles + offset_b + (k * tc_size + j)]; + } + scratchpad[accu_offset + thread_offset +(i * tc_size + j)] += sum; //[i * col2 + j] = sum + DP(3, "Scratchpad Index: " << accu_offset + (i * tc_size + j) << " , Value=" << scratchpad[accu_offset + (i * tc_size + j)]); + } + } + loop_offset += tc_size*tc_size; //Move to the next tiled matmul fragment + } + */ + } + } + + }break; + default: + std::abort(); + } + } break; +#ifdef EXT_V_ENABLE + case Opcode::VSET: { + auto func6 = instr.getFunc6(); + if ((func3 == 0x7) || (func3 == 0x2 && func6 == 16) || (func3 == 0x1 && func6 == 16)) { + rd_write = true; + } + executeVector(instr, wid, rsdata, rddata); + } break; +#endif default: std::abort(); } @@ -1458,6 +1651,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { trace->dst_reg = {type, rdest}; break; default: + std::cout << "Unrecognized register write back type: " << type << std::endl; std::abort(); break; } @@ -1471,10 +1665,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) { } if (warp.tmask != next_tmask) { - DPH(3, "*** New Tmask="); - for (uint32_t i = 0; i < num_threads; ++i) - DPN(3, next_tmask.test(i)); - DPN(3, std::endl); + DP(3, "*** New Tmask=" << ThreadMaskOS(next_tmask, num_threads)); warp.tmask = next_tmask; if (!next_tmask.any()) { active_warps_.reset(wid); diff --git a/sim/simx/func_unit.cpp b/sim/simx/func_unit.cpp index b03551e08..d33a0ac1c 100644 --- a/sim/simx/func_unit.cpp +++ b/sim/simx/func_unit.cpp @@ -21,6 +21,7 @@ #include "core.h" #include "constants.h" #include "cache_sim.h" +#include "VX_types.h" using namespace vortex; @@ -115,12 +116,12 @@ void LsuUnit::tick() { // handle memory responses for (uint32_t b = 0; b < NUM_LSU_BLOCKS; ++b) { - auto& lsu_rsp_port = core_->lsu_demux_.at(b)->RspIn; + auto& lsu_rsp_port = core_->lmem_switch_.at(b)->RspIn; if (lsu_rsp_port.empty()) continue; auto& state = states_.at(b); auto& lsu_rsp = lsu_rsp_port.front(); - DT(3, this->name() << " mem-rsp: " << lsu_rsp); + DT(3, this->name() << "-mem-rsp: " << lsu_rsp); auto& entry = state.pending_rd_reqs.at(lsu_rsp.tag); auto trace = entry.trace; assert(!entry.mask.none()); @@ -145,7 +146,7 @@ void LsuUnit::tick() { continue; Outputs.at(iw).push(state.fence_trace, 1); state.fence_lock = false; - DT(3, this->name() << " fence-unlock: " << state.fence_trace); + DT(3, this->name() << "-fence-unlock: " << state.fence_trace); } // check input queue @@ -159,18 +160,18 @@ void LsuUnit::tick() { // schedule fence lock state.fence_trace = trace; state.fence_lock = true; - DT(3, this->name() << " fence-lock: " << *trace); + DT(3, this->name() << "-fence-lock: " << *trace); // remove input input.pop(); continue; } - bool is_write = (trace->lsu_type == LsuType::STORE); + bool is_write = ((trace->lsu_type == LsuType::STORE) || (trace->lsu_type == LsuType::TCU_STORE)); // check pending queue capacity if (!is_write && state.pending_rd_reqs.full()) { if (!trace->log_once(true)) { - DT(4, "*** " << this->name() << " queue-full: " << *trace); + DT(4, "*** " << this->name() << "-queue-full: " << *trace); } continue; } else { @@ -191,6 +192,7 @@ void LsuUnit::tick() { } } uint32_t tag = 0; + if (!is_write) { tag = state.pending_rd_reqs.allocate({trace, lsu_req.mask}); } @@ -199,8 +201,8 @@ void LsuUnit::tick() { lsu_req.uuid = trace->uuid; // send memory request - core_->lsu_demux_.at(block_idx)->ReqIn.push(lsu_req); - DT(3, this->name() << " mem-req: " << lsu_req); + core_->lmem_switch_.at(block_idx)->ReqIn.push(lsu_req); + DT(3, this->name() << "-mem-req: " << lsu_req); // update stats auto num_addrs = lsu_req.mask.count(); @@ -220,6 +222,96 @@ void LsuUnit::tick() { input.pop(); } } +/* TO BE FIXED:Tensor_core code + send_request is not used anymore. Need to be modified number of load +*/ +/* +int LsuUnit::send_requests(instr_trace_t* trace, int block_idx, int tag) { + int count = 0; + + auto trace_data = std::dynamic_pointer_cast(trace->data); + bool is_write = ((trace->lsu_type == LsuType::STORE) || (trace->lsu_type == LsuType::TCU_STORE)); + + uint16_t req_per_thread = 1; + if ((trace->lsu_type == LsuType::TCU_LOAD) || (trace->lsu_type == LsuType::TCU_STORE)) + { + req_per_thread= (1>(trace_data->mem_addrs.at(0).size)/4)? 1: ((trace_data->mem_addrs.at(0).size)/4); + } + + auto t0 = trace->pid * NUM_LSU_LANES; + + for (uint32_t i = 0; i < NUM_LSU_LANES; ++i) { + uint32_t t = t0 + i; + if (!trace->tmask.test(t)) + continue; + + int req_idx = block_idx * LSU_CHANNELS + (i % LSU_CHANNELS); + auto& dcache_req_port = core_->lmem_switch_.at(req_idx)->ReqIn; + + auto mem_addr = trace_data->mem_addrs.at(t); + auto type = get_addr_type(mem_addr.addr); + // DT(3, "addr_type = " << type << ", " << *trace); + uint32_t mem_bytes = 1; + for (int i = 0; i < req_per_thread; i++) + { + MemReq mem_req; + mem_req.addr = mem_addr.addr + (i*mem_bytes); + mem_req.write = is_write; + mem_req.type = type; + mem_req.tag = tag; + mem_req.cid = trace->cid; + mem_req.uuid = trace->uuid; + + dcache_req_port.push(mem_req, 1); + DT(3, "mem-req: addr=0x" << std::hex << mem_req.addr << ", tag=" << tag + << ", lsu_type=" << trace->lsu_type << ", rid=" << req_idx << ", addr_type=" << mem_req.type << ", " << *trace); + + if (is_write) { + ++core_->perf_stats_.stores; + } else { + ++core_->perf_stats_.loads; + ++pending_loads_; + } + + ++count; + } + } + return count; +} +*/ + +/////////////////////////////////////////////////////////////////////////////// + +TcuUnit::TcuUnit(const SimContext& ctx, Core* core) + : FuncUnit(ctx, core, "TCU") + {} + +void TcuUnit::tick() { + + for (uint32_t i = 0; i < ISSUE_WIDTH; ++i) { + auto& input = Inputs.at(i); + if (input.empty()) + continue; + auto& output = Outputs.at(i); + auto trace = input.front(); + uint32_t n_tiles = core_->emulator_.get_tiles(); + uint32_t tc_size = core_->emulator_.get_tc_size(); + + switch (trace->tcu_type) { + case TCUType::TCU_MUL: + { //mat size = n_tiles * tc_size + int matmul_latency = (n_tiles * tc_size) + tc_size + tc_size; + output.push(trace, matmul_latency); + DT(3, "matmul_latency = " << matmul_latency << ", " << *trace); + break; + } + default: + std::abort(); + } + DT(3, "pipeline-execute: op=" << trace->tcu_type << ", " << *trace); + input.pop(); + } +} /////////////////////////////////////////////////////////////////////////////// diff --git a/sim/simx/func_unit.h b/sim/simx/func_unit.h index 76dd16173..2250d70c5 100644 --- a/sim/simx/func_unit.h +++ b/sim/simx/func_unit.h @@ -98,6 +98,14 @@ private: /////////////////////////////////////////////////////////////////////////////// +class TcuUnit : public FuncUnit { +public: + TcuUnit(const SimContext& ctx, Core*); + void tick(); +}; + +/////////////////////////////////////////////////////////////////////////////// + class SfuUnit : public FuncUnit { public: SfuUnit(const SimContext& ctx, Core*); diff --git a/sim/simx/instr.h b/sim/simx/instr.h index f97a19eac..a303a406d 100644 --- a/sim/simx/instr.h +++ b/sim/simx/instr.h @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -17,8 +17,8 @@ namespace vortex { -enum class Opcode { - NONE = 0, +enum class Opcode { + NONE = 0, R = 0x33, L = 0x3, I = 0x13, @@ -38,30 +38,98 @@ enum class Opcode { FMADD = 0x43, FMSUB = 0x47, FMNMSUB = 0x4b, - FMNMADD = 0x4f, + FMNMADD = 0x4f, // RV64 Standard Extension R_W = 0x3b, I_W = 0x1b, + // Vector Extension + VSET = 0x57, // Custom Extensions EXT1 = 0x0b, EXT2 = 0x2b, EXT3 = 0x5b, - EXT4 = 0x7b + TCU = 0x7b }; enum class InstType { - R, - I, - S, - B, - U, + R, + I, + S, + B, + U, J, + V, R4 }; +enum DecodeConstants { + width_opcode= 7, + width_reg = 5, + width_func2 = 2, + width_func3 = 3, + width_func6 = 6, + width_func7 = 7, + width_mop = 3, + width_vmask = 1, + width_i_imm = 12, + width_j_imm = 20, + width_v_zimm = 11, + width_v_ma = 1, + width_v_ta = 1, + width_v_sew = 3, + width_v_lmul = 3, + width_aq = 1, + width_rl = 1, + + shift_opcode= 0, + shift_rd = width_opcode, + shift_func3 = shift_rd + width_reg, + shift_rs1 = shift_func3 + width_func3, + shift_rs2 = shift_rs1 + width_reg, + shift_func2 = shift_rs2 + width_reg, + shift_func7 = shift_rs2 + width_reg, + shift_rs3 = shift_func7 + width_func2, + shift_vmop = shift_func7 + width_vmask, + shift_vnf = shift_vmop + width_mop, + shift_func6 = shift_func7 + width_vmask, + shift_vset = shift_func7 + width_func6, + shift_v_sew = width_v_lmul, + shift_v_ta = shift_v_sew + width_v_sew, + shift_v_ma = shift_v_ta + width_v_ta, + + mask_opcode = (1 << width_opcode) - 1, + mask_reg = (1 << width_reg) - 1, + mask_func2 = (1 << width_func2) - 1, + mask_func3 = (1 << width_func3) - 1, + mask_func6 = (1 << width_func6) - 1, + mask_func7 = (1 << width_func7) - 1, + mask_i_imm = (1 << width_i_imm) - 1, + mask_j_imm = (1 << width_j_imm) - 1, + mask_v_zimm = (1 << width_v_zimm) - 1, + mask_v_ma = (1 << width_v_ma) - 1, + mask_v_ta = (1 << width_v_ta) - 1, + mask_v_sew = (1 << width_v_sew) - 1, + mask_v_lmul = (1 << width_v_lmul) - 1, +}; + +enum VectorAttrMask { + vattr_vlswidth = (1 << 0), + vattr_vmop = (1 << 1), + vattr_vumop = (1 << 2), + vattr_vnf = (1 << 3), + vattr_vmask = (1 << 4), + vattr_vs3 = (1 << 5), + vattr_zimm = (1 << 6), + vattr_vlmul = (1 << 7), + vattr_vsew = (1 << 8), + vattr_vta = (1 << 9), + vattr_vma = (1 << 10), + vattr_vediv = (1 << 11) +}; + class Instr { public: - Instr() + Instr() : opcode_(Opcode::NONE) , num_rsrcs_(0) , has_imm_(false) @@ -70,45 +138,100 @@ public: , rdest_(0) , func2_(0) , func3_(0) - , func7_(0) { + , func6_(0) + , func7_(0) + , vmask_(0) + , vlsWidth_(0) + , vMop_(0) + , vUmop_(0) + , vNf_(0) + , vs3_(0) + , has_zimm_(false) + , vlmul_(0) + , vsew_(0) + , vta_(0) + , vma_(0) + , vediv_(0) + , vattr_mask_(0) { for (uint32_t i = 0; i < MAX_REG_SOURCES; ++i) { rsrc_type_[i] = RegType::None; rsrc_[i] = 0; } } - void setOpcode(Opcode opcode) { opcode_ = opcode; } - void setDestReg(uint32_t destReg, RegType type) { - rdest_type_ = type; - rdest_ = destReg; + void setOpcode(Opcode opcode) { + opcode_ = opcode; } - void addSrcReg(uint32_t srcReg, RegType type) { - rsrc_type_[num_rsrcs_] = type; - rsrc_[num_rsrcs_] = srcReg; + + void setDestReg(uint32_t destReg, RegType type) { + rdest_type_ = type; + rdest_ = destReg; + } + + void addSrcReg(uint32_t srcReg, RegType type) { + rsrc_type_[num_rsrcs_] = type; + rsrc_[num_rsrcs_] = srcReg; ++num_rsrcs_; } - void setSrcReg(uint32_t index, uint32_t srcReg, RegType type) { - rsrc_type_[index] = type; - rsrc_[index] = srcReg; - num_rsrcs_ = std::max(num_rsrcs_, index+1); + + void setSrcReg(uint32_t index, uint32_t srcReg, RegType type) { + rsrc_type_[index] = type; + rsrc_[index] = srcReg; + num_rsrcs_ = std::max(num_rsrcs_, index+1); } - void setFunc2(uint32_t func2) { func2_ = func2; } - void setFunc3(uint32_t func3) { func3_ = func3; } - void setFunc7(uint32_t func7) { func7_ = func7; } + void setImm(uint32_t imm) { has_imm_ = true; imm_ = imm; } + void setFunc2(uint32_t func2) { func2_ = func2; } + void setFunc3(uint32_t func3) { func3_ = func3; } + void setFunc6(uint32_t func6) { func6_ = func6; } + void setFunc7(uint32_t func7) { func7_ = func7; } + + // Attributes for Vector instructions + void setVlsWidth(uint32_t width) { vlsWidth_ = width; vattr_mask_ |= vattr_vlswidth; } + void setVmop(uint32_t mop) { vMop_ = mop; vattr_mask_ |= vattr_vmop; } + void setVumop(uint32_t umop) { vUmop_ = umop; vattr_mask_ |= vattr_vumop; } + void setVnf(uint32_t nf) { vNf_ = nf; vattr_mask_ |= vattr_vnf; } + void setVmask(uint32_t mask) { vmask_ = mask; vattr_mask_ |= vattr_vmask; } + void setVs3(uint32_t vs) { vs3_ = vs; vattr_mask_ |= vattr_vs3; } + void setZimm(bool has_zimm) { has_zimm_ = has_zimm; vattr_mask_ |= vattr_zimm; } + void setVlmul(uint32_t lmul) { vlmul_ = lmul; vattr_mask_ |= vattr_vlmul; } + void setVsew(uint32_t sew) { vsew_ = sew; vattr_mask_ |= vattr_vsew; } + void setVta(uint32_t vta) { vta_ = vta; vattr_mask_ |= vattr_vta; } + void setVma(uint32_t vma) { vma_ = vma; vattr_mask_ |= vattr_vma; } + void setVediv(uint32_t ediv) { vediv_ = 1 << ediv; vattr_mask_ |= vattr_vediv; } + Opcode getOpcode() const { return opcode_; } - uint32_t getFunc2() const { return func2_; } - uint32_t getFunc3() const { return func3_; } - uint32_t getFunc7() const { return func7_; } + uint32_t getNRSrc() const { return num_rsrcs_; } uint32_t getRSrc(uint32_t i) const { return rsrc_[i]; } RegType getRSType(uint32_t i) const { return rsrc_type_[i]; } - uint32_t getRDest() const { return rdest_; } - RegType getRDType() const { return rdest_type_; } + + uint32_t getRDest() const { return rdest_; } + RegType getRDType() const { return rdest_type_; } + bool hasImm() const { return has_imm_; } uint32_t getImm() const { return imm_; } + uint32_t getFunc2() const { return func2_; } + uint32_t getFunc3() const { return func3_; } + uint32_t getFunc6() const { return func6_; } + uint32_t getFunc7() const { return func7_; } + + uint32_t getVlsWidth() const { return vlsWidth_; } + uint32_t getVmop() const { return vMop_; } + uint32_t getVumop() const { return vUmop_; } + uint32_t getVnf() const { return vNf_; } + uint32_t getVmask() const { return vmask_; } + uint32_t getVs3() const { return vs3_; } + bool hasZimm() const { return has_zimm_; } + uint32_t getVlmul() const { return vlmul_; } + uint32_t getVsew() const { return vsew_; } + uint32_t getVta() const { return vta_; } + uint32_t getVma() const { return vma_; } + uint32_t getVediv() const { return vediv_; } + uint32_t getVattrMask() const { return vattr_mask_; } + private: enum { @@ -121,12 +244,28 @@ private: RegType rdest_type_; uint32_t imm_; RegType rsrc_type_[MAX_REG_SOURCES]; - uint32_t rsrc_[MAX_REG_SOURCES]; + uint32_t rsrc_[MAX_REG_SOURCES]; uint32_t rdest_; uint32_t func2_; uint32_t func3_; + uint32_t func6_; uint32_t func7_; + // Vector + uint32_t vmask_; + uint32_t vlsWidth_; + uint32_t vMop_; + uint32_t vUmop_; + uint32_t vNf_; + uint32_t vs3_; + bool has_zimm_; + uint32_t vlmul_; + uint32_t vsew_; + uint32_t vta_; + uint32_t vma_; + uint32_t vediv_; + uint32_t vattr_mask_; + friend std::ostream &operator<<(std::ostream &, const Instr&); }; diff --git a/sim/simx/instr_trace.h b/sim/simx/instr_trace.h index bbf4eab59..5ed98d265 100644 --- a/sim/simx/instr_trace.h +++ b/sim/simx/instr_trace.h @@ -77,6 +77,7 @@ public: AluType alu_type; FpuType fpu_type; SfuType sfu_type; + TCUType tcu_type; }; ITraceData::Ptr data; diff --git a/sim/simx/local_mem.cpp b/sim/simx/local_mem.cpp index 1bab3fccb..9d601c771 100644 --- a/sim/simx/local_mem.cpp +++ b/sim/simx/local_mem.cpp @@ -24,15 +24,12 @@ protected: LocalMem* simobject_; Config config_; RAM ram_; - int32_t bank_sel_addr_start_; - int32_t bank_sel_addr_end_; - PerfStats perf_stats_; + uint32_t line_bits_; + MemCrossBar::Ptr mem_xbar_; + mutable PerfStats perf_stats_; uint64_t to_local_addr(uint64_t addr) { - uint32_t total_lines = config_.capacity / config_.line_size; - uint32_t line_bits = log2ceil(total_lines); - uint32_t offset = bit_getw(addr, 0, line_bits-1); - return offset; + return bit_getw(addr, 0, line_bits_-1); } public: @@ -40,9 +37,24 @@ public: : simobject_(simobject) , config_(config) , ram_(config.capacity) - , bank_sel_addr_start_(0) - , bank_sel_addr_end_(config.B-1) - {} + { + uint32_t total_lines = config.capacity / config.line_size; + line_bits_ = log2ceil(total_lines); + + char sname[100]; + snprintf(sname, 100, "%s-xbar", simobject->name().c_str()); + uint32_t lg2_line_size = log2ceil(config_.line_size); + uint32_t num_banks = 1 << config.B; + mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::Priority, config.num_reqs, num_banks, 1, + [lg2_line_size, num_banks](const MemCrossBar::ReqType& req) { + // Custom logic to calculate the output index using bank interleaving + return (uint32_t)((req.addr >> lg2_line_size) & (num_banks-1)); + }); + for (uint32_t i = 0; i < config.num_reqs; ++i) { + simobject->Inputs.at(i).bind(&mem_xbar_->ReqIn.at(i)); + mem_xbar_->RspIn.at(i).bind(&simobject->Outputs.at(i)); + } + } virtual ~Impl() {} @@ -51,57 +63,45 @@ public: } void read(void* data, uint64_t addr, uint32_t size) { - auto s_addr = to_local_addr(addr); - DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::dec << std::endl); - ram_.read(data, s_addr, size); + auto l_addr = to_local_addr(addr); + DPH(3, "Local Mem addr=0x" << std::hex << l_addr << std::dec << std::endl); + ram_.read(data, l_addr, size); } void write(const void* data, uint64_t addr, uint32_t size) { - auto s_addr = to_local_addr(addr); - DPH(3, "Local Mem addr=0x" << std::hex << s_addr << std::dec << std::endl); - ram_.write(data, s_addr, size); + auto l_addr = to_local_addr(addr); + DPH(3, "Local Mem addr=0x" << std::hex << l_addr << std::dec << std::endl); + ram_.write(data, l_addr, size); } void tick() { - std::vector in_used_banks(1 << config_.B); - for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) { - auto& core_req_port = simobject_->Inputs.at(req_id); - if (core_req_port.empty()) + // process bank requets from xbar + uint32_t num_banks = (1 << config_.B); + for (uint32_t i = 0; i < num_banks; ++i) { + auto& xbar_req_out = mem_xbar_->ReqOut.at(i); + if (xbar_req_out.empty()) continue; - auto& core_req = core_req_port.front(); + auto& bank_req = xbar_req_out.front(); + DT(4, simobject_->name() << "-bank" << i << "-req : " << bank_req); - uint32_t bank_id = 0; - if (bank_sel_addr_end_ >= bank_sel_addr_start_) { - bank_id = (uint32_t)bit_getw(core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_); - } - - // bank conflict check - if (in_used_banks.at(bank_id)) { - ++perf_stats_.bank_stalls; - continue; - } - - DT(4, simobject_->name() << " mem-req" << req_id << ": "<< core_req); - - in_used_banks.at(bank_id) = true; - - if (!core_req.write || config_.write_reponse) { - // send response - MemRsp core_rsp{core_req.tag, core_req.cid, core_req.uuid}; - simobject_->Outputs.at(req_id).push(core_rsp, 1); + if (!bank_req.write || config_.write_reponse) { + // send xbar response + MemRsp bank_rsp{bank_req.tag, bank_req.cid, bank_req.uuid}; + mem_xbar_->RspOut.at(i).push(bank_rsp, 1); } // update perf counters - perf_stats_.reads += !core_req.write; - perf_stats_.writes += core_req.write; + perf_stats_.reads += !bank_req.write; + perf_stats_.writes += bank_req.write; // remove input - core_req_port.pop(); + xbar_req_out.pop(); } } const PerfStats& perf_stats() const { + perf_stats_.bank_stalls = mem_xbar_->req_collisions(); return perf_stats_; } }; diff --git a/sim/simx/main.cpp b/sim/simx/main.cpp index a8883c696..d6ed15a25 100644 --- a/sim/simx/main.cpp +++ b/sim/simx/main.cpp @@ -29,18 +29,19 @@ using namespace vortex; static void show_usage() { - std::cout << "Usage: [-c ] [-w ] [-t ] [-s: stats] [-h: help] " << std::endl; + std::cout << "Usage: [-c ] [-w ] [-t ] [-v: vector-test] [-s: stats] [-h: help] " << std::endl; } uint32_t num_threads = NUM_THREADS; uint32_t num_warps = NUM_WARPS; uint32_t num_cores = NUM_CORES; bool showStats = false; +bool vector_test = false; const char* program = nullptr; static void parse_args(int argc, char **argv) { int c; - while ((c = getopt(argc, argv, "t:w:c:rsh?")) != -1) { + while ((c = getopt(argc, argv, "t:w:c:vsh")) != -1) { switch (c) { case 't': num_threads = atoi(optarg); @@ -51,17 +52,19 @@ static void parse_args(int argc, char **argv) { case 'c': num_cores = atoi(optarg); break; + case 'v': + vector_test = true; + break; case 's': showStats = true; break; case 'h': - case '?': - show_usage(); - exit(0); + show_usage(); + exit(0); break; default: - show_usage(); - exit(-1); + show_usage(); + exit(-1); } } @@ -84,7 +87,7 @@ int main(int argc, char **argv) { Arch arch(num_threads, num_warps, num_cores); // create memory module - RAM ram(0, RAM_PAGE_SIZE); + RAM ram(0, MEM_PAGE_SIZE); // create processor Processor processor(arch); @@ -116,6 +119,11 @@ int main(int argc, char **argv) { std::cout << "[VXDRV] START: program=" << program << std::endl; #endif // run simulation + // vector test exitcode is a special case + #ifdef EXT_V_ENABLE + if (vector_test) return processor.run(); + #endif + // else continue as normal processor.run(); // read exitcode from @MPM.1 diff --git a/sim/simx/mem_coalescer.cpp b/sim/simx/mem_coalescer.cpp index 8af567985..826e4479c 100644 --- a/sim/simx/mem_coalescer.cpp +++ b/sim/simx/mem_coalescer.cpp @@ -42,10 +42,10 @@ void MemCoalescer::reset() { } void MemCoalescer::tick() { - // process incoming responses + // process outgoing responses if (!RspOut.empty()) { auto& out_rsp = RspOut.front(); - DT(4, this->name() << " mem-rsp: " << out_rsp); + DT(4, this->name() << "-mem-rsp: " << out_rsp); auto& entry = pending_rd_reqs_.at(out_rsp.tag); BitVector<> rsp_mask(input_size_); @@ -89,7 +89,7 @@ void MemCoalescer::tick() { // ensure we can allocate a response tag if (pending_rd_reqs_.full()) { - DT(4, "*** " << this->name() << " queue-full: " << in_req); + DT(4, "*** " << this->name() << "-queue-full: " << in_req); return; } @@ -145,7 +145,10 @@ void MemCoalescer::tick() { // send memory request ReqOut.push(out_req, delay_); - DT(4, this->name() << " mem-req: coalesced=" << cur_mask.count() << ", " << out_req); + DT(4, this->name() << "-mem-req: coalesced=" << cur_mask.count() << ", " << out_req); + + // track partial responses + perf_stats_.misses += (cur_mask.count() != in_req.mask.count()); // update sent mask sent_mask_ |= cur_mask; @@ -153,4 +156,8 @@ void MemCoalescer::tick() { ReqIn.pop(); sent_mask_.reset(); } +} + +const MemCoalescer::PerfStats& MemCoalescer::perf_stats() const { + return perf_stats_; } \ No newline at end of file diff --git a/sim/simx/mem_coalescer.h b/sim/simx/mem_coalescer.h index f0e3935aa..a53590c9c 100644 --- a/sim/simx/mem_coalescer.h +++ b/sim/simx/mem_coalescer.h @@ -23,6 +23,19 @@ public: SimPort ReqOut; SimPort RspOut; + struct PerfStats { + uint64_t misses; + + PerfStats() + : misses(0) + {} + + PerfStats& operator+=(const PerfStats& rhs) { + this->misses += rhs.misses; + return *this; + } + }; + MemCoalescer( const SimContext& ctx, const char* name, @@ -37,6 +50,8 @@ public: void tick(); + const PerfStats& perf_stats() const; + private: struct pending_req_t { @@ -52,6 +67,7 @@ private: BitVector<> sent_mask_; uint32_t line_size_; uint32_t delay_; + PerfStats perf_stats_; }; } \ No newline at end of file diff --git a/sim/simx/mem_sim.cpp b/sim/simx/mem_sim.cpp index a12713fea..a04e83a8b 100644 --- a/sim/simx/mem_sim.cpp +++ b/sim/simx/mem_sim.cpp @@ -27,26 +27,40 @@ class MemSim::Impl { private: MemSim* simobject_; Config config_; + MemCrossBar::Ptr mem_xbar_; DramSim dram_sim_; - PerfStats perf_stats_; - + mutable PerfStats perf_stats_; struct DramCallbackArgs { - MemSim* simobject; - MemReq request; + MemSim::Impl* memsim; + MemReq request; + uint32_t bank_id; }; public: Impl(MemSim* simobject, const Config& config) : simobject_(simobject) , config_(config) - , dram_sim_(MEM_CLOCK_RATIO) - {} + , dram_sim_(config.num_banks, config.block_size, config.clock_ratio) + { + char sname[100]; + snprintf(sname, 100, "%s-xbar", simobject->name().c_str()); + mem_xbar_ = MemCrossBar::Create(sname, ArbiterType::RoundRobin, config.num_ports, config.num_banks, 1, + [lg2_block_size = log2ceil(config.block_size), num_banks = config.num_banks](const MemCrossBar::ReqType& req) { + // Custom logic to calculate the output index using bank interleaving + return (uint32_t)((req.addr >> lg2_block_size) & (num_banks-1)); + }); + for (uint32_t i = 0; i < config.num_ports; ++i) { + simobject->MemReqPorts.at(i).bind(&mem_xbar_->ReqIn.at(i)); + mem_xbar_->RspIn.at(i).bind(&simobject->MemRspPorts.at(i)); + } + } ~Impl() { //-- } const PerfStats& perf_stats() const { + perf_stats_.bank_stalls = mem_xbar_->req_collisions(); return perf_stats_; } @@ -57,45 +71,33 @@ public: void tick() { dram_sim_.tick(); - if (simobject_->MemReqPort.empty()) - return; + for (uint32_t i = 0; i < config_.num_banks; ++i) { + if (mem_xbar_->ReqOut.at(i).empty()) + continue; - auto& mem_req = simobject_->MemReqPort.front(); + auto& mem_req = mem_xbar_->ReqOut.at(i).front(); - // try to enqueue the request to the memory system - auto req_args = new DramCallbackArgs{simobject_, mem_req}; - auto enqueue_success = dram_sim_.send_request( - mem_req.write, - mem_req.addr, - 0, - [](void* arg) { - auto rsp_args = reinterpret_cast(arg); - // only send a response for read requests - if (!rsp_args->request.write) { - MemRsp mem_rsp{rsp_args->request.tag, rsp_args->request.cid, rsp_args->request.uuid}; - rsp_args->simobject->MemRspPort.push(mem_rsp, 1); - DT(3, rsp_args->simobject->name() << " mem-rsp: " << mem_rsp); - } - delete rsp_args; - }, - req_args - ); + // enqueue the request to the memory system + auto req_args = new DramCallbackArgs{this, mem_req, i}; + dram_sim_.send_request( + mem_req.addr, + mem_req.write, + [](void* arg) { + auto rsp_args = reinterpret_cast(arg); + if (!rsp_args->request.write) { + // only send a response for read requests + MemRsp mem_rsp{rsp_args->request.tag, rsp_args->request.cid, rsp_args->request.uuid}; + rsp_args->memsim->mem_xbar_->RspOut.at(rsp_args->bank_id).push(mem_rsp, 1); + DT(3, rsp_args->memsim->simobject_->name() << "-mem-rsp[" << rsp_args->bank_id << "]: " << mem_rsp); + } + delete rsp_args; + }, + req_args + ); - // check if the request was enqueued successfully - if (!enqueue_success) { - delete req_args; - return; + DT(3, simobject_->name() << "-mem-req[" << i << "]: " << mem_req); + mem_xbar_->ReqOut.at(i).pop(); } - - if (mem_req.write) { - ++perf_stats_.writes; - } else { - ++perf_stats_.reads; - } - - DT(3, simobject_->name() << " mem-req: " << mem_req); - - simobject_->MemReqPort.pop(); } }; @@ -103,8 +105,8 @@ public: MemSim::MemSim(const SimContext& ctx, const char* name, const Config& config) : SimObject(ctx, name) - , MemReqPort(this) - , MemRspPort(this) + , MemReqPorts(config.num_ports, this) + , MemRspPorts(config.num_ports, this) , impl_(new Impl(this, config)) {} @@ -118,4 +120,8 @@ void MemSim::reset() { void MemSim::tick() { impl_->tick(); +} + +const MemSim::PerfStats &MemSim::perf_stats() const { + return impl_->perf_stats(); } \ No newline at end of file diff --git a/sim/simx/mem_sim.h b/sim/simx/mem_sim.h index 3f4d9801e..d9408751b 100644 --- a/sim/simx/mem_sim.h +++ b/sim/simx/mem_sim.h @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -21,22 +21,27 @@ namespace vortex { class MemSim : public SimObject{ public: struct Config { - uint32_t channels; - uint32_t num_cores; + uint32_t num_banks; + uint32_t num_ports; + uint32_t block_size; + float clock_ratio; }; struct PerfStats { - uint64_t reads; - uint64_t writes; + uint64_t bank_stalls; - PerfStats() - : reads(0) - , writes(0) + PerfStats() + : bank_stalls(0) {} + + PerfStats& operator+=(const PerfStats& rhs) { + this->bank_stalls += rhs.bank_stalls; + return *this; + } }; - SimPort MemReqPort; - SimPort MemRspPort; + std::vector> MemReqPorts; + std::vector> MemRspPorts; MemSim(const SimContext& ctx, const char* name, const Config& config); ~MemSim(); @@ -46,7 +51,7 @@ public: void tick(); const PerfStats& perf_stats() const; - + private: class Impl; Impl* impl_; diff --git a/sim/simx/processor.cpp b/sim/simx/processor.cpp index 3807fa5e8..2d05ad3dc 100644 --- a/sim/simx/processor.cpp +++ b/sim/simx/processor.cpp @@ -22,12 +22,21 @@ ProcessorImpl::ProcessorImpl(const Arch& arch) { SimPlatform::instance().initialize(); + assert(PLATFORM_MEMORY_DATA_SIZE == MEM_BLOCK_SIZE); + // create memory simulator memsim_ = MemSim::Create("dram", MemSim::Config{ - MEMORY_BANKS, - uint32_t(arch.num_cores()) * arch.num_clusters() + PLATFORM_MEMORY_NUM_BANKS, + L3_MEM_PORTS, + MEM_BLOCK_SIZE, + MEM_CLOCK_RATIO }); + // create clusters + for (uint32_t i = 0; i < arch.num_clusters(); ++i) { + clusters_.at(i) = Cluster::Create(i, this, arch, dcrs_); + } + // create L3 cache l3cache_ = CacheSim::Create("l3cache", CacheSim::Config{ !L3_ENABLED, @@ -38,7 +47,8 @@ ProcessorImpl::ProcessorImpl(const Arch& arch) log2ceil(L3_NUM_BANKS), // B XLEN, // address bits 1, // number of ports - uint8_t(arch.num_clusters()), // request size + L3_NUM_REQS, // request size + L3_MEM_PORTS, // memory ports L3_WRITEBACK, // write-back false, // write response L3_MSHR_SIZE, // mshr size @@ -46,29 +56,33 @@ ProcessorImpl::ProcessorImpl(const Arch& arch) } ); - // connect L3 memory ports - l3cache_->MemReqPort.bind(&memsim_->MemReqPort); - memsim_->MemRspPort.bind(&l3cache_->MemRspPort); - - // create clusters + // connect L3 core interfaces for (uint32_t i = 0; i < arch.num_clusters(); ++i) { - clusters_.at(i) = Cluster::Create(i, this, arch, dcrs_); - // connect L3 core ports - clusters_.at(i)->mem_req_port.bind(&l3cache_->CoreReqPorts.at(i)); - l3cache_->CoreRspPorts.at(i).bind(&clusters_.at(i)->mem_rsp_port); + for (uint32_t j = 0; j < L2_MEM_PORTS; ++j) { + clusters_.at(i)->mem_req_ports.at(j).bind(&l3cache_->CoreReqPorts.at(i * L2_MEM_PORTS + j)); + l3cache_->CoreRspPorts.at(i * L2_MEM_PORTS + j).bind(&clusters_.at(i)->mem_rsp_ports.at(j)); + } + } + + // connect L3 memory interfaces + for (uint32_t i = 0; i < L3_MEM_PORTS; ++i) { + l3cache_->MemReqPorts.at(i).bind(&memsim_->MemReqPorts.at(i)); + memsim_->MemRspPorts.at(i).bind(&l3cache_->MemRspPorts.at(i)); } // set up memory profiling - memsim_->MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){ - __unused (cycle); - perf_mem_reads_ += !req.write; - perf_mem_writes_ += req.write; - perf_mem_pending_reads_ += !req.write; - }); - memsim_->MemRspPort.tx_callback([&](const MemRsp&, uint64_t cycle){ - __unused (cycle); - --perf_mem_pending_reads_; - }); + for (uint32_t i = 0; i < L3_MEM_PORTS; ++i) { + memsim_->MemReqPorts.at(i).tx_callback([&](const MemReq& req, uint64_t cycle){ + __unused (cycle); + perf_mem_reads_ += !req.write; + perf_mem_writes_ += req.write; + perf_mem_pending_reads_ += !req.write; + }); + memsim_->MemRspPorts.at(i).tx_callback([&](const MemRsp&, uint64_t cycle){ + __unused (cycle); + --perf_mem_pending_reads_; + }); + } #ifndef NDEBUG // dump device configuration @@ -95,12 +109,20 @@ void ProcessorImpl::attach_ram(RAM* ram) { cluster->attach_ram(ram); } } +#ifdef VM_ENABLE +void ProcessorImpl::set_satp(uint64_t satp) { + for (auto cluster : clusters_) { + cluster->set_satp(satp); + } +} +#endif -void ProcessorImpl::run() { +int ProcessorImpl::run() { SimPlatform::instance().reset(); this->reset(); bool done; + int exitcode = 0; do { SimPlatform::instance().tick(); done = true; @@ -109,9 +131,14 @@ void ProcessorImpl::run() { done = false; continue; } + #ifdef EXT_V_ENABLE + exitcode |= cluster->get_exitcode(); + #endif } perf_mem_latency_ += perf_mem_pending_reads_; } while (!done); + + return exitcode; } void ProcessorImpl::reset() { @@ -131,6 +158,7 @@ ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const { perf.mem_writes = perf_mem_writes_; perf.mem_latency = perf_mem_latency_; perf.l3cache = l3cache_->perf_stats(); + perf.memsim = memsim_->perf_stats(); return perf; } @@ -138,20 +166,51 @@ ProcessorImpl::PerfStats ProcessorImpl::perf_stats() const { Processor::Processor(const Arch& arch) : impl_(new ProcessorImpl(arch)) -{} +{ +#ifdef VM_ENABLE + satp_ = NULL; +#endif +} Processor::~Processor() { delete impl_; +#ifdef VM_ENABLE + if (satp_ != NULL) + delete satp_; +#endif } void Processor::attach_ram(RAM* mem) { impl_->attach_ram(mem); } -void Processor::run() { - impl_->run(); +int Processor::run() { + return impl_->run(); } void Processor::dcr_write(uint32_t addr, uint32_t value) { return impl_->dcr_write(addr, value); -} \ No newline at end of file +} + +#ifdef VM_ENABLE +int16_t Processor::set_satp_by_addr(uint64_t base_addr) { + uint16_t asid = 0; + satp_ = new SATP_t (base_addr,asid); + if (satp_ == NULL) + return 1; + uint64_t satp = satp_->get_satp(); + impl_->set_satp(satp); + return 0; +} +bool Processor::is_satp_unset() { + return (satp_== NULL); +} +uint8_t Processor::get_satp_mode() { + assert (satp_!=NULL); + return satp_->get_mode(); +} +uint64_t Processor::get_base_ppn() { + assert (satp_!=NULL); + return satp_->get_base_ppn(); +} +#endif diff --git a/sim/simx/processor.h b/sim/simx/processor.h index 003af6b0a..741b04f57 100644 --- a/sim/simx/processor.h +++ b/sim/simx/processor.h @@ -14,12 +14,17 @@ #pragma once #include +#include +#include namespace vortex { class Arch; class RAM; class ProcessorImpl; +#ifdef VM_ENABLE +class SATP_t; +#endif class Processor { public: @@ -28,12 +33,21 @@ public: void attach_ram(RAM* mem); - void run(); + int run(); void dcr_write(uint32_t addr, uint32_t value); +#ifdef VM_ENABLE + bool is_satp_unset(); + uint8_t get_satp_mode(); + uint64_t get_base_ppn(); + int16_t set_satp_by_addr(uint64_t addr); +#endif private: ProcessorImpl* impl_; +#ifdef VM_ENABLE + SATP_t *satp_; +#endif }; } diff --git a/sim/simx/processor_impl.h b/sim/simx/processor_impl.h index dcfba84d7..952b28222 100644 --- a/sim/simx/processor_impl.h +++ b/sim/simx/processor_impl.h @@ -25,6 +25,7 @@ class ProcessorImpl { public: struct PerfStats { CacheSim::PerfStats l3cache; + MemSim::PerfStats memsim; uint64_t mem_reads; uint64_t mem_writes; uint64_t mem_latency; @@ -35,10 +36,14 @@ public: void attach_ram(RAM* mem); - void run(); + int run(); void dcr_write(uint32_t addr, uint32_t value); +#ifdef VM_ENABLE + void set_satp(uint64_t satp); +#endif + PerfStats perf_stats() const; private: diff --git a/sim/simx/socket.cpp b/sim/simx/socket.cpp index d7e421b4b..c08e03a5b 100644 --- a/sim/simx/socket.cpp +++ b/sim/simx/socket.cpp @@ -21,11 +21,9 @@ Socket::Socket(const SimContext& ctx, Cluster* cluster, const Arch &arch, const DCRS &dcrs) - : SimObject(ctx, "socket") - , icache_mem_req_port(this) - , icache_mem_rsp_port(this) - , dcache_mem_req_port(this) - , dcache_mem_rsp_port(this) + : SimObject(ctx, StrFormat("socket%d", socket_id)) + , mem_req_ports(L1_MEM_PORTS, this) + , mem_rsp_ports(L1_MEM_PORTS, this) , socket_id_(socket_id) , cluster_(cluster) , cores_(arch.socket_size()) @@ -33,8 +31,8 @@ Socket::Socket(const SimContext& ctx, auto cores_per_socket = cores_.size(); char sname[100]; - snprintf(sname, 100, "socket%d-icaches", socket_id); - icaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_ICACHES, 1, CacheSim::Config{ + snprintf(sname, 100, "%s-icaches", this->name().c_str()); + icaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_ICACHES, CacheSim::Config{ !ICACHE_ENABLED, log2ceil(ICACHE_SIZE), // C log2ceil(L1_LINE_SIZE), // L @@ -44,17 +42,15 @@ Socket::Socket(const SimContext& ctx, XLEN, // address bits 1, // number of ports 1, // number of inputs + ICACHE_MEM_PORTS, // memory ports false, // write-back false, // write response (uint8_t)arch.num_warps(), // mshr size 2, // pipeline latency }); - icaches_->MemReqPort.bind(&icache_mem_req_port); - icache_mem_rsp_port.bind(&icaches_->MemRspPort); - - snprintf(sname, 100, "socket%d-dcaches", socket_id); - dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, DCACHE_NUM_REQS, CacheSim::Config{ + snprintf(sname, 100, "%s-dcaches", this->name().c_str()); + dcaches_ = CacheCluster::Create(sname, cores_per_socket, NUM_DCACHES, CacheSim::Config{ !DCACHE_ENABLED, log2ceil(DCACHE_SIZE), // C log2ceil(L1_LINE_SIZE), // L @@ -64,21 +60,51 @@ Socket::Socket(const SimContext& ctx, XLEN, // address bits 1, // number of ports DCACHE_NUM_REQS, // number of inputs + L1_MEM_PORTS, // memory ports DCACHE_WRITEBACK, // write-back false, // write response DCACHE_MSHR_SIZE, // mshr size 2, // pipeline latency }); - dcaches_->MemReqPort.bind(&dcache_mem_req_port); - dcache_mem_rsp_port.bind(&dcaches_->MemRspPort); + // find overlap + uint32_t overlap = MIN(ICACHE_MEM_PORTS, L1_MEM_PORTS); + + // connect l1 caches to outgoing memory interfaces + for (uint32_t i = 0; i < L1_MEM_PORTS; ++i) { + snprintf(sname, 100, "%s-l1_arb%d", this->name().c_str(), i); + auto l1_arb = MemArbiter::Create(sname, ArbiterType::RoundRobin, 2 * overlap, overlap); + + if (i < overlap) { + icaches_->MemReqPorts.at(i).bind(&l1_arb->ReqIn.at(i)); + l1_arb->RspIn.at(i).bind(&icaches_->MemRspPorts.at(i)); + + dcaches_->MemReqPorts.at(i).bind(&l1_arb->ReqIn.at(overlap + i)); + l1_arb->RspIn.at(overlap + i).bind(&dcaches_->MemRspPorts.at(i)); + + l1_arb->ReqOut.at(i).bind(&this->mem_req_ports.at(i)); + this->mem_rsp_ports.at(i).bind(&l1_arb->RspOut.at(i)); + } else { + if (L1_MEM_PORTS > ICACHE_MEM_PORTS) { + // if more dcache ports + dcaches_->MemReqPorts.at(i).bind(&this->mem_req_ports.at(i)); + this->mem_rsp_ports.at(i).bind(&dcaches_->MemRspPorts.at(i)); + } else { + // if more icache ports + icaches_->MemReqPorts.at(i).bind(&this->mem_req_ports.at(i)); + this->mem_rsp_ports.at(i).bind(&icaches_->MemRspPorts.at(i)); + } + } + } // create cores - for (uint32_t i = 0; i < cores_per_socket; ++i) { uint32_t core_id = socket_id * cores_per_socket + i; cores_.at(i) = Core::Create(core_id, this, arch, dcrs); + } + // connect cores to caches + for (uint32_t i = 0; i < cores_per_socket; ++i) { cores_.at(i)->icache_req_ports.at(0).bind(&icaches_->CoreReqPorts.at(i).at(0)); icaches_->CoreRspPorts.at(i).at(0).bind(&cores_.at(i)->icache_rsp_ports.at(0)); @@ -107,6 +133,14 @@ void Socket::attach_ram(RAM* ram) { } } +#ifdef VM_ENABLE +void Socket::set_satp(uint64_t satp) { + for (auto core : cores_) { + core->set_satp(satp); + } +} +#endif + bool Socket::running() const { for (auto& core : cores_) { if (core->running()) diff --git a/sim/simx/socket.h b/sim/simx/socket.h index ed38dce67..f8c266d05 100644 --- a/sim/simx/socket.h +++ b/sim/simx/socket.h @@ -1,10 +1,10 @@ // Copyright © 2019-2023 -// +// // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 -// +// // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -32,16 +32,13 @@ public: CacheSim::PerfStats dcache; }; - SimPort icache_mem_req_port; - SimPort icache_mem_rsp_port; + std::vector> mem_req_ports; + std::vector> mem_rsp_ports; - SimPort dcache_mem_req_port; - SimPort dcache_mem_rsp_port; - - Socket(const SimContext& ctx, + Socket(const SimContext& ctx, uint32_t socket_id, - Cluster* cluster, - const Arch &arch, + Cluster* cluster, + const Arch &arch, const DCRS &dcrs); ~Socket(); @@ -60,16 +57,20 @@ public: void attach_ram(RAM* ram); +#ifdef VM_ENABLE + void set_satp(uint64_t satp); +#endif + bool running() const; - int get_exitcode() const; + int get_exitcode() const; void barrier(uint32_t bar_id, uint32_t count, uint32_t core_id); void resume(uint32_t core_id); PerfStats perf_stats() const; - + private: uint32_t socket_id_; Cluster* cluster_; diff --git a/sim/simx/types.cpp b/sim/simx/types.cpp index 3e6c5960f..56bf60cea 100644 --- a/sim/simx/types.cpp +++ b/sim/simx/types.cpp @@ -15,11 +15,11 @@ using namespace vortex; -LocalMemDemux::LocalMemDemux( +LocalMemSwitch::LocalMemSwitch( const SimContext& ctx, const char* name, uint32_t delay -) : SimObject(ctx, name) +) : SimObject(ctx, name) , ReqIn(this) , RspIn(this) , ReqLmem(this) @@ -29,19 +29,19 @@ LocalMemDemux::LocalMemDemux( , delay_(delay) {} -void LocalMemDemux::reset() {} +void LocalMemSwitch::reset() {} -void LocalMemDemux::tick() { - // process incoming responses +void LocalMemSwitch::tick() { + // process outgoing responses if (!RspLmem.empty()) { auto& out_rsp = RspLmem.front(); - DT(4, this->name() << " lmem-rsp: " << out_rsp); + DT(4, this->name() << "-lmem-rsp: " << out_rsp); RspIn.push(out_rsp, 1); RspLmem.pop(); } if (!RspDC.empty()) { auto& out_rsp = RspDC.front(); - DT(4, this->name() << " dc-rsp: " << out_rsp); + DT(4, this->name() << "-dc-rsp: " << out_rsp); RspIn.push(out_rsp, 1); RspDC.pop(); } @@ -73,12 +73,12 @@ void LocalMemDemux::tick() { if (!out_dc_req.mask.none()) { ReqDC.push(out_dc_req, delay_); - DT(4, this->name() << " dc-req: " << out_dc_req); + DT(4, this->name() << "-dc-req: " << out_dc_req); } if (!out_lmem_req.mask.none()) { ReqLmem.push(out_lmem_req, delay_); - DT(4, this->name() << " lmem-req: " << out_lmem_req); + DT(4, this->name() << "-lmem-req: " << out_lmem_req); } ReqIn.pop(); } @@ -104,12 +104,12 @@ void LsuMemAdapter::reset() {} void LsuMemAdapter::tick() { uint32_t input_size = ReqOut.size(); - // process incoming responses + // process outgoing responses for (uint32_t i = 0; i < input_size; ++i) { if (RspOut.at(i).empty()) continue; auto& out_rsp = RspOut.at(i).front(); - DT(4, this->name() << " rsp" << i << ": " << out_rsp); + DT(4, this->name() << "-rsp" << i << ": " << out_rsp); // build memory response LsuRsp in_rsp(input_size); @@ -141,7 +141,6 @@ void LsuMemAdapter::tick() { if (!ReqIn.empty()) { auto& in_req = ReqIn.front(); assert(in_req.mask.size() == input_size); - for (uint32_t i = 0; i < input_size; ++i) { if (in_req.mask.test(i)) { // build memory request @@ -152,10 +151,9 @@ void LsuMemAdapter::tick() { out_req.tag = in_req.tag; out_req.cid = in_req.cid; out_req.uuid = in_req.uuid; - // send memory request ReqOut.at(i).push(out_req, delay_); - DT(4, this->name() << " req" << i << ": " << out_req); + DT(4, this->name() << "-req" << i << ": " << out_req); } } ReqIn.pop(); diff --git a/sim/simx/types.h b/sim/simx/types.h index b452dd379..8c6912a29 100644 --- a/sim/simx/types.h +++ b/sim/simx/types.h @@ -25,6 +25,7 @@ #include #include #include "debug.h" +#include namespace vortex { @@ -58,11 +59,33 @@ typedef std::bitset WarpMask; /////////////////////////////////////////////////////////////////////////////// +class ThreadMaskOS { +public: + ThreadMaskOS(const ThreadMask& mask, int size) + : mask_(mask) + , size_(size) + {} + + friend std::ostream& operator<<(std::ostream& os, const ThreadMaskOS& wrapper) { + for (int i = 0; i < wrapper.size_; ++i) { + os << wrapper.mask_[i]; + } + return os; + } + +private: + const ThreadMask& mask_; + int size_; +}; + +/////////////////////////////////////////////////////////////////////////////// + enum class RegType { None, Integer, Float, - Count + Count, + Vector }; inline std::ostream &operator<<(std::ostream &os, const RegType& type) { @@ -70,6 +93,7 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& type) { case RegType::None: break; case RegType::Integer: os << "x"; break; case RegType::Float: os << "f"; break; + case RegType::Vector: os << "v"; break; default: assert(false); } return os; @@ -82,6 +106,7 @@ enum class FUType { LSU, FPU, SFU, + TCU, Count }; @@ -91,6 +116,7 @@ inline std::ostream &operator<<(std::ostream &os, const FUType& type) { case FUType::LSU: os << "LSU"; break; case FUType::FPU: os << "FPU"; break; case FUType::SFU: os << "SFU"; break; + case FUType::TCU: os << "TCU"; break; default: assert(false); } return os; @@ -122,14 +148,30 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) { enum class LsuType { LOAD, + TCU_LOAD, STORE, + TCU_STORE, FENCE }; +enum class TCUType { + TCU_MUL +}; + +inline std::ostream &operator<<(std::ostream &os, const TCUType& type) { + switch (type) { + case TCUType::TCU_MUL: os << "TCU MUL"; break; + default: assert(false); + } + return os; +} + inline std::ostream &operator<<(std::ostream &os, const LsuType& type) { switch (type) { case LsuType::LOAD: os << "LOAD"; break; + case LsuType::TCU_LOAD: os << "TCU_LOAD"; break; case LsuType::STORE: os << "STORE"; break; + case LsuType::TCU_STORE: os << "TCU_STORE"; break; case LsuType::FENCE: os << "FENCE"; break; default: assert(false); } @@ -260,17 +302,18 @@ struct LsuReq { }; inline std::ostream &operator<<(std::ostream &os, const LsuReq& req) { - os << "rw=" << req.write << ", mask=" << req.mask << ", "; + os << "rw=" << req.write << ", mask=" << req.mask << ", addr={"; + bool first_addr = true; for (size_t i = 0; i < req.mask.size(); ++i) { - os << "addr" << i << "="; + if (!first_addr) os << ", "; + first_addr = false; if (req.mask.test(i)) { os << "0x" << std::hex << req.addrs.at(i) << std::dec; } else { os << "-"; } - os << ", "; } - os << "tag=0x" << std::hex << req.tag << std::dec << ", cid=" << req.cid; + os << "}, tag=0x" << std::hex << req.tag << std::dec << ", cid=" << req.cid; os << " (#" << req.uuid << ")"; return os; } @@ -355,6 +398,8 @@ inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) { template class HashTable { public: + typedef T DataType; + HashTable(uint32_t capacity) : entries_(capacity) , size_(0) @@ -425,29 +470,31 @@ private: /////////////////////////////////////////////////////////////////////////////// template -class Mux : public SimObject> { +class Arbiter : public SimObject> { public: + typedef Type ReqType; + std::vector> Inputs; std::vector> Outputs; - Mux( + Arbiter( const SimContext& ctx, const char* name, ArbiterType type, uint32_t num_inputs, uint32_t num_outputs = 1, uint32_t delay = 1 - ) : SimObject>(ctx, name) + ) : SimObject>(ctx, name) , Inputs(num_inputs, this) , Outputs(num_outputs, this) , type_(type) , delay_(delay) - , cursors_(num_outputs, 0) - , num_reqs_(num_inputs / num_outputs) + , grants_(num_outputs, 0) + , lg2_num_reqs_(log2ceil(num_inputs / num_outputs)) { assert(delay != 0); - assert(num_inputs <= 32); - assert(num_outputs <= 32); + assert(num_inputs <= 64); + assert(num_outputs <= 64); assert(num_inputs >= num_outputs); // bypass mode @@ -459,15 +506,15 @@ public: } void reset() { - for (auto& cursor : cursors_) { - cursor = 0; + for (auto& grant : grants_) { + grant = 0; } } void tick() { uint32_t I = Inputs.size(); uint32_t O = Outputs.size(); - uint32_t R = num_reqs_; + uint32_t R = 1 << lg2_num_reqs_; // skip bypass mode if (I == O) @@ -476,49 +523,164 @@ public: // process inputs for (uint32_t o = 0; o < O; ++o) { for (uint32_t r = 0; r < R; ++r) { - uint32_t i = (cursors_.at(o) + r) & (R-1); - uint32_t j = o * R + i; + uint32_t g = (grants_.at(o) + r) & (R-1); + uint32_t j = o * R + g; if (j >= I) continue; auto& req_in = Inputs.at(j); if (!req_in.empty()) { auto& req = req_in.front(); + DT(4, this->name() << "-req" << o << ": " << req); Outputs.at(o).push(req, delay_); req_in.pop(); - this->update_cursor(o, i); + this->update_grant(o, g); break; } } } } -private: +protected: - void update_cursor(uint32_t index, uint32_t grant) { + void update_grant(uint32_t index, uint32_t grant) { if (type_ == ArbiterType::RoundRobin) { - cursors_.at(index) = grant + 1; + grants_.at(index) = grant + 1; } } ArbiterType type_; uint32_t delay_; - std::vector cursors_; - uint32_t num_reqs_; + std::vector grants_; + uint32_t lg2_num_reqs_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +template +class CrossBar : public SimObject> { +public: + typedef Type ReqType; + + std::vector> Inputs; + std::vector> Outputs; + + CrossBar( + const SimContext& ctx, + const char* name, + ArbiterType type, + uint32_t num_inputs, + uint32_t num_outputs = 1, + uint32_t delay = 1, + std::function output_sel = nullptr + ) + : SimObject>(ctx, name) + , Inputs(num_inputs, this) + , Outputs(num_outputs, this) + , type_(type) + , delay_(delay) + , grants_(num_outputs, 0) + , lg2_inputs_(log2ceil(num_inputs)) + , lg2_outputs_(log2ceil(num_outputs)) + , collisions_(0) { + assert(delay != 0); + assert(num_inputs <= 64); + assert(num_outputs <= 64); + assert(ispow2(num_outputs)); + if (output_sel != nullptr) { + output_sel_ = output_sel; + } else { + output_sel_ = [this](const Type& req) { + return (uint32_t)bit_getw(req.addr, 0, (lg2_outputs_-1)); + }; + } + } + + void reset() { + for (auto& grant : grants_) { + grant = 0; + } + } + + void tick() { + uint32_t I = Inputs.size(); + uint32_t O = Outputs.size(); + uint32_t R = 1 << lg2_inputs_; + + // process incoming requests + for (uint32_t o = 0; o < O; ++o) { + int32_t input_idx = -1; + bool has_collision = false; + for (uint32_t r = 0; r < R; ++r) { + uint32_t i = (grants_.at(o) + r) & (R-1); + if (i >= I) + continue; + auto& req_in = Inputs.at(i); + if (req_in.empty()) + continue; + auto& req = req_in.front(); + uint32_t output_idx = 0; + if (lg2_outputs_ != 0) { + // select output index + output_idx = output_sel_(req); + // skip if input is not going to current output + if (output_idx != o) + continue; + } + if (input_idx != -1) { + has_collision = true; + continue; + } + input_idx = i; + } + if (input_idx != -1) { + auto& req_in = Inputs.at(input_idx); + auto& req = req_in.front(); + DT(4, this->name() << "-req" << o << ": " << req); + Outputs.at(o).push(req, delay_); + req_in.pop(); + this->update_grant(o, input_idx); + collisions_ += has_collision; + } + } + } + + uint64_t collisions() const { + return collisions_; + } + +protected: + + void update_grant(uint32_t index, uint32_t grant) { + if (type_ == ArbiterType::RoundRobin) { + grants_.at(index) = grant + 1; + } + } + + ArbiterType type_; + uint32_t delay_; + std::vector grants_; + uint32_t lg2_inputs_; + uint32_t lg2_outputs_; + std::function output_sel_; + uint64_t collisions_; }; /////////////////////////////////////////////////////////////////////////////// template -class Switch : public SimObject> { +class TxArbiter : public SimObject> { public: + typedef Req ReqType; + typedef Rsp RspType; + std::vector> ReqIn; std::vector> RspIn; std::vector> ReqOut; std::vector> RspOut; - Switch( + TxArbiter( const SimContext& ctx, const char* name, ArbiterType type, @@ -526,19 +688,19 @@ public: uint32_t num_outputs = 1, uint32_t delay = 1 ) - : SimObject>(ctx, name) + : SimObject>(ctx, name) , ReqIn(num_inputs, this) , RspIn(num_inputs, this) , ReqOut(num_outputs, this) , RspOut(num_outputs, this) , type_(type) , delay_(delay) - , cursors_(num_outputs, 0) - , lg_num_reqs_(log2ceil(num_inputs / num_outputs)) + , grants_(num_outputs, 0) + , lg2_num_reqs_(log2ceil(num_inputs / num_outputs)) { assert(delay != 0); - assert(num_inputs <= 32); - assert(num_outputs <= 32); + assert(num_inputs <= 64); + assert(num_outputs <= 64); assert(num_inputs >= num_outputs); // bypass mode @@ -551,76 +713,256 @@ public: } void reset() { - for (auto& cursor : cursors_) { - cursor = 0; + for (auto& grant : grants_) { + grant = 0; } } void tick() { uint32_t I = ReqIn.size(); uint32_t O = ReqOut.size(); - uint32_t R = 1 << lg_num_reqs_; + uint32_t R = 1 << lg2_num_reqs_; // skip bypass mode if (I == O) return; + // process outgoing responses for (uint32_t o = 0; o < O; ++o) { - // process incoming responses - if (!RspOut.at(o).empty()) { - auto& rsp = RspOut.at(o).front(); - uint32_t i = 0; - if (lg_num_reqs_ != 0) { - i = rsp.tag & (R-1); - rsp.tag >>= lg_num_reqs_; + auto& rsp_out = RspOut.at(o); + if (!rsp_out.empty()) { + auto& rsp = rsp_out.front(); + uint32_t g = 0; + if (lg2_num_reqs_ != 0) { + g = rsp.tag & (R-1); + rsp.tag >>= lg2_num_reqs_; } - DT(4, this->name() << " rsp" << o << ": " << rsp); - uint32_t j = o * R + i; + uint32_t j = o * R + g; + DT(4, this->name() << "-rsp" << j << ": " << rsp); RspIn.at(j).push(rsp, 1); - RspOut.at(o).pop(); + rsp_out.pop(); } + } - // process incoming requests + // process incoming requests + for (uint32_t o = 0; o < O; ++o) { for (uint32_t r = 0; r < R; ++r) { - uint32_t i = (cursors_.at(o) + r) & (R-1); - uint32_t j = o * R + i; + uint32_t g = (grants_.at(o) + r) & (R-1); + uint32_t j = o * R + g; if (j >= I) continue; auto& req_in = ReqIn.at(j); if (!req_in.empty()) { auto& req = req_in.front(); - if (lg_num_reqs_ != 0) { - req.tag = (req.tag << lg_num_reqs_) | i; + if (lg2_num_reqs_ != 0) { + req.tag = (req.tag << lg2_num_reqs_) | g; } - DT(4, this->name() << " req" << j << ": " << req); + DT(4, this->name() << "-req" << o << ": " << req); ReqOut.at(o).push(req, delay_); req_in.pop(); - this->update_cursor(o, i); + this->update_grant(o, g); break; } } } } - void update_cursor(uint32_t index, uint32_t grant) { +protected: + + void update_grant(uint32_t index, uint32_t grant) { if (type_ == ArbiterType::RoundRobin) { - cursors_.at(index) = grant + 1; + grants_.at(index) = grant + 1; } } -private: ArbiterType type_; uint32_t delay_; - std::vector cursors_; - uint32_t lg_num_reqs_; + std::vector grants_; + uint32_t lg2_num_reqs_; }; -using MemSwitch = Switch; - /////////////////////////////////////////////////////////////////////////////// -class LocalMemDemux : public SimObject { +template +class TxCrossBar : public SimObject> { +public: + typedef Req ReqType; + typedef Rsp RspType; + + std::vector> ReqIn; + std::vector> RspIn; + + std::vector> ReqOut; + std::vector> RspOut; + + TxCrossBar( + const SimContext& ctx, + const char* name, + ArbiterType type, + uint32_t num_inputs, + uint32_t num_outputs = 1, + uint32_t delay = 1, + std::function output_sel = nullptr + ) + : SimObject>(ctx, name) + , ReqIn(num_inputs, this) + , RspIn(num_inputs, this) + , ReqOut(num_outputs, this) + , RspOut(num_outputs, this) + , type_(type) + , delay_(delay) + , req_grants_(num_outputs, 0) + , rsp_grants_(num_inputs, 0) + , lg2_inputs_(log2ceil(num_inputs)) + , lg2_outputs_(log2ceil(num_outputs)) + , req_collisions_(0) + , rsp_collisions_(0) { + assert(delay != 0); + assert(num_inputs <= 64); + assert(num_outputs <= 64); + assert(ispow2(num_inputs)); + assert(ispow2(num_outputs)); + if (output_sel != nullptr) { + output_sel_ = output_sel; + } else { + output_sel_ = [this](const Req& req) { + return (uint32_t)bit_getw(req.addr, 0, (lg2_outputs_-1)); + }; + } + } + + void reset() { + for (auto& grant : req_grants_) { + grant = 0; + } + for (auto& grant : rsp_grants_) { + grant = 0; + } + } + + void tick() { + uint32_t I = ReqIn.size(); + uint32_t O = ReqOut.size(); + uint32_t R = 1 << lg2_inputs_; + uint32_t T = 1 << lg2_outputs_; + + // process outgoing responses + for (uint32_t i = 0; i < I; ++i) { + int32_t output_idx = -1; + bool has_collision = false; + for (uint32_t t = 0; t < T; ++t) { + uint32_t o = (rsp_grants_.at(i) + t) & (T-1); + if (o >= O) + continue; + auto& rsp_out = RspOut.at(o); + if (rsp_out.empty()) + continue; + auto& rsp = rsp_out.front(); + uint32_t input_idx = 0; + if (lg2_inputs_ != 0) { + input_idx = rsp.tag & (R-1); + // skip if response is not going to current input + if (input_idx != i) + continue; + } + if (output_idx != -1) { + has_collision = true; + continue; + } + output_idx = o; + } + if (output_idx != -1) { + auto& rsp_out = RspOut.at(output_idx); + auto& rsp = rsp_out.front(); + if (lg2_inputs_ != 0) { + rsp.tag >>= lg2_inputs_; + } + DT(4, this->name() << "-rsp" << i << ": " << rsp); + RspIn.at(i).push(rsp, 1); + rsp_out.pop(); + this->update_rsp_grant(i, output_idx); + rsp_collisions_ += has_collision; + } + } + + // process incoming requests + for (uint32_t o = 0; o < O; ++o) { + int32_t input_idx = -1; + bool has_collision = false; + for (uint32_t r = 0; r < R; ++r) { + uint32_t i = (req_grants_.at(o) + r) & (R-1); + if (i >= I) + continue; + auto& req_in = ReqIn.at(i); + if (req_in.empty()) + continue; + auto& req = req_in.front(); + uint32_t output_idx = 0; + if (lg2_outputs_ != 0) { + // select output index + output_idx = output_sel_(req); + // skip if request is not going to current output + if (output_idx != o) + continue; + } + if (input_idx != -1) { + has_collision = true; + continue; + } + input_idx = i; + } + if (input_idx != -1) { + auto& req_in = ReqIn.at(input_idx); + auto& req = req_in.front(); + if (lg2_inputs_ != 0) { + req.tag = (req.tag << lg2_inputs_) | input_idx; + } + DT(4, this->name() << "-req" << o << ": " << req); + ReqOut.at(o).push(req, delay_); + req_in.pop(); + this->update_req_grant(o, input_idx); + req_collisions_ += has_collision; + } + } + } + + uint64_t req_collisions() const { + return req_collisions_; + } + + uint64_t rsp_collisions() const { + return rsp_collisions_; + } + +protected: + + void update_req_grant(uint32_t index, uint32_t grant) { + if (type_ == ArbiterType::RoundRobin) { + req_grants_.at(index) = grant + 1; + } + } + + void update_rsp_grant(uint32_t index, uint32_t grant) { + if (type_ == ArbiterType::RoundRobin) { + rsp_grants_.at(index) = grant + 1; + } + } + + ArbiterType type_; + uint32_t delay_; + std::vector req_grants_; + std::vector rsp_grants_; + uint32_t lg2_inputs_; + uint32_t lg2_outputs_; + std::function output_sel_; + uint64_t req_collisions_; + uint64_t rsp_collisions_; +}; + +/////////////////////////////////////////////////////////////////////////////// + +class LocalMemSwitch : public SimObject { public: SimPort ReqIn; SimPort RspIn; @@ -631,7 +973,7 @@ public: SimPort ReqDC; SimPort RspDC; - LocalMemDemux( + LocalMemSwitch( const SimContext& ctx, const char* name, uint32_t delay @@ -670,4 +1012,8 @@ private: uint32_t delay_; }; +using LsuArbiter = TxArbiter; +using MemArbiter = TxArbiter; +using MemCrossBar = TxCrossBar; + } diff --git a/sim/simx/vpu.cpp b/sim/simx/vpu.cpp new file mode 100644 index 000000000..3a70560ec --- /dev/null +++ b/sim/simx/vpu.cpp @@ -0,0 +1,2481 @@ +// This is a fork of https://github.com/troibe/vortex/tree/simx-v2-vector +// The purpose of this fork is to make simx-v2-vector up to date with master +// Thanks to Troibe for his amazing work + +#ifdef EXT_V_ENABLE +#include "emulator.h" +#include "instr.h" +#include "processor_impl.h" +#include +#include +#include +#include +#include +#include "vpu.h" + +using namespace vortex; + +void Emulator::loadVector(const Instr &instr, uint32_t wid, std::vector &rsdata) { + auto &warp = warps_.at(wid); + auto vmask = instr.getVmask(); + auto rdest = instr.getRDest(); + auto mop = instr.getVmop(); + switch (mop) { + case 0b00: { // unit-stride + auto lumop = instr.getVumop(); + switch (lumop) { + case 0b10000: // vle8ff.v, vle16ff.v, vle32ff.v, vle64ff.v - we do not support exceptions -> treat like regular unit stride + // vlseg2e8ff.v, vlseg2e16ff.v, vlseg2e32ff.v, vlseg2e64ff.v + // vlseg3e8ff.v, vlseg3e16ff.v, vlseg3e32ff.v, vlseg3e64ff.v + // vlseg4e8ff.v, vlseg4e16ff.v, vlseg4e32ff.v, vlseg4e64ff.v + // vlseg5e8ff.v, vlseg5e16ff.v, vlseg5e32ff.v, vlseg5e64ff.v + // vlseg6e8ff.v, vlseg6e16ff.v, vlseg6e32ff.v, vlseg6e64ff.v + // vlseg7e8ff.v, vlseg7e16ff.v, vlseg7e32ff.v, vlseg7e64ff.v + // vlseg8e8ff.v, vlseg8e16ff.v, vlseg8e32ff.v, vlseg8e64ff.v + case 0b0000: { // vle8.v, vle16.v, vle32.v, vle64.v + // vlseg2e8.v, vlseg2e16.v, vlseg2e32.v, vlseg2e64.v + // vlseg3e8.v, vlseg3e16.v, vlseg3e32.v, vlseg3e64.v + // vlseg4e8.v, vlseg4e16.v, vlseg4e32.v, vlseg4e64.v + // vlseg5e8.v, vlseg5e16.v, vlseg5e32.v, vlseg5e64.v + // vlseg6e8.v, vlseg6e16.v, vlseg6e32.v, vlseg6e64.v + // vlseg7e8.v, vlseg7e16.v, vlseg7e32.v, vlseg7e64.v + // vlseg8e8.v, vlseg8e16.v, vlseg8e32.v, vlseg8e64.v + WordI stride = warp.vtype.vsew / 8; + uint32_t nfields = instr.getVnf() + 1; + vector_op_vix_load(warp.vreg_file, this, rsdata[0][0].i, rdest, warp.vtype.vsew, warp.vl, false, stride, nfields, warp.vtype.vlmul, vmask); + break; + } + case 0b1000: { // vl1r.v, vl2r.v, vl4r.v, vl8r.v + uint32_t nreg = instr.getVnf() + 1; + if (nreg != 1 && nreg != 2 && nreg != 4 && nreg != 8) { + std::cout << "Whole vector register load - reserved value for nreg: " << nreg << std::endl; + std::abort(); + } + DP(4, "Whole vector register load with nreg: " << nreg); + uint32_t stride = 1 << instr.getVsew(); + uint32_t vsew_bits = stride * 8; + uint32_t vl = nreg * VLEN / vsew_bits; + vector_op_vix_load(warp.vreg_file, this, rsdata[0][0].i, rdest, vsew_bits, vl, false, stride, 1, 0, vmask); + break; + } + case 0b1011: { // vlm.v + if (warp.vtype.vsew != 8) { + std::cout << "vlm.v only supports SEW=8, but SEW was: " << warp.vtype.vsew << std::endl; + std::abort(); + } + WordI stride = warp.vtype.vsew / 8; + vector_op_vix_load(warp.vreg_file, this, rsdata[0][0].i, rdest, warp.vtype.vsew, (warp.vl + 7) / 8, false, stride, 1, 0, true); + break; + } + default: + std::cout << "Load vector - unsupported lumop: " << lumop << std::endl; + std::abort(); + } + break; + } + case 0b10: { // strided: vlse8.v, vlse16.v, vlse32.v, vlse64.v + // vlsseg2e8.v, vlsseg2e16.v, vlsseg2e32.v, vlsseg2e64.v + // vlsseg3e8.v, vlsseg3e16.v, vlsseg3e32.v, vlsseg3e64.v + // vlsseg4e8.v, vlsseg4e16.v, vlsseg4e32.v, vlsseg4e64.v + // vlsseg5e8.v, vlsseg5e16.v, vlsseg5e32.v, vlsseg5e64.v + // vlsseg6e8.v, vlsseg6e16.v, vlsseg6e32.v, vlsseg6e64.v + // vlsseg7e8.v, vlsseg7e16.v, vlsseg7e32.v, vlsseg7e64.v + // vlsseg8e8.v, vlsseg8e16.v, vlsseg8e32.v, vlsseg8e64.v + auto rsrc1 = instr.getRSrc(1); + auto rdest = instr.getRDest(); + WordI stride = warp.ireg_file.at(0).at(rsrc1); + uint32_t nfields = instr.getVnf() + 1; + vector_op_vix_load(warp.vreg_file, this, rsdata[0][0].i, rdest, warp.vtype.vsew, warp.vl, true, stride, nfields, warp.vtype.vlmul, vmask); + break; + } + case 0b01: // indexed - unordered, vluxei8.v, vluxei16.v, vluxei32.v, vluxei64.v + // vluxseg2e8.v, vluxseg2e16.v, vluxseg2e32.v, vluxseg2e64.v + // vluxseg3e8.v, vluxseg3e16.v, vluxseg3e32.v, vluxseg3e64.v + // vluxseg4e8.v, vluxseg4e16.v, vluxseg4e32.v, vluxseg4e64.v + // vluxseg5e8.v, vluxseg5e16.v, vluxseg5e32.v, vluxseg5e64.v + // vluxseg6e8.v, vluxseg6e16.v, vluxseg6e32.v, vluxseg6e64.v + // vluxseg7e8.v, vluxseg7e16.v, vluxseg7e32.v, vluxseg7e64.v + // vluxseg8e8.v, vluxseg8e16.v, vluxseg8e32.v, vluxseg8e64.v + case 0b11: { // indexed - ordered, vloxei8.v, vloxei16.v, vloxei32.v, vloxei64.v + // vloxseg2e8.v, vloxseg2e16.v, vloxseg2e32.v, vloxseg2e64.v + // vloxseg3e8.v, vloxseg3e16.v, vloxseg3e32.v, vloxseg3e64.v + // vloxseg4e8.v, vloxseg4e16.v, vloxseg4e32.v, vloxseg4e64.v + // vloxseg5e8.v, vloxseg5e16.v, vloxseg5e32.v, vloxseg5e64.v + // vloxseg6e8.v, vloxseg6e16.v, vloxseg6e32.v, vloxseg6e64.v + // vloxseg7e8.v, vloxseg7e16.v, vloxseg7e32.v, vloxseg7e64.v + // vloxseg8e8.v, vloxseg8e16.v, vloxseg8e32.v, vloxseg8e64.v + uint32_t nfields = instr.getVnf() + 1; + uint32_t vsew_bits = 1 << (3 + instr.getVsew()); + vector_op_vv_load(warp.vreg_file, this, rsdata[0][0].i, instr.getRSrc(1), rdest, warp.vtype.vsew, vsew_bits, warp.vl, nfields, warp.vtype.vlmul, vmask); + break; + } + default: + std::cout << "Load vector - unsupported mop: " << mop << std::endl; + std::abort(); + } +} + +void Emulator::storeVector(const Instr &instr, uint32_t wid, std::vector &rsdata) { + auto &warp = warps_.at(wid); + auto vmask = instr.getVmask(); + auto mop = instr.getVmop(); + switch (mop) { + case 0b00: { // unit-stride + auto vs3 = instr.getRSrc(1); + auto sumop = instr.getVumop(); + WordI stride = warp.vtype.vsew / 8; + switch (sumop) { + case 0b0000: { // vse8.v, vse16.v, vse32.v, vse64.v + uint32_t nfields = instr.getVnf() + 1; + vector_op_vix_store(warp.vreg_file, this, rsdata[0][0].i, vs3, warp.vtype.vsew, warp.vl, false, stride, nfields, warp.vtype.vlmul, vmask); + break; + } + case 0b1000: { // vs1r.v, vs2r.v, vs4r.v, vs8r.v + uint32_t nreg = instr.getVnf() + 1; + if (nreg != 1 && nreg != 2 && nreg != 4 && nreg != 8) { + std::cout << "Whole vector register store - reserved value for nreg: " << nreg << std::endl; + std::abort(); + } + DP(4, "Whole vector register store with nreg: " << nreg); + uint32_t vl = nreg * VLEN / 8; + vector_op_vix_store(warp.vreg_file, this, rsdata[0][0].i, vs3, vl, false, stride, 1, 0, vmask); + break; + } + case 0b1011: { // vsm.v + if (warp.vtype.vsew != 8) { + std::cout << "vsm.v only supports EEW=8, but EEW was: " << warp.vtype.vsew << std::endl; + std::abort(); + } + vector_op_vix_store(warp.vreg_file, this, rsdata[0][0].i, vs3, warp.vtype.vsew, (warp.vl + 7) / 8, false, stride, 1, 0, true); + break; + } + default: + std::cout << "Store vector - unsupported sumop: " << sumop << std::endl; + std::abort(); + } + break; + } + case 0b10: { // strided: vsse8.v, vsse16.v, vsse32.v, vsse64.v + // vssseg2e8.v, vssseg2e16.v, vssseg2e32.v, vssseg2e64.v + // vssseg3e8.v, vssseg3e16.v, vssseg3e32.v, vssseg3e64.v + // vssseg4e8.v, vssseg4e16.v, vssseg4e32.v, vssseg4e64.v + // vssseg5e8.v, vssseg5e16.v, vssseg5e32.v, vssseg5e64.v + // vssseg6e8.v, vssseg6e16.v, vssseg6e32.v, vssseg6e64.v + // vssseg7e8.v, vssseg7e16.v, vssseg7e32.v, vssseg7e64.v + // vssseg8e8.v, vssseg8e16.v, vssseg8e32.v, vssseg8e64.v + auto rsrc1 = instr.getRSrc(1); + auto vs3 = instr.getRSrc(2); + WordI stride = warp.ireg_file.at(0).at(rsrc1); + uint32_t nfields = instr.getVnf() + 1; + vector_op_vix_store(warp.vreg_file, this, rsdata[0][0].i, vs3, warp.vtype.vsew, warp.vl, true, stride, nfields, warp.vtype.vlmul, vmask); + break; + } + case 0b01: // indexed - unordered, vsuxei8.v, vsuxei16.v, vsuxei32.v, vsuxei64.v + // vsuxseg2ei8.v, vsuxseg2ei16.v, vsuxseg2ei32.v, vsuxseg2ei64.v + // vsuxseg3ei8.v, vsuxseg3ei16.v, vsuxseg3ei32.v, vsuxseg3ei64.v + // vsuxseg4ei8.v, vsuxseg4ei16.v, vsuxseg4ei32.v, vsuxseg4ei64.v + // vsuxseg5ei8.v, vsuxseg5ei16.v, vsuxseg5ei32.v, vsuxseg5ei64.v + // vsuxseg6ei8.v, vsuxseg6ei16.v, vsuxseg6ei32.v, vsuxseg6ei64.v + // vsuxseg7ei8.v, vsuxseg7ei16.v, vsuxseg7ei32.v, vsuxseg7ei64.v + // vsuxseg8ei8.v, vsuxseg8ei16.v, vsuxseg8ei32.v, vsuxseg8ei64.v + case 0b11: { // indexed - ordered, vsoxei8.v, vsoxei16.v, vsoxei32.v, vsoxei64.v + // vsoxseg2ei8.v, vsoxseg2ei16.v, vsoxseg2ei32.v, vsoxseg2ei64.v + // vsoxseg3ei8.v, vsoxseg3ei16.v, vsoxseg3ei32.v, vsoxseg3ei64.v + // vsoxseg4ei8.v, vsoxseg4ei16.v, vsoxseg4ei32.v, vsoxseg4ei64.v + // vsoxseg5ei8.v, vsoxseg5ei16.v, vsoxseg5ei32.v, vsoxseg5ei64.v + // vsoxseg6ei8.v, vsoxseg6ei16.v, vsoxseg6ei32.v, vsoxseg6ei64.v + // vsoxseg7ei8.v, vsoxseg7ei16.v, vsoxseg7ei32.v, vsoxseg7ei64.v + // vsoxseg8ei8.v, vsoxseg8ei16.v, vsoxseg8ei32.v, vsoxseg8ei64.v + uint32_t nfields = instr.getVnf() + 1; + uint32_t vsew_bits = 1 << (3 + instr.getVsew()); + vector_op_vv_store(warp.vreg_file, this, rsdata[0][0].i, instr.getRSrc(1), instr.getRSrc(2), warp.vtype.vsew, vsew_bits, warp.vl, nfields, warp.vtype.vlmul, vmask); + break; + } + default: + std::cout << "Store vector - unsupported mop: " << mop << std::endl; + std::abort(); + } +} + +void Emulator::executeVector(const Instr &instr, uint32_t wid, std::vector &rsdata, std::vector &rddata) { + auto &warp = warps_.at(wid); + auto func3 = instr.getFunc3(); + auto func6 = instr.getFunc6(); + + auto rdest = instr.getRDest(); + auto rsrc0 = instr.getRSrc(0); + auto rsrc1 = instr.getRSrc(1); + auto immsrc = sext((Word)instr.getImm(), width_reg); + auto uimmsrc = (Word)instr.getImm(); + auto vmask = instr.getVmask(); + auto num_threads = arch_.num_threads(); + + switch (func3) { + case 0: { // vector - vector + switch (func6) { + case 0: { // vadd.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 2: { // vsub.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 4: { // vminu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 5: { // vmin.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 6: { // vmaxu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 7: { // vmax.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 9: { // vand.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 10: { // vor.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 11: { // vxor.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 12: { // vrgather.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_gather(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, false, warp.vlmax, vmask); + } + } break; + case 14: { // vrgatherei16.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_gather(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, true, warp.vlmax, vmask); + } + } break; + case 16: { // vadc.vvm + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_carry(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl); + } + } break; + case 17: { // vmadc.vv, vmadc.vvm + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_carry_out(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 18: { // vsbc.vvm + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_carry(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl); + } + } break; + case 19: { // vmsbc.vv, vmsbc.vvm + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_carry_out(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 23: { + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + if (vmask) { // vmv.v.v + if (rsrc1 != 0) { + std::cout << "For vmv.v.v vs2 must contain v0." << std::endl; + std::abort(); + } + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } else { // vmerge.vvm + vector_op_vv_merge(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } + } break; + case 24: { // vmseq.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 25: { // vmsne.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 26: { // vmsltu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 27: { // vmslt.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 28: { // vmsleu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 29: { // vmsle.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 30: { // vmsgtu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 31: { // vmsgt.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 32: { // vsaddu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vv_sat(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, 2, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + case 33: { // vsadd.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vv_sat(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, 2, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + case 34: { // vssubu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vv_sat(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, 2, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + case 35: { // vssub.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vv_sat(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, 2, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + case 37: { // vsll.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 39: { // vsmul.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vv_sat(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + case 40: { // vsrl.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 41: { // vsra.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 42: { // vssrl.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vv_scale(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + } + } break; + case 43: { // vssra.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vv_scale(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + } + } break; + case 44: { // vnsrl.wv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vv_n(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, 2, vxsat); + } + } break; + case 45: { // vnsra.wv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vv_n(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, 2, vxsat); + } + } break; + case 46: { // vnclipu.wv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vv_n(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + case 47: { // vnclip.wv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vv_n(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + case 48: { // vwredsumu.vs + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_red_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 49: { // vwredsum.vs + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_red_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + default: + std::cout << "Unrecognised vector - vector instruction func3: " << func3 << " func6: " << func6 << std::endl; + std::abort(); + } + } break; + case 1: { // float vector - vector + switch (func6) { + case 0: { // vfadd.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 2: { // vfsub.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 1: // vfredusum.vs - treated the same as vfredosum.vs + case 3: { // vfredosum.vs + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_red(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 4: { // vfmin.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 5: { // vfredmin.vs + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_red(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 6: { // vfmax.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 7: { // vfredmax.vs + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_red(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 8: { // vfsgnj.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 9: { // vfsgnjn.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 10: { // vfsgnjx.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 16: { // vfmv.f.s + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &dest = rddata[t].u64; + vector_op_scalar(dest, warp.vreg_file, rsrc0, rsrc1, warp.vtype.vsew); + DP(4, "Moved " << +dest << " from: " << +rsrc1 << " to: " << +rdest); + } + } break; + case 18: { + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + switch (rsrc0 >> 3) { + case 0b00: // vfcvt.xu.f.v, vfcvt.x.f.v, vfcvt.f.xu.v, vfcvt.f.x.v, vfcvt.rtz.xu.f.v, vfcvt.rtz.x.f.v + vector_op_vix(rsrc0, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + break; + case 0b01: // vfwcvt.xu.f.v, vfwcvt.x.f.v, vfwcvt.f.xu.v, vfwcvt.f.x.v, vfwcvt.f.f.v, vfwcvt.rtz.xu.f.v, vfwcvt.rtz.x.f.v + vector_op_vix_w(rsrc0, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + break; + case 0b10: { // vfncvt.xu.f.w, vfncvt.x.f.w, vfncvt.f.xu.w, vfncvt.f.x.w, vfncvt.f.f.w, vfncvt.rod.f.f.w, vfncvt.rtz.xu.f.w, vfncvt.rtz.x.f.w + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = 0; // saturation argument is unused + vector_op_vix_n(rsrc0, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + break; + } + default: + std::cout << "Fcvt unsupported value for rsrc0: " << rsrc0 << std::endl; + std::abort(); + } + } + } break; + case 19: { // vfsqrt.v, vfrsqrt7.v, vfrec7.v, vfclass.v + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix(rsrc0, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 24: { // vmfeq.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 25: { // vmfle.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 27: { // vmflt.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 28: { // vmfne.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 32: { // vfdiv.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 36: { // vfmul.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 40: { // vfmadd.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 41: { // vfnmadd.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 42: { // vfmsub.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 43: { // vfnmsub.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 44: { // vfmacc.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 45: { // vfnmacc.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 46: { // vfmsac.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 47: { // vfnmsac.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 48: { // vfwadd.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 51: // vfwredosum.vs - treated the same as vfwredosum.vs + case 49: { // vfwredusum.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_red_wf(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 50: { // vfwsub.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 52: { // vfwadd.wv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_wfv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 54: { // vfwsub.wv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_wfv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 56: { // vfwmul.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 60: { // vfwmacc.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 61: { // vfwnmacc.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 62: { // vfwmsac.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 63: { // vfwnmsac.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + default: + std::cout << "Unrecognised float vector - vector instruction func3: " << func3 << " func6: " << func6 << std::endl; + std::abort(); + } + } break; + case 2: { // mask vector - vector + switch (func6) { + case 0: { // vredsum.vs + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_red(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 1: { // vredand.vs + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_red(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 2: { // vredor.vs + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_red(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 3: { // vredxor.vs + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_red(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 4: { // vredminu.vs + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_red(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 5: { // vredmin.vs + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_red(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 6: { // vredmaxu.vs + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_red(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 7: { // vredmax.vs + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_red(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 8: { // vaaddu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vv_sat(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + } + } break; + case 9: { // vaadd.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vv_sat(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + } + } break; + case 10: { // vasubu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vv_sat(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + } + } break; + case 11: { // vasub.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vv_sat(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + } + } break; + case 16: { // vmv.x.s + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &dest = rddata[t].i; + vector_op_scalar(dest, warp.vreg_file, rsrc0, rsrc1, warp.vtype.vsew); + DP(4, "Moved " << +dest << " from: " << +rsrc1 << " to: " << +rdest); + } + } break; + case 18: { // vzext.vf8, vsext.vf8, vzext.vf4, vsext.vf4, vzext.vf2, vsext.vf2 + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + bool negativeLmul = warp.vtype.vlmul >> 2; + uint32_t illegalLmul = negativeLmul && !((8 >> (0x8 - warp.vtype.vlmul)) >> (0x4 - (rsrc0 >> 1))); + if (illegalLmul) { + std::cout << "Lmul*vf<1/8 is not supported by vzext and vsext." << std::endl; + std::abort(); + } + vector_op_vix_ext(rsrc0, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 20: { // vid.v + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vid(warp.vreg_file, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 23: { // vcompress.vm + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_compress(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl); + } + } break; + case 24: { // vmandn.mm + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vl); + } + } break; + case 25: { // vmand.mm + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vl); + } + } break; + case 26: { // vmor.mm + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vl); + } + } break; + case 27: { // vmxor.mm + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vl); + } + } break; + case 28: { // vmorn.mm + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vl); + } + } break; + case 29: { // vmnand.mm + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vl); + } + } break; + case 30: { // vmnor.mm + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vl); + } + } break; + case 31: { // vmxnor.mm + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_mask(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vl); + } + } break; + case 32: { // vdivu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 33: { // vdiv.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 34: { // vremu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 35: { // vrem.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 36: { // vmulhu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 37: { // vmul.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 38: { // vmulhsu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 39: { // vmulh.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 41: { // vmadd.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 43: { // vnmsub.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 45: { // vmacc.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 47: { // vnmsac.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 48: { // vwaddu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 49: { // vwadd.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 50: { // vwsubu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 51: { // vwsub.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 52: { // vwaddu.wv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_wv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 53: { // vwadd.wv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_wv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 54: { // vwsubu.wv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_wv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 55: { // vwsub.wv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_wv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 56: { // vwmulu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 58: { // vwmulsu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 59: { // vwmul.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 60: { // vwmaccu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 61: { // vwmacc.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 63: { // vwmaccsu.vv + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vv_w(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + default: + std::cout << "Unrecognised mask vector - vector instruction func3: " << func3 << " func6: " << func6 << std::endl; + std::abort(); + } + } break; + case 3: { // vector - immidiate + switch (func6) { + case 0: { // vadd.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 3: { // vrsub.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 9: { // vand.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 10: { // vor.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 11: { // vxor.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 12: { // vrgather.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix_gather(uimmsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, warp.vlmax, vmask); + } + } break; + case 14: { // vslideup.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix_slide(uimmsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, 0, vmask, false); + } + } break; + case 15: { // vslidedown.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix_slide(uimmsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, warp.vlmax, vmask, false); + } + } break; + case 16: { // vadc.vim + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix_carry(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl); + } + } break; + case 17: { // vmadc.vi, vmadc.vim + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix_carry_out(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 23: { // vmv.v.i + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + if (vmask) { // vmv.v.i + if (rsrc0 != 0) { + std::cout << "For vmv.v.i vs2 must contain v0." << std::endl; + std::abort(); + } + vector_op_vix(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } else { // vmerge.vim + vector_op_vix_merge(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } + } break; + case 24: { // vmseq.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix_mask(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 25: { // vmsne.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix_mask(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 26: { // vmsltu.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix_mask(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 27: { // vmslt.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix_mask(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 28: { // vmsleu.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix_mask(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 29: { // vmsle.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix_mask(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 30: { // vmsgtu.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix_mask(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 31: { // vmsgt.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix_mask(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 32: { // vsaddu.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vix_sat(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask, 2, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + case 33: { // vsadd.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vix_sat(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask, 2, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + case 37: { // vsll.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 39: { // vmv1r.v, vmv2r.v, vmv4r.v, vmv8r.v + for (uint32_t t = 0; t < num_threads; ++t) { + uint32_t nreg = (immsrc & 0b111) + 1; + if (nreg != 1 && nreg != 2 && nreg != 4 && nreg != 8) { + std::cout << "Reserved value for nreg: " << nreg << std::endl; + std::abort(); + } + if (!warp.tmask.test(t)) + continue; + vector_op_vv(warp.vreg_file, rsrc0, rsrc1, rdest, warp.vtype.vsew, nreg * VLEN / warp.vtype.vsew, vmask); + } + } break; + case 40: { // vsrl.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 41: { // vsra.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + vector_op_vix(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 42: { // vssrl.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vix_scale(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + } + } break; + case 43: { // vssra.vi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vix_scale(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + } + } break; + case 44: { // vnsrl.wi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vix_n(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask, 2, vxsat); + } + } break; + case 45: { // vnsra.wi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vix_n(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask, 2, vxsat); + } + } break; + case 46: { // vnclipu.wi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vix_n(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + case 47: { // vnclip.wi + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vix_n(immsrc, warp.vreg_file, rsrc0, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + default: + std::cout << "Unrecognised vector - immidiate instruction func3: " << func3 << " func6: " << func6 << std::endl; + std::abort(); + } + } break; + case 4: { + switch (func6) { + case 0: { // vadd.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 2: { // vsub.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 3: { // vrsub.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 4: { // vminu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 5: { // vmin.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 6: { // vmaxu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 7: { // vmax.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 9: { // vand.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 10: { // vor.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 11: { // vxor.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 12: { // vrgather.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_gather(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, warp.vlmax, vmask); + } + } break; + case 14: { // vslideup.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_slide(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, 0, vmask, false); + } + } break; + case 15: { // vslidedown.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_slide(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, warp.vlmax, vmask, false); + } + } break; + case 16: { // vadc.vxm + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_carry(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl); + } + } break; + case 17: { // vmadc.vx, vmadc.vxm + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_carry_out(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 18: { // vsbc.vxm + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_carry(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl); + } + } break; + case 19: { // vmsbc.vx, vmsbc.vxm + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_carry_out(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 23: { + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + if (vmask) { // vmv.v.x + if (rsrc1 != 0) { + std::cout << "For vmv.v.x vs2 must contain v0." << std::endl; + std::abort(); + } + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } else { // vmerge.vxm + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_merge(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } + } break; + case 24: { // vmseq.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_mask(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 25: { // vmsne.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_mask(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 26: { // vmsltu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_mask(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 27: { // vmslt.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_mask(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 28: { // vmsleu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_mask(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 29: { // vmsle.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_mask(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 30: { // vmsgtu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_mask(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 31: { // vmsgt.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_mask(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 32: { // vsaddu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vix_sat(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, 2, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + case 33: { // vsadd.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vix_sat(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, 2, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + case 34: { // vssubu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vix_sat(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, 2, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + case 35: { // vssub.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vix_sat(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, 2, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + case 37: { // vsll.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 39: { // vsmul.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vix_sat(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + case 40: { // vsrl.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 41: { // vsra.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 42: { // vssrl.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = 0; // saturation is not relevant for this operation + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_scale(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + } + } break; + case 43: { // vssra.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = 0; // saturation is not relevant for this operation + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_scale(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + } + } break; + case 44: { // vnsrl.wx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vix_n(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, 2, vxsat); + } + } break; + case 45: { // vnsra.wx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vix_n(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, 2, vxsat); + } + } break; + case 46: { // vnclipu.wx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vix_n(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + case 47: { // vnclip.wx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = this->get_csr(VX_CSR_VXSAT, t, wid); + vector_op_vix_n(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + this->set_csr(VX_CSR_VXSAT, vxsat, t, wid); + } + } break; + default: + std::cout << "Unrecognised vector - scalar instruction func3: " << func3 << " func6: " << func6 << std::endl; + std::abort(); + } + } break; + case 5: { // float vector - scalar + switch (func6) { + case 0: { // vfadd.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 2: { // vfsub.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 4: { // vfmin.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 6: { // vfmax.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 8: { // vfsgnj.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 9: { // vfsgnjn.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 10: { // vfsgnjx.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 14: { // vfslide1up.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix_slide(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, 0, vmask, true); + } + } break; + case 15: { // vfslide1down.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix_slide(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, warp.vlmax, vmask, true); + } + } break; + case 16: { // vfmv.s.f + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + if (rsrc1 != 0) { + std::cout << "For vfmv.s.f vs2 must contain v0." << std::endl; + std::abort(); + } + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, std::min(warp.vl, (uint32_t)1), vmask); + } + } break; + case 24: { // vmfeq.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix_mask(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 23: { + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + if (vmask) { // vfmv.v.f + if (rsrc1 != 0) { + std::cout << "For vfmv.v.f vs2 must contain v0." << std::endl; + std::abort(); + } + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } else { // vfmerge.vfm + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix_merge(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } + } break; + case 25: { // vmfle.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix_mask(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 27: { // vmflt.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix_mask(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 28: { // vmfne.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix_mask(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 29: { // vmfgt.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix_mask(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 31: { // vmfge.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix_mask(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 32: { // vfdiv.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 33: { // vfrdiv.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 36: { // vfmul.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 39: { // vfrsub.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 40: { // vfmadd.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 41: { // vfnmadd.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 42: { // vfmsub.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 43: { // vfnmsub.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 44: { // vfmacc.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 45: { // vfnmacc.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 46: { // vfmsac.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 47: { // vfnmsac.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 48: { // vfwadd.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 50: { // vfwsub.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 52: { // vfwadd.wf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + uint64_t src1_d = rv_ftod(src1); + vector_op_vix_wx(src1_d, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 54: { // vfwsub.wf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + uint64_t src1_d = rv_ftod(src1); + vector_op_vix_wx(src1_d, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 56: { // vfwmul.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 60: { // vfwmacc.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 61: { // vfwnmacc.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 62: { // vfwmsac.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 63: { // vfwnmsac.vf + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.freg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + default: + std::cout << "Unrecognised float vector - scalar instruction func3: " << func3 << " func6: " << func6 << std::endl; + std::abort(); + } + } break; + case 6: { + switch (func6) { + case 8: { // vaaddu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vix_sat(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + } + } break; + case 9: { // vaadd.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vix_sat(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + } + } break; + case 10: { // vasubu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vix_sat(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + } + } break; + case 11: { // vasub.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + uint32_t vxrm = this->get_csr(VX_CSR_VXRM, t, wid); + uint32_t vxsat = 0; // saturation is not relevant for this operation + vector_op_vix_sat(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask, vxrm, vxsat); + } + } break; + case 14: { // vslide1up.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_slide(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, 0, vmask, true); + } + } break; + case 15: { // vslide1down.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_slide(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, warp.vlmax, vmask, true); + } + } break; + case 16: { // vmv.s.x + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + if (rsrc1 != 0) { + std::cout << "For vmv.s.x vs2 must contain v0." << std::endl; + std::abort(); + } + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, std::min(warp.vl, (uint32_t)1), vmask); + } + } break; + case 32: { // vdivu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 33: { // vdiv.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 34: { // vremu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 35: { // vrem.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 36: { // vmulhu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 37: { // vmul.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 38: { // vmulhsu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 39: { // vmulh.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 41: { // vmadd.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 43: { // vnmsub.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 45: { // vmacc.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 47: { // vnmsac.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 48: { // vwaddu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 49: { // vwadd.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 50: { // vwsubu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 51: { // vwsub.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 52: { // vwaddu.wx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_wx(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 53: { // vwadd.wx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + Word src1_ext = sext(src1, warp.vtype.vsew); + vector_op_vix_wx(src1_ext, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 54: { // vwsubu.wx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_wx(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 55: { // vwsub.wx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + Word &src1 = warp.ireg_file.at(t).at(rsrc0); + Word src1_ext = sext(src1, warp.vtype.vsew); + vector_op_vix_wx(src1_ext, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 56: { // vwmulu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 58: { // vwmulsu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 59: { // vwmul.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 60: { // vwmaccu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 61: { // vwmacc.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 62: { // vwmaccus.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + case 63: { // vwmaccsu.vx + for (uint32_t t = 0; t < num_threads; ++t) { + if (!warp.tmask.test(t)) + continue; + auto &src1 = warp.ireg_file.at(t).at(rsrc0); + vector_op_vix_w(src1, warp.vreg_file, rsrc1, rdest, warp.vtype.vsew, warp.vl, vmask); + } + } break; + default: + std::cout << "Unrecognised vector - scalar instruction func3: " << func3 << " func6: " << func6 << std::endl; + std::abort(); + } + } break; + case 7: { + uint32_t vma = instr.getVma(); + uint32_t vta = instr.getVta(); + uint32_t vsew = instr.getVsew(); + uint32_t vlmul = instr.getVlmul(); + + if (!instr.hasZimm()) { // vsetvl + uint32_t zimm = rsdata[0][1].u; + vlmul = zimm & mask_v_lmul; + vsew = (zimm >> shift_v_sew) & mask_v_sew; + vta = (zimm >> shift_v_ta) & mask_v_ta; + vma = (zimm >> shift_v_ma) & mask_v_ma; + } + + bool negativeLmul = vlmul >> 2; + uint32_t vlenDividedByLmul = VLEN >> (0x8 - vlmul); + uint32_t vlenMultipliedByLmul = VLEN << vlmul; + uint32_t vlenTimesLmul = negativeLmul ? vlenDividedByLmul : vlenMultipliedByLmul; + uint32_t vsew_bits = 1 << (3 + vsew); + warp.vlmax = vlenTimesLmul / vsew_bits; + warp.vtype.vill = (vsew_bits > XLEN) || (warp.vlmax < (VLEN / XLEN)); + + Word s0 = instr.getImm(); // vsetivli + if (!instr.hasImm()) { // vsetvli/vsetvl + s0 = rsdata[0][0].u; + } + + DP(4, "Vset(i)vl(i) - vill: " << +warp.vtype.vill << " vma: " << vma << " vta: " << vta << " lmul: " << vlmul << " sew: " << vsew << " s0: " << s0 << " vlmax: " << warp.vlmax); + warp.vl = std::min(s0, warp.vlmax); + + if (warp.vtype.vill) { + this->set_csr(VX_CSR_VTYPE, (Word)1 << (XLEN - 1), 0, wid); + warp.vtype.vma = 0; + warp.vtype.vta = 0; + warp.vtype.vsew = 0; + warp.vtype.vlmul = 0; + this->set_csr(VX_CSR_VL, 0, 0, wid); + rddata[0].i = warp.vl; + } else { + warp.vtype.vma = vma; + warp.vtype.vta = vta; + warp.vtype.vsew = vsew_bits; + warp.vtype.vlmul = vlmul; + Word vtype_ = vlmul; + vtype_ |= vsew << shift_v_sew; + vtype_ |= vta << shift_v_ta; + vtype_ |= vma << shift_v_ma; + this->set_csr(VX_CSR_VTYPE, vtype_, 0, wid); + this->set_csr(VX_CSR_VL, warp.vl, 0, wid); + rddata[0].i = warp.vl; + } + } + this->set_csr(VX_CSR_VSTART, 0, 0, wid); + break; + default: + std::cout << "Unrecognised vector instruction func3: " << func3 << " func6: " << func6 << std::endl; + std::abort(); + } +} +#endif \ No newline at end of file diff --git a/sim/simx/vpu.h b/sim/simx/vpu.h new file mode 100644 index 000000000..9ea9ec389 --- /dev/null +++ b/sim/simx/vpu.h @@ -0,0 +1,2393 @@ +#ifdef EXT_V_ENABLE +#pragma once + +using namespace vortex; + +template +class Add { +public: + static R apply(T first, T second, R) { + return (R)first + (R)second; + } + static std::string name() { return "Add"; } +}; + +template +class Sub { +public: + static R apply(T first, T second, R) { + return (R)second - (R)first; + } + static std::string name() { return "Sub"; } +}; + +template +class Adc { +public: + static R apply(T first, T second, R third) { + return (R)first + (R)second + third; + } + static std::string name() { return "Adc"; } +}; + +template +class Madc { +public: + static R apply(T first, T second, R third) { + return ((R)first + (R)second + third) > (R)std::numeric_limits::max(); + } + static std::string name() { return "Madc"; } +}; + +template +class Sbc { +public: + static R apply(T first, T second, R third) { + return (R)second - (R)first - third; + } + static std::string name() { return "Sbc"; } +}; + +template +class Msbc { +public: + static R apply(T first, T second, R third) { + return (R)second < ((R)first + third); + } + static std::string name() { return "Msbc"; } +}; + +template +class Ssub { +public: + static R apply(T first, T second, uint32_t, uint32_t &vxsat_) { + // rounding mode is not relevant for this operation + T unclippedResult = second - first; + R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits::min(), (T)std::numeric_limits::max()); + vxsat_ |= clippedResult != unclippedResult; + return clippedResult; + } + static std::string name() { return "Ssub"; } +}; + +template +class Ssubu { +public: + static R apply(T first, T second, uint32_t, uint32_t &vxsat_) { + // rounding mode is not relevant for this operation + if (first > second) { + vxsat_ = true; + return 0; + } else { + vxsat_ = false; + return second - first; + } + } + static std::string name() { return "Ssubu"; } +}; + +template +class Sadd { +public: + static R apply(T first, T second, uint32_t, uint32_t &vxsat_) { + // rounding mode is not relevant for this operation + T unclippedResult = second + first; + R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits::min(), (T)std::numeric_limits::max()); + vxsat_ |= clippedResult != unclippedResult; + return clippedResult; + } + static std::string name() { return "Sadd"; } +}; + +template +class Rsub { +public: + static R apply(T first, T second, R) { + return first - second; + } + static std::string name() { return "Rsub"; } +}; + +template +class Div { +public: + static R apply(T first, T second, R) { + // logic taken from scalar div + if (first == 0) { + return -1; + } else if (second == std::numeric_limits::min() && first == T(-1)) { + return second; + } else { + return (R)second / (R)first; + } + } + static std::string name() { return "Div"; } +}; + +template +class Rem { +public: + static R apply(T first, T second, R) { + // logic taken from scalar rem + if (first == 0) { + return second; + } else if (second == std::numeric_limits::min() && first == T(-1)) { + return 0; + } else { + return (R)second % (R)first; + } + } + static std::string name() { return "Rem"; } +}; + +template +class Mul { +public: + static R apply(T first, T second, R) { + return (R)first * (R)second; + } + static std::string name() { return "Mul"; } +}; + +template +class Mulsu { +public: + static R apply(T first, T second, R) { + R first_ext = zext((R)first, (sizeof(T) * 8)); + return first_ext * (R)second; + } + static std::string name() { return "Mulsu"; } +}; + +template +class Mulh { +public: + static R apply(T first, T second, R) { + __int128_t first_ext = sext((__int128_t)first, (sizeof(T) * 8)); + __int128_t second_ext = sext((__int128_t)second, (sizeof(T) * 8)); + return (first_ext * second_ext) >> (sizeof(T) * 8); + } + static std::string name() { return "Mulh"; } +}; + +template +class Mulhsu { +public: + static R apply(T first, T second, R) { + __int128_t first_ext = zext((__int128_t)first, (sizeof(T) * 8)); + __int128_t second_ext = sext((__int128_t)second, (sizeof(T) * 8)); + return (first_ext * second_ext) >> (sizeof(T) * 8); + } + static std::string name() { return "Mulhsu"; } +}; + +template +class Mulhu { +public: + static R apply(T first, T second, R) { + return ((__uint128_t)first * (__uint128_t)second) >> (sizeof(T) * 8); + } + static std::string name() { return "Mulhu"; } +}; + +template +class Madd { +public: + static R apply(T first, T second, R third) { + return ((R)first * third) + (R)second; + } + static std::string name() { return "Madd"; } +}; + +template +class Nmsac { +public: + static R apply(T first, T second, R third) { + return -((R)first * (R)second) + third; + } + static std::string name() { return "Nmsac"; } +}; + +template +class Macc { +public: + static R apply(T first, T second, R third) { + return ((R)first * (R)second) + third; + } + static std::string name() { return "Macc"; } +}; + +template +class Maccsu { +public: + static R apply(T first, T second, R third) { + R first_ext = sext((R)first, (sizeof(T) * 8)); + R second_ext = zext((R)second, (sizeof(T) * 8)); + return (first_ext * second_ext) + third; + } + static std::string name() { return "Maccsu"; } +}; + +template +class Maccus { +public: + static R apply(T first, T second, R third) { + R first_ext = zext((R)first, (sizeof(T) * 8)); + R second_ext = sext((R)second, (sizeof(T) * 8)); + return (first_ext * second_ext) + third; + } + static std::string name() { return "Maccus"; } +}; + +template +class Nmsub { +public: + static R apply(T first, T second, R third) { + return -((R)first * third) + (R)second; + } + static std::string name() { return "Nmsub"; } +}; + +template +class Min { +public: + static R apply(T first, T second, R) { + return std::min(first, second); + } + static std::string name() { return "Min"; } +}; + +template +class Max { +public: + static R apply(T first, T second, R) { + return std::max(first, second); + } + static std::string name() { return "Max"; } +}; + +template +class And { +public: + static R apply(T first, T second, R) { + return first & second; + } + static std::string name() { return "And"; } +}; + +template +class Or { +public: + static R apply(T first, T second, R) { + return first | second; + } + static std::string name() { return "Or"; } +}; + +template +class Xor { +public: + static R apply(T first, T second, R) { + return first ^ second; + } + static std::string name() { return "Xor"; } +}; + +template +class Sll { +public: + static R apply(T first, T second, R) { + // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount. + return second << (first & (sizeof(T) * 8 - 1)); + } + static std::string name() { return "Sll"; } +}; + +template +bool bitAt(T value, R pos, R negOffset) { + R offsetPos = pos - negOffset; + return pos >= negOffset && ((value >> offsetPos) & 0x1); +} + +template +bool anyBitUpTo(T value, R to, R negOffset) { + R offsetTo = to - negOffset; + return to >= negOffset && (value & (((R)1 << (offsetTo + 1)) - 1)); +} + +template +bool roundBit(T value, R shiftDown, uint32_t vxrm) { + switch (vxrm) { + case 0: // round-to-nearest-up + return bitAt(value, shiftDown, (R)1); + case 1: // round-to-nearest-even + return bitAt(value, shiftDown, (R)1) && (anyBitUpTo(value, shiftDown, (R)2) || bitAt(value, shiftDown, (R)0)); + case 2: // round-down (truncate) + return 0; + case 3: // round-to-odd + return !bitAt(value, shiftDown, (R)0) && anyBitUpTo(value, shiftDown, (R)1); + default: + std::cout << "Roundoff - invalid value for vxrm: " << vxrm << std::endl; + std::abort(); + } +} + +template +class SrlSra { +public: + static R apply(T first, T second, R) { + // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount. + return second >> (first & (sizeof(T) * 8 - 1)); + } + static R apply(T first, T second, uint32_t vxrm, uint32_t) { + // Saturation is not relevant for this operation + // Only the low lg2(SEW) bits of the shift-amount value are used to control the shift amount. + T firstValid = first & (sizeof(T) * 8 - 1); + return apply(firstValid, second, 0) + roundBit(second, firstValid, vxrm); + } + static std::string name() { return "SrlSra"; } +}; + +template +class Aadd { +public: + static R apply(T first, T second, uint32_t vxrm, uint32_t) { + // Saturation is not relevant for this operation + T sum = second + first; + return (sum >> 1) + roundBit(sum, 1, vxrm); + } + static std::string name() { return "Aadd"; } +}; + +template +class Asub { +public: + static R apply(T first, T second, uint32_t vxrm, uint32_t) { + // Saturation is not relevant for this operation + T difference = second - first; + return (difference >> 1) + roundBit(difference, 1, vxrm); + } + static std::string name() { return "Asub"; } +}; + +template +class Eq { +public: + static R apply(T first, T second, R) { + return first == second; + } + static std::string name() { return "Eq"; } +}; + +template +class Ne { +public: + static R apply(T first, T second, R) { + return first != second; + } + static std::string name() { return "Ne"; } +}; + +template +class Lt { +public: + static R apply(T first, T second, R) { + return first > second; + } + static std::string name() { return "Lt"; } +}; + +template +class Le { +public: + static R apply(T first, T second, R) { + return first >= second; + } + static std::string name() { return "Le"; } +}; + +template +class Gt { +public: + static R apply(T first, T second, R) { + return first < second; + } + static std::string name() { return "Gt"; } +}; + +template +class AndNot { +public: + static R apply(T first, T second, R) { + return second & ~first; + } + static std::string name() { return "AndNot"; } +}; + +template +class OrNot { +public: + static R apply(T first, T second, R) { + return second | ~first; + } + static std::string name() { return "OrNot"; } +}; + +template +class Nand { +public: + static R apply(T first, T second, R) { + return ~(second & first); + } + static std::string name() { return "Nand"; } +}; + +template +class Mv { +public: + static R apply(T first, T, R) { + return first; + } + static std::string name() { return "Mv"; } +}; + +template +class Nor { +public: + static R apply(T first, T second, R) { + return ~(second | first); + } + static std::string name() { return "Nor"; } +}; + +template +class Xnor { +public: + static R apply(T first, T second, R) { + return ~(second ^ first); + } + static std::string name() { return "Xnor"; } +}; + +template +class Fadd { +public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fadd_s(first, second, frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fadd_d(first_d, second_d, frm, &fflags); + } else { + std::cout << "Fadd only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fadd"; } +}; + +template +class Fsub { +public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fsub_s(second, first, frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fsub_d(second_d, first_d, frm, &fflags); + } else { + std::cout << "Fsub only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fsub"; } +}; + +template +class Fmacc { +public: + static R apply(T first, T second, R third) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fmadd_s(first, second, third, frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fmadd_d(first_d, second_d, third, frm, &fflags); + } else { + std::cout << "Fmacc only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fmacc"; } +}; + +template +class Fnmacc { +public: + static R apply(T first, T second, R third) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fnmadd_s(first, second, third, frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fnmadd_d(first_d, second_d, third, frm, &fflags); + } else { + std::cout << "Fnmacc only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fnmacc"; } +}; + +template +class Fmsac { +public: + static R apply(T first, T second, R third) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fmadd_s(first, second, rv_fsgnjn_s(third, third), frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fmadd_d(first_d, second_d, rv_fsgnjn_d(third, third), frm, &fflags); + } else { + std::cout << "Fmsac only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fmsac"; } +}; + +template +class Fnmsac { +public: + static R apply(T first, T second, R third) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fnmadd_s(first, second, rv_fsgnjn_s(third, third), frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fnmadd_d(first_d, second_d, rv_fsgnjn_d(third, third), frm, &fflags); + } else { + std::cout << "Fnmsac only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fnmsac"; } +}; + +template +class Fmadd { +public: + static R apply(T first, T second, R third) { + if (sizeof(T) == 4 || sizeof(T) == 8) { + return Fmacc::apply(first, third, second); + } else { + std::cout << "Fmadd only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fmadd"; } +}; + +template +class Fnmadd { +public: + static R apply(T first, T second, R third) { + if (sizeof(T) == 4 || sizeof(T) == 8) { + return Fnmacc::apply(first, third, second); + } else { + std::cout << "Fnmadd only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fnmadd"; } +}; + +template +class Fmsub { +public: + static R apply(T first, T second, R third) { + if (sizeof(T) == 4 || sizeof(T) == 8) { + return Fmsac::apply(first, third, second); + } else { + std::cout << "Fmsub only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fmsub"; } +}; + +template +class Fnmsub { +public: + static R apply(T first, T second, R third) { + if (sizeof(T) == 4 || sizeof(T) == 8) { + return Fnmsac::apply(first, third, second); + } else { + std::cout << "Fnmsub only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fnmsub"; } +}; + +template +class Fmin { +public: + static R apply(T first, T second, R) { + // ignoring rounding modes for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_fmin_s(first, second, &fflags); + } else if (sizeof(T) == 8) { + return rv_fmin_d(first, second, &fflags); + } else { + std::cout << "Fmin only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fmin"; } +}; + +template +class Fmax { +public: + static R apply(T first, T second, R) { + // ignoring rounding modes for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_fmax_s(first, second, &fflags); + } else if (sizeof(T) == 8) { + return rv_fmax_d(first, second, &fflags); + } else { + std::cout << "Fmax only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fmax"; } +}; + +template +class Fsgnj { +public: + static R apply(T first, T second, R) { + if (sizeof(T) == 4) { + return rv_fsgnj_s(second, first); + } else if (sizeof(T) == 8) { + return rv_fsgnj_d(second, first); + } else { + std::cout << "Fsgnj only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fsgnj"; } +}; + +template +class Fsgnjn { +public: + static R apply(T first, T second, R) { + if (sizeof(T) == 4) { + return rv_fsgnjn_s(second, first); + } else if (sizeof(T) == 8) { + return rv_fsgnjn_d(second, first); + } else { + std::cout << "Fsgnjn only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fsgnjn"; } +}; + +template +class Fsgnjx { +public: + static R apply(T first, T second, R) { + if (sizeof(T) == 4) { + return rv_fsgnjx_s(second, first); + } else if (sizeof(T) == 8) { + return rv_fsgnjx_d(second, first); + } else { + std::cout << "Fsgnjx only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fsgnjx"; } +}; + +template +class Fcvt { +public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(T) == 4) { + switch (first) { + case 0b00000: // vfcvt.xu.f.v + return rv_ftou_s(second, frm, &fflags); + case 0b00001: // vfcvt.x.f.v + return rv_ftoi_s(second, frm, &fflags); + case 0b00010: // vfcvt.f.xu.v + return rv_utof_s(second, frm, &fflags); + case 0b00011: // vfcvt.f.x.v + return rv_itof_s(second, frm, &fflags); + case 0b00110: // vfcvt.rtz.xu.f.v + return rv_ftou_s(second, 1, &fflags); + case 0b00111: // vfcvt.rtz.x.f.v + return rv_ftoi_s(second, 1, &fflags); + case 0b01000: // vfwcvt.xu.f.v + return rv_ftolu_s(second, frm, &fflags); + case 0b01001: // vfwcvt.x.f.v + return rv_ftol_s(second, frm, &fflags); + case 0b01010: // vfwcvt.f.xu.v + return rv_utof_d(second, frm, &fflags); + case 0b01011: // vfwcvt.f.x.v + return rv_itof_d(second, frm, &fflags); + case 0b01100: // vfwcvt.f.f.v + return rv_ftod(second); + case 0b01110: // vfwcvt.rtz.xu.f.v + return rv_ftolu_s(second, 1, &fflags); + case 0b01111: // vfwcvt.rtz.x.f.v + return rv_ftol_s(second, 1, &fflags); + default: + std::cout << "Fcvt has unsupported value for first: " << first << std::endl; + std::abort(); + } + } else if (sizeof(T) == 8) { + switch (first) { + case 0b00000: // vfcvt.xu.f.v + return rv_ftolu_d(second, frm, &fflags); + case 0b00001: // vfcvt.x.f.v + return rv_ftol_d(second, frm, &fflags); + case 0b00010: // vfcvt.f.xu.v + return rv_lutof_d(second, frm, &fflags); + case 0b00011: // vfcvt.f.x.v + return rv_ltof_d(second, frm, &fflags); + case 0b00110: // vfcvt.rtz.xu.f.v + return rv_ftolu_d(second, 1, &fflags); + case 0b00111: // vfcvt.rtz.x.f.v + return rv_ftol_d(second, 1, &fflags); + case 0b01000: // vfwcvt.xu.f.v + case 0b01001: // vfwcvt.x.f.v + case 0b01010: // vfwcvt.f.xu.v + case 0b01011: // vfwcvt.f.x.v + case 0b01100: // vfwcvt.f.f.v + case 0b01110: // vfwcvt.rtz.xu.f.v + case 0b01111: // vfwcvt.rtz.x.f.v + std::cout << "Fwcvt only supports f32" << std::endl; + std::abort(); + default: + std::cout << "Fcvt has unsupported value for first: " << first << std::endl; + std::abort(); + } + } else { + std::cout << "Fcvt only supports f32 and f64" << std::endl; + std::abort(); + } + } + static R apply(T first, T second, uint32_t vxrm, uint32_t &) { // saturation argument is unused + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 8) { + switch (first) { + case 0b10000: // vfncvt.xu.f.w + return rv_ftou_d(second, vxrm, &fflags); + case 0b10001: // vfncvt.x.f.w + return rv_ftoi_d(second, vxrm, &fflags); + case 0b10010: // vfncvt.f.xu.w + return rv_lutof_s(second, vxrm, &fflags); + case 0b10011: // vfncvt.f.x.w + return rv_ltof_s(second, vxrm, &fflags); + case 0b10100: // vfncvt.f.f.w + return rv_dtof_r(second, vxrm); + case 0b10101: // vfncvt.rod.f.f.w + return rv_dtof_r(second, 6); + case 0b10110: // vfncvt.rtz.xu.f.w + return rv_ftou_d(second, 1, &fflags); + case 0b10111: // vfncvt.rtz.x.f.w + return rv_ftoi_d(second, 1, &fflags); + default: + std::cout << "Fncvt has unsupported value for first: " << first << std::endl; + std::abort(); + } + } else { + std::cout << "Fncvt only supports f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fcvt"; } +}; + +template +class Funary1 { +public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(T) == 4) { + switch (first) { + case 0b00000: // vfsqrt.v + return rv_fsqrt_s(second, frm, &fflags); + case 0b00100: // vfrsqrt7.v + return rv_frsqrt7_s(second, frm, &fflags); + case 0b00101: // vfrec7.v + return rv_frecip7_s(second, frm, &fflags); + case 0b10000: // vfclass.v + return rv_fclss_s(second); + default: + std::cout << "Funary1 has unsupported value for first: " << first << std::endl; + std::abort(); + } + } else if (sizeof(T) == 8) { + switch (first) { + case 0b00000: // vfsqrt.v + return rv_fsqrt_d(second, frm, &fflags); + case 0b00100: // vfrsqrt7.v + return rv_frsqrt7_d(second, frm, &fflags); + case 0b00101: // vfrec7.v + return rv_frecip7_d(second, frm, &fflags); + case 0b10000: // vfclass.v + return rv_fclss_d(second); + default: + std::cout << "Funary1 has unsupported value for first: " << first << std::endl; + std::abort(); + } + } else { + std::cout << "Funary1 only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Funary1"; } +}; + +template +class Xunary0 { +public: + static R apply(T, T second, T) { + return second; + } + static std::string name() { return "Xunary0"; } +}; + +template +class Feq { +public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_feq_s(second, first, &fflags); + } else if (sizeof(T) == 8) { + return rv_feq_d(second, first, &fflags); + } else { + std::cout << "Feq only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Feq"; } +}; + +template +class Fle { +public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_fle_s(second, first, &fflags); + } else if (sizeof(T) == 8) { + return rv_fle_d(second, first, &fflags); + } else { + std::cout << "Fle only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fle"; } +}; + +template +class Flt { +public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_flt_s(second, first, &fflags); + } else if (sizeof(T) == 8) { + return rv_flt_d(second, first, &fflags); + } else { + std::cout << "Flt only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Flt"; } +}; + +template +class Fne { +public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return !rv_feq_s(second, first, &fflags); + } else if (sizeof(T) == 8) { + return !rv_feq_d(second, first, &fflags); + } else { + std::cout << "Fne only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fne"; } +}; + +template +class Fgt { +public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_flt_s(first, second, &fflags); + } else if (sizeof(T) == 8) { + return rv_flt_d(first, second, &fflags); + } else { + std::cout << "Fgt only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fgt"; } +}; + +template +class Fge { +public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + if (sizeof(T) == 4) { + return rv_fle_s(first, second, &fflags); + } else if (sizeof(T) == 8) { + return rv_fle_d(first, second, &fflags); + } else { + std::cout << "Fge only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fge"; } +}; + +template +class Fdiv { +public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(T) == 4) { + return rv_fdiv_s(second, first, frm, &fflags); + } else if (sizeof(T) == 8) { + return rv_fdiv_d(second, first, frm, &fflags); + } else { + std::cout << "Fdiv only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fdiv"; } +}; + +template +class Frdiv { +public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(T) == 4) { + return rv_fdiv_s(first, second, frm, &fflags); + } else if (sizeof(T) == 8) { + return rv_fdiv_d(first, second, frm, &fflags); + } else { + std::cout << "Frdiv only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Frdiv"; } +}; + +template +class Fmul { +public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(R) == 4) { + return rv_fmul_s(first, second, frm, &fflags); + } else if (sizeof(R) == 8) { + uint64_t first_d = sizeof(T) == 8 ? first : rv_ftod(first); + uint64_t second_d = sizeof(T) == 8 ? second : rv_ftod(second); + return rv_fmul_d(first_d, second_d, frm, &fflags); + } else { + std::cout << "Fmul only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Fmul"; } +}; + +template +class Frsub { +public: + static R apply(T first, T second, R) { + // ignoring flags for now + uint32_t fflags = 0; + // ignoring rounding mode for now + uint32_t frm = 0; + if (sizeof(T) == 4) { + return rv_fsub_s(first, second, frm, &fflags); + } else if (sizeof(T) == 8) { + return rv_fsub_d(first, second, frm, &fflags); + } else { + std::cout << "Frsub only supports f32 and f64" << std::endl; + std::abort(); + } + } + static std::string name() { return "Frsub"; } +}; + +template +class Clip { +public: + static R apply(T first, T second, uint32_t vxrm, uint32_t &vxsat_) { + // The low lg2(2*SEW) bits of the vector or scalar shift-amount value (e.g., the low 6 bits for a SEW=64-bit to + // SEW=32-bit narrowing operation) are used to control the right shift amount, which provides the scaling. + R firstValid = first & (sizeof(T) * 8 - 1); + T unclippedResult = (second >> firstValid) + roundBit(second, firstValid, vxrm); + R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits::min(), (T)std::numeric_limits::max()); + vxsat_ |= clippedResult != unclippedResult; + return clippedResult; + } + static std::string name() { return "Clip"; } +}; + +template +class Smul { +public: + static R apply(T first, T second, uint32_t vxrm, uint32_t &vxsat_) { + R shift = sizeof(R) * 8 - 1; + T unshiftedResult = first * second; + T unclippedResult = (unshiftedResult >> shift) + roundBit(unshiftedResult, shift, vxrm); + R clippedResult = std::clamp(unclippedResult, (T)std::numeric_limits::min(), (T)std::numeric_limits::max()); + vxsat_ |= clippedResult != unclippedResult; + return clippedResult; + } + static std::string name() { return "Smul"; } +}; + +/////////////////////////////////////////////////////////////////////////////// + +bool isMasked(std::vector> &vreg_file, uint32_t maskVreg, uint32_t byteI, bool vmask) { + auto &mask = vreg_file.at(maskVreg); + uint8_t emask = *(uint8_t *)(mask.data() + byteI / 8); + uint8_t value = (emask >> (byteI % 8)) & 0x1; + DP(4, "Masking enabled: " << +!vmask << " mask element: " << +value); + return !vmask && value == 0; +} + +template +uint32_t getVreg(uint32_t baseVreg, uint32_t byteI) { + uint32_t vsew = sizeof(DT) * 8; + return (baseVreg + (byteI / (VLEN / vsew))) % 32; +} + +template +DT &getVregData(std::vector &baseVregVec, uint32_t byteI) { + uint32_t vsew = sizeof(DT) * 8; + return *(DT *)(baseVregVec.data() + (byteI % (VLEN / vsew)) * vsew / 8); +} + +template +DT &getVregData(std::vector> &vreg_file, uint32_t baseVreg, uint32_t byteI) { + auto &vr1 = vreg_file.at(getVreg
(baseVreg, byteI)); + return getVregData
(vr1, byteI); +} + +template +void vector_op_vix_load(std::vector> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rdest, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + uint32_t vsew = sizeof(DT) * 8; + uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11); + if (nfields * emul > 8) { + std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl; + std::abort(); + } + for (uint32_t i = 0; i < vl * nfields; i++) { + if (isMasked(vreg_file, 0, i / nfields, vmask)) + continue; + + uint32_t nfields_strided = strided ? nfields : 1; + Word mem_addr = (base_addr & 0xFFFFFFFC) + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT); + Word mem_data = 0; + emul_->dcache_read(&mem_data, mem_addr, vsew / 8); + DP(4, "Loading data " << mem_data << " from: " << mem_addr << " to vec reg: " << getVreg
(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields); + DT &result = getVregData
(vreg_file, rdest + (i % nfields) * emul, i / nfields); + DP(4, "Previous data: " << +result); + result = (DT)mem_data; + } +} + +void vector_op_vix_load(std::vector> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rdest, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + switch (vsew) { + case 8: + vector_op_vix_load(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask); + break; + case 16: + vector_op_vix_load(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask); + break; + case 32: + vector_op_vix_load(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask); + break; + case 64: + vector_op_vix_load(vreg_file, emul_, base_addr, rdest, vl, strided, stride, nfields, lmul, vmask); + break; + default: + std::cout << "Failed to execute VLE for vsew: " << vsew << std::endl; + std::abort(); + } +} + +template +void vector_op_vv_load(std::vector> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + uint32_t vsew = sizeof(DT) * 8; + uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11); + if (nfields * emul > 8) { + std::cout << "NFIELDS * EMUL = " << nfields * lmul << " but it should be <= 8" << std::endl; + std::abort(); + } + for (uint32_t i = 0; i < vl * nfields; i++) { + if (isMasked(vreg_file, 0, i / nfields, vmask)) + continue; + + Word offset = 0; + switch (iSew) { + case 8: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 16: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 32: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 64: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + default: + std::cout << "Unsupported iSew: " << iSew << std::endl; + std::abort(); + } + + Word mem_addr = (base_addr & 0xFFFFFFFC) + offset + (i % nfields) * sizeof(DT); + Word mem_data = 0; + emul_->dcache_read(&mem_data, mem_addr, vsew / 8); + DP(4, "VLUX/VLOX - Loading data " << mem_data << " from: " << mem_addr << " with offset: " << std::dec << offset << " to vec reg: " << getVreg
(rdest + (i % nfields) * emul, i / nfields) << " i: " << i / nfields); + DT &result = getVregData
(vreg_file, rdest + (i % nfields) * emul, i / nfields); + DP(4, "Previous data: " << +result); + result = (DT)mem_data; + } +} + +void vector_op_vv_load(std::vector> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rdest, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + switch (vsew) { + case 8: + vector_op_vv_load(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask); + break; + case 16: + vector_op_vv_load(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask); + break; + case 32: + vector_op_vv_load(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask); + break; + case 64: + vector_op_vv_load(vreg_file, emul_, base_addr, rsrc1, rdest, iSew, vl, nfields, lmul, vmask); + break; + default: + std::cout << "Failed to execute VLUX/VLOX for vsew: " << vsew << std::endl; + std::abort(); + } +} + +template +void vector_op_vix_store(std::vector> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc3, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + uint32_t vsew = sizeof(DT) * 8; + uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11); + for (uint32_t i = 0; i < vl * nfields; i++) { + if (isMasked(vreg_file, 0, i / nfields, vmask)) + continue; + + uint32_t nfields_strided = strided ? nfields : 1; + Word mem_addr = base_addr + (i / nfields_strided) * stride + (i % nfields_strided) * sizeof(DT); + Word mem_data = getVregData
(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields); + DP(4, "Storing: " << std::hex << mem_data << " at: " << mem_addr << " from vec reg: " << getVreg
(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields); + emul_->dcache_write(&mem_data, mem_addr, vsew / 8); + } +} + +void vector_op_vix_store(std::vector> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc3, uint32_t vsew, uint32_t vl, bool strided, WordI stride, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + switch (vsew) { + case 8: + vector_op_vix_store(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask); + break; + case 16: + vector_op_vix_store(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask); + break; + case 32: + vector_op_vix_store(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask); + break; + case 64: + vector_op_vix_store(vreg_file, emul_, base_addr, rsrc3, vl, strided, stride, nfields, lmul, vmask); + break; + default: + std::cout << "Failed to execute VSE for vsew: " << vsew << std::endl; + std::abort(); + } +} + +template +void vector_op_vv_store(std::vector> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + uint32_t vsew = sizeof(DT) * 8; + uint32_t emul = lmul >> 2 ? 1 : 1 << (lmul & 0b11); + for (uint32_t i = 0; i < vl * nfields; i++) { + if (isMasked(vreg_file, 0, i / nfields, vmask)) + continue; + + Word offset = 0; + switch (iSew) { + case 8: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 16: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 32: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + case 64: + offset = getVregData(vreg_file, rsrc1, i / nfields); + break; + default: + std::cout << "Unsupported iSew: " << iSew << std::endl; + std::abort(); + } + + Word mem_addr = base_addr + offset + (i % nfields) * sizeof(DT); + Word mem_data = getVregData
(vreg_file, rsrc3 + (i % nfields) * emul, i / nfields); + DP(4, "VSUX/VSOX - Storing: " << std::hex << mem_data << " at: " << mem_addr << " with offset: " << std::dec << offset << " from vec reg: " << getVreg
(rsrc3 + (i % nfields) * emul, i / nfields) << " i: " << i / nfields); + emul_->dcache_write(&mem_data, mem_addr, vsew / 8); + } +} + +void vector_op_vv_store(std::vector> &vreg_file, vortex::Emulator *emul_, WordI base_addr, uint32_t rsrc1, uint32_t rsrc3, uint32_t vsew, uint32_t iSew, uint32_t vl, uint32_t nfields, uint32_t lmul, uint32_t vmask) { + switch (vsew) { + case 8: + vector_op_vv_store(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask); + break; + case 16: + vector_op_vv_store(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask); + break; + case 32: + vector_op_vv_store(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask); + break; + case 64: + vector_op_vv_store(vreg_file, emul_, base_addr, rsrc1, rsrc3, iSew, vl, nfields, lmul, vmask); + break; + default: + std::cout << "Failed to execute VSUX/VSOX for vsew: " << vsew << std::endl; + std::abort(); + } +} + +template