Merge remote-tracking branch 'upstream/master' into develop-documentation

This commit is contained in:
Udit Subramanya 2024-09-20 08:26:08 -04:00
commit ff9d52c162
1687 changed files with 1955358 additions and 205650 deletions

View file

@ -1,3 +0,0 @@
ignore:
- "./examples/*"
- "./tests/*"

272
.github/workflows/ci.yml vendored Normal file
View file

@ -0,0 +1,272 @@
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: CI
on: [push, pull_request]
jobs:
setup:
runs-on: ubuntu-20.04
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
submodules: recursive
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v4
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
restore-keys: |
${{ runner.os }}-toolchain-
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v4
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
restore-keys: |
${{ runner.os }}-thirdparty-
- name: Install Dependencies
if: steps.cache-toolchain.outputs.cache-hit != 'true' || steps.cache-thirdparty.outputs.cache-hit != 'true'
run: |
sudo bash ./ci/system_updates.sh
- name: Setup Toolchain
if: steps.cache-toolchain.outputs.cache-hit != 'true'
run: |
TOOLDIR=$PWD/tools
mkdir -p build
cd build
../configure --tooldir=$TOOLDIR
ci/toolchain_install.sh --all
- name: Setup Third Party
if: steps.cache-thirdparty.outputs.cache-hit != 'true'
run: |
make -C third_party > /dev/null
# build:
# runs-on: ubuntu-20.04
# needs: setup
# strategy:
# matrix:
# xlen: [32, 64]
# steps:
# - name: Checkout code
# uses: actions/checkout@v2
# - name: Install Dependencies
# run: |
# sudo bash ./ci/system_updates.sh
# - name: Cache Toolchain Directory
# id: cache-toolchain
# uses: actions/cache@v2
# with:
# path: tools
# key: ${{ runner.os }}-toolchain-v0.1
# restore-keys: |
# ${{ runner.os }}-toolchain-
# - name: Cache Third Party Directory
# id: cache-thirdparty
# uses: actions/cache@v2
# with:
# path: third_party
# key: ${{ runner.os }}-thirdparty-v0.1
# restore-keys: |
# ${{ runner.os }}-thirdparty-
# - name: Run Build
# run: |
# TOOLDIR=$PWD/tools
# mkdir -p build${{ matrix.xlen }}
# cd build${{ matrix.xlen }}
# ../configure --tooldir=$TOOLDIR --xlen=${{ matrix.xlen }}
# source ci/toolchain_env.sh
# make software -s > /dev/null
# make tests -s > /dev/null
# - name: Upload Build Artifact
# uses: actions/upload-artifact@v2
# with:
# name: build-${{ matrix.xlen }}
# path: build${{ matrix.xlen }}
# tests:
# runs-on: ubuntu-20.04
# needs: build
# strategy:
# matrix:
# name: [regression, opencl, config1, config2, debug, stress]
# xlen: [32, 64]
# steps:
# - name: Checkout code
# uses: actions/checkout@v2
# - name: Install Dependencies
# run: |
# sudo bash ./ci/system_updates.sh
# - name: Cache Toolchain Directory
# id: cache-toolchain
# uses: actions/cache@v2
# with:
# path: tools
# key: ${{ runner.os }}-toolchain-v0.1
# restore-keys: |
# ${{ runner.os }}-toolchain-
# - name: Cache Third Party Directory
# id: cache-thirdparty
# uses: actions/cache@v2
# with:
# path: third_party
# key: ${{ runner.os }}-thirdparty-v0.1
# restore-keys: |
# ${{ runner.os }}-thirdparty-
# - name: Download Build Artifact
# uses: actions/download-artifact@v2
# with:
# name: build-${{ matrix.xlen }}
# path: build${{ matrix.xlen }}
# - name: Run tests
# run: |
# cd build${{ matrix.xlen }}
# source ci/toolchain_env.sh
# chmod -R +x . # Ensure all files have executable permissions
# if [ "${{ matrix.name }}" == "regression" ]; then
# ./ci/regression.sh --unittest
# ./ci/regression.sh --isa
# ./ci/regression.sh --kernel
# ./ci/regression.sh --synthesis
# ./ci/regression.sh --regression
# else
# ./ci/regression.sh --${{ matrix.name }}
# fi
build_vm:
runs-on: ubuntu-20.04
needs: setup
strategy:
matrix:
xlen: [32, 64]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install Dependencies
run: |
sudo bash ./ci/system_updates.sh
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v4
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
restore-keys: |
${{ runner.os }}-toolchain-
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v4
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
restore-keys: |
${{ runner.os }}-thirdparty-
- name: Run Build
run: |
TOOLDIR=$PWD/tools
mkdir -p build${{ matrix.xlen }}-vm
cd build${{ matrix.xlen }}-vm
../configure --tooldir=$TOOLDIR --xlen=${{ matrix.xlen }} --vm_enable=1
source ci/toolchain_env.sh
make software -s > /dev/null
make tests -s > /dev/null
- name: Upload Build Artifact
uses: actions/upload-artifact@v4
with:
name: build-${{ matrix.xlen }}-vm
path: build${{ matrix.xlen }}-vm
test_vm:
runs-on: ubuntu-20.04
needs: build_vm
strategy:
fail-fast: false
matrix:
name: [regression, opencl, cache, config1, config2, debug, stress, vm]
xlen: [32, 64]
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install Dependencies
run: |
sudo bash ./ci/system_updates.sh
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v4
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
restore-keys: |
${{ runner.os }}-toolchain-
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v4
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
restore-keys: |
${{ runner.os }}-thirdparty-
- name: Download Build Artifact
uses: actions/download-artifact@v4
with:
name: build-${{ matrix.xlen }}-vm
path: build${{ matrix.xlen }}-vm
- name: Run tests
run: |
cd build${{ matrix.xlen }}-vm
source ci/toolchain_env.sh
chmod -R +x . # Ensure all files have executable permissions
./ci/regression.sh --vm
complete:
runs-on: ubuntu-20.04
needs: test_vm
steps:
- name: Check Completion
run: echo "All matrix jobs passed"

4
.gitignore vendored Normal file
View file

@ -0,0 +1,4 @@
/build*
/.vscode
*.cache
*.code-workspace

2
.gitmodules vendored
View file

@ -6,4 +6,4 @@
url = https://github.com/ucb-bar/berkeley-softfloat-3.git url = https://github.com/ucb-bar/berkeley-softfloat-3.git
[submodule "third_party/ramulator"] [submodule "third_party/ramulator"]
path = third_party/ramulator path = third_party/ramulator
url = https://github.com/CMU-SAFARI/ramulator.git url = https://github.com/CMU-SAFARI/ramulator2.git

View file

@ -1,90 +0,0 @@
language: cpp
dist: focal
os: linux
compiler: gcc
addons:
apt:
packages:
- build-essential
- valgrind
- libstdc++6
env:
global:
- TOOLDIR=$HOME/tools
cache:
directories:
- $TOOLDIR
- $HOME/build32
- $HOME/build64
before_install:
- if [ ! -d "$TOOLDIR" ] || [ -z "$(ls -A $TOOLDIR)" ]; then
mkdir -p $TOOLDIR;
OSDIR=ubuntu/focal ./ci/toolchain_install.sh --all;
fi
- source ./ci/toolchain_env.sh
stages:
- setup
- test
jobs:
include:
- stage: setup
script:
- rm -rf $HOME/build32 && cp -r $PWD $HOME/build32
- rm -rf $HOME/build64 && cp -r $PWD $HOME/build64
- make -C $HOME/build32
- XLEN=64 make -C $HOME/build64
- stage: test
name: unittest
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --unittest
- stage: test
name: isa
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --isa
- stage: test
name: isa64
script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --isa
- stage: test
name: regression
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --regression
- stage: test
name: regression64
script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --regression
- stage: test
name: opencl
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --opencl
- stage: test
name: cluster
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --cluster
- stage: test
name: config
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --config
- stage: test
name: debug
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --debug
- stage: test
name: stress0
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --stress0
- stage: test
name: stress1
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --stress1
- stage: test
name: synthesis
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --synthesis
- stage: test
name: synthesis64
script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --synthesis
after_success:
# Gather code coverage
- lcov --directory runtime --capture --output-file runtime.cov # capture trace
- lcov --directory sim --capture --output-file sim.cov # capture trace
- lcov --list runtime.cov # output coverage data for debugging
- lcov --list sim.cov # output coverage data for debugging
# Upload coverage report
- bash <(curl -s https://codecov.io/bash) -f runtime.cov
- bash <(curl -s https://codecov.io/bash) -f sim.cov

View file

@ -1,28 +0,0 @@
all:
$(MAKE) -C third_party
$(MAKE) -C hw
$(MAKE) -C sim
$(MAKE) -C kernel
$(MAKE) -C runtime
$(MAKE) -C tests
clean:
$(MAKE) -C hw clean
$(MAKE) -C sim clean
$(MAKE) -C kernel clean
$(MAKE) -C runtime clean
$(MAKE) -C tests clean
clean-all:
$(MAKE) -C third_party clean
$(MAKE) -C hw clean
$(MAKE) -C sim clean
$(MAKE) -C kernel clean
$(MAKE) -C runtime clean
$(MAKE) -C tests clean-all
crtlsim:
$(MAKE) -C sim clean
brtlsim:
$(MAKE) -C sim

82
Makefile.in Normal file
View file

@ -0,0 +1,82 @@
include config.mk
.PHONY: build software tests
vm:
$(MAKE) -C $(VORTEX_HOME)/third_party
$(MAKE) -C hw
$(MAKE) -C sim simx
$(MAKE) -C kernel
$(MAKE) -C runtime vm
$(MAKE) -C tests
all:
$(MAKE) -C $(VORTEX_HOME)/third_party
$(MAKE) -C hw
$(MAKE) -C sim
$(MAKE) -C kernel
$(MAKE) -C runtime
$(MAKE) -C tests
build:
$(MAKE) -C hw
$(MAKE) -C sim
$(MAKE) -C kernel
$(MAKE) -C runtime
$(MAKE) -C tests
software:
$(MAKE) -C hw
$(MAKE) -C kernel
$(MAKE) -C runtime/stub
tests:
$(MAKE) -C tests
clean-build:
$(MAKE) -C hw clean
$(MAKE) -C sim clean
$(MAKE) -C kernel clean
$(MAKE) -C runtime clean
$(MAKE) -C tests clean
clean: clean-build
$(MAKE) -C $(VORTEX_HOME)/third_party clean
# Install setup
KERNEL_INC_DST = $(INSTALLDIR)/kernel/include
KERNEL_LIB_DST = $(INSTALLDIR)/kernel/lib$(XLEN)
RUNTIME_INC_DST = $(INSTALLDIR)/runtime/include
RUNTIME_LIB_DST = $(INSTALLDIR)/runtime/lib
KERNEL_HEADERS = $(wildcard $(VORTEX_HOME)/kernel/include/*.h)
KERNEL_LIBS = $(wildcard kernel/*.a)
RUNTIME_HEADERS = $(wildcard $(VORTEX_HOME)/runtime/include/*.h)
RUNTIME_LIBS = $(wildcard runtime/*.so)
INSTALL_DIRS = $(KERNEL_LIB_DST) $(RUNTIME_LIB_DST) $(KERNEL_INC_DST) $(RUNTIME_INC_DST)
$(INSTALL_DIRS):
mkdir -p $@
$(KERNEL_INC_DST)/VX_types.h: hw/VX_types.h | $(KERNEL_INC_DST)
cp $< $@
$(KERNEL_INC_DST)/%.h: $(VORTEX_HOME)/kernel/include/%.h | $(KERNEL_INC_DST)
cp $< $@
$(RUNTIME_INC_DST)/%.h: $(VORTEX_HOME)/runtime/include/%.h | $(RUNTIME_INC_DST)
cp $< $@
$(KERNEL_LIB_DST)/%.a: kernel/%.a | $(KERNEL_LIB_DST)
cp $< $@
$(RUNTIME_LIB_DST)/%.so: runtime/%.so | $(RUNTIME_LIB_DST)
cp $< $@
install: $(INSTALL_DIRS) \
$(KERNEL_INC_DST)/VX_types.h \
$(KERNEL_HEADERS:$(VORTEX_HOME)/kernel/include/%=$(KERNEL_INC_DST)/%) \
$(RUNTIME_HEADERS:$(VORTEX_HOME)/runtime/include/%=$(RUNTIME_INC_DST)/%) \
$(KERNEL_LIBS:kernel/%=$(KERNEL_LIB_DST)/%) \
$(RUNTIME_LIBS:runtime/%=$(RUNTIME_LIB_DST)/%)

View file

@ -1,6 +1,3 @@
[![Build Status](https://travis-ci.com/vortexgpgpu/vortex.svg?branch=master)](https://travis-ci.com/vortexgpgpu/vortex)
[![codecov](https://codecov.io/gh/vortexgpgpu/vortex/branch/master/graph/badge.svg)](https://codecov.io/gh/vortexgpgpu/vortex)
# Vortex GPGPU # Vortex GPGPU
Vortex is a full-stack open-source RISC-V GPGPU. Vortex is a full-stack open-source RISC-V GPGPU.
@ -12,7 +9,7 @@ Vortex is a full-stack open-source RISC-V GPGPU.
- configurable number of cores, warps, and threads. - configurable number of cores, warps, and threads.
- configurable number of ALU, FPU, LSU, and SFU units per core. - configurable number of ALU, FPU, LSU, and SFU units per core.
- configurable pipeline issue width. - configurable pipeline issue width.
- optional shared memory, L1, L2, and L3 caches. - optional local memory, L1, L2, and L3 caches.
- Software: - Software:
- OpenCL 1.2 Support. - OpenCL 1.2 Support.
- Supported FPGAs: - Supported FPGAs:
@ -33,8 +30,9 @@ Vortex is a full-stack open-source RISC-V GPGPU.
- `miscs`: Miscellaneous resources. - `miscs`: Miscellaneous resources.
## Build Instructions ## Build Instructions
More detailed build instructions can be found [here](docs/install_vortex.md).
### Supported OS Platforms ### Supported OS Platforms
- Ubuntu 18.04 - Ubuntu 18.04, 20.04
- Centos 7 - Centos 7
### Toolchain Dependencies ### Toolchain Dependencies
- [POCL](http://portablecl.org/) - [POCL](http://portablecl.org/)
@ -47,18 +45,66 @@ Vortex is a full-stack open-source RISC-V GPGPU.
- [Yosys](https://github.com/YosysHQ/yosys) - [Yosys](https://github.com/YosysHQ/yosys)
- [Sv2v](https://github.com/zachjs/sv2v) - [Sv2v](https://github.com/zachjs/sv2v)
### Install development tools ### Install development tools
$ sudo apt-get install build-essential ```sh
$ sudo apt-get install git sudo apt-get install build-essential
sudo apt-get install binutils
sudo apt-get install python
sudo apt-get install uuid-dev
sudo apt-get install git
```
### Install Vortex codebase ### Install Vortex codebase
$ git clone --recursive https://github.com/vortexgpgpu/vortex.git ```sh
$ cd Vortex git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git
cd vortex
```
### Configure your build folder
```sh
mkdir build
cd build
# for 32bit
../configure --xlen=32 --tooldir=$HOME/tools
# for 64bit
../configure --xlen=64 --tooldir=$HOME/tools
```
### Install prebuilt toolchain ### Install prebuilt toolchain
By default, the toolchain will install to /opt folder. ```sh
You can install the toolchain to a different directory by overriding TOOLDIR (e.g. export TOOLDIR=$HOME/tools). ./ci/toolchain_install.sh --all
```
$ ./ci/toolchain_install.sh --all ### set environment variables
$ source ./ci/toolchain_env.sh ```sh
### Build Vortex sources # should always run before using the toolchain!
$ make -s source ./ci/toolchain_env.sh
```
### Building Vortex
```sh
make -s
```
### Quick demo running vecadd OpenCL kernel on 2 cores ### Quick demo running vecadd OpenCL kernel on 2 cores
$ ./ci/blackbox.sh --cores=2 --app=vecadd ```sh
./ci/blackbox.sh --cores=2 --app=vecadd
```
### Common Developer Tips
- Installing Vortex kernel and runtime libraries to use with external tools requires passing --prefix=<install-path> to the configure script.
```sh
../configure --xlen=32 --tooldir=$HOME/tools --prefix=<install-path>
make -s
make install
```
- Building Vortex 64-bit simply requires using --xlen=64 configure option.
```sh
../configure --xlen=32 --tooldir=$HOME/tools
```
- Sourcing "./ci/toolchain_env.sh" is required everytime you start a new terminal. we recommend adding "source <build-path>/ci/toolchain_env.sh" to your ~/.bashrc file to automate the process at login.
```sh
echo "source <build-path>/ci/toolchain_env.sh" >> ~/.bashrc
```
- Making changes to Makefiles in your source tree or adding new folders will require executing the "configure" script again to get it propagated into your build folder.
```sh
../configure
```
- To debug the GPU, you can generate a "run.log" trace. see /docs/debugging.md for more information.
```sh
./ci/blackbox.sh --app=demo --debug=3
```
- For additional information, check out the /docs.

View file

@ -1,4 +0,0 @@
Release Notes!
* 07/01/2020 - LKG FPGA build - Passed basic, demo, vecadd kernels.

23
TODO
View file

@ -1,23 +0,0 @@
Functionality:
1) vx_cl_warpSpawn()
-> To be used by pocl->ops->run
2) newlib Integration (LoadFile(""))
-> To be used by the Rhinio benchmarks
3) POCL OPS Vortex Suite
Performance:
1) Icache doesn't need SEND_MEM_REQUEST Stage
-> Blocks are never dirty, so why not evict right away
2) Branch not taken speculation
3) Runtime -02 not running on RTL, and -03 not running on RTL and Emulator
Vector:
1) Cycle accurate simulator (would require Cache Simulator)

View file

@ -16,11 +16,21 @@
show_usage() show_usage()
{ {
echo "Vortex BlackBox Test Driver v1.0" echo "Vortex BlackBox Test Driver v1.0"
echo "Usage: $0 [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=#name] [--app=#app] [--args=#args] [--debug=#level] [--scope] [--perf=#class] [--rebuild=0|1] [--log=logfile] [--help]]" echo "Usage: $0 [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=#name] [--app=#app] [--args=#args] [--debug=#level] [--scope] [--perf=#class] [--rebuild=#n] [--log=logfile] [--help]]"
}
show_help()
{
show_usage
echo " where"
echo "--driver: gpu, simx, rtlsim, oape, xrt"
echo "--app: any subfolder test under regression or opencl"
echo "--class: 0=disable, 1=pipeline, 2=memsys"
echo "--rebuild: 0=disable, 1=force, 2=auto, 3=temp"
} }
SCRIPT_DIR=$(dirname "$0") SCRIPT_DIR=$(dirname "$0")
VORTEX_HOME=$SCRIPT_DIR/.. ROOT_DIR=$SCRIPT_DIR/..
DRIVER=simx DRIVER=simx
APP=sgemm APP=sgemm
@ -36,6 +46,7 @@ SCOPE=0
HAS_ARGS=0 HAS_ARGS=0
PERF_CLASS=0 PERF_CLASS=0
REBUILD=2 REBUILD=2
TEMPBUILD=0
LOGFILE=run.log LOGFILE=run.log
for i in "$@" for i in "$@"
@ -102,7 +113,7 @@ case $i in
shift shift
;; ;;
--help) --help)
show_usage show_help
exit 0 exit 0
;; ;;
*) *)
@ -112,18 +123,27 @@ case $i in
esac esac
done done
if [ $REBUILD -eq 3 ];
then
REBUILD=1
TEMPBUILD=1
fi
case $DRIVER in case $DRIVER in
gpu)
DRIVER_PATH=
;;
simx) simx)
DRIVER_PATH=$VORTEX_HOME/runtime/simx DRIVER_PATH=$ROOT_DIR/runtime/simx
;; ;;
rtlsim) rtlsim)
DRIVER_PATH=$VORTEX_HOME/runtime/rtlsim DRIVER_PATH=$ROOT_DIR/runtime/rtlsim
;; ;;
opae) opae)
DRIVER_PATH=$VORTEX_HOME/runtime/opae DRIVER_PATH=$ROOT_DIR/runtime/opae
;; ;;
xrt) xrt)
DRIVER_PATH=$VORTEX_HOME/runtime/xrt DRIVER_PATH=$ROOT_DIR/runtime/xrt
;; ;;
*) *)
echo "invalid driver: $DRIVER" echo "invalid driver: $DRIVER"
@ -131,17 +151,34 @@ case $DRIVER in
;; ;;
esac esac
if [ -d "$VORTEX_HOME/tests/opencl/$APP" ]; if [ -d "$ROOT_DIR/tests/opencl/$APP" ];
then then
APP_PATH=$VORTEX_HOME/tests/opencl/$APP APP_PATH=$ROOT_DIR/tests/opencl/$APP
elif [ -d "$VORTEX_HOME/tests/regression/$APP" ]; elif [ -d "$ROOT_DIR/tests/regression/$APP" ];
then then
APP_PATH=$VORTEX_HOME/tests/regression/$APP APP_PATH=$ROOT_DIR/tests/regression/$APP
else else
echo "Application folder not found: $APP" echo "Application folder not found: $APP"
exit -1 exit -1
fi fi
if [ "$DRIVER" = "gpu" ];
then
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
OPTS=$ARGS make -C $APP_PATH run-$DRIVER
status=$?
else
echo "running: make -C $APP_PATH run-$DRIVER"
make -C $APP_PATH run-$DRIVER
status=$?
fi
exit $status
fi
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS" CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS"
echo "CONFIGS=$CONFIGS" echo "CONFIGS=$CONFIGS"
@ -156,31 +193,63 @@ then
if [ $REBUILD -eq 1 ] || [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ]; if [ $REBUILD -eq 1 ] || [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ];
then then
make -C $DRIVER_PATH clean > /dev/null make -C $DRIVER_PATH clean-driver > /dev/null
echo "$CONFIGS+$DEBUG+$SCOPE" > $BLACKBOX_CACHE echo "$CONFIGS+$DEBUG+$SCOPE" > $BLACKBOX_CACHE
fi fi
fi fi
# export performance monitor class identifier # export performance monitor class identifier
export PERF_CLASS=$PERF_CLASS export VORTEX_PROFILING=$PERF_CLASS
status=0 status=0
# ensure config update # ensure config update
make -C $VORTEX_HOME/hw config > /dev/null make -C $ROOT_DIR/hw config > /dev/null
# ensure the stub driver is present # ensure the stub driver is present
make -C $VORTEX_HOME/runtime/stub > /dev/null make -C $ROOT_DIR/runtime/stub > /dev/null
if [ $DEBUG -ne 0 ] if [ $DEBUG -ne 0 ]
then then
# running application
if [ $TEMPBUILD -eq 1 ]
then
# setup temp directory
TEMPDIR=$(mktemp -d)
mkdir -p "$TEMPDIR/$DRIVER"
# driver initialization # driver initialization
if [ $SCOPE -eq 1 ] if [ $SCOPE -eq 1 ]
then then
echo "running: DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH" echo "running: DESTDIR=$TEMPDIR/$DRIVER DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: DESTDIR=$TEMPDIR/$DRIVER DEBUG=$DEBUG_LEVEL CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
else
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
fi
# cleanup temp directory
trap "rm -rf $TEMPDIR" EXIT
else
# driver initialization
if [ $SCOPE -eq 1 ]
then
echo "running: DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else else
echo "running: DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH" echo "running: DEBUG=$DEBUG_LEVEL CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi fi
@ -188,26 +257,59 @@ then
if [ $HAS_ARGS -eq 1 ] if [ $HAS_ARGS -eq 1 ]
then then
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1" echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1 DEBUG=1 OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$? status=$?
else else
echo "running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1" echo "running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1 DEBUG=1 make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$? status=$?
fi fi
fi
if [ -f "$APP_PATH/trace.vcd" ] if [ -f "$APP_PATH/trace.vcd" ]
then then
mv -f $APP_PATH/trace.vcd . mv -f $APP_PATH/trace.vcd .
fi fi
else else
if [ $TEMPBUILD -eq 1 ]
then
# setup temp directory
TEMPDIR=$(mktemp -d)
mkdir -p "$TEMPDIR/$DRIVER"
# driver initialization # driver initialization
if [ $SCOPE -eq 1 ] if [ $SCOPE -eq 1 ]
then then
echo "running: SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH" echo "running: DESTDIR=$TEMPDIR/$DRIVER SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: DESTDIR=$TEMPDIR/$DRIVER CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER
status=$?
else
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER"
VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER
status=$?
fi
# cleanup temp directory
trap "rm -rf $TEMPDIR" EXIT
else
# driver initialization
if [ $SCOPE -eq 1 ]
then
echo "running: SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else else
echo "running: CONFIGS="$CONFIGS" make -C $DRIVER_PATH" echo "running: CONFIGS=$CONFIGS make -C $DRIVER_PATH"
CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi fi
@ -223,5 +325,6 @@ else
status=$? status=$?
fi fi
fi fi
fi
exit $status exit $status

41
ci/datagen.py Executable file
View file

@ -0,0 +1,41 @@
#!/usr/bin/env python3
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import struct
import random
import sys
def create_binary_file(n, filename):
# Open the file in binary write mode
with open(filename, 'wb') as f:
# Write the integer N as 4 bytes
f.write(struct.pack('i', n))
# Generate and write N floating-point numbers
for _ in range(n):
# Generate a random float between 0 and 1
num = random.random()
# Write the float in IEEE 754 format (4 bytes)
f.write(struct.pack('f', num))
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: script.py N filename")
sys.exit(1)
n = int(sys.argv[1])
filename = sys.argv[2]
create_binary_file(n, filename)
print(f"Created binary file '{filename}' containing {n} floats.")

View file

@ -1,322 +0,0 @@
#!/bin/bash
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# exit when any command fails
set -e
# clear blackbox cache
rm -f blackbox.*.cache
unittest()
{
make -C tests/unittest run
make -C hw/unittest
}
isa()
{
echo "begin isa tests..."
make -C tests/riscv/isa run-simx
make -C tests/riscv/isa run-rtlsim
CONFIGS="-DDPI_DISABLE" make -C tests/riscv/isa run-rtlsim
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim
make -C tests/riscv/isa run-rtlsim-32f
if [ "$XLEN" == "64" ]
then
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim
make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DEXT_D_ENABLE -DFPU_FPNEW" make -C sim/rtlsim
make -C tests/riscv/isa run-rtlsim-64d || true
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim
make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim
make -C tests/riscv/isa run-rtlsim-64fx
fi
make -C sim/rtlsim clean && make -C sim/rtlsim
echo "isa tests done!"
}
regression()
{
echo "begin regression tests..."
make -C tests/kernel run-simx
make -C tests/kernel run-rtlsim
make -C tests/regression run-simx
make -C tests/regression run-rtlsim
# test FPU hardware implementations
CONFIGS="-DFPU_DPI" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DFPU_DSP" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
# test local barrier
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -t19"
./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -t19"
# test global barrier
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -t20" --cores=2
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -t20" --cores=2
# test FPU core
echo "regression tests done!"
}
opencl()
{
echo "begin opencl tests..."
make -C tests/opencl run-simx
make -C tests/opencl run-rtlsim
echo "opencl tests done!"
}
cluster()
{
echo "begin clustering tests..."
# warp/threads configurations
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=2 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=diverge
./ci/blackbox.sh --driver=simx --cores=1 --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=simx --cores=1 --warps=8 --threads=16 --app=diverge
# cores clustering
./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=diverge --args="-n1"
# L2/L3
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=diverge --args="-n1"
echo "clustering tests done!"
}
debug()
{
echo "begin debugging tests..."
# test CSV trace generation
make -C sim/simx clean && DEBUG=3 make -C sim/simx
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim
make -C tests/riscv/isa run-simx-32im > run_simx.log
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
diff trace_rtlsim.csv trace_simx.csv
make -C sim/simx clean && make -C sim/simx
make -C sim/rtlsim clean && make -C sim/rtlsim
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=basic --args="-t0 -n1"
echo "debugging tests done!"
}
config()
{
echo "begin configuration tests..."
# disable DPI
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
# issue width
CONFIGS="-DISSUE_WIDTH=1" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DISSUE_WIDTH=1" ./ci/blackbox.sh --driver=simx --app=diverge
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=simx --app=diverge
# dispatch size
CONFIGS="-DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=1" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=1" ./ci/blackbox.sh --driver=simx --app=diverge
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=simx --app=diverge
# FPU scaling
CONFIGS="-DNUM_ALU_BLOCK=4 -DNUM_FPU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DNUM_ALU_BLOCK=4 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
# custom program startup address
make -C tests/regression/dogfood clean-all
STARTUP_ADDR=0x40000000 make -C tests/regression/dogfood
CONFIGS="-DSTARTUP_ADDR=0x40000000" ./ci/blackbox.sh --driver=simx --app=dogfood
CONFIGS="-DSTARTUP_ADDR=0x40000000" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
make -C tests/regression/dogfood clean-all
make -C tests/regression/dogfood
# disabling M extension
CONFIGS="-DEXT_M_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
# disabling F extension
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext --perf=1
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=simx --cores=1 --app=no_mf_ext --perf=1
# disable shared memory
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem --perf=1
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=simx --cores=1 --app=no_smem --perf=1
# disable L1 cache
CONFIGS="-DL1_DISABLE -DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
# multiple L1 caches per cluster
CONFIGS="-DNUM_DCACHES=2 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --cores=8 --warps=1 --threads=2
# test AXI bus
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo
# adjust l1 block size to match l2
CONFIGS="-DL1_LINE_SIZE=64" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1"
# test cache banking
CONFIGS="-DSMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DSMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DSMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemm
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --cores=1 --app=sgemm
# test 128-bit MEM block
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
# test single-bank DRAM
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
# test 27-bit DRAM address
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
echo "configuration tests done!"
}
stress0()
{
echo "begin stress0 tests..."
# test verilator reset values
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --app=printf
echo "stress0 tests done!"
}
stress1()
{
echo "begin stress1 tests..."
./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n128" --l2cache
echo "stress1 tests done!"
}
synthesis()
{
echo "begin synthesis tests..."
PREFIX=build_base make -C hw/syn/yosys clean
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys elaborate
echo "synthesis tests done!"
}
show_usage()
{
echo "Vortex Regression Test"
echo "Usage: $0 [--unittest] [--isa] [--regression] [--opencl] [--cluster] [--debug] [--config] [--stress[#n]] [--synthesis] [--all] [--h|--help]"
}
start=$SECONDS
while [ "$1" != "" ]; do
case $1 in
--unittest ) unittest
;;
--isa ) isa
;;
--regression ) regression
;;
--opencl ) opencl
;;
--cluster ) cluster
;;
--debug ) debug
;;
--config ) config
;;
--stress0 ) stress0
;;
--stress1 ) stress1
;;
--stress ) stress0
stress1
;;
--synthesis ) synthesis
;;
--all ) unittest
isa
regression
opencl
cluster
debug
config
stress0
stress1
synthesis
;;
-h | --help ) show_usage
exit
;;
* ) show_usage
exit 1
esac
shift
done
echo "Regression completed!"
duration=$(( SECONDS - start ))
awk -v t=$duration 'BEGIN{t=int(t*1000); printf "Elapsed Time: %d:%02d:%02d\n", t/3600000, t/60000%60, t/1000%60}'

450
ci/regression.sh.in Executable file
View file

@ -0,0 +1,450 @@
#!/bin/bash
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# exit when any command fails
set -e
# clear blackbox cache
rm -f blackbox.*.cache
# HW: add a test "VM Test" to make sure VM feature is enabled
XLEN=${XLEN:=@XLEN@}
XSIZE=$((XLEN / 8))
echo "Vortex Regression Test: XLEN=$XLEN"
unittest()
{
make -C tests/unittest run
make -C hw/unittest > /dev/null
}
isa()
{
echo "begin isa tests..."
make -C sim/simx
make -C sim/rtlsim
make -C tests/riscv/isa run-simx
make -C tests/riscv/isa run-rtlsim
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
if [ "$XLEN" == "64" ]
then
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64d
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64d
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64fx
fi
# clean build
make -C sim/rtlsim clean
echo "isa tests done!"
}
kernel()
{
echo "begin kernel tests..."
make -C sim/simx
make -C sim/rtlsim
make -C tests/kernel run-simx
make -C tests/kernel run-rtlsim
echo "kernel tests done!"
}
regression()
{
echo "begin regression tests..."
make -C runtime/simx
make -C runtime/rtlsim
make -C tests/regression run-simx
make -C tests/regression run-rtlsim
# test global barrier
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tgbar" --cores=2
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tgbar" --cores=2
# test local barrier
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar"
./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tbar"
echo "regression tests done!"
}
opencl()
{
echo "begin opencl tests..."
make -C runtime/simx
make -C runtime/rtlsim
make -C tests/opencl run-simx
make -C tests/opencl run-rtlsim
./ci/blackbox.sh --driver=simx --app=lbm --warps=8
./ci/blackbox.sh --driver=rtlsim --app=lbm --warps=8
echo "opencl tests done!"
}
vm(){
echo "begin vm tests..."
make -C sim/simx
make -C runtime/simx
make -C tests/kernel run-simx
# Regression tests
make -C tests/regression run-simx
# test global barrier
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tgbar" --cores=2
# test local barrier
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar"
# OpenCL tests
make -C tests/opencl run-simx
./ci/blackbox.sh --driver=simx --app=lbm --warps=8
echo "vm tests done!"
}
cache()
{
echo "begin cache tests..."
# disable local memory
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo --perf=1
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=demo --perf=1
# disable L1 cache
CONFIGS="-DL1_DISABLE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
# reduce l1 line size
CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=rtlsim --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache ways
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache banking
CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test writeback
CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --app=mstress
CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
# cache clustering
CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=4 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=4 --warps=1 --threads=2
# L2/L3
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=4 --l2cache --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=diverge --args="-n1"
echo "begin cache tests..."
}
config1()
{
echo "begin configuration-1 tests..."
# warp/threads
./ci/blackbox.sh --driver=rtlsim --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=rtlsim --warps=2 --threads=2 --app=diverge
./ci/blackbox.sh --driver=rtlsim --warps=2 --threads=8 --app=diverge
./ci/blackbox.sh --driver=rtlsim --warps=8 --threads=2 --app=diverge
./ci/blackbox.sh --driver=simx --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=simx --warps=8 --threads=16 --app=diverge
# cores clustering
./ci/blackbox.sh --driver=rtlsim --cores=4 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --app=diverge --args="-n1"
# issue width
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DISSUE_WIDTH=4" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=simx --app=diverge
CONFIGS="-DISSUE_WIDTH=4" ./ci/blackbox.sh --driver=simx --app=diverge
# ALU scaling
CONFIGS="-DISSUE_WIDTH=2 -DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DISSUE_WIDTH=4 -DNUM_ALU_BLOCK=4 -DNUM_ALU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DISSUE_WIDTH=2 -DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=simx --app=diverge
CONFIGS="-DISSUE_WIDTH=4 -DNUM_ALU_BLOCK=4 -DNUM_ALU_LANES=4" ./ci/blackbox.sh --driver=simx --app=diverge
# FPU scaling
CONFIGS="-DISSUE_WIDTH=2 -DNUM_FPU_BLOCK=1 -DNUM_FPU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_FPU_BLOCK=4 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx
CONFIGS="-DISSUE_WIDTH=2 -DNUM_FPU_BLOCK=1 -DNUM_FPU_LANES=2" ./ci/blackbox.sh --driver=simx --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_FPU_BLOCK=4 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=simx --app=vecaddx
# FPU's PE scaling
CONFIGS="-DFMA_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfmadd"
CONFIGS="-DFCVT_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tftoi"
CONFIGS="-DFDIV_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfdiv"
CONFIGS="-DFSQRT_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfsqrt"
CONFIGS="-DFNCP_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfclamp"
# LSU scaling
CONFIGS="-DISSUE_WIDTH=2 -DNUM_LSU_BLOCK=1 -DNUM_LSU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_LSU_BLOCK=4 -DNUM_LSU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx
CONFIGS="-DISSUE_WIDTH=2 -DNUM_LSU_BLOCK=1 -DNUM_LSU_LANES=2" ./ci/blackbox.sh --driver=simx --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_LSU_BLOCK=4 -DNUM_LSU_LANES=4" ./ci/blackbox.sh --driver=simx --app=vecaddx
echo "configuration-1 tests done!"
}
config2()
{
echo "begin configuration-2 tests..."
# test opaesim
./ci/blackbox.sh --driver=opae --app=printf
./ci/blackbox.sh --driver=opae --app=diverge
# disable DPI
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
# custom program startup address
make -C tests/regression/dogfood clean-kernel
if [ "$XLEN" == "64" ]; then
STARTUP_ADDR=0x180000000 make -C tests/regression/dogfood
else
STARTUP_ADDR=0x80000000 make -C tests/regression/dogfood
fi
./ci/blackbox.sh --driver=simx --app=dogfood
./ci/blackbox.sh --driver=rtlsim --app=dogfood
make -C tests/regression/dogfood clean-kernel
# disabling M & F extensions
make -C sim/rtlsim clean && CONFIGS="-DEXT_M_DISABLE -DEXT_F_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32i
make -C sim/rtlsim clean
# disabling ZICOND extension
CONFIGS="-DEXT_ZICOND_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo
# test AXI bus
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --app=mstress
# test 128-bit MEM block
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=mstress
# test XLEN-bit MEM block
CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=mstress
# test memory coalescing
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8
# test single-bank DRAM
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
# test 27-bit DRAM address
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --app=mstress
echo "configuration-2 tests done!"
}
test_csv_trace()
{
# test CSV trace generation
make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-simx-32im > run_simx.log
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
diff trace_rtlsim.csv trace_simx.csv
# clean build
make -C sim/simx clean
make -C sim/rtlsim clean
}
debug()
{
echo "begin debugging tests..."
test_csv_trace
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1"
echo "debugging tests done!"
}
stress()
{
echo "begin stress tests..."
# test verilator reset values
CONFIGS="-DVERILATOR_RESET_VALUE=1 -DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --args="-n128" --l2cache
echo "stress tests done!"
}
synthesis()
{
echo "begin synthesis tests..."
PREFIX=build_base make -C hw/syn/yosys clean
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys synthesis
echo "synthesis tests done!"
}
show_usage()
{
echo "Vortex Regression Test"
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--stress] [--synthesis] [--all] [--h|--help]"
}
declare -a tests=()
clean=0
while [ "$1" != "" ]; do
case $1 in
--clean )
clean=1
;;
--unittest )
tests+=("unittest")
;;
--isa )
tests+=("isa")
;;
--kernel )
tests+=("kernel")
;;
--regression )
tests+=("regression")
;;
--opencl )
tests+=("opencl")
;;
--cache )
tests+=("cache")
;;
--vm )
tests+=("vm")
;;
--config1 )
tests+=("config1")
;;
--config2 )
tests+=("config2")
;;
--debug )
tests+=("debug")
;;
--stress )
tests+=("stress")
;;
--synthesis )
tests+=("synthesis")
;;
--all )
tests=()
tests+=("unittest")
tests+=("isa")
tests+=("kernel")
tests+=("regression")
tests+=("opencl")
tests+=("cache")
tests+=("config1")
tests+=("config2")
tests+=("debug")
tests+=("stress")
tests+=("synthesis")
;;
-h | --help )
show_usage
exit
;;
* )
show_usage
exit 1
esac
shift
done
if [ $clean -eq 1 ];
then
make clean
make -s
fi
start=$SECONDS
for test in "${tests[@]}"; do
$test
done
echo "Regression completed!"
duration=$(( SECONDS - start ))
awk -v t=$duration 'BEGIN{t=int(t*1000); printf "Elapsed Time: %d:%02d:%02d\n", t/3600000, t/60000%60, t/1000%60}'

27
ci/system_updates.sh Executable file
View file

@ -0,0 +1,27 @@
#!/bin/sh
# Copyright 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
apt-get update -y
add-apt-repository -y ppa:ubuntu-toolchain-r/test
apt-get update
apt-get install -y g++-11 gcc-11
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100
apt-get install -y build-essential valgrind libstdc++6 binutils python uuid-dev ccache

8
ci/toolchain_env.sh → ci/toolchain_env.sh.in Normal file → Executable file
View file

@ -1,6 +1,6 @@
#!/bin/sh #!/bin/sh
# Copyright 2023 blaise # Copyright 2019-2023
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
@ -14,10 +14,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
TOOLDIR=${TOOLDIR:=/opt} TOOLDIR=${TOOLDIR:=@TOOLDIR@}
export VERILATOR_ROOT=$TOOLDIR/verilator # export VERILATOR_ROOT=$TOOLDIR/verilator
export PATH=$VERILATOR_ROOT/bin:$PATH # export PATH=$VERILATOR_ROOT/bin:$PATH
export SV2V_PATH=$TOOLDIR/sv2v export SV2V_PATH=$TOOLDIR/sv2v
export PATH=$SV2V_PATH/bin:$PATH export PATH=$SV2V_PATH/bin:$PATH

View file

@ -1,184 +0,0 @@
#!/bin/bash
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# exit when any command fails
set -e
REPOSITORY=https://github.com/vortexgpgpu/vortex-toolchain-prebuilt/raw/master
TOOLDIR=${TOOLDIR:=/opt}
OSDIR=${OSDIR:=ubuntu/bionic}
OS="${OS:=ubuntu/bionic}"
riscv()
{
case $OSDIR in
"centos/7") parts=$(eval echo {a..h}) ;;
*) parts=$(eval echo {a..j}) ;;
esac
rm -f riscv-gnu-toolchain.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/riscv-gnu-toolchain/$OSDIR/riscv-gnu-toolchain.tar.bz2.parta$x
done
cat riscv-gnu-toolchain.tar.bz2.parta* > riscv-gnu-toolchain.tar.bz2
tar -xvf riscv-gnu-toolchain.tar.bz2
cp -r riscv-gnu-toolchain $TOOLDIR
rm -f riscv-gnu-toolchain.tar.bz2*
rm -rf riscv-gnu-toolchain
}
riscv64()
{
case $OSDIR in
"centos/7") parts=$(eval echo {a..h}) ;;
*) parts=$(eval echo {a..j}) ;;
esac
rm -f riscv64-gnu-toolchain.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/riscv64-gnu-toolchain/$OSDIR/riscv64-gnu-toolchain.tar.bz2.parta$x
done
cat riscv64-gnu-toolchain.tar.bz2.parta* > riscv64-gnu-toolchain.tar.bz2
tar -xvf riscv64-gnu-toolchain.tar.bz2
cp -r riscv64-gnu-toolchain $TOOLDIR
rm -f riscv64-gnu-toolchain.tar.bz2*
rm -rf riscv64-gnu-toolchain
}
llvm-vortex()
{
case $OSDIR in
"centos/7") parts=$(eval echo {a..b}) ;;
*) parts=$(eval echo {a..b}) ;;
esac
echo $parts
rm -f llvm-vortex.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/llvm-vortex/$OSDIR/llvm-vortex.tar.bz2.parta$x
done
cat llvm-vortex.tar.bz2.parta* > llvm-vortex.tar.bz2
tar -xvf llvm-vortex.tar.bz2
cp -r llvm-vortex $TOOLDIR
rm -f llvm-vortex.tar.bz2*
rm -rf llvm-vortex
}
llvm-pocl()
{
case $OSDIR in
"centos/7") parts=$(eval echo {a..b}) ;;
*) parts=$(eval echo {a..b}) ;;
esac
echo $parts
rm -f llvm-pocl.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/llvm-pocl/$OSDIR/llvm-pocl.tar.bz2.parta$x
done
cat llvm-pocl.tar.bz2.parta* > llvm-pocl.tar.bz2
tar -xvf llvm-pocl.tar.bz2
cp -r llvm-pocl $TOOLDIR
rm -f llvm-pocl.tar.bz2*
rm -rf llvm-pocl
}
pocl()
{
wget $REPOSITORY/pocl/$OSDIR/pocl.tar.bz2
tar -xvf pocl.tar.bz2
rm -f pocl.tar.bz2
cp -r pocl $TOOLDIR
rm -rf pocl
}
verilator()
{
wget $REPOSITORY/verilator/$OSDIR/verilator.tar.bz2
tar -xvf verilator.tar.bz2
cp -r verilator $TOOLDIR
rm -f verilator.tar.bz2
rm -rf verilator
}
sv2v()
{
wget $REPOSITORY/sv2v/$OSDIR/sv2v.tar.bz2
tar -xvf sv2v.tar.bz2
rm -f sv2v.tar.bz2
cp -r sv2v $TOOLDIR
rm -rf sv2v
}
yosys()
{
case $OSDIR in
"centos/7") parts=$(eval echo {a..c}) ;;
*) parts=$(eval echo {a..c}) ;;
esac
echo $parts
rm -f yosys.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/yosys/$OSDIR/yosys.tar.bz2.parta$x
done
cat yosys.tar.bz2.parta* > yosys.tar.bz2
tar -xvf yosys.tar.bz2
cp -r yosys $TOOLDIR
rm -f yosys.tar.bz2*
rm -rf yosys
}
show_usage()
{
echo "Install Pre-built Vortex Toolchain"
echo "Usage: $0 [[--riscv] [--riscv64] [--llvm-vortex] [--llvm-pocl] [--pocl] [--verilator] [--sv2v] [--yosys] [--all] [-h|--help]]"
}
while [ "$1" != "" ]; do
case $1 in
--pocl ) pocl
;;
--verilator ) verilator
;;
--riscv ) riscv
;;
--riscv64 ) riscv64
;;
--llvm-vortex ) llvm-vortex
;;
--llvm-pocl ) llvm-pocl
;;
--sv2v ) sv2v
;;
--yosys ) yosys
;;
--all ) pocl
verilator
sv2v
yosys
llvm-vortex
riscv
riscv64
;;
-h | --help ) show_usage
exit
;;
* ) show_usage
exit 1
esac
shift
done

199
ci/toolchain_install.sh.in Executable file
View file

@ -0,0 +1,199 @@
#!/bin/bash
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# exit when any command fails
set -e
REPOSITORY=https://github.com/vortexgpgpu/vortex-toolchain-prebuilt/raw/master
TOOLDIR=${TOOLDIR:=@TOOLDIR@}
OSVERSION=${OSVERSION:=@OSVERSION@}
riscv32()
{
case $OSVERSION in
"centos/7") parts=$(eval echo {a..h}) ;;
"ubuntu/focal") parts=$(eval echo {a..k}) ;;
*) parts=$(eval echo {a..j}) ;;
esac
rm -f riscv32-gnu-toolchain.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/riscv32-gnu-toolchain/$OSVERSION/riscv32-gnu-toolchain.tar.bz2.parta$x
done
cat riscv32-gnu-toolchain.tar.bz2.parta* > riscv32-gnu-toolchain.tar.bz2
tar -xvf riscv32-gnu-toolchain.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/riscv32-gnu-toolchain && mv riscv32-gnu-toolchain $TOOLDIR
rm -rf riscv32-gnu-toolchain.tar.bz2*
}
riscv64()
{
case $OSVERSION in
"centos/7") parts=$(eval echo {a..h}) ;;
*) parts=$(eval echo {a..j}) ;;
esac
rm -f riscv64-gnu-toolchain.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/riscv64-gnu-toolchain/$OSVERSION/riscv64-gnu-toolchain.tar.bz2.parta$x
done
cat riscv64-gnu-toolchain.tar.bz2.parta* > riscv64-gnu-toolchain.tar.bz2
tar -xvf riscv64-gnu-toolchain.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/riscv64-gnu-toolchain && mv riscv64-gnu-toolchain $TOOLDIR
rm -rf riscv64-gnu-toolchain riscv64-gnu-toolchain.tar.bz2*
}
llvm()
{
case $OSVERSION in
"centos/7") parts=$(eval echo {a..b}) ;;
*) parts=$(eval echo {a..b}) ;;
esac
echo $parts
rm -f llvm-vortex2.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/llvm-vortex/$OSVERSION/llvm-vortex2.tar.bz2.parta$x
done
cat llvm-vortex2.tar.bz2.parta* > llvm-vortex2.tar.bz2
tar -xvf llvm-vortex2.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/llvm-vortex && mv llvm-vortex $TOOLDIR
rm -rf llvm-vortex llvm-vortex2.tar.bz2*
}
libcrt32()
{
wget $REPOSITORY/libcrt32/libcrt32.tar.bz2
tar -xvf libcrt32.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/libcrt32 && mv libcrt32 $TOOLDIR
rm -rf libcrt32 libcrt32.tar.bz2
}
libcrt64()
{
wget $REPOSITORY/libcrt64/libcrt64.tar.bz2
tar -xvf libcrt64.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/libcrt64 && mv libcrt64 $TOOLDIR
rm -rf libcrt64 libcrt64.tar.bz2
}
libc32()
{
wget $REPOSITORY/libc32/libc32.tar.bz2
tar -xvf libc32.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/libc32 && mv libc32 $TOOLDIR
rm -rf libc32 libc32.tar.bz2
}
libc64()
{
wget $REPOSITORY/libc64/libc64.tar.bz2
tar -xvf libc64.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/libc64 && mv libc64 $TOOLDIR
rm -rf libc64 libc64.tar.bz2
}
pocl()
{
wget $REPOSITORY/pocl/$OSVERSION/pocl2.tar.bz2
tar -xvf pocl2.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/pocl && mv pocl $TOOLDIR
rm -rf pocl2 pocl2.tar.bz2
}
verilator()
{
wget $REPOSITORY/verilator/$OSVERSION/verilator.tar.bz2
tar -xvf verilator.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/verilator && mv verilator $TOOLDIR
rm -rf verilator verilator.tar.bz2
}
sv2v()
{
wget $REPOSITORY/sv2v/$OSVERSION/sv2v.tar.bz2
tar -xvf sv2v.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/sv2v && mv sv2v $TOOLDIR
rm -rf sv2v sv2v.tar.bz2
}
yosys()
{
case $OSVERSION in
"centos/7") parts=$(eval echo {a..c}) ;;
*) parts=$(eval echo {a..c}) ;;
esac
echo $parts
rm -f yosys.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/yosys/$OSVERSION/yosys.tar.bz2.parta$x
done
cat yosys.tar.bz2.parta* > yosys.tar.bz2
tar -xvf yosys.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/yosys && mv yosys $TOOLDIR
rm -rf yosys yosys.tar.bz2* yosys
}
show_usage()
{
echo "Install Pre-built Vortex Toolchain"
echo "Usage: $0 [--pocl] [--verilator] [--riscv32] [--riscv64] [--llvm] [--libcrt32] [--libcrt64] [--libc32] [--libc64] [--sv2v] [--yosys] [--all] [-h|--help]"
}
while [ "$1" != "" ]; do
case $1 in
--pocl ) pocl
;;
--verilator ) verilator
;;
--riscv32 ) riscv32
;;
--riscv64 ) riscv64
;;
--llvm ) llvm
;;
--libcrt32 ) libcrt32
;;
--libcrt64 ) libcrt64
;;
--libc32 ) libc32
;;
--libc64 ) libc64
;;
--sv2v ) sv2v
;;
--yosys ) yosys
;;
--all ) pocl
verilator
llvm
libcrt32
libcrt64
libc32
libc64
riscv32
riscv64
sv2v
yosys
;;
-h | --help ) show_usage
exit
;;
* ) show_usage
exit 1
esac
shift
done

View file

@ -1,128 +0,0 @@
#!/bin/bash
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# exit when any command fails
set -e
TOOLDIR=${TOOLDIR:=/opt}
OSDIR=${OSDIR:=ubuntu/bionic}
riscv()
{
echo "prebuilt riscv-gnu-toolchain..."
tar -C $TOOLDIR -cvjf riscv-gnu-toolchain.tar.bz2 riscv-gnu-toolchain
split -b 50M riscv-gnu-toolchain.tar.bz2 "riscv-gnu-toolchain.tar.bz2.part"
mv riscv-gnu-toolchain.tar.bz2.part* ./riscv-gnu-toolchain/$OSDIR
rm riscv-gnu-toolchain.tar.bz2
}
riscv64()
{
echo "prebuilt riscv64-gnu-toolchain..."
tar -C $TOOLDIR -cvjf riscv64-gnu-toolchain.tar.bz2 riscv64-gnu-toolchain
split -b 50M riscv64-gnu-toolchain.tar.bz2 "riscv64-gnu-toolchain.tar.bz2.part"
mv riscv64-gnu-toolchain.tar.bz2.part* ./riscv64-gnu-toolchain/$OSDIR
rm riscv64-gnu-toolchain.tar.bz2
}
llvm-vortex()
{
echo "prebuilt llvm-vortex..."
tar -C $TOOLDIR -cvjf llvm-vortex.tar.bz2 llvm-vortex
split -b 50M llvm-vortex.tar.bz2 "llvm-vortex.tar.bz2.part"
mv llvm-vortex.tar.bz2.part* ./llvm-vortex/$OSDIR
rm llvm-vortex.tar.bz2
}
llvm-pocl()
{
echo "prebuilt llvm-pocl..."
tar -C $TOOLDIR -cvjf llvm-pocl.tar.bz2 llvm-pocl
split -b 50M llvm-pocl.tar.bz2 "llvm-pocl.tar.bz2.part"
mv llvm-pocl.tar.bz2.part* ./llvm-pocl/$OSDIR
rm llvm-pocl.tar.bz2
}
pocl()
{
echo "prebuilt pocl..."
tar -C $TOOLDIR -cvjf pocl.tar.bz2 pocl
mv pocl.tar.bz2 ./pocl/$OSDIR
}
verilator()
{
echo "prebuilt verilator..."
tar -C $TOOLDIR -cvjf verilator.tar.bz2 verilator
mv verilator.tar.bz2 ./verilator/$OSDIR
}
sv2v()
{
echo "prebuilt sv2v..."
tar -C $TOOLDIR -cvjf sv2v.tar.bz2 sv2v
mv sv2v.tar.bz2 ./sv2v/$OSDIR
}
yosys()
{
echo "prebuilt yosys..."
tar -C $TOOLDIR -cvjf yosys.tar.bz2 yosys
split -b 50M yosys.tar.bz2 "yosys.tar.bz2.part"
mv yosys.tar.bz2.part* ./yosys/$OSDIR
rm yosys.tar.bz2
}
show_usage()
{
echo "Setup Pre-built Vortex Toolchain"
echo "Usage: $0 [[--riscv] [--llvm-vortex] [--llvm-pocl] [--pocl] [--verilator] [--sv2v] [-yosys] [--all] [-h|--help]]"
}
while [ "$1" != "" ]; do
case $1 in
--pocl ) pocl
;;
--verilator ) verilator
;;
--riscv ) riscv
;;
--riscv64 ) riscv64
;;
--llvm-vortex ) llvm-vortex
;;
--llvm-pocl ) llvm-pocl
;;
--sv2v ) sv2v
;;
--yosys ) yosys
;;
--all ) riscv
riscv64
llvm-vortex
llvm-pocl
pocl
verilator
sv2v
yosys
;;
-h | --help ) show_usage
exit
;;
* ) show_usage
exit 1
esac
shift
done

167
ci/toolchain_prebuilt.sh.in Executable file
View file

@ -0,0 +1,167 @@
#!/bin/bash
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# exit when any command fails
set -e
TOOLDIR=${TOOLDIR:=@TOOLDIR@}
OSVERSION=${OSVERSION:=@OSVERSION@}
riscv32()
{
echo "prebuilt riscv32-gnu-toolchain..."
tar -C $TOOLDIR -cvjf riscv32-gnu-toolchain.tar.bz2 riscv32-gnu-toolchain
split -b 50M riscv32-gnu-toolchain.tar.bz2 "riscv32-gnu-toolchain.tar.bz2.part"
mkdir -p ./riscv32-gnu-toolchain/$OSVERSION
mv riscv32-gnu-toolchain.tar.bz2.part* ./riscv32-gnu-toolchain/$OSVERSION
rm riscv32-gnu-toolchain.tar.bz2
}
riscv64()
{
echo "prebuilt riscv64-gnu-toolchain..."
tar -C $TOOLDIR -cvjf riscv64-gnu-toolchain.tar.bz2 riscv64-gnu-toolchain
split -b 50M riscv64-gnu-toolchain.tar.bz2 "riscv64-gnu-toolchain.tar.bz2.part"
mkdir -p ./riscv64-gnu-toolchain/$OSVERSION
mv riscv64-gnu-toolchain.tar.bz2.part* ./riscv64-gnu-toolchain/$OSVERSION
rm riscv64-gnu-toolchain.tar.bz2
}
llvm()
{
echo "prebuilt llvm-vortex2..."
tar -C $TOOLDIR -cvjf llvm-vortex2.tar.bz2 llvm-vortex
split -b 50M llvm-vortex2.tar.bz2 "llvm-vortex2.tar.bz2.part"
mkdir -p ./llvm-vortex/$OSVERSION
mv llvm-vortex2.tar.bz2.part* ./llvm-vortex/$OSVERSION
rm llvm-vortex2.tar.bz2
}
libcrt32()
{
echo "prebuilt libcrt32..."
tar -C $TOOLDIR -cvjf libcrt32.tar.bz2 libcrt32
mkdir -p ./libcrt32
mv libcrt32.tar.bz2 ./libcrt32
}
libcrt64()
{
echo "prebuilt libcrt64..."
tar -C $TOOLDIR -cvjf libcrt64.tar.bz2 libcrt64
mkdir -p ./libcrt64
mv libcrt64.tar.bz2 ./libcrt64
}
libc32()
{
echo "prebuilt libc32..."
tar -C $TOOLDIR -cvjf libc32.tar.bz2 libc32
mkdir -p ./libc32
mv libc32.tar.bz2 ./libc32
}
libc64()
{
echo "prebuilt libc64..."
tar -C $TOOLDIR -cvjf libc64.tar.bz2 libc64
mkdir -p ./libc64
mv libc64.tar.bz2 ./libc64
}
pocl()
{
echo "prebuilt pocl..."
tar -C $TOOLDIR -cvjf pocl2.tar.bz2 pocl
mkdir -p ./pocl/$OSVERSION
mv pocl2.tar.bz2 ./pocl/$OSVERSION
}
verilator()
{
echo "prebuilt verilator..."
tar -C $TOOLDIR -cvjf verilator.tar.bz2 verilator
mkdir -p ./verilator/$OSVERSION
mv verilator.tar.bz2 ./verilator/$OSVERSION
}
sv2v()
{
echo "prebuilt sv2v..."
tar -C $TOOLDIR -cvjf sv2v.tar.bz2 sv2v
mkdir -p ./sv2v/$OSVERSION
mv sv2v.tar.bz2 ./sv2v/$OSVERSION
}
yosys()
{
echo "prebuilt yosys..."
tar -C $TOOLDIR -cvjf yosys.tar.bz2 yosys
split -b 50M yosys.tar.bz2 "yosys.tar.bz2.part"
mkdir -p ./yosys/$OSVERSION
mv yosys.tar.bz2.part* ./yosys/$OSVERSION
rm yosys.tar.bz2
}
show_usage()
{
echo "Setup Pre-built Vortex Toolchain"
echo "Usage: $0 [--pocl] [--verilator] [--riscv32] [--riscv64] [--llvm] [--libcrt32] [--libcrt64] [--libc32] [--libc64] [--sv2v] [-yosys] [--all] [-h|--help]"
}
while [ "$1" != "" ]; do
case $1 in
--pocl ) pocl
;;
--verilator ) verilator
;;
--riscv32 ) riscv32
;;
--riscv64 ) riscv64
;;
--llvm ) llvm
;;
--libcrt32 ) libcrt32
;;
--libcrt64 ) libcrt64
;;
--libc32 ) libc32
;;
--libc64 ) libc64
;;
--sv2v ) sv2v
;;
--yosys ) yosys
;;
--all ) pocl
verilator
riscv32
riscv64
llvm
libcrt32
libcrt64
libc32
libc64
sv2v
yosys
;;
-h | --help ) show_usage
exit
;;
* ) show_usage
exit 1
esac
shift
done

View file

@ -17,6 +17,9 @@ import sys
import argparse import argparse
import csv import csv
import re import re
import inspect
configs = None
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description='CPU trace log to CSV format converter.') parser = argparse.ArgumentParser(description='CPU trace log to CSV format converter.')
@ -25,7 +28,25 @@ def parse_args():
parser.add_argument('log', help='Input log file') parser.add_argument('log', help='Input log file')
return parser.parse_args() return parser.parse_args()
def parse_simx(log_filename): def load_config(filename):
config_pattern = r"CONFIGS: num_threads=(\d+), num_warps=(\d+), num_cores=(\d+), num_clusters=(\d+), socket_size=(\d+), local_mem_base=0x([0-9a-fA-F]+), num_barriers=(\d+)"
with open(filename, 'r') as file:
for line in file:
config_match = re.search(config_pattern, line)
if config_match:
config = {
'num_threads': int(config_match.group(1)),
'num_warps': int(config_match.group(2)),
'num_cores': int(config_match.group(3)),
'num_clusters': int(config_match.group(4)),
'socket_size': int(config_match.group(5)),
'local_mem_base': int(config_match.group(6), 16),
'num_barriers': int(config_match.group(7)),
}
return config
return None
def parse_simx(log_lines):
pc_pattern = r"PC=(0x[0-9a-fA-F]+)" pc_pattern = r"PC=(0x[0-9a-fA-F]+)"
instr_pattern = r"Instr (0x[0-9a-fA-F]+):" instr_pattern = r"Instr (0x[0-9a-fA-F]+):"
opcode_pattern = r"Instr 0x[0-9a-fA-F]+: ([0-9a-zA-Z_\.]+)" opcode_pattern = r"Instr 0x[0-9a-fA-F]+: ([0-9a-zA-Z_\.]+)"
@ -36,19 +57,19 @@ def parse_simx(log_filename):
destination_pattern = r"Dest Reg: (.+)" destination_pattern = r"Dest Reg: (.+)"
uuid_pattern = r"#(\d+)" uuid_pattern = r"#(\d+)"
entries = [] entries = []
with open(log_filename, 'r') as log_file:
instr_data = None instr_data = None
for lineno, line in enumerate(log_file, start=1): for lineno, line in enumerate(log_lines, start=1):
try:
if line.startswith("DEBUG Fetch:"): if line.startswith("DEBUG Fetch:"):
if instr_data: if instr_data:
entries.append(instr_data) entries.append(instr_data)
instr_data = {} instr_data = {}
instr_data["lineno"] = lineno instr_data["lineno"] = lineno
instr_data["PC"] = re.search(pc_pattern, line).group(1) instr_data["PC"] = re.search(pc_pattern, line).group(1)
instr_data["core_id"] = re.search(core_id_pattern, line).group(1) instr_data["core_id"] = int(re.search(core_id_pattern, line).group(1))
instr_data["warp_id"] = re.search(warp_id_pattern, line).group(1) instr_data["warp_id"] = int(re.search(warp_id_pattern, line).group(1))
instr_data["tmask"] = re.search(tmask_pattern, line).group(1) instr_data["tmask"] = re.search(tmask_pattern, line).group(1)
instr_data["uuid"] = re.search(uuid_pattern, line).group(1) instr_data["uuid"] = int(re.search(uuid_pattern, line).group(1))
elif line.startswith("DEBUG Instr"): elif line.startswith("DEBUG Instr"):
instr_data["instr"] = re.search(instr_pattern, line).group(1) instr_data["instr"] = re.search(instr_pattern, line).group(1)
instr_data["opcode"] = re.search(opcode_pattern, line).group(1) instr_data["opcode"] = re.search(opcode_pattern, line).group(1)
@ -57,6 +78,9 @@ def parse_simx(log_filename):
instr_data["operands"] = (instr_data["operands"] + ', ' + src_reg) if 'operands' in instr_data else src_reg instr_data["operands"] = (instr_data["operands"] + ', ' + src_reg) if 'operands' in instr_data else src_reg
elif line.startswith("DEBUG Dest"): elif line.startswith("DEBUG Dest"):
instr_data["destination"] = re.search(destination_pattern, line).group(1) instr_data["destination"] = re.search(destination_pattern, line).group(1)
except Exception as e:
print("Error at line {}: {}".format(lineno, e))
instr_data = None
if instr_data: if instr_data:
entries.append(instr_data) entries.append(instr_data)
return entries return entries
@ -78,13 +102,6 @@ def append_reg(text, value, sep):
sep = True sep = True
return text, sep return text, sep
def append_imm(text, value, sep):
if sep:
text += ", "
text += value
sep = True
return text, sep
def append_value(text, reg, value, tmask_arr, sep): def append_value(text, reg, value, tmask_arr, sep):
text, sep = append_reg(text, reg, sep) text, sep = append_reg(text, reg, sep)
text += "={" text += "={"
@ -98,8 +115,9 @@ def append_value(text, reg, value, tmask_arr, sep):
text += "}" text += "}"
return text, sep return text, sep
def parse_rtlsim(log_filename): def parse_rtlsim(log_lines):
line_pattern = r"\d+: core(\d+)-(decode|issue|commit)" global configs
line_pattern = r"\d+: cluster(\d+)-socket(\d+)-core(\d+)-(decode|issue|commit)"
pc_pattern = r"PC=(0x[0-9a-fA-F]+)" pc_pattern = r"PC=(0x[0-9a-fA-F]+)"
instr_pattern = r"instr=(0x[0-9a-fA-F]+)" instr_pattern = r"instr=(0x[0-9a-fA-F]+)"
ex_pattern = r"ex=([a-zA-Z]+)" ex_pattern = r"ex=([a-zA-Z]+)"
@ -108,8 +126,6 @@ def parse_rtlsim(log_filename):
tmask_pattern = r"tmask=(\d+)" tmask_pattern = r"tmask=(\d+)"
wb_pattern = r"wb=(\d)" wb_pattern = r"wb=(\d)"
opds_pattern = r"opds=(\d+)" opds_pattern = r"opds=(\d+)"
use_imm_pattern = r"use_imm=(\d)"
imm_pattern = r"imm=(0x[0-9a-fA-F]+)"
rd_pattern = r"rd=(\d+)" rd_pattern = r"rd=(\d+)"
rs1_pattern = r"rs1=(\d+)" rs1_pattern = r"rs1=(\d+)"
rs2_pattern = r"rs2=(\d+)" rs2_pattern = r"rs2=(\d+)"
@ -121,22 +137,27 @@ def parse_rtlsim(log_filename):
eop_pattern = r"eop=(\d)" eop_pattern = r"eop=(\d)"
uuid_pattern = r"#(\d+)" uuid_pattern = r"#(\d+)"
entries = [] entries = []
with open(log_filename, 'r') as log_file:
instr_data = {} instr_data = {}
for lineno, line in enumerate(log_file, start=1): num_cores = configs['num_cores']
socket_size = configs['socket_size']
num_sockets = (num_cores + socket_size - 1) // socket_size
for lineno, line in enumerate(log_lines, start=1):
try:
line_match = re.search(line_pattern, line) line_match = re.search(line_pattern, line)
if line_match: if line_match:
PC = re.search(pc_pattern, line).group(1) PC = re.search(pc_pattern, line).group(1)
warp_id = re.search(warp_id_pattern, line).group(1) warp_id = int(re.search(warp_id_pattern, line).group(1))
tmask = re.search(tmask_pattern, line).group(1) tmask = re.search(tmask_pattern, line).group(1)
uuid = re.search(uuid_pattern, line).group(1) uuid = int(re.search(uuid_pattern, line).group(1))
core_id = line_match.group(1) cluster_id = int(line_match.group(1))
stage = line_match.group(2) socket_id = int(line_match.group(2))
core_id = int(line_match.group(3))
stage = line_match.group(4)
if stage == "decode": if stage == "decode":
trace = {} trace = {}
trace["uuid"] = uuid trace["uuid"] = uuid
trace["PC"] = PC trace["PC"] = PC
trace["core_id"] = core_id trace["core_id"] = ((((cluster_id * num_sockets) + socket_id) * socket_size) + core_id)
trace["warp_id"] = warp_id trace["warp_id"] = warp_id
trace["tmask"] = reverse_binary(tmask) trace["tmask"] = reverse_binary(tmask)
trace["instr"] = re.search(instr_pattern, line).group(1) trace["instr"] = re.search(instr_pattern, line).group(1)
@ -146,8 +167,6 @@ def parse_rtlsim(log_filename):
trace["rs1"] = re.search(rs1_pattern, line).group(1) trace["rs1"] = re.search(rs1_pattern, line).group(1)
trace["rs2"] = re.search(rs2_pattern, line).group(1) trace["rs2"] = re.search(rs2_pattern, line).group(1)
trace["rs3"] = re.search(rs3_pattern, line).group(1) trace["rs3"] = re.search(rs3_pattern, line).group(1)
trace["use_imm"] = re.search(use_imm_pattern, line).group(1) == "1"
trace["imm"] = re.search(imm_pattern, line).group(1)
instr_data[uuid] = trace instr_data[uuid] = trace
elif stage == "issue": elif stage == "issue":
if uuid in instr_data: if uuid in instr_data:
@ -205,41 +224,65 @@ def parse_rtlsim(log_filename):
del trace["rs1"] del trace["rs1"]
del trace["rs2"] del trace["rs2"]
del trace["rs3"] del trace["rs3"]
del trace["use_imm"]
del trace["imm"]
del trace["issued"] del trace["issued"]
del instr_data[uuid] del instr_data[uuid]
entries.append(trace) entries.append(trace)
except Exception as e:
print("Error at line {}: {}".format(lineno, e))
return entries return entries
def write_csv(log_filename, csv_filename, log_type): def write_csv(sublogs, csv_filename, log_type):
with open(csv_filename, 'w', newline='') as csv_file:
fieldnames = ["uuid", "PC", "opcode", "instr", "core_id", "warp_id", "tmask", "destination", "operands"]
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for sublog in sublogs:
entries = None entries = None
# parse log file # parse sublog
if log_type == "rtlsim": if log_type == "rtlsim":
entries = parse_rtlsim(log_filename) entries = parse_rtlsim(sublog)
elif log_type == "simx": elif log_type == "simx":
entries = parse_simx(log_filename) entries = parse_simx(sublog)
else: else:
print('Error: invalid log type') print('Error: invalid log type')
sys.exit() sys.exit()
# sort entries by uuid # sort entries by uuid
entries.sort(key=lambda x: (int(x['core_id']), int(x['warp_id']), int(x['lineno']))) entries.sort(key=lambda x: (int(x['uuid'])))
for entry in entries: for entry in entries:
del entry['lineno'] del entry['lineno']
# write to CSV
with open(csv_filename, 'w', newline='') as csv_file:
fieldnames = ["uuid", "PC", "opcode", "instr", "core_id", "warp_id", "tmask", "operands", "destination"]
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for entry in entries: for entry in entries:
writer.writerow(entry) writer.writerow(entry)
def split_log_file(log_filename):
with open(log_filename, 'r') as log_file:
log_lines = log_file.readlines()
sublogs = []
current_sublog = None
for line in log_lines:
if line.startswith("[VXDRV] START"):
if current_sublog is not None:
sublogs.append(current_sublog)
current_sublog = [line]
elif current_sublog is not None:
current_sublog.append(line)
if current_sublog is not None:
sublogs.append(current_sublog)
return sublogs
def main(): def main():
global configs
args = parse_args() args = parse_args()
write_csv(args.log, args.csv, args.type) configs = load_config(args.log)
sublogs = split_log_file(args.log)
write_csv(sublogs, args.csv, args.type)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
# Copyright © 2019-2023 # Copyright 2019-2023
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
@ -18,27 +18,33 @@ import time
import threading import threading
import subprocess import subprocess
# This script executes a long-running command while outputing "still running ..." periodically # This script executes a long-running command while printing "still running ..." periodically
# to notify Travis build system that the program has not hanged # to notify Travis build system that the program has not hanged
PING_INTERVAL=300 # 5 minutes PING_INTERVAL=300 # 5 minutes
SLEEP_INTERVAL=1 # 1 second
def monitor(stop): def monitor(stop_event):
wait_time = 0 wait_time = 0
while True: elapsed_time = 0
time.sleep(PING_INTERVAL) while not stop_event.is_set():
wait_time += PING_INTERVAL time.sleep(SLEEP_INTERVAL)
elapsed_time += SLEEP_INTERVAL
if elapsed_time >= PING_INTERVAL:
wait_time += elapsed_time
print(" + still running (" + str(wait_time) + "s) ...") print(" + still running (" + str(wait_time) + "s) ...")
sys.stdout.flush() sys.stdout.flush()
if stop(): elapsed_time = 0
break
def execute(command): def execute(command):
process = subprocess.Popen(command, stdout=subprocess.PIPE) process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
while True: while True:
output = process.stdout.readline() output = process.stdout.readline()
if output: if output:
line = output.decode('ascii').rstrip() try:
line = output.decode('utf-8').rstrip()
except UnicodeDecodeError:
line = repr(output) # Safely print raw binary data
print(">>> " + line) print(">>> " + line)
process.stdout.flush() process.stdout.flush()
ret = process.poll() ret = process.poll()
@ -47,19 +53,21 @@ def execute(command):
return -1 return -1
def main(argv): def main(argv):
if not argv:
print("Usage: travis_run.py <command>")
sys.exit(1)
# start monitoring thread # start monitoring thread
stop_monitor = False stop_event = threading.Event()
t = threading.Thread(target = monitor, args =(lambda : stop_monitor, )) t = threading.Thread(target=monitor, args=(stop_event,))
t.start() t.start()
# execute command # execute command
exitcode = execute(argv) exitcode = execute(argv)
print(" + exitcode="+str(exitcode)) print(" + exitcode="+str(exitcode))
sys.stdout.flush()
# terminate monitoring thread # terminate monitoring thread
stop_monitor = True stop_event.set()
t.join() t.join()
sys.exit(exitcode) sys.exit(exitcode)

39
config.mk.in Normal file
View file

@ -0,0 +1,39 @@
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
VORTEX_HOME ?= @VORTEX_HOME@
XLEN ?= @XLEN@
TOOLDIR ?= @TOOLDIR@
OSVERSION ?= @OSVERSION@
INSTALLDIR ?= @INSTALLDIR@
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
LIBC_VORTEX ?= $(TOOLDIR)/libc$(XLEN)
LIBCRT_VORTEX ?= $(TOOLDIR)/libcrt$(XLEN)
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv$(XLEN)-gnu-toolchain
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
VORTEX_RT_PATH ?= $(VORTEX_HOME)/runtime
VORTEX_KN_PATH ?= $(VORTEX_HOME)/kernel
THIRD_PARTY_DIR ?= $(VORTEX_HOME)/third_party
VM_ENABLE ?= @VM_ENABLE@

179
configure vendored Executable file
View file

@ -0,0 +1,179 @@
#!/bin/bash
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Determine the current working directory
CURRENT_DIR=$(pwd)
# Function to detect current OS
detect_osversion() {
local osversion="unsupported"
if [ -f /etc/os-release ]; then
. /etc/os-release # Source the os-release file to get OS information
case "$ID" in
ubuntu)
case "$VERSION_CODENAME" in
bionic) osversion="ubuntu/bionic";;
focal) osversion="ubuntu/focal";;
# Add new versions as needed
esac
;;
centos)
case "$VERSION_ID" in
7) osversion="centos/7";;
# Add new versions as needed
esac
;;
esac
fi
echo "$osversion"
}
# Function to recursively copy files, skipping the current directory
copy_files() {
local source_dir="$1"
local target_dir="$2"
#echo "source_dir=$source_dir, target_dir=$target_dir"
local same_dir=0
if [ "$(realpath "$source_dir")" == "$(realpath "$target_dir")" ]; then
same_dir=1
fi
# Function to copy and update file
copy_and_update() {
local src_pattern="$1"
local dest_dir="$2"
for file in $src_pattern; do
#echo "*** $file > $dest_dir"
if [ -f "$file" ]; then
if [[ "$file" == *.in ]]; then
filename=$(basename -- "$file")
filename_no_ext="${filename%.in}"
dest_file="$dest_dir/$filename_no_ext"
mkdir -p "$dest_dir"
sed "s|@VORTEX_HOME@|$SCRIPT_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g; s|@VM_ENABLE@|$VM_ENABLE|g" "$file" > "$dest_file"
# apply permissions to bash scripts
read -r firstline < "$dest_file"
if [[ "$firstline" =~ ^#!.*bash ]]; then
chmod +x "$dest_file"
fi
else
if [ $same_dir -eq 0 ]; then
mkdir -p "$dest_dir"
cp -p "$file" "$dest_dir"
fi
fi
fi
done
}
for pattern in "${SUBDIRS[@]}"; do
local full_copy=0
if [[ "$pattern" == !* ]]; then
full_copy=1
pattern=${pattern:1}
fi
local source_pattern="$source_dir/$pattern"
if [[ "$pattern" == "." ]]; then
source_pattern=$source_dir
fi
find "$source_dir" -type d -path "$source_pattern" 2>/dev/null | while read dir; do
# Compute the relative path of the directory
local rel_path="${dir#$source_dir}"
rel_path="${rel_path#/}" # Remove leading slash, if present
local full_target_dir="$target_dir/$rel_path"
# Copy and update Makefile and common.mk if they exist
if [ $full_copy -eq 1 ]; then
copy_and_update "$dir/*" "$full_target_dir"
else
copy_and_update "$dir/Makefile" "$full_target_dir"
copy_and_update "$dir/common.mk" "$full_target_dir"
copy_and_update "$dir/*.in" "$full_target_dir"
fi
done
done
}
###############################################################################
# default configuration parameters
default_xlen=32
default_tooldir=$HOME/tools
default_osversion=$(detect_osversion)
default_prefix=$CURRENT_DIR
default_vm=0
# load default configuration parameters from existing config.mk
if [ -f "config.mk" ]; then
while IFS='=' read -r key value; do
value=${value//[@]/} # Remove placeholder characters
value="${value#"${value%%[![:space:]]*}"}" # Remove leading whitespace
value="${value%"${value##*[![:space:]]}"}" # Remove trailing whitespace
case $key in
XLEN\ ?*) default_xlen=${value//\?=/} ;;
TOOLDIR\ ?*) default_tooldir=${value//\?=/} ;;
OSVERSION\ ?*) default_osversion=${value//\?=/} ;;
PREFIX\ ?*) default_prefix=${value//\?=/} ;;
VM_ENABLE\ ?*) default_vm=${value//\?=/} ;;
esac
done < config.mk
fi
# set configuration parameters
XLEN=${XLEN:=$default_xlen}
TOOLDIR=${TOOLDIR:=$default_tooldir}
OSVERSION=${OSVERSION:=$default_osversion}
PREFIX=${PREFIX:=$default_prefix}
VM_ENABLE=${VM_ENABLE:=$default_vm}
# parse command line arguments
usage() {
echo "Usage: $0 [--xlen=<value>] [--tooldir=<path>] [--osversion=<version>]"
echo " --xlen=<value> Set the XLEN value (default: 32)"
echo " --tooldir=<path> Set the TOOLDIR path (default: $HOME/tools)"
echo " --osversion=<version> Set the OS Version (default: $(detect_osversion))"
echo " --prefix=<path> Set installation directory"
echo " --vm_enable=<value> Enable Virtual Memory support (default: 0)"
exit 1
}
while [[ "$#" -gt 0 ]]; do
case $1 in
--xlen=*) XLEN="${1#*=}" ;;
--tooldir=*) TOOLDIR="${1#*=}" ;;
--osversion=*) OSVERSION="${1#*=}" ;;
--prefix=*) PREFIX="${1#*=}" ;;
--vm_enable=*) VM_ENABLE="${1#*=}" ;;
-h|--help) usage ;;
*) echo "Unknown parameter passed: $1"; usage ;;
esac
shift
done
# check OS
if [ "$OSVERSION" == "unsupported" ]; then
echo "Error: Unsupported OS."
exit -1
fi
# project subdirectories to build
SUBDIRS=("." "!ci" "!perf" "hw*" "kernel*" "runtime*" "sim*" "tests*")
# Get the directory of the script
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
THIRD_PARTY_DIR=$SCRIPT_DIR/third_party
copy_files "$SCRIPT_DIR" "$CURRENT_DIR"

79
docs/altera_fpga_guide.md Normal file
View file

@ -0,0 +1,79 @@
# FPGA Startup and Configuration Guide
OPAE Environment Setup
----------------------
$ source /opt/inteldevstack/init_env_user.sh
$ export OPAE_HOME=/opt/opae/1.1.2
$ export PATH=$OPAE_HOME/bin:$PATH
$ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH
$ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH
$ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH
OPAE Build
------------------
The FPGA has to following configuration options:
- DEVICE_FAMILY=arria10 | stratix10
- NUM_CORES=#n
Command line:
$ cd hw/syn/altera/opae
$ PREFIX=test1 TARGET=fpga NUM_CORES=4 make
A new folder (ex: `test1_xxx_4c`) will be created and the build will start and take ~30-480 min to complete.
Setting TARGET=ase will build the project for simulation using Intel ASE.
OPAE Build Configuration
------------------------
The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware parameters that can be modified when build the processor.For example, have the following parameters that can be configured:
- `NUM_WARPS`: Number of warps per cores
- `NUM_THREADS`: Number of threads per warps
- `PERF_ENABLE`: enable the use of all profile counters
You configure the syntesis build from the command line:
$ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make
OPAE Build Progress
-------------------
You could check the last 10 lines in the build log for possible errors until build completion.
$ tail -n 10 <build_dir>/build.log
Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs.
$ ps -u <username>
If the build fails and you need to restart it, clean up the build folder using the following command:
$ make clean
The bitstream file `vortex_afu.gbs` should exist when the build is done:
$ ls -lsa <build_dir>/synth/vortex_afu.gbs
Signing the bitstream and Programming the FPGA
----------------------------------------------
$ cd <build_dir>
$ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs
$ fpgasupdate vortex_afu_unsigned_ssl.gbs
Sample FPGA Run Test
--------------------
Ensure you have the correct opae runtime for the FPGA target
$ make -C runtime/opae clean
$ TARGET=FPGA make -C runtime/opae
Run the following from your Vortex build directory
$ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128"

Binary file not shown.

Before

Width:  |  Height:  |  Size: 60 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 207 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 77 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 67 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 463 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 517 KiB

View file

@ -2,69 +2,26 @@
The Vortex Cache Sub-system has the following main properties: The Vortex Cache Sub-system has the following main properties:
- High-bandwidth with bank parallelism - High-bandwidth transfer with Multi-bank parallelism
- Snoop protocol to flush data for CPU access - Non-blocking pipelined write-through cache architecture with per-bank MSHR
- Generic design: Dcache, Icache, Shared Memory, L2 cache, L3 cache - Configurable design: Dcache, Icache, L2 cache, L3 cache
### Cache Hierarchy ### Cache Microarchitecture
![Image of Cache Hierarchy](./assets/img/cache_hierarchy.png) ![Image of Cache Hierarchy](./assets/img/cache_microarchitecture.png)
- Cache can be configured to be any level in the hierarchy The Vortex cache is comprised of multiple parallel banks. It is comprised of the following modules:
- Caches communicate via snooping - **Bank request dispatch crossbar**: assigns a bank to incoming requests and resolve collision using stalls.
- Cache flush from AFU is passed down the hierarchy - **Bank response merge crossbar**: merges result from banks and forward to the core response.
- **Memory request multiplexer**: arbitrates bank memory requests
- **Memory response demultiplexer**: forwards memory response to the corresponding bank.
- **Flush Unit**: performs tag memory initialization.
### VX_cache.v (Top Module) Incoming requests entering the cache are sent to a dispatch crossbar that select the corresponding bank for each request, resolving bank collisions with stalls. The result output of each bank is merge back into outgoing response port via merger crossbar. Each bank intergates a non-blocking pipeline with a local Miss Status Holding Register (MSHR) to reduce the miss rate. The bank pipeline consists of the following stages:
VX.cache.v is the top module of the cache verilog code located in the `/hw/rtl/cache` directory. - **Schedule**: Selects the next request into the pipeline from the incoming core request, memory fill, or the MSHR entry, with priority given to the latter.
- **Tag Access**: single-port read/write access to the tag store.
- **Data Access**: Single-port read/write access to the data store.
- **Response Handling**: Core response back to the core.
![Image of Vortex Cache](./assets/img/vortex_cache_top_module.png) Deadlocks inside the cache can occur when the MSHR is full and a new request is already in the pipeline. It can also occur when the memory request queue is full, and there is an incoming memory response. The cache mitigates MSHR deadlocks by using an early full signal before a new request is issued and similarly mitigates memory deadlocks by ensuring that its request queue never fills up.
- Configurable (Cache size, number of banks, bank line size, etc.)
- I/O signals
- Core Request
- Core Rsp
- DRAM Req
- DRAM Rsp
- Snoop Rsp
- Snoop Rsp
- Snoop Forwarding Out
- Snoop Forwarding In
- Bank Select
- Assigns valid and ready signals for each bank
- Snoop Forwarder
- DRAM Request Arbiter
- Prepares cache response for communication with DRAM
- Snoop Response Arbiter
- Sends snoop response
- Core Response Merge
- Cache accesses one line at a time. As a result, each request may not come back in the same response. This module tries to recombine the responses by thread ID.
### VX_cache_bank.v
VX_cache_bank.v is the verilog code that handles cache bank functionality and is located in the `/hw/rtl/cache` directory.
![Image of Vortex Cache Bank](./assets/img/vortex_bank.png)
- Allows for high throughput
- Each bank contains queues to hold requests to the cache
- I/O signals
- Core request
- Core Response
- DRAM Fill Requests
- DRAM Fill Response
- DRAM WB Requests
- Snp Request
- Snp Response
- Request Priority: DRAM fill, miss reserve, core request, snoop request
- Snoop Request Queue
- DRAM Fill Queue
- Core Req Arbiter
- Requests to be processed by the bank
- Tag Data Store
- Registers for valid, dirty, dirtyb, tag, and data
- Length of registers determined by lines in the bank
- Tag Data Access:
- I/O: stall, snoop info, force request miss
- Writes to cache or sends read response; hit or miss determined here
- A missed request goes to the miss reserve if it is not a snoop request or DRAM fill

View file

@ -61,5 +61,8 @@ We provide a trace sanitizer tool under ./hw/scripts/trace_csv.py that you can u
$ ./ci/blackbox.sh --driver=simx --app=demo --debug=3 --log=run_simx.log $ ./ci/blackbox.sh --driver=simx --app=demo --debug=3 --log=run_simx.log
$ ./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv $ ./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
The first column in the CSV trace is UUID (universal unique identifier) of the instruction and the content is sorted by the UUID. You can use the UUID to trace the same instruction running on either the RTL hw or SimX simulator. $ diff trace_rtlsim.csv trace_simx.csv
The first column in the CSV trace is UUID (universal unique identifier) of the instruction and the content is sorted by the UUID.
You can use the UUID to trace the same instruction running on either the RTL hw or SimX simulator.
This can be very effective if you want to use SimX to debugging your RTL hardware by comparing CSV traces. This can be very effective if you want to use SimX to debugging your RTL hardware by comparing CSV traces.

View file

@ -1,78 +1,51 @@
# Environment Setup # Environment Setup
These instructions apply to the development vortex repo using the _updated toolchain_. The updated toolchain is considered to be any commit of `master` pulled from _July 2, 2023_ onwards. The toolchain update in question can be viewed in this [commit](https://github.com/vortexgpgpu/vortex-dev/commit/0048496ba28d7b9a209a0e569d52d60f2b68fc04). Therefore, if you are unsure whether you are using the new toolchain or not, then you should check the `ci` folder for the existence of the `toolchain_prebuilt.sh` script. Furthermore, you should notice that the `toolchain_install.sh` script has the legacy `llvm()` split into `llvm-vortex()` and `llvm-pocl()`. These instructions apply to the development vortex repo using the updated toolchain. The updated toolchain is considered to be any commit of `master` pulled from July 2, 2023 onwards. The toolchain update in question can be viewed in this [commit](https://github.com/vortexgpgpu/vortex-dev/commit/0048496ba28d7b9a209a0e569d52d60f2b68fc04). Therefore, if you are unsure whether you are using the new toolchain or not, then you should check the `ci` folder for the existence of the `toolchain_prebuilt.sh` script. Furthermore, you should notice that the `toolchain_install.sh` script has the legacy `llvm()` split into `llvm-vortex()` and `llvm-pocl()`.
> Note: As it stands right now, there a few test suites which are not working due to this toolchain migration. We are working to determine an exact list of which ones are working and which ones are not. For now, if the repo builds at a minimum, then you can consider all these steps to have worked successfully. ## Set Up on Your Own System
## Choosing an Development Environment The toolchain binaries provided with Vortex are built on Ubuntu-based systems. To install Vortex on your own system, [follow these instructions](install_vortex.md).
There are three primary environments you can use. Each has its own pros and cons. Refer to this section to help you determine which environment best suits your needs. ## Servers for Georgia Tech Students and Collaborators
1. Volvo
2. Docker
3. Local
### Volvo ### Volvo
Volvo is a server provided by Georgia Tech. As such, it provides high performance compute, but you need valid credentials to access it. If you don't already have credentials, you can get in contact with your mentor to ask about setting your account up. Volvo is a 64-core server provided by HPArch. You need valid credentials to access it. If you don't already have access, you can get in contact with your mentor to ask about setting your account up.
Pros: Setup on Volvo:
1. Native x86_64 architecture, AMD EPYC 7702P 64-Core Processor (_fast_) 1. Connect to Georgia Tech's VPN or ssh into another machine on campus
2. Packages and difficult configurations are already done for you 2. `ssh volvo.cc.gatech.edu`
3. Consistent environment as others, allowing for easier troubleshooting 3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
4. Just need to SSH into Volvo, minimal impact on local computer resources 4. `source /nethome/software/set_vortex_env.sh` to set up the necessary environment variables.
5. VScode remote development tools are phenomenal over SSH 5. `make -s` in the `vortex` root directory
6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
Cons: ### Nio
1. Volvo is accessed via gatech vpn, external contributors might encounter issues with it -- especially from other university networks Nio is a 20-core desktop server provided by HPArch. If you have access to Volvo, you also have access to Nio.
2. Account creation is not immediate and is subject to processing time
3. Volvo might have outtages (_pretty uncommon_)
4. SSH development requires internet and other remote development tools (_vscode works!_)
### Docker Setup on Nio:
Docker allows for isolated pre-built environments to be created, shared and used. They are much more resource efficient than a Virtual Machine, and have great tooling and support available. The main motivation for Docker is bringing a consistent development environment to your local computer, across all platforms. 1. Connect to Georgia Tech's VPN or ssh into another machine on campus
2. `ssh nio.cc.gatech.edu`
3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
4. `source /opt/set_vortex_env_dev.sh` to set up the necessary environment variables.
5. `make -s` in the `vortex` root directory
6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
Pros: ## Docker (Experimental)
1. If you are native to x86_64, the container will also run natively, yielding better performance. However, if you have aarch64 (arm) processor, you can still run the Docker container without configuration changes. Docker allows for isolated pre-built environments to be created, shared and used. The emulation mode required for ARM-based processors will incur a decrease in performance. Currently, the dockerfile is not included with the official vortex repository and is not actively maintained or supported.
2. Consistent environment as others, allowing for easier troubleshooting
3. Works out of the box, just have a working installation of Docker
4. Vortex uses a build system, so once you build the repo once, only new code changes need to be recompiled
5. Docker offers helpful tools and extensions to monitor the performance of your container
Cons: ### Setup with Docker
1. If you are using an arm processor, the container will be run in emulation mode, so it will inherently run slower, as it needs to translate all the x86_64 instructions. It's still usable on Apple Silicon, however. 1. Clone repo recursively onto your local machine: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
2. Limited to your computer's performance, and Vortex is a large repo to build 2. Download the dockerfile from [here](https://github.gatech.edu/gist/usubramanya3/f1bf3e953faa38a6372e1292ffd0b65c) and place it in the root of the repo.
3. Will utilize a few gigabytes of storage on your computer for saving binaries to run the container 3. Build the Dockerfile into an image: `docker build --platform=linux/amd64 -t vortex -f dockerfile .`
4. Run a container based on the image: `docker run --rm -v ./:/root/vortex/ -it --name vtx-dev --privileged=true --platform=linux/amd64 vortex`
### Local
You can reverse engineer the Dockerfile and scripts above to get a working environment setup locally. This option is for experienced users, who have already considered the pros and cons of Volvo and Docker.
## Setup on Volvo
1. Clone Repo Recursively: `git clone --recursive https://github.com/vortexgpgpu/vortex-dev.git`
2. Source `/opt/set_vortex_env_dev.sh` to initialize pre-installed toolchain
3. `make -s` in `vortex-dev` root directory
4. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
## Setup with Docker
Currently the Dockerfile is not included with the official vortex-dev repository, however you can quickly add it to repo and get started.
1. Clone repo recursively onto your local machine: `git clone --recursive https://github.com/vortexgpgpu/vortex-dev.git`
2. Download a copy of `Dockerfile.dev` and place it in the root of the repo.
3. Build the Dockerfile into an image: `docker build --platform=linux/amd64 -t vortex-dev -f Dockerfile.dev .`
4. Run a container based on the image: `docker run --rm -v ./:/root/vortex-dev/ -it --name vtx-dev --privileged=true --platform=linux/amd64 vortex-dev`
5. Install the toolchain `./ci/toolchain_install.sh --all` (once per container) 5. Install the toolchain `./ci/toolchain_install.sh --all` (once per container)
6. `make -s` in `vortex-dev` root directory 6. `make -s` in `vortex` root directory
7. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood` 7. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
### Additional Docker Commands You may exit from a container and resume a container you have exited or start a second terminal session `docker exec -it <container-name> bash`
- Exit from a container (does not stop or remove it)
- Resume a container you have exited or start a second terminal session `docker exec -it <container-name> bash`

View file

@ -55,7 +55,7 @@ If the build fails and you need to restart it, clean up the build folder using t
The file `vortex_afu.gbs` should exist when the build is done: The file `vortex_afu.gbs` should exist when the build is done:
$ ls -lsa <build_dir>/vortex_afu.gbs $ ls -lsa <build_dir>/synth/vortex_afu.gbs
Signing the bitstream and Programming the FPGA Signing the bitstream and Programming the FPGA
@ -70,5 +70,5 @@ FPGA sample test running OpenCL sgemm kernel
Run the following from the Vortex root directory Run the following from the Vortex root directory
$ ./ci/blackbox.sh --driver=fpga --app=sgemm --args="-n64" $ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128"

View file

@ -7,13 +7,15 @@
- [Cache Subsystem](cache_subsystem.md) - [Cache Subsystem](cache_subsystem.md)
- [Software](software.md) - [Software](software.md)
- [Simulation](simulation.md) - [Simulation](simulation.md)
- [FPGA Setup Guide](fpga_setup.md) - [Altera FPGA Setup Guide](altera_fpga_guide.md)
- [Xilinx FPGA Setup Guide](xilinx_fpga_guide.md)
- [Debugging](debugging.md) - [Debugging](debugging.md)
- [Useful Links](references.md) - [Useful Links](references.md)
## Installation ## Installation
- Refer to the build instructions in [README](../README.md). - For the different environments Vortex supports, [read this document](environment_setup.md).
- To install on your own system, [follow this document](install_vortex.md).
## Quick Start Scenarios ## Quick Start Scenarios

81
docs/install_vortex.md Normal file
View file

@ -0,0 +1,81 @@
# Installing and Setting Up the Vortex Environment
## Ubuntu 18.04, 20.04
1. Install the following dependencies:
```
sudo apt-get install build-essential zlib1g-dev libtinfo-dev libncurses5 uuid-dev libboost-serialization-dev libpng-dev libhwloc-dev
```
2. Upgrade GCC to 11:
```
sudo apt-get install gcc-11 g++-11
```
Multiple gcc versions on Ubuntu can be managed with update-alternatives, e.g.:
```
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 9
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 9
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11
```
3. Download the Vortex codebase:
```
git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git
```
4. Build Vortex
```
$ cd vortex
$ mkdir -p build
$ cd build
$ ../configure --xlen=32 --tooldir=$HOME/tools
$ ./ci/toolchain_install.sh --all
$ source ./ci/toolchain_env.sh
$ make -s
```
## RHEL 8
Note: depending on the system, some of the toolchain may need to be recompiled for non-Ubuntu Linux. The source for the tools can be found [here](https://github.com/vortexgpgpu/).
1. Install the following dependencies:
```
sudo yum install libpng-devel boost boost-devel boost-serialization libuuid-devel opencl-headers hwloc hwloc-devel gmp-devel compat-hwloc1
```
2. Upgrade GCC to 11:
```
sudo yum install gcc-toolset-11
```
Multiple gcc versions on Red Hat can be managed with scl
3. Install MPFR 4.2.0:
Download [the source](https://ftp.gnu.org/gnu/mpfr/) and follow [the installation documentation](https://www.mpfr.org/mpfr-current/mpfr.html#How-to-Install).
4. Download the Vortex codebase:
```
git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git
```
5. Build Vortex
```
$ cd vortex
$ mkdir -p build
$ cd build
$ ../configure --xlen=32 --tooldir=$HOME/tools
$ ./ci/toolchain_install.sh --all
$ source ./ci/toolchain_env.sh
$ make -s
```

View file

@ -24,71 +24,57 @@ Vortex uses the SIMT (Single Instruction, Multiple Threads) execution model with
- Control the number of warps to activate during execution - Control the number of warps to activate during execution
- `WSPAWN` *count, addr*: activate count warps and jump to addr location - `WSPAWN` *count, addr*: activate count warps and jump to addr location
- **Control-Flow Divergence** - **Control-Flow Divergence**
- Control threads to activate when a branch diverges - Control threads activation when a branch diverges
- `SPLIT` *predicate*: apply 'taken' predicate thread mask adn save 'not-taken' into IPDOM stack - `SPLIT` *taken, predicate*: apply predicate thread mask and save current state into IPDOM stack
- `JOIN`: restore 'not-taken' thread mask - `JOIN`: pop IPDOM stack to restore thread mask
- `PRED` *predicate, restore_mask*: thread predicate instruction
- **Warp Synchronization** - **Warp Synchronization**
- `BAR` *id, count*: stall warps entering barrier *id* until count is reached - `BAR` *id, count*: stall warps entering barrier *id* until count is reached
### Vortex Pipeline/Datapath ### Vortex Pipeline/Datapath
![Image of Vortex Microarchitecture](./assets/img/vortex_microarchitecture_v2.png) ![Image of Vortex Microarchitecture](./assets/img/vortex_microarchitecture.png)
Vortex has a 5-stage pipeline: FI | ID | Issue | EX | WB. Vortex has a 6-stage pipeline:
- **Schedule**
- Warp Scheduler
- Schedule the next PC into the pipeline
- Track stalled, active warps
- IPDOM Stack
- Save split/join states for divergent threads
- Inflight Tracker
- Track in-flight instructions
- **Fetch** - **Fetch**
- Warp Scheduler - Retrieve instructions from memory
- Track stalled & active warps, resolve branches and barriers, maintain split/join IPDOM stack - Handle I-cache requests/responses
- Instruction Cache
- Retrieve instruction from cache, issue I-cache requests/responses
- **Decode** - **Decode**
- Decode fetched instructions, notify warp scheduler when the following instructions are decoded: - Decode fetched instructions
- Branch, tmc, split/join, wspawn - Notify warp scheduler on control instructions
- Precompute used_regs mask (needed for Issue stage)
- **Issue** - **Issue**
- Scheduling
- In-order issue (operands/execute unit ready), out-of-order commit
- IBuffer - IBuffer
- Store fetched instructions, separate queues per-warp, selects next warp through round-robin scheduling - Store decoded instructions in separate per-warp queues
- Scoreboard - Scoreboard
- Track in-use registers - Track in-use registers
- GPRs (General-Purpose Registers) stage - Check register use for decoded instructions
- Fetch issued instruction operands and send operands to execute unit - Operands Collector
- Fetch the operands for issued instructions from the register file
- **Execute** - **Execute**
- ALU Unit - ALU Unit
- Single-cycle operations (+,-,>>,<<,&,|,^), Branch instructions (Share ALU resources) - Handle arithmetic and branch operations
- MULDIV Unit
- Multiplier - done in 2 cycles
- Divider - division and remainder, done in 32 cycles
- Implements serial alogrithm (Stalls the pipeline)
- FPU Unit - FPU Unit
- Multi-cycle operations, uses `FPnew` Library on ASIC, uses hard DSPs on FPGA - Handle floating-point operations
- CSR Unit
- Store constant status registers - device caps, FPU status flags, performance counters
- Handle external CSR requests (requests from host CPU)
- LSU Unit - LSU Unit
- Handle load/store operations, issue D-cache requests, handle D-cache responses - Handle load/store operations
- Commit load responses - saves storage, Scoreboard tracks completion - SFU Unit
- GPGPU Unit - Handle warp control operations
- Handle GPGPU instructions - Handle Control Status Registers (CSRs) operations
- TMC, WSPAWN, SPLIT, BAR
- JOIN is handled by Warp Scheduler (upon SPLIT response)
- **Commit** - **Commit**
- Commit - Write result back to the register file and update the Scoreboard.
- Update CSR flags, update performance counters
- Writeback ### Vortex clustering architecture
- Write result back to GPRs, notify Scoreboard (release in-use register), select candidate instruction (ALU unit has highest priority) - Sockets
- **Clustering** - Grouping multiple cores sharing L1 cache
- Group mulitple cores into clusters (optionally share L2 cache) - Clusters
- Group multiple clusters (optionally share L3 cache) - Grouping of sockets sharing L2 cache
- Configurable at build time
- Default configuration:
- #Clusters = 1
- #Cores = 4
- #Warps = 4
- #Threads = 4
- **FPGA AFU Interface**
- Manage CPU-GPU comunication
- Query devices caps, load kernel instructions and resource buffers, start kernel execution, read destination buffers
- Local Memory - GPU access to local DRAM
- Reserved I/O addresses - redirect to host CPU, console output

View file

@ -20,7 +20,7 @@ Running tests under specific drivers (rtlsim,simx,fpga) is done using the script
- *Cores* - used to specify the number of cores (processing element containing multiple warps) within a configuration. - *Cores* - used to specify the number of cores (processing element containing multiple warps) within a configuration.
- *Warps* - used to specify the number of warps (collection of concurrent hardware threads) within a configuration. - *Warps* - used to specify the number of warps (collection of concurrent hardware threads) within a configuration.
- *Threads* - used to specify the number of threads (smallest unit of computation) within a configuration. - *Threads* - used to specify the number of threads (smallest unit of computation) within a configuration.
- *L2cache* - used to enable the shard l2cache among the Vortex cores. - *L2cache* - used to enable the shared l2cache among the Vortex cores.
- *L3cache* - used to enable the shared l3cache among the Vortex clusters. - *L3cache* - used to enable the shared l3cache among the Vortex clusters.
- *Driver* - used to specify which driver to run the Vortex simulation (either rtlsim, opae, xrt, simx). - *Driver* - used to specify which driver to run the Vortex simulation (either rtlsim, opae, xrt, simx).
- *Debug* - used to enable debug mode for the Vortex simulation. - *Debug* - used to enable debug mode for the Vortex simulation.

View file

@ -32,16 +32,21 @@ You can execute the default opncl suite by running the following commands at the
$ make -C tests/opencl run-simx $ make -C tests/opencl run-simx
$ make -C tests/opencl run-rtlsim $ make -C tests/opencl run-rtlsim
## Creating Your Own Regression Tests ## Creating Your Own Regression Test
- Inside `test/` you will find a series of folders which are named based on what they test
- You can view the tests to see which ones have tests similar to what you are trying to create new tests for
- once you have found a similar baseline, you can copy the folder and rename it to what you are planning to test
- `testcases.h` contains each of the test case templates
- `main.cpp` contains the implementation of each of the test cases and builds a test suite of all the tests cases you want
Compile the test case: `make -C tests/regression/<testcase-name>/ clean-all && make -C tests/regression/<testcase-name>/` Inside `tests/regression` you will find a series of folders which are named based on what they test.
You can view the tests to see which ones have tests similar to what you are trying to create new tests for.
Once you have found a similar baseline, you can copy the folder and rename it to what you are planning to test.
A regression test typically implements the following files:
- ***kernel.cpp*** contains the GPU kernel code.
- ***main.cpp*** contains the host CPU code.
- ***Makefile*** defines the compiler build commands for the CPU and GPU binaries.
Run the test case: `./ci/blackbox.sh --driver=simx --cores=4 --app=<testcase-name> --debug` Sync your build folder: `$ ../configure`
Compile your test: `$ make -C tests/regression/<test-name>`
Run your test: `$ ./ci/blackbox.sh --driver=simx --app=<test-name> --debug`
## Adding Your Tests to the CI Pipeline ## Adding Your Tests to the CI Pipeline
see `continuous_integration.md` See `continuous_integration.md`

36
docs/xilinx_fpga_guide.md Normal file
View file

@ -0,0 +1,36 @@
# FPGA Startup and Configuration Guide
XRT Environment Setup
----------------------
$ source /opt/xilinx/Vitis/2023.1/settings64.sh
$ source /opt/xilinx/xrt/setup.sh
Check Installed FPGA Platforms
------------------------------
$ platforminfo -l
Build FPGA image
----------------
$ cd hw/syn/xilinx/xrt
$ PREFIX=test1 PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 TARGET=hw NUM_CORES=4 make
Will run the synthesis under new build directory: BUILD_DIR := "\<PREFIX>\_\<PLATFORM>\_\<TARGET>"
The generated bitstream will be located under <BUILD_DIR>/bin/vortex_afu.xclbin
Sample FPGA Run Test
--------------------
Ensure you have the correct opae runtime for the FPGA target
$ make -C runtime/xrt clean
$ TARGET=hw make -C runtime/xrt
Run the following from your Vortex build directory
$ TARGET=hw FPGA_BIN_DIR=<BUILD_DIR>/bin ./ci/blackbox.sh --driver=xrt --app=sgemm --args="-n128"

2
hw/.gitignore vendored
View file

@ -1,2 +0,0 @@
VX_config.h
VX_types.h

View file

@ -1,5 +1,9 @@
RTL_DIR=./rtl ROOT_DIR := $(realpath ..)
SCRIPT_DIR=./scripts include $(ROOT_DIR)/config.mk
HW_DIR := $(VORTEX_HOME)/hw
SCRIPT_DIR := $(HW_DIR)/scripts
RTL_DIR := $(HW_DIR)/rtl
all: config all: config
@ -12,6 +16,7 @@ VX_types.h: $(RTL_DIR)/VX_types.vh
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/VX_types.vh -o VX_types.h $(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/VX_types.vh -o VX_types.h
clean: clean:
$(MAKE) -C unittest clean
rm -f VX_config.h VX_types.h rm -f VX_config.h VX_types.h
.PHONY: VX_config.h VX_types.h .PHONY: VX_config.h VX_types.h

View file

@ -54,7 +54,7 @@ extern "C" {
} }
inline uint64_t nan_box(uint32_t value) { inline uint64_t nan_box(uint32_t value) {
#ifdef FPU_RV64F #ifdef XLEN_64
return value | 0xffffffff00000000; return value | 0xffffffff00000000;
#else #else
return value; return value;
@ -62,7 +62,7 @@ inline uint64_t nan_box(uint32_t value) {
} }
inline bool is_nan_boxed(uint64_t value) { inline bool is_nan_boxed(uint64_t value) {
#ifdef FPU_RV64F #ifdef XLEN_64
return (uint32_t(value >> 32) == 0xffffffff); return (uint32_t(value >> 32) == 0xffffffff);
#else #else
__unused (value); __unused (value);
@ -71,10 +71,9 @@ inline bool is_nan_boxed(uint64_t value) {
} }
inline int64_t check_boxing(int64_t a) { inline int64_t check_boxing(int64_t a) {
if (!is_nan_boxed(a)) { if (is_nan_boxed(a))
return nan_box(0x7fc00000); // NaN
}
return a; return a;
return nan_box(0x7fc00000); // NaN
} }
void dpi_fadd(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) { void dpi_fadd(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {

View file

@ -14,8 +14,6 @@
`ifndef FLOAT_DPI_VH `ifndef FLOAT_DPI_VH
`define FLOAT_DPI_VH `define FLOAT_DPI_VH
`include "VX_config.vh"
import "DPI-C" function void dpi_fadd(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags); import "DPI-C" function void dpi_fadd(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fsub(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags); import "DPI-C" function void dpi_fsub(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmul(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags); import "DPI-C" function void dpi_fmul(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);

View file

@ -21,8 +21,6 @@
#include "svdpi.h" #include "svdpi.h"
#include "verilated_vpi.h" #include "verilated_vpi.h"
#include "uuid_gen.h"
#ifdef XLEN_64 #ifdef XLEN_64
#define iword_t int64_t #define iword_t int64_t
#define uword_t uint64_t #define uword_t uint64_t
@ -50,7 +48,7 @@ extern "C" {
void dpi_trace_start(); void dpi_trace_start();
void dpi_trace_stop(); void dpi_trace_stop();
uint64_t dpi_uuid_gen(bool reset, int wid, uint64_t PC); uint64_t dpi_uuid_gen(bool reset, int wid);
} }
bool sim_trace_enabled(); bool sim_trace_enabled();
@ -209,24 +207,14 @@ void dpi_trace_stop() {
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
std::unordered_map<uint32_t, std::shared_ptr<vortex::UUIDGenerator>> g_uuid_gens; std::unordered_map<uint32_t, uint32_t> g_uuid_gens;
uint64_t dpi_uuid_gen(bool reset, int wid, uint64_t PC) { uint64_t dpi_uuid_gen(bool reset, int wid) {
if (reset) { if (reset) {
g_uuid_gens.clear(); g_uuid_gens.clear();
return 0; return 0;
} }
std::shared_ptr<vortex::UUIDGenerator> uuid_gen; uint32_t instr_uuid = g_uuid_gens[wid]++;
auto it = g_uuid_gens.find(wid); uint64_t uuid = (uint64_t(wid) << 32) | instr_uuid;
if (it == g_uuid_gens.end()) {
uuid_gen = std::make_shared<vortex::UUIDGenerator>();
g_uuid_gens.emplace(wid, uuid_gen);
} else {
uuid_gen = it->second;
}
uint32_t instr_uuid = uuid_gen->get_uuid(PC);
uint32_t instr_id = instr_uuid & 0xffff;
uint32_t instr_ref = instr_uuid >> 16;
uint64_t uuid = (uint64_t(instr_ref) << 32) | (wid << 16) | instr_id;
return uuid; return uuid;
} }

View file

@ -14,8 +14,6 @@
`ifndef UTIL_DPI_VH `ifndef UTIL_DPI_VH
`define UTIL_DPI_VH `define UTIL_DPI_VH
`include "VX_config.vh"
`ifdef XLEN_64 `ifdef XLEN_64
`define INT_TYPE longint `define INT_TYPE longint
`else `else
@ -32,6 +30,6 @@ import "DPI-C" function void dpi_trace(input int level, input string format /*ve
import "DPI-C" function void dpi_trace_start(); import "DPI-C" function void dpi_trace_start();
import "DPI-C" function void dpi_trace_stop(); import "DPI-C" function void dpi_trace_stop();
import "DPI-C" function longint dpi_uuid_gen(input logic reset, input int wid, input longint PC); import "DPI-C" function longint dpi_uuid_gen(input logic reset, input int wid);
`endif `endif

View file

@ -14,7 +14,8 @@
`include "VX_define.vh" `include "VX_define.vh"
module VX_cluster import VX_gpu_pkg::*; #( module VX_cluster import VX_gpu_pkg::*; #(
parameter CLUSTER_ID = 0 parameter CLUSTER_ID = 0,
parameter `STRING INSTANCE_ID = ""
) ( ) (
`SCOPE_IO_DECL `SCOPE_IO_DECL
@ -32,17 +33,22 @@ module VX_cluster import VX_gpu_pkg::*; #(
// Memory // Memory
VX_mem_bus_if.master mem_bus_if, VX_mem_bus_if.master mem_bus_if,
// simulation helper signals
output wire sim_ebreak,
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
// Status // Status
output wire busy output wire busy
); );
`ifdef SCOPE `ifdef SCOPE
localparam scope_socket = 0; localparam scope_socket = 0;
`SCOPE_IO_SWITCH (scope_socket + `NUM_SOCKETS); `SCOPE_IO_SWITCH (`NUM_SOCKETS);
`endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
assign mem_perf_tmp_if.icache = 'x;
assign mem_perf_tmp_if.dcache = 'x;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.lmem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif `endif
`ifdef GBAR_ENABLE `ifdef GBAR_ENABLE
@ -54,7 +60,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
VX_gbar_arb #( VX_gbar_arb #(
.NUM_REQS (`NUM_SOCKETS), .NUM_REQS (`NUM_SOCKETS),
.OUT_REG ((`NUM_SOCKETS > 2) ? 1 : 0) // bgar_unit has no backpressure .OUT_BUF ((`NUM_SOCKETS > 2) ? 1 : 0) // bgar_unit has no backpressure
) gbar_arb ( ) gbar_arb (
.clk (clk), .clk (clk),
.reset (gbar_reset), .reset (gbar_reset),
@ -69,18 +75,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.reset (gbar_reset), .reset (gbar_reset),
.gbar_bus_if (gbar_bus_if) .gbar_bus_if (gbar_bus_if)
); );
`endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
cache_perf_t perf_l2cache;
assign mem_perf_tmp_if.icache = 'x;
assign mem_perf_tmp_if.dcache = 'x;
assign mem_perf_tmp_if.l2cache = perf_l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.smem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif `endif
VX_mem_bus_if #( VX_mem_bus_if #(
@ -91,7 +86,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
`RESET_RELAY (l2_reset, reset); `RESET_RELAY (l2_reset, reset);
VX_cache_wrap #( VX_cache_wrap #(
.INSTANCE_ID ("l2cache"), .INSTANCE_ID ($sformatf("%s-l2cache", INSTANCE_ID)),
.CACHE_SIZE (`L2_CACHE_SIZE), .CACHE_SIZE (`L2_CACHE_SIZE),
.LINE_SIZE (`L2_LINE_SIZE), .LINE_SIZE (`L2_LINE_SIZE),
.NUM_BANKS (`L2_NUM_BANKS), .NUM_BANKS (`L2_NUM_BANKS),
@ -101,19 +96,21 @@ module VX_cluster import VX_gpu_pkg::*; #(
.CRSQ_SIZE (`L2_CRSQ_SIZE), .CRSQ_SIZE (`L2_CRSQ_SIZE),
.MSHR_SIZE (`L2_MSHR_SIZE), .MSHR_SIZE (`L2_MSHR_SIZE),
.MRSQ_SIZE (`L2_MRSQ_SIZE), .MRSQ_SIZE (`L2_MRSQ_SIZE),
.MREQ_SIZE (`L2_MREQ_SIZE), .MREQ_SIZE (`L2_WRITEBACK ? `L2_MSHR_SIZE : `L2_MREQ_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH), .TAG_WIDTH (L2_TAG_WIDTH),
.WRITE_ENABLE (1), .WRITE_ENABLE (1),
.WRITEBACK (`L2_WRITEBACK),
.DIRTY_BYTES (`L2_WRITEBACK),
.UUID_WIDTH (`UUID_WIDTH), .UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_REG (2), .CORE_OUT_BUF (2),
.MEM_OUT_REG (2), .MEM_OUT_BUF (2),
.NC_ENABLE (1), .NC_ENABLE (1),
.PASSTHRU (!`L2_ENABLED) .PASSTHRU (!`L2_ENABLED)
) l2cache ( ) l2cache (
.clk (clk), .clk (clk),
.reset (l2_reset), .reset (l2_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (perf_l2cache), .cache_perf (mem_perf_tmp_if.l2cache),
`endif `endif
.core_bus_if (per_socket_mem_bus_if), .core_bus_if (per_socket_mem_bus_if),
.mem_bus_if (mem_bus_if) .mem_bus_if (mem_bus_if)
@ -121,13 +118,6 @@ module VX_cluster import VX_gpu_pkg::*; #(
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
wire [`NUM_SOCKETS-1:0] per_socket_sim_ebreak;
wire [`NUM_SOCKETS-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_socket_sim_wb_value;
assign sim_ebreak = per_socket_sim_ebreak[0];
assign sim_wb_value = per_socket_sim_wb_value[0];
`UNUSED_VAR (per_socket_sim_ebreak)
`UNUSED_VAR (per_socket_sim_wb_value)
VX_dcr_bus_if socket_dcr_bus_tmp_if(); VX_dcr_bus_if socket_dcr_bus_tmp_if();
assign socket_dcr_bus_tmp_if.write_valid = dcr_bus_if.write_valid && (dcr_bus_if.write_addr >= `VX_DCR_BASE_STATE_BEGIN && dcr_bus_if.write_addr < `VX_DCR_BASE_STATE_END); assign socket_dcr_bus_tmp_if.write_valid = dcr_bus_if.write_valid && (dcr_bus_if.write_addr >= `VX_DCR_BASE_STATE_BEGIN && dcr_bus_if.write_addr < `VX_DCR_BASE_STATE_END);
assign socket_dcr_bus_tmp_if.write_addr = dcr_bus_if.write_addr; assign socket_dcr_bus_tmp_if.write_addr = dcr_bus_if.write_addr;
@ -135,17 +125,20 @@ module VX_cluster import VX_gpu_pkg::*; #(
wire [`NUM_SOCKETS-1:0] per_socket_busy; wire [`NUM_SOCKETS-1:0] per_socket_busy;
VX_dcr_bus_if socket_dcr_bus_if();
`BUFFER_DCR_BUS_IF (socket_dcr_bus_if, socket_dcr_bus_tmp_if, (`NUM_SOCKETS > 1)); `BUFFER_DCR_BUS_IF (socket_dcr_bus_if, socket_dcr_bus_tmp_if, (`NUM_SOCKETS > 1));
// Generate all sockets // Generate all sockets
for (genvar i = 0; i < `NUM_SOCKETS; ++i) begin for (genvar socket_id = 0; socket_id < `NUM_SOCKETS; ++socket_id) begin : sockets
`RESET_RELAY (socket_reset, reset); `RESET_RELAY (socket_reset, reset);
VX_socket #( VX_socket #(
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i) .SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + socket_id),
.INSTANCE_ID ($sformatf("%s-socket%0d", INSTANCE_ID, socket_id))
) socket ( ) socket (
`SCOPE_IO_BIND (scope_socket+i) `SCOPE_IO_BIND (scope_socket+socket_id)
.clk (clk), .clk (clk),
.reset (socket_reset), .reset (socket_reset),
@ -155,18 +148,16 @@ module VX_cluster import VX_gpu_pkg::*; #(
.dcr_bus_if (socket_dcr_bus_if), .dcr_bus_if (socket_dcr_bus_if),
.mem_bus_if (per_socket_mem_bus_if[i]), .mem_bus_if (per_socket_mem_bus_if[socket_id]),
`ifdef GBAR_ENABLE `ifdef GBAR_ENABLE
.gbar_bus_if (per_socket_gbar_bus_if[i]), .gbar_bus_if (per_socket_gbar_bus_if[socket_id]),
`endif `endif
.sim_ebreak (per_socket_sim_ebreak[i]), .busy (per_socket_busy[socket_id])
.sim_wb_value (per_socket_sim_wb_value[i]),
.busy (per_socket_busy[i])
); );
end end
`BUFFER_BUSY (busy, (| per_socket_busy), (`NUM_SOCKETS > 1)); `BUFFER_EX(busy, (| per_socket_busy), 1'b1, (`NUM_SOCKETS > 1));
endmodule endmodule

View file

@ -14,6 +14,8 @@
`ifndef VX_CONFIG_VH `ifndef VX_CONFIG_VH
`define VX_CONFIG_VH `define VX_CONFIG_VH
`ifndef MIN `ifndef MIN
`define MIN(x, y) (((x) < (y)) ? (x) : (y)) `define MIN(x, y) (((x) < (y)) ? (x) : (y))
`endif `endif
@ -31,7 +33,6 @@
`endif `endif
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
`ifndef EXT_M_DISABLE `ifndef EXT_M_DISABLE
`define EXT_M_ENABLE `define EXT_M_ENABLE
`endif `endif
@ -40,6 +41,18 @@
`define EXT_F_ENABLE `define EXT_F_ENABLE
`endif `endif
`ifdef XLEN_64
`ifndef FPU_DSP
`ifndef EXT_D_DISABLE
`define EXT_D_ENABLE
`endif
`endif
`endif
`ifndef EXT_ZICOND_DISABLE
`define EXT_ZICOND_ENABLE
`endif
`ifndef XLEN_32 `ifndef XLEN_32
`ifndef XLEN_64 `ifndef XLEN_64
`define XLEN_32 `define XLEN_32
@ -91,13 +104,12 @@
`endif `endif
`ifndef NUM_BARRIERS `ifndef NUM_BARRIERS
`define NUM_BARRIERS 4 `define NUM_BARRIERS `UP(`NUM_WARPS/2)
`endif `endif
`ifndef SOCKET_SIZE `ifndef SOCKET_SIZE
`define SOCKET_SIZE `MIN(4, `NUM_CORES) `define SOCKET_SIZE `MIN(4, `NUM_CORES)
`endif `endif
`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
`ifdef L2_ENABLE `ifdef L2_ENABLE
`define L2_ENABLED 1 `define L2_ENABLED 1
@ -129,45 +141,77 @@
`endif `endif
`ifndef L1_LINE_SIZE `ifndef L1_LINE_SIZE
`ifdef L1_DISABLE `define L1_LINE_SIZE `MEM_BLOCK_SIZE
`define L1_LINE_SIZE ((`L2_ENABLED || `L3_ENABLED) ? 4 : `MEM_BLOCK_SIZE)
`else
`define L1_LINE_SIZE ((`L2_ENABLED || `L3_ENABLED) ? 16 : `MEM_BLOCK_SIZE)
`endif `endif
`ifndef L2_LINE_SIZE
`define L2_LINE_SIZE `MEM_BLOCK_SIZE
`endif
`ifndef L3_LINE_SIZE
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
`endif `endif
`ifdef XLEN_64 `ifdef XLEN_64
`ifndef STACK_BASE_ADDR
`define STACK_BASE_ADDR 64'h1FFFF0000
`endif
`ifndef STARTUP_ADDR `ifndef STARTUP_ADDR
`define STARTUP_ADDR 64'h180000000 `define STARTUP_ADDR 64'h180000000
`endif `endif
`ifndef STACK_BASE_ADDR `ifndef USER_BASE_ADDR
`define STACK_BASE_ADDR 64'h1FF000000 `define USER_BASE_ADDR 64'h000010000
`endif `endif
`else `ifndef IO_BASE_ADDR
`define IO_BASE_ADDR 64'h000000040
`endif
`ifdef VM_ENABLE
`ifndef PAGE_TABLE_BASE_ADDR
`define PAGE_TABLE_BASE_ADDR 64'h1F0000000
`endif
`endif
`else // XLEN_32
`ifndef STACK_BASE_ADDR
`define STACK_BASE_ADDR 32'hFFFF0000
`endif
`ifndef STARTUP_ADDR `ifndef STARTUP_ADDR
`define STARTUP_ADDR 32'h80000000 `define STARTUP_ADDR 32'h80000000
`endif `endif
`ifndef STACK_BASE_ADDR `ifndef USER_BASE_ADDR
`define STACK_BASE_ADDR 32'hFF000000 `define USER_BASE_ADDR 32'h00010000
`endif
`endif
`ifndef SMEM_BASE_ADDR
`define SMEM_BASE_ADDR `STACK_BASE_ADDR
`endif
`ifndef SMEM_LOG_SIZE
`define SMEM_LOG_SIZE 14
`endif `endif
`ifndef IO_BASE_ADDR `ifndef IO_BASE_ADDR
`define IO_BASE_ADDR (`SMEM_BASE_ADDR + (1 << `SMEM_LOG_SIZE)) `define IO_BASE_ADDR 32'h00000040
`endif
`ifdef VM_ENABLE
`ifndef PAGE_TABLE_BASE_ADDR
`define PAGE_TABLE_BASE_ADDR 32'hF0000000
`endif
`endif
`endif
`define IO_END_ADDR `USER_BASE_ADDR
`ifndef LMEM_LOG_SIZE
`define LMEM_LOG_SIZE 14
`endif
`ifndef LMEM_BASE_ADDR
`define LMEM_BASE_ADDR `STACK_BASE_ADDR
`endif `endif
`ifndef IO_COUT_ADDR `ifndef IO_COUT_ADDR
@ -175,10 +219,10 @@
`endif `endif
`define IO_COUT_SIZE `MEM_BLOCK_SIZE `define IO_COUT_SIZE `MEM_BLOCK_SIZE
`ifndef IO_CSR_ADDR `ifndef IO_MPM_ADDR
`define IO_CSR_ADDR (`IO_COUT_ADDR + `IO_COUT_SIZE) `define IO_MPM_ADDR (`IO_COUT_ADDR + `IO_COUT_SIZE)
`endif `endif
`define IO_CSR_SIZE (4 * 64 * `NUM_CORES * `NUM_CLUSTERS) `define IO_MPM_SIZE (8 * 32 * `NUM_CORES * `NUM_CLUSTERS)
`ifndef STACK_LOG2_SIZE `ifndef STACK_LOG2_SIZE
`define STACK_LOG2_SIZE 13 `define STACK_LOG2_SIZE 13
@ -191,13 +235,21 @@
`define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED))) `define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
`endif `endif
`ifndef SV_DPI
`define DPI_DISABLE
`endif
`ifndef FPU_FPNEW `ifndef FPU_FPNEW
`ifndef FPU_DSP `ifndef FPU_DSP
`ifndef FPU_DPI `ifndef FPU_DPI
`ifdef SYNTHESIS `ifndef SYNTHESIS
`define FPU_DSP `ifndef DPI_DISABLE
`else
`define FPU_DPI `define FPU_DPI
`else
`define FPU_DSP
`endif
`else
`define FPU_DSP
`endif `endif
`endif `endif
`endif `endif
@ -214,11 +266,64 @@
`define DEBUG_LEVEL 3 `define DEBUG_LEVEL 3
`endif `endif
`ifndef MEM_PAGE_SIZE
`define MEM_PAGE_SIZE (4096)
`endif
`ifndef MEM_PAGE_LOG2_SIZE
`define MEM_PAGE_LOG2_SIZE (12)
`endif
// Virtual Memory Configuration ///////////////////////////////////////////////////////
`ifdef VM_ENABLE
`ifdef XLEN_32
`ifndef VM_ADDR_MODE
`define VM_ADDR_MODE SV32 //or BARE
`endif
`ifndef PT_LEVEL
`define PT_LEVEL (2)
`endif
`ifndef PTE_SIZE
`define PTE_SIZE (4)
`endif
`ifndef NUM_PTE_ENTRY
`define NUM_PTE_ENTRY (1024)
`endif
`ifndef PT_SIZE_LIMIT
`define PT_SIZE_LIMIT (1<<23)
`endif
`else
`ifndef VM_ADDR_MODE
`define VM_ADDR_MODE SV39 //or BARE
`endif
`ifndef PT_LEVEL
`define PT_LEVEL (3)
`endif
`ifndef PTE_SIZE
`define PTE_SIZE (8)
`endif
`ifndef NUM_PTE_ENTRY
`define NUM_PTE_ENTRY (512)
`endif
`ifndef PT_SIZE_LIMIT
`define PT_SIZE_LIMIT (1<<25)
`endif
`endif
`ifndef PT_SIZE
`define PT_SIZE MEM_PAGE_SIZE
`endif
`ifndef TLB_SIZE
`define TLB_SIZE (32)
`endif
`endif
// Pipeline Configuration ///////////////////////////////////////////////////// // Pipeline Configuration /////////////////////////////////////////////////////
// Issue width // Issue width
`ifndef ISSUE_WIDTH `ifndef ISSUE_WIDTH
`define ISSUE_WIDTH `MIN(`NUM_WARPS, 4) `define ISSUE_WIDTH `UP(`NUM_WARPS / 8)
`endif `endif
// Number of ALU units // Number of ALU units
@ -239,29 +344,38 @@
// Number of LSU units // Number of LSU units
`ifndef NUM_LSU_LANES `ifndef NUM_LSU_LANES
`define NUM_LSU_LANES `MIN(`NUM_THREADS, 4) `define NUM_LSU_LANES `NUM_THREADS
`endif
`ifndef NUM_LSU_BLOCKS
`define NUM_LSU_BLOCKS 1
`endif `endif
// Number of SFU units // Number of SFU units
`ifndef NUM_SFU_LANES `ifndef NUM_SFU_LANES
`define NUM_SFU_LANES `MIN(`NUM_THREADS, 4) `define NUM_SFU_LANES `NUM_THREADS
`endif
`ifndef NUM_SFU_BLOCKS
`define NUM_SFU_BLOCKS 1
`endif `endif
// Size of Instruction Buffer // Size of Instruction Buffer
`ifndef IBUF_SIZE `ifndef IBUF_SIZE
`define IBUF_SIZE (2 * (`NUM_WARPS / `ISSUE_WIDTH)) `define IBUF_SIZE 4
`endif `endif
// Size of LSU Request Queue // LSU line size
`ifndef LSUQ_SIZE `ifndef LSU_LINE_SIZE
`define LSUQ_SIZE (2 * (`NUM_THREADS / `NUM_LSU_LANES)) `define LSU_LINE_SIZE `MIN(`NUM_LSU_LANES * (`XLEN / 8), `L1_LINE_SIZE)
`endif `endif
// LSU Duplicate Address Check // Size of LSU Core Request Queue
`ifdef LSU_DUP `ifndef LSUQ_IN_SIZE
`define LSU_DUP_ENABLED 1 `define LSUQ_IN_SIZE (2 * (`NUM_THREADS / `NUM_LSU_LANES))
`else `endif
`define LSU_DUP_ENABLED 0
// Size of LSU Memory Request Queue
`ifndef LSUQ_OUT_SIZE
`define LSUQ_OUT_SIZE `MAX(`LSUQ_IN_SIZE, `LSU_LINE_SIZE / (`XLEN / 8))
`endif `endif
`ifdef GBAR_ENABLE `ifdef GBAR_ENABLE
@ -285,8 +399,8 @@
// Floating-Point Units /////////////////////////////////////////////////////// // Floating-Point Units ///////////////////////////////////////////////////////
// Size of FPU Request Queue // Size of FPU Request Queue
`ifndef FPU_REQ_QUEUE_SIZE `ifndef FPUQ_SIZE
`define FPU_REQ_QUEUE_SIZE (2 * (`NUM_THREADS / `NUM_FPU_LANES)) `define FPUQ_SIZE (2 * (`NUM_THREADS / `NUM_FPU_LANES))
`endif `endif
// FNCP Latency // FNCP Latency
@ -362,6 +476,31 @@
`define LATENCY_FCVT 5 `define LATENCY_FCVT 5
`endif `endif
// FMA Bandwidth ratio
`ifndef FMA_PE_RATIO
`define FMA_PE_RATIO 1
`endif
// FDIV Bandwidth ratio
`ifndef FDIV_PE_RATIO
`define FDIV_PE_RATIO 8
`endif
// FSQRT Bandwidth ratio
`ifndef FSQRT_PE_RATIO
`define FSQRT_PE_RATIO 8
`endif
// FCVT Bandwidth ratio
`ifndef FCVT_PE_RATIO
`define FCVT_PE_RATIO 8
`endif
// FNCP Bandwidth ratio
`ifndef FNCP_PE_RATIO
`define FNCP_PE_RATIO 2
`endif
// Icache Configurable Knobs ////////////////////////////////////////////////// // Icache Configurable Knobs //////////////////////////////////////////////////
// Cache Enable // Cache Enable
@ -377,7 +516,7 @@
// Number of Cache Units // Number of Cache Units
`ifndef NUM_ICACHES `ifndef NUM_ICACHES
`define NUM_ICACHES `UP(`NUM_CORES / 4) `define NUM_ICACHES `UP(`SOCKET_SIZE / 4)
`endif `endif
// Cache Size // Cache Size
@ -426,7 +565,7 @@
// Number of Cache Units // Number of Cache Units
`ifndef NUM_DCACHES `ifndef NUM_DCACHES
`define NUM_DCACHES `UP(`NUM_CORES / 4) `define NUM_DCACHES `UP(`SOCKET_SIZE / 4)
`endif `endif
// Cache Size // Cache Size
@ -436,7 +575,7 @@
// Number of Banks // Number of Banks
`ifndef DCACHE_NUM_BANKS `ifndef DCACHE_NUM_BANKS
`define DCACHE_NUM_BANKS (`NUM_LSU_LANES) `define DCACHE_NUM_BANKS `MIN(`NUM_LSU_LANES, 4)
`endif `endif
// Core Response Queue Size // Core Response Queue Size
@ -464,22 +603,27 @@
`define DCACHE_NUM_WAYS 1 `define DCACHE_NUM_WAYS 1
`endif `endif
// SM Configurable Knobs ////////////////////////////////////////////////////// // Enable Cache Writeback
`ifndef DCACHE_WRITEBACK
`ifndef SM_DISABLE `define DCACHE_WRITEBACK 0
`define SM_ENABLE
`endif `endif
`ifdef SM_ENABLE // LMEM Configurable Knobs ////////////////////////////////////////////////////
`define SM_ENABLED 1
`ifndef LMEM_DISABLE
`define LMEM_ENABLE
`endif
`ifdef LMEM_ENABLE
`define LMEM_ENABLED 1
`else `else
`define SM_ENABLED 0 `define LMEM_ENABLED 0
`define SMEM_NUM_BANKS 1 `define LMEM_NUM_BANKS 1
`endif `endif
// Number of Banks // Number of Banks
`ifndef SMEM_NUM_BANKS `ifndef LMEM_NUM_BANKS
`define SMEM_NUM_BANKS (`NUM_LSU_LANES) `define LMEM_NUM_BANKS `NUM_LSU_LANES
`endif `endif
// L2cache Configurable Knobs ///////////////////////////////////////////////// // L2cache Configurable Knobs /////////////////////////////////////////////////
@ -523,6 +667,11 @@
`define L2_NUM_WAYS 2 `define L2_NUM_WAYS 2
`endif `endif
// Enable Cache Writeback
`ifndef L2_WRITEBACK
`define L2_WRITEBACK 0
`endif
// L3cache Configurable Knobs ///////////////////////////////////////////////// // L3cache Configurable Knobs /////////////////////////////////////////////////
// Cache Size // Cache Size
@ -536,7 +685,7 @@
// Number of Banks // Number of Banks
`ifndef L3_NUM_BANKS `ifndef L3_NUM_BANKS
`define L3_NUM_BANKS `MIN(4, `NUM_CLUSTERS) `define L3_NUM_BANKS `MIN(8, `NUM_CLUSTERS)
`endif `endif
// Core Response Queue Size // Core Response Queue Size
@ -564,6 +713,20 @@
`define L3_NUM_WAYS 4 `define L3_NUM_WAYS 4
`endif `endif
// Enable Cache Writeback
`ifndef L3_WRITEBACK
`define L3_WRITEBACK 0
`endif
`ifndef MEMORY_BANKS
`define MEMORY_BANKS 8
`endif
// Number of Memory Ports from LLC
`ifndef NUM_MEM_PORTS
`define NUM_MEM_PORTS `MIN(`MEMORY_BANKS, `L3_NUM_BANKS)
`endif
// ISA Extensions ///////////////////////////////////////////////////////////// // ISA Extensions /////////////////////////////////////////////////////////////
`ifdef EXT_A_ENABLE `ifdef EXT_A_ENABLE
@ -596,6 +759,12 @@
`define EXT_M_ENABLED 0 `define EXT_M_ENABLED 0
`endif `endif
`ifdef EXT_ZICOND_ENABLE
`define EXT_ZICOND_ENABLED 1
`else
`define EXT_ZICOND_ENABLED 0
`endif
`define ISA_STD_A 0 `define ISA_STD_A 0
`define ISA_STD_C 2 `define ISA_STD_C 2
`define ISA_STD_D 3 `define ISA_STD_D 3
@ -612,13 +781,15 @@
`define ISA_EXT_DCACHE 1 `define ISA_EXT_DCACHE 1
`define ISA_EXT_L2CACHE 2 `define ISA_EXT_L2CACHE 2
`define ISA_EXT_L3CACHE 3 `define ISA_EXT_L3CACHE 3
`define ISA_EXT_SMEM 4 `define ISA_EXT_LMEM 4
`define ISA_EXT_ZICOND 5
`define MISA_EXT (`ICACHE_ENABLED << `ISA_EXT_ICACHE) \ `define MISA_EXT (`ICACHE_ENABLED << `ISA_EXT_ICACHE) \
| (`DCACHE_ENABLED << `ISA_EXT_DCACHE) \ | (`DCACHE_ENABLED << `ISA_EXT_DCACHE) \
| (`L2_ENABLED << `ISA_EXT_L2CACHE) \ | (`L2_ENABLED << `ISA_EXT_L2CACHE) \
| (`L3_ENABLED << `ISA_EXT_L3CACHE) \ | (`L3_ENABLED << `ISA_EXT_L3CACHE) \
| (`SM_ENABLED << `ISA_EXT_SMEM) | (`LMEM_ENABLED << `ISA_EXT_LMEM) \
| (`EXT_ZICOND_ENABLED << `ISA_EXT_ZICOND)
`define MISA_STD (`EXT_A_ENABLED << 0) /* A - Atomic Instructions extension */ \ `define MISA_STD (`EXT_A_ENABLED << 0) /* A - Atomic Instructions extension */ \
| (0 << 1) /* B - Tentatively reserved for Bit operations extension */ \ | (0 << 1) /* B - Tentatively reserved for Bit operations extension */ \

View file

@ -44,6 +44,9 @@
`define NR_BITS `CLOG2(`NUM_REGS) `define NR_BITS `CLOG2(`NUM_REGS)
`define DV_STACK_SIZE `UP(`NUM_THREADS-1)
`define DV_STACK_SIZEW `UP(`CLOG2(`DV_STACK_SIZE))
`define PERF_CTR_BITS 44 `define PERF_CTR_BITS 44
`ifndef NDEBUG `ifndef NDEBUG
@ -52,15 +55,29 @@
`define UUID_WIDTH 1 `define UUID_WIDTH 1
`endif `endif
`define PC_BITS (`XLEN-1)
`define OFFSET_BITS 12
`define IMM_BITS `XLEN
`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
`define EX_ALU 0 `define EX_ALU 0
`define EX_LSU 1 `define EX_LSU 1
`define EX_SFU 2 `define EX_SFU 2
`define EX_FPU 3 `define EX_FPU (`EX_SFU + `EXT_F_ENABLED)
`define NUM_EX_UNITS (3 + `EXT_F_ENABLED) `define NUM_EX_UNITS (3 + `EXT_F_ENABLED)
`define EX_BITS `CLOG2(`NUM_EX_UNITS) `define EX_BITS `CLOG2(`NUM_EX_UNITS)
`define EX_WIDTH `UP(`EX_BITS)
`define SFU_CSRS 0
`define SFU_WCTL 1
`define NUM_SFU_UNITS (2)
`define SFU_BITS `CLOG2(`NUM_SFU_UNITS)
`define SFU_WIDTH `UP(`SFU_BITS)
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@ -94,6 +111,10 @@
`define INST_EXT3 7'b1011011 // 0x5B `define INST_EXT3 7'b1011011 // 0x5B
`define INST_EXT4 7'b1111011 // 0x7B `define INST_EXT4 7'b1111011 // 0x7B
// Opcode extensions
`define INST_R_F7_MUL 7'b0000001
`define INST_R_F7_ZICOND 7'b0000111
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
`define INST_FRM_RNE 3'b000 // round to nearest even `define INST_FRM_RNE 3'b000 // round to nearest even
@ -107,31 +128,40 @@
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
`define INST_OP_BITS 4 `define INST_OP_BITS 4
`define INST_MOD_BITS 3 `define INST_ARGS_BITS $bits(op_args_t)
`define INST_FMT_BITS 2 `define INST_FMT_BITS 2
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
`define INST_ALU_ADD 4'b0000 `define INST_ALU_ADD 4'b0000
//`define INST_ALU_UNUSED 4'b0001
`define INST_ALU_LUI 4'b0010 `define INST_ALU_LUI 4'b0010
`define INST_ALU_AUIPC 4'b0011 `define INST_ALU_AUIPC 4'b0011
`define INST_ALU_SLTU 4'b0100 `define INST_ALU_SLTU 4'b0100
`define INST_ALU_SLT 4'b0101 `define INST_ALU_SLT 4'b0101
//`define INST_ALU_UNUSED 4'b0110
`define INST_ALU_SUB 4'b0111 `define INST_ALU_SUB 4'b0111
`define INST_ALU_SRL 4'b1000 `define INST_ALU_SRL 4'b1000
`define INST_ALU_SRA 4'b1001 `define INST_ALU_SRA 4'b1001
`define INST_ALU_CZEQ 4'b1010
`define INST_ALU_CZNE 4'b1011
`define INST_ALU_AND 4'b1100 `define INST_ALU_AND 4'b1100
`define INST_ALU_OR 4'b1101 `define INST_ALU_OR 4'b1101
`define INST_ALU_XOR 4'b1110 `define INST_ALU_XOR 4'b1110
`define INST_ALU_SLL 4'b1111 `define INST_ALU_SLL 4'b1111
`define INST_ALU_OTHER 4'b0111
`define ALU_TYPE_BITS 2
`define ALU_TYPE_ARITH 0
`define ALU_TYPE_BRANCH 1
`define ALU_TYPE_MULDIV 2
`define ALU_TYPE_OTHER 3
`define INST_ALU_BITS 4 `define INST_ALU_BITS 4
`define INST_ALU_CLASS(op) op[3:2] `define INST_ALU_CLASS(op) op[3:2]
`define INST_ALU_SIGNED(op) op[0] `define INST_ALU_SIGNED(op) op[0]
`define INST_ALU_IS_SUB(op) op[1] `define INST_ALU_IS_SUB(op) op[1]
`define INST_ALU_IS_BR(mod) mod[0] `define INST_ALU_IS_CZERO(op) (op[3:1] == 3'b101)
`define INST_ALU_IS_M(mod) mod[1]
`define INST_ALU_IS_W(mod) mod[2]
`define INST_BR_EQ 4'b0000 `define INST_BR_EQ 4'b0000
`define INST_BR_NE 4'b0010 `define INST_BR_NE 4'b0010
@ -202,9 +232,9 @@
`define INST_FPU_MUL 4'b0010 `define INST_FPU_MUL 4'b0010
`define INST_FPU_DIV 4'b0011 `define INST_FPU_DIV 4'b0011
`define INST_FPU_SQRT 4'b0100 `define INST_FPU_SQRT 4'b0100
`define INST_FPU_CMP 4'b0101 // mod: LE=0, LT=1, EQ=2 `define INST_FPU_CMP 4'b0101 // frm: LE=0, LT=1, EQ=2
`define INST_FPU_F2F 4'b0110 `define INST_FPU_F2F 4'b0110
`define INST_FPU_MISC 4'b0111 // mod: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7 `define INST_FPU_MISC 4'b0111 // frm: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
`define INST_FPU_F2I 4'b1000 `define INST_FPU_F2I 4'b1000
`define INST_FPU_F2U 4'b1001 `define INST_FPU_F2U 4'b1001
`define INST_FPU_I2F 4'b1010 `define INST_FPU_I2F 4'b1010
@ -214,9 +244,8 @@
`define INST_FPU_NMSUB 4'b1110 `define INST_FPU_NMSUB 4'b1110
`define INST_FPU_NMADD 4'b1111 `define INST_FPU_NMADD 4'b1111
`define INST_FPU_BITS 4 `define INST_FPU_BITS 4
`define INST_FPU_IS_W(mod) (mod[4]) `define INST_FPU_IS_CLASS(op, frm) (op == `INST_FPU_MISC && frm == 3)
`define INST_FPU_IS_CLASS(op, mod) (op == `INST_FPU_MISC && mod == 3) `define INST_FPU_IS_MVXW(op, frm) (op == `INST_FPU_MISC && frm == 4)
`define INST_FPU_IS_MVXW(op, mod) (op == `INST_FPU_MISC && mod == 4)
`define INST_SFU_TMC 4'h0 `define INST_SFU_TMC 4'h0
`define INST_SFU_WSPAWN 4'h1 `define INST_SFU_WSPAWN 4'h1
@ -227,7 +256,6 @@
`define INST_SFU_CSRRW 4'h6 `define INST_SFU_CSRRW 4'h6
`define INST_SFU_CSRRS 4'h7 `define INST_SFU_CSRRS 4'h7
`define INST_SFU_CSRRC 4'h8 `define INST_SFU_CSRRC 4'h8
`define INST_SFU_CMOV 4'h9
`define INST_SFU_BITS 4 `define INST_SFU_BITS 4
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1) `define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1)
`define INST_SFU_IS_WCTL(op) (op <= 5) `define INST_SFU_IS_WCTL(op) (op <= 5)
@ -235,31 +263,18 @@
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// non-cacheable tag bits `define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2(`CDIV(I, O)) : 0)
`define NC_TAG_BITS 1
// cache address type bits
`ifdef SM_ENABLE
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BITS + 1)
`else
`define CACHE_ADDR_TYPE_BITS `NC_TAG_BITS
`endif
`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0)
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \ `define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
(`CLOG2(mshr_size) + `CLOG2(num_banks) + `NC_TAG_BITS) (`CLOG2(mshr_size) + `CLOG2(num_banks))
`define CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \ `define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) + `NC_TAG_BITS) (`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \ `define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \
`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width)) (`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width)) + 1)
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@ -272,29 +287,27 @@
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \ `define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches) `CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
`define CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \ `define CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG((`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches) `CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG((`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)) + `NC_TAG_BITS), num_caches)
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \ `define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches))), num_caches) `CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
`ifdef L2_ENABLE `ifdef ICACHE_ENABLE
`define L2_LINE_SIZE `MEM_BLOCK_SIZE `define L1_ENABLE
`else
`define L2_LINE_SIZE `L1_LINE_SIZE
`endif `endif
`ifdef L3_ENABLE `ifdef DCACHE_ENABLE
`define L3_LINE_SIZE `MEM_BLOCK_SIZE `define L1_ENABLE
`else
`define L3_LINE_SIZE `L2_LINE_SIZE
`endif `endif
`define ADDR_TYPE_FLUSH 0
`define ADDR_TYPE_IO 1
`define ADDR_TYPE_LOCAL 2 // shoud be last since optional
`define ADDR_TYPE_WIDTH (`ADDR_TYPE_LOCAL + `LMEM_ENABLED)
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE `define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE)) `define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8) `define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
@ -307,32 +320,37 @@
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
`define BUFFER_BUSY(dst, src, enable) \ `define BUFFER_EX(dst, src, ena, latency) \
logic __busy; \ VX_pipe_register #( \
if (enable) begin \ .DATAW ($bits(dst)), \
always @(posedge clk) begin \ .RESETW ($bits(dst)), \
if (reset) begin \ .DEPTH (latency) \
__busy <= 1'b0; \ ) __``dst``__ ( \
end else begin \ .clk (clk), \
__busy <= src; \ .reset (reset), \
end \ .enable (ena), \
end \ .data_in (src), \
end else begin \ .data_out (dst) \
assign __busy = src; \ )
end \
assign dst = __busy `define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 1)
`define POP_COUNT_EX(out, in, model) \ `define POP_COUNT_EX(out, in, model) \
VX_popcount #( \ VX_popcount #( \
.N ($bits(in)), \ .N ($bits(in)), \
.MODEL (model) \ .MODEL (model) \
) __``out ( \ ) __``out``__ ( \
.data_in (in), \ .data_in (in), \
.data_out (out) \ .data_out (out) \
) )
`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1) `define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1)
`define ASSIGN_VX_IF(dst, src) \
assign dst.valid = src.valid; \
assign dst.data = src.data; \
assign src.ready = dst.ready
`define ASSIGN_VX_MEM_BUS_IF(dst, src) \ `define ASSIGN_VX_MEM_BUS_IF(dst, src) \
assign dst.req_valid = src.req_valid; \ assign dst.req_valid = src.req_valid; \
assign dst.req_data = src.req_data; \ assign dst.req_data = src.req_data; \
@ -346,6 +364,7 @@
assign dst.req_data.rw = src.req_data.rw; \ assign dst.req_data.rw = src.req_data.rw; \
assign dst.req_data.byteen = src.req_data.byteen; \ assign dst.req_data.byteen = src.req_data.byteen; \
assign dst.req_data.addr = src.req_data.addr; \ assign dst.req_data.addr = src.req_data.addr; \
assign dst.req_data.atype = src.req_data.atype; \
assign dst.req_data.data = src.req_data.data; \ assign dst.req_data.data = src.req_data.data; \
if (TD != TS) \ if (TD != TS) \
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \ assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
@ -357,47 +376,52 @@
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \ assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
assign dst.rsp_ready = src.rsp_ready assign dst.rsp_ready = src.rsp_ready
`define BUFFER_DCR_BUS_IF(dst, src, enable) \ `define ASSIGN_VX_LSU_MEM_IF(dst, src) \
logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \ assign dst.req_valid = src.req_valid; \
if (enable) begin \ assign dst.req_data = src.req_data; \
always @(posedge clk) begin \ assign src.req_ready = dst.req_ready; \
__``dst <= {src.write_valid, src.write_addr, src.write_data}; \ assign src.rsp_valid = dst.rsp_valid; \
end \ assign src.rsp_data = dst.rsp_data; \
end else begin \ assign dst.rsp_ready = src.rsp_ready
assign __``dst = {src.write_valid, src.write_addr, src.write_data}; \
end \
VX_dcr_bus_if dst(); \
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
`define PERF_REDUCE(dst, src, field, width, count) \ `define BUFFER_DCR_BUS_IF(dst, src, enable) \
wire [count-1:0][width-1:0] __reduce_add_i_``src``field; \ if (enable) begin \
wire [width-1:0] __reduce_add_o_``dst``field; \ reg [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __dst; \
reg [width-1:0] __reduce_add_r_``dst``field; \ always @(posedge clk) begin \
for (genvar __i = 0; __i < count; ++__i) begin \ __dst <= {src.write_valid, src.write_addr, src.write_data}; \
assign __reduce_add_i_``src``field[__i] = ``src[__i].``field; \
end \ end \
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_``dst``field ( \ assign {dst.write_valid, dst.write_addr, dst.write_data} = __dst; \
__reduce_add_i_``src``field, \ end else begin \
__reduce_add_o_``dst``field \ assign {dst.write_valid, dst.write_addr, dst.write_data} = {src.write_valid, src.write_addr, src.write_data}; \
end
`define PERF_COUNTER_ADD(dst, src, field, width, count, reg_enable) \
if (count > 1) begin \
wire [count-1:0][width-1:0] __reduce_add_i_field; \
wire [width-1:0] __reduce_add_o_field; \
for (genvar __i = 0; __i < count; ++__i) begin \
assign __reduce_add_i_field[__i] = src[__i].``field; \
end \
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
__reduce_add_i_field, \
__reduce_add_o_field \
); \ ); \
if (reg_enable) begin \
reg [width-1:0] __reduce_add_r_field; \
always @(posedge clk) begin \ always @(posedge clk) begin \
if (reset) begin \ if (reset) begin \
__reduce_add_r_``dst``field <= '0; \ __reduce_add_r_field <= '0; \
end else begin \ end else begin \
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \ __reduce_add_r_field <= __reduce_add_o_field; \
end \ end \
end \ end \
assign ``dst.``field = __reduce_add_r_``dst``field assign dst.``field = __reduce_add_r_field; \
end else begin \
`define PERF_CACHE_REDUCE(dst, src, count) \ assign dst.``field = __reduce_add_o_field; \
`PERF_REDUCE (dst, src, reads, `PERF_CTR_BITS, count); \ end \
`PERF_REDUCE (dst, src, writes, `PERF_CTR_BITS, count); \ end else begin \
`PERF_REDUCE (dst, src, read_misses, `PERF_CTR_BITS, count); \ assign dst.``field = src[0].``field; \
`PERF_REDUCE (dst, src, write_misses, `PERF_CTR_BITS, count); \ end
`PERF_REDUCE (dst, src, bank_stalls, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, mshr_stalls, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, mem_stalls, `PERF_CTR_BITS, count); \
`PERF_REDUCE (dst, src, crsp_stalls, `PERF_CTR_BITS, count)
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \ `define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
if (block_size != 1) begin \ if (block_size != 1) begin \
@ -410,9 +434,4 @@
assign dst = src; \ assign dst = src; \
end end
`define TO_DISPATCH_DATA(data, tid) \
{data.uuid, data.wis, data.tmask, data.op_type, data.op_mod, data.wb, data.use_PC, data.use_imm, data.PC, data.imm, data.rd, tid, data.rs1_data, data.rs2_data, data.rs3_data}
///////////////////////////////////////////////////////////////////////////////
`endif // VX_DEFINE_VH `endif // VX_DEFINE_VH

View file

@ -26,7 +26,7 @@ package VX_gpu_pkg;
typedef struct packed { typedef struct packed {
logic valid; logic valid;
logic [`NUM_WARPS-1:0] wmask; logic [`NUM_WARPS-1:0] wmask;
logic [`XLEN-1:0] pc; logic [`PC_BITS-1:0] pc;
} wspawn_t; } wspawn_t;
typedef struct packed { typedef struct packed {
@ -34,12 +34,12 @@ package VX_gpu_pkg;
logic is_dvg; logic is_dvg;
logic [`NUM_THREADS-1:0] then_tmask; logic [`NUM_THREADS-1:0] then_tmask;
logic [`NUM_THREADS-1:0] else_tmask; logic [`NUM_THREADS-1:0] else_tmask;
logic [`XLEN-1:0] next_pc; logic [`PC_BITS-1:0] next_pc;
} split_t; } split_t;
typedef struct packed { typedef struct packed {
logic valid; logic valid;
logic is_dvg; logic [`DV_STACK_SIZEW-1:0] stack_ptr;
} join_t; } join_t;
typedef struct packed { typedef struct packed {
@ -51,13 +51,17 @@ package VX_gpu_pkg;
`else `else
logic [`NW_WIDTH-1:0] size_m1; logic [`NW_WIDTH-1:0] size_m1;
`endif `endif
logic is_noop;
} barrier_t; } barrier_t;
typedef struct packed { typedef struct packed {
logic [`XLEN-1:0] startup_addr; logic [`XLEN-1:0] startup_addr;
logic [`XLEN-1:0] startup_arg;
logic [7:0] mpm_class; logic [7:0] mpm_class;
} base_dcrs_t; } base_dcrs_t;
//////////////////////////// Perf counter types ///////////////////////////
typedef struct packed { typedef struct packed {
logic [`PERF_CTR_BITS-1:0] reads; logic [`PERF_CTR_BITS-1:0] reads;
logic [`PERF_CTR_BITS-1:0] writes; logic [`PERF_CTR_BITS-1:0] writes;
@ -75,7 +79,72 @@ package VX_gpu_pkg;
logic [`PERF_CTR_BITS-1:0] latency; logic [`PERF_CTR_BITS-1:0] latency;
} mem_perf_t; } mem_perf_t;
/* verilator lint_off UNUSED */ typedef struct packed {
logic [`PERF_CTR_BITS-1:0] idles;
logic [`PERF_CTR_BITS-1:0] stalls;
} sched_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] ibf_stalls;
logic [`PERF_CTR_BITS-1:0] scb_stalls;
logic [`PERF_CTR_BITS-1:0] opd_stalls;
logic [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] units_uses;
logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses;
} issue_perf_t;
//////////////////////// instruction arguments ////////////////////////////
typedef struct packed {
logic use_PC;
logic use_imm;
logic is_w;
logic [`ALU_TYPE_BITS-1:0] xtype;
logic [`IMM_BITS-1:0] imm;
} alu_args_t;
typedef struct packed {
logic [($bits(alu_args_t)-`INST_FRM_BITS-`INST_FMT_BITS)-1:0] __padding;
logic [`INST_FRM_BITS-1:0] frm;
logic [`INST_FMT_BITS-1:0] fmt;
} fpu_args_t;
typedef struct packed {
logic [($bits(alu_args_t)-1-1-`OFFSET_BITS)-1:0] __padding;
logic is_store;
logic is_float;
logic [`OFFSET_BITS-1:0] offset;
} lsu_args_t;
typedef struct packed {
logic [($bits(alu_args_t)-1-`VX_CSR_ADDR_BITS-5)-1:0] __padding;
logic use_imm;
logic [`VX_CSR_ADDR_BITS-1:0] addr;
logic [4:0] imm;
} csr_args_t;
typedef struct packed {
logic [($bits(alu_args_t)-1)-1:0] __padding;
logic is_neg;
} wctl_args_t;
typedef union packed {
alu_args_t alu;
fpu_args_t fpu;
lsu_args_t lsu;
csr_args_t csr;
wctl_args_t wctl;
} op_args_t;
`IGNORE_UNUSED_BEGIN
///////////////////////// LSU memory Parameters ///////////////////////////
localparam LSU_WORD_SIZE = `XLEN / 8;
localparam LSU_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(LSU_WORD_SIZE));
localparam LSU_MEM_BATCHES = 1;
localparam LSU_TAG_ID_BITS = (`CLOG2(`LSUQ_IN_SIZE) + `CLOG2(LSU_MEM_BATCHES));
localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS);
localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES;
////////////////////////// Icache Parameters ////////////////////////////// ////////////////////////// Icache Parameters //////////////////////////////
@ -99,44 +168,38 @@ package VX_gpu_pkg;
`ifdef ICACHE_ENABLE `ifdef ICACHE_ENABLE
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES); localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
`else `else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `NUM_SOCKETS, `NUM_ICACHES); localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif `endif
////////////////////////// Dcache Parameters ////////////////////////////// ////////////////////////// Dcache Parameters //////////////////////////////
// Word size in bytes // Word size in bytes
localparam DCACHE_WORD_SIZE = (`XLEN / 8); localparam DCACHE_WORD_SIZE = `LSU_LINE_SIZE;
localparam DCACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(DCACHE_WORD_SIZE)); localparam DCACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(DCACHE_WORD_SIZE));
// Block size in bytes // Block size in bytes
localparam DCACHE_LINE_SIZE = `L1_LINE_SIZE; localparam DCACHE_LINE_SIZE = `L1_LINE_SIZE;
// Input request size // Input request size
localparam DCACHE_NUM_REQS = `MAX(`DCACHE_NUM_BANKS, `SMEM_NUM_BANKS); localparam DCACHE_CHANNELS = `UP((`NUM_LSU_LANES * LSU_WORD_SIZE) / DCACHE_WORD_SIZE);
localparam DCACHE_NUM_REQS = `NUM_LSU_BLOCKS * DCACHE_CHANNELS;
// Memory request size
localparam LSU_MEM_REQS = `NUM_LSU_LANES;
// Batch select bits
localparam DCACHE_NUM_BATCHES = ((LSU_MEM_REQS + DCACHE_NUM_REQS - 1) / DCACHE_NUM_REQS);
localparam DCACHE_BATCH_SEL_BITS = `CLOG2(DCACHE_NUM_BATCHES);
// Core request tag Id bits // Core request tag Id bits
localparam LSUQ_TAG_BITS = (`CLOG2(`LSUQ_SIZE) + DCACHE_BATCH_SEL_BITS); localparam DCACHE_MERGED_REQS = (`NUM_LSU_LANES * LSU_WORD_SIZE) / DCACHE_WORD_SIZE;
localparam DCACHE_TAG_ID_BITS = (LSUQ_TAG_BITS + `CACHE_ADDR_TYPE_BITS); localparam DCACHE_MEM_BATCHES = `CDIV(DCACHE_MERGED_REQS, DCACHE_CHANNELS);
localparam DCACHE_TAG_ID_BITS = (`CLOG2(`LSUQ_OUT_SIZE) + `CLOG2(DCACHE_MEM_BATCHES));
// Core request tag bits // Core request tag bits
localparam DCACHE_TAG_WIDTH = (`UUID_WIDTH + DCACHE_TAG_ID_BITS); localparam DCACHE_TAG_WIDTH = (`UUID_WIDTH + DCACHE_TAG_ID_BITS);
localparam DCACHE_NOSM_TAG_WIDTH = (DCACHE_TAG_WIDTH - `SM_ENABLED);
// Memory request data bits // Memory request data bits
localparam DCACHE_MEM_DATA_WIDTH = (DCACHE_LINE_SIZE * 8); localparam DCACHE_MEM_DATA_WIDTH = (DCACHE_LINE_SIZE * 8);
// Memory request tag bits // Memory request tag bits
`ifdef DCACHE_ENABLE `ifdef DCACHE_ENABLE
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_NOSM_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES); localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`else `else
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_NOSM_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES); localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`endif `endif
/////////////////////////////// L1 Parameters ///////////////////////////// /////////////////////////////// L1 Parameters /////////////////////////////
@ -146,6 +209,9 @@ package VX_gpu_pkg;
/////////////////////////////// L2 Parameters ///////////////////////////// /////////////////////////////// L2 Parameters /////////////////////////////
localparam ICACHE_MEM_ARB_IDX = 0;
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
// Word size in bytes // Word size in bytes
localparam L2_WORD_SIZE = `L1_LINE_SIZE; localparam L2_WORD_SIZE = `L1_LINE_SIZE;
@ -162,7 +228,7 @@ package VX_gpu_pkg;
`ifdef L2_ENABLE `ifdef L2_ENABLE
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH); localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
`else `else
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH); localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
`endif `endif
/////////////////////////////// L3 Parameters ///////////////////////////// /////////////////////////////// L3 Parameters /////////////////////////////
@ -183,50 +249,65 @@ package VX_gpu_pkg;
`ifdef L3_ENABLE `ifdef L3_ENABLE
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH); localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`else `else
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH); localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`endif `endif
/* verilator lint_on UNUSED */
/////////////////////////////// Issue parameters ////////////////////////// /////////////////////////////// Issue parameters //////////////////////////
localparam ISSUE_IDX_W = `LOG2UP(`ISSUE_WIDTH); localparam ISSUE_ISW = `CLOG2(`ISSUE_WIDTH);
localparam ISSUE_RATIO = `NUM_WARPS / `ISSUE_WIDTH; localparam ISSUE_ISW_W = `UP(ISSUE_ISW);
localparam ISSUE_WIS_W = `LOG2UP(ISSUE_RATIO); localparam PER_ISSUE_WARPS = `NUM_WARPS / `ISSUE_WIDTH;
localparam ISSUE_ADDRW = `LOG2UP(`NUM_REGS * (ISSUE_RATIO)); localparam ISSUE_WIS = `CLOG2(PER_ISSUE_WARPS);
localparam ISSUE_WIS_W = `UP(ISSUE_WIS);
`IGNORE_UNUSED_BEGIN
function logic [ISSUE_IDX_W-1:0] wid_to_isw(
input logic [`NW_WIDTH-1:0] wid
);
if (`ISSUE_WIDTH > 1) begin
wid_to_isw = ISSUE_IDX_W'(wid);
end else begin
wid_to_isw = 0;
end
endfunction
`IGNORE_UNUSED_END
function logic [`NW_WIDTH-1:0] wis_to_wid( function logic [`NW_WIDTH-1:0] wis_to_wid(
input logic [ISSUE_WIS_W-1:0] wis, input logic [ISSUE_WIS_W-1:0] wis,
input logic [ISSUE_IDX_W-1:0] isw input logic [ISSUE_ISW_W-1:0] isw
); );
wis_to_wid = `NW_WIDTH'({wis, isw} >> (ISSUE_IDX_W-`CLOG2(`ISSUE_WIDTH))); if (ISSUE_WIS == 0) begin
wis_to_wid = `NW_WIDTH'(isw);
end else if (ISSUE_ISW == 0) begin
wis_to_wid = `NW_WIDTH'(wis);
end else begin
wis_to_wid = `NW_WIDTH'({wis, isw});
end
endfunction
function logic [ISSUE_ISW_W-1:0] wid_to_isw(
input logic [`NW_WIDTH-1:0] wid
);
if (ISSUE_ISW != 0) begin
wid_to_isw = wid[ISSUE_ISW_W-1:0];
end else begin
wid_to_isw = 0;
end
endfunction endfunction
function logic [ISSUE_WIS_W-1:0] wid_to_wis( function logic [ISSUE_WIS_W-1:0] wid_to_wis(
input logic [`NW_WIDTH-1:0] wid input logic [`NW_WIDTH-1:0] wid
); );
wid_to_wis = ISSUE_WIS_W'(wid >> `CLOG2(`ISSUE_WIDTH)); if (ISSUE_WIS != 0) begin
wid_to_wis = ISSUE_WIS_W'(wid >> ISSUE_ISW);
end else begin
wid_to_wis = 0;
end
endfunction endfunction
function logic [ISSUE_ADDRW-1:0] wis_to_addr( ///////////////////////// Miscaellaneous functions ////////////////////////
input logic [`NR_BITS-1:0] rid,
input logic [ISSUE_WIS_W-1:0] wis function logic [`SFU_WIDTH-1:0] op_to_sfu_type(
input logic [`INST_OP_BITS-1:0] op_type
); );
wis_to_addr = ISSUE_ADDRW'({rid, wis} >> (ISSUE_WIS_W-`CLOG2(ISSUE_RATIO))); case (op_type)
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: op_to_sfu_type = `SFU_CSRS;
default: op_to_sfu_type = `SFU_WCTL;
endcase
endfunction endfunction
`IGNORE_UNUSED_END
endpackage endpackage
`endif // VX_GPU_PKG_VH `endif // VX_GPU_PKG_VH

View file

@ -14,7 +14,7 @@
`ifndef VX_PLATFORM_VH `ifndef VX_PLATFORM_VH
`define VX_PLATFORM_VH `define VX_PLATFORM_VH
`ifndef SYNTHESIS `ifdef SV_DPI
`include "util_dpi.vh" `include "util_dpi.vh"
`endif `endif
@ -47,7 +47,7 @@
`define UNUSED_VAR(x) `define UNUSED_VAR(x)
`define UNUSED_PIN(x) . x () `define UNUSED_PIN(x) . x ()
`define UNUSED_ARG(x) x `define UNUSED_ARG(x) x
`define TRACE(level, args) $write args `define TRACE(level, args) if (level <= `DEBUG_LEVEL) $write args
`else `else
`ifdef VERILATOR `ifdef VERILATOR
`define TRACING_ON /* verilator tracing_on */ `define TRACING_ON /* verilator tracing_on */
@ -77,7 +77,8 @@
/* verilator lint_off IMPLICIT */ \ /* verilator lint_off IMPLICIT */ \
/* verilator lint_off PINMISSING */ \ /* verilator lint_off PINMISSING */ \
/* verilator lint_off IMPORTSTAR */ \ /* verilator lint_off IMPORTSTAR */ \
/* verilator lint_off UNSIGNED */ /* verilator lint_off UNSIGNED */ \
/* verilator lint_off SYMRSVDWORD */
`define IGNORE_WARNINGS_END /* verilator lint_on UNUSED */ \ `define IGNORE_WARNINGS_END /* verilator lint_on UNUSED */ \
/* verilator lint_on PINCONNECTEMPTY */ \ /* verilator lint_on PINCONNECTEMPTY */ \
@ -88,7 +89,8 @@
/* verilator lint_on IMPLICIT */ \ /* verilator lint_on IMPLICIT */ \
/* verilator lint_off PINMISSING */ \ /* verilator lint_off PINMISSING */ \
/* verilator lint_on IMPORTSTAR */ \ /* verilator lint_on IMPORTSTAR */ \
/* verilator lint_on UNSIGNED */ /* verilator lint_on UNSIGNED */ \
/* verilator lint_on SYMRSVDWORD */
`define UNUSED_PARAM(x) /* verilator lint_off UNUSED */ \ `define UNUSED_PARAM(x) /* verilator lint_off UNUSED */ \
localparam __``x = x; \ localparam __``x = x; \
@ -110,8 +112,14 @@
`define UNUSED_ARG(x) /* verilator lint_off UNUSED */ \ `define UNUSED_ARG(x) /* verilator lint_off UNUSED */ \
x \ x \
/* verilator lint_on UNUSED */ /* verilator lint_on UNUSED */
`define TRACE(level, args) dpi_trace(level, $sformatf args)
`endif `endif
`ifdef SV_DPI
`define TRACE(level, args) dpi_trace(level, $sformatf args)
`else
`define TRACE(level, args) if (level <= `DEBUG_LEVEL) $write args
`endif
`endif `endif
`ifdef SIMULATION `ifdef SIMULATION
@ -140,21 +148,21 @@
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
`ifdef QUARTUS `ifdef QUARTUS
`define MAX_FANOUT 4 `define MAX_FANOUT 8
`define IF_DATA_SIZE(x) $bits(x.data) `define IF_DATA_SIZE(x) $bits(x.data)
`define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *) `define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *)
`define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *) `define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *)
`define DISABLE_BRAM (* ramstyle = "logic" *) `define DISABLE_BRAM (* ramstyle = "logic" *)
`define PRESERVE_NET (* preserve *) `define PRESERVE_NET (* preserve *)
`elsif VIVADO `elsif VIVADO
`define MAX_FANOUT 4 `define MAX_FANOUT 8
`define IF_DATA_SIZE(x) $bits(x.data) `define IF_DATA_SIZE(x) $bits(x.data)
`define USE_FAST_BRAM (* ram_style = "distributed" *) `define USE_FAST_BRAM (* ram_style = "distributed" *)
`define NO_RW_RAM_CHECK (* rw_addr_collision = "no" *) `define NO_RW_RAM_CHECK (* rw_addr_collision = "no" *)
`define DISABLE_BRAM (* ram_style = "registers" *) `define DISABLE_BRAM (* ram_style = "registers" *)
`define PRESERVE_NET (* keep = "true" *) `define PRESERVE_NET (* keep = "true" *)
`else `else
`define MAX_FANOUT 4 `define MAX_FANOUT 8
`define IF_DATA_SIZE(x) x.DATA_WIDTH `define IF_DATA_SIZE(x) x.DATA_WIDTH
`define USE_FAST_BRAM `define USE_FAST_BRAM
`define NO_RW_RAM_CHECK `define NO_RW_RAM_CHECK
@ -169,7 +177,8 @@
`define CLOG2(x) $clog2(x) `define CLOG2(x) $clog2(x)
`define FLOG2(x) ($clog2(x) - (((1 << $clog2(x)) > (x)) ? 1 : 0)) `define FLOG2(x) ($clog2(x) - (((1 << $clog2(x)) > (x)) ? 1 : 0))
`define LOG2UP(x) (((x) > 1) ? $clog2(x) : 1) `define LOG2UP(x) (((x) > 1) ? $clog2(x) : 1)
`define ISPOW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1)))) `define IS_POW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1))))
`define IS_DIVISBLE(n, d) (((n) % (d)) == 0)
`define ABS(x) (((x) < 0) ? (-(x)) : (x)); `define ABS(x) (((x) < 0) ? (-(x)) : (x));
@ -181,34 +190,35 @@
`define MAX(x, y) (((x) > (y)) ? (x) : (y)) `define MAX(x, y) (((x) > (y)) ? (x) : (y))
`endif `endif
`ifndef CLAMP
`define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x))) `define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x)))
`endif
`ifndef UP
`define UP(x) (((x) != 0) ? (x) : 1) `define UP(x) (((x) != 0) ? (x) : 1)
`endif
`define CDIV(n,d) ((n + d - 1) / (d))
`define RTRIM(x, s) x[$bits(x)-1:($bits(x)-s)] `define RTRIM(x, s) x[$bits(x)-1:($bits(x)-s)]
`define LTRIM(x, s) x[s-1:0] `define LTRIM(x, s) x[s-1:0]
`define TRACE_ARRAY1D(lvl, arr, m) \ `define SEXT(len, x) {{(len-$bits(x)+1){x[$bits(x)-1]}}, x[$bits(x)-2:0]}
`define TRACE_ARRAY1D(lvl, fmt, arr, n) \
`TRACE(lvl, ("{")); \ `TRACE(lvl, ("{")); \
for (integer __i = (m-1); __i >= 0; --__i) begin \ for (integer __i = (n-1); __i >= 0; --__i) begin \
if (__i != (m-1)) `TRACE(lvl, (", ")); \ if (__i != (n-1)) `TRACE(lvl, (", ")); \
`TRACE(lvl, ("0x%0h", arr[__i])); \ `TRACE(lvl, (fmt, arr[__i])); \
end \ end \
`TRACE(lvl, ("}")); `TRACE(lvl, ("}"));
`define TRACE_ARRAY2D(lvl, arr, m, n) \ `define TRACE_ARRAY2D(lvl, fmt, arr, m, n) \
`TRACE(lvl, ("{")); \ `TRACE(lvl, ("{")); \
for (integer __i = n-1; __i >= 0; --__i) begin \ for (integer __i = n-1; __i >= 0; --__i) begin \
if (__i != (n-1)) `TRACE(lvl, (", ")); \ if (__i != (n-1)) `TRACE(lvl, (", ")); \
`TRACE(lvl, ("{")); \ `TRACE(lvl, ("{")); \
for (integer __j = (m-1); __j >= 0; --__j) begin \ for (integer __j = (m-1); __j >= 0; --__j) begin \
if (__j != (m-1)) `TRACE(lvl, (", "));\ if (__j != (m-1)) `TRACE(lvl, (", "));\
`TRACE(lvl, ("0x%0h", arr[__i][__j])); \ `TRACE(lvl, (fmt, arr[__i][__j])); \
end \ end \
`TRACE(lvl, ("}")); \ `TRACE(lvl, ("}")); \
end \ end \
@ -228,11 +238,11 @@
`define RESET_RELAY(dst, src) \ `define RESET_RELAY(dst, src) \
`RESET_RELAY_EX (dst, src, 1, 0) `RESET_RELAY_EX (dst, src, 1, 0)
// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2 // size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2, 5 -> 2
`define OUT_REG_TO_EB_SIZE(out_reg) `MIN(out_reg, 2) `define TO_OUT_BUF_SIZE(s) `MIN(s, 2)
// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2 // reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2, 5 > 3
`define OUT_REG_TO_EB_REG(out_reg) ((out_reg & 1) + ((out_reg >> 2) << 1)) `define TO_OUT_BUF_REG(s) ((s < 2) ? s : (s - 2))
`define REPEAT(n,f,s) `_REPEAT_``n(f,s) `define REPEAT(n,f,s) `_REPEAT_``n(f,s)
`define _REPEAT_0(f,s) `define _REPEAT_0(f,s)

View file

@ -14,7 +14,8 @@
`include "VX_define.vh" `include "VX_define.vh"
module VX_socket import VX_gpu_pkg::*; #( module VX_socket import VX_gpu_pkg::*; #(
parameter SOCKET_ID = 0 parameter SOCKET_ID = 0,
parameter `STRING INSTANCE_ID = ""
) ( ) (
`SCOPE_IO_DECL `SCOPE_IO_DECL
@ -36,15 +37,15 @@ module VX_socket import VX_gpu_pkg::*; #(
// Barrier // Barrier
VX_gbar_bus_if.master gbar_bus_if, VX_gbar_bus_if.master gbar_bus_if,
`endif `endif
// simulation helper signals
output wire sim_ebreak,
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
// Status // Status
output wire busy output wire busy
); );
`ifdef SCOPE
localparam scope_core = 0;
`SCOPE_IO_SWITCH (`SOCKET_SIZE);
`endif
`ifdef GBAR_ENABLE `ifdef GBAR_ENABLE
VX_gbar_bus_if per_core_gbar_bus_if[`SOCKET_SIZE](); VX_gbar_bus_if per_core_gbar_bus_if[`SOCKET_SIZE]();
@ -52,7 +53,7 @@ module VX_socket import VX_gpu_pkg::*; #(
VX_gbar_arb #( VX_gbar_arb #(
.NUM_REQS (`SOCKET_SIZE), .NUM_REQS (`SOCKET_SIZE),
.OUT_REG ((`SOCKET_SIZE > 1) ? 2 : 0) .OUT_BUF ((`SOCKET_SIZE > 1) ? 2 : 0)
) gbar_arb ( ) gbar_arb (
.clk (clk), .clk (clk),
.reset (gbar_arb_reset), .reset (gbar_arb_reset),
@ -65,59 +66,12 @@ module VX_socket import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if(); VX_mem_perf_if mem_perf_tmp_if();
cache_perf_t perf_icache;
cache_perf_t perf_dcache;
assign mem_perf_tmp_if.icache = perf_icache;
assign mem_perf_tmp_if.dcache = perf_dcache;
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache; assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.smem = 'x; assign mem_perf_tmp_if.lmem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem; assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif `endif
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
) icache_mem_bus_if();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
) dcache_mem_bus_if();
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
) cache_mem_bus_if[2]();
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) mem_bus_tmp_if[1]();
`ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_X (cache_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
`RESET_RELAY (mem_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (2),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.ARBITER ("R"),
.OUT_REG_REQ (2),
.OUT_REG_RSP (2)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
.bus_in_if (cache_mem_bus_if),
.bus_out_if (mem_bus_tmp_if)
);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]);
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
VX_mem_bus_if #( VX_mem_bus_if #(
@ -125,10 +79,15 @@ module VX_socket import VX_gpu_pkg::*; #(
.TAG_WIDTH (ICACHE_TAG_WIDTH) .TAG_WIDTH (ICACHE_TAG_WIDTH)
) per_core_icache_bus_if[`SOCKET_SIZE](); ) per_core_icache_bus_if[`SOCKET_SIZE]();
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
) icache_mem_bus_if();
`RESET_RELAY (icache_reset, reset); `RESET_RELAY (icache_reset, reset);
VX_cache_cluster #( VX_cache_cluster #(
.INSTANCE_ID ($sformatf("socket%0d-icache", SOCKET_ID)), .INSTANCE_ID ($sformatf("%s-icache", INSTANCE_ID)),
.NUM_UNITS (`NUM_ICACHES), .NUM_UNITS (`NUM_ICACHES),
.NUM_INPUTS (`SOCKET_SIZE), .NUM_INPUTS (`SOCKET_SIZE),
.TAG_SEL_IDX (0), .TAG_SEL_IDX (0),
@ -145,11 +104,12 @@ module VX_socket import VX_gpu_pkg::*; #(
.TAG_WIDTH (ICACHE_TAG_WIDTH), .TAG_WIDTH (ICACHE_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH), .UUID_WIDTH (`UUID_WIDTH),
.WRITE_ENABLE (0), .WRITE_ENABLE (0),
.CORE_OUT_REG (2), .NC_ENABLE (0),
.MEM_OUT_REG (2) .CORE_OUT_BUF (2),
.MEM_OUT_BUF (2)
) icache ( ) icache (
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (perf_icache), .cache_perf (mem_perf_tmp_if.icache),
`endif `endif
.clk (clk), .clk (clk),
.reset (icache_reset), .reset (icache_reset),
@ -161,16 +121,21 @@ module VX_socket import VX_gpu_pkg::*; #(
VX_mem_bus_if #( VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE), .DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH) .TAG_WIDTH (DCACHE_TAG_WIDTH)
) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS](); ) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
) dcache_mem_bus_if();
`RESET_RELAY (dcache_reset, reset); `RESET_RELAY (dcache_reset, reset);
VX_cache_cluster #( VX_cache_cluster #(
.INSTANCE_ID ($sformatf("socket%0d-dcache", SOCKET_ID)), .INSTANCE_ID ($sformatf("%s-dcache", INSTANCE_ID)),
.NUM_UNITS (`NUM_DCACHES), .NUM_UNITS (`NUM_DCACHES),
.NUM_INPUTS (`SOCKET_SIZE), .NUM_INPUTS (`SOCKET_SIZE),
.TAG_SEL_IDX (1), .TAG_SEL_IDX (0),
.CACHE_SIZE (`DCACHE_SIZE), .CACHE_SIZE (`DCACHE_SIZE),
.LINE_SIZE (DCACHE_LINE_SIZE), .LINE_SIZE (DCACHE_LINE_SIZE),
.NUM_BANKS (`DCACHE_NUM_BANKS), .NUM_BANKS (`DCACHE_NUM_BANKS),
@ -180,16 +145,18 @@ module VX_socket import VX_gpu_pkg::*; #(
.CRSQ_SIZE (`DCACHE_CRSQ_SIZE), .CRSQ_SIZE (`DCACHE_CRSQ_SIZE),
.MSHR_SIZE (`DCACHE_MSHR_SIZE), .MSHR_SIZE (`DCACHE_MSHR_SIZE),
.MRSQ_SIZE (`DCACHE_MRSQ_SIZE), .MRSQ_SIZE (`DCACHE_MRSQ_SIZE),
.MREQ_SIZE (`DCACHE_MREQ_SIZE), .MREQ_SIZE (`DCACHE_WRITEBACK ? `DCACHE_MSHR_SIZE : `DCACHE_MREQ_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH), .TAG_WIDTH (DCACHE_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH), .UUID_WIDTH (`UUID_WIDTH),
.WRITE_ENABLE (1), .WRITE_ENABLE (1),
.WRITEBACK (`DCACHE_WRITEBACK),
.DIRTY_BYTES (`DCACHE_WRITEBACK),
.NC_ENABLE (1), .NC_ENABLE (1),
.CORE_OUT_REG (`SM_ENABLED ? 2 : 1), .CORE_OUT_BUF (2),
.MEM_OUT_REG (2) .MEM_OUT_BUF (2)
) dcache ( ) dcache (
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (perf_dcache), .cache_perf (mem_perf_tmp_if.dcache),
`endif `endif
.clk (clk), .clk (clk),
.reset (dcache_reset), .reset (dcache_reset),
@ -199,28 +166,53 @@ module VX_socket import VX_gpu_pkg::*; #(
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
wire [`SOCKET_SIZE-1:0] per_core_sim_ebreak; VX_mem_bus_if #(
wire [`SOCKET_SIZE-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_core_sim_wb_value; .DATA_SIZE (`L1_LINE_SIZE),
assign sim_ebreak = per_core_sim_ebreak[0]; .TAG_WIDTH (L1_MEM_TAG_WIDTH)
assign sim_wb_value = per_core_sim_wb_value[0]; ) l1_mem_bus_if[2]();
`UNUSED_VAR (per_core_sim_ebreak)
`UNUSED_VAR (per_core_sim_wb_value) VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) l1_mem_arb_bus_if[1]();
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
VX_mem_arb #(
.NUM_INPUTS (2),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX (0),
.ARBITER ("R"),
.REQ_OUT_BUF (2),
.RSP_OUT_BUF (2)
) mem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (l1_mem_bus_if),
.bus_out_if (l1_mem_arb_bus_if)
);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, l1_mem_arb_bus_if[0]);
///////////////////////////////////////////////////////////////////////////
wire [`SOCKET_SIZE-1:0] per_core_busy; wire [`SOCKET_SIZE-1:0] per_core_busy;
VX_dcr_bus_if core_dcr_bus_if();
`BUFFER_DCR_BUS_IF (core_dcr_bus_if, dcr_bus_if, (`SOCKET_SIZE > 1)); `BUFFER_DCR_BUS_IF (core_dcr_bus_if, dcr_bus_if, (`SOCKET_SIZE > 1));
`SCOPE_IO_SWITCH (`SOCKET_SIZE)
// Generate all cores // Generate all cores
for (genvar i = 0; i < `SOCKET_SIZE; ++i) begin for (genvar core_id = 0; core_id < `SOCKET_SIZE; ++core_id) begin : cores
`RESET_RELAY (core_reset, reset); `RESET_RELAY (core_reset, reset);
VX_core #( VX_core #(
.CORE_ID ((SOCKET_ID * `SOCKET_SIZE) + i) .CORE_ID ((SOCKET_ID * `SOCKET_SIZE) + core_id),
.INSTANCE_ID ($sformatf("%s-core%0d", INSTANCE_ID, core_id))
) core ( ) core (
`SCOPE_IO_BIND (i) `SCOPE_IO_BIND (scope_core + core_id)
.clk (clk), .clk (clk),
.reset (core_reset), .reset (core_reset),
@ -231,20 +223,18 @@ module VX_socket import VX_gpu_pkg::*; #(
.dcr_bus_if (core_dcr_bus_if), .dcr_bus_if (core_dcr_bus_if),
.dcache_bus_if (per_core_dcache_bus_if[i * DCACHE_NUM_REQS +: DCACHE_NUM_REQS]), .dcache_bus_if (per_core_dcache_bus_if[core_id * DCACHE_NUM_REQS +: DCACHE_NUM_REQS]),
.icache_bus_if (per_core_icache_bus_if[i]), .icache_bus_if (per_core_icache_bus_if[core_id]),
`ifdef GBAR_ENABLE `ifdef GBAR_ENABLE
.gbar_bus_if (per_core_gbar_bus_if[i]), .gbar_bus_if (per_core_gbar_bus_if[core_id]),
`endif `endif
.sim_ebreak (per_core_sim_ebreak[i]), .busy (per_core_busy[core_id])
.sim_wb_value (per_core_sim_wb_value[i]),
.busy (per_core_busy[i])
); );
end end
`BUFFER_BUSY (busy, (| per_core_busy), (`SOCKET_SIZE > 1)); `BUFFER_EX(busy, (| per_core_busy), 1'b1, (`SOCKET_SIZE > 1));
endmodule endmodule

View file

@ -14,7 +14,7 @@
`ifndef VX_TYPES_VH `ifndef VX_TYPES_VH
`define VX_TYPES_VH `define VX_TYPES_VH
// Device configuration registers // Device configuration registers /////////////////////////////////////////////
`define VX_CSR_ADDR_BITS 12 `define VX_CSR_ADDR_BITS 12
`define VX_DCR_ADDR_BITS 12 `define VX_DCR_ADDR_BITS 12
@ -22,19 +22,21 @@
`define VX_DCR_BASE_STATE_BEGIN 12'h001 `define VX_DCR_BASE_STATE_BEGIN 12'h001
`define VX_DCR_BASE_STARTUP_ADDR0 12'h001 `define VX_DCR_BASE_STARTUP_ADDR0 12'h001
`define VX_DCR_BASE_STARTUP_ADDR1 12'h002 `define VX_DCR_BASE_STARTUP_ADDR1 12'h002
`define VX_DCR_BASE_MPM_CLASS 12'h003 `define VX_DCR_BASE_STARTUP_ARG0 12'h003
`define VX_DCR_BASE_STATE_END 12'h004 `define VX_DCR_BASE_STARTUP_ARG1 12'h004
`define VX_DCR_BASE_MPM_CLASS 12'h005
`define VX_DCR_BASE_STATE_END 12'h006
`define VX_DCR_BASE_STATE(addr) ((addr) - `VX_DCR_BASE_STATE_BEGIN) `define VX_DCR_BASE_STATE(addr) ((addr) - `VX_DCR_BASE_STATE_BEGIN)
`define VX_DCR_BASE_STATE_COUNT (`VX_DCR_BASE_STATE_END-`VX_DCR_BASE_STATE_BEGIN) `define VX_DCR_BASE_STATE_COUNT (`VX_DCR_BASE_STATE_END-`VX_DCR_BASE_STATE_BEGIN)
// Machine Performance-monitoring counters classes // Machine Performance-monitoring counters classes ////////////////////////////
`define VX_DCR_MPM_CLASS_NONE 0 `define VX_DCR_MPM_CLASS_NONE 0
`define VX_DCR_MPM_CLASS_CORE 1 `define VX_DCR_MPM_CLASS_CORE 1
`define VX_DCR_MPM_CLASS_MEM 2 `define VX_DCR_MPM_CLASS_MEM 2
// User Floating-Point CSRs // User Floating-Point CSRs ///////////////////////////////////////////////////
`define VX_CSR_FFLAGS 12'h001 `define VX_CSR_FFLAGS 12'h001
`define VX_CSR_FRM 12'h002 `define VX_CSR_FRM 12'h002
@ -52,7 +54,9 @@
`define VX_CSR_MIE 12'h304 `define VX_CSR_MIE 12'h304
`define VX_CSR_MTVEC 12'h305 `define VX_CSR_MTVEC 12'h305
`define VX_CSR_MSCRATCH 12'h340
`define VX_CSR_MEPC 12'h341 `define VX_CSR_MEPC 12'h341
`define VX_CSR_MCAUSE 12'h342
`define VX_CSR_MNSTATUS 12'h744 `define VX_CSR_MNSTATUS 12'h744
@ -61,52 +65,54 @@
`define VX_CSR_MPM_USER 12'hB03 `define VX_CSR_MPM_USER 12'hB03
`define VX_CSR_MPM_USER_H 12'hB83 `define VX_CSR_MPM_USER_H 12'hB83
// Machine Performance-monitoring core counters // Machine Performance-monitoring core counters (Standard) ////////////////////
// PERF: Standard
`define VX_CSR_MCYCLE 12'hB00 `define VX_CSR_MCYCLE 12'hB00
`define VX_CSR_MCYCLE_H 12'hB80 `define VX_CSR_MCYCLE_H 12'hB80
`define VX_CSR_MPM_RESERVED 12'hB01 `define VX_CSR_MPM_RESERVED 12'hB01
`define VX_CSR_MPM_RESERVED_H 12'hB81 `define VX_CSR_MPM_RESERVED_H 12'hB81
`define VX_CSR_MINSTRET 12'hB02 `define VX_CSR_MINSTRET 12'hB02
`define VX_CSR_MINSTRET_H 12'hB82 `define VX_CSR_MINSTRET_H 12'hB82
// Machine Performance-monitoring core counters (class 1) /////////////////////
// PERF: pipeline // PERF: pipeline
`define VX_CSR_MPM_SCHED_ST 12'hB03 `define VX_CSR_MPM_SCHED_ID 12'hB03
`define VX_CSR_MPM_SCHED_ST_H 12'hB83 `define VX_CSR_MPM_SCHED_ID_H 12'hB83
`define VX_CSR_MPM_FETCH_ST 12'hB04 `define VX_CSR_MPM_SCHED_ST 12'hB04
`define VX_CSR_MPM_FETCH_ST_H 12'hB84 `define VX_CSR_MPM_SCHED_ST_H 12'hB84
`define VX_CSR_MPM_IBUF_ST 12'hB05 `define VX_CSR_MPM_IBUF_ST 12'hB05
`define VX_CSR_MPM_IBUF_ST_H 12'hB85 `define VX_CSR_MPM_IBUF_ST_H 12'hB85
`define VX_CSR_MPM_SCRB_ST 12'hB06 `define VX_CSR_MPM_SCRB_ST 12'hB06
`define VX_CSR_MPM_SCRB_ST_H 12'hB86 `define VX_CSR_MPM_SCRB_ST_H 12'hB86
`define VX_CSR_MPM_ALU_ST 12'hB07 `define VX_CSR_MPM_OPDS_ST 12'hB07
`define VX_CSR_MPM_ALU_ST_H 12'hB87 `define VX_CSR_MPM_OPDS_ST_H 12'hB87
`define VX_CSR_MPM_LSU_ST 12'hB08 `define VX_CSR_MPM_SCRB_ALU 12'hB08
`define VX_CSR_MPM_LSU_ST_H 12'hB88 `define VX_CSR_MPM_SCRB_ALU_H 12'hB88
`define VX_CSR_MPM_FPU_ST 12'hB09 `define VX_CSR_MPM_SCRB_FPU 12'hB09
`define VX_CSR_MPM_FPU_ST_H 12'hB89 `define VX_CSR_MPM_SCRB_FPU_H 12'hB89
`define VX_CSR_MPM_SFU_ST 12'hB0A `define VX_CSR_MPM_SCRB_LSU 12'hB0A
`define VX_CSR_MPM_SFU_ST_H 12'hB8A `define VX_CSR_MPM_SCRB_LSU_H 12'hB8A
`define VX_CSR_MPM_SCRB_ALU 12'hB0B `define VX_CSR_MPM_SCRB_SFU 12'hB0B
`define VX_CSR_MPM_SCRB_ALU_H 12'hB8B `define VX_CSR_MPM_SCRB_SFU_H 12'hB8B
`define VX_CSR_MPM_SCRB_FPU 12'hB0C `define VX_CSR_MPM_SCRB_CSRS 12'hB0C
`define VX_CSR_MPM_SCRB_FPU_H 12'hB8C `define VX_CSR_MPM_SCRB_CSRS_H 12'hB8C
`define VX_CSR_MPM_SCRB_LSU 12'hB0D `define VX_CSR_MPM_SCRB_WCTL 12'hB0D
`define VX_CSR_MPM_SCRB_LSU_H 12'hB8D `define VX_CSR_MPM_SCRB_WCTL_H 12'hB8D
`define VX_CSR_MPM_SCRB_SFU 12'hB0E
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8E
// PERF: memory // PERF: memory
`define VX_CSR_MPM_IFETCHES 12'hB0F `define VX_CSR_MPM_IFETCHES 12'hB0E
`define VX_CSR_MPM_IFETCHES_H 12'hB8F `define VX_CSR_MPM_IFETCHES_H 12'hB8E
`define VX_CSR_MPM_LOADS 12'hB10 `define VX_CSR_MPM_LOADS 12'hB0F
`define VX_CSR_MPM_LOADS_H 12'hB90 `define VX_CSR_MPM_LOADS_H 12'hB8F
`define VX_CSR_MPM_STORES 12'hB11 `define VX_CSR_MPM_STORES 12'hB10
`define VX_CSR_MPM_STORES_H 12'hB91 `define VX_CSR_MPM_STORES_H 12'hB90
`define VX_CSR_MPM_IFETCH_LAT 12'hB12 `define VX_CSR_MPM_IFETCH_LT 12'hB11
`define VX_CSR_MPM_IFETCH_LAT_H 12'hB92 `define VX_CSR_MPM_IFETCH_LT_H 12'hB91
`define VX_CSR_MPM_LOAD_LAT 12'hB13 `define VX_CSR_MPM_LOAD_LT 12'hB12
`define VX_CSR_MPM_LOAD_LAT_H 12'hB93 `define VX_CSR_MPM_LOAD_LT_H 12'hB92
// Machine Performance-monitoring memory counters (class 2) ///////////////////
// Machine Performance-monitoring memory counters
// PERF: icache // PERF: icache
`define VX_CSR_MPM_ICACHE_READS 12'hB03 // total reads `define VX_CSR_MPM_ICACHE_READS 12'hB03 // total reads
`define VX_CSR_MPM_ICACHE_READS_H 12'hB83 `define VX_CSR_MPM_ICACHE_READS_H 12'hB83
@ -158,17 +164,24 @@
`define VX_CSR_MPM_MEM_READS_H 12'hB98 `define VX_CSR_MPM_MEM_READS_H 12'hB98
`define VX_CSR_MPM_MEM_WRITES 12'hB19 // total writes `define VX_CSR_MPM_MEM_WRITES 12'hB19 // total writes
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99 `define VX_CSR_MPM_MEM_WRITES_H 12'hB99
`define VX_CSR_MPM_MEM_LAT 12'hB1A // memory latency `define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency
`define VX_CSR_MPM_MEM_LAT_H 12'hB9A `define VX_CSR_MPM_MEM_LT_H 12'hB9A
// PERF: smem `define VX_CSR_MPM_MEM_BANK_CNTR 12'hB1E // memory bank requests
`define VX_CSR_MPM_SMEM_READS 12'hB1B // memory reads `define VX_CSR_MPM_MEM_BANK_CNTR_H 12'hB9E
`define VX_CSR_MPM_SMEM_READS_H 12'hB9B `define VX_CSR_MPM_MEM_BANK_TICK 12'hB1F // memory ticks
`define VX_CSR_MPM_SMEM_WRITES 12'hB1C // memory writes `define VX_CSR_MPM_MEM_BANK_TICK_H 12'hB9F
`define VX_CSR_MPM_SMEM_WRITES_H 12'hB9C // PERF: lmem
`define VX_CSR_MPM_SMEM_BANK_ST 12'hB1D // bank conflicts `define VX_CSR_MPM_LMEM_READS 12'hB1B // memory reads
`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB9D `define VX_CSR_MPM_LMEM_READS_H 12'hB9B
`define VX_CSR_MPM_LMEM_WRITES 12'hB1C // memory writes
`define VX_CSR_MPM_LMEM_WRITES_H 12'hB9C
`define VX_CSR_MPM_LMEM_BANK_ST 12'hB1D // bank conflicts
`define VX_CSR_MPM_LMEM_BANK_ST_H 12'hB9D
// Machine Information Registers // Machine Performance-monitoring memory counters (class 3) ///////////////////
// <Add your own counters: use addresses hB03..B1F, hB83..hB9F>
// Machine Information Registers //////////////////////////////////////////////
`define VX_CSR_MVENDORID 12'hF11 `define VX_CSR_MVENDORID 12'hF11
`define VX_CSR_MARCHID 12'hF12 `define VX_CSR_MARCHID 12'hF12
@ -180,11 +193,12 @@
`define VX_CSR_THREAD_ID 12'hCC0 `define VX_CSR_THREAD_ID 12'hCC0
`define VX_CSR_WARP_ID 12'hCC1 `define VX_CSR_WARP_ID 12'hCC1
`define VX_CSR_CORE_ID 12'hCC2 `define VX_CSR_CORE_ID 12'hCC2
`define VX_CSR_WARP_MASK 12'hCC3 `define VX_CSR_ACTIVE_WARPS 12'hCC3
`define VX_CSR_THREAD_MASK 12'hCC4 // warning! this value is also used in LLVM `define VX_CSR_ACTIVE_THREADS 12'hCC4 // warning! this value is also used in LLVM
`define VX_CSR_NUM_THREADS 12'hFC0 `define VX_CSR_NUM_THREADS 12'hFC0
`define VX_CSR_NUM_WARPS 12'hFC1 `define VX_CSR_NUM_WARPS 12'hFC1
`define VX_CSR_NUM_CORES 12'hFC2 `define VX_CSR_NUM_CORES 12'hFC2
`define VX_CSR_LOCAL_MEM_BASE 12'hFC3
`endif // VX_TYPES_VH `endif // VX_TYPES_VH

View file

@ -44,17 +44,17 @@ module Vortex import VX_gpu_pkg::*; (
output wire busy output wire busy
); );
`ifdef SCOPE
localparam scope_cluster = 0;
`SCOPE_IO_SWITCH (`NUM_CLUSTERS);
`endif
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if(); VX_mem_perf_if mem_perf_if();
cache_perf_t perf_l3cache;
mem_perf_t mem_perf;
assign mem_perf_if.icache = 'x; assign mem_perf_if.icache = 'x;
assign mem_perf_if.dcache = 'x; assign mem_perf_if.dcache = 'x;
assign mem_perf_if.l2cache = 'x; assign mem_perf_if.l2cache = 'x;
assign mem_perf_if.l3cache = perf_l3cache; assign mem_perf_if.lmem = 'x;
assign mem_perf_if.smem = 'x;
assign mem_perf_if.mem = mem_perf;
`endif `endif
VX_mem_bus_if #( VX_mem_bus_if #(
@ -80,12 +80,14 @@ module Vortex import VX_gpu_pkg::*; (
.CRSQ_SIZE (`L3_CRSQ_SIZE), .CRSQ_SIZE (`L3_CRSQ_SIZE),
.MSHR_SIZE (`L3_MSHR_SIZE), .MSHR_SIZE (`L3_MSHR_SIZE),
.MRSQ_SIZE (`L3_MRSQ_SIZE), .MRSQ_SIZE (`L3_MRSQ_SIZE),
.MREQ_SIZE (`L3_MREQ_SIZE), .MREQ_SIZE (`L3_WRITEBACK ? `L3_MSHR_SIZE : `L3_MREQ_SIZE),
.TAG_WIDTH (L2_MEM_TAG_WIDTH), .TAG_WIDTH (L2_MEM_TAG_WIDTH),
.WRITE_ENABLE (1), .WRITE_ENABLE (1),
.WRITEBACK (`L3_WRITEBACK),
.DIRTY_BYTES (`L3_WRITEBACK),
.UUID_WIDTH (`UUID_WIDTH), .UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_REG (2), .CORE_OUT_BUF (2),
.MEM_OUT_REG (2), .MEM_OUT_BUF (2),
.NC_ENABLE (1), .NC_ENABLE (1),
.PASSTHRU (!`L3_ENABLED) .PASSTHRU (!`L3_ENABLED)
) l3cache ( ) l3cache (
@ -93,7 +95,7 @@ module Vortex import VX_gpu_pkg::*; (
.reset (l3_reset), .reset (l3_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (perf_l3cache), .cache_perf (mem_perf_if.l3cache),
`endif `endif
.core_bus_if (per_cluster_mem_bus_if), .core_bus_if (per_cluster_mem_bus_if),
@ -107,6 +109,7 @@ module Vortex import VX_gpu_pkg::*; (
assign mem_req_data = mem_bus_if.req_data.data; assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag; assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready; assign mem_bus_if.req_ready = mem_req_ready;
`UNUSED_VAR (mem_bus_if.req_data.atype)
assign mem_bus_if.rsp_valid = mem_rsp_valid; assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data; assign mem_bus_if.rsp_data.data = mem_rsp_data;
@ -118,15 +121,6 @@ module Vortex import VX_gpu_pkg::*; (
`UNUSED_VAR (mem_req_fire) `UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire) `UNUSED_VAR (mem_rsp_fire)
wire sim_ebreak /* verilator public */;
wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value /* verilator public */;
wire [`NUM_CLUSTERS-1:0] per_cluster_sim_ebreak;
wire [`NUM_CLUSTERS-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_cluster_sim_wb_value;
assign sim_ebreak = per_cluster_sim_ebreak[0];
assign sim_wb_value = per_cluster_sim_wb_value[0];
`UNUSED_VAR (per_cluster_sim_ebreak)
`UNUSED_VAR (per_cluster_sim_wb_value)
VX_dcr_bus_if dcr_bus_if(); VX_dcr_bus_if dcr_bus_if();
assign dcr_bus_if.write_valid = dcr_wr_valid; assign dcr_bus_if.write_valid = dcr_wr_valid;
assign dcr_bus_if.write_addr = dcr_wr_addr; assign dcr_bus_if.write_addr = dcr_wr_addr;
@ -134,19 +128,19 @@ module Vortex import VX_gpu_pkg::*; (
wire [`NUM_CLUSTERS-1:0] per_cluster_busy; wire [`NUM_CLUSTERS-1:0] per_cluster_busy;
`SCOPE_IO_SWITCH (`NUM_CLUSTERS)
// Generate all clusters // Generate all clusters
for (genvar i = 0; i < `NUM_CLUSTERS; ++i) begin for (genvar cluster_id = 0; cluster_id < `NUM_CLUSTERS; ++cluster_id) begin : clusters
`RESET_RELAY (cluster_reset, reset); `RESET_RELAY (cluster_reset, reset);
VX_dcr_bus_if cluster_dcr_bus_if();
`BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, (`NUM_CLUSTERS > 1)); `BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, (`NUM_CLUSTERS > 1));
VX_cluster #( VX_cluster #(
.CLUSTER_ID (i) .CLUSTER_ID (cluster_id),
.INSTANCE_ID ($sformatf("cluster%0d", cluster_id))
) cluster ( ) cluster (
`SCOPE_IO_BIND (i) `SCOPE_IO_BIND (scope_cluster + cluster_id)
.clk (clk), .clk (clk),
.reset (cluster_reset), .reset (cluster_reset),
@ -157,20 +151,18 @@ module Vortex import VX_gpu_pkg::*; (
.dcr_bus_if (cluster_dcr_bus_if), .dcr_bus_if (cluster_dcr_bus_if),
.mem_bus_if (per_cluster_mem_bus_if[i]), .mem_bus_if (per_cluster_mem_bus_if[cluster_id]),
.sim_ebreak (per_cluster_sim_ebreak[i]), .busy (per_cluster_busy[cluster_id])
.sim_wb_value (per_cluster_sim_wb_value[i]),
.busy (per_cluster_busy[i])
); );
end end
`BUFFER_BUSY (busy, (| per_cluster_busy), (`NUM_CLUSTERS > 1)); `BUFFER_EX(busy, (| per_cluster_busy), 1'b1, (`NUM_CLUSTERS > 1));
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads; reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
mem_perf_t mem_perf;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
@ -181,32 +173,32 @@ module Vortex import VX_gpu_pkg::*; (
end end
end end
wire mem_rd_req_fire = mem_req_fire && ~mem_bus_if.req_data.rw;
wire mem_wr_req_fire = mem_req_fire && mem_bus_if.req_data.rw;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
mem_perf <= '0; mem_perf <= '0;
end else begin end else begin
if (mem_req_fire && ~mem_bus_if.req_data.rw) begin mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(mem_rd_req_fire);
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(1); mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(mem_wr_req_fire);
end
if (mem_req_fire && mem_bus_if.req_data.rw) begin
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(1);
end
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads; mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
end end
end end
assign mem_perf_if.mem = mem_perf;
`endif `endif
`ifdef DBG_TRACE_CORE_MEM `ifdef DBG_TRACE_MEM
always @(posedge clk) begin always @(posedge clk) begin
if (mem_req_fire) begin if (mem_req_fire) begin
if (mem_req_rw) if (mem_req_rw)
`TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h data=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data)); `TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%h data=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data));
else else
`TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen)); `TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen));
end end
if (mem_rsp_fire) begin if (mem_rsp_fire) begin
`TRACE(1, ("%d: MEM Rsp: tag=0x%0h, data=0x%0h\n", $time, mem_rsp_tag, mem_rsp_data)); `TRACE(1, ("%d: MEM Rd Rsp: tag=0x%0h, data=0x%h\n", $time, mem_rsp_tag, mem_rsp_data));
end end
end end
`endif `endif

View file

@ -15,7 +15,7 @@
module Vortex_axi import VX_gpu_pkg::*; #( module Vortex_axi import VX_gpu_pkg::*; #(
parameter AXI_DATA_WIDTH = `VX_MEM_DATA_WIDTH, parameter AXI_DATA_WIDTH = `VX_MEM_DATA_WIDTH,
parameter AXI_ADDR_WIDTH = `XLEN, parameter AXI_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter AXI_TID_WIDTH = `VX_MEM_TAG_WIDTH, parameter AXI_TID_WIDTH = `VX_MEM_TAG_WIDTH,
parameter AXI_NUM_BANKS = 1 parameter AXI_NUM_BANKS = 1
)( )(
@ -83,7 +83,7 @@ module Vortex_axi import VX_gpu_pkg::*; #(
output wire busy output wire busy
); );
`STATIC_ASSERT((AXI_DATA_WIDTH == `VX_MEM_DATA_WIDTH), ("invalid memory data size: current=%0d, expected=%0d", AXI_DATA_WIDTH, `VX_MEM_DATA_WIDTH)) `STATIC_ASSERT((AXI_DATA_WIDTH == `VX_MEM_DATA_WIDTH), ("invalid memory data size: current=%0d, expected=%0d", AXI_DATA_WIDTH, `VX_MEM_DATA_WIDTH))
`STATIC_ASSERT((AXI_ADDR_WIDTH >= `XLEN), ("invalid memory address size: current=%0d, expected=%0d", AXI_ADDR_WIDTH, `VX_MEM_ADDR_WIDTH)) `STATIC_ASSERT((AXI_ADDR_WIDTH >= `MEM_ADDR_WIDTH), ("invalid memory address size: current=%0d, expected=%0d", AXI_ADDR_WIDTH, `VX_MEM_ADDR_WIDTH))
//`STATIC_ASSERT((AXI_TID_WIDTH >= `VX_MEM_TAG_WIDTH), ("invalid memory tag size: current=%0d, expected=%0d", AXI_TID_WIDTH, `VX_MEM_TAG_WIDTH)) //`STATIC_ASSERT((AXI_TID_WIDTH >= `VX_MEM_TAG_WIDTH), ("invalid memory tag size: current=%0d, expected=%0d", AXI_TID_WIDTH, `VX_MEM_TAG_WIDTH))
wire mem_req_valid; wire mem_req_valid;
@ -99,8 +99,8 @@ module Vortex_axi import VX_gpu_pkg::*; #(
wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag; wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_ready; wire mem_rsp_ready;
wire [`XLEN-1:0] m_axi_awaddr_unqual [AXI_NUM_BANKS]; wire [`MEM_ADDR_WIDTH-1:0] m_axi_awaddr_unqual [AXI_NUM_BANKS];
wire [`XLEN-1:0] m_axi_araddr_unqual [AXI_NUM_BANKS]; wire [`MEM_ADDR_WIDTH-1:0] m_axi_araddr_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_awid_unqual [AXI_NUM_BANKS]; wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_awid_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_arid_unqual [AXI_NUM_BANKS]; wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_arid_unqual [AXI_NUM_BANKS];
@ -109,8 +109,8 @@ module Vortex_axi import VX_gpu_pkg::*; #(
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_rid_unqual [AXI_NUM_BANKS]; wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_rid_unqual [AXI_NUM_BANKS];
for (genvar i = 0; i < AXI_NUM_BANKS; ++i) begin for (genvar i = 0; i < AXI_NUM_BANKS; ++i) begin
assign m_axi_awaddr[i] = `XLEN'(m_axi_awaddr_unqual[i]); assign m_axi_awaddr[i] = `MEM_ADDR_WIDTH'(m_axi_awaddr_unqual[i]);
assign m_axi_araddr[i] = `XLEN'(m_axi_araddr_unqual[i]); assign m_axi_araddr[i] = `MEM_ADDR_WIDTH'(m_axi_araddr_unqual[i]);
assign m_axi_awid[i] = AXI_TID_WIDTH'(m_axi_awid_unqual[i]); assign m_axi_awid[i] = AXI_TID_WIDTH'(m_axi_awid_unqual[i]);
assign m_axi_arid[i] = AXI_TID_WIDTH'(m_axi_arid_unqual[i]); assign m_axi_arid[i] = AXI_TID_WIDTH'(m_axi_arid_unqual[i]);
@ -121,10 +121,10 @@ module Vortex_axi import VX_gpu_pkg::*; #(
VX_axi_adapter #( VX_axi_adapter #(
.DATA_WIDTH (`VX_MEM_DATA_WIDTH), .DATA_WIDTH (`VX_MEM_DATA_WIDTH),
.ADDR_WIDTH (`XLEN), .ADDR_WIDTH (`MEM_ADDR_WIDTH),
.TAG_WIDTH (`VX_MEM_TAG_WIDTH), .TAG_WIDTH (`VX_MEM_TAG_WIDTH),
.NUM_BANKS (AXI_NUM_BANKS), .NUM_BANKS (AXI_NUM_BANKS),
.OUT_REG_RSP((AXI_NUM_BANKS > 1) ? 2 : 0) .RSP_OUT_BUF((AXI_NUM_BANKS > 1) ? 2 : 0)
) axi_adapter ( ) axi_adapter (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),

View file

@ -5,6 +5,7 @@
// To be done: // To be done:
// Check how to run this with OPAE. Looks like setup issue // Check how to run this with OPAE. Looks like setup issue
`ifndef NOPAE
`include "platform_if.vh" `include "platform_if.vh"
@ -121,3 +122,5 @@ module ccip_std_afu #(
); );
endmodule endmodule
`endif

View file

@ -97,7 +97,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
wire [127:0] afu_id = `AFU_ACCEL_UUID; wire [127:0] afu_id = `AFU_ACCEL_UUID;
wire [63:0] dev_caps = {16'b0, wire [63:0] dev_caps = {16'b0,
8'(`SM_ENABLED ? `SMEM_LOG_SIZE : 0), 8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
16'(`NUM_CORES * `NUM_CLUSTERS), 16'(`NUM_CORES * `NUM_CLUSTERS),
8'(`NUM_WARPS), 8'(`NUM_WARPS),
8'(`NUM_THREADS), 8'(`NUM_THREADS),
@ -240,13 +240,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
MMIO_CMD_ARG0: begin MMIO_CMD_ARG0: begin
cmd_args[0] <= 64'(cp2af_sRxPort.c0.data); cmd_args[0] <= 64'(cp2af_sRxPort.c0.data);
`ifdef DBG_TRACE_AFU `ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_CMD_ARG0: data=0x%0h\n", $time, 64'(cp2af_sRxPort.c0.data))); `TRACE(2, ("%d: MMIO_CMD_ARG0: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data)));
`endif `endif
end end
MMIO_CMD_ARG1: begin MMIO_CMD_ARG1: begin
cmd_args[1] <= 64'(cp2af_sRxPort.c0.data); cmd_args[1] <= 64'(cp2af_sRxPort.c0.data);
`ifdef DBG_TRACE_AFU `ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_CMD_ARG1: data=0x%0h\n", $time, 64'(cp2af_sRxPort.c0.data))); `TRACE(2, ("%d: MMIO_CMD_ARG1: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data)));
`endif `endif
end end
MMIO_CMD_ARG2: begin MMIO_CMD_ARG2: begin
@ -263,13 +263,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
`ifdef SCOPE `ifdef SCOPE
MMIO_SCOPE_WRITE: begin MMIO_SCOPE_WRITE: begin
`ifdef DBG_TRACE_AFU `ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_SCOPE_WRITE: data=0x%0h\n", $time, cmd_scope_wdata)); `TRACE(2, ("%d: MMIO_SCOPE_WRITE: data=0x%h\n", $time, cmd_scope_wdata));
`endif `endif
end end
`endif `endif
default: begin default: begin
`ifdef DBG_TRACE_AFU `ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: Unknown MMIO Wr: addr=0x%0h, data=0x%0h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data))); `TRACE(2, ("%d: Unknown MMIO Wr: addr=0x%0h, data=0x%h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data)));
`endif `endif
end end
endcase endcase
@ -305,14 +305,14 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
MMIO_SCOPE_READ: begin MMIO_SCOPE_READ: begin
mmio_tx.data <= cmd_scope_rdata; mmio_tx.data <= cmd_scope_rdata;
`ifdef DBG_TRACE_AFU `ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_SCOPE_READ: data=0x%0h\n", $time, cmd_scope_rdata)); `TRACE(2, ("%d: MMIO_SCOPE_READ: data=0x%h\n", $time, cmd_scope_rdata));
`endif `endif
end end
`endif `endif
MMIO_DEV_CAPS: begin MMIO_DEV_CAPS: begin
mmio_tx.data <= dev_caps; mmio_tx.data <= dev_caps;
`ifdef DBG_TRACE_AFU `ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_DEV_CAPS: data=0x%0h\n", $time, dev_caps)); `TRACE(2, ("%d: MMIO_DEV_CAPS: data=0x%h\n", $time, dev_caps));
`endif `endif
end end
MMIO_ISA_CAPS: begin MMIO_ISA_CAPS: begin
@ -475,6 +475,8 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.TAG_WIDTH (AVS_REQ_TAGW) .TAG_WIDTH (AVS_REQ_TAGW)
) cci_vx_mem_bus_if[2](); ) cci_vx_mem_bus_if[2]();
`RESET_RELAY (cci_adapter_reset, reset);
VX_mem_adapter #( VX_mem_adapter #(
.SRC_DATA_WIDTH (CCI_DATA_WIDTH), .SRC_DATA_WIDTH (CCI_DATA_WIDTH),
.DST_DATA_WIDTH (LMEM_DATA_WIDTH), .DST_DATA_WIDTH (LMEM_DATA_WIDTH),
@ -482,11 +484,11 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.DST_ADDR_WIDTH (LMEM_ADDR_WIDTH), .DST_ADDR_WIDTH (LMEM_ADDR_WIDTH),
.SRC_TAG_WIDTH (CCI_ADDR_WIDTH), .SRC_TAG_WIDTH (CCI_ADDR_WIDTH),
.DST_TAG_WIDTH (AVS_REQ_TAGW), .DST_TAG_WIDTH (AVS_REQ_TAGW),
.OUT_REG_REQ (0), .REQ_OUT_BUF (0),
.OUT_REG_RSP (0) .RSP_OUT_BUF (0)
) cci_mem_adapter ( ) cci_mem_adapter (
.clk (clk), .clk (clk),
.reset (reset), .reset (cci_adapter_reset),
.mem_req_valid_in (cci_mem_req_valid), .mem_req_valid_in (cci_mem_req_valid),
.mem_req_addr_in (cci_mem_req_addr), .mem_req_addr_in (cci_mem_req_addr),
@ -515,6 +517,9 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.mem_rsp_ready_out (cci_vx_mem_bus_if[1].rsp_ready) .mem_rsp_ready_out (cci_vx_mem_bus_if[1].rsp_ready)
); );
assign cci_vx_mem_bus_if[1].req_data.atype = '0;
`UNUSED_VAR (cci_vx_mem_bus_if[1].req_data.atype)
//-- //--
wire vx_mem_is_cout; wire vx_mem_is_cout;
@ -523,6 +528,8 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
assign vx_mem_req_valid_qual = vx_mem_req_valid && ~vx_mem_is_cout; assign vx_mem_req_valid_qual = vx_mem_req_valid && ~vx_mem_is_cout;
`RESET_RELAY (vx_adapter_reset, reset);
VX_mem_adapter #( VX_mem_adapter #(
.SRC_DATA_WIDTH (`VX_MEM_DATA_WIDTH), .SRC_DATA_WIDTH (`VX_MEM_DATA_WIDTH),
.DST_DATA_WIDTH (LMEM_DATA_WIDTH), .DST_DATA_WIDTH (LMEM_DATA_WIDTH),
@ -530,11 +537,11 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.DST_ADDR_WIDTH (LMEM_ADDR_WIDTH), .DST_ADDR_WIDTH (LMEM_ADDR_WIDTH),
.SRC_TAG_WIDTH (`VX_MEM_TAG_WIDTH), .SRC_TAG_WIDTH (`VX_MEM_TAG_WIDTH),
.DST_TAG_WIDTH (AVS_REQ_TAGW), .DST_TAG_WIDTH (AVS_REQ_TAGW),
.OUT_REG_REQ (0), .REQ_OUT_BUF (0),
.OUT_REG_RSP (2) .RSP_OUT_BUF (2)
) vx_mem_adapter ( ) vx_mem_adapter (
.clk (clk), .clk (clk),
.reset (reset), .reset (vx_adapter_reset),
.mem_req_valid_in (vx_mem_req_valid_qual), .mem_req_valid_in (vx_mem_req_valid_qual),
.mem_req_addr_in (vx_mem_req_addr), .mem_req_addr_in (vx_mem_req_addr),
@ -563,6 +570,9 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.mem_rsp_ready_out (cci_vx_mem_bus_if[0].rsp_ready) .mem_rsp_ready_out (cci_vx_mem_bus_if[0].rsp_ready)
); );
assign cci_vx_mem_bus_if[0].req_data.atype = '0;
`UNUSED_VAR (cci_vx_mem_bus_if[0].req_data.atype)
//-- //--
VX_mem_bus_if #( VX_mem_bus_if #(
.DATA_SIZE (LMEM_DATA_SIZE), .DATA_SIZE (LMEM_DATA_SIZE),
@ -570,19 +580,17 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.TAG_WIDTH (AVS_REQ_TAGW+1) .TAG_WIDTH (AVS_REQ_TAGW+1)
) mem_bus_if[1](); ) mem_bus_if[1]();
`RESET_RELAY (mem_arb_reset, reset);
VX_mem_arb #( VX_mem_arb #(
.NUM_INPUTS (2), .NUM_INPUTS (2),
.DATA_SIZE (LMEM_DATA_SIZE), .DATA_SIZE (LMEM_DATA_SIZE),
.ADDR_WIDTH (LMEM_ADDR_WIDTH), .ADDR_WIDTH (LMEM_ADDR_WIDTH),
.TAG_WIDTH (AVS_REQ_TAGW), .TAG_WIDTH (AVS_REQ_TAGW),
.ARBITER ("P"), .ARBITER ("P"), // prioritize VX requests
.OUT_REG_REQ (0), .REQ_OUT_BUF (0),
.OUT_REG_RSP (0) .RSP_OUT_BUF (0)
) mem_arb ( ) mem_arb (
.clk (clk), .clk (clk),
.reset (mem_arb_reset), .reset (reset),
.bus_in_if (cci_vx_mem_bus_if), .bus_in_if (cci_vx_mem_bus_if),
.bus_out_if (mem_bus_if) .bus_out_if (mem_bus_if)
); );
@ -598,8 +606,8 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.NUM_BANKS (NUM_LOCAL_MEM_BANKS), .NUM_BANKS (NUM_LOCAL_MEM_BANKS),
.TAG_WIDTH (AVS_REQ_TAGW + 1), .TAG_WIDTH (AVS_REQ_TAGW + 1),
.RD_QUEUE_SIZE (AVS_RD_QUEUE_SIZE), .RD_QUEUE_SIZE (AVS_RD_QUEUE_SIZE),
.OUT_REG_REQ (2), .REQ_OUT_BUF (2),
.OUT_REG_RSP (0) .RSP_OUT_BUF (0)
) avs_adapter ( ) avs_adapter (
.clk (clk), .clk (clk),
.reset (avs_adapter_reset), .reset (avs_adapter_reset),
@ -631,6 +639,9 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.avs_readdatavalid(avs_readdatavalid) .avs_readdatavalid(avs_readdatavalid)
); );
assign mem_bus_if[0].req_data.atype = '0;
`UNUSED_VAR (mem_bus_if[0].req_data.atype)
// CCI-P Read Request /////////////////////////////////////////////////////////// // CCI-P Read Request ///////////////////////////////////////////////////////////
reg [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_ctr; reg [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_ctr;
@ -679,9 +690,11 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.reset (reset), .reset (reset),
.incr (cci_rd_req_fire), .incr (cci_rd_req_fire),
.decr (cci_rdq_pop), .decr (cci_rdq_pop),
`UNUSED_PIN (empty),
`UNUSED_PIN (alm_empty),
.full (cci_pending_reads_full), .full (cci_pending_reads_full),
.size (cci_pending_reads), `UNUSED_PIN (alm_full),
`UNUSED_PIN (empty) .size (cci_pending_reads)
); );
`UNUSED_VAR (cci_pending_reads) `UNUSED_VAR (cci_pending_reads)
@ -745,7 +758,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
cci_mem_wr_req_addr_base <= cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE); cci_mem_wr_req_addr_base <= cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE);
end end
`ifdef DBG_TRACE_AFU `ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=0x%0h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data)); `TRACE(2, ("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=0x%h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data));
`endif `endif
end end
@ -763,14 +776,12 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
end end
end end
`RESET_RELAY (cci_rdq_reset, reset);
VX_fifo_queue #( VX_fifo_queue #(
.DATAW (CCI_RD_QUEUE_DATAW), .DATAW (CCI_RD_QUEUE_DATAW),
.DEPTH (CCI_RD_QUEUE_SIZE) .DEPTH (CCI_RD_QUEUE_SIZE)
) cci_rd_req_queue ( ) cci_rd_req_queue (
.clk (clk), .clk (clk),
.reset (cci_rdq_reset), .reset (reset),
.push (cci_rdq_push), .push (cci_rdq_push),
.pop (cci_rdq_pop), .pop (cci_rdq_pop),
.data_in (cci_rdq_din), .data_in (cci_rdq_din),
@ -839,7 +850,9 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.incr (cci_mem_rd_rsp_fire), .incr (cci_mem_rd_rsp_fire),
.decr (cci_wr_rsp_fire), .decr (cci_wr_rsp_fire),
.empty (cci_pending_writes_empty), .empty (cci_pending_writes_empty),
`UNUSED_PIN (alm_empty),
.full (cci_pending_writes_full), .full (cci_pending_writes_full),
`UNUSED_PIN (alm_full),
.size (cci_pending_writes) .size (cci_pending_writes)
); );
@ -889,7 +902,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
cci_wr_req_done <= 1; cci_wr_req_done <= 1;
end end
`ifdef DBG_TRACE_AFU `ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: CCI Wr Req: addr=0x%0h, rem=%0d, pending=%0d, data=0x%0h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data)); `TRACE(2, ("%d: CCI Wr Req: addr=0x%0h, rem=%0d, pending=%0d, data=0x%h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data));
`endif `endif
end end
@ -997,7 +1010,6 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
// SCOPE ////////////////////////////////////////////////////////////////////// // SCOPE //////////////////////////////////////////////////////////////////////
`ifdef DBG_SCOPE_AFU `ifdef DBG_SCOPE_AFU
`ifdef SCOPE
wire mem_req_fire = mem_bus_if[0].req_valid && mem_bus_if[0].req_ready; wire mem_req_fire = mem_bus_if[0].req_valid && mem_bus_if[0].req_ready;
wire mem_rsp_fire = mem_bus_if[0].rsp_valid && mem_bus_if[0].rsp_ready; wire mem_rsp_fire = mem_bus_if[0].rsp_valid && mem_bus_if[0].rsp_ready;
wire avs_write_fire = avs_write[0] && ~avs_waitrequest[0]; wire avs_write_fire = avs_write[0] && ~avs_waitrequest[0];
@ -1067,7 +1079,6 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.bus_in(scope_bus_in_w[0]), .bus_in(scope_bus_in_w[0]),
.bus_out(scope_bus_out_w[0]) .bus_out(scope_bus_out_w[0])
); );
`endif
`else `else
`SCOPE_IO_UNUSED_W(0) `SCOPE_IO_UNUSED_W(0)
`endif `endif
@ -1078,13 +1089,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
always @(posedge clk) begin always @(posedge clk) begin
for (integer i = 0; i < NUM_LOCAL_MEM_BANKS; ++i) begin for (integer i = 0; i < NUM_LOCAL_MEM_BANKS; ++i) begin
if (avs_write[i] && ~avs_waitrequest[i]) begin if (avs_write[i] && ~avs_waitrequest[i]) begin
`TRACE(2, ("%d: AVS Wr Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h, data=0x%0h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i], avs_writedata[i])); `TRACE(2, ("%d: AVS Wr Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h, data=0x%h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i], avs_writedata[i]));
end end
if (avs_read[i] && ~avs_waitrequest[i]) begin if (avs_read[i] && ~avs_waitrequest[i]) begin
`TRACE(2, ("%d: AVS Rd Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i])); `TRACE(2, ("%d: AVS Rd Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i]));
end end
if (avs_readdatavalid[i]) begin if (avs_readdatavalid[i]) begin
`TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%0h\n", $time, i, avs_readdata[i])); `TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%h\n", $time, i, avs_readdata[i]));
end end
end end
end end

View file

@ -110,23 +110,25 @@ module VX_afu_ctrl #(
ADDR_DEV_0 = 8'h10, ADDR_DEV_0 = 8'h10,
ADDR_DEV_1 = 8'h14, ADDR_DEV_1 = 8'h14,
ADDR_DEV_CTRL = 8'h18, //ADDR_DEV_CTRL = 8'h18,
ADDR_ISA_0 = 8'h1C, ADDR_ISA_0 = 8'h1C,
ADDR_ISA_1 = 8'h20, ADDR_ISA_1 = 8'h20,
ADDR_ISA_CTRL = 8'h24, //ADDR_ISA_CTRL = 8'h24,
ADDR_DCR_0 = 8'h28, ADDR_DCR_0 = 8'h28,
ADDR_DCR_1 = 8'h2C, ADDR_DCR_1 = 8'h2C,
ADDR_DCR_CTRL = 8'h30, //ADDR_DCR_CTRL = 8'h30,
`ifdef SCOPE
ADDR_SCP_0 = 8'h34, ADDR_SCP_0 = 8'h34,
ADDR_SCP_1 = 8'h38, ADDR_SCP_1 = 8'h38,
ADDR_SCP_CTRL = 8'h3C, //ADDR_SCP_CTRL = 8'h3C,
`endif
ADDR_MEM_0 = 8'h40, ADDR_MEM_0 = 8'h40,
ADDR_MEM_1 = 8'h44, ADDR_MEM_1 = 8'h44,
ADDR_MEM_CTRL = 8'h48, //ADDR_MEM_CTRL = 8'h48,
ADDR_BITS = 8; ADDR_BITS = 8;
@ -141,7 +143,7 @@ module VX_afu_ctrl #(
// device caps // device caps
wire [63:0] dev_caps = {16'b0, wire [63:0] dev_caps = {16'b0,
8'(`SM_ENABLED ? `SMEM_LOG_SIZE : 0), 8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
16'(`NUM_CORES * `NUM_CLUSTERS), 16'(`NUM_CORES * `NUM_CLUSTERS),
8'(`NUM_WARPS), 8'(`NUM_WARPS),
8'(`NUM_THREADS), 8'(`NUM_THREADS),
@ -317,10 +319,10 @@ module VX_afu_ctrl #(
end end
default: begin default: begin
for (integer i = 0; i < AXI_NUM_BANKS; ++i) begin for (integer i = 0; i < AXI_NUM_BANKS; ++i) begin
if (waddr == (ADDR_MEM_0 + i * 12)) begin if (waddr == (ADDR_MEM_0 + 8'(i) * 8'd12)) begin
mem_r[i][31:0] <= (s_axi_wdata & wmask) | (mem_r[i][31:0] & ~wmask); mem_r[i][31:0] <= (s_axi_wdata & wmask) | (mem_r[i][31:0] & ~wmask);
end end
if (waddr == (ADDR_MEM_1 + i * 12)) begin if (waddr == (ADDR_MEM_1 + 8'(i) * 8'd12)) begin
mem_r[i][63:32] <= (s_axi_wdata & wmask) | (mem_r[i][63:32] & ~wmask); mem_r[i][63:32] <= (s_axi_wdata & wmask) | (mem_r[i][63:32] & ~wmask);
end end
end end

View file

@ -16,9 +16,9 @@
module VX_afu_wrap #( module VX_afu_wrap #(
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8, parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
parameter C_S_AXI_CTRL_DATA_WIDTH = 32, parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
parameter C_M_AXI_MEM_ID_WIDTH = 16, parameter C_M_AXI_MEM_ID_WIDTH = `M_AXI_MEM_ID_WIDTH,
parameter C_M_AXI_MEM_ADDR_WIDTH = 32, parameter C_M_AXI_MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter C_M_AXI_MEM_DATA_WIDTH = 512 parameter C_M_AXI_MEM_DATA_WIDTH = `VX_MEM_DATA_WIDTH
) ( ) (
// System signals // System signals
input wire ap_clk, input wire ap_clk,
@ -82,7 +82,6 @@ module VX_afu_wrap #(
// convert memory interface to array // convert memory interface to array
`REPEAT (`M_AXI_MEM_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON); `REPEAT (`M_AXI_MEM_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
wire clk = ap_clk;
wire reset = ~ap_rst_n; wire reset = ~ap_rst_n;
reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr; reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr;
@ -237,8 +236,8 @@ module VX_afu_wrap #(
.dcr_wr_data (dcr_wr_data) .dcr_wr_data (dcr_wr_data)
); );
wire [`XLEN-1:0] m_axi_mem_awaddr_w [C_M_AXI_MEM_NUM_BANKS]; wire [`MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_w [C_M_AXI_MEM_NUM_BANKS];
wire [`XLEN-1:0] m_axi_mem_araddr_w [C_M_AXI_MEM_NUM_BANKS]; wire [`MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_w [C_M_AXI_MEM_NUM_BANKS];
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_w[i]) + C_M_AXI_MEM_ADDR_WIDTH'(mem_base[i]); assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_w[i]) + C_M_AXI_MEM_ADDR_WIDTH'(mem_base[i]);
@ -249,7 +248,7 @@ module VX_afu_wrap #(
Vortex_axi #( Vortex_axi #(
.AXI_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH), .AXI_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH),
.AXI_ADDR_WIDTH (`XLEN), .AXI_ADDR_WIDTH (`MEM_ADDR_WIDTH),
.AXI_TID_WIDTH (C_M_AXI_MEM_ID_WIDTH), .AXI_TID_WIDTH (C_M_AXI_MEM_ID_WIDTH),
.AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS) .AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
) vortex_axi ( ) vortex_axi (
@ -312,7 +311,6 @@ module VX_afu_wrap #(
// SCOPE ////////////////////////////////////////////////////////////////////// // SCOPE //////////////////////////////////////////////////////////////////////
`ifdef DBG_SCOPE_AFU `ifdef DBG_SCOPE_AFU
`ifdef SCOPE
`define TRIGGERS { \ `define TRIGGERS { \
reset, \ reset, \
ap_start, \ ap_start, \
@ -342,24 +340,6 @@ module VX_afu_wrap #(
.bus_in (scope_bus_in_w[0]), .bus_in (scope_bus_in_w[0]),
.bus_out (scope_bus_out_w[0]) .bus_out (scope_bus_out_w[0])
); );
`endif
`ifdef CHIPSCOPE
ila_afu ila_afu_inst (
.clk (ap_clk),
.probe0 ({
ap_start,
ap_done,
ap_idle,
interrupt
}),
.probe1 ({
vx_pending_writes,
vx_busy_wait,
vx_busy,
vx_running
})
);
`endif
`else `else
`SCOPE_IO_UNUSED_W(0) `SCOPE_IO_UNUSED_W(0)
`endif `endif
@ -397,13 +377,13 @@ module VX_afu_wrap #(
`TRACE(2, ("%d: AFU Wr Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i])); `TRACE(2, ("%d: AFU Wr Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]));
end end
if (m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i]) begin if (m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i]) begin
`TRACE(2, ("%d: AFU Wr Req [%0d]: data=0x%0h\n", $time, i, m_axi_mem_wdata_a[i])); `TRACE(2, ("%d: AFU Wr Req [%0d]: data=0x%h\n", $time, i, m_axi_mem_wdata_a[i]));
end end
if (m_axi_mem_arvalid_a[i] && m_axi_mem_arready_a[i]) begin if (m_axi_mem_arvalid_a[i] && m_axi_mem_arready_a[i]) begin
`TRACE(2, ("%d: AFU Rd Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i])); `TRACE(2, ("%d: AFU Rd Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i]));
end end
if (m_axi_mem_rvalid_a[i] && m_axi_mem_rready_a[i]) begin if (m_axi_mem_rvalid_a[i] && m_axi_mem_rready_a[i]) begin
`TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i])); `TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]));
end end
end end
end end

127
hw/rtl/cache/VX_bank_flush.sv vendored Normal file
View file

@ -0,0 +1,127 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_bank_flush #(
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// Enable cache writeback
parameter WRITEBACK = 0
) (
input wire clk,
input wire reset,
input wire flush_begin,
output wire flush_end,
output wire flush_init,
output wire flush_valid,
output wire [`CS_LINE_SEL_BITS-1:0] flush_line,
output wire [NUM_WAYS-1:0] flush_way,
input wire flush_ready,
input wire mshr_empty,
input wire bank_empty
);
// ways interation is only needed when eviction is enabled
localparam CTR_WIDTH = `CS_LINE_SEL_BITS + (WRITEBACK ? `CS_WAY_SEL_BITS : 0);
localparam STATE_IDLE = 0;
localparam STATE_INIT = 1;
localparam STATE_WAIT1 = 2;
localparam STATE_FLUSH = 3;
localparam STATE_WAIT2 = 4;
localparam STATE_DONE = 5;
reg [2:0] state_r, state_n;
reg [CTR_WIDTH-1:0] counter_r;
always @(*) begin
state_n = state_r;
case (state_r)
STATE_IDLE: begin
if (flush_begin) begin
state_n = STATE_WAIT1;
end
end
STATE_INIT: begin
if (counter_r == ((2 ** `CS_LINE_SEL_BITS)-1)) begin
state_n = STATE_IDLE;
end
end
STATE_WAIT1: begin
// wait for pending requests to complete
if (mshr_empty) begin
state_n = STATE_FLUSH;
end
end
STATE_FLUSH: begin
if (counter_r == ((2 ** CTR_WIDTH)-1) && flush_ready) begin
state_n = (BANK_ID == 0) ? STATE_DONE : STATE_WAIT2;
end
end
STATE_WAIT2: begin
// ensure the bank is empty before notifying the cache flush unit,
// because the flush request to lower caches only goes through bank0
// and it is important that request gets send out last.
if (bank_empty) begin
state_n = STATE_DONE;
end
end
STATE_DONE: begin
// generate a completion pulse
state_n = STATE_IDLE;
end
endcase
end
always @(posedge clk) begin
if (reset) begin
state_r <= STATE_INIT;
counter_r <= '0;
end else begin
state_r <= state_n;
if (state_r != STATE_IDLE) begin
if ((state_r == STATE_INIT)
|| ((state_r == STATE_FLUSH) && flush_ready)) begin
counter_r <= counter_r + CTR_WIDTH'(1);
end
end else begin
counter_r <= '0;
end
end
end
assign flush_end = (state_r == STATE_DONE);
assign flush_init = (state_r == STATE_INIT);
assign flush_valid = (state_r == STATE_FLUSH);
assign flush_line = counter_r[`CS_LINE_SEL_BITS-1:0];
if (WRITEBACK && `CS_WAY_SEL_BITS > 0) begin
reg [NUM_WAYS-1:0] flush_way_r;
always @(*) begin
flush_way_r = '0;
flush_way_r[counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]] = 1;
end
assign flush_way = flush_way_r;
end else begin
assign flush_way = {NUM_WAYS{1'b1}};
end
endmodule

View file

@ -42,6 +42,12 @@ module VX_cache import VX_gpu_pkg::*; #(
// Enable cache writeable // Enable cache writeable
parameter WRITE_ENABLE = 1, parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier // Request debug identifier
parameter UUID_WIDTH = 0, parameter UUID_WIDTH = 0,
@ -49,10 +55,10 @@ module VX_cache import VX_gpu_pkg::*; #(
parameter TAG_WIDTH = UUID_WIDTH + 1, parameter TAG_WIDTH = UUID_WIDTH + 1,
// Core response output register // Core response output register
parameter CORE_OUT_REG = 0, parameter CORE_OUT_BUF = 0,
// Memory request output register // Memory request output register
parameter MEM_OUT_REG = 0 parameter MEM_OUT_BUF = 0
) ( ) (
// PERF // PERF
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
@ -66,8 +72,13 @@ module VX_cache import VX_gpu_pkg::*; #(
VX_mem_bus_if.master mem_bus_if VX_mem_bus_if.master mem_bus_if
); );
`STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid parameter")) `STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter: number of banks must be power of 2"))
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter")) `STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter: writeback requires write enable"))
`STATIC_ASSERT(WRITEBACK || !DIRTY_BYTES, ("invalid parameter: dirty bytes require writeback"))
// In writeback mode, memory fill response may issue a new memory request to handle evicted blocks.
// We need to ensure that the memory request queue never fills up to avoid deadlock.
`STATIC_ASSERT(!WRITEBACK || (MREQ_SIZE >= MSHR_SIZE), ("invalid parameter: writeback requires MREQ_SIZE >= MSHR_SIZE"))
localparam REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS); localparam REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS);
localparam WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS); localparam WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS);
@ -79,35 +90,43 @@ module VX_cache import VX_gpu_pkg::*; #(
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS); localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS); localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam LINE_ADDR_WIDTH = (`CS_WORD_ADDR_WIDTH - BANK_SEL_BITS - WORD_SEL_BITS); localparam LINE_ADDR_WIDTH = (`CS_WORD_ADDR_WIDTH - BANK_SEL_BITS - WORD_SEL_BITS);
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH; localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + 1;
localparam CORE_RSP_DATAW = WORD_WIDTH + TAG_WIDTH; localparam CORE_RSP_DATAW = WORD_WIDTH + TAG_WIDTH;
localparam CORE_REQ_BUF_ENABLE = (NUM_BANKS != 1) || (NUM_REQS != 1); localparam CORE_REQ_BUF_ENABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
localparam MEM_REQ_BUF_ENABLE = (NUM_BANKS != 1); localparam MEM_REQ_BUF_ENABLE = (NUM_BANKS != 1);
localparam REQ_XBAR_BUF = (NUM_REQS > 4) ? 2 : 0;
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
wire [NUM_BANKS-1:0] perf_read_miss_per_bank; wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
wire [NUM_BANKS-1:0] perf_write_miss_per_bank; wire [NUM_BANKS-1:0] perf_write_miss_per_bank;
wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank; wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank;
`endif `endif
wire [NUM_REQS-1:0] core_req_valid; VX_mem_bus_if #(
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr; .DATA_SIZE (WORD_SIZE),
wire [NUM_REQS-1:0] core_req_rw; .TAG_WIDTH (TAG_WIDTH)
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen; ) core_bus2_if[NUM_REQS]();
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag;
wire [NUM_REQS-1:0] core_req_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin wire [NUM_BANKS-1:0] per_bank_flush_begin;
assign core_req_valid[i] = core_bus_if[i].req_valid; wire [NUM_BANKS-1:0] per_bank_flush_end;
assign core_req_addr[i] = core_bus_if[i].req_data.addr;
assign core_req_rw[i] = core_bus_if[i].req_data.rw; wire [NUM_BANKS-1:0] per_bank_core_req_fire;
assign core_req_byteen[i] = core_bus_if[i].req_data.byteen;
assign core_req_data[i] = core_bus_if[i].req_data.data; VX_cache_flush #(
assign core_req_tag[i] = core_bus_if[i].req_data.tag; .NUM_REQS (NUM_REQS),
assign core_bus_if[i].req_ready = core_req_ready[i]; .NUM_BANKS (NUM_BANKS),
end .BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // bank xbar latency
) flush_unit (
.clk (clk),
.reset (reset),
.core_bus_in_if (core_bus_if),
.core_bus_out_if (core_bus2_if),
.bank_req_fire (per_bank_core_req_fire),
.flush_begin (per_bank_flush_begin),
.flush_end (per_bank_flush_end)
);
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
@ -117,23 +136,23 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s; wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
wire [NUM_REQS-1:0] core_rsp_ready_s; wire [NUM_REQS-1:0] core_rsp_ready_s;
for (genvar i = 0; i < NUM_REQS; ++i) begin `RESET_RELAY_EX (core_rsp_reset, reset, NUM_REQS, `MAX_FANOUT);
`RESET_RELAY (core_rsp_reset, reset); for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_elastic_buffer #( VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH), .DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
.SIZE (CORE_REQ_BUF_ENABLE ? `OUT_REG_TO_EB_SIZE(CORE_OUT_REG) : 0), .SIZE (CORE_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.OUT_REG (`OUT_REG_TO_EB_REG(CORE_OUT_REG)) .OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf ( ) core_rsp_buf (
.clk (clk), .clk (clk),
.reset (core_rsp_reset), .reset (core_rsp_reset[i]),
.valid_in (core_rsp_valid_s[i]), .valid_in (core_rsp_valid_s[i]),
.ready_in (core_rsp_ready_s[i]), .ready_in (core_rsp_ready_s[i]),
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}), .data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
.data_out ({core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag}), .data_out ({core_bus2_if[i].rsp_data.data, core_bus2_if[i].rsp_data.tag}),
.valid_out (core_bus_if[i].rsp_valid), .valid_out (core_bus2_if[i].rsp_valid),
.ready_out (core_bus_if[i].rsp_ready) .ready_out (core_bus2_if[i].rsp_ready)
); );
end end
@ -146,25 +165,28 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [LINE_SIZE-1:0] mem_req_byteen_s; wire [LINE_SIZE-1:0] mem_req_byteen_s;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_s; wire [`CS_LINE_WIDTH-1:0] mem_req_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s; wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s;
wire mem_req_flush_s;
wire mem_req_ready_s; wire mem_req_ready_s;
`RESET_RELAY (mem_req_buf_reset, reset); wire mem_bus_if_flush;
VX_elastic_buffer #( VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH), .DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + 1),
.SIZE (MEM_REQ_BUF_ENABLE ? `OUT_REG_TO_EB_SIZE(MEM_OUT_REG) : 0), .SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`OUT_REG_TO_EB_REG(MEM_OUT_REG)) .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf ( ) mem_req_buf (
.clk (clk), .clk (clk),
.reset (mem_req_buf_reset), .reset (reset),
.valid_in (mem_req_valid_s), .valid_in (mem_req_valid_s),
.ready_in (mem_req_ready_s), .ready_in (mem_req_ready_s),
.data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s}), .data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s, mem_req_flush_s}),
.data_out ({mem_bus_if.req_data.rw, mem_bus_if.req_data.byteen, mem_bus_if.req_data.addr, mem_bus_if.req_data.data, mem_bus_if.req_data.tag}), .data_out ({mem_bus_if.req_data.rw, mem_bus_if.req_data.byteen, mem_bus_if.req_data.addr, mem_bus_if.req_data.data, mem_bus_if.req_data.tag, mem_bus_if_flush}),
.valid_out (mem_bus_if.req_valid), .valid_out (mem_bus_if.req_valid),
.ready_out (mem_bus_if.req_ready) .ready_out (mem_bus_if.req_ready)
); );
assign mem_bus_if.req_data.atype = mem_bus_if_flush ? `ADDR_TYPE_WIDTH'(1 << `ADDR_TYPE_FLUSH) : '0;
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
// Memory response buffering // Memory response buffering
@ -173,15 +195,13 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s; wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s;
wire mem_rsp_ready_s; wire mem_rsp_ready_s;
`RESET_RELAY (mem_rsp_reset, reset);
VX_elastic_buffer #( VX_elastic_buffer #(
.DATAW (MEM_TAG_WIDTH + `CS_LINE_WIDTH), .DATAW (MEM_TAG_WIDTH + `CS_LINE_WIDTH),
.SIZE (MRSQ_SIZE), .SIZE (MRSQ_SIZE),
.OUT_REG (MRSQ_SIZE > 2) .OUT_REG (MRSQ_SIZE > 2)
) mem_rsp_queue ( ) mem_rsp_queue (
.clk (clk), .clk (clk),
.reset (mem_rsp_reset), .reset (reset),
.valid_in (mem_bus_if.rsp_valid), .valid_in (mem_bus_if.rsp_valid),
.ready_in (mem_bus_if.rsp_ready), .ready_in (mem_bus_if.rsp_ready),
.data_in ({mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data}), .data_in ({mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data}),
@ -190,26 +210,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.ready_out (mem_rsp_ready_s) .ready_out (mem_rsp_ready_s)
); );
/////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
wire [`CS_LINE_SEL_BITS-1:0] init_line_sel;
wire init_enable;
`RESET_RELAY (init_reset, reset);
VX_cache_init #(
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS)
) cache_init (
.clk (clk),
.reset (init_reset),
.addr_out (init_line_sel),
.valid_out (init_enable)
);
///////////////////////////////////////////////////////////////////////
wire [NUM_BANKS-1:0] per_bank_core_req_valid; wire [NUM_BANKS-1:0] per_bank_core_req_valid;
wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr; wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
@ -219,6 +220,7 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_req_data; wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_req_data;
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_req_tag; wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_req_tag;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_req_idx; wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_req_idx;
wire [NUM_BANKS-1:0] per_bank_core_req_flush;
wire [NUM_BANKS-1:0] per_bank_core_req_ready; wire [NUM_BANKS-1:0] per_bank_core_req_ready;
wire [NUM_BANKS-1:0] per_bank_core_rsp_valid; wire [NUM_BANKS-1:0] per_bank_core_rsp_valid;
@ -230,14 +232,16 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0] per_bank_mem_req_valid; wire [NUM_BANKS-1:0] per_bank_mem_req_valid;
wire [NUM_BANKS-1:0][`CS_MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr; wire [NUM_BANKS-1:0][`CS_MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
wire [NUM_BANKS-1:0] per_bank_mem_req_rw; wire [NUM_BANKS-1:0] per_bank_mem_req_rw;
wire [NUM_BANKS-1:0][WORD_SEL_WIDTH-1:0] per_bank_mem_req_wsel; wire [NUM_BANKS-1:0][LINE_SIZE-1:0] per_bank_mem_req_byteen;
wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_mem_req_byteen; wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_req_data;
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_mem_req_data;
wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0] per_bank_mem_req_id; wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0] per_bank_mem_req_id;
wire [NUM_BANKS-1:0] per_bank_mem_req_flush;
wire [NUM_BANKS-1:0] per_bank_mem_req_ready; wire [NUM_BANKS-1:0] per_bank_mem_req_ready;
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready; wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready;
if (NUM_BANKS == 1) begin if (NUM_BANKS == 1) begin
assign mem_rsp_ready_s = per_bank_mem_rsp_ready; assign mem_rsp_ready_s = per_bank_mem_rsp_ready;
end else begin end else begin
@ -246,12 +250,33 @@ module VX_cache import VX_gpu_pkg::*; #(
// Bank requests dispatch // Bank requests dispatch
wire [NUM_REQS-1:0][CORE_REQ_DATAW-1:0] core_req_data_in; wire [NUM_REQS-1:0] core_req_valid;
wire [NUM_BANKS-1:0][CORE_REQ_DATAW-1:0] core_req_data_out; wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr;
wire [NUM_REQS-1:0] core_req_rw;
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag;
wire [NUM_REQS-1:0] core_req_flush;
wire [NUM_REQS-1:0] core_req_ready;
wire [NUM_REQS-1:0][LINE_ADDR_WIDTH-1:0] core_req_line_addr; wire [NUM_REQS-1:0][LINE_ADDR_WIDTH-1:0] core_req_line_addr;
wire [NUM_REQS-1:0][BANK_SEL_WIDTH-1:0] core_req_bid; wire [NUM_REQS-1:0][BANK_SEL_WIDTH-1:0] core_req_bid;
wire [NUM_REQS-1:0][WORD_SEL_WIDTH-1:0] core_req_wsel; wire [NUM_REQS-1:0][WORD_SEL_WIDTH-1:0] core_req_wsel;
wire [NUM_REQS-1:0][CORE_REQ_DATAW-1:0] core_req_data_in;
wire [NUM_BANKS-1:0][CORE_REQ_DATAW-1:0] core_req_data_out;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_valid[i] = core_bus2_if[i].req_valid;
assign core_req_rw[i] = core_bus2_if[i].req_data.rw;
assign core_req_byteen[i] = core_bus2_if[i].req_data.byteen;
assign core_req_addr[i] = core_bus2_if[i].req_data.addr;
assign core_req_data[i] = core_bus2_if[i].req_data.data;
assign core_req_tag[i] = core_bus2_if[i].req_data.tag;
assign core_req_flush[i] = core_bus2_if[i].req_data.atype[`ADDR_TYPE_FLUSH];
assign core_bus2_if[i].req_ready = core_req_ready[i];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar i = 0; i < NUM_REQS; ++i) begin
if (WORDS_PER_LINE > 1) begin if (WORDS_PER_LINE > 1) begin
assign core_req_wsel[i] = core_req_addr[i][0 +: WORD_SEL_BITS]; assign core_req_wsel[i] = core_req_addr[i][0 +: WORD_SEL_BITS];
@ -276,7 +301,9 @@ module VX_cache import VX_gpu_pkg::*; #(
core_req_wsel[i], core_req_wsel[i],
core_req_byteen[i], core_req_byteen[i],
core_req_data[i], core_req_data[i],
core_req_tag[i]}; core_req_tag[i],
core_req_flush[i]
};
end end
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
@ -289,7 +316,9 @@ module VX_cache import VX_gpu_pkg::*; #(
.NUM_INPUTS (NUM_REQS), .NUM_INPUTS (NUM_REQS),
.NUM_OUTPUTS (NUM_BANKS), .NUM_OUTPUTS (NUM_BANKS),
.DATAW (CORE_REQ_DATAW), .DATAW (CORE_REQ_DATAW),
.PERF_CTR_BITS (`PERF_CTR_BITS) .PERF_CTR_BITS (`PERF_CTR_BITS),
.ARBITER ("F"),
.OUT_BUF (REQ_XBAR_BUF)
) req_xbar ( ) req_xbar (
.clk (clk), .clk (clk),
.reset (req_xbar_reset), .reset (req_xbar_reset),
@ -315,25 +344,27 @@ module VX_cache import VX_gpu_pkg::*; #(
per_bank_core_req_wsel[i], per_bank_core_req_wsel[i],
per_bank_core_req_byteen[i], per_bank_core_req_byteen[i],
per_bank_core_req_data[i], per_bank_core_req_data[i],
per_bank_core_req_tag[i]} = core_req_data_out[i]; per_bank_core_req_tag[i],
per_bank_core_req_flush[i]
} = core_req_data_out[i];
end end
// Banks access // Banks access
for (genvar i = 0; i < NUM_BANKS; ++i) begin for (genvar bank_id = 0; bank_id < NUM_BANKS; ++bank_id) begin : banks
wire [`CS_LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr; wire [`CS_LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr;
wire curr_bank_mem_rsp_valid; wire curr_bank_mem_rsp_valid;
if (NUM_BANKS == 1) begin if (NUM_BANKS == 1) begin
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s; assign curr_bank_mem_rsp_valid = mem_rsp_valid_s;
end else begin end else begin
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s && (`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s) == i); assign curr_bank_mem_rsp_valid = mem_rsp_valid_s && (`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s) == bank_id);
end end
`RESET_RELAY (bank_reset, reset); `RESET_RELAY (bank_reset, reset);
VX_cache_bank #( VX_cache_bank #(
.BANK_ID (i), .BANK_ID (bank_id),
.INSTANCE_ID (INSTANCE_ID), .INSTANCE_ID ($sformatf("%s-bank%0d", INSTANCE_ID, bank_id)),
.CACHE_SIZE (CACHE_SIZE), .CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE), .LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS), .NUM_BANKS (NUM_BANKS),
@ -344,63 +375,65 @@ module VX_cache import VX_gpu_pkg::*; #(
.MSHR_SIZE (MSHR_SIZE), .MSHR_SIZE (MSHR_SIZE),
.MREQ_SIZE (MREQ_SIZE), .MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE), .WRITE_ENABLE (WRITE_ENABLE),
.DIRTY_BYTES (DIRTY_BYTES),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH), .UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH), .TAG_WIDTH (TAG_WIDTH),
.CORE_OUT_REG (CORE_REQ_BUF_ENABLE ? 0 : CORE_OUT_REG), .CORE_OUT_BUF (CORE_REQ_BUF_ENABLE ? 0 : CORE_OUT_BUF),
.MEM_OUT_REG (MEM_REQ_BUF_ENABLE ? 0 : MEM_OUT_REG) .MEM_OUT_BUF (MEM_REQ_BUF_ENABLE ? 0 : MEM_OUT_BUF)
) bank ( ) bank (
.clk (clk), .clk (clk),
.reset (bank_reset), .reset (bank_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.perf_read_misses (perf_read_miss_per_bank[i]), .perf_read_misses (perf_read_miss_per_bank[bank_id]),
.perf_write_misses (perf_write_miss_per_bank[i]), .perf_write_misses (perf_write_miss_per_bank[bank_id]),
.perf_mshr_stalls (perf_mshr_stall_per_bank[i]), .perf_mshr_stalls (perf_mshr_stall_per_bank[bank_id]),
`endif `endif
// Core request // Core request
.core_req_valid (per_bank_core_req_valid[i]), .core_req_valid (per_bank_core_req_valid[bank_id]),
.core_req_addr (per_bank_core_req_addr[i]), .core_req_addr (per_bank_core_req_addr[bank_id]),
.core_req_rw (per_bank_core_req_rw[i]), .core_req_rw (per_bank_core_req_rw[bank_id]),
.core_req_wsel (per_bank_core_req_wsel[i]), .core_req_wsel (per_bank_core_req_wsel[bank_id]),
.core_req_byteen (per_bank_core_req_byteen[i]), .core_req_byteen (per_bank_core_req_byteen[bank_id]),
.core_req_data (per_bank_core_req_data[i]), .core_req_data (per_bank_core_req_data[bank_id]),
.core_req_tag (per_bank_core_req_tag[i]), .core_req_tag (per_bank_core_req_tag[bank_id]),
.core_req_idx (per_bank_core_req_idx[i]), .core_req_idx (per_bank_core_req_idx[bank_id]),
.core_req_ready (per_bank_core_req_ready[i]), .core_req_flush (per_bank_core_req_flush[bank_id]),
.core_req_ready (per_bank_core_req_ready[bank_id]),
// Core response // Core response
.core_rsp_valid (per_bank_core_rsp_valid[i]), .core_rsp_valid (per_bank_core_rsp_valid[bank_id]),
.core_rsp_data (per_bank_core_rsp_data[i]), .core_rsp_data (per_bank_core_rsp_data[bank_id]),
.core_rsp_tag (per_bank_core_rsp_tag[i]), .core_rsp_tag (per_bank_core_rsp_tag[bank_id]),
.core_rsp_idx (per_bank_core_rsp_idx[i]), .core_rsp_idx (per_bank_core_rsp_idx[bank_id]),
.core_rsp_ready (per_bank_core_rsp_ready[i]), .core_rsp_ready (per_bank_core_rsp_ready[bank_id]),
// Memory request // Memory request
.mem_req_valid (per_bank_mem_req_valid[i]), .mem_req_valid (per_bank_mem_req_valid[bank_id]),
.mem_req_addr (curr_bank_mem_req_addr), .mem_req_addr (curr_bank_mem_req_addr),
.mem_req_rw (per_bank_mem_req_rw[i]), .mem_req_rw (per_bank_mem_req_rw[bank_id]),
.mem_req_wsel (per_bank_mem_req_wsel[i]), .mem_req_byteen (per_bank_mem_req_byteen[bank_id]),
.mem_req_byteen (per_bank_mem_req_byteen[i]), .mem_req_data (per_bank_mem_req_data[bank_id]),
.mem_req_data (per_bank_mem_req_data[i]), .mem_req_id (per_bank_mem_req_id[bank_id]),
.mem_req_id (per_bank_mem_req_id[i]), .mem_req_flush (per_bank_mem_req_flush[bank_id]),
.mem_req_ready (per_bank_mem_req_ready[i]), .mem_req_ready (per_bank_mem_req_ready[bank_id]),
// Memory response // Memory response
.mem_rsp_valid (curr_bank_mem_rsp_valid), .mem_rsp_valid (curr_bank_mem_rsp_valid),
.mem_rsp_data (mem_rsp_data_s), .mem_rsp_data (mem_rsp_data_s),
.mem_rsp_id (`CS_MEM_TAG_TO_REQ_ID(mem_rsp_tag_s)), .mem_rsp_id (`CS_MEM_TAG_TO_REQ_ID(mem_rsp_tag_s)),
.mem_rsp_ready (per_bank_mem_rsp_ready[i]), .mem_rsp_ready (per_bank_mem_rsp_ready[bank_id]),
// initialization .flush_begin (per_bank_flush_begin[bank_id]),
.init_enable (init_enable), .flush_end (per_bank_flush_end[bank_id])
.init_line_sel (init_line_sel)
); );
if (NUM_BANKS == 1) begin if (NUM_BANKS == 1) begin
assign per_bank_mem_req_addr[i] = curr_bank_mem_req_addr; assign per_bank_mem_req_addr[bank_id] = curr_bank_mem_req_addr;
end else begin end else begin
assign per_bank_mem_req_addr[i] = `CS_LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, i); assign per_bank_mem_req_addr[bank_id] = `CS_LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, bank_id);
end end
end end
@ -418,7 +451,8 @@ module VX_cache import VX_gpu_pkg::*; #(
VX_stream_xbar #( VX_stream_xbar #(
.NUM_INPUTS (NUM_BANKS), .NUM_INPUTS (NUM_BANKS),
.NUM_OUTPUTS (NUM_REQS), .NUM_OUTPUTS (NUM_REQS),
.DATAW (CORE_RSP_DATAW) .DATAW (CORE_RSP_DATAW),
.ARBITER ("F")
) rsp_xbar ( ) rsp_xbar (
.clk (clk), .clk (clk),
.reset (rsp_xbar_reset), .reset (rsp_xbar_reset),
@ -442,39 +476,39 @@ module VX_cache import VX_gpu_pkg::*; #(
wire mem_req_valid_p; wire mem_req_valid_p;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_p; wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_p;
wire mem_req_rw_p; wire mem_req_rw_p;
wire [WORD_SEL_WIDTH-1:0] mem_req_wsel_p; wire [LINE_SIZE-1:0] mem_req_byteen_p;
wire [WORD_SIZE-1:0] mem_req_byteen_p; wire [`CS_LINE_WIDTH-1:0] mem_req_data_p;
wire [`CS_WORD_WIDTH-1:0] mem_req_data_p;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_p; wire [MEM_TAG_WIDTH-1:0] mem_req_tag_p;
wire [MSHR_ADDR_WIDTH-1:0] mem_req_id_p; wire [MSHR_ADDR_WIDTH-1:0] mem_req_id_p;
wire mem_req_flush_p;
wire mem_req_ready_p; wire mem_req_ready_p;
// Memory request arbitration // Memory request arbitration
wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + `CS_WORD_WIDTH)-1:0] data_in; wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + 1)-1:0] data_in;
for (genvar i = 0; i < NUM_BANKS; ++i) begin for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign data_in[i] = {per_bank_mem_req_addr[i], assign data_in[i] = {
per_bank_mem_req_addr[i],
per_bank_mem_req_rw[i], per_bank_mem_req_rw[i],
per_bank_mem_req_wsel[i],
per_bank_mem_req_byteen[i], per_bank_mem_req_byteen[i],
per_bank_mem_req_data[i], per_bank_mem_req_data[i],
per_bank_mem_req_id[i]}; per_bank_mem_req_id[i],
per_bank_mem_req_flush[i]
};
end end
`RESET_RELAY (mem_req_arb_reset, reset);
VX_stream_arb #( VX_stream_arb #(
.NUM_INPUTS (NUM_BANKS), .NUM_INPUTS (NUM_BANKS),
.DATAW (`CS_MEM_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + MSHR_ADDR_WIDTH), .DATAW (`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + MSHR_ADDR_WIDTH + 1),
.ARBITER ("R") .ARBITER ("F")
) mem_req_arb ( ) mem_req_arb (
.clk (clk), .clk (clk),
.reset (mem_req_arb_reset), .reset (reset),
.valid_in (per_bank_mem_req_valid), .valid_in (per_bank_mem_req_valid),
.ready_in (per_bank_mem_req_ready), .ready_in (per_bank_mem_req_ready),
.data_in (data_in), .data_in (data_in),
.data_out ({mem_req_addr_p, mem_req_rw_p, mem_req_wsel_p, mem_req_byteen_p, mem_req_data_p, mem_req_id_p}), .data_out ({mem_req_addr_p, mem_req_rw_p, mem_req_byteen_p, mem_req_data_p, mem_req_id_p, mem_req_flush_p}),
.valid_out (mem_req_valid_p), .valid_out (mem_req_valid_p),
.ready_out (mem_req_ready_p), .ready_out (mem_req_ready_p),
`UNUSED_PIN (sel_out) `UNUSED_PIN (sel_out)
@ -492,31 +526,15 @@ module VX_cache import VX_gpu_pkg::*; #(
assign mem_req_valid_s = mem_req_valid_p; assign mem_req_valid_s = mem_req_valid_p;
assign mem_req_addr_s = mem_req_addr_p; assign mem_req_addr_s = mem_req_addr_p;
assign mem_req_tag_s = mem_req_tag_p; assign mem_req_tag_s = mem_req_tag_p;
assign mem_req_flush_s = mem_req_flush_p;
assign mem_req_ready_p = mem_req_ready_s; assign mem_req_ready_p = mem_req_ready_s;
if (WRITE_ENABLE != 0) begin if (WRITE_ENABLE != 0) begin
if (`CS_WORDS_PER_LINE > 1) begin
reg [LINE_SIZE-1:0] mem_req_byteen_r;
reg [`CS_LINE_WIDTH-1:0] mem_req_data_r;
always @(*) begin
mem_req_byteen_r = '0;
mem_req_data_r = 'x;
mem_req_byteen_r[mem_req_wsel_p * WORD_SIZE +: WORD_SIZE] = mem_req_byteen_p;
mem_req_data_r[mem_req_wsel_p * `CS_WORD_WIDTH +: `CS_WORD_WIDTH] = mem_req_data_p;
end
assign mem_req_rw_s = mem_req_rw_p;
assign mem_req_byteen_s = mem_req_byteen_r;
assign mem_req_data_s = mem_req_data_r;
end else begin
`UNUSED_VAR (mem_req_wsel_p)
assign mem_req_rw_s = mem_req_rw_p; assign mem_req_rw_s = mem_req_rw_p;
assign mem_req_byteen_s = mem_req_byteen_p; assign mem_req_byteen_s = mem_req_byteen_p;
assign mem_req_data_s = mem_req_data_p; assign mem_req_data_s = mem_req_data_p;
end
end else begin end else begin
`UNUSED_VAR (mem_req_byteen_p) `UNUSED_VAR (mem_req_byteen_p)
`UNUSED_VAR (mem_req_wsel_p)
`UNUSED_VAR (mem_req_data_p) `UNUSED_VAR (mem_req_data_p)
`UNUSED_VAR (mem_req_rw_p) `UNUSED_VAR (mem_req_rw_p)
@ -530,14 +548,17 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle; wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle; wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [NUM_REQS-1:0] perf_core_reads_per_req = core_req_valid & core_req_ready & ~core_req_rw; wire [NUM_REQS-1:0] perf_core_reads_per_req;
wire [NUM_REQS-1:0] perf_core_writes_per_req = core_req_valid & core_req_ready & core_req_rw; wire [NUM_REQS-1:0] perf_core_writes_per_req;
// per cycle: read misses, write misses, msrq stalls, pipeline stalls // per cycle: read misses, write misses, msrq stalls, pipeline stalls
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle; wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle; wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle; wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_crsp_stall_per_cycle; wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
`BUFFER(perf_core_reads_per_req, core_req_valid & core_req_ready & ~core_req_rw);
`BUFFER(perf_core_writes_per_req, core_req_valid & core_req_ready & core_req_rw);
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req); `POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req); `POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
@ -547,7 +568,7 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_REQS-1:0] perf_crsp_stall_per_req; wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar i = 0; i < NUM_REQS; ++i) begin
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready; assign perf_crsp_stall_per_req[i] = core_bus2_if[i].rsp_valid && ~core_bus2_if[i].rsp_ready;
end end
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req); `POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);

View file

@ -41,17 +41,23 @@ module VX_cache_bank #(
// Enable cache writeable // Enable cache writeable
parameter WRITE_ENABLE = 1, parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier // Request debug identifier
parameter UUID_WIDTH = 0, parameter UUID_WIDTH = 0,
// core request tag size // core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 1, parameter TAG_WIDTH = UUID_WIDTH + 1,
// Core response output register // Core response output buffer
parameter CORE_OUT_REG = 0, parameter CORE_OUT_BUF = 0,
// Memory request output register // Memory request output buffer
parameter MEM_OUT_REG = 0, parameter MEM_OUT_BUF = 0,
parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE), parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE),
parameter REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS), parameter REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS),
@ -69,12 +75,13 @@ module VX_cache_bank #(
// Core Request // Core Request
input wire core_req_valid, input wire core_req_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] core_req_addr, input wire [`CS_LINE_ADDR_WIDTH-1:0] core_req_addr,
input wire core_req_rw, input wire core_req_rw, // write enable
input wire [WORD_SEL_WIDTH-1:0] core_req_wsel, input wire [WORD_SEL_WIDTH-1:0] core_req_wsel, // select the word in a cacheline, e.g. word size = 4 bytes, cacheline size = 64 bytes, it should have log(64/4)= 4 bits
input wire [WORD_SIZE-1:0] core_req_byteen, input wire [WORD_SIZE-1:0] core_req_byteen,// which bytes in data to write
input wire [`CS_WORD_WIDTH-1:0] core_req_data, input wire [`CS_WORD_WIDTH-1:0] core_req_data, // data to be written
input wire [TAG_WIDTH-1:0] core_req_tag, input wire [TAG_WIDTH-1:0] core_req_tag, // identifier of the request (request id)
input wire [REQ_SEL_WIDTH-1:0] core_req_idx, input wire [REQ_SEL_WIDTH-1:0] core_req_idx, // index of the request in the core request array
input wire core_req_flush, // flush enable
output wire core_req_ready, output wire core_req_ready,
// Core Response // Core Response
@ -88,10 +95,10 @@ module VX_cache_bank #(
output wire mem_req_valid, output wire mem_req_valid,
output wire [`CS_LINE_ADDR_WIDTH-1:0] mem_req_addr, output wire [`CS_LINE_ADDR_WIDTH-1:0] mem_req_addr,
output wire mem_req_rw, output wire mem_req_rw,
output wire [WORD_SEL_WIDTH-1:0] mem_req_wsel, output wire [LINE_SIZE-1:0] mem_req_byteen,
output wire [WORD_SIZE-1:0] mem_req_byteen, output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [`CS_WORD_WIDTH-1:0] mem_req_data, output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id, // index of the head entry in the mshr
output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id, output wire mem_req_flush,
input wire mem_req_ready, input wire mem_req_ready,
// Memory response // Memory response
@ -100,18 +107,21 @@ module VX_cache_bank #(
input wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id, input wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id,
output wire mem_rsp_ready, output wire mem_rsp_ready,
// initialization // flush
input wire init_enable, input wire flush_begin,
input wire [`CS_LINE_SEL_BITS-1:0] init_line_sel output wire flush_end
); );
localparam PIPELINE_STAGES = 2;
`IGNORE_UNUSED_BEGIN `IGNORE_UNUSED_BEGIN
wire [`UP(UUID_WIDTH)-1:0] req_uuid_sel, req_uuid_st0, req_uuid_st1; wire [`UP(UUID_WIDTH)-1:0] req_uuid_sel, req_uuid_st0, req_uuid_st1;
`IGNORE_UNUSED_END `IGNORE_UNUSED_END
wire crsq_stall; wire crsp_queue_stall;
wire mshr_alm_full; wire mshr_alm_full;
wire mreq_alm_full; wire mreq_queue_empty;
wire mreq_queue_alm_full;
wire [`CS_LINE_ADDR_WIDTH-1:0] mem_rsp_addr; wire [`CS_LINE_ADDR_WIDTH-1:0] mem_rsp_addr;
@ -126,102 +136,148 @@ module VX_cache_bank #(
wire [MSHR_ADDR_WIDTH-1:0] replay_id; wire [MSHR_ADDR_WIDTH-1:0] replay_id;
wire replay_ready; wire replay_ready;
wire is_init_st0, is_init_st1;
wire is_flush_st0, is_flush_st1;
wire [NUM_WAYS-1:0] flush_way_st0;
wire [`CS_LINE_ADDR_WIDTH-1:0] addr_sel, addr_st0, addr_st1; wire [`CS_LINE_ADDR_WIDTH-1:0] addr_sel, addr_st0, addr_st1;
wire rw_st0, rw_st1; wire [`CS_LINE_SEL_BITS-1:0] line_sel_st0, line_sel_st1;
wire [WORD_SEL_WIDTH-1:0] wsel_st0, wsel_st1; wire rw_sel, rw_st0, rw_st1;
wire [WORD_SIZE-1:0] byteen_st0, byteen_st1; wire [WORD_SEL_WIDTH-1:0] wsel_sel, wsel_st0, wsel_st1;
wire [REQ_SEL_WIDTH-1:0] req_idx_st0, req_idx_st1; wire [WORD_SIZE-1:0] byteen_sel, byteen_st0, byteen_st1;
wire [TAG_WIDTH-1:0] tag_st0, tag_st1; wire [REQ_SEL_WIDTH-1:0] req_idx_sel, req_idx_st0, req_idx_st1;
wire [TAG_WIDTH-1:0] tag_sel, tag_st0, tag_st1;
wire [`CS_WORD_WIDTH-1:0] read_data_st1; wire [`CS_WORD_WIDTH-1:0] read_data_st1;
wire [`CS_LINE_WIDTH-1:0] data_sel, data_st0, data_st1; wire [`CS_LINE_WIDTH-1:0] data_sel, data_st0, data_st1;
wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0, mshr_id_st0, mshr_id_st1; wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0, mshr_id_st0, mshr_id_st1;
wire valid_sel, valid_st0, valid_st1; wire valid_sel, valid_st0, valid_st1;
wire is_init_st0;
wire is_creq_st0, is_creq_st1; wire is_creq_st0, is_creq_st1;
wire is_fill_st0, is_fill_st1; wire is_fill_st0, is_fill_st1;
wire is_replay_st0, is_replay_st1; wire is_replay_st0, is_replay_st1;
wire creq_flush_sel, creq_flush_st0, creq_flush_st1;
wire evict_dirty_st0, evict_dirty_st1;
wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1;
wire [NUM_WAYS-1:0] tag_matches_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0; wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_tail_st0, mshr_tail_st1; wire [MSHR_ADDR_WIDTH-1:0] mshr_prev_st0, mshr_prev_st1;
wire mshr_pending_st0, mshr_pending_st1; wire mshr_pending_st0, mshr_pending_st1;
wire mshr_empty;
wire rdw_hazard_st0; wire flush_valid;
reg rdw_hazard_st1; wire init_valid;
wire [`CS_LINE_SEL_BITS-1:0] flush_sel;
wire [NUM_WAYS-1:0] flush_way;
wire flush_ready;
wire pipe_stall = crsq_stall || rdw_hazard_st1; // ensure we have no pending memory request in the bank
wire no_pending_req = ~valid_st0 && ~valid_st1 && mreq_queue_empty;
// flush unit
VX_bank_flush #(
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WRITEBACK (WRITEBACK)
) flush_unit (
.clk (clk),
.reset (reset),
.flush_begin (flush_begin),
.flush_end (flush_end),
.flush_init (init_valid),
.flush_valid (flush_valid),
.flush_line (flush_sel),
.flush_way (flush_way),
.flush_ready (flush_ready),
.mshr_empty (mshr_empty),
.bank_empty (no_pending_req)
);
wire rdw_hazard1_sel;
wire rdw_hazard2_sel;
reg rdw_hazard3_st1;
wire pipe_stall = crsp_queue_stall || rdw_hazard3_st1;
// inputs arbitration: // inputs arbitration:
// mshr replay has highest priority to maximize utilization since there is no miss. // mshr replay has highest priority to maximize utilization since there is no miss.
// handle memory responses next to prevent deadlock with potential memory request from a miss. // handle memory responses next to prevent deadlock with potential memory request from a miss.
wire replay_grant = ~init_enable; // flush has precedence over core requests to ensure that the cache is in a consistent state.
wire replay_grant = ~init_valid;
wire replay_enable = replay_grant && replay_valid; wire replay_enable = replay_grant && replay_valid;
wire fill_grant = ~init_enable && ~replay_enable; wire fill_grant = ~init_valid && ~replay_enable;
wire fill_enable = fill_grant && mem_rsp_valid; wire fill_enable = fill_grant && mem_rsp_valid;
wire creq_grant = ~init_enable && ~replay_enable && ~fill_enable; wire flush_grant = ~init_valid && ~replay_enable && ~fill_enable;
wire flush_enable = flush_grant && flush_valid;
wire creq_grant = ~init_valid && ~replay_enable && ~fill_enable && ~flush_enable;
wire creq_enable = creq_grant && core_req_valid; wire creq_enable = creq_grant && core_req_valid;
assign replay_ready = replay_grant assign replay_ready = replay_grant
&& ~rdw_hazard_st0 && ~rdw_hazard1_sel
&& ~pipe_stall; && ~pipe_stall;
assign mem_rsp_ready = fill_grant assign mem_rsp_ready = fill_grant
&& (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions
&& ~rdw_hazard2_sel
&& ~pipe_stall;
assign flush_ready = flush_grant
&& (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions
&& ~rdw_hazard2_sel
&& ~pipe_stall; && ~pipe_stall;
assign core_req_ready = creq_grant assign core_req_ready = creq_grant
&& ~mreq_alm_full && ~mreq_queue_alm_full
&& ~mshr_alm_full && ~mshr_alm_full
&& ~pipe_stall; && ~pipe_stall;
wire init_fire = init_enable; wire init_fire = init_valid;
wire replay_fire = replay_valid && replay_ready; wire replay_fire = replay_valid && replay_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready; wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
wire flush_fire = flush_valid && flush_ready;
wire core_req_fire = core_req_valid && core_req_ready; wire core_req_fire = core_req_valid && core_req_ready;
wire [TAG_WIDTH-1:0] mshr_creq_tag = replay_enable ? replay_tag : core_req_tag; assign valid_sel = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire;
assign rw_sel = replay_valid ? replay_rw : core_req_rw;
assign byteen_sel = replay_valid ? replay_byteen : core_req_byteen;
assign wsel_sel = replay_valid ? replay_wsel : core_req_wsel;
assign req_idx_sel = replay_valid ? replay_idx : core_req_idx;
assign tag_sel = replay_valid ? replay_tag : core_req_tag;
assign creq_flush_sel = core_req_valid && core_req_flush;
assign addr_sel = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) :
(replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : core_req_addr));
if (WRITE_ENABLE) begin
assign data_sel[`CS_WORD_WIDTH-1:0] = replay_valid ? replay_data : (mem_rsp_valid ? mem_rsp_data[`CS_WORD_WIDTH-1:0] : core_req_data);
end else begin
assign data_sel[`CS_WORD_WIDTH-1:0] = mem_rsp_data[`CS_WORD_WIDTH-1:0];
`UNUSED_VAR (core_req_data)
`UNUSED_VAR (replay_data)
end
for (genvar i = `CS_WORD_WIDTH; i < `CS_LINE_WIDTH; ++i) begin
assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words of data_sel
end
if (UUID_WIDTH != 0) begin if (UUID_WIDTH != 0) begin
assign req_uuid_sel = mshr_creq_tag[TAG_WIDTH-1 -: UUID_WIDTH]; assign req_uuid_sel = tag_sel[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin end else begin
assign req_uuid_sel = 0; assign req_uuid_sel = 0;
end end
`UNUSED_VAR (mshr_creq_tag)
assign valid_sel = init_fire || replay_fire || mem_rsp_fire || core_req_fire;
assign addr_sel = init_enable ? `CS_LINE_ADDR_WIDTH'(init_line_sel) :
(replay_valid ? replay_addr :
(mem_rsp_valid ? mem_rsp_addr : core_req_addr));
assign data_sel[`CS_WORD_WIDTH-1:0] = (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data[`CS_WORD_WIDTH-1:0] : (replay_valid ? replay_data : core_req_data);
for (genvar i = `CS_WORD_WIDTH; i < `CS_LINE_WIDTH; ++i) begin
assign data_sel[i] = mem_rsp_data[i];
end
VX_pipe_register #( VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH), .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + NUM_WAYS + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
.RESETW (1) .RESETW (1)
) pipe_reg0 ( ) pipe_reg0 (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (~pipe_stall), .enable (~pipe_stall),
.data_in ({ .data_in ({valid_sel, init_valid, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}),
valid_sel, .data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
init_enable,
replay_enable,
fill_enable,
creq_enable,
addr_sel,
data_sel,
replay_valid ? replay_rw : core_req_rw,
replay_valid ? replay_byteen : core_req_byteen,
replay_valid ? replay_wsel : core_req_wsel,
replay_valid ? replay_idx : core_req_idx,
replay_valid ? replay_tag : core_req_tag,
replay_id
}),
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_creq_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
); );
if (UUID_WIDTH != 0) begin if (UUID_WIDTH != 0) begin
@ -230,59 +286,81 @@ module VX_cache_bank #(
assign req_uuid_st0 = 0; assign req_uuid_st0 = 0;
end end
wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire do_init_st0 = valid_st0 && is_init_st0; wire do_init_st0 = valid_st0 && is_init_st0;
wire do_lookup_st0 = valid_st0 && ~(is_fill_st0 || is_init_st0); wire do_flush_st0 = valid_st0 && is_flush_st0;
wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0;
wire do_creq_wr_st0 = valid_st0 && is_creq_st0 && rw_st0;
wire do_replay_rd_st0 = valid_st0 && is_replay_st0 && ~rw_st0;
wire do_replay_wr_st0 = valid_st0 && is_replay_st0 && rw_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire do_cache_rd_st0 = do_creq_rd_st0 || do_replay_rd_st0;
wire do_cache_wr_st0 = do_creq_wr_st0 || do_replay_wr_st0;
wire do_lookup_st0 = do_cache_rd_st0 || do_cache_wr_st0;
wire [`CS_WORD_WIDTH-1:0] write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0]; wire [`CS_WORD_WIDTH-1:0] write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0];
wire [NUM_WAYS-1:0] tag_matches_st0, tag_matches_st1; assign line_sel_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0];
wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1;
`RESET_RELAY (tag_reset, reset); wire [NUM_WAYS-1:0] evict_way_st0;
wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0;
VX_cache_tags #( VX_cache_tags #(
.INSTANCE_ID(INSTANCE_ID), .INSTANCE_ID($sformatf("%s-tags", INSTANCE_ID)),
.BANK_ID (BANK_ID), .BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE), .CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE), .LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS), .NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS), .NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE), .WORD_SIZE (WORD_SIZE),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH) .UUID_WIDTH (UUID_WIDTH)
) cache_tags ( ) cache_tags (
.clk (clk), .clk (clk),
.reset (tag_reset), .reset (reset),
.req_uuid (req_uuid_st0), .req_uuid (req_uuid_st0),
.stall (pipe_stall), .stall (pipe_stall),
// read/Fill // init/flush/fill/write/lookup
.init (do_init_st0),
.flush (do_flush_st0),
.fill (do_fill_st0),
.write (do_cache_wr_st0),
.lookup (do_lookup_st0), .lookup (do_lookup_st0),
.line_addr (addr_st0), .line_addr (addr_st0),
.fill (do_fill_st0), .way_sel (flush_way_st0),
.init (do_init_st0), .tag_matches(tag_matches_st0),
.way_sel (way_sel_st0),
.tag_matches(tag_matches_st0) // replacement
.evict_dirty(evict_dirty_st0),
.evict_way (evict_way_st0),
.evict_tag (evict_tag_st0)
); );
wire [`CS_LINE_ADDR_WIDTH-1:0] addr2_st0;
wire is_flush2_st0 = WRITEBACK && is_flush_st0;
assign mshr_id_st0 = is_creq_st0 ? mshr_alloc_id_st0 : replay_id_st0; assign mshr_id_st0 = is_creq_st0 ? mshr_alloc_id_st0 : replay_id_st0;
assign way_sel_st0 = (is_fill_st0 || is_flush2_st0) ? evict_way_st0 : tag_matches_st0;
assign addr2_st0 = (is_fill_st0 || is_flush2_st0) ? {evict_tag_st0, line_sel_st0} : addr_st0;
VX_pipe_register #( VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + NUM_WAYS + 1), .DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1 + 1),
.RESETW (1) .RESETW (1)
) pipe_reg1 ( ) pipe_reg1 (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (~pipe_stall), .enable (~pipe_stall),
.data_in ({valid_st0, is_replay_st0, is_fill_st0, is_creq_st0, rw_st0, addr_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_tail_st0, tag_matches_st0, way_sel_st0, mshr_pending_st0}), .data_in ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush2_st0, is_creq_st0, creq_flush_st0, rw_st0, addr2_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, evict_dirty_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_replay_st1, is_fill_st1, is_creq_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_tail_st1, tag_matches_st1, way_sel_st1, mshr_pending_st1}) .data_out ({valid_st1, is_init_st1, is_replay_st1, is_fill_st1, is_flush_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, evict_dirty_st1, mshr_pending_st1})
); );
// we have a tag hit // we have a tag hit
wire is_hit_st1 = (| tag_matches_st1); wire is_hit_st1 = (| way_sel_st1);
if (UUID_WIDTH != 0) begin if (UUID_WIDTH != 0) begin
assign req_uuid_st1 = tag_st1[TAG_WIDTH-1 -: UUID_WIDTH]; assign req_uuid_st1 = tag_st1[TAG_WIDTH-1 -: UUID_WIDTH];
@ -290,9 +368,15 @@ module VX_cache_bank #(
assign req_uuid_st1 = 0; assign req_uuid_st1 = 0;
end end
wire do_creq_rd_st1 = valid_st1 && is_creq_st1 && ~rw_st1; wire is_read_st1 = is_creq_st1 && ~rw_st1;
wire do_creq_wr_st1 = valid_st1 && is_creq_st1 && rw_st1; wire is_write_st1 = is_creq_st1 && rw_st1;
wire do_init_st1 = valid_st1 && is_init_st1;
wire do_fill_st1 = valid_st1 && is_fill_st1; wire do_fill_st1 = valid_st1 && is_fill_st1;
wire do_flush_st1 = valid_st1 && is_flush_st1;
wire do_creq_rd_st1 = valid_st1 && is_read_st1;
wire do_creq_wr_st1 = valid_st1 && is_write_st1;
wire do_replay_rd_st1 = valid_st1 && is_replay_st1 && ~rw_st1; wire do_replay_rd_st1 = valid_st1 && is_replay_st1 && ~rw_st1;
wire do_replay_wr_st1 = valid_st1 && is_replay_st1 && rw_st1; wire do_replay_wr_st1 = valid_st1 && is_replay_st1 && rw_st1;
@ -302,25 +386,46 @@ module VX_cache_bank #(
wire do_write_hit_st1 = do_creq_wr_st1 && is_hit_st1; wire do_write_hit_st1 = do_creq_wr_st1 && is_hit_st1;
wire do_write_miss_st1= do_creq_wr_st1 && ~is_hit_st1; wire do_write_miss_st1= do_creq_wr_st1 && ~is_hit_st1;
wire do_cache_rd_st1 = do_read_hit_st1 || do_replay_rd_st1;
wire do_cache_wr_st1 = do_write_hit_st1 || do_replay_wr_st1;
assign line_sel_st1 = addr_st1[`CS_LINE_SEL_BITS-1:0];
`UNUSED_VAR (do_write_miss_st1) `UNUSED_VAR (do_write_miss_st1)
// ensure mshr replay always get a hit // ensure mshr replay always get a hit
`RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("runtime error: invalid mshr replay")); `RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("missed mshr replay"));
// detect BRAM's read-during-write hazard // both tag and data stores use BRAM with no read-during-write protection.
assign rdw_hazard_st0 = do_fill_st0; // after a fill // we ned to stall the pipeline to prevent read-after-write hazards.
assign rdw_hazard1_sel = do_fill_st0; // stall first replay following a fill
assign rdw_hazard2_sel = WRITEBACK && do_cache_wr_st0; // a writeback can evict any preceeding write
always @(posedge clk) begin always @(posedge clk) begin
rdw_hazard_st1 <= (do_creq_rd_st0 && do_write_hit_st1 && (addr_st0 == addr_st1)) // stall reads following writes to same line address
&& ~rdw_hazard_st1; // after a write to same address rdw_hazard3_st1 <= do_cache_rd_st0 && do_cache_wr_st1 && (line_sel_st0 == line_sel_st1)
&& ~rdw_hazard3_st1; // release pipeline stall
end end
wire [`CS_WORD_WIDTH-1:0] write_data_st1 = data_st1[`CS_WORD_WIDTH-1:0]; wire [`CS_LINE_WIDTH-1:0] write_data_st1 = {`CS_WORDS_PER_LINE{data_st1[`CS_WORD_WIDTH-1:0]}};
wire [`CS_LINE_WIDTH-1:0] fill_data_st1 = data_st1; wire [`CS_LINE_WIDTH-1:0] fill_data_st1 = data_st1;
wire [LINE_SIZE-1:0] write_byteen_st1;
`RESET_RELAY (data_reset, reset); wire [`CS_LINE_WIDTH-1:0] dirty_data_st1;
wire [LINE_SIZE-1:0] dirty_byteen_st1;
if (`CS_WORDS_PER_LINE > 1) begin
reg [LINE_SIZE-1:0] write_byteen_r;
always @(*) begin
write_byteen_r = '0;
write_byteen_r[wsel_st1 * WORD_SIZE +: WORD_SIZE] = byteen_st1;
end
assign write_byteen_st1 = write_byteen_r;
end else begin
assign write_byteen_st1 = byteen_st1;
end
VX_cache_data #( VX_cache_data #(
.INSTANCE_ID (INSTANCE_ID), .INSTANCE_ID ($sformatf("%s-data", INSTANCE_ID)),
.BANK_ID (BANK_ID), .BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE), .CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE), .LINE_SIZE (LINE_SIZE),
@ -328,32 +433,49 @@ module VX_cache_bank #(
.NUM_WAYS (NUM_WAYS), .NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE), .WORD_SIZE (WORD_SIZE),
.WRITE_ENABLE (WRITE_ENABLE), .WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH) .UUID_WIDTH (UUID_WIDTH)
) cache_data ( ) cache_data (
.clk (clk), .clk (clk),
.reset (data_reset), .reset (reset),
.req_uuid (req_uuid_st1), .req_uuid (req_uuid_st1),
.stall (pipe_stall), .stall (pipe_stall),
.read (do_read_hit_st1 || do_replay_rd_st1), .init (do_init_st1),
.read (do_cache_rd_st1),
.fill (do_fill_st1), .fill (do_fill_st1),
.write (do_write_hit_st1 || do_replay_wr_st1), .flush (do_flush_st1),
.way_sel (way_sel_st1 | tag_matches_st1), .write (do_cache_wr_st1),
.way_sel (way_sel_st1),
.line_addr (addr_st1), .line_addr (addr_st1),
.wsel (wsel_st1), .wsel (wsel_st1),
.byteen (byteen_st1),
.fill_data (fill_data_st1), .fill_data (fill_data_st1),
.write_data (write_data_st1), .write_data (write_data_st1),
.read_data (read_data_st1) .write_byteen(write_byteen_st1),
.read_data (read_data_st1),
.dirty_data (dirty_data_st1),
.dirty_byteen(dirty_byteen_st1)
); );
wire [MSHR_SIZE-1:0] mshr_matches_st0; wire [MSHR_SIZE-1:0] mshr_lookup_pending_st0;
wire [MSHR_SIZE-1:0] mshr_lookup_rw_st0;
wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~pipe_stall; wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~pipe_stall;
wire mshr_lookup_st0 = mshr_allocate_st0; wire mshr_lookup_st0 = mshr_allocate_st0;
wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~pipe_stall; wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~pipe_stall;
wire mshr_release_st1 = is_hit_st1 || (rw_st1 && ~mshr_pending_st1);
// release allocated mshr entry if we had a hit
wire mshr_release_st1;
if (WRITEBACK) begin
assign mshr_release_st1 = is_hit_st1;
end else begin
// we need to keep missed write requests in MSHR if there is already a pending entry to the same address
// this ensures that missed write requests are replayed locally in case a pending fill arrives without the write content
// this can happen when writes are sent late, when the fill was already in flight.
assign mshr_release_st1 = is_hit_st1 || (rw_st1 && ~mshr_pending_st1);
end
VX_pending_size #( VX_pending_size #(
.SIZE (MSHR_SIZE) .SIZE (MSHR_SIZE)
@ -362,15 +484,15 @@ module VX_cache_bank #(
.reset (reset), .reset (reset),
.incr (core_req_fire), .incr (core_req_fire),
.decr (replay_fire || (mshr_finalize_st1 && mshr_release_st1)), .decr (replay_fire || (mshr_finalize_st1 && mshr_release_st1)),
.empty (mshr_empty),
`UNUSED_PIN (alm_empty),
.full (mshr_alm_full), .full (mshr_alm_full),
`UNUSED_PIN (size), `UNUSED_PIN (alm_full),
`UNUSED_PIN (empty) `UNUSED_PIN (size)
); );
`RESET_RELAY (mshr_reset, reset);
VX_cache_mshr #( VX_cache_mshr #(
.INSTANCE_ID (INSTANCE_ID), .INSTANCE_ID ($sformatf("%s-mshr", INSTANCE_ID)),
.BANK_ID (BANK_ID), .BANK_ID (BANK_ID),
.LINE_SIZE (LINE_SIZE), .LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS), .NUM_BANKS (NUM_BANKS),
@ -379,7 +501,7 @@ module VX_cache_bank #(
.DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH) .DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH)
) cache_mshr ( ) cache_mshr (
.clk (clk), .clk (clk),
.reset (mshr_reset), .reset (reset),
.deq_req_uuid (req_uuid_sel), .deq_req_uuid (req_uuid_sel),
.lkp_req_uuid (req_uuid_st0), .lkp_req_uuid (req_uuid_st0),
@ -404,104 +526,128 @@ module VX_cache_bank #(
.allocate_rw (rw_st0), .allocate_rw (rw_st0),
.allocate_data ({wsel_st0, byteen_st0, write_data_st0, tag_st0, req_idx_st0}), .allocate_data ({wsel_st0, byteen_st0, write_data_st0, tag_st0, req_idx_st0}),
.allocate_id (mshr_alloc_id_st0), .allocate_id (mshr_alloc_id_st0),
.allocate_tail (mshr_tail_st0), .allocate_prev (mshr_prev_st0),
`UNUSED_PIN (allocate_ready), `UNUSED_PIN (allocate_ready),
// lookup // lookup
.lookup_valid (mshr_lookup_st0), .lookup_valid (mshr_lookup_st0),
.lookup_addr (addr_st0), .lookup_addr (addr_st0),
.lookup_matches (mshr_matches_st0), .lookup_pending (mshr_lookup_pending_st0),
.lookup_rw (mshr_lookup_rw_st0),
// finalize // finalize
.finalize_valid (mshr_finalize_st1), .finalize_valid (mshr_finalize_st1),
.finalize_release(mshr_release_st1), .finalize_release(mshr_release_st1),
.finalize_pending(mshr_pending_st1), .finalize_pending(mshr_pending_st1),
.finalize_id (mshr_id_st1), .finalize_id (mshr_id_st1),
.finalize_tail (mshr_tail_st1) .finalize_prev (mshr_prev_st1)
); );
// ignore allocated id from mshr matches // check if there are pending requests to same line in the MSHR
wire [MSHR_SIZE-1:0] lookup_matches; wire [MSHR_SIZE-1:0] lookup_matches;
for (genvar i = 0; i < MSHR_SIZE; ++i) begin for (genvar i = 0; i < MSHR_SIZE; ++i) begin
assign lookup_matches[i] = (i != mshr_alloc_id_st0) && mshr_matches_st0[i]; assign lookup_matches[i] = mshr_lookup_pending_st0[i]
&& (i != mshr_alloc_id_st0) // exclude current mshr id
&& (WRITEBACK || ~mshr_lookup_rw_st0[i]); // exclude write requests if writethrough
end end
assign mshr_pending_st0 = (| lookup_matches); assign mshr_pending_st0 = (| lookup_matches);
// schedule core response // schedule core response
wire crsq_valid, crsq_ready; wire crsp_queue_valid, crsp_queue_ready;
wire [`CS_WORD_WIDTH-1:0] crsq_data; wire [`CS_WORD_WIDTH-1:0] crsp_queue_data;
wire [REQ_SEL_WIDTH-1:0] crsq_idx; wire [REQ_SEL_WIDTH-1:0] crsp_queue_idx;
wire [TAG_WIDTH-1:0] crsq_tag; wire [TAG_WIDTH-1:0] crsp_queue_tag;
assign crsq_valid = do_read_hit_st1 || do_replay_rd_st1; assign crsp_queue_valid = do_cache_rd_st1;
assign crsq_idx = req_idx_st1; assign crsp_queue_idx = req_idx_st1;
assign crsq_data = read_data_st1; assign crsp_queue_data = read_data_st1;
assign crsq_tag = tag_st1; assign crsp_queue_tag = tag_st1;
`RESET_RELAY (crsp_reset, reset);
VX_elastic_buffer #( VX_elastic_buffer #(
.DATAW (TAG_WIDTH + `CS_WORD_WIDTH + REQ_SEL_WIDTH), .DATAW (TAG_WIDTH + `CS_WORD_WIDTH + REQ_SEL_WIDTH),
.SIZE (CRSQ_SIZE), .SIZE (CRSQ_SIZE),
.OUT_REG (CORE_OUT_REG) .OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_queue ( ) core_rsp_queue (
.clk (clk), .clk (clk),
.reset (crsp_reset), .reset (reset),
.valid_in (crsq_valid && ~rdw_hazard_st1), .valid_in (crsp_queue_valid && ~rdw_hazard3_st1),
.ready_in (crsq_ready), .ready_in (crsp_queue_ready),
.data_in ({crsq_tag, crsq_data, crsq_idx}), .data_in ({crsp_queue_tag, crsp_queue_data, crsp_queue_idx}),
.data_out ({core_rsp_tag, core_rsp_data, core_rsp_idx}), .data_out ({core_rsp_tag, core_rsp_data, core_rsp_idx}),
.valid_out (core_rsp_valid), .valid_out (core_rsp_valid),
.ready_out (core_rsp_ready) .ready_out (core_rsp_ready)
); );
assign crsq_stall = crsq_valid && ~crsq_ready; assign crsp_queue_stall = crsp_queue_valid && ~crsp_queue_ready;
// schedule memory request // schedule memory request
wire mreq_push, mreq_pop, mreq_empty; wire mreq_queue_push, mreq_queue_pop;
wire [`CS_WORD_WIDTH-1:0] mreq_data; wire [`CS_LINE_WIDTH-1:0] mreq_queue_data;
wire [WORD_SIZE-1:0] mreq_byteen; wire [LINE_SIZE-1:0] mreq_queue_byteen;
wire [WORD_SEL_WIDTH-1:0] mreq_wsel; wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr;
wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_addr; wire [MSHR_ADDR_WIDTH-1:0] mreq_queue_id;
wire [MSHR_ADDR_WIDTH-1:0] mreq_id; wire mreq_queue_rw;
wire mreq_rw; wire mreq_queue_flush;
assign mreq_push = (do_read_miss_st1 && ~mshr_pending_st1) wire is_fill_or_flush_st1 = is_fill_st1 || is_flush_st1;
|| do_creq_wr_st1; wire do_fill_or_flush_st1 = valid_st1 && is_fill_or_flush_st1;
wire do_writeback_st1 = do_fill_or_flush_st1 && evict_dirty_st1;
assign mreq_pop = mem_req_valid && mem_req_ready; if (WRITEBACK) begin
if (DIRTY_BYTES) begin
// ensure dirty bytes match the tag info
wire has_dirty_bytes = (| dirty_byteen_st1);
`RUNTIME_ASSERT (~do_fill_or_flush_st1 || (evict_dirty_st1 == has_dirty_bytes), ("missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", evict_dirty_st1, has_dirty_bytes, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID)));
end
assign mreq_queue_push = (((do_read_miss_st1 || do_write_miss_st1) && ~mshr_pending_st1)
|| do_writeback_st1)
&& ~rdw_hazard3_st1;
end else begin
`UNUSED_VAR (do_writeback_st1)
assign mreq_queue_push = ((do_read_miss_st1 && ~mshr_pending_st1)
|| do_creq_wr_st1)
&& ~rdw_hazard3_st1;
end
assign mreq_rw = WRITE_ENABLE && rw_st1; assign mreq_queue_pop = mem_req_valid && mem_req_ready;
assign mreq_addr = addr_st1; assign mreq_queue_addr = addr_st1;
assign mreq_id = mshr_id_st1; assign mreq_queue_id = mshr_id_st1;
assign mreq_wsel = wsel_st1; assign mreq_queue_flush = creq_flush_st1;
assign mreq_byteen = byteen_st1;
assign mreq_data = write_data_st1;
`RESET_RELAY (mreq_reset, reset); if (WRITE_ENABLE) begin
assign mreq_queue_rw = WRITEBACK ? is_fill_or_flush_st1 : rw_st1;
assign mreq_queue_data = WRITEBACK ? dirty_data_st1 : write_data_st1;
assign mreq_queue_byteen = WRITEBACK ? dirty_byteen_st1 : write_byteen_st1;
end else begin
assign mreq_queue_rw = 0;
assign mreq_queue_data = 0;
assign mreq_queue_byteen = 0;
`UNUSED_VAR (dirty_data_st1)
`UNUSED_VAR (dirty_byteen_st1)
end
VX_fifo_queue #( VX_fifo_queue #(
.DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + `CS_WORD_WIDTH), .DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + 1),
.DEPTH (MREQ_SIZE), .DEPTH (MREQ_SIZE),
.ALM_FULL (MREQ_SIZE-2), .ALM_FULL (MREQ_SIZE-PIPELINE_STAGES),
.OUT_REG (MEM_OUT_REG) .OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_queue ( ) mem_req_queue (
.clk (clk), .clk (clk),
.reset (mreq_reset), .reset (reset),
.push (mreq_push), .push (mreq_queue_push),
.pop (mreq_pop), .pop (mreq_queue_pop),
.data_in ({mreq_rw, mreq_addr, mreq_id, mreq_byteen, mreq_wsel, mreq_data}), .data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_id, mreq_queue_byteen, mreq_queue_data, mreq_queue_flush}),
.data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_wsel, mem_req_data}), .data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_data, mem_req_flush}),
.empty (mreq_empty), .empty (mreq_queue_empty),
.alm_full (mreq_alm_full), .alm_full (mreq_queue_alm_full),
`UNUSED_PIN (full), `UNUSED_PIN (full),
`UNUSED_PIN (alm_empty), `UNUSED_PIN (alm_empty),
`UNUSED_PIN (size) `UNUSED_PIN (size)
); );
assign mem_req_valid = ~mreq_empty; assign mem_req_valid = ~mreq_queue_empty;
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@ -511,37 +657,36 @@ module VX_cache_bank #(
assign perf_mshr_stalls = mshr_alm_full; assign perf_mshr_stalls = mshr_alm_full;
`endif `endif
`ifdef DBG_TRACE_CACHE_BANK `ifdef DBG_TRACE_CACHE
wire crsq_fire = crsq_valid && crsq_ready; wire crsp_queue_fire = crsp_queue_valid && crsp_queue_ready;
wire pipeline_stall = (replay_valid || mem_rsp_valid || core_req_valid) wire input_stall = (replay_valid || mem_rsp_valid || core_req_valid || flush_valid)
&& ~(replay_fire || mem_rsp_fire || core_req_fire); && ~(replay_fire || mem_rsp_fire || core_req_fire || flush_fire);
always @(posedge clk) begin always @(posedge clk) begin
if (pipeline_stall) begin if (input_stall || pipe_stall) begin
`TRACE(3, ("%d: *** %s-bank%0d stall: crsq=%b, mreq=%b, mshr=%b\n", $time, INSTANCE_ID, BANK_ID, crsq_stall, mreq_alm_full, mshr_alm_full)); `TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw1=%b, rdw2=%b, rdw3=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard1_sel, rdw_hazard2_sel, rdw_hazard3_st1));
end
if (init_enable) begin
`TRACE(2, ("%d: %s-bank%0d init: addr=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(init_line_sel, BANK_ID)));
end end
if (mem_rsp_fire) begin if (mem_rsp_fire) begin
`TRACE(2, ("%d: %s-bank%0d fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data)); `TRACE(2, ("%d: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data));
end end
if (replay_fire) begin if (replay_fire) begin
`TRACE(2, ("%d: %s-bank%0d mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel)); `TRACE(2, ("%d: %s mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel));
end end
if (core_req_fire) begin if (core_req_fire) begin
if (core_req_rw) if (core_req_rw)
`TRACE(2, ("%d: %s-bank%0d core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel)); `TRACE(2, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel));
else else
`TRACE(2, ("%d: %s-bank%0d core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel)); `TRACE(2, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel));
end end
if (crsq_fire) begin if (crsp_queue_fire) begin
`TRACE(2, ("%d: %s-bank%0d core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_idx, crsq_data, req_uuid_st1)); `TRACE(2, ("%d: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1));
end end
if (mreq_push) begin if (mreq_queue_push) begin
if (do_creq_wr_st1) if (do_creq_wr_st1 && !WRITEBACK)
`TRACE(2, ("%d: %s-bank%0d writethrough: addr=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mreq_addr, BANK_ID), mreq_byteen, mreq_data, req_uuid_st1)); `TRACE(2, ("%d: %s writethrough: addr=0x%0h, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1));
else if (do_writeback_st1)
`TRACE(2, ("%d: %s writeback: addr=0x%0h, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data));
else else
`TRACE(2, ("%d: %s-bank%0d fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mreq_addr, BANK_ID), mreq_id, req_uuid_st1)); `TRACE(2, ("%d: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_id, req_uuid_st1));
end end
end end
`endif `endif

View file

@ -11,211 +11,176 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
`include "VX_platform.vh" `include "VX_cache_define.vh"
module VX_cache_bypass #( module VX_cache_bypass #(
parameter NUM_REQS = 1, parameter NUM_REQS = 1,
parameter NC_TAG_BIT = 0, parameter TAG_SEL_IDX = 0,
parameter NC_ENABLE = 0,
parameter PASSTHRU = 0, parameter PASSTHRU = 0,
parameter NC_ENABLE = 0,
parameter WORD_SIZE = 1,
parameter LINE_SIZE = 1,
parameter CORE_ADDR_WIDTH = 1, parameter CORE_ADDR_WIDTH = 1,
parameter CORE_DATA_SIZE = 1,
parameter CORE_TAG_IN_WIDTH = 1, parameter CORE_TAG_WIDTH = 1,
parameter MEM_ADDR_WIDTH = 1, parameter MEM_ADDR_WIDTH = 1,
parameter MEM_DATA_SIZE = 1,
parameter MEM_TAG_IN_WIDTH = 1, parameter MEM_TAG_IN_WIDTH = 1,
parameter MEM_TAG_OUT_WIDTH = 1, parameter MEM_TAG_OUT_WIDTH = 1,
parameter UUID_WIDTH = 0, parameter UUID_WIDTH = 0,
parameter CORE_DATA_WIDTH = CORE_DATA_SIZE * 8, parameter CORE_OUT_BUF = 0,
parameter MEM_DATA_WIDTH = MEM_DATA_SIZE * 8, parameter MEM_OUT_BUF = 0,
parameter CORE_TAG_OUT_WIDTH= CORE_TAG_IN_WIDTH - NC_ENABLE
parameter CORE_DATA_WIDTH = WORD_SIZE * 8
) ( ) (
input wire clk, input wire clk,
input wire reset, input wire reset,
// Core request in // Core request in
input wire [NUM_REQS-1:0] core_req_valid_in, VX_mem_bus_if.slave core_bus_in_if [NUM_REQS],
input wire [NUM_REQS-1:0] core_req_rw_in,
input wire [NUM_REQS-1:0][CORE_ADDR_WIDTH-1:0] core_req_addr_in,
input wire [NUM_REQS-1:0][CORE_DATA_SIZE-1:0] core_req_byteen_in,
input wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_req_data_in,
input wire [NUM_REQS-1:0][CORE_TAG_IN_WIDTH-1:0] core_req_tag_in,
output wire [NUM_REQS-1:0] core_req_ready_in,
// Core request out // Core request out
output wire [NUM_REQS-1:0] core_req_valid_out, VX_mem_bus_if.master core_bus_out_if [NUM_REQS],
output wire [NUM_REQS-1:0] core_req_rw_out,
output wire [NUM_REQS-1:0][CORE_ADDR_WIDTH-1:0] core_req_addr_out,
output wire [NUM_REQS-1:0][CORE_DATA_SIZE-1:0] core_req_byteen_out,
output wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_req_data_out,
output wire [NUM_REQS-1:0][CORE_TAG_OUT_WIDTH-1:0] core_req_tag_out,
input wire [NUM_REQS-1:0] core_req_ready_out,
// Core response in
input wire [NUM_REQS-1:0] core_rsp_valid_in,
input wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_rsp_data_in,
input wire [NUM_REQS-1:0][CORE_TAG_OUT_WIDTH-1:0] core_rsp_tag_in,
output wire [NUM_REQS-1:0] core_rsp_ready_in,
// Core response out
output wire [NUM_REQS-1:0] core_rsp_valid_out,
output wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_rsp_data_out,
output wire [NUM_REQS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_out,
input wire [NUM_REQS-1:0] core_rsp_ready_out,
// Memory request in // Memory request in
input wire mem_req_valid_in, VX_mem_bus_if.slave mem_bus_in_if,
input wire mem_req_rw_in,
input wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_in,
input wire [MEM_DATA_SIZE-1:0] mem_req_byteen_in,
input wire [MEM_DATA_WIDTH-1:0] mem_req_data_in,
input wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_in,
output wire mem_req_ready_in,
// Memory request out // Memory request out
output wire mem_req_valid_out, VX_mem_bus_if.master mem_bus_out_if
output wire mem_req_rw_out,
output wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_out,
output wire [MEM_DATA_SIZE-1:0] mem_req_byteen_out,
output wire [MEM_DATA_WIDTH-1:0] mem_req_data_out,
output wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_tag_out,
input wire mem_req_ready_out,
// Memory response in
input wire mem_rsp_valid_in,
input wire [MEM_DATA_WIDTH-1:0] mem_rsp_data_in,
input wire [MEM_TAG_OUT_WIDTH-1:0] mem_rsp_tag_in,
output wire mem_rsp_ready_in,
// Memory response out
output wire mem_rsp_valid_out,
output wire [MEM_DATA_WIDTH-1:0] mem_rsp_data_out,
output wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_out,
input wire mem_rsp_ready_out
); );
`UNUSED_VAR (clk) localparam DIRECT_PASSTHRU = PASSTHRU && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == 1);
`UNUSED_VAR (reset)
localparam REQ_SEL_BITS = `CLOG2(NUM_REQS); localparam REQ_SEL_BITS = `CLOG2(NUM_REQS);
localparam MUX_DATAW = CORE_TAG_IN_WIDTH + CORE_DATA_WIDTH + CORE_DATA_SIZE + CORE_ADDR_WIDTH + 1; localparam MUX_DATAW = 1 + WORD_SIZE + CORE_ADDR_WIDTH + `ADDR_TYPE_WIDTH + CORE_DATA_WIDTH + CORE_TAG_WIDTH;
localparam WORDS_PER_LINE = MEM_DATA_SIZE / CORE_DATA_SIZE; localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE); localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
localparam CORE_TAG_ID_BITS = CORE_TAG_IN_WIDTH - UUID_WIDTH; localparam CORE_TAG_ID_BITS = CORE_TAG_WIDTH - UUID_WIDTH;
localparam MEM_TAG_ID_BITS = REQ_SEL_BITS + WSEL_BITS + CORE_TAG_ID_BITS; localparam MEM_TAG_ID_BITS = REQ_SEL_BITS + WSEL_BITS + CORE_TAG_ID_BITS;
localparam MEM_TAG_BYPASS_BITS = UUID_WIDTH + MEM_TAG_ID_BITS;
localparam MEM_TAG_OUT_NC_WIDTH = MEM_TAG_OUT_WIDTH - 1 + NC_ENABLE; `STATIC_ASSERT(0 == (`IO_BASE_ADDR % `MEM_BLOCK_SIZE), ("invalid parameter"))
// core request handling // handle core requests ///////////////////////////////////////////////////
wire [NUM_REQS-1:0] core_req_valid_in_nc; wire core_req_nc_valid;
wire [NUM_REQS-1:0] core_req_nc_valids;
wire [NUM_REQS-1:0] core_req_nc_idxs; wire [NUM_REQS-1:0] core_req_nc_idxs;
wire [`UP(REQ_SEL_BITS)-1:0] core_req_nc_idx; wire [`UP(REQ_SEL_BITS)-1:0] core_req_nc_idx;
wire [NUM_REQS-1:0] core_req_nc_sel; wire [NUM_REQS-1:0] core_req_nc_sel;
wire core_req_nc_valid; wire core_req_nc_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar i = 0; i < NUM_REQS; ++i) begin
if (PASSTHRU != 0) begin if (PASSTHRU != 0) begin
assign core_req_nc_idxs[i] = 1'b1; assign core_req_nc_idxs[i] = 1'b1;
end else if (NC_ENABLE) begin
assign core_req_nc_idxs[i] = core_bus_in_if[i].req_data.atype[`ADDR_TYPE_IO];
end else begin end else begin
assign core_req_nc_idxs[i] = core_req_tag_in[i][NC_TAG_BIT]; assign core_req_nc_idxs[i] = 1'b0;
end end
assign core_req_nc_valids[i] = core_bus_in_if[i].req_valid && core_req_nc_idxs[i];
end end
assign core_req_valid_in_nc = core_req_valid_in & core_req_nc_idxs;
wire core_req_in_fire = | (core_req_valid_in & core_req_ready_in);
VX_generic_arbiter #( VX_generic_arbiter #(
.NUM_REQS (NUM_REQS), .NUM_REQS (NUM_REQS),
.TYPE (PASSTHRU ? "R" : "P"), .TYPE (PASSTHRU ? "R" : "P")
.LOCK_ENABLE (1) ) core_req_nc_arb (
) req_arb (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.unlock (core_req_in_fire), .requests (core_req_nc_valids),
.requests (core_req_valid_in_nc),
.grant_index (core_req_nc_idx), .grant_index (core_req_nc_idx),
.grant_onehot (core_req_nc_sel), .grant_onehot (core_req_nc_sel),
.grant_valid (core_req_nc_valid) .grant_valid (core_req_nc_valid),
.grant_ready (core_req_nc_ready)
); );
assign core_req_valid_out = core_req_valid_in & ~core_req_nc_idxs;
assign core_req_rw_out = core_req_rw_in;
assign core_req_addr_out = core_req_addr_in;
assign core_req_byteen_out = core_req_byteen_in;
assign core_req_data_out = core_req_data_in;
for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_bits_remove #( assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && ~core_req_nc_idxs[i];
.N (CORE_TAG_IN_WIDTH), assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
.S (NC_ENABLE), assign core_bus_in_if[i].req_ready = core_req_nc_valids[i] ? (core_req_nc_ready && core_req_nc_sel[i])
.POS (NC_TAG_BIT) : core_bus_out_if[i].req_ready;
) core_req_tag_nc_remove (
.data_in (core_req_tag_in[i]),
.data_out (core_req_tag_out[i])
);
end end
for (genvar i = 0; i < NUM_REQS; ++i) begin // handle memory requests /////////////////////////////////////////////////
assign core_req_ready_in[i] = core_req_valid_in_nc[i] ? (~mem_req_valid_in && mem_req_ready_out && core_req_nc_sel[i])
: core_req_ready_out[i];
end
// memory request handling wire mem_req_out_valid;
wire mem_req_out_rw;
wire [LINE_SIZE-1:0] mem_req_out_byteen;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_out_addr;
wire [`ADDR_TYPE_WIDTH-1:0] mem_req_out_atype;
wire [`CS_LINE_WIDTH-1:0] mem_req_out_data;
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_out_tag;
wire mem_req_out_ready;
assign mem_req_valid_out = mem_req_valid_in || core_req_nc_valid; wire core_req_nc_sel_rw;
assign mem_req_ready_in = mem_req_ready_out; wire [WORD_SIZE-1:0] core_req_nc_sel_byteen;
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_sel_addr;
wire [CORE_TAG_IN_WIDTH-1:0] core_req_tag_in_sel; wire [`ADDR_TYPE_WIDTH-1:0] core_req_nc_sel_atype;
wire [CORE_DATA_WIDTH-1:0] core_req_data_in_sel; wire [CORE_DATA_WIDTH-1:0] core_req_nc_sel_data;
wire [CORE_DATA_SIZE-1:0] core_req_byteen_in_sel; wire [CORE_TAG_WIDTH-1:0] core_req_nc_sel_tag;
wire [CORE_ADDR_WIDTH-1:0] core_req_addr_in_sel;
wire core_req_rw_in_sel;
wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in; wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in;
for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_nc_mux_in[i] = {core_req_tag_in[i], core_req_data_in[i], core_req_byteen_in[i], core_req_addr_in[i], core_req_rw_in[i]}; assign core_req_nc_mux_in[i] = {
core_bus_in_if[i].req_data.rw,
core_bus_in_if[i].req_data.byteen,
core_bus_in_if[i].req_data.addr,
core_bus_in_if[i].req_data.atype,
core_bus_in_if[i].req_data.data,
core_bus_in_if[i].req_data.tag
};
end end
assign {core_req_tag_in_sel, core_req_data_in_sel, core_req_byteen_in_sel, core_req_addr_in_sel, core_req_rw_in_sel} = core_req_nc_mux_in[core_req_nc_idx];
wire [CORE_TAG_ID_BITS-1:0] core_req_in_id = core_req_tag_in_sel[CORE_TAG_ID_BITS-1:0]; assign {
core_req_nc_sel_rw,
core_req_nc_sel_byteen,
core_req_nc_sel_addr,
core_req_nc_sel_atype,
core_req_nc_sel_data,
core_req_nc_sel_tag
} = core_req_nc_mux_in[core_req_nc_idx];
assign mem_req_rw_out = mem_req_valid_in ? mem_req_rw_in : core_req_rw_in_sel; assign core_req_nc_ready = ~mem_bus_in_if.req_valid && mem_req_out_ready;
assign mem_req_addr_out = mem_req_valid_in ? mem_req_addr_in : core_req_addr_in_sel[WSEL_BITS +: MEM_ADDR_WIDTH];
assign mem_req_out_valid = mem_bus_in_if.req_valid || core_req_nc_valid;
assign mem_req_out_rw = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.rw : core_req_nc_sel_rw;
assign mem_req_out_addr = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.addr : core_req_nc_sel_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
assign mem_req_out_atype = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.atype : core_req_nc_sel_atype;
wire [MEM_TAG_ID_BITS-1:0] mem_req_tag_id_bypass; wire [MEM_TAG_ID_BITS-1:0] mem_req_tag_id_bypass;
wire [CORE_TAG_ID_BITS-1:0] core_req_in_id = core_req_nc_sel_tag[CORE_TAG_ID_BITS-1:0];
if (WORDS_PER_LINE > 1) begin if (WORDS_PER_LINE > 1) begin
reg [WORDS_PER_LINE-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_in_r; reg [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] mem_req_byteen_in_r;
reg [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_r; reg [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_r;
wire [WSEL_BITS-1:0] req_wsel = core_req_addr_in_sel[WSEL_BITS-1:0]; wire [WSEL_BITS-1:0] req_wsel = core_req_nc_sel_addr[WSEL_BITS-1:0];
always @(*) begin always @(*) begin
mem_req_byteen_in_r = '0; mem_req_byteen_in_r = '0;
mem_req_byteen_in_r[req_wsel] = core_req_byteen_in_sel; mem_req_byteen_in_r[req_wsel] = core_req_nc_sel_byteen;
mem_req_data_in_r = 'x; mem_req_data_in_r = 'x;
mem_req_data_in_r[req_wsel] = core_req_data_in_sel; mem_req_data_in_r[req_wsel] = core_req_nc_sel_data;
end end
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : mem_req_byteen_in_r; assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : mem_req_byteen_in_r;
assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : mem_req_data_in_r; assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : mem_req_data_in_r;
if (NUM_REQS > 1) begin if (NUM_REQS > 1) begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, req_wsel, core_req_in_id}); assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, req_wsel, core_req_in_id});
end else begin end else begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({req_wsel, core_req_in_id}); assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({req_wsel, core_req_in_id});
end end
end else begin end else begin
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : core_req_byteen_in_sel; assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : core_req_nc_sel_byteen;
assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : core_req_data_in_sel; assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : core_req_nc_sel_data;
if (NUM_REQS > 1) begin if (NUM_REQS > 1) begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, core_req_in_id}); assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, core_req_in_id});
end else begin end else begin
@ -223,75 +188,82 @@ module VX_cache_bypass #(
end end
end end
wire [MEM_TAG_OUT_NC_WIDTH-1:0] mem_req_tag_bypass; wire [MEM_TAG_BYPASS_BITS-1:0] mem_req_tag_bypass;
if (UUID_WIDTH != 0) begin if (UUID_WIDTH != 0) begin
assign mem_req_tag_bypass = {core_req_tag_in_sel[CORE_TAG_ID_BITS +: UUID_WIDTH], mem_req_tag_id_bypass}; assign mem_req_tag_bypass = {core_req_nc_sel_tag[CORE_TAG_ID_BITS +: UUID_WIDTH], mem_req_tag_id_bypass};
end else begin end else begin
assign mem_req_tag_bypass = mem_req_tag_id_bypass; assign mem_req_tag_bypass = mem_req_tag_id_bypass;
end end
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_tag_bypass_nc; if (PASSTHRU != 0) begin
wire [(MEM_TAG_IN_WIDTH + 1)-1:0] mem_req_tag_in_nc; assign mem_req_out_tag = mem_req_tag_bypass;
`UNUSED_VAR (mem_bus_in_if.req_data.tag)
end else begin
if (NC_ENABLE) begin
VX_bits_insert #( VX_bits_insert #(
.N (MEM_TAG_OUT_NC_WIDTH), .N (MEM_TAG_OUT_WIDTH-1),
.S (NC_ENABLE ? 0 : 1), .S (1),
.POS (NC_TAG_BIT) .POS (TAG_SEL_IDX)
) mem_req_tag_bypass_nc_insert (
.data_in (mem_req_tag_bypass),
.sel_in (1'b0),
.data_out (mem_req_tag_bypass_nc)
);
VX_bits_insert #(
.N (MEM_TAG_IN_WIDTH),
.POS (NC_TAG_BIT)
) mem_req_tag_in_nc_insert ( ) mem_req_tag_in_nc_insert (
.data_in (mem_req_tag_in), .data_in (mem_bus_in_if.req_valid ? (MEM_TAG_OUT_WIDTH-1)'(mem_bus_in_if.req_data.tag) : (MEM_TAG_OUT_WIDTH-1)'(mem_req_tag_bypass)),
.sel_in (1'b0), .ins_in (~mem_bus_in_if.req_valid),
.data_out (mem_req_tag_in_nc) .data_out (mem_req_out_tag)
);
end else begin
assign mem_req_out_tag = mem_bus_in_if.req_data.tag;
end
end
assign mem_bus_in_if.req_ready = mem_req_out_ready;
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `ADDR_TYPE_WIDTH + `CS_LINE_WIDTH + MEM_TAG_OUT_WIDTH),
.SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_out_valid),
.ready_in (mem_req_out_ready),
.data_in ({mem_req_out_rw, mem_req_out_byteen, mem_req_out_addr, mem_req_out_atype, mem_req_out_data, mem_req_out_tag}),
.data_out ({mem_bus_out_if.req_data.rw, mem_bus_out_if.req_data.byteen, mem_bus_out_if.req_data.addr, mem_bus_out_if.req_data.atype, mem_bus_out_if.req_data.data, mem_bus_out_if.req_data.tag}),
.valid_out (mem_bus_out_if.req_valid),
.ready_out (mem_bus_out_if.req_ready)
); );
assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : mem_req_tag_bypass_nc; // handle core responses //////////////////////////////////////////////////
// core response handling wire [NUM_REQS-1:0] core_rsp_in_valid;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_in_data;
wire [NUM_REQS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_in_nc; wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_in_tag;
wire [NUM_REQS-1:0] core_rsp_in_ready;
wire is_mem_rsp_nc; wire is_mem_rsp_nc;
if (PASSTHRU != 0) begin if (PASSTHRU != 0) begin
assign is_mem_rsp_nc = mem_rsp_valid_in; assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid;
end else begin end else begin
assign is_mem_rsp_nc = mem_rsp_valid_in && mem_rsp_tag_in[NC_TAG_BIT]; if (NC_ENABLE) begin
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid && mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
end else begin
assign is_mem_rsp_nc = 1'b0;
end
end end
for (genvar i = 0; i < NUM_REQS; ++i) begin wire [(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1:0] mem_rsp_tag_id_nc;
VX_bits_insert #(
.N (CORE_TAG_OUT_WIDTH),
.S (NC_ENABLE),
.POS (NC_TAG_BIT)
) core_rsp_tag_in_nc_insert (
.data_in (core_rsp_tag_in[i]),
.sel_in ('0),
.data_out (core_rsp_tag_in_nc[i])
);
end
wire [MEM_TAG_OUT_NC_WIDTH-1:0] mem_rsp_tag_in_nc;
VX_bits_remove #( VX_bits_remove #(
.N (MEM_TAG_OUT_WIDTH), .N (MEM_TAG_OUT_WIDTH),
.S (NC_ENABLE ? 0 : 1), .S (NC_ENABLE),
.POS (NC_TAG_BIT) .POS (TAG_SEL_IDX)
) mem_rsp_tag_in_nc_remove ( ) mem_rsp_tag_in_nc_remove (
.data_in (mem_rsp_tag_in), .data_in (mem_bus_out_if.rsp_data.tag),
.data_out (mem_rsp_tag_in_nc) .data_out (mem_rsp_tag_id_nc)
); );
wire [`UP(REQ_SEL_BITS)-1:0] rsp_idx; wire [`UP(REQ_SEL_BITS)-1:0] rsp_idx;
if (NUM_REQS > 1) begin if (NUM_REQS > 1) begin
assign rsp_idx = mem_rsp_tag_in_nc[(CORE_TAG_ID_BITS + WSEL_BITS) +: REQ_SEL_BITS]; assign rsp_idx = mem_rsp_tag_id_nc[(CORE_TAG_ID_BITS + WSEL_BITS) +: REQ_SEL_BITS];
end else begin end else begin
assign rsp_idx = 1'b0; assign rsp_idx = 1'b0;
end end
@ -302,47 +274,78 @@ module VX_cache_bypass #(
rsp_nc_valid_r[rsp_idx] = is_mem_rsp_nc; rsp_nc_valid_r[rsp_idx] = is_mem_rsp_nc;
end end
assign core_rsp_valid_out = core_rsp_valid_in | rsp_nc_valid_r; for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_ready_in = core_rsp_ready_out; assign core_rsp_in_valid[i] = core_bus_out_if[i].rsp_valid || rsp_nc_valid_r[i];
assign core_bus_out_if[i].rsp_ready = core_rsp_in_ready[i];
end
if (WORDS_PER_LINE > 1) begin if (WORDS_PER_LINE > 1) begin
wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_in_nc[CORE_TAG_ID_BITS +: WSEL_BITS]; wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS +: WSEL_BITS];
for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_data_out[i] = core_rsp_valid_in[i] ? assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ?
core_rsp_data_in[i] : mem_rsp_data_in[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH]; core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
end end
end else begin end else begin
for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_data_out[i] = core_rsp_valid_in[i] ? core_rsp_data_in[i] : mem_rsp_data_in; assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data;
end end
end end
for (genvar i = 0; i < NUM_REQS; ++i) begin wire [(CORE_TAG_ID_BITS + UUID_WIDTH)-1:0] mem_rsp_tag_in_nc2;
if (UUID_WIDTH != 0) begin if (UUID_WIDTH != 0) begin
assign core_rsp_tag_out[i] = core_rsp_valid_in[i] ? core_rsp_tag_in_nc[i] : {mem_rsp_tag_in_nc[MEM_TAG_OUT_NC_WIDTH-1 -: UUID_WIDTH], mem_rsp_tag_in_nc[CORE_TAG_ID_BITS-1:0]}; assign mem_rsp_tag_in_nc2 = {mem_rsp_tag_id_nc[(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1 -: UUID_WIDTH], mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0]};
end else begin end else begin
assign core_rsp_tag_out[i] = core_rsp_valid_in[i] ? core_rsp_tag_in_nc[i] : mem_rsp_tag_in_nc[CORE_TAG_ID_BITS-1:0]; assign mem_rsp_tag_in_nc2 = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (PASSTHRU) begin
assign core_rsp_in_tag[i] = mem_rsp_tag_in_nc2;
end else if (NC_ENABLE) begin
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.tag : mem_rsp_tag_in_nc2;
end else begin
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_data.tag;
end end
end end
// memory response handling for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + CORE_TAG_WIDTH),
.SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_in_valid[i]),
.ready_in (core_rsp_in_ready[i]),
.data_in ({core_rsp_in_data[i], core_rsp_in_tag[i]}),
.data_out ({core_bus_in_if[i].rsp_data.data, core_bus_in_if[i].rsp_data.tag}),
.valid_out (core_bus_in_if[i].rsp_valid),
.ready_out (core_bus_in_if[i].rsp_ready)
);
end
// handle memory responses ////////////////////////////////////////////////
if (PASSTHRU != 0) begin if (PASSTHRU != 0) begin
assign mem_rsp_valid_out = 1'b0; assign mem_bus_in_if.rsp_valid = 1'b0;
assign mem_bus_in_if.rsp_data.data = '0;
assign mem_bus_in_if.rsp_data.tag = '0;
end else if (NC_ENABLE) begin
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid && ~mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc[MEM_TAG_IN_WIDTH-1:0];
end else begin end else begin
assign mem_rsp_valid_out = mem_rsp_valid_in && ~mem_rsp_tag_in[NC_TAG_BIT]; assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid;
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc;
end end
assign mem_rsp_data_out = mem_rsp_data_in; wire [NUM_REQS-1:0] core_rsp_out_valid;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_out_valid[i] = core_bus_out_if[i].rsp_valid;
end
VX_bits_remove #( assign mem_bus_out_if.rsp_ready = is_mem_rsp_nc ? (~core_rsp_out_valid[rsp_idx] && core_rsp_in_ready[rsp_idx]) : mem_bus_in_if.rsp_ready;
.N (MEM_TAG_IN_WIDTH + 1),
.POS (NC_TAG_BIT)
) mem_rsp_tag_out_remove (
.data_in (mem_rsp_tag_in[(MEM_TAG_IN_WIDTH + 1)-1:0]),
.data_out (mem_rsp_tag_out)
);
assign mem_rsp_ready_in = is_mem_rsp_nc ? (~core_rsp_valid_in[rsp_idx] && core_rsp_ready_out[rsp_idx]) : mem_rsp_ready_out;
endmodule endmodule

View file

@ -46,6 +46,12 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
// Enable cache writeable // Enable cache writeable
parameter WRITE_ENABLE = 1, parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier // Request debug identifier
parameter UUID_WIDTH = 0, parameter UUID_WIDTH = 0,
@ -55,11 +61,11 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
// enable bypass for non-cacheable addresses // enable bypass for non-cacheable addresses
parameter NC_ENABLE = 0, parameter NC_ENABLE = 0,
// Core response output register // Core response output buffer
parameter CORE_OUT_REG = 0, parameter CORE_OUT_BUF = 0,
// Memory request output register // Memory request output buffer
parameter MEM_OUT_REG = 0 parameter MEM_OUT_BUF = 0
) ( ) (
input wire clk, input wire clk,
input wire reset, input wire reset,
@ -75,8 +81,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
localparam NUM_CACHES = `UP(NUM_UNITS); localparam NUM_CACHES = `UP(NUM_UNITS);
localparam PASSTHRU = (NUM_UNITS == 0); localparam PASSTHRU = (NUM_UNITS == 0);
localparam ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES); localparam ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES);
localparam MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) : localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH)) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) : (NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS)); `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
@ -84,7 +89,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
cache_perf_t perf_cache_unit[NUM_CACHES]; cache_perf_t perf_cache_unit[NUM_CACHES];
`PERF_CACHE_REDUCE (cache_perf, perf_cache_unit, NUM_CACHES); `PERF_CACHE_ADD (cache_perf, perf_cache_unit, NUM_CACHES)
`endif `endif
VX_mem_bus_if #( VX_mem_bus_if #(
@ -97,6 +102,8 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.TAG_WIDTH (ARB_TAG_WIDTH) .TAG_WIDTH (ARB_TAG_WIDTH)
) arb_core_bus_if[NUM_CACHES * NUM_REQS](); ) arb_core_bus_if[NUM_CACHES * NUM_REQS]();
`RESET_RELAY_EX (cache_arb_reset, reset, NUM_REQS, `MAX_FANOUT);
for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_mem_bus_if #( VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE), .DATA_SIZE (WORD_SIZE),
@ -112,8 +119,6 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
`ASSIGN_VX_MEM_BUS_IF (core_bus_tmp_if[j], core_bus_if[j * NUM_REQS + i]); `ASSIGN_VX_MEM_BUS_IF (core_bus_tmp_if[j], core_bus_if[j * NUM_REQS + i]);
end end
`RESET_RELAY (cache_arb_reset, reset);
VX_mem_arb #( VX_mem_arb #(
.NUM_INPUTS (NUM_INPUTS), .NUM_INPUTS (NUM_INPUTS),
.NUM_OUTPUTS (NUM_CACHES), .NUM_OUTPUTS (NUM_CACHES),
@ -121,11 +126,11 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.TAG_WIDTH (TAG_WIDTH), .TAG_WIDTH (TAG_WIDTH),
.TAG_SEL_IDX (TAG_SEL_IDX), .TAG_SEL_IDX (TAG_SEL_IDX),
.ARBITER ("R"), .ARBITER ("R"),
.OUT_REG_REQ ((NUM_INPUTS != NUM_CACHES) ? 2 : 0), .REQ_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0),
.OUT_REG_RSP ((NUM_INPUTS != NUM_CACHES) ? 2 : 0) .RSP_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0)
) cache_arb ( ) cache_arb (
.clk (clk), .clk (clk),
.reset (cache_arb_reset), .reset (cache_arb_reset[i]),
.bus_in_if (core_bus_tmp_if), .bus_in_if (core_bus_tmp_if),
.bus_out_if (arb_core_bus_tmp_if) .bus_out_if (arb_core_bus_tmp_if)
); );
@ -135,7 +140,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
end end
end end
for (genvar i = 0; i < NUM_CACHES; ++i) begin for (genvar i = 0; i < NUM_CACHES; ++i) begin : caches
`RESET_RELAY (cache_reset, reset); `RESET_RELAY (cache_reset, reset);
@ -152,10 +157,13 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.MRSQ_SIZE (MRSQ_SIZE), .MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE), .MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE), .WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH), .UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (ARB_TAG_WIDTH), .TAG_WIDTH (ARB_TAG_WIDTH),
.CORE_OUT_REG ((NUM_INPUTS != NUM_CACHES) ? 2 : CORE_OUT_REG), .TAG_SEL_IDX (TAG_SEL_IDX),
.MEM_OUT_REG ((NUM_CACHES > 1) ? 2 : MEM_OUT_REG), .CORE_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : CORE_OUT_BUF),
.MEM_OUT_BUF ((NUM_CACHES > 1) ? 2 : MEM_OUT_BUF),
.NC_ENABLE (NC_ENABLE), .NC_ENABLE (NC_ENABLE),
.PASSTHRU (PASSTHRU) .PASSTHRU (PASSTHRU)
) cache_wrap ( ) cache_wrap (
@ -169,8 +177,6 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
); );
end end
`RESET_RELAY (mem_arb_reset, reset);
VX_mem_bus_if #( VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE), .DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1)) .TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1))
@ -180,13 +186,13 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.NUM_INPUTS (NUM_CACHES), .NUM_INPUTS (NUM_CACHES),
.DATA_SIZE (LINE_SIZE), .DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH), .TAG_WIDTH (MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag .TAG_SEL_IDX (TAG_SEL_IDX),
.ARBITER ("R"), .ARBITER ("R"),
.OUT_REG_REQ ((NUM_CACHES > 1) ? 2 : 0), .REQ_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0),
.OUT_REG_RSP ((NUM_CACHES > 1) ? 2 : 0) .RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0)
) mem_arb ( ) mem_arb (
.clk (clk), .clk (clk),
.reset (mem_arb_reset), .reset (reset),
.bus_in_if (cache_mem_bus_if), .bus_in_if (cache_mem_bus_if),
.bus_out_if (mem_bus_tmp_if) .bus_out_if (mem_bus_tmp_if)
); );

View file

@ -28,6 +28,10 @@ module VX_cache_data #(
parameter WORD_SIZE = 1, parameter WORD_SIZE = 1,
// Enable cache writeable // Enable cache writeable
parameter WRITE_ENABLE = 1, parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier // Request debug identifier
parameter UUID_WIDTH = 0 parameter UUID_WIDTH = 0
) ( ) (
@ -40,61 +44,102 @@ module VX_cache_data #(
input wire stall, input wire stall,
input wire init,
input wire read, input wire read,
input wire fill, input wire fill,
input wire flush,
input wire write, input wire write,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr, input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire [`UP(`CS_WORD_SEL_BITS)-1:0] wsel, input wire [`UP(`CS_WORD_SEL_BITS)-1:0] wsel,
input wire [WORD_SIZE-1:0] byteen,
input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] fill_data, input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] fill_data,
input wire [`CS_WORD_WIDTH-1:0] write_data, input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] write_data,
input wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_byteen,
input wire [NUM_WAYS-1:0] way_sel, input wire [NUM_WAYS-1:0] way_sel,
output wire [`CS_WORD_WIDTH-1:0] read_data,
output wire [`CS_WORD_WIDTH-1:0] read_data output wire [`CS_LINE_WIDTH-1:0] dirty_data,
output wire [LINE_SIZE-1:0] dirty_byteen
); );
`UNUSED_SPARAM (INSTANCE_ID) `UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID) `UNUSED_PARAM (BANK_ID)
`UNUSED_PARAM (WORD_SIZE) `UNUSED_PARAM (WORD_SIZE)
`UNUSED_VAR (reset) `UNUSED_VAR (stall)
`UNUSED_VAR (line_addr) `UNUSED_VAR (line_addr)
`UNUSED_VAR (init)
`UNUSED_VAR (read) `UNUSED_VAR (read)
`UNUSED_VAR (flush)
localparam BYTEENW = (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) ? (LINE_SIZE * NUM_WAYS) : 1; localparam BYTEENW = (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) ? (LINE_SIZE * NUM_WAYS) : 1;
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] wdata; wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
wire [BYTEENW-1:0] wren;
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_rdata;
wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
if (WRITEBACK) begin
if (DIRTY_BYTES) begin
wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_rdata;
wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_wdata;
for (genvar i = 0; i < NUM_WAYS; ++i) begin
wire [LINE_SIZE-1:0] wdata = write ? (bs_rdata[i] | write_byteen) : ((fill || flush) ? '0 : bs_rdata[i]);
assign bs_wdata[i] = init ? '0 : (way_sel[i] ? wdata : bs_rdata[i]);
end
VX_sp_ram #(
.DATAW (LINE_SIZE * NUM_WAYS),
.SIZE (`CS_LINES_PER_BANK)
) byteen_store (
.clk (clk),
.reset (reset),
.read (write || fill || flush),
.write (init || write || fill || flush),
.wren (1'b1),
.addr (line_sel),
.wdata (bs_wdata),
.rdata (bs_rdata)
);
assign dirty_byteen = bs_rdata[way_idx];
end else begin
assign dirty_byteen = {LINE_SIZE{1'b1}};
end
wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] flipped_rdata;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
for (genvar j = 0; j < NUM_WAYS; ++j) begin
assign flipped_rdata[j][i] = line_rdata[i][j];
end
end
assign dirty_data = flipped_rdata[way_idx];
end else begin
assign dirty_byteen = '0;
assign dirty_data = '0;
end
// order the data layout to perform ways multiplexing last.
// this allows converting way index to binary in parallel with BRAM readaccess and way selection.
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_wdata;
wire [BYTEENW-1:0] line_wren;
if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin
reg [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] wdata_r;
reg [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] wren_r;
always @(*) begin
wdata_r = {`CS_WORDS_PER_LINE{write_data}};
wren_r = '0;
wren_r[wsel] = byteen;
end
// order the data layout to perform ways multiplexing last
// this allows performing onehot encoding of the way index in parallel with BRAM read.
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w; wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
assign wdata[i] = fill ? {NUM_WAYS{fill_data[i]}} : {NUM_WAYS{wdata_r[i]}};
for (genvar j = 0; j < NUM_WAYS; ++j) begin for (genvar j = 0; j < NUM_WAYS; ++j) begin
assign wren_w[i][j] = (fill ? {WORD_SIZE{1'b1}} : wren_r[i]) assign line_wdata[i][j] = (fill || !WRITE_ENABLE) ? fill_data[i] : write_data[i];
& {WORD_SIZE{((NUM_WAYS == 1) || way_sel[j])}}; assign wren_w[i][j] = ((fill || !WRITE_ENABLE) ? {WORD_SIZE{1'b1}} : write_byteen[i])
& {WORD_SIZE{(way_sel[j] || (NUM_WAYS == 1))}};
end end
end end
assign wren = wren_w; assign line_wren = wren_w;
end else begin end else begin
`UNUSED_VAR (write) `UNUSED_VAR (write)
`UNUSED_VAR (byteen) `UNUSED_VAR (write_byteen)
`UNUSED_VAR (write_data) `UNUSED_VAR (write_data)
assign wdata = fill_data; assign line_wdata = fill_data;
assign wren = fill; assign line_wren = fill;
end end
wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
VX_onehot_encoder #( VX_onehot_encoder #(
.N (NUM_WAYS) .N (NUM_WAYS)
) way_enc ( ) way_enc (
@ -103,48 +148,50 @@ module VX_cache_data #(
`UNUSED_PIN (valid_out) `UNUSED_PIN (valid_out)
); );
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] rdata; wire line_read = (read && ~stall)
|| (WRITEBACK && (fill || flush));
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0]; wire line_write = write || fill;
VX_sp_ram #( VX_sp_ram #(
.DATAW (`CS_LINE_WIDTH * NUM_WAYS), .DATAW (`CS_LINE_WIDTH * NUM_WAYS),
.SIZE (`CS_LINES_PER_BANK), .SIZE (`CS_LINES_PER_BANK),
.WRENW (BYTEENW), .WRENW (BYTEENW),
.NO_RWCHECK (1) .NO_RWCHECK (1),
.RW_ASSERT (1)
) data_store ( ) data_store (
.clk (clk), .clk (clk),
.read (1'b1), .reset (reset),
.write (write || fill), .read (line_read),
.wren (wren), .write (line_write),
.wren (line_wren),
.addr (line_sel), .addr (line_sel),
.wdata (wdata), .wdata (line_wdata),
.rdata (rdata) .rdata (line_rdata)
); );
wire [NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] per_way_rdata; wire [NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] per_way_rdata;
if (`CS_WORDS_PER_LINE > 1) begin if (`CS_WORDS_PER_LINE > 1) begin
assign per_way_rdata = rdata[wsel]; assign per_way_rdata = line_rdata[wsel];
end else begin end else begin
`UNUSED_VAR (wsel) `UNUSED_VAR (wsel)
assign per_way_rdata = rdata; assign per_way_rdata = line_rdata;
end end
assign read_data = per_way_rdata[way_idx]; assign read_data = per_way_rdata[way_idx];
`UNUSED_VAR (stall) `ifdef DBG_TRACE_CACHE
`ifdef DBG_TRACE_CACHE_DATA
always @(posedge clk) begin always @(posedge clk) begin
if (fill && ~stall) begin if (fill && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d data-fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data)); `TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data));
end
if (flush && ~stall) begin
`TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, dirty_byteen, dirty_data));
end end
if (read && ~stall) begin if (read && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d data-read: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, read_data, req_uuid)); `TRACE(3, ("%d: %s read: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, read_data, req_uuid));
end end
if (write && ~stall) begin if (write && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d data-write: addr=0x%0h, way=%b, blk_addr=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, byteen, write_data, req_uuid)); `TRACE(3, ("%d: %s write: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, write_byteen[wsel], write_data[wsel], req_uuid));
end end
end end
`endif `endif

View file

@ -50,7 +50,7 @@
`define CS_TAG_SEL_ADDR_START (1+`CS_LINE_SEL_ADDR_END) `define CS_TAG_SEL_ADDR_START (1+`CS_LINE_SEL_ADDR_END)
`define CS_TAG_SEL_ADDR_END (`CS_WORD_ADDR_WIDTH-1) `define CS_TAG_SEL_ADDR_END (`CS_WORD_ADDR_WIDTH-1)
`define CS_LINE_TAG_ADDR(x) x[`CS_LINE_ADDR_WIDTH-1 : `CS_LINE_SEL_BITS] `define CS_LINE_ADDR_TAG(x) x[`CS_LINE_ADDR_WIDTH-1 : `CS_LINE_SEL_BITS]
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
@ -62,4 +62,16 @@
`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))} `define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
`define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)} `define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)}
///////////////////////////////////////////////////////////////////////////////
`define PERF_CACHE_ADD(dst, src, count) \
`PERF_COUNTER_ADD (dst, src, reads, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, writes, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, read_misses, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, write_misses, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, bank_stalls, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, mshr_stalls, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, count, (count > 1))
`endif // VX_CACHE_DEFINE_VH `endif // VX_CACHE_DEFINE_VH

165
hw/rtl/cache/VX_cache_flush.sv vendored Normal file
View file

@ -0,0 +1,165 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_cache_flush #(
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Number of banks
parameter NUM_BANKS = 1,
// Bank select latency
parameter BANK_SEL_LATENCY = 1
) (
input wire clk,
input wire reset,
VX_mem_bus_if.slave core_bus_in_if [NUM_REQS],
VX_mem_bus_if.master core_bus_out_if [NUM_REQS],
input wire [NUM_BANKS-1:0] bank_req_fire,
output wire [NUM_BANKS-1:0] flush_begin,
input wire [NUM_BANKS-1:0] flush_end
);
localparam STATE_IDLE = 0;
localparam STATE_WAIT1 = 1;
localparam STATE_FLUSH = 2;
localparam STATE_WAIT2 = 3;
localparam STATE_DONE = 4;
reg [2:0] state, state_n;
// track in-flight core requests
wire no_inflight_reqs;
if (BANK_SEL_LATENCY != 0) begin
localparam NUM_REQS_W = `CLOG2(NUM_REQS+1);
localparam NUM_BANKS_W = `CLOG2(NUM_BANKS+1);
wire [NUM_REQS-1:0] core_bus_out_fire;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_out_fire[i] = core_bus_out_if[i].req_valid && core_bus_out_if[i].req_ready;
end
wire [NUM_REQS_W-1:0] core_bus_out_cnt;
wire [NUM_BANKS_W-1:0] bank_req_cnt;
`POP_COUNT(core_bus_out_cnt, core_bus_out_fire);
`POP_COUNT(bank_req_cnt, bank_req_fire);
`UNUSED_VAR (core_bus_out_cnt)
VX_pending_size #(
.SIZE (BANK_SEL_LATENCY * NUM_BANKS),
.INCRW (NUM_BANKS_W),
.DECRW (NUM_BANKS_W)
) pending_size (
.clk (clk),
.reset (reset),
.incr (NUM_BANKS_W'(core_bus_out_cnt)),
.decr (bank_req_cnt),
.empty (no_inflight_reqs),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end else begin
assign no_inflight_reqs = 0;
`UNUSED_VAR (bank_req_fire)
end
reg [NUM_BANKS-1:0] flush_done, flush_done_n;
wire [NUM_REQS-1:0] flush_req_mask;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign flush_req_mask[i] = core_bus_in_if[i].req_valid && core_bus_in_if[i].req_data.atype[`ADDR_TYPE_FLUSH];
end
wire flush_req_enable = (| flush_req_mask);
reg [NUM_REQS-1:0] lock_released, lock_released_n;
for (genvar i = 0; i < NUM_REQS; ++i) begin
wire input_enable = ~flush_req_enable || lock_released[i];
assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && input_enable;
assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
assign core_bus_in_if[i].req_ready = core_bus_out_if[i].req_ready && input_enable;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_in_if[i].rsp_valid = core_bus_out_if[i].rsp_valid;
assign core_bus_in_if[i].rsp_data = core_bus_out_if[i].rsp_data;
assign core_bus_out_if[i].rsp_ready = core_bus_in_if[i].rsp_ready;
end
wire [NUM_REQS-1:0] core_bus_out_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_out_ready[i] = core_bus_out_if[i].req_ready;
end
always @(*) begin
state_n = state;
flush_done_n = flush_done;
lock_released_n = lock_released;
case (state)
STATE_IDLE: begin
if (flush_req_enable) begin
state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT1 : STATE_FLUSH;
end
end
STATE_WAIT1: begin
if (no_inflight_reqs) begin
state_n = STATE_FLUSH;
end
end
STATE_FLUSH: begin
// generate a flush request pulse
state_n = STATE_WAIT2;
end
STATE_WAIT2: begin
// wait for all banks to finish flushing
flush_done_n = flush_done | flush_end;
if (flush_done_n == {NUM_BANKS{1'b1}}) begin
state_n = STATE_DONE;
flush_done_n = '0;
// only release current flush requests
// and keep normal requests locked
lock_released_n = flush_req_mask;
end
end
STATE_DONE: begin
// wait until released flush requests are issued
// when returning to IDLE state other requests will unlock
lock_released_n = lock_released & ~core_bus_out_ready;
if (lock_released_n == 0) begin
state_n = STATE_IDLE;
end
end
endcase
end
always @(posedge clk) begin
if (reset) begin
state <= STATE_IDLE;
flush_done <= '0;
lock_released <= '0;
end else begin
state <= state_n;
flush_done <= flush_done_n;
lock_released <= lock_released_n;
end
end
assign flush_begin = {NUM_BANKS{state == STATE_FLUSH}};
endmodule

View file

@ -1,51 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_cache_init #(
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1
) (
input wire clk,
input wire reset,
output wire [`CS_LINE_SEL_BITS-1:0] addr_out,
output wire valid_out
);
reg enabled;
reg [`CS_LINE_SEL_BITS-1:0] line_ctr;
always @(posedge clk) begin
if (reset) begin
enabled <= 1;
line_ctr <= '0;
end else begin
if (enabled) begin
if (line_ctr == ((2 ** `CS_LINE_SEL_BITS)-1)) begin
enabled <= 0;
end
line_ctr <= line_ctr + `CS_LINE_SEL_BITS'(1);
end
end
end
assign addr_out = line_ctr;
assign valid_out = enabled;
endmodule

View file

@ -13,19 +13,47 @@
`include "VX_cache_define.vh" `include "VX_cache_define.vh"
// this is an implementation of a pipelined multi-banked cache // This is an implementation of a MSHR for pipelined multi-banked cache.
// we allocate a free slot from the MSHR before processing a core request // We allocate a free slot from the MSHR before processing a core request
// and release the slot when we get a cache hit. // and release the slot when we get a cache hit. This ensure that we do not
// during a memory fill response we initiate the replay sequence // enter the cache bank pipeline when the MSHR is full.
// and dequeue all associated pending entries. // During a memory fill response, we initiate the replay sequence
// and dequeue all pending entries for the given cache line.
//
// Pending core requests stored in the MSHR are sorted by the order of
// arrival and are dequeued in the same order.
// Each entry has a next pointer to the next entry pending for the same cache line.
//
// During the fill operation, the MSHR will release the MSHR entry at fill_id
// which represents the first request in the pending list that initiated the memory fill.
//
// The dequeue operation directly follows the fill operation and will release
// all the subsequent entries linked to fill_id (pending the same cache line).
//
// During the allocation operation, the MSHR will allocate the next free slot
// for the incoming core request. We return the allocated slot id as well as
// the slot id of the previous entry for the same cache line. This is used to
// link the new entry to the pending list during finalization.
//
// The lookup operation is used to find all pending entries for a given cache line.
// This is used to by the cache bank to determine if a cache miss is already pending
// and therefore avoid issuing a memory fill request.
//
// The finalize operation is used to release the allocated MSHR entry if we had a hit.
// If we had a miss and finalize_pending is true, we link the allocated entry to
// its corresponding pending list (via finalize_prev).
//
// Warning: This MSHR implementation is strongly coupled with the bank pipeline // Warning: This MSHR implementation is strongly coupled with the bank pipeline
// and as such changes to either module requires careful evaluation. // and as such changes to either module requires careful evaluation.
// This implementation makes the following assumptions: //
// (1) two-cycle pipeline: st0 and st1. // This architecture implements three pipeline stages:
// (2) core request flow: st0: allocate / lookup, st1: finalize. // - Arbitration: cache bank arbitration before entering pipeline.
// (3) the first dequeue after the fill should happen in st0, when the fill is in st1 // fill and dequeue operations are executed at this stage.
// this is enforced inside the bank by "rdw_hazard_st0". // - stage 0: cache bank tag access stage.
// allocate and lookup operations are executed at this stage.
// - stage 1: cache bank tdatag access stage.
// finalize operation is executed at this stage.
//
module VX_cache_mshr #( module VX_cache_mshr #(
parameter `STRING INSTANCE_ID= "", parameter `STRING INSTANCE_ID= "",
@ -51,20 +79,6 @@ module VX_cache_mshr #(
input wire[`UP(UUID_WIDTH)-1:0] fin_req_uuid, input wire[`UP(UUID_WIDTH)-1:0] fin_req_uuid,
`IGNORE_UNUSED_END `IGNORE_UNUSED_END
// allocate
input wire allocate_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] allocate_addr,
input wire allocate_rw,
input wire [DATA_WIDTH-1:0] allocate_data,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_id,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_tail,
output wire allocate_ready,
// lookup
input wire lookup_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] lookup_addr,
output wire [MSHR_SIZE-1:0] lookup_matches,
// memory fill // memory fill
input wire fill_valid, input wire fill_valid,
input wire [MSHR_ADDR_WIDTH-1:0] fill_id, input wire [MSHR_ADDR_WIDTH-1:0] fill_id,
@ -78,12 +92,27 @@ module VX_cache_mshr #(
output wire [MSHR_ADDR_WIDTH-1:0] dequeue_id, output wire [MSHR_ADDR_WIDTH-1:0] dequeue_id,
input wire dequeue_ready, input wire dequeue_ready,
// allocate
input wire allocate_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] allocate_addr,
input wire allocate_rw,
input wire [DATA_WIDTH-1:0] allocate_data,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_id,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_prev,
output wire allocate_ready,
// lookup
input wire lookup_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] lookup_addr,
output wire [MSHR_SIZE-1:0] lookup_pending,
output wire [MSHR_SIZE-1:0] lookup_rw,
// finalize // finalize
input wire finalize_valid, input wire finalize_valid,
input wire finalize_release, input wire finalize_release,
input wire finalize_pending, input wire finalize_pending,
input wire [MSHR_ADDR_WIDTH-1:0] finalize_id, input wire [MSHR_ADDR_WIDTH-1:0] finalize_id,
input wire [MSHR_ADDR_WIDTH-1:0] finalize_tail input wire [MSHR_ADDR_WIDTH-1:0] finalize_prev
); );
`UNUSED_PARAM (BANK_ID) `UNUSED_PARAM (BANK_ID)
@ -100,7 +129,7 @@ module VX_cache_mshr #(
reg dequeue_val, dequeue_val_n; reg dequeue_val, dequeue_val_n;
reg [MSHR_ADDR_WIDTH-1:0] dequeue_id_r, dequeue_id_n; reg [MSHR_ADDR_WIDTH-1:0] dequeue_id_r, dequeue_id_n;
wire [MSHR_ADDR_WIDTH-1:0] tail_idx; wire [MSHR_ADDR_WIDTH-1:0] prev_idx;
wire allocate_fire = allocate_valid && allocate_ready; wire allocate_fire = allocate_valid && allocate_ready;
wire dequeue_fire = dequeue_valid && dequeue_ready; wire dequeue_fire = dequeue_valid && dequeue_ready;
@ -121,9 +150,9 @@ module VX_cache_mshr #(
VX_onehot_encoder #( VX_onehot_encoder #(
.N (MSHR_SIZE) .N (MSHR_SIZE)
) tail_sel ( ) prev_sel (
.data_in (addr_matches & ~next_table_x), .data_in (addr_matches & ~next_table_x),
.data_out (tail_idx), .data_out (prev_idx),
`UNUSED_PIN (valid_out) `UNUSED_PIN (valid_out)
); );
@ -152,7 +181,7 @@ module VX_cache_mshr #(
valid_table_n[finalize_id] = 0; valid_table_n[finalize_id] = 0;
end end
if (finalize_pending) begin if (finalize_pending) begin
next_table_x[finalize_tail] = 1; next_table_x[finalize_prev] = 1;
end end
end end
@ -180,7 +209,7 @@ module VX_cache_mshr #(
end end
if (finalize_valid && finalize_pending) begin if (finalize_valid && finalize_pending) begin
next_index[finalize_tail] <= finalize_id; next_index[finalize_prev] <= finalize_id;
end end
dequeue_id_r <= dequeue_id_n; dequeue_id_r <= dequeue_id_n;
@ -188,13 +217,13 @@ module VX_cache_mshr #(
next_table <= next_table_n; next_table <= next_table_n;
end end
`RUNTIME_ASSERT((~allocate_fire || ~valid_table[allocate_id_r]), ("%t: *** %s-bank%0d inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, BANK_ID, `RUNTIME_ASSERT((~allocate_fire || ~valid_table[allocate_id_r]), ("%t: *** %s inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id_r, lkp_req_uuid)) `CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id_r, lkp_req_uuid))
`RUNTIME_ASSERT((~finalize_valid || valid_table[finalize_id]), ("%t: *** %s-bank%0d invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, BANK_ID, `RUNTIME_ASSERT((~finalize_valid || valid_table[finalize_id]), ("%t: *** %s invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[finalize_id], BANK_ID), finalize_id, fin_req_uuid)) `CS_LINE_TO_FULL_ADDR(addr_table[finalize_id], BANK_ID), finalize_id, fin_req_uuid))
`RUNTIME_ASSERT((~fill_valid || valid_table[fill_id]), ("%t: *** %s-bank%0d invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID, BANK_ID, `RUNTIME_ASSERT((~fill_valid || valid_table[fill_id]), ("%t: *** %s invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), fill_id)) `CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), fill_id))
VX_dp_ram #( VX_dp_ram #(
@ -203,9 +232,10 @@ module VX_cache_mshr #(
.LUTRAM (1) .LUTRAM (1)
) entries ( ) entries (
.clk (clk), .clk (clk),
.reset (reset),
.read (1'b1), .read (1'b1),
.write (allocate_valid), .write (allocate_valid),
`UNUSED_PIN (wren), .wren (1'b1),
.waddr (allocate_id_r), .waddr (allocate_id_r),
.wdata (allocate_data), .wdata (allocate_data),
.raddr (dequeue_id_r), .raddr (dequeue_id_r),
@ -216,18 +246,20 @@ module VX_cache_mshr #(
assign allocate_ready = allocate_rdy; assign allocate_ready = allocate_rdy;
assign allocate_id = allocate_id_r; assign allocate_id = allocate_id_r;
assign allocate_tail = tail_idx; assign allocate_prev = prev_idx;
assign dequeue_valid = dequeue_val; assign dequeue_valid = dequeue_val;
assign dequeue_addr = addr_table[dequeue_id_r]; assign dequeue_addr = addr_table[dequeue_id_r];
assign dequeue_rw = write_table[dequeue_id_r]; assign dequeue_rw = write_table[dequeue_id_r];
assign dequeue_id = dequeue_id_r; assign dequeue_id = dequeue_id_r;
assign lookup_matches = addr_matches & ~write_table; // return pending entries for the given cache line
assign lookup_pending = addr_matches;
assign lookup_rw = write_table;
`UNUSED_VAR (lookup_valid) `UNUSED_VAR (lookup_valid)
`ifdef DBG_TRACE_CACHE_MSHR `ifdef DBG_TRACE_CACHE
reg show_table; reg show_table;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
@ -236,22 +268,22 @@ module VX_cache_mshr #(
show_table <= allocate_fire || lookup_valid || finalize_valid || fill_valid || dequeue_fire; show_table <= allocate_fire || lookup_valid || finalize_valid || fill_valid || dequeue_fire;
end end
if (allocate_fire) if (allocate_fire)
`TRACE(3, ("%d: %s-bank%0d mshr-allocate: addr=0x%0h, tail=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `TRACE(3, ("%d: %s allocate: addr=0x%0h, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_tail, allocate_id, lkp_req_uuid)); `CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_prev, allocate_id, lkp_req_uuid));
if (lookup_valid) if (lookup_valid)
`TRACE(3, ("%d: %s-bank%0d mshr-lookup: addr=0x%0h, matches=%b (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `TRACE(3, ("%d: %s lookup: addr=0x%0h, matches=%b (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(lookup_addr, BANK_ID), lookup_matches, lkp_req_uuid)); `CS_LINE_TO_FULL_ADDR(lookup_addr, BANK_ID), lookup_pending, lkp_req_uuid));
if (finalize_valid) if (finalize_valid)
`TRACE(3, ("%d: %s-bank%0d mshr-finalize release=%b, pending=%b, tail=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `TRACE(3, ("%d: %s finalize release=%b, pending=%b, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID,
finalize_release, finalize_pending, finalize_tail, finalize_id, fin_req_uuid)); finalize_release, finalize_pending, finalize_prev, finalize_id, fin_req_uuid));
if (fill_valid) if (fill_valid)
`TRACE(3, ("%d: %s-bank%0d mshr-fill: addr=0x%0h, addr=0x%0h, id=%0d\n", $time, INSTANCE_ID, BANK_ID, `TRACE(3, ("%d: %s fill: addr=0x%0h, addr=0x%0h, id=%0d\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), `CS_LINE_TO_FULL_ADDR(fill_addr, BANK_ID), fill_id)); `CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), `CS_LINE_TO_FULL_ADDR(fill_addr, BANK_ID), fill_id));
if (dequeue_fire) if (dequeue_fire)
`TRACE(3, ("%d: %s-bank%0d mshr-dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `TRACE(3, ("%d: %s dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_uuid)); `CS_LINE_TO_FULL_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_uuid));
if (show_table) begin if (show_table) begin
`TRACE(3, ("%d: %s-bank%0d mshr-table", $time, INSTANCE_ID, BANK_ID)); `TRACE(3, ("%d: %s table", $time, INSTANCE_ID));
for (integer i = 0; i < MSHR_SIZE; ++i) begin for (integer i = 0; i < MSHR_SIZE; ++i) begin
if (valid_table[i]) begin if (valid_table[i]) begin
`TRACE(3, (" %0d=0x%0h", i, `CS_LINE_TO_FULL_ADDR(addr_table[i], BANK_ID))); `TRACE(3, (" %0d=0x%0h", i, `CS_LINE_TO_FULL_ADDR(addr_table[i], BANK_ID)));

View file

@ -26,6 +26,8 @@ module VX_cache_tags #(
parameter NUM_WAYS = 1, parameter NUM_WAYS = 1,
// Size of a word in bytes // Size of a word in bytes
parameter WORD_SIZE = 1, parameter WORD_SIZE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Request debug identifier // Request debug identifier
parameter UUID_WIDTH = 0 parameter UUID_WIDTH = 0
) ( ) (
@ -38,76 +40,134 @@ module VX_cache_tags #(
input wire stall, input wire stall,
// read/fill // init/fill/lookup
input wire init,
input wire flush,
input wire fill,
input wire write,
input wire lookup, input wire lookup,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr, input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire fill, input wire [NUM_WAYS-1:0] way_sel,
input wire init, output wire [NUM_WAYS-1:0] tag_matches,
output wire [NUM_WAYS-1:0] way_sel,
output wire [NUM_WAYS-1:0] tag_matches // eviction
output wire evict_dirty,
output wire [NUM_WAYS-1:0] evict_way,
output wire [`CS_TAG_SEL_BITS-1:0] evict_tag
); );
`UNUSED_SPARAM (INSTANCE_ID) `UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID) `UNUSED_PARAM (BANK_ID)
`UNUSED_VAR (reset)
`UNUSED_VAR (lookup) `UNUSED_VAR (lookup)
localparam TAG_WIDTH = 1 + `CS_TAG_SEL_BITS; // valid, dirty, tag
localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS;
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0]; wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_TAG_ADDR(line_addr); wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_ADDR_TAG(line_addr);
wire [NUM_WAYS-1:0][`CS_TAG_SEL_BITS-1:0] read_tag;
wire [NUM_WAYS-1:0] read_valid;
wire [NUM_WAYS-1:0] read_dirty;
if (NUM_WAYS > 1) begin if (NUM_WAYS > 1) begin
reg [NUM_WAYS-1:0] repl_way; reg [NUM_WAYS-1:0] evict_way_r;
// cyclic assignment of replacement way // cyclic assignment of replacement way
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
repl_way <= 1; evict_way_r <= 1;
end else if (~stall) begin // hold the value on stalls prevent filling different slots twice end else if (~stall) begin // holding the value on stalls prevents filling different slots twice
repl_way <= {repl_way[NUM_WAYS-2:0], repl_way[NUM_WAYS-1]}; evict_way_r <= {evict_way_r[NUM_WAYS-2:0], evict_way_r[NUM_WAYS-1]};
end end
end end
for (genvar i = 0; i < NUM_WAYS; ++i) begin
assign way_sel[i] = fill && repl_way[i]; assign evict_way = fill ? evict_way_r : way_sel;
end
end else begin VX_onehot_mux #(
`UNUSED_VAR (stall) .DATAW (`CS_TAG_SEL_BITS),
assign way_sel = fill; .N (NUM_WAYS)
end ) evict_tag_sel (
.data_in (read_tag),
.sel_in (evict_way),
.data_out (evict_tag)
);
end else begin
`UNUSED_VAR (stall)
assign evict_way = 1'b1;
assign evict_tag = read_tag;
end
// fill and flush need to also read in writeback mode
wire fill_s = fill && (!WRITEBACK || ~stall);
wire flush_s = flush && (!WRITEBACK || ~stall);
for (genvar i = 0; i < NUM_WAYS; ++i) begin for (genvar i = 0; i < NUM_WAYS; ++i) begin
wire [`CS_TAG_SEL_BITS-1:0] read_tag;
wire read_valid; wire do_fill = fill_s && evict_way[i];
wire do_flush = flush_s && (!WRITEBACK || way_sel[i]); // flush the whole line in writethrough mode
wire do_write = WRITEBACK && write && tag_matches[i];
wire line_read = (WRITEBACK && (fill_s || flush_s));
wire line_write = init || do_fill || do_flush || do_write;
wire line_valid = ~(init || flush);
wire [TAG_WIDTH-1:0] line_wdata;
wire [TAG_WIDTH-1:0] line_rdata;
if (WRITEBACK) begin
assign line_wdata = {line_valid, write, line_tag};
assign {read_valid[i], read_dirty[i], read_tag[i]} = line_rdata;
end else begin
assign line_wdata = {line_valid, line_tag};
assign {read_valid[i], read_tag[i]} = line_rdata;
assign read_dirty[i] = 1'b0;
end
VX_sp_ram #( VX_sp_ram #(
.DATAW (TAG_WIDTH), .DATAW (TAG_WIDTH),
.SIZE (`CS_LINES_PER_BANK), .SIZE (`CS_LINES_PER_BANK),
.NO_RWCHECK (1) .NO_RWCHECK (1),
.RW_ASSERT (1)
) tag_store ( ) tag_store (
.clk (clk), .clk (clk),
.read (1'b1), .reset (reset),
.write (way_sel[i] || init), .read (line_read),
`UNUSED_PIN (wren), .write (line_write),
.wren (1'b1),
.addr (line_sel), .addr (line_sel),
.wdata ({~init, line_tag}), .wdata (line_wdata),
.rdata ({read_valid, read_tag}) .rdata (line_rdata)
); );
assign tag_matches[i] = read_valid && (line_tag == read_tag);
end end
`ifdef DBG_TRACE_CACHE_TAG for (genvar i = 0; i < NUM_WAYS; ++i) begin
assign tag_matches[i] = read_valid[i] && (line_tag == read_tag[i]);
end
assign evict_dirty = | (read_dirty & evict_way);
`ifdef DBG_TRACE_CACHE
wire [`CS_LINE_ADDR_WIDTH-1:0] evict_line_addr = {evict_tag, line_sel};
always @(posedge clk) begin always @(posedge clk) begin
if (fill && ~stall) begin if (fill && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d tag-fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, line_tag)); `TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, dirty=%b, evict_addr=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), evict_way, line_sel, line_tag, evict_dirty, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID)));
end end
if (init) begin if (init) begin
`TRACE(3, ("%d: %s-bank%0d tag-init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel)); `TRACE(3, ("%d: %s init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel));
end
if (flush && ~stall) begin
`TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, dirty=%b\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID), way_sel, line_sel, evict_dirty));
end end
if (lookup && ~stall) begin if (lookup && ~stall) begin
if (tag_matches != 0) begin if (tag_matches != 0) begin
`TRACE(3, ("%d: %s-bank%0d tag-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, line_tag, req_uuid)); if (write)
`TRACE(3, ("%d: %s write-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid));
else
`TRACE(3, ("%d: %s read-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid));
end else begin end else begin
`TRACE(3, ("%d: %s-bank%0d tag-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid)); if (write)
`TRACE(3, ("%d: %s write-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
else
`TRACE(3, ("%d: %s read-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
end end
end end
end end

View file

@ -13,7 +13,7 @@
`include "VX_cache_define.vh" `include "VX_cache_define.vh"
module VX_cache_top #( module VX_cache_top import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "", parameter `STRING INSTANCE_ID = "",
// Number of Word requests per cycle // Number of Word requests per cycle
@ -42,17 +42,23 @@ module VX_cache_top #(
// Enable cache writeable // Enable cache writeable
parameter WRITE_ENABLE = 1, parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier // Request debug identifier
parameter UUID_WIDTH = 0, parameter UUID_WIDTH = 0,
// core request tag size // core request tag size
parameter TAG_WIDTH = 16, parameter TAG_WIDTH = 16,
// Core response output register // Core response output buffer
parameter CORE_OUT_REG = 2, parameter CORE_OUT_BUF = 2,
// Memory request output register // Memory request output buffer
parameter MEM_OUT_REG = 2, parameter MEM_OUT_BUF = 2,
parameter MEM_TAG_WIDTH = `CLOG2(MSHR_SIZE) + `CLOG2(NUM_BANKS) parameter MEM_TAG_WIDTH = `CLOG2(MSHR_SIZE) + `CLOG2(NUM_BANKS)
) ( ) (
@ -69,6 +75,7 @@ module VX_cache_top #(
input wire [NUM_REQS-1:0] core_req_rw, input wire [NUM_REQS-1:0] core_req_rw,
input wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen, input wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr, input wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr,
input wire [NUM_REQS-1:0][`ADDR_TYPE_WIDTH-1:0] core_req_atype,
input wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data, input wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data,
input wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag, input wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag,
output wire [NUM_REQS-1:0] core_req_ready, output wire [NUM_REQS-1:0] core_req_ready,
@ -110,6 +117,7 @@ module VX_cache_top #(
assign core_bus_if[i].req_data.rw = core_req_rw[i]; assign core_bus_if[i].req_data.rw = core_req_rw[i];
assign core_bus_if[i].req_data.byteen = core_req_byteen[i]; assign core_bus_if[i].req_data.byteen = core_req_byteen[i];
assign core_bus_if[i].req_data.addr = core_req_addr[i]; assign core_bus_if[i].req_data.addr = core_req_addr[i];
assign core_bus_if[i].req_data.atype = core_req_atype[i];
assign core_bus_if[i].req_data.data = core_req_data[i]; assign core_bus_if[i].req_data.data = core_req_data[i];
assign core_bus_if[i].req_data.tag = core_req_tag[i]; assign core_bus_if[i].req_data.tag = core_req_tag[i];
assign core_req_ready[i] = core_bus_if[i].req_ready; assign core_req_ready[i] = core_bus_if[i].req_ready;
@ -131,6 +139,7 @@ module VX_cache_top #(
assign mem_req_data = mem_bus_if.req_data.data; assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag; assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready; assign mem_bus_if.req_ready = mem_req_ready;
`UNUSED_VAR (mem_bus_if.req_data.atype)
// Memory response // Memory response
assign mem_bus_if.rsp_valid = mem_rsp_valid; assign mem_bus_if.rsp_valid = mem_rsp_valid;
@ -153,8 +162,10 @@ module VX_cache_top #(
.TAG_WIDTH (TAG_WIDTH), .TAG_WIDTH (TAG_WIDTH),
.UUID_WIDTH (UUID_WIDTH), .UUID_WIDTH (UUID_WIDTH),
.WRITE_ENABLE (WRITE_ENABLE), .WRITE_ENABLE (WRITE_ENABLE),
.CORE_OUT_REG (CORE_OUT_REG), .WRITEBACK (WRITEBACK),
.MEM_OUT_REG (MEM_OUT_REG) .DIRTY_BYTES (DIRTY_BYTES),
.CORE_OUT_BUF (CORE_OUT_BUF),
.MEM_OUT_BUF (MEM_OUT_BUF)
) cache ( ) cache (
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (cache_perf), .cache_perf (cache_perf),

View file

@ -16,9 +16,12 @@
module VX_cache_wrap import VX_gpu_pkg::*; #( module VX_cache_wrap import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "", parameter `STRING INSTANCE_ID = "",
parameter TAG_SEL_IDX = 0,
// Number of Word requests per cycle // Number of Word requests per cycle
parameter NUM_REQS = 4, parameter NUM_REQS = 4,
// Size of cache in bytes // Size of cache in bytes
parameter CACHE_SIZE = 4096, parameter CACHE_SIZE = 4096,
// Size of line inside a bank in bytes // Size of line inside a bank in bytes
@ -42,6 +45,12 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
// Enable cache writeable // Enable cache writeable
parameter WRITE_ENABLE = 1, parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier // Request debug identifier
parameter UUID_WIDTH = 0, parameter UUID_WIDTH = 0,
@ -49,17 +58,16 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
parameter TAG_WIDTH = UUID_WIDTH + 1, parameter TAG_WIDTH = UUID_WIDTH + 1,
// enable bypass for non-cacheable addresses // enable bypass for non-cacheable addresses
parameter NC_TAG_BIT = 0,
parameter NC_ENABLE = 0, parameter NC_ENABLE = 0,
// Force bypass for all requests // Force bypass for all requests
parameter PASSTHRU = 0, parameter PASSTHRU = 0,
// Core response output register // Core response output buffer
parameter CORE_OUT_REG = 0, parameter CORE_OUT_BUF = 0,
// Memory request output register // Memory request output buffer
parameter MEM_OUT_REG = 0 parameter MEM_OUT_BUF = 0
) ( ) (
input wire clk, input wire clk,
@ -74,283 +82,91 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
VX_mem_bus_if.master mem_bus_if VX_mem_bus_if.master mem_bus_if
); );
`STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid parameter: NUM_BANKS=%d, NUM_REQS=%d", NUM_BANKS, NUM_REQS))
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter")) `STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE); localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
localparam CORE_TAG_X_WIDTH = TAG_WIDTH - NC_ENABLE; localparam CACHE_MEM_TAG_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
localparam MEM_TAG_X_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
localparam MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) : localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH)) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) : (NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS)); `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
localparam NC_BYPASS = (NC_ENABLE || PASSTHRU); localparam NC_OR_BYPASS = (NC_ENABLE || PASSTHRU);
localparam DIRECT_PASSTHRU = PASSTHRU && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == 1);
wire [NUM_REQS-1:0] core_req_valid; VX_mem_bus_if #(
wire [NUM_REQS-1:0] core_req_rw; .DATA_SIZE (WORD_SIZE),
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr; .TAG_WIDTH (TAG_WIDTH)
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen; ) core_bus_cache_if[NUM_REQS]();
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag;
wire [NUM_REQS-1:0] core_req_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin VX_mem_bus_if #(
assign core_req_valid[i] = core_bus_if[i].req_valid; .DATA_SIZE (LINE_SIZE),
assign core_req_rw[i] = core_bus_if[i].req_data.rw; .TAG_WIDTH (CACHE_MEM_TAG_WIDTH)
assign core_req_addr[i] = core_bus_if[i].req_data.addr; ) mem_bus_cache_if();
assign core_req_byteen[i] = core_bus_if[i].req_data.byteen;
assign core_req_data[i] = core_bus_if[i].req_data.data;
assign core_req_tag[i] = core_bus_if[i].req_data.tag;
assign core_bus_if[i].req_ready = core_req_ready[i];
end
/////////////////////////////////////////////////////////////////////////// if (NC_OR_BYPASS) begin
// Core response buffering
wire [NUM_REQS-1:0] core_rsp_valid_s;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_s;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
wire [NUM_REQS-1:0] core_rsp_ready_s;
for (genvar i = 0; i < NUM_REQS; ++i) begin
`RESET_RELAY (core_rsp_reset, reset);
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
.SIZE ((NC_BYPASS && !DIRECT_PASSTHRU) ? `OUT_REG_TO_EB_SIZE(CORE_OUT_REG) : 0),
.OUT_REG (`OUT_REG_TO_EB_REG(CORE_OUT_REG))
) core_rsp_buf (
.clk (clk),
.reset (core_rsp_reset),
.valid_in (core_rsp_valid_s[i]),
.ready_in (core_rsp_ready_s[i]),
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
.data_out ({core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag}),
.valid_out (core_bus_if[i].rsp_valid),
.ready_out (core_bus_if[i].rsp_ready)
);
end
///////////////////////////////////////////////////////////////////////////
// Memory request buffering
wire mem_req_valid_s;
wire mem_req_rw_s;
wire [LINE_SIZE-1:0] mem_req_byteen_s;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_s;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s;
wire mem_req_ready_s;
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH),
.SIZE ((NC_BYPASS && !DIRECT_PASSTHRU) ? `OUT_REG_TO_EB_SIZE(MEM_OUT_REG) : 0),
.OUT_REG (`OUT_REG_TO_EB_REG(MEM_OUT_REG))
) mem_req_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_valid_s),
.ready_in (mem_req_ready_s),
.data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s}),
.data_out ({mem_bus_if.req_data.rw, mem_bus_if.req_data.byteen, mem_bus_if.req_data.addr, mem_bus_if.req_data.data, mem_bus_if.req_data.tag}),
.valid_out (mem_bus_if.req_valid),
.ready_out (mem_bus_if.req_ready)
);
///////////////////////////////////////////////////////////////////////////
// Core request
wire [NUM_REQS-1:0] core_req_valid_b;
wire [NUM_REQS-1:0] core_req_rw_b;
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr_b;
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen_b;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data_b;
wire [NUM_REQS-1:0][CORE_TAG_X_WIDTH-1:0] core_req_tag_b;
wire [NUM_REQS-1:0] core_req_ready_b;
// Core response
wire [NUM_REQS-1:0] core_rsp_valid_b;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_b;
wire [NUM_REQS-1:0][CORE_TAG_X_WIDTH-1:0] core_rsp_tag_b;
wire [NUM_REQS-1:0] core_rsp_ready_b;
// Memory request
wire mem_req_valid_b;
wire mem_req_rw_b;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_b;
wire [LINE_SIZE-1:0] mem_req_byteen_b;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_b;
wire [MEM_TAG_X_WIDTH-1:0] mem_req_tag_b;
wire mem_req_ready_b;
// Memory response
wire mem_rsp_valid_b;
wire [`CS_LINE_WIDTH-1:0] mem_rsp_data_b;
wire [MEM_TAG_X_WIDTH-1:0] mem_rsp_tag_b;
wire mem_rsp_ready_b;
if (NC_BYPASS) begin
`RESET_RELAY (nc_bypass_reset, reset); `RESET_RELAY (nc_bypass_reset, reset);
VX_cache_bypass #( VX_cache_bypass #(
.NUM_REQS (NUM_REQS), .NUM_REQS (NUM_REQS),
.NC_TAG_BIT (NC_TAG_BIT), .TAG_SEL_IDX (TAG_SEL_IDX),
.NC_ENABLE (NC_ENABLE),
.PASSTHRU (PASSTHRU), .PASSTHRU (PASSTHRU),
.NC_ENABLE (PASSTHRU ? 0 : NC_ENABLE),
.WORD_SIZE (WORD_SIZE),
.LINE_SIZE (LINE_SIZE),
.CORE_ADDR_WIDTH (`CS_WORD_ADDR_WIDTH), .CORE_ADDR_WIDTH (`CS_WORD_ADDR_WIDTH),
.CORE_DATA_SIZE (WORD_SIZE), .CORE_TAG_WIDTH (TAG_WIDTH),
.CORE_TAG_IN_WIDTH (TAG_WIDTH),
.MEM_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH), .MEM_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH),
.MEM_DATA_SIZE (LINE_SIZE), .MEM_TAG_IN_WIDTH (CACHE_MEM_TAG_WIDTH),
.MEM_TAG_IN_WIDTH (MEM_TAG_X_WIDTH),
.MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH), .MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH),
.UUID_WIDTH (UUID_WIDTH) .UUID_WIDTH (UUID_WIDTH),
.CORE_OUT_BUF (CORE_OUT_BUF),
.MEM_OUT_BUF (MEM_OUT_BUF)
) cache_bypass ( ) cache_bypass (
.clk (clk), .clk (clk),
.reset (nc_bypass_reset), .reset (nc_bypass_reset),
// Core request in .core_bus_in_if (core_bus_if),
.core_req_valid_in (core_req_valid), .core_bus_out_if(core_bus_cache_if),
.core_req_rw_in (core_req_rw),
.core_req_byteen_in (core_req_byteen),
.core_req_addr_in (core_req_addr),
.core_req_data_in (core_req_data),
.core_req_tag_in (core_req_tag),
.core_req_ready_in (core_req_ready),
// Core request out .mem_bus_in_if (mem_bus_cache_if),
.core_req_valid_out (core_req_valid_b), .mem_bus_out_if (mem_bus_if)
.core_req_rw_out (core_req_rw_b),
.core_req_byteen_out(core_req_byteen_b),
.core_req_addr_out (core_req_addr_b),
.core_req_data_out (core_req_data_b),
.core_req_tag_out (core_req_tag_b),
.core_req_ready_out (core_req_ready_b),
// Core response in
.core_rsp_valid_in (core_rsp_valid_b),
.core_rsp_data_in (core_rsp_data_b),
.core_rsp_tag_in (core_rsp_tag_b),
.core_rsp_ready_in (core_rsp_ready_b),
// Core response out
.core_rsp_valid_out (core_rsp_valid_s),
.core_rsp_data_out (core_rsp_data_s),
.core_rsp_tag_out (core_rsp_tag_s),
.core_rsp_ready_out (core_rsp_ready_s),
// Memory request in
.mem_req_valid_in (mem_req_valid_b),
.mem_req_rw_in (mem_req_rw_b),
.mem_req_addr_in (mem_req_addr_b),
.mem_req_byteen_in (mem_req_byteen_b),
.mem_req_data_in (mem_req_data_b),
.mem_req_tag_in (mem_req_tag_b),
.mem_req_ready_in (mem_req_ready_b),
// Memory request out
.mem_req_valid_out (mem_req_valid_s),
.mem_req_addr_out (mem_req_addr_s),
.mem_req_rw_out (mem_req_rw_s),
.mem_req_byteen_out (mem_req_byteen_s),
.mem_req_data_out (mem_req_data_s),
.mem_req_tag_out (mem_req_tag_s),
.mem_req_ready_out (mem_req_ready_s),
// Memory response in
.mem_rsp_valid_in (mem_bus_if.rsp_valid),
.mem_rsp_data_in (mem_bus_if.rsp_data.data),
.mem_rsp_tag_in (mem_bus_if.rsp_data.tag),
.mem_rsp_ready_in (mem_bus_if.rsp_ready),
// Memory response out
.mem_rsp_valid_out (mem_rsp_valid_b),
.mem_rsp_data_out (mem_rsp_data_b),
.mem_rsp_tag_out (mem_rsp_tag_b),
.mem_rsp_ready_out (mem_rsp_ready_b)
); );
end else begin end else begin
assign core_req_valid_b = core_req_valid;
assign core_req_rw_b = core_req_rw;
assign core_req_addr_b = core_req_addr;
assign core_req_byteen_b= core_req_byteen;
assign core_req_data_b = core_req_data;
assign core_req_tag_b = core_req_tag;
assign core_req_ready = core_req_ready_b;
assign core_rsp_valid_s = core_rsp_valid_b; for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_data_s = core_rsp_data_b; `ASSIGN_VX_MEM_BUS_IF (core_bus_cache_if[i], core_bus_if[i]);
assign core_rsp_tag_s = core_rsp_tag_b; end
assign core_rsp_ready_b = core_rsp_ready_s;
assign mem_req_valid_s = mem_req_valid_b; `ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_cache_if);
assign mem_req_addr_s = mem_req_addr_b;
assign mem_req_rw_s = mem_req_rw_b;
assign mem_req_byteen_s = mem_req_byteen_b;
assign mem_req_data_s = mem_req_data_b;
assign mem_req_ready_b = mem_req_ready_s;
// Add explicit NC=0 flag to the memory request tag
VX_bits_insert #(
.N (MEM_TAG_WIDTH-1),
.POS (NC_TAG_BIT)
) mem_req_tag_insert (
.data_in (mem_req_tag_b),
.sel_in (1'b0),
.data_out (mem_req_tag_s)
);
assign mem_rsp_valid_b = mem_bus_if.rsp_valid;
assign mem_rsp_data_b = mem_bus_if.rsp_data.data;
assign mem_bus_if.rsp_ready = mem_rsp_ready_b;
// Remove NC flag from the memory response tag
VX_bits_remove #(
.N (MEM_TAG_WIDTH),
.POS (NC_TAG_BIT)
) mem_rsp_tag_remove (
.data_in (mem_bus_if.rsp_data.tag),
.data_out (mem_rsp_tag_b)
);
end end
if (PASSTHRU != 0) begin if (PASSTHRU != 0) begin
`UNUSED_VAR (core_req_valid_b) for (genvar i = 0; i < NUM_REQS; ++i) begin
`UNUSED_VAR (core_req_rw_b) `UNUSED_VAR (core_bus_cache_if[i].req_valid)
`UNUSED_VAR (core_req_addr_b) `UNUSED_VAR (core_bus_cache_if[i].req_data)
`UNUSED_VAR (core_req_byteen_b) assign core_bus_cache_if[i].req_ready = 0;
`UNUSED_VAR (core_req_data_b)
`UNUSED_VAR (core_req_tag_b)
assign core_req_ready_b = '0;
assign core_rsp_valid_b = '0; assign core_bus_cache_if[i].rsp_valid = 0;
assign core_rsp_data_b = '0; assign core_bus_cache_if[i].rsp_data = '0;
assign core_rsp_tag_b = '0; `UNUSED_VAR (core_bus_cache_if[i].rsp_ready)
`UNUSED_VAR (core_rsp_ready_b) end
assign mem_req_valid_b = 0; assign mem_bus_cache_if.req_valid = 0;
assign mem_req_addr_b = '0; assign mem_bus_cache_if.req_data = '0;
assign mem_req_rw_b = '0; `UNUSED_VAR (mem_bus_cache_if.req_ready)
assign mem_req_byteen_b = '0;
assign mem_req_data_b = '0;
assign mem_req_tag_b = '0;
`UNUSED_VAR (mem_req_ready_b)
`UNUSED_VAR (mem_rsp_valid_b) `UNUSED_VAR (mem_bus_cache_if.rsp_valid)
`UNUSED_VAR (mem_rsp_data_b) `UNUSED_VAR (mem_bus_cache_if.rsp_data)
`UNUSED_VAR (mem_rsp_tag_b) assign mem_bus_cache_if.rsp_ready = 0;
assign mem_rsp_ready_b = 0;
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
assign cache_perf = '0; assign cache_perf = '0;
@ -358,46 +174,6 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
end else begin end else begin
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (CORE_TAG_X_WIDTH)
) core_bus_wrap_if[NUM_REQS]();
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_X_WIDTH)
) mem_bus_wrap_if();
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_wrap_if[i].req_valid = core_req_valid_b[i];
assign core_bus_wrap_if[i].req_data.rw = core_req_rw_b[i];
assign core_bus_wrap_if[i].req_data.addr = core_req_addr_b[i];
assign core_bus_wrap_if[i].req_data.byteen = core_req_byteen_b[i];
assign core_bus_wrap_if[i].req_data.data = core_req_data_b[i];
assign core_bus_wrap_if[i].req_data.tag = core_req_tag_b[i];
assign core_req_ready_b[i] = core_bus_wrap_if[i].req_ready;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_valid_b[i] = core_bus_wrap_if[i].rsp_valid;
assign core_rsp_data_b[i] = core_bus_wrap_if[i].rsp_data.data;
assign core_rsp_tag_b[i] = core_bus_wrap_if[i].rsp_data.tag;
assign core_bus_wrap_if[i].rsp_ready = core_rsp_ready_b[i];
end
assign mem_req_valid_b = mem_bus_wrap_if.req_valid;
assign mem_req_addr_b = mem_bus_wrap_if.req_data.addr;
assign mem_req_rw_b = mem_bus_wrap_if.req_data.rw;
assign mem_req_byteen_b = mem_bus_wrap_if.req_data.byteen;
assign mem_req_data_b = mem_bus_wrap_if.req_data.data;
assign mem_req_tag_b = mem_bus_wrap_if.req_data.tag;
assign mem_bus_wrap_if.req_ready = mem_req_ready_b;
assign mem_bus_wrap_if.rsp_valid = mem_rsp_valid_b;
assign mem_bus_wrap_if.rsp_data.data = mem_rsp_data_b;
assign mem_bus_wrap_if.rsp_data.tag = mem_rsp_tag_b;
assign mem_rsp_ready_b = mem_bus_wrap_if.rsp_ready;
`RESET_RELAY (cache_reset, reset); `RESET_RELAY (cache_reset, reset);
VX_cache #( VX_cache #(
@ -413,25 +189,25 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.MRSQ_SIZE (MRSQ_SIZE), .MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE), .MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE), .WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH), .UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (CORE_TAG_X_WIDTH), .TAG_WIDTH (TAG_WIDTH),
.CORE_OUT_REG (NC_BYPASS ? 1 : CORE_OUT_REG), .CORE_OUT_BUF (NC_OR_BYPASS ? 1 : CORE_OUT_BUF),
.MEM_OUT_REG (NC_BYPASS ? 1 : MEM_OUT_REG) .MEM_OUT_BUF (NC_OR_BYPASS ? 1 : MEM_OUT_BUF)
) cache ( ) cache (
.clk (clk), .clk (clk),
.reset (cache_reset), .reset (cache_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (cache_perf), .cache_perf (cache_perf),
`endif `endif
.core_bus_if (core_bus_cache_if),
.core_bus_if (core_bus_wrap_if), .mem_bus_if (mem_bus_cache_if)
.mem_bus_if (mem_bus_wrap_if)
); );
end end
`ifdef DBG_TRACE_CACHE_BANK `ifdef DBG_TRACE_CACHE
for (genvar i = 0; i < NUM_REQS; ++i) begin for (genvar i = 0; i < NUM_REQS; ++i) begin
wire [`UP(UUID_WIDTH)-1:0] core_req_uuid; wire [`UP(UUID_WIDTH)-1:0] core_req_uuid;
@ -451,12 +227,12 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
always @(posedge clk) begin always @(posedge clk) begin
if (core_req_fire) begin if (core_req_fire) begin
if (core_bus_if[i].req_data.rw) if (core_bus_if[i].req_data.rw)
`TRACE(1, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid)); `TRACE(1, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid));
else else
`TRACE(1, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_req_uuid)); `TRACE(1, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_req_uuid));
end end
if (core_rsp_fire) begin if (core_rsp_fire) begin
`TRACE(1, ("%d: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid)); `TRACE(1, ("%d: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid));
end end
end end
end end
@ -464,7 +240,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
wire [`UP(UUID_WIDTH)-1:0] mem_req_uuid; wire [`UP(UUID_WIDTH)-1:0] mem_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] mem_rsp_uuid; wire [`UP(UUID_WIDTH)-1:0] mem_rsp_uuid;
if ((UUID_WIDTH != 0) && (NC_BYPASS != 0)) begin if ((UUID_WIDTH != 0) && (NC_OR_BYPASS != 0)) begin
assign mem_req_uuid = mem_bus_if.req_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH]; assign mem_req_uuid = mem_bus_if.req_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
assign mem_rsp_uuid = mem_bus_if.rsp_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH]; assign mem_rsp_uuid = mem_bus_if.rsp_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
end else begin end else begin
@ -478,14 +254,14 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
always @(posedge clk) begin always @(posedge clk) begin
if (mem_req_fire) begin if (mem_req_fire) begin
if (mem_bus_if.req_data.rw) if (mem_bus_if.req_data.rw)
`TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", `TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_bus_if.req_data.byteen, mem_bus_if.req_data.data, mem_req_uuid)); $time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_bus_if.req_data.byteen, mem_bus_if.req_data.data, mem_req_uuid));
else else
`TRACE(1, ("%d: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n", `TRACE(1, ("%d: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_req_uuid)); $time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_req_uuid));
end end
if (mem_rsp_fire) begin if (mem_rsp_fire) begin
`TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%0h (#%0d)\n", `TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data, mem_rsp_uuid)); $time, INSTANCE_ID, mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data, mem_rsp_uuid));
end end
end end

View file

@ -13,8 +13,8 @@
`include "VX_define.vh" `include "VX_define.vh"
module VX_int_unit #( module VX_alu_int #(
parameter CORE_ID = 0, parameter `STRING INSTANCE_ID = "",
parameter BLOCK_IDX = 0, parameter BLOCK_IDX = 0,
parameter NUM_LANES = 1 parameter NUM_LANES = 1
) ( ) (
@ -29,7 +29,7 @@ module VX_int_unit #(
VX_branch_ctl_if.master branch_ctl_if VX_branch_ctl_if.master branch_ctl_if
); );
`UNUSED_PARAM (CORE_ID) `UNUSED_SPARAM (INSTANCE_ID)
localparam LANE_BITS = `CLOG2(NUM_LANES); localparam LANE_BITS = `CLOG2(NUM_LANES);
localparam LANE_WIDTH = `UP(LANE_BITS); localparam LANE_WIDTH = `UP(LANE_BITS);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
@ -40,7 +40,7 @@ module VX_int_unit #(
wire [NUM_LANES-1:0][`XLEN-1:0] add_result; wire [NUM_LANES-1:0][`XLEN-1:0] add_result;
wire [NUM_LANES-1:0][`XLEN:0] sub_result; // +1 bit for branch compare wire [NUM_LANES-1:0][`XLEN:0] sub_result; // +1 bit for branch compare
wire [NUM_LANES-1:0][`XLEN-1:0] shr_result; reg [NUM_LANES-1:0][`XLEN-1:0] shr_zic_result;
reg [NUM_LANES-1:0][`XLEN-1:0] msc_result; reg [NUM_LANES-1:0][`XLEN-1:0] msc_result;
wire [NUM_LANES-1:0][`XLEN-1:0] add_result_w; wire [NUM_LANES-1:0][`XLEN-1:0] add_result_w;
@ -52,16 +52,14 @@ module VX_int_unit #(
wire [NUM_LANES-1:0][`XLEN-1:0] alu_result_r; wire [NUM_LANES-1:0][`XLEN-1:0] alu_result_r;
`ifdef XLEN_64 `ifdef XLEN_64
wire is_alu_w = `INST_ALU_IS_W(execute_if.data.op_mod); wire is_alu_w = execute_if.data.op_args.alu.is_w;
`else `else
wire is_alu_w = 0; wire is_alu_w = 0;
`endif `endif
`UNUSED_VAR (execute_if.data.op_mod)
wire [`INST_ALU_BITS-1:0] alu_op = `INST_ALU_BITS'(execute_if.data.op_type); wire [`INST_ALU_BITS-1:0] alu_op = `INST_ALU_BITS'(execute_if.data.op_type);
wire [`INST_BR_BITS-1:0] br_op = `INST_BR_BITS'(execute_if.data.op_type); wire [`INST_BR_BITS-1:0] br_op = `INST_BR_BITS'(execute_if.data.op_type);
wire is_br_op = `INST_ALU_IS_BR(execute_if.data.op_mod); wire is_br_op = (execute_if.data.op_args.alu.xtype == `ALU_TYPE_BRANCH);
wire is_sub_op = `INST_ALU_IS_SUB(alu_op); wire is_sub_op = `INST_ALU_IS_SUB(alu_op);
wire is_signed = `INST_ALU_SIGNED(alu_op); wire is_signed = `INST_ALU_SIGNED(alu_op);
wire [1:0] op_class = is_br_op ? `INST_BR_CLASS(alu_op) : `INST_ALU_CLASS(alu_op); wire [1:0] op_class = is_br_op ? `INST_BR_CLASS(alu_op) : `INST_ALU_CLASS(alu_op);
@ -69,9 +67,9 @@ module VX_int_unit #(
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in1 = execute_if.data.rs1_data; wire [NUM_LANES-1:0][`XLEN-1:0] alu_in1 = execute_if.data.rs1_data;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2 = execute_if.data.rs2_data; wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2 = execute_if.data.rs2_data;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in1_PC = execute_if.data.use_PC ? {NUM_LANES{execute_if.data.PC}} : alu_in1; wire [NUM_LANES-1:0][`XLEN-1:0] alu_in1_PC = execute_if.data.op_args.alu.use_PC ? {NUM_LANES{execute_if.data.PC, 1'd0}} : alu_in1;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_imm = execute_if.data.use_imm ? {NUM_LANES{execute_if.data.imm}} : alu_in2; wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_imm = execute_if.data.op_args.alu.use_imm ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_args.alu.imm)}} : alu_in2;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_br = (execute_if.data.use_imm && ~is_br_op) ? {NUM_LANES{execute_if.data.imm}} : alu_in2; wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_br = (execute_if.data.op_args.alu.use_imm && ~is_br_op) ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_args.alu.imm)}} : alu_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin for (genvar i = 0; i < NUM_LANES; ++i) begin
assign add_result[i] = alu_in1_PC[i] + alu_in2_imm[i]; assign add_result[i] = alu_in1_PC[i] + alu_in2_imm[i];
@ -87,7 +85,18 @@ module VX_int_unit #(
for (genvar i = 0; i < NUM_LANES; ++i) begin for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN:0] shr_in1 = {is_signed && alu_in1[i][`XLEN-1], alu_in1[i]}; wire [`XLEN:0] shr_in1 = {is_signed && alu_in1[i][`XLEN-1], alu_in1[i]};
assign shr_result[i] = `XLEN'($signed(shr_in1) >>> alu_in2_imm[i][SHIFT_IMM_BITS-1:0]); always @(*) begin
case (alu_op[1:0])
`ifdef EXT_ZICOND_ENABLE
2'b10, 2'b11: begin // CZERO
shr_zic_result[i] = alu_in1[i] & {`XLEN{alu_op[0] ^ (| alu_in2[i])}};
end
`endif
default: begin // SRL, SRA, SRLI, SRAI
shr_zic_result[i] = `XLEN'($signed(shr_in1) >>> alu_in2_imm[i][SHIFT_IMM_BITS-1:0]);
end
endcase
end
wire [32:0] shr_in1_w = {is_signed && alu_in1[i][31], alu_in1[i][31:0]}; wire [32:0] shr_in1_w = {is_signed && alu_in1[i][31], alu_in1[i][31:0]};
wire [31:0] shr_res_w = 32'($signed(shr_in1_w) >>> alu_in2_imm[i][4:0]); wire [31:0] shr_res_w = 32'($signed(shr_in1_w) >>> alu_in2_imm[i][4:0]);
assign shr_result_w[i] = `XLEN'($signed(shr_res_w)); assign shr_result_w[i] = `XLEN'($signed(shr_res_w));
@ -102,7 +111,7 @@ module VX_int_unit #(
2'b11: msc_result[i] = alu_in1[i] << alu_in2_imm[i][SHIFT_IMM_BITS-1:0]; // SLL 2'b11: msc_result[i] = alu_in1[i] << alu_in2_imm[i][SHIFT_IMM_BITS-1:0]; // SLL
endcase endcase
end end
assign msc_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] << alu_in2_imm[i][4:0])); assign msc_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] << alu_in2_imm[i][4:0])); // SLLW
end end
for (genvar i = 0; i < NUM_LANES; ++i) begin for (genvar i = 0; i < NUM_LANES; ++i) begin
@ -112,7 +121,7 @@ module VX_int_unit #(
case ({is_alu_w, op_class}) case ({is_alu_w, op_class})
3'b000: alu_result[i] = add_result[i]; // ADD, LUI, AUIPC 3'b000: alu_result[i] = add_result[i]; // ADD, LUI, AUIPC
3'b001: alu_result[i] = sub_slt_br_result; // SUB, SLTU, SLTI, BR* 3'b001: alu_result[i] = sub_slt_br_result; // SUB, SLTU, SLTI, BR*
3'b010: alu_result[i] = shr_result[i]; // SRL, SRA, SRLI, SRAI 3'b010: alu_result[i] = shr_zic_result[i]; // SRL, SRA, SRLI, SRAI, CZERO*
3'b011: alu_result[i] = msc_result[i]; // AND, OR, XOR, SLL, SLLI 3'b011: alu_result[i] = msc_result[i]; // AND, OR, XOR, SLL, SLLI
3'b100: alu_result[i] = add_result_w[i]; // ADDIW, ADDW 3'b100: alu_result[i] = add_result_w[i]; // ADDIW, ADDW
3'b101: alu_result[i] = sub_result_w[i]; // SUBW 3'b101: alu_result[i] = sub_result_w[i]; // SUBW
@ -124,11 +133,14 @@ module VX_int_unit #(
// branch // branch
wire [`XLEN-1:0] PC_r, imm_r; wire [`PC_BITS-1:0] PC_r;
wire [`INST_BR_BITS-1:0] br_op_r; wire [`INST_BR_BITS-1:0] br_op_r;
wire [`PC_BITS-1:0] cbr_dest, cbr_dest_r;
wire [LANE_WIDTH-1:0] tid, tid_r; wire [LANE_WIDTH-1:0] tid, tid_r;
wire is_br_op_r; wire is_br_op_r;
assign cbr_dest = add_result[0][1 +: `PC_BITS];
if (LANE_BITS != 0) begin if (LANE_BITS != 0) begin
assign tid = execute_if.data.tid[0 +: LANE_BITS]; assign tid = execute_if.data.tid[0 +: LANE_BITS];
end else begin end else begin
@ -136,14 +148,14 @@ module VX_int_unit #(
end end
VX_elastic_buffer #( VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `NR_BITS + 1 + PID_WIDTH + 1 + 1 + (NUM_LANES * `XLEN) + `XLEN + `XLEN + 1 + `INST_BR_BITS + LANE_WIDTH) .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `NR_BITS + 1 + PID_WIDTH + 1 + 1 + (NUM_LANES * `XLEN) + `PC_BITS + `PC_BITS + 1 + `INST_BR_BITS + LANE_WIDTH)
) rsp_buf ( ) rsp_buf (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.valid_in (execute_if.valid), .valid_in (execute_if.valid),
.ready_in (execute_if.ready), .ready_in (execute_if.ready),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, alu_result, execute_if.data.PC, execute_if.data.imm, is_br_op, br_op, tid}), .data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, alu_result, execute_if.data.PC, cbr_dest, is_br_op, br_op, tid}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, alu_result_r, PC_r, imm_r, is_br_op_r, br_op_r, tid_r}), .data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, alu_result_r, PC_r, cbr_dest_r, is_br_op_r, br_op_r, tid_r}),
.valid_out (commit_if.valid), .valid_out (commit_if.valid),
.ready_out (commit_if.ready) .ready_out (commit_if.ready)
); );
@ -152,19 +164,19 @@ module VX_int_unit #(
wire is_br_neg = `INST_BR_IS_NEG(br_op_r); wire is_br_neg = `INST_BR_IS_NEG(br_op_r);
wire is_br_less = `INST_BR_IS_LESS(br_op_r); wire is_br_less = `INST_BR_IS_LESS(br_op_r);
wire is_br_static = `INST_BR_IS_STATIC(br_op_r); wire is_br_static = `INST_BR_IS_STATIC(br_op_r);
wire [`XLEN-1:0] br_result = alu_result_r[tid_r];
wire [`XLEN-1:0] br_result = alu_result_r[tid_r];
wire is_less = br_result[0]; wire is_less = br_result[0];
wire is_equal = br_result[1]; wire is_equal = br_result[1];
wire br_enable = is_br_op_r && commit_if.valid && commit_if.ready && commit_if.data.eop; wire br_enable = is_br_op_r && commit_if.valid && commit_if.ready && commit_if.data.eop;
wire br_taken = ((is_br_less ? is_less : is_equal) ^ is_br_neg) | is_br_static; wire br_taken = ((is_br_less ? is_less : is_equal) ^ is_br_neg) | is_br_static;
wire [`XLEN-1:0] br_dest = is_br_static ? br_result : (PC_r + imm_r); wire [`PC_BITS-1:0] br_dest = is_br_static ? br_result[1 +: `PC_BITS] : cbr_dest_r;
wire [`NW_WIDTH-1:0] br_wid; wire [`NW_WIDTH-1:0] br_wid;
`ASSIGN_BLOCKED_WID (br_wid, commit_if.data.wid, BLOCK_IDX, `NUM_ALU_BLOCKS) `ASSIGN_BLOCKED_WID (br_wid, commit_if.data.wid, BLOCK_IDX, `NUM_ALU_BLOCKS)
VX_pipe_register #( VX_pipe_register #(
.DATAW (1 + `NW_WIDTH + 1 + `XLEN) .DATAW (1 + `NW_WIDTH + 1 + `PC_BITS)
) branch_reg ( ) branch_reg (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
@ -174,16 +186,16 @@ module VX_int_unit #(
); );
for (genvar i = 0; i < NUM_LANES; ++i) begin for (genvar i = 0; i < NUM_LANES; ++i) begin
assign commit_if.data.data[i] = (is_br_op_r && is_br_static) ? (PC_r + 4) : alu_result_r[i]; assign commit_if.data.data[i] = (is_br_op_r && is_br_static) ? {(PC_r + `PC_BITS'(2)), 1'd0} : alu_result_r[i];
end end
assign commit_if.data.PC = PC_r; assign commit_if.data.PC = PC_r;
`ifdef DBG_TRACE_CORE_PIPELINE `ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin always @(posedge clk) begin
if (branch_ctl_if.valid) begin if (br_enable) begin
`TRACE(1, ("%d: core%0d-branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n", `TRACE(1, ("%d: %s-branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n",
$time, CORE_ID, branch_ctl_if.wid, commit_if.data.PC, branch_ctl_if.taken, branch_ctl_if.dest, commit_if.data.uuid)); $time, INSTANCE_ID, br_wid, {commit_if.data.PC, 1'b0}, br_taken, {br_dest, 1'b0}, commit_if.data.uuid));
end end
end end
`endif `endif

View file

@ -13,8 +13,8 @@
`include "VX_define.vh" `include "VX_define.vh"
module VX_muldiv_unit #( module VX_alu_muldiv #(
parameter CORE_ID = 0, parameter `STRING INSTANCE_ID = "",
parameter NUM_LANES = 1 parameter NUM_LANES = 1
) ( ) (
input wire clk, input wire clk,
@ -26,10 +26,10 @@ module VX_muldiv_unit #(
// Outputs // Outputs
VX_commit_if.master commit_if VX_commit_if.master commit_if
); );
`UNUSED_PARAM (CORE_ID) `UNUSED_SPARAM (INSTANCE_ID)
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS); localparam PID_WIDTH = `UP(PID_BITS);
localparam TAGW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + PID_WIDTH + 1 + 1; localparam TAG_WIDTH = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + PID_WIDTH + 1 + 1;
`UNUSED_VAR (execute_if.data.rs3_data) `UNUSED_VAR (execute_if.data.rs3_data)
@ -38,7 +38,7 @@ module VX_muldiv_unit #(
wire is_mulx_op = `INST_M_IS_MULX(muldiv_op); wire is_mulx_op = `INST_M_IS_MULX(muldiv_op);
wire is_signed_op = `INST_M_SIGNED(muldiv_op); wire is_signed_op = `INST_M_SIGNED(muldiv_op);
`ifdef XLEN_64 `ifdef XLEN_64
wire is_alu_w = `INST_ALU_IS_W(execute_if.data.op_mod); wire is_alu_w = execute_if.data.op_args.alu.is_w;
`else `else
wire is_alu_w = 0; wire is_alu_w = 0;
`endif `endif
@ -47,7 +47,7 @@ module VX_muldiv_unit #(
wire [`UUID_WIDTH-1:0] mul_uuid_out; wire [`UUID_WIDTH-1:0] mul_uuid_out;
wire [`NW_WIDTH-1:0] mul_wid_out; wire [`NW_WIDTH-1:0] mul_wid_out;
wire [NUM_LANES-1:0] mul_tmask_out; wire [NUM_LANES-1:0] mul_tmask_out;
wire [`XLEN-1:0] mul_PC_out; wire [`PC_BITS-1:0] mul_PC_out;
wire [`NR_BITS-1:0] mul_rd_out; wire [`NR_BITS-1:0] mul_rd_out;
wire mul_wb_out; wire mul_wb_out;
wire [PID_WIDTH-1:0] mul_pid_out; wire [PID_WIDTH-1:0] mul_pid_out;
@ -69,7 +69,7 @@ module VX_muldiv_unit #(
wire mul_fire_in = mul_valid_in && mul_ready_in; wire mul_fire_in = mul_valid_in && mul_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] mul_resultl, mul_resulth; reg [`XLEN-1:0] mul_resultl, mul_resulth;
wire [`XLEN-1:0] mul_in1 = is_alu_w ? (execute_if.data.rs1_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs1_data[i]; wire [`XLEN-1:0] mul_in1 = is_alu_w ? (execute_if.data.rs1_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs1_data[i];
wire [`XLEN-1:0] mul_in2 = is_alu_w ? (execute_if.data.rs2_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs2_data[i]; wire [`XLEN-1:0] mul_in2 = is_alu_w ? (execute_if.data.rs2_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs2_data[i];
always @(*) begin always @(*) begin
@ -79,7 +79,7 @@ module VX_muldiv_unit #(
end end
VX_shift_register #( VX_shift_register #(
.DATAW (1 + TAGW + (NUM_LANES * `XLEN)), .DATAW (1 + TAG_WIDTH + (NUM_LANES * `XLEN)),
.DEPTH (`LATENCY_IMUL), .DEPTH (`LATENCY_IMUL),
.RESETW (1) .RESETW (1)
) mul_shift_reg ( ) mul_shift_reg (
@ -138,7 +138,7 @@ module VX_muldiv_unit #(
.result (mul_result_tmp) .result (mul_result_tmp)
); );
reg [TAGW+2-1:0] mul_tag_r; reg [TAG_WIDTH+2-1:0] mul_tag_r;
always @(posedge clk) begin always @(posedge clk) begin
if (mul_valid_in && mul_ready_in) begin if (mul_valid_in && mul_ready_in) begin
mul_tag_r <= {execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, is_mulh_in, is_alu_w, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}; mul_tag_r <= {execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, is_mulh_in, is_alu_w, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop};
@ -169,7 +169,7 @@ module VX_muldiv_unit #(
end end
VX_shift_register #( VX_shift_register #(
.DATAW (1 + TAGW + 1 + 1), .DATAW (1 + TAG_WIDTH + 1 + 1),
.DEPTH (`LATENCY_IMUL), .DEPTH (`LATENCY_IMUL),
.RESETW (1) .RESETW (1)
) mul_shift_reg ( ) mul_shift_reg (
@ -203,7 +203,7 @@ module VX_muldiv_unit #(
wire [`UUID_WIDTH-1:0] div_uuid_out; wire [`UUID_WIDTH-1:0] div_uuid_out;
wire [`NW_WIDTH-1:0] div_wid_out; wire [`NW_WIDTH-1:0] div_wid_out;
wire [NUM_LANES-1:0] div_tmask_out; wire [NUM_LANES-1:0] div_tmask_out;
wire [`XLEN-1:0] div_PC_out; wire [`PC_BITS-1:0] div_PC_out;
wire [`NR_BITS-1:0] div_rd_out; wire [`NR_BITS-1:0] div_rd_out;
wire div_wb_out; wire div_wb_out;
wire [PID_WIDTH-1:0] div_pid_out; wire [PID_WIDTH-1:0] div_pid_out;
@ -235,7 +235,7 @@ module VX_muldiv_unit #(
wire div_fire_in = div_valid_in && div_ready_in; wire div_fire_in = div_valid_in && div_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] div_quotient, div_remainder; reg [`XLEN-1:0] div_quotient, div_remainder;
always @(*) begin always @(*) begin
dpi_idiv (div_fire_in, is_signed_op, div_in1[i], div_in2[i], div_quotient, div_remainder); dpi_idiv (div_fire_in, is_signed_op, div_in1[i], div_in2[i], div_quotient, div_remainder);
end end
@ -244,7 +244,7 @@ module VX_muldiv_unit #(
end end
VX_shift_register #( VX_shift_register #(
.DATAW (1 + TAGW + (NUM_LANES * `XLEN)), .DATAW (1 + TAG_WIDTH + (NUM_LANES * `XLEN)),
.DEPTH (`LATENCY_IMUL), .DEPTH (`LATENCY_IMUL),
.RESETW (1) .RESETW (1)
) div_shift_reg ( ) div_shift_reg (
@ -297,7 +297,7 @@ module VX_muldiv_unit #(
.remainder (div_remainder) .remainder (div_remainder)
); );
reg [TAGW+2-1:0] div_tag_r; reg [TAG_WIDTH+2-1:0] div_tag_r;
always @(posedge clk) begin always @(posedge clk) begin
if (div_valid_in && div_ready_in) begin if (div_valid_in && div_ready_in) begin
div_tag_r <= {execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, is_rem_op, is_alu_w, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}; div_tag_r <= {execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, is_rem_op, is_alu_w, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop};
@ -323,8 +323,9 @@ module VX_muldiv_unit #(
VX_stream_arb #( VX_stream_arb #(
.NUM_INPUTS (2), .NUM_INPUTS (2),
.DATAW (TAGW + (NUM_LANES * `XLEN)), .DATAW (TAG_WIDTH + (NUM_LANES * `XLEN)),
.OUT_REG (1) .ARBITER ("F"),
.OUT_BUF (1)
) rsp_buf ( ) rsp_buf (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),

View file

@ -14,7 +14,7 @@
`include "VX_define.vh" `include "VX_define.vh"
module VX_alu_unit #( module VX_alu_unit #(
parameter CORE_ID = 0 parameter `STRING INSTANCE_ID = ""
) ( ) (
input wire clk, input wire clk,
input wire reset, input wire reset,
@ -27,60 +27,58 @@ module VX_alu_unit #(
VX_branch_ctl_if.master branch_ctl_if [`NUM_ALU_BLOCKS] VX_branch_ctl_if.master branch_ctl_if [`NUM_ALU_BLOCKS]
); );
`UNUSED_PARAM (CORE_ID) `UNUSED_SPARAM (INSTANCE_ID)
localparam BLOCK_SIZE = `NUM_ALU_BLOCKS; localparam BLOCK_SIZE = `NUM_ALU_BLOCKS;
localparam NUM_LANES = `NUM_ALU_LANES; localparam NUM_LANES = `NUM_ALU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS); localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1; localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam RSP_ARB_SIZE = 1 + `EXT_M_ENABLED; localparam RSP_ARB_SIZE = 1 + `EXT_M_ENABLED;
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS); localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
VX_execute_if #( VX_execute_if #(
.NUM_LANES (NUM_LANES) .NUM_LANES (NUM_LANES)
) execute_if[BLOCK_SIZE](); ) per_block_execute_if[BLOCK_SIZE]();
`RESET_RELAY (dispatch_reset, reset);
VX_dispatch_unit #( VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE), .BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES), .NUM_LANES (NUM_LANES),
.OUT_REG (PARTIAL_BW ? 1 : 0) .OUT_BUF (PARTIAL_BW ? 1 : 0)
) dispatch_unit ( ) dispatch_unit (
.clk (clk), .clk (clk),
.reset (dispatch_reset), .reset (reset),
.dispatch_if(dispatch_if), .dispatch_if(dispatch_if),
.execute_if (execute_if) .execute_if (per_block_execute_if)
); );
VX_commit_if #( VX_commit_if #(
.NUM_LANES (NUM_LANES) .NUM_LANES (NUM_LANES)
) commit_block_if[BLOCK_SIZE](); ) per_block_commit_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
wire is_muldiv_op; `RESET_RELAY_EN (block_reset, reset,(BLOCK_SIZE > 1));
wire is_muldiv_op = `EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == `ALU_TYPE_MULDIV);
VX_execute_if #( VX_execute_if #(
.NUM_LANES (NUM_LANES) .NUM_LANES (NUM_LANES)
) int_execute_if(); ) int_execute_if();
assign int_execute_if.valid = execute_if[block_idx].valid && ~is_muldiv_op;
assign int_execute_if.data = execute_if[block_idx].data;
VX_commit_if #( VX_commit_if #(
.NUM_LANES (NUM_LANES) .NUM_LANES (NUM_LANES)
) int_commit_if(); ) int_commit_if();
`RESET_RELAY (int_reset, reset); assign int_execute_if.valid = per_block_execute_if[block_idx].valid && ~is_muldiv_op;
assign int_execute_if.data = per_block_execute_if[block_idx].data;
VX_int_unit #( VX_alu_int #(
.CORE_ID (CORE_ID), .INSTANCE_ID ($sformatf("%s-int%0d", INSTANCE_ID, block_idx)),
.BLOCK_IDX (block_idx), .BLOCK_IDX (block_idx),
.NUM_LANES (NUM_LANES) .NUM_LANES (NUM_LANES)
) int_unit ( ) alu_int (
.clk (clk), .clk (clk),
.reset (int_reset), .reset (block_reset),
.execute_if (int_execute_if), .execute_if (int_execute_if),
.branch_ctl_if (branch_ctl_if[block_idx]), .branch_ctl_if (branch_ctl_if[block_idx]),
.commit_if (int_commit_if) .commit_if (int_commit_if)
@ -88,84 +86,78 @@ module VX_alu_unit #(
`ifdef EXT_M_ENABLE `ifdef EXT_M_ENABLE
assign is_muldiv_op = `INST_ALU_IS_M(execute_if[block_idx].data.op_mod);
`RESET_RELAY (mdv_reset, reset);
VX_execute_if #( VX_execute_if #(
.NUM_LANES (NUM_LANES) .NUM_LANES (NUM_LANES)
) mdv_execute_if(); ) muldiv_execute_if();
assign mdv_execute_if.valid = execute_if[block_idx].valid && is_muldiv_op;
assign mdv_execute_if.data = execute_if[block_idx].data;
VX_commit_if #( VX_commit_if #(
.NUM_LANES (NUM_LANES) .NUM_LANES (NUM_LANES)
) mdv_commit_if(); ) muldiv_commit_if();
VX_muldiv_unit #( assign muldiv_execute_if.valid = per_block_execute_if[block_idx].valid && is_muldiv_op;
.CORE_ID (CORE_ID), assign muldiv_execute_if.data = per_block_execute_if[block_idx].data;
VX_alu_muldiv #(
.INSTANCE_ID ($sformatf("%s-muldiv%0d", INSTANCE_ID, block_idx)),
.NUM_LANES (NUM_LANES) .NUM_LANES (NUM_LANES)
) mdv_unit ( ) muldiv_unit (
.clk (clk), .clk (clk),
.reset (mdv_reset), .reset (block_reset),
.execute_if (mdv_execute_if), .execute_if (muldiv_execute_if),
.commit_if (mdv_commit_if) .commit_if (muldiv_commit_if)
); );
assign execute_if[block_idx].ready = is_muldiv_op ? mdv_execute_if.ready : int_execute_if.ready;
`else
assign is_muldiv_op = 0;
assign execute_if[block_idx].ready = int_execute_if.ready;
`endif `endif
assign per_block_execute_if[block_idx].ready =
`ifdef EXT_M_ENABLE
is_muldiv_op ? muldiv_execute_if.ready :
`endif
int_execute_if.ready;
// send response // send response
VX_stream_arb #( VX_stream_arb #(
.NUM_INPUTS (RSP_ARB_SIZE), .NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW), .DATAW (RSP_ARB_DATAW),
.OUT_REG (PARTIAL_BW ? 1 : 3) .OUT_BUF (PARTIAL_BW ? 1 : 3),
.ARBITER ("F")
) rsp_arb ( ) rsp_arb (
.clk (clk), .clk (clk),
.reset (reset), .reset (block_reset),
.valid_in ({ .valid_in ({
`ifdef EXT_M_ENABLE `ifdef EXT_M_ENABLE
mdv_commit_if.valid, muldiv_commit_if.valid,
`endif `endif
int_commit_if.valid int_commit_if.valid
}), }),
.ready_in ({ .ready_in ({
`ifdef EXT_M_ENABLE `ifdef EXT_M_ENABLE
mdv_commit_if.ready, muldiv_commit_if.ready,
`endif `endif
int_commit_if.ready int_commit_if.ready
}), }),
.data_in ({ .data_in ({
`ifdef EXT_M_ENABLE `ifdef EXT_M_ENABLE
mdv_commit_if.data, muldiv_commit_if.data,
`endif `endif
int_commit_if.data int_commit_if.data
}), }),
.data_out (commit_block_if[block_idx].data), .data_out (per_block_commit_if[block_idx].data),
.valid_out (commit_block_if[block_idx].valid), .valid_out (per_block_commit_if[block_idx].valid),
.ready_out (commit_block_if[block_idx].ready), .ready_out (per_block_commit_if[block_idx].ready),
`UNUSED_PIN (sel_out) `UNUSED_PIN (sel_out)
); );
end end
`RESET_RELAY (commit_reset, reset);
VX_gather_unit #( VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE), .BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES), .NUM_LANES (NUM_LANES),
.OUT_REG (PARTIAL_BW ? 3 : 0) .OUT_BUF (PARTIAL_BW ? 3 : 0)
) gather_unit ( ) gather_unit (
.clk (clk), .clk (clk),
.reset (commit_reset), .reset (reset),
.commit_in_if (commit_block_if), .commit_in_if (per_block_commit_if),
.commit_out_if (commit_if) .commit_out_if (commit_if)
); );

View file

@ -13,88 +13,69 @@
`include "VX_define.vh" `include "VX_define.vh"
module VX_commit import VX_gpu_pkg::*; #( module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter CORE_ID = 0 parameter `STRING INSTANCE_ID = ""
) ( ) (
input wire clk, input wire clk,
input wire reset, input wire reset,
// inputs // inputs
VX_commit_if.slave alu_commit_if [`ISSUE_WIDTH], VX_commit_if.slave commit_if [`NUM_EX_UNITS * `ISSUE_WIDTH],
VX_commit_if.slave lsu_commit_if [`ISSUE_WIDTH],
`ifdef EXT_F_ENABLE
VX_commit_if.slave fpu_commit_if [`ISSUE_WIDTH],
`endif
VX_commit_if.slave sfu_commit_if [`ISSUE_WIDTH],
// outputs // outputs
VX_writeback_if.master writeback_if [`ISSUE_WIDTH], VX_writeback_if.master writeback_if [`ISSUE_WIDTH],
VX_commit_csr_if.master commit_csr_if, VX_commit_csr_if.master commit_csr_if,
VX_commit_sched_if.master commit_sched_if, VX_commit_sched_if.master commit_sched_if
// simulation helper signals
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value
); );
`UNUSED_PARAM (CORE_ID) `UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + `NUM_THREADS * `XLEN + 1 + 1 + 1; localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `NR_BITS + `NUM_THREADS * `XLEN + 1 + 1 + 1;
localparam COMMIT_SIZEW = `CLOG2(`NUM_THREADS + 1); localparam COMMIT_SIZEW = `CLOG2(`NUM_THREADS + 1);
localparam COMMIT_ALL_SIZEW = COMMIT_SIZEW + `ISSUE_WIDTH - 1; localparam COMMIT_ALL_SIZEW = COMMIT_SIZEW + `ISSUE_WIDTH - 1;
// commit arbitration // commit arbitration
VX_commit_if commit_if[`ISSUE_WIDTH](); VX_commit_if commit_arb_if[`ISSUE_WIDTH]();
wire [`ISSUE_WIDTH-1:0] commit_fire; wire [`ISSUE_WIDTH-1:0] per_issue_commit_fire;
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid; wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] per_issue_commit_wid;
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask; wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] per_issue_commit_tmask;
wire [`ISSUE_WIDTH-1:0] commit_eop; wire [`ISSUE_WIDTH-1:0] per_issue_commit_eop;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [`NUM_EX_UNITS-1:0] valid_in;
wire [`NUM_EX_UNITS-1:0][DATAW-1:0] data_in;
wire [`NUM_EX_UNITS-1:0] ready_in;
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin
assign valid_in[j] = commit_if[j * `ISSUE_WIDTH + i].valid;
assign data_in[j] = commit_if[j * `ISSUE_WIDTH + i].data;
assign commit_if[j * `ISSUE_WIDTH + i].ready = ready_in[j];
end
`RESET_RELAY (arb_reset, reset); `RESET_RELAY (arb_reset, reset);
VX_stream_arb #( VX_stream_arb #(
.NUM_INPUTS (`NUM_EX_UNITS), .NUM_INPUTS (`NUM_EX_UNITS),
.DATAW (DATAW), .DATAW (DATAW),
.ARBITER ("R"), .ARBITER ("R"),
.OUT_REG (1) .OUT_BUF (1)
) commit_arb ( ) commit_arb (
.clk (clk), .clk (clk),
.reset (arb_reset), .reset (arb_reset),
.valid_in ({ .valid_in (valid_in),
sfu_commit_if[i].valid, .ready_in (ready_in),
`ifdef EXT_F_ENABLE .data_in (data_in),
fpu_commit_if[i].valid, .data_out (commit_arb_if[i].data),
`endif .valid_out (commit_arb_if[i].valid),
alu_commit_if[i].valid, .ready_out (commit_arb_if[i].ready),
lsu_commit_if[i].valid
}),
.ready_in ({
sfu_commit_if[i].ready,
`ifdef EXT_F_ENABLE
fpu_commit_if[i].ready,
`endif
alu_commit_if[i].ready,
lsu_commit_if[i].ready
}),
.data_in ({
sfu_commit_if[i].data,
`ifdef EXT_F_ENABLE
fpu_commit_if[i].data,
`endif
alu_commit_if[i].data,
lsu_commit_if[i].data
}),
.data_out (commit_if[i].data),
.valid_out (commit_if[i].valid),
.ready_out (commit_if[i].ready),
`UNUSED_PIN (sel_out) `UNUSED_PIN (sel_out)
); );
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready; assign per_issue_commit_fire[i] = commit_arb_if[i].valid && commit_arb_if[i].ready;
assign commit_tmask[i]= {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask; assign per_issue_commit_tmask[i]= {`NUM_THREADS{per_issue_commit_fire[i]}} & commit_arb_if[i].data.tmask;
assign commit_wid[i] = commit_if[i].data.wid; assign per_issue_commit_wid[i] = commit_arb_if[i].data.wid;
assign commit_eop[i] = commit_if[i].data.eop; assign per_issue_commit_eop[i] = commit_arb_if[i].data.eop;
end end
// CSRs update // CSRs update
@ -103,11 +84,11 @@ module VX_commit import VX_gpu_pkg::*; #(
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all_r, commit_size_all_rr; wire [COMMIT_ALL_SIZEW-1:0] commit_size_all_r, commit_size_all_rr;
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr; wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
assign commit_fire_any = (| commit_fire); assign commit_fire_any = (| per_issue_commit_fire);
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [COMMIT_SIZEW-1:0] count; wire [COMMIT_SIZEW-1:0] count;
`POP_COUNT(count, commit_tmask[i]); `POP_COUNT(count, per_issue_commit_tmask[i]);
assign commit_size[i] = count; assign commit_size[i] = count;
end end
@ -155,69 +136,56 @@ module VX_commit import VX_gpu_pkg::*; #(
end end
assign commit_csr_if.instret = instret; assign commit_csr_if.instret = instret;
// Committed instructions // Track committed instructions
wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop; reg [`NUM_WARPS-1:0] committed_warps;
always @(*) begin
committed_warps = 0;
for (integer i = 0; i < `ISSUE_WIDTH; ++i) begin
if (per_issue_commit_fire[i] && per_issue_commit_eop[i]) begin
committed_warps[per_issue_commit_wid[i]] = 1;
end
end
end
VX_pipe_register #( VX_pipe_register #(
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)), .DATAW (`NUM_WARPS),
.RESETW (`ISSUE_WIDTH) .RESETW (`NUM_WARPS)
) committed_pipe_reg ( ) committed_pipe_reg (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.enable (1'b1), .enable (1'b1),
.data_in ({committed, commit_wid}), .data_in (committed_warps),
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid}) .data_out ({commit_sched_if.committed_warps})
); );
// Writeback // Writeback
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb; assign writeback_if[i].valid = commit_arb_if[i].valid && commit_arb_if[i].data.wb;
assign writeback_if[i].data.uuid = commit_if[i].data.uuid; assign writeback_if[i].data.uuid = commit_arb_if[i].data.uuid;
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid); assign writeback_if[i].data.wis = wid_to_wis(commit_arb_if[i].data.wid);
assign writeback_if[i].data.PC = commit_if[i].data.PC; assign writeback_if[i].data.PC = commit_arb_if[i].data.PC;
assign writeback_if[i].data.tmask= commit_if[i].data.tmask; assign writeback_if[i].data.tmask= commit_arb_if[i].data.tmask;
assign writeback_if[i].data.rd = commit_if[i].data.rd; assign writeback_if[i].data.rd = commit_arb_if[i].data.rd;
assign writeback_if[i].data.data = commit_if[i].data.data; assign writeback_if[i].data.data = commit_arb_if[i].data.data;
assign writeback_if[i].data.sop = commit_if[i].data.sop; assign writeback_if[i].data.sop = commit_arb_if[i].data.sop;
assign writeback_if[i].data.eop = commit_if[i].data.eop; assign writeback_if[i].data.eop = commit_arb_if[i].data.eop;
assign commit_if[i].ready = 1'b1; // writeback has no backpressure assign commit_arb_if[i].ready = 1'b1; // writeback has no backpressure
end end
// simulation helper signal to get RISC-V tests Pass/Fail status `ifdef DBG_TRACE_PIPELINE
reg [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value_r;
always @(posedge clk) begin
if (writeback_if[0].valid) begin
sim_wb_value_r[writeback_if[0].data.rd] <= writeback_if[0].data.data[0];
end
end
assign sim_wb_value = sim_wb_value_r;
`ifdef DBG_TRACE_CORE_PIPELINE
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin
always @(posedge clk) begin always @(posedge clk) begin
if (alu_commit_if[i].valid && alu_commit_if[i].ready) begin if (commit_if[j * `ISSUE_WIDTH + i].valid && commit_if[j * `ISSUE_WIDTH + i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, alu_commit_if[i].data.wid, alu_commit_if[i].data.PC, alu_commit_if[i].data.tmask, alu_commit_if[i].data.wb, alu_commit_if[i].data.rd, alu_commit_if[i].data.sop, alu_commit_if[i].data.eop)); `TRACE(1, ("%d: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, commit_if[j * `ISSUE_WIDTH + i].data.wid, {commit_if[j * `ISSUE_WIDTH + i].data.PC, 1'b0}));
`TRACE_ARRAY1D(1, alu_commit_if[i].data.data, `NUM_THREADS); trace_ex_type(1, j);
`TRACE(1, (" (#%0d)\n", alu_commit_if[i].data.uuid)); `TRACE(1, (", tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", commit_if[j * `ISSUE_WIDTH + i].data.tmask, commit_if[j * `ISSUE_WIDTH + i].data.wb, commit_if[j * `ISSUE_WIDTH + i].data.rd, commit_if[j * `ISSUE_WIDTH + i].data.sop, commit_if[j * `ISSUE_WIDTH + i].data.eop));
`TRACE_ARRAY1D(1, "0x%0h", commit_if[j * `ISSUE_WIDTH + i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", commit_if[j * `ISSUE_WIDTH + i].data.uuid));
end end
if (lsu_commit_if[i].valid && lsu_commit_if[i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, lsu_commit_if[i].data.wid, lsu_commit_if[i].data.PC, lsu_commit_if[i].data.tmask, lsu_commit_if[i].data.wb, lsu_commit_if[i].data.rd, lsu_commit_if[i].data.sop, lsu_commit_if[i].data.eop));
`TRACE_ARRAY1D(1, lsu_commit_if[i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", lsu_commit_if[i].data.uuid));
end
`ifdef EXT_F_ENABLE
if (fpu_commit_if[i].valid && fpu_commit_if[i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=FPU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, fpu_commit_if[i].data.wid, fpu_commit_if[i].data.PC, fpu_commit_if[i].data.tmask, fpu_commit_if[i].data.wb, fpu_commit_if[i].data.rd, fpu_commit_if[i].data.sop, fpu_commit_if[i].data.eop));
`TRACE_ARRAY1D(1, fpu_commit_if[i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", fpu_commit_if[i].data.uuid));
end
`endif
if (sfu_commit_if[i].valid && sfu_commit_if[i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=SFU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, sfu_commit_if[i].data.wid, sfu_commit_if[i].data.PC, sfu_commit_if[i].data.tmask, sfu_commit_if[i].data.wb, sfu_commit_if[i].data.rd, sfu_commit_if[i].data.sop, sfu_commit_if[i].data.eop));
`TRACE_ARRAY1D(1, sfu_commit_if[i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", sfu_commit_if[i].data.uuid));
end end
end end
end end

View file

@ -18,7 +18,8 @@
`endif `endif
module VX_core import VX_gpu_pkg::*; #( module VX_core import VX_gpu_pkg::*; #(
parameter CORE_ID = 0 parameter CORE_ID = 0,
parameter `STRING INSTANCE_ID = ""
) ( ) (
`SCOPE_IO_DECL `SCOPE_IO_DECL
@ -40,10 +41,6 @@ module VX_core import VX_gpu_pkg::*; #(
VX_gbar_bus_if.master gbar_bus_if, VX_gbar_bus_if.master gbar_bus_if,
`endif `endif
// simulation helper signals
output wire sim_ebreak,
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
// Status // Status
output wire busy output wire busy
); );
@ -57,39 +54,24 @@ module VX_core import VX_gpu_pkg::*; #(
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS](); VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
VX_warp_ctl_if warp_ctl_if(); VX_warp_ctl_if warp_ctl_if();
VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH](); VX_dispatch_if dispatch_if[`NUM_EX_UNITS * `ISSUE_WIDTH]();
VX_commit_if alu_commit_if[`ISSUE_WIDTH](); VX_commit_if commit_if[`NUM_EX_UNITS * `ISSUE_WIDTH]();
VX_dispatch_if lsu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if lsu_commit_if[`ISSUE_WIDTH]();
`ifdef EXT_F_ENABLE
VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if fpu_commit_if[`ISSUE_WIDTH]();
`endif
VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if sfu_commit_if[`ISSUE_WIDTH]();
VX_writeback_if writeback_if[`ISSUE_WIDTH](); VX_writeback_if writeback_if[`ISSUE_WIDTH]();
VX_mem_bus_if #( VX_lsu_mem_if #(
.DATA_SIZE (DCACHE_WORD_SIZE), .NUM_LANES (`NUM_LSU_LANES),
.TAG_WIDTH (DCACHE_TAG_WIDTH) .DATA_SIZE (LSU_WORD_SIZE),
) dcache_bus_tmp_if[DCACHE_NUM_REQS](); .TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_mem_if[`NUM_LSU_BLOCKS]();
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_pipeline_perf_if pipeline_perf_if();
VX_mem_perf_if mem_perf_tmp_if(); VX_mem_perf_if mem_perf_tmp_if();
VX_pipeline_perf_if pipeline_perf_if();
assign mem_perf_tmp_if.icache = mem_perf_if.icache; assign mem_perf_tmp_if.icache = mem_perf_if.icache;
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache; assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache; assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache; assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
`ifdef SM_ENABLE
cache_perf_t smem_perf;
assign mem_perf_tmp_if.smem = smem_perf;
`else
assign mem_perf_tmp_if.smem = '0;
`endif
assign mem_perf_tmp_if.mem = mem_perf_if.mem; assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif `endif
@ -113,19 +95,21 @@ module VX_core import VX_gpu_pkg::*; #(
`SCOPE_IO_SWITCH (3) `SCOPE_IO_SWITCH (3)
VX_schedule #( VX_schedule #(
.INSTANCE_ID ($sformatf("%s-schedule", INSTANCE_ID)),
.CORE_ID (CORE_ID) .CORE_ID (CORE_ID)
) schedule ( ) schedule (
.clk (clk), .clk (clk),
.reset (schedule_reset), .reset (schedule_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.perf_schedule_if (pipeline_perf_if.schedule), .sched_perf (pipeline_perf_if.sched),
`endif `endif
.base_dcrs (base_dcrs), .base_dcrs (base_dcrs),
.warp_ctl_if (warp_ctl_if), .warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if), .branch_ctl_if (branch_ctl_if),
.decode_sched_if(decode_sched_if), .decode_sched_if(decode_sched_if),
.commit_sched_if(commit_sched_if), .commit_sched_if(commit_sched_if),
@ -139,7 +123,7 @@ module VX_core import VX_gpu_pkg::*; #(
); );
VX_fetch #( VX_fetch #(
.CORE_ID (CORE_ID) .INSTANCE_ID ($sformatf("%s-fetch", INSTANCE_ID))
) fetch ( ) fetch (
`SCOPE_IO_BIND (0) `SCOPE_IO_BIND (0)
.clk (clk), .clk (clk),
@ -150,7 +134,7 @@ module VX_core import VX_gpu_pkg::*; #(
); );
VX_decode #( VX_decode #(
.CORE_ID (CORE_ID) .INSTANCE_ID ($sformatf("%s-decode", INSTANCE_ID))
) decode ( ) decode (
.clk (clk), .clk (clk),
.reset (decode_reset), .reset (decode_reset),
@ -160,7 +144,7 @@ module VX_core import VX_gpu_pkg::*; #(
); );
VX_issue #( VX_issue #(
.CORE_ID (CORE_ID) .INSTANCE_ID ($sformatf("%s-issue", INSTANCE_ID))
) issue ( ) issue (
`SCOPE_IO_BIND (1) `SCOPE_IO_BIND (1)
@ -168,21 +152,16 @@ module VX_core import VX_gpu_pkg::*; #(
.reset (issue_reset), .reset (issue_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.perf_issue_if (pipeline_perf_if.issue), .issue_perf (pipeline_perf_if.issue),
`endif `endif
.decode_if (decode_if), .decode_if (decode_if),
.writeback_if (writeback_if), .writeback_if (writeback_if),
.dispatch_if (dispatch_if)
.alu_dispatch_if(alu_dispatch_if),
.lsu_dispatch_if(lsu_dispatch_if),
`ifdef EXT_F_ENABLE
.fpu_dispatch_if(fpu_dispatch_if),
`endif
.sfu_dispatch_if(sfu_dispatch_if)
); );
VX_execute #( VX_execute #(
.INSTANCE_ID ($sformatf("%s-execute", INSTANCE_ID)),
.CORE_ID (CORE_ID) .CORE_ID (CORE_ID)
) execute ( ) execute (
`SCOPE_IO_BIND (2) `SCOPE_IO_BIND (2)
@ -190,89 +169,175 @@ module VX_core import VX_gpu_pkg::*; #(
.clk (clk), .clk (clk),
.reset (execute_reset), .reset (execute_reset),
.base_dcrs (base_dcrs),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if), .mem_perf_if (mem_perf_tmp_if),
.pipeline_perf_if(pipeline_perf_if), .pipeline_perf_if(pipeline_perf_if),
`endif `endif
.dcache_bus_if (dcache_bus_tmp_if), .base_dcrs (base_dcrs),
`ifdef EXT_F_ENABLE .lsu_mem_if (lsu_mem_if),
.fpu_dispatch_if(fpu_dispatch_if),
.fpu_commit_if (fpu_commit_if), .dispatch_if (dispatch_if),
`endif .commit_if (commit_if),
.commit_csr_if (commit_csr_if), .commit_csr_if (commit_csr_if),
.sched_csr_if (sched_csr_if), .sched_csr_if (sched_csr_if),
.alu_dispatch_if(alu_dispatch_if),
.lsu_dispatch_if(lsu_dispatch_if),
.sfu_dispatch_if(sfu_dispatch_if),
.warp_ctl_if (warp_ctl_if), .warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if), .branch_ctl_if (branch_ctl_if)
.alu_commit_if (alu_commit_if),
.lsu_commit_if (lsu_commit_if),
.sfu_commit_if (sfu_commit_if),
.sim_ebreak (sim_ebreak)
); );
VX_commit #( VX_commit #(
.CORE_ID (CORE_ID) .INSTANCE_ID ($sformatf("%s-commit", INSTANCE_ID))
) commit ( ) commit (
.clk (clk), .clk (clk),
.reset (commit_reset), .reset (commit_reset),
.alu_commit_if (alu_commit_if), .commit_if (commit_if),
.lsu_commit_if (lsu_commit_if),
`ifdef EXT_F_ENABLE
.fpu_commit_if (fpu_commit_if),
`endif
.sfu_commit_if (sfu_commit_if),
.writeback_if (writeback_if), .writeback_if (writeback_if),
.commit_csr_if (commit_csr_if), .commit_csr_if (commit_csr_if),
.commit_sched_if(commit_sched_if), .commit_sched_if(commit_sched_if)
.sim_wb_value (sim_wb_value)
); );
`ifdef SM_ENABLE VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_dcache_if[`NUM_LSU_BLOCKS]();
VX_smem_unit #( `ifdef LMEM_ENABLE
.CORE_ID (CORE_ID)
) smem_unit ( `RESET_RELAY (lmem_unit_reset, reset);
VX_lmem_unit #(
.INSTANCE_ID (INSTANCE_ID)
) lmem_unit (
.clk (clk), .clk (clk),
.reset (reset), .reset (lmem_unit_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.cache_perf (smem_perf), .cache_perf (mem_perf_tmp_if.lmem),
`endif `endif
.dcache_bus_in_if (dcache_bus_tmp_if), .lsu_mem_in_if (lsu_mem_if),
.dcache_bus_out_if (dcache_bus_if) .lsu_mem_out_if (lsu_dcache_if)
); );
`else `else
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]); `ASSIGN_VX_LSU_MEM_IF (lsu_dcache_if[i], lsu_mem_if[i]);
end end
`endif `endif
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
VX_lsu_mem_if #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_coalesced_if();
if (LSU_WORD_SIZE != DCACHE_WORD_SIZE) begin
`RESET_RELAY (mem_coalescer_reset, reset);
VX_mem_coalescer #(
.INSTANCE_ID ($sformatf("%s-coalescer%0d", INSTANCE_ID, i)),
.NUM_REQS (`NUM_LSU_LANES),
.DATA_IN_SIZE (LSU_WORD_SIZE),
.DATA_OUT_SIZE (DCACHE_WORD_SIZE),
.ADDR_WIDTH (LSU_ADDR_WIDTH),
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.QUEUE_SIZE (`LSUQ_OUT_SIZE)
) mem_coalescer (
.clk (clk),
.reset (mem_coalescer_reset),
// Input request
.in_req_valid (lsu_dcache_if[i].req_valid),
.in_req_mask (lsu_dcache_if[i].req_data.mask),
.in_req_rw (lsu_dcache_if[i].req_data.rw),
.in_req_byteen (lsu_dcache_if[i].req_data.byteen),
.in_req_addr (lsu_dcache_if[i].req_data.addr),
.in_req_atype (lsu_dcache_if[i].req_data.atype),
.in_req_data (lsu_dcache_if[i].req_data.data),
.in_req_tag (lsu_dcache_if[i].req_data.tag),
.in_req_ready (lsu_dcache_if[i].req_ready),
// Input response
.in_rsp_valid (lsu_dcache_if[i].rsp_valid),
.in_rsp_mask (lsu_dcache_if[i].rsp_data.mask),
.in_rsp_data (lsu_dcache_if[i].rsp_data.data),
.in_rsp_tag (lsu_dcache_if[i].rsp_data.tag),
.in_rsp_ready (lsu_dcache_if[i].rsp_ready),
// Output request
.out_req_valid (dcache_coalesced_if.req_valid),
.out_req_mask (dcache_coalesced_if.req_data.mask),
.out_req_rw (dcache_coalesced_if.req_data.rw),
.out_req_byteen (dcache_coalesced_if.req_data.byteen),
.out_req_addr (dcache_coalesced_if.req_data.addr),
.out_req_atype (dcache_coalesced_if.req_data.atype),
.out_req_data (dcache_coalesced_if.req_data.data),
.out_req_tag (dcache_coalesced_if.req_data.tag),
.out_req_ready (dcache_coalesced_if.req_ready),
// Output response
.out_rsp_valid (dcache_coalesced_if.rsp_valid),
.out_rsp_mask (dcache_coalesced_if.rsp_data.mask),
.out_rsp_data (dcache_coalesced_if.rsp_data.data),
.out_rsp_tag (dcache_coalesced_if.rsp_data.tag),
.out_rsp_ready (dcache_coalesced_if.rsp_ready)
);
end else begin
`ASSIGN_VX_LSU_MEM_IF (dcache_coalesced_if, lsu_dcache_if[i]);
end
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_tmp_if[DCACHE_CHANNELS]();
`RESET_RELAY (lsu_adapter_reset, reset);
VX_lsu_adapter #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (0)
) lsu_adapter (
.clk (clk),
.reset (lsu_adapter_reset),
.lsu_mem_if (dcache_coalesced_if),
.mem_bus_if (dcache_bus_tmp_if)
);
for (genvar j = 0; j < DCACHE_CHANNELS; ++j) begin
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i * DCACHE_CHANNELS + j], dcache_bus_tmp_if[j]);
end
end
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle; wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle; wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
wire [1:0] perf_icache_pending_read_cycle; wire [1:0] perf_icache_pending_read_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle; wire [`CLOG2(LSU_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads; reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads; reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
@ -281,19 +346,26 @@ module VX_core import VX_gpu_pkg::*; #(
reg [`PERF_CTR_BITS-1:0] perf_loads; reg [`PERF_CTR_BITS-1:0] perf_loads;
reg [`PERF_CTR_BITS-1:0] perf_stores; reg [`PERF_CTR_BITS-1:0] perf_stores;
wire perf_icache_req_fire = icache_bus_if.req_valid & icache_bus_if.req_ready; wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid & icache_bus_if.rsp_ready; wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_wr_req_fire, perf_dcache_rsp_fire; wire [LSU_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
wire [LSU_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
wire [LSU_NUM_REQS-1:0] perf_dcache_rsp_fire;
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && ~dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready; for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_data.rw && dcache_bus_if[i].req_ready; assign perf_dcache_rd_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && ~lsu_mem_if[i].req_data.rw;
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready; assign perf_dcache_wr_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && lsu_mem_if[i].req_data.rw;
assign perf_dcache_rsp_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].rsp_valid && lsu_mem_if[i].rsp_data.mask[j] && lsu_mem_if[i].rsp_ready;
end
end end
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire); `BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire);
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire); `BUFFER(perf_dcache_wr_req_fire_r, perf_dcache_wr_req_fire);
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire_r);
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire_r);
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire); `POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire; assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire;

View file

@ -32,13 +32,14 @@ module VX_core_top import VX_gpu_pkg::*; #(
output wire [DCACHE_NUM_REQS-1:0] dcache_req_rw, output wire [DCACHE_NUM_REQS-1:0] dcache_req_rw,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] dcache_req_byteen, output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] dcache_req_byteen,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] dcache_req_addr, output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] dcache_req_addr,
output wire [DCACHE_NUM_REQS-1:0][`ADDR_TYPE_WIDTH-1:0] dcache_req_atype,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_req_data, output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_req_data,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] dcache_req_tag, output wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] dcache_req_tag,
input wire [DCACHE_NUM_REQS-1:0] dcache_req_ready, input wire [DCACHE_NUM_REQS-1:0] dcache_req_ready,
input wire [DCACHE_NUM_REQS-1:0] dcache_rsp_valid, input wire [DCACHE_NUM_REQS-1:0] dcache_rsp_valid,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_rsp_data, input wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_rsp_data,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] dcache_rsp_tag, input wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] dcache_rsp_tag,
output wire [DCACHE_NUM_REQS-1:0] dcache_rsp_ready, output wire [DCACHE_NUM_REQS-1:0] dcache_rsp_ready,
output wire icache_req_valid, output wire icache_req_valid,
@ -63,11 +64,6 @@ module VX_core_top import VX_gpu_pkg::*; #(
input wire gbar_rsp_valid, input wire gbar_rsp_valid,
input wire [`NB_WIDTH-1:0] gbar_rsp_id, input wire [`NB_WIDTH-1:0] gbar_rsp_id,
`endif `endif
// simulation helper signals
output wire sim_ebreak,
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
// Status // Status
output wire busy output wire busy
); );
@ -92,7 +88,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
VX_mem_bus_if #( VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE), .DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH) .TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_if[DCACHE_NUM_REQS](); ) dcache_bus_if[DCACHE_NUM_REQS]();
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
@ -100,6 +96,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign dcache_req_rw[i] = dcache_bus_if[i].req_data.rw; assign dcache_req_rw[i] = dcache_bus_if[i].req_data.rw;
assign dcache_req_byteen[i] = dcache_bus_if[i].req_data.byteen; assign dcache_req_byteen[i] = dcache_bus_if[i].req_data.byteen;
assign dcache_req_addr[i] = dcache_bus_if[i].req_data.addr; assign dcache_req_addr[i] = dcache_bus_if[i].req_data.addr;
assign dcache_req_atype[i] = dcache_bus_if[i].req_data.atype;
assign dcache_req_data[i] = dcache_bus_if[i].req_data.data; assign dcache_req_data[i] = dcache_bus_if[i].req_data.data;
assign dcache_req_tag[i] = dcache_bus_if[i].req_data.tag; assign dcache_req_tag[i] = dcache_bus_if[i].req_data.tag;
assign dcache_bus_if[i].req_ready = dcache_req_ready[i]; assign dcache_bus_if[i].req_ready = dcache_req_ready[i];
@ -122,6 +119,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign icache_req_data = icache_bus_if.req_data.data; assign icache_req_data = icache_bus_if.req_data.data;
assign icache_req_tag = icache_bus_if.req_data.tag; assign icache_req_tag = icache_bus_if.req_data.tag;
assign icache_bus_if.req_ready = icache_req_ready; assign icache_bus_if.req_ready = icache_req_ready;
`UNUSED_VAR (icache_bus_if.req_data.atype)
assign icache_bus_if.rsp_valid = icache_rsp_valid; assign icache_bus_if.rsp_valid = icache_rsp_valid;
assign icache_bus_if.rsp_data.tag = icache_rsp_tag; assign icache_bus_if.rsp_data.tag = icache_rsp_tag;
@ -130,6 +128,12 @@ module VX_core_top import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if(); VX_mem_perf_if mem_perf_if();
assign mem_perf_if.icache = '0;
assign mem_perf_if.dcache = '0;
assign mem_perf_if.l2cache = '0;
assign mem_perf_if.l3cache = '0;
assign mem_perf_if.lmem = '0;
assign mem_perf_if.mem = '0;
`endif `endif
`ifdef SCOPE `ifdef SCOPE
@ -140,6 +144,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
`endif `endif
VX_core #( VX_core #(
.INSTANCE_ID ($sformatf("core")),
.CORE_ID (CORE_ID) .CORE_ID (CORE_ID)
) core ( ) core (
`SCOPE_IO_BIND (0) `SCOPE_IO_BIND (0)
@ -160,8 +165,6 @@ module VX_core_top import VX_gpu_pkg::*; #(
.gbar_bus_if (gbar_bus_if), .gbar_bus_if (gbar_bus_if),
`endif `endif
.sim_ebreak (sim_ebreak),
.sim_wb_value (sim_wb_value),
.busy (busy) .busy (busy)
); );

View file

@ -17,12 +17,22 @@
`include "VX_fpu_define.vh" `include "VX_fpu_define.vh"
`endif `endif
`ifdef XLEN_64
`define CSR_READ_64(addr, dst, src) \
addr : dst = `XLEN'(src)
`else
`define CSR_READ_64(addr, dst, src) \
addr : dst = src[31:0]; \
addr+12'h80 : dst = 32'(src[$bits(src)-1:32])
`endif
module VX_csr_data module VX_csr_data
import VX_gpu_pkg::*; import VX_gpu_pkg::*;
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
import VX_fpu_pkg::*; import VX_fpu_pkg::*;
`endif `endif
#( #(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0 parameter CORE_ID = 0
) ( ) (
input wire clk, input wire clk,
@ -33,13 +43,12 @@ import VX_fpu_pkg::*;
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if, VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if, VX_pipeline_perf_if.slave pipeline_perf_if,
VX_sfu_perf_if.slave sfu_perf_if,
`endif `endif
VX_commit_csr_if.slave commit_csr_if, VX_commit_csr_if.slave commit_csr_if,
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
VX_fpu_to_csr_if.slave fpu_to_csr_if [`NUM_FPU_BLOCKS], VX_fpu_csr_if.slave fpu_csr_if [`NUM_FPU_BLOCKS],
`endif `endif
input wire [`PERF_CTR_BITS-1:0] cycles, input wire [`PERF_CTR_BITS-1:0] cycles,
@ -50,14 +59,14 @@ import VX_fpu_pkg::*;
input wire [`UUID_WIDTH-1:0] read_uuid, input wire [`UUID_WIDTH-1:0] read_uuid,
input wire [`NW_WIDTH-1:0] read_wid, input wire [`NW_WIDTH-1:0] read_wid,
input wire [`VX_CSR_ADDR_BITS-1:0] read_addr, input wire [`VX_CSR_ADDR_BITS-1:0] read_addr,
output wire [31:0] read_data_ro, output wire [`XLEN-1:0] read_data_ro,
output wire [31:0] read_data_rw, output wire [`XLEN-1:0] read_data_rw,
input wire write_enable, input wire write_enable,
input wire [`UUID_WIDTH-1:0] write_uuid, input wire [`UUID_WIDTH-1:0] write_uuid,
input wire [`NW_WIDTH-1:0] write_wid, input wire [`NW_WIDTH-1:0] write_wid,
input wire [`VX_CSR_ADDR_BITS-1:0] write_addr, input wire [`VX_CSR_ADDR_BITS-1:0] write_addr,
input wire [31:0] write_data input wire [`XLEN-1:0] write_data
); );
`UNUSED_VAR (reset) `UNUSED_VAR (reset)
@ -66,16 +75,20 @@ import VX_fpu_pkg::*;
// CSRs Write ///////////////////////////////////////////////////////////// // CSRs Write /////////////////////////////////////////////////////////////
reg [`XLEN-1:0] mscratch;
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FP_FLAGS_BITS-1:0] fcsr, fcsr_n; reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FP_FLAGS_BITS-1:0] fcsr, fcsr_n;
wire [`NUM_FPU_BLOCKS-1:0] fpu_write_enable; wire [`NUM_FPU_BLOCKS-1:0] fpu_write_enable;
wire [`NUM_FPU_BLOCKS-1:0][`NW_WIDTH-1:0] fpu_write_wid; wire [`NUM_FPU_BLOCKS-1:0][`NW_WIDTH-1:0] fpu_write_wid;
fflags_t [`NUM_FPU_BLOCKS-1:0] fpu_write_fflags; fflags_t [`NUM_FPU_BLOCKS-1:0] fpu_write_fflags;
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
assign fpu_write_enable[i] = fpu_to_csr_if[i].write_enable; assign fpu_write_enable[i] = fpu_csr_if[i].write_enable;
assign fpu_write_wid[i] = fpu_to_csr_if[i].write_wid; assign fpu_write_wid[i] = fpu_csr_if[i].write_wid;
assign fpu_write_fflags[i] = fpu_to_csr_if[i].write_fflags; assign fpu_write_fflags[i] = fpu_csr_if[i].write_fflags;
end end
always @(*) begin always @(*) begin
fcsr_n = fcsr; fcsr_n = fcsr;
for (integer i = 0; i < `NUM_FPU_BLOCKS; ++i) begin for (integer i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
@ -95,7 +108,7 @@ import VX_fpu_pkg::*;
end end
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
assign fpu_to_csr_if[i].read_frm = fcsr[fpu_to_csr_if[i].read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]; assign fpu_csr_if[i].read_frm = fcsr[fpu_csr_if[i].read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS];
end end
always @(posedge clk) begin always @(posedge clk) begin
@ -108,6 +121,9 @@ import VX_fpu_pkg::*;
`endif `endif
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin
mscratch <= base_dcrs.startup_arg;
end
if (write_enable) begin if (write_enable) begin
case (write_addr) case (write_addr)
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
@ -124,9 +140,14 @@ import VX_fpu_pkg::*;
`VX_CSR_MTVEC, `VX_CSR_MTVEC,
`VX_CSR_MEPC, `VX_CSR_MEPC,
`VX_CSR_PMPCFG0, `VX_CSR_PMPCFG0,
`VX_CSR_PMPADDR0: /* do nothing!*/; `VX_CSR_PMPADDR0: begin
// do nothing!
end
`VX_CSR_MSCRATCH: begin
mscratch <= write_data;
end
default: begin default: begin
`ASSERT(0, ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid)); `ASSERT(0, ("%t: *** %s invalid CSR write address: %0h (#%0d)", $time, INSTANCE_ID, write_addr, write_uuid));
end end
endcase endcase
end end
@ -134,8 +155,8 @@ import VX_fpu_pkg::*;
// CSRs read ////////////////////////////////////////////////////////////// // CSRs read //////////////////////////////////////////////////////////////
reg [31:0] read_data_ro_r; reg [`XLEN-1:0] read_data_ro_r;
reg [31:0] read_data_rw_r; reg [`XLEN-1:0] read_data_rw_r;
reg read_addr_valid_r; reg read_addr_valid_r;
always @(*) begin always @(*) begin
@ -143,28 +164,32 @@ import VX_fpu_pkg::*;
read_data_rw_r = '0; read_data_rw_r = '0;
read_addr_valid_r = 1; read_addr_valid_r = 1;
case (read_addr) case (read_addr)
`VX_CSR_MVENDORID : read_data_ro_r = 32'(`VENDOR_ID); `VX_CSR_MVENDORID : read_data_ro_r = `XLEN'(`VENDOR_ID);
`VX_CSR_MARCHID : read_data_ro_r = 32'(`ARCHITECTURE_ID); `VX_CSR_MARCHID : read_data_ro_r = `XLEN'(`ARCHITECTURE_ID);
`VX_CSR_MIMPID : read_data_ro_r = 32'(`IMPLEMENTATION_ID); `VX_CSR_MIMPID : read_data_ro_r = `XLEN'(`IMPLEMENTATION_ID);
`VX_CSR_MISA : read_data_ro_r = (((`CLOG2(`XLEN)-4) << (`XLEN-2)) | `MISA_STD); `VX_CSR_MISA : read_data_ro_r = `XLEN'({2'(`CLOG2(`XLEN/16)), 30'(`MISA_STD)});
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
`VX_CSR_FFLAGS : read_data_rw_r = 32'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]); `VX_CSR_FFLAGS : read_data_rw_r = `XLEN'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]);
`VX_CSR_FRM : read_data_rw_r = 32'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]); `VX_CSR_FRM : read_data_rw_r = `XLEN'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
`VX_CSR_FCSR : read_data_rw_r = 32'(fcsr[read_wid]); `VX_CSR_FCSR : read_data_rw_r = `XLEN'(fcsr[read_wid]);
`endif `endif
`VX_CSR_WARP_ID : read_data_ro_r = 32'(read_wid); `VX_CSR_MSCRATCH : read_data_rw_r = mscratch;
`VX_CSR_CORE_ID : read_data_ro_r = 32'(CORE_ID);
`VX_CSR_THREAD_MASK: read_data_ro_r = 32'(thread_masks[read_wid]); `VX_CSR_WARP_ID : read_data_ro_r = `XLEN'(read_wid);
`VX_CSR_WARP_MASK : read_data_ro_r = 32'(active_warps); `VX_CSR_CORE_ID : read_data_ro_r = `XLEN'(CORE_ID);
`VX_CSR_NUM_THREADS: read_data_ro_r = 32'(`NUM_THREADS); `VX_CSR_ACTIVE_THREADS: read_data_ro_r = `XLEN'(thread_masks[read_wid]);
`VX_CSR_NUM_WARPS : read_data_ro_r = 32'(`NUM_WARPS); `VX_CSR_ACTIVE_WARPS: read_data_ro_r = `XLEN'(active_warps);
`VX_CSR_NUM_CORES : read_data_ro_r = 32'(`NUM_CORES * `NUM_CLUSTERS); `VX_CSR_NUM_THREADS: read_data_ro_r = `XLEN'(`NUM_THREADS);
`VX_CSR_MCYCLE : read_data_ro_r = 32'(cycles[31:0]); `VX_CSR_NUM_WARPS : read_data_ro_r = `XLEN'(`NUM_WARPS);
`VX_CSR_MCYCLE_H : read_data_ro_r = 32'(cycles[`PERF_CTR_BITS-1:32]); `VX_CSR_NUM_CORES : read_data_ro_r = `XLEN'(`NUM_CORES * `NUM_CLUSTERS);
`VX_CSR_LOCAL_MEM_BASE: read_data_ro_r = `XLEN'(`LMEM_BASE_ADDR);
`CSR_READ_64(`VX_CSR_MCYCLE, read_data_ro_r, cycles);
`VX_CSR_MPM_RESERVED : read_data_ro_r = 'x; `VX_CSR_MPM_RESERVED : read_data_ro_r = 'x;
`VX_CSR_MPM_RESERVED_H : read_data_ro_r = 'x; `VX_CSR_MPM_RESERVED_H : read_data_ro_r = 'x;
`VX_CSR_MINSTRET : read_data_ro_r = 32'(commit_csr_if.instret[31:0]);
`VX_CSR_MINSTRET_H : read_data_ro_r = 32'(commit_csr_if.instret[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MINSTRET, read_data_ro_r, commit_csr_if.instret);
`VX_CSR_SATP, `VX_CSR_SATP,
`VX_CSR_MSTATUS, `VX_CSR_MSTATUS,
@ -175,7 +200,7 @@ import VX_fpu_pkg::*;
`VX_CSR_MTVEC, `VX_CSR_MTVEC,
`VX_CSR_MEPC, `VX_CSR_MEPC,
`VX_CSR_PMPCFG0, `VX_CSR_PMPCFG0,
`VX_CSR_PMPADDR0 : read_data_ro_r = 32'(0); `VX_CSR_PMPADDR0 : read_data_ro_r = `XLEN'(0);
default: begin default: begin
read_addr_valid_r = 0; read_addr_valid_r = 0;
@ -187,116 +212,65 @@ import VX_fpu_pkg::*;
`VX_DCR_MPM_CLASS_CORE: begin `VX_DCR_MPM_CLASS_CORE: begin
case (read_addr) case (read_addr)
// PERF: pipeline // PERF: pipeline
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0]; `CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_r, pipeline_perf_if.sched.idles);
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_r, pipeline_perf_if.sched.stalls);
`VX_CSR_MPM_FETCH_ST : read_data_ro_r = pipeline_perf_if.fetch_stalls[31:0]; `CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_r, pipeline_perf_if.issue.ibf_stalls);
`VX_CSR_MPM_FETCH_ST_H : read_data_ro_r = 32'(pipeline_perf_if.fetch_stalls[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_r, pipeline_perf_if.issue.scb_stalls);
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0]; `CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_r, pipeline_perf_if.issue.opd_stalls);
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_ALU]);
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ALU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_ALU][31:0];
`VX_CSR_MPM_ALU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_ALU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LSU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_LSU][31:0];
`VX_CSR_MPM_LSU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_LSU][`PERF_CTR_BITS-1:32]);
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
`VX_CSR_MPM_FPU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_FPU][31:0]; `CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_FPU]);
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_FPU][`PERF_CTR_BITS-1:32]);
`else `else
`VX_CSR_MPM_FPU_ST : read_data_ro_r = '0; `CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, `PERF_CTR_BITS'(0));
`VX_CSR_MPM_FPU_ST_H : read_data_ro_r = '0;
`endif `endif
`VX_CSR_MPM_SFU_ST : read_data_ro_r = pipeline_perf_if.dsp_stalls[`EX_SFU][31:0]; `CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_LSU]);
`VX_CSR_MPM_SFU_ST_H : read_data_ro_r = 32'(pipeline_perf_if.dsp_stalls[`EX_SFU][`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_SFU]);
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_ALU][`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_CSRS]);
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_ALU][31:0]; `CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_WCTL]);
`ifdef EXT_F_ENABLE
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_FPU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_FPU][31:0];
`else
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
`endif
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_LSU][31:0];
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = 32'(pipeline_perf_if.scb_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = pipeline_perf_if.scb_uses[`EX_SFU][31:0];
// PERF: memory // PERF: memory
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0]; `CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_r, pipeline_perf_if.ifetches);
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_r, pipeline_perf_if.loads);
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0]; `CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_r, pipeline_perf_if.stores);
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_r, pipeline_perf_if.ifetch_latency);
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0]; `CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_r, pipeline_perf_if.load_latency);
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_IFETCH_LAT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
`VX_CSR_MPM_IFETCH_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOAD_LAT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
`VX_CSR_MPM_LOAD_LAT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
default:; default:;
endcase endcase
end end
`VX_DCR_MPM_CLASS_MEM: begin `VX_DCR_MPM_CLASS_MEM: begin
case (read_addr) case (read_addr)
// PERF: icache // PERF: icache
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0]; `CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_r, mem_perf_if.icache.reads);
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_r, mem_perf_if.icache.read_misses);
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0]; `CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_r, mem_perf_if.icache.mshr_stalls);
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0];
`VX_CSR_MPM_ICACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: dcache // PERF: dcache
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0]; `CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_r, mem_perf_if.dcache.reads);
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_r, mem_perf_if.dcache.writes);
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0]; `CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_r, mem_perf_if.dcache.read_misses);
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_r, mem_perf_if.dcache.write_misses);
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0]; `CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_r, mem_perf_if.dcache.bank_stalls);
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_r, mem_perf_if.dcache.mshr_stalls);
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0]; // PERF: lmem
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_r, mem_perf_if.lmem.reads);
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0]; `CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_r, mem_perf_if.lmem.writes);
`VX_CSR_MPM_DCACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_r, mem_perf_if.lmem.bank_stalls);
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
`VX_CSR_MPM_DCACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: smem
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
// PERF: l2cache // PERF: l2cache
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0]; `CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_r, mem_perf_if.l2cache.reads);
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_r, mem_perf_if.l2cache.writes);
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0]; `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_r, mem_perf_if.l2cache.read_misses);
`VX_CSR_MPM_L2CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_r, mem_perf_if.l2cache.write_misses);
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0]; `CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_r, mem_perf_if.l2cache.bank_stalls);
`VX_CSR_MPM_L2CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l2cache.mshr_stalls);
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
`VX_CSR_MPM_L2CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
`VX_CSR_MPM_L2CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
`VX_CSR_MPM_L2CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: l3cache // PERF: l3cache
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0]; `CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_r, mem_perf_if.l3cache.reads);
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_r, mem_perf_if.l3cache.writes);
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0]; `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_r, mem_perf_if.l3cache.read_misses);
`VX_CSR_MPM_L3CACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_r, mem_perf_if.l3cache.write_misses);
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0]; `CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_r, mem_perf_if.l3cache.bank_stalls);
`VX_CSR_MPM_L3CACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l3cache.mshr_stalls);
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
`VX_CSR_MPM_L3CACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
`VX_CSR_MPM_L3CACHE_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
`VX_CSR_MPM_L3CACHE_MSHR_ST_H : read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: memory // PERF: memory
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0]; `CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_r, mem_perf_if.mem.reads);
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]); `CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_r, mem_perf_if.mem.writes);
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0]; `CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_r, mem_perf_if.mem.latency);
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_LAT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
`VX_CSR_MPM_MEM_LAT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
default:; default:;
endcase endcase
end end
@ -316,10 +290,8 @@ import VX_fpu_pkg::*;
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid)) `RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
wire [`PERF_CTR_BITS-1:0] perf_wctl_stalls = sfu_perf_if.wctl_stalls;
`UNUSED_VAR (perf_wctl_stalls);
`UNUSED_VAR (mem_perf_if.icache); `UNUSED_VAR (mem_perf_if.icache);
`UNUSED_VAR (mem_perf_if.smem); `UNUSED_VAR (mem_perf_if.lmem);
`endif `endif
endmodule endmodule

View file

@ -14,6 +14,7 @@
`include "VX_define.vh" `include "VX_define.vh"
module VX_csr_unit import VX_gpu_pkg::*; #( module VX_csr_unit import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0, parameter CORE_ID = 0,
parameter NUM_LANES = 1 parameter NUM_LANES = 1
) ( ) (
@ -25,11 +26,10 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if, VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if, VX_pipeline_perf_if.slave pipeline_perf_if,
VX_sfu_perf_if.slave sfu_perf_if,
`endif `endif
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
VX_fpu_to_csr_if.slave fpu_to_csr_if [`NUM_FPU_BLOCKS], VX_fpu_csr_if.slave fpu_csr_if [`NUM_FPU_BLOCKS],
`endif `endif
VX_commit_csr_if.slave commit_csr_if, VX_commit_csr_if.slave commit_csr_if,
@ -37,40 +37,43 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
VX_execute_if.slave execute_if, VX_execute_if.slave execute_if,
VX_commit_if.master commit_if VX_commit_if.master commit_if
); );
`UNUSED_PARAM (CORE_ID) `UNUSED_SPARAM (INSTANCE_ID)
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS); localparam PID_WIDTH = `UP(PID_BITS);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * 32 + PID_WIDTH + 1 + 1; localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
`UNUSED_VAR (execute_if.data.rs3_data) `UNUSED_VAR (execute_if.data.rs3_data)
reg [NUM_LANES-1:0][31:0] csr_read_data; reg [NUM_LANES-1:0][`XLEN-1:0] csr_read_data;
reg [31:0] csr_write_data; reg [`XLEN-1:0] csr_write_data;
wire [31:0] csr_read_data_ro, csr_read_data_rw; wire [`XLEN-1:0] csr_read_data_ro, csr_read_data_rw;
wire [31:0] csr_req_data; wire [`XLEN-1:0] csr_req_data;
reg csr_rd_enable; reg csr_rd_enable;
wire csr_wr_enable; wire csr_wr_enable;
wire csr_req_ready; wire csr_req_ready;
// wait for all pending instructions to complete wire [`VX_CSR_ADDR_BITS-1:0] csr_addr = execute_if.data.op_args.csr.addr;
wire [`NRI_BITS-1:0] csr_imm = execute_if.data.op_args.csr.imm;
wire is_fpu_csr = (csr_addr <= `VX_CSR_FCSR);
// wait for all pending instructions for current warp to complete
assign sched_csr_if.alm_empty_wid = execute_if.data.wid; assign sched_csr_if.alm_empty_wid = execute_if.data.wid;
wire no_pending_instr = sched_csr_if.alm_empty; wire no_pending_instr = sched_csr_if.alm_empty || ~is_fpu_csr;
wire csr_req_valid = execute_if.valid && no_pending_instr; wire csr_req_valid = execute_if.valid && no_pending_instr;
assign execute_if.ready = csr_req_ready && no_pending_instr; assign execute_if.ready = csr_req_ready && no_pending_instr;
wire [`VX_CSR_ADDR_BITS-1:0] csr_addr = execute_if.data.imm[`VX_CSR_ADDR_BITS-1:0]; wire [NUM_LANES-1:0][`XLEN-1:0] rs1_data;
wire [`NRI_BITS-1:0] csr_imm = execute_if.data.imm[`VX_CSR_ADDR_BITS +: `NRI_BITS];
wire [NUM_LANES-1:0][31:0] rs1_data;
`UNUSED_VAR (rs1_data) `UNUSED_VAR (rs1_data)
for (genvar i = 0; i < NUM_LANES; ++i) begin for (genvar i = 0; i < NUM_LANES; ++i) begin
assign rs1_data[i] = execute_if.data.rs1_data[i][31:0]; assign rs1_data[i] = execute_if.data.rs1_data[i];
end end
wire csr_write_enable = (execute_if.data.op_type == `INST_SFU_CSRRW); wire csr_write_enable = (execute_if.data.op_type == `INST_SFU_CSRRW);
VX_csr_data #( VX_csr_data #(
.INSTANCE_ID (INSTANCE_ID),
.CORE_ID (CORE_ID) .CORE_ID (CORE_ID)
) csr_data ( ) csr_data (
.clk (clk), .clk (clk),
@ -81,7 +84,6 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if), .mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if), .pipeline_perf_if(pipeline_perf_if),
.sfu_perf_if (sfu_perf_if),
`endif `endif
.commit_csr_if (commit_csr_if), .commit_csr_if (commit_csr_if),
@ -90,7 +92,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
.thread_masks (sched_csr_if.thread_masks), .thread_masks (sched_csr_if.thread_masks),
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if), .fpu_csr_if (fpu_csr_if),
`endif `endif
.read_enable (csr_req_valid && csr_rd_enable), .read_enable (csr_req_valid && csr_rd_enable),
@ -109,15 +111,15 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
// CSR read // CSR read
wire [NUM_LANES-1:0][31:0] wtid, gtid; wire [NUM_LANES-1:0][`XLEN-1:0] wtid, gtid;
for (genvar i = 0; i < NUM_LANES; ++i) begin for (genvar i = 0; i < NUM_LANES; ++i) begin
if (PID_BITS != 0) begin if (PID_BITS != 0) begin
assign wtid[i] = 32'(execute_if.data.pid * NUM_LANES + i); assign wtid[i] = `XLEN'(execute_if.data.pid * NUM_LANES + i);
end else begin end else begin
assign wtid[i] = 32'(i); assign wtid[i] = `XLEN'(i);
end end
assign gtid[i] = (32'(CORE_ID) << (`NW_BITS + `NT_BITS)) + (32'(execute_if.data.wid) << `NT_BITS) + wtid[i]; assign gtid[i] = (`XLEN'(CORE_ID) << (`NW_BITS + `NT_BITS)) + (`XLEN'(execute_if.data.wid) << `NT_BITS) + wtid[i];
end end
always @(*) begin always @(*) begin
@ -134,8 +136,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
// CSR write // CSR write
assign csr_req_data = execute_if.data.use_imm ? 32'(csr_imm) : rs1_data[0]; assign csr_req_data = execute_if.data.op_args.csr.use_imm ? `XLEN'(csr_imm) : rs1_data[0];
assign csr_wr_enable = (csr_write_enable || (| csr_req_data)); assign csr_wr_enable = (csr_write_enable || (| csr_req_data));
always @(*) begin always @(*) begin
@ -154,12 +155,9 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
end end
// unlock the warp // unlock the warp
assign sched_csr_if.unlock_warp = csr_req_valid && csr_req_ready && execute_if.data.eop; assign sched_csr_if.unlock_warp = csr_req_valid && csr_req_ready && execute_if.data.eop && is_fpu_csr;
assign sched_csr_if.unlock_wid = execute_if.data.wid; assign sched_csr_if.unlock_wid = execute_if.data.wid;
// send response
wire [NUM_LANES-1:0][31:0] csr_commit_data;
VX_elastic_buffer #( VX_elastic_buffer #(
.DATAW (DATAW), .DATAW (DATAW),
.SIZE (2) .SIZE (2)
@ -169,13 +167,9 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
.valid_in (csr_req_valid), .valid_in (csr_req_valid),
.ready_in (csr_req_ready), .ready_in (csr_req_ready),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, csr_read_data, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}), .data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, csr_read_data, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, csr_commit_data, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}), .data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.data, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}),
.valid_out (commit_if.valid), .valid_out (commit_if.valid),
.ready_out (commit_if.ready) .ready_out (commit_if.ready)
); );
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign commit_if.data.data[i] = `XLEN'(csr_commit_data[i]);
end
endmodule endmodule

View file

@ -12,9 +12,8 @@
// limitations under the License. // limitations under the License.
`include "VX_define.vh" `include "VX_define.vh"
`include "VX_trace.vh"
module VX_dcr_data import VX_gpu_pkg::*; ( module VX_dcr_data import VX_gpu_pkg::*, VX_trace_pkg::*; (
input wire clk, input wire clk,
input wire reset, input wire reset,
@ -35,6 +34,10 @@ module VX_dcr_data import VX_gpu_pkg::*; (
`VX_DCR_BASE_STARTUP_ADDR0 : dcrs.startup_addr[31:0] <= dcr_bus_if.write_data; `VX_DCR_BASE_STARTUP_ADDR0 : dcrs.startup_addr[31:0] <= dcr_bus_if.write_data;
`ifdef XLEN_64 `ifdef XLEN_64
`VX_DCR_BASE_STARTUP_ADDR1 : dcrs.startup_addr[63:32] <= dcr_bus_if.write_data; `VX_DCR_BASE_STARTUP_ADDR1 : dcrs.startup_addr[63:32] <= dcr_bus_if.write_data;
`endif
`VX_DCR_BASE_STARTUP_ARG0 : dcrs.startup_arg[31:0] <= dcr_bus_if.write_data;
`ifdef XLEN_64
`VX_DCR_BASE_STARTUP_ARG1 : dcrs.startup_arg[63:32] <= dcr_bus_if.write_data;
`endif `endif
`VX_DCR_BASE_MPM_CLASS : dcrs.mpm_class <= dcr_bus_if.write_data[7:0]; `VX_DCR_BASE_MPM_CLASS : dcrs.mpm_class <= dcr_bus_if.write_data[7:0];
default:; default:;
@ -44,12 +47,12 @@ module VX_dcr_data import VX_gpu_pkg::*; (
assign base_dcrs = dcrs; assign base_dcrs = dcrs;
`ifdef DBG_TRACE_CORE_PIPELINE `ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin always @(posedge clk) begin
if (dcr_bus_if.write_valid) begin if (dcr_bus_if.write_valid) begin
`TRACE(1, ("%d: base-dcr: state=", $time)); `TRACE(1, ("%d: base-dcr: state=", $time));
trace_base_dcr(1, dcr_bus_if.write_addr); trace_base_dcr(1, dcr_bus_if.write_addr);
`TRACE(1, (", data=0x%0h\n", dcr_bus_if.write_data)); `TRACE(1, (", data=0x%h\n", dcr_bus_if.write_data));
end end
end end
`endif `endif

View file

@ -12,7 +12,6 @@
// limitations under the License. // limitations under the License.
`include "VX_define.vh" `include "VX_define.vh"
`include "VX_trace.vh"
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
`define USED_IREG(x) \ `define USED_IREG(x) \
@ -28,8 +27,8 @@
use_``x = 1 use_``x = 1
`endif `endif
module VX_decode #( module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter CORE_ID = 0 parameter `STRING INSTANCE_ID = ""
) ( ) (
input wire clk, input wire clk,
input wire reset, input wire reset,
@ -42,18 +41,17 @@ module VX_decode #(
VX_decode_sched_if.master decode_sched_if VX_decode_sched_if.master decode_sched_if
); );
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + (`NR_BITS * 4) + `XLEN + 1 + 1; localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + (`NR_BITS * 4);
`UNUSED_PARAM (CORE_ID) `UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_VAR (clk) `UNUSED_VAR (clk)
`UNUSED_VAR (reset) `UNUSED_VAR (reset)
reg [`EX_BITS-1:0] ex_type; reg [`EX_BITS-1:0] ex_type;
reg [`INST_OP_BITS-1:0] op_type; reg [`INST_OP_BITS-1:0] op_type;
reg [`INST_MOD_BITS-1:0] op_mod; op_args_t op_args;
reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r; reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r;
reg [`XLEN-1:0] imm; reg use_rd, use_rs1, use_rs2, use_rs3;
reg use_rd, use_rs1, use_rs2, use_rs3, use_PC, use_imm;
reg is_wstall; reg is_wstall;
wire [31:0] instr = fetch_if.data.instr; wire [31:0] instr = fetch_if.data.instr;
@ -78,6 +76,7 @@ module VX_decode #(
`UNUSED_VAR (use_rs3) `UNUSED_VAR (use_rs3)
wire is_itype_sh = func3[0] && ~func3[1]; wire is_itype_sh = func3[0] && ~func3[1];
wire is_fpu_csr = (u_12 <= `VX_CSR_FCSR);
wire [19:0] ui_imm = instr[31:12]; wire [19:0] ui_imm = instr[31:12];
`ifdef XLEN_64 `ifdef XLEN_64
@ -145,18 +144,21 @@ module VX_decode #(
end end
`endif `endif
`STATIC_ASSERT($bits(alu_args_t) == $bits(op_args_t), ("alu_args_t size mismatch: current=%0d, expected=%0d", $bits(alu_args_t), $bits(op_args_t)));
`STATIC_ASSERT($bits(fpu_args_t) == $bits(op_args_t), ("fpu_args_t size mismatch: current=%0d, expected=%0d", $bits(fpu_args_t), $bits(op_args_t)));
`STATIC_ASSERT($bits(lsu_args_t) == $bits(op_args_t), ("lsu_args_t size mismatch: current=%0d, expected=%0d", $bits(lsu_args_t), $bits(op_args_t)));
`STATIC_ASSERT($bits(csr_args_t) == $bits(op_args_t), ("csr_args_t size mismatch: current=%0d, expected=%0d", $bits(csr_args_t), $bits(op_args_t)));
`STATIC_ASSERT($bits(wctl_args_t) == $bits(op_args_t), ("wctl_args_t size mismatch: current=%0d, expected=%0d", $bits(wctl_args_t), $bits(op_args_t)));
always @(*) begin always @(*) begin
ex_type = '0; ex_type = '0;
op_type = 'x; op_type = 'x;
op_mod = '0; op_args = 'x;
rd_r = '0; rd_r = '0;
rs1_r = '0; rs1_r = '0;
rs2_r = '0; rs2_r = '0;
rs3_r = '0; rs3_r = '0;
imm = 'x;
use_imm = 0;
use_PC = 0;
use_rd = 0; use_rd = 0;
use_rs1 = 0; use_rs1 = 0;
use_rs2 = 0; use_rs2 = 0;
@ -167,137 +169,174 @@ module VX_decode #(
`INST_I: begin `INST_I: begin
ex_type = `EX_ALU; ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(r_type); op_type = `INST_OP_BITS'(r_type);
op_args.alu.xtype = `ALU_TYPE_ARITH;
op_args.alu.is_w = 0;
op_args.alu.use_PC = 0;
op_args.alu.use_imm = 1;
op_args.alu.imm = `SEXT(`IMM_BITS, i_imm);
use_rd = 1; use_rd = 1;
use_imm = 1;
imm = {{(`XLEN-12){i_imm[11]}}, i_imm};
`USED_IREG (rd); `USED_IREG (rd);
`USED_IREG (rs1); `USED_IREG (rs1);
end end
`INST_R: begin `INST_R: begin
ex_type = `EX_ALU; ex_type = `EX_ALU;
`ifdef EXT_M_ENABLE op_args.alu.is_w = 0;
if (func7[0]) begin op_args.alu.use_PC = 0;
op_type = `INST_OP_BITS'(m_type); op_args.alu.use_imm = 0;
op_mod[1] = 1;
end else
`endif
begin
op_type = `INST_OP_BITS'(r_type);
end
use_rd = 1; use_rd = 1;
`USED_IREG (rd); `USED_IREG (rd);
`USED_IREG (rs1); `USED_IREG (rs1);
`USED_IREG (rs2); `USED_IREG (rs2);
case (func7)
`ifdef EXT_M_ENABLE
`INST_R_F7_MUL: begin
// MUL, MULH, MULHSU, MULHU
op_type = `INST_OP_BITS'(m_type);
op_args.alu.xtype = `ALU_TYPE_MULDIV;
end
`endif
`ifdef EXT_ZICOND_ENABLE
`INST_R_F7_ZICOND: begin
// CZERO-EQZ, CZERO-NEZ
op_type = func3[1] ? `INST_OP_BITS'(`INST_ALU_CZNE) : `INST_OP_BITS'(`INST_ALU_CZEQ);
op_args.alu.xtype = `ALU_TYPE_ARITH;
end
`endif
default: begin
op_type = `INST_OP_BITS'(r_type);
op_args.alu.xtype = `ALU_TYPE_ARITH;
end
endcase
end end
`ifdef XLEN_64 `ifdef XLEN_64
`INST_I_W: begin `INST_I_W: begin
// ADDIW, SLLIW, SRLIW, SRAIW // ADDIW, SLLIW, SRLIW, SRAIW
ex_type = `EX_ALU; ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(r_type); op_type = `INST_OP_BITS'(r_type);
op_mod[2] = 1; op_args.alu.xtype = `ALU_TYPE_ARITH;
op_args.alu.is_w = 1;
op_args.alu.use_PC = 0;
op_args.alu.use_imm = 1;
op_args.alu.imm = `SEXT(`IMM_BITS, iw_imm);
use_rd = 1; use_rd = 1;
use_imm = 1;
imm = {{(`XLEN-12){iw_imm[11]}}, iw_imm};
`USED_IREG (rd); `USED_IREG (rd);
`USED_IREG (rs1); `USED_IREG (rs1);
end end
`INST_R_W: begin `INST_R_W: begin
ex_type = `EX_ALU; ex_type = `EX_ALU;
`ifdef EXT_M_ENABLE op_args.alu.is_w = 1;
if (func7[0]) begin op_args.alu.use_PC = 0;
// MULW, DIVW, DIVUW, REMW, REMUW op_args.alu.use_imm = 0;
op_type = `INST_OP_BITS'(m_type);
op_mod[1] = 1;
end else
`endif
begin
// ADDW, SUBW, SLLW, SRLW, SRAW
op_type = `INST_OP_BITS'(r_type);
end
op_mod[2] = 1;
use_rd = 1; use_rd = 1;
`USED_IREG (rd); `USED_IREG (rd);
`USED_IREG (rs1); `USED_IREG (rs1);
`USED_IREG (rs2); `USED_IREG (rs2);
case (func7)
`ifdef EXT_M_ENABLE
`INST_R_F7_MUL: begin
// MULW, DIVW, DIVUW, REMW, REMUW
op_type = `INST_OP_BITS'(m_type);
op_args.alu.xtype = `ALU_TYPE_MULDIV;
end
`endif
default: begin
// ADDW, SUBW, SLLW, SRLW, SRAW
op_type = `INST_OP_BITS'(r_type);
op_args.alu.xtype = `ALU_TYPE_ARITH;
end
endcase
end end
`endif `endif
`INST_LUI: begin `INST_LUI: begin
ex_type = `EX_ALU; ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_ALU_LUI); op_type = `INST_OP_BITS'(`INST_ALU_LUI);
op_args.alu.xtype = `ALU_TYPE_ARITH;
op_args.alu.is_w = 0;
op_args.alu.use_PC = 0;
op_args.alu.use_imm = 1;
op_args.alu.imm = {{`IMM_BITS-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
use_rd = 1; use_rd = 1;
use_imm = 1;
imm = {{`XLEN-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
`USED_IREG (rd); `USED_IREG (rd);
end end
`INST_AUIPC: begin `INST_AUIPC: begin
ex_type = `EX_ALU; ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_ALU_AUIPC); op_type = `INST_OP_BITS'(`INST_ALU_AUIPC);
op_args.alu.xtype = `ALU_TYPE_ARITH;
op_args.alu.is_w = 0;
op_args.alu.use_PC = 1;
op_args.alu.use_imm = 1;
op_args.alu.imm = {{`IMM_BITS-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
use_rd = 1; use_rd = 1;
use_imm = 1;
use_PC = 1;
imm = {{`XLEN-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
`USED_IREG (rd); `USED_IREG (rd);
end end
`INST_JAL: begin `INST_JAL: begin
ex_type = `EX_ALU; ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_BR_JAL); op_type = `INST_OP_BITS'(`INST_BR_JAL);
op_mod[0] = 1; op_args.alu.xtype = `ALU_TYPE_BRANCH;
op_args.alu.is_w = 0;
op_args.alu.use_PC = 1;
op_args.alu.use_imm = 1;
op_args.alu.imm = `SEXT(`IMM_BITS, jal_imm);
use_rd = 1; use_rd = 1;
use_imm = 1;
use_PC = 1;
is_wstall = 1; is_wstall = 1;
imm = {{(`XLEN-21){jal_imm[20]}}, jal_imm};
`USED_IREG (rd); `USED_IREG (rd);
end end
`INST_JALR: begin `INST_JALR: begin
ex_type = `EX_ALU; ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_BR_JALR); op_type = `INST_OP_BITS'(`INST_BR_JALR);
op_mod[0] = 1; op_args.alu.xtype = `ALU_TYPE_BRANCH;
op_args.alu.is_w = 0;
op_args.alu.use_PC = 0;
op_args.alu.use_imm = 1;
op_args.alu.imm = `SEXT(`IMM_BITS, u_12);
use_rd = 1; use_rd = 1;
use_imm = 1;
is_wstall = 1; is_wstall = 1;
imm = {{(`XLEN-12){u_12[11]}}, u_12};
`USED_IREG (rd); `USED_IREG (rd);
`USED_IREG (rs1); `USED_IREG (rs1);
end end
`INST_B: begin `INST_B: begin
ex_type = `EX_ALU; ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(b_type); op_type = `INST_OP_BITS'(b_type);
op_mod[0] = 1; op_args.alu.xtype = `ALU_TYPE_BRANCH;
use_imm = 1; op_args.alu.is_w = 0;
use_PC = 1; op_args.alu.use_PC = 1;
op_args.alu.use_imm = 1;
op_args.alu.imm = `SEXT(`IMM_BITS, b_imm);
is_wstall = 1; is_wstall = 1;
imm = {{(`XLEN-13){b_imm[12]}}, b_imm};
`USED_IREG (rs1); `USED_IREG (rs1);
`USED_IREG (rs2); `USED_IREG (rs2);
end end
`INST_FENCE: begin `INST_FENCE: begin
ex_type = `EX_LSU; ex_type = `EX_LSU;
op_type = `INST_LSU_FENCE; op_type = `INST_LSU_FENCE;
op_args.lsu.is_store = 0;
op_args.lsu.is_float = 0;
op_args.lsu.offset = 0;
end end
`INST_SYS : begin `INST_SYS : begin
if (func3[1:0] != 0) begin if (func3[1:0] != 0) begin
ex_type = `EX_SFU; ex_type = `EX_SFU;
op_type = `INST_OP_BITS'(`INST_SFU_CSR(func3[1:0])); op_type = `INST_OP_BITS'(`INST_SFU_CSR(func3[1:0]));
op_args.csr.addr = u_12;
op_args.csr.use_imm = func3[2];
use_rd = 1; use_rd = 1;
is_wstall = 1; is_wstall = is_fpu_csr; // only stall for FPU CSRs
use_imm = func3[2];
imm[`VX_CSR_ADDR_BITS-1:0] = u_12; // addr
`USED_IREG (rd); `USED_IREG (rd);
if (func3[2]) begin if (func3[2]) begin
imm[`VX_CSR_ADDR_BITS +: `NRI_BITS] = rs1; // imm op_args.csr.imm = rs1;
end else begin end else begin
`USED_IREG (rs1); `USED_IREG (rs1);
end end
end else begin end else begin
ex_type = `EX_ALU; ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(s_type); op_type = `INST_OP_BITS'(s_type);
op_mod[0] = 1; op_args.alu.xtype = `ALU_TYPE_BRANCH;
op_args.alu.is_w = 0;
op_args.alu.use_imm = 1;
op_args.alu.use_PC = 1;
op_args.alu.imm = `IMM_BITS'd4;
use_rd = 1; use_rd = 1;
use_imm = 1;
use_PC = 1;
is_wstall = 1; is_wstall = 1;
imm = `XLEN'd4;
`USED_IREG (rd); `USED_IREG (rd);
end end
end end
@ -307,9 +346,10 @@ module VX_decode #(
`INST_L: begin `INST_L: begin
ex_type = `EX_LSU; ex_type = `EX_LSU;
op_type = `INST_OP_BITS'({1'b0, func3}); op_type = `INST_OP_BITS'({1'b0, func3});
op_args.lsu.is_store = 0;
op_args.lsu.is_float = opcode[2];
op_args.lsu.offset = u_12;
use_rd = 1; use_rd = 1;
imm = {{(`XLEN-12){u_12[11]}}, u_12};
use_imm = 1;
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
if (opcode[2]) begin if (opcode[2]) begin
`USED_FREG (rd); `USED_FREG (rd);
@ -324,8 +364,9 @@ module VX_decode #(
`INST_S: begin `INST_S: begin
ex_type = `EX_LSU; ex_type = `EX_LSU;
op_type = `INST_OP_BITS'({1'b1, func3}); op_type = `INST_OP_BITS'({1'b1, func3});
imm = {{(`XLEN-12){s_imm[11]}}, s_imm}; op_args.lsu.is_store = 1;
use_imm = 1; op_args.lsu.is_float = opcode[2];
op_args.lsu.offset = s_imm;
`USED_IREG (rs1); `USED_IREG (rs1);
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
if (opcode[2]) begin if (opcode[2]) begin
@ -341,8 +382,8 @@ module VX_decode #(
`INST_FNMADD: begin `INST_FNMADD: begin
ex_type = `EX_FPU; ex_type = `EX_FPU;
op_type = `INST_OP_BITS'({2'b11, opcode[3:2]}); op_type = `INST_OP_BITS'({2'b11, opcode[3:2]});
op_mod = `INST_MOD_BITS'(func3); op_args.fpu.frm = func3;
imm[0] = func2[0]; // destination is double? op_args.fpu.fmt[0] = func2[0]; // float / double
use_rd = 1; use_rd = 1;
`USED_FREG (rd); `USED_FREG (rd);
`USED_FREG (rs1); `USED_FREG (rs1);
@ -351,10 +392,9 @@ module VX_decode #(
end end
`INST_FCI: begin `INST_FCI: begin
ex_type = `EX_FPU; ex_type = `EX_FPU;
op_mod = `INST_MOD_BITS'(func3); op_args.fpu.frm = func3;
`ifdef FLEN_64 op_args.fpu.fmt[0] = func2[0]; // float / double
imm[0] = func2[0]; // destination is double? op_args.fpu.fmt[1] = rs2[1]; // int32 / int64
`endif
use_rd = 1; use_rd = 1;
case (func5) case (func5)
5'b00000, // FADD 5'b00000, // FADD
@ -369,7 +409,7 @@ module VX_decode #(
5'b00100: begin 5'b00100: begin
// NCP: FSGNJ=0, FSGNJN=1, FSGNJX=2 // NCP: FSGNJ=0, FSGNJN=1, FSGNJX=2
op_type = `INST_OP_BITS'(`INST_FPU_MISC); op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = `INST_MOD_BITS'(func3[1:0]); op_args.fpu.frm = `INST_FRM_BITS'(func3[1:0]);
`USED_FREG (rd); `USED_FREG (rd);
`USED_FREG (rs1); `USED_FREG (rs1);
`USED_FREG (rs2); `USED_FREG (rs2);
@ -377,47 +417,41 @@ module VX_decode #(
5'b00101: begin 5'b00101: begin
// NCP: FMIN=6, FMAX=7 // NCP: FMIN=6, FMAX=7
op_type = `INST_OP_BITS'(`INST_FPU_MISC); op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = func3[0] ? 7 : 6; op_args.fpu.frm = `INST_FRM_BITS'(func3[0] ? 7 : 6);
`USED_FREG (rd); `USED_FREG (rd);
`USED_FREG (rs1); `USED_FREG (rs1);
`USED_FREG (rs2); `USED_FREG (rs2);
end end
`ifdef FLEN_64 `ifdef FLEN_64
5'b01000: begin 5'b01000: begin
// CVT.S.D, CVT.D.S // FCVT.S.D, FCVT.D.S
op_type = `INST_OP_BITS'(`INST_FPU_F2F); op_type = `INST_OP_BITS'(`INST_FPU_F2F);
`USED_FREG (rd); `USED_FREG (rd);
`USED_FREG (rs1); `USED_FREG (rs1);
end end
`endif `endif
5'b01011: begin 5'b01011: begin
// SQRT // FSQRT
op_type = `INST_OP_BITS'(`INST_FPU_SQRT); op_type = `INST_OP_BITS'(`INST_FPU_SQRT);
`USED_FREG (rd); `USED_FREG (rd);
`USED_FREG (rs1); `USED_FREG (rs1);
end end
5'b10100: begin 5'b10100: begin
// CMP // FCMP
op_type = `INST_OP_BITS'(`INST_FPU_CMP); op_type = `INST_OP_BITS'(`INST_FPU_CMP);
`USED_IREG (rd); `USED_IREG (rd);
`USED_FREG (rs1); `USED_FREG (rs1);
`USED_FREG (rs2); `USED_FREG (rs2);
end end
5'b11000: begin 5'b11000: begin
// CVT.W.X, CVT.WU.X // FCVT.W.X, FCVT.WU.X
op_type = (rs2[0]) ? `INST_OP_BITS'(`INST_FPU_F2U) : `INST_OP_BITS'(`INST_FPU_F2I); op_type = (rs2[0]) ? `INST_OP_BITS'(`INST_FPU_F2U) : `INST_OP_BITS'(`INST_FPU_F2I);
`ifdef XLEN_64
imm[1] = rs2[1]; // is 64-bit integer
`endif
`USED_IREG (rd); `USED_IREG (rd);
`USED_FREG (rs1); `USED_FREG (rs1);
end end
5'b11010: begin 5'b11010: begin
// CVT.X.W, CVT.X.WU // FCVT.X.W, FCVT.X.WU
op_type = (rs2[0]) ? `INST_OP_BITS'(`INST_FPU_U2F) : `INST_OP_BITS'(`INST_FPU_I2F); op_type = (rs2[0]) ? `INST_OP_BITS'(`INST_FPU_U2F) : `INST_OP_BITS'(`INST_FPU_I2F);
`ifdef XLEN_64
imm[1] = rs2[1]; // is 64-bit integer
`endif
`USED_FREG (rd); `USED_FREG (rd);
`USED_IREG (rs1); `USED_IREG (rs1);
end end
@ -425,11 +459,11 @@ module VX_decode #(
if (func3[0]) begin if (func3[0]) begin
// NCP: FCLASS=3 // NCP: FCLASS=3
op_type = `INST_OP_BITS'(`INST_FPU_MISC); op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 3; op_args.fpu.frm = `INST_FRM_BITS'(3);
end else begin end else begin
// NCP: FMV.X.W=4 // NCP: FMV.X.W=4
op_type = `INST_OP_BITS'(`INST_FPU_MISC); op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 4; op_args.fpu.frm = `INST_FRM_BITS'(4);
end end
`USED_IREG (rd); `USED_IREG (rd);
`USED_FREG (rs1); `USED_FREG (rs1);
@ -437,7 +471,7 @@ module VX_decode #(
5'b11110: begin 5'b11110: begin
// NCP: FMV.W.X=5 // NCP: FMV.W.X=5
op_type = `INST_OP_BITS'(`INST_FPU_MISC); op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 5; op_args.fpu.frm = `INST_FRM_BITS'(5);
`USED_FREG (rd); `USED_FREG (rd);
`USED_IREG (rs1); `USED_IREG (rs1);
end end
@ -463,6 +497,7 @@ module VX_decode #(
3'h2: begin // SPLIT 3'h2: begin // SPLIT
op_type = `INST_OP_BITS'(`INST_SFU_SPLIT); op_type = `INST_OP_BITS'(`INST_SFU_SPLIT);
use_rd = 1; use_rd = 1;
op_args.wctl.is_neg = rs2[0];
`USED_IREG (rs1); `USED_IREG (rs1);
`USED_IREG (rd); `USED_IREG (rd);
end end
@ -477,6 +512,7 @@ module VX_decode #(
end end
3'h5: begin // PRED 3'h5: begin // PRED
op_type = `INST_OP_BITS'(`INST_SFU_PRED); op_type = `INST_OP_BITS'(`INST_SFU_PRED);
op_args.wctl.is_neg = rd[0];
`USED_IREG (rs1); `USED_IREG (rs1);
`USED_IREG (rs2); `USED_IREG (rs2);
end end
@ -486,25 +522,6 @@ module VX_decode #(
default:; default:;
endcase endcase
end end
`INST_EXT2: begin
case (func3)
3'h1: begin
case (func2)
2'h0: begin // CMOV
ex_type = `EX_SFU;
op_type = `INST_OP_BITS'(`INST_SFU_CMOV);
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2);
`USED_IREG (rs3);
end
default:;
endcase
end
default:;
endcase
end
default:; default:;
endcase endcase
end end
@ -520,8 +537,8 @@ module VX_decode #(
.reset (reset), .reset (reset),
.valid_in (fetch_if.valid), .valid_in (fetch_if.valid),
.ready_in (fetch_if.ready), .ready_in (fetch_if.ready),
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_mod, use_PC, imm, use_imm, wb, rd_r, rs1_r, rs2_r, rs3_r}), .data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_r, rs1_r, rs2_r, rs3_r}),
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_mod, decode_if.data.use_PC, decode_if.data.imm, decode_if.data.use_imm, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}), .data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
.valid_out (decode_if.valid), .valid_out (decode_if.valid),
.ready_out (decode_if.ready) .ready_out (decode_if.ready)
); );
@ -533,18 +550,21 @@ module VX_decode #(
assign decode_sched_if.valid = fetch_fire; assign decode_sched_if.valid = fetch_fire;
assign decode_sched_if.wid = fetch_if.data.wid; assign decode_sched_if.wid = fetch_if.data.wid;
assign decode_sched_if.is_wstall = is_wstall; assign decode_sched_if.is_wstall = is_wstall;
`ifndef L1_ENABLE
assign fetch_if.ibuf_pop = decode_if.ibuf_pop; assign fetch_if.ibuf_pop = decode_if.ibuf_pop;
`endif
`ifdef DBG_TRACE_CORE_PIPELINE `ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin always @(posedge clk) begin
if (decode_if.valid && decode_if.ready) begin if (decode_if.valid && decode_if.ready) begin
`TRACE(1, ("%d: core%0d-decode: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, CORE_ID, decode_if.data.wid, decode_if.data.PC, instr)); `TRACE(1, ("%d: %s: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, INSTANCE_ID, decode_if.data.wid, {decode_if.data.PC, 1'd0}, instr));
trace_ex_type(1, decode_if.data.ex_type); trace_ex_type(1, decode_if.data.ex_type);
`TRACE(1, (", op=")); `TRACE(1, (", op="));
trace_ex_op(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_mod, decode_if.data.rd, decode_if.data.rs2, decode_if.data.use_imm, decode_if.data.imm); trace_ex_op(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args);
`TRACE(1, (", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=0x%0h, opds=%b%b%b%b, use_pc=%b, use_imm=%b (#%0d)\n", `TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, opds=%b%b%b%b",
decode_if.data.op_mod, decode_if.data.tmask, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, decode_if.data.imm, use_rd, use_rs1, use_rs2, use_rs3, decode_if.data.use_PC, decode_if.data.use_imm, decode_if.data.uuid)); decode_if.data.tmask, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, use_rd, use_rs1, use_rs2, use_rs3));
trace_op_args(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args);
`TRACE(1, (" (#%0d)\n", decode_if.data.uuid));
end end
end end
`endif `endif

View file

@ -14,7 +14,7 @@
`include "VX_define.vh" `include "VX_define.vh"
module VX_dispatch import VX_gpu_pkg::*; #( module VX_dispatch import VX_gpu_pkg::*; #(
parameter CORE_ID = 0 parameter `STRING INSTANCE_ID = ""
) ( ) (
input wire clk, input wire clk,
input wire reset, input wire reset,
@ -23,205 +23,85 @@ module VX_dispatch import VX_gpu_pkg::*; #(
output wire [`PERF_CTR_BITS-1:0] perf_stalls [`NUM_EX_UNITS], output wire [`PERF_CTR_BITS-1:0] perf_stalls [`NUM_EX_UNITS],
`endif `endif
// inputs // inputs
VX_operands_if.slave operands_if [`ISSUE_WIDTH], VX_operands_if.slave operands_if,
// outputs // outputs
VX_dispatch_if.master alu_dispatch_if [`ISSUE_WIDTH], VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS]
VX_dispatch_if.master lsu_dispatch_if [`ISSUE_WIDTH],
`ifdef EXT_F_ENABLE
VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH],
`endif
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
); );
`UNUSED_PARAM (CORE_ID) `UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH; localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH;
wire [`ISSUE_WIDTH-1:0][`NT_WIDTH-1:0] last_active_tid;
wire [`NUM_THREADS-1:0][`NT_WIDTH-1:0] tids; wire [`NUM_THREADS-1:0][`NT_WIDTH-1:0] tids;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign tids[i] = `NT_WIDTH'(i); assign tids[i] = `NT_WIDTH'(i);
end end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin wire [`NT_WIDTH-1:0] last_active_tid;
VX_find_first #( VX_find_first #(
.N (`NUM_THREADS), .N (`NUM_THREADS),
.DATAW (`NT_WIDTH), .DATAW (`NT_WIDTH),
.REVERSE (1) .REVERSE (1)
) last_tid_select ( ) last_tid_select (
.valid_in (operands_if[i].data.tmask), .valid_in (operands_if.data.tmask),
.data_in (tids), .data_in (tids),
.data_out (last_active_tid[i]), .data_out (last_active_tid),
`UNUSED_PIN (valid_out) `UNUSED_PIN (valid_out)
); );
end
// ALU dispatch wire [`NUM_EX_UNITS-1:0] operands_reset;
assign operands_if.ready = operands_reset[operands_if.data.ex_type];
VX_operands_if alu_operands_if[`ISSUE_WIDTH](); for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin `RESET_RELAY (buffer_reset, reset);
assign alu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_ALU);
assign alu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (alu_reset, reset);
VX_elastic_buffer #( VX_elastic_buffer #(
.DATAW (DATAW), .DATAW (DATAW),
.SIZE (2), .SIZE (2),
.OUT_REG (2) .OUT_REG (2), // 2-cycle EB for area reduction
) alu_buffer ( .LUTRAM (1)
) buffer (
.clk (clk), .clk (clk),
.reset (alu_reset), .reset (buffer_reset),
.valid_in (alu_operands_if[i].valid), .valid_in (operands_if.valid && (operands_if.data.ex_type == `EX_BITS'(i))),
.ready_in (alu_operands_if[i].ready), .ready_in (operands_reset[i]),
.data_in (`TO_DISPATCH_DATA(alu_operands_if[i].data, last_active_tid[i])), .data_in ({
.data_out (alu_dispatch_if[i].data), operands_if.data.uuid,
.valid_out (alu_dispatch_if[i].valid), operands_if.data.wis,
.ready_out (alu_dispatch_if[i].ready) operands_if.data.tmask,
operands_if.data.PC,
operands_if.data.op_type,
operands_if.data.op_args,
operands_if.data.wb,
operands_if.data.rd,
last_active_tid,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data
}),
.data_out (dispatch_if[i].data),
.valid_out (dispatch_if[i].valid),
.ready_out (dispatch_if[i].ready)
); );
end end
// LSU dispatch
VX_operands_if lsu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign lsu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_LSU);
assign lsu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (lsu_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) lsu_buffer (
.clk (clk),
.reset (lsu_reset),
.valid_in (lsu_operands_if[i].valid),
.ready_in (lsu_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(lsu_operands_if[i].data, last_active_tid[i])),
.data_out (lsu_dispatch_if[i].data),
.valid_out (lsu_dispatch_if[i].valid),
.ready_out (lsu_dispatch_if[i].ready)
);
end
// FPU dispatch
`ifdef EXT_F_ENABLE
VX_operands_if fpu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign fpu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_FPU);
assign fpu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (fpu_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) fpu_buffer (
.clk (clk),
.reset (fpu_reset),
.valid_in (fpu_operands_if[i].valid),
.ready_in (fpu_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(fpu_operands_if[i].data, last_active_tid[i])),
.data_out (fpu_dispatch_if[i].data),
.valid_out (fpu_dispatch_if[i].valid),
.ready_out (fpu_dispatch_if[i].ready)
);
end
`endif
// SFU dispatch
VX_operands_if sfu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign sfu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_SFU);
assign sfu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (sfu_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) sfu_buffer (
.clk (clk),
.reset (sfu_reset),
.valid_in (sfu_operands_if[i].valid),
.ready_in (sfu_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(sfu_operands_if[i].data, last_active_tid[i])),
.data_out (sfu_dispatch_if[i].data),
.valid_out (sfu_dispatch_if[i].valid),
.ready_out (sfu_dispatch_if[i].ready)
);
end
// can take next request?
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign operands_if[i].ready = (alu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_ALU))
|| (lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU))
`ifdef EXT_F_ENABLE
|| (fpu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_FPU))
`endif
|| (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU));
end
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_n, perf_stalls_r; reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r;
wire [`ISSUE_WIDTH-1:0] operands_stall;
wire [`ISSUE_WIDTH-1:0][`EX_BITS-1:0] operands_ex_type;
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin wire operands_if_stall = operands_if.valid && ~operands_if.ready;
assign operands_stall[i] = operands_if[i].valid && ~operands_if[i].ready;
assign operands_ex_type[i] = operands_if[i].data.ex_type;
end
always @(*) begin
perf_stalls_n = perf_stalls_r;
for (integer i=0; i < `ISSUE_WIDTH; ++i) begin
if (operands_stall[i]) begin
perf_stalls_n[operands_ex_type[i]] += `PERF_CTR_BITS'(1);
end
end
end
always @(posedge clk) begin
if (reset) begin
perf_stalls_r <= '0;
end else begin
perf_stalls_r <= perf_stalls_n;
end
end
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_stalls_r[i] <= '0;
end else begin
perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(operands_if_stall && operands_if.data.ex_type == `EX_BITS'(i));
end
end
assign perf_stalls[i] = perf_stalls_r[i]; assign perf_stalls[i] = perf_stalls_r[i];
end end
`endif `endif
`ifdef DBG_TRACE_CORE_PIPELINE
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
always @(posedge clk) begin
if (operands_if[i].valid && operands_if[i].ready) begin
`TRACE(1, ("%d: core%0d-issue: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, wis_to_wid(operands_if[i].data.wis, i), operands_if[i].data.PC));
trace_ex_type(1, operands_if[i].data.ex_type);
`TRACE(1, (", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if[i].data.op_mod, operands_if[i].data.tmask, operands_if[i].data.wb, operands_if[i].data.rd));
`TRACE_ARRAY1D(1, operands_if[i].data.rs1_data, `NUM_THREADS);
`TRACE(1, (", rs2_data="));
`TRACE_ARRAY1D(1, operands_if[i].data.rs2_data, `NUM_THREADS);
`TRACE(1, (", rs3_data="));
`TRACE_ARRAY1D(1, operands_if[i].data.rs3_data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", operands_if[i].data.uuid));
end
end
end
`endif
endmodule endmodule

View file

@ -16,7 +16,7 @@
module VX_dispatch_unit import VX_gpu_pkg::*; #( module VX_dispatch_unit import VX_gpu_pkg::*; #(
parameter BLOCK_SIZE = 1, parameter BLOCK_SIZE = 1,
parameter NUM_LANES = 1, parameter NUM_LANES = 1,
parameter OUT_REG = 0, parameter OUT_BUF = 0,
parameter MAX_FANOUT = `MAX_FANOUT parameter MAX_FANOUT = `MAX_FANOUT
) ( ) (
input wire clk, input wire clk,
@ -29,7 +29,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
VX_execute_if.master execute_if [BLOCK_SIZE] VX_execute_if.master execute_if [BLOCK_SIZE]
); );
`STATIC_ASSERT ((`NUM_THREADS == NUM_LANES * (`NUM_THREADS / NUM_LANES)), ("invalid parameter")) `STATIC_ASSERT (`IS_DIVISBLE(`ISSUE_WIDTH, BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT (`IS_DIVISBLE(`NUM_THREADS, NUM_LANES), ("invalid parameter"))
localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE); localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE);
localparam NUM_PACKETS = `NUM_THREADS / NUM_LANES; localparam NUM_PACKETS = `NUM_THREADS / NUM_LANES;
localparam PID_BITS = `CLOG2(NUM_PACKETS); localparam PID_BITS = `CLOG2(NUM_PACKETS);
@ -37,8 +38,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
localparam BATCH_COUNT = `ISSUE_WIDTH / BLOCK_SIZE; localparam BATCH_COUNT = `ISSUE_WIDTH / BLOCK_SIZE;
localparam BATCH_COUNT_W= `LOG2UP(BATCH_COUNT); localparam BATCH_COUNT_W= `LOG2UP(BATCH_COUNT);
localparam ISSUE_W = `LOG2UP(`ISSUE_WIDTH); localparam ISSUE_W = `LOG2UP(`ISSUE_WIDTH);
localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN); localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN);
localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1; localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
localparam FANOUT_ENABLE= (`NUM_THREADS > (MAX_FANOUT + MAX_FANOUT /2)); localparam FANOUT_ENABLE= (`NUM_THREADS > (MAX_FANOUT + MAX_FANOUT /2));
localparam DATA_TMASK_OFF = IN_DATAW - (`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS); localparam DATA_TMASK_OFF = IN_DATAW - (`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS);
@ -70,8 +71,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (reset) begin
batch_idx <= '0; batch_idx <= '0;
end else if (batch_done) begin end else begin
batch_idx <= batch_idx + BATCH_COUNT_W'(1); batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done);
end end
end end
end else begin end else begin
@ -84,6 +85,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx); wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
assign issue_indices[block_idx] = issue_idx; assign issue_indices[block_idx] = issue_idx;
`RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1));
wire valid_p, ready_p; wire valid_p, ready_p;
if (`NUM_THREADS != NUM_LANES) begin if (`NUM_THREADS != NUM_LANES) begin
@ -99,7 +102,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire fire_eop = fire_p && is_last_p; wire fire_eop = fire_p && is_last_p;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (block_reset) begin
sent_mask_p <= '0; sent_mask_p <= '0;
is_first_p <= 1; is_first_p <= 1;
end else begin end else begin
@ -203,28 +206,26 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
assign block_done[block_idx] = ~valid_p || ready_p; assign block_done[block_idx] = ~valid_p || ready_p;
end end
wire [ISSUE_IDX_W-1:0] wsi; wire [ISSUE_ISW_W-1:0] isw;
if (BATCH_COUNT != 1) begin if (BATCH_COUNT != 1) begin
if (BLOCK_SIZE != 1) begin if (BLOCK_SIZE != 1) begin
assign wsi = {batch_idx, BLOCK_SIZE_W'(block_idx)}; assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)};
end else begin end else begin
assign wsi = batch_idx; assign isw = batch_idx;
end end
end else begin end else begin
assign wsi = block_idx; assign isw = block_idx;
end end
`RESET_RELAY(buf_out_reset, reset); wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], wsi);
VX_elastic_buffer #( VX_elastic_buffer #(
.DATAW (OUT_DATAW), .DATAW (OUT_DATAW),
.SIZE (`OUT_REG_TO_EB_SIZE(OUT_REG)), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`OUT_REG_TO_EB_REG(OUT_REG)) .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) buf_out ( ) buf_out (
.clk (clk), .clk (clk),
.reset (buf_out_reset), .reset (block_reset),
.valid_in (valid_p), .valid_in (valid_p),
.ready_in (ready_p), .ready_in (ready_p),
.data_in ({ .data_in ({

View file

@ -14,6 +14,7 @@
`include "VX_define.vh" `include "VX_define.vh"
module VX_execute import VX_gpu_pkg::*; #( module VX_execute import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0 parameter CORE_ID = 0
) ( ) (
`SCOPE_IO_DECL `SCOPE_IO_DECL
@ -21,44 +22,33 @@ module VX_execute import VX_gpu_pkg::*; #(
input wire clk, input wire clk,
input wire reset, input wire reset,
input base_dcrs_t base_dcrs,
// Dcache interface
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
// commit interface
VX_commit_csr_if.slave commit_csr_if,
// fetch interface
VX_sched_csr_if.slave sched_csr_if,
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if, VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if, VX_pipeline_perf_if.slave pipeline_perf_if,
`endif `endif
`ifdef EXT_F_ENABLE input base_dcrs_t base_dcrs,
VX_dispatch_if.slave fpu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master fpu_commit_if [`ISSUE_WIDTH],
`endif
VX_dispatch_if.slave alu_dispatch_if [`ISSUE_WIDTH], // Dcache interface
VX_commit_if.master alu_commit_if [`ISSUE_WIDTH], VX_lsu_mem_if.master lsu_mem_if [`NUM_LSU_BLOCKS],
// dispatch interface
VX_dispatch_if.slave dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH],
// commit interface
VX_commit_if.master commit_if [`NUM_EX_UNITS * `ISSUE_WIDTH],
// scheduler interfaces
VX_sched_csr_if.slave sched_csr_if,
VX_branch_ctl_if.master branch_ctl_if [`NUM_ALU_BLOCKS], VX_branch_ctl_if.master branch_ctl_if [`NUM_ALU_BLOCKS],
VX_dispatch_if.slave lsu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master lsu_commit_if [`ISSUE_WIDTH],
VX_dispatch_if.slave sfu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master sfu_commit_if [`ISSUE_WIDTH],
VX_warp_ctl_if.master warp_ctl_if, VX_warp_ctl_if.master warp_ctl_if,
// simulation helper signals // commit interface
output wire sim_ebreak VX_commit_csr_if.slave commit_csr_if
); );
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
VX_fpu_to_csr_if fpu_to_csr_if[`NUM_FPU_BLOCKS](); VX_fpu_csr_if fpu_csr_if[`NUM_FPU_BLOCKS]();
`endif `endif
`RESET_RELAY (alu_reset, reset); `RESET_RELAY (alu_reset, reset);
@ -66,72 +56,61 @@ module VX_execute import VX_gpu_pkg::*; #(
`RESET_RELAY (sfu_reset, reset); `RESET_RELAY (sfu_reset, reset);
VX_alu_unit #( VX_alu_unit #(
.CORE_ID (CORE_ID) .INSTANCE_ID ($sformatf("%s-alu", INSTANCE_ID))
) alu_unit ( ) alu_unit (
.clk (clk), .clk (clk),
.reset (alu_reset), .reset (alu_reset),
.dispatch_if (alu_dispatch_if), .dispatch_if (dispatch_if[`EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.branch_ctl_if (branch_ctl_if), .commit_if (commit_if[`EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (alu_commit_if) .branch_ctl_if (branch_ctl_if)
); );
`SCOPE_IO_SWITCH (1) `SCOPE_IO_SWITCH (1)
VX_lsu_unit #( VX_lsu_unit #(
.CORE_ID (CORE_ID) .INSTANCE_ID ($sformatf("%s-lsu", INSTANCE_ID))
) lsu_unit ( ) lsu_unit (
`SCOPE_IO_BIND (0) `SCOPE_IO_BIND (0)
.clk (clk), .clk (clk),
.reset (lsu_reset), .reset (lsu_reset),
.cache_bus_if (dcache_bus_if), .dispatch_if (dispatch_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.dispatch_if (lsu_dispatch_if), .commit_if (commit_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (lsu_commit_if) .lsu_mem_if (lsu_mem_if)
); );
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
`RESET_RELAY (fpu_reset, reset); `RESET_RELAY (fpu_reset, reset);
VX_fpu_unit #( VX_fpu_unit #(
.CORE_ID (CORE_ID) .INSTANCE_ID ($sformatf("%s-fpu", INSTANCE_ID))
) fpu_unit ( ) fpu_unit (
.clk (clk), .clk (clk),
.reset (fpu_reset), .reset (fpu_reset),
.dispatch_if (fpu_dispatch_if), .dispatch_if (dispatch_if[`EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.fpu_to_csr_if (fpu_to_csr_if), .commit_if (commit_if[`EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (fpu_commit_if) .fpu_csr_if (fpu_csr_if)
); );
`endif `endif
VX_sfu_unit #( VX_sfu_unit #(
.INSTANCE_ID ($sformatf("%s-sfu", INSTANCE_ID)),
.CORE_ID (CORE_ID) .CORE_ID (CORE_ID)
) sfu_unit ( ) sfu_unit (
.clk (clk), .clk (clk),
.reset (sfu_reset), .reset (sfu_reset),
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if), .mem_perf_if (mem_perf_if),
.pipeline_perf_if (pipeline_perf_if), .pipeline_perf_if (pipeline_perf_if),
`endif `endif
.base_dcrs (base_dcrs), .base_dcrs (base_dcrs),
.dispatch_if (dispatch_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.dispatch_if (sfu_dispatch_if), .commit_if (commit_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
`ifdef EXT_F_ENABLE `ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if), .fpu_csr_if (fpu_csr_if),
`endif `endif
.commit_csr_if (commit_csr_if), .commit_csr_if (commit_csr_if),
.sched_csr_if (sched_csr_if), .sched_csr_if (sched_csr_if),
.warp_ctl_if (warp_ctl_if), .warp_ctl_if (warp_ctl_if)
.commit_if (sfu_commit_if)
); );
// simulation helper signal to get RISC-V tests Pass/Fail status
assign sim_ebreak = alu_dispatch_if[0].valid && alu_dispatch_if[0].ready
&& alu_dispatch_if[0].data.wis == 0
&& `INST_ALU_IS_BR(alu_dispatch_if[0].data.op_mod)
&& (`INST_BR_BITS'(alu_dispatch_if[0].data.op_type) == `INST_BR_EBREAK
|| `INST_BR_BITS'(alu_dispatch_if[0].data.op_type) == `INST_BR_ECALL);
endmodule endmodule

View file

@ -14,7 +14,7 @@
`include "VX_define.vh" `include "VX_define.vh"
module VX_fetch import VX_gpu_pkg::*; #( module VX_fetch import VX_gpu_pkg::*; #(
parameter CORE_ID = 0 parameter `STRING INSTANCE_ID = ""
) ( ) (
`SCOPE_IO_DECL `SCOPE_IO_DECL
@ -30,9 +30,8 @@ module VX_fetch import VX_gpu_pkg::*; #(
// outputs // outputs
VX_fetch_if.master fetch_if VX_fetch_if.master fetch_if
); );
`UNUSED_PARAM (CORE_ID) `UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_VAR (reset) `UNUSED_VAR (reset)
localparam ISW_WIDTH = `LOG2UP(`ISSUE_WIDTH);
wire icache_req_valid; wire icache_req_valid;
wire [ICACHE_ADDR_WIDTH-1:0] icache_req_addr; wire [ICACHE_ADDR_WIDTH-1:0] icache_req_addr;
@ -44,56 +43,61 @@ module VX_fetch import VX_gpu_pkg::*; #(
wire icache_req_fire = icache_req_valid && icache_req_ready; wire icache_req_fire = icache_req_valid && icache_req_ready;
wire [ISW_WIDTH-1:0] schedule_isw = wid_to_isw(schedule_if.data.wid);
assign req_tag = schedule_if.data.wid; assign req_tag = schedule_if.data.wid;
assign {rsp_uuid, rsp_tag} = icache_bus_if.rsp_data.tag; assign {rsp_uuid, rsp_tag} = icache_bus_if.rsp_data.tag;
wire [`XLEN-1:0] rsp_PC; wire [`PC_BITS-1:0] rsp_PC;
wire [`NUM_THREADS-1:0] rsp_tmask; wire [`NUM_THREADS-1:0] rsp_tmask;
VX_dp_ram #( VX_dp_ram #(
.DATAW (`XLEN + `NUM_THREADS), .DATAW (`PC_BITS + `NUM_THREADS),
.SIZE (`NUM_WARPS), .SIZE (`NUM_WARPS),
.LUTRAM (1) .LUTRAM (1)
) tag_store ( ) tag_store (
.clk (clk), .clk (clk),
.reset (reset),
.read (1'b1), .read (1'b1),
.write (icache_req_fire), .write (icache_req_fire),
`UNUSED_PIN (wren), .wren (1'b1),
.waddr (req_tag), .waddr (req_tag),
.wdata ({schedule_if.data.PC, schedule_if.data.tmask}), .wdata ({schedule_if.data.PC, schedule_if.data.tmask}),
.raddr (rsp_tag), .raddr (rsp_tag),
.rdata ({rsp_PC, rsp_tmask}) .rdata ({rsp_PC, rsp_tmask})
); );
`ifndef L1_ENABLE
// Ensure that the ibuffer doesn't fill up. // Ensure that the ibuffer doesn't fill up.
// This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache request. // This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache requests.
// This issue is particularly prevalent when the icache and dcache is disabled and both requests share the same bus. // This issue is particularly prevalent when the icache and dcache are disabled and both requests share the same bus.
wire [`ISSUE_WIDTH-1:0] pending_ibuf_full; wire [`NUM_WARPS-1:0] pending_ibuf_full;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin for (genvar i = 0; i < `NUM_WARPS; ++i) begin
VX_pending_size #( VX_pending_size #(
.SIZE (`IBUF_SIZE) .SIZE (`IBUF_SIZE)
) pending_reads ( ) pending_reads (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.incr (icache_req_fire && schedule_isw == i), .incr (icache_req_fire && schedule_if.data.wid == i),
.decr (fetch_if.ibuf_pop[i]), .decr (fetch_if.ibuf_pop[i]),
`UNUSED_PIN (empty),
`UNUSED_PIN (alm_empty),
.full (pending_ibuf_full[i]), .full (pending_ibuf_full[i]),
`UNUSED_PIN (size), `UNUSED_PIN (alm_full),
`UNUSED_PIN (empty) `UNUSED_PIN (size)
); );
end end
wire ibuf_ready = ~pending_ibuf_full[schedule_if.data.wid];
`else
wire ibuf_ready = 1'b1;
`endif
`RUNTIME_ASSERT((!schedule_if.valid || schedule_if.data.PC != 0), `RUNTIME_ASSERT((!schedule_if.valid || schedule_if.data.PC != 0),
("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, schedule_if.data.PC, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.uuid)) ("%t: *** %s invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, INSTANCE_ID, {schedule_if.data.PC, 1'b0}, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.uuid))
// Icache Request // Icache Request
wire ibuf_ready = ~pending_ibuf_full[schedule_isw];
assign icache_req_valid = schedule_if.valid && ibuf_ready; assign icache_req_valid = schedule_if.valid && ibuf_ready;
assign icache_req_addr = schedule_if.data.PC[`MEM_ADDR_WIDTH-1:2]; assign icache_req_addr = schedule_if.data.PC[1 +: ICACHE_ADDR_WIDTH];
assign icache_req_tag = {schedule_if.data.uuid, req_tag}; assign icache_req_tag = {schedule_if.data.uuid, req_tag};
assign schedule_if.ready = icache_req_ready && ibuf_ready; assign schedule_if.ready = icache_req_ready && ibuf_ready;
@ -112,6 +116,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
.ready_out (icache_bus_if.req_ready) .ready_out (icache_bus_if.req_ready)
); );
assign icache_bus_if.req_data.atype = '0;
assign icache_bus_if.req_data.rw = 0; assign icache_bus_if.req_data.rw = 0;
assign icache_bus_if.req_data.byteen = 4'b1111; assign icache_bus_if.req_data.byteen = 4'b1111;
assign icache_bus_if.req_data.data = '0; assign icache_bus_if.req_data.data = '0;
@ -127,14 +132,14 @@ module VX_fetch import VX_gpu_pkg::*; #(
assign icache_bus_if.rsp_ready = fetch_if.ready; assign icache_bus_if.rsp_ready = fetch_if.ready;
`ifdef DBG_SCOPE_FETCH `ifdef DBG_SCOPE_FETCH
if (CORE_ID == 0) begin
`ifdef SCOPE
wire schedule_fire = schedule_if.valid && schedule_if.ready; wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready; wire icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
VX_scope_tap #( VX_scope_tap #(
.SCOPE_ID (1), .SCOPE_ID (1),
.TRIGGERW (4), .TRIGGERW (4),
.PROBEW (3*`UUID_WIDTH + 108) .PROBEW (`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +
ICACHE_TAG_WIDTH + ICACHE_WORD_SIZE + ICACHE_ADDR_WIDTH +
(ICACHE_WORD_SIZE*8) + ICACHE_TAG_WIDTH)
) scope_tap ( ) scope_tap (
.clk (clk), .clk (clk),
.reset (scope_reset), .reset (scope_reset),
@ -154,29 +159,19 @@ module VX_fetch import VX_gpu_pkg::*; #(
.bus_in (scope_bus_in), .bus_in (scope_bus_in),
.bus_out (scope_bus_out) .bus_out (scope_bus_out)
); );
`endif
`ifdef CHIPSCOPE
ila_fetch ila_fetch_inst (
.clk (clk),
.probe0 ({reset, schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC, schedule_if.ready, schedule_if.valid}),
.probe1 ({icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr, icache_bus_if.req_ready, icache_bus_if.req_valid}),
.probe2 ({icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag, icache_bus_if.rsp_ready, icache_bus_if.rsp_valid})
);
`endif
end
`else `else
`SCOPE_IO_UNUSED() `SCOPE_IO_UNUSED()
`endif `endif
`ifdef DBG_TRACE_CORE_ICACHE `ifdef DBG_TRACE_MEM
wire schedule_fire = schedule_if.valid && schedule_if.ready; wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire fetch_fire = fetch_if.valid && fetch_if.ready; wire fetch_fire = fetch_if.valid && fetch_if.ready;
always @(posedge clk) begin always @(posedge clk) begin
if (schedule_fire) begin if (schedule_fire) begin
`TRACE(1, ("%d: I$%0d req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, CORE_ID, schedule_if.data.wid, schedule_if.data.PC, schedule_if.data.tmask, schedule_if.data.uuid)); `TRACE(1, ("%d: %s req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, INSTANCE_ID, schedule_if.data.wid, {schedule_if.data.PC, 1'b0}, schedule_if.data.tmask, schedule_if.data.uuid));
end end
if (fetch_fire) begin if (fetch_fire) begin
`TRACE(1, ("%d: I$%0d rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, fetch_if.data.wid, fetch_if.data.PC, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid)); `TRACE(1, ("%d: %s rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, INSTANCE_ID, fetch_if.data.wid, {fetch_if.data.PC, 1'b0}, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid));
end end
end end
`endif `endif

View file

@ -11,54 +11,53 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
`include "VX_define.vh"
`include "VX_fpu_define.vh" `include "VX_fpu_define.vh"
module VX_fpu_unit import VX_fpu_pkg::*; #( module VX_fpu_unit import VX_fpu_pkg::*; #(
parameter CORE_ID = 0 parameter `STRING INSTANCE_ID = ""
) ( ) (
input wire clk, input wire clk,
input wire reset, input wire reset,
// Inputs
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH], VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
VX_fpu_to_csr_if.master fpu_to_csr_if[`NUM_FPU_BLOCKS],
VX_commit_if.master commit_if [`ISSUE_WIDTH] // Outputs
VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_fpu_csr_if.master fpu_csr_if[`NUM_FPU_BLOCKS]
); );
`UNUSED_PARAM (CORE_ID) `UNUSED_SPARAM (INSTANCE_ID)
localparam BLOCK_SIZE = `NUM_FPU_BLOCKS; localparam BLOCK_SIZE = `NUM_FPU_BLOCKS;
localparam NUM_LANES = `NUM_FPU_LANES; localparam NUM_LANES = `NUM_FPU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS); localparam PID_WIDTH = `UP(PID_BITS);
localparam TAG_WIDTH = `LOG2UP(`FPU_REQ_QUEUE_SIZE); localparam TAG_WIDTH = `LOG2UP(`FPUQ_SIZE);
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS); localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
VX_execute_if #( VX_execute_if #(
.NUM_LANES (NUM_LANES) .NUM_LANES (NUM_LANES)
) execute_if[BLOCK_SIZE](); ) per_block_execute_if[BLOCK_SIZE]();
`RESET_RELAY (dispatch_reset, reset);
VX_dispatch_unit #( VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE), .BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES), .NUM_LANES (NUM_LANES),
.OUT_REG (PARTIAL_BW ? 1 : 0) .OUT_BUF (PARTIAL_BW ? 1 : 0)
) dispatch_unit ( ) dispatch_unit (
.clk (clk), .clk (clk),
.reset (dispatch_reset), .reset (reset),
.dispatch_if(dispatch_if), .dispatch_if(dispatch_if),
.execute_if (execute_if) .execute_if (per_block_execute_if)
); );
VX_commit_if #( VX_commit_if #(
.NUM_LANES (NUM_LANES) .NUM_LANES (NUM_LANES)
) commit_block_if[BLOCK_SIZE](); ) per_block_commit_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
`UNUSED_VAR (execute_if[block_idx].data.tid) `UNUSED_VAR (per_block_execute_if[block_idx].data.tid)
`UNUSED_VAR (execute_if[block_idx].data.wb) `UNUSED_VAR (per_block_execute_if[block_idx].data.wb)
`UNUSED_VAR (execute_if[block_idx].data.use_PC)
`UNUSED_VAR (execute_if[block_idx].data.use_imm) `RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1));
// Store request info // Store request info
wire fpu_req_valid, fpu_req_ready; wire fpu_req_valid, fpu_req_ready;
@ -70,7 +69,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
wire [`UUID_WIDTH-1:0] fpu_rsp_uuid; wire [`UUID_WIDTH-1:0] fpu_rsp_uuid;
wire [`NW_WIDTH-1:0] fpu_rsp_wid; wire [`NW_WIDTH-1:0] fpu_rsp_wid;
wire [NUM_LANES-1:0] fpu_rsp_tmask; wire [NUM_LANES-1:0] fpu_rsp_tmask;
wire [`XLEN-1:0] fpu_rsp_PC; wire [`PC_BITS-1:0] fpu_rsp_PC;
wire [`NR_BITS-1:0] fpu_rsp_rd; wire [`NR_BITS-1:0] fpu_rsp_rd;
wire [PID_WIDTH-1:0] fpu_rsp_pid; wire [PID_WIDTH-1:0] fpu_rsp_pid;
wire fpu_rsp_sop; wire fpu_rsp_sop;
@ -79,21 +78,21 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
wire [TAG_WIDTH-1:0] fpu_req_tag, fpu_rsp_tag; wire [TAG_WIDTH-1:0] fpu_req_tag, fpu_rsp_tag;
wire mdata_full; wire mdata_full;
wire [`INST_FMT_BITS-1:0] fpu_fmt = execute_if[block_idx].data.imm[`INST_FMT_BITS-1:0]; wire [`INST_FMT_BITS-1:0] fpu_fmt = per_block_execute_if[block_idx].data.op_args.fpu.fmt;
wire [`INST_FRM_BITS-1:0] fpu_frm = execute_if[block_idx].data.op_mod[`INST_FRM_BITS-1:0]; wire [`INST_FRM_BITS-1:0] fpu_frm = per_block_execute_if[block_idx].data.op_args.fpu.frm;
wire execute_fire = execute_if[block_idx].valid && execute_if[block_idx].ready; wire execute_fire = per_block_execute_if[block_idx].valid && per_block_execute_if[block_idx].ready;
wire fpu_rsp_fire = fpu_rsp_valid && fpu_rsp_ready; wire fpu_rsp_fire = fpu_rsp_valid && fpu_rsp_ready;
VX_index_buffer #( VX_index_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + PID_WIDTH + 1 + 1), .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + PID_WIDTH + 1 + 1),
.SIZE (`FPU_REQ_QUEUE_SIZE) .SIZE (`FPUQ_SIZE)
) tag_store ( ) tag_store (
.clk (clk), .clk (clk),
.reset (reset), .reset (block_reset),
.acquire_en (execute_fire), .acquire_en (execute_fire),
.write_addr (fpu_req_tag), .write_addr (fpu_req_tag),
.write_data ({execute_if[block_idx].data.uuid, execute_if[block_idx].data.wid, execute_if[block_idx].data.tmask, execute_if[block_idx].data.PC, execute_if[block_idx].data.rd, execute_if[block_idx].data.pid, execute_if[block_idx].data.sop, execute_if[block_idx].data.eop}), .write_data ({per_block_execute_if[block_idx].data.uuid, per_block_execute_if[block_idx].data.wid, per_block_execute_if[block_idx].data.tmask, per_block_execute_if[block_idx].data.PC, per_block_execute_if[block_idx].data.rd, per_block_execute_if[block_idx].data.pid, per_block_execute_if[block_idx].data.sop, per_block_execute_if[block_idx].data.eop}),
.read_data ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}), .read_data ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
.read_addr (fpu_rsp_tag), .read_addr (fpu_rsp_tag),
.release_en (fpu_rsp_fire), .release_en (fpu_rsp_fire),
@ -103,35 +102,33 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
// resolve dynamic FRM from CSR // resolve dynamic FRM from CSR
wire [`INST_FRM_BITS-1:0] fpu_req_frm; wire [`INST_FRM_BITS-1:0] fpu_req_frm;
`ASSIGN_BLOCKED_WID (fpu_to_csr_if[block_idx].read_wid, execute_if[block_idx].data.wid, block_idx, `NUM_FPU_BLOCKS) `ASSIGN_BLOCKED_WID (fpu_csr_if[block_idx].read_wid, per_block_execute_if[block_idx].data.wid, block_idx, `NUM_FPU_BLOCKS)
assign fpu_req_frm = (execute_if[block_idx].data.op_type != `INST_FPU_MISC assign fpu_req_frm = (per_block_execute_if[block_idx].data.op_type != `INST_FPU_MISC
&& fpu_frm == `INST_FRM_DYN) ? fpu_to_csr_if[block_idx].read_frm : fpu_frm; && fpu_frm == `INST_FRM_DYN) ? fpu_csr_if[block_idx].read_frm : fpu_frm;
// submit FPU request // submit FPU request
assign fpu_req_valid = execute_if[block_idx].valid && ~mdata_full; assign fpu_req_valid = per_block_execute_if[block_idx].valid && ~mdata_full;
assign execute_if[block_idx].ready = fpu_req_ready && ~mdata_full; assign per_block_execute_if[block_idx].ready = fpu_req_ready && ~mdata_full;
`RESET_RELAY (fpu_reset, reset);
`ifdef FPU_DPI `ifdef FPU_DPI
VX_fpu_dpi #( VX_fpu_dpi #(
.NUM_LANES (NUM_LANES), .NUM_LANES (NUM_LANES),
.TAGW (TAG_WIDTH), .TAG_WIDTH (TAG_WIDTH),
.OUT_REG (PARTIAL_BW ? 1 : 3) .OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_dpi ( ) fpu_dpi (
.clk (clk), .clk (clk),
.reset (fpu_reset), .reset (block_reset),
.valid_in (fpu_req_valid), .valid_in (fpu_req_valid),
.op_type (execute_if[block_idx].data.op_type), .mask_in (per_block_execute_if[block_idx].data.tmask),
.lane_mask (execute_if[block_idx].data.tmask), .op_type (per_block_execute_if[block_idx].data.op_type),
.fmt (fpu_fmt), .fmt (fpu_fmt),
.frm (fpu_req_frm), .frm (fpu_req_frm),
.dataa (execute_if[block_idx].data.rs1_data), .dataa (per_block_execute_if[block_idx].data.rs1_data),
.datab (execute_if[block_idx].data.rs2_data), .datab (per_block_execute_if[block_idx].data.rs2_data),
.datac (execute_if[block_idx].data.rs3_data), .datac (per_block_execute_if[block_idx].data.rs3_data),
.tag_in (fpu_req_tag), .tag_in (fpu_req_tag),
.ready_in (fpu_req_ready), .ready_in (fpu_req_ready),
@ -147,20 +144,20 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
VX_fpu_fpnew #( VX_fpu_fpnew #(
.NUM_LANES (NUM_LANES), .NUM_LANES (NUM_LANES),
.TAGW (TAG_WIDTH), .TAG_WIDTH (TAG_WIDTH),
.OUT_REG (PARTIAL_BW ? 1 : 3) .OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_fpnew ( ) fpu_fpnew (
.clk (clk), .clk (clk),
.reset (fpu_reset), .reset (block_reset),
.valid_in (fpu_req_valid), .valid_in (fpu_req_valid),
.op_type (execute_if[block_idx].data.op_type), .mask_in (per_block_execute_if[block_idx].data.tmask),
.lane_mask (execute_if[block_idx].data.tmask), .op_type (per_block_execute_if[block_idx].data.op_type),
.fmt (fpu_fmt), .fmt (fpu_fmt),
.frm (fpu_req_frm), .frm (fpu_req_frm),
.dataa (execute_if[block_idx].data.rs1_data), .dataa (per_block_execute_if[block_idx].data.rs1_data),
.datab (execute_if[block_idx].data.rs2_data), .datab (per_block_execute_if[block_idx].data.rs2_data),
.datac (execute_if[block_idx].data.rs3_data), .datac (per_block_execute_if[block_idx].data.rs3_data),
.tag_in (fpu_req_tag), .tag_in (fpu_req_tag),
.ready_in (fpu_req_ready), .ready_in (fpu_req_ready),
@ -176,20 +173,20 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
VX_fpu_dsp #( VX_fpu_dsp #(
.NUM_LANES (NUM_LANES), .NUM_LANES (NUM_LANES),
.TAGW (TAG_WIDTH), .TAG_WIDTH (TAG_WIDTH),
.OUT_REG (PARTIAL_BW ? 1 : 3) .OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_dsp ( ) fpu_dsp (
.clk (clk), .clk (clk),
.reset (fpu_reset), .reset (block_reset),
.valid_in (fpu_req_valid), .valid_in (fpu_req_valid),
.lane_mask (execute_if[block_idx].data.tmask), .mask_in (per_block_execute_if[block_idx].data.tmask),
.op_type (execute_if[block_idx].data.op_type), .op_type (per_block_execute_if[block_idx].data.op_type),
.fmt (fpu_fmt), .fmt (fpu_fmt),
.frm (fpu_req_frm), .frm (fpu_req_frm),
.dataa (execute_if[block_idx].data.rs1_data), .dataa (per_block_execute_if[block_idx].data.rs1_data),
.datab (execute_if[block_idx].data.rs2_data), .datab (per_block_execute_if[block_idx].data.rs2_data),
.datac (execute_if[block_idx].data.rs3_data), .datac (per_block_execute_if[block_idx].data.rs3_data),
.tag_in (fpu_req_tag), .tag_in (fpu_req_tag),
.ready_in (fpu_req_ready), .ready_in (fpu_req_ready),
@ -210,7 +207,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
if (PID_BITS != 0) begin if (PID_BITS != 0) begin
fflags_t fpu_rsp_fflags_r; fflags_t fpu_rsp_fflags_r;
always @(posedge clk) begin always @(posedge clk) begin
if (reset) begin if (block_reset) begin
fpu_rsp_fflags_r <= '0; fpu_rsp_fflags_r <= '0;
end else if (fpu_rsp_fire) begin end else if (fpu_rsp_fire) begin
fpu_rsp_fflags_r <= fpu_rsp_eop ? '0 : (fpu_rsp_fflags_r | fpu_rsp_fflags); fpu_rsp_fflags_r <= fpu_rsp_eop ? '0 : (fpu_rsp_fflags_r | fpu_rsp_fflags);
@ -221,38 +218,36 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
assign fpu_rsp_fflags_q = fpu_rsp_fflags; assign fpu_rsp_fflags_q = fpu_rsp_fflags;
end end
assign fpu_to_csr_if[block_idx].write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags; assign fpu_csr_if[block_idx].write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags;
`ASSIGN_BLOCKED_WID (fpu_to_csr_if[block_idx].write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS) `ASSIGN_BLOCKED_WID (fpu_csr_if[block_idx].write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS)
assign fpu_to_csr_if[block_idx].write_fflags = fpu_rsp_fflags_q; assign fpu_csr_if[block_idx].write_fflags = fpu_rsp_fflags_q;
// send response // send response
VX_elastic_buffer #( VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1), .DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
.SIZE (0) .SIZE (0)
) rsp_buf ( ) rsp_buf (
.clk (clk), .clk (clk),
.reset (reset), .reset (block_reset),
.valid_in (fpu_rsp_valid), .valid_in (fpu_rsp_valid),
.ready_in (fpu_rsp_ready), .ready_in (fpu_rsp_ready),
.data_in ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_result, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}), .data_in ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_result, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
.data_out ({commit_block_if[block_idx].data.uuid, commit_block_if[block_idx].data.wid, commit_block_if[block_idx].data.tmask, commit_block_if[block_idx].data.PC, commit_block_if[block_idx].data.rd, commit_block_if[block_idx].data.data, commit_block_if[block_idx].data.pid, commit_block_if[block_idx].data.sop, commit_block_if[block_idx].data.eop}), .data_out ({per_block_commit_if[block_idx].data.uuid, per_block_commit_if[block_idx].data.wid, per_block_commit_if[block_idx].data.tmask, per_block_commit_if[block_idx].data.PC, per_block_commit_if[block_idx].data.rd, per_block_commit_if[block_idx].data.data, per_block_commit_if[block_idx].data.pid, per_block_commit_if[block_idx].data.sop, per_block_commit_if[block_idx].data.eop}),
.valid_out (commit_block_if[block_idx].valid), .valid_out (per_block_commit_if[block_idx].valid),
.ready_out (commit_block_if[block_idx].ready) .ready_out (per_block_commit_if[block_idx].ready)
); );
assign commit_block_if[block_idx].data.wb = 1'b1; assign per_block_commit_if[block_idx].data.wb = 1'b1;
end end
`RESET_RELAY (commit_reset, reset);
VX_gather_unit #( VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE), .BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES), .NUM_LANES (NUM_LANES),
.OUT_REG (PARTIAL_BW ? 3 : 0) .OUT_BUF (PARTIAL_BW ? 3 : 0)
) gather_unit ( ) gather_unit (
.clk (clk), .clk (clk),
.reset (commit_reset), .reset (reset),
.commit_in_if (commit_block_if), .commit_in_if (per_block_commit_if),
.commit_out_if (commit_if) .commit_out_if (commit_if)
); );

View file

@ -16,7 +16,7 @@
module VX_gather_unit import VX_gpu_pkg::*; #( module VX_gather_unit import VX_gpu_pkg::*; #(
parameter BLOCK_SIZE = 1, parameter BLOCK_SIZE = 1,
parameter NUM_LANES = 1, parameter NUM_LANES = 1,
parameter OUT_REG = 0 parameter OUT_BUF = 0
) ( ) (
input wire clk, input wire clk,
input wire reset, input wire reset,
@ -28,16 +28,18 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
VX_commit_if.master commit_out_if [`ISSUE_WIDTH] VX_commit_if.master commit_out_if [`ISSUE_WIDTH]
); );
`STATIC_ASSERT (`IS_DIVISBLE(`ISSUE_WIDTH, BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT (`IS_DIVISBLE(`NUM_THREADS, NUM_LANES), ("invalid parameter"))
localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE); localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES); localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS); localparam PID_WIDTH = `UP(PID_BITS);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + `NR_BITS + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1; localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + 1 + `NR_BITS + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam DATA_WIS_OFF = DATAW - (`UUID_WIDTH + `NW_WIDTH); localparam DATA_WIS_OFF = DATAW - (`UUID_WIDTH + `NW_WIDTH);
wire [BLOCK_SIZE-1:0] commit_in_valid; wire [BLOCK_SIZE-1:0] commit_in_valid;
wire [BLOCK_SIZE-1:0][DATAW-1:0] commit_in_data; wire [BLOCK_SIZE-1:0][DATAW-1:0] commit_in_data;
wire [BLOCK_SIZE-1:0] commit_in_ready; wire [BLOCK_SIZE-1:0] commit_in_ready;
wire [BLOCK_SIZE-1:0][ISSUE_IDX_W-1:0] commit_in_wsi; wire [BLOCK_SIZE-1:0][ISSUE_ISW_W-1:0] commit_in_isw;
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
assign commit_in_valid[i] = commit_in_if[i].valid; assign commit_in_valid[i] = commit_in_if[i].valid;
@ -45,12 +47,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
assign commit_in_if[i].ready = commit_in_ready[i]; assign commit_in_if[i].ready = commit_in_ready[i];
if (BLOCK_SIZE != `ISSUE_WIDTH) begin if (BLOCK_SIZE != `ISSUE_WIDTH) begin
if (BLOCK_SIZE != 1) begin if (BLOCK_SIZE != 1) begin
assign commit_in_wsi[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_IDX_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)}; assign commit_in_isw[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_ISW_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)};
end else begin end else begin
assign commit_in_wsi[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_IDX_W]; assign commit_in_isw[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_ISW_W];
end end
end else begin end else begin
assign commit_in_wsi[i] = BLOCK_SIZE_W'(i); assign commit_in_isw[i] = BLOCK_SIZE_W'(i);
end end
end end
@ -64,12 +66,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
commit_out_data[i] = 'x; commit_out_data[i] = 'x;
end end
for (integer i = 0; i < BLOCK_SIZE; ++i) begin for (integer i = 0; i < BLOCK_SIZE; ++i) begin
commit_out_valid[commit_in_wsi[i]] = commit_in_valid[i]; commit_out_valid[commit_in_isw[i]] = commit_in_valid[i];
commit_out_data[commit_in_wsi[i]] = commit_in_data[i]; commit_out_data[commit_in_isw[i]] = commit_in_data[i];
end end
end end
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
assign commit_in_ready[i] = commit_out_ready[commit_in_wsi[i]]; assign commit_in_ready[i] = commit_out_ready[commit_in_isw[i]];
end end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
@ -77,15 +79,13 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
.NUM_LANES (NUM_LANES) .NUM_LANES (NUM_LANES)
) commit_tmp_if(); ) commit_tmp_if();
`RESET_RELAY(commit_out_reset, reset);
VX_elastic_buffer #( VX_elastic_buffer #(
.DATAW (DATAW), .DATAW (DATAW),
.SIZE (`OUT_REG_TO_EB_SIZE(OUT_REG)), .SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`OUT_REG_TO_EB_REG(OUT_REG)) .OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf ( ) out_buf (
.clk (clk), .clk (clk),
.reset (commit_out_reset), .reset (reset),
.valid_in (commit_out_valid[i]), .valid_in (commit_out_valid[i]),
.ready_in (commit_out_ready[i]), .ready_in (commit_out_ready[i]),
.data_in (commit_out_data[i]), .data_in (commit_out_data[i]),

286
hw/rtl/core/VX_gpr_slice.sv Normal file
View file

@ -0,0 +1,286 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_gpr_slice import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter CACHE_ENABLE = 0
) (
input wire clk,
input wire reset,
VX_writeback_if.slave writeback_if,
VX_scoreboard_if.slave scoreboard_if,
VX_operands_if.master operands_if
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);
localparam STATE_IDLE = 2'd0;
localparam STATE_FETCH1 = 2'd1;
localparam STATE_FETCH2 = 2'd2;
localparam STATE_FETCH3 = 2'd3;
localparam STATE_BITS = 2;
wire [`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data;
reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n;
reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data [ISSUE_RATIO-1:0];
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data_n [ISSUE_RATIO-1:0];
reg [`NR_BITS-1:0] cache_reg [ISSUE_RATIO-1:0];
reg [`NR_BITS-1:0] cache_reg_n [ISSUE_RATIO-1:0];
reg [`NUM_THREADS-1:0] cache_tmask [ISSUE_RATIO-1:0];
reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0];
reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n;
reg [STATE_BITS-1:0] state, state_n;
reg [`NR_BITS-1:0] rs2, rs2_n;
reg [`NR_BITS-1:0] rs3, rs3_n;
reg rs2_ready, rs2_ready_n;
reg rs3_ready, rs3_ready_n;
reg data_ready, data_ready_n;
wire stg_valid_in, stg_ready_in;
wire is_rs1_zero = (scoreboard_if.data.rs1 == 0);
wire is_rs2_zero = (scoreboard_if.data.rs2 == 0);
wire is_rs3_zero = (scoreboard_if.data.rs3 == 0);
always @(*) begin
state_n = state;
rs2_n = rs2;
rs3_n = rs3;
rs2_ready_n = rs2_ready;
rs3_ready_n = rs3_ready;
rs1_data_n = rs1_data;
rs2_data_n = rs2_data;
rs3_data_n = rs3_data;
cache_data_n = cache_data;
cache_reg_n = cache_reg;
cache_tmask_n= cache_tmask;
cache_eop_n = cache_eop;
gpr_rd_rid_n = gpr_rd_rid;
gpr_rd_wis_n = gpr_rd_wis;
data_ready_n = data_ready;
case (state)
STATE_IDLE: begin
if (operands_if.valid && operands_if.ready) begin
data_ready_n = 0;
end
if (scoreboard_if.valid && data_ready_n == 0) begin
data_ready_n = 1;
if (is_rs3_zero || (CACHE_ENABLE != 0 &&
scoreboard_if.data.rs3 == cache_reg[scoreboard_if.data.wis] &&
(scoreboard_if.data.tmask & cache_tmask[scoreboard_if.data.wis]) == scoreboard_if.data.tmask)) begin
rs3_data_n = (is_rs3_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if.data.wis];
rs3_ready_n = 1;
end else begin
rs3_ready_n = 0;
gpr_rd_rid_n = scoreboard_if.data.rs3;
data_ready_n = 0;
state_n = STATE_FETCH3;
end
if (is_rs2_zero || (CACHE_ENABLE != 0 &&
scoreboard_if.data.rs2 == cache_reg[scoreboard_if.data.wis] &&
(scoreboard_if.data.tmask & cache_tmask[scoreboard_if.data.wis]) == scoreboard_if.data.tmask)) begin
rs2_data_n = (is_rs2_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if.data.wis];
rs2_ready_n = 1;
end else begin
rs2_ready_n = 0;
gpr_rd_rid_n = scoreboard_if.data.rs2;
data_ready_n = 0;
state_n = STATE_FETCH2;
end
if (is_rs1_zero || (CACHE_ENABLE != 0 &&
scoreboard_if.data.rs1 == cache_reg[scoreboard_if.data.wis] &&
(scoreboard_if.data.tmask & cache_tmask[scoreboard_if.data.wis]) == scoreboard_if.data.tmask)) begin
rs1_data_n = (is_rs1_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if.data.wis];
end else begin
gpr_rd_rid_n = scoreboard_if.data.rs1;
data_ready_n = 0;
state_n = STATE_FETCH1;
end
end
gpr_rd_wis_n = scoreboard_if.data.wis;
rs2_n = scoreboard_if.data.rs2;
rs3_n = scoreboard_if.data.rs3;
end
STATE_FETCH1: begin
rs1_data_n = gpr_rd_data;
if (~rs2_ready) begin
gpr_rd_rid_n = rs2;
state_n = STATE_FETCH2;
end else if (~rs3_ready) begin
gpr_rd_rid_n = rs3;
state_n = STATE_FETCH3;
end else begin
data_ready_n = 1;
state_n = STATE_IDLE;
end
end
STATE_FETCH2: begin
rs2_data_n = gpr_rd_data;
if (~rs3_ready) begin
gpr_rd_rid_n = rs3;
state_n = STATE_FETCH3;
end else begin
data_ready_n = 1;
state_n = STATE_IDLE;
end
end
STATE_FETCH3: begin
rs3_data_n = gpr_rd_data;
data_ready_n = 1;
state_n = STATE_IDLE;
end
endcase
if (CACHE_ENABLE != 0 && writeback_if.valid) begin
if ((cache_reg[writeback_if.data.wis] == writeback_if.data.rd)
|| (cache_eop[writeback_if.data.wis] && writeback_if.data.sop)) begin
for (integer j = 0; j < `NUM_THREADS; ++j) begin
if (writeback_if.data.tmask[j]) begin
cache_data_n[writeback_if.data.wis][j] = writeback_if.data.data[j];
end
end
cache_reg_n[writeback_if.data.wis] = writeback_if.data.rd;
cache_eop_n[writeback_if.data.wis] = writeback_if.data.eop;
cache_tmask_n[writeback_if.data.wis] = writeback_if.data.sop ? writeback_if.data.tmask :
(cache_tmask_n[writeback_if.data.wis] | writeback_if.data.tmask);
end
end
end
always @(posedge clk) begin
if (reset) begin
state <= STATE_IDLE;
cache_eop <= {ISSUE_RATIO{1'b1}};
data_ready <= 0;
end else begin
state <= state_n;
cache_eop <= cache_eop_n;
data_ready <= data_ready_n;
end
gpr_rd_rid <= gpr_rd_rid_n;
gpr_rd_wis <= gpr_rd_wis_n;
rs2_ready <= rs2_ready_n;
rs3_ready <= rs3_ready_n;
rs2 <= rs2_n;
rs3 <= rs3_n;
rs1_data <= rs1_data_n;
rs2_data <= rs2_data_n;
rs3_data <= rs3_data_n;
cache_data <= cache_data_n;
cache_reg <= cache_reg_n;
cache_tmask <= cache_tmask_n;
end
assign stg_valid_in = scoreboard_if.valid && data_ready;
assign scoreboard_if.ready = stg_ready_in && data_ready;
VX_toggle_buffer #(
.DATAW (DATAW)
) toggle_buffer (
.clk (clk),
.reset (reset),
.valid_in (stg_valid_in),
.data_in ({
scoreboard_if.data.uuid,
scoreboard_if.data.wis,
scoreboard_if.data.tmask,
scoreboard_if.data.PC,
scoreboard_if.data.wb,
scoreboard_if.data.ex_type,
scoreboard_if.data.op_type,
scoreboard_if.data.op_args,
scoreboard_if.data.rd
}),
.ready_in (stg_ready_in),
.valid_out (operands_if.valid),
.data_out ({
operands_if.data.uuid,
operands_if.data.wis,
operands_if.data.tmask,
operands_if.data.PC,
operands_if.data.wb,
operands_if.data.ex_type,
operands_if.data.op_type,
operands_if.data.op_args,
operands_if.data.rd
}),
.ready_out (operands_if.ready)
);
assign operands_if.data.rs1_data = rs1_data;
assign operands_if.data.rs2_data = rs2_data;
assign operands_if.data.rs3_data = rs3_data;
// GPR banks
reg [RAM_ADDRW-1:0] gpr_rd_addr;
wire [RAM_ADDRW-1:0] gpr_wr_addr;
if (ISSUE_WIS != 0) begin
assign gpr_wr_addr = {writeback_if.data.wis, writeback_if.data.rd};
always @(posedge clk) begin
gpr_rd_addr <= {gpr_rd_wis_n, gpr_rd_rid_n};
end
end else begin
assign gpr_wr_addr = writeback_if.data.rd;
always @(posedge clk) begin
gpr_rd_addr <= gpr_rd_rid_n;
end
end
`ifdef GPR_RESET
reg wr_enabled = 0;
always @(posedge clk) begin
if (reset) begin
wr_enabled <= 1;
end
end
`endif
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
VX_dp_ram #(
.DATAW (`XLEN),
.SIZE (`NUM_REGS * ISSUE_RATIO),
`ifdef GPR_RESET
.INIT_ENABLE (1),
.INIT_VALUE (0),
`endif
.NO_RWCHECK (1)
) gpr_ram (
.clk (clk),
.read (1'b1),
`UNUSED_PIN (wren),
`ifdef GPR_RESET
.write (wr_enabled && writeback_if.valid && writeback_if.data.tmask[j]),
`else
.write (writeback_if.valid && writeback_if.data.tmask[j]),
`endif
.waddr (gpr_wr_addr),
.wdata (writeback_if.data.data[j]),
.raddr (gpr_rd_addr),
.rdata (gpr_rd_data[j])
);
end
endmodule

View file

@ -14,60 +14,73 @@
`include "VX_define.vh" `include "VX_define.vh"
module VX_ibuffer import VX_gpu_pkg::*; #( module VX_ibuffer import VX_gpu_pkg::*; #(
parameter CORE_ID = 0 parameter `STRING INSTANCE_ID = ""
) ( ) (
input wire clk, input wire clk,
input wire reset, input wire reset,
`ifdef PERF_ENABLE
output wire [`PERF_CTR_BITS-1:0] perf_stalls,
`endif
// inputs // inputs
VX_decode_if.slave decode_if, VX_decode_if.slave decode_if,
// outputs // outputs
VX_ibuffer_if.master ibuffer_if [`ISSUE_WIDTH] VX_ibuffer_if.master ibuffer_if [PER_ISSUE_WARPS]
); );
`UNUSED_PARAM (CORE_ID) `UNUSED_SPARAM (INSTANCE_ID)
localparam ISW_WIDTH = `LOG2UP(`ISSUE_WIDTH); localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4);
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4);
wire [`ISSUE_WIDTH-1:0] ibuf_ready_in; wire [PER_ISSUE_WARPS-1:0] ibuf_ready_in;
assign decode_if.ready = ibuf_ready_in[decode_if.data.wid];
wire [ISW_WIDTH-1:0] decode_isw = wid_to_isw(decode_if.data.wid); for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
wire [ISSUE_WIS_W-1:0] decode_wis = wid_to_wis(decode_if.data.wid);
assign decode_if.ready = ibuf_ready_in[decode_isw];
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
VX_elastic_buffer #( VX_elastic_buffer #(
.DATAW (DATAW), .DATAW (DATAW),
.SIZE (`IBUF_SIZE), .SIZE (`IBUF_SIZE),
.OUT_REG (1) .OUT_REG (2) // 2-cycle EB for area reduction
) instr_buf ( ) instr_buf (
.clk (clk), .clk (clk),
.reset (reset), .reset (reset),
.valid_in (decode_if.valid && decode_isw == i), .valid_in (decode_if.valid && decode_if.data.wid == ISSUE_WIS_W'(w)),
.ready_in (ibuf_ready_in[i]),
.data_in ({ .data_in ({
decode_if.data.uuid, decode_if.data.uuid,
decode_wis,
decode_if.data.tmask, decode_if.data.tmask,
decode_if.data.PC,
decode_if.data.ex_type, decode_if.data.ex_type,
decode_if.data.op_type, decode_if.data.op_type,
decode_if.data.op_mod, decode_if.data.op_args,
decode_if.data.wb, decode_if.data.wb,
decode_if.data.use_PC,
decode_if.data.use_imm,
decode_if.data.PC,
decode_if.data.imm,
decode_if.data.rd, decode_if.data.rd,
decode_if.data.rs1, decode_if.data.rs1,
decode_if.data.rs2, decode_if.data.rs2,
decode_if.data.rs3}), decode_if.data.rs3
.data_out(ibuffer_if[i].data), }),
.valid_out (ibuffer_if[i].valid), .ready_in (ibuf_ready_in[w]),
.ready_out(ibuffer_if[i].ready) .valid_out(ibuffer_if[w].valid),
.data_out (ibuffer_if[w].data),
.ready_out(ibuffer_if[w].ready)
); );
`ifndef L1_ENABLE
assign decode_if.ibuf_pop[i] = ibuffer_if[i].valid && ibuffer_if[i].ready; assign decode_if.ibuf_pop[w] = ibuffer_if[w].valid && ibuffer_if[w].ready;
`endif
end end
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
wire decode_if_stall = decode_if.valid && ~decode_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_ibf_stalls <= '0;
end else begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_if_stall);
end
end
assign perf_stalls = perf_ibf_stalls;
`endif
endmodule endmodule

View file

@ -25,6 +25,7 @@ module VX_ipdom_stack #(
input wire [WIDTH-1:0] q1, input wire [WIDTH-1:0] q1,
output wire [WIDTH-1:0] d, output wire [WIDTH-1:0] d,
output wire d_set, output wire d_set,
output wire [ADDRW-1:0] q_ptr,
input wire push, input wire push,
input wire pop, input wire pop,
output wire empty, output wire empty,
@ -71,9 +72,10 @@ module VX_ipdom_stack #(
.LUTRAM (OUT_REG ? 0 : 1) .LUTRAM (OUT_REG ? 0 : 1)
) store ( ) store (
.clk (clk), .clk (clk),
.reset (reset),
.read (1'b1), .read (1'b1),
.write (push), .write (push),
`UNUSED_PIN (wren), .wren (1'b1),
.waddr (wr_ptr), .waddr (wr_ptr),
.wdata ({q1, q0}), .wdata ({q1, q0}),
.raddr (rd_ptr), .raddr (rd_ptr),
@ -89,6 +91,7 @@ module VX_ipdom_stack #(
end end
wire d_set_r; wire d_set_r;
VX_pipe_register #( VX_pipe_register #(
.DATAW (1), .DATAW (1),
.DEPTH (OUT_REG) .DEPTH (OUT_REG)
@ -102,6 +105,7 @@ module VX_ipdom_stack #(
assign d = d_set_r ? d0 : d1; assign d = d_set_r ? d0 : d1;
assign d_set = ~d_set_r; assign d_set = ~d_set_r;
assign q_ptr = wr_ptr;
assign empty = empty_r; assign empty = empty_r;
assign full = full_r; assign full = full_r;

View file

@ -12,10 +12,9 @@
// limitations under the License. // limitations under the License.
`include "VX_define.vh" `include "VX_define.vh"
`include "VX_trace.vh"
module VX_issue #( module VX_issue import VX_gpu_pkg::*; #(
parameter CORE_ID = 0 parameter `STRING INSTANCE_ID = ""
) ( ) (
`SCOPE_IO_DECL `SCOPE_IO_DECL
@ -23,150 +22,81 @@ module VX_issue #(
input wire reset, input wire reset,
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
VX_pipeline_perf_if.issue perf_issue_if, output issue_perf_t issue_perf,
`endif `endif
VX_decode_if.slave decode_if, VX_decode_if.slave decode_if,
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH], VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH]
VX_dispatch_if.master alu_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.master lsu_dispatch_if [`ISSUE_WIDTH],
`ifdef EXT_F_ENABLE
VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH],
`endif
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
); );
VX_ibuffer_if ibuffer_if [`ISSUE_WIDTH]();
VX_ibuffer_if scoreboard_if [`ISSUE_WIDTH]();
VX_operands_if operands_if [`ISSUE_WIDTH]();
`RESET_RELAY (ibuf_reset, reset);
`RESET_RELAY (scoreboard_reset, reset);
`RESET_RELAY (operands_reset, reset);
`RESET_RELAY (dispatch_reset, reset);
VX_ibuffer #(
.CORE_ID (CORE_ID)
) ibuffer (
.clk (clk),
.reset (ibuf_reset),
.decode_if (decode_if),
.ibuffer_if (ibuffer_if)
);
VX_scoreboard #(
.CORE_ID (CORE_ID)
) scoreboard (
.clk (clk),
.reset (scoreboard_reset),
`ifdef PERF_ENABLE
.perf_scb_stalls(perf_issue_if.scb_stalls),
.perf_scb_uses (perf_issue_if.scb_uses),
`endif
.writeback_if (writeback_if),
.ibuffer_if (ibuffer_if),
.scoreboard_if (scoreboard_if)
);
VX_operands #(
.CORE_ID (CORE_ID)
) operands (
.clk (clk),
.reset (operands_reset),
.writeback_if (writeback_if),
.scoreboard_if (scoreboard_if),
.operands_if (operands_if)
);
VX_dispatch #(
.CORE_ID (CORE_ID)
) dispatch (
.clk (clk),
.reset (dispatch_reset),
`ifdef PERF_ENABLE
.perf_stalls (perf_issue_if.dsp_stalls),
`endif
.operands_if (operands_if),
.alu_dispatch_if(alu_dispatch_if),
.lsu_dispatch_if(lsu_dispatch_if),
`ifdef EXT_F_ENABLE
.fpu_dispatch_if(fpu_dispatch_if),
`endif
.sfu_dispatch_if(sfu_dispatch_if)
);
`ifdef DBG_SCOPE_ISSUE
if (CORE_ID == 0) begin
`ifdef SCOPE
wire operands_if_fire = operands_if[0].valid && operands_if[0].ready;
wire operands_if_not_ready = ~operands_if[0].ready;
wire writeback_if_valid = writeback_if[0].valid;
VX_scope_tap #(
.SCOPE_ID (2),
.TRIGGERW (4),
.PROBEW (`UUID_WIDTH + `NUM_THREADS + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS +
1 + `NR_BITS + `XLEN + 1 + 1 + (`NUM_THREADS * 3 * `XLEN) +
`UUID_WIDTH + `NUM_THREADS + `NR_BITS + (`NUM_THREADS*`XLEN) + 1)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({
reset,
operands_if_fire,
operands_if_not_ready,
writeback_if_valid
}),
.probes({
operands_if[0].data.uuid,
operands_if[0].data.tmask,
operands_if[0].data.ex_type,
operands_if[0].data.op_type,
operands_if[0].data.op_mod,
operands_if[0].data.wb,
operands_if[0].data.rd,
operands_if[0].data.imm,
operands_if[0].data.use_PC,
operands_if[0].data.use_imm,
operands_if[0].data.rs1_data,
operands_if[0].data.rs2_data,
operands_if[0].data.rs3_data,
writeback_if[0].data.uuid,
writeback_if[0].data.tmask,
writeback_if[0].data.rd,
writeback_if[0].data.data,
writeback_if[0].data.eop
}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
ila_issue ila_issue_inst (
.clk (clk),
.probe0 ({operands_if.uuid, ibuffer.rs3, ibuffer.rs2, ibuffer.rs1, operands_if.PC, operands_if.tmask, operands_if.wid, operands_if.ex_type, operands_if.op_type, operands_if.ready, operands_if.valid}),
.probe1 ({writeback_if.uuid, writeback_if.data[0], writeback_if.PC, writeback_if.tmask, writeback_if.wid, writeback_if.eop, writeback_if.valid})
);
`endif
end
`else
`SCOPE_IO_UNUSED()
`endif
`ifdef PERF_ENABLE `ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls; issue_perf_t per_issue_perf [`ISSUE_WIDTH];
always @(posedge clk) begin `PERF_COUNTER_ADD (issue_perf, per_issue_perf, ibf_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
if (reset) begin `PERF_COUNTER_ADD (issue_perf, per_issue_perf, scb_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
perf_ibf_stalls <= '0; `PERF_COUNTER_ADD (issue_perf, per_issue_perf, opd_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
end else begin for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
if (decode_if.valid && ~decode_if.ready) begin `PERF_COUNTER_ADD (issue_perf, per_issue_perf, units_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(1);
end end
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, sfu_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
end end
end
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
`endif `endif
wire [ISSUE_ISW_W-1:0] decode_isw = wid_to_isw(decode_if.data.wid);
wire [ISSUE_WIS_W-1:0] decode_wis = wid_to_wis(decode_if.data.wid);
wire [`ISSUE_WIDTH-1:0] decode_ready_in;
assign decode_if.ready = decode_ready_in[decode_isw];
`SCOPE_IO_SWITCH (`ISSUE_WIDTH)
for (genvar issue_id = 0; issue_id < `ISSUE_WIDTH; ++issue_id) begin : issue_slices
VX_decode_if #(
.NUM_WARPS (PER_ISSUE_WARPS)
) per_issue_decode_if();
VX_dispatch_if per_issue_dispatch_if[`NUM_EX_UNITS]();
assign per_issue_decode_if.valid = decode_if.valid && (decode_isw == ISSUE_ISW_W'(issue_id));
assign per_issue_decode_if.data.uuid = decode_if.data.uuid;
assign per_issue_decode_if.data.wid = decode_wis;
assign per_issue_decode_if.data.tmask = decode_if.data.tmask;
assign per_issue_decode_if.data.PC = decode_if.data.PC;
assign per_issue_decode_if.data.ex_type = decode_if.data.ex_type;
assign per_issue_decode_if.data.op_type = decode_if.data.op_type;
assign per_issue_decode_if.data.op_args = decode_if.data.op_args;
assign per_issue_decode_if.data.wb = decode_if.data.wb;
assign per_issue_decode_if.data.rd = decode_if.data.rd;
assign per_issue_decode_if.data.rs1 = decode_if.data.rs1;
assign per_issue_decode_if.data.rs2 = decode_if.data.rs2;
assign per_issue_decode_if.data.rs3 = decode_if.data.rs3;
assign decode_ready_in[issue_id] = per_issue_decode_if.ready;
`ifndef L1_ENABLE
assign decode_if.ibuf_pop[issue_id * PER_ISSUE_WARPS +: PER_ISSUE_WARPS] = per_issue_decode_if.ibuf_pop;
`endif
`RESET_RELAY (slice_reset, reset);
VX_issue_slice #(
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, issue_id)),
.ISSUE_ID (issue_id)
) issue_slice (
`SCOPE_IO_BIND(issue_id)
.clk (clk),
.reset (slice_reset),
`ifdef PERF_ENABLE
.issue_perf (per_issue_perf[issue_id]),
`endif
.decode_if (per_issue_decode_if),
.writeback_if (writeback_if[issue_id]),
.dispatch_if (per_issue_dispatch_if)
);
// Assign transposed dispatch_if
for (genvar ex_id = 0; ex_id < `NUM_EX_UNITS; ++ex_id) begin
`ASSIGN_VX_IF(dispatch_if[ex_id * `ISSUE_WIDTH + issue_id], per_issue_dispatch_if[ex_id]);
end
end
endmodule endmodule

View file

@ -0,0 +1,159 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter ISSUE_ID = 0
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output issue_perf_t issue_perf,
`endif
VX_decode_if.slave decode_if,
VX_writeback_if.slave writeback_if,
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS]
);
`UNUSED_PARAM (ISSUE_ID)
VX_ibuffer_if ibuffer_if [PER_ISSUE_WARPS]();
VX_scoreboard_if scoreboard_if();
VX_operands_if operands_if();
`RESET_RELAY (ibuf_reset, reset);
`RESET_RELAY (scoreboard_reset, reset);
`RESET_RELAY (operands_reset, reset);
`RESET_RELAY (dispatch_reset, reset);
VX_ibuffer #(
.INSTANCE_ID ($sformatf("%s-ibuffer", INSTANCE_ID))
) ibuffer (
.clk (clk),
.reset (ibuf_reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.ibf_stalls),
`endif
.decode_if (decode_if),
.ibuffer_if (ibuffer_if)
);
VX_scoreboard #(
.INSTANCE_ID ($sformatf("%s-scoreboard", INSTANCE_ID))
) scoreboard (
.clk (clk),
.reset (scoreboard_reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.scb_stalls),
.perf_units_uses(issue_perf.units_uses),
.perf_sfu_uses (issue_perf.sfu_uses),
`endif
.writeback_if (writeback_if),
.ibuffer_if (ibuffer_if),
.scoreboard_if (scoreboard_if)
);
VX_operands #(
.INSTANCE_ID ($sformatf("%s-operands", INSTANCE_ID))
) operands (
.clk (clk),
.reset (operands_reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.opd_stalls),
`endif
.writeback_if (writeback_if),
.scoreboard_if (scoreboard_if),
.operands_if (operands_if)
);
VX_dispatch #(
.INSTANCE_ID ($sformatf("%s-dispatch", INSTANCE_ID))
) dispatch (
.clk (clk),
.reset (dispatch_reset),
`ifdef PERF_ENABLE
`UNUSED_PIN (perf_stalls),
`endif
.operands_if (operands_if),
.dispatch_if (dispatch_if)
);
`ifdef DBG_SCOPE_ISSUE
wire operands_if_fire = operands_if.valid && operands_if.ready;
wire operands_if_not_ready = ~operands_if.ready;
wire writeback_if_valid = writeback_if.valid;
VX_scope_tap #(
.SCOPE_ID (2),
.TRIGGERW (4),
.PROBEW (`UUID_WIDTH + `NUM_THREADS + `EX_BITS + `INST_OP_BITS +
1 + `NR_BITS + (`NUM_THREADS * 3 * `XLEN) +
`UUID_WIDTH + `NUM_THREADS + `NR_BITS + (`NUM_THREADS*`XLEN) + 1)
) scope_tap (
.clk (clk),
.reset (scope_reset),
.start (1'b0),
.stop (1'b0),
.triggers ({
reset,
operands_if_fire,
operands_if_not_ready,
writeback_if_valid
}),
.probes ({
operands_if.data.uuid,
operands_if.data.tmask,
operands_if.data.ex_type,
operands_if.data.op_type,
operands_if.data.wb,
operands_if.data.rd,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data,
writeback_if.data.uuid,
writeback_if.data.tmask,
writeback_if.data.rd,
writeback_if.data.data,
writeback_if.data.eop
}),
.bus_in (scope_bus_in),
.bus_out (scope_bus_out)
);
`else
`SCOPE_IO_UNUSED()
`endif
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (operands_if.valid && operands_if.ready) begin
`TRACE(1, ("%d: %s wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, wis_to_wid(operands_if.data.wis, ISSUE_ID), {operands_if.data.PC, 1'b0}));
trace_ex_type(1, operands_if.data.ex_type);
`TRACE(1, (", op="));
trace_ex_op(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args);
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if.data.tmask, operands_if.data.wb, operands_if.data.rd));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs1_data, `NUM_THREADS);
`TRACE(1, (", rs2_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs2_data, `NUM_THREADS);
`TRACE(1, (", rs3_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs3_data, `NUM_THREADS);
trace_op_args(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args);
`TRACE(1, (" (#%0d)\n", operands_if.data.uuid));
end
end
`endif
endmodule

132
hw/rtl/core/VX_issue_top.sv Normal file
View file

@ -0,0 +1,132 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_issue_top import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "issue"
) (
// Clock
input wire clk,
input wire reset,
input wire decode_valid,
input wire [`UUID_WIDTH-1:0] decode_uuid,
input wire [`NW_WIDTH-1:0] decode_wid,
input wire [`NUM_THREADS-1:0] decode_tmask,
input wire [`PC_BITS-1:0] decode_PC,
input wire [`EX_BITS-1:0] decode_ex_type,
input wire [`INST_OP_BITS-1:0] decode_op_type,
input op_args_t decode_op_args,
input wire decode_wb,
input wire [`NR_BITS-1:0] decode_rd,
input wire [`NR_BITS-1:0] decode_rs1,
input wire [`NR_BITS-1:0] decode_rs2,
input wire [`NR_BITS-1:0] decode_rs3,
output wire decode_ready,
input wire writeback_valid[`ISSUE_WIDTH],
input wire [`UUID_WIDTH-1:0] writeback_uuid[`ISSUE_WIDTH],
input wire [ISSUE_WIS_W-1:0] writeback_wis[`ISSUE_WIDTH],
input wire [`NUM_THREADS-1:0] writeback_tmask[`ISSUE_WIDTH],
input wire [`PC_BITS-1:0] writeback_PC[`ISSUE_WIDTH],
input wire [`NR_BITS-1:0] writeback_rd[`ISSUE_WIDTH],
input wire [`NUM_THREADS-1:0][`XLEN-1:0] writeback_data[`ISSUE_WIDTH],
input wire writeback_sop[`ISSUE_WIDTH],
input wire writeback_eop[`ISSUE_WIDTH],
output wire dispatch_valid[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`UUID_WIDTH-1:0] dispatch_uuid[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [ISSUE_WIS_W-1:0] dispatch_wis[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NUM_THREADS-1:0] dispatch_tmask[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`PC_BITS-1:0] dispatch_PC[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`INST_ALU_BITS-1:0] dispatch_op_type[`NUM_EX_UNITS * `ISSUE_WIDTH],
output op_args_t dispatch_op_args[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire dispatch_wb[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NR_BITS-1:0] dispatch_rd[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NT_WIDTH-1:0] dispatch_tid[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs1_data[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data[`NUM_EX_UNITS * `ISSUE_WIDTH],
input wire dispatch_ready[`NUM_EX_UNITS * `ISSUE_WIDTH]
);
VX_decode_if decode_if();
VX_dispatch_if dispatch_if[`NUM_EX_UNITS * `ISSUE_WIDTH]();
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
assign decode_if.valid = decode_valid;
assign decode_if.data.uuid = decode_uuid;
assign decode_if.data.wid = decode_wid;
assign decode_if.data.tmask = decode_tmask;
assign decode_if.data.PC = decode_PC;
assign decode_if.data.ex_type = decode_ex_type;
assign decode_if.data.op_type = decode_op_type;
assign decode_if.data.op_args = decode_op_args;
assign decode_if.data.wb = decode_wb;
assign decode_if.data.rd = decode_rd;
assign decode_if.data.rs1 = decode_rs1;
assign decode_if.data.rs2 = decode_rs2;
assign decode_if.data.rs3 = decode_rs3;
assign decode_ready = decode_if.ready;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign writeback_if[i].valid = writeback_valid[i];
assign writeback_if[i].data.uuid = writeback_uuid[i];
assign writeback_if[i].data.wis = writeback_wis[i];
assign writeback_if[i].data.tmask = writeback_tmask[i];
assign writeback_if[i].data.PC = writeback_PC[i];
assign writeback_if[i].data.rd = writeback_rd[i];
assign writeback_if[i].data.data = writeback_data[i];
assign writeback_if[i].data.sop = writeback_sop[i];
assign writeback_if[i].data.eop = writeback_eop[i];
end
for (genvar i = 0; i < `NUM_EX_UNITS * `ISSUE_WIDTH; ++i) begin
assign dispatch_valid[i] = dispatch_if[i].valid;
assign dispatch_uuid[i] = dispatch_if[i].data.uuid;
assign dispatch_wis[i] = dispatch_if[i].data.wis;
assign dispatch_tmask[i] = dispatch_if[i].data.tmask;
assign dispatch_PC[i] = dispatch_if[i].data.PC;
assign dispatch_op_type[i] = dispatch_if[i].data.op_type;
assign dispatch_op_args[i] = dispatch_if[i].data.op_args;
assign dispatch_wb[i] = dispatch_if[i].data.wb;
assign dispatch_rd[i] = dispatch_if[i].data.rd;
assign dispatch_tid[i] = dispatch_if[i].data.tid;
assign dispatch_rs1_data[i] = dispatch_if[i].data.rs1_data;
assign dispatch_rs2_data[i] = dispatch_if[i].data.rs2_data;
assign dispatch_rs3_data[i] = dispatch_if[i].data.rs3_data;
assign dispatch_if[i].ready = dispatch_ready[i];
end
`ifdef PERF_ENABLE
issue_perf_t issue_perf = '0;
`endif
VX_issue #(
.INSTANCE_ID (INSTANCE_ID)
) issue (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.issue_perf (issue_perf),
`endif
.decode_if (decode_if),
.writeback_if (writeback_if),
.dispatch_if (dispatch_if)
);
endmodule

201
hw/rtl/core/VX_lmem_unit.sv Normal file
View file

@ -0,0 +1,201 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_lmem_unit import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output cache_perf_t cache_perf,
`endif
VX_lsu_mem_if.slave lsu_mem_in_if [`NUM_LSU_BLOCKS],
VX_lsu_mem_if.master lsu_mem_out_if [`NUM_LSU_BLOCKS]
);
`STATIC_ASSERT(`IS_DIVISBLE((1 << `LMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT(0 == (`LMEM_BASE_ADDR % (1 << `LMEM_LOG_SIZE)), ("invalid parameter"))
localparam REQ_DATAW = `NUM_LSU_LANES + 1 + `NUM_LSU_LANES * (LSU_WORD_SIZE + LSU_ADDR_WIDTH + `ADDR_TYPE_WIDTH + LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH;
localparam RSP_DATAW = `NUM_LSU_LANES + `NUM_LSU_LANES * (LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH;
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE);
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_switch_if[`NUM_LSU_BLOCKS]();
`RESET_RELAY_EX (block_reset, reset, `NUM_LSU_BLOCKS, 1);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
wire [`NUM_LSU_LANES-1:0] is_addr_local_mask;
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
assign is_addr_local_mask[j] = lsu_mem_in_if[i].req_data.atype[j][`ADDR_TYPE_LOCAL];
end
wire is_addr_global = | (lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask);
wire is_addr_local = | (lsu_mem_in_if[i].req_data.mask & is_addr_local_mask);
wire req_global_ready;
wire req_local_ready;
VX_elastic_buffer #(
.DATAW (REQ_DATAW),
.SIZE (2),
.OUT_REG (1)
) req_global_buf (
.clk (clk),
.reset (block_reset[i]),
.valid_in (lsu_mem_in_if[i].req_valid && is_addr_global),
.data_in ({
lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask,
lsu_mem_in_if[i].req_data.rw,
lsu_mem_in_if[i].req_data.byteen,
lsu_mem_in_if[i].req_data.addr,
lsu_mem_in_if[i].req_data.atype,
lsu_mem_in_if[i].req_data.data,
lsu_mem_in_if[i].req_data.tag
}),
.ready_in (req_global_ready),
.valid_out (lsu_mem_out_if[i].req_valid),
.data_out ({
lsu_mem_out_if[i].req_data.mask,
lsu_mem_out_if[i].req_data.rw,
lsu_mem_out_if[i].req_data.byteen,
lsu_mem_out_if[i].req_data.addr,
lsu_mem_out_if[i].req_data.atype,
lsu_mem_out_if[i].req_data.data,
lsu_mem_out_if[i].req_data.tag
}),
.ready_out (lsu_mem_out_if[i].req_ready)
);
VX_elastic_buffer #(
.DATAW (REQ_DATAW),
.SIZE (0),
.OUT_REG (0)
) req_local_buf (
.clk (clk),
.reset (block_reset[i]),
.valid_in (lsu_mem_in_if[i].req_valid && is_addr_local),
.data_in ({
lsu_mem_in_if[i].req_data.mask & is_addr_local_mask,
lsu_mem_in_if[i].req_data.rw,
lsu_mem_in_if[i].req_data.byteen,
lsu_mem_in_if[i].req_data.addr,
lsu_mem_in_if[i].req_data.atype,
lsu_mem_in_if[i].req_data.data,
lsu_mem_in_if[i].req_data.tag
}),
.ready_in (req_local_ready),
.valid_out (lsu_switch_if[i].req_valid),
.data_out ({
lsu_switch_if[i].req_data.mask,
lsu_switch_if[i].req_data.rw,
lsu_switch_if[i].req_data.byteen,
lsu_switch_if[i].req_data.addr,
lsu_switch_if[i].req_data.atype,
lsu_switch_if[i].req_data.data,
lsu_switch_if[i].req_data.tag
}),
.ready_out (lsu_switch_if[i].req_ready)
);
assign lsu_mem_in_if[i].req_ready = (req_global_ready && is_addr_global)
|| (req_local_ready && is_addr_local);
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_DATAW),
.ARBITER ("R"),
.OUT_BUF (1)
) rsp_arb (
.clk (clk),
.reset (block_reset[i]),
.valid_in ({
lsu_switch_if[i].rsp_valid,
lsu_mem_out_if[i].rsp_valid
}),
.ready_in ({
lsu_switch_if[i].rsp_ready,
lsu_mem_out_if[i].rsp_ready
}),
.data_in ({
lsu_switch_if[i].rsp_data,
lsu_mem_out_if[i].rsp_data
}),
.data_out (lsu_mem_in_if[i].rsp_data),
.valid_out (lsu_mem_in_if[i].rsp_valid),
.ready_out (lsu_mem_in_if[i].rsp_ready),
`UNUSED_PIN (sel_out)
);
end
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_if[LSU_NUM_REQS]();
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_tmp_if[`NUM_LSU_LANES]();
VX_lsu_adapter #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (3),
.RSP_OUT_BUF (0)
) lsu_adapter (
.clk (clk),
.reset (block_reset[i]),
.lsu_mem_if (lsu_switch_if[i]),
.mem_bus_if (lmem_bus_tmp_if)
);
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
`ASSIGN_VX_MEM_BUS_IF (lmem_bus_if[i * `NUM_LSU_LANES + j], lmem_bus_tmp_if[j]);
end
end
`RESET_RELAY (lmem_reset, reset);
VX_local_mem #(
.INSTANCE_ID($sformatf("%s-lmem", INSTANCE_ID)),
.SIZE (1 << `LMEM_LOG_SIZE),
.NUM_REQS (LSU_NUM_REQS),
.NUM_BANKS (`LMEM_NUM_BANKS),
.WORD_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.OUT_BUF (3)
) local_mem (
.clk (clk),
.reset (lmem_reset),
`ifdef PERF_ENABLE
.cache_perf (cache_perf),
`endif
.mem_bus_if (lmem_bus_if)
);
endmodule

View file

@ -0,0 +1,121 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_lsu_adapter import VX_gpu_pkg::*; #(
parameter NUM_LANES = 1,
parameter DATA_SIZE = 1,
parameter TAG_WIDTH = 1,
parameter TAG_SEL_BITS = 0,
parameter `STRING ARBITER = "P",
parameter REQ_OUT_BUF = 0,
parameter RSP_OUT_BUF = 0
) (
input wire clk,
input wire reset,
VX_lsu_mem_if.slave lsu_mem_if,
VX_mem_bus_if.master mem_bus_if [NUM_LANES]
);
localparam REQ_ADDR_WIDTH = `MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE);
localparam REQ_DATA_WIDTH = 1 + DATA_SIZE + REQ_ADDR_WIDTH + `ADDR_TYPE_WIDTH + DATA_SIZE * 8;
localparam RSP_DATA_WIDTH = DATA_SIZE * 8;
// handle request unpacking
wire [NUM_LANES-1:0][REQ_DATA_WIDTH-1:0] req_data_in;
wire [NUM_LANES-1:0] req_valid_out;
wire [NUM_LANES-1:0][REQ_DATA_WIDTH-1:0] req_data_out;
wire [NUM_LANES-1:0][TAG_WIDTH-1:0] req_tag_out;
wire [NUM_LANES-1:0] req_ready_out;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign req_data_in[i] = {
lsu_mem_if.req_data.rw,
lsu_mem_if.req_data.byteen[i],
lsu_mem_if.req_data.addr[i],
lsu_mem_if.req_data.atype[i],
lsu_mem_if.req_data.data[i]
};
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign mem_bus_if[i].req_valid = req_valid_out[i];
assign {
mem_bus_if[i].req_data.rw,
mem_bus_if[i].req_data.byteen,
mem_bus_if[i].req_data.addr,
mem_bus_if[i].req_data.atype,
mem_bus_if[i].req_data.data
} = req_data_out[i];
assign mem_bus_if[i].req_data.tag = req_tag_out[i];
assign req_ready_out[i] = mem_bus_if[i].req_ready;
end
VX_stream_unpack #(
.NUM_REQS (NUM_LANES),
.DATA_WIDTH (REQ_DATA_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.OUT_BUF (REQ_OUT_BUF)
) stream_unpack (
.clk (clk),
.reset (reset),
.valid_in (lsu_mem_if.req_valid),
.mask_in (lsu_mem_if.req_data.mask),
.data_in (req_data_in),
.tag_in (lsu_mem_if.req_data.tag),
.ready_in (lsu_mem_if.req_ready),
.valid_out (req_valid_out),
.data_out (req_data_out),
.tag_out (req_tag_out),
.ready_out (req_ready_out)
);
// handle response packing
wire [NUM_LANES-1:0] rsp_valid_out;
wire [NUM_LANES-1:0][RSP_DATA_WIDTH-1:0] rsp_data_out;
wire [NUM_LANES-1:0][TAG_WIDTH-1:0] rsp_tag_out;
wire [NUM_LANES-1:0] rsp_ready_out;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign rsp_valid_out[i] = mem_bus_if[i].rsp_valid;
assign rsp_data_out[i] = mem_bus_if[i].rsp_data.data;
assign rsp_tag_out[i] = mem_bus_if[i].rsp_data.tag;
assign mem_bus_if[i].rsp_ready = rsp_ready_out[i];
end
VX_stream_pack #(
.NUM_REQS (NUM_LANES),
.DATA_WIDTH (RSP_DATA_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.TAG_SEL_BITS (TAG_SEL_BITS),
.ARBITER (ARBITER),
.OUT_BUF (RSP_OUT_BUF)
) stream_pack (
.clk (clk),
.reset (reset),
.valid_in (rsp_valid_out),
.data_in (rsp_data_out),
.tag_in (rsp_tag_out),
.ready_in (rsp_ready_out),
.valid_out (lsu_mem_if.rsp_valid),
.mask_out (lsu_mem_if.rsp_data.mask),
.data_out (lsu_mem_if.rsp_data.data),
.tag_out (lsu_mem_if.rsp_data.tag),
.ready_out (lsu_mem_if.rsp_ready)
);
endmodule

557
hw/rtl/core/VX_lsu_slice.sv Normal file
View file

@ -0,0 +1,557 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
// Inputs
VX_execute_if.slave execute_if,
// Outputs
VX_commit_if.master commit_if,
VX_lsu_mem_if.master lsu_mem_if
);
localparam NUM_LANES = `NUM_LSU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_IN_SIZE);
localparam REQ_ASHIFT = `CLOG2(LSU_WORD_SIZE);
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
localparam MEM_ADDRW = `MEM_ADDR_WIDTH - MEM_ASHIFT;
// tag_id = wid + PC + wb + rd + op_type + align + pid + pkt_addr + fence
localparam TAG_ID_WIDTH = `NW_WIDTH + `PC_BITS + 1 + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * REQ_ASHIFT) + PID_WIDTH + LSUQ_SIZEW + 1;
// tag = uuid + tag_id
localparam TAG_WIDTH = `UUID_WIDTH + TAG_ID_WIDTH;
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_rsp_if();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_no_rsp_if();
`UNUSED_VAR (execute_if.data.rs3_data)
`UNUSED_VAR (execute_if.data.tid)
// full address calculation
wire req_is_fence, rsp_is_fence;
wire [NUM_LANES-1:0][`XLEN-1:0] full_addr;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign full_addr[i] = execute_if.data.rs1_data[i] + `SEXT(`XLEN, execute_if.data.op_args.lsu.offset);
end
// address type calculation
wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] mem_req_atype;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [MEM_ADDRW-1:0] block_addr = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
// is I/O address
wire [MEM_ADDRW-1:0] io_addr_start = MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT);
wire [MEM_ADDRW-1:0] io_addr_end = MEM_ADDRW'(`XLEN'(`IO_END_ADDR) >> MEM_ASHIFT);
assign mem_req_atype[i][`ADDR_TYPE_FLUSH] = req_is_fence;
assign mem_req_atype[i][`ADDR_TYPE_IO] = (block_addr >= io_addr_start) && (block_addr < io_addr_end);
`ifdef LMEM_ENABLE
// is local memory address
wire [MEM_ADDRW-1:0] lmem_addr_start = MEM_ADDRW'(`XLEN'(`LMEM_BASE_ADDR) >> MEM_ASHIFT);
wire [MEM_ADDRW-1:0] lmem_addr_end = MEM_ADDRW'((`XLEN'(`LMEM_BASE_ADDR) + `XLEN'(1 << `LMEM_LOG_SIZE)) >> MEM_ASHIFT);
assign mem_req_atype[i][`ADDR_TYPE_LOCAL] = (block_addr >= lmem_addr_start) && (block_addr < lmem_addr_end);
`endif
end
// schedule memory request
wire mem_req_valid;
wire [NUM_LANES-1:0] mem_req_mask;
wire mem_req_rw;
wire [NUM_LANES-1:0][LSU_ADDR_WIDTH-1:0] mem_req_addr;
wire [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] mem_req_byteen;
reg [NUM_LANES-1:0][LSU_WORD_SIZE*8-1:0] mem_req_data;
wire [TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
wire mem_rsp_valid;
wire [NUM_LANES-1:0] mem_rsp_mask;
wire [NUM_LANES-1:0][LSU_WORD_SIZE*8-1:0] mem_rsp_data;
wire [TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_sop;
wire mem_rsp_eop;
wire mem_rsp_ready;
wire mem_req_fire = mem_req_valid && mem_req_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
`UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire)
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
wire no_rsp_buf_valid, no_rsp_buf_ready;
// fence handling
reg fence_lock;
assign req_is_fence = `INST_LSU_IS_FENCE(execute_if.data.op_type);
always @(posedge clk) begin
if (reset) begin
fence_lock <= 0;
end else begin
if (mem_req_fire && req_is_fence && execute_if.data.eop) begin
fence_lock <= 1;
end
if (mem_rsp_fire && rsp_is_fence && mem_rsp_eop_pkt) begin
fence_lock <= 0;
end
end
end
wire req_skip = req_is_fence && ~execute_if.data.eop;
wire no_rsp_buf_enable = (mem_req_rw && ~execute_if.data.wb) || req_skip;
assign mem_req_valid = execute_if.valid
&& ~req_skip
&& ~(no_rsp_buf_enable && ~no_rsp_buf_ready)
&& ~fence_lock;
assign no_rsp_buf_valid = execute_if.valid
&& no_rsp_buf_enable
&& (req_skip || mem_req_ready)
&& ~fence_lock;
assign execute_if.ready = (mem_req_ready || req_skip)
&& ~(no_rsp_buf_enable && ~no_rsp_buf_ready)
&& ~fence_lock;
assign mem_req_mask = execute_if.data.tmask;
assign mem_req_rw = execute_if.data.op_args.lsu.is_store;
// address formatting
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] req_align;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign req_align[i] = full_addr[i][REQ_ASHIFT-1:0];
assign mem_req_addr[i] = full_addr[i][`MEM_ADDR_WIDTH-1:REQ_ASHIFT];
end
// byte enable formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
reg [LSU_WORD_SIZE-1:0] mem_req_byteen_r;
always @(*) begin
mem_req_byteen_r = '0;
case (`INST_LSU_WSIZE(execute_if.data.op_type))
0: begin // 8-bit
mem_req_byteen_r[req_align[i]] = 1'b1;
end
1: begin // 16 bit
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
end
`ifdef XLEN_64
2: begin // 32 bit
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
end
`endif
// 3: 64 bit
default : mem_req_byteen_r = {LSU_WORD_SIZE{1'b1}};
endcase
end
assign mem_req_byteen[i] = mem_req_byteen_r;
end
// memory misalignment not supported!
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire lsu_req_fire = execute_if.valid && execute_if.ready;
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if.data.tmask[i] || req_is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if.data.op_type))) == 0),
("misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
execute_if.data.wid, {execute_if.data.PC, 1'b0}, full_addr[i], `INST_LSU_WSIZE(execute_if.data.op_type), execute_if.data.uuid));
end
// store data formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
mem_req_data[i] = execute_if.data.rs2_data[i];
case (req_align[i])
1: mem_req_data[i][`XLEN-1:8] = execute_if.data.rs2_data[i][`XLEN-9:0];
2: mem_req_data[i][`XLEN-1:16] = execute_if.data.rs2_data[i][`XLEN-17:0];
3: mem_req_data[i][`XLEN-1:24] = execute_if.data.rs2_data[i][`XLEN-25:0];
`ifdef XLEN_64
4: mem_req_data[i][`XLEN-1:32] = execute_if.data.rs2_data[i][`XLEN-33:0];
5: mem_req_data[i][`XLEN-1:40] = execute_if.data.rs2_data[i][`XLEN-41:0];
6: mem_req_data[i][`XLEN-1:48] = execute_if.data.rs2_data[i][`XLEN-49:0];
7: mem_req_data[i][`XLEN-1:56] = execute_if.data.rs2_data[i][`XLEN-57:0];
`endif
default:;
endcase
end
end
// track SOP/EOP for out-of-order memory responses
wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr;
if (PID_BITS != 0) begin
reg [`LSUQ_IN_SIZE-1:0][PID_BITS:0] pkt_ctr;
reg [`LSUQ_IN_SIZE-1:0] pkt_sop, pkt_eop;
wire mem_req_rd_fire = mem_req_fire && ~mem_req_rw;
wire mem_req_rd_sop_fire = mem_req_rd_fire && execute_if.data.sop;
wire mem_req_rd_eop_fire = mem_req_rd_fire && execute_if.data.eop;
wire mem_rsp_eop_fire = mem_rsp_fire && mem_rsp_eop;
wire full;
VX_allocator #(
.SIZE (`LSUQ_IN_SIZE)
) pkt_allocator (
.clk (clk),
.reset (reset),
.acquire_en (mem_req_rd_eop_fire),
.acquire_addr(pkt_waddr),
.release_en (mem_rsp_eop_pkt),
.release_addr(pkt_raddr),
`UNUSED_PIN (empty),
.full (full)
);
wire rd_during_wr = mem_req_rd_fire && mem_rsp_eop_fire && (pkt_raddr == pkt_waddr);
always @(posedge clk) begin
if (reset) begin
pkt_ctr <= '0;
pkt_sop <= '0;
pkt_eop <= '0;
end else begin
if (mem_req_rd_sop_fire) begin
pkt_sop[pkt_waddr] <= 1;
end
if (mem_req_rd_eop_fire) begin
pkt_eop[pkt_waddr] <= 1;
end
if (mem_rsp_fire) begin
pkt_sop[pkt_raddr] <= 0;
end
if (mem_rsp_eop_pkt) begin
pkt_eop[pkt_raddr] <= 0;
end
if (~rd_during_wr) begin
if (mem_req_rd_fire) begin
pkt_ctr[pkt_waddr] <= pkt_ctr[pkt_waddr] + PID_BITS'(1);
end
if (mem_rsp_eop_fire) begin
pkt_ctr[pkt_raddr] <= pkt_ctr[pkt_raddr] - PID_BITS'(1);
end
end
end
end
assign mem_rsp_sop_pkt = pkt_sop[pkt_raddr];
assign mem_rsp_eop_pkt = mem_rsp_eop_fire && pkt_eop[pkt_raddr] && (pkt_ctr[pkt_raddr] == 1);
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("allocator full!"))
`RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("Oops!"))
`UNUSED_VAR (mem_rsp_sop)
end else begin
assign pkt_waddr = 0;
assign mem_rsp_sop_pkt = mem_rsp_sop;
assign mem_rsp_eop_pkt = mem_rsp_eop;
`UNUSED_VAR (pkt_raddr)
end
// pack memory request tag
assign mem_req_tag = {
execute_if.data.uuid,
execute_if.data.wid,
execute_if.data.PC,
execute_if.data.wb,
execute_if.data.rd,
execute_if.data.op_type,
req_align,
execute_if.data.pid,
pkt_waddr,
req_is_fence
};
wire lsu_mem_req_valid;
wire lsu_mem_req_rw;
wire [NUM_LANES-1:0] lsu_mem_req_mask;
wire [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] lsu_mem_req_byteen;
wire [NUM_LANES-1:0][LSU_ADDR_WIDTH-1:0] lsu_mem_req_addr;
wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] lsu_mem_req_atype;
wire [NUM_LANES-1:0][(LSU_WORD_SIZE*8)-1:0] lsu_mem_req_data;
wire [LSU_TAG_WIDTH-1:0] lsu_mem_req_tag;
wire lsu_mem_req_ready;
wire lsu_mem_rsp_valid;
wire [NUM_LANES-1:0] lsu_mem_rsp_mask;
wire [NUM_LANES-1:0][(LSU_WORD_SIZE*8)-1:0] lsu_mem_rsp_data;
wire [LSU_TAG_WIDTH-1:0] lsu_mem_rsp_tag;
wire lsu_mem_rsp_ready;
`RESET_RELAY (mem_scheduler_reset, reset);
VX_mem_scheduler #(
.INSTANCE_ID ($sformatf("%s-scheduler", INSTANCE_ID)),
.CORE_REQS (NUM_LANES),
.MEM_CHANNELS(NUM_LANES),
.WORD_SIZE (LSU_WORD_SIZE),
.LINE_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LSU_ADDR_WIDTH),
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_QUEUE_SIZE (`LSUQ_IN_SIZE),
.MEM_QUEUE_SIZE (`LSUQ_OUT_SIZE),
.UUID_WIDTH (`UUID_WIDTH),
.RSP_PARTIAL (1),
.MEM_OUT_BUF (0),
.CORE_OUT_BUF(0)
) mem_scheduler (
.clk (clk),
.reset (mem_scheduler_reset),
// Input request
.core_req_valid (mem_req_valid),
.core_req_rw (mem_req_rw),
.core_req_mask (mem_req_mask),
.core_req_byteen(mem_req_byteen),
.core_req_addr (mem_req_addr),
.core_req_atype (mem_req_atype),
.core_req_data (mem_req_data),
.core_req_tag (mem_req_tag),
.core_req_ready (mem_req_ready),
`UNUSED_PIN (core_req_empty),
`UNUSED_PIN (core_req_sent),
// Output response
.core_rsp_valid (mem_rsp_valid),
.core_rsp_mask (mem_rsp_mask),
.core_rsp_data (mem_rsp_data),
.core_rsp_tag (mem_rsp_tag),
.core_rsp_sop (mem_rsp_sop),
.core_rsp_eop (mem_rsp_eop),
.core_rsp_ready (mem_rsp_ready),
// Memory request
.mem_req_valid (lsu_mem_req_valid),
.mem_req_rw (lsu_mem_req_rw),
.mem_req_mask (lsu_mem_req_mask),
.mem_req_byteen (lsu_mem_req_byteen),
.mem_req_addr (lsu_mem_req_addr),
.mem_req_atype (lsu_mem_req_atype),
.mem_req_data (lsu_mem_req_data),
.mem_req_tag (lsu_mem_req_tag),
.mem_req_ready (lsu_mem_req_ready),
// Memory response
.mem_rsp_valid (lsu_mem_rsp_valid),
.mem_rsp_mask (lsu_mem_rsp_mask),
.mem_rsp_data (lsu_mem_rsp_data),
.mem_rsp_tag (lsu_mem_rsp_tag),
.mem_rsp_ready (lsu_mem_rsp_ready)
);
assign lsu_mem_if.req_valid = lsu_mem_req_valid;
assign lsu_mem_if.req_data.mask = lsu_mem_req_mask;
assign lsu_mem_if.req_data.rw = lsu_mem_req_rw;
assign lsu_mem_if.req_data.byteen = lsu_mem_req_byteen;
assign lsu_mem_if.req_data.addr = lsu_mem_req_addr;
assign lsu_mem_if.req_data.atype = lsu_mem_req_atype;
assign lsu_mem_if.req_data.data = lsu_mem_req_data;
assign lsu_mem_if.req_data.tag = lsu_mem_req_tag;
assign lsu_mem_req_ready = lsu_mem_if.req_ready;
assign lsu_mem_rsp_valid = lsu_mem_if.rsp_valid;
assign lsu_mem_rsp_mask = lsu_mem_if.rsp_data.mask;
assign lsu_mem_rsp_data = lsu_mem_if.rsp_data.data;
assign lsu_mem_rsp_tag = lsu_mem_if.rsp_data.tag;
assign lsu_mem_if.rsp_ready = lsu_mem_rsp_ready;
wire [`UUID_WIDTH-1:0] rsp_uuid;
wire [`NW_WIDTH-1:0] rsp_wid;
wire [`PC_BITS-1:0] rsp_pc;
wire rsp_wb;
wire [`NR_BITS-1:0] rsp_rd;
wire [`INST_LSU_BITS-1:0] rsp_op_type;
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] rsp_align;
wire [PID_WIDTH-1:0] rsp_pid;
`UNUSED_VAR (rsp_op_type)
// unpack memory response tag
assign {
rsp_uuid,
rsp_wid,
rsp_pc,
rsp_wb,
rsp_rd,
rsp_op_type,
rsp_align,
rsp_pid,
pkt_raddr,
rsp_is_fence
} = mem_rsp_tag;
// load response formatting
reg [NUM_LANES-1:0][`XLEN-1:0] rsp_data;
`ifdef XLEN_64
`ifdef EXT_F_ENABLE
// apply nan-boxing to flw outputs
wire rsp_is_float = rsp_rd[5];
`else
wire rsp_is_float = 0;
`endif
`endif
for (genvar i = 0; i < NUM_LANES; i++) begin
`ifdef XLEN_64
wire [63:0] rsp_data64 = mem_rsp_data[i];
wire [31:0] rsp_data32 = (rsp_align[i][2] ? mem_rsp_data[i][63:32] : mem_rsp_data[i][31:0]);
`else
wire [31:0] rsp_data32 = mem_rsp_data[i];
`endif
wire [15:0] rsp_data16 = rsp_align[i][1] ? rsp_data32[31:16] : rsp_data32[15:0];
wire [7:0] rsp_data8 = rsp_align[i][0] ? rsp_data16[15:8] : rsp_data16[7:0];
always @(*) begin
case (`INST_LSU_FMT(rsp_op_type))
`INST_FMT_B: rsp_data[i] = `XLEN'(signed'(rsp_data8));
`INST_FMT_H: rsp_data[i] = `XLEN'(signed'(rsp_data16));
`INST_FMT_BU: rsp_data[i] = `XLEN'(unsigned'(rsp_data8));
`INST_FMT_HU: rsp_data[i] = `XLEN'(unsigned'(rsp_data16));
`ifdef XLEN_64
`INST_FMT_W: rsp_data[i] = rsp_is_float ? (`XLEN'(rsp_data32) | 64'hffffffff00000000) : `XLEN'(signed'(rsp_data32));
`INST_FMT_WU: rsp_data[i] = `XLEN'(unsigned'(rsp_data32));
`INST_FMT_D: rsp_data[i] = `XLEN'(signed'(rsp_data64));
`else
`INST_FMT_W: rsp_data[i] = `XLEN'(signed'(rsp_data32));
`endif
default: rsp_data[i] = 'x;
endcase
end
end
// commit
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + 1 + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
.SIZE (2)
) rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_rsp_valid),
.ready_in (mem_rsp_ready),
.data_in ({rsp_uuid, rsp_wid, mem_rsp_mask, rsp_pc, rsp_wb, rsp_rd, rsp_data, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
.data_out ({commit_rsp_if.data.uuid, commit_rsp_if.data.wid, commit_rsp_if.data.tmask, commit_rsp_if.data.PC, commit_rsp_if.data.wb, commit_rsp_if.data.rd, commit_rsp_if.data.data, commit_rsp_if.data.pid, commit_rsp_if.data.sop, commit_rsp_if.data.eop}),
.valid_out (commit_rsp_if.valid),
.ready_out (commit_rsp_if.ready)
);
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + PID_WIDTH + 1 + 1),
.SIZE (2)
) no_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (no_rsp_buf_valid),
.ready_in (no_rsp_buf_ready),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
.data_out ({commit_no_rsp_if.data.uuid, commit_no_rsp_if.data.wid, commit_no_rsp_if.data.tmask, commit_no_rsp_if.data.PC, commit_no_rsp_if.data.pid, commit_no_rsp_if.data.sop, commit_no_rsp_if.data.eop}),
.valid_out (commit_no_rsp_if.valid),
.ready_out (commit_no_rsp_if.ready)
);
assign commit_no_rsp_if.data.rd = '0;
assign commit_no_rsp_if.data.wb = 1'b0;
assign commit_no_rsp_if.data.data = commit_rsp_if.data.data; // arbiter MUX optimization
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_ARB_DATAW),
.ARBITER ("P"), // prioritize commit_rsp_if
.OUT_BUF (3)
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in ({commit_no_rsp_if.valid, commit_rsp_if.valid}),
.ready_in ({commit_no_rsp_if.ready, commit_rsp_if.ready}),
.data_in ({commit_no_rsp_if.data, commit_rsp_if.data}),
.data_out (commit_if.data),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready),
`UNUSED_PIN (sel_out)
);
`ifdef DBG_TRACE_MEM
always @(posedge clk) begin
if (execute_if.valid && fence_lock) begin
`TRACE(1, ("%d: *** %s fence wait\n", $time, INSTANCE_ID));
end
if (mem_req_fire) begin
if (mem_req_rw) begin
`TRACE(1, ("%d: %s Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
`TRACE(1, (", byteen=0x%0h, data=", mem_req_byteen));
`TRACE_ARRAY1D(1, "0x%0h", mem_req_data, NUM_LANES);
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_req_tag, execute_if.data.uuid));
end else begin
`TRACE(1, ("%d: %s Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
`TRACE(1, (", byteen=0x%0h, rd=%0d, tag=0x%0h (#%0d)\n", mem_req_byteen, execute_if.data.rd, mem_req_tag, execute_if.data.uuid));
end
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: %s Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=",
$time, INSTANCE_ID, rsp_wid, {rsp_pc, 1'b0}, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop));
`TRACE_ARRAY1D(1, "0x%0h", mem_rsp_data, NUM_LANES);
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid));
end
end
`endif
`ifdef DBG_SCOPE_LSU
VX_scope_tap #(
.SCOPE_ID (3),
.TRIGGERW (3),
.PROBEW (1 + NUM_LANES*(`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE*8) + `UUID_WIDTH + NUM_LANES*LSU_WORD_SIZE*8 + `UUID_WIDTH)
) scope_tap (
.clk (clk),
.reset (scope_reset),
.start (1'b0),
.stop (1'b0),
.triggers({reset, mem_req_fire, mem_rsp_fire}),
.probes ({mem_req_rw, full_addr, mem_req_byteen, mem_req_data, execute_if.data.uuid, rsp_data, rsp_uuid}),
.bus_in (scope_bus_in),
.bus_out(scope_bus_out)
);
`else
`SCOPE_IO_UNUSED()
`endif
endmodule

View file

@ -14,634 +14,71 @@
`include "VX_define.vh" `include "VX_define.vh"
module VX_lsu_unit import VX_gpu_pkg::*; #( module VX_lsu_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0 parameter `STRING INSTANCE_ID = ""
) ( ) (
`SCOPE_IO_DECL `SCOPE_IO_DECL
input wire clk, input wire clk,
input wire reset, input wire reset,
// Dcache interface // Inputs
VX_mem_bus_if.master cache_bus_if [DCACHE_NUM_REQS],
// inputs
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH], VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
// outputs // Outputs
VX_commit_if.master commit_if [`ISSUE_WIDTH] VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_lsu_mem_if.master lsu_mem_if [`NUM_LSU_BLOCKS]
); );
localparam BLOCK_SIZE = 1; localparam BLOCK_SIZE = `NUM_LSU_BLOCKS;
localparam NUM_LANES = `NUM_LSU_LANES; localparam NUM_LANES = `NUM_LSU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS); `ifdef SCOPE
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1; `SCOPE_IO_SWITCH (BLOCK_SIZE);
localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_SIZE); `endif
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
localparam MEM_ADDRW = `XLEN - MEM_ASHIFT;
localparam REQ_ASHIFT = `CLOG2(DCACHE_WORD_SIZE);
localparam CACHE_TAG_WIDTH = `UUID_WIDTH + (NUM_LANES * `CACHE_ADDR_TYPE_BITS) + LSUQ_TAG_BITS;
VX_execute_if #( VX_execute_if #(
.NUM_LANES (NUM_LANES) .NUM_LANES (NUM_LANES)
) execute_if[BLOCK_SIZE](); ) per_block_execute_if[BLOCK_SIZE]();
`RESET_RELAY (dispatch_reset, reset);
VX_dispatch_unit #( VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE), .BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES), .NUM_LANES (NUM_LANES),
.OUT_REG (1) .OUT_BUF (1)
) dispatch_unit ( ) dispatch_unit (
.clk (clk), .clk (clk),
.reset (dispatch_reset), .reset (reset),
.dispatch_if(dispatch_if), .dispatch_if(dispatch_if),
.execute_if (execute_if) .execute_if (per_block_execute_if)
); );
VX_commit_if #( VX_commit_if #(
.NUM_LANES (NUM_LANES) .NUM_LANES (NUM_LANES)
) commit_st_if(); ) per_block_commit_if[BLOCK_SIZE]();
VX_commit_if #( for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : lsu_slices
.NUM_LANES (NUM_LANES)
) commit_ld_if();
`UNUSED_VAR (execute_if[0].data.op_mod) `RESET_RELAY (slice_reset, reset);
`UNUSED_VAR (execute_if[0].data.use_PC)
`UNUSED_VAR (execute_if[0].data.use_imm)
`UNUSED_VAR (execute_if[0].data.rs3_data)
`UNUSED_VAR (execute_if[0].data.tid)
`ifdef SM_ENABLE VX_lsu_slice #(
`STATIC_ASSERT((1 << `SMEM_LOG_SIZE) == `MEM_BLOCK_SIZE * ((1 << `SMEM_LOG_SIZE) / `MEM_BLOCK_SIZE), ("invalid parameter")) .INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, block_idx))
`STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % (1 << `SMEM_LOG_SIZE)), ("invalid parameter")) ) lsu_slice(
localparam SMEM_START_B = MEM_ADDRW'(`XLEN'(`SMEM_BASE_ADDR) >> MEM_ASHIFT); `SCOPE_IO_BIND (block_idx)
localparam SMEM_END_B = MEM_ADDRW'((`XLEN'(`SMEM_BASE_ADDR) + (1 << `SMEM_LOG_SIZE)) >> MEM_ASHIFT);
`endif
// tag = uuid + addr_type + wid + PC + tmask + rd + op_type + align + is_dup + pid + pkt_addr
localparam TAG_WIDTH = `UUID_WIDTH + (NUM_LANES * `CACHE_ADDR_TYPE_BITS) + `NW_WIDTH + `XLEN + NUM_LANES + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * (REQ_ASHIFT)) + `LSU_DUP_ENABLED + PID_WIDTH + LSUQ_SIZEW;
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % `MEM_BLOCK_SIZE), ("invalid parameter"))
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type;
// full address calculation
wire [NUM_LANES-1:0][`XLEN-1:0] full_addr;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign full_addr[i] = execute_if[0].data.rs1_data[i][`XLEN-1:0] + execute_if[0].data.imm;
end
// detect duplicate addresses
wire lsu_is_dup;
`ifdef LSU_DUP
if (NUM_LANES > 1) begin
wire [NUM_LANES-2:0] addr_matches;
for (genvar i = 0; i < (NUM_LANES-1); ++i) begin
assign addr_matches[i] = (execute_if[0].data.rs1_data[i+1] == execute_if[0].data.rs1_data[0]) || ~execute_if[0].data.tmask[i+1];
end
assign lsu_is_dup = execute_if[0].data.tmask[0] && (& addr_matches);
end else begin
assign lsu_is_dup = 0;
end
`else
assign lsu_is_dup = 0;
`endif
// detect address type
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [MEM_ADDRW-1:0] full_addr_b = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
// is non-cacheable I/O address
wire is_addr_io = (full_addr_b >= MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT));
`ifdef SM_ENABLE
// is shared memory address
wire is_addr_sm = (full_addr_b >= SMEM_START_B) && (full_addr_b < SMEM_END_B);
assign lsu_addr_type[i] = {is_addr_io, is_addr_sm};
`else
assign lsu_addr_type[i] = is_addr_io;
`endif
end
wire mem_req_empty;
wire st_rsp_ready;
wire lsu_valid, lsu_ready;
// fence: stall the pipeline until all pending requests are sent
wire is_fence = `INST_LSU_IS_FENCE(execute_if[0].data.op_type);
wire fence_wait = is_fence && ~mem_req_empty;
assign lsu_valid = execute_if[0].valid && ~fence_wait;
assign execute_if[0].ready = lsu_ready && ~fence_wait;
// schedule memory request
wire mem_req_valid;
wire [NUM_LANES-1:0] mem_req_mask;
wire mem_req_rw;
wire [NUM_LANES-1:0][`MEM_ADDR_WIDTH-REQ_ASHIFT-1:0] mem_req_addr;
reg [NUM_LANES-1:0][DCACHE_WORD_SIZE-1:0] mem_req_byteen;
reg [NUM_LANES-1:0][`XLEN-1:0] mem_req_data;
wire [TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
wire mem_rsp_valid;
wire [NUM_LANES-1:0] mem_rsp_mask;
wire [NUM_LANES-1:0][`XLEN-1:0] mem_rsp_data;
wire [TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_sop;
wire mem_rsp_eop;
wire mem_rsp_ready;
assign mem_req_valid = lsu_valid;
assign lsu_ready = mem_req_ready
&& (~mem_req_rw || st_rsp_ready); // writes commit directly
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign mem_req_mask[i] = execute_if[0].data.tmask[i] && (~lsu_is_dup || (i == 0));
end
assign mem_req_rw = ~execute_if[0].data.wb;
wire mem_req_fire = mem_req_valid && mem_req_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
`UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire)
// address formatting
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] req_align;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign req_align[i] = full_addr[i][REQ_ASHIFT-1:0];
assign mem_req_addr[i] = full_addr[i][`MEM_ADDR_WIDTH-1:REQ_ASHIFT];
end
// byte enable formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
mem_req_byteen[i] = '0;
case (`INST_LSU_WSIZE(execute_if[0].data.op_type))
0: begin // 8-bit
mem_req_byteen[i][req_align[i]] = 1'b1;
end
1: begin // 16 bit
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
end
`ifdef XLEN_64
2: begin // 32 bit
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
end
`endif
default : mem_req_byteen[i] = {DCACHE_WORD_SIZE{1'b1}};
endcase
end
end
// memory misalignment not supported!
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire lsu_req_fire = execute_if[0].valid && execute_if[0].ready;
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if[0].data.tmask[i] || is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if[0].data.op_type))) == 0),
("misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
execute_if[0].data.wid, execute_if[0].data.PC, full_addr[i], `INST_LSU_WSIZE(execute_if[0].data.op_type), execute_if[0].data.uuid));
end
// store data formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
mem_req_data[i] = execute_if[0].data.rs2_data[i];
case (req_align[i])
1: mem_req_data[i][`XLEN-1:8] = execute_if[0].data.rs2_data[i][`XLEN-9:0];
2: mem_req_data[i][`XLEN-1:16] = execute_if[0].data.rs2_data[i][`XLEN-17:0];
3: mem_req_data[i][`XLEN-1:24] = execute_if[0].data.rs2_data[i][`XLEN-25:0];
`ifdef XLEN_64
4: mem_req_data[i][`XLEN-1:32] = execute_if[0].data.rs2_data[i][`XLEN-33:0];
5: mem_req_data[i][`XLEN-1:40] = execute_if[0].data.rs2_data[i][`XLEN-41:0];
6: mem_req_data[i][`XLEN-1:48] = execute_if[0].data.rs2_data[i][`XLEN-49:0];
7: mem_req_data[i][`XLEN-1:56] = execute_if[0].data.rs2_data[i][`XLEN-57:0];
`endif
default:;
endcase
end
end
// track SOP/EOP for out-of-order memory responses
wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr;
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
if (PID_BITS != 0) begin
reg [`LSUQ_SIZE-1:0][PID_BITS:0] pkt_ctr;
reg [`LSUQ_SIZE-1:0] pkt_sop, pkt_eop;
wire mem_req_rd_fire = mem_req_fire && execute_if[0].data.wb;
wire mem_req_rd_sop_fire = mem_req_rd_fire && execute_if[0].data.sop;
wire mem_req_rd_eop_fire = mem_req_rd_fire && execute_if[0].data.eop;
wire mem_rsp_eop_fire = mem_rsp_fire && mem_rsp_eop;
wire full;
VX_allocator #(
.SIZE (`LSUQ_SIZE)
) pkt_allocator (
.clk (clk), .clk (clk),
.reset (reset), .reset (slice_reset),
.acquire_en (mem_req_rd_eop_fire), .execute_if (per_block_execute_if[block_idx]),
.acquire_addr(pkt_waddr), .commit_if (per_block_commit_if[block_idx]),
.release_en (mem_rsp_eop_pkt), .lsu_mem_if (lsu_mem_if[block_idx])
.release_addr(pkt_raddr),
`UNUSED_PIN (empty),
.full (full)
); );
wire rd_during_wr = mem_req_rd_fire && mem_rsp_eop_fire && (pkt_raddr == pkt_waddr);
always @(posedge clk) begin
if (reset) begin
pkt_ctr <= '0;
pkt_sop <= '0;
pkt_eop <= '0;
end else begin
if (mem_req_rd_sop_fire) begin
pkt_sop[pkt_waddr] <= 1;
end end
if (mem_req_rd_eop_fire) begin
pkt_eop[pkt_waddr] <= 1;
end
if (mem_rsp_fire) begin
pkt_sop[pkt_raddr] <= 0;
end
if (mem_rsp_eop_pkt) begin
pkt_eop[pkt_raddr] <= 0;
end
if (~rd_during_wr) begin
if (mem_req_rd_fire) begin
pkt_ctr[pkt_waddr] <= pkt_ctr[pkt_waddr] + PID_BITS'(1);
end
if (mem_rsp_eop_fire) begin
pkt_ctr[pkt_raddr] <= pkt_ctr[pkt_raddr] - PID_BITS'(1);
end
end
end
end
assign mem_rsp_sop_pkt = pkt_sop[pkt_raddr];
assign mem_rsp_eop_pkt = mem_rsp_eop_fire && pkt_eop[pkt_raddr] && (pkt_ctr[pkt_raddr] == 1);
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("allocator full!"))
`RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("Oops!"))
`UNUSED_VAR (mem_rsp_sop)
end else begin
assign pkt_waddr = 0;
assign mem_rsp_sop_pkt = mem_rsp_sop;
assign mem_rsp_eop_pkt = mem_rsp_eop;
`UNUSED_VAR (pkt_raddr)
end
assign mem_req_tag = {
execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr
`ifdef LSU_DUP
, lsu_is_dup
`endif
};
wire [DCACHE_NUM_REQS-1:0] cache_req_valid;
wire [DCACHE_NUM_REQS-1:0] cache_req_rw;
wire [DCACHE_NUM_REQS-1:0][(`XLEN/8)-1:0] cache_req_byteen;
wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] cache_req_addr;
wire [DCACHE_NUM_REQS-1:0][`XLEN-1:0] cache_req_data;
wire [DCACHE_NUM_REQS-1:0][CACHE_TAG_WIDTH-1:0] cache_req_tag;
wire [DCACHE_NUM_REQS-1:0] cache_req_ready;
wire [DCACHE_NUM_REQS-1:0] cache_rsp_valid;
wire [DCACHE_NUM_REQS-1:0][`XLEN-1:0] cache_rsp_data;
wire [DCACHE_NUM_REQS-1:0][CACHE_TAG_WIDTH-1:0] cache_rsp_tag;
wire [DCACHE_NUM_REQS-1:0] cache_rsp_ready;
`RESET_RELAY (mem_scheduler_reset, reset);
VX_mem_scheduler #(
.INSTANCE_ID ($sformatf("core%0d-lsu-memsched", CORE_ID)),
.NUM_REQS (LSU_MEM_REQS),
.NUM_BANKS (DCACHE_NUM_REQS),
.ADDR_WIDTH (DCACHE_ADDR_WIDTH),
.DATA_WIDTH (`XLEN),
.QUEUE_SIZE (`LSUQ_SIZE),
.TAG_WIDTH (TAG_WIDTH),
.MEM_TAG_ID (`UUID_WIDTH + (NUM_LANES * `CACHE_ADDR_TYPE_BITS)),
.UUID_WIDTH (`UUID_WIDTH),
.RSP_PARTIAL (1),
.MEM_OUT_REG (2)
) mem_scheduler (
.clk (clk),
.reset (mem_scheduler_reset),
// Input request
.req_valid (mem_req_valid),
.req_rw (mem_req_rw),
.req_mask (mem_req_mask),
.req_byteen (mem_req_byteen),
.req_addr (mem_req_addr),
.req_data (mem_req_data),
.req_tag (mem_req_tag),
.req_empty (mem_req_empty),
.req_ready (mem_req_ready),
`UNUSED_PIN (write_notify),
// Output response
.rsp_valid (mem_rsp_valid),
.rsp_mask (mem_rsp_mask),
.rsp_data (mem_rsp_data),
.rsp_tag (mem_rsp_tag),
.rsp_sop (mem_rsp_sop),
.rsp_eop (mem_rsp_eop),
.rsp_ready (mem_rsp_ready),
// Memory request
.mem_req_valid (cache_req_valid),
.mem_req_rw (cache_req_rw),
.mem_req_byteen (cache_req_byteen),
.mem_req_addr (cache_req_addr),
.mem_req_data (cache_req_data),
.mem_req_tag (cache_req_tag),
.mem_req_ready (cache_req_ready),
// Memory response
.mem_rsp_valid (cache_rsp_valid),
.mem_rsp_data (cache_rsp_data),
.mem_rsp_tag (cache_rsp_tag),
.mem_rsp_ready (cache_rsp_ready)
);
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign cache_bus_if[i].req_valid = cache_req_valid[i];
assign cache_bus_if[i].req_data.rw = cache_req_rw[i];
assign cache_bus_if[i].req_data.byteen = cache_req_byteen[i];
assign cache_bus_if[i].req_data.addr = cache_req_addr[i];
assign cache_bus_if[i].req_data.data = cache_req_data[i];
assign cache_req_ready[i] = cache_bus_if[i].req_ready;
assign cache_rsp_valid[i] = cache_bus_if[i].rsp_valid;
assign cache_rsp_data[i] = cache_bus_if[i].rsp_data.data;
assign cache_bus_if[i].rsp_ready = cache_rsp_ready[i];
end
// cache tag formatting: <uuid, tag, type>
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
wire [`UUID_WIDTH-1:0] cache_req_uuid, cache_rsp_uuid;
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type, cache_rsp_type;
wire [`CLOG2(`LSUQ_SIZE)-1:0] cache_req_tag_x, cache_rsp_tag_x;
if (DCACHE_NUM_BATCHES > 1) begin
wire [DCACHE_NUM_BATCHES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type_b, cache_rsp_type_b;
wire [`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type_bi, cache_rsp_type_bi;
wire [DCACHE_BATCH_SEL_BITS-1:0] cache_req_bid, cache_rsp_bid;
assign {cache_req_uuid, cache_req_type, cache_req_bid, cache_req_tag_x} = cache_req_tag[i];
assign cache_req_type_bi = cache_req_type_b[cache_req_bid];
assign cache_bus_if[i].req_data.tag = {cache_req_uuid, cache_req_bid, cache_req_tag_x, cache_req_type_bi};
assign {cache_rsp_uuid, cache_rsp_bid, cache_rsp_tag_x, cache_rsp_type_bi} = cache_bus_if[i].rsp_data.tag;
assign cache_rsp_type_b = {DCACHE_NUM_BATCHES{cache_rsp_type_bi}};
assign cache_rsp_tag[i] = {cache_rsp_uuid, cache_rsp_type, cache_rsp_bid, cache_rsp_tag_x};
for (genvar j = 0; j < DCACHE_NUM_BATCHES; ++j) begin
localparam k = j * DCACHE_NUM_REQS + i;
if (k < NUM_LANES) begin
assign cache_req_type_b[j] = cache_req_type[k];
assign cache_rsp_type[k] = cache_rsp_type_b[j];
end else begin
assign cache_req_type_b[j] = '0;
`UNUSED_VAR (cache_rsp_type_b[j])
end
end
end else begin
assign {cache_req_uuid, cache_req_type, cache_req_tag_x} = cache_req_tag[i];
assign cache_bus_if[i].req_data.tag = {cache_req_uuid, cache_req_tag_x, cache_req_type[i]};
assign {cache_rsp_uuid, cache_rsp_tag_x, cache_rsp_type[i]} = cache_bus_if[i].rsp_data.tag;
assign cache_rsp_tag[i] = {cache_rsp_uuid, cache_rsp_type, cache_rsp_tag_x};
for (genvar j = 0; j < DCACHE_NUM_REQS; ++j) begin
if (i != j) begin
`UNUSED_VAR (cache_req_type[j])
assign cache_rsp_type[j] = '0;
end
end
end
end
wire [`UUID_WIDTH-1:0] rsp_uuid;
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] rsp_addr_type;
wire [`NW_WIDTH-1:0] rsp_wid;
wire [NUM_LANES-1:0] rsp_tmask_uq;
wire [`XLEN-1:0] rsp_pc;
wire [`NR_BITS-1:0] rsp_rd;
wire [`INST_LSU_BITS-1:0] rsp_op_type;
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] rsp_align;
wire [PID_WIDTH-1:0] rsp_pid;
wire rsp_is_dup;
`ifndef LSU_DUP
assign rsp_is_dup = 0;
`endif
assign {
rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr
`ifdef LSU_DUP
, rsp_is_dup
`endif
} = mem_rsp_tag;
`UNUSED_VAR (rsp_addr_type)
`UNUSED_VAR (rsp_op_type)
// load response formatting
reg [NUM_LANES-1:0][`XLEN-1:0] rsp_data;
wire [NUM_LANES-1:0] rsp_tmask;
`ifdef XLEN_64
`ifdef EXT_F_ENABLE
// apply nan-boxing to flw outputs
wire rsp_is_float = rsp_rd[5];
`else
wire rsp_is_float = 0;
`endif
`endif
for (genvar i = 0; i < NUM_LANES; i++) begin
`ifdef XLEN_64
wire [63:0] rsp_data64 = (i == 0 || rsp_is_dup) ? mem_rsp_data[0] : mem_rsp_data[i];
wire [31:0] rsp_data32 = (i == 0 || rsp_is_dup) ? (rsp_align[0][2] ? mem_rsp_data[0][63:32] : mem_rsp_data[0][31:0]) :
(rsp_align[i][2] ? mem_rsp_data[i][63:32] : mem_rsp_data[i][31:0]);
`else
wire [31:0] rsp_data32 = (i == 0 || rsp_is_dup) ? mem_rsp_data[0] : mem_rsp_data[i];
`endif
wire [15:0] rsp_data16 = rsp_align[i][1] ? rsp_data32[31:16] : rsp_data32[15:0];
wire [7:0] rsp_data8 = rsp_align[i][0] ? rsp_data16[15:8] : rsp_data16[7:0];
always @(*) begin
case (`INST_LSU_FMT(rsp_op_type))
`INST_FMT_B: rsp_data[i] = `XLEN'(signed'(rsp_data8));
`INST_FMT_H: rsp_data[i] = `XLEN'(signed'(rsp_data16));
`INST_FMT_BU: rsp_data[i] = `XLEN'(unsigned'(rsp_data8));
`INST_FMT_HU: rsp_data[i] = `XLEN'(unsigned'(rsp_data16));
`ifdef XLEN_64
`INST_FMT_W: rsp_data[i] = rsp_is_float ? (`XLEN'(rsp_data32) | 64'hffffffff00000000) : `XLEN'(signed'(rsp_data32));
`INST_FMT_WU: rsp_data[i] = `XLEN'(unsigned'(rsp_data32));
`INST_FMT_D: rsp_data[i] = `XLEN'(signed'(rsp_data64));
`else
`INST_FMT_W: rsp_data[i] = `XLEN'(signed'(rsp_data32));
`endif
default: rsp_data[i] = 'x;
endcase
end
end
assign rsp_tmask = rsp_is_dup ? rsp_tmask_uq : mem_rsp_mask;
// load commit
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
.SIZE (2)
) ld_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_rsp_valid),
.ready_in (mem_rsp_ready),
.data_in ({rsp_uuid, rsp_wid, rsp_tmask, rsp_pc, rsp_rd, rsp_data, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
.data_out ({commit_ld_if.data.uuid, commit_ld_if.data.wid, commit_ld_if.data.tmask, commit_ld_if.data.PC, commit_ld_if.data.rd, commit_ld_if.data.data, commit_ld_if.data.pid, commit_ld_if.data.sop, commit_ld_if.data.eop}),
.valid_out (commit_ld_if.valid),
.ready_out (commit_ld_if.ready)
);
assign commit_ld_if.data.wb = 1'b1;
// store commit
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + PID_WIDTH + 1 + 1),
.SIZE (2)
) st_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_fire && mem_req_rw),
.ready_in (st_rsp_ready),
.data_in ({execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.pid, execute_if[0].data.sop, execute_if[0].data.eop}),
.data_out ({commit_st_if.data.uuid, commit_st_if.data.wid, commit_st_if.data.tmask, commit_st_if.data.PC, commit_st_if.data.pid, commit_st_if.data.sop, commit_st_if.data.eop}),
.valid_out (commit_st_if.valid),
.ready_out (commit_st_if.ready)
);
assign commit_st_if.data.rd = '0;
assign commit_st_if.data.wb = 1'b0;
assign commit_st_if.data.data = commit_ld_if.data.data; // force arbiter passthru
// lsu commit
`RESET_RELAY (commit_reset, reset);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_arb_if[1]();
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_ARB_DATAW),
.OUT_REG (1)
) rsp_arb (
.clk (clk),
.reset (commit_reset),
.valid_in ({commit_st_if.valid, commit_ld_if.valid}),
.ready_in ({commit_st_if.ready, commit_ld_if.ready}),
.data_in ({commit_st_if.data, commit_ld_if.data}),
.data_out (commit_arb_if[0].data),
.valid_out (commit_arb_if[0].valid),
.ready_out (commit_arb_if[0].ready),
`UNUSED_PIN (sel_out)
);
VX_gather_unit #( VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE), .BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES), .NUM_LANES (NUM_LANES),
.OUT_REG (3) .OUT_BUF (3)
) gather_unit ( ) gather_unit (
.clk (clk), .clk (clk),
.reset (commit_reset), .reset (reset),
.commit_in_if (commit_arb_if), .commit_in_if (per_block_commit_if),
.commit_out_if (commit_if) .commit_out_if (commit_if)
); );
`ifdef DBG_SCOPE_LSU
if (CORE_ID == 0) begin
`ifdef SCOPE
VX_scope_tap #(
.SCOPE_ID (3),
.TRIGGERW (3),
.PROBEW (`UUID_WIDTH+NUM_LANES*(`XLEN+4+`XLEN)+1+`UUID_WIDTH+NUM_LANES*`XLEN)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({reset, mem_req_fire, mem_rsp_fire}),
.probes({execute_if[0].data.uuid, full_addr, mem_req_rw, mem_req_byteen, mem_req_data, rsp_uuid, rsp_data}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
wire [31:0] full_addr_0 = full_addr[0];
wire [31:0] mem_req_data_0 = mem_req_data[0];
wire [31:0] rsp_data_0 = rsp_data[0];
ila_lsu ila_lsu_inst (
.clk (clk),
.probe0 ({mem_req_data_0, execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.PC, mem_req_mask, full_addr_0, mem_req_byteen, mem_req_rw, mem_req_ready, mem_req_valid}),
.probe1 ({rsp_data_0, rsp_uuid, mem_rsp_eop, rsp_pc, rsp_rd, rsp_tmask, rsp_wid, mem_rsp_ready, mem_rsp_valid}),
.probe2 ({cache_bus_if.req_data.data, cache_bus_if.req_data.tag, cache_bus_if.req_data.byteen, cache_bus_if.req_data.addr, cache_bus_if.req_data.rw, cache_bus_if.req_ready, cache_bus_if.req_valid}),
.probe3 ({cache_bus_if.rsp_data.data, cache_bus_if.rsp_data.tag, cache_bus_if.rsp_ready, cache_bus_if.rsp_valid})
);
`endif
end
`else
`SCOPE_IO_UNUSED()
`endif
`ifdef DBG_TRACE_CORE_DCACHE
always @(posedge clk) begin
if (execute_if[0].valid && fence_wait) begin
`TRACE(1, ("%d: *** D$%0d fence wait\n", $time, CORE_ID));
end
if (mem_req_fire) begin
if (mem_req_rw) begin
`TRACE(1, ("%d: D$%0d Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, mem_req_mask));
`TRACE_ARRAY1D(1, full_addr, NUM_LANES);
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, type=", mem_req_tag, mem_req_byteen));
`TRACE_ARRAY1D(1, lsu_addr_type, NUM_LANES);
`TRACE(1, (", data="));
`TRACE_ARRAY1D(1, mem_req_data, NUM_LANES);
`TRACE(1, (", is_dup=%b (#%0d)\n", lsu_is_dup, execute_if[0].data.uuid));
end else begin
`TRACE(1, ("%d: D$%0d Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, mem_req_mask));
`TRACE_ARRAY1D(1, full_addr, NUM_LANES);
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, type=", mem_req_tag, mem_req_byteen));
`TRACE_ARRAY1D(1, lsu_addr_type, NUM_LANES);
`TRACE(1, (", rd=%0d, is_dup=%b (#%0d)\n", execute_if[0].data.rd, lsu_is_dup, execute_if[0].data.uuid));
end
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: D$%0d Rsp: wid=%0d, PC=0x%0h, tmask=%b, tag=0x%0h, rd=%0d, sop=%b, eop=%b, data=",
$time, CORE_ID, rsp_wid, rsp_pc, mem_rsp_mask, mem_rsp_tag, rsp_rd, mem_rsp_sop, mem_rsp_eop));
`TRACE_ARRAY1D(1, mem_rsp_data, NUM_LANES);
`TRACE(1, (", is_dup=%b (#%0d)\n", rsp_is_dup, rsp_uuid));
end
end
`endif
endmodule endmodule

View file

@ -13,189 +13,241 @@
`include "VX_define.vh" `include "VX_define.vh"
// reset all GPRs in debug mode
`ifdef SIMULATION
`ifndef NDEBUG
`define GPR_RESET
`endif
`endif
module VX_operands import VX_gpu_pkg::*; #( module VX_operands import VX_gpu_pkg::*; #(
parameter CORE_ID = 0, parameter `STRING INSTANCE_ID = "",
parameter CACHE_ENABLE = 0 parameter NUM_BANKS = 4,
parameter OUT_BUF = 4 // using 2-cycle EB for area reduction
) ( ) (
input wire clk, input wire clk,
input wire reset, input wire reset,
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH], `ifdef PERF_ENABLE
VX_ibuffer_if.slave scoreboard_if [`ISSUE_WIDTH], output wire [`PERF_CTR_BITS-1:0] perf_stalls,
VX_operands_if.master operands_if [`ISSUE_WIDTH] `endif
VX_writeback_if.slave writeback_if,
VX_scoreboard_if.slave scoreboard_if,
VX_operands_if.master operands_if
); );
`UNUSED_PARAM (CORE_ID) `UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS; localparam NUM_SRC_REGS = 3;
localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_REGS);
localparam REQ_SEL_WIDTH = `UP(REQ_SEL_BITS);
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS;
localparam META_DATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + `UUID_WIDTH;
localparam REGS_DATAW = `XLEN * `NUM_THREADS;
localparam DATAW = META_DATAW + NUM_SRC_REGS * REGS_DATAW;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS);
localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS;
localparam XLEN_SIZE = `XLEN / 8;
localparam BYTEENW = `NUM_THREADS * XLEN_SIZE;
localparam STATE_IDLE = 2'd0; `UNUSED_VAR (writeback_if.data.sop)
localparam STATE_FETCH1 = 2'd1;
localparam STATE_FETCH2 = 2'd2;
localparam STATE_FETCH3 = 2'd3;
localparam STATE_BITS = 2;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin wire [NUM_SRC_REGS-1:0] src_valid;
wire [`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data; wire [NUM_SRC_REGS-1:0] req_in_valid, req_in_ready;
reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n; wire [NUM_SRC_REGS-1:0][PER_BANK_ADDRW-1:0] req_in_data;
reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n; wire [NUM_SRC_REGS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data [ISSUE_RATIO-1:0]; wire [NUM_BANKS-1:0] gpr_rd_valid, gpr_rd_ready;
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data_n [ISSUE_RATIO-1:0]; wire [NUM_BANKS-1:0] gpr_rd_valid_st1, gpr_rd_valid_st2;
reg [`NR_BITS-1:0] cache_reg [ISSUE_RATIO-1:0]; wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr, gpr_rd_addr_st1;
reg [`NR_BITS-1:0] cache_reg_n [ISSUE_RATIO-1:0]; wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data_st1, gpr_rd_data_st2;
reg [`NUM_THREADS-1:0] cache_tmask [ISSUE_RATIO-1:0]; wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx, gpr_rd_req_idx_st1, gpr_rd_req_idx_st2;
reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0];
reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n; wire pipe_valid_st1, pipe_ready_st1;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n; wire pipe_valid_st2, pipe_ready_st2;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n; wire [META_DATAW-1:0] pipe_data, pipe_data_st1, pipe_data_st2;
reg [STATE_BITS-1:0] state, state_n; reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_n;
reg [`NR_BITS-1:0] rs2, rs2_n; wire [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_st1, src_data_st2;
reg [`NR_BITS-1:0] rs3, rs3_n;
reg rs2_ready, rs2_ready_n;
reg rs3_ready, rs3_ready_n;
reg data_ready, data_ready_n;
wire is_rs1_zero = (scoreboard_if[i].data.rs1 == 0); reg [NUM_SRC_REGS-1:0] data_fetched_n;
wire is_rs2_zero = (scoreboard_if[i].data.rs2 == 0); wire [NUM_SRC_REGS-1:0] data_fetched_st1;
wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0);
VX_operands_if staging_if(); reg has_collision_n;
wire has_collision_st1;
wire [NUM_SRC_REGS-1:0][`NR_BITS-1:0] src_regs = {scoreboard_if.data.rs3,
scoreboard_if.data.rs2,
scoreboard_if.data.rs1};
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
if (ISSUE_WIS != 0) begin
assign req_in_data[i] = {src_regs[i][`NR_BITS-1:BANK_SEL_BITS], scoreboard_if.data.wis};
end else begin
assign req_in_data[i] = src_regs[i][`NR_BITS-1:BANK_SEL_BITS];
end
if (NUM_BANKS != 1) begin
assign req_bank_idx[i] = src_regs[i][BANK_SEL_BITS-1:0];
end else begin
assign req_bank_idx[i] = '0;
end
end
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
assign src_valid[i] = (src_regs[i] != 0) && ~data_fetched_st1[i];
end
assign req_in_valid = {NUM_SRC_REGS{scoreboard_if.valid}} & src_valid;
VX_stream_xbar #(
.NUM_INPUTS (NUM_SRC_REGS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (PER_BANK_ADDRW),
.ARBITER ("P"), // use priority arbiter
.PERF_CTR_BITS(`PERF_CTR_BITS),
.OUT_BUF (0) // no output buffering
) req_xbar (
.clk (clk),
.reset (reset),
`UNUSED_PIN(collisions),
.valid_in (req_in_valid),
.data_in (req_in_data),
.sel_in (req_bank_idx),
.ready_in (req_in_ready),
.valid_out (gpr_rd_valid),
.data_out (gpr_rd_addr),
.sel_out (gpr_rd_req_idx),
.ready_out (gpr_rd_ready)
);
wire pipe_in_ready = pipe_ready_st1 || ~pipe_valid_st1;
assign gpr_rd_ready = {NUM_BANKS{pipe_in_ready}};
assign scoreboard_if.ready = pipe_in_ready && ~has_collision_n;
wire pipe_fire_st1 = pipe_valid_st1 && pipe_ready_st1;
wire pipe_fire_st2 = pipe_valid_st2 && pipe_ready_st2;
always @(*) begin always @(*) begin
state_n = state; has_collision_n = 0;
rs2_n = rs2; for (integer i = 0; i < NUM_SRC_REGS; ++i) begin
rs3_n = rs3; for (integer j = 1; j < (NUM_SRC_REGS-i); ++j) begin
rs2_ready_n = rs2_ready; has_collision_n |= src_valid[i]
rs3_ready_n = rs3_ready; && src_valid[j+i]
rs1_data_n = rs1_data; && (req_bank_idx[i] == req_bank_idx[j+i]);
rs2_data_n = rs2_data;
rs3_data_n = rs3_data;
cache_data_n = cache_data;
cache_reg_n = cache_reg;
cache_tmask_n= cache_tmask;
cache_eop_n = cache_eop;
gpr_rd_rid_n = gpr_rd_rid;
gpr_rd_wis_n = gpr_rd_wis;
data_ready_n = data_ready;
case (state)
STATE_IDLE: begin
if (staging_if.valid && staging_if.ready) begin
data_ready_n = 0;
end
if (scoreboard_if[i].valid && data_ready_n == 0) begin
data_ready_n = 1;
if (is_rs3_zero || (CACHE_ENABLE != 0 &&
scoreboard_if[i].data.rs3 == cache_reg[scoreboard_if[i].data.wis] &&
(scoreboard_if[i].data.tmask & cache_tmask[scoreboard_if[i].data.wis]) == scoreboard_if[i].data.tmask)) begin
rs3_data_n = (is_rs3_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if[i].data.wis];
rs3_ready_n = 1;
end else begin
rs3_ready_n = 0;
gpr_rd_rid_n = scoreboard_if[i].data.rs3;
data_ready_n = 0;
state_n = STATE_FETCH3;
end
if (is_rs2_zero || (CACHE_ENABLE != 0 &&
scoreboard_if[i].data.rs2 == cache_reg[scoreboard_if[i].data.wis] &&
(scoreboard_if[i].data.tmask & cache_tmask[scoreboard_if[i].data.wis]) == scoreboard_if[i].data.tmask)) begin
rs2_data_n = (is_rs2_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if[i].data.wis];
rs2_ready_n = 1;
end else begin
rs2_ready_n = 0;
gpr_rd_rid_n = scoreboard_if[i].data.rs2;
data_ready_n = 0;
state_n = STATE_FETCH2;
end
if (is_rs1_zero || (CACHE_ENABLE != 0 &&
scoreboard_if[i].data.rs1 == cache_reg[scoreboard_if[i].data.wis] &&
(scoreboard_if[i].data.tmask & cache_tmask[scoreboard_if[i].data.wis]) == scoreboard_if[i].data.tmask)) begin
rs1_data_n = (is_rs1_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if[i].data.wis];
end else begin
gpr_rd_rid_n = scoreboard_if[i].data.rs1;
data_ready_n = 0;
state_n = STATE_FETCH1;
end
end
gpr_rd_wis_n = scoreboard_if[i].data.wis;
rs2_n = scoreboard_if[i].data.rs2;
rs3_n = scoreboard_if[i].data.rs3;
end
STATE_FETCH1: begin
rs1_data_n = gpr_rd_data;
if (~rs2_ready) begin
gpr_rd_rid_n = rs2;
state_n = STATE_FETCH2;
end else if (~rs3_ready) begin
gpr_rd_rid_n = rs3;
state_n = STATE_FETCH3;
end else begin
data_ready_n = 1;
state_n = STATE_IDLE;
end
end
STATE_FETCH2: begin
rs2_data_n = gpr_rd_data;
if (~rs3_ready) begin
gpr_rd_rid_n = rs3;
state_n = STATE_FETCH3;
end else begin
data_ready_n = 1;
state_n = STATE_IDLE;
end
end
STATE_FETCH3: begin
rs3_data_n = gpr_rd_data;
data_ready_n = 1;
state_n = STATE_IDLE;
end
endcase
if (CACHE_ENABLE != 0 && writeback_if[i].valid) begin
if ((cache_reg[writeback_if[i].data.wis] == writeback_if[i].data.rd)
|| (cache_eop[writeback_if[i].data.wis] && writeback_if[i].data.sop)) begin
for (integer j = 0; j < `NUM_THREADS; ++j) begin
if (writeback_if[i].data.tmask[j]) begin
cache_data_n[writeback_if[i].data.wis][j] = writeback_if[i].data.data[j];
end
end
cache_reg_n[writeback_if[i].data.wis] = writeback_if[i].data.rd;
cache_eop_n[writeback_if[i].data.wis] = writeback_if[i].data.eop;
cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.sop ? writeback_if[i].data.tmask :
(cache_tmask_n[writeback_if[i].data.wis] | writeback_if[i].data.tmask);
end end
end end
end end
always @(posedge clk) begin always @(*) begin
if (reset) begin data_fetched_n = data_fetched_st1;
state <= STATE_IDLE; if (scoreboard_if.ready) begin
gpr_rd_rid <= '0; data_fetched_n = '0;
gpr_rd_wis <= '0;
cache_eop <= {ISSUE_RATIO{1'b1}};
data_ready <= 0;
end else begin end else begin
state <= state_n; data_fetched_n = data_fetched_st1 | req_in_ready;
rs2 <= rs2_n;
rs3 <= rs3_n;
rs2_ready <= rs2_ready_n;
rs3_ready <= rs3_ready_n;
rs1_data <= rs1_data_n;
rs2_data <= rs2_data_n;
rs3_data <= rs3_data_n;
gpr_rd_rid <= gpr_rd_rid_n;
gpr_rd_wis <= gpr_rd_wis_n;
cache_data <= cache_data_n;
cache_reg <= cache_reg_n;
cache_tmask <= cache_tmask_n;
cache_eop <= cache_eop_n;
data_ready <= data_ready_n;
end end
end end
// GPR banks assign pipe_data = {
scoreboard_if.data.wis,
scoreboard_if.data.tmask,
scoreboard_if.data.PC,
scoreboard_if.data.wb,
scoreboard_if.data.ex_type,
scoreboard_if.data.op_type,
scoreboard_if.data.op_args,
scoreboard_if.data.rd,
scoreboard_if.data.uuid
};
VX_pipe_register #(
.DATAW (1 + NUM_SRC_REGS + NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)),
.RESETW (1 + NUM_SRC_REGS)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (pipe_in_ready),
.data_in ({scoreboard_if.valid, data_fetched_n, gpr_rd_valid, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}),
.data_out ({pipe_valid_st1, data_fetched_st1, gpr_rd_valid_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1})
);
assign pipe_ready_st1 = pipe_ready_st2 || ~pipe_valid_st2;
assign src_data_st1 = pipe_fire_st2 ? '0 : src_data_n;
wire pipe_valid2_st1 = pipe_valid_st1 && ~has_collision_st1;
`RESET_RELAY (pipe2_reset, reset); // needed for pipe_reg2's wide RESETW
VX_pipe_register #(
.DATAW (1 + NUM_SRC_REGS * REGS_DATAW + NUM_BANKS + NUM_BANKS * REGS_DATAW + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH),
.RESETW (1 + NUM_SRC_REGS * REGS_DATAW)
) pipe_reg2 (
.clk (clk),
.reset (pipe2_reset),
.enable (pipe_ready_st1),
.data_in ({pipe_valid2_st1, src_data_st1, gpr_rd_valid_st1, gpr_rd_data_st1, pipe_data_st1, gpr_rd_req_idx_st1}),
.data_out ({pipe_valid_st2, src_data_st2, gpr_rd_valid_st2, gpr_rd_data_st2, pipe_data_st2, gpr_rd_req_idx_st2})
);
always @(*) begin
src_data_n = src_data_st2;
for (integer b = 0; b < NUM_BANKS; ++b) begin
if (gpr_rd_valid_st2[b]) begin
src_data_n[gpr_rd_req_idx_st2[b]] = gpr_rd_data_st2[b];
end
end
end
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)),
.LUTRAM (1)
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (pipe_valid_st2),
.ready_in (pipe_ready_st2),
.data_in ({
pipe_data_st2,
src_data_n[0],
src_data_n[1],
src_data_n[2]
}),
.data_out ({
operands_if.data.wis,
operands_if.data.tmask,
operands_if.data.PC,
operands_if.data.wb,
operands_if.data.ex_type,
operands_if.data.op_type,
operands_if.data.op_args,
operands_if.data.rd,
operands_if.data.uuid,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data
}),
.valid_out (operands_if.valid),
.ready_out (operands_if.ready)
);
wire [PER_BANK_ADDRW-1:0] gpr_wr_addr;
if (ISSUE_WIS != 0) begin
assign gpr_wr_addr = {writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS], writeback_if.data.wis};
end else begin
assign gpr_wr_addr = writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS];
end
wire [BANK_SEL_WIDTH-1:0] gpr_wr_bank_idx;
if (NUM_BANKS != 1) begin
assign gpr_wr_bank_idx = writeback_if.data.rd[BANK_SEL_BITS-1:0];
end else begin
assign gpr_wr_bank_idx = '0;
end
`ifdef GPR_RESET `ifdef GPR_RESET
reg wr_enabled = 0; reg wr_enabled = 0;
@ -208,94 +260,52 @@ module VX_operands import VX_gpu_pkg::*; #(
wire wr_enabled = 1; wire wr_enabled = 1;
`endif `endif
for (genvar j = 0; j < `NUM_THREADS; ++j) begin for (genvar b = 0; b < NUM_BANKS; ++b) begin
wire gpr_wr_enabled;
if (BANK_SEL_BITS != 0) begin
assign gpr_wr_enabled = wr_enabled
&& writeback_if.valid
&& (gpr_wr_bank_idx == BANK_SEL_BITS'(b));
end else begin
assign gpr_wr_enabled = wr_enabled && writeback_if.valid;
end
wire [BYTEENW-1:0] wren;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign wren[i*XLEN_SIZE+:XLEN_SIZE] = {XLEN_SIZE{writeback_if.data.tmask[i]}};
end
VX_dp_ram #( VX_dp_ram #(
.DATAW (`XLEN), .DATAW (REGS_DATAW),
.SIZE (`NUM_REGS * ISSUE_RATIO), .SIZE (PER_BANK_REGS * PER_ISSUE_WARPS),
.WRENW (BYTEENW),
`ifdef GPR_RESET `ifdef GPR_RESET
.INIT_ENABLE (1), .RESET_RAM (1),
.INIT_VALUE (0),
`endif `endif
.NO_RWCHECK (1) .NO_RWCHECK (1)
) gpr_ram ( ) gpr_ram (
.clk (clk), .clk (clk),
.read (1'b1), .reset (reset),
`UNUSED_PIN (wren), .read (pipe_fire_st1),
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]), .wren (wren),
.waddr (wis_to_addr(writeback_if[i].data.rd, writeback_if[i].data.wis)), .write (gpr_wr_enabled),
.wdata (writeback_if[i].data.data[j]), .waddr (gpr_wr_addr),
.raddr (wis_to_addr(gpr_rd_rid, gpr_rd_wis)), .wdata (writeback_if.data.data),
.rdata (gpr_rd_data[j]) .raddr (gpr_rd_addr_st1[b]),
.rdata (gpr_rd_data_st1[b])
); );
end end
// staging buffer `ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] collisions_r;
`RESET_RELAY (stg_buf_reset, reset); always @(posedge clk) begin
if (reset) begin
VX_elastic_buffer #( collisions_r <= '0;
.DATAW (DATAW) end else begin
) stg_buf ( collisions_r <= collisions_r + `PERF_CTR_BITS'(scoreboard_if.valid && pipe_in_ready && has_collision_n);
.clk (clk),
.reset (stg_buf_reset),
.valid_in (scoreboard_if[i].valid),
.ready_in (scoreboard_if[i].ready),
.data_in ({
scoreboard_if[i].data.uuid,
scoreboard_if[i].data.wis,
scoreboard_if[i].data.tmask,
scoreboard_if[i].data.PC,
scoreboard_if[i].data.wb,
scoreboard_if[i].data.ex_type,
scoreboard_if[i].data.op_type,
scoreboard_if[i].data.op_mod,
scoreboard_if[i].data.use_PC,
scoreboard_if[i].data.use_imm,
scoreboard_if[i].data.imm,
scoreboard_if[i].data.rd}),
.data_out ({
staging_if.data.uuid,
staging_if.data.wis,
staging_if.data.tmask,
staging_if.data.PC,
staging_if.data.wb,
staging_if.data.ex_type,
staging_if.data.op_type,
staging_if.data.op_mod,
staging_if.data.use_PC,
staging_if.data.use_imm,
staging_if.data.imm,
staging_if.data.rd}),
.valid_out (staging_if.valid),
.ready_out (staging_if.ready)
);
assign staging_if.data.rs1_data = rs1_data;
assign staging_if.data.rs2_data = rs2_data;
assign staging_if.data.rs3_data = rs3_data;
// output buffer
wire valid_stg, ready_stg;
assign valid_stg = staging_if.valid && data_ready;
assign staging_if.ready = ready_stg && data_ready;
`RESET_RELAY (out_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW + (3 * `NUM_THREADS * `XLEN)),
.SIZE (2),
.OUT_REG (2)
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.valid_in (valid_stg),
.ready_in (ready_stg),
.data_in (staging_if.data),
.data_out (operands_if[i].data),
.valid_out (operands_if[i].valid),
.ready_out (operands_if[i].ready)
);
end end
end
assign perf_stalls = collisions_r;
`endif
endmodule endmodule

Some files were not shown because too many files have changed in this diff Show more