Merge branch 'master' into master

This commit is contained in:
tinebp 2024-08-07 18:09:27 -07:00 committed by GitHub
commit aad3b26332
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
1631 changed files with 1923326 additions and 186050 deletions

View file

@ -1,3 +0,0 @@
ignore:
- "./examples/*"
- "./tests/*"

176
.github/workflows/ci.yml vendored Normal file
View file

@ -0,0 +1,176 @@
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: CI
on: [push, pull_request]
jobs:
setup:
runs-on: ubuntu-20.04
steps:
- name: Checkout code
uses: actions/checkout@v2
with:
submodules: recursive
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v2
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
restore-keys: |
${{ runner.os }}-toolchain-
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v2
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
restore-keys: |
${{ runner.os }}-thirdparty-
- name: Install Dependencies
if: steps.cache-toolchain.outputs.cache-hit != 'true' || steps.cache-thirdparty.outputs.cache-hit != 'true'
run: |
sudo bash ./ci/system_updates.sh
- name: Setup Toolchain
if: steps.cache-toolchain.outputs.cache-hit != 'true'
run: |
TOOLDIR=$PWD/tools
mkdir -p build
cd build
../configure --tooldir=$TOOLDIR
ci/toolchain_install.sh --all
- name: Setup Third Party
if: steps.cache-thirdparty.outputs.cache-hit != 'true'
run: |
make -C third_party > /dev/null
build:
runs-on: ubuntu-20.04
needs: setup
strategy:
matrix:
xlen: [32, 64]
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Install Dependencies
run: |
sudo bash ./ci/system_updates.sh
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v2
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
restore-keys: |
${{ runner.os }}-toolchain-
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v2
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
restore-keys: |
${{ runner.os }}-thirdparty-
- name: Run Build
run: |
TOOLDIR=$PWD/tools
mkdir -p build${{ matrix.xlen }}
cd build${{ matrix.xlen }}
../configure --tooldir=$TOOLDIR --xlen=${{ matrix.xlen }}
source ci/toolchain_env.sh
make software -s > /dev/null
make tests -s > /dev/null
- name: Upload Build Artifact
uses: actions/upload-artifact@v2
with:
name: build-${{ matrix.xlen }}
path: build${{ matrix.xlen }}
tests:
runs-on: ubuntu-20.04
needs: build
strategy:
fail-fast: false
matrix:
name: [regression, opencl, cache, config1, config2, debug, stress]
xlen: [32, 64]
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Install Dependencies
run: |
sudo bash ./ci/system_updates.sh
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v2
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
restore-keys: |
${{ runner.os }}-toolchain-
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v2
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
restore-keys: |
${{ runner.os }}-thirdparty-
- name: Download Build Artifact
uses: actions/download-artifact@v2
with:
name: build-${{ matrix.xlen }}
path: build${{ matrix.xlen }}
- name: Run tests
run: |
cd build${{ matrix.xlen }}
source ci/toolchain_env.sh
chmod -R +x . # Ensure all files have executable permissions
if [ "${{ matrix.name }}" == "regression" ]; then
./ci/regression.sh --unittest
./ci/regression.sh --isa
./ci/regression.sh --kernel
./ci/regression.sh --synthesis
./ci/regression.sh --regression
else
./ci/regression.sh --${{ matrix.name }}
fi
complete:
runs-on: ubuntu-20.04
needs: tests
steps:
- name: Check Completion
run: echo "All matrix jobs passed"

3
.gitignore vendored Normal file
View file

@ -0,0 +1,3 @@
/build*
/.vscode
*.cache

2
.gitmodules vendored
View file

@ -6,4 +6,4 @@
url = https://github.com/ucb-bar/berkeley-softfloat-3.git
[submodule "third_party/ramulator"]
path = third_party/ramulator
url = https://github.com/CMU-SAFARI/ramulator.git
url = https://github.com/CMU-SAFARI/ramulator2.git

View file

@ -1,90 +0,0 @@
language: cpp
dist: focal
os: linux
compiler: gcc
addons:
apt:
packages:
- build-essential
- valgrind
- libstdc++6
env:
global:
- TOOLDIR=$HOME/tools
cache:
directories:
- $TOOLDIR
- $HOME/build32
- $HOME/build64
before_install:
- if [ ! -d "$TOOLDIR" ] || [ -z "$(ls -A $TOOLDIR)" ]; then
mkdir -p $TOOLDIR;
OSDIR=ubuntu/focal ./ci/toolchain_install.sh --all;
fi
- source ./ci/toolchain_env.sh
stages:
- setup
- test
jobs:
include:
- stage: setup
script:
- rm -rf $HOME/build32 && cp -r $PWD $HOME/build32
- rm -rf $HOME/build64 && cp -r $PWD $HOME/build64
- make -C $HOME/build32
- XLEN=64 make -C $HOME/build64
- stage: test
name: unittest
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --unittest
- stage: test
name: isa
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --isa
- stage: test
name: isa64
script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --isa
- stage: test
name: regression
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --regression
- stage: test
name: regression64
script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --regression
- stage: test
name: opencl
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --opencl
- stage: test
name: cluster
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --cluster
- stage: test
name: config
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --config
- stage: test
name: debug
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --debug
- stage: test
name: stress0
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --stress0
- stage: test
name: stress1
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --stress1
- stage: test
name: synthesis
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --synthesis
- stage: test
name: synthesis64
script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --synthesis
after_success:
# Gather code coverage
- lcov --directory runtime --capture --output-file runtime.cov # capture trace
- lcov --directory sim --capture --output-file sim.cov # capture trace
- lcov --list runtime.cov # output coverage data for debugging
- lcov --list sim.cov # output coverage data for debugging
# Upload coverage report
- bash <(curl -s https://codecov.io/bash) -f runtime.cov
- bash <(curl -s https://codecov.io/bash) -f sim.cov

View file

@ -1,28 +0,0 @@
all:
$(MAKE) -C third_party
$(MAKE) -C hw
$(MAKE) -C sim
$(MAKE) -C kernel
$(MAKE) -C runtime
$(MAKE) -C tests
clean:
$(MAKE) -C hw clean
$(MAKE) -C sim clean
$(MAKE) -C kernel clean
$(MAKE) -C runtime clean
$(MAKE) -C tests clean
clean-all:
$(MAKE) -C third_party clean
$(MAKE) -C hw clean
$(MAKE) -C sim clean
$(MAKE) -C kernel clean
$(MAKE) -C runtime clean
$(MAKE) -C tests clean-all
crtlsim:
$(MAKE) -C sim clean
brtlsim:
$(MAKE) -C sim

74
Makefile.in Normal file
View file

@ -0,0 +1,74 @@
include config.mk
.PHONY: build software tests
all:
$(MAKE) -C $(VORTEX_HOME)/third_party
$(MAKE) -C hw
$(MAKE) -C sim
$(MAKE) -C kernel
$(MAKE) -C runtime
$(MAKE) -C tests
build:
$(MAKE) -C hw
$(MAKE) -C sim
$(MAKE) -C kernel
$(MAKE) -C runtime
$(MAKE) -C tests
software:
$(MAKE) -C hw
$(MAKE) -C kernel
$(MAKE) -C runtime/stub
tests:
$(MAKE) -C tests
clean-build:
$(MAKE) -C hw clean
$(MAKE) -C sim clean
$(MAKE) -C kernel clean
$(MAKE) -C runtime clean
$(MAKE) -C tests clean
clean: clean-build
$(MAKE) -C $(VORTEX_HOME)/third_party clean
# Install setup
KERNEL_INC_DST = $(INSTALLDIR)/kernel/include
KERNEL_LIB_DST = $(INSTALLDIR)/kernel/lib$(XLEN)
RUNTIME_INC_DST = $(INSTALLDIR)/runtime/include
RUNTIME_LIB_DST = $(INSTALLDIR)/runtime/lib
KERNEL_HEADERS = $(wildcard $(VORTEX_HOME)/kernel/include/*.h)
KERNEL_LIBS = $(wildcard kernel/*.a)
RUNTIME_HEADERS = $(wildcard $(VORTEX_HOME)/runtime/include/*.h)
RUNTIME_LIBS = $(wildcard runtime/*.so)
INSTALL_DIRS = $(KERNEL_LIB_DST) $(RUNTIME_LIB_DST) $(KERNEL_INC_DST) $(RUNTIME_INC_DST)
$(INSTALL_DIRS):
mkdir -p $@
$(KERNEL_INC_DST)/VX_types.h: hw/VX_types.h | $(KERNEL_INC_DST)
cp $< $@
$(KERNEL_INC_DST)/%.h: $(VORTEX_HOME)/kernel/include/%.h | $(KERNEL_INC_DST)
cp $< $@
$(RUNTIME_INC_DST)/%.h: $(VORTEX_HOME)/runtime/include/%.h | $(RUNTIME_INC_DST)
cp $< $@
$(KERNEL_LIB_DST)/%.a: kernel/%.a | $(KERNEL_LIB_DST)
cp $< $@
$(RUNTIME_LIB_DST)/%.so: runtime/%.so | $(RUNTIME_LIB_DST)
cp $< $@
install: $(INSTALL_DIRS) \
$(KERNEL_INC_DST)/VX_types.h \
$(KERNEL_HEADERS:$(VORTEX_HOME)/kernel/include/%=$(KERNEL_INC_DST)/%) \
$(RUNTIME_HEADERS:$(VORTEX_HOME)/runtime/include/%=$(RUNTIME_INC_DST)/%) \
$(KERNEL_LIBS:kernel/%=$(KERNEL_LIB_DST)/%) \
$(RUNTIME_LIBS:runtime/%=$(RUNTIME_LIB_DST)/%)

View file

@ -1,6 +1,3 @@
[![Build Status](https://travis-ci.com/vortexgpgpu/vortex.svg?branch=master)](https://travis-ci.com/vortexgpgpu/vortex)
[![codecov](https://codecov.io/gh/vortexgpgpu/vortex/branch/master/graph/badge.svg)](https://codecov.io/gh/vortexgpgpu/vortex)
# Vortex GPGPU
Vortex is a full-stack open-source RISC-V GPGPU.
@ -12,10 +9,10 @@ Vortex is a full-stack open-source RISC-V GPGPU.
- configurable number of cores, warps, and threads.
- configurable number of ALU, FPU, LSU, and SFU units per core.
- configurable pipeline issue width.
- optional shared memory, L1, L2, and L3 caches.
- Software:
- optional local memory, L1, L2, and L3 caches.
- Software:
- OpenCL 1.2 Support.
- Supported FPGAs:
- Supported FPGAs:
- Altera Arria 10
- Altera Stratix 10
- Xilinx Alveo U50, U250, U280
@ -33,8 +30,9 @@ Vortex is a full-stack open-source RISC-V GPGPU.
- `miscs`: Miscellaneous resources.
## Build Instructions
More detailed build instructions can be found [here](docs/install_vortex.md).
### Supported OS Platforms
- Ubuntu 18.04
- Ubuntu 18.04, 20.04
- Centos 7
### Toolchain Dependencies
- [POCL](http://portablecl.org/)
@ -46,19 +44,64 @@ Vortex is a full-stack open-source RISC-V GPGPU.
- [Ramulator](https://github.com/CMU-SAFARI/ramulator.git)
- [Yosys](https://github.com/YosysHQ/yosys)
- [Sv2v](https://github.com/zachjs/sv2v)
### Install development tools
$ sudo apt-get install build-essential
$ sudo apt-get install git
### Install development tools
```sh
sudo apt-get install build-essential
sudo apt-get install binutils
sudo apt-get install python
sudo apt-get install uuid-dev
sudo apt-get install git
```
### Install Vortex codebase
$ git clone --recursive https://github.com/vortexgpgpu/vortex.git
$ cd Vortex
```sh
git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git
cd vortex
```
### Configure your build folder
```sh
mkdir build
cd build
../configure --xlen=32 --tooldir=$HOME/tools
```
### Install prebuilt toolchain
By default, the toolchain will install to /opt folder.
You can install the toolchain to a different directory by overriding TOOLDIR (e.g. export TOOLDIR=$HOME/tools).
$ ./ci/toolchain_install.sh --all
$ source ./ci/toolchain_env.sh
### Build Vortex sources
$ make -s
```sh
./ci/toolchain_install.sh --all
```
### Set environment variables
```sh
# should always run before using the toolchain!
source ./ci/toolchain_env.sh
```
### Building Vortex
```sh
make -s
```
### Quick demo running vecadd OpenCL kernel on 2 cores
$ ./ci/blackbox.sh --cores=2 --app=vecadd
```sh
./ci/blackbox.sh --cores=2 --app=vecadd
```
### Common Developer Tips
- Installing Vortex kernel and runtime libraries to use with external tools requires passing --prefix=<install-path> to the configure script.
```sh
../configure --xlen=32 --tooldir=$HOME/tools --prefix=<install-path>
make -s
make install
```
- Building Vortex 64-bit simply requires using --xlen=64 configure option.
```sh
../configure --xlen=32 --tooldir=$HOME/tools
```
- Sourcing "./ci/toolchain_env.sh" is required everytime you start a new terminal. we recommend adding "source <build-path>/ci/toolchain_env.sh" to your ~/.bashrc file to automate the process at login.
```sh
echo "source <build-path>/ci/toolchain_env.sh" >> ~/.bashrc
```
- Making changes to Makefiles in your source tree or adding new folders will require executing the "configure" script again to get it propagated into your build folder.
```sh
../configure
```
- To debug the GPU, you can generate a "run.log" trace. see /docs/debugging.md for more information.
```sh
./ci/blackbox.sh --app=demo --debug=3
```
- For additional information, check out the /docs.

View file

@ -1,4 +0,0 @@
Release Notes!
* 07/01/2020 - LKG FPGA build - Passed basic, demo, vecadd kernels.

23
TODO
View file

@ -1,23 +0,0 @@
Functionality:
1) vx_cl_warpSpawn()
-> To be used by pocl->ops->run
2) newlib Integration (LoadFile(""))
-> To be used by the Rhinio benchmarks
3) POCL OPS Vortex Suite
Performance:
1) Icache doesn't need SEND_MEM_REQUEST Stage
-> Blocks are never dirty, so why not evict right away
2) Branch not taken speculation
3) Runtime -02 not running on RTL, and -03 not running on RTL and Emulator
Vector:
1) Cycle accurate simulator (would require Cache Simulator)

View file

@ -1,12 +1,12 @@
#!/bin/sh
# Copyright © 2019-2023
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -23,14 +23,14 @@ show_help()
{
show_usage
echo " where"
echo "--driver: simx, rtlsim, oape, xrt"
echo "--driver: gpu, simx, rtlsim, oape, xrt"
echo "--app: any subfolder test under regression or opencl"
echo "--class: 0=disable, 1=pipeline, 2=memsys"
echo "--rebuild: 0=disable, 1=force, 2=auto, 3=temp"
}
SCRIPT_DIR=$(dirname "$0")
VORTEX_HOME=$SCRIPT_DIR/..
ROOT_DIR=$SCRIPT_DIR/..
DRIVER=simx
APP=sgemm
@ -91,12 +91,12 @@ case $i in
;;
--scope)
SCOPE=1
CORES=1
CORES=1
shift
;;
--perf=*)
PERF_FLAG=-DPERF_ENABLE
PERF_CLASS=${i#*=}
PERF_CLASS=${i#*=}
shift
;;
--args=*)
@ -117,8 +117,8 @@ case $i in
exit 0
;;
*)
show_usage
exit -1
show_usage
exit -1
;;
esac
done
@ -130,17 +130,20 @@ then
fi
case $DRIVER in
gpu)
DRIVER_PATH=
;;
simx)
DRIVER_PATH=$VORTEX_HOME/runtime/simx
DRIVER_PATH=$ROOT_DIR/runtime/simx
;;
rtlsim)
DRIVER_PATH=$VORTEX_HOME/runtime/rtlsim
DRIVER_PATH=$ROOT_DIR/runtime/rtlsim
;;
opae)
DRIVER_PATH=$VORTEX_HOME/runtime/opae
DRIVER_PATH=$ROOT_DIR/runtime/opae
;;
xrt)
DRIVER_PATH=$VORTEX_HOME/runtime/xrt
DRIVER_PATH=$ROOT_DIR/runtime/xrt
;;
*)
echo "invalid driver: $DRIVER"
@ -148,49 +151,66 @@ case $DRIVER in
;;
esac
if [ -d "$VORTEX_HOME/tests/opencl/$APP" ];
if [ -d "$ROOT_DIR/tests/opencl/$APP" ];
then
APP_PATH=$VORTEX_HOME/tests/opencl/$APP
elif [ -d "$VORTEX_HOME/tests/regression/$APP" ];
APP_PATH=$ROOT_DIR/tests/opencl/$APP
elif [ -d "$ROOT_DIR/tests/regression/$APP" ];
then
APP_PATH=$VORTEX_HOME/tests/regression/$APP
APP_PATH=$ROOT_DIR/tests/regression/$APP
else
echo "Application folder not found: $APP"
exit -1
fi
if [ "$DRIVER" = "gpu" ];
then
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
OPTS=$ARGS make -C $APP_PATH run-$DRIVER
status=$?
else
echo "running: make -C $APP_PATH run-$DRIVER"
make -C $APP_PATH run-$DRIVER
status=$?
fi
exit $status
fi
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS"
echo "CONFIGS=$CONFIGS"
if [ $REBUILD -ne 0 ]
if [ $REBUILD -ne 0 ]
then
BLACKBOX_CACHE=blackbox.$DRIVER.cache
BLACKBOX_CACHE=blackbox.$DRIVER.cache
if [ -f "$BLACKBOX_CACHE" ]
then
then
LAST_CONFIGS=`cat $BLACKBOX_CACHE`
fi
if [ $REBUILD -eq 1 ] || [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ];
then
make -C $DRIVER_PATH clean > /dev/null
make -C $DRIVER_PATH clean-driver > /dev/null
echo "$CONFIGS+$DEBUG+$SCOPE" > $BLACKBOX_CACHE
fi
fi
# export performance monitor class identifier
export PERF_CLASS=$PERF_CLASS
export VORTEX_PROFILING=$PERF_CLASS
status=0
# ensure config update
make -C $VORTEX_HOME/hw config > /dev/null
make -C $ROOT_DIR/hw config > /dev/null
# ensure the stub driver is present
make -C $VORTEX_HOME/runtime/stub > /dev/null
make -C $ROOT_DIR/runtime/stub > /dev/null
if [ $DEBUG -ne 0 ]
then
then
# running application
if [ $TEMPBUILD -eq 1 ]
then
@ -212,11 +232,11 @@ then
if [ $HAS_ARGS -eq 1 ]
then
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
else
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
fi
@ -237,26 +257,26 @@ then
if [ $HAS_ARGS -eq 1 ]
then
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
DEBUG=1 OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
else
echo "running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
DEBUG=1 make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
fi
fi
if [ -f "$APP_PATH/trace.vcd" ]
then
then
mv -f $APP_PATH/trace.vcd .
fi
else
else
if [ $TEMPBUILD -eq 1 ]
then
# setup temp directory
TEMPDIR=$(mktemp -d)
mkdir -p "$TEMPDIR/$DRIVER"
# driver initialization
if [ $SCOPE -eq 1 ]
then
@ -266,7 +286,7 @@ else
echo "running: DESTDIR=$TEMPDIR/$DRIVER CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
@ -282,7 +302,7 @@ else
# cleanup temp directory
trap "rm -rf $TEMPDIR" EXIT
else
# driver initialization
if [ $SCOPE -eq 1 ]
then

41
ci/datagen.py Executable file
View file

@ -0,0 +1,41 @@
#!/usr/bin/env python3
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import struct
import random
import sys
def create_binary_file(n, filename):
# Open the file in binary write mode
with open(filename, 'wb') as f:
# Write the integer N as 4 bytes
f.write(struct.pack('i', n))
# Generate and write N floating-point numbers
for _ in range(n):
# Generate a random float between 0 and 1
num = random.random()
# Write the float in IEEE 754 format (4 bytes)
f.write(struct.pack('f', num))
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: script.py N filename")
sys.exit(1)
n = int(sys.argv[1])
filename = sys.argv[2]
create_binary_file(n, filename)
print(f"Created binary file '{filename}' containing {n} floats.")

View file

@ -1,322 +0,0 @@
#!/bin/bash
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# exit when any command fails
set -e
# clear blackbox cache
rm -f blackbox.*.cache
unittest()
{
make -C tests/unittest run
make -C hw/unittest
}
isa()
{
echo "begin isa tests..."
make -C tests/riscv/isa run-simx
make -C tests/riscv/isa run-rtlsim
CONFIGS="-DDPI_DISABLE" make -C tests/riscv/isa run-rtlsim
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim
make -C tests/riscv/isa run-rtlsim-32f
if [ "$XLEN" == "64" ]
then
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim
make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DEXT_D_ENABLE -DFPU_FPNEW" make -C sim/rtlsim
make -C tests/riscv/isa run-rtlsim-64d || true
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim
make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim
make -C tests/riscv/isa run-rtlsim-64fx
fi
make -C sim/rtlsim clean && make -C sim/rtlsim
echo "isa tests done!"
}
regression()
{
echo "begin regression tests..."
make -C tests/kernel run-simx
make -C tests/kernel run-rtlsim
make -C tests/regression run-simx
make -C tests/regression run-rtlsim
# test FPU hardware implementations
CONFIGS="-DFPU_DPI" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DFPU_DSP" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
# test local barrier
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -t19"
./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -t19"
# test global barrier
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -t20" --cores=2
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -t20" --cores=2
# test FPU core
echo "regression tests done!"
}
opencl()
{
echo "begin opencl tests..."
make -C tests/opencl run-simx
make -C tests/opencl run-rtlsim
echo "opencl tests done!"
}
cluster()
{
echo "begin clustering tests..."
# warp/threads configurations
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=2 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=diverge
./ci/blackbox.sh --driver=simx --cores=1 --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=simx --cores=1 --warps=8 --threads=16 --app=diverge
# cores clustering
./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=diverge --args="-n1"
# L2/L3
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=diverge --args="-n1"
echo "clustering tests done!"
}
debug()
{
echo "begin debugging tests..."
# test CSV trace generation
make -C sim/simx clean && DEBUG=3 make -C sim/simx
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim
make -C tests/riscv/isa run-simx-32im > run_simx.log
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
diff trace_rtlsim.csv trace_simx.csv
make -C sim/simx clean && make -C sim/simx
make -C sim/rtlsim clean && make -C sim/rtlsim
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=basic --args="-t0 -n1"
echo "debugging tests done!"
}
config()
{
echo "begin configuration tests..."
# disable DPI
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
# issue width
CONFIGS="-DISSUE_WIDTH=1" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DISSUE_WIDTH=1" ./ci/blackbox.sh --driver=simx --app=diverge
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=simx --app=diverge
# dispatch size
CONFIGS="-DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=1" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=1" ./ci/blackbox.sh --driver=simx --app=diverge
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=simx --app=diverge
# FPU scaling
CONFIGS="-DNUM_ALU_BLOCK=4 -DNUM_FPU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DNUM_ALU_BLOCK=4 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
# custom program startup address
make -C tests/regression/dogfood clean-all
STARTUP_ADDR=0x40000000 make -C tests/regression/dogfood
CONFIGS="-DSTARTUP_ADDR=0x40000000" ./ci/blackbox.sh --driver=simx --app=dogfood
CONFIGS="-DSTARTUP_ADDR=0x40000000" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
make -C tests/regression/dogfood clean-all
make -C tests/regression/dogfood
# disabling M extension
CONFIGS="-DEXT_M_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
# disabling F extension
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext --perf=1
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=simx --cores=1 --app=no_mf_ext --perf=1
# disable shared memory
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem --perf=1
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=simx --cores=1 --app=no_smem --perf=1
# disable L1 cache
CONFIGS="-DL1_DISABLE -DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
# multiple L1 caches per cluster
CONFIGS="-DNUM_DCACHES=2 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --cores=8 --warps=1 --threads=2
# test AXI bus
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo
# adjust l1 block size to match l2
CONFIGS="-DL1_LINE_SIZE=64" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1"
# test cache banking
CONFIGS="-DSMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DSMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DSMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemm
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --cores=1 --app=sgemm
# test 128-bit MEM block
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
# test single-bank DRAM
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
# test 27-bit DRAM address
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
echo "configuration tests done!"
}
stress0()
{
echo "begin stress0 tests..."
# test verilator reset values
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --app=printf
echo "stress0 tests done!"
}
stress1()
{
echo "begin stress1 tests..."
./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n128" --l2cache
echo "stress1 tests done!"
}
synthesis()
{
echo "begin synthesis tests..."
PREFIX=build_base make -C hw/syn/yosys clean
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys elaborate
echo "synthesis tests done!"
}
show_usage()
{
echo "Vortex Regression Test"
echo "Usage: $0 [--unittest] [--isa] [--regression] [--opencl] [--cluster] [--debug] [--config] [--stress[#n]] [--synthesis] [--all] [--h|--help]"
}
start=$SECONDS
while [ "$1" != "" ]; do
case $1 in
--unittest ) unittest
;;
--isa ) isa
;;
--regression ) regression
;;
--opencl ) opencl
;;
--cluster ) cluster
;;
--debug ) debug
;;
--config ) config
;;
--stress0 ) stress0
;;
--stress1 ) stress1
;;
--stress ) stress0
stress1
;;
--synthesis ) synthesis
;;
--all ) unittest
isa
regression
opencl
cluster
debug
config
stress0
stress1
synthesis
;;
-h | --help ) show_usage
exit
;;
* ) show_usage
exit 1
esac
shift
done
echo "Regression completed!"
duration=$(( SECONDS - start ))
awk -v t=$duration 'BEGIN{t=int(t*1000); printf "Elapsed Time: %d:%02d:%02d\n", t/3600000, t/60000%60, t/1000%60}'

417
ci/regression.sh.in Executable file
View file

@ -0,0 +1,417 @@
#!/bin/bash
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# exit when any command fails
set -e
# clear blackbox cache
rm -f blackbox.*.cache
XLEN=${XLEN:=@XLEN@}
XSIZE=$((XLEN / 8))
echo "Vortex Regression Test: XLEN=$XLEN"
unittest()
{
make -C tests/unittest run
make -C hw/unittest > /dev/null
}
isa()
{
echo "begin isa tests..."
make -C sim/simx
make -C sim/rtlsim
make -C tests/riscv/isa run-simx
make -C tests/riscv/isa run-rtlsim
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
if [ "$XLEN" == "64" ]
then
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64d
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64d
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64fx
fi
# clean build
make -C sim/rtlsim clean
echo "isa tests done!"
}
kernel()
{
echo "begin kernel tests..."
make -C sim/simx
make -C sim/rtlsim
make -C tests/kernel run-simx
make -C tests/kernel run-rtlsim
echo "kernel tests done!"
}
regression()
{
echo "begin regression tests..."
make -C runtime/simx
make -C runtime/rtlsim
make -C tests/regression run-simx
make -C tests/regression run-rtlsim
# test global barrier
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tgbar" --cores=2
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tgbar" --cores=2
# test local barrier
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar"
./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tbar"
echo "regression tests done!"
}
opencl()
{
echo "begin opencl tests..."
make -C runtime/simx
make -C runtime/rtlsim
make -C tests/opencl run-simx
make -C tests/opencl run-rtlsim
./ci/blackbox.sh --driver=simx --app=lbm --warps=8
./ci/blackbox.sh --driver=rtlsim --app=lbm --warps=8
echo "opencl tests done!"
}
cache()
{
echo "begin cache tests..."
# disable local memory
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo --perf=1
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=demo --perf=1
# disable L1 cache
CONFIGS="-DL1_DISABLE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
# reduce l1 line size
CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=rtlsim --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache ways
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache banking
CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test writeback
CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --app=mstress
CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
# cache clustering
CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=4 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=4 --warps=1 --threads=2
# L2/L3
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=4 --l2cache --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=diverge --args="-n1"
echo "begin cache tests..."
}
config1()
{
echo "begin configuration-1 tests..."
# warp/threads
./ci/blackbox.sh --driver=rtlsim --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=rtlsim --warps=2 --threads=2 --app=diverge
./ci/blackbox.sh --driver=rtlsim --warps=2 --threads=8 --app=diverge
./ci/blackbox.sh --driver=rtlsim --warps=8 --threads=2 --app=diverge
./ci/blackbox.sh --driver=simx --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=simx --warps=8 --threads=16 --app=diverge
# cores clustering
./ci/blackbox.sh --driver=rtlsim --cores=4 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --app=diverge --args="-n1"
# issue width
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DISSUE_WIDTH=4" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=simx --app=diverge
CONFIGS="-DISSUE_WIDTH=4" ./ci/blackbox.sh --driver=simx --app=diverge
# ALU scaling
CONFIGS="-DISSUE_WIDTH=2 -DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DISSUE_WIDTH=4 -DNUM_ALU_BLOCK=4 -DNUM_ALU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DISSUE_WIDTH=2 -DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=simx --app=diverge
CONFIGS="-DISSUE_WIDTH=4 -DNUM_ALU_BLOCK=4 -DNUM_ALU_LANES=4" ./ci/blackbox.sh --driver=simx --app=diverge
# FPU scaling
CONFIGS="-DISSUE_WIDTH=2 -DNUM_FPU_BLOCK=1 -DNUM_FPU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_FPU_BLOCK=4 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx
CONFIGS="-DISSUE_WIDTH=2 -DNUM_FPU_BLOCK=1 -DNUM_FPU_LANES=2" ./ci/blackbox.sh --driver=simx --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_FPU_BLOCK=4 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=simx --app=vecaddx
# FPU's PE scaling
CONFIGS="-DFMA_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfmadd"
CONFIGS="-DFCVT_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tftoi"
CONFIGS="-DFDIV_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfdiv"
CONFIGS="-DFSQRT_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfsqrt"
CONFIGS="-DFNCP_PE_RATIO=2" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-tfclamp"
# LSU scaling
CONFIGS="-DISSUE_WIDTH=2 -DNUM_LSU_BLOCK=1 -DNUM_LSU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_LSU_BLOCK=4 -DNUM_LSU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=vecaddx
CONFIGS="-DISSUE_WIDTH=2 -DNUM_LSU_BLOCK=1 -DNUM_LSU_LANES=2" ./ci/blackbox.sh --driver=simx --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_LSU_BLOCK=4 -DNUM_LSU_LANES=4" ./ci/blackbox.sh --driver=simx --app=vecaddx
echo "configuration-1 tests done!"
}
config2()
{
echo "begin configuration-2 tests..."
# test opaesim
./ci/blackbox.sh --driver=opae --app=printf
./ci/blackbox.sh --driver=opae --app=diverge
# disable DPI
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
# custom program startup address
make -C tests/regression/dogfood clean-kernel
STARTUP_ADDR=0x40000000 make -C tests/regression/dogfood
./ci/blackbox.sh --driver=simx --app=dogfood
./ci/blackbox.sh --driver=rtlsim --app=dogfood
make -C tests/regression/dogfood clean-kernel
# disabling M & F extensions
make -C sim/rtlsim clean && CONFIGS="-DEXT_M_DISABLE -DEXT_F_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32i
make -C sim/rtlsim clean
# disabling ZICOND extension
CONFIGS="-DEXT_ZICOND_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo
# test AXI bus
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --app=mstress
# test 128-bit MEM block
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=mstress
# test XLEN-bit MEM block
CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=mstress
# test memory coalescing
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8
# test single-bank DRAM
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
# test 27-bit DRAM address
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --app=mstress
echo "configuration-2 tests done!"
}
test_csv_trace()
{
# test CSV trace generation
make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-simx-32im > run_simx.log
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
diff trace_rtlsim.csv trace_simx.csv
# clean build
make -C sim/simx clean
make -C sim/rtlsim clean
}
debug()
{
echo "begin debugging tests..."
test_csv_trace
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1"
echo "debugging tests done!"
}
stress()
{
echo "begin stress tests..."
# test verilator reset values
CONFIGS="-DVERILATOR_RESET_VALUE=1 -DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --args="-n128" --l2cache
echo "stress tests done!"
}
synthesis()
{
echo "begin synthesis tests..."
PREFIX=build_base make -C hw/syn/yosys clean
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys synthesis
echo "synthesis tests done!"
}
show_usage()
{
echo "Vortex Regression Test"
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--stress] [--synthesis] [--all] [--h|--help]"
}
declare -a tests=()
clean=0
while [ "$1" != "" ]; do
case $1 in
--clean )
clean=1
;;
--unittest )
tests+=("unittest")
;;
--isa )
tests+=("isa")
;;
--kernel )
tests+=("kernel")
;;
--regression )
tests+=("regression")
;;
--opencl )
tests+=("opencl")
;;
--cache )
tests+=("cache")
;;
--config1 )
tests+=("config1")
;;
--config2 )
tests+=("config2")
;;
--debug )
tests+=("debug")
;;
--stress )
tests+=("stress")
;;
--synthesis )
tests+=("synthesis")
;;
--all )
tests=()
tests+=("unittest")
tests+=("isa")
tests+=("kernel")
tests+=("regression")
tests+=("opencl")
tests+=("cache")
tests+=("config1")
tests+=("config2")
tests+=("debug")
tests+=("stress")
tests+=("synthesis")
;;
-h | --help )
show_usage
exit
;;
* )
show_usage
exit 1
esac
shift
done
if [ $clean -eq 1 ];
then
make clean
make -s
fi
start=$SECONDS
for test in "${tests[@]}"; do
$test
done
echo "Regression completed!"
duration=$(( SECONDS - start ))
awk -v t=$duration 'BEGIN{t=int(t*1000); printf "Elapsed Time: %d:%02d:%02d\n", t/3600000, t/60000%60, t/1000%60}'

27
ci/system_updates.sh Executable file
View file

@ -0,0 +1,27 @@
#!/bin/sh
# Copyright 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
apt-get update -y
add-apt-repository -y ppa:ubuntu-toolchain-r/test
apt-get update
apt-get install -y g++-11 gcc-11
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100
apt-get install -y build-essential valgrind libstdc++6 binutils python uuid-dev ccache

13
ci/toolchain_env.sh → ci/toolchain_env.sh.in Normal file → Executable file
View file

@ -1,23 +1,22 @@
#!/bin/sh
# Copyright 2023 blaise
#
# Copyright 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
TOOLDIR=${TOOLDIR:=/opt}
TOOLDIR=${TOOLDIR:=@TOOLDIR@}
export VERILATOR_ROOT=$TOOLDIR/verilator
export PATH=$VERILATOR_ROOT/bin:$PATH
export PATH=$TOOLDIR/verilator/bin:$PATH
export SV2V_PATH=$TOOLDIR/sv2v
export PATH=$SV2V_PATH/bin:$PATH

View file

@ -1,184 +0,0 @@
#!/bin/bash
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# exit when any command fails
set -e
REPOSITORY=https://github.com/vortexgpgpu/vortex-toolchain-prebuilt/raw/master
TOOLDIR=${TOOLDIR:=/opt}
OSDIR=${OSDIR:=ubuntu/bionic}
OS="${OS:=ubuntu/bionic}"
riscv()
{
case $OSDIR in
"centos/7") parts=$(eval echo {a..h}) ;;
*) parts=$(eval echo {a..j}) ;;
esac
rm -f riscv-gnu-toolchain.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/riscv-gnu-toolchain/$OSDIR/riscv-gnu-toolchain.tar.bz2.parta$x
done
cat riscv-gnu-toolchain.tar.bz2.parta* > riscv-gnu-toolchain.tar.bz2
tar -xvf riscv-gnu-toolchain.tar.bz2
cp -r riscv-gnu-toolchain $TOOLDIR
rm -f riscv-gnu-toolchain.tar.bz2*
rm -rf riscv-gnu-toolchain
}
riscv64()
{
case $OSDIR in
"centos/7") parts=$(eval echo {a..h}) ;;
*) parts=$(eval echo {a..j}) ;;
esac
rm -f riscv64-gnu-toolchain.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/riscv64-gnu-toolchain/$OSDIR/riscv64-gnu-toolchain.tar.bz2.parta$x
done
cat riscv64-gnu-toolchain.tar.bz2.parta* > riscv64-gnu-toolchain.tar.bz2
tar -xvf riscv64-gnu-toolchain.tar.bz2
cp -r riscv64-gnu-toolchain $TOOLDIR
rm -f riscv64-gnu-toolchain.tar.bz2*
rm -rf riscv64-gnu-toolchain
}
llvm-vortex()
{
case $OSDIR in
"centos/7") parts=$(eval echo {a..b}) ;;
*) parts=$(eval echo {a..b}) ;;
esac
echo $parts
rm -f llvm-vortex.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/llvm-vortex/$OSDIR/llvm-vortex.tar.bz2.parta$x
done
cat llvm-vortex.tar.bz2.parta* > llvm-vortex.tar.bz2
tar -xvf llvm-vortex.tar.bz2
cp -r llvm-vortex $TOOLDIR
rm -f llvm-vortex.tar.bz2*
rm -rf llvm-vortex
}
llvm-pocl()
{
case $OSDIR in
"centos/7") parts=$(eval echo {a..b}) ;;
*) parts=$(eval echo {a..b}) ;;
esac
echo $parts
rm -f llvm-pocl.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/llvm-pocl/$OSDIR/llvm-pocl.tar.bz2.parta$x
done
cat llvm-pocl.tar.bz2.parta* > llvm-pocl.tar.bz2
tar -xvf llvm-pocl.tar.bz2
cp -r llvm-pocl $TOOLDIR
rm -f llvm-pocl.tar.bz2*
rm -rf llvm-pocl
}
pocl()
{
wget $REPOSITORY/pocl/$OSDIR/pocl.tar.bz2
tar -xvf pocl.tar.bz2
rm -f pocl.tar.bz2
cp -r pocl $TOOLDIR
rm -rf pocl
}
verilator()
{
wget $REPOSITORY/verilator/$OSDIR/verilator.tar.bz2
tar -xvf verilator.tar.bz2
cp -r verilator $TOOLDIR
rm -f verilator.tar.bz2
rm -rf verilator
}
sv2v()
{
wget $REPOSITORY/sv2v/$OSDIR/sv2v.tar.bz2
tar -xvf sv2v.tar.bz2
rm -f sv2v.tar.bz2
cp -r sv2v $TOOLDIR
rm -rf sv2v
}
yosys()
{
case $OSDIR in
"centos/7") parts=$(eval echo {a..c}) ;;
*) parts=$(eval echo {a..c}) ;;
esac
echo $parts
rm -f yosys.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/yosys/$OSDIR/yosys.tar.bz2.parta$x
done
cat yosys.tar.bz2.parta* > yosys.tar.bz2
tar -xvf yosys.tar.bz2
cp -r yosys $TOOLDIR
rm -f yosys.tar.bz2*
rm -rf yosys
}
show_usage()
{
echo "Install Pre-built Vortex Toolchain"
echo "Usage: $0 [[--riscv] [--riscv64] [--llvm-vortex] [--llvm-pocl] [--pocl] [--verilator] [--sv2v] [--yosys] [--all] [-h|--help]]"
}
while [ "$1" != "" ]; do
case $1 in
--pocl ) pocl
;;
--verilator ) verilator
;;
--riscv ) riscv
;;
--riscv64 ) riscv64
;;
--llvm-vortex ) llvm-vortex
;;
--llvm-pocl ) llvm-pocl
;;
--sv2v ) sv2v
;;
--yosys ) yosys
;;
--all ) pocl
verilator
sv2v
yosys
llvm-vortex
riscv
riscv64
;;
-h | --help ) show_usage
exit
;;
* ) show_usage
exit 1
esac
shift
done

199
ci/toolchain_install.sh.in Executable file
View file

@ -0,0 +1,199 @@
#!/bin/bash
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# exit when any command fails
set -e
REPOSITORY=https://github.com/vortexgpgpu/vortex-toolchain-prebuilt/raw/master
TOOLDIR=${TOOLDIR:=@TOOLDIR@}
OSVERSION=${OSVERSION:=@OSVERSION@}
riscv32()
{
case $OSVERSION in
"centos/7") parts=$(eval echo {a..h}) ;;
"ubuntu/focal") parts=$(eval echo {a..k}) ;;
*) parts=$(eval echo {a..j}) ;;
esac
rm -f riscv32-gnu-toolchain.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/riscv32-gnu-toolchain/$OSVERSION/riscv32-gnu-toolchain.tar.bz2.parta$x
done
cat riscv32-gnu-toolchain.tar.bz2.parta* > riscv32-gnu-toolchain.tar.bz2
tar -xvf riscv32-gnu-toolchain.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/riscv32-gnu-toolchain && mv riscv32-gnu-toolchain $TOOLDIR
rm -rf riscv32-gnu-toolchain.tar.bz2*
}
riscv64()
{
case $OSVERSION in
"centos/7") parts=$(eval echo {a..h}) ;;
*) parts=$(eval echo {a..j}) ;;
esac
rm -f riscv64-gnu-toolchain.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/riscv64-gnu-toolchain/$OSVERSION/riscv64-gnu-toolchain.tar.bz2.parta$x
done
cat riscv64-gnu-toolchain.tar.bz2.parta* > riscv64-gnu-toolchain.tar.bz2
tar -xvf riscv64-gnu-toolchain.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/riscv64-gnu-toolchain && mv riscv64-gnu-toolchain $TOOLDIR
rm -rf riscv64-gnu-toolchain riscv64-gnu-toolchain.tar.bz2*
}
llvm()
{
case $OSVERSION in
"centos/7") parts=$(eval echo {a..b}) ;;
*) parts=$(eval echo {a..b}) ;;
esac
echo $parts
rm -f llvm-vortex2.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/llvm-vortex/$OSVERSION/llvm-vortex2.tar.bz2.parta$x
done
cat llvm-vortex2.tar.bz2.parta* > llvm-vortex2.tar.bz2
tar -xvf llvm-vortex2.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/llvm-vortex && mv llvm-vortex $TOOLDIR
rm -rf llvm-vortex llvm-vortex2.tar.bz2*
}
libcrt32()
{
wget $REPOSITORY/libcrt32/libcrt32.tar.bz2
tar -xvf libcrt32.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/libcrt32 && mv libcrt32 $TOOLDIR
rm -rf libcrt32 libcrt32.tar.bz2
}
libcrt64()
{
wget $REPOSITORY/libcrt64/libcrt64.tar.bz2
tar -xvf libcrt64.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/libcrt64 && mv libcrt64 $TOOLDIR
rm -rf libcrt64 libcrt64.tar.bz2
}
libc32()
{
wget $REPOSITORY/libc32/libc32.tar.bz2
tar -xvf libc32.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/libc32 && mv libc32 $TOOLDIR
rm -rf libc32 libc32.tar.bz2
}
libc64()
{
wget $REPOSITORY/libc64/libc64.tar.bz2
tar -xvf libc64.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/libc64 && mv libc64 $TOOLDIR
rm -rf libc64 libc64.tar.bz2
}
pocl()
{
wget $REPOSITORY/pocl/$OSVERSION/pocl2.tar.bz2
tar -xvf pocl2.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/pocl && mv pocl $TOOLDIR
rm -rf pocl2 pocl2.tar.bz2
}
verilator()
{
wget $REPOSITORY/verilator/$OSVERSION/verilator.tar.bz2
tar -xvf verilator.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/verilator && mv verilator $TOOLDIR
rm -rf verilator verilator.tar.bz2
}
sv2v()
{
wget $REPOSITORY/sv2v/$OSVERSION/sv2v.tar.bz2
tar -xvf sv2v.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/sv2v && mv sv2v $TOOLDIR
rm -rf sv2v sv2v.tar.bz2
}
yosys()
{
case $OSVERSION in
"centos/7") parts=$(eval echo {a..c}) ;;
*) parts=$(eval echo {a..c}) ;;
esac
echo $parts
rm -f yosys.tar.bz2.parta*
for x in $parts
do
wget $REPOSITORY/yosys/$OSVERSION/yosys.tar.bz2.parta$x
done
cat yosys.tar.bz2.parta* > yosys.tar.bz2
tar -xvf yosys.tar.bz2
mkdir -p $TOOLDIR && rm -rf $TOOLDIR/yosys && mv yosys $TOOLDIR
rm -rf yosys yosys.tar.bz2* yosys
}
show_usage()
{
echo "Install Pre-built Vortex Toolchain"
echo "Usage: $0 [--pocl] [--verilator] [--riscv32] [--riscv64] [--llvm] [--libcrt32] [--libcrt64] [--libc32] [--libc64] [--sv2v] [--yosys] [--all] [-h|--help]"
}
while [ "$1" != "" ]; do
case $1 in
--pocl ) pocl
;;
--verilator ) verilator
;;
--riscv32 ) riscv32
;;
--riscv64 ) riscv64
;;
--llvm ) llvm
;;
--libcrt32 ) libcrt32
;;
--libcrt64 ) libcrt64
;;
--libc32 ) libc32
;;
--libc64 ) libc64
;;
--sv2v ) sv2v
;;
--yosys ) yosys
;;
--all ) pocl
verilator
llvm
libcrt32
libcrt64
libc32
libc64
riscv32
riscv64
sv2v
yosys
;;
-h | --help ) show_usage
exit
;;
* ) show_usage
exit 1
esac
shift
done

View file

@ -1,128 +0,0 @@
#!/bin/bash
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# exit when any command fails
set -e
TOOLDIR=${TOOLDIR:=/opt}
OSDIR=${OSDIR:=ubuntu/bionic}
riscv()
{
echo "prebuilt riscv-gnu-toolchain..."
tar -C $TOOLDIR -cvjf riscv-gnu-toolchain.tar.bz2 riscv-gnu-toolchain
split -b 50M riscv-gnu-toolchain.tar.bz2 "riscv-gnu-toolchain.tar.bz2.part"
mv riscv-gnu-toolchain.tar.bz2.part* ./riscv-gnu-toolchain/$OSDIR
rm riscv-gnu-toolchain.tar.bz2
}
riscv64()
{
echo "prebuilt riscv64-gnu-toolchain..."
tar -C $TOOLDIR -cvjf riscv64-gnu-toolchain.tar.bz2 riscv64-gnu-toolchain
split -b 50M riscv64-gnu-toolchain.tar.bz2 "riscv64-gnu-toolchain.tar.bz2.part"
mv riscv64-gnu-toolchain.tar.bz2.part* ./riscv64-gnu-toolchain/$OSDIR
rm riscv64-gnu-toolchain.tar.bz2
}
llvm-vortex()
{
echo "prebuilt llvm-vortex..."
tar -C $TOOLDIR -cvjf llvm-vortex.tar.bz2 llvm-vortex
split -b 50M llvm-vortex.tar.bz2 "llvm-vortex.tar.bz2.part"
mv llvm-vortex.tar.bz2.part* ./llvm-vortex/$OSDIR
rm llvm-vortex.tar.bz2
}
llvm-pocl()
{
echo "prebuilt llvm-pocl..."
tar -C $TOOLDIR -cvjf llvm-pocl.tar.bz2 llvm-pocl
split -b 50M llvm-pocl.tar.bz2 "llvm-pocl.tar.bz2.part"
mv llvm-pocl.tar.bz2.part* ./llvm-pocl/$OSDIR
rm llvm-pocl.tar.bz2
}
pocl()
{
echo "prebuilt pocl..."
tar -C $TOOLDIR -cvjf pocl.tar.bz2 pocl
mv pocl.tar.bz2 ./pocl/$OSDIR
}
verilator()
{
echo "prebuilt verilator..."
tar -C $TOOLDIR -cvjf verilator.tar.bz2 verilator
mv verilator.tar.bz2 ./verilator/$OSDIR
}
sv2v()
{
echo "prebuilt sv2v..."
tar -C $TOOLDIR -cvjf sv2v.tar.bz2 sv2v
mv sv2v.tar.bz2 ./sv2v/$OSDIR
}
yosys()
{
echo "prebuilt yosys..."
tar -C $TOOLDIR -cvjf yosys.tar.bz2 yosys
split -b 50M yosys.tar.bz2 "yosys.tar.bz2.part"
mv yosys.tar.bz2.part* ./yosys/$OSDIR
rm yosys.tar.bz2
}
show_usage()
{
echo "Setup Pre-built Vortex Toolchain"
echo "Usage: $0 [[--riscv] [--llvm-vortex] [--llvm-pocl] [--pocl] [--verilator] [--sv2v] [-yosys] [--all] [-h|--help]]"
}
while [ "$1" != "" ]; do
case $1 in
--pocl ) pocl
;;
--verilator ) verilator
;;
--riscv ) riscv
;;
--riscv64 ) riscv64
;;
--llvm-vortex ) llvm-vortex
;;
--llvm-pocl ) llvm-pocl
;;
--sv2v ) sv2v
;;
--yosys ) yosys
;;
--all ) riscv
riscv64
llvm-vortex
llvm-pocl
pocl
verilator
sv2v
yosys
;;
-h | --help ) show_usage
exit
;;
* ) show_usage
exit 1
esac
shift
done

167
ci/toolchain_prebuilt.sh.in Executable file
View file

@ -0,0 +1,167 @@
#!/bin/bash
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# exit when any command fails
set -e
TOOLDIR=${TOOLDIR:=@TOOLDIR@}
OSVERSION=${OSVERSION:=@OSVERSION@}
riscv32()
{
echo "prebuilt riscv32-gnu-toolchain..."
tar -C $TOOLDIR -cvjf riscv32-gnu-toolchain.tar.bz2 riscv32-gnu-toolchain
split -b 50M riscv32-gnu-toolchain.tar.bz2 "riscv32-gnu-toolchain.tar.bz2.part"
mkdir -p ./riscv32-gnu-toolchain/$OSVERSION
mv riscv32-gnu-toolchain.tar.bz2.part* ./riscv32-gnu-toolchain/$OSVERSION
rm riscv32-gnu-toolchain.tar.bz2
}
riscv64()
{
echo "prebuilt riscv64-gnu-toolchain..."
tar -C $TOOLDIR -cvjf riscv64-gnu-toolchain.tar.bz2 riscv64-gnu-toolchain
split -b 50M riscv64-gnu-toolchain.tar.bz2 "riscv64-gnu-toolchain.tar.bz2.part"
mkdir -p ./riscv64-gnu-toolchain/$OSVERSION
mv riscv64-gnu-toolchain.tar.bz2.part* ./riscv64-gnu-toolchain/$OSVERSION
rm riscv64-gnu-toolchain.tar.bz2
}
llvm()
{
echo "prebuilt llvm-vortex2..."
tar -C $TOOLDIR -cvjf llvm-vortex2.tar.bz2 llvm-vortex
split -b 50M llvm-vortex2.tar.bz2 "llvm-vortex2.tar.bz2.part"
mkdir -p ./llvm-vortex/$OSVERSION
mv llvm-vortex2.tar.bz2.part* ./llvm-vortex/$OSVERSION
rm llvm-vortex2.tar.bz2
}
libcrt32()
{
echo "prebuilt libcrt32..."
tar -C $TOOLDIR -cvjf libcrt32.tar.bz2 libcrt32
mkdir -p ./libcrt32
mv libcrt32.tar.bz2 ./libcrt32
}
libcrt64()
{
echo "prebuilt libcrt64..."
tar -C $TOOLDIR -cvjf libcrt64.tar.bz2 libcrt64
mkdir -p ./libcrt64
mv libcrt64.tar.bz2 ./libcrt64
}
libc32()
{
echo "prebuilt libc32..."
tar -C $TOOLDIR -cvjf libc32.tar.bz2 libc32
mkdir -p ./libc32
mv libc32.tar.bz2 ./libc32
}
libc64()
{
echo "prebuilt libc64..."
tar -C $TOOLDIR -cvjf libc64.tar.bz2 libc64
mkdir -p ./libc64
mv libc64.tar.bz2 ./libc64
}
pocl()
{
echo "prebuilt pocl..."
tar -C $TOOLDIR -cvjf pocl2.tar.bz2 pocl
mkdir -p ./pocl/$OSVERSION
mv pocl2.tar.bz2 ./pocl/$OSVERSION
}
verilator()
{
echo "prebuilt verilator..."
tar -C $TOOLDIR -cvjf verilator.tar.bz2 verilator
mkdir -p ./verilator/$OSVERSION
mv verilator.tar.bz2 ./verilator/$OSVERSION
}
sv2v()
{
echo "prebuilt sv2v..."
tar -C $TOOLDIR -cvjf sv2v.tar.bz2 sv2v
mkdir -p ./sv2v/$OSVERSION
mv sv2v.tar.bz2 ./sv2v/$OSVERSION
}
yosys()
{
echo "prebuilt yosys..."
tar -C $TOOLDIR -cvjf yosys.tar.bz2 yosys
split -b 50M yosys.tar.bz2 "yosys.tar.bz2.part"
mkdir -p ./yosys/$OSVERSION
mv yosys.tar.bz2.part* ./yosys/$OSVERSION
rm yosys.tar.bz2
}
show_usage()
{
echo "Setup Pre-built Vortex Toolchain"
echo "Usage: $0 [--pocl] [--verilator] [--riscv32] [--riscv64] [--llvm] [--libcrt32] [--libcrt64] [--libc32] [--libc64] [--sv2v] [-yosys] [--all] [-h|--help]"
}
while [ "$1" != "" ]; do
case $1 in
--pocl ) pocl
;;
--verilator ) verilator
;;
--riscv32 ) riscv32
;;
--riscv64 ) riscv64
;;
--llvm ) llvm
;;
--libcrt32 ) libcrt32
;;
--libcrt64 ) libcrt64
;;
--libc32 ) libc32
;;
--libc64 ) libc64
;;
--sv2v ) sv2v
;;
--yosys ) yosys
;;
--all ) pocl
verilator
riscv32
riscv64
llvm
libcrt32
libcrt64
libc32
libc64
sv2v
yosys
;;
-h | --help ) show_usage
exit
;;
* ) show_usage
exit 1
esac
shift
done

View file

@ -1,12 +1,12 @@
#!/usr/bin/env python3
# Copyright © 2019-2023
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -17,7 +17,10 @@ import sys
import argparse
import csv
import re
import inspect
configs = None
def parse_args():
parser = argparse.ArgumentParser(description='CPU trace log to CSV format converter.')
parser.add_argument('-t', '--type', default='simx', help='log type (rtlsim or simx)')
@ -25,7 +28,25 @@ def parse_args():
parser.add_argument('log', help='Input log file')
return parser.parse_args()
def parse_simx(log_filename):
def load_config(filename):
config_pattern = r"CONFIGS: num_threads=(\d+), num_warps=(\d+), num_cores=(\d+), num_clusters=(\d+), socket_size=(\d+), local_mem_base=0x([0-9a-fA-F]+), num_barriers=(\d+)"
with open(filename, 'r') as file:
for line in file:
config_match = re.search(config_pattern, line)
if config_match:
config = {
'num_threads': int(config_match.group(1)),
'num_warps': int(config_match.group(2)),
'num_cores': int(config_match.group(3)),
'num_clusters': int(config_match.group(4)),
'socket_size': int(config_match.group(5)),
'local_mem_base': int(config_match.group(6), 16),
'num_barriers': int(config_match.group(7)),
}
return config
return None
def parse_simx(log_lines):
pc_pattern = r"PC=(0x[0-9a-fA-F]+)"
instr_pattern = r"Instr (0x[0-9a-fA-F]+):"
opcode_pattern = r"Instr 0x[0-9a-fA-F]+: ([0-9a-zA-Z_\.]+)"
@ -36,19 +57,19 @@ def parse_simx(log_filename):
destination_pattern = r"Dest Reg: (.+)"
uuid_pattern = r"#(\d+)"
entries = []
with open(log_filename, 'r') as log_file:
instr_data = None
for lineno, line in enumerate(log_file, start=1):
instr_data = None
for lineno, line in enumerate(log_lines, start=1):
try:
if line.startswith("DEBUG Fetch:"):
if instr_data:
entries.append(instr_data)
instr_data = {}
instr_data = {}
instr_data["lineno"] = lineno
instr_data["PC"] = re.search(pc_pattern, line).group(1)
instr_data["core_id"] = re.search(core_id_pattern, line).group(1)
instr_data["warp_id"] = re.search(warp_id_pattern, line).group(1)
instr_data["tmask"] = re.search(tmask_pattern, line).group(1)
instr_data["uuid"] = re.search(uuid_pattern, line).group(1)
instr_data["core_id"] = int(re.search(core_id_pattern, line).group(1))
instr_data["warp_id"] = int(re.search(warp_id_pattern, line).group(1))
instr_data["tmask"] = re.search(tmask_pattern, line).group(1)
instr_data["uuid"] = int(re.search(uuid_pattern, line).group(1))
elif line.startswith("DEBUG Instr"):
instr_data["instr"] = re.search(instr_pattern, line).group(1)
instr_data["opcode"] = re.search(opcode_pattern, line).group(1)
@ -57,16 +78,19 @@ def parse_simx(log_filename):
instr_data["operands"] = (instr_data["operands"] + ', ' + src_reg) if 'operands' in instr_data else src_reg
elif line.startswith("DEBUG Dest"):
instr_data["destination"] = re.search(destination_pattern, line).group(1)
if instr_data:
entries.append(instr_data)
except Exception as e:
print("Error at line {}: {}".format(lineno, e))
instr_data = None
if instr_data:
entries.append(instr_data)
return entries
def reverse_binary(bin_str):
return bin_str[::-1]
def bin_to_array(bin_str):
return [int(bit) for bit in bin_str]
def append_reg(text, value, sep):
if sep:
text += ", "
@ -77,14 +101,7 @@ def append_reg(text, value, sep):
text += "x" + value
sep = True
return text, sep
def append_imm(text, value, sep):
if sep:
text += ", "
text += value
sep = True
return text, sep
def append_value(text, reg, value, tmask_arr, sep):
text, sep = append_reg(text, reg, sep)
text += "={"
@ -97,9 +114,10 @@ def append_value(text, reg, value, tmask_arr, sep):
text +="-"
text += "}"
return text, sep
def parse_rtlsim(log_filename):
line_pattern = r"\d+: core(\d+)-(decode|issue|commit)"
def parse_rtlsim(log_lines):
global configs
line_pattern = r"\d+: cluster(\d+)-socket(\d+)-core(\d+)-(decode|issue|commit)"
pc_pattern = r"PC=(0x[0-9a-fA-F]+)"
instr_pattern = r"instr=(0x[0-9a-fA-F]+)"
ex_pattern = r"ex=([a-zA-Z]+)"
@ -108,8 +126,6 @@ def parse_rtlsim(log_filename):
tmask_pattern = r"tmask=(\d+)"
wb_pattern = r"wb=(\d)"
opds_pattern = r"opds=(\d+)"
use_imm_pattern = r"use_imm=(\d)"
imm_pattern = r"imm=(0x[0-9a-fA-F]+)"
rd_pattern = r"rd=(\d+)"
rs1_pattern = r"rs1=(\d+)"
rs2_pattern = r"rs2=(\d+)"
@ -120,24 +136,29 @@ def parse_rtlsim(log_filename):
rd_data_pattern = r"data=\{(.+?)\}"
eop_pattern = r"eop=(\d)"
uuid_pattern = r"#(\d+)"
entries = []
with open(log_filename, 'r') as log_file:
instr_data = {}
for lineno, line in enumerate(log_file, start=1):
entries = []
instr_data = {}
num_cores = configs['num_cores']
socket_size = configs['socket_size']
num_sockets = (num_cores + socket_size - 1) // socket_size
for lineno, line in enumerate(log_lines, start=1):
try:
line_match = re.search(line_pattern, line)
if line_match:
PC = re.search(pc_pattern, line).group(1)
warp_id = re.search(warp_id_pattern, line).group(1)
warp_id = int(re.search(warp_id_pattern, line).group(1))
tmask = re.search(tmask_pattern, line).group(1)
uuid = re.search(uuid_pattern, line).group(1)
core_id = line_match.group(1)
stage = line_match.group(2)
if stage == "decode":
uuid = int(re.search(uuid_pattern, line).group(1))
cluster_id = int(line_match.group(1))
socket_id = int(line_match.group(2))
core_id = int(line_match.group(3))
stage = line_match.group(4)
if stage == "decode":
trace = {}
trace["uuid"] = uuid
trace["PC"] = PC
trace["core_id"] = core_id
trace["warp_id"] = warp_id
trace["PC"] = PC
trace["core_id"] = ((((cluster_id * num_sockets) + socket_id) * socket_size) + core_id)
trace["warp_id"] = warp_id
trace["tmask"] = reverse_binary(tmask)
trace["instr"] = re.search(instr_pattern, line).group(1)
trace["opcode"] = re.search(op_pattern, line).group(1)
@ -146,8 +167,6 @@ def parse_rtlsim(log_filename):
trace["rs1"] = re.search(rs1_pattern, line).group(1)
trace["rs2"] = re.search(rs2_pattern, line).group(1)
trace["rs3"] = re.search(rs3_pattern, line).group(1)
trace["use_imm"] = re.search(use_imm_pattern, line).group(1) == "1"
trace["imm"] = re.search(imm_pattern, line).group(1)
instr_data[uuid] = trace
elif stage == "issue":
if uuid in instr_data:
@ -162,7 +181,7 @@ def parse_rtlsim(log_filename):
trace["rs3_data"] = re.search(rs3_data_pattern, line).group(1).split(', ')[::-1]
trace["issued"] = True
instr_data[uuid] = trace
elif stage == "commit":
elif stage == "commit":
if uuid in instr_data:
trace = instr_data[uuid]
if "issued" in trace:
@ -205,41 +224,65 @@ def parse_rtlsim(log_filename):
del trace["rs1"]
del trace["rs2"]
del trace["rs3"]
del trace["use_imm"]
del trace["imm"]
del trace["issued"]
del trace["issued"]
del instr_data[uuid]
entries.append(trace)
return entries
except Exception as e:
print("Error at line {}: {}".format(lineno, e))
return entries
def write_csv(log_filename, csv_filename, log_type):
entries = None
# parse log file
if log_type == "rtlsim":
entries = parse_rtlsim(log_filename)
elif log_type == "simx":
entries = parse_simx(log_filename)
else:
print('Error: invalid log type')
sys.exit()
# sort entries by uuid
entries.sort(key=lambda x: (int(x['core_id']), int(x['warp_id']), int(x['lineno'])))
for entry in entries:
del entry['lineno']
# write to CSV
def write_csv(sublogs, csv_filename, log_type):
with open(csv_filename, 'w', newline='') as csv_file:
fieldnames = ["uuid", "PC", "opcode", "instr", "core_id", "warp_id", "tmask", "operands", "destination"]
fieldnames = ["uuid", "PC", "opcode", "instr", "core_id", "warp_id", "tmask", "destination", "operands"]
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for entry in entries:
writer.writerow(entry)
for sublog in sublogs:
entries = None
# parse sublog
if log_type == "rtlsim":
entries = parse_rtlsim(sublog)
elif log_type == "simx":
entries = parse_simx(sublog)
else:
print('Error: invalid log type')
sys.exit()
# sort entries by uuid
entries.sort(key=lambda x: (int(x['uuid'])))
for entry in entries:
del entry['lineno']
for entry in entries:
writer.writerow(entry)
def split_log_file(log_filename):
with open(log_filename, 'r') as log_file:
log_lines = log_file.readlines()
sublogs = []
current_sublog = None
for line in log_lines:
if line.startswith("[VXDRV] START"):
if current_sublog is not None:
sublogs.append(current_sublog)
current_sublog = [line]
elif current_sublog is not None:
current_sublog.append(line)
if current_sublog is not None:
sublogs.append(current_sublog)
return sublogs
def main():
global configs
args = parse_args()
write_csv(args.log, args.csv, args.type)
configs = load_config(args.log)
sublogs = split_log_file(args.log)
write_csv(sublogs, args.csv, args.type)
if __name__ == "__main__":
main()

View file

@ -1,12 +1,12 @@
#!/usr/bin/env python
# Copyright © 2019-2023
#
# Copyright 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -18,48 +18,56 @@ import time
import threading
import subprocess
# This script executes a long-running command while outputing "still running ..." periodically
# This script executes a long-running command while printing "still running ..." periodically
# to notify Travis build system that the program has not hanged
PING_INTERVAL=300 # 5 minutes
SLEEP_INTERVAL=1 # 1 second
def monitor(stop):
wait_time = 0
while True:
time.sleep(PING_INTERVAL)
wait_time += PING_INTERVAL
print(" + still running (" + str(wait_time) + "s) ...")
sys.stdout.flush()
if stop():
break
def monitor(stop_event):
wait_time = 0
elapsed_time = 0
while not stop_event.is_set():
time.sleep(SLEEP_INTERVAL)
elapsed_time += SLEEP_INTERVAL
if elapsed_time >= PING_INTERVAL:
wait_time += elapsed_time
print(" + still running (" + str(wait_time) + "s) ...")
sys.stdout.flush()
elapsed_time = 0
def execute(command):
process = subprocess.Popen(command, stdout=subprocess.PIPE)
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
while True:
output = process.stdout.readline()
if output:
line = output.decode('ascii').rstrip()
try:
line = output.decode('utf-8').rstrip()
except UnicodeDecodeError:
line = repr(output) # Safely print raw binary data
print(">>> " + line)
process.stdout.flush()
ret = process.poll()
if ret is not None:
return ret
return ret
return -1
def main(argv):
if not argv:
print("Usage: travis_run.py <command>")
sys.exit(1)
# start monitoring thread
stop_monitor = False
t = threading.Thread(target = monitor, args =(lambda : stop_monitor, ))
stop_event = threading.Event()
t = threading.Thread(target=monitor, args=(stop_event,))
t.start()
# execute command
exitcode = execute(argv)
exitcode = execute(argv)
print(" + exitcode="+str(exitcode))
sys.stdout.flush()
# terminate monitoring thread
stop_monitor = True
stop_event.set()
t.join()
sys.exit(exitcode)

37
config.mk.in Normal file
View file

@ -0,0 +1,37 @@
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
VORTEX_HOME ?= @VORTEX_HOME@
XLEN ?= @XLEN@
TOOLDIR ?= @TOOLDIR@
OSVERSION ?= @OSVERSION@
INSTALLDIR ?= @INSTALLDIR@
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
LIBC_VORTEX ?= $(TOOLDIR)/libc$(XLEN)
LIBCRT_VORTEX ?= $(TOOLDIR)/libcrt$(XLEN)
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv$(XLEN)-gnu-toolchain
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
VORTEX_RT_PATH ?= $(VORTEX_HOME)/runtime
VORTEX_KN_PATH ?= $(VORTEX_HOME)/kernel
THIRD_PARTY_DIR ?= $(VORTEX_HOME)/third_party

174
configure vendored Executable file
View file

@ -0,0 +1,174 @@
#!/bin/bash
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Determine the current working directory
CURRENT_DIR=$(pwd)
# Function to detect current OS
detect_osversion() {
local osversion="unsupported"
if [ -f /etc/os-release ]; then
. /etc/os-release # Source the os-release file to get OS information
case "$ID" in
ubuntu)
case "$VERSION_CODENAME" in
bionic) osversion="ubuntu/bionic";;
focal) osversion="ubuntu/focal";;
# Add new versions as needed
esac
;;
centos)
case "$VERSION_ID" in
7) osversion="centos/7";;
# Add new versions as needed
esac
;;
esac
fi
echo "$osversion"
}
# Function to recursively copy files, skipping the current directory
copy_files() {
local source_dir="$1"
local target_dir="$2"
#echo "source_dir=$source_dir, target_dir=$target_dir"
local same_dir=0
if [ "$(realpath "$source_dir")" == "$(realpath "$target_dir")" ]; then
same_dir=1
fi
# Function to copy and update file
copy_and_update() {
local src_pattern="$1"
local dest_dir="$2"
for file in $src_pattern; do
#echo "*** $file > $dest_dir"
if [ -f "$file" ]; then
if [[ "$file" == *.in ]]; then
filename=$(basename -- "$file")
filename_no_ext="${filename%.in}"
dest_file="$dest_dir/$filename_no_ext"
mkdir -p "$dest_dir"
sed "s|@VORTEX_HOME@|$SCRIPT_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g" "$file" > "$dest_file"
# apply permissions to bash scripts
read -r firstline < "$dest_file"
if [[ "$firstline" =~ ^#!.*bash ]]; then
chmod +x "$dest_file"
fi
else
if [ $same_dir -eq 0 ]; then
mkdir -p "$dest_dir"
cp -p "$file" "$dest_dir"
fi
fi
fi
done
}
for pattern in "${SUBDIRS[@]}"; do
local full_copy=0
if [[ "$pattern" == !* ]]; then
full_copy=1
pattern=${pattern:1}
fi
local source_pattern="$source_dir/$pattern"
if [[ "$pattern" == "." ]]; then
source_pattern=$source_dir
fi
find "$source_dir" -type d -path "$source_pattern" 2>/dev/null | while read dir; do
# Compute the relative path of the directory
local rel_path="${dir#$source_dir}"
rel_path="${rel_path#/}" # Remove leading slash, if present
local full_target_dir="$target_dir/$rel_path"
# Copy and update Makefile and common.mk if they exist
if [ $full_copy -eq 1 ]; then
copy_and_update "$dir/*" "$full_target_dir"
else
copy_and_update "$dir/Makefile" "$full_target_dir"
copy_and_update "$dir/common.mk" "$full_target_dir"
copy_and_update "$dir/*.in" "$full_target_dir"
fi
done
done
}
###############################################################################
# default configuration parameters
default_xlen=32
default_tooldir=$HOME/tools
default_osversion=$(detect_osversion)
default_prefix=$CURRENT_DIR
# load default configuration parameters from existing config.mk
if [ -f "config.mk" ]; then
while IFS='=' read -r key value; do
value=${value//[@]/} # Remove placeholder characters
value="${value#"${value%%[![:space:]]*}"}" # Remove leading whitespace
value="${value%"${value##*[![:space:]]}"}" # Remove trailing whitespace
case $key in
XLEN\ ?*) default_xlen=${value//\?=/} ;;
TOOLDIR\ ?*) default_tooldir=${value//\?=/} ;;
OSVERSION\ ?*) default_osversion=${value//\?=/} ;;
PREFIX\ ?*) default_prefix=${value//\?=/} ;;
esac
done < config.mk
fi
# set configuration parameters
XLEN=${XLEN:=$default_xlen}
TOOLDIR=${TOOLDIR:=$default_tooldir}
OSVERSION=${OSVERSION:=$default_osversion}
PREFIX=${PREFIX:=$default_prefix}
# parse command line arguments
usage() {
echo "Usage: $0 [--xlen=<value>] [--tooldir=<path>] [--osversion=<version>]"
echo " --xlen=<value> Set the XLEN value (default: 32)"
echo " --tooldir=<path> Set the TOOLDIR path (default: $HOME/tools)"
echo " --osversion=<version> Set the OS Version (default: $(detect_osversion))"
echo " --prefix=<path> Set installation directory"
exit 1
}
while [[ "$#" -gt 0 ]]; do
case $1 in
--xlen=*) XLEN="${1#*=}" ;;
--tooldir=*) TOOLDIR="${1#*=}" ;;
--osversion=*) OSVERSION="${1#*=}" ;;
--prefix=*) PREFIX="${1#*=}" ;;
-h|--help) usage ;;
*) echo "Unknown parameter passed: $1"; usage ;;
esac
shift
done
# check OS
if [ "$OSVERSION" == "unsupported" ]; then
echo "Error: Unsupported OS."
exit -1
fi
# project subdirectories to build
SUBDIRS=("." "!ci" "!perf" "hw*" "kernel*" "runtime*" "sim*" "tests*")
# Get the directory of the script
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
THIRD_PARTY_DIR=$SCRIPT_DIR/third_party
copy_files "$SCRIPT_DIR" "$CURRENT_DIR"

View file

@ -1,4 +1,4 @@
# FPGA Startup and Configuration Guide
# FPGA Startup and Configuration Guide
OPAE Environment Setup
----------------------
@ -53,9 +53,9 @@ If the build fails and you need to restart it, clean up the build folder using t
$ make clean
The file `vortex_afu.gbs` should exist when the build is done:
The bitstream file `vortex_afu.gbs` should exist when the build is done:
$ ls -lsa <build_dir>/vortex_afu.gbs
$ ls -lsa <build_dir>/synth/vortex_afu.gbs
Signing the bitstream and Programming the FPGA
@ -65,10 +65,15 @@ Signing the bitstream and Programming the FPGA
$ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs
$ fpgasupdate vortex_afu_unsigned_ssl.gbs
FPGA sample test running OpenCL sgemm kernel
--------------------------------------------
Sample FPGA Run Test
--------------------
Run the following from the Vortex root directory
Ensure you have the correct opae runtime for the FPGA target
$ ./ci/blackbox.sh --driver=fpga --app=sgemm --args="-n64"
$ make -C runtime/opae clean
$ TARGET=FPGA make -C runtime/opae
Run the following from your Vortex build directory
$ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128"

View file

@ -3,7 +3,7 @@
The Vortex Cache Sub-system has the following main properties:
- High-bandwidth transfer with Multi-bank parallelism
- Non-blocking pipelined architecture with local MSHR
- Non-blocking pipelined write-through cache architecture with per-bank MSHR
- Configurable design: Dcache, Icache, L2 cache, L3 cache
### Cache Microarchitecture
@ -11,16 +11,16 @@ The Vortex Cache Sub-system has the following main properties:
![Image of Cache Hierarchy](./assets/img/cache_microarchitecture.png)
The Vortex cache is comprised of multiple parallel banks. It is comprised of the following modules:
- **Bank request dispatch crossbar**: assign a bank to incoming requests and resolve collision using stalls.
- **Bank response merge crossbar**: merge result from banks and forward to the core response.
- **Memory request multiplexer**: arbitrate bank memory requests
- **Memory response demultiplexer**: forward memory response to the corresponding bank.
- **Flush Unit**: perform tag memory initialization.
- **Bank request dispatch crossbar**: assigns a bank to incoming requests and resolve collision using stalls.
- **Bank response merge crossbar**: merges result from banks and forward to the core response.
- **Memory request multiplexer**: arbitrates bank memory requests
- **Memory response demultiplexer**: forwards memory response to the corresponding bank.
- **Flush Unit**: performs tag memory initialization.
Incoming requests entering the cache are sent to a dispatch crossbar that select the corresponding bank for each request, resolving bank collisions with stalls. The result output of each bank is merge back into outgoing response port via merger crossbar. Each bank intergates a non-blocking pipeline with a local Miss Status Holding Register (MSHR) to reduce the miss rate. The bank pipeline consists of the following stages:
- **Schedule**: Selects the next request into the pipeline from the incoming core request, memory fill, or the MSHR entry, with priority given to the latter.
- **Tag Access**: A single-port read/write access to the tag store.
- **Tag Access**: single-port read/write access to the tag store.
- **Data Access**: Single-port read/write access to the data store.
- **Response Handling**: Core response back to the core.

View file

@ -3,14 +3,14 @@
## Testing changes to the RTL or simulator GPU driver.
The Blackbox utility script will not pick up your changes if the h/w configuration is the same as during teh last run.
To force the utility to build the driver, you need pass the --rebuild=1 option when running tests.
To force the utility to build the driver, you need pass the --rebuild=1 option when running tests.
Using --rebuild=0 will prevent the rebuild even if the h/w configuration is different from last run.
$ ./ci/blackbox.sh --driver=simx --app=demo --rebuild=1
## SimX Debugging
SimX cycle-approximate simulator allows faster debugging of Vortex kernels' execution.
SimX cycle-approximate simulator allows faster debugging of Vortex kernels' execution.
The recommended method to enable debugging is to pass the `--debug=<level>` flag to `blackbox` tool when running a program.
// Running demo program on SimX in debug mode
@ -61,5 +61,8 @@ We provide a trace sanitizer tool under ./hw/scripts/trace_csv.py that you can u
$ ./ci/blackbox.sh --driver=simx --app=demo --debug=3 --log=run_simx.log
$ ./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
The first column in the CSV trace is UUID (universal unique identifier) of the instruction and the content is sorted by the UUID. You can use the UUID to trace the same instruction running on either the RTL hw or SimX simulator.
$ diff trace_rtlsim.csv trace_simx.csv
The first column in the CSV trace is UUID (universal unique identifier) of the instruction and the content is sorted by the UUID.
You can use the UUID to trace the same instruction running on either the RTL hw or SimX simulator.
This can be very effective if you want to use SimX to debugging your RTL hardware by comparing CSV traces.

View file

@ -1,71 +1,45 @@
# Environment Setup# Vortex Dev Environment Setup
These instructions apply to the development vortex repo using the *updated toolchain*. The updated toolchain is considered to be any commit of `master` pulled from *July 2, 2023* onwards. The toolchain update in question can be viewed in this [commit](https://github.com/vortexgpgpu/vortex-dev/commit/0048496ba28d7b9a209a0e569d52d60f2b68fc04). Therefore, if you are unsure whether you are using the new toolchain or not, then you should check the `ci` folder for the existence of the `toolchain_prebuilt.sh` script. Furthermore, you should notice that the `toolchain_install.sh` script has the legacy `llvm()` split into `llvm-vortex()` and `llvm-pocl()`.
# Environment Setup
These instructions apply to the development vortex repo using the updated toolchain. The updated toolchain is considered to be any commit of `master` pulled from July 2, 2023 onwards. The toolchain update in question can be viewed in this [commit](https://github.com/vortexgpgpu/vortex-dev/commit/0048496ba28d7b9a209a0e569d52d60f2b68fc04). Therefore, if you are unsure whether you are using the new toolchain or not, then you should check the `ci` folder for the existence of the `toolchain_prebuilt.sh` script. Furthermore, you should notice that the `toolchain_install.sh` script has the legacy `llvm()` split into `llvm-vortex()` and `llvm-pocl()`.
> Note: As it stands right now, there a few test suites which are not working due to this toolchain migration. We are working to determine an exact list of which ones are working and which ones are not. For now, if the repo builds at a minimum, then you can consider all these steps to have worked successfully.
## Choosing an Development Environment
There are three primary environments you can use. Each has its own pros and cons. Refer to this section to help you determine which environment best suits your needs.
1. Volvo
2. Docker
3. Local
## Set Up on Your Own System
The toolchain binaries provided with Vortex are built on Ubuntu-based systems. To install Vortex on your own system, [follow these instructions](install_vortex.md).
## Servers for Georgia Tech Students and Collaborators
### Volvo
Volvo is a server provided by Georgia Tech. As such, it provides high performance compute, but you need valid credentials to access it. If you don't already have credentials, you can get in contact with your mentor to ask about setting your account up.
Volvo is a 64-core server provided by HPArch. You need valid credentials to access it. If you don't already have access, you can get in contact with your mentor to ask about setting your account up.
Pros:
Setup on Volvo:
1. Connect to Georgia Tech's VPN or ssh into another machine on campus
2. `ssh volvo.cc.gatech.edu`
3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
4. `source /nethome/software/set_vortex_env.sh` to set up the necessary environment variables.
5. `make -s` in the `vortex` root directory
6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
1. Native x86_64 architecture, AMD EPYC 7702P 64-Core Processor (*fast*)
2. Packages and difficult configurations are already done for you
3. Consistent environment as others, allowing for easier troubleshooting
4. Just need to SSH into Volvo, minimal impact on local computer resources
5. VScode remote development tools are phenomenal over SSH
### Nio
Nio is a 20-core desktop server provided by HPArch. If you have access to Volvo, you also have access to Nio.
Cons:
1. Volvo is accessed via gatech vpn, external contributors might encounter issues with it -- especially from other university networks
2. Account creation is not immediate and is subject to processing time
3. Volvo might have outtages (*pretty uncommon*)
5. SSH development requires internet and other remote development tools (*vscode works!*)
### Docker
Docker allows for isolated pre-built environments to be created, shared and used. They are much more resource efficient than a Virtual Machine, and have great tooling and support available. The main motivation for Docker is bringing a consistent development environment to your local computer, across all platforms.
Pros:
1. If you are native to x86_64, the container will also run natively, yielding better performance. However, if you have aarch64 (arm) processor, you can still run the Docker container without configuration changes.
2. Consistent environment as others, allowing for easier troubleshooting
3. Works out of the box, just have a working installation of Docker
4. Vortex uses a build system, so once you build the repo once, only new code changes need to be recompiled
5. Docker offers helpful tools and extensions to monitor the performance of your container
Cons:
1. If you are using an arm processor, the container will be run in emulation mode, so it will inherently run slower, as it needs to translate all the x86_64 instructions. It's still usable on Apple Silicon, however.
2. Limited to your computer's performance, and Vortex is a large repo to build
3. Will utilize a few gigabytes of storage on your computer for saving binaries to run the container
Setup on Nio:
1. Connect to Georgia Tech's VPN or ssh into another machine on campus
2. `ssh nio.cc.gatech.edu`
3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
4. `source /opt/set_vortex_env_dev.sh` to set up the necessary environment variables.
5. `make -s` in the `vortex` root directory
6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
### Local
You can reverse engineer the Dockerfile and scripts above to get a working environment setup locally. This option is for experienced users, who have already considered the pros and cons of Volvo and Docker.
## Docker (Experimental)
Docker allows for isolated pre-built environments to be created, shared and used. The emulation mode required for ARM-based processors will incur a decrease in performance. Currently, the dockerfile is not included with the official vortex repository and is not actively maintained or supported.
## Setup on Volvo
1. Clone Repo Recursively: `git clone --recursive https://github.com/vortexgpgpu/vortex-dev.git`
2. Source `/opt/set_vortex_env_dev.sh` to initialize pre-installed toolchain
3. `make -s` in `vortex-dev` root directory
4. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
## Setup with Docker
Currently the Dockerfile is not included with the official vortex-dev repository, however you can quickly add it to repo and get started.
1. Clone repo recursively onto your local machine: `git clone --recursive https://github.com/vortexgpgpu/vortex-dev.git`
2. Download a copy of `Dockerfile.dev` and place it in the root of the repo.
3. Build the Dockerfile into an image: `docker build --platform=linux/amd64 -t vortex-dev -f Dockerfile.dev .`
4. Run a container based on the image: `docker run --rm -v ./:/root/vortex-dev/ -it --name vtx-dev --privileged=true --platform=linux/amd64 vortex-dev`
### Setup with Docker
1. Clone repo recursively onto your local machine: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
2. Download the dockerfile from [here](https://github.gatech.edu/gist/usubramanya3/f1bf3e953faa38a6372e1292ffd0b65c) and place it in the root of the repo.
3. Build the Dockerfile into an image: `docker build --platform=linux/amd64 -t vortex -f dockerfile .`
4. Run a container based on the image: `docker run --rm -v ./:/root/vortex/ -it --name vtx-dev --privileged=true --platform=linux/amd64 vortex`
5. Install the toolchain `./ci/toolchain_install.sh --all` (once per container)
6. `make -s` in `vortex-dev` root directory
6. `make -s` in `vortex` root directory
7. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
### Additional Docker Commands
- Exit from a container (does not stop or remove it)
- Resume a container you have exited or start a second terminal session `docker exec -it <container-name> bash`
You may exit from a container and resume a container you have exited or start a second terminal session `docker exec -it <container-name> bash`

View file

@ -7,13 +7,15 @@
- [Cache Subsystem](cache_subsystem.md)
- [Software](software.md)
- [Simulation](simulation.md)
- [FPGA Setup Guide](fpga_setup.md)
- [Altera FPGA Setup Guide](altera_fpga_guide.md)
- [Xilinx FPGA Setup Guide](xilinx_fpga_guide.md)
- [Debugging](debugging.md)
- [Useful Links](references.md)
## Installation
- Refer to the build instructions in [README](../README.md).
- For the different environments Vortex supports, [read this document](environment_setup.md).
- To install on your own system, [follow this document](install_vortex.md).
## Quick Start Scenarios
@ -26,6 +28,6 @@ Running Vortex simulators with different configurations:
$ ./ci/blackbox.sh --driver=opae --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo
- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads
- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads
$ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood
$ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood

81
docs/install_vortex.md Normal file
View file

@ -0,0 +1,81 @@
# Installing and Setting Up the Vortex Environment
## Ubuntu 18.04, 20.04
1. Install the following dependencies:
```
sudo apt-get install build-essential zlib1g-dev libtinfo-dev libncurses5 uuid-dev libboost-serialization-dev libpng-dev libhwloc-dev
```
2. Upgrade GCC to 11:
```
sudo apt-get install gcc-11 g++-11
```
Multiple gcc versions on Ubuntu can be managed with update-alternatives, e.g.:
```
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-9 9
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-9 9
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11
```
3. Download the Vortex codebase:
```
git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git
```
4. Build Vortex
```
$ cd vortex
$ mkdir -p build
$ cd build
$ ../configure --xlen=32 --tooldir=$HOME/tools
$ ./ci/toolchain_install.sh --all
$ source ./ci/toolchain_env.sh
$ make -s
```
## RHEL 8
Note: depending on the system, some of the toolchain may need to be recompiled for non-Ubuntu Linux. The source for the tools can be found [here](https://github.com/vortexgpgpu/).
1. Install the following dependencies:
```
sudo yum install libpng-devel boost boost-devel boost-serialization libuuid-devel opencl-headers hwloc hwloc-devel gmp-devel compat-hwloc1
```
2. Upgrade GCC to 11:
```
sudo yum install gcc-toolset-11
```
Multiple gcc versions on Red Hat can be managed with scl
3. Install MPFR 4.2.0:
Download [the source](https://ftp.gnu.org/gnu/mpfr/) and follow [the installation documentation](https://www.mpfr.org/mpfr-current/mpfr.html#How-to-Install).
4. Download the Vortex codebase:
```
git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git
```
5. Build Vortex
```
$ cd vortex
$ mkdir -p build
$ cd build
$ ../configure --xlen=32 --tooldir=$HOME/tools
$ ./ci/toolchain_install.sh --all
$ source ./ci/toolchain_env.sh
$ make -s
```

View file

@ -20,7 +20,7 @@ Running tests under specific drivers (rtlsim,simx,fpga) is done using the script
- *Cores* - used to specify the number of cores (processing element containing multiple warps) within a configuration.
- *Warps* - used to specify the number of warps (collection of concurrent hardware threads) within a configuration.
- *Threads* - used to specify the number of threads (smallest unit of computation) within a configuration.
- *L2cache* - used to enable the shard l2cache among the Vortex cores.
- *L2cache* - used to enable the shared l2cache among the Vortex cores.
- *L3cache* - used to enable the shared l3cache among the Vortex clusters.
- *Driver* - used to specify which driver to run the Vortex simulation (either rtlsim, opae, xrt, simx).
- *Debug* - used to enable debug mode for the Vortex simulation.

View file

@ -15,7 +15,7 @@ You can execute the same application of a GPU architecture with 2 cores:
$ ./ci/blackbox.sh --core=2 --driver=simx --app=sgemm --args="-n10"
When excuting, Blackbox needs to recompile the driver if the desired architecture changes.
When excuting, Blackbox needs to recompile the driver if the desired architecture changes.
It tracks the latest configuration in a file under the current directory blackbox.<driver>.cache.
To avoid having to rebuild the driver all the time, Blackbox checks if the latest cached configuration matches the current.
@ -24,24 +24,29 @@ To avoid having to rebuild the driver all the time, Blackbox checks if the lates
The Vortex test suite is located under the /test/ folder
You can execute the default regression suite by running the following commands at the root folder.
$ make -C tests/regression run-simx
$ make -C tests/regression run-simx
$ make -C tests/regression run-rtlsim
You can execute the default opncl suite by running the following commands at the root folder.
$ make -C tests/opencl run-simx
$ make -C tests/opencl run-simx
$ make -C tests/opencl run-rtlsim
## Creating Your Own Regression Tests
- Inside `test/` you will find a series of folders which are named based on what they test
- You can view the tests to see which ones have tests similar to what you are trying to create new tests for
- once you have found a similar baseline, you can copy the folder and rename it to what you are planning to test
- `testcases.h` contains each of the test case templates
- `main.cpp` contains the implementation of each of the test cases and builds a test suite of all the tests cases you want
## Creating Your Own Regression Test
Compile the test case: `make -C tests/regression/<testcase-name>/ clean-all && make -C tests/regression/<testcase-name>/`
Inside `tests/regression` you will find a series of folders which are named based on what they test.
You can view the tests to see which ones have tests similar to what you are trying to create new tests for.
Once you have found a similar baseline, you can copy the folder and rename it to what you are planning to test.
A regression test typically implements the following files:
- ***kernel.cpp*** contains the GPU kernel code.
- ***main.cpp*** contains the host CPU code.
- ***Makefile*** defines the compiler build commands for the CPU and GPU binaries.
Run the test case: `./ci/blackbox.sh --driver=simx --cores=4 --app=<testcase-name> --debug`
Sync your build folder: `$ ../configure`
Compile your test: `$ make -C tests/regression/<test-name>`
Run your test: `$ ./ci/blackbox.sh --driver=simx --app=<test-name> --debug`
## Adding Your Tests to the CI Pipeline
see `continuous_integration.md`
See `continuous_integration.md`

36
docs/xilinx_fpga_guide.md Normal file
View file

@ -0,0 +1,36 @@
# FPGA Startup and Configuration Guide
XRT Environment Setup
----------------------
$ source /opt/xilinx/Vitis/2023.1/settings64.sh
$ source /opt/xilinx/xrt/setup.sh
Check Installed FPGA Platforms
------------------------------
$ platforminfo -l
Build FPGA image
----------------
$ cd hw/syn/xilinx/xrt
$ PREFIX=test1 PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 TARGET=hw NUM_CORES=4 make
Will run the synthesis under new build directory: BUILD_DIR := "\<PREFIX>\_\<PLATFORM>\_\<TARGET>"
The generated bitstream will be located under <BUILD_DIR>/bin/vortex_afu.xclbin
Sample FPGA Run Test
--------------------
Ensure you have the correct opae runtime for the FPGA target
$ make -C runtime/xrt clean
$ TARGET=hw make -C runtime/xrt
Run the following from your Vortex build directory
$ TARGET=hw FPGA_BIN_DIR=<BUILD_DIR>/bin ./ci/blackbox.sh --driver=xrt --app=sgemm --args="-n128"

2
hw/.gitignore vendored
View file

@ -1,2 +0,0 @@
VX_config.h
VX_types.h

View file

@ -1,17 +1,22 @@
RTL_DIR=./rtl
SCRIPT_DIR=./scripts
ROOT_DIR := $(realpath ..)
include $(ROOT_DIR)/config.mk
HW_DIR := $(VORTEX_HOME)/hw
SCRIPT_DIR := $(HW_DIR)/scripts
RTL_DIR := $(HW_DIR)/rtl
all: config
config: VX_config.h VX_types.h
VX_config.h: $(RTL_DIR)/VX_config.vh
VX_config.h: $(RTL_DIR)/VX_config.vh
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/VX_config.vh -o VX_config.h
VX_types.h: $(RTL_DIR)/VX_types.vh
VX_types.h: $(RTL_DIR)/VX_types.vh
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/VX_types.vh -o VX_types.h
clean:
$(MAKE) -C unittest clean
rm -f VX_config.h VX_types.h
.PHONY: VX_config.h VX_types.h

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -40,7 +40,7 @@ extern "C" {
void dpi_itof(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
void dpi_utof(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
void dpi_f2f(bool enable, int dst_fmt, int64_t a, int64_t* result);
void dpi_fclss(bool enable, int dst_fmt, int64_t a, int64_t* result);
void dpi_fsgnj(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result);
void dpi_fsgnjn(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result);
@ -54,15 +54,15 @@ extern "C" {
}
inline uint64_t nan_box(uint32_t value) {
#ifdef FPU_RV64F
#ifdef XLEN_64
return value | 0xffffffff00000000;
#else
#else
return value;
#endif
}
inline bool is_nan_boxed(uint64_t value) {
#ifdef FPU_RV64F
#ifdef XLEN_64
return (uint32_t(value >> 32) == 0xffffffff);
#else
__unused (value);
@ -70,15 +70,14 @@ inline bool is_nan_boxed(uint64_t value) {
#endif
}
inline int64_t check_boxing(int64_t a) {
if (!is_nan_boxed(a)) {
return nan_box(0x7fc00000); // NaN
}
return a;
inline int64_t check_boxing(int64_t a) {
if (is_nan_boxed(a))
return a;
return nan_box(0x7fc00000); // NaN
}
void dpi_fadd(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
*result = rv_fadd_d(a, b, (*frm & 0x7), fflags);
@ -88,7 +87,7 @@ void dpi_fadd(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal*
}
void dpi_fsub(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
*result = rv_fsub_d(a, b, (*frm & 0x7), fflags);
@ -98,19 +97,19 @@ void dpi_fsub(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal*
}
void dpi_fmul(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
*result = rv_fmul_d(a, b, (*frm & 0x7), fflags);
if (dst_fmt) {
*result = rv_fmul_d(a, b, (*frm & 0x7), fflags);
} else {
*result = nan_box(rv_fmul_s(check_boxing(a), check_boxing(b), (*frm & 0x7), fflags));
}
}
void dpi_fmadd(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
if (dst_fmt) {
*result = rv_fmadd_d(a, b, c, (*frm & 0x7), fflags);
} else {
*result = nan_box(rv_fmadd_s(check_boxing(a), check_boxing(b), check_boxing(c), (*frm & 0x7), fflags));
@ -118,9 +117,9 @@ void dpi_fmadd(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const
}
void dpi_fmsub(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
if (dst_fmt) {
*result = rv_fmsub_d(a, b, c, (*frm & 0x7), fflags);
} else {
*result = nan_box(rv_fmsub_s(check_boxing(a), check_boxing(b), check_boxing(c), (*frm & 0x7), fflags));
@ -128,9 +127,9 @@ void dpi_fmsub(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const
}
void dpi_fnmadd(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
if (dst_fmt) {
*result = rv_fnmadd_d(a, b, c, (*frm & 0x7), fflags);
} else {
*result = nan_box(rv_fnmadd_s(check_boxing(a), check_boxing(b), check_boxing(c), (*frm & 0x7), fflags));
@ -138,9 +137,9 @@ void dpi_fnmadd(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const
}
void dpi_fnmsub(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
if (dst_fmt) {
*result = rv_fnmsub_d(a, b, c, (*frm & 0x7), fflags);
} else {
*result = nan_box(rv_fnmsub_s(check_boxing(a), check_boxing(b), check_boxing(c), (*frm & 0x7), fflags));
@ -148,36 +147,36 @@ void dpi_fnmsub(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const
}
void dpi_fdiv(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
*result = rv_fdiv_d(a, b, (*frm & 0x7), fflags);
if (dst_fmt) {
*result = rv_fdiv_d(a, b, (*frm & 0x7), fflags);
} else {
*result = nan_box(rv_fdiv_s(check_boxing(a), check_boxing(b), (*frm & 0x7), fflags));
}
}
void dpi_fsqrt(bool enable, int dst_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
*result = rv_fsqrt_d(a, (*frm & 0x7), fflags);
if (dst_fmt) {
*result = rv_fsqrt_d(a, (*frm & 0x7), fflags);
} else {
*result = nan_box(rv_fsqrt_s(check_boxing(a), (*frm & 0x7), fflags));
}
}
void dpi_ftoi(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
if (src_fmt) {
if (src_fmt) {
*result = rv_ftol_d(a, (*frm & 0x7), fflags);
} else {
*result = rv_ftol_s(check_boxing(a), (*frm & 0x7), fflags);
}
} else {
if (src_fmt) {
} else {
if (src_fmt) {
*result = sext<uint64_t>(rv_ftoi_d(a, (*frm & 0x7), fflags), 32);
} else {
*result = sext<uint64_t>(rv_ftoi_s(check_boxing(a), (*frm & 0x7), fflags), 32);
@ -186,61 +185,61 @@ void dpi_ftoi(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVa
}
void dpi_ftou(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
if (src_fmt) {
if (src_fmt) {
*result = rv_ftolu_d(a, (*frm & 0x7), fflags);
} else {
*result = rv_ftolu_s(check_boxing(a), (*frm & 0x7), fflags);
}
} else {
if (src_fmt) {
} else {
if (src_fmt) {
*result = sext<uint64_t>(rv_ftou_d(a, (*frm & 0x7), fflags), 32);
} else {
*result = sext<uint64_t>(rv_ftou_s(check_boxing(a), (*frm & 0x7), fflags), 32);
*result = sext<uint64_t>(rv_ftou_s(check_boxing(a), (*frm & 0x7), fflags), 32);
}
}
}
void dpi_itof(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
if (src_fmt) {
if (src_fmt) {
*result = rv_ltof_d(a, (*frm & 0x7), fflags);
} else {
} else {
*result = rv_itof_d(a, (*frm & 0x7), fflags);
}
} else {
if (src_fmt) {
*result = nan_box(rv_ltof_s(a, (*frm & 0x7), fflags));
} else {
*result = nan_box(rv_itof_s(a, (*frm & 0x7), fflags));
if (src_fmt) {
*result = nan_box(rv_ltof_s(a, (*frm & 0x7), fflags));
} else {
*result = nan_box(rv_itof_s(a, (*frm & 0x7), fflags));
}
}
}
void dpi_utof(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
if (src_fmt) {
if (src_fmt) {
*result = rv_lutof_d(a, (*frm & 0x7), fflags);
} else {
} else {
*result = rv_utof_d(a, (*frm & 0x7), fflags);
}
} else {
if (src_fmt) {
if (src_fmt) {
*result = nan_box(rv_lutof_s(a, (*frm & 0x7), fflags));
} else {
} else {
*result = nan_box(rv_utof_s(a, (*frm & 0x7), fflags));
}
}
}
void dpi_f2f(bool enable, int dst_fmt, int64_t a, int64_t* result) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
*result = rv_ftod((int32_t)check_boxing(a));
@ -250,90 +249,90 @@ void dpi_f2f(bool enable, int dst_fmt, int64_t a, int64_t* result) {
}
void dpi_fclss(bool enable, int dst_fmt, int64_t a, int64_t* result) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
*result = rv_fclss_d(a);
} else {
*result = rv_fclss_s(check_boxing(a));
if (dst_fmt) {
*result = rv_fclss_d(a);
} else {
*result = rv_fclss_s(check_boxing(a));
}
}
void dpi_fsgnj(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
*result = rv_fsgnj_d(a, b);
if (dst_fmt) {
*result = rv_fsgnj_d(a, b);
} else {
*result = nan_box(rv_fsgnj_s(check_boxing(a), check_boxing(b)));
}
}
void dpi_fsgnjn(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
*result = rv_fsgnjn_d(a, b);
if (dst_fmt) {
*result = rv_fsgnjn_d(a, b);
} else {
*result = nan_box(rv_fsgnjn_s(check_boxing(a), check_boxing(b)));
}
}
void dpi_fsgnjx(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
*result = rv_fsgnjx_d(a, b);
if (dst_fmt) {
*result = rv_fsgnjx_d(a, b);
} else {
*result = nan_box(rv_fsgnjx_s(check_boxing(a), check_boxing(b)));
}
}
void dpi_flt(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
*result = rv_flt_d(a, b, fflags);
*result = rv_flt_d(a, b, fflags);
} else {
*result = rv_flt_s(check_boxing(a), check_boxing(b), fflags);
}
}
void dpi_fle(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
*result = rv_fle_d(a, b, fflags);
if (dst_fmt) {
*result = rv_fle_d(a, b, fflags);
} else {
*result = rv_fle_s(check_boxing(a), check_boxing(b), fflags);
}
}
void dpi_feq(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
*result = rv_feq_d(a, b, fflags);
if (dst_fmt) {
*result = rv_feq_d(a, b, fflags);
} else {
*result = rv_feq_s(check_boxing(a), check_boxing(b), fflags);
}
}
void dpi_fmin(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
*result = rv_fmin_d(a, b, fflags);
if (dst_fmt) {
*result = rv_fmin_d(a, b, fflags);
} else {
*result = nan_box(rv_fmin_s(check_boxing(a), check_boxing(b), fflags));
}
}
void dpi_fmax(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
if (!enable)
if (!enable)
return;
if (dst_fmt) {
*result = rv_fmax_d(a, b, fflags);
if (dst_fmt) {
*result = rv_fmax_d(a, b, fflags);
} else {
*result = nan_box(rv_fmax_s(check_boxing(a), check_boxing(b), fflags));
}

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,8 +14,6 @@
`ifndef FLOAT_DPI_VH
`define FLOAT_DPI_VH
`include "VX_config.vh"
import "DPI-C" function void dpi_fadd(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fsub(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmul(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -21,8 +21,6 @@
#include "svdpi.h"
#include "verilated_vpi.h"
#include "uuid_gen.h"
#ifdef XLEN_64
#define iword_t int64_t
#define uword_t uint64_t
@ -50,7 +48,7 @@ extern "C" {
void dpi_trace_start();
void dpi_trace_stop();
uint64_t dpi_uuid_gen(bool reset, int wid, uint64_t PC);
uint64_t dpi_uuid_gen(bool reset, int wid);
}
bool sim_trace_enabled();
@ -70,7 +68,7 @@ public:
void push(int value, bool enable) {
if (!enable)
return;
return;
for (unsigned i = 0; i < depth_-1; ++i) {
buffer_[i] = buffer_[i+1];
}
@ -85,7 +83,7 @@ private:
std::vector<int> buffer_;
bool init_;
unsigned depth_;
unsigned depth_;
};
class Instances {
@ -95,9 +93,9 @@ public:
}
int allocate() {
mutex_.lock();
mutex_.lock();
int inst = instances_.size();
instances_.resize(inst + 1);
instances_.resize(inst + 1);
mutex_.unlock();
return inst;
}
@ -135,7 +133,7 @@ void dpi_imul(bool enable, bool is_signed_a, bool is_signed_b, iword_t a, iword_
udword_t second = *(uword_t*)&b;
udword_t mask = udword_t(-1) << (8 * sizeof(iword_t));
if (is_signed_a && a < 0) {
first |= mask;
}
@ -171,11 +169,11 @@ void dpi_idiv(bool enable, bool is_signed, iword_t a, iword_t b, iword_t* quotie
} else if (dividen == inf_neg && divisor == -1) {
*remainder = 0;
*quotient = dividen;
} else {
} else {
*quotient = (iword_t)dividen / (iword_t)divisor;
*remainder = (iword_t)dividen % (iword_t)divisor;
*remainder = (iword_t)dividen % (iword_t)divisor;
}
} else {
} else {
if (b == 0) {
*quotient = -1;
*remainder = dividen;
@ -188,45 +186,35 @@ void dpi_idiv(bool enable, bool is_signed, iword_t a, iword_t b, iword_t* quotie
///////////////////////////////////////////////////////////////////////////////
void dpi_trace(int level, const char* format, ...) {
void dpi_trace(int level, const char* format, ...) {
if (level > DEBUG_LEVEL)
return;
if (!sim_trace_enabled())
return;
va_list va;
va_start(va, format);
va_start(va, format);
vprintf(format, va);
va_end(va);
va_end(va);
}
void dpi_trace_start() {
void dpi_trace_start() {
sim_trace_enable(true);
}
void dpi_trace_stop() {
void dpi_trace_stop() {
sim_trace_enable(false);
}
///////////////////////////////////////////////////////////////////////////////
std::unordered_map<uint32_t, std::shared_ptr<vortex::UUIDGenerator>> g_uuid_gens;
std::unordered_map<uint32_t, uint32_t> g_uuid_gens;
uint64_t dpi_uuid_gen(bool reset, int wid, uint64_t PC) {
uint64_t dpi_uuid_gen(bool reset, int wid) {
if (reset) {
g_uuid_gens.clear();
return 0;
}
std::shared_ptr<vortex::UUIDGenerator> uuid_gen;
auto it = g_uuid_gens.find(wid);
if (it == g_uuid_gens.end()) {
uuid_gen = std::make_shared<vortex::UUIDGenerator>();
g_uuid_gens.emplace(wid, uuid_gen);
} else {
uuid_gen = it->second;
}
uint32_t instr_uuid = uuid_gen->get_uuid(PC);
uint32_t instr_id = instr_uuid & 0xffff;
uint32_t instr_ref = instr_uuid >> 16;
uint64_t uuid = (uint64_t(instr_ref) << 32) | (wid << 16) | instr_id;
uint32_t instr_uuid = g_uuid_gens[wid]++;
uint64_t uuid = (uint64_t(wid) << 32) | instr_uuid;
return uuid;
}

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,8 +14,6 @@
`ifndef UTIL_DPI_VH
`define UTIL_DPI_VH
`include "VX_config.vh"
`ifdef XLEN_64
`define INT_TYPE longint
`else
@ -32,6 +30,6 @@ import "DPI-C" function void dpi_trace(input int level, input string format /*ve
import "DPI-C" function void dpi_trace_start();
import "DPI-C" function void dpi_trace_stop();
import "DPI-C" function longint dpi_uuid_gen(input logic reset, input int wid, input longint PC);
import "DPI-C" function longint dpi_uuid_gen(input logic reset, input int wid);
`endif

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,8 +14,9 @@
`include "VX_define.vh"
module VX_cluster import VX_gpu_pkg::*; #(
parameter CLUSTER_ID = 0
) (
parameter CLUSTER_ID = 0,
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
// Clock
@ -32,27 +33,23 @@ module VX_cluster import VX_gpu_pkg::*; #(
// Memory
VX_mem_bus_if.master mem_bus_if,
// simulation helper signals
output wire sim_ebreak,
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
// Status
output wire busy
);
`ifdef SCOPE
localparam scope_socket = 0;
`SCOPE_IO_SWITCH (scope_socket + `NUM_SOCKETS);
`endif
`SCOPE_IO_SWITCH (`NUM_SOCKETS);
`endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
VX_mem_perf_if mem_perf_tmp_if();
assign mem_perf_tmp_if.icache = 'x;
assign mem_perf_tmp_if.dcache = 'x;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.smem = 'x;
assign mem_perf_tmp_if.lmem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif
`endif
`ifdef GBAR_ENABLE
@ -63,7 +60,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
VX_gbar_arb #(
.NUM_REQS (`NUM_SOCKETS),
.OUT_REG ((`NUM_SOCKETS > 2) ? 1 : 0) // bgar_unit has no backpressure
.OUT_BUF ((`NUM_SOCKETS > 2) ? 1 : 0) // bgar_unit has no backpressure
) gbar_arb (
.clk (clk),
.reset (gbar_reset),
@ -89,7 +86,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
`RESET_RELAY (l2_reset, reset);
VX_cache_wrap #(
.INSTANCE_ID ("l2cache"),
.INSTANCE_ID ($sformatf("%s-l2cache", INSTANCE_ID)),
.CACHE_SIZE (`L2_CACHE_SIZE),
.LINE_SIZE (`L2_LINE_SIZE),
.NUM_BANKS (`L2_NUM_BANKS),
@ -99,12 +96,14 @@ module VX_cluster import VX_gpu_pkg::*; #(
.CRSQ_SIZE (`L2_CRSQ_SIZE),
.MSHR_SIZE (`L2_MSHR_SIZE),
.MRSQ_SIZE (`L2_MRSQ_SIZE),
.MREQ_SIZE (`L2_MREQ_SIZE),
.MREQ_SIZE (`L2_WRITEBACK ? `L2_MSHR_SIZE : `L2_MREQ_SIZE),
.TAG_WIDTH (L2_TAG_WIDTH),
.WRITE_ENABLE (1),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_REG (2),
.MEM_OUT_REG (2),
.WRITEBACK (`L2_WRITEBACK),
.DIRTY_BYTES (`L2_WRITEBACK),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2),
.NC_ENABLE (1),
.PASSTHRU (!`L2_ENABLED)
) l2cache (
@ -119,13 +118,6 @@ module VX_cluster import VX_gpu_pkg::*; #(
///////////////////////////////////////////////////////////////////////////
wire [`NUM_SOCKETS-1:0] per_socket_sim_ebreak;
wire [`NUM_SOCKETS-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_socket_sim_wb_value;
assign sim_ebreak = per_socket_sim_ebreak[0];
assign sim_wb_value = per_socket_sim_wb_value[0];
`UNUSED_VAR (per_socket_sim_ebreak)
`UNUSED_VAR (per_socket_sim_wb_value)
VX_dcr_bus_if socket_dcr_bus_tmp_if();
assign socket_dcr_bus_tmp_if.write_valid = dcr_bus_if.write_valid && (dcr_bus_if.write_addr >= `VX_DCR_BASE_STATE_BEGIN && dcr_bus_if.write_addr < `VX_DCR_BASE_STATE_END);
assign socket_dcr_bus_tmp_if.write_addr = dcr_bus_if.write_addr;
@ -133,17 +125,19 @@ module VX_cluster import VX_gpu_pkg::*; #(
wire [`NUM_SOCKETS-1:0] per_socket_busy;
VX_dcr_bus_if socket_dcr_bus_if();
`BUFFER_DCR_BUS_IF (socket_dcr_bus_if, socket_dcr_bus_tmp_if, (`NUM_SOCKETS > 1));
// Generate all sockets
for (genvar i = 0; i < `NUM_SOCKETS; ++i) begin
for (genvar socket_id = 0; socket_id < `NUM_SOCKETS; ++socket_id) begin : sockets
`RESET_RELAY (socket_reset, reset);
VX_socket #(
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i)
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + socket_id),
.INSTANCE_ID ($sformatf("%s-socket%0d", INSTANCE_ID, socket_id))
) socket (
`SCOPE_IO_BIND (scope_socket+i)
`SCOPE_IO_BIND (scope_socket+socket_id)
.clk (clk),
.reset (socket_reset),
@ -151,18 +145,16 @@ module VX_cluster import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
`endif
.dcr_bus_if (socket_dcr_bus_if),
.mem_bus_if (per_socket_mem_bus_if[i]),
.mem_bus_if (per_socket_mem_bus_if[socket_id]),
`ifdef GBAR_ENABLE
.gbar_bus_if (per_socket_gbar_bus_if[i]),
.gbar_bus_if (per_socket_gbar_bus_if[socket_id]),
`endif
.sim_ebreak (per_socket_sim_ebreak[i]),
.sim_wb_value (per_socket_sim_wb_value[i]),
.busy (per_socket_busy[i])
.busy (per_socket_busy[socket_id])
);
end

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -40,6 +40,18 @@
`define EXT_F_ENABLE
`endif
`ifdef XLEN_64
`ifndef FPU_DSP
`ifndef EXT_D_DISABLE
`define EXT_D_ENABLE
`endif
`endif
`endif
`ifndef EXT_ZICOND_DISABLE
`define EXT_ZICOND_ENABLE
`endif
`ifndef XLEN_32
`ifndef XLEN_64
`define XLEN_32
@ -91,13 +103,12 @@
`endif
`ifndef NUM_BARRIERS
`define NUM_BARRIERS 4
`define NUM_BARRIERS `UP(`NUM_WARPS/2)
`endif
`ifndef SOCKET_SIZE
`define SOCKET_SIZE `MIN(4, `NUM_CORES)
`endif
`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
`ifdef L2_ENABLE
`define L2_ENABLED 1
@ -129,66 +140,88 @@
`endif
`ifndef L1_LINE_SIZE
`ifdef L1_DISABLE
`define L1_LINE_SIZE ((`L2_ENABLED || `L3_ENABLED) ? 4 : `MEM_BLOCK_SIZE)
`else
`define L1_LINE_SIZE ((`L2_ENABLED || `L3_ENABLED) ? 16 : `MEM_BLOCK_SIZE)
`define L1_LINE_SIZE `MEM_BLOCK_SIZE
`endif
`ifndef L2_LINE_SIZE
`define L2_LINE_SIZE `MEM_BLOCK_SIZE
`endif
`ifndef L3_LINE_SIZE
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
`endif
`ifdef XLEN_64
`ifndef STARTUP_ADDR
`define STARTUP_ADDR 64'h180000000
`ifndef STACK_BASE_ADDR
`define STACK_BASE_ADDR 64'h1FFFF0000
`endif
`ifndef STACK_BASE_ADDR
`define STACK_BASE_ADDR 64'h1FF000000
`ifndef STARTUP_ADDR
`define STARTUP_ADDR 64'h080000000
`endif
`ifndef USER_BASE_ADDR
`define USER_BASE_ADDR 64'h000010000
`endif
`ifndef IO_BASE_ADDR
`define IO_BASE_ADDR 64'h000000040
`endif
`else
`ifndef STARTUP_ADDR
`define STARTUP_ADDR 32'h80000000
`endif
`ifndef STACK_BASE_ADDR
`define STACK_BASE_ADDR 32'hFF000000
`define STACK_BASE_ADDR 32'hFFFF0000
`endif
`ifndef STARTUP_ADDR
`define STARTUP_ADDR 32'h80000000
`endif
`ifndef SMEM_BASE_ADDR
`define SMEM_BASE_ADDR `STACK_BASE_ADDR
`endif
`ifndef SMEM_LOG_SIZE
`define SMEM_LOG_SIZE 14
`ifndef USER_BASE_ADDR
`define USER_BASE_ADDR 32'h00010000
`endif
`ifndef IO_BASE_ADDR
`define IO_BASE_ADDR (`SMEM_BASE_ADDR + (1 << `SMEM_LOG_SIZE))
`define IO_BASE_ADDR 32'h00000040
`endif
`endif
`define IO_END_ADDR `USER_BASE_ADDR
`ifndef LMEM_LOG_SIZE
`define LMEM_LOG_SIZE 14
`endif
`ifndef LMEM_BASE_ADDR
`define LMEM_BASE_ADDR `STACK_BASE_ADDR
`endif
`ifndef IO_COUT_ADDR
`define IO_COUT_ADDR `IO_BASE_ADDR
`define IO_COUT_ADDR `IO_BASE_ADDR
`endif
`define IO_COUT_SIZE `MEM_BLOCK_SIZE
`define IO_COUT_SIZE 64
`ifndef IO_CSR_ADDR
`define IO_CSR_ADDR (`IO_COUT_ADDR + `IO_COUT_SIZE)
`ifndef IO_MPM_ADDR
`define IO_MPM_ADDR (`IO_COUT_ADDR + `IO_COUT_SIZE)
`endif
`define IO_CSR_SIZE (4 * 64 * `NUM_CORES * `NUM_CLUSTERS)
`define IO_MPM_SIZE (8 * 32 * `NUM_CORES * `NUM_CLUSTERS)
`ifndef STACK_LOG2_SIZE
`define STACK_LOG2_SIZE 13
`endif
`define STACK_SIZE (1 << `STACK_LOG2_SIZE)
`define STACK_SIZE (1 << `STACK_LOG2_SIZE)
`define RESET_DELAY 8
`ifndef STALL_TIMEOUT
`define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
`define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
`endif
`ifndef SV_DPI
`define DPI_DISABLE
`endif
`ifndef FPU_FPNEW
@ -222,7 +255,7 @@
// Issue width
`ifndef ISSUE_WIDTH
`define ISSUE_WIDTH `MIN(`NUM_WARPS, 4)
`define ISSUE_WIDTH `UP(`NUM_WARPS / 8)
`endif
// Number of ALU units
@ -243,32 +276,38 @@
// Number of LSU units
`ifndef NUM_LSU_LANES
`define NUM_LSU_LANES `MIN(`NUM_THREADS, 4)
`define NUM_LSU_LANES `NUM_THREADS
`endif
`ifndef NUM_LSU_BLOCKS
`define NUM_LSU_BLOCKS 1
`endif
// Number of SFU units
`ifndef NUM_SFU_LANES
`define NUM_SFU_LANES `MIN(`NUM_THREADS, 4)
`define NUM_SFU_LANES `NUM_THREADS
`endif
`ifndef NUM_SFU_BLOCKS
`define NUM_SFU_BLOCKS 1
`endif
// Size of Instruction Buffer
`ifndef IBUF_SIZE
`define IBUF_SIZE (2 * (`NUM_WARPS / `ISSUE_WIDTH))
`define IBUF_SIZE 4
`endif
// Size of LSU Request Queue
`ifndef LSUQ_SIZE
`define LSUQ_SIZE (2 * (`NUM_THREADS / `NUM_LSU_LANES))
// LSU line size
`ifndef LSU_LINE_SIZE
`define LSU_LINE_SIZE `MIN(`NUM_LSU_LANES * (`XLEN / 8), `L1_LINE_SIZE)
`endif
// LSU Duplicate Address Check
`ifndef LSU_DUP_DISABLE
`define LSU_DUP_ENABLE
// Size of LSU Core Request Queue
`ifndef LSUQ_IN_SIZE
`define LSUQ_IN_SIZE (2 * (`NUM_THREADS / `NUM_LSU_LANES))
`endif
`ifdef LSU_DUP_ENABLE
`define LSU_DUP_ENABLED 1
`else
`define LSU_DUP_ENABLED 0
// Size of LSU Memory Request Queue
`ifndef LSUQ_OUT_SIZE
`define LSUQ_OUT_SIZE `MAX(`LSUQ_IN_SIZE, `LSU_LINE_SIZE / (`XLEN / 8))
`endif
`ifdef GBAR_ENABLE
@ -304,20 +343,20 @@
// FMA Latency
`ifndef LATENCY_FMA
`ifdef FPU_DPI
`define LATENCY_FMA 4
`define LATENCY_FMA 4
`endif
`ifdef FPU_FPNEW
`define LATENCY_FMA 4
`define LATENCY_FMA 4
`endif
`ifdef FPU_DSP
`ifdef QUARTUS
`define LATENCY_FMA 4
`endif
`ifdef VIVADO
`define LATENCY_FMA 16
`define LATENCY_FMA 16
`endif
`ifndef LATENCY_FMA
`define LATENCY_FMA 4
`define LATENCY_FMA 4
`endif
`endif
`endif
@ -325,17 +364,17 @@
// FDIV Latency
`ifndef LATENCY_FDIV
`ifdef FPU_DPI
`define LATENCY_FDIV 15
`define LATENCY_FDIV 15
`endif
`ifdef FPU_FPNEW
`define LATENCY_FDIV 16
`define LATENCY_FDIV 16
`endif
`ifdef FPU_DSP
`ifdef QUARTUS
`define LATENCY_FDIV 15
`endif
`ifdef VIVADO
`define LATENCY_FDIV 28
`define LATENCY_FDIV 28
`endif
`ifndef LATENCY_FDIV
`define LATENCY_FDIV 16
@ -346,20 +385,20 @@
// FSQRT Latency
`ifndef LATENCY_FSQRT
`ifdef FPU_DPI
`define LATENCY_FSQRT 10
`define LATENCY_FSQRT 10
`endif
`ifdef FPU_FPNEW
`define LATENCY_FSQRT 16
`define LATENCY_FSQRT 16
`endif
`ifdef FPU_DSP
`ifdef QUARTUS
`define LATENCY_FSQRT 10
`endif
`ifdef VIVADO
`define LATENCY_FSQRT 28
`define LATENCY_FSQRT 28
`endif
`ifndef LATENCY_FSQRT
`define LATENCY_FSQRT 16
`define LATENCY_FSQRT 16
`endif
`endif
`endif
@ -369,6 +408,31 @@
`define LATENCY_FCVT 5
`endif
// FMA Bandwidth ratio
`ifndef FMA_PE_RATIO
`define FMA_PE_RATIO 1
`endif
// FDIV Bandwidth ratio
`ifndef FDIV_PE_RATIO
`define FDIV_PE_RATIO 8
`endif
// FSQRT Bandwidth ratio
`ifndef FSQRT_PE_RATIO
`define FSQRT_PE_RATIO 8
`endif
// FCVT Bandwidth ratio
`ifndef FCVT_PE_RATIO
`define FCVT_PE_RATIO 8
`endif
// FNCP Bandwidth ratio
`ifndef FNCP_PE_RATIO
`define FNCP_PE_RATIO 2
`endif
// Icache Configurable Knobs //////////////////////////////////////////////////
// Cache Enable
@ -471,22 +535,27 @@
`define DCACHE_NUM_WAYS 1
`endif
// SM Configurable Knobs //////////////////////////////////////////////////////
`ifndef SM_DISABLE
`define SM_ENABLE
// Enable Cache Writeback
`ifndef DCACHE_WRITEBACK
`define DCACHE_WRITEBACK 0
`endif
`ifdef SM_ENABLE
`define SM_ENABLED 1
// LMEM Configurable Knobs ////////////////////////////////////////////////////
`ifndef LMEM_DISABLE
`define LMEM_ENABLE
`endif
`ifdef LMEM_ENABLE
`define LMEM_ENABLED 1
`else
`define SM_ENABLED 0
`define SMEM_NUM_BANKS 1
`define LMEM_ENABLED 0
`define LMEM_NUM_BANKS 1
`endif
// Number of Banks
`ifndef SMEM_NUM_BANKS
`define SMEM_NUM_BANKS (`NUM_LSU_LANES)
`ifndef LMEM_NUM_BANKS
`define LMEM_NUM_BANKS `NUM_LSU_LANES
`endif
// L2cache Configurable Knobs /////////////////////////////////////////////////
@ -530,6 +599,11 @@
`define L2_NUM_WAYS 2
`endif
// Enable Cache Writeback
`ifndef L2_WRITEBACK
`define L2_WRITEBACK 0
`endif
// L3cache Configurable Knobs /////////////////////////////////////////////////
// Cache Size
@ -571,6 +645,11 @@
`define L3_NUM_WAYS 4
`endif
// Enable Cache Writeback
`ifndef L3_WRITEBACK
`define L3_WRITEBACK 0
`endif
// ISA Extensions /////////////////////////////////////////////////////////////
`ifdef EXT_A_ENABLE
@ -603,6 +682,12 @@
`define EXT_M_ENABLED 0
`endif
`ifdef EXT_ZICOND_ENABLE
`define EXT_ZICOND_ENABLED 1
`else
`define EXT_ZICOND_ENABLED 0
`endif
`define ISA_STD_A 0
`define ISA_STD_C 2
`define ISA_STD_D 3
@ -619,13 +704,15 @@
`define ISA_EXT_DCACHE 1
`define ISA_EXT_L2CACHE 2
`define ISA_EXT_L3CACHE 3
`define ISA_EXT_SMEM 4
`define ISA_EXT_LMEM 4
`define ISA_EXT_ZICOND 5
`define MISA_EXT (`ICACHE_ENABLED << `ISA_EXT_ICACHE) \
| (`DCACHE_ENABLED << `ISA_EXT_DCACHE) \
| (`L2_ENABLED << `ISA_EXT_L2CACHE) \
| (`L3_ENABLED << `ISA_EXT_L3CACHE) \
| (`SM_ENABLED << `ISA_EXT_SMEM)
| (`LMEM_ENABLED << `ISA_EXT_LMEM) \
| (`EXT_ZICOND_ENABLED << `ISA_EXT_ZICOND)
`define MISA_STD (`EXT_A_ENABLED << 0) /* A - Atomic Instructions extension */ \
| (0 << 1) /* B - Tentatively reserved for Bit operations extension */ \

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -44,6 +44,9 @@
`define NR_BITS `CLOG2(`NUM_REGS)
`define DV_STACK_SIZE `UP(`NUM_THREADS-1)
`define DV_STACK_SIZEW `UP(`CLOG2(`DV_STACK_SIZE))
`define PERF_CTR_BITS 44
`ifndef NDEBUG
@ -52,6 +55,12 @@
`define UUID_WIDTH 1
`endif
`define PC_BITS (`XLEN-1)
`define OFFSET_BITS 12
`define IMM_BITS `XLEN
`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
///////////////////////////////////////////////////////////////////////////////
`define EX_ALU 0
@ -90,10 +99,10 @@
`define INST_FL 7'b0000111 // float load instruction
`define INST_FS 7'b0100111 // float store instruction
`define INST_FMADD 7'b1000011
`define INST_FMADD 7'b1000011
`define INST_FMSUB 7'b1000111
`define INST_FNMSUB 7'b1001011
`define INST_FNMADD 7'b1001111
`define INST_FNMADD 7'b1001111
`define INST_FCI 7'b1010011 // float common instructions
// Custom extension opcodes
@ -102,6 +111,10 @@
`define INST_EXT3 7'b1011011 // 0x5B
`define INST_EXT4 7'b1111011 // 0x7B
// Opcode extensions
`define INST_R_F7_MUL 7'b0000001
`define INST_R_F7_ZICOND 7'b0000111
///////////////////////////////////////////////////////////////////////////////
`define INST_FRM_RNE 3'b000 // round to nearest even
@ -115,36 +128,45 @@
///////////////////////////////////////////////////////////////////////////////
`define INST_OP_BITS 4
`define INST_MOD_BITS 3
`define INST_ARGS_BITS $bits(op_args_t)
`define INST_FMT_BITS 2
///////////////////////////////////////////////////////////////////////////////
`define INST_ALU_ADD 4'b0000
//`define INST_ALU_UNUSED 4'b0001
`define INST_ALU_LUI 4'b0010
`define INST_ALU_AUIPC 4'b0011
`define INST_ALU_SLTU 4'b0100
`define INST_ALU_SLT 4'b0101
//`define INST_ALU_UNUSED 4'b0110
`define INST_ALU_SUB 4'b0111
`define INST_ALU_SRL 4'b1000
`define INST_ALU_SRA 4'b1001
`define INST_ALU_CZEQ 4'b1010
`define INST_ALU_CZNE 4'b1011
`define INST_ALU_AND 4'b1100
`define INST_ALU_OR 4'b1101
`define INST_ALU_XOR 4'b1110
`define INST_ALU_SLL 4'b1111
`define INST_ALU_OTHER 4'b0111
`define ALU_TYPE_BITS 2
`define ALU_TYPE_ARITH 0
`define ALU_TYPE_BRANCH 1
`define ALU_TYPE_MULDIV 2
`define ALU_TYPE_OTHER 3
`define INST_ALU_BITS 4
`define INST_ALU_CLASS(op) op[3:2]
`define INST_ALU_SIGNED(op) op[0]
`define INST_ALU_IS_SUB(op) op[1]
`define INST_ALU_IS_BR(mod) mod[0]
`define INST_ALU_IS_M(mod) mod[1]
`define INST_ALU_IS_W(mod) mod[2]
`define INST_ALU_IS_CZERO(op) (op[3:1] == 3'b101)
`define INST_BR_EQ 4'b0000
`define INST_BR_NE 4'b0010
`define INST_BR_LTU 4'b0100
`define INST_BR_GEU 4'b0110
`define INST_BR_LTU 4'b0100
`define INST_BR_GEU 4'b0110
`define INST_BR_LT 4'b0101
`define INST_BR_GE 4'b0111
`define INST_BR_JAL 4'b1000
@ -184,14 +206,14 @@
`define INST_FMT_HU 3'b101
`define INST_FMT_WU 3'b110
`define INST_LSU_LB 4'b0000
`define INST_LSU_LB 4'b0000
`define INST_LSU_LH 4'b0001
`define INST_LSU_LW 4'b0010
`define INST_LSU_LD 4'b0011 // new for RV64I LD
`define INST_LSU_LBU 4'b0100
`define INST_LSU_LHU 4'b0101
`define INST_LSU_LWU 4'b0110 // new for RV64I LWU
`define INST_LSU_SB 4'b1000
`define INST_LSU_SB 4'b1000
`define INST_LSU_SH 4'b1001
`define INST_LSU_SW 4'b1010
`define INST_LSU_SD 4'b1011 // new for RV64I SD
@ -205,29 +227,28 @@
`define INST_FENCE_D 1'h0
`define INST_FENCE_I 1'h1
`define INST_FPU_ADD 4'b0000
`define INST_FPU_SUB 4'b0001
`define INST_FPU_MUL 4'b0010
`define INST_FPU_ADD 4'b0000
`define INST_FPU_SUB 4'b0001
`define INST_FPU_MUL 4'b0010
`define INST_FPU_DIV 4'b0011
`define INST_FPU_SQRT 4'b0100
`define INST_FPU_CMP 4'b0101 // mod: LE=0, LT=1, EQ=2
`define INST_FPU_CMP 4'b0101 // frm: LE=0, LT=1, EQ=2
`define INST_FPU_F2F 4'b0110
`define INST_FPU_MISC 4'b0111 // mod: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
`define INST_FPU_MISC 4'b0111 // frm: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
`define INST_FPU_F2I 4'b1000
`define INST_FPU_F2U 4'b1001
`define INST_FPU_I2F 4'b1010
`define INST_FPU_U2F 4'b1011
`define INST_FPU_MADD 4'b1100
`define INST_FPU_MSUB 4'b1101
`define INST_FPU_NMSUB 4'b1110
`define INST_FPU_MADD 4'b1100
`define INST_FPU_MSUB 4'b1101
`define INST_FPU_NMSUB 4'b1110
`define INST_FPU_NMADD 4'b1111
`define INST_FPU_BITS 4
`define INST_FPU_IS_W(mod) (mod[4])
`define INST_FPU_IS_CLASS(op, mod) (op == `INST_FPU_MISC && mod == 3)
`define INST_FPU_IS_MVXW(op, mod) (op == `INST_FPU_MISC && mod == 4)
`define INST_FPU_IS_CLASS(op, frm) (op == `INST_FPU_MISC && frm == 3)
`define INST_FPU_IS_MVXW(op, frm) (op == `INST_FPU_MISC && frm == 4)
`define INST_SFU_TMC 4'h0
`define INST_SFU_WSPAWN 4'h1
`define INST_SFU_WSPAWN 4'h1
`define INST_SFU_SPLIT 4'h2
`define INST_SFU_JOIN 4'h3
`define INST_SFU_BAR 4'h4
@ -235,7 +256,6 @@
`define INST_SFU_CSRRW 4'h6
`define INST_SFU_CSRRS 4'h7
`define INST_SFU_CSRRC 4'h8
`define INST_SFU_CMOV 4'h9
`define INST_SFU_BITS 4
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1)
`define INST_SFU_IS_WCTL(op) (op <= 5)
@ -243,67 +263,52 @@
///////////////////////////////////////////////////////////////////////////////
// non-cacheable tag bits
`define NC_TAG_BITS 1
// cache address type bits
`ifdef SM_ENABLE
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BITS + 1)
`else
`define CACHE_ADDR_TYPE_BITS `NC_TAG_BITS
`endif
`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0)
`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2(`CDIV(I, O)) : 0)
///////////////////////////////////////////////////////////////////////////////
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
(`CLOG2(mshr_size) + `CLOG2(num_banks) + `NC_TAG_BITS)
`define CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
(`CLOG2(mshr_size) + `CLOG2(num_banks))
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) + `NC_TAG_BITS)
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \
`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width))
(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width)) + 1)
///////////////////////////////////////////////////////////////////////////////
`define CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches) \
(tag_width + `ARB_SEL_BITS(num_inputs, `UP(num_caches)))
(tag_width + `ARB_SEL_BITS(num_inputs, `UP(num_caches)))
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
`define CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG((`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG((`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)) + `NC_TAG_BITS), num_caches)
`define CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches))), num_caches)
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
///////////////////////////////////////////////////////////////////////////////
`ifdef L2_ENABLE
`define L2_LINE_SIZE `MEM_BLOCK_SIZE
`else
`define L2_LINE_SIZE `L1_LINE_SIZE
`ifdef ICACHE_ENABLE
`define L1_ENABLE
`endif
`ifdef L3_ENABLE
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
`else
`define L3_LINE_SIZE `L2_LINE_SIZE
`ifdef DCACHE_ENABLE
`define L1_ENABLE
`endif
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
`define ADDR_TYPE_FLUSH 0
`define ADDR_TYPE_IO 1
`define ADDR_TYPE_LOCAL 2 // shoud be last since optional
`define ADDR_TYPE_WIDTH (`ADDR_TYPE_LOCAL + `LMEM_ENABLED)
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
`define VX_MEM_TAG_WIDTH L3_MEM_TAG_WIDTH
@ -320,7 +325,7 @@
.DATAW ($bits(dst)), \
.RESETW ($bits(dst)), \
.DEPTH (latency) \
) __``dst ( \
) __``dst``__ ( \
.clk (clk), \
.reset (reset), \
.enable (ena), \
@ -334,13 +339,18 @@
VX_popcount #( \
.N ($bits(in)), \
.MODEL (model) \
) __``out ( \
) __``out``__ ( \
.data_in (in), \
.data_out (out) \
)
`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1)
`define ASSIGN_VX_IF(dst, src) \
assign dst.valid = src.valid; \
assign dst.data = src.data; \
assign src.ready = dst.ready
`define ASSIGN_VX_MEM_BUS_IF(dst, src) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data = src.req_data; \
@ -354,6 +364,7 @@
assign dst.req_data.rw = src.req_data.rw; \
assign dst.req_data.byteen = src.req_data.byteen; \
assign dst.req_data.addr = src.req_data.addr; \
assign dst.req_data.atype = src.req_data.atype; \
assign dst.req_data.data = src.req_data.data; \
if (TD != TS) \
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
@ -365,43 +376,51 @@
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
assign dst.rsp_ready = src.rsp_ready
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \
if (enable) begin \
always @(posedge clk) begin \
__``dst <= {src.write_valid, src.write_addr, src.write_data}; \
end \
end else begin \
assign __``dst = {src.write_valid, src.write_addr, src.write_data}; \
end \
VX_dcr_bus_if dst(); \
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
`define ASSIGN_VX_LSU_MEM_IF(dst, src) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data = src.req_data; \
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data = dst.rsp_data; \
assign dst.rsp_ready = src.rsp_ready
`define PERF_COUNTER_ADD(dst, src, field, width, dst_count, src_count, reg_enable) \
for (genvar __d = 0; __d < dst_count; ++__d) begin \
localparam __count = ((src_count > dst_count) ? ((src_count + dst_count - 1) / dst_count) : 1); \
wire [__count-1:0][width-1:0] __reduce_add_i_``src``field; \
wire [width-1:0] __reduce_add_o_``dst``field; \
for (genvar __i = 0; __i < __count; ++__i) begin \
assign __reduce_add_i_``src``field[__i] = ``src[__d * __count + __i].``field; \
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
if (enable) begin \
reg [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __dst; \
always @(posedge clk) begin \
__dst <= {src.write_valid, src.write_addr, src.write_data}; \
end \
VX_reduce #(.DATAW_IN(width), .N(__count), .OP("+")) __reduce_add_``dst``field ( \
__reduce_add_i_``src``field, \
__reduce_add_o_``dst``field \
assign {dst.write_valid, dst.write_addr, dst.write_data} = __dst; \
end else begin \
assign {dst.write_valid, dst.write_addr, dst.write_data} = {src.write_valid, src.write_addr, src.write_data}; \
end
`define PERF_COUNTER_ADD(dst, src, field, width, count, reg_enable) \
if (count > 1) begin \
wire [count-1:0][width-1:0] __reduce_add_i_field; \
wire [width-1:0] __reduce_add_o_field; \
for (genvar __i = 0; __i < count; ++__i) begin \
assign __reduce_add_i_field[__i] = src[__i].``field; \
end \
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
__reduce_add_i_field, \
__reduce_add_o_field \
); \
if (reg_enable) begin \
reg [width-1:0] __reduce_add_r_``dst``field; \
reg [width-1:0] __reduce_add_r_field; \
always @(posedge clk) begin \
if (reset) begin \
__reduce_add_r_``dst``field <= '0; \
__reduce_add_r_field <= '0; \
end else begin \
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
__reduce_add_r_field <= __reduce_add_o_field; \
end \
end \
assign ``dst[__d].``field = __reduce_add_r_``dst``field; \
assign dst.``field = __reduce_add_r_field; \
end else begin \
assign ``dst[__d].``field = __reduce_add_o_``dst``field; \
assign dst.``field = __reduce_add_o_field; \
end \
end else begin \
assign dst.``field = src[0].``field; \
end
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
@ -415,23 +434,4 @@
assign dst = src; \
end
`define TO_DISPATCH_DATA(data, tid) { \
data.uuid, \
data.wis, \
data.tmask, \
data.op_type, \
data.op_mod, \
data.wb, \
data.use_PC, \
data.use_imm, \
data.PC, \
data.imm, \
data.rd, \
tid, \
data.rs1_data, \
data.rs2_data, \
data.rs3_data}
///////////////////////////////////////////////////////////////////////////////
`endif // VX_DEFINE_VH

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -26,7 +26,7 @@ package VX_gpu_pkg;
typedef struct packed {
logic valid;
logic [`NUM_WARPS-1:0] wmask;
logic [`XLEN-1:0] pc;
logic [`PC_BITS-1:0] pc;
} wspawn_t;
typedef struct packed {
@ -34,12 +34,12 @@ package VX_gpu_pkg;
logic is_dvg;
logic [`NUM_THREADS-1:0] then_tmask;
logic [`NUM_THREADS-1:0] else_tmask;
logic [`XLEN-1:0] next_pc;
logic [`PC_BITS-1:0] next_pc;
} split_t;
typedef struct packed {
logic valid;
logic is_dvg;
logic [`DV_STACK_SIZEW-1:0] stack_ptr;
} join_t;
typedef struct packed {
@ -51,13 +51,17 @@ package VX_gpu_pkg;
`else
logic [`NW_WIDTH-1:0] size_m1;
`endif
logic is_noop;
} barrier_t;
typedef struct packed {
logic [`XLEN-1:0] startup_addr;
logic [7:0] mpm_class;
logic [`XLEN-1:0] startup_addr;
logic [`XLEN-1:0] startup_arg;
logic [7:0] mpm_class;
} base_dcrs_t;
//////////////////////////// Perf counter types ///////////////////////////
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] reads;
logic [`PERF_CTR_BITS-1:0] writes;
@ -75,7 +79,72 @@ package VX_gpu_pkg;
logic [`PERF_CTR_BITS-1:0] latency;
} mem_perf_t;
/* verilator lint_off UNUSED */
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] idles;
logic [`PERF_CTR_BITS-1:0] stalls;
} sched_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] ibf_stalls;
logic [`PERF_CTR_BITS-1:0] scb_stalls;
logic [`PERF_CTR_BITS-1:0] opd_stalls;
logic [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] units_uses;
logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses;
} issue_perf_t;
//////////////////////// instruction arguments ////////////////////////////
typedef struct packed {
logic use_PC;
logic use_imm;
logic is_w;
logic [`ALU_TYPE_BITS-1:0] xtype;
logic [`IMM_BITS-1:0] imm;
} alu_args_t;
typedef struct packed {
logic [($bits(alu_args_t)-`INST_FRM_BITS-`INST_FMT_BITS)-1:0] __padding;
logic [`INST_FRM_BITS-1:0] frm;
logic [`INST_FMT_BITS-1:0] fmt;
} fpu_args_t;
typedef struct packed {
logic [($bits(alu_args_t)-1-1-`OFFSET_BITS)-1:0] __padding;
logic is_store;
logic is_float;
logic [`OFFSET_BITS-1:0] offset;
} lsu_args_t;
typedef struct packed {
logic [($bits(alu_args_t)-1-`VX_CSR_ADDR_BITS-5)-1:0] __padding;
logic use_imm;
logic [`VX_CSR_ADDR_BITS-1:0] addr;
logic [4:0] imm;
} csr_args_t;
typedef struct packed {
logic [($bits(alu_args_t)-1)-1:0] __padding;
logic is_neg;
} wctl_args_t;
typedef union packed {
alu_args_t alu;
fpu_args_t fpu;
lsu_args_t lsu;
csr_args_t csr;
wctl_args_t wctl;
} op_args_t;
`IGNORE_UNUSED_BEGIN
///////////////////////// LSU memory Parameters ///////////////////////////
localparam LSU_WORD_SIZE = `XLEN / 8;
localparam LSU_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(LSU_WORD_SIZE));
localparam LSU_MEM_BATCHES = 1;
localparam LSU_TAG_ID_BITS = (`CLOG2(`LSUQ_IN_SIZE) + `CLOG2(LSU_MEM_BATCHES));
localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS);
localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES;
////////////////////////// Icache Parameters //////////////////////////////
@ -86,7 +155,7 @@ package VX_gpu_pkg;
// Block size in bytes
localparam ICACHE_LINE_SIZE = `L1_LINE_SIZE;
// Core request tag Id bits
// Core request tag Id bits
localparam ICACHE_TAG_ID_BITS = `NW_WIDTH;
// Core request tag bits
@ -96,54 +165,48 @@ package VX_gpu_pkg;
localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8);
// Memory request tag bits
`ifdef ICACHE_ENABLE
`ifdef ICACHE_ENABLE
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
`else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif
`else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif
////////////////////////// Dcache Parameters //////////////////////////////
// Word size in bytes
localparam DCACHE_WORD_SIZE = (`XLEN / 8);
localparam DCACHE_WORD_SIZE = `LSU_LINE_SIZE;
localparam DCACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(DCACHE_WORD_SIZE));
// Block size in bytes
localparam DCACHE_LINE_SIZE = `L1_LINE_SIZE;
// Input request size
localparam DCACHE_NUM_REQS = `MAX(`DCACHE_NUM_BANKS, `SMEM_NUM_BANKS);
// Memory request size
localparam LSU_MEM_REQS = `NUM_LSU_LANES;
// Batch select bits
localparam DCACHE_NUM_BATCHES = ((LSU_MEM_REQS + DCACHE_NUM_REQS - 1) / DCACHE_NUM_REQS);
localparam DCACHE_BATCH_SEL_BITS = `CLOG2(DCACHE_NUM_BATCHES);
localparam DCACHE_CHANNELS = `UP((`NUM_LSU_LANES * LSU_WORD_SIZE) / DCACHE_WORD_SIZE);
localparam DCACHE_NUM_REQS = `NUM_LSU_BLOCKS * DCACHE_CHANNELS;
// Core request tag Id bits
localparam LSUQ_TAG_BITS = (`CLOG2(`LSUQ_SIZE) + DCACHE_BATCH_SEL_BITS);
localparam DCACHE_TAG_ID_BITS = (LSUQ_TAG_BITS + `CACHE_ADDR_TYPE_BITS);
localparam DCACHE_MERGED_REQS = (`NUM_LSU_LANES * LSU_WORD_SIZE) / DCACHE_WORD_SIZE;
localparam DCACHE_MEM_BATCHES = `CDIV(DCACHE_MERGED_REQS, DCACHE_CHANNELS);
localparam DCACHE_TAG_ID_BITS = (`CLOG2(`LSUQ_OUT_SIZE) + `CLOG2(DCACHE_MEM_BATCHES));
// Core request tag bits
localparam DCACHE_TAG_WIDTH = (`UUID_WIDTH + DCACHE_TAG_ID_BITS);
localparam DCACHE_NOSM_TAG_WIDTH = (DCACHE_TAG_WIDTH - `SM_ENABLED);
// Memory request data bits
localparam DCACHE_MEM_DATA_WIDTH = (DCACHE_LINE_SIZE * 8);
// Memory request tag bits
`ifdef DCACHE_ENABLE
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_NOSM_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`else
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_NOSM_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`endif
`ifdef DCACHE_ENABLE
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`else
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`endif
/////////////////////////////// L1 Parameters /////////////////////////////
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
/////////////////////////////// L2 Parameters /////////////////////////////
localparam ICACHE_MEM_ARB_IDX = 0;
@ -162,11 +225,11 @@ package VX_gpu_pkg;
localparam L2_MEM_DATA_WIDTH = (`L2_LINE_SIZE * 8);
// Memory request tag bits
`ifdef L2_ENABLE
`ifdef L2_ENABLE
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
`else
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
`endif
`else
localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
`endif
/////////////////////////////// L3 Parameters /////////////////////////////
@ -183,32 +246,29 @@ package VX_gpu_pkg;
localparam L3_MEM_DATA_WIDTH = (`L3_LINE_SIZE * 8);
// Memory request tag bits
`ifdef L3_ENABLE
`ifdef L3_ENABLE
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`else
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`endif
/* verilator lint_on UNUSED */
`else
localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`endif
/////////////////////////////// Issue parameters //////////////////////////
localparam ISSUE_ISW = `CLOG2(`ISSUE_WIDTH);
localparam ISSUE_ISW_W = `UP(ISSUE_ISW);
localparam ISSUE_RATIO = `NUM_WARPS / `ISSUE_WIDTH;
localparam ISSUE_WIS = `CLOG2(ISSUE_RATIO);
localparam ISSUE_ISW_W = `UP(ISSUE_ISW);
localparam PER_ISSUE_WARPS = `NUM_WARPS / `ISSUE_WIDTH;
localparam ISSUE_WIS = `CLOG2(PER_ISSUE_WARPS);
localparam ISSUE_WIS_W = `UP(ISSUE_WIS);
`IGNORE_UNUSED_BEGIN
function logic [`NW_WIDTH-1:0] wis_to_wid(
input logic [ISSUE_WIS_W-1:0] wis,
input logic [ISSUE_WIS_W-1:0] wis,
input logic [ISSUE_ISW_W-1:0] isw
);
if (ISSUE_WIS == 0) begin
wis_to_wid = `NW_WIDTH'(isw);
end else if (ISSUE_ISW == 0) begin
wis_to_wid = `NW_WIDTH'(wis);
end else begin
end else begin
wis_to_wid = `NW_WIDTH'({wis, isw});
end
endfunction
@ -216,7 +276,7 @@ package VX_gpu_pkg;
function logic [ISSUE_ISW_W-1:0] wid_to_isw(
input logic [`NW_WIDTH-1:0] wid
);
if (ISSUE_ISW != 0) begin
if (ISSUE_ISW != 0) begin
wid_to_isw = wid[ISSUE_ISW_W-1:0];
end else begin
wid_to_isw = 0;
@ -232,6 +292,20 @@ package VX_gpu_pkg;
wid_to_wis = 0;
end
endfunction
///////////////////////// Miscaellaneous functions ////////////////////////
function logic [`SFU_WIDTH-1:0] op_to_sfu_type(
input logic [`INST_OP_BITS-1:0] op_type
);
case (op_type)
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: op_to_sfu_type = `SFU_CSRS;
default: op_to_sfu_type = `SFU_WCTL;
endcase
endfunction
`IGNORE_UNUSED_END
endpackage

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,7 +14,7 @@
`ifndef VX_PLATFORM_VH
`define VX_PLATFORM_VH
`ifndef SYNTHESIS
`ifdef SV_DPI
`include "util_dpi.vh"
`endif
@ -47,7 +47,7 @@
`define UNUSED_VAR(x)
`define UNUSED_PIN(x) . x ()
`define UNUSED_ARG(x) x
`define TRACE(level, args) $write args
`define TRACE(level, args) if (level <= `DEBUG_LEVEL) $write args
`else
`ifdef VERILATOR
`define TRACING_ON /* verilator tracing_on */
@ -77,7 +77,8 @@
/* verilator lint_off IMPLICIT */ \
/* verilator lint_off PINMISSING */ \
/* verilator lint_off IMPORTSTAR */ \
/* verilator lint_off UNSIGNED */
/* verilator lint_off UNSIGNED */ \
/* verilator lint_off SYMRSVDWORD */
`define IGNORE_WARNINGS_END /* verilator lint_on UNUSED */ \
/* verilator lint_on PINCONNECTEMPTY */ \
@ -88,7 +89,8 @@
/* verilator lint_on IMPLICIT */ \
/* verilator lint_off PINMISSING */ \
/* verilator lint_on IMPORTSTAR */ \
/* verilator lint_on UNSIGNED */
/* verilator lint_on UNSIGNED */ \
/* verilator lint_on SYMRSVDWORD */
`define UNUSED_PARAM(x) /* verilator lint_off UNUSED */ \
localparam __``x = x; \
@ -110,8 +112,14 @@
`define UNUSED_ARG(x) /* verilator lint_off UNUSED */ \
x \
/* verilator lint_on UNUSED */
`define TRACE(level, args) dpi_trace(level, $sformatf args)
`endif
`ifdef SV_DPI
`define TRACE(level, args) dpi_trace(level, $sformatf args)
`else
`define TRACE(level, args) if (level <= `DEBUG_LEVEL) $write args
`endif
`endif
`ifdef SIMULATION
@ -140,21 +148,21 @@
///////////////////////////////////////////////////////////////////////////////
`ifdef QUARTUS
`define MAX_FANOUT 4
`define MAX_FANOUT 8
`define IF_DATA_SIZE(x) $bits(x.data)
`define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *)
`define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *)
`define DISABLE_BRAM (* ramstyle = "logic" *)
`define PRESERVE_NET (* preserve *)
`elsif VIVADO
`define MAX_FANOUT 4
`define MAX_FANOUT 8
`define IF_DATA_SIZE(x) $bits(x.data)
`define USE_FAST_BRAM (* ram_style = "distributed" *)
`define NO_RW_RAM_CHECK (* rw_addr_collision = "no" *)
`define DISABLE_BRAM (* ram_style = "registers" *)
`define PRESERVE_NET (* keep = "true" *)
`else
`define MAX_FANOUT 4
`define MAX_FANOUT 8
`define IF_DATA_SIZE(x) x.DATA_WIDTH
`define USE_FAST_BRAM
`define NO_RW_RAM_CHECK
@ -169,7 +177,8 @@
`define CLOG2(x) $clog2(x)
`define FLOG2(x) ($clog2(x) - (((1 << $clog2(x)) > (x)) ? 1 : 0))
`define LOG2UP(x) (((x) > 1) ? $clog2(x) : 1)
`define ISPOW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1))))
`define IS_POW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1))))
`define IS_DIVISBLE(n, d) (((n) % (d)) == 0)
`define ABS(x) (((x) < 0) ? (-(x)) : (x));
@ -181,34 +190,35 @@
`define MAX(x, y) (((x) > (y)) ? (x) : (y))
`endif
`ifndef CLAMP
`define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x)))
`endif
`ifndef UP
`define UP(x) (((x) != 0) ? (x) : 1)
`endif
`define CDIV(n,d) ((n + d - 1) / (d))
`define RTRIM(x, s) x[$bits(x)-1:($bits(x)-s)]
`define LTRIM(x, s) x[s-1:0]
`define TRACE_ARRAY1D(lvl, arr, m) \
`define SEXT(len, x) {{(len-$bits(x)+1){x[$bits(x)-1]}}, x[$bits(x)-2:0]}
`define TRACE_ARRAY1D(lvl, fmt, arr, n) \
`TRACE(lvl, ("{")); \
for (integer __i = (m-1); __i >= 0; --__i) begin \
if (__i != (m-1)) `TRACE(lvl, (", ")); \
`TRACE(lvl, ("0x%0h", arr[__i])); \
for (integer __i = (n-1); __i >= 0; --__i) begin \
if (__i != (n-1)) `TRACE(lvl, (", ")); \
`TRACE(lvl, (fmt, arr[__i])); \
end \
`TRACE(lvl, ("}"));
`define TRACE_ARRAY2D(lvl, arr, m, n) \
`define TRACE_ARRAY2D(lvl, fmt, arr, m, n) \
`TRACE(lvl, ("{")); \
for (integer __i = n-1; __i >= 0; --__i) begin \
if (__i != (n-1)) `TRACE(lvl, (", ")); \
`TRACE(lvl, ("{")); \
for (integer __j = (m-1); __j >= 0; --__j) begin \
if (__j != (m-1)) `TRACE(lvl, (", "));\
`TRACE(lvl, ("0x%0h", arr[__i][__j])); \
`TRACE(lvl, (fmt, arr[__i][__j])); \
end \
`TRACE(lvl, ("}")); \
end \
@ -228,11 +238,11 @@
`define RESET_RELAY(dst, src) \
`RESET_RELAY_EX (dst, src, 1, 0)
// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2
`define OUT_REG_TO_EB_SIZE(out_reg) `MIN(out_reg, 2)
// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2, 5 -> 2
`define TO_OUT_BUF_SIZE(s) `MIN(s, 2)
// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2
`define OUT_REG_TO_EB_REG(out_reg) ((out_reg & 1) + ((out_reg >> 2) << 1))
// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2, 5 > 3
`define TO_OUT_BUF_REG(s) ((s < 2) ? s : (s - 2))
`define REPEAT(n,f,s) `_REPEAT_``n(f,s)
`define _REPEAT_0(f,s)

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -13,11 +13,12 @@
`include "VX_define.vh"
module VX_socket import VX_gpu_pkg::*; #(
parameter SOCKET_ID = 0
) (
module VX_socket import VX_gpu_pkg::*; #(
parameter SOCKET_ID = 0,
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
// Clock
input wire clk,
input wire reset,
@ -36,15 +37,15 @@ module VX_socket import VX_gpu_pkg::*; #(
// Barrier
VX_gbar_bus_if.master gbar_bus_if,
`endif
// simulation helper signals
output wire sim_ebreak,
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
// Status
output wire busy
);
`ifdef SCOPE
localparam scope_core = 0;
`SCOPE_IO_SWITCH (`SOCKET_SIZE);
`endif
`ifdef GBAR_ENABLE
VX_gbar_bus_if per_core_gbar_bus_if[`SOCKET_SIZE]();
@ -52,7 +53,7 @@ module VX_socket import VX_gpu_pkg::*; #(
VX_gbar_arb #(
.NUM_REQS (`SOCKET_SIZE),
.OUT_REG ((`SOCKET_SIZE > 1) ? 2 : 0)
.OUT_BUF ((`SOCKET_SIZE > 1) ? 2 : 0)
) gbar_arb (
.clk (clk),
.reset (gbar_arb_reset),
@ -67,14 +68,14 @@ module VX_socket import VX_gpu_pkg::*; #(
VX_mem_perf_if mem_perf_tmp_if();
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.smem = 'x;
assign mem_perf_tmp_if.lmem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
`endif
///////////////////////////////////////////////////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_WORD_SIZE),
.DATA_SIZE (ICACHE_WORD_SIZE),
.TAG_WIDTH (ICACHE_TAG_WIDTH)
) per_core_icache_bus_if[`SOCKET_SIZE]();
@ -86,7 +87,7 @@ module VX_socket import VX_gpu_pkg::*; #(
`RESET_RELAY (icache_reset, reset);
VX_cache_cluster #(
.INSTANCE_ID ($sformatf("socket%0d-icache", SOCKET_ID)),
.INSTANCE_ID ($sformatf("%s-icache", INSTANCE_ID)),
.NUM_UNITS (`NUM_ICACHES),
.NUM_INPUTS (`SOCKET_SIZE),
.TAG_SEL_IDX (0),
@ -103,8 +104,9 @@ module VX_socket import VX_gpu_pkg::*; #(
.TAG_WIDTH (ICACHE_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.WRITE_ENABLE (0),
.CORE_OUT_REG (2),
.MEM_OUT_REG (2)
.NC_ENABLE (0),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2)
) icache (
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.icache),
@ -119,9 +121,9 @@ module VX_socket import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
@ -130,10 +132,10 @@ module VX_socket import VX_gpu_pkg::*; #(
`RESET_RELAY (dcache_reset, reset);
VX_cache_cluster #(
.INSTANCE_ID ($sformatf("socket%0d-dcache", SOCKET_ID)),
.INSTANCE_ID ($sformatf("%s-dcache", INSTANCE_ID)),
.NUM_UNITS (`NUM_DCACHES),
.NUM_INPUTS (`SOCKET_SIZE),
.TAG_SEL_IDX (1),
.TAG_SEL_IDX (0),
.CACHE_SIZE (`DCACHE_SIZE),
.LINE_SIZE (DCACHE_LINE_SIZE),
.NUM_BANKS (`DCACHE_NUM_BANKS),
@ -143,24 +145,26 @@ module VX_socket import VX_gpu_pkg::*; #(
.CRSQ_SIZE (`DCACHE_CRSQ_SIZE),
.MSHR_SIZE (`DCACHE_MSHR_SIZE),
.MRSQ_SIZE (`DCACHE_MRSQ_SIZE),
.MREQ_SIZE (`DCACHE_MREQ_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH),
.MREQ_SIZE (`DCACHE_WRITEBACK ? `DCACHE_MSHR_SIZE : `DCACHE_MREQ_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.WRITE_ENABLE (1),
.WRITE_ENABLE (1),
.WRITEBACK (`DCACHE_WRITEBACK),
.DIRTY_BYTES (`DCACHE_WRITEBACK),
.NC_ENABLE (1),
.CORE_OUT_REG (`SM_ENABLED ? 2 : 1),
.MEM_OUT_REG (2)
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2)
) dcache (
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.dcache),
`endif
`endif
.clk (clk),
.reset (dcache_reset),
.reset (dcache_reset),
.core_bus_if (per_core_dcache_bus_if),
.mem_bus_if (dcache_mem_bus_if)
);
///////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
@ -175,19 +179,17 @@ module VX_socket import VX_gpu_pkg::*; #(
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
`RESET_RELAY (mem_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (2),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.TAG_SEL_IDX (0),
.ARBITER ("R"),
.OUT_REG_REQ (2),
.OUT_REG_RSP (2)
.REQ_OUT_BUF (2),
.RSP_OUT_BUF (2)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
.reset (reset),
.bus_in_if (l1_mem_bus_if),
.bus_out_if (l1_mem_arb_bus_if)
);
@ -196,28 +198,21 @@ module VX_socket import VX_gpu_pkg::*; #(
///////////////////////////////////////////////////////////////////////////
wire [`SOCKET_SIZE-1:0] per_core_sim_ebreak;
wire [`SOCKET_SIZE-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_core_sim_wb_value;
assign sim_ebreak = per_core_sim_ebreak[0];
assign sim_wb_value = per_core_sim_wb_value[0];
`UNUSED_VAR (per_core_sim_ebreak)
`UNUSED_VAR (per_core_sim_wb_value)
wire [`SOCKET_SIZE-1:0] per_core_busy;
VX_dcr_bus_if core_dcr_bus_if();
`BUFFER_DCR_BUS_IF (core_dcr_bus_if, dcr_bus_if, (`SOCKET_SIZE > 1));
`SCOPE_IO_SWITCH (`SOCKET_SIZE)
// Generate all cores
for (genvar i = 0; i < `SOCKET_SIZE; ++i) begin
for (genvar core_id = 0; core_id < `SOCKET_SIZE; ++core_id) begin : cores
`RESET_RELAY (core_reset, reset);
VX_core #(
.CORE_ID ((SOCKET_ID * `SOCKET_SIZE) + i)
.CORE_ID ((SOCKET_ID * `SOCKET_SIZE) + core_id),
.INSTANCE_ID ($sformatf("%s-core%0d", INSTANCE_ID, core_id))
) core (
`SCOPE_IO_BIND (i)
`SCOPE_IO_BIND (scope_core + core_id)
.clk (clk),
.reset (core_reset),
@ -225,23 +220,21 @@ module VX_socket import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
`endif
.dcr_bus_if (core_dcr_bus_if),
.dcache_bus_if (per_core_dcache_bus_if[i * DCACHE_NUM_REQS +: DCACHE_NUM_REQS]),
.dcache_bus_if (per_core_dcache_bus_if[core_id * DCACHE_NUM_REQS +: DCACHE_NUM_REQS]),
.icache_bus_if (per_core_icache_bus_if[i]),
.icache_bus_if (per_core_icache_bus_if[core_id]),
`ifdef GBAR_ENABLE
.gbar_bus_if (per_core_gbar_bus_if[i]),
.gbar_bus_if (per_core_gbar_bus_if[core_id]),
`endif
.sim_ebreak (per_core_sim_ebreak[i]),
.sim_wb_value (per_core_sim_wb_value[i]),
.busy (per_core_busy[i])
.busy (per_core_busy[core_id])
);
end
`BUFFER_EX(busy, (| per_core_busy), 1'b1, (`SOCKET_SIZE > 1));
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,7 +14,7 @@
`ifndef VX_TYPES_VH
`define VX_TYPES_VH
// Device configuration registers
// Device configuration registers /////////////////////////////////////////////
`define VX_CSR_ADDR_BITS 12
`define VX_DCR_ADDR_BITS 12
@ -22,24 +22,26 @@
`define VX_DCR_BASE_STATE_BEGIN 12'h001
`define VX_DCR_BASE_STARTUP_ADDR0 12'h001
`define VX_DCR_BASE_STARTUP_ADDR1 12'h002
`define VX_DCR_BASE_MPM_CLASS 12'h003
`define VX_DCR_BASE_STATE_END 12'h004
`define VX_DCR_BASE_STARTUP_ARG0 12'h003
`define VX_DCR_BASE_STARTUP_ARG1 12'h004
`define VX_DCR_BASE_MPM_CLASS 12'h005
`define VX_DCR_BASE_STATE_END 12'h006
`define VX_DCR_BASE_STATE(addr) ((addr) - `VX_DCR_BASE_STATE_BEGIN)
`define VX_DCR_BASE_STATE_COUNT (`VX_DCR_BASE_STATE_END-`VX_DCR_BASE_STATE_BEGIN)
// Machine Performance-monitoring counters classes
// Machine Performance-monitoring counters classes ////////////////////////////
`define VX_DCR_MPM_CLASS_NONE 0
`define VX_DCR_MPM_CLASS_NONE 0
`define VX_DCR_MPM_CLASS_CORE 1
`define VX_DCR_MPM_CLASS_MEM 2
// User Floating-Point CSRs
// User Floating-Point CSRs ///////////////////////////////////////////////////
`define VX_CSR_FFLAGS 12'h001
`define VX_CSR_FRM 12'h002
`define VX_CSR_FCSR 12'h003
`define VX_CSR_SATP 12'h180
`define VX_CSR_PMPCFG0 12'h3A0
@ -52,7 +54,9 @@
`define VX_CSR_MIE 12'h304
`define VX_CSR_MTVEC 12'h305
`define VX_CSR_MSCRATCH 12'h340
`define VX_CSR_MEPC 12'h341
`define VX_CSR_MCAUSE 12'h342
`define VX_CSR_MNSTATUS 12'h744
@ -61,14 +65,17 @@
`define VX_CSR_MPM_USER 12'hB03
`define VX_CSR_MPM_USER_H 12'hB83
// Machine Performance-monitoring core counters
// PERF: Standard
// Machine Performance-monitoring core counters (Standard) ////////////////////
`define VX_CSR_MCYCLE 12'hB00
`define VX_CSR_MCYCLE_H 12'hB80
`define VX_CSR_MPM_RESERVED 12'hB01
`define VX_CSR_MPM_RESERVED_H 12'hB81
`define VX_CSR_MINSTRET 12'hB02
`define VX_CSR_MINSTRET_H 12'hB82
// Machine Performance-monitoring core counters (class 1) /////////////////////
// PERF: pipeline
`define VX_CSR_MPM_SCHED_ID 12'hB03
`define VX_CSR_MPM_SCHED_ID_H 12'hB83
@ -78,32 +85,34 @@
`define VX_CSR_MPM_IBUF_ST_H 12'hB85
`define VX_CSR_MPM_SCRB_ST 12'hB06
`define VX_CSR_MPM_SCRB_ST_H 12'hB86
`define VX_CSR_MPM_SCRB_ALU 12'hB07
`define VX_CSR_MPM_SCRB_ALU_H 12'hB87
`define VX_CSR_MPM_SCRB_FPU 12'hB08
`define VX_CSR_MPM_SCRB_FPU_H 12'hB88
`define VX_CSR_MPM_SCRB_LSU 12'hB09
`define VX_CSR_MPM_SCRB_LSU_H 12'hB89
`define VX_CSR_MPM_SCRB_SFU 12'hB0A
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8A
`define VX_CSR_MPM_OPDS_ST 12'hB07
`define VX_CSR_MPM_OPDS_ST_H 12'hB87
`define VX_CSR_MPM_SCRB_ALU 12'hB08
`define VX_CSR_MPM_SCRB_ALU_H 12'hB88
`define VX_CSR_MPM_SCRB_FPU 12'hB09
`define VX_CSR_MPM_SCRB_FPU_H 12'hB89
`define VX_CSR_MPM_SCRB_LSU 12'hB0A
`define VX_CSR_MPM_SCRB_LSU_H 12'hB8A
`define VX_CSR_MPM_SCRB_SFU 12'hB0B
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8B
`define VX_CSR_MPM_SCRB_CSRS 12'hB0C
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB8C
`define VX_CSR_MPM_SCRB_WCTL 12'hB0D
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB8D
// PERF: memory
`define VX_CSR_MPM_IFETCHES 12'hB0B
`define VX_CSR_MPM_IFETCHES_H 12'hB8B
`define VX_CSR_MPM_LOADS 12'hB0C
`define VX_CSR_MPM_LOADS_H 12'hB8C
`define VX_CSR_MPM_STORES 12'hB0D
`define VX_CSR_MPM_STORES_H 12'hB8D
`define VX_CSR_MPM_IFETCH_LT 12'hB0E
`define VX_CSR_MPM_IFETCH_LT_H 12'hB8E
`define VX_CSR_MPM_LOAD_LT 12'hB0F
`define VX_CSR_MPM_LOAD_LT_H 12'hB8F
// SFU: scoreboard
`define VX_CSR_MPM_SCRB_WCTL 12'hB10
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB90
`define VX_CSR_MPM_SCRB_CSRS 12'hB11
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB91
`define VX_CSR_MPM_IFETCHES 12'hB0E
`define VX_CSR_MPM_IFETCHES_H 12'hB8E
`define VX_CSR_MPM_LOADS 12'hB0F
`define VX_CSR_MPM_LOADS_H 12'hB8F
`define VX_CSR_MPM_STORES 12'hB10
`define VX_CSR_MPM_STORES_H 12'hB90
`define VX_CSR_MPM_IFETCH_LT 12'hB11
`define VX_CSR_MPM_IFETCH_LT_H 12'hB91
`define VX_CSR_MPM_LOAD_LT 12'hB12
`define VX_CSR_MPM_LOAD_LT_H 12'hB92
// Machine Performance-monitoring memory counters (class 2) ///////////////////
// Machine Performance-monitoring memory counters
// PERF: icache
`define VX_CSR_MPM_ICACHE_READS 12'hB03 // total reads
`define VX_CSR_MPM_ICACHE_READS_H 12'hB83
@ -157,15 +166,18 @@
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99
`define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency
`define VX_CSR_MPM_MEM_LT_H 12'hB9A
// PERF: smem
`define VX_CSR_MPM_SMEM_READS 12'hB1B // memory reads
`define VX_CSR_MPM_SMEM_READS_H 12'hB9B
`define VX_CSR_MPM_SMEM_WRITES 12'hB1C // memory writes
`define VX_CSR_MPM_SMEM_WRITES_H 12'hB9C
`define VX_CSR_MPM_SMEM_BANK_ST 12'hB1D // bank conflicts
`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB9D
// PERF: lmem
`define VX_CSR_MPM_LMEM_READS 12'hB1B // memory reads
`define VX_CSR_MPM_LMEM_READS_H 12'hB9B
`define VX_CSR_MPM_LMEM_WRITES 12'hB1C // memory writes
`define VX_CSR_MPM_LMEM_WRITES_H 12'hB9C
`define VX_CSR_MPM_LMEM_BANK_ST 12'hB1D // bank conflicts
`define VX_CSR_MPM_LMEM_BANK_ST_H 12'hB9D
// Machine Information Registers
// Machine Performance-monitoring memory counters (class 3) ///////////////////
// <Add your own counters: use addresses hB03..B1F, hB83..hB9F>
// Machine Information Registers //////////////////////////////////////////////
`define VX_CSR_MVENDORID 12'hF11
`define VX_CSR_MARCHID 12'hF12
@ -177,11 +189,12 @@
`define VX_CSR_THREAD_ID 12'hCC0
`define VX_CSR_WARP_ID 12'hCC1
`define VX_CSR_CORE_ID 12'hCC2
`define VX_CSR_WARP_MASK 12'hCC3
`define VX_CSR_THREAD_MASK 12'hCC4 // warning! this value is also used in LLVM
`define VX_CSR_ACTIVE_WARPS 12'hCC3
`define VX_CSR_ACTIVE_THREADS 12'hCC4 // warning! this value is also used in LLVM
`define VX_CSR_NUM_THREADS 12'hFC0
`define VX_CSR_NUM_WARPS 12'hFC1
`define VX_CSR_NUM_CORES 12'hFC2
`define VX_CSR_LOCAL_MEM_BASE 12'hFC3
`endif // VX_TYPES_VH

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -29,8 +29,8 @@ module Vortex import VX_gpu_pkg::*; (
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
// Memory response
input wire mem_rsp_valid,
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data,
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
@ -44,11 +44,17 @@ module Vortex import VX_gpu_pkg::*; (
output wire busy
);
`ifdef SCOPE
localparam scope_cluster = 0;
`SCOPE_IO_SWITCH (`NUM_CLUSTERS);
`endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if();
assign mem_perf_if.icache = 'x;
assign mem_perf_if.dcache = 'x;
assign mem_perf_if.l2cache = 'x;
assign mem_perf_if.lmem = 'x;
`endif
VX_mem_bus_if #(
@ -74,12 +80,14 @@ module Vortex import VX_gpu_pkg::*; (
.CRSQ_SIZE (`L3_CRSQ_SIZE),
.MSHR_SIZE (`L3_MSHR_SIZE),
.MRSQ_SIZE (`L3_MRSQ_SIZE),
.MREQ_SIZE (`L3_MREQ_SIZE),
.MREQ_SIZE (`L3_WRITEBACK ? `L3_MSHR_SIZE : `L3_MREQ_SIZE),
.TAG_WIDTH (L2_MEM_TAG_WIDTH),
.WRITE_ENABLE (1),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_REG (2),
.MEM_OUT_REG (2),
.WRITEBACK (`L3_WRITEBACK),
.DIRTY_BYTES (`L3_WRITEBACK),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2),
.NC_ENABLE (1),
.PASSTHRU (!`L3_ENABLED)
) l3cache (
@ -101,6 +109,7 @@ module Vortex import VX_gpu_pkg::*; (
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
`UNUSED_VAR (mem_bus_if.req_data.atype)
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data;
@ -112,15 +121,6 @@ module Vortex import VX_gpu_pkg::*; (
`UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire)
wire sim_ebreak /* verilator public */;
wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value /* verilator public */;
wire [`NUM_CLUSTERS-1:0] per_cluster_sim_ebreak;
wire [`NUM_CLUSTERS-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_cluster_sim_wb_value;
assign sim_ebreak = per_cluster_sim_ebreak[0];
assign sim_wb_value = per_cluster_sim_wb_value[0];
`UNUSED_VAR (per_cluster_sim_ebreak)
`UNUSED_VAR (per_cluster_sim_wb_value)
VX_dcr_bus_if dcr_bus_if();
assign dcr_bus_if.write_valid = dcr_wr_valid;
assign dcr_bus_if.write_addr = dcr_wr_addr;
@ -128,19 +128,19 @@ module Vortex import VX_gpu_pkg::*; (
wire [`NUM_CLUSTERS-1:0] per_cluster_busy;
`SCOPE_IO_SWITCH (`NUM_CLUSTERS)
// Generate all clusters
for (genvar i = 0; i < `NUM_CLUSTERS; ++i) begin
for (genvar cluster_id = 0; cluster_id < `NUM_CLUSTERS; ++cluster_id) begin : clusters
`RESET_RELAY (cluster_reset, reset);
VX_dcr_bus_if cluster_dcr_bus_if();
`BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, (`NUM_CLUSTERS > 1));
VX_cluster #(
.CLUSTER_ID (i)
.CLUSTER_ID (cluster_id),
.INSTANCE_ID ($sformatf("cluster%0d", cluster_id))
) cluster (
`SCOPE_IO_BIND (i)
`SCOPE_IO_BIND (scope_cluster + cluster_id)
.clk (clk),
.reset (cluster_reset),
@ -148,15 +148,12 @@ module Vortex import VX_gpu_pkg::*; (
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
`endif
.dcr_bus_if (cluster_dcr_bus_if),
.mem_bus_if (per_cluster_mem_bus_if[i]),
.mem_bus_if (per_cluster_mem_bus_if[cluster_id]),
.sim_ebreak (per_cluster_sim_ebreak[i]),
.sim_wb_value (per_cluster_sim_wb_value[i]),
.busy (per_cluster_busy[i])
.busy (per_cluster_busy[cluster_id])
);
end
@ -164,14 +161,14 @@ module Vortex import VX_gpu_pkg::*; (
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
mem_perf_t mem_perf;
always @(posedge clk) begin
if (reset) begin
perf_mem_pending_reads <= '0;
end else begin
perf_mem_pending_reads <= $signed(perf_mem_pending_reads) +
perf_mem_pending_reads <= $signed(perf_mem_pending_reads) +
`PERF_CTR_BITS'($signed(2'(mem_req_fire && ~mem_bus_if.req_data.rw) - 2'(mem_rsp_fire)));
end
end
@ -180,7 +177,7 @@ module Vortex import VX_gpu_pkg::*; (
wire mem_wr_req_fire = mem_req_fire && mem_bus_if.req_data.rw;
always @(posedge clk) begin
if (reset) begin
if (reset) begin
mem_perf <= '0;
end else begin
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(mem_rd_req_fire);
@ -189,19 +186,19 @@ module Vortex import VX_gpu_pkg::*; (
end
end
assign mem_perf_if.mem = mem_perf;
`endif
`ifdef DBG_TRACE_CORE_MEM
`ifdef DBG_TRACE_MEM
always @(posedge clk) begin
if (mem_req_fire) begin
if (mem_req_rw)
`TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h data=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data));
`TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%h data=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data));
else
`TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen));
`TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen));
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: MEM Rsp: tag=0x%0h, data=0x%0h\n", $time, mem_rsp_tag, mem_rsp_data));
`TRACE(1, ("%d: MEM Rd Rsp: tag=0x%0h, data=0x%h\n", $time, mem_rsp_tag, mem_rsp_data));
end
end
`endif

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,8 +14,8 @@
`include "VX_define.vh"
module Vortex_axi import VX_gpu_pkg::*; #(
parameter AXI_DATA_WIDTH = `VX_MEM_DATA_WIDTH,
parameter AXI_ADDR_WIDTH = `XLEN,
parameter AXI_DATA_WIDTH = `VX_MEM_DATA_WIDTH,
parameter AXI_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter AXI_TID_WIDTH = `VX_MEM_TAG_WIDTH,
parameter AXI_NUM_BANKS = 1
)(
@ -25,7 +25,7 @@ module Vortex_axi import VX_gpu_pkg::*; #(
input wire clk,
input wire reset,
// AXI write request address channel
// AXI write request address channel
output wire m_axi_awvalid [AXI_NUM_BANKS],
input wire m_axi_awready [AXI_NUM_BANKS],
output wire [AXI_ADDR_WIDTH-1:0] m_axi_awaddr [AXI_NUM_BANKS],
@ -39,19 +39,19 @@ module Vortex_axi import VX_gpu_pkg::*; #(
output wire [3:0] m_axi_awqos [AXI_NUM_BANKS],
output wire [3:0] m_axi_awregion [AXI_NUM_BANKS],
// AXI write request data channel
output wire m_axi_wvalid [AXI_NUM_BANKS],
// AXI write request data channel
output wire m_axi_wvalid [AXI_NUM_BANKS],
input wire m_axi_wready [AXI_NUM_BANKS],
output wire [AXI_DATA_WIDTH-1:0] m_axi_wdata [AXI_NUM_BANKS],
output wire [AXI_DATA_WIDTH/8-1:0] m_axi_wstrb [AXI_NUM_BANKS],
output wire m_axi_wlast [AXI_NUM_BANKS],
output wire [AXI_DATA_WIDTH/8-1:0] m_axi_wstrb [AXI_NUM_BANKS],
output wire m_axi_wlast [AXI_NUM_BANKS],
// AXI write response channel
input wire m_axi_bvalid [AXI_NUM_BANKS],
output wire m_axi_bready [AXI_NUM_BANKS],
input wire [AXI_TID_WIDTH-1:0] m_axi_bid [AXI_NUM_BANKS],
input wire [1:0] m_axi_bresp [AXI_NUM_BANKS],
// AXI read request channel
output wire m_axi_arvalid [AXI_NUM_BANKS],
input wire m_axi_arready [AXI_NUM_BANKS],
@ -59,13 +59,13 @@ module Vortex_axi import VX_gpu_pkg::*; #(
output wire [AXI_TID_WIDTH-1:0] m_axi_arid [AXI_NUM_BANKS],
output wire [7:0] m_axi_arlen [AXI_NUM_BANKS],
output wire [2:0] m_axi_arsize [AXI_NUM_BANKS],
output wire [1:0] m_axi_arburst [AXI_NUM_BANKS],
output wire [1:0] m_axi_arlock [AXI_NUM_BANKS],
output wire [1:0] m_axi_arburst [AXI_NUM_BANKS],
output wire [1:0] m_axi_arlock [AXI_NUM_BANKS],
output wire [3:0] m_axi_arcache [AXI_NUM_BANKS],
output wire [2:0] m_axi_arprot [AXI_NUM_BANKS],
output wire [3:0] m_axi_arqos [AXI_NUM_BANKS],
output wire [2:0] m_axi_arprot [AXI_NUM_BANKS],
output wire [3:0] m_axi_arqos [AXI_NUM_BANKS],
output wire [3:0] m_axi_arregion [AXI_NUM_BANKS],
// AXI read response channel
input wire m_axi_rvalid [AXI_NUM_BANKS],
output wire m_axi_rready [AXI_NUM_BANKS],
@ -73,7 +73,7 @@ module Vortex_axi import VX_gpu_pkg::*; #(
input wire m_axi_rlast [AXI_NUM_BANKS],
input wire [AXI_TID_WIDTH-1:0] m_axi_rid [AXI_NUM_BANKS],
input wire [1:0] m_axi_rresp [AXI_NUM_BANKS],
// DCR write request
input wire dcr_wr_valid,
input wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
@ -83,35 +83,35 @@ module Vortex_axi import VX_gpu_pkg::*; #(
output wire busy
);
`STATIC_ASSERT((AXI_DATA_WIDTH == `VX_MEM_DATA_WIDTH), ("invalid memory data size: current=%0d, expected=%0d", AXI_DATA_WIDTH, `VX_MEM_DATA_WIDTH))
`STATIC_ASSERT((AXI_ADDR_WIDTH >= `XLEN), ("invalid memory address size: current=%0d, expected=%0d", AXI_ADDR_WIDTH, `VX_MEM_ADDR_WIDTH))
`STATIC_ASSERT((AXI_ADDR_WIDTH >= `MEM_ADDR_WIDTH), ("invalid memory address size: current=%0d, expected=%0d", AXI_ADDR_WIDTH, `VX_MEM_ADDR_WIDTH))
//`STATIC_ASSERT((AXI_TID_WIDTH >= `VX_MEM_TAG_WIDTH), ("invalid memory tag size: current=%0d, expected=%0d", AXI_TID_WIDTH, `VX_MEM_TAG_WIDTH))
wire mem_req_valid;
wire mem_req_rw;
wire mem_req_rw;
wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen;
wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr;
wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data;
wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
wire mem_rsp_valid;
wire mem_rsp_valid;
wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data;
wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_ready;
wire [`XLEN-1:0] m_axi_awaddr_unqual [AXI_NUM_BANKS];
wire [`XLEN-1:0] m_axi_araddr_unqual [AXI_NUM_BANKS];
wire [`MEM_ADDR_WIDTH-1:0] m_axi_awaddr_unqual [AXI_NUM_BANKS];
wire [`MEM_ADDR_WIDTH-1:0] m_axi_araddr_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_awid_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_arid_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_bid_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_rid_unqual [AXI_NUM_BANKS];
for (genvar i = 0; i < AXI_NUM_BANKS; ++i) begin
assign m_axi_awaddr[i] = `XLEN'(m_axi_awaddr_unqual[i]);
assign m_axi_araddr[i] = `XLEN'(m_axi_araddr_unqual[i]);
assign m_axi_awaddr[i] = `MEM_ADDR_WIDTH'(m_axi_awaddr_unqual[i]);
assign m_axi_araddr[i] = `MEM_ADDR_WIDTH'(m_axi_araddr_unqual[i]);
assign m_axi_awid[i] = AXI_TID_WIDTH'(m_axi_awid_unqual[i]);
assign m_axi_arid[i] = AXI_TID_WIDTH'(m_axi_arid_unqual[i]);
@ -120,11 +120,11 @@ module Vortex_axi import VX_gpu_pkg::*; #(
end
VX_axi_adapter #(
.DATA_WIDTH (`VX_MEM_DATA_WIDTH),
.ADDR_WIDTH (`XLEN),
.DATA_WIDTH (`VX_MEM_DATA_WIDTH),
.ADDR_WIDTH (`MEM_ADDR_WIDTH),
.TAG_WIDTH (`VX_MEM_TAG_WIDTH),
.NUM_BANKS (AXI_NUM_BANKS),
.OUT_REG_RSP((AXI_NUM_BANKS > 1) ? 2 : 0)
.RSP_OUT_BUF((AXI_NUM_BANKS > 1) ? 2 : 0)
) axi_adapter (
.clk (clk),
.reset (reset),
@ -141,18 +141,18 @@ module Vortex_axi import VX_gpu_pkg::*; #(
.mem_rsp_data (mem_rsp_data),
.mem_rsp_tag (mem_rsp_tag),
.mem_rsp_ready (mem_rsp_ready),
.m_axi_awvalid (m_axi_awvalid),
.m_axi_awready (m_axi_awready),
.m_axi_awaddr (m_axi_awaddr_unqual),
.m_axi_awid (m_axi_awid_unqual),
.m_axi_awlen (m_axi_awlen),
.m_axi_awsize (m_axi_awsize),
.m_axi_awburst (m_axi_awburst),
.m_axi_awlock (m_axi_awlock),
.m_axi_awburst (m_axi_awburst),
.m_axi_awlock (m_axi_awlock),
.m_axi_awcache (m_axi_awcache),
.m_axi_awprot (m_axi_awprot),
.m_axi_awqos (m_axi_awqos),
.m_axi_awprot (m_axi_awprot),
.m_axi_awqos (m_axi_awqos),
.m_axi_awregion (m_axi_awregion),
.m_axi_wvalid (m_axi_wvalid),
@ -160,35 +160,35 @@ module Vortex_axi import VX_gpu_pkg::*; #(
.m_axi_wdata (m_axi_wdata),
.m_axi_wstrb (m_axi_wstrb),
.m_axi_wlast (m_axi_wlast),
.m_axi_bvalid (m_axi_bvalid),
.m_axi_bready (m_axi_bready),
.m_axi_bid (m_axi_bid_unqual),
.m_axi_bresp (m_axi_bresp),
.m_axi_arvalid (m_axi_arvalid),
.m_axi_arready (m_axi_arready),
.m_axi_araddr (m_axi_araddr_unqual),
.m_axi_arid (m_axi_arid_unqual),
.m_axi_arid (m_axi_arid_unqual),
.m_axi_arlen (m_axi_arlen),
.m_axi_arsize (m_axi_arsize),
.m_axi_arburst (m_axi_arburst),
.m_axi_arlock (m_axi_arlock),
.m_axi_arburst (m_axi_arburst),
.m_axi_arlock (m_axi_arlock),
.m_axi_arcache (m_axi_arcache),
.m_axi_arprot (m_axi_arprot),
.m_axi_arprot (m_axi_arprot),
.m_axi_arqos (m_axi_arqos),
.m_axi_arregion (m_axi_arregion),
.m_axi_arregion (m_axi_arregion),
.m_axi_rvalid (m_axi_rvalid),
.m_axi_rready (m_axi_rready),
.m_axi_rdata (m_axi_rdata),
.m_axi_rlast (m_axi_rlast) ,
.m_axi_rdata (m_axi_rdata),
.m_axi_rlast (m_axi_rlast) ,
.m_axi_rid (m_axi_rid_unqual),
.m_axi_rresp (m_axi_rresp)
);
`SCOPE_IO_SWITCH (1)
Vortex vortex (
`SCOPE_IO_BIND (0)

View file

@ -5,6 +5,7 @@
// To be done:
// Check how to run this with OPAE. Looks like setup issue
`ifndef NOPAE
`include "platform_if.vh"
@ -85,7 +86,7 @@ module ccip_std_afu #(
t_local_mem_data avs_writedata [NUM_LOCAL_MEM_BANKS];
t_local_mem_addr avs_address [NUM_LOCAL_MEM_BANKS];
logic avs_write [NUM_LOCAL_MEM_BANKS];
logic avs_read [NUM_LOCAL_MEM_BANKS];
logic avs_read [NUM_LOCAL_MEM_BANKS];
for (genvar b = 0; b < NUM_LOCAL_MEM_BANKS; b++) begin
assign local_mem[b].burstcount = avs_burstcount[b];
@ -94,7 +95,7 @@ module ccip_std_afu #(
assign local_mem[b].byteenable = avs_byteenable[b];
assign local_mem[b].write = avs_write[b];
assign local_mem[b].read = avs_read[b];
assign avs_waitrequest[b] = local_mem[b].waitrequest;
assign avs_readdata[b] = local_mem[b].readdata;
assign avs_readdatavalid[b] = local_mem[b].readdatavalid;
@ -107,7 +108,7 @@ module ccip_std_afu #(
.reset (reset_T1),
.cp2af_sRxPort (cp2af_sRx_T1),
.af2cp_sTxPort (af2cp_sTx_T0),
.af2cp_sTxPort (af2cp_sTx_T0),
.avs_writedata (avs_writedata),
.avs_readdata (avs_readdata),
@ -121,3 +122,5 @@ module ccip_std_afu #(
);
endmodule
`endif

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -62,7 +62,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
localparam CCI_RW_PENDING_SIZE= 256;
localparam AFU_ID_L = 16'h0002; // AFU ID Lower
localparam AFU_ID_H = 16'h0004; // AFU ID Higher
localparam AFU_ID_H = 16'h0004; // AFU ID Higher
localparam CMD_MEM_READ = `AFU_IMAGE_CMD_MEM_READ;
localparam CMD_MEM_WRITE = `AFU_IMAGE_CMD_MEM_WRITE;
@ -70,15 +70,15 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
localparam CMD_RUN = `AFU_IMAGE_CMD_RUN;
localparam CMD_TYPE_WIDTH = `CLOG2(`AFU_IMAGE_CMD_MAX_VALUE+1);
localparam MMIO_CMD_TYPE = `AFU_IMAGE_MMIO_CMD_TYPE;
localparam MMIO_CMD_TYPE = `AFU_IMAGE_MMIO_CMD_TYPE;
localparam MMIO_CMD_ARG0 = `AFU_IMAGE_MMIO_CMD_ARG0;
localparam MMIO_CMD_ARG1 = `AFU_IMAGE_MMIO_CMD_ARG1;
localparam MMIO_CMD_ARG2 = `AFU_IMAGE_MMIO_CMD_ARG2;
localparam MMIO_STATUS = `AFU_IMAGE_MMIO_STATUS;
localparam COUT_TID_WIDTH = `CLOG2(`VX_MEM_BYTEEN_WIDTH);
localparam COUT_TID_WIDTH = `CLOG2(`VX_MEM_BYTEEN_WIDTH);
localparam COUT_QUEUE_DATAW = COUT_TID_WIDTH + 8;
localparam COUT_QUEUE_SIZE = 64;
localparam COUT_QUEUE_SIZE = 64;
localparam MMIO_DEV_CAPS = `AFU_IMAGE_MMIO_DEV_CAPS;
localparam MMIO_ISA_CAPS = `AFU_IMAGE_MMIO_ISA_CAPS;
@ -97,14 +97,14 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
wire [127:0] afu_id = `AFU_ACCEL_UUID;
wire [63:0] dev_caps = {16'b0,
8'(`SM_ENABLED ? `SMEM_LOG_SIZE : 0),
16'(`NUM_CORES * `NUM_CLUSTERS),
8'(`NUM_WARPS),
8'(`NUM_THREADS),
8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
16'(`NUM_CORES * `NUM_CLUSTERS),
8'(`NUM_WARPS),
8'(`NUM_THREADS),
8'(`IMPLEMENTATION_ID)};
wire [63:0] isa_caps = {32'(`MISA_EXT),
2'(`CLOG2(`XLEN)-4),
wire [63:0] isa_caps = {32'(`MISA_EXT),
2'(`CLOG2(`XLEN)-4),
30'(`MISA_STD)};
reg [STATE_WIDTH-1:0] state;
@ -161,7 +161,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
reg scope_bus_in;
wire scope_bus_out;
reg [5:0] scope_bus_ctr;
wire scope_reset = reset;
@ -177,8 +177,8 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
scope_bus_ctr <= 63;
end
scope_bus_in <= 0;
if (cp2af_sRxPort.c0.mmioWrValid
&& (MMIO_SCOPE_WRITE == mmio_hdr.address)) begin
if (cp2af_sRxPort.c0.mmioWrValid
&& (MMIO_SCOPE_WRITE == mmio_hdr.address)) begin
cmd_scope_wdata <= 64'(cp2af_sRxPort.c0.data);
cmd_scope_writing <= 1;
scope_bus_ctr <= 63;
@ -191,7 +191,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
if (scope_bus_ctr == 0) begin
cmd_scope_writing <= 0;
end
end
end
if (cmd_scope_reading) begin
cmd_scope_rdata <= {cmd_scope_rdata[62:0], scope_bus_out};
scope_bus_ctr <= scope_bus_ctr - 1;
@ -211,7 +211,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
// disable assertions until full reset
reg [`CLOG2(`RESET_DELAY+1)-1:0] assert_delay_ctr;
initial begin
$assertoff;
$assertoff;
end
always @(posedge clk) begin
if (reset) begin
@ -231,22 +231,22 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
mmio_tx.mmioRdValid <= 0;
mmio_tx.hdr <= '0;
end else begin
mmio_tx.mmioRdValid <= cp2af_sRxPort.c0.mmioRdValid;
mmio_tx.hdr.tid <= mmio_hdr.tid;
mmio_tx.mmioRdValid <= cp2af_sRxPort.c0.mmioRdValid;
mmio_tx.hdr.tid <= mmio_hdr.tid;
end
// serve MMIO write request
if (cp2af_sRxPort.c0.mmioWrValid) begin
case (mmio_hdr.address)
MMIO_CMD_ARG0: begin
cmd_args[0] <= 64'(cp2af_sRxPort.c0.data);
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_CMD_ARG0: data=0x%0h\n", $time, 64'(cp2af_sRxPort.c0.data)));
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_CMD_ARG0: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data)));
`endif
end
MMIO_CMD_ARG1: begin
cmd_args[1] <= 64'(cp2af_sRxPort.c0.data);
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_CMD_ARG1: data=0x%0h\n", $time, 64'(cp2af_sRxPort.c0.data)));
`TRACE(2, ("%d: MMIO_CMD_ARG1: data=0x%h\n", $time, 64'(cp2af_sRxPort.c0.data)));
`endif
end
MMIO_CMD_ARG2: begin
@ -263,13 +263,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
`ifdef SCOPE
MMIO_SCOPE_WRITE: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_SCOPE_WRITE: data=0x%0h\n", $time, cmd_scope_wdata));
`TRACE(2, ("%d: MMIO_SCOPE_WRITE: data=0x%h\n", $time, cmd_scope_wdata));
`endif
end
`endif
default: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: Unknown MMIO Wr: addr=0x%0h, data=0x%0h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data)));
`TRACE(2, ("%d: Unknown MMIO Wr: addr=0x%0h, data=0x%h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data)));
`endif
end
endcase
@ -284,7 +284,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
8'b0, // reserved
4'b0, // afu minor revision = 0
7'b0, // reserved
1'b1, // end of DFH list = 1
1'b1, // end of DFH list = 1
24'b0, // next DFH offset = 0
4'b0, // afu major revision = 0
12'b0 // feature ID = 0
@ -305,16 +305,16 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
MMIO_SCOPE_READ: begin
mmio_tx.data <= cmd_scope_rdata;
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_SCOPE_READ: data=0x%0h\n", $time, cmd_scope_rdata));
`TRACE(2, ("%d: MMIO_SCOPE_READ: data=0x%h\n", $time, cmd_scope_rdata));
`endif
end
`endif
MMIO_DEV_CAPS: begin
mmio_tx.data <= dev_caps;
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: MMIO_DEV_CAPS: data=0x%0h\n", $time, dev_caps));
`TRACE(2, ("%d: MMIO_DEV_CAPS: data=0x%h\n", $time, dev_caps));
`endif
end
end
MMIO_ISA_CAPS: begin
mmio_tx.data <= isa_caps;
`ifdef DBG_TRACE_AFU
@ -352,41 +352,41 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
end
wire is_mmio_wr_cmd = cp2af_sRxPort.c0.mmioWrValid && (MMIO_CMD_TYPE == mmio_hdr.address);
wire [CMD_TYPE_WIDTH-1:0] cmd_type = is_mmio_wr_cmd ?
wire [CMD_TYPE_WIDTH-1:0] cmd_type = is_mmio_wr_cmd ?
CMD_TYPE_WIDTH'(cp2af_sRxPort.c0.data) : CMD_TYPE_WIDTH'(0);
always @(posedge clk) begin
if (reset) begin
state <= STATE_IDLE;
vx_busy_wait <= 0;
vx_running <= 0;
vx_running <= 0;
end else begin
case (state)
STATE_IDLE: begin
STATE_IDLE: begin
case (cmd_type)
CMD_MEM_READ: begin
CMD_MEM_READ: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: STATE MEM_READ: ia=0x%0h addr=0x%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size));
`endif
state <= STATE_MEM_READ;
end
CMD_MEM_WRITE: begin
state <= STATE_MEM_READ;
end
CMD_MEM_WRITE: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: STATE MEM_WRITE: ia=0x%0h addr=0x%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size));
`endif
state <= STATE_MEM_WRITE;
end
CMD_DCR_WRITE: begin
CMD_DCR_WRITE: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: STATE DCR_WRITE: addr=0x%0h data=%0d\n", $time, cmd_dcr_addr, cmd_dcr_data));
`endif
state <= STATE_DCR_WRITE;
end
CMD_RUN: begin
CMD_RUN: begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: STATE RUN\n", $time));
`endif
state <= STATE_RUN;
`endif
state <= STATE_RUN;
vx_running <= 0;
end
default: begin
@ -425,7 +425,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
end
end else begin
// wait until the gpu is not busy
if (~vx_busy) begin
if (~vx_busy) begin
state <= STATE_IDLE;
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: AFU: End execution\n", $time));
@ -441,8 +441,8 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
`endif
vx_running <= 1;
vx_busy_wait <= 1;
end
end
end
end
end
default:;
endcase
@ -462,7 +462,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
wire [CCI_ADDR_WIDTH-1:0] cci_mem_req_tag;
wire cci_mem_req_ready;
wire cci_mem_rsp_valid;
wire cci_mem_rsp_valid;
wire [CCI_DATA_WIDTH-1:0] cci_mem_rsp_data;
wire [CCI_ADDR_WIDTH-1:0] cci_mem_rsp_tag;
wire cci_mem_rsp_ready;
@ -475,30 +475,32 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.TAG_WIDTH (AVS_REQ_TAGW)
) cci_vx_mem_bus_if[2]();
`RESET_RELAY (cci_adapter_reset, reset);
VX_mem_adapter #(
.SRC_DATA_WIDTH (CCI_DATA_WIDTH),
.DST_DATA_WIDTH (LMEM_DATA_WIDTH),
.SRC_ADDR_WIDTH (CCI_ADDR_WIDTH),
.DST_ADDR_WIDTH (LMEM_ADDR_WIDTH),
.SRC_DATA_WIDTH (CCI_DATA_WIDTH),
.DST_DATA_WIDTH (LMEM_DATA_WIDTH),
.SRC_ADDR_WIDTH (CCI_ADDR_WIDTH),
.DST_ADDR_WIDTH (LMEM_ADDR_WIDTH),
.SRC_TAG_WIDTH (CCI_ADDR_WIDTH),
.DST_TAG_WIDTH (AVS_REQ_TAGW),
.OUT_REG_REQ (0),
.OUT_REG_RSP (0)
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (0)
) cci_mem_adapter (
.clk (clk),
.reset (reset),
.reset (cci_adapter_reset),
.mem_req_valid_in (cci_mem_req_valid),
.mem_req_addr_in (cci_mem_req_addr),
.mem_req_rw_in (cci_mem_req_rw),
.mem_req_byteen_in ({CCI_DATA_SIZE{1'b1}}),
.mem_req_data_in (cci_mem_req_data),
.mem_req_tag_in (cci_mem_req_tag),
.mem_req_ready_in (cci_mem_req_ready),
.mem_req_tag_in (cci_mem_req_tag),
.mem_req_ready_in (cci_mem_req_ready),
.mem_rsp_valid_in (cci_mem_rsp_valid),
.mem_rsp_data_in (cci_mem_rsp_data),
.mem_rsp_tag_in (cci_mem_rsp_tag),
.mem_rsp_valid_in (cci_mem_rsp_valid),
.mem_rsp_data_in (cci_mem_rsp_data),
.mem_rsp_tag_in (cci_mem_rsp_tag),
.mem_rsp_ready_in (cci_mem_rsp_ready),
.mem_req_valid_out (cci_vx_mem_bus_if[1].req_valid),
@ -507,14 +509,17 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.mem_req_byteen_out (cci_vx_mem_bus_if[1].req_data.byteen),
.mem_req_data_out (cci_vx_mem_bus_if[1].req_data.data),
.mem_req_tag_out (cci_vx_mem_bus_if[1].req_data.tag),
.mem_req_ready_out (cci_vx_mem_bus_if[1].req_ready),
.mem_req_ready_out (cci_vx_mem_bus_if[1].req_ready),
.mem_rsp_valid_out (cci_vx_mem_bus_if[1].rsp_valid),
.mem_rsp_data_out (cci_vx_mem_bus_if[1].rsp_data.data),
.mem_rsp_tag_out (cci_vx_mem_bus_if[1].rsp_data.tag),
.mem_rsp_valid_out (cci_vx_mem_bus_if[1].rsp_valid),
.mem_rsp_data_out (cci_vx_mem_bus_if[1].rsp_data.data),
.mem_rsp_tag_out (cci_vx_mem_bus_if[1].rsp_data.tag),
.mem_rsp_ready_out (cci_vx_mem_bus_if[1].rsp_ready)
);
assign cci_vx_mem_bus_if[1].req_data.atype = '0;
`UNUSED_VAR (cci_vx_mem_bus_if[1].req_data.atype)
//--
wire vx_mem_is_cout;
@ -523,30 +528,32 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
assign vx_mem_req_valid_qual = vx_mem_req_valid && ~vx_mem_is_cout;
`RESET_RELAY (vx_adapter_reset, reset);
VX_mem_adapter #(
.SRC_DATA_WIDTH (`VX_MEM_DATA_WIDTH),
.DST_DATA_WIDTH (LMEM_DATA_WIDTH),
.SRC_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH),
.DST_DATA_WIDTH (LMEM_DATA_WIDTH),
.SRC_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH),
.DST_ADDR_WIDTH (LMEM_ADDR_WIDTH),
.SRC_TAG_WIDTH (`VX_MEM_TAG_WIDTH),
.DST_TAG_WIDTH (AVS_REQ_TAGW),
.OUT_REG_REQ (0),
.OUT_REG_RSP (2)
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (2)
) vx_mem_adapter (
.clk (clk),
.reset (reset),
.reset (vx_adapter_reset),
.mem_req_valid_in (vx_mem_req_valid_qual),
.mem_req_addr_in (vx_mem_req_addr),
.mem_req_rw_in (vx_mem_req_rw),
.mem_req_byteen_in (vx_mem_req_byteen),
.mem_req_data_in (vx_mem_req_data),
.mem_req_tag_in (vx_mem_req_tag),
.mem_req_ready_in (vx_mem_req_ready_qual),
.mem_req_tag_in (vx_mem_req_tag),
.mem_req_ready_in (vx_mem_req_ready_qual),
.mem_rsp_valid_in (vx_mem_rsp_valid),
.mem_rsp_data_in (vx_mem_rsp_data),
.mem_rsp_tag_in (vx_mem_rsp_tag),
.mem_rsp_valid_in (vx_mem_rsp_valid),
.mem_rsp_data_in (vx_mem_rsp_data),
.mem_rsp_tag_in (vx_mem_rsp_tag),
.mem_rsp_ready_in (vx_mem_rsp_ready),
.mem_req_valid_out (cci_vx_mem_bus_if[0].req_valid),
@ -555,14 +562,17 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.mem_req_byteen_out (cci_vx_mem_bus_if[0].req_data.byteen),
.mem_req_data_out (cci_vx_mem_bus_if[0].req_data.data),
.mem_req_tag_out (cci_vx_mem_bus_if[0].req_data.tag),
.mem_req_ready_out (cci_vx_mem_bus_if[0].req_ready),
.mem_req_ready_out (cci_vx_mem_bus_if[0].req_ready),
.mem_rsp_valid_out (cci_vx_mem_bus_if[0].rsp_valid),
.mem_rsp_data_out (cci_vx_mem_bus_if[0].rsp_data.data),
.mem_rsp_tag_out (cci_vx_mem_bus_if[0].rsp_data.tag),
.mem_rsp_valid_out (cci_vx_mem_bus_if[0].rsp_valid),
.mem_rsp_data_out (cci_vx_mem_bus_if[0].rsp_data.data),
.mem_rsp_tag_out (cci_vx_mem_bus_if[0].rsp_data.tag),
.mem_rsp_ready_out (cci_vx_mem_bus_if[0].rsp_ready)
);
assign cci_vx_mem_bus_if[0].req_data.atype = '0;
`UNUSED_VAR (cci_vx_mem_bus_if[0].req_data.atype)
//--
VX_mem_bus_if #(
.DATA_SIZE (LMEM_DATA_SIZE),
@ -570,19 +580,17 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.TAG_WIDTH (AVS_REQ_TAGW+1)
) mem_bus_if[1]();
`RESET_RELAY (mem_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (2),
.DATA_SIZE (LMEM_DATA_SIZE),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.TAG_WIDTH (AVS_REQ_TAGW),
.ARBITER ("P"),
.OUT_REG_REQ (0),
.OUT_REG_RSP (0)
.ARBITER ("P"), // prioritize VX requests
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (0)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
.reset (reset),
.bus_in_if (cci_vx_mem_bus_if),
.bus_out_if (mem_bus_if)
);
@ -592,19 +600,19 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
`RESET_RELAY (avs_adapter_reset, reset);
VX_avs_adapter #(
.DATA_WIDTH (LMEM_DATA_WIDTH),
.DATA_WIDTH (LMEM_DATA_WIDTH),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.BURST_WIDTH (LMEM_BURST_CTRW),
.NUM_BANKS (NUM_LOCAL_MEM_BANKS),
.TAG_WIDTH (AVS_REQ_TAGW + 1),
.RD_QUEUE_SIZE (AVS_RD_QUEUE_SIZE),
.OUT_REG_REQ (2),
.OUT_REG_RSP (0)
.REQ_OUT_BUF (2),
.RSP_OUT_BUF (0)
) avs_adapter (
.clk (clk),
.reset (avs_adapter_reset),
// Memory request
// Memory request
.mem_req_valid (mem_bus_if[0].req_valid),
.mem_req_rw (mem_bus_if[0].req_data.rw),
.mem_req_byteen (mem_bus_if[0].req_data.byteen),
@ -613,7 +621,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.mem_req_tag (mem_bus_if[0].req_data.tag),
.mem_req_ready (mem_bus_if[0].req_ready),
// Memory response
// Memory response
.mem_rsp_valid (mem_bus_if[0].rsp_valid),
.mem_rsp_data (mem_bus_if[0].rsp_data.data),
.mem_rsp_tag (mem_bus_if[0].rsp_data.tag),
@ -631,6 +639,9 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.avs_readdatavalid(avs_readdatavalid)
);
assign mem_bus_if[0].req_data.atype = '0;
`UNUSED_VAR (mem_bus_if[0].req_data.atype)
// CCI-P Read Request ///////////////////////////////////////////////////////////
reg [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_ctr;
@ -654,13 +665,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
always @(*) begin
af2cp_sTxPort.c0.valid = cci_rd_req_fire;
af2cp_sTxPort.c0.hdr = t_ccip_c0_ReqMemHdr'(0);
af2cp_sTxPort.c0.hdr.address = cci_rd_req_addr;
af2cp_sTxPort.c0.hdr.address = cci_rd_req_addr;
af2cp_sTxPort.c0.hdr.mdata = t_ccip_mdata'(cci_rd_req_tag);
end
wire cci_mem_wr_req_fire = cci_mem_wr_req_valid && cci_mem_req_ready;
wire cci_rd_rsp_fire = cp2af_sRxPort.c0.rspValid
wire cci_rd_rsp_fire = cp2af_sRxPort.c0.rspValid
&& (cp2af_sRxPort.c0.hdr.resp_type == eRSP_RDLINE);
assign cci_rd_req_tag = CCI_RD_QUEUE_TAGW'(cci_rd_req_ctr);
@ -672,16 +683,18 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
wire [`CLOG2(CCI_RD_QUEUE_SIZE+1)-1:0] cci_pending_reads;
wire cci_pending_reads_full;
VX_pending_size #(
VX_pending_size #(
.SIZE (CCI_RD_QUEUE_SIZE)
) cci_rd_pending_size (
.clk (clk),
.reset (reset),
.incr (cci_rd_req_fire),
.decr (cci_rdq_pop),
`UNUSED_PIN (empty),
`UNUSED_PIN (alm_empty),
.full (cci_pending_reads_full),
.size (cci_pending_reads),
`UNUSED_PIN (empty)
`UNUSED_PIN (alm_full),
.size (cci_pending_reads)
);
`UNUSED_VAR (cci_pending_reads)
@ -699,29 +712,29 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
if (reset) begin
cci_rd_req_valid <= 0;
cci_rd_req_wait <= 0;
end else begin
if ((STATE_IDLE == state)
end else begin
if ((STATE_IDLE == state)
&& (CMD_MEM_WRITE == cmd_type)) begin
cci_rd_req_valid <= (cmd_data_size != 0);
cci_rd_req_wait <= 0;
end
cci_rd_req_valid <= (STATE_MEM_WRITE == state)
cci_rd_req_valid <= (STATE_MEM_WRITE == state)
&& (cci_rd_req_ctr_next != cmd_data_size)
&& !cp2af_sRxPort.c0TxAlmFull;
&& !cp2af_sRxPort.c0TxAlmFull;
if (cci_rd_req_fire
if (cci_rd_req_fire
&& (cci_rd_req_tag == CCI_RD_QUEUE_TAGW'(CCI_RD_WINDOW_SIZE-1))) begin
cci_rd_req_wait <= 1; // end current request batch
end
if (cci_rd_rsp_fire
if (cci_rd_rsp_fire
&& (cci_rd_rsp_ctr == CCI_RD_QUEUE_TAGW'(CCI_RD_WINDOW_SIZE-1))) begin
cci_rd_req_wait <= 0; // begin new request batch
end
end
end
if ((STATE_IDLE == state)
if ((STATE_IDLE == state)
&& (CMD_MEM_WRITE == cmd_type)) begin
cci_rd_req_addr <= cmd_io_addr;
cci_rd_req_ctr <= '0;
@ -731,7 +744,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
cmd_mem_wr_done <= 0;
end
if (cci_rd_req_fire) begin
if (cci_rd_req_fire) begin
cci_rd_req_addr <= cci_rd_req_addr + 1;
cci_rd_req_ctr <= cci_rd_req_ctr + $bits(cci_rd_req_ctr)'(1);
`ifdef DBG_TRACE_AFU
@ -745,9 +758,9 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
cci_mem_wr_req_addr_base <= cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE);
end
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=0x%0h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data));
`TRACE(2, ("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=0x%h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data));
`endif
end
end
if (cci_rdq_pop) begin
`ifdef DBG_TRACE_AFU
@ -755,7 +768,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
`endif
end
if (cci_mem_wr_req_fire) begin
if (cci_mem_wr_req_fire) begin
cci_mem_wr_req_ctr <= cci_mem_wr_req_ctr + CCI_ADDR_WIDTH'(1);
if (cci_mem_wr_req_ctr == (cmd_data_size-1)) begin
cmd_mem_wr_done <= 1;
@ -763,14 +776,12 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
end
end
`RESET_RELAY (cci_rdq_reset, reset);
VX_fifo_queue #(
.DATAW (CCI_RD_QUEUE_DATAW),
.DEPTH (CCI_RD_QUEUE_SIZE)
) cci_rd_req_queue (
.clk (clk),
.reset (cci_rdq_reset),
.reset (reset),
.push (cci_rdq_push),
.pop (cci_rdq_pop),
.data_in (cci_rdq_din),
@ -788,13 +799,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
if (reset) begin
dbg_cci_rd_rsp_mask <= '0;
end else begin
if (cci_rd_rsp_fire) begin
if (cci_rd_rsp_fire) begin
if (cci_rd_rsp_ctr == 0) begin
dbg_cci_rd_rsp_mask <= (CCI_RD_WINDOW_SIZE'(1) << cci_rd_rsp_tag);
end else begin
dbg_cci_rd_rsp_mask <= (CCI_RD_WINDOW_SIZE'(1) << cci_rd_rsp_tag);
end else begin
assert(!dbg_cci_rd_rsp_mask[cci_rd_rsp_tag]);
dbg_cci_rd_rsp_mask[cci_rd_rsp_tag] <= 1;
end
end
end
end
end
@ -817,21 +828,21 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
af2cp_sTxPort.c1.hdr = t_ccip_c1_ReqMemHdr'(0);
af2cp_sTxPort.c1.hdr.sop = 1; // single line write mode
af2cp_sTxPort.c1.hdr.address = cci_wr_req_addr;
af2cp_sTxPort.c1.data = cci_wr_req_data;
end
af2cp_sTxPort.c1.data = cci_wr_req_data;
end
wire cci_mem_rd_req_fire = cci_mem_rd_req_valid && cci_mem_req_ready;
wire cci_mem_rd_rsp_fire = cci_mem_rsp_valid && cci_mem_rsp_ready;
wire cci_wr_rsp_fire = (STATE_MEM_READ == state)
&& cp2af_sRxPort.c1.rspValid
wire cci_wr_rsp_fire = (STATE_MEM_READ == state)
&& cp2af_sRxPort.c1.rspValid
&& (cp2af_sRxPort.c1.hdr.resp_type == eRSP_WRLINE);
wire [`CLOG2(CCI_RW_PENDING_SIZE+1)-1:0] cci_pending_writes;
wire cci_pending_writes_empty;
wire cci_pending_writes_full;
VX_pending_size #(
VX_pending_size #(
.SIZE (CCI_RW_PENDING_SIZE)
) cci_wr_pending_size (
.clk (clk),
@ -839,16 +850,18 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.incr (cci_mem_rd_rsp_fire),
.decr (cci_wr_rsp_fire),
.empty (cci_pending_writes_empty),
`UNUSED_PIN (alm_empty),
.full (cci_pending_writes_full),
`UNUSED_PIN (alm_full),
.size (cci_pending_writes)
);
`UNUSED_VAR (cci_pending_writes)
assign cci_mem_rd_req_valid = (STATE_MEM_READ == state)
assign cci_mem_rd_req_valid = (STATE_MEM_READ == state)
&& ~cci_mem_rd_req_done;
assign cci_mem_rsp_ready = ~cp2af_sRxPort.c1TxAlmFull
assign cci_mem_rsp_ready = ~cp2af_sRxPort.c1TxAlmFull
&& ~cci_pending_writes_full;
assign cmd_mem_rd_done = cci_wr_req_done
@ -861,42 +874,42 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
end else begin
cci_wr_req_fire <= cci_mem_rd_rsp_fire;
end
if ((STATE_IDLE == state)
if ((STATE_IDLE == state)
&& (CMD_MEM_READ == cmd_type)) begin
cci_mem_rd_req_ctr <= '0;
cci_mem_rd_req_addr <= cmd_mem_addr;
cci_mem_rd_req_done <= 0;
cci_wr_req_ctr <= cmd_data_size;
cci_wr_req_done <= 0;
end
end
if (cci_mem_rd_req_fire) begin
cci_mem_rd_req_addr <= cci_mem_rd_req_addr + CCI_ADDR_WIDTH'(1);
cci_mem_rd_req_addr <= cci_mem_rd_req_addr + CCI_ADDR_WIDTH'(1);
cci_mem_rd_req_ctr <= cci_mem_rd_req_ctr + CCI_ADDR_WIDTH'(1);
if (cci_mem_rd_req_ctr == (cmd_data_size-1)) begin
cci_mem_rd_req_done <= 1;
end
end
end
cci_wr_req_addr <= cmd_io_addr + t_ccip_clAddr'(cci_mem_rsp_tag);
cci_wr_req_data <= t_ccip_clData'(cci_mem_rsp_data);
cci_wr_req_data <= t_ccip_clData'(cci_mem_rsp_data);
if (cci_wr_req_fire) begin
`ASSERT(cci_wr_req_ctr != 0, ("runtime error"));
`ASSERT(cci_wr_req_ctr != 0, ("runtime error"));
cci_wr_req_ctr <= cci_wr_req_ctr - CCI_ADDR_WIDTH'(1);
if (cci_wr_req_ctr == CCI_ADDR_WIDTH'(1)) begin
cci_wr_req_done <= 1;
end
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: CCI Wr Req: addr=0x%0h, rem=%0d, pending=%0d, data=0x%0h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data));
`TRACE(2, ("%d: CCI Wr Req: addr=0x%0h, rem=%0d, pending=%0d, data=0x%h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data));
`endif
end
if (cci_wr_rsp_fire) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: CCI Wr Rsp: pending=%0d\n", $time, cci_pending_writes));
`endif
if (cci_wr_rsp_fire) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: CCI Wr Rsp: pending=%0d\n", $time, cci_pending_writes));
`endif
end
end
@ -916,7 +929,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
wire vx_dcr_wr_valid = (STATE_DCR_WRITE == state);
wire [`VX_DCR_ADDR_WIDTH-1:0] vx_dcr_wr_addr = cmd_dcr_addr;
wire [`VX_DCR_DATA_WIDTH-1:0] vx_dcr_wr_data = cmd_dcr_data;
`SCOPE_IO_SWITCH (2)
Vortex vortex (
@ -925,7 +938,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.clk (clk),
.reset (reset || ~vx_running),
// Memory request
// Memory request
.mem_req_valid (vx_mem_req_valid),
.mem_req_rw (vx_mem_req_rw),
.mem_req_byteen (vx_mem_req_byteen),
@ -934,7 +947,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.mem_req_tag (vx_mem_req_tag),
.mem_req_ready (vx_mem_req_ready),
// Memory response
// Memory response
.mem_rsp_valid (vx_mem_rsp_valid),
.mem_rsp_data (vx_mem_rsp_data),
.mem_rsp_tag (vx_mem_rsp_tag),
@ -944,7 +957,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.dcr_wr_valid (vx_dcr_wr_valid),
.dcr_wr_addr (vx_dcr_wr_addr),
.dcr_wr_data (vx_dcr_wr_data),
// Status
.busy (vx_busy)
);
@ -973,7 +986,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
wire cout_q_push = vx_mem_req_valid && vx_mem_is_cout && ~cout_q_full;
wire cout_q_pop = cp2af_sRxPort.c0.mmioRdValid
wire cout_q_pop = cp2af_sRxPort.c0.mmioRdValid
&& (mmio_hdr.address == MMIO_STATUS)
&& ~cout_q_empty;
@ -997,7 +1010,6 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
// SCOPE //////////////////////////////////////////////////////////////////////
`ifdef DBG_SCOPE_AFU
`ifdef SCOPE
wire mem_req_fire = mem_bus_if[0].req_valid && mem_bus_if[0].req_ready;
wire mem_rsp_fire = mem_bus_if[0].rsp_valid && mem_bus_if[0].rsp_ready;
wire avs_write_fire = avs_write[0] && ~avs_waitrequest[0];
@ -1022,26 +1034,26 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.triggers({
reset,
state_changed,
mem_req_fire,
mem_rsp_fire,
avs_write_fire,
avs_read_fire,
avs_waitrequest[0],
avs_readdatavalid[0],
cp2af_sRxPort.c0.mmioRdValid,
cp2af_sRxPort.c0.mmioWrValid,
cp2af_sRxPort.c0.rspValid,
cp2af_sRxPort.c1.rspValid,
af2cp_sTxPort.c0.valid,
af2cp_sTxPort.c1.valid,
cp2af_sRxPort.c0TxAlmFull,
cp2af_sRxPort.c1TxAlmFull,
af2cp_sTxPort.c2.mmioRdValid,
cci_wr_req_fire,
cci_wr_rsp_fire,
cci_rd_req_fire,
mem_req_fire,
mem_rsp_fire,
avs_write_fire,
avs_read_fire,
avs_waitrequest[0],
avs_readdatavalid[0],
cp2af_sRxPort.c0.mmioRdValid,
cp2af_sRxPort.c0.mmioWrValid,
cp2af_sRxPort.c0.rspValid,
cp2af_sRxPort.c1.rspValid,
af2cp_sTxPort.c0.valid,
af2cp_sTxPort.c1.valid,
cp2af_sRxPort.c0TxAlmFull,
cp2af_sRxPort.c1TxAlmFull,
af2cp_sTxPort.c2.mmioRdValid,
cci_wr_req_fire,
cci_wr_rsp_fire,
cci_rd_req_fire,
cci_rd_rsp_fire,
cci_pending_reads_full,
cci_pending_reads_full,
cci_pending_writes_empty,
cci_pending_writes_full
}),
@ -1067,7 +1079,6 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.bus_in(scope_bus_in_w[0]),
.bus_out(scope_bus_out_w[0])
);
`endif
`else
`SCOPE_IO_UNUSED_W(0)
`endif
@ -1078,13 +1089,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
always @(posedge clk) begin
for (integer i = 0; i < NUM_LOCAL_MEM_BANKS; ++i) begin
if (avs_write[i] && ~avs_waitrequest[i]) begin
`TRACE(2, ("%d: AVS Wr Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h, data=0x%0h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i], avs_writedata[i]));
`TRACE(2, ("%d: AVS Wr Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h, data=0x%h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i], avs_writedata[i]));
end
if (avs_read[i] && ~avs_waitrequest[i]) begin
if (avs_read[i] && ~avs_waitrequest[i]) begin
`TRACE(2, ("%d: AVS Rd Req [%0d]: addr=0x%0h, byteen=0x%0h, burst=0x%0h\n", $time, i, `TO_FULL_ADDR(avs_address[i]), avs_byteenable[i], avs_burstcount[i]));
end
end
if (avs_readdatavalid[i]) begin
`TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%0h\n", $time, i, avs_readdata[i]));
`TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%h\n", $time, i, avs_readdata[i]));
end
end
end

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -22,34 +22,34 @@ module VX_afu_ctrl #(
input wire clk,
input wire reset,
input wire clk_en,
input wire s_axi_awvalid,
input wire [AXI_ADDR_WIDTH-1:0] s_axi_awaddr,
input wire [AXI_ADDR_WIDTH-1:0] s_axi_awaddr,
output wire s_axi_awready,
input wire s_axi_wvalid,
input wire [AXI_DATA_WIDTH-1:0] s_axi_wdata,
input wire [AXI_DATA_WIDTH/8-1:0] s_axi_wstrb,
input wire [AXI_DATA_WIDTH/8-1:0] s_axi_wstrb,
output wire s_axi_wready,
output wire s_axi_bvalid,
output wire [1:0] s_axi_bresp,
output wire [1:0] s_axi_bresp,
input wire s_axi_bready,
input wire s_axi_arvalid,
input wire [AXI_ADDR_WIDTH-1:0] s_axi_araddr,
input wire [AXI_ADDR_WIDTH-1:0] s_axi_araddr,
output wire s_axi_arready,
output wire s_axi_rvalid,
output wire [AXI_DATA_WIDTH-1:0] s_axi_rdata,
output wire [1:0] s_axi_rresp,
input wire s_axi_rready,
output wire [1:0] s_axi_rresp,
input wire s_axi_rready,
output wire ap_reset,
output wire ap_start,
input wire ap_done,
input wire ap_ready,
input wire ap_idle,
input wire ap_idle,
output wire interrupt,
`ifdef SCOPE
@ -101,7 +101,7 @@ module VX_afu_ctrl #(
// 0x48 : Control signal of MEM
// (SC = Self Clear, COR = Clear on Read, TOW = Toggle on Write, COH = Clear on Handshake)
// Parameters
// Parameters
localparam
ADDR_AP_CTRL = 8'h00,
ADDR_GIE = 8'h04,
@ -110,45 +110,47 @@ module VX_afu_ctrl #(
ADDR_DEV_0 = 8'h10,
ADDR_DEV_1 = 8'h14,
ADDR_DEV_CTRL = 8'h18,
//ADDR_DEV_CTRL = 8'h18,
ADDR_ISA_0 = 8'h1C,
ADDR_ISA_1 = 8'h20,
ADDR_ISA_CTRL = 8'h24,
//ADDR_ISA_CTRL = 8'h24,
ADDR_DCR_0 = 8'h28,
ADDR_DCR_1 = 8'h2C,
ADDR_DCR_CTRL = 8'h30,
//ADDR_DCR_CTRL = 8'h30,
`ifdef SCOPE
ADDR_SCP_0 = 8'h34,
ADDR_SCP_1 = 8'h38,
ADDR_SCP_CTRL = 8'h3C,
//ADDR_SCP_CTRL = 8'h3C,
`endif
ADDR_MEM_0 = 8'h40,
ADDR_MEM_1 = 8'h44,
ADDR_MEM_CTRL = 8'h48,
//ADDR_MEM_CTRL = 8'h48,
ADDR_BITS = 8;
localparam
WSTATE_IDLE = 2'd0,
WSTATE_DATA = 2'd1,
WSTATE_RESP = 2'd2;
localparam
RSTATE_IDLE = 2'd0,
RSTATE_DATA = 2'd1;
// device caps
wire [63:0] dev_caps = {16'b0,
8'(`SM_ENABLED ? `SMEM_LOG_SIZE : 0),
16'(`NUM_CORES * `NUM_CLUSTERS),
8'(`NUM_WARPS),
8'(`NUM_THREADS),
8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
16'(`NUM_CORES * `NUM_CLUSTERS),
8'(`NUM_WARPS),
8'(`NUM_THREADS),
8'(`IMPLEMENTATION_ID)};
wire [63:0] isa_caps = {32'(`MISA_EXT),
2'(`CLOG2(`XLEN)-4),
wire [63:0] isa_caps = {32'(`MISA_EXT),
2'(`CLOG2(`XLEN)-4),
30'(`MISA_STD)};
reg [1:0] wstate;
@ -156,7 +158,7 @@ module VX_afu_ctrl #(
wire [31:0] wmask;
wire s_axi_aw_fire;
wire s_axi_w_fire;
reg [1:0] rstate;
reg [31:0] rdata;
wire [ADDR_BITS-1:0] raddr;
@ -171,12 +173,12 @@ module VX_afu_ctrl #(
reg [63:0] mem_r [AXI_NUM_BANKS];
reg [31:0] dcra_r;
reg [31:0] dcrv_r;
reg dcr_wr_valid_r;
reg dcr_wr_valid_r;
`ifdef SCOPE
reg [63:0] scope_bus_wdata;
reg [63:0] scope_bus_rdata;
reg [63:0] scope_bus_rdata;
reg [5:0] scope_bus_ctr;
reg cmd_scope_reading;
@ -186,7 +188,7 @@ module VX_afu_ctrl #(
always @(posedge clk) begin
if (reset) begin
cmd_scope_reading <= 0;
cmd_scope_writing <= 0;
cmd_scope_writing <= 0;
scope_bus_ctr <= '0;
scope_bus_out_r <= 0;
end else if (clk_en) begin
@ -194,29 +196,29 @@ module VX_afu_ctrl #(
scope_bus_wdata[31:0] <= (s_axi_wdata & wmask) | (scope_bus_wdata[31:0] & ~wmask);
end
if (s_axi_w_fire && waddr == ADDR_SCP_1) begin
scope_bus_wdata[63:32] <= (s_axi_wdata & wmask) | (scope_bus_wdata[63:32] & ~wmask);
scope_bus_wdata[63:32] <= (s_axi_wdata & wmask) | (scope_bus_wdata[63:32] & ~wmask);
cmd_scope_writing <= 1;
scope_bus_out_r <= 1;
scope_bus_out_r <= 1;
scope_bus_ctr <= 63;
end
end
if (scope_bus_in) begin
cmd_scope_reading <= 1;
scope_bus_ctr <= 63;
end
end
if (cmd_scope_reading) begin
scope_bus_rdata <= {scope_bus_rdata[62:0], scope_bus_in};
scope_bus_ctr <= scope_bus_ctr - 1;
if (scope_bus_ctr == 0) begin
cmd_scope_reading <= 0;
end
end
end
if (cmd_scope_writing) begin
scope_bus_out_r <= 1'(scope_bus_wdata >> scope_bus_ctr);
scope_bus_ctr <= scope_bus_ctr - 1;
if (scope_bus_ctr == 0) begin
cmd_scope_writing <= 0;
end
end
end
end
end
@ -224,7 +226,7 @@ module VX_afu_ctrl #(
`endif
// AXI Write
// AXI Write
assign s_axi_awready = (wstate == WSTATE_IDLE);
assign s_axi_wready = (wstate == WSTATE_DATA);
@ -259,14 +261,14 @@ module VX_afu_ctrl #(
waddr <= s_axi_awaddr[ADDR_BITS-1:0];
end
end
// wdata
always @(posedge clk) begin
if (reset) begin
ap_start_r <= 0;
ap_reset_r <= 0;
auto_restart_r <= 0;
gie_r <= 0;
ier_r <= '0;
isr_r <= '0;
@ -287,7 +289,7 @@ module VX_afu_ctrl #(
if (s_axi_w_fire) begin
case (waddr)
ADDR_AP_CTRL: begin
if (s_axi_wstrb[0]) begin
if (s_axi_wstrb[0]) begin
if (s_axi_wdata[0])
ap_start_r <= 1;
if (s_axi_wdata[4])
@ -317,16 +319,16 @@ module VX_afu_ctrl #(
end
default: begin
for (integer i = 0; i < AXI_NUM_BANKS; ++i) begin
if (waddr == (ADDR_MEM_0 + i * 12)) begin
if (waddr == (ADDR_MEM_0 + 8'(i) * 8'd12)) begin
mem_r[i][31:0] <= (s_axi_wdata & wmask) | (mem_r[i][31:0] & ~wmask);
end
if (waddr == (ADDR_MEM_1 + i * 12)) begin
if (waddr == (ADDR_MEM_1 + 8'(i) * 8'd12)) begin
mem_r[i][63:32] <= (s_axi_wdata & wmask) | (mem_r[i][63:32] & ~wmask);
end
end
end
endcase
if (ier_r[0] & ap_done)
isr_r[0] <= 1'b1;
if (ier_r[1] & ap_ready)
@ -341,10 +343,10 @@ module VX_afu_ctrl #(
assign s_axi_rvalid = (rstate == RSTATE_DATA);
assign s_axi_rdata = rdata;
assign s_axi_rresp = 2'b00; // OKAY
assign s_axi_ar_fire = s_axi_arvalid && s_axi_arready;
assign raddr = s_axi_araddr[ADDR_BITS-1:0];
// rstate
always @(posedge clk) begin
if (reset) begin
@ -414,6 +416,6 @@ module VX_afu_ctrl #(
assign dcr_wr_valid = dcr_wr_valid_r;
assign dcr_wr_addr = `VX_DCR_ADDR_WIDTH'(dcra_r);
assign dcr_wr_data = `VX_DCR_DATA_WIDTH'(dcrv_r);
assign dcr_wr_data = `VX_DCR_DATA_WIDTH'(dcrv_r);
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -13,12 +13,12 @@
`include "vortex_afu.vh"
module VX_afu_wrap #(
module VX_afu_wrap #(
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
parameter C_M_AXI_MEM_ID_WIDTH = 16,
parameter C_M_AXI_MEM_ADDR_WIDTH = 32,
parameter C_M_AXI_MEM_DATA_WIDTH = 512
parameter C_M_AXI_MEM_ID_WIDTH = `M_AXI_MEM_ID_WIDTH,
parameter C_M_AXI_MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter C_M_AXI_MEM_DATA_WIDTH = `VX_MEM_DATA_WIDTH
) (
// System signals
input wire ap_clk,
@ -45,8 +45,8 @@ module VX_afu_wrap #(
output wire s_axi_ctrl_bvalid,
input wire s_axi_ctrl_bready,
output wire [1:0] s_axi_ctrl_bresp,
output wire interrupt
output wire interrupt
);
localparam C_M_AXI_MEM_NUM_BANKS = `M_AXI_MEM_NUM_BANKS;
@ -62,7 +62,7 @@ module VX_afu_wrap #(
wire m_axi_mem_wready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_wdata_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_DATA_WIDTH/8-1:0] m_axi_mem_wstrb_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_wlast_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_wlast_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_bvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_bready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_bid_a [C_M_AXI_MEM_NUM_BANKS];
@ -82,17 +82,16 @@ module VX_afu_wrap #(
// convert memory interface to array
`REPEAT (`M_AXI_MEM_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
wire clk = ap_clk;
wire reset = ~ap_rst_n;
reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr;
reg [15:0] vx_pending_writes;
reg vx_busy_wait;
reg vx_running;
wire vx_busy;
wire [63:0] mem_base [C_M_AXI_MEM_NUM_BANKS];
wire [63:0] mem_base [C_M_AXI_MEM_NUM_BANKS];
wire dcr_wr_valid;
wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr;
@ -109,7 +108,7 @@ module VX_afu_wrap #(
`ifdef SCOPE
wire scope_bus_in;
wire scope_bus_out;
wire scope_reset = reset;
wire scope_reset = reset;
`endif
always @(posedge ap_clk) begin
@ -120,15 +119,15 @@ module VX_afu_wrap #(
end else begin
case (state)
STATE_IDLE: begin
if (ap_start) begin
if (ap_start) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: STATE RUN\n", $time));
`endif
`endif
state <= STATE_RUN;
vx_running <= 0;
end
end
STATE_RUN: begin
STATE_RUN: begin
if (vx_running) begin
if (vx_busy_wait) begin
// wait until processor goes busy
@ -151,7 +150,7 @@ module VX_afu_wrap #(
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: AFU: Begin execution\n", $time));
`endif
vx_running <= 1;
vx_running <= 1;
vx_busy_wait <= 1;
end
end
@ -185,7 +184,7 @@ module VX_afu_wrap #(
always @(posedge ap_clk) begin
if (state == STATE_RUN) begin
vx_reset_ctr <= vx_reset_ctr + 1;
vx_reset_ctr <= vx_reset_ctr + 1;
end else begin
vx_reset_ctr <= '0;
end
@ -197,9 +196,9 @@ module VX_afu_wrap #(
.AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
) afu_ctrl (
.clk (ap_clk),
.reset (reset || ap_reset),
.reset (reset || ap_reset),
.clk_en (1'b1),
.s_axi_awvalid (s_axi_ctrl_awvalid),
.s_axi_awready (s_axi_ctrl_awready),
.s_axi_awaddr (s_axi_ctrl_awaddr),
@ -226,9 +225,9 @@ module VX_afu_wrap #(
.interrupt (interrupt),
`ifdef SCOPE
.scope_bus_in (scope_bus_out),
.scope_bus_in (scope_bus_out),
.scope_bus_out (scope_bus_in),
`endif
`endif
.mem_base (mem_base),
@ -237,8 +236,8 @@ module VX_afu_wrap #(
.dcr_wr_data (dcr_wr_data)
);
wire [`XLEN-1:0] m_axi_mem_awaddr_w [C_M_AXI_MEM_NUM_BANKS];
wire [`XLEN-1:0] m_axi_mem_araddr_w [C_M_AXI_MEM_NUM_BANKS];
wire [`MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_w [C_M_AXI_MEM_NUM_BANKS];
wire [`MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_w [C_M_AXI_MEM_NUM_BANKS];
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_w[i]) + C_M_AXI_MEM_ADDR_WIDTH'(mem_base[i]);
@ -249,7 +248,7 @@ module VX_afu_wrap #(
Vortex_axi #(
.AXI_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH),
.AXI_ADDR_WIDTH (`XLEN),
.AXI_ADDR_WIDTH (`MEM_ADDR_WIDTH),
.AXI_TID_WIDTH (C_M_AXI_MEM_ID_WIDTH),
.AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
) vortex_axi (
@ -257,7 +256,7 @@ module VX_afu_wrap #(
.clk (ap_clk),
.reset (reset || ap_reset || ~vx_running),
.m_axi_awvalid (m_axi_mem_awvalid_a),
.m_axi_awready (m_axi_mem_awready_a),
.m_axi_awaddr (m_axi_mem_awaddr_w),
@ -268,7 +267,7 @@ module VX_afu_wrap #(
`UNUSED_PIN (m_axi_awlock),
`UNUSED_PIN (m_axi_awcache),
`UNUSED_PIN (m_axi_awprot),
`UNUSED_PIN (m_axi_awqos),
`UNUSED_PIN (m_axi_awqos),
`UNUSED_PIN (m_axi_awregion),
.m_axi_wvalid (m_axi_mem_wvalid_a),
@ -280,7 +279,7 @@ module VX_afu_wrap #(
.m_axi_bvalid (m_axi_mem_bvalid_a),
.m_axi_bready (m_axi_mem_bready_a),
.m_axi_bid (m_axi_mem_bid_a),
.m_axi_bresp (m_axi_mem_bresp_a),
.m_axi_bresp (m_axi_mem_bresp_a),
.m_axi_arvalid (m_axi_mem_arvalid_a),
.m_axi_arready (m_axi_mem_arready_a),
@ -292,7 +291,7 @@ module VX_afu_wrap #(
`UNUSED_PIN (m_axi_arlock),
`UNUSED_PIN (m_axi_arcache),
`UNUSED_PIN (m_axi_arprot),
`UNUSED_PIN (m_axi_arqos),
`UNUSED_PIN (m_axi_arqos),
`UNUSED_PIN (m_axi_arregion),
.m_axi_rvalid (m_axi_mem_rvalid_a),
@ -312,7 +311,6 @@ module VX_afu_wrap #(
// SCOPE //////////////////////////////////////////////////////////////////////
`ifdef DBG_SCOPE_AFU
`ifdef SCOPE
`define TRIGGERS { \
reset, \
ap_start, \
@ -331,35 +329,17 @@ module VX_afu_wrap #(
VX_scope_tap #(
.SCOPE_ID (0),
.TRIGGERW ($bits(`TRIGGERS)),
.PROBEW ($bits(`PROBES))
.PROBEW ($bits(`PROBES))
) scope_tap (
.clk(clk),
.reset(scope_reset_w[0]),
.start(1'b0),
.stop(1'b0),
.triggers(`TRIGGERS),
.probes(`PROBES),
.bus_in(scope_bus_in_w[0]),
.bus_out(scope_bus_out_w[0])
.clk (clk),
.reset (scope_reset_w[0]),
.start (1'b0),
.stop (1'b0),
.triggers (`TRIGGERS),
.probes (`PROBES),
.bus_in (scope_bus_in_w[0]),
.bus_out (scope_bus_out_w[0])
);
`endif
`ifdef CHIPSCOPE
ila_afu ila_afu_inst (
.clk (ap_clk),
.probe0 ({
ap_start,
ap_done,
ap_idle,
interrupt
}),
.probe1 ({
vx_pending_writes,
vx_busy_wait,
vx_busy,
vx_running
})
);
`endif
`else
`SCOPE_IO_UNUSED_W(0)
`endif
@ -370,13 +350,13 @@ module VX_afu_wrap #(
reg [`CLOG2(`RESET_DELAY+1)-1:0] assert_delay_ctr;
reg assert_enabled;
initial begin
$assertoff(0, vortex_axi);
end
$assertoff(0, vortex_axi);
end
always @(posedge ap_clk) begin
if (reset) begin
assert_delay_ctr <= '0;
assert_enabled <= 0;
end else begin
end else begin
if (~assert_enabled) begin
if (assert_delay_ctr == (`RESET_DELAY-1)) begin
assert_enabled <= 1;
@ -394,19 +374,19 @@ module VX_afu_wrap #(
always @(posedge ap_clk) begin
for (integer i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
if (m_axi_mem_awvalid_a[i] && m_axi_mem_awready_a[i]) begin
`TRACE(2, ("%d: AFU Wr Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]));
`TRACE(2, ("%d: AFU Wr Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]));
end
if (m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i]) begin
`TRACE(2, ("%d: AFU Wr Req [%0d]: data=0x%0h\n", $time, i, m_axi_mem_wdata_a[i]));
`TRACE(2, ("%d: AFU Wr Req [%0d]: data=0x%h\n", $time, i, m_axi_mem_wdata_a[i]));
end
if (m_axi_mem_arvalid_a[i] && m_axi_mem_arready_a[i]) begin
if (m_axi_mem_arvalid_a[i] && m_axi_mem_arready_a[i]) begin
`TRACE(2, ("%d: AFU Rd Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i]));
end
if (m_axi_mem_rvalid_a[i] && m_axi_mem_rready_a[i]) begin
`TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]));
`TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]));
end
end
end
`endif
endmodule

127
hw/rtl/cache/VX_bank_flush.sv vendored Normal file
View file

@ -0,0 +1,127 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_bank_flush #(
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// Enable cache writeback
parameter WRITEBACK = 0
) (
input wire clk,
input wire reset,
input wire flush_begin,
output wire flush_end,
output wire flush_init,
output wire flush_valid,
output wire [`CS_LINE_SEL_BITS-1:0] flush_line,
output wire [NUM_WAYS-1:0] flush_way,
input wire flush_ready,
input wire mshr_empty,
input wire bank_empty
);
// ways interation is only needed when eviction is enabled
localparam CTR_WIDTH = `CS_LINE_SEL_BITS + (WRITEBACK ? `CS_WAY_SEL_BITS : 0);
localparam STATE_IDLE = 0;
localparam STATE_INIT = 1;
localparam STATE_WAIT1 = 2;
localparam STATE_FLUSH = 3;
localparam STATE_WAIT2 = 4;
localparam STATE_DONE = 5;
reg [2:0] state_r, state_n;
reg [CTR_WIDTH-1:0] counter_r;
always @(*) begin
state_n = state_r;
case (state_r)
STATE_IDLE: begin
if (flush_begin) begin
state_n = STATE_WAIT1;
end
end
STATE_INIT: begin
if (counter_r == ((2 ** `CS_LINE_SEL_BITS)-1)) begin
state_n = STATE_IDLE;
end
end
STATE_WAIT1: begin
// wait for pending requests to complete
if (mshr_empty) begin
state_n = STATE_FLUSH;
end
end
STATE_FLUSH: begin
if (counter_r == ((2 ** CTR_WIDTH)-1) && flush_ready) begin
state_n = (BANK_ID == 0) ? STATE_DONE : STATE_WAIT2;
end
end
STATE_WAIT2: begin
// ensure the bank is empty before notifying the cache flush unit,
// because the flush request to lower caches only goes through bank0
// and it is important that request gets send out last.
if (bank_empty) begin
state_n = STATE_DONE;
end
end
STATE_DONE: begin
// generate a completion pulse
state_n = STATE_IDLE;
end
endcase
end
always @(posedge clk) begin
if (reset) begin
state_r <= STATE_INIT;
counter_r <= '0;
end else begin
state_r <= state_n;
if (state_r != STATE_IDLE) begin
if ((state_r == STATE_INIT)
|| ((state_r == STATE_FLUSH) && flush_ready)) begin
counter_r <= counter_r + CTR_WIDTH'(1);
end
end else begin
counter_r <= '0;
end
end
end
assign flush_end = (state_r == STATE_DONE);
assign flush_init = (state_r == STATE_INIT);
assign flush_valid = (state_r == STATE_FLUSH);
assign flush_line = counter_r[`CS_LINE_SEL_BITS-1:0];
if (WRITEBACK && `CS_WAY_SEL_BITS > 0) begin
reg [NUM_WAYS-1:0] flush_way_r;
always @(*) begin
flush_way_r = '0;
flush_way_r[counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]] = 1;
end
assign flush_way = flush_way_r;
end else begin
assign flush_way = {NUM_WAYS{1'b1}};
end
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,15 +14,15 @@
`include "VX_cache_define.vh"
module VX_cache import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter `STRING INSTANCE_ID = "",
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Size of cache in bytes
parameter CACHE_SIZE = 4096,
parameter CACHE_SIZE = 4096,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
@ -33,7 +33,7 @@ module VX_cache import VX_gpu_pkg::*; #(
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 8,
parameter MSHR_SIZE = 8,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
@ -42,6 +42,12 @@ module VX_cache import VX_gpu_pkg::*; #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -49,16 +55,16 @@ module VX_cache import VX_gpu_pkg::*; #(
parameter TAG_WIDTH = UUID_WIDTH + 1,
// Core response output register
parameter CORE_OUT_REG = 0,
parameter CORE_OUT_BUF = 0,
// Memory request output register
parameter MEM_OUT_REG = 0
) (
parameter MEM_OUT_BUF = 0
) (
// PERF
`ifdef PERF_ENABLE
output cache_perf_t cache_perf,
`endif
input wire clk,
input wire reset,
@ -66,8 +72,13 @@ module VX_cache import VX_gpu_pkg::*; #(
VX_mem_bus_if.master mem_bus_if
);
`STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid parameter"))
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter: number of banks must be power of 2"))
`STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter: writeback requires write enable"))
`STATIC_ASSERT(WRITEBACK || !DIRTY_BYTES, ("invalid parameter: dirty bytes require writeback"))
// In writeback mode, memory fill response may issue a new memory request to handle evicted blocks.
// We need to ensure that the memory request queue never fills up to avoid deadlock.
`STATIC_ASSERT(!WRITEBACK || (MREQ_SIZE >= MSHR_SIZE), ("invalid parameter: writeback requires MREQ_SIZE >= MSHR_SIZE"))
localparam REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS);
localparam WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS);
@ -79,35 +90,43 @@ module VX_cache import VX_gpu_pkg::*; #(
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam LINE_ADDR_WIDTH = (`CS_WORD_ADDR_WIDTH - BANK_SEL_BITS - WORD_SEL_BITS);
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH;
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + 1;
localparam CORE_RSP_DATAW = WORD_WIDTH + TAG_WIDTH;
localparam CORE_REQ_BUF_ENABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
localparam MEM_REQ_BUF_ENABLE = (NUM_BANKS != 1);
localparam REQ_XBAR_BUF = (NUM_REQS > 4) ? 2 : 0;
`ifdef PERF_ENABLE
wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
wire [NUM_BANKS-1:0] perf_write_miss_per_bank;
wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank;
`endif
wire [NUM_REQS-1:0] core_req_valid;
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr;
wire [NUM_REQS-1:0] core_req_rw;
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag;
wire [NUM_REQS-1:0] core_req_ready;
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (TAG_WIDTH)
) core_bus2_if[NUM_REQS]();
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_valid[i] = core_bus_if[i].req_valid;
assign core_req_addr[i] = core_bus_if[i].req_data.addr;
assign core_req_rw[i] = core_bus_if[i].req_data.rw;
assign core_req_byteen[i] = core_bus_if[i].req_data.byteen;
assign core_req_data[i] = core_bus_if[i].req_data.data;
assign core_req_tag[i] = core_bus_if[i].req_data.tag;
assign core_bus_if[i].req_ready = core_req_ready[i];
end
wire [NUM_BANKS-1:0] per_bank_flush_begin;
wire [NUM_BANKS-1:0] per_bank_flush_end;
wire [NUM_BANKS-1:0] per_bank_core_req_fire;
VX_cache_flush #(
.NUM_REQS (NUM_REQS),
.NUM_BANKS (NUM_BANKS),
.BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // bank xbar latency
) flush_unit (
.clk (clk),
.reset (reset),
.core_bus_in_if (core_bus_if),
.core_bus_out_if (core_bus2_if),
.bank_req_fire (per_bank_core_req_fire),
.flush_begin (per_bank_flush_begin),
.flush_end (per_bank_flush_end)
);
///////////////////////////////////////////////////////////////////////////
@ -117,23 +136,23 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
wire [NUM_REQS-1:0] core_rsp_ready_s;
for (genvar i = 0; i < NUM_REQS; ++i) begin
`RESET_RELAY_EX (core_rsp_reset, reset, NUM_REQS, `MAX_FANOUT);
`RESET_RELAY (core_rsp_reset, reset);
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
.SIZE (CORE_REQ_BUF_ENABLE ? `OUT_REG_TO_EB_SIZE(CORE_OUT_REG) : 0),
.OUT_REG (`OUT_REG_TO_EB_REG(CORE_OUT_REG))
.SIZE (CORE_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
.clk (clk),
.reset (core_rsp_reset),
.reset (core_rsp_reset[i]),
.valid_in (core_rsp_valid_s[i]),
.ready_in (core_rsp_ready_s[i]),
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
.data_out ({core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag}),
.valid_out (core_bus_if[i].rsp_valid),
.ready_out (core_bus_if[i].rsp_ready)
.data_out ({core_bus2_if[i].rsp_data.data, core_bus2_if[i].rsp_data.tag}),
.valid_out (core_bus2_if[i].rsp_valid),
.ready_out (core_bus2_if[i].rsp_ready)
);
end
@ -146,25 +165,28 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [LINE_SIZE-1:0] mem_req_byteen_s;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s;
wire mem_req_flush_s;
wire mem_req_ready_s;
`RESET_RELAY (mem_req_buf_reset, reset);
wire mem_bus_if_flush;
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH),
.SIZE (MEM_REQ_BUF_ENABLE ? `OUT_REG_TO_EB_SIZE(MEM_OUT_REG) : 0),
.OUT_REG (`OUT_REG_TO_EB_REG(MEM_OUT_REG))
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + 1),
.SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (mem_req_buf_reset),
.valid_in (mem_req_valid_s),
.ready_in (mem_req_ready_s),
.data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s}),
.data_out ({mem_bus_if.req_data.rw, mem_bus_if.req_data.byteen, mem_bus_if.req_data.addr, mem_bus_if.req_data.data, mem_bus_if.req_data.tag}),
.valid_out (mem_bus_if.req_valid),
.reset (reset),
.valid_in (mem_req_valid_s),
.ready_in (mem_req_ready_s),
.data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s, mem_req_flush_s}),
.data_out ({mem_bus_if.req_data.rw, mem_bus_if.req_data.byteen, mem_bus_if.req_data.addr, mem_bus_if.req_data.data, mem_bus_if.req_data.tag, mem_bus_if_flush}),
.valid_out (mem_bus_if.req_valid),
.ready_out (mem_bus_if.req_ready)
);
assign mem_bus_if.req_data.atype = mem_bus_if_flush ? `ADDR_TYPE_WIDTH'(1 << `ADDR_TYPE_FLUSH) : '0;
///////////////////////////////////////////////////////////////////////////
// Memory response buffering
@ -173,44 +195,23 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s;
wire mem_rsp_ready_s;
`RESET_RELAY (mem_rsp_reset, reset);
VX_elastic_buffer #(
.DATAW (MEM_TAG_WIDTH + `CS_LINE_WIDTH),
.DATAW (MEM_TAG_WIDTH + `CS_LINE_WIDTH),
.SIZE (MRSQ_SIZE),
.OUT_REG (MRSQ_SIZE > 2)
) mem_rsp_queue (
.clk (clk),
.reset (mem_rsp_reset),
.reset (reset),
.valid_in (mem_bus_if.rsp_valid),
.ready_in (mem_bus_if.rsp_ready),
.data_in ({mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data}),
.data_out ({mem_rsp_tag_s, mem_rsp_data_s}),
.data_in ({mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data}),
.data_out ({mem_rsp_tag_s, mem_rsp_data_s}),
.valid_out (mem_rsp_valid_s),
.ready_out (mem_rsp_ready_s)
);
///////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////
wire [`CS_LINE_SEL_BITS-1:0] init_line_sel;
wire init_enable;
`RESET_RELAY (init_reset, reset);
VX_cache_init #(
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS)
) cache_init (
.clk (clk),
.reset (init_reset),
.addr_out (init_line_sel),
.valid_out (init_enable)
);
///////////////////////////////////////////////////////////////////////
wire [NUM_BANKS-1:0] per_bank_core_req_valid;
wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
wire [NUM_BANKS-1:0] per_bank_core_req_rw;
@ -219,25 +220,28 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_req_data;
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_req_tag;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_req_idx;
wire [NUM_BANKS-1:0] per_bank_core_req_flush;
wire [NUM_BANKS-1:0] per_bank_core_req_ready;
wire [NUM_BANKS-1:0] per_bank_core_rsp_valid;
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_rsp_data;
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_rsp_tag;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_rsp_idx;
wire [NUM_BANKS-1:0] per_bank_core_rsp_ready;
wire [NUM_BANKS-1:0] per_bank_mem_req_valid;
wire [NUM_BANKS-1:0] per_bank_mem_req_valid;
wire [NUM_BANKS-1:0][`CS_MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
wire [NUM_BANKS-1:0] per_bank_mem_req_rw;
wire [NUM_BANKS-1:0][WORD_SEL_WIDTH-1:0] per_bank_mem_req_wsel;
wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_mem_req_byteen;
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_mem_req_data;
wire [NUM_BANKS-1:0][LINE_SIZE-1:0] per_bank_mem_req_byteen;
wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_req_data;
wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0] per_bank_mem_req_id;
wire [NUM_BANKS-1:0] per_bank_mem_req_flush;
wire [NUM_BANKS-1:0] per_bank_mem_req_ready;
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready;
if (NUM_BANKS == 1) begin
assign mem_rsp_ready_s = per_bank_mem_rsp_ready;
end else begin
@ -246,12 +250,33 @@ module VX_cache import VX_gpu_pkg::*; #(
// Bank requests dispatch
wire [NUM_REQS-1:0][CORE_REQ_DATAW-1:0] core_req_data_in;
wire [NUM_BANKS-1:0][CORE_REQ_DATAW-1:0] core_req_data_out;
wire [NUM_REQS-1:0] core_req_valid;
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr;
wire [NUM_REQS-1:0] core_req_rw;
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag;
wire [NUM_REQS-1:0] core_req_flush;
wire [NUM_REQS-1:0] core_req_ready;
wire [NUM_REQS-1:0][LINE_ADDR_WIDTH-1:0] core_req_line_addr;
wire [NUM_REQS-1:0][BANK_SEL_WIDTH-1:0] core_req_bid;
wire [NUM_REQS-1:0][WORD_SEL_WIDTH-1:0] core_req_wsel;
wire [NUM_REQS-1:0][CORE_REQ_DATAW-1:0] core_req_data_in;
wire [NUM_BANKS-1:0][CORE_REQ_DATAW-1:0] core_req_data_out;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_valid[i] = core_bus2_if[i].req_valid;
assign core_req_rw[i] = core_bus2_if[i].req_data.rw;
assign core_req_byteen[i] = core_bus2_if[i].req_data.byteen;
assign core_req_addr[i] = core_bus2_if[i].req_data.addr;
assign core_req_data[i] = core_bus2_if[i].req_data.data;
assign core_req_tag[i] = core_bus2_if[i].req_data.tag;
assign core_req_flush[i] = core_bus2_if[i].req_data.atype[`ADDR_TYPE_FLUSH];
assign core_bus2_if[i].req_ready = core_req_ready[i];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (WORDS_PER_LINE > 1) begin
assign core_req_wsel[i] = core_req_addr[i][0 +: WORD_SEL_BITS];
@ -274,9 +299,11 @@ module VX_cache import VX_gpu_pkg::*; #(
core_req_line_addr[i],
core_req_rw[i],
core_req_wsel[i],
core_req_byteen[i],
core_req_byteen[i],
core_req_data[i],
core_req_tag[i]};
core_req_tag[i],
core_req_flush[i]
};
end
`ifdef PERF_ENABLE
@ -285,11 +312,13 @@ module VX_cache import VX_gpu_pkg::*; #(
`RESET_RELAY (req_xbar_reset, reset);
VX_stream_xbar #(
VX_stream_xbar #(
.NUM_INPUTS (NUM_REQS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (CORE_REQ_DATAW),
.PERF_CTR_BITS (`PERF_CTR_BITS)
.PERF_CTR_BITS (`PERF_CTR_BITS),
.ARBITER ("F"),
.OUT_BUF (REQ_XBAR_BUF)
) req_xbar (
.clk (clk),
.reset (req_xbar_reset),
@ -313,27 +342,29 @@ module VX_cache import VX_gpu_pkg::*; #(
per_bank_core_req_addr[i],
per_bank_core_req_rw[i],
per_bank_core_req_wsel[i],
per_bank_core_req_byteen[i],
per_bank_core_req_byteen[i],
per_bank_core_req_data[i],
per_bank_core_req_tag[i]} = core_req_data_out[i];
per_bank_core_req_tag[i],
per_bank_core_req_flush[i]
} = core_req_data_out[i];
end
// Banks access
for (genvar i = 0; i < NUM_BANKS; ++i) begin
for (genvar bank_id = 0; bank_id < NUM_BANKS; ++bank_id) begin : banks
wire [`CS_LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr;
wire curr_bank_mem_rsp_valid;
if (NUM_BANKS == 1) begin
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s;
end else begin
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s && (`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s) == i);
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s && (`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s) == bank_id);
end
`RESET_RELAY (bank_reset, reset);
VX_cache_bank #(
.BANK_ID (i),
.INSTANCE_ID (INSTANCE_ID),
VX_cache_bank #(
.BANK_ID (bank_id),
.INSTANCE_ID ($sformatf("%s-bank%0d", INSTANCE_ID, bank_id)),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
@ -344,65 +375,67 @@ module VX_cache import VX_gpu_pkg::*; #(
.MSHR_SIZE (MSHR_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.DIRTY_BYTES (DIRTY_BYTES),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_OUT_REG (CORE_REQ_BUF_ENABLE ? 0 : CORE_OUT_REG),
.MEM_OUT_REG (MEM_REQ_BUF_ENABLE ? 0 : MEM_OUT_REG)
) bank (
.CORE_OUT_BUF (CORE_REQ_BUF_ENABLE ? 0 : CORE_OUT_BUF),
.MEM_OUT_BUF (MEM_REQ_BUF_ENABLE ? 0 : MEM_OUT_BUF)
) bank (
.clk (clk),
.reset (bank_reset),
`ifdef PERF_ENABLE
.perf_read_misses (perf_read_miss_per_bank[i]),
.perf_write_misses (perf_write_miss_per_bank[i]),
.perf_mshr_stalls (perf_mshr_stall_per_bank[i]),
.perf_read_misses (perf_read_miss_per_bank[bank_id]),
.perf_write_misses (perf_write_miss_per_bank[bank_id]),
.perf_mshr_stalls (perf_mshr_stall_per_bank[bank_id]),
`endif
// Core request
.core_req_valid (per_bank_core_req_valid[i]),
.core_req_addr (per_bank_core_req_addr[i]),
.core_req_rw (per_bank_core_req_rw[i]),
.core_req_wsel (per_bank_core_req_wsel[i]),
.core_req_byteen (per_bank_core_req_byteen[i]),
.core_req_data (per_bank_core_req_data[i]),
.core_req_tag (per_bank_core_req_tag[i]),
.core_req_idx (per_bank_core_req_idx[i]),
.core_req_ready (per_bank_core_req_ready[i]),
// Core response
.core_rsp_valid (per_bank_core_rsp_valid[i]),
.core_rsp_data (per_bank_core_rsp_data[i]),
.core_rsp_tag (per_bank_core_rsp_tag[i]),
.core_rsp_idx (per_bank_core_rsp_idx[i]),
.core_rsp_ready (per_bank_core_rsp_ready[i]),
// Core request
.core_req_valid (per_bank_core_req_valid[bank_id]),
.core_req_addr (per_bank_core_req_addr[bank_id]),
.core_req_rw (per_bank_core_req_rw[bank_id]),
.core_req_wsel (per_bank_core_req_wsel[bank_id]),
.core_req_byteen (per_bank_core_req_byteen[bank_id]),
.core_req_data (per_bank_core_req_data[bank_id]),
.core_req_tag (per_bank_core_req_tag[bank_id]),
.core_req_idx (per_bank_core_req_idx[bank_id]),
.core_req_flush (per_bank_core_req_flush[bank_id]),
.core_req_ready (per_bank_core_req_ready[bank_id]),
// Core response
.core_rsp_valid (per_bank_core_rsp_valid[bank_id]),
.core_rsp_data (per_bank_core_rsp_data[bank_id]),
.core_rsp_tag (per_bank_core_rsp_tag[bank_id]),
.core_rsp_idx (per_bank_core_rsp_idx[bank_id]),
.core_rsp_ready (per_bank_core_rsp_ready[bank_id]),
// Memory request
.mem_req_valid (per_bank_mem_req_valid[i]),
.mem_req_valid (per_bank_mem_req_valid[bank_id]),
.mem_req_addr (curr_bank_mem_req_addr),
.mem_req_rw (per_bank_mem_req_rw[i]),
.mem_req_wsel (per_bank_mem_req_wsel[i]),
.mem_req_byteen (per_bank_mem_req_byteen[i]),
.mem_req_data (per_bank_mem_req_data[i]),
.mem_req_id (per_bank_mem_req_id[i]),
.mem_req_ready (per_bank_mem_req_ready[i]),
.mem_req_rw (per_bank_mem_req_rw[bank_id]),
.mem_req_byteen (per_bank_mem_req_byteen[bank_id]),
.mem_req_data (per_bank_mem_req_data[bank_id]),
.mem_req_id (per_bank_mem_req_id[bank_id]),
.mem_req_flush (per_bank_mem_req_flush[bank_id]),
.mem_req_ready (per_bank_mem_req_ready[bank_id]),
// Memory response
.mem_rsp_valid (curr_bank_mem_rsp_valid),
.mem_rsp_data (mem_rsp_data_s),
.mem_rsp_id (`CS_MEM_TAG_TO_REQ_ID(mem_rsp_tag_s)),
.mem_rsp_ready (per_bank_mem_rsp_ready[i]),
.mem_rsp_ready (per_bank_mem_rsp_ready[bank_id]),
// initialization
.init_enable (init_enable),
.init_line_sel (init_line_sel)
.flush_begin (per_bank_flush_begin[bank_id]),
.flush_end (per_bank_flush_end[bank_id])
);
if (NUM_BANKS == 1) begin
assign per_bank_mem_req_addr[i] = curr_bank_mem_req_addr;
assign per_bank_mem_req_addr[bank_id] = curr_bank_mem_req_addr;
end else begin
assign per_bank_mem_req_addr[i] = `CS_LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, i);
assign per_bank_mem_req_addr[bank_id] = `CS_LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, bank_id);
end
end
end
// Bank responses gather
@ -418,7 +451,8 @@ module VX_cache import VX_gpu_pkg::*; #(
VX_stream_xbar #(
.NUM_INPUTS (NUM_BANKS),
.NUM_OUTPUTS (NUM_REQS),
.DATAW (CORE_RSP_DATAW)
.DATAW (CORE_RSP_DATAW),
.ARBITER ("F")
) rsp_xbar (
.clk (clk),
.reset (rsp_xbar_reset),
@ -442,39 +476,39 @@ module VX_cache import VX_gpu_pkg::*; #(
wire mem_req_valid_p;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_p;
wire mem_req_rw_p;
wire [WORD_SEL_WIDTH-1:0] mem_req_wsel_p;
wire [WORD_SIZE-1:0] mem_req_byteen_p;
wire [`CS_WORD_WIDTH-1:0] mem_req_data_p;
wire [LINE_SIZE-1:0] mem_req_byteen_p;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_p;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_p;
wire [MSHR_ADDR_WIDTH-1:0] mem_req_id_p;
wire mem_req_flush_p;
wire mem_req_ready_p;
// Memory request arbitration
wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + `CS_WORD_WIDTH)-1:0] data_in;
wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + 1)-1:0] data_in;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign data_in[i] = {per_bank_mem_req_addr[i],
per_bank_mem_req_rw[i],
per_bank_mem_req_wsel[i],
per_bank_mem_req_byteen[i],
per_bank_mem_req_data[i],
per_bank_mem_req_id[i]};
assign data_in[i] = {
per_bank_mem_req_addr[i],
per_bank_mem_req_rw[i],
per_bank_mem_req_byteen[i],
per_bank_mem_req_data[i],
per_bank_mem_req_id[i],
per_bank_mem_req_flush[i]
};
end
`RESET_RELAY (mem_req_arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (NUM_BANKS),
.DATAW (`CS_MEM_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + MSHR_ADDR_WIDTH),
.ARBITER ("R")
.DATAW (`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + MSHR_ADDR_WIDTH + 1),
.ARBITER ("F")
) mem_req_arb (
.clk (clk),
.reset (mem_req_arb_reset),
.reset (reset),
.valid_in (per_bank_mem_req_valid),
.ready_in (per_bank_mem_req_ready),
.data_in (data_in),
.data_out ({mem_req_addr_p, mem_req_rw_p, mem_req_wsel_p, mem_req_byteen_p, mem_req_data_p, mem_req_id_p}),
.data_out ({mem_req_addr_p, mem_req_rw_p, mem_req_byteen_p, mem_req_data_p, mem_req_id_p, mem_req_flush_p}),
.valid_out (mem_req_valid_p),
.ready_out (mem_req_ready_p),
`UNUSED_PIN (sel_out)
@ -482,44 +516,28 @@ module VX_cache import VX_gpu_pkg::*; #(
if (NUM_BANKS > 1) begin
wire [`CS_BANK_SEL_BITS-1:0] mem_req_bank_id = `CS_MEM_ADDR_TO_BANK_ID(mem_req_addr_p);
assign mem_req_tag_p = MEM_TAG_WIDTH'({mem_req_bank_id, mem_req_id_p});
assign mem_req_tag_p = MEM_TAG_WIDTH'({mem_req_bank_id, mem_req_id_p});
end else begin
assign mem_req_tag_p = MEM_TAG_WIDTH'(mem_req_id_p);
end
end
// Memory request multi-port handling
assign mem_req_valid_s = mem_req_valid_p;
assign mem_req_addr_s = mem_req_addr_p;
assign mem_req_tag_s = mem_req_tag_p;
assign mem_req_flush_s = mem_req_flush_p;
assign mem_req_ready_p = mem_req_ready_s;
if (WRITE_ENABLE != 0) begin
if (`CS_WORDS_PER_LINE > 1) begin
reg [LINE_SIZE-1:0] mem_req_byteen_r;
reg [`CS_LINE_WIDTH-1:0] mem_req_data_r;
always @(*) begin
mem_req_byteen_r = '0;
mem_req_data_r = 'x;
mem_req_byteen_r[mem_req_wsel_p * WORD_SIZE +: WORD_SIZE] = mem_req_byteen_p;
mem_req_data_r[mem_req_wsel_p * `CS_WORD_WIDTH +: `CS_WORD_WIDTH] = mem_req_data_p;
end
assign mem_req_rw_s = mem_req_rw_p;
assign mem_req_byteen_s = mem_req_byteen_r;
assign mem_req_data_s = mem_req_data_r;
end else begin
`UNUSED_VAR (mem_req_wsel_p)
assign mem_req_rw_s = mem_req_rw_p;
assign mem_req_byteen_s = mem_req_byteen_p;
assign mem_req_data_s = mem_req_data_p;
end
assign mem_req_rw_s = mem_req_rw_p;
assign mem_req_byteen_s = mem_req_byteen_p;
assign mem_req_data_s = mem_req_data_p;
end else begin
`UNUSED_VAR (mem_req_byteen_p)
`UNUSED_VAR (mem_req_wsel_p)
`UNUSED_VAR (mem_req_data_p)
`UNUSED_VAR (mem_req_rw_p)
assign mem_req_rw_s = 0;
assign mem_req_byteen_s = {LINE_SIZE{1'b1}};
assign mem_req_data_s = '0;
@ -529,10 +547,10 @@ module VX_cache import VX_gpu_pkg::*; #(
// per cycle: core_reads, core_writes
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [NUM_REQS-1:0] perf_core_reads_per_req;
wire [NUM_REQS-1:0] perf_core_writes_per_req;
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
@ -541,16 +559,16 @@ module VX_cache import VX_gpu_pkg::*; #(
`BUFFER(perf_core_reads_per_req, core_req_valid & core_req_ready & ~core_req_rw);
`BUFFER(perf_core_writes_per_req, core_req_valid & core_req_ready & core_req_rw);
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
`POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank);
`POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank);
`POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank);
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready;
assign perf_crsp_stall_per_req[i] = core_bus2_if[i].rsp_valid && ~core_bus2_if[i].rsp_ready;
end
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
@ -563,7 +581,7 @@ module VX_cache import VX_gpu_pkg::*; #(
reg [`PERF_CTR_BITS-1:0] perf_write_misses;
reg [`PERF_CTR_BITS-1:0] perf_mshr_stalls;
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
always @(posedge clk) begin
if (reset) begin

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -18,40 +18,46 @@ module VX_cache_bank #(
parameter BANK_ID = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 1,
parameter NUM_REQS = 1,
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 4,
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 1,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 1,
parameter MSHR_SIZE = 1,
// Memory Request Queue Size
parameter MREQ_SIZE = 1,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 1,
// Core response output register
parameter CORE_OUT_REG = 0,
// Core response output buffer
parameter CORE_OUT_BUF = 0,
// Memory request output register
parameter MEM_OUT_REG = 0,
// Memory request output buffer
parameter MEM_OUT_BUF = 0,
parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE),
parameter REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS),
@ -66,18 +72,19 @@ module VX_cache_bank #(
output wire perf_mshr_stalls,
`endif
// Core Request
// Core Request
input wire core_req_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] core_req_addr,
input wire core_req_rw,
input wire [WORD_SEL_WIDTH-1:0] core_req_wsel,
input wire [WORD_SIZE-1:0] core_req_byteen,
input wire [`CS_WORD_WIDTH-1:0] core_req_data,
input wire [TAG_WIDTH-1:0] core_req_tag,
input wire [REQ_SEL_WIDTH-1:0] core_req_idx,
input wire core_req_rw, // write enable
input wire [WORD_SEL_WIDTH-1:0] core_req_wsel, // select the word in a cacheline, e.g. word size = 4 bytes, cacheline size = 64 bytes, it should have log(64/4)= 4 bits
input wire [WORD_SIZE-1:0] core_req_byteen,// which bytes in data to write
input wire [`CS_WORD_WIDTH-1:0] core_req_data, // data to be written
input wire [TAG_WIDTH-1:0] core_req_tag, // identifier of the request (request id)
input wire [REQ_SEL_WIDTH-1:0] core_req_idx, // index of the request in the core request array
input wire core_req_flush, // flush enable
output wire core_req_ready,
// Core Response
// Core Response
output wire core_rsp_valid,
output wire [`CS_WORD_WIDTH-1:0] core_rsp_data,
output wire [TAG_WIDTH-1:0] core_rsp_tag,
@ -88,33 +95,36 @@ module VX_cache_bank #(
output wire mem_req_valid,
output wire [`CS_LINE_ADDR_WIDTH-1:0] mem_req_addr,
output wire mem_req_rw,
output wire [WORD_SEL_WIDTH-1:0] mem_req_wsel,
output wire [WORD_SIZE-1:0] mem_req_byteen,
output wire [`CS_WORD_WIDTH-1:0] mem_req_data,
output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id,
output wire [LINE_SIZE-1:0] mem_req_byteen,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id, // index of the head entry in the mshr
output wire mem_req_flush,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
input wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id,
output wire mem_rsp_ready,
// initialization
input wire init_enable,
input wire [`CS_LINE_SEL_BITS-1:0] init_line_sel
// flush
input wire flush_begin,
output wire flush_end
);
localparam PIPELINE_STAGES = 2;
`IGNORE_UNUSED_BEGIN
wire [`UP(UUID_WIDTH)-1:0] req_uuid_sel, req_uuid_st0, req_uuid_st1;
`IGNORE_UNUSED_END
wire crsq_stall;
wire crsp_queue_stall;
wire mshr_alm_full;
wire mreq_alm_full;
wire mreq_queue_empty;
wire mreq_queue_alm_full;
wire [`CS_LINE_ADDR_WIDTH-1:0] mem_rsp_addr;
wire replay_valid;
wire [`CS_LINE_ADDR_WIDTH-1:0] replay_addr;
wire replay_rw;
@ -125,103 +135,149 @@ module VX_cache_bank #(
wire [REQ_SEL_WIDTH-1:0] replay_idx;
wire [MSHR_ADDR_WIDTH-1:0] replay_id;
wire replay_ready;
wire is_init_st0, is_init_st1;
wire is_flush_st0, is_flush_st1;
wire [NUM_WAYS-1:0] flush_way_st0;
wire [`CS_LINE_ADDR_WIDTH-1:0] addr_sel, addr_st0, addr_st1;
wire rw_st0, rw_st1;
wire [WORD_SEL_WIDTH-1:0] wsel_st0, wsel_st1;
wire [WORD_SIZE-1:0] byteen_st0, byteen_st1;
wire [REQ_SEL_WIDTH-1:0] req_idx_st0, req_idx_st1;
wire [TAG_WIDTH-1:0] tag_st0, tag_st1;
wire [`CS_LINE_SEL_BITS-1:0] line_sel_st0, line_sel_st1;
wire rw_sel, rw_st0, rw_st1;
wire [WORD_SEL_WIDTH-1:0] wsel_sel, wsel_st0, wsel_st1;
wire [WORD_SIZE-1:0] byteen_sel, byteen_st0, byteen_st1;
wire [REQ_SEL_WIDTH-1:0] req_idx_sel, req_idx_st0, req_idx_st1;
wire [TAG_WIDTH-1:0] tag_sel, tag_st0, tag_st1;
wire [`CS_WORD_WIDTH-1:0] read_data_st1;
wire [`CS_LINE_WIDTH-1:0] data_sel, data_st0, data_st1;
wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0, mshr_id_st0, mshr_id_st1;
wire valid_sel, valid_st0, valid_st1;
wire is_init_st0;
wire is_creq_st0, is_creq_st1;
wire is_fill_st0, is_fill_st1;
wire is_replay_st0, is_replay_st1;
wire creq_flush_sel, creq_flush_st0, creq_flush_st1;
wire evict_dirty_st0, evict_dirty_st1;
wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1;
wire [NUM_WAYS-1:0] tag_matches_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_tail_st0, mshr_tail_st1;
wire [MSHR_ADDR_WIDTH-1:0] mshr_prev_st0, mshr_prev_st1;
wire mshr_pending_st0, mshr_pending_st1;
wire mshr_empty;
wire rdw_hazard_st0;
reg rdw_hazard_st1;
wire flush_valid;
wire init_valid;
wire [`CS_LINE_SEL_BITS-1:0] flush_sel;
wire [NUM_WAYS-1:0] flush_way;
wire flush_ready;
wire pipe_stall = crsq_stall || rdw_hazard_st1;
// ensure we have no pending memory request in the bank
wire no_pending_req = ~valid_st0 && ~valid_st1 && mreq_queue_empty;
// flush unit
VX_bank_flush #(
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WRITEBACK (WRITEBACK)
) flush_unit (
.clk (clk),
.reset (reset),
.flush_begin (flush_begin),
.flush_end (flush_end),
.flush_init (init_valid),
.flush_valid (flush_valid),
.flush_line (flush_sel),
.flush_way (flush_way),
.flush_ready (flush_ready),
.mshr_empty (mshr_empty),
.bank_empty (no_pending_req)
);
wire rdw_hazard1_sel;
wire rdw_hazard2_sel;
reg rdw_hazard3_st1;
wire pipe_stall = crsp_queue_stall || rdw_hazard3_st1;
// inputs arbitration:
// mshr replay has highest priority to maximize utilization since there is no miss.
// handle memory responses next to prevent deadlock with potential memory request from a miss.
wire replay_grant = ~init_enable;
wire replay_enable = replay_grant && replay_valid;
// flush has precedence over core requests to ensure that the cache is in a consistent state.
wire replay_grant = ~init_valid;
wire replay_enable = replay_grant && replay_valid;
wire fill_grant = ~init_enable && ~replay_enable;
wire fill_grant = ~init_valid && ~replay_enable;
wire fill_enable = fill_grant && mem_rsp_valid;
wire creq_grant = ~init_enable && ~replay_enable && ~fill_enable;
wire flush_grant = ~init_valid && ~replay_enable && ~fill_enable;
wire flush_enable = flush_grant && flush_valid;
wire creq_grant = ~init_valid && ~replay_enable && ~fill_enable && ~flush_enable;
wire creq_enable = creq_grant && core_req_valid;
assign replay_ready = replay_grant
&& ~rdw_hazard_st0
&& ~pipe_stall;
&& ~rdw_hazard1_sel
&& ~pipe_stall;
assign mem_rsp_ready = fill_grant
&& ~pipe_stall;
assign core_req_ready = creq_grant
&& ~mreq_alm_full
&& ~mshr_alm_full
&& (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions
&& ~rdw_hazard2_sel
&& ~pipe_stall;
wire init_fire = init_enable;
wire replay_fire = replay_valid && replay_ready;
assign flush_ready = flush_grant
&& (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions
&& ~rdw_hazard2_sel
&& ~pipe_stall;
assign core_req_ready = creq_grant
&& ~mreq_queue_alm_full
&& ~mshr_alm_full
&& ~pipe_stall;
wire init_fire = init_valid;
wire replay_fire = replay_valid && replay_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
wire flush_fire = flush_valid && flush_ready;
wire core_req_fire = core_req_valid && core_req_ready;
wire [TAG_WIDTH-1:0] mshr_creq_tag = replay_enable ? replay_tag : core_req_tag;
assign valid_sel = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire;
assign rw_sel = replay_valid ? replay_rw : core_req_rw;
assign byteen_sel = replay_valid ? replay_byteen : core_req_byteen;
assign wsel_sel = replay_valid ? replay_wsel : core_req_wsel;
assign req_idx_sel = replay_valid ? replay_idx : core_req_idx;
assign tag_sel = replay_valid ? replay_tag : core_req_tag;
assign creq_flush_sel = core_req_valid && core_req_flush;
assign addr_sel = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) :
(replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : core_req_addr));
if (WRITE_ENABLE) begin
assign data_sel[`CS_WORD_WIDTH-1:0] = replay_valid ? replay_data : (mem_rsp_valid ? mem_rsp_data[`CS_WORD_WIDTH-1:0] : core_req_data);
end else begin
assign data_sel[`CS_WORD_WIDTH-1:0] = mem_rsp_data[`CS_WORD_WIDTH-1:0];
`UNUSED_VAR (core_req_data)
`UNUSED_VAR (replay_data)
end
for (genvar i = `CS_WORD_WIDTH; i < `CS_LINE_WIDTH; ++i) begin
assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words of data_sel
end
if (UUID_WIDTH != 0) begin
assign req_uuid_sel = mshr_creq_tag[TAG_WIDTH-1 -: UUID_WIDTH];
assign req_uuid_sel = tag_sel[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign req_uuid_sel = 0;
end
`UNUSED_VAR (mshr_creq_tag)
assign valid_sel = init_fire || replay_fire || mem_rsp_fire || core_req_fire;
assign addr_sel = init_enable ? `CS_LINE_ADDR_WIDTH'(init_line_sel) :
(replay_valid ? replay_addr :
(mem_rsp_valid ? mem_rsp_addr : core_req_addr));
assign data_sel[`CS_WORD_WIDTH-1:0] = (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data[`CS_WORD_WIDTH-1:0] : (replay_valid ? replay_data : core_req_data);
for (genvar i = `CS_WORD_WIDTH; i < `CS_LINE_WIDTH; ++i) begin
assign data_sel[i] = mem_rsp_data[i];
end
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + NUM_WAYS + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
.RESETW (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (~pipe_stall),
.data_in ({
valid_sel,
init_enable,
replay_enable,
fill_enable,
creq_enable,
addr_sel,
data_sel,
replay_valid ? replay_rw : core_req_rw,
replay_valid ? replay_byteen : core_req_byteen,
replay_valid ? replay_wsel : core_req_wsel,
replay_valid ? replay_idx : core_req_idx,
replay_valid ? replay_tag : core_req_tag,
replay_id
}),
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_creq_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
.data_in ({valid_sel, init_valid, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}),
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
);
if (UUID_WIDTH != 0) begin
@ -230,59 +286,81 @@ module VX_cache_bank #(
assign req_uuid_st0 = 0;
end
wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire do_init_st0 = valid_st0 && is_init_st0;
wire do_lookup_st0 = valid_st0 && ~(is_fill_st0 || is_init_st0);
wire do_flush_st0 = valid_st0 && is_flush_st0;
wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0;
wire do_creq_wr_st0 = valid_st0 && is_creq_st0 && rw_st0;
wire do_replay_rd_st0 = valid_st0 && is_replay_st0 && ~rw_st0;
wire do_replay_wr_st0 = valid_st0 && is_replay_st0 && rw_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire do_cache_rd_st0 = do_creq_rd_st0 || do_replay_rd_st0;
wire do_cache_wr_st0 = do_creq_wr_st0 || do_replay_wr_st0;
wire do_lookup_st0 = do_cache_rd_st0 || do_cache_wr_st0;
wire [`CS_WORD_WIDTH-1:0] write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0];
wire [NUM_WAYS-1:0] tag_matches_st0, tag_matches_st1;
wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1;
assign line_sel_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0];
`RESET_RELAY (tag_reset, reset);
wire [NUM_WAYS-1:0] evict_way_st0;
wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0;
VX_cache_tags #(
.INSTANCE_ID(INSTANCE_ID),
.BANK_ID (BANK_ID),
.INSTANCE_ID($sformatf("%s-tags", INSTANCE_ID)),
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.WORD_SIZE (WORD_SIZE),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH)
) cache_tags (
.clk (clk),
.reset (tag_reset),
.reset (reset),
.req_uuid (req_uuid_st0),
.stall (pipe_stall),
// read/Fill
// init/flush/fill/write/lookup
.init (do_init_st0),
.flush (do_flush_st0),
.fill (do_fill_st0),
.write (do_cache_wr_st0),
.lookup (do_lookup_st0),
.line_addr (addr_st0),
.fill (do_fill_st0),
.init (do_init_st0),
.way_sel (way_sel_st0),
.tag_matches(tag_matches_st0)
.way_sel (flush_way_st0),
.tag_matches(tag_matches_st0),
// replacement
.evict_dirty(evict_dirty_st0),
.evict_way (evict_way_st0),
.evict_tag (evict_tag_st0)
);
wire [`CS_LINE_ADDR_WIDTH-1:0] addr2_st0;
wire is_flush2_st0 = WRITEBACK && is_flush_st0;
assign mshr_id_st0 = is_creq_st0 ? mshr_alloc_id_st0 : replay_id_st0;
assign way_sel_st0 = (is_fill_st0 || is_flush2_st0) ? evict_way_st0 : tag_matches_st0;
assign addr2_st0 = (is_fill_st0 || is_flush2_st0) ? {evict_tag_st0, line_sel_st0} : addr_st0;
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + NUM_WAYS + 1),
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1 + 1),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (~pipe_stall),
.data_in ({valid_st0, is_replay_st0, is_fill_st0, is_creq_st0, rw_st0, addr_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_tail_st0, tag_matches_st0, way_sel_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_replay_st1, is_fill_st1, is_creq_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_tail_st1, tag_matches_st1, way_sel_st1, mshr_pending_st1})
.data_in ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush2_st0, is_creq_st0, creq_flush_st0, rw_st0, addr2_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, evict_dirty_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_init_st1, is_replay_st1, is_fill_st1, is_flush_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, evict_dirty_st1, mshr_pending_st1})
);
// we have a tag hit
wire is_hit_st1 = (| tag_matches_st1);
wire is_hit_st1 = (| way_sel_st1);
if (UUID_WIDTH != 0) begin
assign req_uuid_st1 = tag_st1[TAG_WIDTH-1 -: UUID_WIDTH];
@ -290,9 +368,15 @@ module VX_cache_bank #(
assign req_uuid_st1 = 0;
end
wire do_creq_rd_st1 = valid_st1 && is_creq_st1 && ~rw_st1;
wire do_creq_wr_st1 = valid_st1 && is_creq_st1 && rw_st1;
wire is_read_st1 = is_creq_st1 && ~rw_st1;
wire is_write_st1 = is_creq_st1 && rw_st1;
wire do_init_st1 = valid_st1 && is_init_st1;
wire do_fill_st1 = valid_st1 && is_fill_st1;
wire do_flush_st1 = valid_st1 && is_flush_st1;
wire do_creq_rd_st1 = valid_st1 && is_read_st1;
wire do_creq_wr_st1 = valid_st1 && is_write_st1;
wire do_replay_rd_st1 = valid_st1 && is_replay_st1 && ~rw_st1;
wire do_replay_wr_st1 = valid_st1 && is_replay_st1 && rw_st1;
@ -302,76 +386,114 @@ module VX_cache_bank #(
wire do_write_hit_st1 = do_creq_wr_st1 && is_hit_st1;
wire do_write_miss_st1= do_creq_wr_st1 && ~is_hit_st1;
wire do_cache_rd_st1 = do_read_hit_st1 || do_replay_rd_st1;
wire do_cache_wr_st1 = do_write_hit_st1 || do_replay_wr_st1;
assign line_sel_st1 = addr_st1[`CS_LINE_SEL_BITS-1:0];
`UNUSED_VAR (do_write_miss_st1)
// ensure mshr replay always get a hit
`RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("runtime error: invalid mshr replay"));
`RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("missed mshr replay"));
// detect BRAM's read-during-write hazard
assign rdw_hazard_st0 = do_fill_st0; // after a fill
// both tag and data stores use BRAM with no read-during-write protection.
// we ned to stall the pipeline to prevent read-after-write hazards.
assign rdw_hazard1_sel = do_fill_st0; // stall first replay following a fill
assign rdw_hazard2_sel = WRITEBACK && do_cache_wr_st0; // a writeback can evict any preceeding write
always @(posedge clk) begin
rdw_hazard_st1 <= (do_creq_rd_st0 && do_write_hit_st1 && (addr_st0 == addr_st1))
&& ~rdw_hazard_st1; // after a write to same address
// stall reads following writes to same line address
rdw_hazard3_st1 <= do_cache_rd_st0 && do_cache_wr_st1 && (line_sel_st0 == line_sel_st1)
&& ~rdw_hazard3_st1; // release pipeline stall
end
wire [`CS_WORD_WIDTH-1:0] write_data_st1 = data_st1[`CS_WORD_WIDTH-1:0];
wire [`CS_LINE_WIDTH-1:0] write_data_st1 = {`CS_WORDS_PER_LINE{data_st1[`CS_WORD_WIDTH-1:0]}};
wire [`CS_LINE_WIDTH-1:0] fill_data_st1 = data_st1;
wire [LINE_SIZE-1:0] write_byteen_st1;
wire [`CS_LINE_WIDTH-1:0] dirty_data_st1;
wire [LINE_SIZE-1:0] dirty_byteen_st1;
if (`CS_WORDS_PER_LINE > 1) begin
reg [LINE_SIZE-1:0] write_byteen_r;
always @(*) begin
write_byteen_r = '0;
write_byteen_r[wsel_st1 * WORD_SIZE +: WORD_SIZE] = byteen_st1;
end
assign write_byteen_st1 = write_byteen_r;
end else begin
assign write_byteen_st1 = byteen_st1;
end
`RESET_RELAY (data_reset, reset);
VX_cache_data #(
.INSTANCE_ID (INSTANCE_ID),
.BANK_ID (BANK_ID),
.INSTANCE_ID ($sformatf("%s-data", INSTANCE_ID)),
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH)
) cache_data (
.clk (clk),
.reset (data_reset),
.reset (reset),
.req_uuid (req_uuid_st1),
.stall (pipe_stall),
.read (do_read_hit_st1 || do_replay_rd_st1),
.fill (do_fill_st1),
.write (do_write_hit_st1 || do_replay_wr_st1),
.way_sel (way_sel_st1 | tag_matches_st1),
.init (do_init_st1),
.read (do_cache_rd_st1),
.fill (do_fill_st1),
.flush (do_flush_st1),
.write (do_cache_wr_st1),
.way_sel (way_sel_st1),
.line_addr (addr_st1),
.wsel (wsel_st1),
.byteen (byteen_st1),
.fill_data (fill_data_st1),
.fill_data (fill_data_st1),
.write_data (write_data_st1),
.read_data (read_data_st1)
.write_byteen(write_byteen_st1),
.read_data (read_data_st1),
.dirty_data (dirty_data_st1),
.dirty_byteen(dirty_byteen_st1)
);
wire [MSHR_SIZE-1:0] mshr_matches_st0;
wire [MSHR_SIZE-1:0] mshr_lookup_pending_st0;
wire [MSHR_SIZE-1:0] mshr_lookup_rw_st0;
wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~pipe_stall;
wire mshr_lookup_st0 = mshr_allocate_st0;
wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~pipe_stall;
wire mshr_release_st1 = is_hit_st1 || (rw_st1 && ~mshr_pending_st1);
VX_pending_size #(
// release allocated mshr entry if we had a hit
wire mshr_release_st1;
if (WRITEBACK) begin
assign mshr_release_st1 = is_hit_st1;
end else begin
// we need to keep missed write requests in MSHR if there is already a pending entry to the same address
// this ensures that missed write requests are replayed locally in case a pending fill arrives without the write content
// this can happen when writes are sent late, when the fill was already in flight.
assign mshr_release_st1 = is_hit_st1 || (rw_st1 && ~mshr_pending_st1);
end
VX_pending_size #(
.SIZE (MSHR_SIZE)
) mshr_pending_size (
.clk (clk),
.reset (reset),
.incr (core_req_fire),
.decr (replay_fire || (mshr_finalize_st1 && mshr_release_st1)),
.empty (mshr_empty),
`UNUSED_PIN (alm_empty),
.full (mshr_alm_full),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
`RESET_RELAY (mshr_reset, reset);
VX_cache_mshr #(
.INSTANCE_ID (INSTANCE_ID),
.BANK_ID (BANK_ID),
.INSTANCE_ID ($sformatf("%s-mshr", INSTANCE_ID)),
.BANK_ID (BANK_ID),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.MSHR_SIZE (MSHR_SIZE),
@ -379,7 +501,7 @@ module VX_cache_bank #(
.DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH)
) cache_mshr (
.clk (clk),
.reset (mshr_reset),
.reset (reset),
.deq_req_uuid (req_uuid_sel),
.lkp_req_uuid (req_uuid_st0),
@ -393,7 +515,7 @@ module VX_cache_bank #(
// dequeue
.dequeue_valid (replay_valid),
.dequeue_addr (replay_addr),
.dequeue_rw (replay_rw),
.dequeue_rw (replay_rw),
.dequeue_data ({replay_wsel, replay_byteen, replay_data, replay_tag, replay_idx}),
.dequeue_id (replay_id),
.dequeue_ready (replay_ready),
@ -404,104 +526,128 @@ module VX_cache_bank #(
.allocate_rw (rw_st0),
.allocate_data ({wsel_st0, byteen_st0, write_data_st0, tag_st0, req_idx_st0}),
.allocate_id (mshr_alloc_id_st0),
.allocate_tail (mshr_tail_st0),
.allocate_prev (mshr_prev_st0),
`UNUSED_PIN (allocate_ready),
// lookup
.lookup_valid (mshr_lookup_st0),
.lookup_addr (addr_st0),
.lookup_matches (mshr_matches_st0),
.lookup_pending (mshr_lookup_pending_st0),
.lookup_rw (mshr_lookup_rw_st0),
// finalize
.finalize_valid (mshr_finalize_st1),
.finalize_release(mshr_release_st1),
.finalize_pending(mshr_pending_st1),
.finalize_id (mshr_id_st1),
.finalize_tail (mshr_tail_st1)
.finalize_prev (mshr_prev_st1)
);
// ignore allocated id from mshr matches
// check if there are pending requests to same line in the MSHR
wire [MSHR_SIZE-1:0] lookup_matches;
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
assign lookup_matches[i] = (i != mshr_alloc_id_st0) && mshr_matches_st0[i];
assign lookup_matches[i] = mshr_lookup_pending_st0[i]
&& (i != mshr_alloc_id_st0) // exclude current mshr id
&& (WRITEBACK || ~mshr_lookup_rw_st0[i]); // exclude write requests if writethrough
end
assign mshr_pending_st0 = (| lookup_matches);
// schedule core response
wire crsq_valid, crsq_ready;
wire [`CS_WORD_WIDTH-1:0] crsq_data;
wire [REQ_SEL_WIDTH-1:0] crsq_idx;
wire [TAG_WIDTH-1:0] crsq_tag;
assign crsq_valid = do_read_hit_st1 || do_replay_rd_st1;
assign crsq_idx = req_idx_st1;
assign crsq_data = read_data_st1;
assign crsq_tag = tag_st1;
wire crsp_queue_valid, crsp_queue_ready;
wire [`CS_WORD_WIDTH-1:0] crsp_queue_data;
wire [REQ_SEL_WIDTH-1:0] crsp_queue_idx;
wire [TAG_WIDTH-1:0] crsp_queue_tag;
`RESET_RELAY (crsp_reset, reset);
assign crsp_queue_valid = do_cache_rd_st1;
assign crsp_queue_idx = req_idx_st1;
assign crsp_queue_data = read_data_st1;
assign crsp_queue_tag = tag_st1;
VX_elastic_buffer #(
.DATAW (TAG_WIDTH + `CS_WORD_WIDTH + REQ_SEL_WIDTH),
.SIZE (CRSQ_SIZE),
.OUT_REG (CORE_OUT_REG)
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_queue (
.clk (clk),
.reset (crsp_reset),
.valid_in (crsq_valid && ~rdw_hazard_st1),
.ready_in (crsq_ready),
.data_in ({crsq_tag, crsq_data, crsq_idx}),
.reset (reset),
.valid_in (crsp_queue_valid && ~rdw_hazard3_st1),
.ready_in (crsp_queue_ready),
.data_in ({crsp_queue_tag, crsp_queue_data, crsp_queue_idx}),
.data_out ({core_rsp_tag, core_rsp_data, core_rsp_idx}),
.valid_out (core_rsp_valid),
.ready_out (core_rsp_ready)
);
assign crsq_stall = crsq_valid && ~crsq_ready;
assign crsp_queue_stall = crsp_queue_valid && ~crsp_queue_ready;
// schedule memory request
wire mreq_push, mreq_pop, mreq_empty;
wire [`CS_WORD_WIDTH-1:0] mreq_data;
wire [WORD_SIZE-1:0] mreq_byteen;
wire [WORD_SEL_WIDTH-1:0] mreq_wsel;
wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_addr;
wire [MSHR_ADDR_WIDTH-1:0] mreq_id;
wire mreq_rw;
wire mreq_queue_push, mreq_queue_pop;
wire [`CS_LINE_WIDTH-1:0] mreq_queue_data;
wire [LINE_SIZE-1:0] mreq_queue_byteen;
wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr;
wire [MSHR_ADDR_WIDTH-1:0] mreq_queue_id;
wire mreq_queue_rw;
wire mreq_queue_flush;
assign mreq_push = (do_read_miss_st1 && ~mshr_pending_st1)
|| do_creq_wr_st1;
wire is_fill_or_flush_st1 = is_fill_st1 || is_flush_st1;
wire do_fill_or_flush_st1 = valid_st1 && is_fill_or_flush_st1;
wire do_writeback_st1 = do_fill_or_flush_st1 && evict_dirty_st1;
assign mreq_pop = mem_req_valid && mem_req_ready;
if (WRITEBACK) begin
if (DIRTY_BYTES) begin
// ensure dirty bytes match the tag info
wire has_dirty_bytes = (| dirty_byteen_st1);
`RUNTIME_ASSERT (~do_fill_or_flush_st1 || (evict_dirty_st1 == has_dirty_bytes), ("missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", evict_dirty_st1, has_dirty_bytes, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID)));
end
assign mreq_queue_push = (((do_read_miss_st1 || do_write_miss_st1) && ~mshr_pending_st1)
|| do_writeback_st1)
&& ~rdw_hazard3_st1;
end else begin
`UNUSED_VAR (do_writeback_st1)
assign mreq_queue_push = ((do_read_miss_st1 && ~mshr_pending_st1)
|| do_creq_wr_st1)
&& ~rdw_hazard3_st1;
end
assign mreq_rw = WRITE_ENABLE && rw_st1;
assign mreq_addr = addr_st1;
assign mreq_id = mshr_id_st1;
assign mreq_wsel = wsel_st1;
assign mreq_byteen = byteen_st1;
assign mreq_data = write_data_st1;
assign mreq_queue_pop = mem_req_valid && mem_req_ready;
assign mreq_queue_addr = addr_st1;
assign mreq_queue_id = mshr_id_st1;
assign mreq_queue_flush = creq_flush_st1;
`RESET_RELAY (mreq_reset, reset);
if (WRITE_ENABLE) begin
assign mreq_queue_rw = WRITEBACK ? is_fill_or_flush_st1 : rw_st1;
assign mreq_queue_data = WRITEBACK ? dirty_data_st1 : write_data_st1;
assign mreq_queue_byteen = WRITEBACK ? dirty_byteen_st1 : write_byteen_st1;
end else begin
assign mreq_queue_rw = 0;
assign mreq_queue_data = 0;
assign mreq_queue_byteen = 0;
`UNUSED_VAR (dirty_data_st1)
`UNUSED_VAR (dirty_byteen_st1)
end
VX_fifo_queue #(
.DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + `CS_WORD_WIDTH),
.DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + 1),
.DEPTH (MREQ_SIZE),
.ALM_FULL (MREQ_SIZE-2),
.OUT_REG (MEM_OUT_REG)
.ALM_FULL (MREQ_SIZE-PIPELINE_STAGES),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_queue (
.clk (clk),
.reset (mreq_reset),
.push (mreq_push),
.pop (mreq_pop),
.data_in ({mreq_rw, mreq_addr, mreq_id, mreq_byteen, mreq_wsel, mreq_data}),
.data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_wsel, mem_req_data}),
.empty (mreq_empty),
.alm_full (mreq_alm_full),
.reset (reset),
.push (mreq_queue_push),
.pop (mreq_queue_pop),
.data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_id, mreq_queue_byteen, mreq_queue_data, mreq_queue_flush}),
.data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_data, mem_req_flush}),
.empty (mreq_queue_empty),
.alm_full (mreq_queue_alm_full),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (size)
);
assign mem_req_valid = ~mreq_empty;
assign mem_req_valid = ~mreq_queue_empty;
///////////////////////////////////////////////////////////////////////////////
@ -511,39 +657,38 @@ module VX_cache_bank #(
assign perf_mshr_stalls = mshr_alm_full;
`endif
`ifdef DBG_TRACE_CACHE_BANK
wire crsq_fire = crsq_valid && crsq_ready;
wire pipeline_stall = (replay_valid || mem_rsp_valid || core_req_valid)
&& ~(replay_fire || mem_rsp_fire || core_req_fire);
`ifdef DBG_TRACE_CACHE
wire crsp_queue_fire = crsp_queue_valid && crsp_queue_ready;
wire input_stall = (replay_valid || mem_rsp_valid || core_req_valid || flush_valid)
&& ~(replay_fire || mem_rsp_fire || core_req_fire || flush_fire);
always @(posedge clk) begin
if (pipeline_stall) begin
`TRACE(3, ("%d: *** %s-bank%0d stall: crsq=%b, mreq=%b, mshr=%b\n", $time, INSTANCE_ID, BANK_ID, crsq_stall, mreq_alm_full, mshr_alm_full));
end
if (init_enable) begin
`TRACE(2, ("%d: %s-bank%0d init: addr=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(init_line_sel, BANK_ID)));
if (input_stall || pipe_stall) begin
`TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw1=%b, rdw2=%b, rdw3=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard1_sel, rdw_hazard2_sel, rdw_hazard3_st1));
end
if (mem_rsp_fire) begin
`TRACE(2, ("%d: %s-bank%0d fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data));
`TRACE(2, ("%d: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data));
end
if (replay_fire) begin
`TRACE(2, ("%d: %s-bank%0d mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel));
`TRACE(2, ("%d: %s mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel));
end
if (core_req_fire) begin
if (core_req_rw)
`TRACE(2, ("%d: %s-bank%0d core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel));
`TRACE(2, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel));
else
`TRACE(2, ("%d: %s-bank%0d core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel));
`TRACE(2, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel));
end
if (crsq_fire) begin
`TRACE(2, ("%d: %s-bank%0d core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_idx, crsq_data, req_uuid_st1));
if (crsp_queue_fire) begin
`TRACE(2, ("%d: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1));
end
if (mreq_push) begin
if (do_creq_wr_st1)
`TRACE(2, ("%d: %s-bank%0d writethrough: addr=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mreq_addr, BANK_ID), mreq_byteen, mreq_data, req_uuid_st1));
if (mreq_queue_push) begin
if (do_creq_wr_st1 && !WRITEBACK)
`TRACE(2, ("%d: %s writethrough: addr=0x%0h, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1));
else if (do_writeback_st1)
`TRACE(2, ("%d: %s writeback: addr=0x%0h, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data));
else
`TRACE(2, ("%d: %s-bank%0d fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mreq_addr, BANK_ID), mreq_id, req_uuid_st1));
`TRACE(2, ("%d: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_id, req_uuid_st1));
end
end
end
`endif
endmodule

View file

@ -1,221 +1,186 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
`include "VX_cache_define.vh"
module VX_cache_bypass #(
parameter NUM_REQS = 1,
parameter NC_TAG_BIT = 0,
parameter TAG_SEL_IDX = 0,
parameter NC_ENABLE = 0,
parameter PASSTHRU = 0,
parameter NC_ENABLE = 0,
parameter WORD_SIZE = 1,
parameter LINE_SIZE = 1,
parameter CORE_ADDR_WIDTH = 1,
parameter CORE_DATA_SIZE = 1,
parameter CORE_TAG_IN_WIDTH = 1,
parameter CORE_TAG_WIDTH = 1,
parameter MEM_ADDR_WIDTH = 1,
parameter MEM_DATA_SIZE = 1,
parameter MEM_TAG_IN_WIDTH = 1,
parameter MEM_TAG_OUT_WIDTH = 1,
parameter UUID_WIDTH = 0,
parameter CORE_DATA_WIDTH = CORE_DATA_SIZE * 8,
parameter MEM_DATA_WIDTH = MEM_DATA_SIZE * 8,
parameter CORE_TAG_OUT_WIDTH= CORE_TAG_IN_WIDTH - NC_ENABLE
) (
parameter CORE_OUT_BUF = 0,
parameter MEM_OUT_BUF = 0,
parameter CORE_DATA_WIDTH = WORD_SIZE * 8
) (
input wire clk,
input wire reset,
// Core request in
input wire [NUM_REQS-1:0] core_req_valid_in,
input wire [NUM_REQS-1:0] core_req_rw_in,
input wire [NUM_REQS-1:0][CORE_ADDR_WIDTH-1:0] core_req_addr_in,
input wire [NUM_REQS-1:0][CORE_DATA_SIZE-1:0] core_req_byteen_in,
input wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_req_data_in,
input wire [NUM_REQS-1:0][CORE_TAG_IN_WIDTH-1:0] core_req_tag_in,
output wire [NUM_REQS-1:0] core_req_ready_in,
// Core request in
VX_mem_bus_if.slave core_bus_in_if [NUM_REQS],
// Core request out
output wire [NUM_REQS-1:0] core_req_valid_out,
output wire [NUM_REQS-1:0] core_req_rw_out,
output wire [NUM_REQS-1:0][CORE_ADDR_WIDTH-1:0] core_req_addr_out,
output wire [NUM_REQS-1:0][CORE_DATA_SIZE-1:0] core_req_byteen_out,
output wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_req_data_out,
output wire [NUM_REQS-1:0][CORE_TAG_OUT_WIDTH-1:0] core_req_tag_out,
input wire [NUM_REQS-1:0] core_req_ready_out,
// Core response in
input wire [NUM_REQS-1:0] core_rsp_valid_in,
input wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_rsp_data_in,
input wire [NUM_REQS-1:0][CORE_TAG_OUT_WIDTH-1:0] core_rsp_tag_in,
output wire [NUM_REQS-1:0] core_rsp_ready_in,
// Core response out
output wire [NUM_REQS-1:0] core_rsp_valid_out,
output wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_rsp_data_out,
output wire [NUM_REQS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_out,
input wire [NUM_REQS-1:0] core_rsp_ready_out,
VX_mem_bus_if.master core_bus_out_if [NUM_REQS],
// Memory request in
input wire mem_req_valid_in,
input wire mem_req_rw_in,
input wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_in,
input wire [MEM_DATA_SIZE-1:0] mem_req_byteen_in,
input wire [MEM_DATA_WIDTH-1:0] mem_req_data_in,
input wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_in,
output wire mem_req_ready_in,
VX_mem_bus_if.slave mem_bus_in_if,
// Memory request out
output wire mem_req_valid_out,
output wire mem_req_rw_out,
output wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_out,
output wire [MEM_DATA_SIZE-1:0] mem_req_byteen_out,
output wire [MEM_DATA_WIDTH-1:0] mem_req_data_out,
output wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_tag_out,
input wire mem_req_ready_out,
// Memory response in
input wire mem_rsp_valid_in,
input wire [MEM_DATA_WIDTH-1:0] mem_rsp_data_in,
input wire [MEM_TAG_OUT_WIDTH-1:0] mem_rsp_tag_in,
output wire mem_rsp_ready_in,
// Memory response out
output wire mem_rsp_valid_out,
output wire [MEM_DATA_WIDTH-1:0] mem_rsp_data_out,
output wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_out,
input wire mem_rsp_ready_out
VX_mem_bus_if.master mem_bus_out_if
);
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
localparam DIRECT_PASSTHRU = PASSTHRU && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == 1);
localparam REQ_SEL_BITS = `CLOG2(NUM_REQS);
localparam MUX_DATAW = CORE_TAG_IN_WIDTH + CORE_DATA_WIDTH + CORE_DATA_SIZE + CORE_ADDR_WIDTH + 1;
localparam MUX_DATAW = 1 + WORD_SIZE + CORE_ADDR_WIDTH + `ADDR_TYPE_WIDTH + CORE_DATA_WIDTH + CORE_TAG_WIDTH;
localparam WORDS_PER_LINE = MEM_DATA_SIZE / CORE_DATA_SIZE;
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
localparam CORE_TAG_ID_BITS = CORE_TAG_IN_WIDTH - UUID_WIDTH;
localparam CORE_TAG_ID_BITS = CORE_TAG_WIDTH - UUID_WIDTH;
localparam MEM_TAG_ID_BITS = REQ_SEL_BITS + WSEL_BITS + CORE_TAG_ID_BITS;
localparam MEM_TAG_BYPASS_BITS = UUID_WIDTH + MEM_TAG_ID_BITS;
localparam MEM_TAG_OUT_NC_WIDTH = MEM_TAG_OUT_WIDTH - 1 + NC_ENABLE;
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % `MEM_BLOCK_SIZE), ("invalid parameter"))
// core request handling
// handle core requests ///////////////////////////////////////////////////
wire [NUM_REQS-1:0] core_req_valid_in_nc;
wire [NUM_REQS-1:0] core_req_nc_idxs;
wire core_req_nc_valid;
wire [NUM_REQS-1:0] core_req_nc_valids;
wire [NUM_REQS-1:0] core_req_nc_idxs;
wire [`UP(REQ_SEL_BITS)-1:0] core_req_nc_idx;
wire [NUM_REQS-1:0] core_req_nc_sel;
wire core_req_nc_valid;
wire core_req_nc_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (PASSTHRU != 0) begin
assign core_req_nc_idxs[i] = 1'b1;
end else if (NC_ENABLE) begin
assign core_req_nc_idxs[i] = core_bus_in_if[i].req_data.atype[`ADDR_TYPE_IO];
end else begin
assign core_req_nc_idxs[i] = core_req_tag_in[i][NC_TAG_BIT];
assign core_req_nc_idxs[i] = 1'b0;
end
assign core_req_nc_valids[i] = core_bus_in_if[i].req_valid && core_req_nc_idxs[i];
end
assign core_req_valid_in_nc = core_req_valid_in & core_req_nc_idxs;
wire core_req_in_fire = | (core_req_valid_in & core_req_ready_in);
VX_generic_arbiter #(
.NUM_REQS (NUM_REQS),
.TYPE (PASSTHRU ? "R" : "P"),
.LOCK_ENABLE (1)
) req_arb (
.TYPE (PASSTHRU ? "R" : "P")
) core_req_nc_arb (
.clk (clk),
.reset (reset),
.unlock (core_req_in_fire),
.requests (core_req_valid_in_nc),
.requests (core_req_nc_valids),
.grant_index (core_req_nc_idx),
.grant_onehot (core_req_nc_sel),
.grant_valid (core_req_nc_valid)
.grant_valid (core_req_nc_valid),
.grant_ready (core_req_nc_ready)
);
assign core_req_valid_out = core_req_valid_in & ~core_req_nc_idxs;
assign core_req_rw_out = core_req_rw_in;
assign core_req_addr_out = core_req_addr_in;
assign core_req_byteen_out = core_req_byteen_in;
assign core_req_data_out = core_req_data_in;
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_bits_remove #(
.N (CORE_TAG_IN_WIDTH),
.S (NC_ENABLE),
.POS (NC_TAG_BIT)
) core_req_tag_nc_remove (
.data_in (core_req_tag_in[i]),
.data_out (core_req_tag_out[i])
);
assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && ~core_req_nc_idxs[i];
assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
assign core_bus_in_if[i].req_ready = core_req_nc_valids[i] ? (core_req_nc_ready && core_req_nc_sel[i])
: core_bus_out_if[i].req_ready;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_ready_in[i] = core_req_valid_in_nc[i] ? (~mem_req_valid_in && mem_req_ready_out && core_req_nc_sel[i])
: core_req_ready_out[i];
end
// handle memory requests /////////////////////////////////////////////////
// memory request handling
wire mem_req_out_valid;
wire mem_req_out_rw;
wire [LINE_SIZE-1:0] mem_req_out_byteen;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_out_addr;
wire [`ADDR_TYPE_WIDTH-1:0] mem_req_out_atype;
wire [`CS_LINE_WIDTH-1:0] mem_req_out_data;
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_out_tag;
wire mem_req_out_ready;
assign mem_req_valid_out = mem_req_valid_in || core_req_nc_valid;
assign mem_req_ready_in = mem_req_ready_out;
wire [CORE_TAG_IN_WIDTH-1:0] core_req_tag_in_sel;
wire [CORE_DATA_WIDTH-1:0] core_req_data_in_sel;
wire [CORE_DATA_SIZE-1:0] core_req_byteen_in_sel;
wire [CORE_ADDR_WIDTH-1:0] core_req_addr_in_sel;
wire core_req_rw_in_sel;
wire core_req_nc_sel_rw;
wire [WORD_SIZE-1:0] core_req_nc_sel_byteen;
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_sel_addr;
wire [`ADDR_TYPE_WIDTH-1:0] core_req_nc_sel_atype;
wire [CORE_DATA_WIDTH-1:0] core_req_nc_sel_data;
wire [CORE_TAG_WIDTH-1:0] core_req_nc_sel_tag;
wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_nc_mux_in[i] = {core_req_tag_in[i], core_req_data_in[i], core_req_byteen_in[i], core_req_addr_in[i], core_req_rw_in[i]};
assign core_req_nc_mux_in[i] = {
core_bus_in_if[i].req_data.rw,
core_bus_in_if[i].req_data.byteen,
core_bus_in_if[i].req_data.addr,
core_bus_in_if[i].req_data.atype,
core_bus_in_if[i].req_data.data,
core_bus_in_if[i].req_data.tag
};
end
assign {core_req_tag_in_sel, core_req_data_in_sel, core_req_byteen_in_sel, core_req_addr_in_sel, core_req_rw_in_sel} = core_req_nc_mux_in[core_req_nc_idx];
wire [CORE_TAG_ID_BITS-1:0] core_req_in_id = core_req_tag_in_sel[CORE_TAG_ID_BITS-1:0];
assign mem_req_rw_out = mem_req_valid_in ? mem_req_rw_in : core_req_rw_in_sel;
assign mem_req_addr_out = mem_req_valid_in ? mem_req_addr_in : core_req_addr_in_sel[WSEL_BITS +: MEM_ADDR_WIDTH];
assign {
core_req_nc_sel_rw,
core_req_nc_sel_byteen,
core_req_nc_sel_addr,
core_req_nc_sel_atype,
core_req_nc_sel_data,
core_req_nc_sel_tag
} = core_req_nc_mux_in[core_req_nc_idx];
assign core_req_nc_ready = ~mem_bus_in_if.req_valid && mem_req_out_ready;
assign mem_req_out_valid = mem_bus_in_if.req_valid || core_req_nc_valid;
assign mem_req_out_rw = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.rw : core_req_nc_sel_rw;
assign mem_req_out_addr = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.addr : core_req_nc_sel_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
assign mem_req_out_atype = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.atype : core_req_nc_sel_atype;
wire [MEM_TAG_ID_BITS-1:0] mem_req_tag_id_bypass;
wire [CORE_TAG_ID_BITS-1:0] core_req_in_id = core_req_nc_sel_tag[CORE_TAG_ID_BITS-1:0];
if (WORDS_PER_LINE > 1) begin
reg [WORDS_PER_LINE-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_in_r;
reg [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] mem_req_byteen_in_r;
reg [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_r;
wire [WSEL_BITS-1:0] req_wsel = core_req_addr_in_sel[WSEL_BITS-1:0];
wire [WSEL_BITS-1:0] req_wsel = core_req_nc_sel_addr[WSEL_BITS-1:0];
always @(*) begin
mem_req_byteen_in_r = '0;
mem_req_byteen_in_r[req_wsel] = core_req_byteen_in_sel;
mem_req_byteen_in_r[req_wsel] = core_req_nc_sel_byteen;
mem_req_data_in_r = 'x;
mem_req_data_in_r[req_wsel] = core_req_data_in_sel;
mem_req_data_in_r[req_wsel] = core_req_nc_sel_data;
end
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : mem_req_byteen_in_r;
assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : mem_req_data_in_r;
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : mem_req_byteen_in_r;
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : mem_req_data_in_r;
if (NUM_REQS > 1) begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, req_wsel, core_req_in_id});
end else begin
end else begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({req_wsel, core_req_in_id});
end
end else begin
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : core_req_byteen_in_sel;
assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : core_req_data_in_sel;
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : core_req_nc_sel_byteen;
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : core_req_nc_sel_data;
if (NUM_REQS > 1) begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, core_req_in_id});
end else begin
@ -223,126 +188,164 @@ module VX_cache_bypass #(
end
end
wire [MEM_TAG_OUT_NC_WIDTH-1:0] mem_req_tag_bypass;
wire [MEM_TAG_BYPASS_BITS-1:0] mem_req_tag_bypass;
if (UUID_WIDTH != 0) begin
assign mem_req_tag_bypass = {core_req_tag_in_sel[CORE_TAG_ID_BITS +: UUID_WIDTH], mem_req_tag_id_bypass};
assign mem_req_tag_bypass = {core_req_nc_sel_tag[CORE_TAG_ID_BITS +: UUID_WIDTH], mem_req_tag_id_bypass};
end else begin
assign mem_req_tag_bypass = mem_req_tag_id_bypass;
end
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_tag_bypass_nc;
wire [(MEM_TAG_IN_WIDTH + 1)-1:0] mem_req_tag_in_nc;
if (PASSTHRU != 0) begin
assign mem_req_out_tag = mem_req_tag_bypass;
`UNUSED_VAR (mem_bus_in_if.req_data.tag)
end else begin
if (NC_ENABLE) begin
VX_bits_insert #(
.N (MEM_TAG_OUT_WIDTH-1),
.S (1),
.POS (TAG_SEL_IDX)
) mem_req_tag_in_nc_insert (
.data_in (mem_bus_in_if.req_valid ? (MEM_TAG_OUT_WIDTH-1)'(mem_bus_in_if.req_data.tag) : (MEM_TAG_OUT_WIDTH-1)'(mem_req_tag_bypass)),
.ins_in (~mem_bus_in_if.req_valid),
.data_out (mem_req_out_tag)
);
end else begin
assign mem_req_out_tag = mem_bus_in_if.req_data.tag;
end
end
VX_bits_insert #(
.N (MEM_TAG_OUT_NC_WIDTH),
.S (NC_ENABLE ? 0 : 1),
.POS (NC_TAG_BIT)
) mem_req_tag_bypass_nc_insert (
.data_in (mem_req_tag_bypass),
.sel_in (1'b0),
.data_out (mem_req_tag_bypass_nc)
);
assign mem_bus_in_if.req_ready = mem_req_out_ready;
VX_bits_insert #(
.N (MEM_TAG_IN_WIDTH),
.POS (NC_TAG_BIT)
) mem_req_tag_in_nc_insert (
.data_in (mem_req_tag_in),
.sel_in (1'b0),
.data_out (mem_req_tag_in_nc)
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `ADDR_TYPE_WIDTH + `CS_LINE_WIDTH + MEM_TAG_OUT_WIDTH),
.SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_out_valid),
.ready_in (mem_req_out_ready),
.data_in ({mem_req_out_rw, mem_req_out_byteen, mem_req_out_addr, mem_req_out_atype, mem_req_out_data, mem_req_out_tag}),
.data_out ({mem_bus_out_if.req_data.rw, mem_bus_out_if.req_data.byteen, mem_bus_out_if.req_data.addr, mem_bus_out_if.req_data.atype, mem_bus_out_if.req_data.data, mem_bus_out_if.req_data.tag}),
.valid_out (mem_bus_out_if.req_valid),
.ready_out (mem_bus_out_if.req_ready)
);
assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : mem_req_tag_bypass_nc;
// handle core responses //////////////////////////////////////////////////
// core response handling
wire [NUM_REQS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_in_nc;
wire [NUM_REQS-1:0] core_rsp_in_valid;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_in_data;
wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_in_tag;
wire [NUM_REQS-1:0] core_rsp_in_ready;
wire is_mem_rsp_nc;
if (PASSTHRU != 0) begin
assign is_mem_rsp_nc = mem_rsp_valid_in;
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid;
end else begin
assign is_mem_rsp_nc = mem_rsp_valid_in && mem_rsp_tag_in[NC_TAG_BIT];
if (NC_ENABLE) begin
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid && mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
end else begin
assign is_mem_rsp_nc = 1'b0;
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_bits_insert #(
.N (CORE_TAG_OUT_WIDTH),
.S (NC_ENABLE),
.POS (NC_TAG_BIT)
) core_rsp_tag_in_nc_insert (
.data_in (core_rsp_tag_in[i]),
.sel_in ('0),
.data_out (core_rsp_tag_in_nc[i])
);
end
wire [(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1:0] mem_rsp_tag_id_nc;
wire [MEM_TAG_OUT_NC_WIDTH-1:0] mem_rsp_tag_in_nc;
VX_bits_remove #(
VX_bits_remove #(
.N (MEM_TAG_OUT_WIDTH),
.S (NC_ENABLE ? 0 : 1),
.POS (NC_TAG_BIT)
.S (NC_ENABLE),
.POS (TAG_SEL_IDX)
) mem_rsp_tag_in_nc_remove (
.data_in (mem_rsp_tag_in),
.data_out (mem_rsp_tag_in_nc)
.data_in (mem_bus_out_if.rsp_data.tag),
.data_out (mem_rsp_tag_id_nc)
);
wire [`UP(REQ_SEL_BITS)-1:0] rsp_idx;
if (NUM_REQS > 1) begin
assign rsp_idx = mem_rsp_tag_in_nc[(CORE_TAG_ID_BITS + WSEL_BITS) +: REQ_SEL_BITS];
end else begin
assign rsp_idx = mem_rsp_tag_id_nc[(CORE_TAG_ID_BITS + WSEL_BITS) +: REQ_SEL_BITS];
end else begin
assign rsp_idx = 1'b0;
end
reg [NUM_REQS-1:0] rsp_nc_valid_r;
always @(*) begin
rsp_nc_valid_r = '0;
rsp_nc_valid_r[rsp_idx] = is_mem_rsp_nc;
end
assign core_rsp_valid_out = core_rsp_valid_in | rsp_nc_valid_r;
assign core_rsp_ready_in = core_rsp_ready_out;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_in_valid[i] = core_bus_out_if[i].rsp_valid || rsp_nc_valid_r[i];
assign core_bus_out_if[i].rsp_ready = core_rsp_in_ready[i];
end
if (WORDS_PER_LINE > 1) begin
wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_in_nc[CORE_TAG_ID_BITS +: WSEL_BITS];
wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS +: WSEL_BITS];
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_data_out[i] = core_rsp_valid_in[i] ?
core_rsp_data_in[i] : mem_rsp_data_in[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ?
core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
end
end else begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_data_out[i] = core_rsp_valid_in[i] ? core_rsp_data_in[i] : mem_rsp_data_in;
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data;
end
end
wire [(CORE_TAG_ID_BITS + UUID_WIDTH)-1:0] mem_rsp_tag_in_nc2;
if (UUID_WIDTH != 0) begin
assign mem_rsp_tag_in_nc2 = {mem_rsp_tag_id_nc[(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1 -: UUID_WIDTH], mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0]};
end else begin
assign mem_rsp_tag_in_nc2 = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (PASSTHRU) begin
assign core_rsp_in_tag[i] = mem_rsp_tag_in_nc2;
end else if (NC_ENABLE) begin
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.tag : mem_rsp_tag_in_nc2;
end else begin
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_data.tag;
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (UUID_WIDTH != 0) begin
assign core_rsp_tag_out[i] = core_rsp_valid_in[i] ? core_rsp_tag_in_nc[i] : {mem_rsp_tag_in_nc[MEM_TAG_OUT_NC_WIDTH-1 -: UUID_WIDTH], mem_rsp_tag_in_nc[CORE_TAG_ID_BITS-1:0]};
end else begin
assign core_rsp_tag_out[i] = core_rsp_valid_in[i] ? core_rsp_tag_in_nc[i] : mem_rsp_tag_in_nc[CORE_TAG_ID_BITS-1:0];
end
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + CORE_TAG_WIDTH),
.SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_in_valid[i]),
.ready_in (core_rsp_in_ready[i]),
.data_in ({core_rsp_in_data[i], core_rsp_in_tag[i]}),
.data_out ({core_bus_in_if[i].rsp_data.data, core_bus_in_if[i].rsp_data.tag}),
.valid_out (core_bus_in_if[i].rsp_valid),
.ready_out (core_bus_in_if[i].rsp_ready)
);
end
// memory response handling
// handle memory responses ////////////////////////////////////////////////
if (PASSTHRU != 0) begin
assign mem_rsp_valid_out = 1'b0;
assign mem_bus_in_if.rsp_valid = 1'b0;
assign mem_bus_in_if.rsp_data.data = '0;
assign mem_bus_in_if.rsp_data.tag = '0;
end else if (NC_ENABLE) begin
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid && ~mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc[MEM_TAG_IN_WIDTH-1:0];
end else begin
assign mem_rsp_valid_out = mem_rsp_valid_in && ~mem_rsp_tag_in[NC_TAG_BIT];
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid;
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc;
end
assign mem_rsp_data_out = mem_rsp_data_in;
wire [NUM_REQS-1:0] core_rsp_out_valid;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_out_valid[i] = core_bus_out_if[i].rsp_valid;
end
VX_bits_remove #(
.N (MEM_TAG_IN_WIDTH + 1),
.POS (NC_TAG_BIT)
) mem_rsp_tag_out_remove (
.data_in (mem_rsp_tag_in[(MEM_TAG_IN_WIDTH + 1)-1:0]),
.data_out (mem_rsp_tag_out)
);
assign mem_rsp_ready_in = is_mem_rsp_nc ? (~core_rsp_valid_in[rsp_idx] && core_rsp_ready_out[rsp_idx]) : mem_rsp_ready_out;
assign mem_bus_out_if.rsp_ready = is_mem_rsp_nc ? (~core_rsp_out_valid[rsp_idx] && core_rsp_in_ready[rsp_idx]) : mem_bus_in_if.rsp_ready;
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -24,20 +24,20 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
parameter NUM_REQS = 4,
// Size of cache in bytes
parameter CACHE_SIZE = 16384,
parameter CACHE_SIZE = 16384,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 4,
// Size of a word in bytes
parameter WORD_SIZE = 4,
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 8,
parameter MSHR_SIZE = 8,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
@ -46,6 +46,12 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -55,12 +61,12 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
// enable bypass for non-cacheable addresses
parameter NC_ENABLE = 0,
// Core response output register
parameter CORE_OUT_REG = 0,
// Core response output buffer
parameter CORE_OUT_BUF = 0,
// Memory request output register
parameter MEM_OUT_REG = 0
) (
// Memory request output buffer
parameter MEM_OUT_BUF = 0
) (
input wire clk,
input wire reset,
@ -74,18 +80,16 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
);
localparam NUM_CACHES = `UP(NUM_UNITS);
localparam PASSTHRU = (NUM_UNITS == 0);
localparam ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES);
localparam MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH)) :
localparam ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES);
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
`ifdef PERF_ENABLE
cache_perf_t perf_cache_tmp[1], perf_cache_unit[NUM_CACHES];
`PERF_CACHE_ADD (perf_cache_tmp, perf_cache_unit, 1, NUM_CACHES)
assign cache_perf = perf_cache_tmp[0];
cache_perf_t perf_cache_unit[NUM_CACHES];
`PERF_CACHE_ADD (cache_perf, perf_cache_unit, NUM_CACHES)
`endif
VX_mem_bus_if #(
@ -98,6 +102,8 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.TAG_WIDTH (ARB_TAG_WIDTH)
) arb_core_bus_if[NUM_CACHES * NUM_REQS]();
`RESET_RELAY_EX (cache_arb_reset, reset, NUM_REQS, `MAX_FANOUT);
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
@ -113,8 +119,6 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
`ASSIGN_VX_MEM_BUS_IF (core_bus_tmp_if[j], core_bus_if[j * NUM_REQS + i]);
end
`RESET_RELAY (cache_arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (NUM_INPUTS),
.NUM_OUTPUTS (NUM_CACHES),
@ -122,11 +126,11 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.TAG_WIDTH (TAG_WIDTH),
.TAG_SEL_IDX (TAG_SEL_IDX),
.ARBITER ("R"),
.OUT_REG_REQ ((NUM_INPUTS != NUM_CACHES) ? 2 : 0),
.OUT_REG_RSP ((NUM_INPUTS != NUM_CACHES) ? 2 : 0)
.REQ_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0),
.RSP_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0)
) cache_arb (
.clk (clk),
.reset (cache_arb_reset),
.reset (cache_arb_reset[i]),
.bus_in_if (core_bus_tmp_if),
.bus_out_if (arb_core_bus_tmp_if)
);
@ -136,7 +140,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
end
end
for (genvar i = 0; i < NUM_CACHES; ++i) begin
for (genvar i = 0; i < NUM_CACHES; ++i) begin : caches
`RESET_RELAY (cache_reset, reset);
@ -153,10 +157,13 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (ARB_TAG_WIDTH),
.CORE_OUT_REG ((NUM_INPUTS != NUM_CACHES) ? 2 : CORE_OUT_REG),
.MEM_OUT_REG ((NUM_CACHES > 1) ? 2 : MEM_OUT_REG),
.TAG_SEL_IDX (TAG_SEL_IDX),
.CORE_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : CORE_OUT_BUF),
.MEM_OUT_BUF ((NUM_CACHES > 1) ? 2 : MEM_OUT_BUF),
.NC_ENABLE (NC_ENABLE),
.PASSTHRU (PASSTHRU)
) cache_wrap (
@ -170,8 +177,6 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
);
end
`RESET_RELAY (mem_arb_reset, reset);
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1))
@ -181,13 +186,13 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.NUM_INPUTS (NUM_CACHES),
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.TAG_SEL_IDX (TAG_SEL_IDX),
.ARBITER ("R"),
.OUT_REG_REQ ((NUM_CACHES > 1) ? 2 : 0),
.OUT_REG_RSP ((NUM_CACHES > 1) ? 2 : 0)
.REQ_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0),
.RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0)
) mem_arb (
.clk (clk),
.reset (mem_arb_reset),
.reset (reset),
.bus_in_if (cache_mem_bus_if),
.bus_out_if (mem_bus_tmp_if)
);

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -17,17 +17,21 @@ module VX_cache_data #(
parameter `STRING INSTANCE_ID= "",
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 1,
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0
) (
@ -40,60 +44,101 @@ module VX_cache_data #(
input wire stall,
input wire init,
input wire read,
input wire fill,
input wire fill,
input wire flush,
input wire write,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire [`UP(`CS_WORD_SEL_BITS)-1:0] wsel,
input wire [WORD_SIZE-1:0] byteen,
input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] fill_data,
input wire [`CS_WORD_WIDTH-1:0] write_data,
input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] write_data,
input wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_byteen,
input wire [NUM_WAYS-1:0] way_sel,
output wire [`CS_WORD_WIDTH-1:0] read_data
output wire [`CS_WORD_WIDTH-1:0] read_data,
output wire [`CS_LINE_WIDTH-1:0] dirty_data,
output wire [LINE_SIZE-1:0] dirty_byteen
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_PARAM (WORD_SIZE)
`UNUSED_VAR (reset)
`UNUSED_VAR (stall)
`UNUSED_VAR (line_addr)
`UNUSED_VAR (init)
`UNUSED_VAR (read)
`UNUSED_VAR (flush)
localparam BYTEENW = (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) ? (LINE_SIZE * NUM_WAYS) : 1;
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] wdata;
wire [BYTEENW-1:0] wren;
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin
reg [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] wdata_r;
reg [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] wren_r;
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_rdata;
wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
always @(*) begin
wdata_r = {`CS_WORDS_PER_LINE{write_data}};
wren_r = '0;
wren_r[wsel] = byteen;
if (WRITEBACK) begin
if (DIRTY_BYTES) begin
wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_rdata;
wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_wdata;
for (genvar i = 0; i < NUM_WAYS; ++i) begin
wire [LINE_SIZE-1:0] wdata = write ? (bs_rdata[i] | write_byteen) : ((fill || flush) ? '0 : bs_rdata[i]);
assign bs_wdata[i] = init ? '0 : (way_sel[i] ? wdata : bs_rdata[i]);
end
VX_sp_ram #(
.DATAW (LINE_SIZE * NUM_WAYS),
.SIZE (`CS_LINES_PER_BANK)
) byteen_store (
.clk (clk),
.reset (reset),
.read (write || fill || flush),
.write (init || write || fill || flush),
.wren (1'b1),
.addr (line_sel),
.wdata (bs_wdata),
.rdata (bs_rdata)
);
assign dirty_byteen = bs_rdata[way_idx];
end else begin
assign dirty_byteen = {LINE_SIZE{1'b1}};
end
// order the data layout to perform ways multiplexing last
// this allows performing onehot encoding of the way index in parallel with BRAM read.
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w;
wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] flipped_rdata;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
assign wdata[i] = fill ? {NUM_WAYS{fill_data[i]}} : {NUM_WAYS{wdata_r[i]}};
for (genvar j = 0; j < NUM_WAYS; ++j) begin
assign wren_w[i][j] = (fill ? {WORD_SIZE{1'b1}} : wren_r[i])
& {WORD_SIZE{((NUM_WAYS == 1) || way_sel[j])}};
assign flipped_rdata[j][i] = line_rdata[i][j];
end
end
assign wren = wren_w;
assign dirty_data = flipped_rdata[way_idx];
end else begin
assign dirty_byteen = '0;
assign dirty_data = '0;
end
// order the data layout to perform ways multiplexing last.
// this allows converting way index to binary in parallel with BRAM readaccess and way selection.
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_wdata;
wire [BYTEENW-1:0] line_wren;
if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
for (genvar j = 0; j < NUM_WAYS; ++j) begin
assign line_wdata[i][j] = (fill || !WRITE_ENABLE) ? fill_data[i] : write_data[i];
assign wren_w[i][j] = ((fill || !WRITE_ENABLE) ? {WORD_SIZE{1'b1}} : write_byteen[i])
& {WORD_SIZE{(way_sel[j] || (NUM_WAYS == 1))}};
end
end
assign line_wren = wren_w;
end else begin
`UNUSED_VAR (write)
`UNUSED_VAR (byteen)
`UNUSED_VAR (write_byteen)
`UNUSED_VAR (write_data)
assign wdata = fill_data;
assign wren = fill;
assign line_wdata = fill_data;
assign line_wren = fill;
end
wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
VX_onehot_encoder #(
.N (NUM_WAYS)
@ -103,50 +148,52 @@ module VX_cache_data #(
`UNUSED_PIN (valid_out)
);
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] rdata;
wire line_read = (read && ~stall)
|| (WRITEBACK && (fill || flush));
wire line_write = write || fill;
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
VX_sp_ram #(
.DATAW (`CS_LINE_WIDTH * NUM_WAYS),
.SIZE (`CS_LINES_PER_BANK),
.WRENW (BYTEENW),
.NO_RWCHECK (1)
.NO_RWCHECK (1),
.RW_ASSERT (1)
) data_store (
.clk (clk),
.read (1'b1),
.write (write || fill),
.wren (wren),
.reset (reset),
.read (line_read),
.write (line_write),
.wren (line_wren),
.addr (line_sel),
.wdata (wdata),
.rdata (rdata)
.wdata (line_wdata),
.rdata (line_rdata)
);
wire [NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] per_way_rdata;
if (`CS_WORDS_PER_LINE > 1) begin
assign per_way_rdata = rdata[wsel];
assign per_way_rdata = line_rdata[wsel];
end else begin
`UNUSED_VAR (wsel)
assign per_way_rdata = rdata;
end
assign per_way_rdata = line_rdata;
end
assign read_data = per_way_rdata[way_idx];
`UNUSED_VAR (stall)
`ifdef DBG_TRACE_CACHE_DATA
always @(posedge clk) begin
`ifdef DBG_TRACE_CACHE
always @(posedge clk) begin
if (fill && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d data-fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data));
`TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data));
end
if (flush && ~stall) begin
`TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, dirty_byteen, dirty_data));
end
if (read && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d data-read: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, read_data, req_uuid));
end
`TRACE(3, ("%d: %s read: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, read_data, req_uuid));
end
if (write && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d data-write: addr=0x%0h, way=%b, blk_addr=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, byteen, write_data, req_uuid));
end
end
`TRACE(3, ("%d: %s write: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, write_byteen[wsel], write_data[wsel], req_uuid));
end
end
`endif
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,7 +14,7 @@
`ifndef VX_CACHE_DEFINE_VH
`define VX_CACHE_DEFINE_VH
`include "VX_define.vh"
`include "VX_define.vh"
`define CS_REQ_SEL_BITS `CLOG2(NUM_REQS)
@ -50,7 +50,7 @@
`define CS_TAG_SEL_ADDR_START (1+`CS_LINE_SEL_ADDR_END)
`define CS_TAG_SEL_ADDR_END (`CS_WORD_ADDR_WIDTH-1)
`define CS_LINE_TAG_ADDR(x) x[`CS_LINE_ADDR_WIDTH-1 : `CS_LINE_SEL_BITS]
`define CS_LINE_ADDR_TAG(x) x[`CS_LINE_ADDR_WIDTH-1 : `CS_LINE_SEL_BITS]
///////////////////////////////////////////////////////////////////////////////
@ -64,14 +64,14 @@
///////////////////////////////////////////////////////////////////////////////
`define PERF_CACHE_ADD(dst, src, dcount, scount) \
`PERF_COUNTER_ADD (dst, src, reads, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, writes, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, read_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, write_misses, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, bank_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, mshr_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, dcount, scount, (((scount + dcount - 1) / dcount) > 1))
`define PERF_CACHE_ADD(dst, src, count) \
`PERF_COUNTER_ADD (dst, src, reads, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, writes, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, read_misses, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, write_misses, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, bank_stalls, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, mshr_stalls, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, count, (count > 1))
`endif // VX_CACHE_DEFINE_VH

165
hw/rtl/cache/VX_cache_flush.sv vendored Normal file
View file

@ -0,0 +1,165 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_cache_flush #(
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Number of banks
parameter NUM_BANKS = 1,
// Bank select latency
parameter BANK_SEL_LATENCY = 1
) (
input wire clk,
input wire reset,
VX_mem_bus_if.slave core_bus_in_if [NUM_REQS],
VX_mem_bus_if.master core_bus_out_if [NUM_REQS],
input wire [NUM_BANKS-1:0] bank_req_fire,
output wire [NUM_BANKS-1:0] flush_begin,
input wire [NUM_BANKS-1:0] flush_end
);
localparam STATE_IDLE = 0;
localparam STATE_WAIT1 = 1;
localparam STATE_FLUSH = 2;
localparam STATE_WAIT2 = 3;
localparam STATE_DONE = 4;
reg [2:0] state, state_n;
// track in-flight core requests
wire no_inflight_reqs;
if (BANK_SEL_LATENCY != 0) begin
localparam NUM_REQS_W = `CLOG2(NUM_REQS+1);
localparam NUM_BANKS_W = `CLOG2(NUM_BANKS+1);
wire [NUM_REQS-1:0] core_bus_out_fire;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_out_fire[i] = core_bus_out_if[i].req_valid && core_bus_out_if[i].req_ready;
end
wire [NUM_REQS_W-1:0] core_bus_out_cnt;
wire [NUM_BANKS_W-1:0] bank_req_cnt;
`POP_COUNT(core_bus_out_cnt, core_bus_out_fire);
`POP_COUNT(bank_req_cnt, bank_req_fire);
`UNUSED_VAR (core_bus_out_cnt)
VX_pending_size #(
.SIZE (BANK_SEL_LATENCY * NUM_BANKS),
.INCRW (NUM_BANKS_W),
.DECRW (NUM_BANKS_W)
) pending_size (
.clk (clk),
.reset (reset),
.incr (NUM_BANKS_W'(core_bus_out_cnt)),
.decr (bank_req_cnt),
.empty (no_inflight_reqs),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end else begin
assign no_inflight_reqs = 0;
`UNUSED_VAR (bank_req_fire)
end
reg [NUM_BANKS-1:0] flush_done, flush_done_n;
wire [NUM_REQS-1:0] flush_req_mask;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign flush_req_mask[i] = core_bus_in_if[i].req_valid && core_bus_in_if[i].req_data.atype[`ADDR_TYPE_FLUSH];
end
wire flush_req_enable = (| flush_req_mask);
reg [NUM_REQS-1:0] lock_released, lock_released_n;
for (genvar i = 0; i < NUM_REQS; ++i) begin
wire input_enable = ~flush_req_enable || lock_released[i];
assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && input_enable;
assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
assign core_bus_in_if[i].req_ready = core_bus_out_if[i].req_ready && input_enable;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_in_if[i].rsp_valid = core_bus_out_if[i].rsp_valid;
assign core_bus_in_if[i].rsp_data = core_bus_out_if[i].rsp_data;
assign core_bus_out_if[i].rsp_ready = core_bus_in_if[i].rsp_ready;
end
wire [NUM_REQS-1:0] core_bus_out_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_out_ready[i] = core_bus_out_if[i].req_ready;
end
always @(*) begin
state_n = state;
flush_done_n = flush_done;
lock_released_n = lock_released;
case (state)
STATE_IDLE: begin
if (flush_req_enable) begin
state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT1 : STATE_FLUSH;
end
end
STATE_WAIT1: begin
if (no_inflight_reqs) begin
state_n = STATE_FLUSH;
end
end
STATE_FLUSH: begin
// generate a flush request pulse
state_n = STATE_WAIT2;
end
STATE_WAIT2: begin
// wait for all banks to finish flushing
flush_done_n = flush_done | flush_end;
if (flush_done_n == {NUM_BANKS{1'b1}}) begin
state_n = STATE_DONE;
flush_done_n = '0;
// only release current flush requests
// and keep normal requests locked
lock_released_n = flush_req_mask;
end
end
STATE_DONE: begin
// wait until released flush requests are issued
// when returning to IDLE state other requests will unlock
lock_released_n = lock_released & ~core_bus_out_ready;
if (lock_released_n == 0) begin
state_n = STATE_IDLE;
end
end
endcase
end
always @(posedge clk) begin
if (reset) begin
state <= STATE_IDLE;
flush_done <= '0;
lock_released <= '0;
end else begin
state <= state_n;
flush_done <= flush_done_n;
lock_released <= lock_released_n;
end
end
assign flush_begin = {NUM_BANKS{state == STATE_FLUSH}};
endmodule

View file

@ -1,51 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_cache_init #(
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1
) (
input wire clk,
input wire reset,
output wire [`CS_LINE_SEL_BITS-1:0] addr_out,
output wire valid_out
);
reg enabled;
reg [`CS_LINE_SEL_BITS-1:0] line_ctr;
always @(posedge clk) begin
if (reset) begin
enabled <= 1;
line_ctr <= '0;
end else begin
if (enabled) begin
if (line_ctr == ((2 ** `CS_LINE_SEL_BITS)-1)) begin
enabled <= 0;
end
line_ctr <= line_ctr + `CS_LINE_SEL_BITS'(1);
end
end
end
assign addr_out = line_ctr;
assign valid_out = enabled;
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -13,25 +13,53 @@
`include "VX_cache_define.vh"
// this is an implementation of a pipelined multi-banked cache
// we allocate a free slot from the MSHR before processing a core request
// and release the slot when we get a cache hit.
// during a memory fill response we initiate the replay sequence
// and dequeue all associated pending entries.
// This is an implementation of a MSHR for pipelined multi-banked cache.
// We allocate a free slot from the MSHR before processing a core request
// and release the slot when we get a cache hit. This ensure that we do not
// enter the cache bank pipeline when the MSHR is full.
// During a memory fill response, we initiate the replay sequence
// and dequeue all pending entries for the given cache line.
//
// Pending core requests stored in the MSHR are sorted by the order of
// arrival and are dequeued in the same order.
// Each entry has a next pointer to the next entry pending for the same cache line.
//
// During the fill operation, the MSHR will release the MSHR entry at fill_id
// which represents the first request in the pending list that initiated the memory fill.
//
// The dequeue operation directly follows the fill operation and will release
// all the subsequent entries linked to fill_id (pending the same cache line).
//
// During the allocation operation, the MSHR will allocate the next free slot
// for the incoming core request. We return the allocated slot id as well as
// the slot id of the previous entry for the same cache line. This is used to
// link the new entry to the pending list during finalization.
//
// The lookup operation is used to find all pending entries for a given cache line.
// This is used to by the cache bank to determine if a cache miss is already pending
// and therefore avoid issuing a memory fill request.
//
// The finalize operation is used to release the allocated MSHR entry if we had a hit.
// If we had a miss and finalize_pending is true, we link the allocated entry to
// its corresponding pending list (via finalize_prev).
//
// Warning: This MSHR implementation is strongly coupled with the bank pipeline
// and as such changes to either module requires careful evaluation.
// This implementation makes the following assumptions:
// (1) two-cycle pipeline: st0 and st1.
// (2) core request flow: st0: allocate / lookup, st1: finalize.
// (3) the first dequeue after the fill should happen in st0, when the fill is in st1
// this is enforced inside the bank by "rdw_hazard_st0".
//
// This architecture implements three pipeline stages:
// - Arbitration: cache bank arbitration before entering pipeline.
// fill and dequeue operations are executed at this stage.
// - stage 0: cache bank tag access stage.
// allocate and lookup operations are executed at this stage.
// - stage 1: cache bank tdatag access stage.
// finalize operation is executed at this stage.
//
module VX_cache_mshr #(
parameter `STRING INSTANCE_ID= "",
parameter BANK_ID = 0,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 1,
// Miss Reserv Queue Knob
@ -51,26 +79,12 @@ module VX_cache_mshr #(
input wire[`UP(UUID_WIDTH)-1:0] fin_req_uuid,
`IGNORE_UNUSED_END
// allocate
input wire allocate_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] allocate_addr,
input wire allocate_rw,
input wire [DATA_WIDTH-1:0] allocate_data,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_id,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_tail,
output wire allocate_ready,
// lookup
input wire lookup_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] lookup_addr,
output wire [MSHR_SIZE-1:0] lookup_matches,
// memory fill
input wire fill_valid,
input wire [MSHR_ADDR_WIDTH-1:0] fill_id,
output wire [`CS_LINE_ADDR_WIDTH-1:0] fill_addr,
// dequeue
// dequeue
output wire dequeue_valid,
output wire [`CS_LINE_ADDR_WIDTH-1:0] dequeue_addr,
output wire dequeue_rw,
@ -78,30 +92,45 @@ module VX_cache_mshr #(
output wire [MSHR_ADDR_WIDTH-1:0] dequeue_id,
input wire dequeue_ready,
// allocate
input wire allocate_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] allocate_addr,
input wire allocate_rw,
input wire [DATA_WIDTH-1:0] allocate_data,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_id,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_prev,
output wire allocate_ready,
// lookup
input wire lookup_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] lookup_addr,
output wire [MSHR_SIZE-1:0] lookup_pending,
output wire [MSHR_SIZE-1:0] lookup_rw,
// finalize
input wire finalize_valid,
input wire finalize_release,
input wire finalize_pending,
input wire [MSHR_ADDR_WIDTH-1:0] finalize_id,
input wire [MSHR_ADDR_WIDTH-1:0] finalize_tail
input wire [MSHR_ADDR_WIDTH-1:0] finalize_prev
);
`UNUSED_PARAM (BANK_ID)
`UNUSED_PARAM (BANK_ID)
reg [`CS_LINE_ADDR_WIDTH-1:0] addr_table [MSHR_SIZE-1:0];
reg [MSHR_ADDR_WIDTH-1:0] next_index [MSHR_SIZE-1:0];
reg [MSHR_SIZE-1:0] valid_table, valid_table_n;
reg [MSHR_SIZE-1:0] next_table, next_table_x, next_table_n;
reg [MSHR_SIZE-1:0] write_table;
reg allocate_rdy, allocate_rdy_n;
reg [MSHR_ADDR_WIDTH-1:0] allocate_id_r, allocate_id_n;
reg dequeue_val, dequeue_val_n;
reg [MSHR_ADDR_WIDTH-1:0] dequeue_id_r, dequeue_id_n;
wire [MSHR_ADDR_WIDTH-1:0] tail_idx;
wire [MSHR_ADDR_WIDTH-1:0] prev_idx;
wire allocate_fire = allocate_valid && allocate_ready;
wire dequeue_fire = dequeue_valid && dequeue_ready;
@ -121,9 +150,9 @@ module VX_cache_mshr #(
VX_onehot_encoder #(
.N (MSHR_SIZE)
) tail_sel (
) prev_sel (
.data_in (addr_matches & ~next_table_x),
.data_out (tail_idx),
.data_out (prev_idx),
`UNUSED_PIN (valid_out)
);
@ -152,7 +181,7 @@ module VX_cache_mshr #(
valid_table_n[finalize_id] = 0;
end
if (finalize_pending) begin
next_table_x[finalize_tail] = 1;
next_table_x[finalize_prev] = 1;
end
end
@ -180,21 +209,21 @@ module VX_cache_mshr #(
end
if (finalize_valid && finalize_pending) begin
next_index[finalize_tail] <= finalize_id;
next_index[finalize_prev] <= finalize_id;
end
dequeue_id_r <= dequeue_id_n;
allocate_id_r <= allocate_id_n;
next_table <= next_table_n;
end
`RUNTIME_ASSERT((~allocate_fire || ~valid_table[allocate_id_r]), ("%t: *** %s-bank%0d inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, BANK_ID,
`RUNTIME_ASSERT((~allocate_fire || ~valid_table[allocate_id_r]), ("%t: *** %s inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id_r, lkp_req_uuid))
`RUNTIME_ASSERT((~finalize_valid || valid_table[finalize_id]), ("%t: *** %s-bank%0d invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, BANK_ID,
`RUNTIME_ASSERT((~finalize_valid || valid_table[finalize_id]), ("%t: *** %s invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[finalize_id], BANK_ID), finalize_id, fin_req_uuid))
`RUNTIME_ASSERT((~fill_valid || valid_table[fill_id]), ("%t: *** %s-bank%0d invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID, BANK_ID,
`RUNTIME_ASSERT((~fill_valid || valid_table[fill_id]), ("%t: *** %s invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), fill_id))
VX_dp_ram #(
@ -203,10 +232,11 @@ module VX_cache_mshr #(
.LUTRAM (1)
) entries (
.clk (clk),
.reset (reset),
.read (1'b1),
.write (allocate_valid),
`UNUSED_PIN (wren),
.waddr (allocate_id_r),
.wren (1'b1),
.waddr (allocate_id_r),
.wdata (allocate_data),
.raddr (dequeue_id_r),
.rdata (dequeue_data)
@ -216,18 +246,20 @@ module VX_cache_mshr #(
assign allocate_ready = allocate_rdy;
assign allocate_id = allocate_id_r;
assign allocate_tail = tail_idx;
assign allocate_prev = prev_idx;
assign dequeue_valid = dequeue_val;
assign dequeue_addr = addr_table[dequeue_id_r];
assign dequeue_rw = write_table[dequeue_id_r];
assign dequeue_id = dequeue_id_r;
assign lookup_matches = addr_matches & ~write_table;
// return pending entries for the given cache line
assign lookup_pending = addr_matches;
assign lookup_rw = write_table;
`UNUSED_VAR (lookup_valid)
`ifdef DBG_TRACE_CACHE_MSHR
`ifdef DBG_TRACE_CACHE
reg show_table;
always @(posedge clk) begin
if (reset) begin
@ -236,22 +268,22 @@ module VX_cache_mshr #(
show_table <= allocate_fire || lookup_valid || finalize_valid || fill_valid || dequeue_fire;
end
if (allocate_fire)
`TRACE(3, ("%d: %s-bank%0d mshr-allocate: addr=0x%0h, tail=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID,
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_tail, allocate_id, lkp_req_uuid));
`TRACE(3, ("%d: %s allocate: addr=0x%0h, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_prev, allocate_id, lkp_req_uuid));
if (lookup_valid)
`TRACE(3, ("%d: %s-bank%0d mshr-lookup: addr=0x%0h, matches=%b (#%0d)\n", $time, INSTANCE_ID, BANK_ID,
`CS_LINE_TO_FULL_ADDR(lookup_addr, BANK_ID), lookup_matches, lkp_req_uuid));
`TRACE(3, ("%d: %s lookup: addr=0x%0h, matches=%b (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(lookup_addr, BANK_ID), lookup_pending, lkp_req_uuid));
if (finalize_valid)
`TRACE(3, ("%d: %s-bank%0d mshr-finalize release=%b, pending=%b, tail=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID,
finalize_release, finalize_pending, finalize_tail, finalize_id, fin_req_uuid));
`TRACE(3, ("%d: %s finalize release=%b, pending=%b, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID,
finalize_release, finalize_pending, finalize_prev, finalize_id, fin_req_uuid));
if (fill_valid)
`TRACE(3, ("%d: %s-bank%0d mshr-fill: addr=0x%0h, addr=0x%0h, id=%0d\n", $time, INSTANCE_ID, BANK_ID,
`TRACE(3, ("%d: %s fill: addr=0x%0h, addr=0x%0h, id=%0d\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), `CS_LINE_TO_FULL_ADDR(fill_addr, BANK_ID), fill_id));
if (dequeue_fire)
`TRACE(3, ("%d: %s-bank%0d mshr-dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID,
`TRACE(3, ("%d: %s dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_uuid));
if (show_table) begin
`TRACE(3, ("%d: %s-bank%0d mshr-table", $time, INSTANCE_ID, BANK_ID));
`TRACE(3, ("%d: %s table", $time, INSTANCE_ID));
for (integer i = 0; i < MSHR_SIZE; ++i) begin
if (valid_table[i]) begin
`TRACE(3, (" %0d=0x%0h", i, `CS_LINE_TO_FULL_ADDR(addr_table[i], BANK_ID)));
@ -264,7 +296,7 @@ module VX_cache_mshr #(
end
end
`TRACE(3, ("\n"));
end
end
end
`endif

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -17,15 +17,17 @@ module VX_cache_tags #(
parameter `STRING INSTANCE_ID = "",
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 1,
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
parameter NUM_WAYS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 1,
parameter WORD_SIZE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Request debug identifier
parameter UUID_WIDTH = 0
) (
@ -38,79 +40,137 @@ module VX_cache_tags #(
input wire stall,
// read/fill
// init/fill/lookup
input wire init,
input wire flush,
input wire fill,
input wire write,
input wire lookup,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire fill,
input wire init,
output wire [NUM_WAYS-1:0] way_sel,
output wire [NUM_WAYS-1:0] tag_matches
input wire [NUM_WAYS-1:0] way_sel,
output wire [NUM_WAYS-1:0] tag_matches,
// eviction
output wire evict_dirty,
output wire [NUM_WAYS-1:0] evict_way,
output wire [`CS_TAG_SEL_BITS-1:0] evict_tag
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_VAR (reset)
`UNUSED_VAR (lookup)
localparam TAG_WIDTH = 1 + `CS_TAG_SEL_BITS;
// valid, dirty, tag
localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS;
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_TAG_ADDR(line_addr);
wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_ADDR_TAG(line_addr);
wire [NUM_WAYS-1:0][`CS_TAG_SEL_BITS-1:0] read_tag;
wire [NUM_WAYS-1:0] read_valid;
wire [NUM_WAYS-1:0] read_dirty;
if (NUM_WAYS > 1) begin
reg [NUM_WAYS-1:0] repl_way;
reg [NUM_WAYS-1:0] evict_way_r;
// cyclic assignment of replacement way
always @(posedge clk) begin
if (reset) begin
repl_way <= 1;
end else if (~stall) begin // hold the value on stalls prevent filling different slots twice
repl_way <= {repl_way[NUM_WAYS-2:0], repl_way[NUM_WAYS-1]};
evict_way_r <= 1;
end else if (~stall) begin // holding the value on stalls prevents filling different slots twice
evict_way_r <= {evict_way_r[NUM_WAYS-2:0], evict_way_r[NUM_WAYS-1]};
end
end
for (genvar i = 0; i < NUM_WAYS; ++i) begin
assign way_sel[i] = fill && repl_way[i];
end
assign evict_way = fill ? evict_way_r : way_sel;
VX_onehot_mux #(
.DATAW (`CS_TAG_SEL_BITS),
.N (NUM_WAYS)
) evict_tag_sel (
.data_in (read_tag),
.sel_in (evict_way),
.data_out (evict_tag)
);
end else begin
`UNUSED_VAR (stall)
assign way_sel = fill;
assign evict_way = 1'b1;
assign evict_tag = read_tag;
end
// fill and flush need to also read in writeback mode
wire fill_s = fill && (!WRITEBACK || ~stall);
wire flush_s = flush && (!WRITEBACK || ~stall);
for (genvar i = 0; i < NUM_WAYS; ++i) begin
wire [`CS_TAG_SEL_BITS-1:0] read_tag;
wire read_valid;
wire do_fill = fill_s && evict_way[i];
wire do_flush = flush_s && (!WRITEBACK || way_sel[i]); // flush the whole line in writethrough mode
wire do_write = WRITEBACK && write && tag_matches[i];
wire line_read = (WRITEBACK && (fill_s || flush_s));
wire line_write = init || do_fill || do_flush || do_write;
wire line_valid = ~(init || flush);
wire [TAG_WIDTH-1:0] line_wdata;
wire [TAG_WIDTH-1:0] line_rdata;
if (WRITEBACK) begin
assign line_wdata = {line_valid, write, line_tag};
assign {read_valid[i], read_dirty[i], read_tag[i]} = line_rdata;
end else begin
assign line_wdata = {line_valid, line_tag};
assign {read_valid[i], read_tag[i]} = line_rdata;
assign read_dirty[i] = 1'b0;
end
VX_sp_ram #(
.DATAW (TAG_WIDTH),
.SIZE (`CS_LINES_PER_BANK),
.NO_RWCHECK (1)
.NO_RWCHECK (1),
.RW_ASSERT (1)
) tag_store (
.clk (clk),
.read (1'b1),
.write (way_sel[i] || init),
`UNUSED_PIN (wren),
.reset (reset),
.read (line_read),
.write (line_write),
.wren (1'b1),
.addr (line_sel),
.wdata ({~init, line_tag}),
.rdata ({read_valid, read_tag})
.wdata (line_wdata),
.rdata (line_rdata)
);
assign tag_matches[i] = read_valid && (line_tag == read_tag);
end
`ifdef DBG_TRACE_CACHE_TAG
for (genvar i = 0; i < NUM_WAYS; ++i) begin
assign tag_matches[i] = read_valid[i] && (line_tag == read_tag[i]);
end
assign evict_dirty = | (read_dirty & evict_way);
`ifdef DBG_TRACE_CACHE
wire [`CS_LINE_ADDR_WIDTH-1:0] evict_line_addr = {evict_tag, line_sel};
always @(posedge clk) begin
if (fill && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d tag-fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, line_tag));
`TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, dirty=%b, evict_addr=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), evict_way, line_sel, line_tag, evict_dirty, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID)));
end
if (init) begin
`TRACE(3, ("%d: %s-bank%0d tag-init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel));
`TRACE(3, ("%d: %s init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel));
end
if (flush && ~stall) begin
`TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, dirty=%b\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID), way_sel, line_sel, evict_dirty));
end
if (lookup && ~stall) begin
if (tag_matches != 0) begin
`TRACE(3, ("%d: %s-bank%0d tag-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, line_tag, req_uuid));
if (write)
`TRACE(3, ("%d: %s write-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid));
else
`TRACE(3, ("%d: %s read-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid));
end else begin
`TRACE(3, ("%d: %s-bank%0d tag-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
if (write)
`TRACE(3, ("%d: %s write-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
else
`TRACE(3, ("%d: %s read-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
end
end
end
end
end
`endif
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -20,20 +20,20 @@ module VX_cache_top import VX_gpu_pkg::*; #(
parameter NUM_REQS = 4,
// Size of cache in bytes
parameter CACHE_SIZE = 16384,
parameter CACHE_SIZE = 16384,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 4,
// Number of associative ways
parameter NUM_WAYS = 4,
// Size of a word in bytes
parameter WORD_SIZE = 4,
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 16,
parameter MSHR_SIZE = 16,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
@ -42,20 +42,26 @@ module VX_cache_top import VX_gpu_pkg::*; #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = 16,
// Core response output register
parameter CORE_OUT_REG = 2,
// Core response output buffer
parameter CORE_OUT_BUF = 2,
// Memory request output register
parameter MEM_OUT_REG = 2,
// Memory request output buffer
parameter MEM_OUT_BUF = 2,
parameter MEM_TAG_WIDTH = `CLOG2(MSHR_SIZE) + `CLOG2(NUM_BANKS)
) (
) (
input wire clk,
input wire reset,
@ -69,6 +75,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
input wire [NUM_REQS-1:0] core_req_rw,
input wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr,
input wire [NUM_REQS-1:0][`ADDR_TYPE_WIDTH-1:0] core_req_atype,
input wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data,
input wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag,
output wire [NUM_REQS-1:0] core_req_ready,
@ -81,17 +88,17 @@ module VX_cache_top import VX_gpu_pkg::*; #(
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire mem_req_rw,
output wire [LINE_SIZE-1:0] mem_req_byteen,
output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire mem_rsp_valid,
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready
);
VX_mem_bus_if #(
@ -110,6 +117,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
assign core_bus_if[i].req_data.rw = core_req_rw[i];
assign core_bus_if[i].req_data.byteen = core_req_byteen[i];
assign core_bus_if[i].req_data.addr = core_req_addr[i];
assign core_bus_if[i].req_data.atype = core_req_atype[i];
assign core_bus_if[i].req_data.data = core_req_data[i];
assign core_bus_if[i].req_data.tag = core_req_tag[i];
assign core_req_ready[i] = core_bus_if[i].req_ready;
@ -125,17 +133,18 @@ module VX_cache_top import VX_gpu_pkg::*; #(
// Memory request
assign mem_req_valid = mem_bus_if.req_valid;
assign mem_req_rw = mem_bus_if.req_data.rw;
assign mem_req_rw = mem_bus_if.req_data.rw;
assign mem_req_byteen = mem_bus_if.req_data.byteen;
assign mem_req_addr = mem_bus_if.req_data.addr;
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
`UNUSED_VAR (mem_bus_if.req_data.atype)
// Memory response
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data;
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
assign mem_rsp_ready = mem_bus_if.rsp_ready;
VX_cache #(
@ -153,8 +162,10 @@ module VX_cache_top import VX_gpu_pkg::*; #(
.TAG_WIDTH (TAG_WIDTH),
.UUID_WIDTH (UUID_WIDTH),
.WRITE_ENABLE (WRITE_ENABLE),
.CORE_OUT_REG (CORE_OUT_REG),
.MEM_OUT_REG (MEM_OUT_REG)
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.CORE_OUT_BUF (CORE_OUT_BUF),
.MEM_OUT_BUF (MEM_OUT_BUF)
) cache (
`ifdef PERF_ENABLE
.cache_perf (cache_perf),

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -16,24 +16,27 @@
module VX_cache_wrap import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter TAG_SEL_IDX = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Size of cache in bytes
parameter CACHE_SIZE = 4096,
parameter CACHE_SIZE = 4096,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// Size of a word in bytes
parameter WORD_SIZE = 4,
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 8,
parameter MSHR_SIZE = 8,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
@ -42,6 +45,12 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -49,19 +58,18 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
parameter TAG_WIDTH = UUID_WIDTH + 1,
// enable bypass for non-cacheable addresses
parameter NC_TAG_BIT = 0,
parameter NC_ENABLE = 0,
// Force bypass for all requests
parameter PASSTHRU = 0,
// Core response output register
parameter CORE_OUT_REG = 0,
// Core response output buffer
parameter CORE_OUT_BUF = 0,
// Memory request output register
parameter MEM_OUT_REG = 0
// Memory request output buffer
parameter MEM_OUT_BUF = 0
) (
input wire clk,
input wire reset,
@ -74,283 +82,91 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
VX_mem_bus_if.master mem_bus_if
);
`STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid parameter: NUM_BANKS=%d, NUM_REQS=%d", NUM_BANKS, NUM_REQS))
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
localparam CORE_TAG_X_WIDTH = TAG_WIDTH - NC_ENABLE;
localparam MEM_TAG_X_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
localparam MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH)) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
localparam CACHE_MEM_TAG_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
localparam NC_BYPASS = (NC_ENABLE || PASSTHRU);
localparam DIRECT_PASSTHRU = PASSTHRU && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == 1);
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
wire [NUM_REQS-1:0] core_req_valid;
wire [NUM_REQS-1:0] core_req_rw;
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr;
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag;
wire [NUM_REQS-1:0] core_req_ready;
localparam NC_OR_BYPASS = (NC_ENABLE || PASSTHRU);
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_valid[i] = core_bus_if[i].req_valid;
assign core_req_rw[i] = core_bus_if[i].req_data.rw;
assign core_req_addr[i] = core_bus_if[i].req_data.addr;
assign core_req_byteen[i] = core_bus_if[i].req_data.byteen;
assign core_req_data[i] = core_bus_if[i].req_data.data;
assign core_req_tag[i] = core_bus_if[i].req_data.tag;
assign core_bus_if[i].req_ready = core_req_ready[i];
end
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (TAG_WIDTH)
) core_bus_cache_if[NUM_REQS]();
///////////////////////////////////////////////////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (CACHE_MEM_TAG_WIDTH)
) mem_bus_cache_if();
// Core response buffering
wire [NUM_REQS-1:0] core_rsp_valid_s;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_s;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
wire [NUM_REQS-1:0] core_rsp_ready_s;
if (NC_OR_BYPASS) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
`RESET_RELAY (core_rsp_reset, reset);
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
.SIZE ((NC_BYPASS && !DIRECT_PASSTHRU) ? `OUT_REG_TO_EB_SIZE(CORE_OUT_REG) : 0),
.OUT_REG (`OUT_REG_TO_EB_REG(CORE_OUT_REG))
) core_rsp_buf (
.clk (clk),
.reset (core_rsp_reset),
.valid_in (core_rsp_valid_s[i]),
.ready_in (core_rsp_ready_s[i]),
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
.data_out ({core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag}),
.valid_out (core_bus_if[i].rsp_valid),
.ready_out (core_bus_if[i].rsp_ready)
);
end
///////////////////////////////////////////////////////////////////////////
// Memory request buffering
wire mem_req_valid_s;
wire mem_req_rw_s;
wire [LINE_SIZE-1:0] mem_req_byteen_s;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_s;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s;
wire mem_req_ready_s;
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH),
.SIZE ((NC_BYPASS && !DIRECT_PASSTHRU) ? `OUT_REG_TO_EB_SIZE(MEM_OUT_REG) : 0),
.OUT_REG (`OUT_REG_TO_EB_REG(MEM_OUT_REG))
) mem_req_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_valid_s),
.ready_in (mem_req_ready_s),
.data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s}),
.data_out ({mem_bus_if.req_data.rw, mem_bus_if.req_data.byteen, mem_bus_if.req_data.addr, mem_bus_if.req_data.data, mem_bus_if.req_data.tag}),
.valid_out (mem_bus_if.req_valid),
.ready_out (mem_bus_if.req_ready)
);
///////////////////////////////////////////////////////////////////////////
// Core request
wire [NUM_REQS-1:0] core_req_valid_b;
wire [NUM_REQS-1:0] core_req_rw_b;
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr_b;
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen_b;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data_b;
wire [NUM_REQS-1:0][CORE_TAG_X_WIDTH-1:0] core_req_tag_b;
wire [NUM_REQS-1:0] core_req_ready_b;
// Core response
wire [NUM_REQS-1:0] core_rsp_valid_b;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_b;
wire [NUM_REQS-1:0][CORE_TAG_X_WIDTH-1:0] core_rsp_tag_b;
wire [NUM_REQS-1:0] core_rsp_ready_b;
// Memory request
wire mem_req_valid_b;
wire mem_req_rw_b;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_b;
wire [LINE_SIZE-1:0] mem_req_byteen_b;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_b;
wire [MEM_TAG_X_WIDTH-1:0] mem_req_tag_b;
wire mem_req_ready_b;
// Memory response
wire mem_rsp_valid_b;
wire [`CS_LINE_WIDTH-1:0] mem_rsp_data_b;
wire [MEM_TAG_X_WIDTH-1:0] mem_rsp_tag_b;
wire mem_rsp_ready_b;
if (NC_BYPASS) begin
`RESET_RELAY (nc_bypass_reset, reset);
VX_cache_bypass #(
.NUM_REQS (NUM_REQS),
.NC_TAG_BIT (NC_TAG_BIT),
.TAG_SEL_IDX (TAG_SEL_IDX),
.NC_ENABLE (NC_ENABLE),
.PASSTHRU (PASSTHRU),
.NC_ENABLE (PASSTHRU ? 0 : NC_ENABLE),
.WORD_SIZE (WORD_SIZE),
.LINE_SIZE (LINE_SIZE),
.CORE_ADDR_WIDTH (`CS_WORD_ADDR_WIDTH),
.CORE_DATA_SIZE (WORD_SIZE),
.CORE_TAG_IN_WIDTH (TAG_WIDTH),
.CORE_TAG_WIDTH (TAG_WIDTH),
.MEM_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH),
.MEM_DATA_SIZE (LINE_SIZE),
.MEM_TAG_IN_WIDTH (MEM_TAG_X_WIDTH),
.MEM_TAG_IN_WIDTH (CACHE_MEM_TAG_WIDTH),
.MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH),
.UUID_WIDTH (UUID_WIDTH)
.UUID_WIDTH (UUID_WIDTH),
.CORE_OUT_BUF (CORE_OUT_BUF),
.MEM_OUT_BUF (MEM_OUT_BUF)
) cache_bypass (
.clk (clk),
.reset (nc_bypass_reset),
.clk (clk),
.reset (nc_bypass_reset),
// Core request in
.core_req_valid_in (core_req_valid),
.core_req_rw_in (core_req_rw),
.core_req_byteen_in (core_req_byteen),
.core_req_addr_in (core_req_addr),
.core_req_data_in (core_req_data),
.core_req_tag_in (core_req_tag),
.core_req_ready_in (core_req_ready),
.core_bus_in_if (core_bus_if),
.core_bus_out_if(core_bus_cache_if),
// Core request out
.core_req_valid_out (core_req_valid_b),
.core_req_rw_out (core_req_rw_b),
.core_req_byteen_out(core_req_byteen_b),
.core_req_addr_out (core_req_addr_b),
.core_req_data_out (core_req_data_b),
.core_req_tag_out (core_req_tag_b),
.core_req_ready_out (core_req_ready_b),
// Core response in
.core_rsp_valid_in (core_rsp_valid_b),
.core_rsp_data_in (core_rsp_data_b),
.core_rsp_tag_in (core_rsp_tag_b),
.core_rsp_ready_in (core_rsp_ready_b),
// Core response out
.core_rsp_valid_out (core_rsp_valid_s),
.core_rsp_data_out (core_rsp_data_s),
.core_rsp_tag_out (core_rsp_tag_s),
.core_rsp_ready_out (core_rsp_ready_s),
// Memory request in
.mem_req_valid_in (mem_req_valid_b),
.mem_req_rw_in (mem_req_rw_b),
.mem_req_addr_in (mem_req_addr_b),
.mem_req_byteen_in (mem_req_byteen_b),
.mem_req_data_in (mem_req_data_b),
.mem_req_tag_in (mem_req_tag_b),
.mem_req_ready_in (mem_req_ready_b),
// Memory request out
.mem_req_valid_out (mem_req_valid_s),
.mem_req_addr_out (mem_req_addr_s),
.mem_req_rw_out (mem_req_rw_s),
.mem_req_byteen_out (mem_req_byteen_s),
.mem_req_data_out (mem_req_data_s),
.mem_req_tag_out (mem_req_tag_s),
.mem_req_ready_out (mem_req_ready_s),
// Memory response in
.mem_rsp_valid_in (mem_bus_if.rsp_valid),
.mem_rsp_data_in (mem_bus_if.rsp_data.data),
.mem_rsp_tag_in (mem_bus_if.rsp_data.tag),
.mem_rsp_ready_in (mem_bus_if.rsp_ready),
// Memory response out
.mem_rsp_valid_out (mem_rsp_valid_b),
.mem_rsp_data_out (mem_rsp_data_b),
.mem_rsp_tag_out (mem_rsp_tag_b),
.mem_rsp_ready_out (mem_rsp_ready_b)
);
end else begin
assign core_req_valid_b = core_req_valid;
assign core_req_rw_b = core_req_rw;
assign core_req_addr_b = core_req_addr;
assign core_req_byteen_b= core_req_byteen;
assign core_req_data_b = core_req_data;
assign core_req_tag_b = core_req_tag;
assign core_req_ready = core_req_ready_b;
assign core_rsp_valid_s = core_rsp_valid_b;
assign core_rsp_data_s = core_rsp_data_b;
assign core_rsp_tag_s = core_rsp_tag_b;
assign core_rsp_ready_b = core_rsp_ready_s;
assign mem_req_valid_s = mem_req_valid_b;
assign mem_req_addr_s = mem_req_addr_b;
assign mem_req_rw_s = mem_req_rw_b;
assign mem_req_byteen_s = mem_req_byteen_b;
assign mem_req_data_s = mem_req_data_b;
assign mem_req_ready_b = mem_req_ready_s;
// Add explicit NC=0 flag to the memory request tag
VX_bits_insert #(
.N (MEM_TAG_WIDTH-1),
.POS (NC_TAG_BIT)
) mem_req_tag_insert (
.data_in (mem_req_tag_b),
.sel_in (1'b0),
.data_out (mem_req_tag_s)
.mem_bus_in_if (mem_bus_cache_if),
.mem_bus_out_if (mem_bus_if)
);
assign mem_rsp_valid_b = mem_bus_if.rsp_valid;
assign mem_rsp_data_b = mem_bus_if.rsp_data.data;
assign mem_bus_if.rsp_ready = mem_rsp_ready_b;
end else begin
// Remove NC flag from the memory response tag
for (genvar i = 0; i < NUM_REQS; ++i) begin
`ASSIGN_VX_MEM_BUS_IF (core_bus_cache_if[i], core_bus_if[i]);
end
VX_bits_remove #(
.N (MEM_TAG_WIDTH),
.POS (NC_TAG_BIT)
) mem_rsp_tag_remove (
.data_in (mem_bus_if.rsp_data.tag),
.data_out (mem_rsp_tag_b)
);
end
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_cache_if);
end
if (PASSTHRU != 0) begin
`UNUSED_VAR (core_req_valid_b)
`UNUSED_VAR (core_req_rw_b)
`UNUSED_VAR (core_req_addr_b)
`UNUSED_VAR (core_req_byteen_b)
`UNUSED_VAR (core_req_data_b)
`UNUSED_VAR (core_req_tag_b)
assign core_req_ready_b = '0;
for (genvar i = 0; i < NUM_REQS; ++i) begin
`UNUSED_VAR (core_bus_cache_if[i].req_valid)
`UNUSED_VAR (core_bus_cache_if[i].req_data)
assign core_bus_cache_if[i].req_ready = 0;
assign core_rsp_valid_b = '0;
assign core_rsp_data_b = '0;
assign core_rsp_tag_b = '0;
`UNUSED_VAR (core_rsp_ready_b)
assign core_bus_cache_if[i].rsp_valid = 0;
assign core_bus_cache_if[i].rsp_data = '0;
`UNUSED_VAR (core_bus_cache_if[i].rsp_ready)
end
assign mem_req_valid_b = 0;
assign mem_req_addr_b = '0;
assign mem_req_rw_b = '0;
assign mem_req_byteen_b = '0;
assign mem_req_data_b = '0;
assign mem_req_tag_b = '0;
`UNUSED_VAR (mem_req_ready_b)
assign mem_bus_cache_if.req_valid = 0;
assign mem_bus_cache_if.req_data = '0;
`UNUSED_VAR (mem_bus_cache_if.req_ready)
`UNUSED_VAR (mem_rsp_valid_b)
`UNUSED_VAR (mem_rsp_data_b)
`UNUSED_VAR (mem_rsp_tag_b)
assign mem_rsp_ready_b = 0;
`UNUSED_VAR (mem_bus_cache_if.rsp_valid)
`UNUSED_VAR (mem_bus_cache_if.rsp_data)
assign mem_bus_cache_if.rsp_ready = 0;
`ifdef PERF_ENABLE
assign cache_perf = '0;
@ -358,46 +174,6 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
end else begin
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (CORE_TAG_X_WIDTH)
) core_bus_wrap_if[NUM_REQS]();
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_X_WIDTH)
) mem_bus_wrap_if();
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_wrap_if[i].req_valid = core_req_valid_b[i];
assign core_bus_wrap_if[i].req_data.rw = core_req_rw_b[i];
assign core_bus_wrap_if[i].req_data.addr = core_req_addr_b[i];
assign core_bus_wrap_if[i].req_data.byteen = core_req_byteen_b[i];
assign core_bus_wrap_if[i].req_data.data = core_req_data_b[i];
assign core_bus_wrap_if[i].req_data.tag = core_req_tag_b[i];
assign core_req_ready_b[i] = core_bus_wrap_if[i].req_ready;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_valid_b[i] = core_bus_wrap_if[i].rsp_valid;
assign core_rsp_data_b[i] = core_bus_wrap_if[i].rsp_data.data;
assign core_rsp_tag_b[i] = core_bus_wrap_if[i].rsp_data.tag;
assign core_bus_wrap_if[i].rsp_ready = core_rsp_ready_b[i];
end
assign mem_req_valid_b = mem_bus_wrap_if.req_valid;
assign mem_req_addr_b = mem_bus_wrap_if.req_data.addr;
assign mem_req_rw_b = mem_bus_wrap_if.req_data.rw;
assign mem_req_byteen_b = mem_bus_wrap_if.req_data.byteen;
assign mem_req_data_b = mem_bus_wrap_if.req_data.data;
assign mem_req_tag_b = mem_bus_wrap_if.req_data.tag;
assign mem_bus_wrap_if.req_ready = mem_req_ready_b;
assign mem_bus_wrap_if.rsp_valid = mem_rsp_valid_b;
assign mem_bus_wrap_if.rsp_data.data = mem_rsp_data_b;
assign mem_bus_wrap_if.rsp_data.tag = mem_rsp_tag_b;
assign mem_rsp_ready_b = mem_bus_wrap_if.rsp_ready;
`RESET_RELAY (cache_reset, reset);
VX_cache #(
@ -413,25 +189,25 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (CORE_TAG_X_WIDTH),
.CORE_OUT_REG (NC_BYPASS ? 1 : CORE_OUT_REG),
.MEM_OUT_REG (NC_BYPASS ? 1 : MEM_OUT_REG)
.TAG_WIDTH (TAG_WIDTH),
.CORE_OUT_BUF (NC_OR_BYPASS ? 1 : CORE_OUT_BUF),
.MEM_OUT_BUF (NC_OR_BYPASS ? 1 : MEM_OUT_BUF)
) cache (
.clk (clk),
.reset (cache_reset),
`ifdef PERF_ENABLE
.cache_perf (cache_perf),
`endif
.core_bus_if (core_bus_wrap_if),
.mem_bus_if (mem_bus_wrap_if)
.core_bus_if (core_bus_cache_if),
.mem_bus_if (mem_bus_cache_if)
);
end
`ifdef DBG_TRACE_CACHE_BANK
`ifdef DBG_TRACE_CACHE
for (genvar i = 0; i < NUM_REQS; ++i) begin
wire [`UP(UUID_WIDTH)-1:0] core_req_uuid;
@ -451,20 +227,20 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (core_req_fire) begin
if (core_bus_if[i].req_data.rw)
`TRACE(1, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid));
`TRACE(1, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid));
else
`TRACE(1, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_req_uuid));
end
if (core_rsp_fire) begin
`TRACE(1, ("%d: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid));
end
`TRACE(1, ("%d: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid));
end
end
end
end
wire [`UP(UUID_WIDTH)-1:0] mem_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] mem_rsp_uuid;
if ((UUID_WIDTH != 0) && (NC_BYPASS != 0)) begin
if ((UUID_WIDTH != 0) && (NC_OR_BYPASS != 0)) begin
assign mem_req_uuid = mem_bus_if.req_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
assign mem_rsp_uuid = mem_bus_if.rsp_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
@ -478,17 +254,17 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (mem_req_fire) begin
if (mem_bus_if.req_data.rw)
`TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n",
`TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_bus_if.req_data.byteen, mem_bus_if.req_data.data, mem_req_uuid));
else
`TRACE(1, ("%d: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
`TRACE(1, ("%d: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_req_uuid));
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%0h (#%0d)\n",
`TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data, mem_rsp_uuid));
end
end
end
`endif
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -13,23 +13,23 @@
`include "VX_define.vh"
module VX_int_unit #(
parameter CORE_ID = 0,
module VX_alu_int #(
parameter `STRING INSTANCE_ID = "",
parameter BLOCK_IDX = 0,
parameter NUM_LANES = 1
) (
input wire clk,
input wire reset,
// Inputs
VX_execute_if.slave execute_if,
// Outputs
// Outputs
VX_commit_if.master commit_if,
VX_branch_ctl_if.master branch_ctl_if
);
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam LANE_BITS = `CLOG2(NUM_LANES);
localparam LANE_WIDTH = `UP(LANE_BITS);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
@ -40,9 +40,9 @@ module VX_int_unit #(
wire [NUM_LANES-1:0][`XLEN-1:0] add_result;
wire [NUM_LANES-1:0][`XLEN:0] sub_result; // +1 bit for branch compare
wire [NUM_LANES-1:0][`XLEN-1:0] shr_result;
reg [NUM_LANES-1:0][`XLEN-1:0] shr_zic_result;
reg [NUM_LANES-1:0][`XLEN-1:0] msc_result;
wire [NUM_LANES-1:0][`XLEN-1:0] add_result_w;
wire [NUM_LANES-1:0][`XLEN-1:0] sub_result_w;
wire [NUM_LANES-1:0][`XLEN-1:0] shr_result_w;
@ -52,26 +52,24 @@ module VX_int_unit #(
wire [NUM_LANES-1:0][`XLEN-1:0] alu_result_r;
`ifdef XLEN_64
wire is_alu_w = `INST_ALU_IS_W(execute_if.data.op_mod);
wire is_alu_w = execute_if.data.op_args.alu.is_w;
`else
wire is_alu_w = 0;
`endif
`UNUSED_VAR (execute_if.data.op_mod)
wire [`INST_ALU_BITS-1:0] alu_op = `INST_ALU_BITS'(execute_if.data.op_type);
wire [`INST_BR_BITS-1:0] br_op = `INST_BR_BITS'(execute_if.data.op_type);
wire is_br_op = `INST_ALU_IS_BR(execute_if.data.op_mod);
wire is_br_op = (execute_if.data.op_args.alu.xtype == `ALU_TYPE_BRANCH);
wire is_sub_op = `INST_ALU_IS_SUB(alu_op);
wire is_signed = `INST_ALU_SIGNED(alu_op);
wire is_signed = `INST_ALU_SIGNED(alu_op);
wire [1:0] op_class = is_br_op ? `INST_BR_CLASS(alu_op) : `INST_ALU_CLASS(alu_op);
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in1 = execute_if.data.rs1_data;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2 = execute_if.data.rs2_data;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in1_PC = execute_if.data.use_PC ? {NUM_LANES{execute_if.data.PC}} : alu_in1;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_imm = execute_if.data.use_imm ? {NUM_LANES{execute_if.data.imm}} : alu_in2;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_br = (execute_if.data.use_imm && ~is_br_op) ? {NUM_LANES{execute_if.data.imm}} : alu_in2;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in1_PC = execute_if.data.op_args.alu.use_PC ? {NUM_LANES{execute_if.data.PC, 1'd0}} : alu_in1;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_imm = execute_if.data.op_args.alu.use_imm ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_args.alu.imm)}} : alu_in2;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_br = (execute_if.data.op_args.alu.use_imm && ~is_br_op) ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_args.alu.imm)}} : alu_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign add_result[i] = alu_in1_PC[i] + alu_in2_imm[i];
@ -85,9 +83,20 @@ module VX_int_unit #(
assign sub_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] - alu_in2_imm[i][31:0]));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN:0] shr_in1 = {is_signed && alu_in1[i][`XLEN-1], alu_in1[i]};
assign shr_result[i] = `XLEN'($signed(shr_in1) >>> alu_in2_imm[i][SHIFT_IMM_BITS-1:0]);
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN:0] shr_in1 = {is_signed && alu_in1[i][`XLEN-1], alu_in1[i]};
always @(*) begin
case (alu_op[1:0])
`ifdef EXT_ZICOND_ENABLE
2'b10, 2'b11: begin // CZERO
shr_zic_result[i] = alu_in1[i] & {`XLEN{alu_op[0] ^ (| alu_in2[i])}};
end
`endif
default: begin // SRL, SRA, SRLI, SRAI
shr_zic_result[i] = `XLEN'($signed(shr_in1) >>> alu_in2_imm[i][SHIFT_IMM_BITS-1:0]);
end
endcase
end
wire [32:0] shr_in1_w = {is_signed && alu_in1[i][31], alu_in1[i][31:0]};
wire [31:0] shr_res_w = 32'($signed(shr_in1_w) >>> alu_in2_imm[i][4:0]);
assign shr_result_w[i] = `XLEN'($signed(shr_res_w));
@ -102,48 +111,51 @@ module VX_int_unit #(
2'b11: msc_result[i] = alu_in1[i] << alu_in2_imm[i][SHIFT_IMM_BITS-1:0]; // SLL
endcase
end
assign msc_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] << alu_in2_imm[i][4:0]));
assign msc_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] << alu_in2_imm[i][4:0])); // SLLW
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] slt_br_result = `XLEN'({is_br_op && ~(| sub_result[i][`XLEN-1:0]), sub_result[i][`XLEN]});
wire [`XLEN-1:0] sub_slt_br_result = (is_sub_op && ~is_br_op) ? sub_result[i][`XLEN-1:0] : slt_br_result;
always @(*) begin
case ({is_alu_w, op_class})
case ({is_alu_w, op_class})
3'b000: alu_result[i] = add_result[i]; // ADD, LUI, AUIPC
3'b001: alu_result[i] = sub_slt_br_result; // SUB, SLTU, SLTI, BR*
3'b010: alu_result[i] = shr_result[i]; // SRL, SRA, SRLI, SRAI
3'b010: alu_result[i] = shr_zic_result[i]; // SRL, SRA, SRLI, SRAI, CZERO*
3'b011: alu_result[i] = msc_result[i]; // AND, OR, XOR, SLL, SLLI
3'b100: alu_result[i] = add_result_w[i]; // ADDIW, ADDW
3'b101: alu_result[i] = sub_result_w[i]; // SUBW
3'b110: alu_result[i] = shr_result_w[i]; // SRLW, SRAW, SRLIW, SRAIW
3'b111: alu_result[i] = msc_result_w[i]; // SLLW
endcase
end
end
end
// branch
wire [`XLEN-1:0] PC_r, imm_r;
wire [`PC_BITS-1:0] PC_r;
wire [`INST_BR_BITS-1:0] br_op_r;
wire [`PC_BITS-1:0] cbr_dest, cbr_dest_r;
wire [LANE_WIDTH-1:0] tid, tid_r;
wire is_br_op_r;
assign cbr_dest = add_result[0][1 +: `PC_BITS];
if (LANE_BITS != 0) begin
assign tid = execute_if.data.tid[0 +: LANE_BITS];
end else begin
assign tid = 0;
end
end
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `NR_BITS + 1 + PID_WIDTH + 1 + 1 + (NUM_LANES * `XLEN) + `XLEN + `XLEN + 1 + `INST_BR_BITS + LANE_WIDTH)
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `NR_BITS + 1 + PID_WIDTH + 1 + 1 + (NUM_LANES * `XLEN) + `PC_BITS + `PC_BITS + 1 + `INST_BR_BITS + LANE_WIDTH)
) rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (execute_if.valid),
.ready_in (execute_if.ready),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, alu_result, execute_if.data.PC, execute_if.data.imm, is_br_op, br_op, tid}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, alu_result_r, PC_r, imm_r, is_br_op_r, br_op_r, tid_r}),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, alu_result, execute_if.data.PC, cbr_dest, is_br_op, br_op, tid}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, alu_result_r, PC_r, cbr_dest_r, is_br_op_r, br_op_r, tid_r}),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready)
);
@ -152,38 +164,38 @@ module VX_int_unit #(
wire is_br_neg = `INST_BR_IS_NEG(br_op_r);
wire is_br_less = `INST_BR_IS_LESS(br_op_r);
wire is_br_static = `INST_BR_IS_STATIC(br_op_r);
wire [`XLEN-1:0] br_result = alu_result_r[tid_r];
wire [`XLEN-1:0] br_result = alu_result_r[tid_r];
wire is_less = br_result[0];
wire is_equal = br_result[1];
wire br_enable = is_br_op_r && commit_if.valid && commit_if.ready && commit_if.data.eop;
wire br_taken = ((is_br_less ? is_less : is_equal) ^ is_br_neg) | is_br_static;
wire [`XLEN-1:0] br_dest = is_br_static ? br_result : (PC_r + imm_r);
wire [`PC_BITS-1:0] br_dest = is_br_static ? br_result[1 +: `PC_BITS] : cbr_dest_r;
wire [`NW_WIDTH-1:0] br_wid;
`ASSIGN_BLOCKED_WID (br_wid, commit_if.data.wid, BLOCK_IDX, `NUM_ALU_BLOCKS)
VX_pipe_register #(
.DATAW (1 + `NW_WIDTH + 1 + `XLEN)
.DATAW (1 + `NW_WIDTH + 1 + `PC_BITS)
) branch_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({br_enable, br_wid, br_taken, br_dest}),
.data_in ({br_enable, br_wid, br_taken, br_dest}),
.data_out ({branch_ctl_if.valid, branch_ctl_if.wid, branch_ctl_if.taken, branch_ctl_if.dest})
);
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign commit_if.data.data[i] = (is_br_op_r && is_br_static) ? (PC_r + 4) : alu_result_r[i];
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign commit_if.data.data[i] = (is_br_op_r && is_br_static) ? {(PC_r + `PC_BITS'(2)), 1'd0} : alu_result_r[i];
end
assign commit_if.data.PC = PC_r;
`ifdef DBG_TRACE_CORE_PIPELINE
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (branch_ctl_if.valid) begin
`TRACE(1, ("%d: core%0d-branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n",
$time, CORE_ID, branch_ctl_if.wid, commit_if.data.PC, branch_ctl_if.taken, branch_ctl_if.dest, commit_if.data.uuid));
if (br_enable) begin
`TRACE(1, ("%d: %s-branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n",
$time, INSTANCE_ID, br_wid, {commit_if.data.PC, 1'b0}, br_taken, {br_dest, 1'b0}, commit_if.data.uuid));
end
end
`endif

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -13,23 +13,23 @@
`include "VX_define.vh"
module VX_muldiv_unit #(
parameter CORE_ID = 0,
module VX_alu_muldiv #(
parameter `STRING INSTANCE_ID = "",
parameter NUM_LANES = 1
) (
input wire clk,
input wire reset,
// Inputs
VX_execute_if.slave execute_if,
// Outputs
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam TAGW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + PID_WIDTH + 1 + 1;
localparam TAG_WIDTH = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + PID_WIDTH + 1 + 1;
`UNUSED_VAR (execute_if.data.rs3_data)
@ -38,7 +38,7 @@ module VX_muldiv_unit #(
wire is_mulx_op = `INST_M_IS_MULX(muldiv_op);
wire is_signed_op = `INST_M_SIGNED(muldiv_op);
`ifdef XLEN_64
wire is_alu_w = `INST_ALU_IS_W(execute_if.data.op_mod);
wire is_alu_w = execute_if.data.op_args.alu.is_w;
`else
wire is_alu_w = 0;
`endif
@ -47,43 +47,43 @@ module VX_muldiv_unit #(
wire [`UUID_WIDTH-1:0] mul_uuid_out;
wire [`NW_WIDTH-1:0] mul_wid_out;
wire [NUM_LANES-1:0] mul_tmask_out;
wire [`XLEN-1:0] mul_PC_out;
wire [`PC_BITS-1:0] mul_PC_out;
wire [`NR_BITS-1:0] mul_rd_out;
wire mul_wb_out;
wire [PID_WIDTH-1:0] mul_pid_out;
wire mul_sop_out, mul_eop_out;
wire mul_valid_in = execute_if.valid && is_mulx_op;
wire mul_ready_in;
wire mul_valid_out;
wire mul_ready_out;
wire is_mulh_in = `INST_M_IS_MULH(muldiv_op);
wire is_signed_mul_a = `INST_M_SIGNED_A(muldiv_op);
wire is_signed_mul_b = is_signed_op;
`ifdef IMUL_DPI
wire [NUM_LANES-1:0][`XLEN-1:0] mul_result_tmp;
wire [NUM_LANES-1:0][`XLEN-1:0] mul_result_tmp;
wire mul_fire_in = mul_valid_in && mul_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] mul_resultl, mul_resulth;
wire [`XLEN-1:0] mul_in1 = is_alu_w ? (execute_if.data.rs1_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs1_data[i];
wire [`XLEN-1:0] mul_in2 = is_alu_w ? (execute_if.data.rs2_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs2_data[i];
always @(*) begin
reg [`XLEN-1:0] mul_resultl, mul_resulth;
wire [`XLEN-1:0] mul_in1 = is_alu_w ? (execute_if.data.rs1_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs1_data[i];
wire [`XLEN-1:0] mul_in2 = is_alu_w ? (execute_if.data.rs2_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs2_data[i];
always @(*) begin
dpi_imul (mul_fire_in, is_signed_mul_a, is_signed_mul_b, mul_in1, mul_in2, mul_resultl, mul_resulth);
end
assign mul_result_tmp[i] = is_mulh_in ? mul_resulth : (is_alu_w ? `XLEN'($signed(mul_resultl[31:0])) : mul_resultl);
end
VX_shift_register #(
.DATAW (1 + TAGW + (NUM_LANES * `XLEN)),
.DATAW (1 + TAG_WIDTH + (NUM_LANES * `XLEN)),
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) mul_shift_reg (
.clk(clk),
.clk (clk),
.reset (reset),
.enable (mul_ready_in),
.data_in ({mul_valid_in, execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, mul_result_tmp}),
@ -92,8 +92,8 @@ module VX_muldiv_unit #(
assign mul_ready_in = mul_ready_out || ~mul_valid_out;
`else
`else
wire [NUM_LANES-1:0][2*(`XLEN+1)-1:0] mul_result_tmp;
wire is_mulh_out;
wire is_mul_w_out;
@ -106,7 +106,7 @@ module VX_muldiv_unit #(
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign mul_in1[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]} : {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]};
assign mul_in2[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]} : {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]};
end
end
wire mul_strode;
wire mul_busy;
@ -115,7 +115,7 @@ module VX_muldiv_unit #(
.clk (clk),
.reset (reset),
.valid_in (mul_valid_in),
.ready_in (mul_ready_in),
.ready_in (mul_ready_in),
.valid_out (mul_valid_out),
.ready_out (mul_ready_out),
.strobe (mul_strode),
@ -128,31 +128,31 @@ module VX_muldiv_unit #(
.SIGNED (1)
) serial_mul (
.clk (clk),
.reset (reset),
.reset (reset),
.strobe (mul_strode),
.busy (mul_busy),
.busy (mul_busy),
.dataa (mul_in1),
.datab (mul_in2),
.result (mul_result_tmp)
);
reg [TAGW+2-1:0] mul_tag_r;
reg [TAG_WIDTH+2-1:0] mul_tag_r;
always @(posedge clk) begin
if (mul_valid_in && mul_ready_in) begin
mul_tag_r <= {execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, is_mulh_in, is_alu_w, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop};
end
end
assign {mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out, is_mul_w_out, mul_pid_out, mul_sop_out, mul_eop_out} = mul_tag_r;
`else
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN:0] mul_in1 = {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]};
wire [`XLEN:0] mul_in2 = {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]};
wire [`XLEN:0] mul_in2 = {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]};
VX_multiplier #(
.A_WIDTH (`XLEN+1),
.B_WIDTH (`XLEN+1),
@ -165,11 +165,11 @@ module VX_muldiv_unit #(
.dataa (mul_in1),
.datab (mul_in2),
.result (mul_result_tmp[i])
);
);
end
VX_shift_register #(
.DATAW (1 + TAGW + 1 + 1),
.DATAW (1 + TAG_WIDTH + 1 + 1),
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) mul_shift_reg (
@ -186,8 +186,8 @@ module VX_muldiv_unit #(
for (genvar i = 0; i < NUM_LANES; ++i) begin
`ifdef XLEN_64
assign mul_result_out[i] = is_mulh_out ? mul_result_tmp[i][2*(`XLEN)-1:`XLEN] :
(is_mul_w_out ? `XLEN'($signed(mul_result_tmp[i][31:0])) :
assign mul_result_out[i] = is_mulh_out ? mul_result_tmp[i][2*(`XLEN)-1:`XLEN] :
(is_mul_w_out ? `XLEN'($signed(mul_result_tmp[i][31:0])) :
mul_result_tmp[i][`XLEN-1:0]);
`else
assign mul_result_out[i] = is_mulh_out ? mul_result_tmp[i][2*(`XLEN)-1:`XLEN] : mul_result_tmp[i][`XLEN-1:0];
@ -203,7 +203,7 @@ module VX_muldiv_unit #(
wire [`UUID_WIDTH-1:0] div_uuid_out;
wire [`NW_WIDTH-1:0] div_wid_out;
wire [NUM_LANES-1:0] div_tmask_out;
wire [`XLEN-1:0] div_PC_out;
wire [`PC_BITS-1:0] div_PC_out;
wire [`NR_BITS-1:0] div_rd_out;
wire div_wb_out;
wire [PID_WIDTH-1:0] div_pid_out;
@ -211,7 +211,7 @@ module VX_muldiv_unit #(
wire is_rem_op = `INST_M_IS_REM(muldiv_op);
wire div_valid_in = execute_if.valid && ~is_mulx_op;
wire div_valid_in = execute_if.valid && ~is_mulx_op;
wire div_ready_in;
wire div_valid_out;
wire div_ready_out;
@ -226,25 +226,25 @@ module VX_muldiv_unit #(
`else
assign div_in1[i] = execute_if.data.rs1_data[i];
assign div_in2[i] = execute_if.data.rs2_data[i];
`endif
`endif
end
`ifdef IDIV_DPI
`ifdef IDIV_DPI
wire [NUM_LANES-1:0][`XLEN-1:0] div_result_in;
wire div_fire_in = div_valid_in && div_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] div_quotient, div_remainder;
always @(*) begin
reg [`XLEN-1:0] div_quotient, div_remainder;
always @(*) begin
dpi_idiv (div_fire_in, is_signed_op, div_in1[i], div_in2[i], div_quotient, div_remainder);
end
assign div_result_in[i] = is_rem_op ? (is_alu_w ? `XLEN'($signed(div_remainder[31:0])) : div_remainder) :
assign div_result_in[i] = is_rem_op ? (is_alu_w ? `XLEN'($signed(div_remainder[31:0])) : div_remainder) :
(is_alu_w ? `XLEN'($signed(div_quotient[31:0])) : div_quotient);
end
VX_shift_register #(
.DATAW (1 + TAGW + (NUM_LANES * `XLEN)),
.DATAW (1 + TAG_WIDTH + (NUM_LANES * `XLEN)),
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) div_shift_reg (
@ -260,7 +260,7 @@ module VX_muldiv_unit #(
`else
wire [NUM_LANES-1:0][`XLEN-1:0] div_quotient, div_remainder;
wire is_rem_op_out;
wire is_rem_op_out;
wire is_div_w_out;
wire div_strode;
wire div_busy;
@ -285,31 +285,31 @@ module VX_muldiv_unit #(
) serial_div (
.clk (clk),
.reset (reset),
.strobe (div_strode),
.busy (div_busy),
.is_signed (is_signed_op),
.is_signed (is_signed_op),
.numer (div_in1),
.denom (div_in2),
.quotient (div_quotient),
.remainder (div_remainder)
.remainder (div_remainder)
);
reg [TAGW+2-1:0] div_tag_r;
reg [TAG_WIDTH+2-1:0] div_tag_r;
always @(posedge clk) begin
if (div_valid_in && div_ready_in) begin
div_tag_r <= {execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, is_rem_op, is_alu_w, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop};
end
end
assign {div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out, is_div_w_out, div_pid_out, div_sop_out, div_eop_out} = div_tag_r;
for (genvar i = 0; i < NUM_LANES; ++i) begin
`ifdef XLEN_64
assign div_result_out[i] = is_rem_op_out ? (is_div_w_out ? `XLEN'($signed(div_remainder[i][31:0])) : div_remainder[i]) :
(is_div_w_out ? `XLEN'($signed(div_quotient[i][31:0])) : div_quotient[i]);
assign div_result_out[i] = is_rem_op_out ? (is_div_w_out ? `XLEN'($signed(div_remainder[i][31:0])) : div_remainder[i]) :
(is_div_w_out ? `XLEN'($signed(div_quotient[i][31:0])) : div_quotient[i]);
`else
assign div_result_out[i] = is_rem_op_out ? div_remainder[i] : div_quotient[i];
`UNUSED_VAR (is_div_w_out)
@ -323,8 +323,9 @@ module VX_muldiv_unit #(
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (TAGW + (NUM_LANES * `XLEN)),
.OUT_REG (1)
.DATAW (TAG_WIDTH + (NUM_LANES * `XLEN)),
.ARBITER ("F"),
.OUT_BUF (1)
) rsp_buf (
.clk (clk),
.reset (reset),
@ -337,5 +338,5 @@ module VX_muldiv_unit #(
.ready_out (commit_if.ready),
`UNUSED_PIN (sel_out)
);
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,73 +14,71 @@
`include "VX_define.vh"
module VX_alu_unit #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
// Inputs
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
// Outputs
VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_branch_ctl_if.master branch_ctl_if [`NUM_ALU_BLOCKS]
);
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam BLOCK_SIZE = `NUM_ALU_BLOCKS;
localparam NUM_LANES = `NUM_ALU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam RSP_ARB_SIZE = 1 + `EXT_M_ENABLED;
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) execute_if[BLOCK_SIZE]();
`RESET_RELAY (dispatch_reset, reset);
) per_block_execute_if[BLOCK_SIZE]();
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (PARTIAL_BW ? 1 : 0)
.OUT_BUF (PARTIAL_BW ? 1 : 0)
) dispatch_unit (
.clk (clk),
.reset (dispatch_reset),
.reset (reset),
.dispatch_if(dispatch_if),
.execute_if (execute_if)
.execute_if (per_block_execute_if)
);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_block_if[BLOCK_SIZE]();
) per_block_commit_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
wire is_muldiv_op;
`RESET_RELAY_EN (block_reset, reset,(BLOCK_SIZE > 1));
wire is_muldiv_op = `EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == `ALU_TYPE_MULDIV);
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) int_execute_if();
assign int_execute_if.valid = execute_if[block_idx].valid && ~is_muldiv_op;
assign int_execute_if.data = execute_if[block_idx].data;
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) int_commit_if();
`RESET_RELAY (int_reset, reset);
assign int_execute_if.valid = per_block_execute_if[block_idx].valid && ~is_muldiv_op;
assign int_execute_if.data = per_block_execute_if[block_idx].data;
VX_int_unit #(
.CORE_ID (CORE_ID),
VX_alu_int #(
.INSTANCE_ID ($sformatf("%s-int%0d", INSTANCE_ID, block_idx)),
.BLOCK_IDX (block_idx),
.NUM_LANES (NUM_LANES)
) int_unit (
) alu_int (
.clk (clk),
.reset (int_reset),
.reset (block_reset),
.execute_if (int_execute_if),
.branch_ctl_if (branch_ctl_if[block_idx]),
.commit_if (int_commit_if)
@ -88,84 +86,78 @@ module VX_alu_unit #(
`ifdef EXT_M_ENABLE
assign is_muldiv_op = `INST_ALU_IS_M(execute_if[block_idx].data.op_mod);
`RESET_RELAY (mdv_reset, reset);
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) mdv_execute_if();
assign mdv_execute_if.valid = execute_if[block_idx].valid && is_muldiv_op;
assign mdv_execute_if.data = execute_if[block_idx].data;
) muldiv_execute_if();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) mdv_commit_if();
) muldiv_commit_if();
VX_muldiv_unit #(
.CORE_ID (CORE_ID),
assign muldiv_execute_if.valid = per_block_execute_if[block_idx].valid && is_muldiv_op;
assign muldiv_execute_if.data = per_block_execute_if[block_idx].data;
VX_alu_muldiv #(
.INSTANCE_ID ($sformatf("%s-muldiv%0d", INSTANCE_ID, block_idx)),
.NUM_LANES (NUM_LANES)
) mdv_unit (
) muldiv_unit (
.clk (clk),
.reset (mdv_reset),
.execute_if (mdv_execute_if),
.commit_if (mdv_commit_if)
);
assign execute_if[block_idx].ready = is_muldiv_op ? mdv_execute_if.ready : int_execute_if.ready;
`else
assign is_muldiv_op = 0;
assign execute_if[block_idx].ready = int_execute_if.ready;
.reset (block_reset),
.execute_if (muldiv_execute_if),
.commit_if (muldiv_commit_if)
);
`endif
assign per_block_execute_if[block_idx].ready =
`ifdef EXT_M_ENABLE
is_muldiv_op ? muldiv_execute_if.ready :
`endif
int_execute_if.ready;
// send response
VX_stream_arb #(
.NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW),
.OUT_REG (PARTIAL_BW ? 1 : 3)
.OUT_BUF (PARTIAL_BW ? 1 : 3),
.ARBITER ("F")
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in ({
.reset (block_reset),
.valid_in ({
`ifdef EXT_M_ENABLE
mdv_commit_if.valid,
muldiv_commit_if.valid,
`endif
int_commit_if.valid
}),
.ready_in ({
`ifdef EXT_M_ENABLE
mdv_commit_if.ready,
muldiv_commit_if.ready,
`endif
int_commit_if.ready
}),
.data_in ({
`ifdef EXT_M_ENABLE
mdv_commit_if.data,
muldiv_commit_if.data,
`endif
int_commit_if.data
}),
.data_out (commit_block_if[block_idx].data),
.valid_out (commit_block_if[block_idx].valid),
.ready_out (commit_block_if[block_idx].ready),
.data_out (per_block_commit_if[block_idx].data),
.valid_out (per_block_commit_if[block_idx].valid),
.ready_out (per_block_commit_if[block_idx].ready),
`UNUSED_PIN (sel_out)
);
end
`RESET_RELAY (commit_reset, reset);
VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (PARTIAL_BW ? 3 : 0)
.OUT_BUF (PARTIAL_BW ? 3 : 0)
) gather_unit (
.clk (clk),
.reset (commit_reset),
.commit_in_if (commit_block_if),
.reset (reset),
.commit_in_if (per_block_commit_if),
.commit_out_if (commit_if)
);

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -13,101 +13,82 @@
`include "VX_define.vh"
module VX_commit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
// inputs
VX_commit_if.slave alu_commit_if [`ISSUE_WIDTH],
VX_commit_if.slave lsu_commit_if [`ISSUE_WIDTH],
`ifdef EXT_F_ENABLE
VX_commit_if.slave fpu_commit_if [`ISSUE_WIDTH],
`endif
VX_commit_if.slave sfu_commit_if [`ISSUE_WIDTH],
VX_commit_if.slave commit_if [`NUM_EX_UNITS * `ISSUE_WIDTH],
// outputs
VX_writeback_if.master writeback_if [`ISSUE_WIDTH],
VX_commit_csr_if.master commit_csr_if,
VX_commit_sched_if.master commit_sched_if,
// simulation helper signals
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value
VX_commit_sched_if.master commit_sched_if
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + `NUM_THREADS * `XLEN + 1 + 1 + 1;
`UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `NR_BITS + `NUM_THREADS * `XLEN + 1 + 1 + 1;
localparam COMMIT_SIZEW = `CLOG2(`NUM_THREADS + 1);
localparam COMMIT_ALL_SIZEW = COMMIT_SIZEW + `ISSUE_WIDTH - 1;
// commit arbitration
VX_commit_if commit_if[`ISSUE_WIDTH]();
VX_commit_if commit_arb_if[`ISSUE_WIDTH]();
wire [`ISSUE_WIDTH-1:0] commit_fire;
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid;
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask;
wire [`ISSUE_WIDTH-1:0] commit_eop;
wire [`ISSUE_WIDTH-1:0] per_issue_commit_fire;
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] per_issue_commit_wid;
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] per_issue_commit_tmask;
wire [`ISSUE_WIDTH-1:0] per_issue_commit_eop;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [`NUM_EX_UNITS-1:0] valid_in;
wire [`NUM_EX_UNITS-1:0][DATAW-1:0] data_in;
wire [`NUM_EX_UNITS-1:0] ready_in;
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin
assign valid_in[j] = commit_if[j * `ISSUE_WIDTH + i].valid;
assign data_in[j] = commit_if[j * `ISSUE_WIDTH + i].data;
assign commit_if[j * `ISSUE_WIDTH + i].ready = ready_in[j];
end
`RESET_RELAY (arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (`NUM_EX_UNITS),
.DATAW (DATAW),
.ARBITER ("R"),
.OUT_REG (1)
.OUT_BUF (1)
) commit_arb (
.clk (clk),
.reset (arb_reset),
.valid_in ({
sfu_commit_if[i].valid,
`ifdef EXT_F_ENABLE
fpu_commit_if[i].valid,
`endif
alu_commit_if[i].valid,
lsu_commit_if[i].valid
}),
.ready_in ({
sfu_commit_if[i].ready,
`ifdef EXT_F_ENABLE
fpu_commit_if[i].ready,
`endif
alu_commit_if[i].ready,
lsu_commit_if[i].ready
}),
.data_in ({
sfu_commit_if[i].data,
`ifdef EXT_F_ENABLE
fpu_commit_if[i].data,
`endif
alu_commit_if[i].data,
lsu_commit_if[i].data
}),
.data_out (commit_if[i].data),
.valid_out (commit_if[i].valid),
.ready_out (commit_if[i].ready),
.clk (clk),
.reset (arb_reset),
.valid_in (valid_in),
.ready_in (ready_in),
.data_in (data_in),
.data_out (commit_arb_if[i].data),
.valid_out (commit_arb_if[i].valid),
.ready_out (commit_arb_if[i].ready),
`UNUSED_PIN (sel_out)
);
assign commit_fire[i] = commit_if[i].valid && commit_if[i].ready;
assign commit_tmask[i]= {`NUM_THREADS{commit_fire[i]}} & commit_if[i].data.tmask;
assign commit_wid[i] = commit_if[i].data.wid;
assign commit_eop[i] = commit_if[i].data.eop;
assign per_issue_commit_fire[i] = commit_arb_if[i].valid && commit_arb_if[i].ready;
assign per_issue_commit_tmask[i]= {`NUM_THREADS{per_issue_commit_fire[i]}} & commit_arb_if[i].data.tmask;
assign per_issue_commit_wid[i] = commit_arb_if[i].data.wid;
assign per_issue_commit_eop[i] = commit_arb_if[i].data.eop;
end
// CSRs update
wire [`ISSUE_WIDTH-1:0][COMMIT_SIZEW-1:0] commit_size, commit_size_r;
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all_r, commit_size_all_rr;
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
assign commit_fire_any = (| commit_fire);
assign commit_fire_any = (| per_issue_commit_fire);
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [COMMIT_SIZEW-1:0] count;
`POP_COUNT(count, commit_tmask[i]);
`POP_COUNT(count, per_issue_commit_tmask[i]);
assign commit_size[i] = count;
end
@ -155,69 +136,56 @@ module VX_commit import VX_gpu_pkg::*; #(
end
assign commit_csr_if.instret = instret;
// Committed instructions
// Track committed instructions
wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop;
reg [`NUM_WARPS-1:0] committed_warps;
always @(*) begin
committed_warps = 0;
for (integer i = 0; i < `ISSUE_WIDTH; ++i) begin
if (per_issue_commit_fire[i] && per_issue_commit_eop[i]) begin
committed_warps[per_issue_commit_wid[i]] = 1;
end
end
end
VX_pipe_register #(
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
.RESETW (`ISSUE_WIDTH)
.DATAW (`NUM_WARPS),
.RESETW (`NUM_WARPS)
) committed_pipe_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({committed, commit_wid}),
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid})
.data_in (committed_warps),
.data_out ({commit_sched_if.committed_warps})
);
// Writeback
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign writeback_if[i].valid = commit_if[i].valid && commit_if[i].data.wb;
assign writeback_if[i].data.uuid = commit_if[i].data.uuid;
assign writeback_if[i].data.wis = wid_to_wis(commit_if[i].data.wid);
assign writeback_if[i].data.PC = commit_if[i].data.PC;
assign writeback_if[i].data.tmask= commit_if[i].data.tmask;
assign writeback_if[i].data.rd = commit_if[i].data.rd;
assign writeback_if[i].data.data = commit_if[i].data.data;
assign writeback_if[i].data.sop = commit_if[i].data.sop;
assign writeback_if[i].data.eop = commit_if[i].data.eop;
assign commit_if[i].ready = 1'b1; // writeback has no backpressure
assign writeback_if[i].valid = commit_arb_if[i].valid && commit_arb_if[i].data.wb;
assign writeback_if[i].data.uuid = commit_arb_if[i].data.uuid;
assign writeback_if[i].data.wis = wid_to_wis(commit_arb_if[i].data.wid);
assign writeback_if[i].data.PC = commit_arb_if[i].data.PC;
assign writeback_if[i].data.tmask= commit_arb_if[i].data.tmask;
assign writeback_if[i].data.rd = commit_arb_if[i].data.rd;
assign writeback_if[i].data.data = commit_arb_if[i].data.data;
assign writeback_if[i].data.sop = commit_arb_if[i].data.sop;
assign writeback_if[i].data.eop = commit_arb_if[i].data.eop;
assign commit_arb_if[i].ready = 1'b1; // writeback has no backpressure
end
// simulation helper signal to get RISC-V tests Pass/Fail status
reg [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value_r;
always @(posedge clk) begin
if (writeback_if[0].valid) begin
sim_wb_value_r[writeback_if[0].data.rd] <= writeback_if[0].data.data[0];
end
end
assign sim_wb_value = sim_wb_value_r;
`ifdef DBG_TRACE_CORE_PIPELINE
`ifdef DBG_TRACE_PIPELINE
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
always @(posedge clk) begin
if (alu_commit_if[i].valid && alu_commit_if[i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, alu_commit_if[i].data.wid, alu_commit_if[i].data.PC, alu_commit_if[i].data.tmask, alu_commit_if[i].data.wb, alu_commit_if[i].data.rd, alu_commit_if[i].data.sop, alu_commit_if[i].data.eop));
`TRACE_ARRAY1D(1, alu_commit_if[i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", alu_commit_if[i].data.uuid));
end
if (lsu_commit_if[i].valid && lsu_commit_if[i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, lsu_commit_if[i].data.wid, lsu_commit_if[i].data.PC, lsu_commit_if[i].data.tmask, lsu_commit_if[i].data.wb, lsu_commit_if[i].data.rd, lsu_commit_if[i].data.sop, lsu_commit_if[i].data.eop));
`TRACE_ARRAY1D(1, lsu_commit_if[i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", lsu_commit_if[i].data.uuid));
end
`ifdef EXT_F_ENABLE
if (fpu_commit_if[i].valid && fpu_commit_if[i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=FPU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, fpu_commit_if[i].data.wid, fpu_commit_if[i].data.PC, fpu_commit_if[i].data.tmask, fpu_commit_if[i].data.wb, fpu_commit_if[i].data.rd, fpu_commit_if[i].data.sop, fpu_commit_if[i].data.eop));
`TRACE_ARRAY1D(1, fpu_commit_if[i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", fpu_commit_if[i].data.uuid));
end
`endif
if (sfu_commit_if[i].valid && sfu_commit_if[i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=SFU, tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", $time, CORE_ID, sfu_commit_if[i].data.wid, sfu_commit_if[i].data.PC, sfu_commit_if[i].data.tmask, sfu_commit_if[i].data.wb, sfu_commit_if[i].data.rd, sfu_commit_if[i].data.sop, sfu_commit_if[i].data.eop));
`TRACE_ARRAY1D(1, sfu_commit_if[i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", sfu_commit_if[i].data.uuid));
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin
always @(posedge clk) begin
if (commit_if[j * `ISSUE_WIDTH + i].valid && commit_if[j * `ISSUE_WIDTH + i].ready) begin
`TRACE(1, ("%d: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, commit_if[j * `ISSUE_WIDTH + i].data.wid, {commit_if[j * `ISSUE_WIDTH + i].data.PC, 1'b0}));
trace_ex_type(1, j);
`TRACE(1, (", tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", commit_if[j * `ISSUE_WIDTH + i].data.tmask, commit_if[j * `ISSUE_WIDTH + i].data.wb, commit_if[j * `ISSUE_WIDTH + i].data.rd, commit_if[j * `ISSUE_WIDTH + i].data.sop, commit_if[j * `ISSUE_WIDTH + i].data.eop));
`TRACE_ARRAY1D(1, "0x%0h", commit_if[j * `ISSUE_WIDTH + i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", commit_if[j * `ISSUE_WIDTH + i].data.uuid));
end
end
end
end

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -17,11 +17,12 @@
`include "VX_fpu_define.vh"
`endif
module VX_core import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
module VX_core import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
// Clock
input wire clk,
input wire reset,
@ -40,10 +41,6 @@ module VX_core import VX_gpu_pkg::*; #(
VX_gbar_bus_if.master gbar_bus_if,
`endif
// simulation helper signals
output wire sim_ebreak,
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
// Status
output wire busy
);
@ -55,30 +52,21 @@ module VX_core import VX_gpu_pkg::*; #(
VX_commit_sched_if commit_sched_if();
VX_commit_csr_if commit_csr_if();
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
VX_warp_ctl_if warp_ctl_if();
VX_dispatch_if alu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if alu_commit_if[`ISSUE_WIDTH]();
VX_warp_ctl_if warp_ctl_if();
VX_dispatch_if lsu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if lsu_commit_if[`ISSUE_WIDTH]();
`ifdef EXT_F_ENABLE
VX_dispatch_if fpu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if fpu_commit_if[`ISSUE_WIDTH]();
`endif
VX_dispatch_if sfu_dispatch_if[`ISSUE_WIDTH]();
VX_commit_if sfu_commit_if[`ISSUE_WIDTH]();
VX_dispatch_if dispatch_if[`NUM_EX_UNITS * `ISSUE_WIDTH]();
VX_commit_if commit_if[`NUM_EX_UNITS * `ISSUE_WIDTH]();
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_mem_if[`NUM_LSU_BLOCKS]();
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
VX_pipeline_perf_if pipeline_perf_if();
VX_pipeline_perf_if pipeline_perf_if();
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
@ -107,19 +95,21 @@ module VX_core import VX_gpu_pkg::*; #(
`SCOPE_IO_SWITCH (3)
VX_schedule #(
.INSTANCE_ID ($sformatf("%s-schedule", INSTANCE_ID)),
.CORE_ID (CORE_ID)
) schedule (
.clk (clk),
.reset (schedule_reset),
`ifdef PERF_ENABLE
.perf_schedule_if (pipeline_perf_if.schedule),
`endif
.sched_perf (pipeline_perf_if.sched),
`endif
.base_dcrs (base_dcrs),
.base_dcrs (base_dcrs),
.warp_ctl_if (warp_ctl_if),
.warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if),
.decode_sched_if(decode_sched_if),
.commit_sched_if(commit_sched_if),
@ -127,13 +117,13 @@ module VX_core import VX_gpu_pkg::*; #(
`ifdef GBAR_ENABLE
.gbar_bus_if (gbar_bus_if),
`endif
.sched_csr_if (sched_csr_if),
.sched_csr_if (sched_csr_if),
.busy (busy)
);
VX_fetch #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-fetch", INSTANCE_ID))
) fetch (
`SCOPE_IO_BIND (0)
.clk (clk),
@ -144,7 +134,7 @@ module VX_core import VX_gpu_pkg::*; #(
);
VX_decode #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-decode", INSTANCE_ID))
) decode (
.clk (clk),
.reset (decode_reset),
@ -154,7 +144,7 @@ module VX_core import VX_gpu_pkg::*; #(
);
VX_issue #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-issue", INSTANCE_ID))
) issue (
`SCOPE_IO_BIND (1)
@ -162,110 +152,192 @@ module VX_core import VX_gpu_pkg::*; #(
.reset (issue_reset),
`ifdef PERF_ENABLE
.perf_issue_if (pipeline_perf_if.issue),
.issue_perf (pipeline_perf_if.issue),
`endif
.decode_if (decode_if),
.writeback_if (writeback_if),
.alu_dispatch_if(alu_dispatch_if),
.lsu_dispatch_if(lsu_dispatch_if),
`ifdef EXT_F_ENABLE
.fpu_dispatch_if(fpu_dispatch_if),
`endif
.sfu_dispatch_if(sfu_dispatch_if)
.dispatch_if (dispatch_if)
);
VX_execute #(
.INSTANCE_ID ($sformatf("%s-execute", INSTANCE_ID)),
.CORE_ID (CORE_ID)
) execute (
`SCOPE_IO_BIND (2)
.clk (clk),
.reset (execute_reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
.pipeline_perf_if(pipeline_perf_if),
`endif
.base_dcrs (base_dcrs),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
.pipeline_perf_if(pipeline_perf_if),
`endif
.lsu_mem_if (lsu_mem_if),
.dcache_bus_if (dcache_bus_tmp_if),
`ifdef EXT_F_ENABLE
.fpu_dispatch_if(fpu_dispatch_if),
.fpu_commit_if (fpu_commit_if),
`endif
.dispatch_if (dispatch_if),
.commit_if (commit_if),
.commit_csr_if (commit_csr_if),
.sched_csr_if (sched_csr_if),
.alu_dispatch_if(alu_dispatch_if),
.lsu_dispatch_if(lsu_dispatch_if),
.sfu_dispatch_if(sfu_dispatch_if),
.warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if),
.alu_commit_if (alu_commit_if),
.lsu_commit_if (lsu_commit_if),
.sfu_commit_if (sfu_commit_if),
.sim_ebreak (sim_ebreak)
);
.branch_ctl_if (branch_ctl_if)
);
VX_commit #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-commit", INSTANCE_ID))
) commit (
.clk (clk),
.reset (commit_reset),
.alu_commit_if (alu_commit_if),
.lsu_commit_if (lsu_commit_if),
`ifdef EXT_F_ENABLE
.fpu_commit_if (fpu_commit_if),
`endif
.sfu_commit_if (sfu_commit_if),
.writeback_if (writeback_if),
.commit_csr_if (commit_csr_if),
.commit_sched_if(commit_sched_if),
.commit_if (commit_if),
.sim_wb_value (sim_wb_value)
.writeback_if (writeback_if),
.commit_csr_if (commit_csr_if),
.commit_sched_if(commit_sched_if)
);
`ifdef SM_ENABLE
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_dcache_if[`NUM_LSU_BLOCKS]();
VX_smem_unit #(
.CORE_ID (CORE_ID)
) smem_unit (
.clk (clk),
.reset (reset),
`ifdef LMEM_ENABLE
`RESET_RELAY (lmem_unit_reset, reset);
VX_lmem_unit #(
.INSTANCE_ID (INSTANCE_ID)
) lmem_unit (
.clk (clk),
.reset (lmem_unit_reset),
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.smem),
.cache_perf (mem_perf_tmp_if.lmem),
`endif
.dcache_bus_in_if (dcache_bus_tmp_if),
.dcache_bus_out_if (dcache_bus_if)
.lsu_mem_in_if (lsu_mem_if),
.lsu_mem_out_if (lsu_dcache_if)
);
`else
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
`ASSIGN_VX_LSU_MEM_IF (lsu_dcache_if[i], lsu_mem_if[i]);
end
`endif
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
VX_lsu_mem_if #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_coalesced_if();
if (LSU_WORD_SIZE != DCACHE_WORD_SIZE) begin
`RESET_RELAY (mem_coalescer_reset, reset);
VX_mem_coalescer #(
.INSTANCE_ID ($sformatf("%s-coalescer%0d", INSTANCE_ID, i)),
.NUM_REQS (`NUM_LSU_LANES),
.DATA_IN_SIZE (LSU_WORD_SIZE),
.DATA_OUT_SIZE (DCACHE_WORD_SIZE),
.ADDR_WIDTH (LSU_ADDR_WIDTH),
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.QUEUE_SIZE (`LSUQ_OUT_SIZE)
) mem_coalescer (
.clk (clk),
.reset (mem_coalescer_reset),
// Input request
.in_req_valid (lsu_dcache_if[i].req_valid),
.in_req_mask (lsu_dcache_if[i].req_data.mask),
.in_req_rw (lsu_dcache_if[i].req_data.rw),
.in_req_byteen (lsu_dcache_if[i].req_data.byteen),
.in_req_addr (lsu_dcache_if[i].req_data.addr),
.in_req_atype (lsu_dcache_if[i].req_data.atype),
.in_req_data (lsu_dcache_if[i].req_data.data),
.in_req_tag (lsu_dcache_if[i].req_data.tag),
.in_req_ready (lsu_dcache_if[i].req_ready),
// Input response
.in_rsp_valid (lsu_dcache_if[i].rsp_valid),
.in_rsp_mask (lsu_dcache_if[i].rsp_data.mask),
.in_rsp_data (lsu_dcache_if[i].rsp_data.data),
.in_rsp_tag (lsu_dcache_if[i].rsp_data.tag),
.in_rsp_ready (lsu_dcache_if[i].rsp_ready),
// Output request
.out_req_valid (dcache_coalesced_if.req_valid),
.out_req_mask (dcache_coalesced_if.req_data.mask),
.out_req_rw (dcache_coalesced_if.req_data.rw),
.out_req_byteen (dcache_coalesced_if.req_data.byteen),
.out_req_addr (dcache_coalesced_if.req_data.addr),
.out_req_atype (dcache_coalesced_if.req_data.atype),
.out_req_data (dcache_coalesced_if.req_data.data),
.out_req_tag (dcache_coalesced_if.req_data.tag),
.out_req_ready (dcache_coalesced_if.req_ready),
// Output response
.out_rsp_valid (dcache_coalesced_if.rsp_valid),
.out_rsp_mask (dcache_coalesced_if.rsp_data.mask),
.out_rsp_data (dcache_coalesced_if.rsp_data.data),
.out_rsp_tag (dcache_coalesced_if.rsp_data.tag),
.out_rsp_ready (dcache_coalesced_if.rsp_ready)
);
end else begin
`ASSIGN_VX_LSU_MEM_IF (dcache_coalesced_if, lsu_dcache_if[i]);
end
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_tmp_if[DCACHE_CHANNELS]();
`RESET_RELAY (lsu_adapter_reset, reset);
VX_lsu_adapter #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (0)
) lsu_adapter (
.clk (clk),
.reset (lsu_adapter_reset),
.lsu_mem_if (dcache_coalesced_if),
.mem_bus_if (dcache_bus_tmp_if)
);
for (genvar j = 0; j < DCACHE_CHANNELS; ++j) begin
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i * DCACHE_CHANNELS + j], dcache_bus_tmp_if[j]);
end
end
`ifdef PERF_ENABLE
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle;
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
wire [1:0] perf_icache_pending_read_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
wire [`CLOG2(LSU_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
@ -277,14 +349,16 @@ module VX_core import VX_gpu_pkg::*; #(
wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
wire [DCACHE_NUM_REQS-1:0] perf_dcache_rsp_fire;
wire [LSU_NUM_REQS-1:0] perf_dcache_rd_req_fire, perf_dcache_rd_req_fire_r;
wire [LSU_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
wire [LSU_NUM_REQS-1:0] perf_dcache_rsp_fire;
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign perf_dcache_rd_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && ~dcache_bus_if[i].req_data.rw;
assign perf_dcache_wr_req_fire[i] = dcache_bus_if[i].req_valid && dcache_bus_if[i].req_ready && dcache_bus_if[i].req_data.rw;
assign perf_dcache_rsp_fire[i] = dcache_bus_if[i].rsp_valid && dcache_bus_if[i].rsp_ready;
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
assign perf_dcache_rd_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && ~lsu_mem_if[i].req_data.rw;
assign perf_dcache_wr_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && lsu_mem_if[i].req_data.rw;
assign perf_dcache_rsp_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].rsp_valid && lsu_mem_if[i].rsp_data.mask[j] && lsu_mem_if[i].rsp_ready;
end
end
`BUFFER(perf_dcache_rd_req_fire_r, perf_dcache_rd_req_fire);
@ -293,7 +367,7 @@ module VX_core import VX_gpu_pkg::*; #(
`POP_COUNT(perf_dcache_rd_req_per_cycle, perf_dcache_rd_req_fire_r);
`POP_COUNT(perf_dcache_wr_req_per_cycle, perf_dcache_wr_req_fire_r);
`POP_COUNT(perf_dcache_rsp_per_cycle, perf_dcache_rsp_fire);
assign perf_icache_pending_read_cycle = perf_icache_req_fire - perf_icache_rsp_fire;
assign perf_dcache_pending_read_cycle = perf_dcache_rd_req_per_cycle - perf_dcache_rsp_per_cycle;
@ -306,7 +380,7 @@ module VX_core import VX_gpu_pkg::*; #(
perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
end
end
reg [`PERF_CTR_BITS-1:0] perf_icache_lat;
reg [`PERF_CTR_BITS-1:0] perf_dcache_lat;

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -17,9 +17,9 @@
`include "VX_fpu_define.vh"
`endif
module VX_core_top import VX_gpu_pkg::*; #(
module VX_core_top import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
) (
// Clock
input wire clk,
input wire reset,
@ -32,13 +32,14 @@ module VX_core_top import VX_gpu_pkg::*; #(
output wire [DCACHE_NUM_REQS-1:0] dcache_req_rw,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] dcache_req_byteen,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] dcache_req_addr,
output wire [DCACHE_NUM_REQS-1:0][`ADDR_TYPE_WIDTH-1:0] dcache_req_atype,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_req_data,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] dcache_req_tag,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] dcache_req_tag,
input wire [DCACHE_NUM_REQS-1:0] dcache_req_ready,
input wire [DCACHE_NUM_REQS-1:0] dcache_rsp_valid,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_rsp_data,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] dcache_rsp_tag,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] dcache_rsp_tag,
output wire [DCACHE_NUM_REQS-1:0] dcache_rsp_ready,
output wire icache_req_valid,
@ -57,34 +58,29 @@ module VX_core_top import VX_gpu_pkg::*; #(
`ifdef GBAR_ENABLE
output wire gbar_req_valid,
output wire [`NB_WIDTH-1:0] gbar_req_id,
output wire [`NC_WIDTH-1:0] gbar_req_size_m1,
output wire [`NC_WIDTH-1:0] gbar_req_size_m1,
output wire [`NC_WIDTH-1:0] gbar_req_core_id,
input wire gbar_req_ready,
input wire gbar_rsp_valid,
input wire [`NB_WIDTH-1:0] gbar_rsp_id,
`endif
// simulation helper signals
output wire sim_ebreak,
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
// Status
output wire busy
);
`ifdef GBAR_ENABLE
VX_gbar_bus_if gbar_bus_if();
assign gbar_req_valid = gbar_bus_if.req_valid;
assign gbar_req_id = gbar_bus_if.req_id;
assign gbar_req_size_m1 = gbar_bus_if.req_size_m1;
assign gbar_req_size_m1 = gbar_bus_if.req_size_m1;
assign gbar_req_core_id = gbar_bus_if.req_core_id;
assign gbar_bus_if.req_ready = gbar_req_ready;
assign gbar_bus_if.rsp_valid = gbar_rsp_valid;
assign gbar_bus_if.rsp_id = gbar_rsp_id;
`endif
VX_dcr_bus_if dcr_bus_if();
VX_dcr_bus_if dcr_bus_if();
assign dcr_bus_if.write_valid = dcr_write_valid;
assign dcr_bus_if.write_addr = dcr_write_addr;
@ -92,7 +88,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_if[DCACHE_NUM_REQS]();
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
@ -100,6 +96,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign dcache_req_rw[i] = dcache_bus_if[i].req_data.rw;
assign dcache_req_byteen[i] = dcache_bus_if[i].req_data.byteen;
assign dcache_req_addr[i] = dcache_bus_if[i].req_data.addr;
assign dcache_req_atype[i] = dcache_bus_if[i].req_data.atype;
assign dcache_req_data[i] = dcache_bus_if[i].req_data.data;
assign dcache_req_tag[i] = dcache_bus_if[i].req_data.tag;
assign dcache_bus_if[i].req_ready = dcache_req_ready[i];
@ -122,6 +119,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign icache_req_data = icache_bus_if.req_data.data;
assign icache_req_tag = icache_bus_if.req_data.tag;
assign icache_bus_if.req_ready = icache_req_ready;
`UNUSED_VAR (icache_bus_if.req_data.atype)
assign icache_bus_if.rsp_valid = icache_rsp_valid;
assign icache_bus_if.rsp_data.tag = icache_rsp_tag;
@ -129,33 +127,34 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign icache_rsp_ready = icache_bus_if.rsp_ready;
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if();
VX_mem_perf_if mem_perf_if();
assign mem_perf_if.icache = '0;
assign mem_perf_if.dcache = '0;
assign mem_perf_if.l2cache = '0;
assign mem_perf_if.l3cache = '0;
assign mem_perf_if.smem = '0;
assign mem_perf_if.lmem = '0;
assign mem_perf_if.mem = '0;
`endif
`ifdef SCOPE
wire [0:0] scope_reset_w = 1'b0;
wire [0:0] scope_bus_in_w = 1'b0;
wire [0:0] scope_reset_w = 1'b0;
wire [0:0] scope_bus_in_w = 1'b0;
wire [0:0] scope_bus_out_w;
`UNUSED_VAR (scope_bus_out_w)
`endif
VX_core #(
.INSTANCE_ID ($sformatf("core")),
.CORE_ID (CORE_ID)
) core (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
`endif
.dcr_bus_if (dcr_bus_if),
.dcache_bus_if (dcache_bus_if),
@ -166,8 +165,6 @@ module VX_core_top import VX_gpu_pkg::*; #(
.gbar_bus_if (gbar_bus_if),
`endif
.sim_ebreak (sim_ebreak),
.sim_wb_value (sim_wb_value),
.busy (busy)
);

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -17,12 +17,22 @@
`include "VX_fpu_define.vh"
`endif
`ifdef XLEN_64
`define CSR_READ_64(addr, dst, src) \
addr : dst = `XLEN'(src)
`else
`define CSR_READ_64(addr, dst, src) \
addr : dst = src[31:0]; \
addr+12'h80 : dst = 32'(src[$bits(src)-1:32])
`endif
module VX_csr_data
import VX_gpu_pkg::*;
`ifdef EXT_F_ENABLE
import VX_fpu_pkg::*;
`endif
#(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0
) (
input wire clk,
@ -38,7 +48,7 @@ import VX_fpu_pkg::*;
VX_commit_csr_if.slave commit_csr_if,
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if.slave fpu_to_csr_if [`NUM_FPU_BLOCKS],
VX_fpu_csr_if.slave fpu_csr_if [`NUM_FPU_BLOCKS],
`endif
input wire [`PERF_CTR_BITS-1:0] cycles,
@ -49,14 +59,14 @@ import VX_fpu_pkg::*;
input wire [`UUID_WIDTH-1:0] read_uuid,
input wire [`NW_WIDTH-1:0] read_wid,
input wire [`VX_CSR_ADDR_BITS-1:0] read_addr,
output wire [31:0] read_data_ro,
output wire [31:0] read_data_rw,
output wire [`XLEN-1:0] read_data_ro,
output wire [`XLEN-1:0] read_data_rw,
input wire write_enable,
input wire write_enable,
input wire [`UUID_WIDTH-1:0] write_uuid,
input wire [`NW_WIDTH-1:0] write_wid,
input wire [`VX_CSR_ADDR_BITS-1:0] write_addr,
input wire [31:0] write_data
input wire [`XLEN-1:0] write_data
);
`UNUSED_VAR (reset)
@ -65,16 +75,20 @@ import VX_fpu_pkg::*;
// CSRs Write /////////////////////////////////////////////////////////////
`ifdef EXT_F_ENABLE
reg [`XLEN-1:0] mscratch;
`ifdef EXT_F_ENABLE
reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FP_FLAGS_BITS-1:0] fcsr, fcsr_n;
wire [`NUM_FPU_BLOCKS-1:0] fpu_write_enable;
wire [`NUM_FPU_BLOCKS-1:0][`NW_WIDTH-1:0] fpu_write_wid;
fflags_t [`NUM_FPU_BLOCKS-1:0] fpu_write_fflags;
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
assign fpu_write_enable[i] = fpu_to_csr_if[i].write_enable;
assign fpu_write_wid[i] = fpu_to_csr_if[i].write_wid;
assign fpu_write_fflags[i] = fpu_to_csr_if[i].write_fflags;
assign fpu_write_enable[i] = fpu_csr_if[i].write_enable;
assign fpu_write_wid[i] = fpu_csr_if[i].write_wid;
assign fpu_write_fflags[i] = fpu_csr_if[i].write_fflags;
end
always @(*) begin
fcsr_n = fcsr;
for (integer i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
@ -82,7 +96,7 @@ import VX_fpu_pkg::*;
fcsr_n[fpu_write_wid[i]][`FP_FLAGS_BITS-1:0] = fcsr[fpu_write_wid[i]][`FP_FLAGS_BITS-1:0]
| fpu_write_fflags[i];
end
end
end
if (write_enable) begin
case (write_addr)
`VX_CSR_FFLAGS: fcsr_n[write_wid][`FP_FLAGS_BITS-1:0] = write_data[`FP_FLAGS_BITS-1:0];
@ -92,9 +106,9 @@ import VX_fpu_pkg::*;
endcase
end
end
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
assign fpu_to_csr_if[i].read_frm = fcsr[fpu_to_csr_if[i].read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS];
assign fpu_csr_if[i].read_frm = fcsr[fpu_csr_if[i].read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS];
end
always @(posedge clk) begin
@ -107,6 +121,9 @@ import VX_fpu_pkg::*;
`endif
always @(posedge clk) begin
if (reset) begin
mscratch <= base_dcrs.startup_arg;
end
if (write_enable) begin
case (write_addr)
`ifdef EXT_F_ENABLE
@ -123,9 +140,14 @@ import VX_fpu_pkg::*;
`VX_CSR_MTVEC,
`VX_CSR_MEPC,
`VX_CSR_PMPCFG0,
`VX_CSR_PMPADDR0: /* do nothing!*/;
`VX_CSR_PMPADDR0: begin
// do nothing!
end
`VX_CSR_MSCRATCH: begin
mscratch <= write_data;
end
default: begin
`ASSERT(0, ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid));
`ASSERT(0, ("%t: *** %s invalid CSR write address: %0h (#%0d)", $time, INSTANCE_ID, write_addr, write_uuid));
end
endcase
end
@ -133,38 +155,42 @@ import VX_fpu_pkg::*;
// CSRs read //////////////////////////////////////////////////////////////
reg [31:0] read_data_ro_r;
reg [31:0] read_data_rw_r;
reg [`XLEN-1:0] read_data_ro_r;
reg [`XLEN-1:0] read_data_rw_r;
reg read_addr_valid_r;
always @(*) begin
read_data_ro_r = '0;
read_data_rw_r = '0;
read_addr_valid_r = 1;
case (read_addr)
`VX_CSR_MVENDORID : read_data_ro_r = 32'(`VENDOR_ID);
`VX_CSR_MARCHID : read_data_ro_r = 32'(`ARCHITECTURE_ID);
`VX_CSR_MIMPID : read_data_ro_r = 32'(`IMPLEMENTATION_ID);
`VX_CSR_MISA : read_data_ro_r = (((`CLOG2(`XLEN)-4) << (`XLEN-2)) | `MISA_STD);
case (read_addr)
`VX_CSR_MVENDORID : read_data_ro_r = `XLEN'(`VENDOR_ID);
`VX_CSR_MARCHID : read_data_ro_r = `XLEN'(`ARCHITECTURE_ID);
`VX_CSR_MIMPID : read_data_ro_r = `XLEN'(`IMPLEMENTATION_ID);
`VX_CSR_MISA : read_data_ro_r = `XLEN'({2'(`CLOG2(`XLEN/16)), 30'(`MISA_STD)});
`ifdef EXT_F_ENABLE
`VX_CSR_FFLAGS : read_data_rw_r = 32'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]);
`VX_CSR_FRM : read_data_rw_r = 32'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
`VX_CSR_FCSR : read_data_rw_r = 32'(fcsr[read_wid]);
`VX_CSR_FFLAGS : read_data_rw_r = `XLEN'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]);
`VX_CSR_FRM : read_data_rw_r = `XLEN'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
`VX_CSR_FCSR : read_data_rw_r = `XLEN'(fcsr[read_wid]);
`endif
`VX_CSR_WARP_ID : read_data_ro_r = 32'(read_wid);
`VX_CSR_CORE_ID : read_data_ro_r = 32'(CORE_ID);
`VX_CSR_THREAD_MASK: read_data_ro_r = 32'(thread_masks[read_wid]);
`VX_CSR_WARP_MASK : read_data_ro_r = 32'(active_warps);
`VX_CSR_NUM_THREADS: read_data_ro_r = 32'(`NUM_THREADS);
`VX_CSR_NUM_WARPS : read_data_ro_r = 32'(`NUM_WARPS);
`VX_CSR_NUM_CORES : read_data_ro_r = 32'(`NUM_CORES * `NUM_CLUSTERS);
`VX_CSR_MCYCLE : read_data_ro_r = 32'(cycles[31:0]);
`VX_CSR_MCYCLE_H : read_data_ro_r = 32'(cycles[`PERF_CTR_BITS-1:32]);
`VX_CSR_MSCRATCH : read_data_rw_r = mscratch;
`VX_CSR_WARP_ID : read_data_ro_r = `XLEN'(read_wid);
`VX_CSR_CORE_ID : read_data_ro_r = `XLEN'(CORE_ID);
`VX_CSR_ACTIVE_THREADS: read_data_ro_r = `XLEN'(thread_masks[read_wid]);
`VX_CSR_ACTIVE_WARPS: read_data_ro_r = `XLEN'(active_warps);
`VX_CSR_NUM_THREADS: read_data_ro_r = `XLEN'(`NUM_THREADS);
`VX_CSR_NUM_WARPS : read_data_ro_r = `XLEN'(`NUM_WARPS);
`VX_CSR_NUM_CORES : read_data_ro_r = `XLEN'(`NUM_CORES * `NUM_CLUSTERS);
`VX_CSR_LOCAL_MEM_BASE: read_data_ro_r = `XLEN'(`LMEM_BASE_ADDR);
`CSR_READ_64(`VX_CSR_MCYCLE, read_data_ro_r, cycles);
`VX_CSR_MPM_RESERVED : read_data_ro_r = 'x;
`VX_CSR_MPM_RESERVED_H : read_data_ro_r = 'x;
`VX_CSR_MINSTRET : read_data_ro_r = 32'(commit_csr_if.instret[31:0]);
`VX_CSR_MINSTRET_H : read_data_ro_r = 32'(commit_csr_if.instret[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_RESERVED_H : read_data_ro_r = 'x;
`CSR_READ_64(`VX_CSR_MINSTRET, read_data_ro_r, commit_csr_if.instret);
`VX_CSR_SATP,
`VX_CSR_MSTATUS,
`VX_CSR_MNSTATUS,
@ -174,7 +200,7 @@ import VX_fpu_pkg::*;
`VX_CSR_MTVEC,
`VX_CSR_MEPC,
`VX_CSR_PMPCFG0,
`VX_CSR_PMPADDR0 : read_data_ro_r = 32'(0);
`VX_CSR_PMPADDR0 : read_data_ro_r = `XLEN'(0);
default: begin
read_addr_valid_r = 0;
@ -186,107 +212,65 @@ import VX_fpu_pkg::*;
`VX_DCR_MPM_CLASS_CORE: begin
case (read_addr)
// PERF: pipeline
`VX_CSR_MPM_SCHED_ID : read_data_ro_r = pipeline_perf_if.sched_idles[31:0];
`VX_CSR_MPM_SCHED_ID_H : read_data_ro_r = 32'(pipeline_perf_if.sched_idles[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCHED_ST : read_data_ro_r = pipeline_perf_if.sched_stalls[31:0];
`VX_CSR_MPM_SCHED_ST_H : read_data_ro_r = 32'(pipeline_perf_if.sched_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_IBUF_ST : read_data_ro_r = pipeline_perf_if.ibf_stalls[31:0];
`VX_CSR_MPM_IBUF_ST_H : read_data_ro_r = 32'(pipeline_perf_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ST : read_data_ro_r = pipeline_perf_if.scb_stalls[31:0];
`VX_CSR_MPM_SCRB_ST_H : read_data_ro_r = 32'(pipeline_perf_if.scb_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_ALU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_ALU][31:0];
`VX_CSR_MPM_SCRB_ALU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_ALU][`PERF_CTR_BITS-1:32]);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_r, pipeline_perf_if.sched.idles);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_r, pipeline_perf_if.sched.stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_r, pipeline_perf_if.issue.ibf_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_r, pipeline_perf_if.issue.scb_stalls);
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_r, pipeline_perf_if.issue.opd_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_ALU]);
`ifdef EXT_F_ENABLE
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_FPU][31:0];
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_FPU][`PERF_CTR_BITS-1:32]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_FPU]);
`else
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, `PERF_CTR_BITS'(0));
`endif
`VX_CSR_MPM_SCRB_LSU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_LSU][31:0];
`VX_CSR_MPM_SCRB_LSU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_LSU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_SFU : read_data_ro_r = pipeline_perf_if.units_uses[`EX_SFU][31:0];
`VX_CSR_MPM_SCRB_SFU_H : read_data_ro_r = 32'(pipeline_perf_if.units_uses[`EX_SFU][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_CSRS : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_CSRS][31:0];
`VX_CSR_MPM_SCRB_CSRS_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_CSRS][`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SCRB_WCTL : read_data_ro_r = pipeline_perf_if.sfu_uses[`SFU_WCTL][31:0];
`VX_CSR_MPM_SCRB_WCTL_H : read_data_ro_r = 32'(pipeline_perf_if.sfu_uses[`SFU_WCTL][`PERF_CTR_BITS-1:32]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_LSU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_SFU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_CSRS]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_WCTL]);
// PERF: memory
`VX_CSR_MPM_IFETCHES : read_data_ro_r = pipeline_perf_if.ifetches[31:0];
`VX_CSR_MPM_IFETCHES_H : read_data_ro_r = 32'(pipeline_perf_if.ifetches[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOADS : read_data_ro_r = pipeline_perf_if.loads[31:0];
`VX_CSR_MPM_LOADS_H : read_data_ro_r = 32'(pipeline_perf_if.loads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_STORES : read_data_ro_r = pipeline_perf_if.stores[31:0];
`VX_CSR_MPM_STORES_H : read_data_ro_r = 32'(pipeline_perf_if.stores[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_IFETCH_LT : read_data_ro_r = pipeline_perf_if.ifetch_latency[31:0];
`VX_CSR_MPM_IFETCH_LT_H : read_data_ro_r = 32'(pipeline_perf_if.ifetch_latency[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_LOAD_LT : read_data_ro_r = pipeline_perf_if.load_latency[31:0];
`VX_CSR_MPM_LOAD_LT_H : read_data_ro_r = 32'(pipeline_perf_if.load_latency[`PERF_CTR_BITS-1:32]);
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_r, pipeline_perf_if.ifetches);
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_r, pipeline_perf_if.loads);
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_r, pipeline_perf_if.stores);
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_r, pipeline_perf_if.ifetch_latency);
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_r, pipeline_perf_if.load_latency);
default:;
endcase
end
`VX_DCR_MPM_CLASS_MEM: begin
case (read_addr)
// PERF: icache
`VX_CSR_MPM_ICACHE_READS : read_data_ro_r = mem_perf_if.icache.reads[31:0];
`VX_CSR_MPM_ICACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.icache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ICACHE_MISS_R : read_data_ro_r = mem_perf_if.icache.read_misses[31:0];
`VX_CSR_MPM_ICACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.icache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_ICACHE_MSHR_ST : read_data_ro_r = mem_perf_if.icache.mshr_stalls[31:0];
`VX_CSR_MPM_ICACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.icache.mshr_stalls[`PERF_CTR_BITS-1:32]);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_r, mem_perf_if.icache.reads);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_r, mem_perf_if.icache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_r, mem_perf_if.icache.mshr_stalls);
// PERF: dcache
`VX_CSR_MPM_DCACHE_READS : read_data_ro_r = mem_perf_if.dcache.reads[31:0];
`VX_CSR_MPM_DCACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.dcache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_WRITES : read_data_ro_r = mem_perf_if.dcache.writes[31:0];
`VX_CSR_MPM_DCACHE_WRITES_H : read_data_ro_r = 32'(mem_perf_if.dcache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MISS_R : read_data_ro_r = mem_perf_if.dcache.read_misses[31:0];
`VX_CSR_MPM_DCACHE_MISS_R_H : read_data_ro_r = 32'(mem_perf_if.dcache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MISS_W : read_data_ro_r = mem_perf_if.dcache.write_misses[31:0];
`VX_CSR_MPM_DCACHE_MISS_W_H : read_data_ro_r = 32'(mem_perf_if.dcache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_BANK_ST : read_data_ro_r = mem_perf_if.dcache.bank_stalls[31:0];
`VX_CSR_MPM_DCACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_DCACHE_MSHR_ST : read_data_ro_r = mem_perf_if.dcache.mshr_stalls[31:0];
`VX_CSR_MPM_DCACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.dcache.mshr_stalls[`PERF_CTR_BITS-1:32]);
// PERF: smem
`VX_CSR_MPM_SMEM_READS : read_data_ro_r = mem_perf_if.smem.reads[31:0];
`VX_CSR_MPM_SMEM_READS_H : read_data_ro_r = 32'(mem_perf_if.smem.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SMEM_WRITES : read_data_ro_r = mem_perf_if.smem.writes[31:0];
`VX_CSR_MPM_SMEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.smem.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_SMEM_BANK_ST : read_data_ro_r = mem_perf_if.smem.bank_stalls[31:0];
`VX_CSR_MPM_SMEM_BANK_ST_H : read_data_ro_r = 32'(mem_perf_if.smem.bank_stalls[`PERF_CTR_BITS-1:32]);
// PERF: l2cache
`VX_CSR_MPM_L2CACHE_READS : read_data_ro_r = mem_perf_if.l2cache.reads[31:0];
`VX_CSR_MPM_L2CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l2cache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_WRITES : read_data_ro_r = mem_perf_if.l2cache.writes[31:0];
`VX_CSR_MPM_L2CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l2cache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MISS_R : read_data_ro_r = mem_perf_if.l2cache.read_misses[31:0];
`VX_CSR_MPM_L2CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l2cache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MISS_W : read_data_ro_r = mem_perf_if.l2cache.write_misses[31:0];
`VX_CSR_MPM_L2CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l2cache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l2cache.bank_stalls[31:0];
`VX_CSR_MPM_L2CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L2CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l2cache.mshr_stalls[31:0];
`VX_CSR_MPM_L2CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l2cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_r, mem_perf_if.dcache.reads);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_r, mem_perf_if.dcache.writes);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_r, mem_perf_if.dcache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_r, mem_perf_if.dcache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_r, mem_perf_if.dcache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_r, mem_perf_if.dcache.mshr_stalls);
// PERF: lmem
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_r, mem_perf_if.lmem.reads);
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_r, mem_perf_if.lmem.writes);
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_r, mem_perf_if.lmem.bank_stalls);
// PERF: l2cache
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_r, mem_perf_if.l2cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_r, mem_perf_if.l2cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_r, mem_perf_if.l2cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_r, mem_perf_if.l2cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_r, mem_perf_if.l2cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l2cache.mshr_stalls);
// PERF: l3cache
`VX_CSR_MPM_L3CACHE_READS : read_data_ro_r = mem_perf_if.l3cache.reads[31:0];
`VX_CSR_MPM_L3CACHE_READS_H : read_data_ro_r = 32'(mem_perf_if.l3cache.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_WRITES : read_data_ro_r = mem_perf_if.l3cache.writes[31:0];
`VX_CSR_MPM_L3CACHE_WRITES_H: read_data_ro_r = 32'(mem_perf_if.l3cache.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MISS_R : read_data_ro_r = mem_perf_if.l3cache.read_misses[31:0];
`VX_CSR_MPM_L3CACHE_MISS_R_H: read_data_ro_r = 32'(mem_perf_if.l3cache.read_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MISS_W : read_data_ro_r = mem_perf_if.l3cache.write_misses[31:0];
`VX_CSR_MPM_L3CACHE_MISS_W_H: read_data_ro_r = 32'(mem_perf_if.l3cache.write_misses[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_BANK_ST : read_data_ro_r = mem_perf_if.l3cache.bank_stalls[31:0];
`VX_CSR_MPM_L3CACHE_BANK_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.bank_stalls[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_L3CACHE_MSHR_ST : read_data_ro_r = mem_perf_if.l3cache.mshr_stalls[31:0];
`VX_CSR_MPM_L3CACHE_MSHR_ST_H: read_data_ro_r = 32'(mem_perf_if.l3cache.mshr_stalls[`PERF_CTR_BITS-1:32]);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_r, mem_perf_if.l3cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_r, mem_perf_if.l3cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_r, mem_perf_if.l3cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_r, mem_perf_if.l3cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_r, mem_perf_if.l3cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l3cache.mshr_stalls);
// PERF: memory
`VX_CSR_MPM_MEM_READS : read_data_ro_r = mem_perf_if.mem.reads[31:0];
`VX_CSR_MPM_MEM_READS_H : read_data_ro_r = 32'(mem_perf_if.mem.reads[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_WRITES : read_data_ro_r = mem_perf_if.mem.writes[31:0];
`VX_CSR_MPM_MEM_WRITES_H : read_data_ro_r = 32'(mem_perf_if.mem.writes[`PERF_CTR_BITS-1:32]);
`VX_CSR_MPM_MEM_LT : read_data_ro_r = mem_perf_if.mem.latency[31:0];
`VX_CSR_MPM_MEM_LT_H : read_data_ro_r = 32'(mem_perf_if.mem.latency[`PERF_CTR_BITS-1:32]);
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_r, mem_perf_if.mem.reads);
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_r, mem_perf_if.mem.writes);
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_r, mem_perf_if.mem.latency);
default:;
endcase
end
@ -307,7 +291,7 @@ import VX_fpu_pkg::*;
`ifdef PERF_ENABLE
`UNUSED_VAR (mem_perf_if.icache);
`UNUSED_VAR (mem_perf_if.smem);
`UNUSED_VAR (mem_perf_if.lmem);
`endif
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,6 +14,7 @@
`include "VX_define.vh"
module VX_csr_unit import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0,
parameter NUM_LANES = 1
) (
@ -26,9 +27,9 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
`endif
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if.slave fpu_to_csr_if [`NUM_FPU_BLOCKS],
VX_fpu_csr_if.slave fpu_csr_if [`NUM_FPU_BLOCKS],
`endif
VX_commit_csr_if.slave commit_csr_if,
@ -36,41 +37,44 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
VX_execute_if.slave execute_if,
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * 32 + PID_WIDTH + 1 + 1;
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
`UNUSED_VAR (execute_if.data.rs3_data)
reg [NUM_LANES-1:0][31:0] csr_read_data;
reg [31:0] csr_write_data;
wire [31:0] csr_read_data_ro, csr_read_data_rw;
wire [31:0] csr_req_data;
reg csr_rd_enable;
wire csr_wr_enable;
wire csr_req_ready;
// wait for all pending instructions to complete
reg [NUM_LANES-1:0][`XLEN-1:0] csr_read_data;
reg [`XLEN-1:0] csr_write_data;
wire [`XLEN-1:0] csr_read_data_ro, csr_read_data_rw;
wire [`XLEN-1:0] csr_req_data;
reg csr_rd_enable;
wire csr_wr_enable;
wire csr_req_ready;
wire [`VX_CSR_ADDR_BITS-1:0] csr_addr = execute_if.data.op_args.csr.addr;
wire [`NRI_BITS-1:0] csr_imm = execute_if.data.op_args.csr.imm;
wire is_fpu_csr = (csr_addr <= `VX_CSR_FCSR);
// wait for all pending instructions for current warp to complete
assign sched_csr_if.alm_empty_wid = execute_if.data.wid;
wire no_pending_instr = sched_csr_if.alm_empty;
wire no_pending_instr = sched_csr_if.alm_empty || ~is_fpu_csr;
wire csr_req_valid = execute_if.valid && no_pending_instr;
assign execute_if.ready = csr_req_ready && no_pending_instr;
wire [`VX_CSR_ADDR_BITS-1:0] csr_addr = execute_if.data.imm[`VX_CSR_ADDR_BITS-1:0];
wire [`NRI_BITS-1:0] csr_imm = execute_if.data.imm[`VX_CSR_ADDR_BITS +: `NRI_BITS];
wire [NUM_LANES-1:0][31:0] rs1_data;
wire [NUM_LANES-1:0][`XLEN-1:0] rs1_data;
`UNUSED_VAR (rs1_data)
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign rs1_data[i] = execute_if.data.rs1_data[i][31:0];
assign rs1_data[i] = execute_if.data.rs1_data[i];
end
wire csr_write_enable = (execute_if.data.op_type == `INST_SFU_CSRRW);
VX_csr_data #(
.CORE_ID (CORE_ID)
.INSTANCE_ID (INSTANCE_ID),
.CORE_ID (CORE_ID)
) csr_data (
.clk (clk),
.reset (reset),
@ -86,14 +90,14 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
.cycles (sched_csr_if.cycles),
.active_warps (sched_csr_if.active_warps),
.thread_masks (sched_csr_if.thread_masks),
`ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if),
`endif
.fpu_csr_if (fpu_csr_if),
`endif
.read_enable (csr_req_valid && csr_rd_enable),
.read_uuid (execute_if.data.uuid),
.read_wid (execute_if.data.wid),
.read_wid (execute_if.data.wid),
.read_addr (csr_addr),
.read_data_ro (csr_read_data_ro),
.read_data_rw (csr_read_data_rw),
@ -107,16 +111,16 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
// CSR read
wire [NUM_LANES-1:0][31:0] wtid, gtid;
wire [NUM_LANES-1:0][`XLEN-1:0] wtid, gtid;
for (genvar i = 0; i < NUM_LANES; ++i) begin
if (PID_BITS != 0) begin
assign wtid[i] = 32'(execute_if.data.pid * NUM_LANES + i);
assign wtid[i] = `XLEN'(execute_if.data.pid * NUM_LANES + i);
end else begin
assign wtid[i] = 32'(i);
assign wtid[i] = `XLEN'(i);
end
assign gtid[i] = (32'(CORE_ID) << (`NW_BITS + `NT_BITS)) + (32'(execute_if.data.wid) << `NT_BITS) + wtid[i];
end
assign gtid[i] = (`XLEN'(CORE_ID) << (`NW_BITS + `NT_BITS)) + (`XLEN'(execute_if.data.wid) << `NT_BITS) + wtid[i];
end
always @(*) begin
csr_rd_enable = 0;
@ -132,8 +136,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
// CSR write
assign csr_req_data = execute_if.data.use_imm ? 32'(csr_imm) : rs1_data[0];
assign csr_req_data = execute_if.data.op_args.csr.use_imm ? `XLEN'(csr_imm) : rs1_data[0];
assign csr_wr_enable = (csr_write_enable || (| csr_req_data));
always @(*) begin
@ -152,12 +155,9 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
end
// unlock the warp
assign sched_csr_if.unlock_warp = csr_req_valid && csr_req_ready && execute_if.data.eop;
assign sched_csr_if.unlock_warp = csr_req_valid && csr_req_ready && execute_if.data.eop && is_fpu_csr;
assign sched_csr_if.unlock_wid = execute_if.data.wid;
// send response
wire [NUM_LANES-1:0][31:0] csr_commit_data;
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2)
@ -166,14 +166,10 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
.reset (reset),
.valid_in (csr_req_valid),
.ready_in (csr_req_ready),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, csr_read_data, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, csr_commit_data, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, csr_read_data, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.data, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop}),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready)
);
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign commit_if.data.data[i] = `XLEN'(csr_commit_data[i]);
end
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -12,9 +12,8 @@
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
module VX_dcr_data import VX_gpu_pkg::*; (
module VX_dcr_data import VX_gpu_pkg::*, VX_trace_pkg::*; (
input wire clk,
input wire reset,
@ -35,6 +34,10 @@ module VX_dcr_data import VX_gpu_pkg::*; (
`VX_DCR_BASE_STARTUP_ADDR0 : dcrs.startup_addr[31:0] <= dcr_bus_if.write_data;
`ifdef XLEN_64
`VX_DCR_BASE_STARTUP_ADDR1 : dcrs.startup_addr[63:32] <= dcr_bus_if.write_data;
`endif
`VX_DCR_BASE_STARTUP_ARG0 : dcrs.startup_arg[31:0] <= dcr_bus_if.write_data;
`ifdef XLEN_64
`VX_DCR_BASE_STARTUP_ARG1 : dcrs.startup_arg[63:32] <= dcr_bus_if.write_data;
`endif
`VX_DCR_BASE_MPM_CLASS : dcrs.mpm_class <= dcr_bus_if.write_data[7:0];
default:;
@ -44,12 +47,12 @@ module VX_dcr_data import VX_gpu_pkg::*; (
assign base_dcrs = dcrs;
`ifdef DBG_TRACE_CORE_PIPELINE
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (dcr_bus_if.write_valid) begin
`TRACE(1, ("%d: base-dcr: state=", $time));
trace_base_dcr(1, dcr_bus_if.write_addr);
`TRACE(1, (", data=0x%0h\n", dcr_bus_if.write_data));
`TRACE(1, (", data=0x%h\n", dcr_bus_if.write_data));
end
end
`endif

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -12,7 +12,6 @@
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
`ifdef EXT_F_ENABLE
`define USED_IREG(x) \
@ -28,8 +27,8 @@
use_``x = 1
`endif
module VX_decode #(
parameter CORE_ID = 0
module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
@ -37,27 +36,26 @@ module VX_decode #(
// inputs
VX_fetch_if.slave fetch_if,
// outputs
// outputs
VX_decode_if.master decode_if,
VX_decode_sched_if.master decode_sched_if
);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + (`NR_BITS * 4) + `XLEN + 1 + 1;
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + (`NR_BITS * 4);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
reg [`EX_BITS-1:0] ex_type;
reg [`INST_OP_BITS-1:0] op_type;
reg [`INST_MOD_BITS-1:0] op_mod;
reg [`EX_BITS-1:0] ex_type;
reg [`INST_OP_BITS-1:0] op_type;
op_args_t op_args;
reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r;
reg [`XLEN-1:0] imm;
reg use_rd, use_rs1, use_rs2, use_rs3, use_PC, use_imm;
reg use_rd, use_rs1, use_rs2, use_rs3;
reg is_wstall;
wire [31:0] instr = fetch_if.data.instr;
wire [6:0] opcode = instr[6:0];
wire [6:0] opcode = instr[6:0];
wire [1:0] func2 = instr[26:25];
wire [2:0] func3 = instr[14:12];
wire [4:0] func5 = instr[31:27];
@ -78,6 +76,7 @@ module VX_decode #(
`UNUSED_VAR (use_rs3)
wire is_itype_sh = func3[0] && ~func3[1];
wire is_fpu_csr = (u_12 <= `VX_CSR_FCSR);
wire [19:0] ui_imm = instr[31:12];
`ifdef XLEN_64
@ -85,7 +84,7 @@ module VX_decode #(
wire [11:0] iw_imm = is_itype_sh ? {7'b0, instr[24:20]} : u_12;
`else
wire [11:0] i_imm = is_itype_sh ? {7'b0, instr[24:20]} : u_12;
`endif
`endif
wire [11:0] s_imm = {func7, rd};
wire [12:0] b_imm = {instr[31], instr[7], instr[30:25], instr[11:8], 1'b0};
wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0};
@ -121,9 +120,9 @@ module VX_decode #(
always @(*) begin
case (u_12)
12'h000: s_type = `INST_OP_BITS'(`INST_BR_ECALL);
12'h001: s_type = `INST_OP_BITS'(`INST_BR_EBREAK);
12'h002: s_type = `INST_OP_BITS'(`INST_BR_URET);
12'h102: s_type = `INST_OP_BITS'(`INST_BR_SRET);
12'h001: s_type = `INST_OP_BITS'(`INST_BR_EBREAK);
12'h002: s_type = `INST_OP_BITS'(`INST_BR_URET);
12'h102: s_type = `INST_OP_BITS'(`INST_BR_SRET);
12'h302: s_type = `INST_OP_BITS'(`INST_BR_MRET);
default: s_type = 'x;
endcase
@ -145,171 +144,212 @@ module VX_decode #(
end
`endif
`STATIC_ASSERT($bits(alu_args_t) == $bits(op_args_t), ("alu_args_t size mismatch: current=%0d, expected=%0d", $bits(alu_args_t), $bits(op_args_t)));
`STATIC_ASSERT($bits(fpu_args_t) == $bits(op_args_t), ("fpu_args_t size mismatch: current=%0d, expected=%0d", $bits(fpu_args_t), $bits(op_args_t)));
`STATIC_ASSERT($bits(lsu_args_t) == $bits(op_args_t), ("lsu_args_t size mismatch: current=%0d, expected=%0d", $bits(lsu_args_t), $bits(op_args_t)));
`STATIC_ASSERT($bits(csr_args_t) == $bits(op_args_t), ("csr_args_t size mismatch: current=%0d, expected=%0d", $bits(csr_args_t), $bits(op_args_t)));
`STATIC_ASSERT($bits(wctl_args_t) == $bits(op_args_t), ("wctl_args_t size mismatch: current=%0d, expected=%0d", $bits(wctl_args_t), $bits(op_args_t)));
always @(*) begin
ex_type = '0;
op_type = 'x;
op_mod = '0;
op_args = 'x;
rd_r = '0;
rs1_r = '0;
rs2_r = '0;
rs3_r = '0;
imm = 'x;
use_imm = 0;
use_PC = 0;
use_rd = 0;
use_rs1 = 0;
use_rs2 = 0;
use_rs3 = 0;
is_wstall = 0;
case (opcode)
case (opcode)
`INST_I: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(r_type);
use_rd = 1;
use_imm = 1;
imm = {{(`XLEN-12){i_imm[11]}}, i_imm};
op_args.alu.xtype = `ALU_TYPE_ARITH;
op_args.alu.is_w = 0;
op_args.alu.use_PC = 0;
op_args.alu.use_imm = 1;
op_args.alu.imm = `SEXT(`IMM_BITS, i_imm);
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
end
`INST_R: begin
`INST_R: begin
ex_type = `EX_ALU;
`ifdef EXT_M_ENABLE
if (func7[0]) begin
op_type = `INST_OP_BITS'(m_type);
op_mod[1] = 1;
end else
`endif
begin
op_type = `INST_OP_BITS'(r_type);
end
op_args.alu.is_w = 0;
op_args.alu.use_PC = 0;
op_args.alu.use_imm = 0;
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2);
case (func7)
`ifdef EXT_M_ENABLE
`INST_R_F7_MUL: begin
// MUL, MULH, MULHSU, MULHU
op_type = `INST_OP_BITS'(m_type);
op_args.alu.xtype = `ALU_TYPE_MULDIV;
end
`endif
`ifdef EXT_ZICOND_ENABLE
`INST_R_F7_ZICOND: begin
// CZERO-EQZ, CZERO-NEZ
op_type = func3[1] ? `INST_OP_BITS'(`INST_ALU_CZNE) : `INST_OP_BITS'(`INST_ALU_CZEQ);
op_args.alu.xtype = `ALU_TYPE_ARITH;
end
`endif
default: begin
op_type = `INST_OP_BITS'(r_type);
op_args.alu.xtype = `ALU_TYPE_ARITH;
end
endcase
end
`ifdef XLEN_64
`INST_I_W: begin
// ADDIW, SLLIW, SRLIW, SRAIW
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(r_type);
op_mod[2] = 1;
op_args.alu.xtype = `ALU_TYPE_ARITH;
op_args.alu.is_w = 1;
op_args.alu.use_PC = 0;
op_args.alu.use_imm = 1;
op_args.alu.imm = `SEXT(`IMM_BITS, iw_imm);
use_rd = 1;
use_imm = 1;
imm = {{(`XLEN-12){iw_imm[11]}}, iw_imm};
`USED_IREG (rd);
`USED_IREG (rs1);
end
`INST_R_W: begin
ex_type = `EX_ALU;
`ifdef EXT_M_ENABLE
if (func7[0]) begin
// MULW, DIVW, DIVUW, REMW, REMUW
op_type = `INST_OP_BITS'(m_type);
op_mod[1] = 1;
end else
`endif
begin
// ADDW, SUBW, SLLW, SRLW, SRAW
op_type = `INST_OP_BITS'(r_type);
end
op_mod[2] = 1;
op_args.alu.is_w = 1;
op_args.alu.use_PC = 0;
op_args.alu.use_imm = 0;
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2);
case (func7)
`ifdef EXT_M_ENABLE
`INST_R_F7_MUL: begin
// MULW, DIVW, DIVUW, REMW, REMUW
op_type = `INST_OP_BITS'(m_type);
op_args.alu.xtype = `ALU_TYPE_MULDIV;
end
`endif
default: begin
// ADDW, SUBW, SLLW, SRLW, SRAW
op_type = `INST_OP_BITS'(r_type);
op_args.alu.xtype = `ALU_TYPE_ARITH;
end
endcase
end
`endif
`INST_LUI: begin
`INST_LUI: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_ALU_LUI);
op_args.alu.xtype = `ALU_TYPE_ARITH;
op_args.alu.is_w = 0;
op_args.alu.use_PC = 0;
op_args.alu.use_imm = 1;
op_args.alu.imm = {{`IMM_BITS-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
use_rd = 1;
use_imm = 1;
imm = {{`XLEN-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
`USED_IREG (rd);
end
`INST_AUIPC: begin
`INST_AUIPC: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_ALU_AUIPC);
use_rd = 1;
use_imm = 1;
use_PC = 1;
imm = {{`XLEN-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
op_args.alu.xtype = `ALU_TYPE_ARITH;
op_args.alu.is_w = 0;
op_args.alu.use_PC = 1;
op_args.alu.use_imm = 1;
op_args.alu.imm = {{`IMM_BITS-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
use_rd = 1;
`USED_IREG (rd);
end
`INST_JAL: begin
`INST_JAL: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_BR_JAL);
op_mod[0] = 1;
op_args.alu.xtype = `ALU_TYPE_BRANCH;
op_args.alu.is_w = 0;
op_args.alu.use_PC = 1;
op_args.alu.use_imm = 1;
op_args.alu.imm = `SEXT(`IMM_BITS, jal_imm);
use_rd = 1;
use_imm = 1;
use_PC = 1;
is_wstall = 1;
imm = {{(`XLEN-21){jal_imm[20]}}, jal_imm};
`USED_IREG (rd);
end
`INST_JALR: begin
`INST_JALR: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_BR_JALR);
op_mod[0] = 1;
op_args.alu.xtype = `ALU_TYPE_BRANCH;
op_args.alu.is_w = 0;
op_args.alu.use_PC = 0;
op_args.alu.use_imm = 1;
op_args.alu.imm = `SEXT(`IMM_BITS, u_12);
use_rd = 1;
use_imm = 1;
is_wstall = 1;
imm = {{(`XLEN-12){u_12[11]}}, u_12};
`USED_IREG (rd);
`USED_IREG (rs1);
end
`INST_B: begin
`INST_B: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(b_type);
op_mod[0] = 1;
use_imm = 1;
use_PC = 1;
op_args.alu.xtype = `ALU_TYPE_BRANCH;
op_args.alu.is_w = 0;
op_args.alu.use_PC = 1;
op_args.alu.use_imm = 1;
op_args.alu.imm = `SEXT(`IMM_BITS, b_imm);
is_wstall = 1;
imm = {{(`XLEN-13){b_imm[12]}}, b_imm};
`USED_IREG (rs1);
`USED_IREG (rs2);
end
`INST_FENCE: begin
ex_type = `EX_LSU;
op_type = `INST_LSU_FENCE;
op_args.lsu.is_store = 0;
op_args.lsu.is_float = 0;
op_args.lsu.offset = 0;
end
`INST_SYS : begin
if (func3[1:0] != 0) begin
`INST_SYS : begin
if (func3[1:0] != 0) begin
ex_type = `EX_SFU;
op_type = `INST_OP_BITS'(`INST_SFU_CSR(func3[1:0]));
op_args.csr.addr = u_12;
op_args.csr.use_imm = func3[2];
use_rd = 1;
is_wstall = 1;
use_imm = func3[2];
imm[`VX_CSR_ADDR_BITS-1:0] = u_12; // addr
is_wstall = is_fpu_csr; // only stall for FPU CSRs
`USED_IREG (rd);
if (func3[2]) begin
imm[`VX_CSR_ADDR_BITS +: `NRI_BITS] = rs1; // imm
op_args.csr.imm = rs1;
end else begin
`USED_IREG (rs1);
end
end
end else begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(s_type);
op_mod[0] = 1;
op_args.alu.xtype = `ALU_TYPE_BRANCH;
op_args.alu.is_w = 0;
op_args.alu.use_imm = 1;
op_args.alu.use_PC = 1;
op_args.alu.imm = `IMM_BITS'd4;
use_rd = 1;
use_imm = 1;
use_PC = 1;
is_wstall = 1;
imm = `XLEN'd4;
`USED_IREG (rd);
end
end
`ifdef EXT_F_ENABLE
`INST_FL,
`INST_FL,
`endif
`INST_L: begin
`INST_L: begin
ex_type = `EX_LSU;
op_type = `INST_OP_BITS'({1'b0, func3});
op_args.lsu.is_store = 0;
op_args.lsu.is_float = opcode[2];
op_args.lsu.offset = u_12;
use_rd = 1;
imm = {{(`XLEN-12){u_12[11]}}, u_12};
use_imm = 1;
`ifdef EXT_F_ENABLE
if (opcode[2]) begin
`USED_FREG (rd);
@ -319,13 +359,14 @@ module VX_decode #(
`USED_IREG (rs1);
end
`ifdef EXT_F_ENABLE
`INST_FS,
`INST_FS,
`endif
`INST_S: begin
`INST_S: begin
ex_type = `EX_LSU;
op_type = `INST_OP_BITS'({1'b1, func3});
imm = {{(`XLEN-12){s_imm[11]}}, s_imm};
use_imm = 1;
op_args.lsu.is_store = 1;
op_args.lsu.is_float = opcode[2];
op_args.lsu.offset = s_imm;
`USED_IREG (rs1);
`ifdef EXT_F_ENABLE
if (opcode[2]) begin
@ -338,24 +379,23 @@ module VX_decode #(
`INST_FMADD,
`INST_FMSUB,
`INST_FNMSUB,
`INST_FNMADD: begin
`INST_FNMADD: begin
ex_type = `EX_FPU;
op_type = `INST_OP_BITS'({2'b11, opcode[3:2]});
op_mod = `INST_MOD_BITS'(func3);
imm[0] = func2[0]; // destination is double?
op_args.fpu.frm = func3;
op_args.fpu.fmt[0] = func2[0]; // float / double
use_rd = 1;
`USED_FREG (rd);
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
`USED_FREG (rs3);
end
`INST_FCI: begin
`INST_FCI: begin
ex_type = `EX_FPU;
op_mod = `INST_MOD_BITS'(func3);
`ifdef FLEN_64
imm[0] = func2[0]; // destination is double?
`endif
use_rd = 1;
op_args.fpu.frm = func3;
op_args.fpu.fmt[0] = func2[0]; // float / double
op_args.fpu.fmt[1] = rs2[1]; // int32 / int64
use_rd = 1;
case (func5)
5'b00000, // FADD
5'b00001, // FSUB
@ -369,7 +409,7 @@ module VX_decode #(
5'b00100: begin
// NCP: FSGNJ=0, FSGNJN=1, FSGNJX=2
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = `INST_MOD_BITS'(func3[1:0]);
op_args.fpu.frm = `INST_FRM_BITS'(func3[1:0]);
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
@ -377,67 +417,61 @@ module VX_decode #(
5'b00101: begin
// NCP: FMIN=6, FMAX=7
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = func3[0] ? 7 : 6;
op_args.fpu.frm = `INST_FRM_BITS'(func3[0] ? 7 : 6);
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
end
`ifdef FLEN_64
5'b01000: begin
// CVT.S.D, CVT.D.S
5'b01000: begin
// FCVT.S.D, FCVT.D.S
op_type = `INST_OP_BITS'(`INST_FPU_F2F);
`USED_FREG (rd);
`USED_FREG (rs1);
end
`endif
5'b01011: begin
// SQRT
5'b01011: begin
// FSQRT
op_type = `INST_OP_BITS'(`INST_FPU_SQRT);
`USED_FREG (rd);
`USED_FREG (rs1);
end
end
5'b10100: begin
// CMP
// FCMP
op_type = `INST_OP_BITS'(`INST_FPU_CMP);
`USED_IREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
end
5'b11000: begin
// CVT.W.X, CVT.WU.X
// FCVT.W.X, FCVT.WU.X
op_type = (rs2[0]) ? `INST_OP_BITS'(`INST_FPU_F2U) : `INST_OP_BITS'(`INST_FPU_F2I);
`ifdef XLEN_64
imm[1] = rs2[1]; // is 64-bit integer
`endif
`USED_IREG (rd);
`USED_FREG (rs1);
end
5'b11010: begin
// CVT.X.W, CVT.X.WU
// FCVT.X.W, FCVT.X.WU
op_type = (rs2[0]) ? `INST_OP_BITS'(`INST_FPU_U2F) : `INST_OP_BITS'(`INST_FPU_I2F);
`ifdef XLEN_64
imm[1] = rs2[1]; // is 64-bit integer
`endif
`USED_FREG (rd);
`USED_IREG (rs1);
end
5'b11100: begin
5'b11100: begin
if (func3[0]) begin
// NCP: FCLASS=3
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 3;
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_args.fpu.frm = `INST_FRM_BITS'(3);
end else begin
// NCP: FMV.X.W=4
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 4;
op_args.fpu.frm = `INST_FRM_BITS'(4);
end
`USED_IREG (rd);
`USED_FREG (rs1);
end
5'b11110: begin
`USED_FREG (rs1);
end
5'b11110: begin
// NCP: FMV.W.X=5
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 5;
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_args.fpu.frm = `INST_FRM_BITS'(5);
`USED_FREG (rd);
`USED_IREG (rs1);
end
@ -445,7 +479,7 @@ module VX_decode #(
endcase
end
`endif
`INST_EXT1: begin
`INST_EXT1: begin
case (func7)
7'h00: begin
ex_type = `EX_SFU;
@ -463,8 +497,9 @@ module VX_decode #(
3'h2: begin // SPLIT
op_type = `INST_OP_BITS'(`INST_SFU_SPLIT);
use_rd = 1;
`USED_IREG (rs1);
`USED_IREG (rd);
op_args.wctl.is_neg = rs2[0];
`USED_IREG (rs1);
`USED_IREG (rd);
end
3'h3: begin // JOIN
op_type = `INST_OP_BITS'(`INST_SFU_JOIN);
@ -477,6 +512,7 @@ module VX_decode #(
end
3'h5: begin // PRED
op_type = `INST_OP_BITS'(`INST_SFU_PRED);
op_args.wctl.is_neg = rd[0];
`USED_IREG (rs1);
`USED_IREG (rs2);
end
@ -486,25 +522,6 @@ module VX_decode #(
default:;
endcase
end
`INST_EXT2: begin
case (func3)
3'h1: begin
case (func2)
2'h0: begin // CMOV
ex_type = `EX_SFU;
op_type = `INST_OP_BITS'(`INST_SFU_CMOV);
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
`USED_IREG (rs2);
`USED_IREG (rs3);
end
default:;
endcase
end
default:;
endcase
end
default:;
endcase
end
@ -520,8 +537,8 @@ module VX_decode #(
.reset (reset),
.valid_in (fetch_if.valid),
.ready_in (fetch_if.ready),
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_mod, use_PC, imm, use_imm, wb, rd_r, rs1_r, rs2_r, rs3_r}),
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_mod, decode_if.data.use_PC, decode_if.data.imm, decode_if.data.use_imm, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_r, rs1_r, rs2_r, rs3_r}),
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
.valid_out (decode_if.valid),
.ready_out (decode_if.ready)
);
@ -533,18 +550,21 @@ module VX_decode #(
assign decode_sched_if.valid = fetch_fire;
assign decode_sched_if.wid = fetch_if.data.wid;
assign decode_sched_if.is_wstall = is_wstall;
`ifndef L1_ENABLE
assign fetch_if.ibuf_pop = decode_if.ibuf_pop;
`endif
`ifdef DBG_TRACE_CORE_PIPELINE
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (decode_if.valid && decode_if.ready) begin
`TRACE(1, ("%d: core%0d-decode: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, CORE_ID, decode_if.data.wid, decode_if.data.PC, instr));
`TRACE(1, ("%d: %s: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, INSTANCE_ID, decode_if.data.wid, {decode_if.data.PC, 1'd0}, instr));
trace_ex_type(1, decode_if.data.ex_type);
`TRACE(1, (", op="));
trace_ex_op(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_mod, decode_if.data.rd, decode_if.data.rs2, decode_if.data.use_imm, decode_if.data.imm);
`TRACE(1, (", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=0x%0h, opds=%b%b%b%b, use_pc=%b, use_imm=%b (#%0d)\n",
decode_if.data.op_mod, decode_if.data.tmask, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, decode_if.data.imm, use_rd, use_rs1, use_rs2, use_rs3, decode_if.data.use_PC, decode_if.data.use_imm, decode_if.data.uuid));
trace_ex_op(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args);
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, opds=%b%b%b%b",
decode_if.data.tmask, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, use_rd, use_rs1, use_rs2, use_rs3));
trace_op_args(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args);
`TRACE(1, (" (#%0d)\n", decode_if.data.uuid));
end
end
`endif

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_dispatch import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
@ -23,213 +23,85 @@ module VX_dispatch import VX_gpu_pkg::*; #(
output wire [`PERF_CTR_BITS-1:0] perf_stalls [`NUM_EX_UNITS],
`endif
// inputs
VX_operands_if.slave operands_if [`ISSUE_WIDTH],
VX_operands_if.slave operands_if,
// outputs
VX_dispatch_if.master alu_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.master lsu_dispatch_if [`ISSUE_WIDTH],
`ifdef EXT_F_ENABLE
VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH],
`endif
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS]
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH;
wire [`ISSUE_WIDTH-1:0][`NT_WIDTH-1:0] last_active_tid;
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH;
wire [`NUM_THREADS-1:0][`NT_WIDTH-1:0] tids;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign tids[i] = `NT_WIDTH'(i);
end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
VX_find_first #(
.N (`NUM_THREADS),
.DATAW (`NT_WIDTH),
.REVERSE (1)
) last_tid_select (
.valid_in (operands_if[i].data.tmask),
.data_in (tids),
.data_out (last_active_tid[i]),
`UNUSED_PIN (valid_out)
);
end
// ALU dispatch
wire [`NT_WIDTH-1:0] last_active_tid;
VX_operands_if alu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign alu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_ALU);
assign alu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (alu_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) alu_buffer (
.clk (clk),
.reset (alu_reset),
.valid_in (alu_operands_if[i].valid),
.ready_in (alu_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(alu_operands_if[i].data, last_active_tid[i])),
.data_out (alu_dispatch_if[i].data),
.valid_out (alu_dispatch_if[i].valid),
.ready_out (alu_dispatch_if[i].ready)
);
end
// LSU dispatch
VX_operands_if lsu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign lsu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_LSU);
assign lsu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (lsu_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) lsu_buffer (
.clk (clk),
.reset (lsu_reset),
.valid_in (lsu_operands_if[i].valid),
.ready_in (lsu_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(lsu_operands_if[i].data, last_active_tid[i])),
.data_out (lsu_dispatch_if[i].data),
.valid_out (lsu_dispatch_if[i].valid),
.ready_out (lsu_dispatch_if[i].ready)
);
end
// FPU dispatch
`ifdef EXT_F_ENABLE
VX_operands_if fpu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign fpu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_FPU);
assign fpu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (fpu_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) fpu_buffer (
.clk (clk),
.reset (fpu_reset),
.valid_in (fpu_operands_if[i].valid),
.ready_in (fpu_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(fpu_operands_if[i].data, last_active_tid[i])),
.data_out (fpu_dispatch_if[i].data),
.valid_out (fpu_dispatch_if[i].valid),
.ready_out (fpu_dispatch_if[i].ready)
);
end
`endif
// SFU dispatch
VX_operands_if sfu_operands_if[`ISSUE_WIDTH]();
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign sfu_operands_if[i].valid = operands_if[i].valid && (operands_if[i].data.ex_type == `EX_SFU);
assign sfu_operands_if[i].data = operands_if[i].data;
`RESET_RELAY (sfu_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
) sfu_buffer (
.clk (clk),
.reset (sfu_reset),
.valid_in (sfu_operands_if[i].valid),
.ready_in (sfu_operands_if[i].ready),
.data_in (`TO_DISPATCH_DATA(sfu_operands_if[i].data, last_active_tid[i])),
.data_out (sfu_dispatch_if[i].data),
.valid_out (sfu_dispatch_if[i].valid),
.ready_out (sfu_dispatch_if[i].ready)
);
end
// can take next request?
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign operands_if[i].ready = (alu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_ALU))
|| (lsu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_LSU))
`ifdef EXT_F_ENABLE
|| (fpu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_FPU))
`endif
|| (sfu_operands_if[i].ready && (operands_if[i].data.ex_type == `EX_SFU));
end
`ifdef PERF_ENABLE
wire [`NUM_EX_UNITS-1:0] perf_unit_stalls_per_cycle, perf_unit_stalls_per_cycle_r;
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_stalls_per_cycle;
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r;
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
always @(*) begin
perf_issue_unit_stalls_per_cycle[i] = '0;
if (operands_if[i].valid && ~operands_if[i].ready) begin
perf_issue_unit_stalls_per_cycle[i][operands_if[i].data.ex_type] = 1;
end
end
end
VX_reduce #(
.DATAW_IN (`NUM_EX_UNITS),
.N (`ISSUE_WIDTH),
.OP ("|")
) reduce (
.data_in (perf_issue_unit_stalls_per_cycle),
.data_out (perf_unit_stalls_per_cycle)
VX_find_first #(
.N (`NUM_THREADS),
.DATAW (`NT_WIDTH),
.REVERSE (1)
) last_tid_select (
.valid_in (operands_if.data.tmask),
.data_in (tids),
.data_out (last_active_tid),
`UNUSED_PIN (valid_out)
);
`BUFFER(perf_unit_stalls_per_cycle_r, perf_unit_stalls_per_cycle);
wire [`NUM_EX_UNITS-1:0] operands_reset;
assign operands_if.ready = operands_reset[operands_if.data.ex_type];
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
`RESET_RELAY (buffer_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2), // 2-cycle EB for area reduction
.LUTRAM (1)
) buffer (
.clk (clk),
.reset (buffer_reset),
.valid_in (operands_if.valid && (operands_if.data.ex_type == `EX_BITS'(i))),
.ready_in (operands_reset[i]),
.data_in ({
operands_if.data.uuid,
operands_if.data.wis,
operands_if.data.tmask,
operands_if.data.PC,
operands_if.data.op_type,
operands_if.data.op_args,
operands_if.data.wb,
operands_if.data.rd,
last_active_tid,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data
}),
.data_out (dispatch_if[i].data),
.valid_out (dispatch_if[i].valid),
.ready_out (dispatch_if[i].ready)
);
end
`ifdef PERF_ENABLE
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r;
wire operands_if_stall = operands_if.valid && ~operands_if.ready;
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_stalls_r[i] <= '0;
end else begin
perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(perf_unit_stalls_per_cycle_r[i]);
perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(operands_if_stall && operands_if.data.ex_type == `EX_BITS'(i));
end
end
end
for (genvar i=0; i < `NUM_EX_UNITS; ++i) begin
assign perf_stalls[i] = perf_stalls_r[i];
end
`endif
`ifdef DBG_TRACE_CORE_PIPELINE
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
always @(posedge clk) begin
if (operands_if[i].valid && operands_if[i].ready) begin
`TRACE(1, ("%d: core%0d-issue: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, wis_to_wid(operands_if[i].data.wis, i), operands_if[i].data.PC));
trace_ex_type(1, operands_if[i].data.ex_type);
`TRACE(1, (", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if[i].data.op_mod, operands_if[i].data.tmask, operands_if[i].data.wb, operands_if[i].data.rd));
`TRACE_ARRAY1D(1, operands_if[i].data.rs1_data, `NUM_THREADS);
`TRACE(1, (", rs2_data="));
`TRACE_ARRAY1D(1, operands_if[i].data.rs2_data, `NUM_THREADS);
`TRACE(1, (", rs3_data="));
`TRACE_ARRAY1D(1, operands_if[i].data.rs3_data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", operands_if[i].data.uuid));
end
end
end
`endif
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -13,23 +13,24 @@
`include "VX_define.vh"
module VX_dispatch_unit import VX_gpu_pkg::*; #(
module VX_dispatch_unit import VX_gpu_pkg::*; #(
parameter BLOCK_SIZE = 1,
parameter NUM_LANES = 1,
parameter OUT_REG = 0,
parameter OUT_BUF = 0,
parameter MAX_FANOUT = `MAX_FANOUT
) (
) (
input wire clk,
input wire reset,
// inputs
// inputs
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
// outputs
VX_execute_if.master execute_if [BLOCK_SIZE]
);
`STATIC_ASSERT ((`NUM_THREADS == NUM_LANES * (`NUM_THREADS / NUM_LANES)), ("invalid parameter"))
`STATIC_ASSERT (`IS_DIVISBLE(`ISSUE_WIDTH, BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT (`IS_DIVISBLE(`NUM_THREADS, NUM_LANES), ("invalid parameter"))
localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE);
localparam NUM_PACKETS = `NUM_THREADS / NUM_LANES;
localparam PID_BITS = `CLOG2(NUM_PACKETS);
@ -37,9 +38,9 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
localparam BATCH_COUNT = `ISSUE_WIDTH / BLOCK_SIZE;
localparam BATCH_COUNT_W= `LOG2UP(BATCH_COUNT);
localparam ISSUE_W = `LOG2UP(`ISSUE_WIDTH);
localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN);
localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
localparam FANOUT_ENABLE= (`NUM_THREADS > (MAX_FANOUT + MAX_FANOUT/2));
localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN);
localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
localparam FANOUT_ENABLE= (`NUM_THREADS > (MAX_FANOUT + MAX_FANOUT /2));
localparam DATA_TMASK_OFF = IN_DATAW - (`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS);
localparam DATA_REGS_OFF = 0;
@ -53,7 +54,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
assign dispatch_data[i] = dispatch_if[i].data;
assign dispatch_if[i].ready = dispatch_ready[i];
end
wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices;
wire [BLOCK_SIZE-1:0] block_ready;
wire [BLOCK_SIZE-1:0][NUM_LANES-1:0] block_tmask;
@ -64,7 +65,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire [BLOCK_SIZE-1:0] block_done;
wire batch_done = (& block_done);
logic [BATCH_COUNT_W-1:0] batch_idx;
if (BATCH_COUNT != 1) begin
always @(posedge clk) begin
@ -78,12 +79,14 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
assign batch_idx = 0;
`UNUSED_VAR (batch_done)
end
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
assign issue_indices[block_idx] = issue_idx;
`RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1));
wire valid_p, ready_p;
if (`NUM_THREADS != NUM_LANES) begin
@ -99,7 +102,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire fire_eop = fire_p && is_last_p;
always @(posedge clk) begin
if (reset) begin
if (block_reset) begin
sent_mask_p <= '0;
is_first_p <= 1;
end else begin
@ -114,7 +117,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
end
wire [NUM_PACKETS-1:0][NUM_LANES-1:0] per_packet_tmask;
wire [NUM_PACKETS-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] per_packet_regs;
wire [NUM_PACKETS-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] per_packet_regs;
wire [`NUM_THREADS-1:0] dispatch_tmask = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs1_data = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
@ -134,7 +137,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire [NUM_PACKETS-1:0] packet_valids;
wire [NUM_PACKETS-1:0][PID_WIDTH-1:0] packet_ids;
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
assign packet_valids[i] = (| per_packet_tmask[i]);
assign packet_ids[i] = PID_WIDTH'(i);
end
@ -143,7 +146,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
.N (NUM_PACKETS),
.DATAW (PID_WIDTH),
.REVERSE (0)
) find_first (
) find_first (
.valid_in (packet_valids & ~sent_mask_p),
.data_in (packet_ids),
.data_out (start_p_n),
@ -154,12 +157,12 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
.N (NUM_PACKETS),
.DATAW (PID_WIDTH),
.REVERSE (1)
) find_last (
) find_last (
.valid_in (packet_valids),
.data_in (packet_ids),
.data_out (end_p),
`UNUSED_PIN (valid_out)
);
);
VX_pipe_register #(
.DATAW (1 + PID_WIDTH),
@ -171,14 +174,14 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
.enable (1'b1),
.data_in ({dispatch_valid[issue_idx], start_p_n}),
.data_out ({dispatch_valid_r, start_p})
);
);
wire [NUM_LANES-1:0] tmask_p = per_packet_tmask[start_p];
wire [2:0][NUM_LANES-1:0][`XLEN-1:0] regs_p = per_packet_regs[start_p];
wire block_enable = (BATCH_COUNT == 1 || ~(& sent_mask_p));
assign valid_p = dispatch_valid_r && block_enable;
assign valid_p = dispatch_valid_r && block_enable;
assign block_tmask[block_idx] = tmask_p;
assign block_regs[block_idx] = regs_p;
assign block_pid[block_idx] = start_p;
@ -214,27 +217,25 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
assign isw = block_idx;
end
`RESET_RELAY(buf_out_reset, reset);
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
VX_elastic_buffer #(
.DATAW (OUT_DATAW),
.SIZE (`OUT_REG_TO_EB_SIZE(OUT_REG)),
.OUT_REG (`OUT_REG_TO_EB_REG(OUT_REG))
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) buf_out (
.clk (clk),
.reset (buf_out_reset),
.reset (block_reset),
.valid_in (valid_p),
.ready_in (ready_p),
.data_in ({
.data_in ({
dispatch_data[issue_idx][IN_DATAW-1 : DATA_TMASK_OFF+`NUM_THREADS+ISSUE_WIS_W],
block_wid,
block_tmask[block_idx],
dispatch_data[issue_idx][DATA_TMASK_OFF-1 : DATA_REGS_OFF + 3 * `NUM_THREADS * `XLEN],
block_regs[block_idx][0],
block_regs[block_idx][1],
block_regs[block_idx][2],
block_regs[block_idx][2],
block_pid[block_idx],
block_sop[block_idx],
block_eop[block_idx]}),
@ -251,6 +252,6 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
ready_in[issue_indices[i]] = block_ready[i] && block_eop[i];
end
end
assign dispatch_ready = ready_in;
assign dispatch_ready = ready_in;
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,124 +14,103 @@
`include "VX_define.vh"
module VX_execute import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
input base_dcrs_t base_dcrs,
// Dcache interface
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
// commit interface
VX_commit_csr_if.slave commit_csr_if,
// fetch interface
VX_sched_csr_if.slave sched_csr_if,
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
`endif
`ifdef EXT_F_ENABLE
VX_dispatch_if.slave fpu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master fpu_commit_if [`ISSUE_WIDTH],
`endif
VX_dispatch_if.slave alu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master alu_commit_if [`ISSUE_WIDTH],
input base_dcrs_t base_dcrs,
// Dcache interface
VX_lsu_mem_if.master lsu_mem_if [`NUM_LSU_BLOCKS],
// dispatch interface
VX_dispatch_if.slave dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH],
// commit interface
VX_commit_if.master commit_if [`NUM_EX_UNITS * `ISSUE_WIDTH],
// scheduler interfaces
VX_sched_csr_if.slave sched_csr_if,
VX_branch_ctl_if.master branch_ctl_if [`NUM_ALU_BLOCKS],
VX_dispatch_if.slave lsu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master lsu_commit_if [`ISSUE_WIDTH],
VX_dispatch_if.slave sfu_dispatch_if [`ISSUE_WIDTH],
VX_commit_if.master sfu_commit_if [`ISSUE_WIDTH],
VX_warp_ctl_if.master warp_ctl_if,
// simulation helper signals
output wire sim_ebreak
// commit interface
VX_commit_csr_if.slave commit_csr_if
);
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if fpu_to_csr_if[`NUM_FPU_BLOCKS]();
VX_fpu_csr_if fpu_csr_if[`NUM_FPU_BLOCKS]();
`endif
`RESET_RELAY (alu_reset, reset);
`RESET_RELAY (lsu_reset, reset);
`RESET_RELAY (sfu_reset, reset);
VX_alu_unit #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-alu", INSTANCE_ID))
) alu_unit (
.clk (clk),
.reset (alu_reset),
.dispatch_if (alu_dispatch_if),
.branch_ctl_if (branch_ctl_if),
.commit_if (alu_commit_if)
.dispatch_if (dispatch_if[`EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (commit_if[`EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.branch_ctl_if (branch_ctl_if)
);
`SCOPE_IO_SWITCH (1)
VX_lsu_unit #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-lsu", INSTANCE_ID))
) lsu_unit (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (lsu_reset),
.cache_bus_if (dcache_bus_if),
.dispatch_if (lsu_dispatch_if),
.commit_if (lsu_commit_if)
.dispatch_if (dispatch_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (commit_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.lsu_mem_if (lsu_mem_if)
);
`ifdef EXT_F_ENABLE
`RESET_RELAY (fpu_reset, reset);
VX_fpu_unit #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-fpu", INSTANCE_ID))
) fpu_unit (
.clk (clk),
.reset (fpu_reset),
.dispatch_if (fpu_dispatch_if),
.fpu_to_csr_if (fpu_to_csr_if),
.commit_if (fpu_commit_if)
.reset (fpu_reset),
.dispatch_if (dispatch_if[`EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (commit_if[`EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.fpu_csr_if (fpu_csr_if)
);
`endif
VX_sfu_unit #(
.INSTANCE_ID ($sformatf("%s-sfu", INSTANCE_ID)),
.CORE_ID (CORE_ID)
) sfu_unit (
.clk (clk),
.reset (sfu_reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if (pipeline_perf_if),
`endif
.base_dcrs (base_dcrs),
.dispatch_if (sfu_dispatch_if),
.base_dcrs (base_dcrs),
.dispatch_if (dispatch_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (commit_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
`ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if),
.fpu_csr_if (fpu_csr_if),
`endif
.commit_csr_if (commit_csr_if),
.sched_csr_if (sched_csr_if),
.warp_ctl_if (warp_ctl_if),
.commit_if (sfu_commit_if)
.warp_ctl_if (warp_ctl_if)
);
// simulation helper signal to get RISC-V tests Pass/Fail status
assign sim_ebreak = alu_dispatch_if[0].valid && alu_dispatch_if[0].ready
&& alu_dispatch_if[0].data.wis == 0
&& `INST_ALU_IS_BR(alu_dispatch_if[0].data.op_mod)
&& (`INST_BR_BITS'(alu_dispatch_if[0].data.op_type) == `INST_BR_EBREAK
|| `INST_BR_BITS'(alu_dispatch_if[0].data.op_type) == `INST_BR_ECALL);
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_fetch import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -23,16 +23,15 @@ module VX_fetch import VX_gpu_pkg::*; #(
// Icache interface
VX_mem_bus_if.master icache_bus_if,
// inputs
VX_schedule_if.slave schedule_if,
// outputs
VX_fetch_if.master fetch_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_VAR (reset)
localparam ISW_WIDTH = `LOG2UP(`ISSUE_WIDTH);
wire icache_req_valid;
wire [ICACHE_ADDR_WIDTH-1:0] icache_req_addr;
@ -40,60 +39,65 @@ module VX_fetch import VX_gpu_pkg::*; #(
wire icache_req_ready;
wire [`UUID_WIDTH-1:0] rsp_uuid;
wire [`NW_WIDTH-1:0] req_tag, rsp_tag;
wire [`NW_WIDTH-1:0] req_tag, rsp_tag;
wire icache_req_fire = icache_req_valid && icache_req_ready;
wire [ISW_WIDTH-1:0] schedule_isw = wid_to_isw(schedule_if.data.wid);
assign req_tag = schedule_if.data.wid;
assign {rsp_uuid, rsp_tag} = icache_bus_if.rsp_data.tag;
wire [`XLEN-1:0] rsp_PC;
wire [`PC_BITS-1:0] rsp_PC;
wire [`NUM_THREADS-1:0] rsp_tmask;
VX_dp_ram #(
.DATAW (`XLEN + `NUM_THREADS),
.DATAW (`PC_BITS + `NUM_THREADS),
.SIZE (`NUM_WARPS),
.LUTRAM (1)
) tag_store (
.clk (clk),
.clk (clk),
.reset (reset),
.read (1'b1),
.write (icache_req_fire),
`UNUSED_PIN (wren),
.write (icache_req_fire),
.wren (1'b1),
.waddr (req_tag),
.wdata ({schedule_if.data.PC, schedule_if.data.tmask}),
.raddr (rsp_tag),
.rdata ({rsp_PC, rsp_tmask})
);
`ifndef L1_ENABLE
// Ensure that the ibuffer doesn't fill up.
// This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache request.
// This issue is particularly prevalent when the icache and dcache is disabled and both requests share the same bus.
wire [`ISSUE_WIDTH-1:0] pending_ibuf_full;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
VX_pending_size #(
// This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache requests.
// This issue is particularly prevalent when the icache and dcache are disabled and both requests share the same bus.
wire [`NUM_WARPS-1:0] pending_ibuf_full;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
VX_pending_size #(
.SIZE (`IBUF_SIZE)
) pending_reads (
.clk (clk),
.reset (reset),
.incr (icache_req_fire && schedule_isw == i),
.incr (icache_req_fire && schedule_if.data.wid == i),
.decr (fetch_if.ibuf_pop[i]),
`UNUSED_PIN (empty),
`UNUSED_PIN (alm_empty),
.full (pending_ibuf_full[i]),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end
wire ibuf_ready = ~pending_ibuf_full[schedule_if.data.wid];
`else
wire ibuf_ready = 1'b1;
`endif
`RUNTIME_ASSERT((!schedule_if.valid || schedule_if.data.PC != 0),
("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, schedule_if.data.PC, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.uuid))
`RUNTIME_ASSERT((!schedule_if.valid || schedule_if.data.PC != 0),
("%t: *** %s invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, INSTANCE_ID, {schedule_if.data.PC, 1'b0}, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.uuid))
// Icache Request
wire ibuf_ready = ~pending_ibuf_full[schedule_isw];
assign icache_req_valid = schedule_if.valid && ibuf_ready;
assign icache_req_addr = schedule_if.data.PC[`MEM_ADDR_WIDTH-1:2];
assign icache_req_addr = schedule_if.data.PC[1 +: ICACHE_ADDR_WIDTH];
assign icache_req_tag = {schedule_if.data.uuid, req_tag};
assign schedule_if.ready = icache_req_ready && ibuf_ready;
@ -112,9 +116,10 @@ module VX_fetch import VX_gpu_pkg::*; #(
.ready_out (icache_bus_if.req_ready)
);
assign icache_bus_if.req_data.atype = '0;
assign icache_bus_if.req_data.rw = 0;
assign icache_bus_if.req_data.byteen = 4'b1111;
assign icache_bus_if.req_data.data = '0;
assign icache_bus_if.req_data.data = '0;
// Icache Response
@ -127,56 +132,46 @@ module VX_fetch import VX_gpu_pkg::*; #(
assign icache_bus_if.rsp_ready = fetch_if.ready;
`ifdef DBG_SCOPE_FETCH
if (CORE_ID == 0) begin
`ifdef SCOPE
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
VX_scope_tap #(
.SCOPE_ID (1),
.TRIGGERW (4),
.PROBEW (3*`UUID_WIDTH + 108)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({
reset,
schedule_fire,
icache_req_fire,
icache_rsp_fire
}),
.probes({
schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC,
icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag
}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
ila_fetch ila_fetch_inst (
.clk (clk),
.probe0 ({reset, schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC, schedule_if.ready, schedule_if.valid}),
.probe1 ({icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr, icache_bus_if.req_ready, icache_bus_if.req_valid}),
.probe2 ({icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag, icache_bus_if.rsp_ready, icache_bus_if.rsp_valid})
);
`endif
end
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
VX_scope_tap #(
.SCOPE_ID (1),
.TRIGGERW (4),
.PROBEW (`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +
ICACHE_TAG_WIDTH + ICACHE_WORD_SIZE + ICACHE_ADDR_WIDTH +
(ICACHE_WORD_SIZE*8) + ICACHE_TAG_WIDTH)
) scope_tap (
.clk (clk),
.reset (scope_reset),
.start (1'b0),
.stop (1'b0),
.triggers ({
reset,
schedule_fire,
icache_req_fire,
icache_rsp_fire
}),
.probes ({
schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC,
icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag
}),
.bus_in (scope_bus_in),
.bus_out (scope_bus_out)
);
`else
`SCOPE_IO_UNUSED()
`endif
`ifdef DBG_TRACE_CORE_ICACHE
`ifdef DBG_TRACE_MEM
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire fetch_fire = fetch_if.valid && fetch_if.ready;
always @(posedge clk) begin
if (schedule_fire) begin
`TRACE(1, ("%d: I$%0d req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, CORE_ID, schedule_if.data.wid, schedule_if.data.PC, schedule_if.data.tmask, schedule_if.data.uuid));
`TRACE(1, ("%d: %s req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, INSTANCE_ID, schedule_if.data.wid, {schedule_if.data.PC, 1'b0}, schedule_if.data.tmask, schedule_if.data.uuid));
end
if (fetch_fire) begin
`TRACE(1, ("%d: I$%0d rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, fetch_if.data.wid, fetch_if.data.PC, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid));
`TRACE(1, ("%d: %s rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, INSTANCE_ID, fetch_if.data.wid, {fetch_if.data.PC, 1'b0}, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid));
end
end
`endif

View file

@ -1,31 +1,32 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
`include "VX_fpu_define.vh"
module VX_fpu_unit import VX_fpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
// Inputs
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
VX_fpu_to_csr_if.master fpu_to_csr_if[`NUM_FPU_BLOCKS],
VX_commit_if.master commit_if [`ISSUE_WIDTH]
// Outputs
VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_fpu_csr_if.master fpu_csr_if[`NUM_FPU_BLOCKS]
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam BLOCK_SIZE = `NUM_FPU_BLOCKS;
localparam NUM_LANES = `NUM_FPU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
@ -35,34 +36,32 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) execute_if[BLOCK_SIZE]();
`RESET_RELAY (dispatch_reset, reset);
) per_block_execute_if[BLOCK_SIZE]();
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (PARTIAL_BW ? 1 : 0)
.OUT_BUF (PARTIAL_BW ? 1 : 0)
) dispatch_unit (
.clk (clk),
.reset (dispatch_reset),
.reset (reset),
.dispatch_if(dispatch_if),
.execute_if (execute_if)
.execute_if (per_block_execute_if)
);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_block_if[BLOCK_SIZE]();
) per_block_commit_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
`UNUSED_VAR (execute_if[block_idx].data.tid)
`UNUSED_VAR (execute_if[block_idx].data.wb)
`UNUSED_VAR (execute_if[block_idx].data.use_PC)
`UNUSED_VAR (execute_if[block_idx].data.use_imm)
`UNUSED_VAR (per_block_execute_if[block_idx].data.tid)
`UNUSED_VAR (per_block_execute_if[block_idx].data.wb)
`RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1));
// Store request info
wire fpu_req_valid, fpu_req_ready;
wire fpu_rsp_valid, fpu_rsp_ready;
wire fpu_rsp_valid, fpu_rsp_ready;
wire [NUM_LANES-1:0][`XLEN-1:0] fpu_rsp_result;
fflags_t fpu_rsp_fflags;
wire fpu_rsp_has_fflags;
@ -70,68 +69,66 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
wire [`UUID_WIDTH-1:0] fpu_rsp_uuid;
wire [`NW_WIDTH-1:0] fpu_rsp_wid;
wire [NUM_LANES-1:0] fpu_rsp_tmask;
wire [`XLEN-1:0] fpu_rsp_PC;
wire [`PC_BITS-1:0] fpu_rsp_PC;
wire [`NR_BITS-1:0] fpu_rsp_rd;
wire [PID_WIDTH-1:0] fpu_rsp_pid;
wire fpu_rsp_sop;
wire fpu_rsp_eop;
wire [TAG_WIDTH-1:0] fpu_req_tag, fpu_rsp_tag;
wire [TAG_WIDTH-1:0] fpu_req_tag, fpu_rsp_tag;
wire mdata_full;
wire [`INST_FMT_BITS-1:0] fpu_fmt = execute_if[block_idx].data.imm[`INST_FMT_BITS-1:0];
wire [`INST_FRM_BITS-1:0] fpu_frm = execute_if[block_idx].data.op_mod[`INST_FRM_BITS-1:0];
wire [`INST_FMT_BITS-1:0] fpu_fmt = per_block_execute_if[block_idx].data.op_args.fpu.fmt;
wire [`INST_FRM_BITS-1:0] fpu_frm = per_block_execute_if[block_idx].data.op_args.fpu.frm;
wire execute_fire = execute_if[block_idx].valid && execute_if[block_idx].ready;
wire execute_fire = per_block_execute_if[block_idx].valid && per_block_execute_if[block_idx].ready;
wire fpu_rsp_fire = fpu_rsp_valid && fpu_rsp_ready;
VX_index_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + PID_WIDTH + 1 + 1),
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + PID_WIDTH + 1 + 1),
.SIZE (`FPUQ_SIZE)
) tag_store (
.clk (clk),
.reset (reset),
.acquire_en (execute_fire),
.write_addr (fpu_req_tag),
.write_data ({execute_if[block_idx].data.uuid, execute_if[block_idx].data.wid, execute_if[block_idx].data.tmask, execute_if[block_idx].data.PC, execute_if[block_idx].data.rd, execute_if[block_idx].data.pid, execute_if[block_idx].data.sop, execute_if[block_idx].data.eop}),
.reset (block_reset),
.acquire_en (execute_fire),
.write_addr (fpu_req_tag),
.write_data ({per_block_execute_if[block_idx].data.uuid, per_block_execute_if[block_idx].data.wid, per_block_execute_if[block_idx].data.tmask, per_block_execute_if[block_idx].data.PC, per_block_execute_if[block_idx].data.rd, per_block_execute_if[block_idx].data.pid, per_block_execute_if[block_idx].data.sop, per_block_execute_if[block_idx].data.eop}),
.read_data ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
.read_addr (fpu_rsp_tag),
.release_en (fpu_rsp_fire),
.release_en (fpu_rsp_fire),
.full (mdata_full),
`UNUSED_PIN (empty)
);
// resolve dynamic FRM from CSR
wire [`INST_FRM_BITS-1:0] fpu_req_frm;
`ASSIGN_BLOCKED_WID (fpu_to_csr_if[block_idx].read_wid, execute_if[block_idx].data.wid, block_idx, `NUM_FPU_BLOCKS)
assign fpu_req_frm = (execute_if[block_idx].data.op_type != `INST_FPU_MISC
&& fpu_frm == `INST_FRM_DYN) ? fpu_to_csr_if[block_idx].read_frm : fpu_frm;
// resolve dynamic FRM from CSR
wire [`INST_FRM_BITS-1:0] fpu_req_frm;
`ASSIGN_BLOCKED_WID (fpu_csr_if[block_idx].read_wid, per_block_execute_if[block_idx].data.wid, block_idx, `NUM_FPU_BLOCKS)
assign fpu_req_frm = (per_block_execute_if[block_idx].data.op_type != `INST_FPU_MISC
&& fpu_frm == `INST_FRM_DYN) ? fpu_csr_if[block_idx].read_frm : fpu_frm;
// submit FPU request
assign fpu_req_valid = execute_if[block_idx].valid && ~mdata_full;
assign execute_if[block_idx].ready = fpu_req_ready && ~mdata_full;
`RESET_RELAY (fpu_reset, reset);
assign fpu_req_valid = per_block_execute_if[block_idx].valid && ~mdata_full;
assign per_block_execute_if[block_idx].ready = fpu_req_ready && ~mdata_full;
`ifdef FPU_DPI
VX_fpu_dpi #(
.NUM_LANES (NUM_LANES),
.TAGW (TAG_WIDTH),
.OUT_REG (PARTIAL_BW ? 1 : 3)
.TAG_WIDTH (TAG_WIDTH),
.OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_dpi (
.clk (clk),
.reset (fpu_reset),
.reset (block_reset),
.valid_in (fpu_req_valid),
.op_type (execute_if[block_idx].data.op_type),
.lane_mask (execute_if[block_idx].data.tmask),
.mask_in (per_block_execute_if[block_idx].data.tmask),
.op_type (per_block_execute_if[block_idx].data.op_type),
.fmt (fpu_fmt),
.frm (fpu_req_frm),
.dataa (execute_if[block_idx].data.rs1_data),
.datab (execute_if[block_idx].data.rs2_data),
.datac (execute_if[block_idx].data.rs3_data),
.dataa (per_block_execute_if[block_idx].data.rs1_data),
.datab (per_block_execute_if[block_idx].data.rs2_data),
.datac (per_block_execute_if[block_idx].data.rs3_data),
.tag_in (fpu_req_tag),
.ready_in (fpu_req_ready),
@ -140,67 +137,67 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.has_fflags (fpu_rsp_has_fflags),
.fflags (fpu_rsp_fflags),
.tag_out (fpu_rsp_tag),
.ready_out (fpu_rsp_ready)
);
.ready_out (fpu_rsp_ready)
);
`elsif FPU_FPNEW
VX_fpu_fpnew #(
.NUM_LANES (NUM_LANES),
.TAGW (TAG_WIDTH),
.OUT_REG (PARTIAL_BW ? 1 : 3)
.TAG_WIDTH (TAG_WIDTH),
.OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_fpnew (
.clk (clk),
.reset (fpu_reset),
.reset (block_reset),
.valid_in (fpu_req_valid),
.op_type (execute_if[block_idx].data.op_type),
.lane_mask (execute_if[block_idx].data.tmask),
.mask_in (per_block_execute_if[block_idx].data.tmask),
.op_type (per_block_execute_if[block_idx].data.op_type),
.fmt (fpu_fmt),
.frm (fpu_req_frm),
.dataa (execute_if[block_idx].data.rs1_data),
.datab (execute_if[block_idx].data.rs2_data),
.datac (execute_if[block_idx].data.rs3_data),
.dataa (per_block_execute_if[block_idx].data.rs1_data),
.datab (per_block_execute_if[block_idx].data.rs2_data),
.datac (per_block_execute_if[block_idx].data.rs3_data),
.tag_in (fpu_req_tag),
.ready_in (fpu_req_ready),
.valid_out (fpu_rsp_valid),
.valid_out (fpu_rsp_valid),
.result (fpu_rsp_result),
.has_fflags (fpu_rsp_has_fflags),
.fflags (fpu_rsp_fflags),
.tag_out (fpu_rsp_tag),
.ready_out (fpu_rsp_ready)
.tag_out (fpu_rsp_tag),
.ready_out (fpu_rsp_ready)
);
`elsif FPU_DSP
VX_fpu_dsp #(
.NUM_LANES (NUM_LANES),
.TAGW (TAG_WIDTH),
.OUT_REG (PARTIAL_BW ? 1 : 3)
.TAG_WIDTH (TAG_WIDTH),
.OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_dsp (
.clk (clk),
.reset (fpu_reset),
.reset (block_reset),
.valid_in (fpu_req_valid),
.lane_mask (execute_if[block_idx].data.tmask),
.op_type (execute_if[block_idx].data.op_type),
.mask_in (per_block_execute_if[block_idx].data.tmask),
.op_type (per_block_execute_if[block_idx].data.op_type),
.fmt (fpu_fmt),
.frm (fpu_req_frm),
.dataa (execute_if[block_idx].data.rs1_data),
.datab (execute_if[block_idx].data.rs2_data),
.datac (execute_if[block_idx].data.rs3_data),
.dataa (per_block_execute_if[block_idx].data.rs1_data),
.datab (per_block_execute_if[block_idx].data.rs2_data),
.datac (per_block_execute_if[block_idx].data.rs3_data),
.tag_in (fpu_req_tag),
.ready_in (fpu_req_ready),
.valid_out (fpu_rsp_valid),
.result (fpu_rsp_result),
.valid_out (fpu_rsp_valid),
.result (fpu_rsp_result),
.has_fflags (fpu_rsp_has_fflags),
.fflags (fpu_rsp_fflags),
.tag_out (fpu_rsp_tag),
.ready_out (fpu_rsp_ready)
);
`endif
// handle FPU response
@ -210,7 +207,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
if (PID_BITS != 0) begin
fflags_t fpu_rsp_fflags_r;
always @(posedge clk) begin
if (reset) begin
if (block_reset) begin
fpu_rsp_fflags_r <= '0;
end else if (fpu_rsp_fire) begin
fpu_rsp_fflags_r <= fpu_rsp_eop ? '0 : (fpu_rsp_fflags_r | fpu_rsp_fflags);
@ -220,39 +217,37 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
end else begin
assign fpu_rsp_fflags_q = fpu_rsp_fflags;
end
assign fpu_to_csr_if[block_idx].write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags;
`ASSIGN_BLOCKED_WID (fpu_to_csr_if[block_idx].write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS)
assign fpu_to_csr_if[block_idx].write_fflags = fpu_rsp_fflags_q;
assign fpu_csr_if[block_idx].write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags;
`ASSIGN_BLOCKED_WID (fpu_csr_if[block_idx].write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS)
assign fpu_csr_if[block_idx].write_fflags = fpu_rsp_fflags_q;
// send response
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
.SIZE (0)
) rsp_buf (
.clk (clk),
.reset (reset),
.reset (block_reset),
.valid_in (fpu_rsp_valid),
.ready_in (fpu_rsp_ready),
.data_in ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_result, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
.data_out ({commit_block_if[block_idx].data.uuid, commit_block_if[block_idx].data.wid, commit_block_if[block_idx].data.tmask, commit_block_if[block_idx].data.PC, commit_block_if[block_idx].data.rd, commit_block_if[block_idx].data.data, commit_block_if[block_idx].data.pid, commit_block_if[block_idx].data.sop, commit_block_if[block_idx].data.eop}),
.valid_out (commit_block_if[block_idx].valid),
.ready_out (commit_block_if[block_idx].ready)
.data_out ({per_block_commit_if[block_idx].data.uuid, per_block_commit_if[block_idx].data.wid, per_block_commit_if[block_idx].data.tmask, per_block_commit_if[block_idx].data.PC, per_block_commit_if[block_idx].data.rd, per_block_commit_if[block_idx].data.data, per_block_commit_if[block_idx].data.pid, per_block_commit_if[block_idx].data.sop, per_block_commit_if[block_idx].data.eop}),
.valid_out (per_block_commit_if[block_idx].valid),
.ready_out (per_block_commit_if[block_idx].ready)
);
assign commit_block_if[block_idx].data.wb = 1'b1;
assign per_block_commit_if[block_idx].data.wb = 1'b1;
end
`RESET_RELAY (commit_reset, reset);
VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (PARTIAL_BW ? 3 : 0)
.OUT_BUF (PARTIAL_BW ? 3 : 0)
) gather_unit (
.clk (clk),
.reset (commit_reset),
.commit_in_if (commit_block_if),
.reset (reset),
.commit_in_if (per_block_commit_if),
.commit_out_if (commit_if)
);

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -16,22 +16,24 @@
module VX_gather_unit import VX_gpu_pkg::*; #(
parameter BLOCK_SIZE = 1,
parameter NUM_LANES = 1,
parameter OUT_REG = 0
) (
parameter OUT_BUF = 0
) (
input wire clk,
input wire reset,
// inputs
VX_commit_if.slave commit_in_if [BLOCK_SIZE],
// outputs
VX_commit_if.master commit_out_if [`ISSUE_WIDTH]
);
`STATIC_ASSERT (`IS_DIVISBLE(`ISSUE_WIDTH, BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT (`IS_DIVISBLE(`NUM_THREADS, NUM_LANES), ("invalid parameter"))
localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + 1 + `NR_BITS + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + 1 + `NR_BITS + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam DATA_WIS_OFF = DATAW - (`UUID_WIDTH + `NW_WIDTH);
wire [BLOCK_SIZE-1:0] commit_in_valid;
@ -71,24 +73,22 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
assign commit_in_ready[i] = commit_out_ready[commit_in_isw[i]];
end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_tmp_if();
`RESET_RELAY(commit_out_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`OUT_REG_TO_EB_SIZE(OUT_REG)),
.OUT_REG (`OUT_REG_TO_EB_REG(OUT_REG))
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf (
.clk (clk),
.reset (commit_out_reset),
.reset (reset),
.valid_in (commit_out_valid[i]),
.ready_in (commit_out_ready[i]),
.data_in (commit_out_data[i]),
.data_in (commit_out_data[i]),
.data_out (commit_tmp_if.data),
.valid_out (commit_tmp_if.valid),
.ready_out (commit_tmp_if.ready)

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,60 +14,73 @@
`include "VX_define.vh"
module VX_ibuffer import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output wire [`PERF_CTR_BITS-1:0] perf_stalls,
`endif
// inputs
VX_decode_if.slave decode_if,
// outputs
VX_ibuffer_if.master ibuffer_if [`ISSUE_WIDTH]
VX_ibuffer_if.master ibuffer_if [PER_ISSUE_WARPS]
);
`UNUSED_PARAM (CORE_ID)
localparam ISW_WIDTH = `LOG2UP(`ISSUE_WIDTH);
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4);
wire [`ISSUE_WIDTH-1:0] ibuf_ready_in;
`UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4);
wire [ISW_WIDTH-1:0] decode_isw = wid_to_isw(decode_if.data.wid);
wire [ISSUE_WIS_W-1:0] decode_wis = wid_to_wis(decode_if.data.wid);
assign decode_if.ready = ibuf_ready_in[decode_isw];
wire [PER_ISSUE_WARPS-1:0] ibuf_ready_in;
assign decode_if.ready = ibuf_ready_in[decode_if.data.wid];
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`IBUF_SIZE),
.OUT_REG (1)
.OUT_REG (2) // 2-cycle EB for area reduction
) instr_buf (
.clk (clk),
.reset (reset),
.valid_in (decode_if.valid && decode_isw == i),
.ready_in (ibuf_ready_in[i]),
.valid_in (decode_if.valid && decode_if.data.wid == ISSUE_WIS_W'(w)),
.data_in ({
decode_if.data.uuid,
decode_wis,
decode_if.data.tmask,
decode_if.data.PC,
decode_if.data.ex_type,
decode_if.data.op_type,
decode_if.data.op_mod,
decode_if.data.op_args,
decode_if.data.wb,
decode_if.data.use_PC,
decode_if.data.use_imm,
decode_if.data.PC,
decode_if.data.imm,
decode_if.data.rd,
decode_if.data.rs1,
decode_if.data.rs2,
decode_if.data.rs3}),
.data_out(ibuffer_if[i].data),
.valid_out (ibuffer_if[i].valid),
.ready_out(ibuffer_if[i].ready)
);
assign decode_if.ibuf_pop[i] = ibuffer_if[i].valid && ibuffer_if[i].ready;
decode_if.data.rd,
decode_if.data.rs1,
decode_if.data.rs2,
decode_if.data.rs3
}),
.ready_in (ibuf_ready_in[w]),
.valid_out(ibuffer_if[w].valid),
.data_out (ibuffer_if[w].data),
.ready_out(ibuffer_if[w].ready)
);
`ifndef L1_ENABLE
assign decode_if.ibuf_pop[w] = ibuffer_if[w].valid && ibuffer_if[w].ready;
`endif
end
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
wire decode_if_stall = decode_if.valid && ~decode_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_ibf_stalls <= '0;
end else begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_if_stall);
end
end
assign perf_stalls = perf_ibf_stalls;
`endif
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -24,14 +24,15 @@ module VX_ipdom_stack #(
input wire [WIDTH-1:0] q0,
input wire [WIDTH-1:0] q1,
output wire [WIDTH-1:0] d,
output wire d_set,
output wire d_set,
output wire [ADDRW-1:0] q_ptr,
input wire push,
input wire pop,
input wire pop,
output wire empty,
output wire full
);
reg slot_set [DEPTH-1:0];
reg [ADDRW-1:0] rd_ptr, wr_ptr;
reg empty_r, full_r;
@ -41,28 +42,28 @@ module VX_ipdom_stack #(
wire d_set_n = slot_set[rd_ptr];
always @(posedge clk) begin
if (reset) begin
if (reset) begin
rd_ptr <= '0;
wr_ptr <= '0;
empty_r <= 1;
full_r <= 0;
full_r <= 0;
end else begin
`ASSERT(~push || ~full, ("runtime error: writing to a full stack!"));
`ASSERT(~pop || ~empty, ("runtime error: reading an empty stack!"));
`ASSERT(~push || ~pop, ("runtime error: push and pop in same cycle not supported!"));
if (push) begin
if (push) begin
rd_ptr <= wr_ptr;
wr_ptr <= wr_ptr + ADDRW'(1);
wr_ptr <= wr_ptr + ADDRW'(1);
empty_r <= 0;
full_r <= (ADDRW'(DEPTH-1) == wr_ptr);
end else if (pop) begin
end else if (pop) begin
wr_ptr <= wr_ptr - ADDRW'(d_set_n);
rd_ptr <= rd_ptr - ADDRW'(d_set_n);
empty_r <= (rd_ptr == 0) && (d_set_n == 1);
full_r <= 0;
end
end
end
end
VX_dp_ram #(
.DATAW (WIDTH * 2),
@ -71,24 +72,26 @@ module VX_ipdom_stack #(
.LUTRAM (OUT_REG ? 0 : 1)
) store (
.clk (clk),
.reset (reset),
.read (1'b1),
.write (push),
`UNUSED_PIN (wren),
.write (push),
.wren (1'b1),
.waddr (wr_ptr),
.wdata ({q1, q0}),
.raddr (rd_ptr),
.rdata ({d1, d0})
);
always @(posedge clk) begin
if (push) begin
slot_set[wr_ptr] <= 0;
end else if (pop) begin
slot_set[wr_ptr] <= 0;
end else if (pop) begin
slot_set[rd_ptr] <= 1;
end
end
wire d_set_r;
VX_pipe_register #(
.DATAW (1),
.DEPTH (OUT_REG)
@ -102,6 +105,7 @@ module VX_ipdom_stack #(
assign d = d_set_r ? d0 : d1;
assign d_set = ~d_set_r;
assign q_ptr = wr_ptr;
assign empty = empty_r;
assign full = full_r;

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -12,10 +12,9 @@
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
module VX_issue #(
parameter CORE_ID = 0
module VX_issue import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -23,152 +22,81 @@ module VX_issue #(
input wire reset,
`ifdef PERF_ENABLE
VX_pipeline_perf_if.issue perf_issue_if,
output issue_perf_t issue_perf,
`endif
VX_decode_if.slave decode_if,
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_dispatch_if.master alu_dispatch_if [`ISSUE_WIDTH],
VX_dispatch_if.master lsu_dispatch_if [`ISSUE_WIDTH],
`ifdef EXT_F_ENABLE
VX_dispatch_if.master fpu_dispatch_if [`ISSUE_WIDTH],
`endif
VX_dispatch_if.master sfu_dispatch_if [`ISSUE_WIDTH]
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH]
);
VX_ibuffer_if ibuffer_if [`ISSUE_WIDTH]();
VX_ibuffer_if scoreboard_if [`ISSUE_WIDTH]();
VX_operands_if operands_if [`ISSUE_WIDTH]();
`RESET_RELAY (ibuf_reset, reset);
`RESET_RELAY (scoreboard_reset, reset);
`RESET_RELAY (operands_reset, reset);
`RESET_RELAY (dispatch_reset, reset);
VX_ibuffer #(
.CORE_ID (CORE_ID)
) ibuffer (
.clk (clk),
.reset (ibuf_reset),
.decode_if (decode_if),
.ibuffer_if (ibuffer_if)
);
VX_scoreboard #(
.CORE_ID (CORE_ID)
) scoreboard (
.clk (clk),
.reset (scoreboard_reset),
`ifdef PERF_ENABLE
.perf_scb_stalls(perf_issue_if.scb_stalls),
.perf_units_uses(perf_issue_if.units_uses),
.perf_sfu_uses (perf_issue_if.sfu_uses),
`endif
.writeback_if (writeback_if),
.ibuffer_if (ibuffer_if),
.scoreboard_if (scoreboard_if)
);
VX_operands #(
.CORE_ID (CORE_ID)
) operands (
.clk (clk),
.reset (operands_reset),
.writeback_if (writeback_if),
.scoreboard_if (scoreboard_if),
.operands_if (operands_if)
);
VX_dispatch #(
.CORE_ID (CORE_ID)
) dispatch (
.clk (clk),
.reset (dispatch_reset),
`ifdef PERF_ENABLE
`UNUSED_PIN (perf_stalls),
`endif
.operands_if (operands_if),
.alu_dispatch_if(alu_dispatch_if),
.lsu_dispatch_if(lsu_dispatch_if),
`ifdef EXT_F_ENABLE
.fpu_dispatch_if(fpu_dispatch_if),
`endif
.sfu_dispatch_if(sfu_dispatch_if)
);
`ifdef DBG_SCOPE_ISSUE
if (CORE_ID == 0) begin
`ifdef SCOPE
wire operands_if_fire = operands_if[0].valid && operands_if[0].ready;
wire operands_if_not_ready = ~operands_if[0].ready;
wire writeback_if_valid = writeback_if[0].valid;
VX_scope_tap #(
.SCOPE_ID (2),
.TRIGGERW (4),
.PROBEW (`UUID_WIDTH + `NUM_THREADS + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS +
1 + `NR_BITS + `XLEN + 1 + 1 + (`NUM_THREADS * 3 * `XLEN) +
`UUID_WIDTH + `NUM_THREADS + `NR_BITS + (`NUM_THREADS*`XLEN) + 1)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({
reset,
operands_if_fire,
operands_if_not_ready,
writeback_if_valid
}),
.probes({
operands_if[0].data.uuid,
operands_if[0].data.tmask,
operands_if[0].data.ex_type,
operands_if[0].data.op_type,
operands_if[0].data.op_mod,
operands_if[0].data.wb,
operands_if[0].data.rd,
operands_if[0].data.imm,
operands_if[0].data.use_PC,
operands_if[0].data.use_imm,
operands_if[0].data.rs1_data,
operands_if[0].data.rs2_data,
operands_if[0].data.rs3_data,
writeback_if[0].data.uuid,
writeback_if[0].data.tmask,
writeback_if[0].data.rd,
writeback_if[0].data.data,
writeback_if[0].data.eop
}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
ila_issue ila_issue_inst (
.clk (clk),
.probe0 ({operands_if.uuid, ibuffer.rs3, ibuffer.rs2, ibuffer.rs1, operands_if.PC, operands_if.tmask, operands_if.wid, operands_if.ex_type, operands_if.op_type, operands_if.ready, operands_if.valid}),
.probe1 ({writeback_if.uuid, writeback_if.data[0], writeback_if.PC, writeback_if.tmask, writeback_if.wid, writeback_if.eop, writeback_if.valid})
);
`endif
end
`else
`SCOPE_IO_UNUSED()
`endif
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
wire decode_stall = decode_if.valid && ~decode_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_ibf_stalls <= '0;
end else begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_stall);
end
issue_perf_t per_issue_perf [`ISSUE_WIDTH];
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, ibf_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, scb_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, opd_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, units_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
end
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, sfu_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
end
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
`endif
wire [ISSUE_ISW_W-1:0] decode_isw = wid_to_isw(decode_if.data.wid);
wire [ISSUE_WIS_W-1:0] decode_wis = wid_to_wis(decode_if.data.wid);
wire [`ISSUE_WIDTH-1:0] decode_ready_in;
assign decode_if.ready = decode_ready_in[decode_isw];
`SCOPE_IO_SWITCH (`ISSUE_WIDTH)
for (genvar issue_id = 0; issue_id < `ISSUE_WIDTH; ++issue_id) begin : issue_slices
VX_decode_if #(
.NUM_WARPS (PER_ISSUE_WARPS)
) per_issue_decode_if();
VX_dispatch_if per_issue_dispatch_if[`NUM_EX_UNITS]();
assign per_issue_decode_if.valid = decode_if.valid && (decode_isw == ISSUE_ISW_W'(issue_id));
assign per_issue_decode_if.data.uuid = decode_if.data.uuid;
assign per_issue_decode_if.data.wid = decode_wis;
assign per_issue_decode_if.data.tmask = decode_if.data.tmask;
assign per_issue_decode_if.data.PC = decode_if.data.PC;
assign per_issue_decode_if.data.ex_type = decode_if.data.ex_type;
assign per_issue_decode_if.data.op_type = decode_if.data.op_type;
assign per_issue_decode_if.data.op_args = decode_if.data.op_args;
assign per_issue_decode_if.data.wb = decode_if.data.wb;
assign per_issue_decode_if.data.rd = decode_if.data.rd;
assign per_issue_decode_if.data.rs1 = decode_if.data.rs1;
assign per_issue_decode_if.data.rs2 = decode_if.data.rs2;
assign per_issue_decode_if.data.rs3 = decode_if.data.rs3;
assign decode_ready_in[issue_id] = per_issue_decode_if.ready;
`ifndef L1_ENABLE
assign decode_if.ibuf_pop[issue_id * PER_ISSUE_WARPS +: PER_ISSUE_WARPS] = per_issue_decode_if.ibuf_pop;
`endif
`RESET_RELAY (slice_reset, reset);
VX_issue_slice #(
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, issue_id)),
.ISSUE_ID (issue_id)
) issue_slice (
`SCOPE_IO_BIND(issue_id)
.clk (clk),
.reset (slice_reset),
`ifdef PERF_ENABLE
.issue_perf (per_issue_perf[issue_id]),
`endif
.decode_if (per_issue_decode_if),
.writeback_if (writeback_if[issue_id]),
.dispatch_if (per_issue_dispatch_if)
);
// Assign transposed dispatch_if
for (genvar ex_id = 0; ex_id < `NUM_EX_UNITS; ++ex_id) begin
`ASSIGN_VX_IF(dispatch_if[ex_id * `ISSUE_WIDTH + issue_id], per_issue_dispatch_if[ex_id]);
end
end
endmodule

View file

@ -0,0 +1,159 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter ISSUE_ID = 0
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output issue_perf_t issue_perf,
`endif
VX_decode_if.slave decode_if,
VX_writeback_if.slave writeback_if,
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS]
);
`UNUSED_PARAM (ISSUE_ID)
VX_ibuffer_if ibuffer_if [PER_ISSUE_WARPS]();
VX_scoreboard_if scoreboard_if();
VX_operands_if operands_if();
`RESET_RELAY (ibuf_reset, reset);
`RESET_RELAY (scoreboard_reset, reset);
`RESET_RELAY (operands_reset, reset);
`RESET_RELAY (dispatch_reset, reset);
VX_ibuffer #(
.INSTANCE_ID ($sformatf("%s-ibuffer", INSTANCE_ID))
) ibuffer (
.clk (clk),
.reset (ibuf_reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.ibf_stalls),
`endif
.decode_if (decode_if),
.ibuffer_if (ibuffer_if)
);
VX_scoreboard #(
.INSTANCE_ID ($sformatf("%s-scoreboard", INSTANCE_ID))
) scoreboard (
.clk (clk),
.reset (scoreboard_reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.scb_stalls),
.perf_units_uses(issue_perf.units_uses),
.perf_sfu_uses (issue_perf.sfu_uses),
`endif
.writeback_if (writeback_if),
.ibuffer_if (ibuffer_if),
.scoreboard_if (scoreboard_if)
);
VX_operands #(
.INSTANCE_ID ($sformatf("%s-operands", INSTANCE_ID))
) operands (
.clk (clk),
.reset (operands_reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.opd_stalls),
`endif
.writeback_if (writeback_if),
.scoreboard_if (scoreboard_if),
.operands_if (operands_if)
);
VX_dispatch #(
.INSTANCE_ID ($sformatf("%s-dispatch", INSTANCE_ID))
) dispatch (
.clk (clk),
.reset (dispatch_reset),
`ifdef PERF_ENABLE
`UNUSED_PIN (perf_stalls),
`endif
.operands_if (operands_if),
.dispatch_if (dispatch_if)
);
`ifdef DBG_SCOPE_ISSUE
wire operands_if_fire = operands_if.valid && operands_if.ready;
wire operands_if_not_ready = ~operands_if.ready;
wire writeback_if_valid = writeback_if.valid;
VX_scope_tap #(
.SCOPE_ID (2),
.TRIGGERW (4),
.PROBEW (`UUID_WIDTH + `NUM_THREADS + `EX_BITS + `INST_OP_BITS +
1 + `NR_BITS + (`NUM_THREADS * 3 * `XLEN) +
`UUID_WIDTH + `NUM_THREADS + `NR_BITS + (`NUM_THREADS*`XLEN) + 1)
) scope_tap (
.clk (clk),
.reset (scope_reset),
.start (1'b0),
.stop (1'b0),
.triggers ({
reset,
operands_if_fire,
operands_if_not_ready,
writeback_if_valid
}),
.probes ({
operands_if.data.uuid,
operands_if.data.tmask,
operands_if.data.ex_type,
operands_if.data.op_type,
operands_if.data.wb,
operands_if.data.rd,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data,
writeback_if.data.uuid,
writeback_if.data.tmask,
writeback_if.data.rd,
writeback_if.data.data,
writeback_if.data.eop
}),
.bus_in (scope_bus_in),
.bus_out (scope_bus_out)
);
`else
`SCOPE_IO_UNUSED()
`endif
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (operands_if.valid && operands_if.ready) begin
`TRACE(1, ("%d: %s wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, wis_to_wid(operands_if.data.wis, ISSUE_ID), {operands_if.data.PC, 1'b0}));
trace_ex_type(1, operands_if.data.ex_type);
`TRACE(1, (", op="));
trace_ex_op(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args);
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if.data.tmask, operands_if.data.wb, operands_if.data.rd));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs1_data, `NUM_THREADS);
`TRACE(1, (", rs2_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs2_data, `NUM_THREADS);
`TRACE(1, (", rs3_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs3_data, `NUM_THREADS);
trace_op_args(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args);
`TRACE(1, (" (#%0d)\n", operands_if.data.uuid));
end
end
`endif
endmodule

132
hw/rtl/core/VX_issue_top.sv Normal file
View file

@ -0,0 +1,132 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_issue_top import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "issue"
) (
// Clock
input wire clk,
input wire reset,
input wire decode_valid,
input wire [`UUID_WIDTH-1:0] decode_uuid,
input wire [`NW_WIDTH-1:0] decode_wid,
input wire [`NUM_THREADS-1:0] decode_tmask,
input wire [`PC_BITS-1:0] decode_PC,
input wire [`EX_BITS-1:0] decode_ex_type,
input wire [`INST_OP_BITS-1:0] decode_op_type,
input op_args_t decode_op_args,
input wire decode_wb,
input wire [`NR_BITS-1:0] decode_rd,
input wire [`NR_BITS-1:0] decode_rs1,
input wire [`NR_BITS-1:0] decode_rs2,
input wire [`NR_BITS-1:0] decode_rs3,
output wire decode_ready,
input wire writeback_valid[`ISSUE_WIDTH],
input wire [`UUID_WIDTH-1:0] writeback_uuid[`ISSUE_WIDTH],
input wire [ISSUE_WIS_W-1:0] writeback_wis[`ISSUE_WIDTH],
input wire [`NUM_THREADS-1:0] writeback_tmask[`ISSUE_WIDTH],
input wire [`PC_BITS-1:0] writeback_PC[`ISSUE_WIDTH],
input wire [`NR_BITS-1:0] writeback_rd[`ISSUE_WIDTH],
input wire [`NUM_THREADS-1:0][`XLEN-1:0] writeback_data[`ISSUE_WIDTH],
input wire writeback_sop[`ISSUE_WIDTH],
input wire writeback_eop[`ISSUE_WIDTH],
output wire dispatch_valid[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`UUID_WIDTH-1:0] dispatch_uuid[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [ISSUE_WIS_W-1:0] dispatch_wis[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NUM_THREADS-1:0] dispatch_tmask[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`PC_BITS-1:0] dispatch_PC[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`INST_ALU_BITS-1:0] dispatch_op_type[`NUM_EX_UNITS * `ISSUE_WIDTH],
output op_args_t dispatch_op_args[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire dispatch_wb[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NR_BITS-1:0] dispatch_rd[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NT_WIDTH-1:0] dispatch_tid[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs1_data[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data[`NUM_EX_UNITS * `ISSUE_WIDTH],
input wire dispatch_ready[`NUM_EX_UNITS * `ISSUE_WIDTH]
);
VX_decode_if decode_if();
VX_dispatch_if dispatch_if[`NUM_EX_UNITS * `ISSUE_WIDTH]();
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
assign decode_if.valid = decode_valid;
assign decode_if.data.uuid = decode_uuid;
assign decode_if.data.wid = decode_wid;
assign decode_if.data.tmask = decode_tmask;
assign decode_if.data.PC = decode_PC;
assign decode_if.data.ex_type = decode_ex_type;
assign decode_if.data.op_type = decode_op_type;
assign decode_if.data.op_args = decode_op_args;
assign decode_if.data.wb = decode_wb;
assign decode_if.data.rd = decode_rd;
assign decode_if.data.rs1 = decode_rs1;
assign decode_if.data.rs2 = decode_rs2;
assign decode_if.data.rs3 = decode_rs3;
assign decode_ready = decode_if.ready;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign writeback_if[i].valid = writeback_valid[i];
assign writeback_if[i].data.uuid = writeback_uuid[i];
assign writeback_if[i].data.wis = writeback_wis[i];
assign writeback_if[i].data.tmask = writeback_tmask[i];
assign writeback_if[i].data.PC = writeback_PC[i];
assign writeback_if[i].data.rd = writeback_rd[i];
assign writeback_if[i].data.data = writeback_data[i];
assign writeback_if[i].data.sop = writeback_sop[i];
assign writeback_if[i].data.eop = writeback_eop[i];
end
for (genvar i = 0; i < `NUM_EX_UNITS * `ISSUE_WIDTH; ++i) begin
assign dispatch_valid[i] = dispatch_if[i].valid;
assign dispatch_uuid[i] = dispatch_if[i].data.uuid;
assign dispatch_wis[i] = dispatch_if[i].data.wis;
assign dispatch_tmask[i] = dispatch_if[i].data.tmask;
assign dispatch_PC[i] = dispatch_if[i].data.PC;
assign dispatch_op_type[i] = dispatch_if[i].data.op_type;
assign dispatch_op_args[i] = dispatch_if[i].data.op_args;
assign dispatch_wb[i] = dispatch_if[i].data.wb;
assign dispatch_rd[i] = dispatch_if[i].data.rd;
assign dispatch_tid[i] = dispatch_if[i].data.tid;
assign dispatch_rs1_data[i] = dispatch_if[i].data.rs1_data;
assign dispatch_rs2_data[i] = dispatch_if[i].data.rs2_data;
assign dispatch_rs3_data[i] = dispatch_if[i].data.rs3_data;
assign dispatch_if[i].ready = dispatch_ready[i];
end
`ifdef PERF_ENABLE
issue_perf_t issue_perf = '0;
`endif
VX_issue #(
.INSTANCE_ID (INSTANCE_ID)
) issue (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.issue_perf (issue_perf),
`endif
.decode_if (decode_if),
.writeback_if (writeback_if),
.dispatch_if (dispatch_if)
);
endmodule

201
hw/rtl/core/VX_lmem_unit.sv Normal file
View file

@ -0,0 +1,201 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_lmem_unit import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output cache_perf_t cache_perf,
`endif
VX_lsu_mem_if.slave lsu_mem_in_if [`NUM_LSU_BLOCKS],
VX_lsu_mem_if.master lsu_mem_out_if [`NUM_LSU_BLOCKS]
);
`STATIC_ASSERT(`IS_DIVISBLE((1 << `LMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT(0 == (`LMEM_BASE_ADDR % (1 << `LMEM_LOG_SIZE)), ("invalid parameter"))
localparam REQ_DATAW = `NUM_LSU_LANES + 1 + `NUM_LSU_LANES * (LSU_WORD_SIZE + LSU_ADDR_WIDTH + `ADDR_TYPE_WIDTH + LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH;
localparam RSP_DATAW = `NUM_LSU_LANES + `NUM_LSU_LANES * (LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH;
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE);
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_switch_if[`NUM_LSU_BLOCKS]();
`RESET_RELAY_EX (block_reset, reset, `NUM_LSU_BLOCKS, 1);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
wire [`NUM_LSU_LANES-1:0] is_addr_local_mask;
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
assign is_addr_local_mask[j] = lsu_mem_in_if[i].req_data.atype[j][`ADDR_TYPE_LOCAL];
end
wire is_addr_global = | (lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask);
wire is_addr_local = | (lsu_mem_in_if[i].req_data.mask & is_addr_local_mask);
wire req_global_ready;
wire req_local_ready;
VX_elastic_buffer #(
.DATAW (REQ_DATAW),
.SIZE (2),
.OUT_REG (1)
) req_global_buf (
.clk (clk),
.reset (block_reset[i]),
.valid_in (lsu_mem_in_if[i].req_valid && is_addr_global),
.data_in ({
lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask,
lsu_mem_in_if[i].req_data.rw,
lsu_mem_in_if[i].req_data.byteen,
lsu_mem_in_if[i].req_data.addr,
lsu_mem_in_if[i].req_data.atype,
lsu_mem_in_if[i].req_data.data,
lsu_mem_in_if[i].req_data.tag
}),
.ready_in (req_global_ready),
.valid_out (lsu_mem_out_if[i].req_valid),
.data_out ({
lsu_mem_out_if[i].req_data.mask,
lsu_mem_out_if[i].req_data.rw,
lsu_mem_out_if[i].req_data.byteen,
lsu_mem_out_if[i].req_data.addr,
lsu_mem_out_if[i].req_data.atype,
lsu_mem_out_if[i].req_data.data,
lsu_mem_out_if[i].req_data.tag
}),
.ready_out (lsu_mem_out_if[i].req_ready)
);
VX_elastic_buffer #(
.DATAW (REQ_DATAW),
.SIZE (0),
.OUT_REG (0)
) req_local_buf (
.clk (clk),
.reset (block_reset[i]),
.valid_in (lsu_mem_in_if[i].req_valid && is_addr_local),
.data_in ({
lsu_mem_in_if[i].req_data.mask & is_addr_local_mask,
lsu_mem_in_if[i].req_data.rw,
lsu_mem_in_if[i].req_data.byteen,
lsu_mem_in_if[i].req_data.addr,
lsu_mem_in_if[i].req_data.atype,
lsu_mem_in_if[i].req_data.data,
lsu_mem_in_if[i].req_data.tag
}),
.ready_in (req_local_ready),
.valid_out (lsu_switch_if[i].req_valid),
.data_out ({
lsu_switch_if[i].req_data.mask,
lsu_switch_if[i].req_data.rw,
lsu_switch_if[i].req_data.byteen,
lsu_switch_if[i].req_data.addr,
lsu_switch_if[i].req_data.atype,
lsu_switch_if[i].req_data.data,
lsu_switch_if[i].req_data.tag
}),
.ready_out (lsu_switch_if[i].req_ready)
);
assign lsu_mem_in_if[i].req_ready = (req_global_ready && is_addr_global)
|| (req_local_ready && is_addr_local);
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_DATAW),
.ARBITER ("R"),
.OUT_BUF (1)
) rsp_arb (
.clk (clk),
.reset (block_reset[i]),
.valid_in ({
lsu_switch_if[i].rsp_valid,
lsu_mem_out_if[i].rsp_valid
}),
.ready_in ({
lsu_switch_if[i].rsp_ready,
lsu_mem_out_if[i].rsp_ready
}),
.data_in ({
lsu_switch_if[i].rsp_data,
lsu_mem_out_if[i].rsp_data
}),
.data_out (lsu_mem_in_if[i].rsp_data),
.valid_out (lsu_mem_in_if[i].rsp_valid),
.ready_out (lsu_mem_in_if[i].rsp_ready),
`UNUSED_PIN (sel_out)
);
end
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_if[LSU_NUM_REQS]();
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_tmp_if[`NUM_LSU_LANES]();
VX_lsu_adapter #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (3),
.RSP_OUT_BUF (0)
) lsu_adapter (
.clk (clk),
.reset (block_reset[i]),
.lsu_mem_if (lsu_switch_if[i]),
.mem_bus_if (lmem_bus_tmp_if)
);
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
`ASSIGN_VX_MEM_BUS_IF (lmem_bus_if[i * `NUM_LSU_LANES + j], lmem_bus_tmp_if[j]);
end
end
`RESET_RELAY (lmem_reset, reset);
VX_local_mem #(
.INSTANCE_ID($sformatf("%s-lmem", INSTANCE_ID)),
.SIZE (1 << `LMEM_LOG_SIZE),
.NUM_REQS (LSU_NUM_REQS),
.NUM_BANKS (`LMEM_NUM_BANKS),
.WORD_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.OUT_BUF (3)
) local_mem (
.clk (clk),
.reset (lmem_reset),
`ifdef PERF_ENABLE
.cache_perf (cache_perf),
`endif
.mem_bus_if (lmem_bus_if)
);
endmodule

View file

@ -0,0 +1,121 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_lsu_adapter import VX_gpu_pkg::*; #(
parameter NUM_LANES = 1,
parameter DATA_SIZE = 1,
parameter TAG_WIDTH = 1,
parameter TAG_SEL_BITS = 0,
parameter `STRING ARBITER = "P",
parameter REQ_OUT_BUF = 0,
parameter RSP_OUT_BUF = 0
) (
input wire clk,
input wire reset,
VX_lsu_mem_if.slave lsu_mem_if,
VX_mem_bus_if.master mem_bus_if [NUM_LANES]
);
localparam REQ_ADDR_WIDTH = `MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE);
localparam REQ_DATA_WIDTH = 1 + DATA_SIZE + REQ_ADDR_WIDTH + `ADDR_TYPE_WIDTH + DATA_SIZE * 8;
localparam RSP_DATA_WIDTH = DATA_SIZE * 8;
// handle request unpacking
wire [NUM_LANES-1:0][REQ_DATA_WIDTH-1:0] req_data_in;
wire [NUM_LANES-1:0] req_valid_out;
wire [NUM_LANES-1:0][REQ_DATA_WIDTH-1:0] req_data_out;
wire [NUM_LANES-1:0][TAG_WIDTH-1:0] req_tag_out;
wire [NUM_LANES-1:0] req_ready_out;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign req_data_in[i] = {
lsu_mem_if.req_data.rw,
lsu_mem_if.req_data.byteen[i],
lsu_mem_if.req_data.addr[i],
lsu_mem_if.req_data.atype[i],
lsu_mem_if.req_data.data[i]
};
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign mem_bus_if[i].req_valid = req_valid_out[i];
assign {
mem_bus_if[i].req_data.rw,
mem_bus_if[i].req_data.byteen,
mem_bus_if[i].req_data.addr,
mem_bus_if[i].req_data.atype,
mem_bus_if[i].req_data.data
} = req_data_out[i];
assign mem_bus_if[i].req_data.tag = req_tag_out[i];
assign req_ready_out[i] = mem_bus_if[i].req_ready;
end
VX_stream_unpack #(
.NUM_REQS (NUM_LANES),
.DATA_WIDTH (REQ_DATA_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.OUT_BUF (REQ_OUT_BUF)
) stream_unpack (
.clk (clk),
.reset (reset),
.valid_in (lsu_mem_if.req_valid),
.mask_in (lsu_mem_if.req_data.mask),
.data_in (req_data_in),
.tag_in (lsu_mem_if.req_data.tag),
.ready_in (lsu_mem_if.req_ready),
.valid_out (req_valid_out),
.data_out (req_data_out),
.tag_out (req_tag_out),
.ready_out (req_ready_out)
);
// handle response packing
wire [NUM_LANES-1:0] rsp_valid_out;
wire [NUM_LANES-1:0][RSP_DATA_WIDTH-1:0] rsp_data_out;
wire [NUM_LANES-1:0][TAG_WIDTH-1:0] rsp_tag_out;
wire [NUM_LANES-1:0] rsp_ready_out;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign rsp_valid_out[i] = mem_bus_if[i].rsp_valid;
assign rsp_data_out[i] = mem_bus_if[i].rsp_data.data;
assign rsp_tag_out[i] = mem_bus_if[i].rsp_data.tag;
assign mem_bus_if[i].rsp_ready = rsp_ready_out[i];
end
VX_stream_pack #(
.NUM_REQS (NUM_LANES),
.DATA_WIDTH (RSP_DATA_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.TAG_SEL_BITS (TAG_SEL_BITS),
.ARBITER (ARBITER),
.OUT_BUF (RSP_OUT_BUF)
) stream_pack (
.clk (clk),
.reset (reset),
.valid_in (rsp_valid_out),
.data_in (rsp_data_out),
.tag_in (rsp_tag_out),
.ready_in (rsp_ready_out),
.valid_out (lsu_mem_if.rsp_valid),
.mask_out (lsu_mem_if.rsp_data.mask),
.data_out (lsu_mem_if.rsp_data.data),
.tag_out (lsu_mem_if.rsp_data.tag),
.ready_out (lsu_mem_if.rsp_ready)
);
endmodule

557
hw/rtl/core/VX_lsu_slice.sv Normal file
View file

@ -0,0 +1,557 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
// Inputs
VX_execute_if.slave execute_if,
// Outputs
VX_commit_if.master commit_if,
VX_lsu_mem_if.master lsu_mem_if
);
localparam NUM_LANES = `NUM_LSU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_IN_SIZE);
localparam REQ_ASHIFT = `CLOG2(LSU_WORD_SIZE);
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
localparam MEM_ADDRW = `MEM_ADDR_WIDTH - MEM_ASHIFT;
// tag_id = wid + PC + wb + rd + op_type + align + pid + pkt_addr + fence
localparam TAG_ID_WIDTH = `NW_WIDTH + `PC_BITS + 1 + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * REQ_ASHIFT) + PID_WIDTH + LSUQ_SIZEW + 1;
// tag = uuid + tag_id
localparam TAG_WIDTH = `UUID_WIDTH + TAG_ID_WIDTH;
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_rsp_if();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_no_rsp_if();
`UNUSED_VAR (execute_if.data.rs3_data)
`UNUSED_VAR (execute_if.data.tid)
// full address calculation
wire req_is_fence, rsp_is_fence;
wire [NUM_LANES-1:0][`XLEN-1:0] full_addr;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign full_addr[i] = execute_if.data.rs1_data[i] + `SEXT(`XLEN, execute_if.data.op_args.lsu.offset);
end
// address type calculation
wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] mem_req_atype;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [MEM_ADDRW-1:0] block_addr = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
// is I/O address
wire [MEM_ADDRW-1:0] io_addr_start = MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT);
wire [MEM_ADDRW-1:0] io_addr_end = MEM_ADDRW'(`XLEN'(`IO_END_ADDR) >> MEM_ASHIFT);
assign mem_req_atype[i][`ADDR_TYPE_FLUSH] = req_is_fence;
assign mem_req_atype[i][`ADDR_TYPE_IO] = (block_addr >= io_addr_start) && (block_addr < io_addr_end);
`ifdef LMEM_ENABLE
// is local memory address
wire [MEM_ADDRW-1:0] lmem_addr_start = MEM_ADDRW'(`XLEN'(`LMEM_BASE_ADDR) >> MEM_ASHIFT);
wire [MEM_ADDRW-1:0] lmem_addr_end = MEM_ADDRW'((`XLEN'(`LMEM_BASE_ADDR) + `XLEN'(1 << `LMEM_LOG_SIZE)) >> MEM_ASHIFT);
assign mem_req_atype[i][`ADDR_TYPE_LOCAL] = (block_addr >= lmem_addr_start) && (block_addr < lmem_addr_end);
`endif
end
// schedule memory request
wire mem_req_valid;
wire [NUM_LANES-1:0] mem_req_mask;
wire mem_req_rw;
wire [NUM_LANES-1:0][LSU_ADDR_WIDTH-1:0] mem_req_addr;
wire [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] mem_req_byteen;
reg [NUM_LANES-1:0][LSU_WORD_SIZE*8-1:0] mem_req_data;
wire [TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
wire mem_rsp_valid;
wire [NUM_LANES-1:0] mem_rsp_mask;
wire [NUM_LANES-1:0][LSU_WORD_SIZE*8-1:0] mem_rsp_data;
wire [TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_sop;
wire mem_rsp_eop;
wire mem_rsp_ready;
wire mem_req_fire = mem_req_valid && mem_req_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
`UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire)
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
wire no_rsp_buf_valid, no_rsp_buf_ready;
// fence handling
reg fence_lock;
assign req_is_fence = `INST_LSU_IS_FENCE(execute_if.data.op_type);
always @(posedge clk) begin
if (reset) begin
fence_lock <= 0;
end else begin
if (mem_req_fire && req_is_fence && execute_if.data.eop) begin
fence_lock <= 1;
end
if (mem_rsp_fire && rsp_is_fence && mem_rsp_eop_pkt) begin
fence_lock <= 0;
end
end
end
wire req_skip = req_is_fence && ~execute_if.data.eop;
wire no_rsp_buf_enable = (mem_req_rw && ~execute_if.data.wb) || req_skip;
assign mem_req_valid = execute_if.valid
&& ~req_skip
&& ~(no_rsp_buf_enable && ~no_rsp_buf_ready)
&& ~fence_lock;
assign no_rsp_buf_valid = execute_if.valid
&& no_rsp_buf_enable
&& (req_skip || mem_req_ready)
&& ~fence_lock;
assign execute_if.ready = (mem_req_ready || req_skip)
&& ~(no_rsp_buf_enable && ~no_rsp_buf_ready)
&& ~fence_lock;
assign mem_req_mask = execute_if.data.tmask;
assign mem_req_rw = execute_if.data.op_args.lsu.is_store;
// address formatting
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] req_align;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign req_align[i] = full_addr[i][REQ_ASHIFT-1:0];
assign mem_req_addr[i] = full_addr[i][`MEM_ADDR_WIDTH-1:REQ_ASHIFT];
end
// byte enable formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
reg [LSU_WORD_SIZE-1:0] mem_req_byteen_r;
always @(*) begin
mem_req_byteen_r = '0;
case (`INST_LSU_WSIZE(execute_if.data.op_type))
0: begin // 8-bit
mem_req_byteen_r[req_align[i]] = 1'b1;
end
1: begin // 16 bit
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
end
`ifdef XLEN_64
2: begin // 32 bit
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
end
`endif
// 3: 64 bit
default : mem_req_byteen_r = {LSU_WORD_SIZE{1'b1}};
endcase
end
assign mem_req_byteen[i] = mem_req_byteen_r;
end
// memory misalignment not supported!
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire lsu_req_fire = execute_if.valid && execute_if.ready;
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if.data.tmask[i] || req_is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if.data.op_type))) == 0),
("misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
execute_if.data.wid, {execute_if.data.PC, 1'b0}, full_addr[i], `INST_LSU_WSIZE(execute_if.data.op_type), execute_if.data.uuid));
end
// store data formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
mem_req_data[i] = execute_if.data.rs2_data[i];
case (req_align[i])
1: mem_req_data[i][`XLEN-1:8] = execute_if.data.rs2_data[i][`XLEN-9:0];
2: mem_req_data[i][`XLEN-1:16] = execute_if.data.rs2_data[i][`XLEN-17:0];
3: mem_req_data[i][`XLEN-1:24] = execute_if.data.rs2_data[i][`XLEN-25:0];
`ifdef XLEN_64
4: mem_req_data[i][`XLEN-1:32] = execute_if.data.rs2_data[i][`XLEN-33:0];
5: mem_req_data[i][`XLEN-1:40] = execute_if.data.rs2_data[i][`XLEN-41:0];
6: mem_req_data[i][`XLEN-1:48] = execute_if.data.rs2_data[i][`XLEN-49:0];
7: mem_req_data[i][`XLEN-1:56] = execute_if.data.rs2_data[i][`XLEN-57:0];
`endif
default:;
endcase
end
end
// track SOP/EOP for out-of-order memory responses
wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr;
if (PID_BITS != 0) begin
reg [`LSUQ_IN_SIZE-1:0][PID_BITS:0] pkt_ctr;
reg [`LSUQ_IN_SIZE-1:0] pkt_sop, pkt_eop;
wire mem_req_rd_fire = mem_req_fire && ~mem_req_rw;
wire mem_req_rd_sop_fire = mem_req_rd_fire && execute_if.data.sop;
wire mem_req_rd_eop_fire = mem_req_rd_fire && execute_if.data.eop;
wire mem_rsp_eop_fire = mem_rsp_fire && mem_rsp_eop;
wire full;
VX_allocator #(
.SIZE (`LSUQ_IN_SIZE)
) pkt_allocator (
.clk (clk),
.reset (reset),
.acquire_en (mem_req_rd_eop_fire),
.acquire_addr(pkt_waddr),
.release_en (mem_rsp_eop_pkt),
.release_addr(pkt_raddr),
`UNUSED_PIN (empty),
.full (full)
);
wire rd_during_wr = mem_req_rd_fire && mem_rsp_eop_fire && (pkt_raddr == pkt_waddr);
always @(posedge clk) begin
if (reset) begin
pkt_ctr <= '0;
pkt_sop <= '0;
pkt_eop <= '0;
end else begin
if (mem_req_rd_sop_fire) begin
pkt_sop[pkt_waddr] <= 1;
end
if (mem_req_rd_eop_fire) begin
pkt_eop[pkt_waddr] <= 1;
end
if (mem_rsp_fire) begin
pkt_sop[pkt_raddr] <= 0;
end
if (mem_rsp_eop_pkt) begin
pkt_eop[pkt_raddr] <= 0;
end
if (~rd_during_wr) begin
if (mem_req_rd_fire) begin
pkt_ctr[pkt_waddr] <= pkt_ctr[pkt_waddr] + PID_BITS'(1);
end
if (mem_rsp_eop_fire) begin
pkt_ctr[pkt_raddr] <= pkt_ctr[pkt_raddr] - PID_BITS'(1);
end
end
end
end
assign mem_rsp_sop_pkt = pkt_sop[pkt_raddr];
assign mem_rsp_eop_pkt = mem_rsp_eop_fire && pkt_eop[pkt_raddr] && (pkt_ctr[pkt_raddr] == 1);
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("allocator full!"))
`RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("Oops!"))
`UNUSED_VAR (mem_rsp_sop)
end else begin
assign pkt_waddr = 0;
assign mem_rsp_sop_pkt = mem_rsp_sop;
assign mem_rsp_eop_pkt = mem_rsp_eop;
`UNUSED_VAR (pkt_raddr)
end
// pack memory request tag
assign mem_req_tag = {
execute_if.data.uuid,
execute_if.data.wid,
execute_if.data.PC,
execute_if.data.wb,
execute_if.data.rd,
execute_if.data.op_type,
req_align,
execute_if.data.pid,
pkt_waddr,
req_is_fence
};
wire lsu_mem_req_valid;
wire lsu_mem_req_rw;
wire [NUM_LANES-1:0] lsu_mem_req_mask;
wire [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] lsu_mem_req_byteen;
wire [NUM_LANES-1:0][LSU_ADDR_WIDTH-1:0] lsu_mem_req_addr;
wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] lsu_mem_req_atype;
wire [NUM_LANES-1:0][(LSU_WORD_SIZE*8)-1:0] lsu_mem_req_data;
wire [LSU_TAG_WIDTH-1:0] lsu_mem_req_tag;
wire lsu_mem_req_ready;
wire lsu_mem_rsp_valid;
wire [NUM_LANES-1:0] lsu_mem_rsp_mask;
wire [NUM_LANES-1:0][(LSU_WORD_SIZE*8)-1:0] lsu_mem_rsp_data;
wire [LSU_TAG_WIDTH-1:0] lsu_mem_rsp_tag;
wire lsu_mem_rsp_ready;
`RESET_RELAY (mem_scheduler_reset, reset);
VX_mem_scheduler #(
.INSTANCE_ID ($sformatf("%s-scheduler", INSTANCE_ID)),
.CORE_REQS (NUM_LANES),
.MEM_CHANNELS(NUM_LANES),
.WORD_SIZE (LSU_WORD_SIZE),
.LINE_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LSU_ADDR_WIDTH),
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_QUEUE_SIZE (`LSUQ_IN_SIZE),
.MEM_QUEUE_SIZE (`LSUQ_OUT_SIZE),
.UUID_WIDTH (`UUID_WIDTH),
.RSP_PARTIAL (1),
.MEM_OUT_BUF (0),
.CORE_OUT_BUF(0)
) mem_scheduler (
.clk (clk),
.reset (mem_scheduler_reset),
// Input request
.core_req_valid (mem_req_valid),
.core_req_rw (mem_req_rw),
.core_req_mask (mem_req_mask),
.core_req_byteen(mem_req_byteen),
.core_req_addr (mem_req_addr),
.core_req_atype (mem_req_atype),
.core_req_data (mem_req_data),
.core_req_tag (mem_req_tag),
.core_req_ready (mem_req_ready),
`UNUSED_PIN (core_req_empty),
`UNUSED_PIN (core_req_sent),
// Output response
.core_rsp_valid (mem_rsp_valid),
.core_rsp_mask (mem_rsp_mask),
.core_rsp_data (mem_rsp_data),
.core_rsp_tag (mem_rsp_tag),
.core_rsp_sop (mem_rsp_sop),
.core_rsp_eop (mem_rsp_eop),
.core_rsp_ready (mem_rsp_ready),
// Memory request
.mem_req_valid (lsu_mem_req_valid),
.mem_req_rw (lsu_mem_req_rw),
.mem_req_mask (lsu_mem_req_mask),
.mem_req_byteen (lsu_mem_req_byteen),
.mem_req_addr (lsu_mem_req_addr),
.mem_req_atype (lsu_mem_req_atype),
.mem_req_data (lsu_mem_req_data),
.mem_req_tag (lsu_mem_req_tag),
.mem_req_ready (lsu_mem_req_ready),
// Memory response
.mem_rsp_valid (lsu_mem_rsp_valid),
.mem_rsp_mask (lsu_mem_rsp_mask),
.mem_rsp_data (lsu_mem_rsp_data),
.mem_rsp_tag (lsu_mem_rsp_tag),
.mem_rsp_ready (lsu_mem_rsp_ready)
);
assign lsu_mem_if.req_valid = lsu_mem_req_valid;
assign lsu_mem_if.req_data.mask = lsu_mem_req_mask;
assign lsu_mem_if.req_data.rw = lsu_mem_req_rw;
assign lsu_mem_if.req_data.byteen = lsu_mem_req_byteen;
assign lsu_mem_if.req_data.addr = lsu_mem_req_addr;
assign lsu_mem_if.req_data.atype = lsu_mem_req_atype;
assign lsu_mem_if.req_data.data = lsu_mem_req_data;
assign lsu_mem_if.req_data.tag = lsu_mem_req_tag;
assign lsu_mem_req_ready = lsu_mem_if.req_ready;
assign lsu_mem_rsp_valid = lsu_mem_if.rsp_valid;
assign lsu_mem_rsp_mask = lsu_mem_if.rsp_data.mask;
assign lsu_mem_rsp_data = lsu_mem_if.rsp_data.data;
assign lsu_mem_rsp_tag = lsu_mem_if.rsp_data.tag;
assign lsu_mem_if.rsp_ready = lsu_mem_rsp_ready;
wire [`UUID_WIDTH-1:0] rsp_uuid;
wire [`NW_WIDTH-1:0] rsp_wid;
wire [`PC_BITS-1:0] rsp_pc;
wire rsp_wb;
wire [`NR_BITS-1:0] rsp_rd;
wire [`INST_LSU_BITS-1:0] rsp_op_type;
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] rsp_align;
wire [PID_WIDTH-1:0] rsp_pid;
`UNUSED_VAR (rsp_op_type)
// unpack memory response tag
assign {
rsp_uuid,
rsp_wid,
rsp_pc,
rsp_wb,
rsp_rd,
rsp_op_type,
rsp_align,
rsp_pid,
pkt_raddr,
rsp_is_fence
} = mem_rsp_tag;
// load response formatting
reg [NUM_LANES-1:0][`XLEN-1:0] rsp_data;
`ifdef XLEN_64
`ifdef EXT_F_ENABLE
// apply nan-boxing to flw outputs
wire rsp_is_float = rsp_rd[5];
`else
wire rsp_is_float = 0;
`endif
`endif
for (genvar i = 0; i < NUM_LANES; i++) begin
`ifdef XLEN_64
wire [63:0] rsp_data64 = mem_rsp_data[i];
wire [31:0] rsp_data32 = (rsp_align[i][2] ? mem_rsp_data[i][63:32] : mem_rsp_data[i][31:0]);
`else
wire [31:0] rsp_data32 = mem_rsp_data[i];
`endif
wire [15:0] rsp_data16 = rsp_align[i][1] ? rsp_data32[31:16] : rsp_data32[15:0];
wire [7:0] rsp_data8 = rsp_align[i][0] ? rsp_data16[15:8] : rsp_data16[7:0];
always @(*) begin
case (`INST_LSU_FMT(rsp_op_type))
`INST_FMT_B: rsp_data[i] = `XLEN'(signed'(rsp_data8));
`INST_FMT_H: rsp_data[i] = `XLEN'(signed'(rsp_data16));
`INST_FMT_BU: rsp_data[i] = `XLEN'(unsigned'(rsp_data8));
`INST_FMT_HU: rsp_data[i] = `XLEN'(unsigned'(rsp_data16));
`ifdef XLEN_64
`INST_FMT_W: rsp_data[i] = rsp_is_float ? (`XLEN'(rsp_data32) | 64'hffffffff00000000) : `XLEN'(signed'(rsp_data32));
`INST_FMT_WU: rsp_data[i] = `XLEN'(unsigned'(rsp_data32));
`INST_FMT_D: rsp_data[i] = `XLEN'(signed'(rsp_data64));
`else
`INST_FMT_W: rsp_data[i] = `XLEN'(signed'(rsp_data32));
`endif
default: rsp_data[i] = 'x;
endcase
end
end
// commit
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + 1 + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
.SIZE (2)
) rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_rsp_valid),
.ready_in (mem_rsp_ready),
.data_in ({rsp_uuid, rsp_wid, mem_rsp_mask, rsp_pc, rsp_wb, rsp_rd, rsp_data, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
.data_out ({commit_rsp_if.data.uuid, commit_rsp_if.data.wid, commit_rsp_if.data.tmask, commit_rsp_if.data.PC, commit_rsp_if.data.wb, commit_rsp_if.data.rd, commit_rsp_if.data.data, commit_rsp_if.data.pid, commit_rsp_if.data.sop, commit_rsp_if.data.eop}),
.valid_out (commit_rsp_if.valid),
.ready_out (commit_rsp_if.ready)
);
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + PID_WIDTH + 1 + 1),
.SIZE (2)
) no_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (no_rsp_buf_valid),
.ready_in (no_rsp_buf_ready),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
.data_out ({commit_no_rsp_if.data.uuid, commit_no_rsp_if.data.wid, commit_no_rsp_if.data.tmask, commit_no_rsp_if.data.PC, commit_no_rsp_if.data.pid, commit_no_rsp_if.data.sop, commit_no_rsp_if.data.eop}),
.valid_out (commit_no_rsp_if.valid),
.ready_out (commit_no_rsp_if.ready)
);
assign commit_no_rsp_if.data.rd = '0;
assign commit_no_rsp_if.data.wb = 1'b0;
assign commit_no_rsp_if.data.data = commit_rsp_if.data.data; // arbiter MUX optimization
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_ARB_DATAW),
.ARBITER ("P"), // prioritize commit_rsp_if
.OUT_BUF (3)
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in ({commit_no_rsp_if.valid, commit_rsp_if.valid}),
.ready_in ({commit_no_rsp_if.ready, commit_rsp_if.ready}),
.data_in ({commit_no_rsp_if.data, commit_rsp_if.data}),
.data_out (commit_if.data),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready),
`UNUSED_PIN (sel_out)
);
`ifdef DBG_TRACE_MEM
always @(posedge clk) begin
if (execute_if.valid && fence_lock) begin
`TRACE(1, ("%d: *** %s fence wait\n", $time, INSTANCE_ID));
end
if (mem_req_fire) begin
if (mem_req_rw) begin
`TRACE(1, ("%d: %s Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
`TRACE(1, (", byteen=0x%0h, data=", mem_req_byteen));
`TRACE_ARRAY1D(1, "0x%0h", mem_req_data, NUM_LANES);
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_req_tag, execute_if.data.uuid));
end else begin
`TRACE(1, ("%d: %s Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
`TRACE(1, (", byteen=0x%0h, rd=%0d, tag=0x%0h (#%0d)\n", mem_req_byteen, execute_if.data.rd, mem_req_tag, execute_if.data.uuid));
end
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: %s Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=",
$time, INSTANCE_ID, rsp_wid, {rsp_pc, 1'b0}, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop));
`TRACE_ARRAY1D(1, "0x%0h", mem_rsp_data, NUM_LANES);
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid));
end
end
`endif
`ifdef DBG_SCOPE_LSU
VX_scope_tap #(
.SCOPE_ID (3),
.TRIGGERW (3),
.PROBEW (1 + NUM_LANES*(`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE*8) + `UUID_WIDTH + NUM_LANES*LSU_WORD_SIZE*8 + `UUID_WIDTH)
) scope_tap (
.clk (clk),
.reset (scope_reset),
.start (1'b0),
.stop (1'b0),
.triggers({reset, mem_req_fire, mem_rsp_fire}),
.probes ({mem_req_rw, full_addr, mem_req_byteen, mem_req_data, execute_if.data.uuid, rsp_data, rsp_uuid}),
.bus_in (scope_bus_in),
.bus_out(scope_bus_out)
);
`else
`SCOPE_IO_UNUSED()
`endif
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,634 +14,71 @@
`include "VX_define.vh"
module VX_lsu_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
input wire clk,
input wire clk,
input wire reset,
// Dcache interface
VX_mem_bus_if.master cache_bus_if [DCACHE_NUM_REQS],
// inputs
// Inputs
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
// outputs
VX_commit_if.master commit_if [`ISSUE_WIDTH]
// Outputs
VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_lsu_mem_if.master lsu_mem_if [`NUM_LSU_BLOCKS]
);
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_LSU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_SIZE);
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
localparam MEM_ADDRW = `XLEN - MEM_ASHIFT;
localparam REQ_ASHIFT = `CLOG2(DCACHE_WORD_SIZE);
localparam CACHE_TAG_WIDTH = `UUID_WIDTH + (NUM_LANES * `CACHE_ADDR_TYPE_BITS) + LSUQ_TAG_BITS;
localparam BLOCK_SIZE = `NUM_LSU_BLOCKS;
localparam NUM_LANES = `NUM_LSU_LANES;
`ifdef SCOPE
`SCOPE_IO_SWITCH (BLOCK_SIZE);
`endif
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) execute_if[BLOCK_SIZE]();
`RESET_RELAY (dispatch_reset, reset);
) per_block_execute_if[BLOCK_SIZE]();
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (1)
.OUT_BUF (1)
) dispatch_unit (
.clk (clk),
.reset (dispatch_reset),
.reset (reset),
.dispatch_if(dispatch_if),
.execute_if (execute_if)
.execute_if (per_block_execute_if)
);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_st_if();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_ld_if();
`UNUSED_VAR (execute_if[0].data.op_mod)
`UNUSED_VAR (execute_if[0].data.use_PC)
`UNUSED_VAR (execute_if[0].data.use_imm)
`UNUSED_VAR (execute_if[0].data.rs3_data)
`UNUSED_VAR (execute_if[0].data.tid)
) per_block_commit_if[BLOCK_SIZE]();
`ifdef SM_ENABLE
`STATIC_ASSERT((1 << `SMEM_LOG_SIZE) == `MEM_BLOCK_SIZE * ((1 << `SMEM_LOG_SIZE) / `MEM_BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % (1 << `SMEM_LOG_SIZE)), ("invalid parameter"))
localparam SMEM_START_B = MEM_ADDRW'(`XLEN'(`SMEM_BASE_ADDR) >> MEM_ASHIFT);
localparam SMEM_END_B = MEM_ADDRW'((`XLEN'(`SMEM_BASE_ADDR) + (1 << `SMEM_LOG_SIZE)) >> MEM_ASHIFT);
`endif
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : lsu_slices
// tag = uuid + addr_type + wid + PC + tmask + rd + op_type + align + is_dup + pid + pkt_addr
localparam TAG_WIDTH = `UUID_WIDTH + (NUM_LANES * `CACHE_ADDR_TYPE_BITS) + `NW_WIDTH + `XLEN + NUM_LANES + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * (REQ_ASHIFT)) + `LSU_DUP_ENABLED + PID_WIDTH + LSUQ_SIZEW;
`RESET_RELAY (slice_reset, reset);
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % `MEM_BLOCK_SIZE), ("invalid parameter"))
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type;
// full address calculation
wire [NUM_LANES-1:0][`XLEN-1:0] full_addr;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign full_addr[i] = execute_if[0].data.rs1_data[i][`XLEN-1:0] + execute_if[0].data.imm;
end
// detect duplicate addresses
wire lsu_is_dup;
`ifdef LSU_DUP_ENABLE
if (NUM_LANES > 1) begin
wire [NUM_LANES-2:0] addr_matches;
for (genvar i = 0; i < (NUM_LANES-1); ++i) begin
assign addr_matches[i] = (execute_if[0].data.rs1_data[i+1] == execute_if[0].data.rs1_data[0]) || ~execute_if[0].data.tmask[i+1];
end
assign lsu_is_dup = execute_if[0].data.tmask[0] && (& addr_matches);
end else begin
assign lsu_is_dup = 0;
end
`else
assign lsu_is_dup = 0;
`endif
// detect address type
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [MEM_ADDRW-1:0] full_addr_b = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
// is non-cacheable I/O address
wire is_addr_io = (full_addr_b >= MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT));
`ifdef SM_ENABLE
// is shared memory address
wire is_addr_sm = (full_addr_b >= SMEM_START_B) && (full_addr_b < SMEM_END_B);
assign lsu_addr_type[i] = {is_addr_io, is_addr_sm};
`else
assign lsu_addr_type[i] = is_addr_io;
`endif
end
wire mem_req_empty;
wire st_rsp_ready;
wire lsu_valid, lsu_ready;
// fence: stall the pipeline until all pending requests are sent
wire is_fence = `INST_LSU_IS_FENCE(execute_if[0].data.op_type);
wire fence_wait = is_fence && ~mem_req_empty;
assign lsu_valid = execute_if[0].valid && ~fence_wait;
assign execute_if[0].ready = lsu_ready && ~fence_wait;
// schedule memory request
wire mem_req_valid;
wire [NUM_LANES-1:0] mem_req_mask;
wire mem_req_rw;
wire [NUM_LANES-1:0][`MEM_ADDR_WIDTH-REQ_ASHIFT-1:0] mem_req_addr;
reg [NUM_LANES-1:0][DCACHE_WORD_SIZE-1:0] mem_req_byteen;
reg [NUM_LANES-1:0][`XLEN-1:0] mem_req_data;
wire [TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
wire mem_rsp_valid;
wire [NUM_LANES-1:0] mem_rsp_mask;
wire [NUM_LANES-1:0][`XLEN-1:0] mem_rsp_data;
wire [TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_sop;
wire mem_rsp_eop;
wire mem_rsp_ready;
assign mem_req_valid = lsu_valid;
assign lsu_ready = mem_req_ready
&& (~mem_req_rw || st_rsp_ready); // writes commit directly
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign mem_req_mask[i] = execute_if[0].data.tmask[i] && (~lsu_is_dup || (i == 0));
end
assign mem_req_rw = ~execute_if[0].data.wb;
wire mem_req_fire = mem_req_valid && mem_req_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
`UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire)
// address formatting
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] req_align;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign req_align[i] = full_addr[i][REQ_ASHIFT-1:0];
assign mem_req_addr[i] = full_addr[i][`MEM_ADDR_WIDTH-1:REQ_ASHIFT];
end
// byte enable formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
mem_req_byteen[i] = '0;
case (`INST_LSU_WSIZE(execute_if[0].data.op_type))
0: begin // 8-bit
mem_req_byteen[i][req_align[i]] = 1'b1;
end
1: begin // 16 bit
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
end
`ifdef XLEN_64
2: begin // 32 bit
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
end
`endif
default : mem_req_byteen[i] = {DCACHE_WORD_SIZE{1'b1}};
endcase
end
end
// memory misalignment not supported!
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire lsu_req_fire = execute_if[0].valid && execute_if[0].ready;
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if[0].data.tmask[i] || is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if[0].data.op_type))) == 0),
("misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
execute_if[0].data.wid, execute_if[0].data.PC, full_addr[i], `INST_LSU_WSIZE(execute_if[0].data.op_type), execute_if[0].data.uuid));
end
// store data formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
mem_req_data[i] = execute_if[0].data.rs2_data[i];
case (req_align[i])
1: mem_req_data[i][`XLEN-1:8] = execute_if[0].data.rs2_data[i][`XLEN-9:0];
2: mem_req_data[i][`XLEN-1:16] = execute_if[0].data.rs2_data[i][`XLEN-17:0];
3: mem_req_data[i][`XLEN-1:24] = execute_if[0].data.rs2_data[i][`XLEN-25:0];
`ifdef XLEN_64
4: mem_req_data[i][`XLEN-1:32] = execute_if[0].data.rs2_data[i][`XLEN-33:0];
5: mem_req_data[i][`XLEN-1:40] = execute_if[0].data.rs2_data[i][`XLEN-41:0];
6: mem_req_data[i][`XLEN-1:48] = execute_if[0].data.rs2_data[i][`XLEN-49:0];
7: mem_req_data[i][`XLEN-1:56] = execute_if[0].data.rs2_data[i][`XLEN-57:0];
`endif
default:;
endcase
end
end
// track SOP/EOP for out-of-order memory responses
wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr;
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
if (PID_BITS != 0) begin
reg [`LSUQ_SIZE-1:0][PID_BITS:0] pkt_ctr;
reg [`LSUQ_SIZE-1:0] pkt_sop, pkt_eop;
wire mem_req_rd_fire = mem_req_fire && execute_if[0].data.wb;
wire mem_req_rd_sop_fire = mem_req_rd_fire && execute_if[0].data.sop;
wire mem_req_rd_eop_fire = mem_req_rd_fire && execute_if[0].data.eop;
wire mem_rsp_eop_fire = mem_rsp_fire && mem_rsp_eop;
wire full;
VX_allocator #(
.SIZE (`LSUQ_SIZE)
) pkt_allocator (
VX_lsu_slice #(
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, block_idx))
) lsu_slice(
`SCOPE_IO_BIND (block_idx)
.clk (clk),
.reset (reset),
.acquire_en (mem_req_rd_eop_fire),
.acquire_addr(pkt_waddr),
.release_en (mem_rsp_eop_pkt),
.release_addr(pkt_raddr),
`UNUSED_PIN (empty),
.full (full)
.reset (slice_reset),
.execute_if (per_block_execute_if[block_idx]),
.commit_if (per_block_commit_if[block_idx]),
.lsu_mem_if (lsu_mem_if[block_idx])
);
wire rd_during_wr = mem_req_rd_fire && mem_rsp_eop_fire && (pkt_raddr == pkt_waddr);
always @(posedge clk) begin
if (reset) begin
pkt_ctr <= '0;
pkt_sop <= '0;
pkt_eop <= '0;
end else begin
if (mem_req_rd_sop_fire) begin
pkt_sop[pkt_waddr] <= 1;
end
if (mem_req_rd_eop_fire) begin
pkt_eop[pkt_waddr] <= 1;
end
if (mem_rsp_fire) begin
pkt_sop[pkt_raddr] <= 0;
end
if (mem_rsp_eop_pkt) begin
pkt_eop[pkt_raddr] <= 0;
end
if (~rd_during_wr) begin
if (mem_req_rd_fire) begin
pkt_ctr[pkt_waddr] <= pkt_ctr[pkt_waddr] + PID_BITS'(1);
end
if (mem_rsp_eop_fire) begin
pkt_ctr[pkt_raddr] <= pkt_ctr[pkt_raddr] - PID_BITS'(1);
end
end
end
end
assign mem_rsp_sop_pkt = pkt_sop[pkt_raddr];
assign mem_rsp_eop_pkt = mem_rsp_eop_fire && pkt_eop[pkt_raddr] && (pkt_ctr[pkt_raddr] == 1);
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("allocator full!"))
`RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("Oops!"))
`UNUSED_VAR (mem_rsp_sop)
end else begin
assign pkt_waddr = 0;
assign mem_rsp_sop_pkt = mem_rsp_sop;
assign mem_rsp_eop_pkt = mem_rsp_eop;
`UNUSED_VAR (pkt_raddr)
end
assign mem_req_tag = {
execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr
`ifdef LSU_DUP_ENABLE
, lsu_is_dup
`endif
};
wire [DCACHE_NUM_REQS-1:0] cache_req_valid;
wire [DCACHE_NUM_REQS-1:0] cache_req_rw;
wire [DCACHE_NUM_REQS-1:0][(`XLEN/8)-1:0] cache_req_byteen;
wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] cache_req_addr;
wire [DCACHE_NUM_REQS-1:0][`XLEN-1:0] cache_req_data;
wire [DCACHE_NUM_REQS-1:0][CACHE_TAG_WIDTH-1:0] cache_req_tag;
wire [DCACHE_NUM_REQS-1:0] cache_req_ready;
wire [DCACHE_NUM_REQS-1:0] cache_rsp_valid;
wire [DCACHE_NUM_REQS-1:0][`XLEN-1:0] cache_rsp_data;
wire [DCACHE_NUM_REQS-1:0][CACHE_TAG_WIDTH-1:0] cache_rsp_tag;
wire [DCACHE_NUM_REQS-1:0] cache_rsp_ready;
`RESET_RELAY (mem_scheduler_reset, reset);
VX_mem_scheduler #(
.INSTANCE_ID ($sformatf("core%0d-lsu-memsched", CORE_ID)),
.NUM_REQS (LSU_MEM_REQS),
.NUM_BANKS (DCACHE_NUM_REQS),
.ADDR_WIDTH (DCACHE_ADDR_WIDTH),
.DATA_WIDTH (`XLEN),
.QUEUE_SIZE (`LSUQ_SIZE),
.TAG_WIDTH (TAG_WIDTH),
.MEM_TAG_ID (`UUID_WIDTH + (NUM_LANES * `CACHE_ADDR_TYPE_BITS)),
.UUID_WIDTH (`UUID_WIDTH),
.RSP_PARTIAL (1),
.MEM_OUT_REG (2)
) mem_scheduler (
.clk (clk),
.reset (mem_scheduler_reset),
// Input request
.req_valid (mem_req_valid),
.req_rw (mem_req_rw),
.req_mask (mem_req_mask),
.req_byteen (mem_req_byteen),
.req_addr (mem_req_addr),
.req_data (mem_req_data),
.req_tag (mem_req_tag),
.req_empty (mem_req_empty),
.req_ready (mem_req_ready),
`UNUSED_PIN (write_notify),
// Output response
.rsp_valid (mem_rsp_valid),
.rsp_mask (mem_rsp_mask),
.rsp_data (mem_rsp_data),
.rsp_tag (mem_rsp_tag),
.rsp_sop (mem_rsp_sop),
.rsp_eop (mem_rsp_eop),
.rsp_ready (mem_rsp_ready),
// Memory request
.mem_req_valid (cache_req_valid),
.mem_req_rw (cache_req_rw),
.mem_req_byteen (cache_req_byteen),
.mem_req_addr (cache_req_addr),
.mem_req_data (cache_req_data),
.mem_req_tag (cache_req_tag),
.mem_req_ready (cache_req_ready),
// Memory response
.mem_rsp_valid (cache_rsp_valid),
.mem_rsp_data (cache_rsp_data),
.mem_rsp_tag (cache_rsp_tag),
.mem_rsp_ready (cache_rsp_ready)
);
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign cache_bus_if[i].req_valid = cache_req_valid[i];
assign cache_bus_if[i].req_data.rw = cache_req_rw[i];
assign cache_bus_if[i].req_data.byteen = cache_req_byteen[i];
assign cache_bus_if[i].req_data.addr = cache_req_addr[i];
assign cache_bus_if[i].req_data.data = cache_req_data[i];
assign cache_req_ready[i] = cache_bus_if[i].req_ready;
assign cache_rsp_valid[i] = cache_bus_if[i].rsp_valid;
assign cache_rsp_data[i] = cache_bus_if[i].rsp_data.data;
assign cache_bus_if[i].rsp_ready = cache_rsp_ready[i];
end
// cache tag formatting: <uuid, tag, type>
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
wire [`UUID_WIDTH-1:0] cache_req_uuid, cache_rsp_uuid;
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type, cache_rsp_type;
wire [`CLOG2(`LSUQ_SIZE)-1:0] cache_req_tag_x, cache_rsp_tag_x;
if (DCACHE_NUM_BATCHES > 1) begin
wire [DCACHE_NUM_BATCHES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type_b, cache_rsp_type_b;
wire [`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type_bi, cache_rsp_type_bi;
wire [DCACHE_BATCH_SEL_BITS-1:0] cache_req_bid, cache_rsp_bid;
assign {cache_req_uuid, cache_req_type, cache_req_bid, cache_req_tag_x} = cache_req_tag[i];
assign cache_req_type_bi = cache_req_type_b[cache_req_bid];
assign cache_bus_if[i].req_data.tag = {cache_req_uuid, cache_req_bid, cache_req_tag_x, cache_req_type_bi};
assign {cache_rsp_uuid, cache_rsp_bid, cache_rsp_tag_x, cache_rsp_type_bi} = cache_bus_if[i].rsp_data.tag;
assign cache_rsp_type_b = {DCACHE_NUM_BATCHES{cache_rsp_type_bi}};
assign cache_rsp_tag[i] = {cache_rsp_uuid, cache_rsp_type, cache_rsp_bid, cache_rsp_tag_x};
for (genvar j = 0; j < DCACHE_NUM_BATCHES; ++j) begin
localparam k = j * DCACHE_NUM_REQS + i;
if (k < NUM_LANES) begin
assign cache_req_type_b[j] = cache_req_type[k];
assign cache_rsp_type[k] = cache_rsp_type_b[j];
end else begin
assign cache_req_type_b[j] = '0;
`UNUSED_VAR (cache_rsp_type_b[j])
end
end
end else begin
assign {cache_req_uuid, cache_req_type, cache_req_tag_x} = cache_req_tag[i];
assign cache_bus_if[i].req_data.tag = {cache_req_uuid, cache_req_tag_x, cache_req_type[i]};
assign {cache_rsp_uuid, cache_rsp_tag_x, cache_rsp_type[i]} = cache_bus_if[i].rsp_data.tag;
assign cache_rsp_tag[i] = {cache_rsp_uuid, cache_rsp_type, cache_rsp_tag_x};
for (genvar j = 0; j < DCACHE_NUM_REQS; ++j) begin
if (i != j) begin
`UNUSED_VAR (cache_req_type[j])
assign cache_rsp_type[j] = '0;
end
end
end
end
wire [`UUID_WIDTH-1:0] rsp_uuid;
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] rsp_addr_type;
wire [`NW_WIDTH-1:0] rsp_wid;
wire [NUM_LANES-1:0] rsp_tmask_uq;
wire [`XLEN-1:0] rsp_pc;
wire [`NR_BITS-1:0] rsp_rd;
wire [`INST_LSU_BITS-1:0] rsp_op_type;
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] rsp_align;
wire [PID_WIDTH-1:0] rsp_pid;
wire rsp_is_dup;
`ifndef LSU_DUP_ENABLE
assign rsp_is_dup = 0;
`endif
assign {
rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr
`ifdef LSU_DUP_ENABLE
, rsp_is_dup
`endif
} = mem_rsp_tag;
`UNUSED_VAR (rsp_addr_type)
`UNUSED_VAR (rsp_op_type)
// load response formatting
reg [NUM_LANES-1:0][`XLEN-1:0] rsp_data;
wire [NUM_LANES-1:0] rsp_tmask;
`ifdef XLEN_64
`ifdef EXT_F_ENABLE
// apply nan-boxing to flw outputs
wire rsp_is_float = rsp_rd[5];
`else
wire rsp_is_float = 0;
`endif
`endif
for (genvar i = 0; i < NUM_LANES; i++) begin
`ifdef XLEN_64
wire [63:0] rsp_data64 = (i == 0 || rsp_is_dup) ? mem_rsp_data[0] : mem_rsp_data[i];
wire [31:0] rsp_data32 = (i == 0 || rsp_is_dup) ? (rsp_align[0][2] ? mem_rsp_data[0][63:32] : mem_rsp_data[0][31:0]) :
(rsp_align[i][2] ? mem_rsp_data[i][63:32] : mem_rsp_data[i][31:0]);
`else
wire [31:0] rsp_data32 = (i == 0 || rsp_is_dup) ? mem_rsp_data[0] : mem_rsp_data[i];
`endif
wire [15:0] rsp_data16 = rsp_align[i][1] ? rsp_data32[31:16] : rsp_data32[15:0];
wire [7:0] rsp_data8 = rsp_align[i][0] ? rsp_data16[15:8] : rsp_data16[7:0];
always @(*) begin
case (`INST_LSU_FMT(rsp_op_type))
`INST_FMT_B: rsp_data[i] = `XLEN'(signed'(rsp_data8));
`INST_FMT_H: rsp_data[i] = `XLEN'(signed'(rsp_data16));
`INST_FMT_BU: rsp_data[i] = `XLEN'(unsigned'(rsp_data8));
`INST_FMT_HU: rsp_data[i] = `XLEN'(unsigned'(rsp_data16));
`ifdef XLEN_64
`INST_FMT_W: rsp_data[i] = rsp_is_float ? (`XLEN'(rsp_data32) | 64'hffffffff00000000) : `XLEN'(signed'(rsp_data32));
`INST_FMT_WU: rsp_data[i] = `XLEN'(unsigned'(rsp_data32));
`INST_FMT_D: rsp_data[i] = `XLEN'(signed'(rsp_data64));
`else
`INST_FMT_W: rsp_data[i] = `XLEN'(signed'(rsp_data32));
`endif
default: rsp_data[i] = 'x;
endcase
end
end
assign rsp_tmask = rsp_is_dup ? rsp_tmask_uq : mem_rsp_mask;
// load commit
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
.SIZE (2)
) ld_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_rsp_valid),
.ready_in (mem_rsp_ready),
.data_in ({rsp_uuid, rsp_wid, rsp_tmask, rsp_pc, rsp_rd, rsp_data, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
.data_out ({commit_ld_if.data.uuid, commit_ld_if.data.wid, commit_ld_if.data.tmask, commit_ld_if.data.PC, commit_ld_if.data.rd, commit_ld_if.data.data, commit_ld_if.data.pid, commit_ld_if.data.sop, commit_ld_if.data.eop}),
.valid_out (commit_ld_if.valid),
.ready_out (commit_ld_if.ready)
);
assign commit_ld_if.data.wb = 1'b1;
// store commit
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + PID_WIDTH + 1 + 1),
.SIZE (2)
) st_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_fire && mem_req_rw),
.ready_in (st_rsp_ready),
.data_in ({execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.pid, execute_if[0].data.sop, execute_if[0].data.eop}),
.data_out ({commit_st_if.data.uuid, commit_st_if.data.wid, commit_st_if.data.tmask, commit_st_if.data.PC, commit_st_if.data.pid, commit_st_if.data.sop, commit_st_if.data.eop}),
.valid_out (commit_st_if.valid),
.ready_out (commit_st_if.ready)
);
assign commit_st_if.data.rd = '0;
assign commit_st_if.data.wb = 1'b0;
assign commit_st_if.data.data = commit_ld_if.data.data; // force arbiter passthru
// lsu commit
`RESET_RELAY (commit_reset, reset);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_arb_if[1]();
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_ARB_DATAW),
.OUT_REG (3)
) rsp_arb (
.clk (clk),
.reset (commit_reset),
.valid_in ({commit_st_if.valid, commit_ld_if.valid}),
.ready_in ({commit_st_if.ready, commit_ld_if.ready}),
.data_in ({commit_st_if.data, commit_ld_if.data}),
.data_out (commit_arb_if[0].data),
.valid_out (commit_arb_if[0].valid),
.ready_out (commit_arb_if[0].ready),
`UNUSED_PIN (sel_out)
);
VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (3)
.OUT_BUF (3)
) gather_unit (
.clk (clk),
.reset (commit_reset),
.commit_in_if (commit_arb_if),
.reset (reset),
.commit_in_if (per_block_commit_if),
.commit_out_if (commit_if)
);
`ifdef DBG_SCOPE_LSU
if (CORE_ID == 0) begin
`ifdef SCOPE
VX_scope_tap #(
.SCOPE_ID (3),
.TRIGGERW (3),
.PROBEW (`UUID_WIDTH+NUM_LANES*(`XLEN+4+`XLEN)+1+`UUID_WIDTH+NUM_LANES*`XLEN)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({reset, mem_req_fire, mem_rsp_fire}),
.probes({execute_if[0].data.uuid, full_addr, mem_req_rw, mem_req_byteen, mem_req_data, rsp_uuid, rsp_data}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
wire [31:0] full_addr_0 = full_addr[0];
wire [31:0] mem_req_data_0 = mem_req_data[0];
wire [31:0] rsp_data_0 = rsp_data[0];
ila_lsu ila_lsu_inst (
.clk (clk),
.probe0 ({mem_req_data_0, execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.PC, mem_req_mask, full_addr_0, mem_req_byteen, mem_req_rw, mem_req_ready, mem_req_valid}),
.probe1 ({rsp_data_0, rsp_uuid, mem_rsp_eop, rsp_pc, rsp_rd, rsp_tmask, rsp_wid, mem_rsp_ready, mem_rsp_valid}),
.probe2 ({cache_bus_if.req_data.data, cache_bus_if.req_data.tag, cache_bus_if.req_data.byteen, cache_bus_if.req_data.addr, cache_bus_if.req_data.rw, cache_bus_if.req_ready, cache_bus_if.req_valid}),
.probe3 ({cache_bus_if.rsp_data.data, cache_bus_if.rsp_data.tag, cache_bus_if.rsp_ready, cache_bus_if.rsp_valid})
);
`endif
end
`else
`SCOPE_IO_UNUSED()
`endif
`ifdef DBG_TRACE_CORE_DCACHE
always @(posedge clk) begin
if (execute_if[0].valid && fence_wait) begin
`TRACE(1, ("%d: *** D$%0d fence wait\n", $time, CORE_ID));
end
if (mem_req_fire) begin
if (mem_req_rw) begin
`TRACE(1, ("%d: D$%0d Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, mem_req_mask));
`TRACE_ARRAY1D(1, full_addr, NUM_LANES);
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, type=", mem_req_tag, mem_req_byteen));
`TRACE_ARRAY1D(1, lsu_addr_type, NUM_LANES);
`TRACE(1, (", data="));
`TRACE_ARRAY1D(1, mem_req_data, NUM_LANES);
`TRACE(1, (", is_dup=%b (#%0d)\n", lsu_is_dup, execute_if[0].data.uuid));
end else begin
`TRACE(1, ("%d: D$%0d Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, mem_req_mask));
`TRACE_ARRAY1D(1, full_addr, NUM_LANES);
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, type=", mem_req_tag, mem_req_byteen));
`TRACE_ARRAY1D(1, lsu_addr_type, NUM_LANES);
`TRACE(1, (", rd=%0d, is_dup=%b (#%0d)\n", execute_if[0].data.rd, lsu_is_dup, execute_if[0].data.uuid));
end
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: D$%0d Rsp: wid=%0d, PC=0x%0h, tmask=%b, tag=0x%0h, rd=%0d, sop=%b, eop=%b, data=",
$time, CORE_ID, rsp_wid, rsp_pc, mem_rsp_mask, mem_rsp_tag, rsp_rd, mem_rsp_sop, mem_rsp_eop));
`TRACE_ARRAY1D(1, mem_rsp_data, NUM_LANES);
`TRACE(1, (", is_dup=%b (#%0d)\n", rsp_is_dup, rsp_uuid));
end
end
`endif
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -13,246 +13,242 @@
`include "VX_define.vh"
// reset all GPRs in debug mode
`ifdef SIMULATION
`ifndef NDEBUG
`define GPR_RESET
`endif
`endif
module VX_operands import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter CACHE_ENABLE = 0
parameter `STRING INSTANCE_ID = "",
parameter NUM_BANKS = 4,
parameter OUT_BUF = 4 // using 2-cycle EB for area reduction
) (
input wire clk,
input wire reset,
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave scoreboard_if [`ISSUE_WIDTH],
VX_operands_if.master operands_if [`ISSUE_WIDTH]
`ifdef PERF_ENABLE
output wire [`PERF_CTR_BITS-1:0] perf_stalls,
`endif
VX_writeback_if.slave writeback_if,
VX_scoreboard_if.slave scoreboard_if,
VX_operands_if.master operands_if
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + 1 + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + `NR_BITS;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * ISSUE_RATIO);
`UNUSED_SPARAM (INSTANCE_ID)
localparam NUM_SRC_REGS = 3;
localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_REGS);
localparam REQ_SEL_WIDTH = `UP(REQ_SEL_BITS);
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS;
localparam META_DATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + `UUID_WIDTH;
localparam REGS_DATAW = `XLEN * `NUM_THREADS;
localparam DATAW = META_DATAW + NUM_SRC_REGS * REGS_DATAW;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS);
localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS;
localparam XLEN_SIZE = `XLEN / 8;
localparam BYTEENW = `NUM_THREADS * XLEN_SIZE;
localparam STATE_IDLE = 2'd0;
localparam STATE_FETCH1 = 2'd1;
localparam STATE_FETCH2 = 2'd2;
localparam STATE_FETCH3 = 2'd3;
localparam STATE_BITS = 2;
`UNUSED_VAR (writeback_if.data.sop)
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data;
reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n;
reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n;
wire [NUM_SRC_REGS-1:0] src_valid;
wire [NUM_SRC_REGS-1:0] req_in_valid, req_in_ready;
wire [NUM_SRC_REGS-1:0][PER_BANK_ADDRW-1:0] req_in_data;
wire [NUM_SRC_REGS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data [ISSUE_RATIO-1:0];
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data_n [ISSUE_RATIO-1:0];
reg [`NR_BITS-1:0] cache_reg [ISSUE_RATIO-1:0];
reg [`NR_BITS-1:0] cache_reg_n [ISSUE_RATIO-1:0];
reg [`NUM_THREADS-1:0] cache_tmask [ISSUE_RATIO-1:0];
reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0];
reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;
wire [NUM_BANKS-1:0] gpr_rd_valid, gpr_rd_ready;
wire [NUM_BANKS-1:0] gpr_rd_valid_st1, gpr_rd_valid_st2;
wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr, gpr_rd_addr_st1;
wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data_st1, gpr_rd_data_st2;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx, gpr_rd_req_idx_st1, gpr_rd_req_idx_st2;
reg valid_out_r;
reg [DATAW-1:0] data_out_r;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n;
wire pipe_valid_st1, pipe_ready_st1;
wire pipe_valid_st2, pipe_ready_st2;
wire [META_DATAW-1:0] pipe_data, pipe_data_st1, pipe_data_st2;
reg [STATE_BITS-1:0] state, state_n;
reg [`NR_BITS-1:0] rs2, rs2_n;
reg [`NR_BITS-1:0] rs3, rs3_n;
reg rs2_ready, rs2_ready_n;
reg rs3_ready, rs3_ready_n;
reg data_ready, data_ready_n;
reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_n;
wire [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_st1, src_data_st2;
wire ready_out = operands_if[i].ready;
wire is_rs1_zero = (scoreboard_if[i].data.rs1 == 0);
wire is_rs2_zero = (scoreboard_if[i].data.rs2 == 0);
wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0);
reg [NUM_SRC_REGS-1:0] data_fetched_n;
wire [NUM_SRC_REGS-1:0] data_fetched_st1;
always @(*) begin
state_n = state;
rs2_n = rs2;
rs3_n = rs3;
rs2_ready_n = rs2_ready;
rs3_ready_n = rs3_ready;
rs1_data_n = rs1_data;
rs2_data_n = rs2_data;
rs3_data_n = rs3_data;
cache_data_n = cache_data;
cache_reg_n = cache_reg;
cache_tmask_n= cache_tmask;
cache_eop_n = cache_eop;
gpr_rd_rid_n = gpr_rd_rid;
gpr_rd_wis_n = gpr_rd_wis;
data_ready_n = data_ready;
reg has_collision_n;
wire has_collision_st1;
case (state)
STATE_IDLE: begin
if (valid_out_r && ready_out) begin
data_ready_n = 0;
end
if (scoreboard_if[i].valid && data_ready_n == 0) begin
data_ready_n = 1;
if (is_rs3_zero || (CACHE_ENABLE != 0 &&
scoreboard_if[i].data.rs3 == cache_reg[scoreboard_if[i].data.wis] &&
(scoreboard_if[i].data.tmask & cache_tmask[scoreboard_if[i].data.wis]) == scoreboard_if[i].data.tmask)) begin
rs3_data_n = (is_rs3_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if[i].data.wis];
rs3_ready_n = 1;
end else begin
rs3_ready_n = 0;
gpr_rd_rid_n = scoreboard_if[i].data.rs3;
data_ready_n = 0;
state_n = STATE_FETCH3;
end
if (is_rs2_zero || (CACHE_ENABLE != 0 &&
scoreboard_if[i].data.rs2 == cache_reg[scoreboard_if[i].data.wis] &&
(scoreboard_if[i].data.tmask & cache_tmask[scoreboard_if[i].data.wis]) == scoreboard_if[i].data.tmask)) begin
rs2_data_n = (is_rs2_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if[i].data.wis];
rs2_ready_n = 1;
end else begin
rs2_ready_n = 0;
gpr_rd_rid_n = scoreboard_if[i].data.rs2;
data_ready_n = 0;
state_n = STATE_FETCH2;
end
if (is_rs1_zero || (CACHE_ENABLE != 0 &&
scoreboard_if[i].data.rs1 == cache_reg[scoreboard_if[i].data.wis] &&
(scoreboard_if[i].data.tmask & cache_tmask[scoreboard_if[i].data.wis]) == scoreboard_if[i].data.tmask)) begin
rs1_data_n = (is_rs1_zero || CACHE_ENABLE == 0) ? '0 : cache_data[scoreboard_if[i].data.wis];
end else begin
gpr_rd_rid_n = scoreboard_if[i].data.rs1;
data_ready_n = 0;
state_n = STATE_FETCH1;
end
end
gpr_rd_wis_n = scoreboard_if[i].data.wis;
rs2_n = scoreboard_if[i].data.rs2;
rs3_n = scoreboard_if[i].data.rs3;
end
STATE_FETCH1: begin
rs1_data_n = gpr_rd_data;
if (~rs2_ready) begin
gpr_rd_rid_n = rs2;
state_n = STATE_FETCH2;
end else if (~rs3_ready) begin
gpr_rd_rid_n = rs3;
state_n = STATE_FETCH3;
end else begin
data_ready_n = 1;
state_n = STATE_IDLE;
end
end
STATE_FETCH2: begin
rs2_data_n = gpr_rd_data;
if (~rs3_ready) begin
gpr_rd_rid_n = rs3;
state_n = STATE_FETCH3;
end else begin
data_ready_n = 1;
state_n = STATE_IDLE;
end
end
STATE_FETCH3: begin
rs3_data_n = gpr_rd_data;
data_ready_n = 1;
state_n = STATE_IDLE;
end
endcase
if (CACHE_ENABLE != 0 && writeback_if[i].valid) begin
if ((cache_reg[writeback_if[i].data.wis] == writeback_if[i].data.rd)
|| (cache_eop[writeback_if[i].data.wis] && writeback_if[i].data.sop)) begin
for (integer j = 0; j < `NUM_THREADS; ++j) begin
if (writeback_if[i].data.tmask[j]) begin
cache_data_n[writeback_if[i].data.wis][j] = writeback_if[i].data.data[j];
end
end
cache_reg_n[writeback_if[i].data.wis] = writeback_if[i].data.rd;
cache_eop_n[writeback_if[i].data.wis] = writeback_if[i].data.eop;
cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.sop ? writeback_if[i].data.tmask :
(cache_tmask_n[writeback_if[i].data.wis] | writeback_if[i].data.tmask);
end
end
end
wire [NUM_SRC_REGS-1:0][`NR_BITS-1:0] src_regs = {scoreboard_if.data.rs3,
scoreboard_if.data.rs2,
scoreboard_if.data.rs1};
always @(posedge clk) begin
if (reset) begin
state <= STATE_IDLE;
cache_eop <= {ISSUE_RATIO{1'b1}};
data_ready <= 0;
valid_out_r <= 0;
end else begin
state <= state_n;
cache_eop <= cache_eop_n;
data_ready <= data_ready_n;
if (~valid_out_r) begin
valid_out_r <= scoreboard_if[i].valid && data_ready;
end else if (ready_out) begin
valid_out_r <= 0;
end
end
if (~valid_out_r) begin
data_out_r <= {scoreboard_if[i].data.uuid,
scoreboard_if[i].data.wis,
scoreboard_if[i].data.tmask,
scoreboard_if[i].data.PC,
scoreboard_if[i].data.wb,
scoreboard_if[i].data.ex_type,
scoreboard_if[i].data.op_type,
scoreboard_if[i].data.op_mod,
scoreboard_if[i].data.use_PC,
scoreboard_if[i].data.use_imm,
scoreboard_if[i].data.imm,
scoreboard_if[i].data.rd};
end
gpr_rd_rid <= gpr_rd_rid_n;
gpr_rd_wis <= gpr_rd_wis_n;
rs2_ready <= rs2_ready_n;
rs3_ready <= rs3_ready_n;
rs2 <= rs2_n;
rs3 <= rs3_n;
rs1_data <= rs1_data_n;
rs2_data <= rs2_data_n;
rs3_data <= rs3_data_n;
cache_data <= cache_data_n;
cache_reg <= cache_reg_n;
cache_tmask <= cache_tmask_n;
end
assign operands_if[i].valid = valid_out_r;
assign {operands_if[i].data.uuid,
operands_if[i].data.wis,
operands_if[i].data.tmask,
operands_if[i].data.PC,
operands_if[i].data.wb,
operands_if[i].data.ex_type,
operands_if[i].data.op_type,
operands_if[i].data.op_mod,
operands_if[i].data.use_PC,
operands_if[i].data.use_imm,
operands_if[i].data.imm,
operands_if[i].data.rd} = data_out_r;
assign operands_if[i].data.rs1_data = rs1_data;
assign operands_if[i].data.rs2_data = rs2_data;
assign operands_if[i].data.rs3_data = rs3_data;
assign scoreboard_if[i].ready = ~valid_out_r && data_ready;
// GPR banks
reg [RAM_ADDRW-1:0] gpr_rd_addr;
wire [RAM_ADDRW-1:0] gpr_wr_addr;
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
if (ISSUE_WIS != 0) begin
assign gpr_wr_addr = {writeback_if[i].data.wis, writeback_if[i].data.rd};
always @(posedge clk) begin
gpr_rd_addr <= {gpr_rd_wis_n, gpr_rd_rid_n};
end
assign req_in_data[i] = {src_regs[i][`NR_BITS-1:BANK_SEL_BITS], scoreboard_if.data.wis};
end else begin
assign gpr_wr_addr = writeback_if[i].data.rd;
always @(posedge clk) begin
gpr_rd_addr <= gpr_rd_rid_n;
assign req_in_data[i] = src_regs[i][`NR_BITS-1:BANK_SEL_BITS];
end
if (NUM_BANKS != 1) begin
assign req_bank_idx[i] = src_regs[i][BANK_SEL_BITS-1:0];
end else begin
assign req_bank_idx[i] = '0;
end
end
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
assign src_valid[i] = (src_regs[i] != 0) && ~data_fetched_st1[i];
end
assign req_in_valid = {NUM_SRC_REGS{scoreboard_if.valid}} & src_valid;
VX_stream_xbar #(
.NUM_INPUTS (NUM_SRC_REGS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (PER_BANK_ADDRW),
.ARBITER ("P"), // use priority arbiter
.PERF_CTR_BITS(`PERF_CTR_BITS),
.OUT_BUF (0) // no output buffering
) req_xbar (
.clk (clk),
.reset (reset),
`UNUSED_PIN(collisions),
.valid_in (req_in_valid),
.data_in (req_in_data),
.sel_in (req_bank_idx),
.ready_in (req_in_ready),
.valid_out (gpr_rd_valid),
.data_out (gpr_rd_addr),
.sel_out (gpr_rd_req_idx),
.ready_out (gpr_rd_ready)
);
wire pipe_in_ready = pipe_ready_st1 || ~pipe_valid_st1;
assign gpr_rd_ready = {NUM_BANKS{pipe_in_ready}};
assign scoreboard_if.ready = pipe_in_ready && ~has_collision_n;
wire pipe_fire_st1 = pipe_valid_st1 && pipe_ready_st1;
wire pipe_fire_st2 = pipe_valid_st2 && pipe_ready_st2;
always @(*) begin
has_collision_n = 0;
for (integer i = 0; i < NUM_SRC_REGS; ++i) begin
for (integer j = 1; j < (NUM_SRC_REGS-i); ++j) begin
has_collision_n |= src_valid[i]
&& src_valid[j+i]
&& (req_bank_idx[i] == req_bank_idx[j+i]);
end
end
end
always @(*) begin
data_fetched_n = data_fetched_st1;
if (scoreboard_if.ready) begin
data_fetched_n = '0;
end else begin
data_fetched_n = data_fetched_st1 | req_in_ready;
end
end
assign pipe_data = {
scoreboard_if.data.wis,
scoreboard_if.data.tmask,
scoreboard_if.data.PC,
scoreboard_if.data.wb,
scoreboard_if.data.ex_type,
scoreboard_if.data.op_type,
scoreboard_if.data.op_args,
scoreboard_if.data.rd,
scoreboard_if.data.uuid
};
VX_pipe_register #(
.DATAW (1 + NUM_SRC_REGS + NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)),
.RESETW (1 + NUM_SRC_REGS)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (pipe_in_ready),
.data_in ({scoreboard_if.valid, data_fetched_n, gpr_rd_valid, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}),
.data_out ({pipe_valid_st1, data_fetched_st1, gpr_rd_valid_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1})
);
assign pipe_ready_st1 = pipe_ready_st2 || ~pipe_valid_st2;
assign src_data_st1 = pipe_fire_st2 ? '0 : src_data_n;
wire pipe_valid2_st1 = pipe_valid_st1 && ~has_collision_st1;
`RESET_RELAY (pipe2_reset, reset); // needed for pipe_reg2's wide RESETW
VX_pipe_register #(
.DATAW (1 + NUM_SRC_REGS * REGS_DATAW + NUM_BANKS + NUM_BANKS * REGS_DATAW + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH),
.RESETW (1 + NUM_SRC_REGS * REGS_DATAW)
) pipe_reg2 (
.clk (clk),
.reset (pipe2_reset),
.enable (pipe_ready_st1),
.data_in ({pipe_valid2_st1, src_data_st1, gpr_rd_valid_st1, gpr_rd_data_st1, pipe_data_st1, gpr_rd_req_idx_st1}),
.data_out ({pipe_valid_st2, src_data_st2, gpr_rd_valid_st2, gpr_rd_data_st2, pipe_data_st2, gpr_rd_req_idx_st2})
);
always @(*) begin
src_data_n = src_data_st2;
for (integer b = 0; b < NUM_BANKS; ++b) begin
if (gpr_rd_valid_st2[b]) begin
src_data_n[gpr_rd_req_idx_st2[b]] = gpr_rd_data_st2[b];
end
end
end
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)),
.LUTRAM (1)
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (pipe_valid_st2),
.ready_in (pipe_ready_st2),
.data_in ({
pipe_data_st2,
src_data_n[0],
src_data_n[1],
src_data_n[2]
}),
.data_out ({
operands_if.data.wis,
operands_if.data.tmask,
operands_if.data.PC,
operands_if.data.wb,
operands_if.data.ex_type,
operands_if.data.op_type,
operands_if.data.op_args,
operands_if.data.rd,
operands_if.data.uuid,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data
}),
.valid_out (operands_if.valid),
.ready_out (operands_if.ready)
);
wire [PER_BANK_ADDRW-1:0] gpr_wr_addr;
if (ISSUE_WIS != 0) begin
assign gpr_wr_addr = {writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS], writeback_if.data.wis};
end else begin
assign gpr_wr_addr = writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS];
end
wire [BANK_SEL_WIDTH-1:0] gpr_wr_bank_idx;
if (NUM_BANKS != 1) begin
assign gpr_wr_bank_idx = writeback_if.data.rd[BANK_SEL_BITS-1:0];
end else begin
assign gpr_wr_bank_idx = '0;
end
`ifdef GPR_RESET
reg wr_enabled = 0;
always @(posedge clk) begin
@ -260,32 +256,56 @@ module VX_operands import VX_gpu_pkg::*; #(
wr_enabled <= 1;
end
end
`else
wire wr_enabled = 1;
`endif
for (genvar j = 0; j < `NUM_THREADS; ++j) begin
VX_dp_ram #(
.DATAW (`XLEN),
.SIZE (`NUM_REGS * ISSUE_RATIO),
`ifdef GPR_RESET
.INIT_ENABLE (1),
.INIT_VALUE (0),
`endif
.NO_RWCHECK (1)
) gpr_ram (
.clk (clk),
.read (1'b1),
`UNUSED_PIN (wren),
`ifdef GPR_RESET
.write (wr_enabled && writeback_if[i].valid && writeback_if[i].data.tmask[j]),
`else
.write (writeback_if[i].valid && writeback_if[i].data.tmask[j]),
`endif
.waddr (gpr_wr_addr),
.wdata (writeback_if[i].data.data[j]),
.raddr (gpr_rd_addr),
.rdata (gpr_rd_data[j])
);
for (genvar b = 0; b < NUM_BANKS; ++b) begin
wire gpr_wr_enabled;
if (BANK_SEL_BITS != 0) begin
assign gpr_wr_enabled = wr_enabled
&& writeback_if.valid
&& (gpr_wr_bank_idx == BANK_SEL_BITS'(b));
end else begin
assign gpr_wr_enabled = wr_enabled && writeback_if.valid;
end
wire [BYTEENW-1:0] wren;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign wren[i*XLEN_SIZE+:XLEN_SIZE] = {XLEN_SIZE{writeback_if.data.tmask[i]}};
end
VX_dp_ram #(
.DATAW (REGS_DATAW),
.SIZE (PER_BANK_REGS * PER_ISSUE_WARPS),
.WRENW (BYTEENW),
`ifdef GPR_RESET
.RESET_RAM (1),
`endif
.NO_RWCHECK (1)
) gpr_ram (
.clk (clk),
.reset (reset),
.read (pipe_fire_st1),
.wren (wren),
.write (gpr_wr_enabled),
.waddr (gpr_wr_addr),
.wdata (writeback_if.data.data),
.raddr (gpr_rd_addr_st1[b]),
.rdata (gpr_rd_data_st1[b])
);
end
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] collisions_r;
always @(posedge clk) begin
if (reset) begin
collisions_r <= '0;
end else begin
collisions_r <= collisions_r + `PERF_CTR_BITS'(scoreboard_if.valid && pipe_in_ready && has_collision_n);
end
end
assign perf_stalls = collisions_r;
`endif
endmodule

View file

@ -1,79 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_pending_instr #(
parameter CTR_WIDTH = 12,
parameter ALM_EMPTY = 1,
parameter DECR_COUNT = 1
) (
input wire clk,
input wire reset,
input wire incr,
input wire [`NW_WIDTH-1:0] incr_wid,
input wire [DECR_COUNT-1:0] decr,
input wire [DECR_COUNT-1:0][`NW_WIDTH-1:0] decr_wid,
input wire [`NW_WIDTH-1:0] alm_empty_wid,
output wire empty,
output wire alm_empty
);
localparam COUNTW = `CLOG2(DECR_COUNT+1);
reg [`NUM_WARPS-1:0][CTR_WIDTH-1:0] pending_instrs;
reg [`NUM_WARPS-1:0][COUNTW-1:0] decr_cnt;
reg [`NUM_WARPS-1:0][DECR_COUNT-1:0] decr_mask;
reg [`NUM_WARPS-1:0] incr_cnt, incr_cnt_n;
reg [`NUM_WARPS-1:0] alm_empty_r, empty_r;
always @(*) begin
incr_cnt_n = 0;
decr_mask = 0;
if (incr) begin
incr_cnt_n[incr_wid] = 1;
end
for (integer i = 0; i < DECR_COUNT; ++i) begin
if (decr[i]) begin
decr_mask[decr_wid[i]][i] = 1;
end
end
end
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
wire [COUNTW-1:0] decr_cnt_n;
`POP_COUNT(decr_cnt_n, decr_mask[i]);
wire [CTR_WIDTH-1:0] pending_instrs_n = pending_instrs[i] + CTR_WIDTH'(incr_cnt[i]) - CTR_WIDTH'(decr_cnt[i]);
always @(posedge clk) begin
if (reset) begin
incr_cnt[i] <= '0;
decr_cnt[i] <= '0;
pending_instrs[i] <= '0;
alm_empty_r[i] <= 0;
empty_r[i] <= 1;
end else begin
incr_cnt[i] <= incr_cnt_n[i];
decr_cnt[i] <= decr_cnt_n;
pending_instrs[i] <= pending_instrs_n;
alm_empty_r[i] <= (pending_instrs_n == ALM_EMPTY);
empty_r[i] <= (pending_instrs_n == 0);
end
end
end
assign alm_empty = alm_empty_r[alm_empty_wid];
assign empty = (& empty_r);
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,20 +14,21 @@
`include "VX_define.vh"
module VX_schedule import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0
) (
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_pipeline_perf_if.schedule perf_schedule_if,
output sched_perf_t sched_perf,
`endif
// configuration
input base_dcrs_t base_dcrs,
// inputsdecode_if
VX_warp_ctl_if.slave warp_ctl_if,
VX_warp_ctl_if.slave warp_ctl_if,
VX_branch_ctl_if.slave branch_ctl_if [`NUM_ALU_BLOCKS],
VX_decode_sched_if.slave decode_sched_if,
VX_commit_sched_if.slave commit_sched_if,
@ -42,17 +43,18 @@ module VX_schedule import VX_gpu_pkg::*; #(
// status
output wire busy
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (CORE_ID)
reg [`NUM_WARPS-1:0] active_warps, active_warps_n; // updated when a warp is activated or disabled
reg [`NUM_WARPS-1:0] stalled_warps, stalled_warps_n; // set when branch/gpgpu instructions are issued
reg [`NUM_WARPS-1:0][`NUM_THREADS-1:0] thread_masks, thread_masks_n;
reg [`NUM_WARPS-1:0][`XLEN-1:0] warp_pcs, warp_pcs_n;
reg [`NUM_WARPS-1:0][`PC_BITS-1:0] warp_pcs, warp_pcs_n;
wire [`NW_WIDTH-1:0] schedule_wid;
wire [`NUM_THREADS-1:0] schedule_tmask;
wire [`XLEN-1:0] schedule_pc;
wire [`PC_BITS-1:0] schedule_pc;
wire schedule_valid;
wire schedule_ready;
@ -60,9 +62,9 @@ module VX_schedule import VX_gpu_pkg::*; #(
wire join_valid;
wire join_is_dvg;
wire join_is_else;
wire [`NW_WIDTH-1:0] join_wid;
wire [`NW_WIDTH-1:0] join_wid;
wire [`NUM_THREADS-1:0] join_tmask;
wire [`XLEN-1:0] join_pc;
wire [`PC_BITS-1:0] join_pc;
reg [`PERF_CTR_BITS-1:0] cycles;
@ -72,10 +74,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
wire schedule_if_fire = schedule_if.valid && schedule_if.ready;
// branch
wire [`NUM_ALU_BLOCKS-1:0] branch_valid;
wire [`NUM_ALU_BLOCKS-1:0][`NW_WIDTH-1:0] branch_wid;
wire [`NUM_ALU_BLOCKS-1:0] branch_valid;
wire [`NUM_ALU_BLOCKS-1:0][`NW_WIDTH-1:0] branch_wid;
wire [`NUM_ALU_BLOCKS-1:0] branch_taken;
wire [`NUM_ALU_BLOCKS-1:0][`XLEN-1:0] branch_dest;
wire [`NUM_ALU_BLOCKS-1:0][`PC_BITS-1:0] branch_dest;
for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin
assign branch_valid[i] = branch_ctl_if[i].valid;
assign branch_wid[i] = branch_ctl_if[i].wid;
@ -85,47 +87,51 @@ module VX_schedule import VX_gpu_pkg::*; #(
// barriers
reg [`NUM_BARRIERS-1:0][`NUM_WARPS-1:0] barrier_masks, barrier_masks_n;
reg [`NUM_BARRIERS-1:0][`NW_WIDTH-1:0] barrier_ctrs, barrier_ctrs_n;
reg [`NUM_WARPS-1:0] barrier_stalls, barrier_stalls_n;
wire [`CLOG2(`NUM_WARPS+1)-1:0] active_barrier_count;
wire [`NUM_WARPS-1:0] curr_barrier_mask;
reg [`NUM_WARPS-1:0] curr_barrier_mask_p1;
`ifdef GBAR_ENABLE
reg [`NUM_WARPS-1:0] curr_barrier_mask_n;
reg gbar_req_valid;
reg [`NB_WIDTH-1:0] gbar_req_id;
reg [`NC_WIDTH-1:0] gbar_req_size_m1;
`endif
assign curr_barrier_mask = barrier_masks[warp_ctl_if.barrier.id];
`POP_COUNT(active_barrier_count, curr_barrier_mask);
`UNUSED_VAR (active_barrier_count)
// wspawn
wspawn_t wspawn;
reg [`NW_WIDTH-1:0] wspawn_wid;
reg is_single_warp;
wire [`CLOG2(`NUM_WARPS+1)-1:0] active_warps_cnt;
`POP_COUNT(active_warps_cnt, active_warps);
always @(*) begin
active_warps_n = active_warps;
stalled_warps_n = stalled_warps;
thread_masks_n = thread_masks;
barrier_masks_n = barrier_masks;
barrier_ctrs_n = barrier_ctrs;
barrier_stalls_n= barrier_stalls;
warp_pcs_n = warp_pcs;
// wspawn handling
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
active_warps_n |= warp_ctl_if.wspawn.wmask;
if (wspawn.valid && is_single_warp) begin
active_warps_n |= wspawn.wmask;
for (integer i = 0; i < `NUM_WARPS; ++i) begin
if (warp_ctl_if.wspawn.wmask[i]) begin
if (wspawn.wmask[i]) begin
thread_masks_n[i][0] = 1;
warp_pcs_n[i] = warp_ctl_if.wspawn.pc;
warp_pcs_n[i] = wspawn.pc;
end
end
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
stalled_warps_n[wspawn_wid] = 0; // unlock warp
end
// TMC handling
if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
active_warps_n[warp_ctl_if.wid] = (warp_ctl_if.tmc.tmask != 0);
thread_masks_n[warp_ctl_if.wid] = warp_ctl_if.tmc.tmask;
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end
// split handling
if (warp_ctl_if.valid && warp_ctl_if.split.valid) begin
if (warp_ctl_if.split.is_dvg) begin
@ -145,26 +151,30 @@ module VX_schedule import VX_gpu_pkg::*; #(
stalled_warps_n[join_wid] = 0; // unlock warp
end
// barrier handling
`ifdef GBAR_ENABLE
curr_barrier_mask_n = curr_barrier_mask;
curr_barrier_mask_n[warp_ctl_if.wid] = 1;
`endif
// barrier handling
curr_barrier_mask_p1 = barrier_masks[warp_ctl_if.barrier.id];
curr_barrier_mask_p1[warp_ctl_if.wid] = 1;
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid) begin
if (~warp_ctl_if.barrier.is_global
&& (active_barrier_count[`NW_WIDTH-1:0] == warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0])) begin
barrier_masks_n[warp_ctl_if.barrier.id] = '0;
barrier_stalls_n &= ~barrier_masks[warp_ctl_if.barrier.id];
if (~warp_ctl_if.barrier.is_noop) begin
if (~warp_ctl_if.barrier.is_global
&& (barrier_ctrs[warp_ctl_if.barrier.id] == `NW_WIDTH'(warp_ctl_if.barrier.size_m1))) begin
barrier_ctrs_n[warp_ctl_if.barrier.id] = '0; // reset barrier counter
barrier_masks_n[warp_ctl_if.barrier.id] = '0; // reset barrier mask
stalled_warps_n &= ~barrier_masks[warp_ctl_if.barrier.id]; // unlock warps
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end else begin
barrier_ctrs_n[warp_ctl_if.barrier.id] = barrier_ctrs[warp_ctl_if.barrier.id] + `NW_WIDTH'(1);
barrier_masks_n[warp_ctl_if.barrier.id] = curr_barrier_mask_p1;
end
end else begin
barrier_masks_n[warp_ctl_if.barrier.id][warp_ctl_if.wid] = 1;
barrier_stalls_n[warp_ctl_if.wid] = 1;
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end
`ifdef GBAR_ENABLE
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_id)) begin
barrier_masks_n[gbar_bus_if.rsp_id] = '0;
barrier_stalls_n = '0; // unlock all warps
barrier_ctrs_n[warp_ctl_if.barrier.id] = '0; // reset barrier counter
barrier_masks_n[gbar_bus_if.rsp_id] = '0; // reset barrier mask
stalled_warps_n = '0; // unlock all warps
end
`endif
@ -195,7 +205,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
// advance PC
if (schedule_if_fire) begin
warp_pcs_n[schedule_if.data.wid] = schedule_if.data.PC + 4;
warp_pcs_n[schedule_if.data.wid] = schedule_if.data.PC + `PC_BITS'(2);
end
end
@ -204,6 +214,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (reset) begin
barrier_masks <= '0;
barrier_ctrs <= '0;
`ifdef GBAR_ENABLE
gbar_req_valid <= 0;
`endif
@ -214,27 +225,43 @@ module VX_schedule import VX_gpu_pkg::*; #(
barrier_stalls <= '0;
issued_instrs <= '0;
cycles <= '0;
wspawn.valid <= 0;
// activate first warp
warp_pcs[0] <= base_dcrs.startup_addr;
warp_pcs[0] <= base_dcrs.startup_addr[1 +: `PC_BITS];
active_warps[0] <= 1;
thread_masks[0][0] <= 1;
is_single_warp <= 1;
end else begin
active_warps <= active_warps_n;
stalled_warps <= stalled_warps_n;
thread_masks <= thread_masks_n;
warp_pcs <= warp_pcs_n;
barrier_masks <= barrier_masks_n;
barrier_ctrs <= barrier_ctrs_n;
barrier_stalls <= barrier_stalls_n;
is_single_warp <= (active_warps_cnt == $bits(active_warps_cnt)'(1));
// wspawn handling
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
wspawn.valid <= 1;
wspawn.wmask <= warp_ctl_if.wspawn.wmask;
wspawn.pc <= warp_ctl_if.wspawn.pc;
wspawn_wid <= warp_ctl_if.wid;
end
if (wspawn.valid && is_single_warp) begin
wspawn.valid <= 0;
end
// global barrier scheduling
`ifdef GBAR_ENABLE
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid
&& warp_ctl_if.barrier.is_global
&& (curr_barrier_mask_n == active_warps)) begin
&& warp_ctl_if.barrier.is_global
&& !warp_ctl_if.barrier.is_noop
&& (curr_barrier_mask_p1 == active_warps)) begin
gbar_req_valid <= 1;
gbar_req_id <= warp_ctl_if.barrier.id;
gbar_req_size_m1 <= warp_ctl_if.barrier.size_m1[`NC_WIDTH-1:0];
gbar_req_size_m1 <= `NC_WIDTH'(warp_ctl_if.barrier.size_m1);
end
if (gbar_bus_if.req_valid && gbar_bus_if.req_ready) begin
gbar_req_valid <= 0;
@ -247,7 +274,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
if (busy) begin
cycles <= cycles + 1;
end
end
end
end
@ -265,7 +292,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
`RESET_RELAY (split_join_reset, reset);
VX_split_join #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-splitjoin", INSTANCE_ID))
) split_join (
.clk (clk),
.reset (split_join_reset),
@ -274,19 +301,21 @@ module VX_schedule import VX_gpu_pkg::*; #(
.split (warp_ctl_if.split),
.sjoin (warp_ctl_if.sjoin),
.join_valid (join_valid),
.join_is_dvg (join_is_dvg),
.join_is_else (join_is_else),
.join_wid (join_wid),
.join_is_dvg(join_is_dvg),
.join_is_else(join_is_else),
.join_wid (join_wid),
.join_tmask (join_tmask),
.join_pc (join_pc)
.join_pc (join_pc),
.stack_wid (warp_ctl_if.dvstack_wid),
.stack_ptr (warp_ctl_if.dvstack_ptr)
);
// schedule the next ready warp
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls);
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~stalled_warps;
VX_lzc #(
.N (`NUM_WARPS),
.N (`NUM_WARPS),
.REVERSE (1)
) wid_select (
.data_in (ready_warps),
@ -294,33 +323,40 @@ module VX_schedule import VX_gpu_pkg::*; #(
.valid_out (schedule_valid)
);
wire [`NUM_WARPS-1:0][(`NUM_THREADS + `XLEN)-1:0] schedule_data;
wire [`NUM_WARPS-1:0][(`NUM_THREADS + `PC_BITS)-1:0] schedule_data;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
assign schedule_data[i] = {thread_masks[i], warp_pcs[i]};
end
assign {schedule_tmask, schedule_pc} = {
schedule_data[schedule_wid][(`NUM_THREADS + `XLEN)-1:(`NUM_THREADS + `XLEN)-4],
schedule_data[schedule_wid][(`NUM_THREADS + `XLEN)-5:0]
schedule_data[schedule_wid][(`NUM_THREADS + `PC_BITS)-1:(`NUM_THREADS + `PC_BITS)-4],
schedule_data[schedule_wid][(`NUM_THREADS + `PC_BITS)-5:0]
};
`ifndef NDEBUG
localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS);
reg [`UUID_WIDTH-1:0] instr_uuid;
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid);
`ifdef SV_DPI
always @(posedge clk) begin
if (reset) begin
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 0, 0));
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 32'd0));
end else if (schedule_fire) begin
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid), 64'(schedule_pc)));
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid)));
end
end
`else
wire [GNW_WIDTH+16-1:0] w_uuid = {g_wid, 16'(schedule_pc)};
always @(*) begin
instr_uuid = `UUID_WIDTH'(w_uuid);
end
`endif
`else
wire [`UUID_WIDTH-1:0] instr_uuid = '0;
`endif
VX_elastic_buffer #(
.DATAW (`NUM_THREADS + `XLEN + `NW_WIDTH)
VX_elastic_buffer #(
.DATAW (`NUM_THREADS + `PC_BITS + `NW_WIDTH)
) out_buf (
.clk (clk),
.reset (reset),
@ -334,24 +370,42 @@ module VX_schedule import VX_gpu_pkg::*; #(
assign schedule_if.data.uuid = instr_uuid;
`RESET_RELAY (pending_instr_reset, reset);
// Track pending instructions per warp
wire no_pending_instr;
VX_pending_instr #(
.CTR_WIDTH (12),
.DECR_COUNT (`ISSUE_WIDTH),
.ALM_EMPTY (1)
) pending_instr(
.clk (clk),
.reset (pending_instr_reset),
.incr (schedule_if_fire),
.incr_wid (schedule_if.data.wid),
.decr (commit_sched_if.committed),
.decr_wid (commit_sched_if.committed_wid),
.alm_empty_wid (sched_csr_if.alm_empty_wid),
.alm_empty (sched_csr_if.alm_empty),
.empty (no_pending_instr)
);
reg [`NUM_WARPS-1:0] per_warp_incr;
always @(*) begin
per_warp_incr = 0;
if (schedule_if_fire) begin
per_warp_incr[schedule_if.data.wid] = 1;
end
end
wire [`NUM_WARPS-1:0] pending_warp_empty;
wire [`NUM_WARPS-1:0] pending_warp_alm_empty;
`RESET_RELAY_EX (pending_instr_reset, reset, `NUM_WARPS, `MAX_FANOUT);
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
VX_pending_size #(
.SIZE (4096),
.ALM_EMPTY (1)
) counter (
.clk (clk),
.reset (pending_instr_reset[i]),
.incr (per_warp_incr[i]),
.decr (commit_sched_if.committed_warps[i]),
.empty (pending_warp_empty[i]),
.alm_empty (pending_warp_alm_empty[i]),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end
assign sched_csr_if.alm_empty = pending_warp_alm_empty[sched_csr_if.alm_empty_wid];
wire no_pending_instr = (& pending_warp_empty);
`BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1);
@ -359,7 +413,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
assign sched_csr_if.cycles = cycles;
assign sched_csr_if.active_warps = active_warps;
assign sched_csr_if.thread_masks = thread_masks;
// timeout handling
reg [31:0] timeout_ctr;
reg timeout_enable;
@ -378,9 +432,9 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
end
end
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** %s timeout: stalled_warps=%b", $time, INSTANCE_ID, stalled_warps))
`ifdef PERF_ENABLE
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_sched_idles;
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
@ -390,15 +444,15 @@ module VX_schedule import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (reset) begin
perf_sched_idles <= '0;
perf_sched_stalls <= '0;
perf_sched_stalls <= '0;
end else begin
perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle);
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall);
end
end
assign perf_schedule_if.sched_idles = perf_sched_idles;
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
assign sched_perf.idles = perf_sched_idles;
assign sched_perf.stalls = perf_sched_stalls;
`endif
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,63 +14,67 @@
`include "VX_define.vh"
module VX_scoreboard import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS],
output reg [`PERF_CTR_BITS-1:0] perf_sfu_uses [`NUM_SFU_UNITS],
output reg [`PERF_CTR_BITS-1:0] perf_stalls,
output reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_units_uses,
output reg [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_sfu_uses,
`endif
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave ibuffer_if [`ISSUE_WIDTH],
VX_ibuffer_if.master scoreboard_if [`ISSUE_WIDTH]
VX_writeback_if.slave writeback_if,
VX_ibuffer_if.slave ibuffer_if [PER_ISSUE_WARPS],
VX_scoreboard_if.master scoreboard_if
);
`UNUSED_PARAM (CORE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + `XLEN + (`NR_BITS * 4) + 1;
`UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + 1;
VX_ibuffer_if staging_if [PER_ISSUE_WARPS]();
reg [PER_ISSUE_WARPS-1:0] operands_ready;
`ifdef PERF_ENABLE
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_units_per_cycle;
reg [PER_ISSUE_WARPS-1:0][`NUM_EX_UNITS-1:0] perf_inuse_units_per_cycle;
wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
reg [`ISSUE_WIDTH-1:0][`NUM_SFU_UNITS-1:0] perf_issue_sfu_per_cycle;
reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
wire [`ISSUE_WIDTH-1:0] perf_issue_stalls_per_cycle;
wire [`CLOG2(`ISSUE_WIDTH+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
VX_reduce #(
.DATAW_IN (`NUM_EX_UNITS),
.N (`ISSUE_WIDTH),
.N (PER_ISSUE_WARPS),
.OP ("|")
) perf_units_reduce (
.data_in (perf_issue_units_per_cycle),
.data_in (perf_inuse_units_per_cycle),
.data_out (perf_units_per_cycle)
);
);
VX_reduce #(
.DATAW_IN (`NUM_SFU_UNITS),
.N (`ISSUE_WIDTH),
.N (PER_ISSUE_WARPS),
.OP ("|")
) perf_sfu_reduce (
.data_in (perf_issue_sfu_per_cycle),
.data_in (perf_inuse_sfu_per_cycle),
.data_out (perf_sfu_per_cycle)
);
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
`BUFFER(perf_units_per_cycle_r, perf_units_per_cycle);
`BUFFER(perf_sfu_per_cycle_r, perf_sfu_per_cycle);
`BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
`BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
wire [PER_ISSUE_WARPS-1:0] stg_valid_in;
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
assign stg_valid_in[w] = staging_if[w].valid;
end
wire perf_stall_per_cycle = (|stg_valid_in) && ~(|(stg_valid_in & operands_ready));
always @(posedge clk) begin
if (reset) begin
perf_scb_stalls <= '0;
perf_stalls <= '0;
end else begin
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
perf_stalls <= perf_stalls + `PERF_CTR_BITS'(perf_stall_per_cycle);
end
end
@ -83,7 +87,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
end
end
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
@ -95,138 +99,221 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
`endif
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs;
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (1)
) stanging_buf (
.clk (clk),
.reset (reset),
.valid_in (ibuffer_if[w].valid),
.data_in (ibuffer_if[w].data),
.ready_in (ibuffer_if[w].ready),
.valid_out(staging_if[w].valid),
.data_out (staging_if[w].data),
.ready_out(staging_if[w].ready)
);
end
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
reg [`NUM_REGS-1:0] inuse_regs;
wire inuse_rd = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd];
wire inuse_rs1 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1];
wire inuse_rs2 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2];
wire inuse_rs3 = inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3];
reg [3:0] operands_busy, operands_busy_n;
wire ibuffer_fire = ibuffer_if[w].valid && ibuffer_if[w].ready;
wire staging_fire = staging_if[w].valid && staging_if[w].ready;
wire writeback_fire = writeback_if.valid
&& (writeback_if.data.wis == ISSUE_WIS_W'(w))
&& writeback_if.data.eop;
`ifdef PERF_ENABLE
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
reg [`SFU_WIDTH-1:0] sfu_type;
always @(*) begin
case (scoreboard_if[i].data.op_type)
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: sfu_type = `SFU_CSRS;
default: sfu_type = `SFU_WCTL;
endcase
end
reg [`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
reg [`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
always @(*) begin
perf_issue_units_per_cycle[i] = '0;
perf_issue_sfu_per_cycle[i] = '0;
if (ibuffer_if[i].valid) begin
if (inuse_rd) begin
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd]] = 1;
perf_inuse_units_per_cycle[w] = '0;
perf_inuse_sfu_per_cycle[w] = '0;
if (staging_if[w].valid) begin
if (operands_busy[0]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rd]] = 1;
if (inuse_units[staging_if[w].data.rd] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rd]] = 1;
end
end
if (inuse_rs1) begin
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs1]] = 1;
if (operands_busy[1]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs1]] = 1;
if (inuse_units[staging_if[w].data.rs1] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs1]] = 1;
end
end
if (inuse_rs2) begin
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs2]] = 1;
if (operands_busy[2]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs2]] = 1;
if (inuse_units[staging_if[w].data.rs2] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs2]] = 1;
end
end
if (inuse_rs3) begin
perf_issue_units_per_cycle[i][inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
if (inuse_units[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3] == `EX_SFU) begin
perf_issue_sfu_per_cycle[i][inuse_sfu[ibuffer_if[i].data.wis][ibuffer_if[i].data.rs3]] = 1;
if (operands_busy[3]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs3]] = 1;
if (inuse_units[staging_if[w].data.rs3] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs3]] = 1;
end
end
end
end
assign perf_issue_stalls_per_cycle[i] = ibuffer_if[i].valid && ~ibuffer_if[i].ready;
`endif
reg [DATAW-1:0] data_out_r;
reg valid_out_r;
wire ready_out;
wire [3:0] ready_masks = ~{inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
wire deps_ready = (& ready_masks);
wire valid_in = ibuffer_if[i].valid && deps_ready;
wire ready_in = ~valid_out_r && deps_ready;
wire [DATAW-1:0] data_in = ibuffer_if[i].data;
assign ready_out = scoreboard_if[i].ready;
always @(*) begin
operands_busy_n = operands_busy;
if (ibuffer_fire) begin
operands_busy_n = {
inuse_regs[ibuffer_if[w].data.rs3],
inuse_regs[ibuffer_if[w].data.rs2],
inuse_regs[ibuffer_if[w].data.rs1],
inuse_regs[ibuffer_if[w].data.rd]
};
end
if (writeback_fire) begin
if (ibuffer_fire) begin
if (writeback_if.data.rd == ibuffer_if[w].data.rd) begin
operands_busy_n[0] = 0;
end
if (writeback_if.data.rd == ibuffer_if[w].data.rs1) begin
operands_busy_n[1] = 0;
end
if (writeback_if.data.rd == ibuffer_if[w].data.rs2) begin
operands_busy_n[2] = 0;
end
if (writeback_if.data.rd == ibuffer_if[w].data.rs3) begin
operands_busy_n[3] = 0;
end
end else begin
if (writeback_if.data.rd == staging_if[w].data.rd) begin
operands_busy_n[0] = 0;
end
if (writeback_if.data.rd == staging_if[w].data.rs1) begin
operands_busy_n[1] = 0;
end
if (writeback_if.data.rd == staging_if[w].data.rs2) begin
operands_busy_n[2] = 0;
end
if (writeback_if.data.rd == staging_if[w].data.rs3) begin
operands_busy_n[3] = 0;
end
end
end
if (staging_fire && staging_if[w].data.wb) begin
if (staging_if[w].data.rd == ibuffer_if[w].data.rd) begin
operands_busy_n[0] = 1;
end
if (staging_if[w].data.rd == ibuffer_if[w].data.rs1) begin
operands_busy_n[1] = 1;
end
if (staging_if[w].data.rd == ibuffer_if[w].data.rs2) begin
operands_busy_n[2] = 1;
end
if (staging_if[w].data.rd == ibuffer_if[w].data.rs3) begin
operands_busy_n[3] = 1;
end
end
end
always @(posedge clk) begin
if (reset) begin
valid_out_r <= 0;
inuse_regs <= '0;
end else begin
if (writeback_fire) begin
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
inuse_regs[writeback_if.data.rd] <= 0;
end
if (~valid_out_r) begin
valid_out_r <= valid_in;
end else if (ready_out) begin
if (scoreboard_if[i].data.wb) begin
inuse_regs[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= 1;
`ifdef PERF_ENABLE
inuse_units[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= scoreboard_if[i].data.ex_type;
if (scoreboard_if[i].data.ex_type == `EX_SFU) begin
inuse_sfu[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= sfu_type;
end
`endif
end
valid_out_r <= 0;
if (staging_fire && staging_if[w].data.wb) begin
inuse_regs[staging_if[w].data.rd] <= 1;
end
end
if (~valid_out_r) begin
data_out_r <= data_in;
operands_busy <= operands_busy_n;
operands_ready[w] <= ~(| operands_busy_n);
`ifdef PERF_ENABLE
if (staging_fire && staging_if[w].data.wb) begin
inuse_units[staging_if[w].data.rd] <= staging_if[w].data.ex_type;
if (staging_if[w].data.ex_type == `EX_SFU) begin
inuse_sfu[staging_if[w].data.rd] <= op_to_sfu_type(staging_if[w].data.op_type);
end
end
`endif
end
assign ibuffer_if[i].ready = ready_in;
assign scoreboard_if[i].valid = valid_out_r;
assign scoreboard_if[i].data = data_out_r;
`ifdef SIMULATION
reg [31:0] timeout_ctr;
reg [31:0] timeout_ctr;
always @(posedge clk) begin
if (reset) begin
timeout_ctr <= '0;
end else begin
if (ibuffer_if[i].valid && ~ibuffer_if[i].ready) begin
`ifdef DBG_TRACE_CORE_PIPELINE
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
~ready_masks, ibuffer_if[i].data.uuid));
end else begin
if (staging_if[w].valid && ~staging_if[w].ready) begin
`ifdef DBG_TRACE_PIPELINE
`TRACE(3, ("%d: *** %s-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr,
operands_busy, staging_if[w].data.uuid));
`endif
timeout_ctr <= timeout_ctr + 1;
end else if (ibuffer_if[i].valid && ibuffer_if[i].ready) begin
end else if (ibuffer_fire) begin
timeout_ctr <= '0;
end
end
end
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, CORE_ID, wis_to_wid(ibuffer_if[i].data.wis, i), ibuffer_if[i].data.PC, ibuffer_if[i].data.tmask, timeout_ctr,
~ready_masks, ibuffer_if[i].data.uuid));
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid));
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
("%t: *** %s timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr,
operands_busy, staging_if[w].data.uuid));
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if.data.rd] != 0,
("%t: *** %s invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, INSTANCE_ID, w, {writeback_if.data.PC, 1'b0}, writeback_if.data.tmask, writeback_if.data.rd, writeback_if.data.uuid));
`endif
end
wire [PER_ISSUE_WARPS-1:0] arb_valid_in;
wire [PER_ISSUE_WARPS-1:0][DATAW-1:0] arb_data_in;
wire [PER_ISSUE_WARPS-1:0] arb_ready_in;
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
assign arb_valid_in[w] = staging_if[w].valid && operands_ready[w];
assign arb_data_in[w] = staging_if[w].data;
assign staging_if[w].ready = arb_ready_in[w] && operands_ready[w];
end
`RESET_RELAY (arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (PER_ISSUE_WARPS),
.DATAW (DATAW),
.ARBITER ("F"),
.LUTRAM (1),
.OUT_BUF (4) // using 2-cycle EB for area reduction
) out_arb (
.clk (clk),
.reset (arb_reset),
.valid_in (arb_valid_in),
.ready_in (arb_ready_in),
.data_in (arb_data_in),
.data_out ({
scoreboard_if.data.uuid,
scoreboard_if.data.tmask,
scoreboard_if.data.PC,
scoreboard_if.data.ex_type,
scoreboard_if.data.op_type,
scoreboard_if.data.op_args,
scoreboard_if.data.wb,
scoreboard_if.data.rd,
scoreboard_if.data.rs1,
scoreboard_if.data.rs2,
scoreboard_if.data.rs3
}),
.valid_out (scoreboard_if.valid),
.ready_out (scoreboard_if.ready),
.sel_out (scoreboard_if.data.wis)
);
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,8 +14,9 @@
`include "VX_define.vh"
module VX_sfu_unit import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0
) (
) (
input wire clk,
input wire reset,
@ -28,71 +29,68 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
// Inputs
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if.slave fpu_to_csr_if [`NUM_FPU_BLOCKS],
VX_fpu_csr_if.slave fpu_csr_if [`NUM_FPU_BLOCKS],
`endif
VX_commit_csr_if.slave commit_csr_if,
VX_sched_csr_if.slave sched_csr_if,
// Outputs
VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_commit_csr_if.slave commit_csr_if,
VX_sched_csr_if.slave sched_csr_if,
VX_warp_ctl_if.master warp_ctl_if
VX_warp_ctl_if.master warp_ctl_if
);
`UNUSED_PARAM (CORE_ID)
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_SFU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
`UNUSED_SPARAM (INSTANCE_ID)
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_SFU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1;
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `PC_BITS + PID_WIDTH + 1 + 1;
localparam RSP_ARB_SIZE = 1 + 1;
localparam RSP_ARB_IDX_WCTL = 0;
localparam RSP_ARB_IDX_CSRS = 1;
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) execute_if[BLOCK_SIZE]();
`RESET_RELAY (dispatch_reset, reset);
) per_block_execute_if[BLOCK_SIZE]();
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (1)
.OUT_BUF (1)
) dispatch_unit (
.clk (clk),
.reset (dispatch_reset),
.reset (reset),
.dispatch_if(dispatch_if),
.execute_if (execute_if)
.execute_if (per_block_execute_if)
);
wire [RSP_ARB_SIZE-1:0] rsp_arb_valid_in;
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
// Warp control block
// Warp control block
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) wctl_execute_if();
VX_commit_if#(
.NUM_LANES (NUM_LANES)
) wctl_commit_if();
assign wctl_execute_if.valid = execute_if[0].valid && `INST_SFU_IS_WCTL(execute_if[0].data.op_type);
assign wctl_execute_if.data = execute_if[0].data;
assign wctl_execute_if.valid = per_block_execute_if[0].valid && `INST_SFU_IS_WCTL(per_block_execute_if[0].data.op_type);
assign wctl_execute_if.data = per_block_execute_if[0].data;
`RESET_RELAY (wctl_reset, reset);
VX_wctl_unit #(
.CORE_ID (CORE_ID),
.INSTANCE_ID ($sformatf("%s-wctl", INSTANCE_ID)),
.NUM_LANES (NUM_LANES)
) wctl_unit (
.clk (clk),
.reset (wctl_reset),
.execute_if (wctl_execute_if),
.warp_ctl_if(warp_ctl_if),
.execute_if (wctl_execute_if),
.warp_ctl_if(warp_ctl_if),
.commit_if (wctl_commit_if)
);
@ -108,12 +106,13 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.NUM_LANES (NUM_LANES)
) csr_commit_if();
assign csr_execute_if.valid = execute_if[0].valid && `INST_SFU_IS_CSR(execute_if[0].data.op_type);
assign csr_execute_if.data = execute_if[0].data;
assign csr_execute_if.valid = per_block_execute_if[0].valid && `INST_SFU_IS_CSR(per_block_execute_if[0].data.op_type);
assign csr_execute_if.data = per_block_execute_if[0].data;
`RESET_RELAY (csr_reset, reset);
VX_csr_unit #(
.INSTANCE_ID ($sformatf("%s-csr", INSTANCE_ID)),
.CORE_ID (CORE_ID),
.NUM_LANES (NUM_LANES)
) csr_unit (
@ -122,20 +121,20 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.base_dcrs (base_dcrs),
.execute_if (csr_execute_if),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if),
`endif
`ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if),
`ifdef EXT_F_ENABLE
.fpu_csr_if (fpu_csr_if),
`endif
.sched_csr_if (sched_csr_if),
.commit_csr_if (commit_csr_if),
.commit_if (csr_commit_if)
);
);
assign rsp_arb_valid_in[RSP_ARB_IDX_CSRS] = csr_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_CSRS] = csr_commit_if.data;
@ -145,18 +144,16 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
reg sfu_req_ready;
always @(*) begin
case (execute_if[0].data.op_type)
case (per_block_execute_if[0].data.op_type)
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
default: sfu_req_ready = wctl_execute_if.ready;
endcase
end
assign execute_if[0].ready = sfu_req_ready;
assign per_block_execute_if[0].ready = sfu_req_ready;
// response arbitration
`RESET_RELAY (commit_reset, reset);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
@ -166,10 +163,10 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW),
.ARBITER ("R"),
.OUT_REG (3)
.OUT_BUF (3)
) rsp_arb (
.clk (clk),
.reset (commit_reset),
.reset (reset),
.valid_in (rsp_arb_valid_in),
.ready_in (rsp_arb_ready_in),
.data_in (rsp_arb_data_in),
@ -182,10 +179,10 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (1)
.OUT_BUF (3)
) gather_unit (
.clk (clk),
.reset (commit_reset),
.reset (reset),
.commit_in_if (arb_commit_if),
.commit_out_if (commit_if)
);

View file

@ -1,124 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_smem_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output cache_perf_t cache_perf,
`endif
VX_mem_bus_if.slave dcache_bus_in_if [DCACHE_NUM_REQS],
VX_mem_bus_if.master dcache_bus_out_if [DCACHE_NUM_REQS]
);
`UNUSED_PARAM (CORE_ID)
localparam SMEM_ADDR_WIDTH = `SMEM_LOG_SIZE - `CLOG2(DCACHE_WORD_SIZE);
wire [DCACHE_NUM_REQS-1:0] smem_req_valid;
wire [DCACHE_NUM_REQS-1:0] smem_req_rw;
wire [DCACHE_NUM_REQS-1:0][SMEM_ADDR_WIDTH-1:0] smem_req_addr;
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] smem_req_byteen;
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] smem_req_data;
wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] smem_req_tag;
wire [DCACHE_NUM_REQS-1:0] smem_req_ready;
wire [DCACHE_NUM_REQS-1:0] smem_rsp_valid;
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] smem_rsp_data;
wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] smem_rsp_tag;
wire [DCACHE_NUM_REQS-1:0] smem_rsp_ready;
`RESET_RELAY (smem_reset, reset);
VX_shared_mem #(
.INSTANCE_ID($sformatf("core%0d-smem", CORE_ID)),
.SIZE (1 << `SMEM_LOG_SIZE),
.NUM_REQS (DCACHE_NUM_REQS),
.NUM_BANKS (`SMEM_NUM_BANKS),
.WORD_SIZE (DCACHE_WORD_SIZE),
.ADDR_WIDTH (SMEM_ADDR_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
) shared_mem (
.clk (clk),
.reset (smem_reset),
`ifdef PERF_ENABLE
.cache_perf (cache_perf),
`endif
// Core request
.req_valid (smem_req_valid),
.req_rw (smem_req_rw),
.req_byteen (smem_req_byteen),
.req_addr (smem_req_addr),
.req_data (smem_req_data),
.req_tag (smem_req_tag),
.req_ready (smem_req_ready),
// Core response
.rsp_valid (smem_rsp_valid),
.rsp_data (smem_rsp_data),
.rsp_tag (smem_rsp_tag),
.rsp_ready (smem_rsp_ready)
);
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
) switch_out_bus_if[2 * DCACHE_NUM_REQS]();
`RESET_RELAY (switch_reset, reset);
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign smem_req_valid[i] = switch_out_bus_if[i * 2 + 1].req_valid;
assign smem_req_rw[i] = switch_out_bus_if[i * 2 + 1].req_data.rw;
assign smem_req_byteen[i] = switch_out_bus_if[i * 2 + 1].req_data.byteen;
assign smem_req_data[i] = switch_out_bus_if[i * 2 + 1].req_data.data;
assign smem_req_tag[i] = switch_out_bus_if[i * 2 + 1].req_data.tag;
assign switch_out_bus_if[i * 2 + 1].req_ready = smem_req_ready[i];
assign switch_out_bus_if[i * 2 + 1].rsp_valid = smem_rsp_valid[i];
assign switch_out_bus_if[i * 2 + 1].rsp_data.data = smem_rsp_data[i];
assign switch_out_bus_if[i * 2 + 1].rsp_data.tag = smem_rsp_tag[i];
assign smem_rsp_ready[i] = switch_out_bus_if[i * 2 + 1].rsp_ready;
assign smem_req_addr[i] = switch_out_bus_if[i * 2 + 1].req_data.addr[SMEM_ADDR_WIDTH-1:0];
VX_smem_switch #(
.NUM_REQS (2),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.TAG_SEL_IDX (0),
.ARBITER ("P"),
.OUT_REG_REQ (2),
.OUT_REG_RSP (2)
) smem_switch (
.clk (clk),
.reset (switch_reset),
.bus_in_if (dcache_bus_in_if[i]),
.bus_out_if (switch_out_bus_if[i * 2 +: 2])
);
end
// this bus goes to the dcache
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_out_if[i], switch_out_bus_if[i * 2]);
end
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_split_join import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
@ -27,50 +27,58 @@ module VX_split_join import VX_gpu_pkg::*; #(
output wire join_is_else,
output wire [`NW_WIDTH-1:0] join_wid,
output wire [`NUM_THREADS-1:0] join_tmask,
output wire [`XLEN-1:0] join_pc
output wire [`PC_BITS-1:0] join_pc,
input wire [`NW_WIDTH-1:0] stack_wid,
output wire [`DV_STACK_SIZEW-1:0] stack_ptr
);
`UNUSED_PARAM (CORE_ID)
wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_data [`NUM_WARPS-1:0];
`UNUSED_SPARAM (INSTANCE_ID)
wire [(`NUM_THREADS+`PC_BITS)-1:0] ipdom_data [`NUM_WARPS-1:0];
wire [`DV_STACK_SIZEW-1:0] ipdom_q_ptr [`NUM_WARPS-1:0];
wire ipdom_set [`NUM_WARPS-1:0];
wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_q0 = {split.then_tmask | split.else_tmask, `XLEN'(0)};
wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_q1 = {split.else_tmask, split.next_pc};
wire [(`NUM_THREADS+`PC_BITS)-1:0] ipdom_q0 = {split.then_tmask | split.else_tmask, `PC_BITS'(0)};
wire [(`NUM_THREADS+`PC_BITS)-1:0] ipdom_q1 = {split.else_tmask, split.next_pc};
wire sjoin_is_dvg = (sjoin.stack_ptr != ipdom_q_ptr[wid]);
wire ipdom_push = valid && split.valid && split.is_dvg;
wire ipdom_pop = valid && sjoin.valid && sjoin.is_dvg;
wire ipdom_pop = valid && sjoin.valid && sjoin_is_dvg;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
`RESET_RELAY (ipdom_reset, reset);
VX_ipdom_stack #(
.WIDTH (`XLEN+`NUM_THREADS),
.DEPTH (`UP(`NUM_THREADS-1))
.WIDTH (`NUM_THREADS+`PC_BITS),
.DEPTH (`DV_STACK_SIZE)
) ipdom_stack (
.clk (clk),
.reset (ipdom_reset),
.push (ipdom_push && (i == wid)),
.pop (ipdom_pop && (i == wid)),
.q0 (ipdom_q0),
.q1 (ipdom_q1),
.d (ipdom_data[i]),
.d_set (ipdom_set[i]),
.q_ptr (ipdom_q_ptr[i]),
.push (ipdom_push && (i == wid)),
.pop (ipdom_pop && (i == wid)),
`UNUSED_PIN (empty),
`UNUSED_PIN (full)
);
end
VX_pipe_register #(
.DATAW (1 + 1 + `NW_WIDTH + 1 + `XLEN + `NUM_THREADS),
.DATAW (1 + 1 + 1 + `NW_WIDTH + `NUM_THREADS + `PC_BITS),
.DEPTH (1),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({valid && sjoin.valid, sjoin.is_dvg, ipdom_set[wid], wid, ipdom_data[wid]}),
.data_out ({join_valid, join_is_dvg, join_is_else, join_wid, join_tmask, join_pc})
.data_in ({valid && sjoin.valid, sjoin_is_dvg, ipdom_set[wid], wid, ipdom_data[wid]}),
.data_out ({join_valid, join_is_dvg, join_is_else, join_wid, {join_tmask, join_pc}})
);
assign stack_ptr = ipdom_q_ptr[stack_wid];
endmodule

View file

@ -1,379 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`ifndef VX_TRACE_VH
`define VX_TRACE_VH
`ifndef SYNTHESIS
`include "VX_define.vh"
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
case (ex_type)
`EX_ALU: `TRACE(level, ("ALU"));
`EX_LSU: `TRACE(level, ("LSU"));
`EX_FPU: `TRACE(level, ("FPU"));
`EX_SFU: `TRACE(level, ("SFU"));
default: `TRACE(level, ("?"));
endcase
endtask
task trace_ex_op(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input [`INST_MOD_BITS-1:0] op_mod,
`UNUSED_ARG(input [`NR_BITS-1:0] rd),
`UNUSED_ARG(input [`NR_BITS-1:0] rs2),
input use_imm,
`UNUSED_ARG(input [`XLEN-1:0] imm)
);
`ifdef FLEN_64
logic fdst_d = imm[0];
`else
logic fdst_d = 0;
`endif
`ifdef XLEN_64
logic fcvt_l = imm[1];
`else
logic fcvt_l = 0;
`endif
`ifdef EXT_F_ENABLE
logic rd_float = 1'(rd >> 5) || 1'(rs2 >> 5);
`else
logic rd_float = 0;
`endif
case (ex_type)
`EX_ALU: begin
if (`INST_ALU_IS_BR(op_mod)) begin
case (`INST_BR_BITS'(op_type))
`INST_BR_EQ: `TRACE(level, ("BEQ"));
`INST_BR_NE: `TRACE(level, ("BNE"));
`INST_BR_LT: `TRACE(level, ("BLT"));
`INST_BR_GE: `TRACE(level, ("BGE"));
`INST_BR_LTU: `TRACE(level, ("BLTU"));
`INST_BR_GEU: `TRACE(level, ("BGEU"));
`INST_BR_JAL: `TRACE(level, ("JAL"));
`INST_BR_JALR: `TRACE(level, ("JALR"));
`INST_BR_ECALL: `TRACE(level, ("ECALL"));
`INST_BR_EBREAK:`TRACE(level, ("EBREAK"));
`INST_BR_URET: `TRACE(level, ("URET"));
`INST_BR_SRET: `TRACE(level, ("SRET"));
`INST_BR_MRET: `TRACE(level, ("MRET"));
default: `TRACE(level, ("?"));
endcase
end else if (`INST_ALU_IS_M(op_mod)) begin
if (`INST_ALU_IS_W(op_mod)) begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MULW"));
`INST_M_DIV: `TRACE(level, ("DIVW"));
`INST_M_DIVU: `TRACE(level, ("DIVUW"));
`INST_M_REM: `TRACE(level, ("REMW"));
`INST_M_REMU: `TRACE(level, ("REMUW"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MUL"));
`INST_M_MULH: `TRACE(level, ("MULH"));
`INST_M_MULHSU:`TRACE(level, ("MULHSU"));
`INST_M_MULHU: `TRACE(level, ("MULHU"));
`INST_M_DIV: `TRACE(level, ("DIV"));
`INST_M_DIVU: `TRACE(level, ("DIVU"));
`INST_M_REM: `TRACE(level, ("REM"));
`INST_M_REMU: `TRACE(level, ("REMU"));
default: `TRACE(level, ("?"));
endcase
end
end else begin
if (`INST_ALU_IS_W(op_mod)) begin
if (use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDIW"));
`INST_ALU_SLL: `TRACE(level, ("SLLIW"));
`INST_ALU_SRL: `TRACE(level, ("SRLIW"));
`INST_ALU_SRA: `TRACE(level, ("SRAIW"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDW"));
`INST_ALU_SUB: `TRACE(level, ("SUBW"));
`INST_ALU_SLL: `TRACE(level, ("SLLW"));
`INST_ALU_SRL: `TRACE(level, ("SRLW"));
`INST_ALU_SRA: `TRACE(level, ("SRAW"));
default: `TRACE(level, ("?"));
endcase
end
end else begin
if (use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDI"));
`INST_ALU_SLL: `TRACE(level, ("SLLI"));
`INST_ALU_SRL: `TRACE(level, ("SRLI"));
`INST_ALU_SRA: `TRACE(level, ("SRAI"));
`INST_ALU_SLT: `TRACE(level, ("SLTI"));
`INST_ALU_SLTU: `TRACE(level, ("SLTIU"));
`INST_ALU_XOR: `TRACE(level, ("XORI"));
`INST_ALU_OR: `TRACE(level, ("ORI"));
`INST_ALU_AND: `TRACE(level, ("ANDI"));
`INST_ALU_LUI: `TRACE(level, ("LUI"));
`INST_ALU_AUIPC: `TRACE(level, ("AUIPC"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADD"));
`INST_ALU_SUB: `TRACE(level, ("SUB"));
`INST_ALU_SLL: `TRACE(level, ("SLL"));
`INST_ALU_SRL: `TRACE(level, ("SRL"));
`INST_ALU_SRA: `TRACE(level, ("SRA"));
`INST_ALU_SLT: `TRACE(level, ("SLT"));
`INST_ALU_SLTU: `TRACE(level, ("SLTU"));
`INST_ALU_XOR: `TRACE(level, ("XOR"));
`INST_ALU_OR: `TRACE(level, ("OR"));
`INST_ALU_AND: `TRACE(level, ("AND"));
default: `TRACE(level, ("?"));
endcase
end
end
end
end
`EX_LSU: begin
if (rd_float) begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LW: `TRACE(level, ("FLW"));
`INST_LSU_LD: `TRACE(level, ("FLD"));
`INST_LSU_SW: `TRACE(level, ("FSW"));
`INST_LSU_SD: `TRACE(level, ("FSD"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LB: `TRACE(level, ("LB"));
`INST_LSU_LH: `TRACE(level, ("LH"));
`INST_LSU_LW: `TRACE(level, ("LW"));
`INST_LSU_LD: `TRACE(level, ("LD"));
`INST_LSU_LBU:`TRACE(level, ("LBU"));
`INST_LSU_LHU:`TRACE(level, ("LHU"));
`INST_LSU_LWU:`TRACE(level, ("LWU"));
`INST_LSU_SB: `TRACE(level, ("SB"));
`INST_LSU_SH: `TRACE(level, ("SH"));
`INST_LSU_SW: `TRACE(level, ("SW"));
`INST_LSU_SD: `TRACE(level, ("SD"));
`INST_LSU_FENCE:`TRACE(level,("FENCE"));
default: `TRACE(level, ("?"));
endcase
end
end
`EX_FPU: begin
case (`INST_FPU_BITS'(op_type))
`INST_FPU_ADD: begin
if (fdst_d)
`TRACE(level, ("FADD.D"));
else
`TRACE(level, ("FADD.S"));
end
`INST_FPU_SUB: begin
if (fdst_d)
`TRACE(level, ("FSUB.D"));
else
`TRACE(level, ("FSUB.S"));
end
`INST_FPU_MUL: begin
if (fdst_d)
`TRACE(level, ("FMUL.D"));
else
`TRACE(level, ("FMUL.S"));
end
`INST_FPU_DIV: begin
if (fdst_d)
`TRACE(level, ("FDIV.D"));
else
`TRACE(level, ("FDIV.S"));
end
`INST_FPU_SQRT: begin
if (fdst_d)
`TRACE(level, ("FSQRT.D"));
else
`TRACE(level, ("FSQRT.S"));
end
`INST_FPU_MADD: begin
if (fdst_d)
`TRACE(level, ("FMADD.D"));
else
`TRACE(level, ("FMADD.S"));
end
`INST_FPU_MSUB: begin
if (fdst_d)
`TRACE(level, ("FMSUB.D"));
else
`TRACE(level, ("FMSUB.S"));
end
`INST_FPU_NMADD: begin
if (fdst_d)
`TRACE(level, ("FNMADD.D"));
else
`TRACE(level, ("FNMADD.S"));
end
`INST_FPU_NMSUB: begin
if (fdst_d)
`TRACE(level, ("FNMSUB.D"));
else
`TRACE(level, ("FNMSUB.S"));
end
`INST_FPU_CMP: begin
if (fdst_d) begin
case (op_mod[1:0])
0: `TRACE(level, ("FLE.D"));
1: `TRACE(level, ("FLT.D"));
2: `TRACE(level, ("FEQ.D"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (op_mod[1:0])
0: `TRACE(level, ("FLE.S"));
1: `TRACE(level, ("FLT.S"));
2: `TRACE(level, ("FEQ.S"));
default: `TRACE(level, ("?"));
endcase
end
end
`INST_FPU_F2F: begin
if (fdst_d) begin
`TRACE(level, ("FCVT.D.S"));
end else begin
`TRACE(level, ("FCVT.S.D"));
end
end
`INST_FPU_F2I: begin
if (fdst_d) begin
if (fcvt_l) begin
`TRACE(level, ("FCVT.L.D"));
end else begin
`TRACE(level, ("FCVT.W.D"));
end
end else begin
if (fcvt_l) begin
`TRACE(level, ("FCVT.L.S"));
end else begin
`TRACE(level, ("FCVT.W.S"));
end
end
end
`INST_FPU_F2U: begin
if (fdst_d) begin
if (fcvt_l) begin
`TRACE(level, ("FCVT.LU.D"));
end else begin
`TRACE(level, ("FCVT.WU.D"));
end
end else begin
if (fcvt_l) begin
`TRACE(level, ("FCVT.LU.S"));
end else begin
`TRACE(level, ("FCVT.WU.S"));
end
end
end
`INST_FPU_I2F: begin
if (fdst_d) begin
if (fcvt_l) begin
`TRACE(level, ("FCVT.D.L"));
end else begin
`TRACE(level, ("FCVT.D.W"));
end
end else begin
if (fcvt_l) begin
`TRACE(level, ("FCVT.S.L"));
end else begin
`TRACE(level, ("FCVT.S.W"));
end
end
end
`INST_FPU_U2F: begin
if (fdst_d) begin
if (fcvt_l) begin
`TRACE(level, ("FCVT.D.LU"));
end else begin
`TRACE(level, ("FCVT.D.WU"));
end
end else begin
if (fcvt_l) begin
`TRACE(level, ("FCVT.S.LU"));
end else begin
`TRACE(level, ("FCVT.S.WU"));
end
end
end
`INST_FPU_MISC: begin
if (fdst_d) begin
case (op_mod)
0: `TRACE(level, ("FSGNJ.D"));
1: `TRACE(level, ("FSGNJN.D"));
2: `TRACE(level, ("FSGNJX.D"));
3: `TRACE(level, ("FCLASS.D"));
4: `TRACE(level, ("FMV.X.D"));
5: `TRACE(level, ("FMV.D.X"));
6: `TRACE(level, ("FMIN.D"));
7: `TRACE(level, ("FMAX.D"));
endcase
end else begin
case (op_mod)
0: `TRACE(level, ("FSGNJ.S"));
1: `TRACE(level, ("FSGNJN.S"));
2: `TRACE(level, ("FSGNJX.S"));
3: `TRACE(level, ("FCLASS.S"));
4: `TRACE(level, ("FMV.X.S"));
5: `TRACE(level, ("FMV.S.X"));
6: `TRACE(level, ("FMIN.S"));
7: `TRACE(level, ("FMAX.S"));
endcase
end
end
default: `TRACE(level, ("?"));
endcase
end
`EX_SFU: begin
case (`INST_SFU_BITS'(op_type))
`INST_SFU_TMC: `TRACE(level, ("TMC"));
`INST_SFU_WSPAWN:`TRACE(level, ("WSPAWN"));
`INST_SFU_SPLIT: `TRACE(level, ("SPLIT"));
`INST_SFU_JOIN: `TRACE(level, ("JOIN"));
`INST_SFU_BAR: `TRACE(level, ("BAR"));
`INST_SFU_PRED: `TRACE(level, ("PRED"));
`INST_SFU_CSRRW: begin if (use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end
`INST_SFU_CSRRS: begin if (use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end
`INST_SFU_CSRRC: begin if (use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end
default: `TRACE(level, ("?"));
endcase
end
default: `TRACE(level, ("?"));
endcase
endtask
task trace_base_dcr(input int level, input [`VX_DCR_ADDR_WIDTH-1:0] addr);
case (addr)
`VX_DCR_BASE_STARTUP_ADDR0: `TRACE(level, ("STARTUP_ADDR0"));
`VX_DCR_BASE_STARTUP_ADDR1: `TRACE(level, ("STARTUP_ADDR1"));
`VX_DCR_BASE_MPM_CLASS: `TRACE(level, ("MPM_CLASS"));
default: `TRACE(level, ("?"));
endcase
endtask
`endif
`endif // VX_TRACE_VH

399
hw/rtl/core/VX_trace_pkg.sv Normal file
View file

@ -0,0 +1,399 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`ifndef VX_TRACE_PKG_VH
`define VX_TRACE_PKG_VH
`include "VX_define.vh"
package VX_trace_pkg;
`ifdef SIMULATION
`ifdef SV_DPI
import "DPI-C" function void dpi_trace(input int level, input string format /*verilator sformat*/);
`endif
import VX_gpu_pkg::*;
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
case (ex_type)
`EX_ALU: `TRACE(level, ("ALU"));
`EX_LSU: `TRACE(level, ("LSU"));
`EX_FPU: `TRACE(level, ("FPU"));
`EX_SFU: `TRACE(level, ("SFU"));
default: `TRACE(level, ("?"));
endcase
endtask
task trace_ex_op(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input VX_gpu_pkg::op_args_t op_args
);
case (ex_type)
`EX_ALU: begin
case (op_args.alu.xtype)
`ALU_TYPE_ARITH: begin
if (op_args.alu.is_w) begin
if (op_args.alu.use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDIW"));
`INST_ALU_SLL: `TRACE(level, ("SLLIW"));
`INST_ALU_SRL: `TRACE(level, ("SRLIW"));
`INST_ALU_SRA: `TRACE(level, ("SRAIW"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDW"));
`INST_ALU_SUB: `TRACE(level, ("SUBW"));
`INST_ALU_SLL: `TRACE(level, ("SLLW"));
`INST_ALU_SRL: `TRACE(level, ("SRLW"));
`INST_ALU_SRA: `TRACE(level, ("SRAW"));
default: `TRACE(level, ("?"));
endcase
end
end else begin
if (op_args.alu.use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDI"));
`INST_ALU_SLL: `TRACE(level, ("SLLI"));
`INST_ALU_SRL: `TRACE(level, ("SRLI"));
`INST_ALU_SRA: `TRACE(level, ("SRAI"));
`INST_ALU_SLT: `TRACE(level, ("SLTI"));
`INST_ALU_SLTU: `TRACE(level, ("SLTIU"));
`INST_ALU_XOR: `TRACE(level, ("XORI"));
`INST_ALU_OR: `TRACE(level, ("ORI"));
`INST_ALU_AND: `TRACE(level, ("ANDI"));
`INST_ALU_LUI: `TRACE(level, ("LUI"));
`INST_ALU_AUIPC: `TRACE(level, ("AUIPC"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADD"));
`INST_ALU_SUB: `TRACE(level, ("SUB"));
`INST_ALU_SLL: `TRACE(level, ("SLL"));
`INST_ALU_SRL: `TRACE(level, ("SRL"));
`INST_ALU_SRA: `TRACE(level, ("SRA"));
`INST_ALU_SLT: `TRACE(level, ("SLT"));
`INST_ALU_SLTU: `TRACE(level, ("SLTU"));
`INST_ALU_XOR: `TRACE(level, ("XOR"));
`INST_ALU_OR: `TRACE(level, ("OR"));
`INST_ALU_AND: `TRACE(level, ("AND"));
`INST_ALU_CZEQ: `TRACE(level, ("CZERO.EQZ"));
`INST_ALU_CZNE: `TRACE(level, ("CZERO.NEZ"));
default: `TRACE(level, ("?"));
endcase
end
end
end
`ALU_TYPE_BRANCH: begin
case (`INST_BR_BITS'(op_type))
`INST_BR_EQ: `TRACE(level, ("BEQ"));
`INST_BR_NE: `TRACE(level, ("BNE"));
`INST_BR_LT: `TRACE(level, ("BLT"));
`INST_BR_GE: `TRACE(level, ("BGE"));
`INST_BR_LTU: `TRACE(level, ("BLTU"));
`INST_BR_GEU: `TRACE(level, ("BGEU"));
`INST_BR_JAL: `TRACE(level, ("JAL"));
`INST_BR_JALR: `TRACE(level, ("JALR"));
`INST_BR_ECALL: `TRACE(level, ("ECALL"));
`INST_BR_EBREAK:`TRACE(level, ("EBREAK"));
`INST_BR_URET: `TRACE(level, ("URET"));
`INST_BR_SRET: `TRACE(level, ("SRET"));
`INST_BR_MRET: `TRACE(level, ("MRET"));
default: `TRACE(level, ("?"));
endcase
end
`ALU_TYPE_MULDIV: begin
if (op_args.alu.is_w) begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MULW"));
`INST_M_DIV: `TRACE(level, ("DIVW"));
`INST_M_DIVU: `TRACE(level, ("DIVUW"));
`INST_M_REM: `TRACE(level, ("REMW"));
`INST_M_REMU: `TRACE(level, ("REMUW"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MUL"));
`INST_M_MULH: `TRACE(level, ("MULH"));
`INST_M_MULHSU:`TRACE(level, ("MULHSU"));
`INST_M_MULHU: `TRACE(level, ("MULHU"));
`INST_M_DIV: `TRACE(level, ("DIV"));
`INST_M_DIVU: `TRACE(level, ("DIVU"));
`INST_M_REM: `TRACE(level, ("REM"));
`INST_M_REMU: `TRACE(level, ("REMU"));
default: `TRACE(level, ("?"));
endcase
end
end
default: `TRACE(level, ("?"));
endcase
end
`EX_LSU: begin
if (op_args.lsu.is_float) begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LW: `TRACE(level, ("FLW"));
`INST_LSU_LD: `TRACE(level, ("FLD"));
`INST_LSU_SW: `TRACE(level, ("FSW"));
`INST_LSU_SD: `TRACE(level, ("FSD"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LB: `TRACE(level, ("LB"));
`INST_LSU_LH: `TRACE(level, ("LH"));
`INST_LSU_LW: `TRACE(level, ("LW"));
`INST_LSU_LD: `TRACE(level, ("LD"));
`INST_LSU_LBU:`TRACE(level, ("LBU"));
`INST_LSU_LHU:`TRACE(level, ("LHU"));
`INST_LSU_LWU:`TRACE(level, ("LWU"));
`INST_LSU_SB: `TRACE(level, ("SB"));
`INST_LSU_SH: `TRACE(level, ("SH"));
`INST_LSU_SW: `TRACE(level, ("SW"));
`INST_LSU_SD: `TRACE(level, ("SD"));
`INST_LSU_FENCE:`TRACE(level,("FENCE"));
default: `TRACE(level, ("?"));
endcase
end
end
`EX_FPU: begin
case (`INST_FPU_BITS'(op_type))
`INST_FPU_ADD: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FADD.D"));
else
`TRACE(level, ("FADD.S"));
end
`INST_FPU_SUB: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FSUB.D"));
else
`TRACE(level, ("FSUB.S"));
end
`INST_FPU_MUL: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FMUL.D"));
else
`TRACE(level, ("FMUL.S"));
end
`INST_FPU_DIV: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FDIV.D"));
else
`TRACE(level, ("FDIV.S"));
end
`INST_FPU_SQRT: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FSQRT.D"));
else
`TRACE(level, ("FSQRT.S"));
end
`INST_FPU_MADD: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FMADD.D"));
else
`TRACE(level, ("FMADD.S"));
end
`INST_FPU_MSUB: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FMSUB.D"));
else
`TRACE(level, ("FMSUB.S"));
end
`INST_FPU_NMADD: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FNMADD.D"));
else
`TRACE(level, ("FNMADD.S"));
end
`INST_FPU_NMSUB: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FNMSUB.D"));
else
`TRACE(level, ("FNMSUB.S"));
end
`INST_FPU_CMP: begin
if (op_args.fpu.fmt[0]) begin
case (op_args.fpu.frm[1:0])
0: `TRACE(level, ("FLE.D"));
1: `TRACE(level, ("FLT.D"));
2: `TRACE(level, ("FEQ.D"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (op_args.fpu.frm[1:0])
0: `TRACE(level, ("FLE.S"));
1: `TRACE(level, ("FLT.S"));
2: `TRACE(level, ("FEQ.S"));
default: `TRACE(level, ("?"));
endcase
end
end
`INST_FPU_F2F: begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FCVT.D.S"));
end else begin
`TRACE(level, ("FCVT.S.D"));
end
end
`INST_FPU_F2I: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.L.D"));
end else begin
`TRACE(level, ("FCVT.W.D"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.L.S"));
end else begin
`TRACE(level, ("FCVT.W.S"));
end
end
end
`INST_FPU_F2U: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.LU.D"));
end else begin
`TRACE(level, ("FCVT.WU.D"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.LU.S"));
end else begin
`TRACE(level, ("FCVT.WU.S"));
end
end
end
`INST_FPU_I2F: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.D.L"));
end else begin
`TRACE(level, ("FCVT.D.W"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.S.L"));
end else begin
`TRACE(level, ("FCVT.S.W"));
end
end
end
`INST_FPU_U2F: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.D.LU"));
end else begin
`TRACE(level, ("FCVT.D.WU"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.S.LU"));
end else begin
`TRACE(level, ("FCVT.S.WU"));
end
end
end
`INST_FPU_MISC: begin
if (op_args.fpu.fmt[0]) begin
case (op_args.fpu.frm)
0: `TRACE(level, ("FSGNJ.D"));
1: `TRACE(level, ("FSGNJN.D"));
2: `TRACE(level, ("FSGNJX.D"));
3: `TRACE(level, ("FCLASS.D"));
4: `TRACE(level, ("FMV.X.D"));
5: `TRACE(level, ("FMV.D.X"));
6: `TRACE(level, ("FMIN.D"));
7: `TRACE(level, ("FMAX.D"));
endcase
end else begin
case (op_args.fpu.frm)
0: `TRACE(level, ("FSGNJ.S"));
1: `TRACE(level, ("FSGNJN.S"));
2: `TRACE(level, ("FSGNJX.S"));
3: `TRACE(level, ("FCLASS.S"));
4: `TRACE(level, ("FMV.X.S"));
5: `TRACE(level, ("FMV.S.X"));
6: `TRACE(level, ("FMIN.S"));
7: `TRACE(level, ("FMAX.S"));
endcase
end
end
default: `TRACE(level, ("?"));
endcase
end
`EX_SFU: begin
case (`INST_SFU_BITS'(op_type))
`INST_SFU_TMC: `TRACE(level, ("TMC"));
`INST_SFU_WSPAWN:`TRACE(level, ("WSPAWN"));
`INST_SFU_SPLIT: begin if (op_args.wctl.is_neg) `TRACE(level, ("SPLIT.N")); else `TRACE(level, ("SPLIT")); end
`INST_SFU_JOIN: `TRACE(level, ("JOIN"));
`INST_SFU_BAR: `TRACE(level, ("BAR"));
`INST_SFU_PRED: begin if (op_args.wctl.is_neg) `TRACE(level, ("PRED.N")); else `TRACE(level, ("PRED")); end
`INST_SFU_CSRRW: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end
`INST_SFU_CSRRS: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end
`INST_SFU_CSRRC: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end
default: `TRACE(level, ("?"));
endcase
end
default: `TRACE(level, ("?"));
endcase
endtask
task trace_op_args(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input VX_gpu_pkg::op_args_t op_args
);
case (ex_type)
`EX_ALU: begin
`TRACE(level, (", use_PC=%b, use_imm=%b, imm=0x%0h", op_args.alu.use_PC, op_args.alu.use_imm, op_args.alu.imm));
end
`EX_LSU: begin
`TRACE(level, (", offset=0x%0h", op_args.lsu.offset));
end
`EX_FPU: begin
`TRACE(level, (", fmt=0x%0h, frm=0x%0h", op_args.fpu.fmt, op_args.fpu.frm));
end
`EX_SFU: begin
if (`INST_SFU_IS_CSR(op_type)) begin
`TRACE(level, (", addr=0x%0h, use_imm=%b, imm=0x%0h", op_args.csr.addr, op_args.csr.use_imm, op_args.csr.imm));
end
end
default:;
endcase
endtask
task trace_base_dcr(input int level, input [`VX_DCR_ADDR_WIDTH-1:0] addr);
case (addr)
`VX_DCR_BASE_STARTUP_ADDR0: `TRACE(level, ("STARTUP_ADDR0"));
`VX_DCR_BASE_STARTUP_ADDR1: `TRACE(level, ("STARTUP_ADDR1"));
`VX_DCR_BASE_STARTUP_ARG0: `TRACE(level, ("STARTUP_ARG0"));
`VX_DCR_BASE_STARTUP_ARG1: `TRACE(level, ("STARTUP_ARG1"));
`VX_DCR_BASE_MPM_CLASS: `TRACE(level, ("MPM_CLASS"));
default: `TRACE(level, ("?"));
endcase
endtask
`endif
endpackage
`endif // VX_TRACE_PKG_VH

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_wctl_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter `STRING INSTANCE_ID = "",
parameter NUM_LANES = 1
) (
input wire clk,
@ -22,22 +22,22 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
// Inputs
VX_execute_if.slave execute_if,
// Outputs
VX_warp_ctl_if.master warp_ctl_if,
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam LANE_BITS = `CLOG2(NUM_LANES);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + WCTL_WIDTH + PID_WIDTH + 1 + 1;
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + WCTL_WIDTH + PID_WIDTH + 1 + 1 + `DV_STACK_SIZEW;
`UNUSED_VAR (execute_if.data.rs3_data)
tmc_t tmc, tmc_r;
wspawn_t wspawn, wspawn_r;
wspawn_t wspawn, wspawn_r;
split_t split, split_r;
join_t sjoin, sjoin_r;
barrier_t barrier, barrier_r;
@ -55,14 +55,16 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
end else begin
assign tid = 0;
end
wire [`XLEN-1:0] rs1_data = execute_if.data.rs1_data[tid];
wire [`XLEN-1:0] rs2_data = execute_if.data.rs2_data[tid];
`UNUSED_VAR (rs1_data)
wire not_pred = execute_if.data.op_args.wctl.is_neg;
wire [NUM_LANES-1:0] taken;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign taken[i] = execute_if.data.rs1_data[i][0];
assign taken[i] = (execute_if.data.rs1_data[i][0] ^ not_pred);
end
reg [`NUM_THREADS-1:0] then_tmask_r, then_tmask_n;
@ -93,17 +95,27 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
assign tmc.tmask = is_pred ? pred_mask : rs1_data[`NUM_THREADS-1:0];
// split
wire [`CLOG2(`NUM_THREADS+1)-1:0] then_tmask_cnt, else_tmask_cnt;
`POP_COUNT(then_tmask_cnt, then_tmask_n);
`POP_COUNT(else_tmask_cnt, else_tmask_n);
wire then_first = (then_tmask_cnt >= else_tmask_cnt);
wire [`NUM_THREADS-1:0] taken_tmask = then_first ? then_tmask_n : else_tmask_n;
wire [`NUM_THREADS-1:0] ntaken_tmask = then_first ? else_tmask_n : then_tmask_n;
assign split.valid = is_split;
assign split.is_dvg = has_then && has_else;
assign split.then_tmask = then_tmask_n;
assign split.else_tmask = else_tmask_n;
assign split.next_pc = execute_if.data.PC + 4;
assign split.then_tmask = taken_tmask;
assign split.else_tmask = ntaken_tmask;
assign split.next_pc = execute_if.data.PC + `PC_BITS'(2);
assign warp_ctl_if.dvstack_wid = execute_if.data.wid;
wire [`DV_STACK_SIZEW-1:0] dvstack_ptr;
// join
assign sjoin.valid = is_join;
assign sjoin.is_dvg = rs1_data[0];
assign sjoin.valid = is_join;
assign sjoin.stack_ptr = rs1_data[`DV_STACK_SIZEW-1:0];
// barrier
assign barrier.valid = is_bar;
@ -114,6 +126,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
assign barrier.is_global = 1'b0;
`endif
assign barrier.size_m1 = rs2_data[$bits(barrier.size_m1)-1:0] - $bits(barrier.size_m1)'(1);
assign barrier.is_noop = (rs2_data[$bits(barrier.size_m1)-1:0] == $bits(barrier.size_m1)'(1));
// wspawn
@ -123,10 +136,10 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
end
assign wspawn.valid = is_wspawn;
assign wspawn.wmask = wspawn_wmask;
assign wspawn.pc = rs2_data;
assign wspawn.pc = rs2_data[1 +: `PC_BITS];
// response
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2)
@ -135,8 +148,8 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
.reset (reset),
.valid_in (execute_if.valid),
.ready_in (execute_if.ready),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, {tmc, wspawn, split, sjoin, barrier}}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, {tmc_r, wspawn_r, split_r, sjoin_r, barrier_r}}),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, {tmc, wspawn, split, sjoin, barrier}, warp_ctl_if.dvstack_ptr}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, {tmc_r, wspawn_r, split_r, sjoin_r, barrier_r}, dvstack_ptr}),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready)
);
@ -148,9 +161,9 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
assign warp_ctl_if.split = split_r;
assign warp_ctl_if.sjoin = sjoin_r;
assign warp_ctl_if.barrier = barrier_r;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign commit_if.data.data[i] = `XLEN'(split_r.is_dvg);
assign commit_if.data.data[i] = `XLEN'(dvstack_ptr);
end
endmodule

Some files were not shown because too many files have changed in this diff Show more