This commit is contained in:
Hanran Wu 2024-08-23 17:44:24 -04:00
parent e7660b6ffe
commit ea9560b33b
186 changed files with 36003 additions and 4008 deletions

270
.github/workflows/ci.yml vendored Normal file
View file

@ -0,0 +1,270 @@
# Copyright © 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: CI
on: [push, pull_request]
jobs:
setup:
runs-on: ubuntu-20.04
steps:
- name: Checkout code
uses: actions/checkout@v2
with:
submodules: recursive
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v2
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
restore-keys: |
${{ runner.os }}-toolchain-
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v2
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
restore-keys: |
${{ runner.os }}-thirdparty-
- name: Install Dependencies
if: steps.cache-toolchain.outputs.cache-hit != 'true' || steps.cache-thirdparty.outputs.cache-hit != 'true'
run: |
sudo bash ./ci/system_updates.sh
- name: Setup Toolchain
if: steps.cache-toolchain.outputs.cache-hit != 'true'
run: |
TOOLDIR=$PWD/tools
mkdir -p build
cd build
../configure --tooldir=$TOOLDIR
ci/toolchain_install.sh --all
- name: Setup Third Party
if: steps.cache-thirdparty.outputs.cache-hit != 'true'
run: |
make -C third_party > /dev/null
# build:
# runs-on: ubuntu-20.04
# needs: setup
# strategy:
# matrix:
# xlen: [32, 64]
# steps:
# - name: Checkout code
# uses: actions/checkout@v2
# - name: Install Dependencies
# run: |
# sudo bash ./ci/system_updates.sh
# - name: Cache Toolchain Directory
# id: cache-toolchain
# uses: actions/cache@v2
# with:
# path: tools
# key: ${{ runner.os }}-toolchain-v0.1
# restore-keys: |
# ${{ runner.os }}-toolchain-
# - name: Cache Third Party Directory
# id: cache-thirdparty
# uses: actions/cache@v2
# with:
# path: third_party
# key: ${{ runner.os }}-thirdparty-v0.1
# restore-keys: |
# ${{ runner.os }}-thirdparty-
# - name: Run Build
# run: |
# TOOLDIR=$PWD/tools
# mkdir -p build${{ matrix.xlen }}
# cd build${{ matrix.xlen }}
# ../configure --tooldir=$TOOLDIR --xlen=${{ matrix.xlen }}
# source ci/toolchain_env.sh
# make software -s > /dev/null
# make tests -s > /dev/null
# - name: Upload Build Artifact
# uses: actions/upload-artifact@v2
# with:
# name: build-${{ matrix.xlen }}
# path: build${{ matrix.xlen }}
# tests:
# runs-on: ubuntu-20.04
# needs: build
# strategy:
# matrix:
# name: [regression, opencl, config1, config2, debug, stress]
# xlen: [32, 64]
# steps:
# - name: Checkout code
# uses: actions/checkout@v2
# - name: Install Dependencies
# run: |
# sudo bash ./ci/system_updates.sh
# - name: Cache Toolchain Directory
# id: cache-toolchain
# uses: actions/cache@v2
# with:
# path: tools
# key: ${{ runner.os }}-toolchain-v0.1
# restore-keys: |
# ${{ runner.os }}-toolchain-
# - name: Cache Third Party Directory
# id: cache-thirdparty
# uses: actions/cache@v2
# with:
# path: third_party
# key: ${{ runner.os }}-thirdparty-v0.1
# restore-keys: |
# ${{ runner.os }}-thirdparty-
# - name: Download Build Artifact
# uses: actions/download-artifact@v2
# with:
# name: build-${{ matrix.xlen }}
# path: build${{ matrix.xlen }}
# - name: Run tests
# run: |
# cd build${{ matrix.xlen }}
# source ci/toolchain_env.sh
# chmod -R +x . # Ensure all files have executable permissions
# if [ "${{ matrix.name }}" == "regression" ]; then
# ./ci/regression.sh --unittest
# ./ci/regression.sh --isa
# ./ci/regression.sh --kernel
# ./ci/regression.sh --synthesis
# ./ci/regression.sh --regression
# else
# ./ci/regression.sh --${{ matrix.name }}
# fi
build_vm:
runs-on: ubuntu-20.04
needs: setup
strategy:
matrix:
xlen: [32, 64]
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Install Dependencies
run: |
sudo bash ./ci/system_updates.sh
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v2
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
restore-keys: |
${{ runner.os }}-toolchain-
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v2
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
restore-keys: |
${{ runner.os }}-thirdparty-
- name: Run Build
run: |
TOOLDIR=$PWD/tools
mkdir -p build${{ matrix.xlen }}-vm
cd build${{ matrix.xlen }}-vm
../configure --tooldir=$TOOLDIR --xlen=${{ matrix.xlen }} --vm_enable=1
source ci/toolchain_env.sh
make software -s > /dev/null
make tests -s > /dev/null
- name: Upload Build Artifact
uses: actions/upload-artifact@v2
with:
name: build-${{ matrix.xlen }}-vm
path: build${{ matrix.xlen }}-vm
test_vm:
runs-on: ubuntu-20.04
needs: build_vm
strategy:
matrix:
xlen: [32, 64]
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Install Dependencies
run: |
sudo bash ./ci/system_updates.sh
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v2
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
restore-keys: |
${{ runner.os }}-toolchain-
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v2
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
restore-keys: |
${{ runner.os }}-thirdparty-
- name: Download Build Artifact
uses: actions/download-artifact@v2
with:
name: build-${{ matrix.xlen }}-vm
path: build${{ matrix.xlen }}-vm
- name: Run tests
run: |
cd build${{ matrix.xlen }}-vm
source ci/toolchain_env.sh
chmod -R +x . # Ensure all files have executable permissions
./ci/regression.sh --vm
complete:
runs-on: ubuntu-20.04
needs: test_vm
steps:
- name: Check Completion
run: echo "All matrix jobs passed"

1
.gitignore vendored
View file

@ -1,3 +1,4 @@
/build*
/.vscode
*.cache
*.code-workspace

3
.gitmodules vendored
View file

@ -6,5 +6,4 @@
url = https://github.com/ucb-bar/berkeley-softfloat-3.git
[submodule "third_party/ramulator"]
path = third_party/ramulator
url = https://github.com/CMU-SAFARI/ramulator.git
ignore = dirty
url = https://github.com/CMU-SAFARI/ramulator2.git

View file

@ -1,118 +0,0 @@
language: cpp
dist: focal
os: linux
compiler: gcc
addons:
apt:
packages:
- build-essential
- valgrind
- libstdc++6
- binutils
- python
- uuid-dev
env:
global:
- TOOLDIR=$HOME/tools
cache:
directories:
- $TOOLDIR
- $HOME/third_party
- $HOME/build32
- $HOME/build64
before_install:
- if [ ! -d "$TOOLDIR" ] || [ -z "$(ls -A $TOOLDIR)" ] || [ "$(cat "$TOOLDIR/version.txt")" != "v0.4" ]; then
rm -rf $TOOLDIR;
mkdir -p $TRAVIS_BUILD_DIR/build && cd $TRAVIS_BUILD_DIR/build;
../configure --tooldir=$TOOLDIR;
ci/toolchain_install.sh --all;
echo "v0.3" > "$TOOLDIR/version.txt";
else
echo "using existing tooldir build";
fi
- if [ ! -d "$HOME/third_party" ] || [ -z "$(ls -A $HOME/third_party)" ] || [ "$(cat "$HOME/third_party/version.txt")" != "v0.2" ]; then
cd $TRAVIS_BUILD_DIR;
make -C third_party > /dev/null;
echo "v0.2" > "third_party/version.txt";
cp -rf third_party $HOME;
else
echo "using existing third_party build";
cp -rf $HOME/third_party $TRAVIS_BUILD_DIR;
fi
install:
- if [ ! -d "$HOME/build$XLEN" ] || [ -z "$(ls -A $HOME/build$XLEN)" ] || [ "$(cat "$HOME/build$XLEN/version.txt")" != "$TRAVIS_COMMIT" ]; then
mkdir -p $TRAVIS_BUILD_DIR/build$XLEN && cd $TRAVIS_BUILD_DIR/build$XLEN;
../configure --tooldir=$TOOLDIR --xlen=$XLEN;
source ci/toolchain_env.sh;
make build -s > /dev/null;
echo "$TRAVIS_COMMIT" > version.txt;
cp -rf $TRAVIS_BUILD_DIR/build$XLEN $HOME;
else
echo "using existing build for commit $TRAVIS_COMMIT";
cp -rf $HOME/build$XLEN $TRAVIS_BUILD_DIR;
fi
before_script:
- cd $TRAVIS_BUILD_DIR/build$XLEN
- source ci/toolchain_env.sh
stages:
- test
jobs:
include:
- stage: test
name: regression32
env: XLEN=32
script:
- ./ci/travis_run.py ./ci/regression.sh --unittest
- ./ci/travis_run.py ./ci/regression.sh --isa
- ./ci/travis_run.py ./ci/regression.sh --kernel
- ./ci/travis_run.py ./ci/regression.sh --synthesis
- ./ci/travis_run.py ./ci/regression.sh --regression
- ./ci/travis_run.py ./ci/regression.sh --opencl
- stage: test
name: regression64
env: XLEN=64
script:
- ./ci/travis_run.py ./ci/regression.sh --isa
- ./ci/travis_run.py ./ci/regression.sh --kernel
- ./ci/travis_run.py ./ci/regression.sh --synthesis
- ./ci/travis_run.py ./ci/regression.sh --regression
- ./ci/travis_run.py ./ci/regression.sh --opencl
- stage: test
name: config
env: XLEN=32
script:
- ./ci/travis_run.py ./ci/regression.sh --cluster
- ./ci/travis_run.py ./ci/regression.sh --config
- stage: test
name: debug
env: XLEN=32
script:
- ./ci/travis_run.py ./ci/regression.sh --debug
- ./ci/travis_run.py ./ci/regression.sh --stress
- stage: test
name: virtual_memory
env: XLEN=32
env: VM_DISABLE=1
script:
- ./ci/travis_run.py ./ci/regression.sh --regression
- ./ci/travis_run.py ./ci/regression.sh --opencl
- stage: test
name: virtual_memory
env: XLEN=64
env: VM_DISABLE=1
script:
- ./ci/travis_run.py ./ci/regression.sh --regression
- ./ci/travis_run.py ./ci/regression.sh --opencl

View file

@ -1,5 +1,15 @@
include config.mk
.PHONY: build software tests
vm:
$(MAKE) -C $(VORTEX_HOME)/third_party
$(MAKE) -C hw
$(MAKE) -C sim simx
$(MAKE) -C kernel
$(MAKE) -C runtime vm
$(MAKE) -C tests
all:
$(MAKE) -C $(VORTEX_HOME)/third_party
$(MAKE) -C hw
@ -15,13 +25,24 @@ build:
$(MAKE) -C runtime
$(MAKE) -C tests
clean:
software:
$(MAKE) -C hw
$(MAKE) -C kernel
$(MAKE) -C runtime/stub
tests:
$(MAKE) -C tests
clean-build:
$(MAKE) -C hw clean
$(MAKE) -C sim clean
$(MAKE) -C kernel clean
$(MAKE) -C runtime clean
$(MAKE) -C tests clean
clean: clean-build
$(MAKE) -C $(VORTEX_HOME)/third_party clean
# Install setup
KERNEL_INC_DST = $(PREFIX)/kernel/include
KERNEL_LIB_DST = $(PREFIX)/kernel/lib$(XLEN)

View file

@ -56,7 +56,7 @@ More detailed build instructions can be found [here](docs/install_vortex.md).
```
### Install Vortex codebase
```
git clone --depth=1 --recursive git@github.com:vortexgpgpu/vortex.git -b vortex_vm
git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git -b vortex_vm
cd vortex
```
@ -68,18 +68,18 @@ More detailed build instructions can be found [here](docs/install_vortex.md).
mkdir out
export OUT_DIR=`pwd`/out
cd build
../configure --xlen=32 --tooldir=/software/vortex-toolchain-2024-6-14 --prefix=$OUT_DIR
# Run the following to disble virtual memory feature in compilation
../configure --xlen=32 --tooldir=/software/vortex-toolchain-2024-2024-08-09 --prefix=$OUT_DIR
# Run the following instead to enable virtual memory feature in compilation
../configure --xlen=32 --tooldir=/software/vortex-toolchain-2024-2024-08-09 --prefix=$OUT_DIR --vm_enable=1
### Install prebuilt toolchain
# We will use the precomipled tools in volvo toolchanin directory
### set environment variables
# should always run before using the toolchain!
source ./ci/toolchain_env.sh
### Building Vortex
make -s
### Quick demo running vecadd OpenCL kernel on 2 cores
$ ./ci/blackbox.sh --cores=2 --app=vecadd

View file

@ -25,37 +25,6 @@ XLEN=${XLEN:=@XLEN@}
echo "Vortex Regression Test: XLEN=$XLEN"
split_file() {
if [[ $# -ne 2 ]]; then
echo "Usage: $0 <filename> <start_with>"
return 1
fi
input_file="$1"
start_with="$2"
if [[ ! -r "$input_file" ]]; then
echo "Error: File '$input_file' is not readable or does not exist."
return 1
fi
count=0
output_file=""
while IFS= read -r line; do
if [[ $line == $start_with* ]]; then
count=$((count + 1))
output_file="$input_file.part$count"
> "$output_file" # ensure empty
fi
if [[ -n "$output_file" ]]; then
echo "$line" >> "$output_file"
fi
done < "$input_file"
if [[ $count -eq 0 ]]; then
echo "No lines starting with '$start_with' were found in '$input_file'."
fi
}
###############################################################################
unittest()
{
make -C tests/unittest run
@ -66,6 +35,9 @@ isa()
{
echo "begin isa tests..."
make -C sim/simx
make -C sim/rtlsim
make -C tests/riscv/isa run-simx
make -C tests/riscv/isa run-rtlsim
@ -96,8 +68,8 @@ isa()
make -C tests/riscv/isa run-rtlsim-64fx
fi
# restore default prebuilt configuration
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
# clean build
make -C sim/rtlsim clean
echo "isa tests done!"
}
@ -106,6 +78,9 @@ kernel()
{
echo "begin kernel tests..."
make -C sim/simx
make -C sim/rtlsim
make -C tests/kernel run-simx
make -C tests/kernel run-rtlsim
@ -116,6 +91,9 @@ regression()
{
echo "begin regression tests..."
make -C runtime/simx
make -C runtime/rtlsim
make -C tests/regression run-simx
make -C tests/regression run-rtlsim
@ -134,6 +112,9 @@ opencl()
{
echo "begin opencl tests..."
make -C runtime/simx
make -C runtime/rtlsim
make -C tests/opencl run-simx
make -C tests/opencl run-rtlsim
@ -143,24 +124,28 @@ opencl()
echo "opencl tests done!"
}
cluster()
{
echo "begin clustering tests..."
vm(){
echo "begin vm tests..."
# cores clustering
./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=diverge --args="-n1"
make -C sim/simx
make -C runtime/simx
# L2/L3
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=diverge --args="-n1"
make -C tests/kernel run-simx
echo "clustering tests done!"
# Regression tests
make -C tests/regression run-simx
# test global barrier
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tgbar" --cores=2
# test local barrier
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar"
# OpenCL tests
make -C tests/opencl run-simx
./ci/blackbox.sh --driver=simx --app=lbm --warps=8
echo "vm tests done!"
}
test_csv_trace()
@ -170,29 +155,20 @@ test_csv_trace()
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-simx-32im > run_simx.log
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
split_file run_simx.log "Running "
split_file run_rtlsim.log "Running "
for file in ./run_simx.log.part*; do
if [[ -f "$file" ]]; then
file2="${file//simx/rtlsim}"
if [[ -f "$file2" ]]; then
./ci/trace_csv.py -tsimx $file -otrace_simx.csv
./ci/trace_csv.py -trtlsim $file2 -otrace_rtlsim.csv
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
diff trace_rtlsim.csv trace_simx.csv
else
echo "File $file2 not found."
fi
fi
done
# restore default prebuilt configuration
make -C sim/simx clean && make -C sim/simx > /dev/null
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
# clean build
make -C sim/simx clean
make -C sim/rtlsim clean
}
debug()
{
echo "begin debugging tests..."
test_csv_trace
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1"
@ -200,21 +176,23 @@ debug()
echo "debugging tests done!"
}
config()
config1()
{
echo "begin configuration tests..."
echo "begin configuration-1 tests..."
# warp/threads configurations
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=2 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=diverge
./ci/blackbox.sh --driver=simx --cores=1 --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=simx --cores=1 --warps=8 --threads=16 --app=diverge
# warp/threads
./ci/blackbox.sh --driver=rtlsim --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=rtlsim --warps=2 --threads=2 --app=diverge
./ci/blackbox.sh --driver=rtlsim --warps=2 --threads=8 --app=diverge
./ci/blackbox.sh --driver=rtlsim --warps=8 --threads=2 --app=diverge
./ci/blackbox.sh --driver=simx --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=simx --warps=8 --threads=16 --app=diverge
# disable DPI
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
# cores clustering
./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=diverge --args="-n1"
# issue width
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
@ -240,6 +218,31 @@ config()
CONFIGS="-DISSUE_WIDTH=2 -DNUM_LSU_BLOCK=1 -DNUM_LSU_LANES=2" ./ci/blackbox.sh --driver=simx --app=vecaddx
CONFIGS="-DISSUE_WIDTH=4 -DNUM_LSU_BLOCK=4 -DNUM_LSU_LANES=4" ./ci/blackbox.sh --driver=simx --app=vecaddx
# L2/L3
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=diverge --args="-n1"
# multiple L1 caches per socket
CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=2 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=8 --warps=1 --threads=2
echo "configuration-1 tests done!"
}
config2()
{
echo "begin configuration-2 tests..."
# test opaesim
./ci/blackbox.sh --driver=opae --app=printf
./ci/blackbox.sh --driver=opae --app=diverge
# disable DPI
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
# custom program startup address
make -C tests/regression/dogfood clean-kernel
if [ "$XLEN" == "64" ]; then
@ -249,55 +252,57 @@ config()
fi
./ci/blackbox.sh --driver=simx --app=dogfood
./ci/blackbox.sh --driver=rtlsim --app=dogfood
make -C tests/regression/dogfood clean-kernel
# disabling M & F extensions
make -C sim/rtlsim clean && CONFIGS="-DEXT_M_DISABLE -DEXT_F_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32i
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
make -C sim/rtlsim clean
# disabling ZICOND extension
CONFIGS="-DEXT_ZICOND_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo
# disable local memory
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo --perf=1
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --cores=1 --app=demo --perf=1
# disable L1 cache
CONFIGS="-DL1_DISABLE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemmx
CONFIGS="-DL1_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemmx
CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemmx
CONFIGS="-DICACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemmx
# multiple L1 caches per socket
CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=2 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=8 --warps=1 --threads=2
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo --perf=1
CONFIGS="-DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=demo --perf=1
# test AXI bus
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --app=demo
# disable L1 cache
CONFIGS="-DL1_DISABLE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
# reduce l1 line size
CONFIGS="-DL1_LINE_SIZE=4" ./ci/blackbox.sh --driver=rtlsim --app=io_addr
CONFIGS="-DL1_LINE_SIZE=4" ./ci/blackbox.sh --driver=simx --app=io_addr
CONFIGS="-DL1_LINE_SIZE=4 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_LINE_SIZE=4 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DL1_LINE_SIZE=$XLEN/8" ./ci/blackbox.sh --driver=rtlsim --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XLEN/8" ./ci/blackbox.sh --driver=simx --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XLEN/8 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_LINE_SIZE=$XLEN/8 -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache ways
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache banking
CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --cores=1 --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test 128-bit MEM block
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=demo
# test single-bank DRAM
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --app=demo
# test 27-bit DRAM address
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --app=demo
echo "configuration tests done!"
echo "configuration-2 tests done!"
}
stress()
@ -306,9 +311,7 @@ stress()
# test verilator reset values
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --app=printf
./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n128" --l2cache
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --args="-n128" --l2cache
echo "stress tests done!"
}
@ -318,7 +321,7 @@ synthesis()
echo "begin synthesis tests..."
PREFIX=build_base make -C hw/syn/yosys clean
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys elaborate
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys synthesis
echo "synthesis tests done!"
}
@ -326,7 +329,7 @@ synthesis()
show_usage()
{
echo "Vortex Regression Test"
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cluster] [--debug] [--config] [--stress] [--synthesis] [--all] [--h|--help]"
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--config1] [--config2] [--debug] [--stress] [--synthesis] [--all] [--h|--help]"
}
start=$SECONDS
@ -336,6 +339,9 @@ clean=0
while [ "$1" != "" ]; do
case $1 in
--vm )
tests+=("vm")
;;
--clean )
clean=1
;;
@ -354,15 +360,15 @@ while [ "$1" != "" ]; do
--opencl )
tests+=("opencl")
;;
--cluster )
tests+=("cluster")
--config1 )
tests+=("config1")
;;
--config2 )
tests+=("config2")
;;
--debug )
tests+=("debug")
;;
--config )
tests+=("config")
;;
--stress )
tests+=("stress")
;;
@ -376,9 +382,9 @@ while [ "$1" != "" ]; do
tests+=("kernel")
tests+=("regression")
tests+=("opencl")
tests+=("cluster")
tests+=("config1")
tests+=("config2")
tests+=("debug")
tests+=("config")
tests+=("stress")
tests+=("synthesis")
;;

27
ci/system_updates.sh Executable file
View file

@ -0,0 +1,27 @@
#!/bin/sh
# Copyright 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
apt-get update -y
add-apt-repository -y ppa:ubuntu-toolchain-r/test
apt-get update
apt-get install -y g++-11 gcc-11
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100
apt-get install -y build-essential valgrind libstdc++6 binutils python uuid-dev ccache

View file

@ -16,8 +16,8 @@
TOOLDIR=${TOOLDIR:=@TOOLDIR@}
export VERILATOR_ROOT=$TOOLDIR/verilator
export PATH=$VERILATOR_ROOT/bin:$PATH
# export VERILATOR_ROOT=$TOOLDIR/verilator
# export PATH=$VERILATOR_ROOT/bin:$PATH
export SV2V_PATH=$TOOLDIR/sv2v
export PATH=$SV2V_PATH/bin:$PATH

View file

@ -26,7 +26,7 @@ def parse_args():
parser.add_argument('log', help='Input log file')
return parser.parse_args()
def parse_simx(log_filename):
def parse_simx(log_lines):
pc_pattern = r"PC=(0x[0-9a-fA-F]+)"
instr_pattern = r"Instr (0x[0-9a-fA-F]+):"
opcode_pattern = r"Instr 0x[0-9a-fA-F]+: ([0-9a-zA-Z_\.]+)"
@ -37,9 +37,8 @@ def parse_simx(log_filename):
destination_pattern = r"Dest Reg: (.+)"
uuid_pattern = r"#(\d+)"
entries = []
with open(log_filename, 'r') as log_file:
instr_data = None
for lineno, line in enumerate(log_file, start=1):
for lineno, line in enumerate(log_lines, start=1):
try:
if line.startswith("DEBUG Fetch:"):
if instr_data:
@ -95,8 +94,9 @@ def append_value(text, reg, value, tmask_arr, sep):
text += "}"
return text, sep
def parse_rtlsim(log_filename):
line_pattern = r"\d+: core(\d+)-(decode|issue|commit)"
def parse_rtlsim(log_lines):
config_pattern = r"CONFIGS: num_threads=(\d+), num_warps=(\d+), num_cores=(\d+), num_clusters=(\d+), socket_size=(\d+), local_mem_base=(\d+), num_barriers=(\d+)"
line_pattern = r"\d+: cluster(\d+)-socket(\d+)-core(\d+)-(decode|issue|commit)"
pc_pattern = r"PC=(0x[0-9a-fA-F]+)"
instr_pattern = r"instr=(0x[0-9a-fA-F]+)"
ex_pattern = r"ex=([a-zA-Z]+)"
@ -116,23 +116,43 @@ def parse_rtlsim(log_filename):
eop_pattern = r"eop=(\d)"
uuid_pattern = r"#(\d+)"
entries = []
with open(log_filename, 'r') as log_file:
instr_data = {}
for lineno, line in enumerate(log_file, start=1):
num_threads = 0
num_warps = 0
num_cores = 0
num_clusters = 0
socket_size = 0
local_mem_base = 0
num_barriers = 0
num_sockets = 0
for lineno, line in enumerate(log_lines, start=1):
try:
config_match = re.search(config_pattern, line)
if config_match:
num_threads = int(config_match.group(1))
num_warps = int(config_match.group(2))
num_cores = int(config_match.group(3))
num_clusters = int(config_match.group(4))
socket_size = int(config_match.group(5))
local_mem_base = int(config_match.group(6))
num_barriers = int(config_match.group(7))
num_sockets = (num_cores + socket_size - 1) // socket_size
continue
line_match = re.search(line_pattern, line)
if line_match:
PC = re.search(pc_pattern, line).group(1)
warp_id = re.search(warp_id_pattern, line).group(1)
tmask = re.search(tmask_pattern, line).group(1)
uuid = re.search(uuid_pattern, line).group(1)
core_id = line_match.group(1)
stage = line_match.group(2)
cluster_id = line_match.group(1)
socket_id = line_match.group(2)
core_id = line_match.group(3)
stage = line_match.group(4)
if stage == "decode":
trace = {}
trace["uuid"] = uuid
trace["PC"] = PC
trace["core_id"] = core_id
trace["core_id"] = ((((cluster_id * num_sockets) + socket_id) * socket_size) + core_id)
trace["warp_id"] = warp_id
trace["tmask"] = reverse_binary(tmask)
trace["instr"] = re.search(instr_pattern, line).group(1)
@ -206,14 +226,20 @@ def parse_rtlsim(log_filename):
print("Error at line {}: {}".format(lineno, e))
return entries
def write_csv(log_filename, csv_filename, log_type):
def write_csv(sublogs, csv_filename, log_type):
with open(csv_filename, 'w', newline='') as csv_file:
fieldnames = ["uuid", "PC", "opcode", "instr", "core_id", "warp_id", "tmask", "destination", "operands"]
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for sublog in sublogs:
entries = None
# parse log file
# parse sublog
if log_type == "rtlsim":
entries = parse_rtlsim(log_filename)
entries = parse_rtlsim(sublog)
elif log_type == "simx":
entries = parse_simx(log_filename)
entries = parse_simx(sublog)
else:
print('Error: invalid log type')
sys.exit()
@ -223,17 +249,33 @@ def write_csv(log_filename, csv_filename, log_type):
for entry in entries:
del entry['lineno']
# write to CSV
with open(csv_filename, 'w', newline='') as csv_file:
fieldnames = ["uuid", "PC", "opcode", "instr", "core_id", "warp_id", "tmask", "destination", "operands"]
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
writer.writeheader()
for entry in entries:
writer.writerow(entry)
def split_log_file(log_filename):
with open(log_filename, 'r') as log_file:
log_lines = log_file.readlines()
sublogs = []
current_sublog = None
for line in log_lines:
if line.startswith("[VXDRV] START"):
if current_sublog is not None:
sublogs.append(current_sublog)
current_sublog = [line]
elif current_sublog is not None:
current_sublog.append(line)
if current_sublog is not None:
sublogs.append(current_sublog)
return sublogs
def main():
args = parse_args()
write_csv(args.log, args.csv, args.type)
sublogs = split_log_file(args.log)
write_csv(sublogs, args.csv, args.type)
if __name__ == "__main__":
main()

View file

@ -33,3 +33,7 @@ RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
VORTEX_RT_PATH ?= $(VORTEX_HOME)/runtime
VORTEX_KN_PATH ?= $(VORTEX_HOME)/kernel
THIRD_PARTY_DIR ?= $(VORTEX_HOME)/third_party
VM_ENABLE ?= @VM_ENABLE@

15
configure vendored
View file

@ -63,7 +63,7 @@ copy_files() {
filename_no_ext="${filename%.in}"
dest_file="$dest_dir/$filename_no_ext"
mkdir -p "$dest_dir"
sed "s|@VORTEX_HOME@|$SCRIPT_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@PREFIX@|$PREFIX|g" "$file" > "$dest_file"
sed "s|@VORTEX_HOME@|$SCRIPT_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@PREFIX@|$PREFIX|g; s|@VM_ENABLE@|$VM_ENABLE|g" "$file" > "$dest_file"
# apply permissions to bash scripts
read -r firstline < "$dest_file"
if [[ "$firstline" =~ ^#!.*bash ]]; then
@ -111,9 +111,10 @@ copy_files() {
# default configuration parameters
default_xlen=32
default_tooldir=/opt
default_tooldir=$HOME/tools
default_osversion=$(detect_osversion)
default_prefix=$CURRENT_DIR
default_vm=0
# load default configuration parameters from existing config.mk
if [ -f "config.mk" ]; then
@ -126,6 +127,7 @@ if [ -f "config.mk" ]; then
TOOLDIR\ ?*) default_tooldir=${value//\?=/} ;;
OSVERSION\ ?*) default_osversion=${value//\?=/} ;;
PREFIX\ ?*) default_prefix=${value//\?=/} ;;
VM_ENABLE\ ?*) default_vm=${value//\?=/} ;;
esac
done < config.mk
fi
@ -135,14 +137,16 @@ XLEN=${XLEN:=$default_xlen}
TOOLDIR=${TOOLDIR:=$default_tooldir}
OSVERSION=${OSVERSION:=$default_osversion}
PREFIX=${PREFIX:=$default_prefix}
VM_ENABLE=${VM_ENABLE:=$default_vm}
# parse command line arguments
usage() {
echo "Usage: $0 [--xlen=<value>] [--tooldir=<path>] [--osversion=<version>]"
echo " --xlen=<value> Set the XLEN value (default: 32)"
echo " --tooldir=<path> Set the TOOLDIR path (default: /opt)"
echo " --osversion=<version> Set the OS Version (default: $(detect_os))"
echo " --tooldir=<path> Set the TOOLDIR path (default: $HOME/tools)"
echo " --osversion=<version> Set the OS Version (default: $(detect_osversion))"
echo " --prefix=<path> Set installation directory"
echo " --vm_enable=<value> Enable Virtual Memory support (default: 0)"
exit 1
}
while [[ "$#" -gt 0 ]]; do
@ -151,6 +155,7 @@ while [[ "$#" -gt 0 ]]; do
--tooldir=*) TOOLDIR="${1#*=}" ;;
--osversion=*) OSVERSION="${1#*=}" ;;
--prefix=*) PREFIX="${1#*=}" ;;
--vm_enable=*) VM_ENABLE="${1#*=}" ;;
-h|--help) usage ;;
*) echo "Unknown parameter passed: $1"; usage ;;
esac
@ -172,3 +177,5 @@ SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
THIRD_PARTY_DIR=$SCRIPT_DIR/third_party
copy_files "$SCRIPT_DIR" "$CURRENT_DIR"
echo "VM Enable: "$VM_ENABLE

79
docs/altera_fpga_guide.md Normal file
View file

@ -0,0 +1,79 @@
# FPGA Startup and Configuration Guide
OPAE Environment Setup
----------------------
$ source /opt/inteldevstack/init_env_user.sh
$ export OPAE_HOME=/opt/opae/1.1.2
$ export PATH=$OPAE_HOME/bin:$PATH
$ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH
$ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH
$ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH
OPAE Build
------------------
The FPGA has to following configuration options:
- DEVICE_FAMILY=arria10 | stratix10
- NUM_CORES=#n
Command line:
$ cd hw/syn/altera/opae
$ PREFIX=test1 TARGET=fpga NUM_CORES=4 make
A new folder (ex: `test1_xxx_4c`) will be created and the build will start and take ~30-480 min to complete.
Setting TARGET=ase will build the project for simulation using Intel ASE.
OPAE Build Configuration
------------------------
The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware parameters that can be modified when build the processor.For example, have the following parameters that can be configured:
- `NUM_WARPS`: Number of warps per cores
- `NUM_THREADS`: Number of threads per warps
- `PERF_ENABLE`: enable the use of all profile counters
You configure the syntesis build from the command line:
$ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make
OPAE Build Progress
-------------------
You could check the last 10 lines in the build log for possible errors until build completion.
$ tail -n 10 <build_dir>/build.log
Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs.
$ ps -u <username>
If the build fails and you need to restart it, clean up the build folder using the following command:
$ make clean
The bitstream file `vortex_afu.gbs` should exist when the build is done:
$ ls -lsa <build_dir>/synth/vortex_afu.gbs
Signing the bitstream and Programming the FPGA
----------------------------------------------
$ cd <build_dir>
$ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs
$ fpgasupdate vortex_afu_unsigned_ssl.gbs
Sample FPGA Run Test
--------------------
Ensure you have the correct opae runtime for the FPGA target
$ make -C runtime/opae clean
$ TARGET=FPGA make -C runtime/opae
Run the following from your Vortex build directory
$ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128"

View file

@ -7,7 +7,8 @@
- [Cache Subsystem](cache_subsystem.md)
- [Software](software.md)
- [Simulation](simulation.md)
- [FPGA Setup Guide](fpga_setup.md)
- [Altera FPGA Setup Guide](altera_fpga_guide.md)
- [Xilinx FPGA Setup Guide](xilinx_fpga_guide.md)
- [Debugging](debugging.md)
- [Useful Links](references.md)

36
docs/xilinx_fpga_guide.md Normal file
View file

@ -0,0 +1,36 @@
# FPGA Startup and Configuration Guide
XRT Environment Setup
----------------------
$ source /opt/xilinx/Vitis/2023.1/settings64.sh
$ source /opt/xilinx/xrt/setup.sh
Check Installed FPGA Platforms
------------------------------
$ platforminfo -l
Build FPGA image
----------------
$ cd hw/syn/xilinx/xrt
$ PREFIX=test1 PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 TARGET=hw NUM_CORES=4 make
Will run the synthesis under new build directory: BUILD_DIR := "\<PREFIX>\_\<PLATFORM>\_\<TARGET>"
The generated bitstream will be located under <BUILD_DIR>/bin/vortex_afu.xclbin
Sample FPGA Run Test
--------------------
Ensure you have the correct opae runtime for the FPGA target
$ make -C runtime/xrt clean
$ TARGET=hw make -C runtime/xrt
Run the following from your Vortex build directory
$ TARGET=hw FPGA_BIN_DIR=<BUILD_DIR>/bin ./ci/blackbox.sh --driver=xrt --app=sgemm --args="-n128"

View file

@ -16,6 +16,7 @@ VX_types.h: $(RTL_DIR)/VX_types.vh
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/VX_types.vh -o VX_types.h
clean:
$(MAKE) -C unittest clean
rm -f VX_config.h VX_types.h
.PHONY: VX_config.h VX_types.h

View file

@ -14,8 +14,6 @@
`ifndef FLOAT_DPI_VH
`define FLOAT_DPI_VH
`include "VX_config.vh"
import "DPI-C" function void dpi_fadd(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fsub(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
import "DPI-C" function void dpi_fmul(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);

View file

@ -14,8 +14,6 @@
`ifndef UTIL_DPI_VH
`define UTIL_DPI_VH
`include "VX_config.vh"
`ifdef XLEN_64
`define INT_TYPE longint
`else

View file

@ -14,7 +14,8 @@
`include "VX_define.vh"
module VX_cluster import VX_gpu_pkg::*; #(
parameter CLUSTER_ID = 0
parameter CLUSTER_ID = 0,
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -85,7 +86,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
`RESET_RELAY (l2_reset, reset);
VX_cache_wrap #(
.INSTANCE_ID ("l2cache"),
.INSTANCE_ID ($sformatf("%s-l2cache", INSTANCE_ID)),
.CACHE_SIZE (`L2_CACHE_SIZE),
.LINE_SIZE (`L2_LINE_SIZE),
.NUM_BANKS (`L2_NUM_BANKS),
@ -98,6 +99,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
.MREQ_SIZE (`L2_MREQ_SIZE),
.TAG_WIDTH (L2_TAG_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`L2_WRITEBACK),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2),
@ -122,17 +124,19 @@ module VX_cluster import VX_gpu_pkg::*; #(
wire [`NUM_SOCKETS-1:0] per_socket_busy;
VX_dcr_bus_if socket_dcr_bus_if();
`BUFFER_DCR_BUS_IF (socket_dcr_bus_if, socket_dcr_bus_tmp_if, (`NUM_SOCKETS > 1));
// Generate all sockets
for (genvar i = 0; i < `NUM_SOCKETS; ++i) begin
for (genvar socket_id = 0; socket_id < `NUM_SOCKETS; ++socket_id) begin : sockets
`RESET_RELAY (socket_reset, reset);
VX_socket #(
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i)
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + socket_id),
.INSTANCE_ID ($sformatf("%s-socket%0d", INSTANCE_ID, socket_id))
) socket (
`SCOPE_IO_BIND (scope_socket+i)
`SCOPE_IO_BIND (scope_socket+socket_id)
.clk (clk),
.reset (socket_reset),
@ -143,13 +147,13 @@ module VX_cluster import VX_gpu_pkg::*; #(
.dcr_bus_if (socket_dcr_bus_if),
.mem_bus_if (per_socket_mem_bus_if[i]),
.mem_bus_if (per_socket_mem_bus_if[socket_id]),
`ifdef GBAR_ENABLE
.gbar_bus_if (per_socket_gbar_bus_if[i]),
.gbar_bus_if (per_socket_gbar_bus_if[socket_id]),
`endif
.busy (per_socket_busy[i])
.busy (per_socket_busy[socket_id])
);
end

View file

@ -33,10 +33,6 @@
`endif
///////////////////////////////////////////////////////////////////////////////
`ifndef VM_DISABLE
`define VM_ENABLE
`endif
`ifndef EXT_M_DISABLE
`define EXT_M_ENABLE
`endif
@ -114,7 +110,6 @@
`ifndef SOCKET_SIZE
`define SOCKET_SIZE `MIN(4, `NUM_CORES)
`endif
`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
`ifdef L2_ENABLE
`define L2_ENABLED 1
@ -357,7 +352,7 @@
// Number of SFU units
`ifndef NUM_SFU_LANES
`define NUM_SFU_LANES `MIN(`NUM_THREADS, 4)
`define NUM_SFU_LANES `NUM_THREADS
`endif
`ifndef NUM_SFU_BLOCKS
`define NUM_SFU_BLOCKS 1
@ -481,22 +476,27 @@
`define LATENCY_FCVT 5
`endif
// FMA Bandwidth ratio
`ifndef FMA_PE_RATIO
`define FMA_PE_RATIO 1
`endif
// FDIV Bandwidth ratio
`ifndef FDIV_PE_RATIO
`define FDIV_PE_RATIO 8
`endif
// FSQRT Bandwidth ratio
`ifndef FSQRT_PE_RATIO
`define FSQRT_PE_RATIO 8
`endif
// FCVT Bandwidth ratio
`ifndef FCVT_PE_RATIO
`define FCVT_PE_RATIO 8
`endif
// FNCP Bandwidth ratio
`ifndef FNCP_PE_RATIO
`define FNCP_PE_RATIO 2
`endif
@ -603,7 +603,12 @@
`define DCACHE_NUM_WAYS 1
`endif
// SM Configurable Knobs //////////////////////////////////////////////////////
// Enable Cache Writeback
`ifndef DCACHE_WRITEBACK
`define DCACHE_WRITEBACK 0
`endif
// LMEM Configurable Knobs ////////////////////////////////////////////////////
`ifndef LMEM_DISABLE
`define LMEM_ENABLE
@ -662,6 +667,11 @@
`define L2_NUM_WAYS 2
`endif
// Enable Cache Writeback
`ifndef L2_WRITEBACK
`define L2_WRITEBACK 0
`endif
// L3cache Configurable Knobs /////////////////////////////////////////////////
// Cache Size
@ -703,6 +713,11 @@
`define L3_NUM_WAYS 4
`endif
// Enable Cache Writeback
`ifndef L3_WRITEBACK
`define L3_WRITEBACK 0
`endif
// ISA Extensions /////////////////////////////////////////////////////////////
`ifdef EXT_A_ENABLE

View file

@ -59,6 +59,8 @@
`define OFFSET_BITS 12
`define IMM_BITS `XLEN
`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
///////////////////////////////////////////////////////////////////////////////
`define EX_ALU 0
@ -296,6 +298,7 @@
`ifdef ICACHE_ENABLE
`define L1_ENABLE
`endif
`ifdef DCACHE_ENABLE
`define L1_ENABLE
`endif
@ -322,7 +325,7 @@
.DATAW ($bits(dst)), \
.RESETW ($bits(dst)), \
.DEPTH (latency) \
) __``dst ( \
) __``dst``__ ( \
.clk (clk), \
.reset (reset), \
.enable (ena), \
@ -336,13 +339,18 @@
VX_popcount #( \
.N ($bits(in)), \
.MODEL (model) \
) __``out ( \
) __``out``__ ( \
.data_in (in), \
.data_out (out) \
)
`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1)
`define ASSIGN_VX_IF(dst, src) \
assign dst.valid = src.valid; \
assign dst.data = src.data; \
assign src.ready = dst.ready
`define ASSIGN_VX_MEM_BUS_IF(dst, src) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data = src.req_data; \
@ -377,42 +385,42 @@
assign dst.rsp_ready = src.rsp_ready
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \
if (enable) begin \
reg [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __dst; \
always @(posedge clk) begin \
__``dst <= {src.write_valid, src.write_addr, src.write_data}; \
__dst <= {src.write_valid, src.write_addr, src.write_data}; \
end \
assign {dst.write_valid, dst.write_addr, dst.write_data} = __dst; \
end else begin \
assign __``dst = {src.write_valid, src.write_addr, src.write_data}; \
end \
VX_dcr_bus_if dst(); \
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
assign {dst.write_valid, dst.write_addr, dst.write_data} = {src.write_valid, src.write_addr, src.write_data}; \
end
`define PERF_COUNTER_ADD(dst, src, field, width, dst_count, src_count, reg_enable) \
for (genvar __d = 0; __d < dst_count; ++__d) begin \
localparam __count = ((src_count > dst_count) ? `CDIV(src_count, dst_count) : 1); \
wire [__count-1:0][width-1:0] __reduce_add_i_``src``field; \
wire [width-1:0] __reduce_add_o_``dst``field; \
for (genvar __i = 0; __i < __count; ++__i) begin \
assign __reduce_add_i_``src``field[__i] = ``src[__d * __count + __i].``field; \
`define PERF_COUNTER_ADD(dst, src, field, width, count, reg_enable) \
if (count > 1) begin \
wire [count-1:0][width-1:0] __reduce_add_i_field; \
wire [width-1:0] __reduce_add_o_field; \
for (genvar __i = 0; __i < count; ++__i) begin \
assign __reduce_add_i_field[__i] = src[__i].``field; \
end \
VX_reduce #(.DATAW_IN(width), .N(__count), .OP("+")) __reduce_add_``dst``field ( \
__reduce_add_i_``src``field, \
__reduce_add_o_``dst``field \
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
__reduce_add_i_field, \
__reduce_add_o_field \
); \
if (reg_enable) begin \
reg [width-1:0] __reduce_add_r_``dst``field; \
reg [width-1:0] __reduce_add_r_field; \
always @(posedge clk) begin \
if (reset) begin \
__reduce_add_r_``dst``field <= '0; \
__reduce_add_r_field <= '0; \
end else begin \
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
__reduce_add_r_field <= __reduce_add_o_field; \
end \
end \
assign ``dst[__d].``field = __reduce_add_r_``dst``field; \
assign dst.``field = __reduce_add_r_field; \
end else begin \
assign ``dst[__d].``field = __reduce_add_o_``dst``field; \
assign dst.``field = __reduce_add_o_field; \
end \
end else begin \
assign dst.``field = src[0].``field; \
end
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
@ -426,20 +434,4 @@
assign dst = src; \
end
`define TO_DISPATCH_DATA(data, tid) { \
data.uuid, \
data.wis, \
data.tmask, \
data.PC, \
data.op_type, \
data.op_args, \
data.wb, \
data.rd, \
tid, \
data.rs1_data, \
data.rs2_data, \
data.rs3_data}
///////////////////////////////////////////////////////////////////////////////
`endif // VX_DEFINE_VH

View file

@ -60,6 +60,8 @@ package VX_gpu_pkg;
logic [7:0] mpm_class;
} base_dcrs_t;
//////////////////////////// Perf counter types ///////////////////////////
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] reads;
logic [`PERF_CTR_BITS-1:0] writes;
@ -77,48 +79,63 @@ package VX_gpu_pkg;
logic [`PERF_CTR_BITS-1:0] latency;
} mem_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] idles;
logic [`PERF_CTR_BITS-1:0] stalls;
} sched_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] ibf_stalls;
logic [`PERF_CTR_BITS-1:0] scb_stalls;
logic [`PERF_CTR_BITS-1:0] opd_stalls;
logic [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] units_uses;
logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses;
} issue_perf_t;
//////////////////////// instruction arguments ////////////////////////////
typedef struct packed {
logic use_PC;
logic use_imm;
logic is_w;
logic [`ALU_TYPE_BITS-1:0] xtype;
logic [`IMM_BITS-1:0] imm;
} alu_mod_t;
} alu_args_t;
typedef struct packed {
logic [($bits(alu_mod_t)-`INST_FRM_BITS-`INST_FMT_BITS)-1:0] __padding;
logic [($bits(alu_args_t)-`INST_FRM_BITS-`INST_FMT_BITS)-1:0] __padding;
logic [`INST_FRM_BITS-1:0] frm;
logic [`INST_FMT_BITS-1:0] fmt;
} fpu_mod_t;
} fpu_args_t;
typedef struct packed {
logic [($bits(alu_mod_t)-1-1-`OFFSET_BITS)-1:0] __padding;
logic [($bits(alu_args_t)-1-1-`OFFSET_BITS)-1:0] __padding;
logic is_store;
logic is_float;
logic [`OFFSET_BITS-1:0] offset;
} lsu_mod_t;
} lsu_args_t;
typedef struct packed {
logic [($bits(alu_mod_t)-1-`VX_CSR_ADDR_BITS-5)-1:0] __padding;
logic [($bits(alu_args_t)-1-`VX_CSR_ADDR_BITS-5)-1:0] __padding;
logic use_imm;
logic [`VX_CSR_ADDR_BITS-1:0] addr;
logic [4:0] imm;
} csr_mod_t;
} csr_args_t;
typedef struct packed {
logic [($bits(alu_mod_t)-1)-1:0] __padding;
logic [($bits(alu_args_t)-1)-1:0] __padding;
logic is_neg;
} wctl_mod_t;
} wctl_args_t;
typedef union packed {
alu_mod_t alu;
fpu_mod_t fpu;
lsu_mod_t lsu;
csr_mod_t csr;
wctl_mod_t wctl;
alu_args_t alu;
fpu_args_t fpu;
lsu_args_t lsu;
csr_args_t csr;
wctl_args_t wctl;
} op_args_t;
/* verilator lint_off UNUSED */
`IGNORE_UNUSED_BEGIN
///////////////////////// LSU memory Parameters ///////////////////////////
@ -129,6 +146,31 @@ package VX_gpu_pkg;
localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS);
localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES;
////////////////////////// Icache Parameters //////////////////////////////
// Word size in bytes
localparam ICACHE_WORD_SIZE = 4;
localparam ICACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(ICACHE_WORD_SIZE));
// Block size in bytes
localparam ICACHE_LINE_SIZE = `L1_LINE_SIZE;
// Core request tag Id bits
localparam ICACHE_TAG_ID_BITS = `NW_WIDTH;
// Core request tag bits
localparam ICACHE_TAG_WIDTH = (`UUID_WIDTH + ICACHE_TAG_ID_BITS);
// Memory request data bits
localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8);
// Memory request tag bits
`ifdef ICACHE_ENABLE
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
`else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif
////////////////////////// Dcache Parameters //////////////////////////////
// Word size in bytes
@ -160,31 +202,6 @@ package VX_gpu_pkg;
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`endif
////////////////////////// Icache Parameters //////////////////////////////
// Word size in bytes
localparam ICACHE_WORD_SIZE = 4;
localparam ICACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(ICACHE_WORD_SIZE));
// Block size in bytes
localparam ICACHE_LINE_SIZE = `L1_LINE_SIZE;
// Core request tag Id bits
localparam ICACHE_TAG_ID_BITS = `NW_WIDTH;
// Core request tag bits
localparam ICACHE_TAG_WIDTH = (`UUID_WIDTH + ICACHE_TAG_ID_BITS);
// Memory request data bits
localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8);
// Memory request tag bits
`ifdef ICACHE_ENABLE
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
`else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif
/////////////////////////////// L1 Parameters /////////////////////////////
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
@ -235,17 +252,14 @@ package VX_gpu_pkg;
localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`endif
/* verilator lint_on UNUSED */
/////////////////////////////// Issue parameters //////////////////////////
localparam ISSUE_ISW = `CLOG2(`ISSUE_WIDTH);
localparam ISSUE_ISW_W = `UP(ISSUE_ISW);
localparam ISSUE_RATIO = `NUM_WARPS / `ISSUE_WIDTH;
localparam ISSUE_WIS = `CLOG2(ISSUE_RATIO);
localparam PER_ISSUE_WARPS = `NUM_WARPS / `ISSUE_WIDTH;
localparam ISSUE_WIS = `CLOG2(PER_ISSUE_WARPS);
localparam ISSUE_WIS_W = `UP(ISSUE_WIS);
`IGNORE_UNUSED_BEGIN
function logic [`NW_WIDTH-1:0] wis_to_wid(
input logic [ISSUE_WIS_W-1:0] wis,
input logic [ISSUE_ISW_W-1:0] isw
@ -278,6 +292,20 @@ package VX_gpu_pkg;
wid_to_wis = 0;
end
endfunction
///////////////////////// Miscaellaneous functions ////////////////////////
function logic [`SFU_WIDTH-1:0] op_to_sfu_type(
input logic [`INST_OP_BITS-1:0] op_type
);
case (op_type)
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: op_to_sfu_type = `SFU_CSRS;
default: op_to_sfu_type = `SFU_WCTL;
endcase
endfunction
`IGNORE_UNUSED_END
endpackage

View file

@ -47,7 +47,7 @@
`define UNUSED_VAR(x)
`define UNUSED_PIN(x) . x ()
`define UNUSED_ARG(x) x
`define TRACE(level, args) $write args
`define TRACE(level, args) if (level <= `DEBUG_LEVEL) $write args
`else
`ifdef VERILATOR
`define TRACING_ON /* verilator tracing_on */
@ -112,8 +112,14 @@
`define UNUSED_ARG(x) /* verilator lint_off UNUSED */ \
x \
/* verilator lint_on UNUSED */
`define TRACE(level, args) dpi_trace(level, $sformatf args)
`endif
`ifdef SV_DPI
`define TRACE(level, args) dpi_trace(level, $sformatf args)
`else
`define TRACE(level, args) if (level <= `DEBUG_LEVEL) $write args
`endif
`endif
`ifdef SIMULATION

View file

@ -14,7 +14,8 @@
`include "VX_define.vh"
module VX_socket import VX_gpu_pkg::*; #(
parameter SOCKET_ID = 0
parameter SOCKET_ID = 0,
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -40,6 +41,11 @@ module VX_socket import VX_gpu_pkg::*; #(
output wire busy
);
`ifdef SCOPE
localparam scope_core = 0;
`SCOPE_IO_SWITCH (`SOCKET_SIZE);
`endif
`ifdef GBAR_ENABLE
VX_gbar_bus_if per_core_gbar_bus_if[`SOCKET_SIZE]();
@ -81,7 +87,7 @@ module VX_socket import VX_gpu_pkg::*; #(
`RESET_RELAY (icache_reset, reset);
VX_cache_cluster #(
.INSTANCE_ID ($sformatf("socket%0d-icache", SOCKET_ID)),
.INSTANCE_ID ($sformatf("%s-icache", INSTANCE_ID)),
.NUM_UNITS (`NUM_ICACHES),
.NUM_INPUTS (`SOCKET_SIZE),
.TAG_SEL_IDX (0),
@ -126,7 +132,7 @@ module VX_socket import VX_gpu_pkg::*; #(
`RESET_RELAY (dcache_reset, reset);
VX_cache_cluster #(
.INSTANCE_ID ($sformatf("socket%0d-dcache", SOCKET_ID)),
.INSTANCE_ID ($sformatf("%s-dcache", INSTANCE_ID)),
.NUM_UNITS (`NUM_DCACHES),
.NUM_INPUTS (`SOCKET_SIZE),
.TAG_SEL_IDX (0),
@ -143,8 +149,9 @@ module VX_socket import VX_gpu_pkg::*; #(
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`DCACHE_WRITEBACK),
.NC_ENABLE (1),
.CORE_OUT_BUF (`LMEM_ENABLED ? 2 : 1),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2)
) dcache (
`ifdef PERF_ENABLE
@ -194,19 +201,19 @@ module VX_socket import VX_gpu_pkg::*; #(
wire [`SOCKET_SIZE-1:0] per_core_busy;
VX_dcr_bus_if core_dcr_bus_if();
`BUFFER_DCR_BUS_IF (core_dcr_bus_if, dcr_bus_if, (`SOCKET_SIZE > 1));
`SCOPE_IO_SWITCH (`SOCKET_SIZE)
// Generate all cores
for (genvar i = 0; i < `SOCKET_SIZE; ++i) begin
for (genvar core_id = 0; core_id < `SOCKET_SIZE; ++core_id) begin : cores
`RESET_RELAY (core_reset, reset);
VX_core #(
.CORE_ID ((SOCKET_ID * `SOCKET_SIZE) + i)
.CORE_ID ((SOCKET_ID * `SOCKET_SIZE) + core_id),
.INSTANCE_ID ($sformatf("%s-core%0d", INSTANCE_ID, core_id))
) core (
`SCOPE_IO_BIND (i)
`SCOPE_IO_BIND (scope_core + core_id)
.clk (clk),
.reset (core_reset),
@ -217,15 +224,15 @@ module VX_socket import VX_gpu_pkg::*; #(
.dcr_bus_if (core_dcr_bus_if),
.dcache_bus_if (per_core_dcache_bus_if[i * DCACHE_NUM_REQS +: DCACHE_NUM_REQS]),
.dcache_bus_if (per_core_dcache_bus_if[core_id * DCACHE_NUM_REQS +: DCACHE_NUM_REQS]),
.icache_bus_if (per_core_icache_bus_if[i]),
.icache_bus_if (per_core_icache_bus_if[core_id]),
`ifdef GBAR_ENABLE
.gbar_bus_if (per_core_gbar_bus_if[i]),
.gbar_bus_if (per_core_gbar_bus_if[core_id]),
`endif
.busy (per_core_busy[i])
.busy (per_core_busy[core_id])
);
end

View file

@ -85,30 +85,31 @@
`define VX_CSR_MPM_IBUF_ST_H 12'hB85
`define VX_CSR_MPM_SCRB_ST 12'hB06
`define VX_CSR_MPM_SCRB_ST_H 12'hB86
`define VX_CSR_MPM_SCRB_ALU 12'hB07
`define VX_CSR_MPM_SCRB_ALU_H 12'hB87
`define VX_CSR_MPM_SCRB_FPU 12'hB08
`define VX_CSR_MPM_SCRB_FPU_H 12'hB88
`define VX_CSR_MPM_SCRB_LSU 12'hB09
`define VX_CSR_MPM_SCRB_LSU_H 12'hB89
`define VX_CSR_MPM_SCRB_SFU 12'hB0A
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8A
`define VX_CSR_MPM_OPDS_ST 12'hB07
`define VX_CSR_MPM_OPDS_ST_H 12'hB87
`define VX_CSR_MPM_SCRB_ALU 12'hB08
`define VX_CSR_MPM_SCRB_ALU_H 12'hB88
`define VX_CSR_MPM_SCRB_FPU 12'hB09
`define VX_CSR_MPM_SCRB_FPU_H 12'hB89
`define VX_CSR_MPM_SCRB_LSU 12'hB0A
`define VX_CSR_MPM_SCRB_LSU_H 12'hB8A
`define VX_CSR_MPM_SCRB_SFU 12'hB0B
`define VX_CSR_MPM_SCRB_SFU_H 12'hB8B
`define VX_CSR_MPM_SCRB_CSRS 12'hB0C
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB8C
`define VX_CSR_MPM_SCRB_WCTL 12'hB0D
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB8D
// PERF: memory
`define VX_CSR_MPM_IFETCHES 12'hB0B
`define VX_CSR_MPM_IFETCHES_H 12'hB8B
`define VX_CSR_MPM_LOADS 12'hB0C
`define VX_CSR_MPM_LOADS_H 12'hB8C
`define VX_CSR_MPM_STORES 12'hB0D
`define VX_CSR_MPM_STORES_H 12'hB8D
`define VX_CSR_MPM_IFETCH_LT 12'hB0E
`define VX_CSR_MPM_IFETCH_LT_H 12'hB8E
`define VX_CSR_MPM_LOAD_LT 12'hB0F
`define VX_CSR_MPM_LOAD_LT_H 12'hB8F
// SFU: scoreboard
`define VX_CSR_MPM_SCRB_WCTL 12'hB10
`define VX_CSR_MPM_SCRB_WCTL_H 12'hB90
`define VX_CSR_MPM_SCRB_CSRS 12'hB11
`define VX_CSR_MPM_SCRB_CSRS_H 12'hB91
`define VX_CSR_MPM_IFETCHES 12'hB0E
`define VX_CSR_MPM_IFETCHES_H 12'hB8E
`define VX_CSR_MPM_LOADS 12'hB0F
`define VX_CSR_MPM_LOADS_H 12'hB8F
`define VX_CSR_MPM_STORES 12'hB10
`define VX_CSR_MPM_STORES_H 12'hB90
`define VX_CSR_MPM_IFETCH_LT 12'hB11
`define VX_CSR_MPM_IFETCH_LT_H 12'hB91
`define VX_CSR_MPM_LOAD_LT 12'hB12
`define VX_CSR_MPM_LOAD_LT_H 12'hB92
// Machine Performance-monitoring memory counters (class 2) ///////////////////

View file

@ -44,6 +44,11 @@ module Vortex import VX_gpu_pkg::*; (
output wire busy
);
`ifdef SCOPE
localparam scope_cluster = 0;
`SCOPE_IO_SWITCH (`NUM_CLUSTERS);
`endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if();
assign mem_perf_if.icache = 'x;
@ -78,6 +83,7 @@ module Vortex import VX_gpu_pkg::*; (
.MREQ_SIZE (`L3_MREQ_SIZE),
.TAG_WIDTH (L2_MEM_TAG_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`L3_WRITEBACK),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2),
@ -121,19 +127,19 @@ module Vortex import VX_gpu_pkg::*; (
wire [`NUM_CLUSTERS-1:0] per_cluster_busy;
`SCOPE_IO_SWITCH (`NUM_CLUSTERS)
// Generate all clusters
for (genvar i = 0; i < `NUM_CLUSTERS; ++i) begin
for (genvar cluster_id = 0; cluster_id < `NUM_CLUSTERS; ++cluster_id) begin : clusters
`RESET_RELAY (cluster_reset, reset);
VX_dcr_bus_if cluster_dcr_bus_if();
`BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, (`NUM_CLUSTERS > 1));
VX_cluster #(
.CLUSTER_ID (i)
.CLUSTER_ID (cluster_id),
.INSTANCE_ID ($sformatf("cluster%0d", cluster_id))
) cluster (
`SCOPE_IO_BIND (i)
`SCOPE_IO_BIND (scope_cluster + cluster_id)
.clk (clk),
.reset (cluster_reset),
@ -144,9 +150,9 @@ module Vortex import VX_gpu_pkg::*; (
.dcr_bus_if (cluster_dcr_bus_if),
.mem_bus_if (per_cluster_mem_bus_if[i]),
.mem_bus_if (per_cluster_mem_bus_if[cluster_id]),
.busy (per_cluster_busy[i])
.busy (per_cluster_busy[cluster_id])
);
end

View file

@ -5,6 +5,7 @@
// To be done:
// Check how to run this with OPAE. Looks like setup issue
`ifndef NOPAE
`include "platform_if.vh"
@ -121,3 +122,5 @@ module ccip_std_afu #(
);
endmodule
`endif

View file

@ -587,7 +587,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.DATA_SIZE (LMEM_DATA_SIZE),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.TAG_WIDTH (AVS_REQ_TAGW),
.ARBITER ("P"),
.ARBITER ("P"), // prioritize VX requests
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (0)
) mem_arb (
@ -692,9 +692,11 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.reset (reset),
.incr (cci_rd_req_fire),
.decr (cci_rdq_pop),
`UNUSED_PIN (empty),
`UNUSED_PIN (alm_empty),
.full (cci_pending_reads_full),
.size (cci_pending_reads),
`UNUSED_PIN (empty)
`UNUSED_PIN (alm_full),
.size (cci_pending_reads)
);
`UNUSED_VAR (cci_pending_reads)
@ -852,7 +854,9 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.incr (cci_mem_rd_rsp_fire),
.decr (cci_wr_rsp_fire),
.empty (cci_pending_writes_empty),
`UNUSED_PIN (alm_empty),
.full (cci_pending_writes_full),
`UNUSED_PIN (alm_full),
.size (cci_pending_writes)
);
@ -1010,7 +1014,6 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
// SCOPE //////////////////////////////////////////////////////////////////////
`ifdef DBG_SCOPE_AFU
`ifdef SCOPE
wire mem_req_fire = mem_bus_if[0].req_valid && mem_bus_if[0].req_ready;
wire mem_rsp_fire = mem_bus_if[0].rsp_valid && mem_bus_if[0].rsp_ready;
wire avs_write_fire = avs_write[0] && ~avs_waitrequest[0];
@ -1080,7 +1083,6 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
.bus_in(scope_bus_in_w[0]),
.bus_out(scope_bus_out_w[0])
);
`endif
`else
`SCOPE_IO_UNUSED_W(0)
`endif

View file

@ -311,7 +311,6 @@ module VX_afu_wrap #(
// SCOPE //////////////////////////////////////////////////////////////////////
`ifdef DBG_SCOPE_AFU
`ifdef SCOPE
`define TRIGGERS { \
reset, \
ap_start, \
@ -341,24 +340,6 @@ module VX_afu_wrap #(
.bus_in (scope_bus_in_w[0]),
.bus_out (scope_bus_out_w[0])
);
`endif
`ifdef CHIPSCOPE
ila_afu ila_afu_inst (
.clk (ap_clk),
.probe0 ({
ap_start,
ap_done,
ap_idle,
interrupt
}),
.probe1 ({
vx_pending_writes,
vx_busy_wait,
vx_busy,
vx_running
})
);
`endif
`else
`SCOPE_IO_UNUSED_W(0)
`endif

109
hw/rtl/cache/VX_bank_flush.sv vendored Normal file
View file

@ -0,0 +1,109 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_bank_flush #(
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// Enable cache writeback
parameter WRITEBACK = 0
) (
input wire clk,
input wire reset,
input wire flush_in_valid,
output wire flush_in_ready,
output wire flush_out_init,
output wire flush_out_valid,
output wire [`CS_LINE_SEL_BITS-1:0] flush_out_line,
output wire [NUM_WAYS-1:0] flush_out_way,
input wire flush_out_ready,
input wire mshr_empty
);
parameter CTR_WIDTH = `CS_LINE_SEL_BITS + (WRITEBACK ? `CS_WAY_SEL_BITS : 0);
parameter STATE_IDLE = 2'd0;
parameter STATE_INIT = 2'd1;
parameter STATE_FLUSH = 2'd2;
reg [CTR_WIDTH-1:0] counter_r;
reg [1:0] state_r, state_n;
reg flush_in_ready_r, flush_in_ready_n;
always @(*) begin
state_n = state_r;
flush_in_ready_n = 0;
case (state_r)
// STATE_IDLE
default: begin
if (flush_in_valid && mshr_empty) begin
state_n = STATE_FLUSH;
end
end
STATE_INIT: begin
if (counter_r == ((2 ** `CS_LINE_SEL_BITS)-1)) begin
state_n = STATE_IDLE;
end
end
STATE_FLUSH: begin
if (counter_r == ((2 ** CTR_WIDTH)-1)) begin
state_n = STATE_IDLE;
flush_in_ready_n = 1;
end
end
endcase
end
always @(posedge clk) begin
if (reset) begin
state_r <= STATE_INIT;
counter_r <= '0;
flush_in_ready_r <= '0;
end else begin
state_r <= state_n;
flush_in_ready_r <= flush_in_ready_n;
if (state_r != STATE_IDLE) begin
if ((state_r == STATE_INIT) || flush_out_ready) begin
counter_r <= counter_r + CTR_WIDTH'(1);
end
end else begin
counter_r <= '0;
end
end
end
assign flush_in_ready = flush_in_ready_r;
assign flush_out_init = (state_r == STATE_INIT);
assign flush_out_valid = (state_r == STATE_FLUSH);
assign flush_out_line = counter_r[`CS_LINE_SEL_BITS-1:0];
if (WRITEBACK && `CS_WAY_SEL_BITS > 0) begin
reg [NUM_WAYS-1:0] flush_out_way_r;
always @(*) begin
flush_out_way_r = '0;
flush_out_way_r[counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]] = 1;
end
assign flush_out_way = flush_out_way_r;
end else begin
assign flush_out_way = {NUM_WAYS{1'b1}};
end
endmodule

View file

@ -42,6 +42,9 @@ module VX_cache import VX_gpu_pkg::*; #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -67,6 +70,7 @@ module VX_cache import VX_gpu_pkg::*; #(
);
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
`STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter"))
localparam REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS);
localparam WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS);
@ -78,36 +82,46 @@ module VX_cache import VX_gpu_pkg::*; #(
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam LINE_ADDR_WIDTH = (`CS_WORD_ADDR_WIDTH - BANK_SEL_BITS - WORD_SEL_BITS);
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH;
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + 1;
localparam CORE_RSP_DATAW = WORD_WIDTH + TAG_WIDTH;
localparam CORE_REQ_BUF_ENABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
localparam MEM_REQ_BUF_ENABLE = (NUM_BANKS != 1);
localparam REQ_XBAR_BUF = (NUM_REQS > 4) ? 2 : 0;
`ifdef PERF_ENABLE
wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
wire [NUM_BANKS-1:0] perf_write_miss_per_bank;
wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank;
`endif
wire [NUM_REQS-1:0] core_req_valid;
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr;
wire [NUM_REQS-1:0] core_req_rw;
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag;
wire [NUM_REQS-1:0] core_req_ready;
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (TAG_WIDTH)
) core_bus2_if[NUM_REQS]();
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_valid[i] = core_bus_if[i].req_valid;
assign core_req_rw[i] = core_bus_if[i].req_data.rw;
assign core_req_byteen[i] = core_bus_if[i].req_data.byteen;
assign core_req_addr[i] = core_bus_if[i].req_data.addr;
assign core_req_data[i] = core_bus_if[i].req_data.data;
assign core_req_tag[i] = core_bus_if[i].req_data.tag;
assign core_bus_if[i].req_ready = core_req_ready[i];
`UNUSED_VAR (core_bus_if[i].req_data.atype)
end
wire [NUM_BANKS-1:0] per_bank_flush_valid;
wire [NUM_BANKS-1:0] per_bank_flush_ready;
wire [NUM_BANKS-1:0] per_bank_core_req_fire;
// this reset relay is required to sync with bank initialization
`RESET_RELAY (flush_reset, reset);
VX_cache_flush #(
.NUM_REQS (NUM_REQS),
.NUM_BANKS (NUM_BANKS),
.BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // bank xbar latency
) flush_unit (
.clk (clk),
.reset (flush_reset),
.core_bus_in_if (core_bus_if),
.core_bus_out_if (core_bus2_if),
.bank_req_fire (per_bank_core_req_fire),
.flush_valid (per_bank_flush_valid),
.flush_ready (per_bank_flush_ready)
);
///////////////////////////////////////////////////////////////////////////
@ -117,10 +131,10 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
wire [NUM_REQS-1:0] core_rsp_ready_s;
`RESET_RELAY (core_rsp_reset, reset);
for (genvar i = 0; i < NUM_REQS; ++i) begin
`RESET_RELAY (core_rsp_reset, reset);
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
.SIZE (CORE_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
@ -131,9 +145,9 @@ module VX_cache import VX_gpu_pkg::*; #(
.valid_in (core_rsp_valid_s[i]),
.ready_in (core_rsp_ready_s[i]),
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
.data_out ({core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag}),
.valid_out (core_bus_if[i].rsp_valid),
.ready_out (core_bus_if[i].rsp_ready)
.data_out ({core_bus2_if[i].rsp_data.data, core_bus2_if[i].rsp_data.tag}),
.valid_out (core_bus2_if[i].rsp_valid),
.ready_out (core_bus2_if[i].rsp_ready)
);
end
@ -146,24 +160,29 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [LINE_SIZE-1:0] mem_req_byteen_s;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s;
wire mem_req_flush_s;
wire mem_req_ready_s;
wire mem_bus_if_flush;
`RESET_RELAY (mem_req_reset, reset);
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH),
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + 1),
.SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (reset),
.reset (mem_req_reset),
.valid_in (mem_req_valid_s),
.ready_in (mem_req_ready_s),
.data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s}),
.data_out ({mem_bus_if.req_data.rw, mem_bus_if.req_data.byteen, mem_bus_if.req_data.addr, mem_bus_if.req_data.data, mem_bus_if.req_data.tag}),
.data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s, mem_req_flush_s}),
.data_out ({mem_bus_if.req_data.rw, mem_bus_if.req_data.byteen, mem_bus_if.req_data.addr, mem_bus_if.req_data.data, mem_bus_if.req_data.tag, mem_bus_if_flush}),
.valid_out (mem_bus_if.req_valid),
.ready_out (mem_bus_if.req_ready)
);
assign mem_bus_if.req_data.atype = '0;
assign mem_bus_if.req_data.atype = mem_bus_if_flush ? `ADDR_TYPE_WIDTH'(1 << `ADDR_TYPE_FLUSH) : '0;
///////////////////////////////////////////////////////////////////////////
@ -173,13 +192,15 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s;
wire mem_rsp_ready_s;
`RESET_RELAY (mem_rsp_reset, reset);
VX_elastic_buffer #(
.DATAW (MEM_TAG_WIDTH + `CS_LINE_WIDTH),
.SIZE (MRSQ_SIZE),
.OUT_REG (MRSQ_SIZE > 2)
) mem_rsp_queue (
.clk (clk),
.reset (reset),
.reset (mem_rsp_reset),
.valid_in (mem_bus_if.rsp_valid),
.ready_in (mem_bus_if.rsp_ready),
.data_in ({mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data}),
@ -188,27 +209,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.ready_out (mem_rsp_ready_s)
);
///////////////////////////////////////////////////////////////////////
wire [`CS_LINE_SEL_BITS-1:0] init_line_sel;
wire init_enable;
// this reset relay is required to sync with bank initialization
`RESET_RELAY (init_reset, reset);
VX_cache_init #(
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS)
) cache_init (
.clk (clk),
.reset (init_reset),
.addr_out (init_line_sel),
.valid_out (init_enable)
);
///////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////
wire [NUM_BANKS-1:0] per_bank_core_req_valid;
wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
@ -218,6 +219,7 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_req_data;
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_req_tag;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_req_idx;
wire [NUM_BANKS-1:0] per_bank_core_req_flush;
wire [NUM_BANKS-1:0] per_bank_core_req_ready;
wire [NUM_BANKS-1:0] per_bank_core_rsp_valid;
@ -229,14 +231,16 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0] per_bank_mem_req_valid;
wire [NUM_BANKS-1:0][`CS_MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
wire [NUM_BANKS-1:0] per_bank_mem_req_rw;
wire [NUM_BANKS-1:0][WORD_SEL_WIDTH-1:0] per_bank_mem_req_wsel;
wire [NUM_BANKS-1:0][WORD_SIZE-1:0] per_bank_mem_req_byteen;
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_mem_req_data;
wire [NUM_BANKS-1:0][LINE_SIZE-1:0] per_bank_mem_req_byteen;
wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_req_data;
wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0] per_bank_mem_req_id;
wire [NUM_BANKS-1:0] per_bank_mem_req_flush;
wire [NUM_BANKS-1:0] per_bank_mem_req_ready;
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready;
if (NUM_BANKS == 1) begin
assign mem_rsp_ready_s = per_bank_mem_rsp_ready;
end else begin
@ -245,12 +249,33 @@ module VX_cache import VX_gpu_pkg::*; #(
// Bank requests dispatch
wire [NUM_REQS-1:0][CORE_REQ_DATAW-1:0] core_req_data_in;
wire [NUM_BANKS-1:0][CORE_REQ_DATAW-1:0] core_req_data_out;
wire [NUM_REQS-1:0] core_req_valid;
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr;
wire [NUM_REQS-1:0] core_req_rw;
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag;
wire [NUM_REQS-1:0] core_req_flush;
wire [NUM_REQS-1:0] core_req_ready;
wire [NUM_REQS-1:0][LINE_ADDR_WIDTH-1:0] core_req_line_addr;
wire [NUM_REQS-1:0][BANK_SEL_WIDTH-1:0] core_req_bid;
wire [NUM_REQS-1:0][WORD_SEL_WIDTH-1:0] core_req_wsel;
wire [NUM_REQS-1:0][CORE_REQ_DATAW-1:0] core_req_data_in;
wire [NUM_BANKS-1:0][CORE_REQ_DATAW-1:0] core_req_data_out;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_valid[i] = core_bus2_if[i].req_valid;
assign core_req_rw[i] = core_bus2_if[i].req_data.rw;
assign core_req_byteen[i] = core_bus2_if[i].req_data.byteen;
assign core_req_addr[i] = core_bus2_if[i].req_data.addr;
assign core_req_data[i] = core_bus2_if[i].req_data.data;
assign core_req_tag[i] = core_bus2_if[i].req_data.tag;
assign core_req_flush[i] = core_bus2_if[i].req_data.atype[`ADDR_TYPE_FLUSH];
assign core_bus2_if[i].req_ready = core_req_ready[i];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (WORDS_PER_LINE > 1) begin
assign core_req_wsel[i] = core_req_addr[i][0 +: WORD_SEL_BITS];
@ -275,7 +300,9 @@ module VX_cache import VX_gpu_pkg::*; #(
core_req_wsel[i],
core_req_byteen[i],
core_req_data[i],
core_req_tag[i]};
core_req_tag[i],
core_req_flush[i]
};
end
`ifdef PERF_ENABLE
@ -289,7 +316,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (CORE_REQ_DATAW),
.PERF_CTR_BITS (`PERF_CTR_BITS),
.OUT_BUF ((NUM_REQS > 4) ? 2 : 0)
.OUT_BUF (REQ_XBAR_BUF)
) req_xbar (
.clk (clk),
.reset (req_xbar_reset),
@ -315,25 +342,27 @@ module VX_cache import VX_gpu_pkg::*; #(
per_bank_core_req_wsel[i],
per_bank_core_req_byteen[i],
per_bank_core_req_data[i],
per_bank_core_req_tag[i]} = core_req_data_out[i];
per_bank_core_req_tag[i],
per_bank_core_req_flush[i]
} = core_req_data_out[i];
end
// Banks access
for (genvar i = 0; i < NUM_BANKS; ++i) begin
for (genvar bank_id = 0; bank_id < NUM_BANKS; ++bank_id) begin : banks
wire [`CS_LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr;
wire curr_bank_mem_rsp_valid;
if (NUM_BANKS == 1) begin
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s;
end else begin
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s && (`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s) == i);
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s && (`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s) == bank_id);
end
`RESET_RELAY (bank_reset, reset);
VX_cache_bank #(
.BANK_ID (i),
.INSTANCE_ID (INSTANCE_ID),
.BANK_ID (bank_id),
.INSTANCE_ID ($sformatf("%s-bank%0d", INSTANCE_ID, bank_id)),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
@ -344,6 +373,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.MSHR_SIZE (MSHR_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_OUT_BUF (CORE_REQ_BUF_ENABLE ? 0 : CORE_OUT_BUF),
@ -353,54 +383,54 @@ module VX_cache import VX_gpu_pkg::*; #(
.reset (bank_reset),
`ifdef PERF_ENABLE
.perf_read_misses (perf_read_miss_per_bank[i]),
.perf_write_misses (perf_write_miss_per_bank[i]),
.perf_mshr_stalls (perf_mshr_stall_per_bank[i]),
.perf_read_misses (perf_read_miss_per_bank[bank_id]),
.perf_write_misses (perf_write_miss_per_bank[bank_id]),
.perf_mshr_stalls (perf_mshr_stall_per_bank[bank_id]),
`endif
// Core request
.core_req_valid (per_bank_core_req_valid[i]),
.core_req_addr (per_bank_core_req_addr[i]),
.core_req_rw (per_bank_core_req_rw[i]),
.core_req_wsel (per_bank_core_req_wsel[i]),
.core_req_byteen (per_bank_core_req_byteen[i]),
.core_req_data (per_bank_core_req_data[i]),
.core_req_tag (per_bank_core_req_tag[i]),
.core_req_idx (per_bank_core_req_idx[i]),
.core_req_ready (per_bank_core_req_ready[i]),
.core_req_valid (per_bank_core_req_valid[bank_id]),
.core_req_addr (per_bank_core_req_addr[bank_id]),
.core_req_rw (per_bank_core_req_rw[bank_id]),
.core_req_wsel (per_bank_core_req_wsel[bank_id]),
.core_req_byteen (per_bank_core_req_byteen[bank_id]),
.core_req_data (per_bank_core_req_data[bank_id]),
.core_req_tag (per_bank_core_req_tag[bank_id]),
.core_req_idx (per_bank_core_req_idx[bank_id]),
.core_req_flush (per_bank_core_req_flush[bank_id]),
.core_req_ready (per_bank_core_req_ready[bank_id]),
// Core response
.core_rsp_valid (per_bank_core_rsp_valid[i]),
.core_rsp_data (per_bank_core_rsp_data[i]),
.core_rsp_tag (per_bank_core_rsp_tag[i]),
.core_rsp_idx (per_bank_core_rsp_idx[i]),
.core_rsp_ready (per_bank_core_rsp_ready[i]),
.core_rsp_valid (per_bank_core_rsp_valid[bank_id]),
.core_rsp_data (per_bank_core_rsp_data[bank_id]),
.core_rsp_tag (per_bank_core_rsp_tag[bank_id]),
.core_rsp_idx (per_bank_core_rsp_idx[bank_id]),
.core_rsp_ready (per_bank_core_rsp_ready[bank_id]),
// Memory request
.mem_req_valid (per_bank_mem_req_valid[i]),
.mem_req_valid (per_bank_mem_req_valid[bank_id]),
.mem_req_addr (curr_bank_mem_req_addr),
.mem_req_rw (per_bank_mem_req_rw[i]),
.mem_req_wsel (per_bank_mem_req_wsel[i]),
.mem_req_byteen (per_bank_mem_req_byteen[i]),
.mem_req_data (per_bank_mem_req_data[i]),
.mem_req_id (per_bank_mem_req_id[i]),
.mem_req_ready (per_bank_mem_req_ready[i]),
.mem_req_rw (per_bank_mem_req_rw[bank_id]),
.mem_req_byteen (per_bank_mem_req_byteen[bank_id]),
.mem_req_data (per_bank_mem_req_data[bank_id]),
.mem_req_id (per_bank_mem_req_id[bank_id]),
.mem_req_flush (per_bank_mem_req_flush[bank_id]),
.mem_req_ready (per_bank_mem_req_ready[bank_id]),
// Memory response
.mem_rsp_valid (curr_bank_mem_rsp_valid),
.mem_rsp_data (mem_rsp_data_s),
.mem_rsp_id (`CS_MEM_TAG_TO_REQ_ID(mem_rsp_tag_s)),
.mem_rsp_ready (per_bank_mem_rsp_ready[i]),
.mem_rsp_ready (per_bank_mem_rsp_ready[bank_id]),
// initialization
.init_enable (init_enable),
.init_line_sel (init_line_sel)
.flush_valid (per_bank_flush_valid[bank_id]),
.flush_ready (per_bank_flush_ready[bank_id])
);
if (NUM_BANKS == 1) begin
assign per_bank_mem_req_addr[i] = curr_bank_mem_req_addr;
assign per_bank_mem_req_addr[bank_id] = curr_bank_mem_req_addr;
end else begin
assign per_bank_mem_req_addr[i] = `CS_LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, i);
assign per_bank_mem_req_addr[bank_id] = `CS_LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, bank_id);
end
end
@ -442,37 +472,41 @@ module VX_cache import VX_gpu_pkg::*; #(
wire mem_req_valid_p;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_p;
wire mem_req_rw_p;
wire [WORD_SEL_WIDTH-1:0] mem_req_wsel_p;
wire [WORD_SIZE-1:0] mem_req_byteen_p;
wire [`CS_WORD_WIDTH-1:0] mem_req_data_p;
wire [LINE_SIZE-1:0] mem_req_byteen_p;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_p;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_p;
wire [MSHR_ADDR_WIDTH-1:0] mem_req_id_p;
wire mem_req_flush_p;
wire mem_req_ready_p;
// Memory request arbitration
wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + `CS_WORD_WIDTH)-1:0] data_in;
wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + 1)-1:0] data_in;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign data_in[i] = {per_bank_mem_req_addr[i],
assign data_in[i] = {
per_bank_mem_req_addr[i],
per_bank_mem_req_rw[i],
per_bank_mem_req_wsel[i],
per_bank_mem_req_byteen[i],
per_bank_mem_req_data[i],
per_bank_mem_req_id[i]};
per_bank_mem_req_id[i],
per_bank_mem_req_flush[i]
};
end
`RESET_RELAY (mem_arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (NUM_BANKS),
.DATAW (`CS_MEM_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + MSHR_ADDR_WIDTH),
.ARBITER ("R")
.DATAW (`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + MSHR_ADDR_WIDTH + 1),
.ARBITER ("F")
) mem_req_arb (
.clk (clk),
.reset (reset),
.reset (mem_arb_reset),
.valid_in (per_bank_mem_req_valid),
.ready_in (per_bank_mem_req_ready),
.data_in (data_in),
.data_out ({mem_req_addr_p, mem_req_rw_p, mem_req_wsel_p, mem_req_byteen_p, mem_req_data_p, mem_req_id_p}),
.data_out ({mem_req_addr_p, mem_req_rw_p, mem_req_byteen_p, mem_req_data_p, mem_req_id_p, mem_req_flush_p}),
.valid_out (mem_req_valid_p),
.ready_out (mem_req_ready_p),
`UNUSED_PIN (sel_out)
@ -490,31 +524,15 @@ module VX_cache import VX_gpu_pkg::*; #(
assign mem_req_valid_s = mem_req_valid_p;
assign mem_req_addr_s = mem_req_addr_p;
assign mem_req_tag_s = mem_req_tag_p;
assign mem_req_flush_s = mem_req_flush_p;
assign mem_req_ready_p = mem_req_ready_s;
if (WRITE_ENABLE != 0) begin
if (`CS_WORDS_PER_LINE > 1) begin
reg [LINE_SIZE-1:0] mem_req_byteen_r;
reg [`CS_LINE_WIDTH-1:0] mem_req_data_r;
always @(*) begin
mem_req_byteen_r = '0;
mem_req_data_r = 'x;
mem_req_byteen_r[mem_req_wsel_p * WORD_SIZE +: WORD_SIZE] = mem_req_byteen_p;
mem_req_data_r[mem_req_wsel_p * `CS_WORD_WIDTH +: `CS_WORD_WIDTH] = mem_req_data_p;
end
assign mem_req_rw_s = mem_req_rw_p;
assign mem_req_byteen_s = mem_req_byteen_r;
assign mem_req_data_s = mem_req_data_r;
end else begin
`UNUSED_VAR (mem_req_wsel_p)
assign mem_req_rw_s = mem_req_rw_p;
assign mem_req_byteen_s = mem_req_byteen_p;
assign mem_req_data_s = mem_req_data_p;
end
end else begin
`UNUSED_VAR (mem_req_byteen_p)
`UNUSED_VAR (mem_req_wsel_p)
`UNUSED_VAR (mem_req_data_p)
`UNUSED_VAR (mem_req_rw_p)
@ -548,7 +566,7 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready;
assign perf_crsp_stall_per_req[i] = core_bus2_if[i].rsp_valid && ~core_bus2_if[i].rsp_ready;
end
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);

View file

@ -41,6 +41,9 @@ module VX_cache_bank #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -69,12 +72,13 @@ module VX_cache_bank #(
// Core Request
input wire core_req_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] core_req_addr,
input wire core_req_rw,
input wire [WORD_SEL_WIDTH-1:0] core_req_wsel,
input wire [WORD_SIZE-1:0] core_req_byteen,
input wire [`CS_WORD_WIDTH-1:0] core_req_data,
input wire [TAG_WIDTH-1:0] core_req_tag,
input wire [REQ_SEL_WIDTH-1:0] core_req_idx,
input wire core_req_rw, // write enable
input wire [WORD_SEL_WIDTH-1:0] core_req_wsel, // select the word in a cacheline, e.g. word size = 4 bytes, cacheline size = 64 bytes, it should have log(64/4)= 4 bits
input wire [WORD_SIZE-1:0] core_req_byteen,// which bytes in data to write
input wire [`CS_WORD_WIDTH-1:0] core_req_data, // data to be written
input wire [TAG_WIDTH-1:0] core_req_tag, // identifier of the request (request id)
input wire [REQ_SEL_WIDTH-1:0] core_req_idx, // index of the request in the core request array
input wire core_req_flush, // flush enable
output wire core_req_ready,
// Core Response
@ -88,10 +92,10 @@ module VX_cache_bank #(
output wire mem_req_valid,
output wire [`CS_LINE_ADDR_WIDTH-1:0] mem_req_addr,
output wire mem_req_rw,
output wire [WORD_SEL_WIDTH-1:0] mem_req_wsel,
output wire [WORD_SIZE-1:0] mem_req_byteen,
output wire [`CS_WORD_WIDTH-1:0] mem_req_data,
output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id,
output wire [LINE_SIZE-1:0] mem_req_byteen,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id, // index of the head entry in the mshr
output wire mem_req_flush,
input wire mem_req_ready,
// Memory response
@ -100,9 +104,9 @@ module VX_cache_bank #(
input wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id,
output wire mem_rsp_ready,
// initialization
input wire init_enable,
input wire [`CS_LINE_SEL_BITS-1:0] init_line_sel
// flush
input wire flush_valid,
output wire flush_ready
);
localparam PIPELINE_STAGES = 2;
@ -128,23 +132,56 @@ module VX_cache_bank #(
wire [MSHR_ADDR_WIDTH-1:0] replay_id;
wire replay_ready;
wire is_init_st0;
wire is_flush_st0, is_flush_st1;
wire [NUM_WAYS-1:0] flush_way_st0;
wire [`CS_LINE_ADDR_WIDTH-1:0] addr_sel, addr_st0, addr_st1;
wire rw_st0, rw_st1;
wire [WORD_SEL_WIDTH-1:0] wsel_st0, wsel_st1;
wire [WORD_SIZE-1:0] byteen_st0, byteen_st1;
wire [REQ_SEL_WIDTH-1:0] req_idx_st0, req_idx_st1;
wire [TAG_WIDTH-1:0] tag_st0, tag_st1;
wire rw_sel, rw_st0, rw_st1;
wire [WORD_SEL_WIDTH-1:0] wsel_sel, wsel_st0, wsel_st1;
wire [WORD_SIZE-1:0] byteen_sel, byteen_st0, byteen_st1;
wire [REQ_SEL_WIDTH-1:0] req_idx_sel, req_idx_st0, req_idx_st1;
wire [TAG_WIDTH-1:0] tag_sel, tag_st0, tag_st1;
wire [`CS_WORD_WIDTH-1:0] read_data_st1;
wire [`CS_LINE_WIDTH-1:0] data_sel, data_st0, data_st1;
wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0, mshr_id_st0, mshr_id_st1;
wire valid_sel, valid_st0, valid_st1;
wire is_init_st0;
wire is_creq_st0, is_creq_st1;
wire is_fill_st0, is_fill_st1;
wire is_replay_st0, is_replay_st1;
wire creq_flush_st0, creq_flush_st1;
wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1;
wire [NUM_WAYS-1:0] tag_matches_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_prev_st0, mshr_prev_st1;
wire mshr_pending_st0, mshr_pending_st1;
wire mshr_empty;
wire line_flush_valid;
wire line_flush_init;
wire [`CS_LINE_SEL_BITS-1:0] line_flush_sel;
wire [NUM_WAYS-1:0] line_flush_way;
wire line_flush_ready;
// flush unit
VX_bank_flush #(
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WRITEBACK (WRITEBACK)
) flush_unit (
.clk (clk),
.reset (reset),
.flush_in_valid (flush_valid),
.flush_in_ready (flush_ready),
.flush_out_init (line_flush_init),
.flush_out_valid (line_flush_valid),
.flush_out_line (line_flush_sel),
.flush_out_way (line_flush_way),
.flush_out_ready (line_flush_ready),
.mshr_empty (mshr_empty)
);
wire rdw_hazard_st0;
reg rdw_hazard_st1;
@ -154,13 +191,17 @@ module VX_cache_bank #(
// inputs arbitration:
// mshr replay has highest priority to maximize utilization since there is no miss.
// handle memory responses next to prevent deadlock with potential memory request from a miss.
wire replay_grant = ~init_enable;
// flush has precedence over core requests to ensure that the cache is in a consistent state.
wire replay_grant = ~line_flush_init;
wire replay_enable = replay_grant && replay_valid;
wire fill_grant = ~init_enable && ~replay_enable;
wire fill_grant = ~line_flush_init && ~replay_enable;
wire fill_enable = fill_grant && mem_rsp_valid;
wire creq_grant = ~init_enable && ~replay_enable && ~fill_enable;
wire flush_grant = ~line_flush_init && ~replay_enable && ~fill_enable;
wire flush_enable = flush_grant && line_flush_valid;
wire creq_grant = ~line_flush_init && ~replay_enable && ~fill_enable && ~flush_enable;
wire creq_enable = creq_grant && core_req_valid;
assign replay_ready = replay_grant
@ -170,60 +211,57 @@ module VX_cache_bank #(
assign mem_rsp_ready = fill_grant
&& ~pipe_stall;
assign line_flush_ready = flush_grant
&& ~mreq_queue_alm_full
&& ~pipe_stall;
assign core_req_ready = creq_grant
&& ~mreq_queue_alm_full
&& ~mshr_alm_full
&& ~pipe_stall;
wire init_fire = init_enable;
wire init_fire = line_flush_init;
wire replay_fire = replay_valid && replay_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
wire flush_fire = line_flush_valid && line_flush_ready;
wire core_req_fire = core_req_valid && core_req_ready;
wire [TAG_WIDTH-1:0] mshr_creq_tag = replay_enable ? replay_tag : core_req_tag;
assign valid_sel = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire;
assign rw_sel = replay_valid ? replay_rw : core_req_rw;
assign byteen_sel = replay_valid ? replay_byteen : core_req_byteen;
assign wsel_sel = replay_valid ? replay_wsel : core_req_wsel;
assign req_idx_sel = replay_valid ? replay_idx : core_req_idx;
assign tag_sel = replay_valid ? replay_tag : core_req_tag;
assign addr_sel = (line_flush_init | line_flush_valid) ? `CS_LINE_ADDR_WIDTH'(line_flush_sel) :
(replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : core_req_addr));
if (WRITE_ENABLE) begin
assign data_sel[`CS_WORD_WIDTH-1:0] = replay_valid ? replay_data : (mem_rsp_valid ? mem_rsp_data[`CS_WORD_WIDTH-1:0] : core_req_data);
end else begin
assign data_sel[`CS_WORD_WIDTH-1:0] = mem_rsp_data[`CS_WORD_WIDTH-1:0];
`UNUSED_VAR (core_req_data)
`UNUSED_VAR (replay_data)
end
for (genvar i = `CS_WORD_WIDTH; i < `CS_LINE_WIDTH; ++i) begin
assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words of data_sel
end
if (UUID_WIDTH != 0) begin
assign req_uuid_sel = mshr_creq_tag[TAG_WIDTH-1 -: UUID_WIDTH];
assign req_uuid_sel = tag_sel[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign req_uuid_sel = 0;
end
`UNUSED_VAR (mshr_creq_tag)
assign valid_sel = init_fire || replay_fire || mem_rsp_fire || core_req_fire;
assign addr_sel = init_enable ? `CS_LINE_ADDR_WIDTH'(init_line_sel) :
(replay_valid ? replay_addr :
(mem_rsp_valid ? mem_rsp_addr : core_req_addr));
assign data_sel[`CS_WORD_WIDTH-1:0] = (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data[`CS_WORD_WIDTH-1:0] : (replay_valid ? replay_data : core_req_data);
for (genvar i = `CS_WORD_WIDTH; i < `CS_LINE_WIDTH; ++i) begin
assign data_sel[i] = mem_rsp_data[i];
end
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + NUM_WAYS + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
.RESETW (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (~pipe_stall),
.data_in ({
valid_sel,
init_enable,
replay_enable,
fill_enable,
creq_enable,
addr_sel,
data_sel,
replay_valid ? replay_rw : core_req_rw,
replay_valid ? replay_byteen : core_req_byteen,
replay_valid ? replay_wsel : core_req_wsel,
replay_valid ? replay_idx : core_req_idx,
replay_valid ? replay_tag : core_req_tag,
replay_id
}),
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_creq_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
.data_in ({valid_sel, line_flush_init, replay_enable, fill_enable, flush_enable, creq_enable, core_req_flush, line_flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}),
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
);
if (UUID_WIDTH != 0) begin
@ -232,20 +270,24 @@ module VX_cache_bank #(
assign req_uuid_st0 = 0;
end
wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire do_init_st0 = valid_st0 && is_init_st0;
wire do_flush_st0 = valid_st0 && is_flush_st0;
wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0;
wire do_replay_rd_st0 = valid_st0 && is_replay_st0 && ~rw_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire do_lookup_st0 = valid_st0 && ~(is_fill_st0 || is_init_st0);
wire do_cache_rd_st0 = do_creq_rd_st0 || do_replay_rd_st0;
wire [`CS_WORD_WIDTH-1:0] write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0];
wire [NUM_WAYS-1:0] tag_matches_st0, tag_matches_st1;
wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1;
wire [NUM_WAYS-1:0] repl_way_st0;
wire [`CS_TAG_SEL_BITS-1:0] repl_tag_st0;
`RESET_RELAY (tag_reset, reset);
VX_cache_tags #(
.INSTANCE_ID(INSTANCE_ID),
.INSTANCE_ID($sformatf("%s-tags", INSTANCE_ID)),
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
@ -261,30 +303,37 @@ module VX_cache_bank #(
.stall (pipe_stall),
// read/Fill
// init/fill/lookup/flush
.init (do_init_st0 || do_flush_st0),
.fill (do_fill_st0),
.lookup (do_lookup_st0),
.line_addr (addr_st0),
.fill (do_fill_st0),
.init (do_init_st0),
.way_sel (way_sel_st0),
.tag_matches(tag_matches_st0)
.tag_matches(tag_matches_st0),
// replacement
.repl_way (repl_way_st0),
.repl_tag (repl_tag_st0)
);
assign mshr_id_st0 = is_creq_st0 ? mshr_alloc_id_st0 : replay_id_st0;
assign way_sel_st0 = is_fill_st0 ? repl_way_st0 : (is_flush_st0 ? flush_way_st0 : tag_matches_st0);
wire [`CS_LINE_ADDR_WIDTH-1:0] addr_r_st0 = (is_fill_st0 || is_flush_st0) ? {repl_tag_st0, addr_st0[`CS_LINE_SEL_BITS-1:0]} : addr_st0;
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + NUM_WAYS + 1),
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (~pipe_stall),
.data_in ({valid_st0, is_replay_st0, is_fill_st0, is_creq_st0, rw_st0, addr_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, tag_matches_st0, way_sel_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_replay_st1, is_fill_st1, is_creq_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, tag_matches_st1, way_sel_st1, mshr_pending_st1})
.data_in ({valid_st0, is_flush_st0, is_replay_st0, is_fill_st0, is_creq_st0, creq_flush_st0, rw_st0, addr_r_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_flush_st1, is_replay_st1, is_fill_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, mshr_pending_st1})
);
// we have a tag hit
wire is_hit_st1 = (| tag_matches_st1);
wire is_hit_st1 = (| way_sel_st1);
if (UUID_WIDTH != 0) begin
assign req_uuid_st1 = tag_st1[TAG_WIDTH-1 -: UUID_WIDTH];
@ -292,37 +341,62 @@ module VX_cache_bank #(
assign req_uuid_st1 = 0;
end
wire do_creq_rd_st1 = valid_st1 && is_creq_st1 && ~rw_st1;
wire do_creq_wr_st1 = valid_st1 && is_creq_st1 && rw_st1;
wire is_read_st1 = is_creq_st1 && ~rw_st1;
wire is_write_st1 = is_creq_st1 && rw_st1;
wire do_creq_rd_st1 = valid_st1 && is_read_st1;
wire do_creq_wr_st1 = valid_st1 && is_write_st1;
wire do_fill_st1 = valid_st1 && is_fill_st1;
wire do_replay_rd_st1 = valid_st1 && is_replay_st1 && ~rw_st1;
wire do_replay_wr_st1 = valid_st1 && is_replay_st1 && rw_st1;
wire do_cache_rd_st1 = do_read_hit_st1 || do_replay_rd_st1;
wire do_cache_wr_st1 = do_write_hit_st1 || do_replay_wr_st1;
wire do_read_hit_st1 = do_creq_rd_st1 && is_hit_st1;
wire do_read_miss_st1 = do_creq_rd_st1 && ~is_hit_st1;
wire do_write_hit_st1 = do_creq_wr_st1 && is_hit_st1;
wire do_write_miss_st1= do_creq_wr_st1 && ~is_hit_st1;
wire do_flush_st1 = valid_st1 && is_flush_st1;
`UNUSED_VAR (do_write_miss_st1)
// ensure mshr replay always get a hit
`RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("runtime error: invalid mshr replay"));
// detect BRAM's read-during-write hazard
assign rdw_hazard_st0 = do_fill_st0; // after a fill
always @(posedge clk) begin
rdw_hazard_st1 <= (do_creq_rd_st0 && do_write_hit_st1 && (addr_st0 == addr_st1))
&& ~rdw_hazard_st1; // after a write to same address
assign rdw_hazard_st0 = do_fill_st0; // stall cycle after a fill
wire rdw_case1 = do_cache_rd_st0 && do_cache_wr_st1 && (addr_st0 == addr_st1); // standard cache access
wire rdw_case2 = WRITEBACK && (do_flush_st0 || do_fill_st0) && do_cache_wr_st1; // a writeback can evict preceeding write
always @(posedge clk) begin // after a write to same address
rdw_hazard_st1 <= (rdw_case1 || rdw_case2)
&& ~rdw_hazard_st1; // invalidate if pipeline stalled to avoid repeats
end
wire [`CS_WORD_WIDTH-1:0] write_data_st1 = data_st1[`CS_WORD_WIDTH-1:0];
wire [`CS_LINE_WIDTH-1:0] write_data_st1 = {`CS_WORDS_PER_LINE{data_st1[`CS_WORD_WIDTH-1:0]}};
wire [`CS_LINE_WIDTH-1:0] fill_data_st1 = data_st1;
wire [LINE_SIZE-1:0] write_byteen_st1;
wire [`CS_LINE_WIDTH-1:0] dirty_data_st1;
wire [LINE_SIZE-1:0] dirty_byteen_st1;
wire dirty_valid_st1;
if (`CS_WORDS_PER_LINE > 1) begin
reg [LINE_SIZE-1:0] write_byteen_r;
always @(*) begin
write_byteen_r = '0;
write_byteen_r[wsel_st1 * WORD_SIZE +: WORD_SIZE] = byteen_st1;
end
assign write_byteen_st1 = write_byteen_r;
end else begin
assign write_byteen_st1 = byteen_st1;
end
`RESET_RELAY (data_reset, reset);
VX_cache_data #(
.INSTANCE_ID (INSTANCE_ID),
.INSTANCE_ID ($sformatf("%s-data", INSTANCE_ID)),
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
@ -330,6 +404,7 @@ module VX_cache_bank #(
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH)
) cache_data (
.clk (clk),
@ -339,23 +414,38 @@ module VX_cache_bank #(
.stall (pipe_stall),
.read (do_read_hit_st1 || do_replay_rd_st1),
.fill (do_fill_st1),
.write (do_write_hit_st1 || do_replay_wr_st1),
.way_sel (way_sel_st1 | tag_matches_st1),
.read (do_cache_rd_st1),
.fill (do_fill_st1 && ~rdw_hazard_st1),
.flush (do_flush_st1),
.write (do_cache_wr_st1),
.way_sel (way_sel_st1),
.line_addr (addr_st1),
.wsel (wsel_st1),
.byteen (byteen_st1),
.fill_data (fill_data_st1),
.write_data (write_data_st1),
.read_data (read_data_st1)
.write_byteen(write_byteen_st1),
.read_data (read_data_st1),
.dirty_valid(dirty_valid_st1),
.dirty_data (dirty_data_st1),
.dirty_byteen(dirty_byteen_st1)
);
wire [MSHR_SIZE-1:0] mshr_matches_st0;
wire [MSHR_SIZE-1:0] mshr_lookup_pending_st0;
wire [MSHR_SIZE-1:0] mshr_lookup_rw_st0;
wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~pipe_stall;
wire mshr_lookup_st0 = mshr_allocate_st0;
wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~pipe_stall;
wire mshr_release_st1 = is_hit_st1 || (rw_st1 && ~mshr_pending_st1);
// release allocated mshr entry if we had a hit
wire mshr_release_st1;
if (WRITEBACK) begin
assign mshr_release_st1 = is_hit_st1;
end else begin
// we need to keep missed write requests in MSHR if there is already a pending entry to the same address
// this ensures that missed write requests are replayed locally in case a pending fill arrives without the write content
// this can happen when writes are sent late, when the fill was already in flight.
assign mshr_release_st1 = is_hit_st1 || (rw_st1 && ~mshr_pending_st1);
end
VX_pending_size #(
.SIZE (MSHR_SIZE)
@ -364,15 +454,17 @@ module VX_cache_bank #(
.reset (reset),
.incr (core_req_fire),
.decr (replay_fire || (mshr_finalize_st1 && mshr_release_st1)),
.empty (mshr_empty),
`UNUSED_PIN (alm_empty),
.full (mshr_alm_full),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
`RESET_RELAY (mshr_reset, reset);
VX_cache_mshr #(
.INSTANCE_ID (INSTANCE_ID),
.INSTANCE_ID ($sformatf("%s-mshr", INSTANCE_ID)),
.BANK_ID (BANK_ID),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
@ -412,7 +504,8 @@ module VX_cache_bank #(
// lookup
.lookup_valid (mshr_lookup_st0),
.lookup_addr (addr_st0),
.lookup_matches (mshr_matches_st0),
.lookup_pending (mshr_lookup_pending_st0),
.lookup_rw (mshr_lookup_rw_st0),
// finalize
.finalize_valid (mshr_finalize_st1),
@ -422,10 +515,12 @@ module VX_cache_bank #(
.finalize_prev (mshr_prev_st1)
);
// ignore allocated id from mshr matches
// check if there are pending requests to same line in the MSHR
wire [MSHR_SIZE-1:0] lookup_matches;
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
assign lookup_matches[i] = (i != mshr_alloc_id_st0) && mshr_matches_st0[i];
assign lookup_matches[i] = mshr_lookup_pending_st0[i]
&& (i != mshr_alloc_id_st0) // exclude current mshr id
&& (WRITEBACK || ~mshr_lookup_rw_st0[i]); // exclude write requests if writethrough
end
assign mshr_pending_st0 = (| lookup_matches);
@ -436,7 +531,7 @@ module VX_cache_bank #(
wire [REQ_SEL_WIDTH-1:0] crsp_queue_idx;
wire [TAG_WIDTH-1:0] crsp_queue_tag;
assign crsp_queue_valid = do_read_hit_st1 || do_replay_rd_st1;
assign crsp_queue_valid = do_cache_rd_st1;
assign crsp_queue_idx = req_idx_st1;
assign crsp_queue_data = read_data_st1;
assign crsp_queue_tag = tag_st1;
@ -463,29 +558,40 @@ module VX_cache_bank #(
// schedule memory request
wire mreq_queue_push, mreq_queue_pop, mreq_queue_empty;
wire [`CS_WORD_WIDTH-1:0] mreq_queue_data;
wire [WORD_SIZE-1:0] mreq_queue_byteen;
wire [WORD_SEL_WIDTH-1:0] mreq_queue_wsel;
wire [`CS_LINE_WIDTH-1:0] mreq_queue_data;
wire [LINE_SIZE-1:0] mreq_queue_byteen;
wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr;
wire [MSHR_ADDR_WIDTH-1:0] mreq_queue_id;
wire mreq_queue_rw;
wire mreq_queue_flush;
assign mreq_queue_push = (do_read_miss_st1 && ~mshr_pending_st1)
|| do_creq_wr_st1;
wire is_evict_st1 = (is_fill_st1 || is_flush_st1) && dirty_valid_st1;
wire do_writeback_st1 = valid_st1 && is_evict_st1;
`UNUSED_VAR (do_writeback_st1)
if (WRITEBACK) begin
assign mreq_queue_push = (((do_read_miss_st1 || do_write_miss_st1) && ~mshr_pending_st1)
|| do_writeback_st1)
&& ~rdw_hazard_st1;
end else begin
`UNUSED_VAR (dirty_valid_st1)
assign mreq_queue_push = ((do_read_miss_st1 && ~mshr_pending_st1)
|| do_creq_wr_st1)
&& ~rdw_hazard_st1;
end
assign mreq_queue_pop = mem_req_valid && mem_req_ready;
assign mreq_queue_rw = WRITE_ENABLE && rw_st1;
assign mreq_queue_rw = WRITE_ENABLE && (WRITEBACK ? is_evict_st1 : rw_st1);
assign mreq_queue_addr = addr_st1;
assign mreq_queue_id = mshr_id_st1;
assign mreq_queue_wsel = wsel_st1;
assign mreq_queue_byteen = byteen_st1;
assign mreq_queue_data = write_data_st1;
assign mreq_queue_data = is_write_st1 ? write_data_st1 : dirty_data_st1;
assign mreq_queue_byteen = is_write_st1 ? write_byteen_st1 : dirty_byteen_st1;
assign mreq_queue_flush = creq_flush_st1;
`RESET_RELAY (mreq_queue_reset, reset);
VX_fifo_queue #(
.DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + `CS_WORD_WIDTH),
.DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + 1),
.DEPTH (MREQ_SIZE),
.ALM_FULL (MREQ_SIZE-PIPELINE_STAGES),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
@ -494,8 +600,8 @@ module VX_cache_bank #(
.reset (mreq_queue_reset),
.push (mreq_queue_push),
.pop (mreq_queue_pop),
.data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_id, mreq_queue_byteen, mreq_queue_wsel, mreq_queue_data}),
.data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_wsel, mem_req_data}),
.data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_id, mreq_queue_byteen, mreq_queue_data, mreq_queue_flush}),
.data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_data, mem_req_flush}),
.empty (mreq_queue_empty),
.alm_full (mreq_queue_alm_full),
`UNUSED_PIN (full),
@ -515,35 +621,34 @@ module VX_cache_bank #(
`ifdef DBG_TRACE_CACHE
wire crsp_queue_fire = crsp_queue_valid && crsp_queue_ready;
wire pipeline_stall = (replay_valid || mem_rsp_valid || core_req_valid)
&& ~(replay_fire || mem_rsp_fire || core_req_fire);
wire pipeline_stall = (replay_valid || mem_rsp_valid || core_req_valid || line_flush_valid)
&& ~(replay_fire || mem_rsp_fire || core_req_fire || line_flush_valid);
always @(posedge clk) begin
if (pipeline_stall) begin
`TRACE(3, ("%d: *** %s-bank%0d stall: crsq=%b, mreq=%b, mshr=%b\n", $time, INSTANCE_ID, BANK_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full));
end
if (init_enable) begin
`TRACE(2, ("%d: %s-bank%0d init: addr=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(init_line_sel, BANK_ID)));
`TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw_st0=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard_st0));
end
if (mem_rsp_fire) begin
`TRACE(2, ("%d: %s-bank%0d fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data));
`TRACE(2, ("%d: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data));
end
if (replay_fire) begin
`TRACE(2, ("%d: %s-bank%0d mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel));
`TRACE(2, ("%d: %s mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel));
end
if (core_req_fire) begin
if (core_req_rw)
`TRACE(2, ("%d: %s-bank%0d core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel));
`TRACE(2, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel));
else
`TRACE(2, ("%d: %s-bank%0d core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel));
`TRACE(2, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel));
end
if (crsp_queue_fire) begin
`TRACE(2, ("%d: %s-bank%0d core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1));
`TRACE(2, ("%d: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1));
end
if (mreq_queue_push) begin
if (do_creq_wr_st1)
`TRACE(2, ("%d: %s-bank%0d writethrough: addr=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1));
if (do_creq_wr_st1 && !WRITEBACK)
`TRACE(2, ("%d: %s writethrough: addr=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1));
else if (do_writeback_st1)
`TRACE(2, ("%d: %s writeback: addr=0x%0h, byteen=%b, data=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data));
else
`TRACE(2, ("%d: %s-bank%0d fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_id, req_uuid_st1));
`TRACE(2, ("%d: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_id, req_uuid_st1));
end
end
`endif

View file

@ -89,8 +89,7 @@ module VX_cache_bypass #(
VX_generic_arbiter #(
.NUM_REQS (NUM_REQS),
.TYPE (PASSTHRU ? "R" : "P"),
.LOCK_ENABLE (1)
.TYPE (PASSTHRU ? "R" : "P")
) core_req_nc_arb (
.clk (clk),
.reset (reset),
@ -98,7 +97,7 @@ module VX_cache_bypass #(
.grant_index (core_req_nc_idx),
.grant_onehot (core_req_nc_sel),
.grant_valid (core_req_nc_valid),
.grant_unlock (core_req_nc_ready)
.grant_ready (core_req_nc_ready)
);
for (genvar i = 0; i < NUM_REQS; ++i) begin

View file

@ -46,6 +46,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -82,9 +85,8 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
`ifdef PERF_ENABLE
cache_perf_t perf_cache_tmp[1], perf_cache_unit[NUM_CACHES];
`PERF_CACHE_ADD (perf_cache_tmp, perf_cache_unit, 1, NUM_CACHES)
assign cache_perf = perf_cache_tmp[0];
cache_perf_t perf_cache_unit[NUM_CACHES];
`PERF_CACHE_ADD (cache_perf, perf_cache_unit, NUM_CACHES)
`endif
VX_mem_bus_if #(
@ -97,8 +99,6 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.TAG_WIDTH (ARB_TAG_WIDTH)
) arb_core_bus_if[NUM_CACHES * NUM_REQS]();
`RESET_RELAY (arb_reset, reset);
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
@ -114,6 +114,8 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
`ASSIGN_VX_MEM_BUS_IF (core_bus_tmp_if[j], core_bus_if[j * NUM_REQS + i]);
end
`RESET_RELAY (arb_reset, reset);
VX_mem_arb #(
.NUM_INPUTS (NUM_INPUTS),
.NUM_OUTPUTS (NUM_CACHES),
@ -135,9 +137,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
end
end
`RESET_RELAY (cache_reset, reset);
for (genvar i = 0; i < NUM_CACHES; ++i) begin : caches
for (genvar i = 0; i < NUM_CACHES; ++i) begin
`RESET_RELAY (cache_reset, reset);
VX_cache_wrap #(
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, i)),
@ -152,6 +154,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (ARB_TAG_WIDTH),
.TAG_SEL_IDX (TAG_SEL_IDX),

View file

@ -28,6 +28,8 @@ module VX_cache_data #(
parameter WORD_SIZE = 1,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Request debug identifier
parameter UUID_WIDTH = 0
) (
@ -42,59 +44,100 @@ module VX_cache_data #(
input wire read,
input wire fill,
input wire flush,
input wire write,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire [`UP(`CS_WORD_SEL_BITS)-1:0] wsel,
input wire [WORD_SIZE-1:0] byteen,
input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] fill_data,
input wire [`CS_WORD_WIDTH-1:0] write_data,
input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] write_data,
input wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_byteen,
input wire [NUM_WAYS-1:0] way_sel,
output wire [`CS_WORD_WIDTH-1:0] read_data
output wire [`CS_WORD_WIDTH-1:0] read_data,
output wire dirty_valid,
output wire [`CS_LINE_WIDTH-1:0] dirty_data,
output wire [LINE_SIZE-1:0] dirty_byteen
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_PARAM (WORD_SIZE)
`UNUSED_VAR (reset)
`UNUSED_VAR (stall)
`UNUSED_VAR (line_addr)
`UNUSED_VAR (read)
`UNUSED_VAR (flush)
localparam BYTEENW = (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) ? (LINE_SIZE * NUM_WAYS) : 1;
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
if (WRITEBACK) begin
reg [`CS_LINES_PER_BANK * NUM_WAYS-1:0][LINE_SIZE-1:0] dirty_bytes_r;
reg [`CS_LINES_PER_BANK * NUM_WAYS-1:0] dirty_blocks_r;
wire [`CLOG2(`CS_LINES_PER_BANK * NUM_WAYS)-1:0] way_addr;
if (NUM_WAYS > 1) begin
assign way_addr = {line_sel, way_idx};
end else begin
assign way_addr = line_sel;
end
always @(posedge clk) begin
if (fill) begin
dirty_bytes_r[way_addr] <= '0;
end else if (write) begin
dirty_bytes_r[way_addr] <= dirty_bytes_r[way_addr] | write_byteen;
end
end
always @(posedge clk) begin
if (reset) begin
for (integer i = 0; i < `CS_LINES_PER_BANK * NUM_WAYS; ++i) begin
dirty_blocks_r[i] <= 0;
end
end else begin
if (fill) begin
dirty_blocks_r[way_addr] <= 0;
end else if (write) begin
dirty_blocks_r[way_addr] <= 1;
end
end
end
assign dirty_byteen = dirty_bytes_r[way_addr];
assign dirty_valid = dirty_blocks_r[way_addr];
end else begin
assign dirty_byteen = '0;
assign dirty_valid = 0;
end
// order the data layout to perform ways multiplexing last.
// this allows converting way index to binary in parallel with BRAM read.
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] wdata;
wire [BYTEENW-1:0] wren;
if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin
reg [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] wdata_r;
reg [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] wren_r;
always @(*) begin
wdata_r = {`CS_WORDS_PER_LINE{write_data}};
wren_r = '0;
wren_r[wsel] = byteen;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
assign wdata[i] = (fill || !WRITE_ENABLE) ? {NUM_WAYS{fill_data[i]}} : {NUM_WAYS{write_data[i]}};
end
// order the data layout to perform ways multiplexing last
// this allows performing onehot encoding of the way index in parallel with BRAM read.
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
assign wdata[i] = fill ? {NUM_WAYS{fill_data[i]}} : {NUM_WAYS{wdata_r[i]}};
for (genvar j = 0; j < NUM_WAYS; ++j) begin
assign wren_w[i][j] = (fill ? {WORD_SIZE{1'b1}} : wren_r[i])
& {WORD_SIZE{((NUM_WAYS == 1) || way_sel[j])}};
assign wren_w[i][j] = ((fill || !WRITE_ENABLE) ? {WORD_SIZE{1'b1}} : write_byteen[i])
& {WORD_SIZE{(way_sel[j] || (NUM_WAYS == 1))}};
end
end
assign wren = wren_w;
end else begin
`UNUSED_VAR (write)
`UNUSED_VAR (byteen)
`UNUSED_VAR (write_byteen)
`UNUSED_VAR (write_data)
assign wdata = fill_data;
assign wren = fill;
end
wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
VX_onehot_encoder #(
.N (NUM_WAYS)
) way_enc (
@ -105,8 +148,6 @@ module VX_cache_data #(
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] rdata;
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
VX_sp_ram #(
.DATAW (`CS_LINE_WIDTH * NUM_WAYS),
.SIZE (`CS_LINES_PER_BANK),
@ -123,28 +164,35 @@ module VX_cache_data #(
);
wire [NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] per_way_rdata;
if (`CS_WORDS_PER_LINE > 1) begin
assign per_way_rdata = rdata[wsel];
end else begin
`UNUSED_VAR (wsel)
assign per_way_rdata = rdata;
end
assign read_data = per_way_rdata[way_idx];
`UNUSED_VAR (stall)
wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] dirty_data_w;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
for (genvar j = 0; j < NUM_WAYS; ++j) begin
assign dirty_data_w[j][i] = rdata[i][j];
end
end
assign dirty_data = dirty_data_w[way_idx];
`ifdef DBG_TRACE_CACHE
always @(posedge clk) begin
if (fill && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d data-fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data));
`TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data));
end
if (flush && ~stall) begin
`TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, dirty=%b, byteen=%b\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, dirty_valid, dirty_byteen));
end
if (read && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d data-read: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, read_data, req_uuid));
`TRACE(3, ("%d: %s read: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, read_data, req_uuid));
end
if (write && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d data-write: addr=0x%0h, way=%b, blk_addr=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, byteen, write_data, req_uuid));
`TRACE(3, ("%d: %s write: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, write_byteen[wsel], write_data[wsel], req_uuid));
end
end
`endif

View file

@ -50,7 +50,7 @@
`define CS_TAG_SEL_ADDR_START (1+`CS_LINE_SEL_ADDR_END)
`define CS_TAG_SEL_ADDR_END (`CS_WORD_ADDR_WIDTH-1)
`define CS_LINE_TAG_ADDR(x) x[`CS_LINE_ADDR_WIDTH-1 : `CS_LINE_SEL_BITS]
`define CS_LINE_ADDR_TAG(x) x[`CS_LINE_ADDR_WIDTH-1 : `CS_LINE_SEL_BITS]
///////////////////////////////////////////////////////////////////////////////
@ -64,14 +64,14 @@
///////////////////////////////////////////////////////////////////////////////
`define PERF_CACHE_ADD(dst, src, dcount, scount) \
`PERF_COUNTER_ADD (dst, src, reads, `PERF_CTR_BITS, dcount, scount, (`CDIV(scount, dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, writes, `PERF_CTR_BITS, dcount, scount, (`CDIV(scount, dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, read_misses, `PERF_CTR_BITS, dcount, scount, (`CDIV(scount, dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, write_misses, `PERF_CTR_BITS, dcount, scount, (`CDIV(scount, dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, bank_stalls, `PERF_CTR_BITS, dcount, scount, (`CDIV(scount, dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, mshr_stalls, `PERF_CTR_BITS, dcount, scount, (`CDIV(scount, dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, dcount, scount, (`CDIV(scount, dcount) > 1)) \
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, dcount, scount, (`CDIV(scount, dcount) > 1))
`define PERF_CACHE_ADD(dst, src, count) \
`PERF_COUNTER_ADD (dst, src, reads, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, writes, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, read_misses, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, write_misses, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, bank_stalls, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, mshr_stalls, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, count, (count > 1))
`endif // VX_CACHE_DEFINE_VH

154
hw/rtl/cache/VX_cache_flush.sv vendored Normal file
View file

@ -0,0 +1,154 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_cache_flush #(
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Number of banks
parameter NUM_BANKS = 1,
// Bank select latency
parameter BANK_SEL_LATENCY = 1
) (
input wire clk,
input wire reset,
VX_mem_bus_if.slave core_bus_in_if [NUM_REQS],
VX_mem_bus_if.master core_bus_out_if [NUM_REQS],
input wire [NUM_BANKS-1:0] bank_req_fire,
output wire [NUM_BANKS-1:0] flush_valid,
input wire [NUM_BANKS-1:0] flush_ready
);
localparam STATE_IDLE = 0;
localparam STATE_WAIT = 1;
localparam STATE_FLUSH = 2;
localparam STATE_DONE = 3;
// track in-flight core requests
wire no_inflight_reqs;
if (BANK_SEL_LATENCY != 0) begin
localparam NUM_REQS_W = `CLOG2(NUM_REQS+1);
localparam NUM_BANKS_W = `CLOG2(NUM_BANKS+1);
wire [NUM_REQS-1:0] core_bus_out_fire;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_out_fire[i] = core_bus_out_if[i].req_valid && core_bus_out_if[i].req_ready;
end
wire [NUM_REQS_W-1:0] core_bus_out_cnt;
wire [NUM_BANKS_W-1:0] bank_req_cnt;
`POP_COUNT(core_bus_out_cnt, core_bus_out_fire);
`POP_COUNT(bank_req_cnt, bank_req_fire);
`UNUSED_VAR (core_bus_out_cnt)
VX_pending_size #(
.SIZE (BANK_SEL_LATENCY * NUM_BANKS),
.INCRW (NUM_BANKS_W),
.DECRW (NUM_BANKS_W)
) pending_size (
.clk (clk),
.reset (reset),
.incr (NUM_BANKS_W'(core_bus_out_cnt)),
.decr (bank_req_cnt),
.empty (no_inflight_reqs),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end else begin
assign no_inflight_reqs = 0;
`UNUSED_VAR (bank_req_fire)
end
reg [1:0] state, state_n;
reg [NUM_BANKS-1:0] flush_done, flush_done_n;
wire [NUM_REQS-1:0] flush_req_mask;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign flush_req_mask[i] = core_bus_in_if[i].req_valid && core_bus_in_if[i].req_data.atype[`ADDR_TYPE_FLUSH];
end
wire flush_req_enable = (| flush_req_mask);
reg [NUM_REQS-1:0] lock_released, lock_released_n;
for (genvar i = 0; i < NUM_REQS; ++i) begin
wire input_enable = ~flush_req_enable || lock_released[i];
assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && input_enable;
assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
assign core_bus_in_if[i].req_ready = core_bus_out_if[i].req_ready && input_enable;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_in_if[i].rsp_valid = core_bus_out_if[i].rsp_valid;
assign core_bus_in_if[i].rsp_data = core_bus_out_if[i].rsp_data;
assign core_bus_out_if[i].rsp_ready = core_bus_in_if[i].rsp_ready;
end
wire [NUM_REQS-1:0] core_bus_out_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_out_ready[i] = core_bus_out_if[i].req_ready;
end
always @(*) begin
state_n = state;
flush_done_n = flush_done;
lock_released_n = lock_released;
case (state)
STATE_IDLE: begin
if (flush_req_enable) begin
state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT : STATE_FLUSH;
end
end
STATE_WAIT: begin
if (no_inflight_reqs) begin
state_n = STATE_FLUSH;
end
end
STATE_FLUSH: begin
flush_done_n = flush_done | flush_ready;
if (flush_done_n == 0) begin
state_n = STATE_DONE;
lock_released_n = flush_req_mask;
end
end
STATE_DONE: begin
lock_released_n = lock_released & ~core_bus_out_ready;
if (lock_released_n == 0) begin
state_n = STATE_IDLE;
end
end
endcase
end
always @(posedge clk) begin
if (reset) begin
state <= STATE_IDLE;
flush_done <= '0;
lock_released <= '0;
end else begin
state <= state_n;
flush_done <= flush_done_n;
lock_released <= lock_released_n;
end
end
assign flush_valid = {NUM_BANKS{state == STATE_FLUSH}};
endmodule

View file

@ -13,6 +13,7 @@
`include "VX_cache_define.vh"
// cache flush unit
module VX_cache_init #(
// Size of cache in bytes
parameter CACHE_SIZE = 1024,

View file

@ -104,7 +104,8 @@ module VX_cache_mshr #(
// lookup
input wire lookup_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] lookup_addr,
output wire [MSHR_SIZE-1:0] lookup_matches,
output wire [MSHR_SIZE-1:0] lookup_pending,
output wire [MSHR_SIZE-1:0] lookup_rw,
// finalize
input wire finalize_valid,
@ -216,13 +217,13 @@ module VX_cache_mshr #(
next_table <= next_table_n;
end
`RUNTIME_ASSERT((~allocate_fire || ~valid_table[allocate_id_r]), ("%t: *** %s-bank%0d inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, BANK_ID,
`RUNTIME_ASSERT((~allocate_fire || ~valid_table[allocate_id_r]), ("%t: *** %s inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id_r, lkp_req_uuid))
`RUNTIME_ASSERT((~finalize_valid || valid_table[finalize_id]), ("%t: *** %s-bank%0d invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID, BANK_ID,
`RUNTIME_ASSERT((~finalize_valid || valid_table[finalize_id]), ("%t: *** %s invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[finalize_id], BANK_ID), finalize_id, fin_req_uuid))
`RUNTIME_ASSERT((~fill_valid || valid_table[fill_id]), ("%t: *** %s-bank%0d invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID, BANK_ID,
`RUNTIME_ASSERT((~fill_valid || valid_table[fill_id]), ("%t: *** %s invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), fill_id))
VX_dp_ram #(
@ -251,7 +252,9 @@ module VX_cache_mshr #(
assign dequeue_rw = write_table[dequeue_id_r];
assign dequeue_id = dequeue_id_r;
assign lookup_matches = addr_matches & ~write_table;
// return pending entries for the given cache line
assign lookup_pending = addr_matches;
assign lookup_rw = write_table;
`UNUSED_VAR (lookup_valid)
@ -264,22 +267,22 @@ module VX_cache_mshr #(
show_table <= allocate_fire || lookup_valid || finalize_valid || fill_valid || dequeue_fire;
end
if (allocate_fire)
`TRACE(3, ("%d: %s-bank%0d mshr-allocate: addr=0x%0h, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID,
`TRACE(3, ("%d: %s allocate: addr=0x%0h, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_prev, allocate_id, lkp_req_uuid));
if (lookup_valid)
`TRACE(3, ("%d: %s-bank%0d mshr-lookup: addr=0x%0h, matches=%b (#%0d)\n", $time, INSTANCE_ID, BANK_ID,
`CS_LINE_TO_FULL_ADDR(lookup_addr, BANK_ID), lookup_matches, lkp_req_uuid));
`TRACE(3, ("%d: %s lookup: addr=0x%0h, matches=%b (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(lookup_addr, BANK_ID), lookup_pending, lkp_req_uuid));
if (finalize_valid)
`TRACE(3, ("%d: %s-bank%0d mshr-finalize release=%b, pending=%b, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID,
`TRACE(3, ("%d: %s finalize release=%b, pending=%b, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID,
finalize_release, finalize_pending, finalize_prev, finalize_id, fin_req_uuid));
if (fill_valid)
`TRACE(3, ("%d: %s-bank%0d mshr-fill: addr=0x%0h, addr=0x%0h, id=%0d\n", $time, INSTANCE_ID, BANK_ID,
`TRACE(3, ("%d: %s fill: addr=0x%0h, addr=0x%0h, id=%0d\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), `CS_LINE_TO_FULL_ADDR(fill_addr, BANK_ID), fill_id));
if (dequeue_fire)
`TRACE(3, ("%d: %s-bank%0d mshr-dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID,
`TRACE(3, ("%d: %s dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_uuid));
if (show_table) begin
`TRACE(3, ("%d: %s-bank%0d mshr-table", $time, INSTANCE_ID, BANK_ID));
`TRACE(3, ("%d: %s table", $time, INSTANCE_ID));
for (integer i = 0; i < MSHR_SIZE; ++i) begin
if (valid_table[i]) begin
`TRACE(3, (" %0d=0x%0h", i, `CS_LINE_TO_FULL_ADDR(addr_table[i], BANK_ID)));

View file

@ -38,45 +38,63 @@ module VX_cache_tags #(
input wire stall,
// read/fill
// init/fill/lookup
input wire init,
input wire fill,
input wire lookup,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire fill,
input wire init,
output wire [NUM_WAYS-1:0] way_sel,
output wire [NUM_WAYS-1:0] tag_matches
output wire [NUM_WAYS-1:0] tag_matches,
// replacement
output wire [NUM_WAYS-1:0] repl_way,
output wire [`CS_TAG_SEL_BITS-1:0] repl_tag
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_VAR (reset)
`UNUSED_VAR (lookup)
// valid, tag
localparam TAG_WIDTH = 1 + `CS_TAG_SEL_BITS;
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_TAG_ADDR(line_addr);
wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_ADDR_TAG(line_addr);
wire [NUM_WAYS-1:0][`CS_TAG_SEL_BITS-1:0] read_tag;
wire [NUM_WAYS-1:0] read_valid;
if (NUM_WAYS > 1) begin
reg [NUM_WAYS-1:0] repl_way;
reg [NUM_WAYS-1:0] repl_way_r;
// cyclic assignment of replacement way
always @(posedge clk) begin
if (reset) begin
repl_way <= 1;
repl_way_r <= 1;
end else if (~stall) begin // hold the value on stalls prevent filling different slots twice
repl_way <= {repl_way[NUM_WAYS-2:0], repl_way[NUM_WAYS-1]};
repl_way_r <= {repl_way_r[NUM_WAYS-2:0], repl_way_r[NUM_WAYS-1]};
end
end
for (genvar i = 0; i < NUM_WAYS; ++i) begin
assign way_sel[i] = fill && repl_way[i];
end
assign repl_way = repl_way_r;
VX_onehot_mux #(
.DATAW (`CS_TAG_SEL_BITS),
.N (NUM_WAYS)
) repl_tag_sel (
.data_in (read_tag),
.sel_in (repl_way_r),
.data_out (repl_tag)
);
end else begin
`UNUSED_VAR (stall)
assign way_sel = fill;
assign repl_way = 1'b1;
assign repl_tag = read_tag;
end
for (genvar i = 0; i < NUM_WAYS; ++i) begin
wire [`CS_TAG_SEL_BITS-1:0] read_tag;
wire read_valid;
wire do_fill = fill && repl_way[i];
wire do_write = init || do_fill;
wire line_valid = ~init;
VX_sp_ram #(
.DATAW (TAG_WIDTH),
@ -85,29 +103,31 @@ module VX_cache_tags #(
) tag_store (
.clk (clk),
.read (1'b1),
.write (way_sel[i] || init),
.write (do_write),
`UNUSED_PIN (wren),
.addr (line_sel),
.wdata ({~init, line_tag}),
.rdata ({read_valid, read_tag})
.wdata ({line_valid, line_tag}),
.rdata ({read_valid[i], read_tag[i]})
);
end
assign tag_matches[i] = read_valid && (line_tag == read_tag);
for (genvar i = 0; i < NUM_WAYS; ++i) begin
assign tag_matches[i] = read_valid[i] && (line_tag == read_tag[i]);
end
`ifdef DBG_TRACE_CACHE
always @(posedge clk) begin
if (fill && ~stall) begin
`TRACE(3, ("%d: %s-bank%0d tag-fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, line_tag));
`TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), repl_way, line_sel, line_tag));
end
if (init) begin
`TRACE(3, ("%d: %s-bank%0d tag-init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel));
`TRACE(3, ("%d: %s init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel));
end
if (lookup && ~stall) begin
if (tag_matches != 0) begin
`TRACE(3, ("%d: %s-bank%0d tag-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, line_tag, req_uuid));
`TRACE(3, ("%d: %s hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid));
end else begin
`TRACE(3, ("%d: %s-bank%0d tag-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
`TRACE(3, ("%d: %s miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
end
end
end

View file

@ -45,6 +45,9 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Request debug identifier
parameter UUID_WIDTH = 0,
@ -183,6 +186,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_OUT_BUF (NC_OR_BYPASS ? 1 : CORE_OUT_BUF),

View file

@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_alu_int #(
parameter CORE_ID = 0,
parameter `STRING INSTANCE_ID = "",
parameter BLOCK_IDX = 0,
parameter NUM_LANES = 1
) (
@ -29,7 +29,7 @@ module VX_alu_int #(
VX_branch_ctl_if.master branch_ctl_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam LANE_BITS = `CLOG2(NUM_LANES);
localparam LANE_WIDTH = `UP(LANE_BITS);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
@ -193,9 +193,9 @@ module VX_alu_int #(
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (branch_ctl_if.valid) begin
`TRACE(1, ("%d: core%0d-branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n",
$time, CORE_ID, branch_ctl_if.wid, {commit_if.data.PC, 1'b0}, branch_ctl_if.taken, {branch_ctl_if.dest, 1'b0}, commit_if.data.uuid));
if (br_enable) begin
`TRACE(1, ("%d: %s-branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n",
$time, INSTANCE_ID, br_wid, {commit_if.data.PC, 1'b0}, br_taken, {br_dest, 1'b0}, commit_if.data.uuid));
end
end
`endif

View file

@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_alu_muldiv #(
parameter CORE_ID = 0,
parameter `STRING INSTANCE_ID = "",
parameter NUM_LANES = 1
) (
input wire clk,
@ -26,7 +26,7 @@ module VX_alu_muldiv #(
// Outputs
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam TAG_WIDTH = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + PID_WIDTH + 1 + 1;
@ -69,7 +69,7 @@ module VX_alu_muldiv #(
wire mul_fire_in = mul_valid_in && mul_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] mul_resultl, mul_resulth;
reg [`XLEN-1:0] mul_resultl, mul_resulth;
wire [`XLEN-1:0] mul_in1 = is_alu_w ? (execute_if.data.rs1_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs1_data[i];
wire [`XLEN-1:0] mul_in2 = is_alu_w ? (execute_if.data.rs2_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs2_data[i];
always @(*) begin
@ -235,7 +235,7 @@ module VX_alu_muldiv #(
wire div_fire_in = div_valid_in && div_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [`XLEN-1:0] div_quotient, div_remainder;
reg [`XLEN-1:0] div_quotient, div_remainder;
always @(*) begin
dpi_idiv (div_fire_in, is_signed_op, div_in1[i], div_in2[i], div_quotient, div_remainder);
end

View file

@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_alu_unit #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
@ -27,7 +27,7 @@ module VX_alu_unit #(
VX_branch_ctl_if.master branch_ctl_if [`NUM_ALU_BLOCKS]
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam BLOCK_SIZE = `NUM_ALU_BLOCKS;
localparam NUM_LANES = `NUM_ALU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
@ -75,7 +75,7 @@ module VX_alu_unit #(
`RESET_RELAY (int_reset, block_reset);
VX_alu_int #(
.CORE_ID (CORE_ID),
.INSTANCE_ID ($sformatf("%s-int%0d", INSTANCE_ID, block_idx)),
.BLOCK_IDX (block_idx),
.NUM_LANES (NUM_LANES)
) alu_int (
@ -90,59 +90,61 @@ module VX_alu_unit #(
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) mdv_execute_if();
) muldiv_execute_if();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) mdv_commit_if();
) muldiv_commit_if();
assign mdv_execute_if.valid = per_block_execute_if[block_idx].valid && is_muldiv_op;
assign mdv_execute_if.data = per_block_execute_if[block_idx].data;
assign muldiv_execute_if.valid = per_block_execute_if[block_idx].valid && is_muldiv_op;
assign muldiv_execute_if.data = per_block_execute_if[block_idx].data;
`RESET_RELAY (mdv_reset, block_reset);
`RESET_RELAY (muldiv_reset, block_reset);
VX_alu_muldiv #(
.CORE_ID (CORE_ID),
.INSTANCE_ID ($sformatf("%s-muldiv%0d", INSTANCE_ID, block_idx)),
.NUM_LANES (NUM_LANES)
) mdv_unit (
) muldiv_unit (
.clk (clk),
.reset (mdv_reset),
.execute_if (mdv_execute_if),
.commit_if (mdv_commit_if)
.reset (muldiv_reset),
.execute_if (muldiv_execute_if),
.commit_if (muldiv_commit_if)
);
`endif
assign per_block_execute_if[block_idx].ready =
`ifdef EXT_M_ENABLE
is_muldiv_op ? mdv_execute_if.ready :
is_muldiv_op ? muldiv_execute_if.ready :
`endif
int_execute_if.ready;
// send response
`RESET_RELAY (arb_reset, block_reset);
VX_stream_arb #(
.NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW),
.OUT_BUF (PARTIAL_BW ? 1 : 3)
) rsp_arb (
.clk (clk),
.reset (block_reset),
.reset (arb_reset),
.valid_in ({
`ifdef EXT_M_ENABLE
mdv_commit_if.valid,
muldiv_commit_if.valid,
`endif
int_commit_if.valid
}),
.ready_in ({
`ifdef EXT_M_ENABLE
mdv_commit_if.ready,
muldiv_commit_if.ready,
`endif
int_commit_if.ready
}),
.data_in ({
`ifdef EXT_M_ENABLE
mdv_commit_if.data,
muldiv_commit_if.data,
`endif
int_commit_if.data
}),

View file

@ -13,8 +13,8 @@
`include "VX_define.vh"
module VX_commit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
@ -27,7 +27,7 @@ module VX_commit import VX_gpu_pkg::*; #(
VX_commit_csr_if.master commit_csr_if,
VX_commit_sched_if.master commit_sched_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `NR_BITS + `NUM_THREADS * `XLEN + 1 + 1 + 1;
localparam COMMIT_SIZEW = `CLOG2(`NUM_THREADS + 1);
localparam COMMIT_ALL_SIZEW = COMMIT_SIZEW + `ISSUE_WIDTH - 1;
@ -36,12 +36,10 @@ module VX_commit import VX_gpu_pkg::*; #(
VX_commit_if commit_arb_if[`ISSUE_WIDTH]();
wire [`ISSUE_WIDTH-1:0] commit_fire;
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] commit_wid;
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] commit_tmask;
wire [`ISSUE_WIDTH-1:0] commit_eop;
`RESET_RELAY (arb_reset, reset);
wire [`ISSUE_WIDTH-1:0] per_issue_commit_fire;
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] per_issue_commit_wid;
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] per_issue_commit_tmask;
wire [`ISSUE_WIDTH-1:0] per_issue_commit_eop;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
@ -55,6 +53,8 @@ module VX_commit import VX_gpu_pkg::*; #(
assign commit_if[j * `ISSUE_WIDTH + i].ready = ready_in[j];
end
`RESET_RELAY (arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (`NUM_EX_UNITS),
.DATAW (DATAW),
@ -72,10 +72,10 @@ module VX_commit import VX_gpu_pkg::*; #(
`UNUSED_PIN (sel_out)
);
assign commit_fire[i] = commit_arb_if[i].valid && commit_arb_if[i].ready;
assign commit_tmask[i]= {`NUM_THREADS{commit_fire[i]}} & commit_arb_if[i].data.tmask;
assign commit_wid[i] = commit_arb_if[i].data.wid;
assign commit_eop[i] = commit_arb_if[i].data.eop;
assign per_issue_commit_fire[i] = commit_arb_if[i].valid && commit_arb_if[i].ready;
assign per_issue_commit_tmask[i]= {`NUM_THREADS{per_issue_commit_fire[i]}} & commit_arb_if[i].data.tmask;
assign per_issue_commit_wid[i] = commit_arb_if[i].data.wid;
assign per_issue_commit_eop[i] = commit_arb_if[i].data.eop;
end
// CSRs update
@ -84,11 +84,11 @@ module VX_commit import VX_gpu_pkg::*; #(
wire [COMMIT_ALL_SIZEW-1:0] commit_size_all_r, commit_size_all_rr;
wire commit_fire_any, commit_fire_any_r, commit_fire_any_rr;
assign commit_fire_any = (| commit_fire);
assign commit_fire_any = (| per_issue_commit_fire);
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [COMMIT_SIZEW-1:0] count;
`POP_COUNT(count, commit_tmask[i]);
`POP_COUNT(count, per_issue_commit_tmask[i]);
assign commit_size[i] = count;
end
@ -136,19 +136,28 @@ module VX_commit import VX_gpu_pkg::*; #(
end
assign commit_csr_if.instret = instret;
// Committed instructions
// Track committed instructions
wire [`ISSUE_WIDTH-1:0] committed = commit_fire & commit_eop;
reg [`NUM_WARPS-1:0] committed_warps;
always @(*) begin
committed_warps = 0;
for (integer i = 0; i < `ISSUE_WIDTH; ++i) begin
if (per_issue_commit_fire[i] && per_issue_commit_eop[i]) begin
committed_warps[per_issue_commit_wid[i]] = 1;
end
end
end
VX_pipe_register #(
.DATAW (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
.RESETW (`ISSUE_WIDTH)
.DATAW (`NUM_WARPS),
.RESETW (`NUM_WARPS)
) committed_pipe_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({committed, commit_wid}),
.data_out ({commit_sched_if.committed, commit_sched_if.committed_wid})
.data_in (committed_warps),
.data_out ({commit_sched_if.committed_warps})
);
// Writeback
@ -171,7 +180,7 @@ module VX_commit import VX_gpu_pkg::*; #(
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin
always @(posedge clk) begin
if (commit_if[j * `ISSUE_WIDTH + i].valid && commit_if[j * `ISSUE_WIDTH + i].ready) begin
`TRACE(1, ("%d: core%0d-commit: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, commit_if[j * `ISSUE_WIDTH + i].data.wid, {commit_if[j * `ISSUE_WIDTH + i].data.PC, 1'b0}));
`TRACE(1, ("%d: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, commit_if[j * `ISSUE_WIDTH + i].data.wid, {commit_if[j * `ISSUE_WIDTH + i].data.PC, 1'b0}));
trace_ex_type(1, j);
`TRACE(1, (", tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", commit_if[j * `ISSUE_WIDTH + i].data.tmask, commit_if[j * `ISSUE_WIDTH + i].data.wb, commit_if[j * `ISSUE_WIDTH + i].data.rd, commit_if[j * `ISSUE_WIDTH + i].data.sop, commit_if[j * `ISSUE_WIDTH + i].data.eop));
`TRACE_ARRAY1D(1, "0x%0h", commit_if[j * `ISSUE_WIDTH + i].data.data, `NUM_THREADS);

View file

@ -18,7 +18,8 @@
`endif
module VX_core import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter CORE_ID = 0,
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -94,13 +95,14 @@ module VX_core import VX_gpu_pkg::*; #(
`SCOPE_IO_SWITCH (3)
VX_schedule #(
.INSTANCE_ID ($sformatf("%s-schedule", INSTANCE_ID)),
.CORE_ID (CORE_ID)
) schedule (
.clk (clk),
.reset (schedule_reset),
`ifdef PERF_ENABLE
.perf_schedule_if (pipeline_perf_if.schedule),
.sched_perf (pipeline_perf_if.sched),
`endif
.base_dcrs (base_dcrs),
@ -121,7 +123,7 @@ module VX_core import VX_gpu_pkg::*; #(
);
VX_fetch #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-fetch", INSTANCE_ID))
) fetch (
`SCOPE_IO_BIND (0)
.clk (clk),
@ -132,7 +134,7 @@ module VX_core import VX_gpu_pkg::*; #(
);
VX_decode #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-decode", INSTANCE_ID))
) decode (
.clk (clk),
.reset (decode_reset),
@ -142,7 +144,7 @@ module VX_core import VX_gpu_pkg::*; #(
);
VX_issue #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-issue", INSTANCE_ID))
) issue (
`SCOPE_IO_BIND (1)
@ -150,7 +152,7 @@ module VX_core import VX_gpu_pkg::*; #(
.reset (issue_reset),
`ifdef PERF_ENABLE
.perf_issue_if (pipeline_perf_if.issue),
.issue_perf (pipeline_perf_if.issue),
`endif
.decode_if (decode_if),
@ -159,6 +161,7 @@ module VX_core import VX_gpu_pkg::*; #(
);
VX_execute #(
.INSTANCE_ID ($sformatf("%s-execute", INSTANCE_ID)),
.CORE_ID (CORE_ID)
) execute (
`SCOPE_IO_BIND (2)
@ -186,7 +189,7 @@ module VX_core import VX_gpu_pkg::*; #(
);
VX_commit #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-commit", INSTANCE_ID))
) commit (
.clk (clk),
.reset (commit_reset),
@ -210,7 +213,7 @@ module VX_core import VX_gpu_pkg::*; #(
`RESET_RELAY (lmem_unit_reset, reset);
VX_lmem_unit #(
.CORE_ID (CORE_ID)
.INSTANCE_ID (INSTANCE_ID)
) lmem_unit (
.clk (clk),
.reset (lmem_unit_reset),
@ -229,20 +232,20 @@ module VX_core import VX_gpu_pkg::*; #(
`endif
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
VX_lsu_mem_if #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_coalesced_if[`NUM_LSU_BLOCKS]();
) dcache_coalesced_if();
if (LSU_WORD_SIZE != DCACHE_WORD_SIZE) begin
`RESET_RELAY (coalescer_reset, reset);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
`RESET_RELAY (mem_coalescer_reset, reset);
VX_mem_coalescer #(
.INSTANCE_ID ($sformatf("core%0d-coalescer", CORE_ID)),
.INSTANCE_ID ($sformatf("%s-coalescer%0d", INSTANCE_ID, i)),
.NUM_REQS (`NUM_LSU_LANES),
.DATA_IN_SIZE (LSU_WORD_SIZE),
.DATA_OUT_SIZE (DCACHE_WORD_SIZE),
@ -251,9 +254,9 @@ module VX_core import VX_gpu_pkg::*; #(
.TAG_WIDTH (LSU_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.QUEUE_SIZE (`LSUQ_OUT_SIZE)
) coalescer (
) mem_coalescer (
.clk (clk),
.reset (coalescer_reset),
.reset (mem_coalescer_reset),
// Input request
.in_req_valid (lsu_dcache_if[i].req_valid),
@ -274,42 +277,37 @@ module VX_core import VX_gpu_pkg::*; #(
.in_rsp_ready (lsu_dcache_if[i].rsp_ready),
// Output request
.out_req_valid (dcache_coalesced_if[i].req_valid),
.out_req_mask (dcache_coalesced_if[i].req_data.mask),
.out_req_rw (dcache_coalesced_if[i].req_data.rw),
.out_req_byteen (dcache_coalesced_if[i].req_data.byteen),
.out_req_addr (dcache_coalesced_if[i].req_data.addr),
.out_req_atype (dcache_coalesced_if[i].req_data.atype),
.out_req_data (dcache_coalesced_if[i].req_data.data),
.out_req_tag (dcache_coalesced_if[i].req_data.tag),
.out_req_ready (dcache_coalesced_if[i].req_ready),
.out_req_valid (dcache_coalesced_if.req_valid),
.out_req_mask (dcache_coalesced_if.req_data.mask),
.out_req_rw (dcache_coalesced_if.req_data.rw),
.out_req_byteen (dcache_coalesced_if.req_data.byteen),
.out_req_addr (dcache_coalesced_if.req_data.addr),
.out_req_atype (dcache_coalesced_if.req_data.atype),
.out_req_data (dcache_coalesced_if.req_data.data),
.out_req_tag (dcache_coalesced_if.req_data.tag),
.out_req_ready (dcache_coalesced_if.req_ready),
// Output response
.out_rsp_valid (dcache_coalesced_if[i].rsp_valid),
.out_rsp_mask (dcache_coalesced_if[i].rsp_data.mask),
.out_rsp_data (dcache_coalesced_if[i].rsp_data.data),
.out_rsp_tag (dcache_coalesced_if[i].rsp_data.tag),
.out_rsp_ready (dcache_coalesced_if[i].rsp_ready)
.out_rsp_valid (dcache_coalesced_if.rsp_valid),
.out_rsp_mask (dcache_coalesced_if.rsp_data.mask),
.out_rsp_data (dcache_coalesced_if.rsp_data.data),
.out_rsp_tag (dcache_coalesced_if.rsp_data.tag),
.out_rsp_ready (dcache_coalesced_if.rsp_ready)
);
end
end else begin
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
`ASSIGN_VX_LSU_MEM_IF (dcache_coalesced_if[i], lsu_dcache_if[i]);
end
`ASSIGN_VX_LSU_MEM_IF (dcache_coalesced_if, lsu_dcache_if[i]);
end
`RESET_RELAY (lsu_adapter_reset, reset);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_tmp_if[DCACHE_CHANNELS]();
`RESET_RELAY (lsu_adapter_reset, reset);
VX_lsu_adapter #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
@ -320,15 +318,17 @@ module VX_core import VX_gpu_pkg::*; #(
) lsu_adapter (
.clk (clk),
.reset (lsu_adapter_reset),
.lsu_mem_if (dcache_coalesced_if[i]),
.lsu_mem_if (dcache_coalesced_if),
.mem_bus_if (dcache_bus_tmp_if)
);
for (genvar j = 0; j < DCACHE_CHANNELS; ++j) begin
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i * DCACHE_CHANNELS + j], dcache_bus_tmp_if[j]);
end
end
`ifdef PERF_ENABLE
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;

View file

@ -144,6 +144,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
`endif
VX_core #(
.INSTANCE_ID ($sformatf("core")),
.CORE_ID (CORE_ID)
) core (
`SCOPE_IO_BIND (0)

View file

@ -26,13 +26,13 @@
addr+12'h80 : dst = 32'(src[$bits(src)-1:32])
`endif
module VX_csr_data
import VX_gpu_pkg::*;
`ifdef EXT_F_ENABLE
import VX_fpu_pkg::*;
`endif
#(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0
) (
input wire clk,
@ -147,7 +147,7 @@ import VX_fpu_pkg::*;
mscratch <= write_data;
end
default: begin
`ASSERT(0, ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid));
`ASSERT(0, ("%t: *** %s invalid CSR write address: %0h (#%0d)", $time, INSTANCE_ID, write_addr, write_uuid));
end
endcase
end
@ -212,21 +212,21 @@ import VX_fpu_pkg::*;
`VX_DCR_MPM_CLASS_CORE: begin
case (read_addr)
// PERF: pipeline
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_r, pipeline_perf_if.sched_idles);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_r, pipeline_perf_if.sched_stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_r, pipeline_perf_if.ibf_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_r, pipeline_perf_if.scb_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_r, pipeline_perf_if.units_uses[`EX_ALU]);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_r, pipeline_perf_if.sched.idles);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_r, pipeline_perf_if.sched.stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_r, pipeline_perf_if.issue.ibf_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_r, pipeline_perf_if.issue.scb_stalls);
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_r, pipeline_perf_if.issue.opd_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_ALU]);
`ifdef EXT_F_ENABLE
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, pipeline_perf_if.units_uses[`EX_FPU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_FPU]);
`else
`VX_CSR_MPM_SCRB_FPU : read_data_ro_r = '0;
`VX_CSR_MPM_SCRB_FPU_H : read_data_ro_r = '0;
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, `PERF_CTR_BITS'(0));
`endif
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_r, pipeline_perf_if.units_uses[`EX_LSU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_r, pipeline_perf_if.units_uses[`EX_SFU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_r, pipeline_perf_if.sfu_uses[`SFU_CSRS]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_r, pipeline_perf_if.sfu_uses[`SFU_WCTL]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_LSU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_SFU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_CSRS]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_WCTL]);
// PERF: memory
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_r, pipeline_perf_if.ifetches);
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_r, pipeline_perf_if.loads);

View file

@ -14,6 +14,7 @@
`include "VX_define.vh"
module VX_csr_unit import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0,
parameter NUM_LANES = 1
) (
@ -36,7 +37,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
VX_execute_if.slave execute_if,
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
@ -72,6 +73,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
wire csr_write_enable = (execute_if.data.op_type == `INST_SFU_CSRRW);
VX_csr_data #(
.INSTANCE_ID (INSTANCE_ID),
.CORE_ID (CORE_ID)
) csr_data (
.clk (clk),

View file

@ -12,9 +12,8 @@
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
module VX_dcr_data import VX_gpu_pkg::*; (
module VX_dcr_data import VX_gpu_pkg::*, VX_trace_pkg::*; (
input wire clk,
input wire reset,

View file

@ -12,7 +12,6 @@
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
`ifdef EXT_F_ENABLE
`define USED_IREG(x) \
@ -28,8 +27,8 @@
use_``x = 1
`endif
module VX_decode import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
@ -44,7 +43,7 @@ module VX_decode import VX_gpu_pkg::*; #(
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + (`NR_BITS * 4);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
@ -145,6 +144,12 @@ module VX_decode import VX_gpu_pkg::*; #(
end
`endif
`STATIC_ASSERT($bits(alu_args_t) == $bits(op_args_t), ("alu_args_t size mismatch: current=%0d, expected=%0d", $bits(alu_args_t), $bits(op_args_t)));
`STATIC_ASSERT($bits(fpu_args_t) == $bits(op_args_t), ("fpu_args_t size mismatch: current=%0d, expected=%0d", $bits(fpu_args_t), $bits(op_args_t)));
`STATIC_ASSERT($bits(lsu_args_t) == $bits(op_args_t), ("lsu_args_t size mismatch: current=%0d, expected=%0d", $bits(lsu_args_t), $bits(op_args_t)));
`STATIC_ASSERT($bits(csr_args_t) == $bits(op_args_t), ("csr_args_t size mismatch: current=%0d, expected=%0d", $bits(csr_args_t), $bits(op_args_t)));
`STATIC_ASSERT($bits(wctl_args_t) == $bits(op_args_t), ("wctl_args_t size mismatch: current=%0d, expected=%0d", $bits(wctl_args_t), $bits(op_args_t)));
always @(*) begin
ex_type = '0;
@ -552,7 +557,7 @@ module VX_decode import VX_gpu_pkg::*; #(
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (decode_if.valid && decode_if.ready) begin
`TRACE(1, ("%d: core%0d-decode: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, CORE_ID, decode_if.data.wid, {decode_if.data.PC, 1'd0}, instr));
`TRACE(1, ("%d: %s: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, INSTANCE_ID, decode_if.data.wid, {decode_if.data.PC, 1'd0}, instr));
trace_ex_type(1, decode_if.data.ex_type);
`TRACE(1, (", op="));
trace_ex_op(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args);

View file

@ -12,10 +12,9 @@
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
module VX_dispatch import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
@ -24,12 +23,12 @@ module VX_dispatch import VX_gpu_pkg::*; #(
output wire [`PERF_CTR_BITS-1:0] perf_stalls [`NUM_EX_UNITS],
`endif
// inputs
VX_operands_if.slave operands_if [`ISSUE_WIDTH],
VX_operands_if.slave operands_if,
// outputs
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH]
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS]
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH;
@ -38,8 +37,6 @@ module VX_dispatch import VX_gpu_pkg::*; #(
assign tids[i] = `NT_WIDTH'(i);
end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [`NT_WIDTH-1:0] last_active_tid;
VX_find_first #(
@ -47,95 +44,64 @@ module VX_dispatch import VX_gpu_pkg::*; #(
.DATAW (`NT_WIDTH),
.REVERSE (1)
) last_tid_select (
.valid_in (operands_if[i].data.tmask),
.valid_in (operands_if.data.tmask),
.data_in (tids),
.data_out (last_active_tid),
`UNUSED_PIN (valid_out)
);
wire [`NUM_EX_UNITS-1:0] operands_reset;
assign operands_if.ready = operands_reset[operands_if.data.ex_type];
`RESET_RELAY (buf_reset, reset);
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
`RESET_RELAY (buffer_reset, reset);
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2)
.OUT_REG (2), // 2-cycle EB for area reduction
.LUTRAM (1)
) buffer (
.clk (clk),
.reset (buf_reset),
.valid_in (operands_if[i].valid && (operands_if[i].data.ex_type == j)),
.ready_in (operands_reset[j]),
.data_in (`TO_DISPATCH_DATA(operands_if[i].data, last_active_tid)),
.data_out (dispatch_if[j * `ISSUE_WIDTH + i].data),
.valid_out (dispatch_if[j * `ISSUE_WIDTH + i].valid),
.ready_out (dispatch_if[j * `ISSUE_WIDTH + i].ready)
.reset (buffer_reset),
.valid_in (operands_if.valid && (operands_if.data.ex_type == `EX_BITS'(i))),
.ready_in (operands_reset[i]),
.data_in ({
operands_if.data.uuid,
operands_if.data.wis,
operands_if.data.tmask,
operands_if.data.PC,
operands_if.data.op_type,
operands_if.data.op_args,
operands_if.data.wb,
operands_if.data.rd,
last_active_tid,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data
}),
.data_out (dispatch_if[i].data),
.valid_out (dispatch_if[i].valid),
.ready_out (dispatch_if[i].ready)
);
end
assign operands_if[i].ready = operands_reset[operands_if[i].data.ex_type];
end
`ifdef PERF_ENABLE
wire [`NUM_EX_UNITS-1:0] perf_unit_stalls_per_cycle, perf_unit_stalls_per_cycle_r;
reg [`ISSUE_WIDTH-1:0][`NUM_EX_UNITS-1:0] perf_issue_unit_stalls_per_cycle;
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r;
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
always @(*) begin
perf_issue_unit_stalls_per_cycle[i] = '0;
if (operands_if[i].valid && ~operands_if[i].ready) begin
perf_issue_unit_stalls_per_cycle[i][operands_if[i].data.ex_type] = 1;
end
end
end
VX_reduce #(
.DATAW_IN (`NUM_EX_UNITS),
.N (`ISSUE_WIDTH),
.OP ("|")
) reduce (
.data_in (perf_issue_unit_stalls_per_cycle),
.data_out (perf_unit_stalls_per_cycle)
);
`BUFFER(perf_unit_stalls_per_cycle_r, perf_unit_stalls_per_cycle);
wire operands_if_stall = operands_if.valid && ~operands_if.ready;
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
always @(posedge clk) begin
if (reset) begin
perf_stalls_r[i] <= '0;
end else begin
perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(perf_unit_stalls_per_cycle_r[i]);
perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(operands_if_stall && operands_if.data.ex_type == `EX_BITS'(i));
end
end
end
for (genvar i=0; i < `NUM_EX_UNITS; ++i) begin
assign perf_stalls[i] = perf_stalls_r[i];
end
`endif
`ifdef DBG_TRACE_PIPELINE
for (genvar i=0; i < `ISSUE_WIDTH; ++i) begin
always @(posedge clk) begin
if (operands_if[i].valid && operands_if[i].ready) begin
`TRACE(1, ("%d: core%0d-issue: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, wis_to_wid(operands_if[i].data.wis, i), {operands_if[i].data.PC, 1'b0}));
trace_ex_type(1, operands_if[i].data.ex_type);
`TRACE(1, (", op="));
trace_ex_op(1, operands_if[i].data.ex_type, operands_if[i].data.op_type, operands_if[i].data.op_args);
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if[i].data.tmask, operands_if[i].data.wb, operands_if[i].data.rd));
`TRACE_ARRAY1D(1, "0x%0h", operands_if[i].data.rs1_data, `NUM_THREADS);
`TRACE(1, (", rs2_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if[i].data.rs2_data, `NUM_THREADS);
`TRACE(1, (", rs3_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if[i].data.rs3_data, `NUM_THREADS);
trace_op_args(1, operands_if[i].data.ex_type, operands_if[i].data.op_type, operands_if[i].data.op_args);
`TRACE(1, (" (#%0d)\n", operands_if[i].data.uuid));
end
end
end
`endif
endmodule

View file

@ -14,6 +14,7 @@
`include "VX_define.vh"
module VX_execute import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0
) (
`SCOPE_IO_DECL
@ -55,7 +56,7 @@ module VX_execute import VX_gpu_pkg::*; #(
`RESET_RELAY (sfu_reset, reset);
VX_alu_unit #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-alu", INSTANCE_ID))
) alu_unit (
.clk (clk),
.reset (alu_reset),
@ -67,7 +68,7 @@ module VX_execute import VX_gpu_pkg::*; #(
`SCOPE_IO_SWITCH (1)
VX_lsu_unit #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-lsu", INSTANCE_ID))
) lsu_unit (
`SCOPE_IO_BIND (0)
.clk (clk),
@ -81,7 +82,7 @@ module VX_execute import VX_gpu_pkg::*; #(
`RESET_RELAY (fpu_reset, reset);
VX_fpu_unit #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-fpu", INSTANCE_ID))
) fpu_unit (
.clk (clk),
.reset (fpu_reset),
@ -92,6 +93,7 @@ module VX_execute import VX_gpu_pkg::*; #(
`endif
VX_sfu_unit #(
.INSTANCE_ID ($sformatf("%s-sfu", INSTANCE_ID)),
.CORE_ID (CORE_ID)
) sfu_unit (
.clk (clk),

View file

@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_fetch import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -30,7 +30,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
// outputs
VX_fetch_if.master fetch_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_VAR (reset)
wire icache_req_valid;
@ -78,9 +78,11 @@ module VX_fetch import VX_gpu_pkg::*; #(
.reset (reset),
.incr (icache_req_fire && schedule_if.data.wid == i),
.decr (fetch_if.ibuf_pop[i]),
`UNUSED_PIN (empty),
`UNUSED_PIN (alm_empty),
.full (pending_ibuf_full[i]),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end
wire ibuf_ready = ~pending_ibuf_full[schedule_if.data.wid];
@ -89,7 +91,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
`endif
`RUNTIME_ASSERT((!schedule_if.valid || schedule_if.data.PC != 0),
("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, {schedule_if.data.PC, 1'b0}, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.uuid))
("%t: *** %s invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, INSTANCE_ID, {schedule_if.data.PC, 1'b0}, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.uuid))
// Icache Request
@ -129,8 +131,6 @@ module VX_fetch import VX_gpu_pkg::*; #(
assign icache_bus_if.rsp_ready = fetch_if.ready;
`ifdef DBG_SCOPE_FETCH
if (CORE_ID == 0) begin
`ifdef SCOPE
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
VX_scope_tap #(
@ -158,16 +158,6 @@ module VX_fetch import VX_gpu_pkg::*; #(
.bus_in (scope_bus_in),
.bus_out (scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
ila_fetch ila_fetch_inst (
.clk (clk),
.probe0 ({reset, schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC, schedule_if.ready, schedule_if.valid}),
.probe1 ({icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr, icache_bus_if.req_ready, icache_bus_if.req_valid}),
.probe2 ({icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag, icache_bus_if.rsp_ready, icache_bus_if.rsp_valid})
);
`endif
end
`else
`SCOPE_IO_UNUSED()
`endif
@ -177,10 +167,10 @@ module VX_fetch import VX_gpu_pkg::*; #(
wire fetch_fire = fetch_if.valid && fetch_if.ready;
always @(posedge clk) begin
if (schedule_fire) begin
`TRACE(1, ("%d: I$%0d req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, CORE_ID, schedule_if.data.wid, {schedule_if.data.PC, 1'b0}, schedule_if.data.tmask, schedule_if.data.uuid));
`TRACE(1, ("%d: %s req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, INSTANCE_ID, schedule_if.data.wid, {schedule_if.data.PC, 1'b0}, schedule_if.data.tmask, schedule_if.data.uuid));
end
if (fetch_fire) begin
`TRACE(1, ("%d: I$%0d rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, CORE_ID, fetch_if.data.wid, {fetch_if.data.PC, 1'b0}, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid));
`TRACE(1, ("%d: %s rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, INSTANCE_ID, fetch_if.data.wid, {fetch_if.data.PC, 1'b0}, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid));
end
end
`endif

View file

@ -14,7 +14,7 @@
`include "VX_fpu_define.vh"
module VX_fpu_unit import VX_fpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
@ -26,7 +26,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_fpu_csr_if.master fpu_csr_if[`NUM_FPU_BLOCKS]
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam BLOCK_SIZE = `NUM_FPU_BLOCKS;
localparam NUM_LANES = `NUM_FPU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
@ -84,12 +84,14 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
wire execute_fire = per_block_execute_if[block_idx].valid && per_block_execute_if[block_idx].ready;
wire fpu_rsp_fire = fpu_rsp_valid && fpu_rsp_ready;
`RESET_RELAY (ibuf_reset, block_reset);
VX_index_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + PID_WIDTH + 1 + 1),
.SIZE (`FPUQ_SIZE)
) tag_store (
.clk (clk),
.reset (block_reset),
.reset (ibuf_reset),
.acquire_en (execute_fire),
.write_addr (fpu_req_tag),
.write_data ({per_block_execute_if[block_idx].data.uuid, per_block_execute_if[block_idx].data.wid, per_block_execute_if[block_idx].data.tmask, per_block_execute_if[block_idx].data.PC, per_block_execute_if[block_idx].data.rd, per_block_execute_if[block_idx].data.pid, per_block_execute_if[block_idx].data.sop, per_block_execute_if[block_idx].data.eop}),
@ -226,12 +228,14 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
// send response
`RESET_RELAY (rsp_reset, block_reset);
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
.SIZE (0)
) rsp_buf (
.clk (clk),
.reset (block_reset),
.reset (rsp_reset),
.valid_in (fpu_rsp_valid),
.ready_in (fpu_rsp_ready),
.data_in ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_result, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),

View file

@ -14,33 +14,36 @@
`include "VX_define.vh"
module VX_ibuffer import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output wire [`PERF_CTR_BITS-1:0] perf_stalls,
`endif
// inputs
VX_decode_if.slave decode_if,
// outputs
VX_ibuffer_if.master ibuffer_if [`NUM_WARPS]
VX_ibuffer_if.master ibuffer_if [PER_ISSUE_WARPS]
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4);
wire [`NUM_WARPS-1:0] ibuf_ready_in;
wire [PER_ISSUE_WARPS-1:0] ibuf_ready_in;
assign decode_if.ready = ibuf_ready_in[decode_if.data.wid];
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`IBUF_SIZE),
.OUT_REG (2) // use a 2-cycle FIFO
.OUT_REG (2) // 2-cycle EB for area reduction
) instr_buf (
.clk (clk),
.reset (reset),
.valid_in (decode_if.valid && decode_if.data.wid == i),
.valid_in (decode_if.valid && decode_if.data.wid == ISSUE_WIS_W'(w)),
.data_in ({
decode_if.data.uuid,
decode_if.data.tmask,
@ -52,15 +55,32 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
decode_if.data.rd,
decode_if.data.rs1,
decode_if.data.rs2,
decode_if.data.rs3}),
.ready_in (ibuf_ready_in[i]),
.valid_out(ibuffer_if[i].valid),
.data_out (ibuffer_if[i].data),
.ready_out(ibuffer_if[i].ready)
decode_if.data.rs3
}),
.ready_in (ibuf_ready_in[w]),
.valid_out(ibuffer_if[w].valid),
.data_out (ibuffer_if[w].data),
.ready_out(ibuffer_if[w].ready)
);
`ifndef L1_ENABLE
assign decode_if.ibuf_pop[i] = ibuffer_if[i].valid && ibuffer_if[i].ready;
assign decode_if.ibuf_pop[w] = ibuffer_if[w].valid && ibuffer_if[w].ready;
`endif
end
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
wire decode_if_stall = decode_if.valid && ~decode_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_ibf_stalls <= '0;
end else begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_if_stall);
end
end
assign perf_stalls = perf_ibf_stalls;
`endif
endmodule

View file

@ -12,10 +12,9 @@
// limitations under the License.
`include "VX_define.vh"
`include "VX_trace.vh"
module VX_issue #(
parameter CORE_ID = 0
module VX_issue import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -23,137 +22,81 @@ module VX_issue #(
input wire reset,
`ifdef PERF_ENABLE
VX_pipeline_perf_if.issue perf_issue_if,
output issue_perf_t issue_perf,
`endif
VX_decode_if.slave decode_if,
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH]
);
VX_ibuffer_if ibuffer_if [`NUM_WARPS]();
VX_scoreboard_if scoreboard_if [`ISSUE_WIDTH]();
VX_operands_if operands_if [`ISSUE_WIDTH]();
`RESET_RELAY (ibuf_reset, reset);
`RESET_RELAY (scoreboard_reset, reset);
`RESET_RELAY (operands_reset, reset);
`RESET_RELAY (dispatch_reset, reset);
VX_ibuffer #(
.CORE_ID (CORE_ID)
) ibuffer (
.clk (clk),
.reset (ibuf_reset),
.decode_if (decode_if),
.ibuffer_if (ibuffer_if)
);
VX_scoreboard #(
.CORE_ID (CORE_ID)
) scoreboard (
.clk (clk),
.reset (scoreboard_reset),
`ifdef PERF_ENABLE
.perf_scb_stalls(perf_issue_if.scb_stalls),
.perf_units_uses(perf_issue_if.units_uses),
.perf_sfu_uses (perf_issue_if.sfu_uses),
`endif
.writeback_if (writeback_if),
.ibuffer_if (ibuffer_if),
.scoreboard_if (scoreboard_if)
);
VX_operands #(
.CORE_ID (CORE_ID)
) operands (
.clk (clk),
.reset (operands_reset),
.writeback_if (writeback_if),
.scoreboard_if (scoreboard_if),
.operands_if (operands_if)
);
VX_dispatch #(
.CORE_ID (CORE_ID)
) dispatch (
.clk (clk),
.reset (dispatch_reset),
`ifdef PERF_ENABLE
`UNUSED_PIN (perf_stalls),
`endif
.operands_if (operands_if),
.dispatch_if (dispatch_if)
);
`ifdef DBG_SCOPE_ISSUE
if (CORE_ID == 0) begin
`ifdef SCOPE
wire operands_if_fire = operands_if[0].valid && operands_if[0].ready;
wire operands_if_not_ready = ~operands_if[0].ready;
wire writeback_if_valid = writeback_if[0].valid;
VX_scope_tap #(
.SCOPE_ID (2),
.TRIGGERW (4),
.PROBEW (`UUID_WIDTH + `NUM_THREADS + `EX_BITS + `INST_OP_BITS +
1 + `NR_BITS + (`NUM_THREADS * 3 * `XLEN) +
`UUID_WIDTH + `NUM_THREADS + `NR_BITS + (`NUM_THREADS*`XLEN) + 1)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({
reset,
operands_if_fire,
operands_if_not_ready,
writeback_if_valid
}),
.probes({
operands_if[0].data.uuid,
operands_if[0].data.tmask,
operands_if[0].data.ex_type,
operands_if[0].data.op_type,
operands_if[0].data.wb,
operands_if[0].data.rd,
operands_if[0].data.rs1_data,
operands_if[0].data.rs2_data,
operands_if[0].data.rs3_data,
writeback_if[0].data.uuid,
writeback_if[0].data.tmask,
writeback_if[0].data.rd,
writeback_if[0].data.data,
writeback_if[0].data.eop
}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
ila_issue ila_issue_inst (
.clk (clk),
.probe0 ({operands_if.uuid, ibuffer.rs3, ibuffer.rs2, ibuffer.rs1, operands_if.PC, operands_if.tmask, operands_if.wid, operands_if.ex_type, operands_if.op_type, operands_if.ready, operands_if.valid}),
.probe1 ({writeback_if.uuid, writeback_if.data[0], writeback_if.PC, writeback_if.tmask, writeback_if.wid, writeback_if.eop, writeback_if.valid})
);
`endif
end
`else
`SCOPE_IO_UNUSED()
`endif
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
wire decode_stall = decode_if.valid && ~decode_if.ready;
always @(posedge clk) begin
if (reset) begin
perf_ibf_stalls <= '0;
end else begin
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_stall);
issue_perf_t per_issue_perf [`ISSUE_WIDTH];
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, ibf_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, scb_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, opd_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, units_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
end
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, sfu_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
end
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
`endif
wire [ISSUE_ISW_W-1:0] decode_isw = wid_to_isw(decode_if.data.wid);
wire [ISSUE_WIS_W-1:0] decode_wis = wid_to_wis(decode_if.data.wid);
wire [`ISSUE_WIDTH-1:0] decode_ready_in;
assign decode_if.ready = decode_ready_in[decode_isw];
`SCOPE_IO_SWITCH (`ISSUE_WIDTH)
for (genvar issue_id = 0; issue_id < `ISSUE_WIDTH; ++issue_id) begin : issue_slices
VX_decode_if #(
.NUM_WARPS (PER_ISSUE_WARPS)
) per_issue_decode_if();
VX_dispatch_if per_issue_dispatch_if[`NUM_EX_UNITS]();
assign per_issue_decode_if.valid = decode_if.valid && (decode_isw == ISSUE_ISW_W'(issue_id));
assign per_issue_decode_if.data.uuid = decode_if.data.uuid;
assign per_issue_decode_if.data.wid = decode_wis;
assign per_issue_decode_if.data.tmask = decode_if.data.tmask;
assign per_issue_decode_if.data.PC = decode_if.data.PC;
assign per_issue_decode_if.data.ex_type = decode_if.data.ex_type;
assign per_issue_decode_if.data.op_type = decode_if.data.op_type;
assign per_issue_decode_if.data.op_args = decode_if.data.op_args;
assign per_issue_decode_if.data.wb = decode_if.data.wb;
assign per_issue_decode_if.data.rd = decode_if.data.rd;
assign per_issue_decode_if.data.rs1 = decode_if.data.rs1;
assign per_issue_decode_if.data.rs2 = decode_if.data.rs2;
assign per_issue_decode_if.data.rs3 = decode_if.data.rs3;
assign decode_ready_in[issue_id] = per_issue_decode_if.ready;
`ifndef L1_ENABLE
assign decode_if.ibuf_pop[issue_id * PER_ISSUE_WARPS +: PER_ISSUE_WARPS] = per_issue_decode_if.ibuf_pop;
`endif
`RESET_RELAY (slice_reset, reset);
VX_issue_slice #(
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, issue_id)),
.ISSUE_ID (issue_id)
) issue_slice (
`SCOPE_IO_BIND(issue_id)
.clk (clk),
.reset (slice_reset),
`ifdef PERF_ENABLE
.issue_perf (per_issue_perf[issue_id]),
`endif
.decode_if (per_issue_decode_if),
.writeback_if (writeback_if[issue_id]),
.dispatch_if (per_issue_dispatch_if)
);
// Assign transposed dispatch_if
for (genvar ex_id = 0; ex_id < `NUM_EX_UNITS; ++ex_id) begin
`ASSIGN_VX_IF(dispatch_if[ex_id * `ISSUE_WIDTH + issue_id], per_issue_dispatch_if[ex_id]);
end
end
endmodule

View file

@ -0,0 +1,159 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter ISSUE_ID = 0
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output issue_perf_t issue_perf,
`endif
VX_decode_if.slave decode_if,
VX_writeback_if.slave writeback_if,
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS]
);
`UNUSED_PARAM (ISSUE_ID)
VX_ibuffer_if ibuffer_if [PER_ISSUE_WARPS]();
VX_scoreboard_if scoreboard_if();
VX_operands_if operands_if();
`RESET_RELAY (ibuf_reset, reset);
`RESET_RELAY (scoreboard_reset, reset);
`RESET_RELAY (operands_reset, reset);
`RESET_RELAY (dispatch_reset, reset);
VX_ibuffer #(
.INSTANCE_ID ($sformatf("%s-ibuffer", INSTANCE_ID))
) ibuffer (
.clk (clk),
.reset (ibuf_reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.ibf_stalls),
`endif
.decode_if (decode_if),
.ibuffer_if (ibuffer_if)
);
VX_scoreboard #(
.INSTANCE_ID ($sformatf("%s-scoreboard", INSTANCE_ID))
) scoreboard (
.clk (clk),
.reset (scoreboard_reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.scb_stalls),
.perf_units_uses(issue_perf.units_uses),
.perf_sfu_uses (issue_perf.sfu_uses),
`endif
.writeback_if (writeback_if),
.ibuffer_if (ibuffer_if),
.scoreboard_if (scoreboard_if)
);
VX_operands #(
.INSTANCE_ID ($sformatf("%s-operands", INSTANCE_ID))
) operands (
.clk (clk),
.reset (operands_reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.opd_stalls),
`endif
.writeback_if (writeback_if),
.scoreboard_if (scoreboard_if),
.operands_if (operands_if)
);
VX_dispatch #(
.INSTANCE_ID ($sformatf("%s-dispatch", INSTANCE_ID))
) dispatch (
.clk (clk),
.reset (dispatch_reset),
`ifdef PERF_ENABLE
`UNUSED_PIN (perf_stalls),
`endif
.operands_if (operands_if),
.dispatch_if (dispatch_if)
);
`ifdef DBG_SCOPE_ISSUE
wire operands_if_fire = operands_if.valid && operands_if.ready;
wire operands_if_not_ready = ~operands_if.ready;
wire writeback_if_valid = writeback_if.valid;
VX_scope_tap #(
.SCOPE_ID (2),
.TRIGGERW (4),
.PROBEW (`UUID_WIDTH + `NUM_THREADS + `EX_BITS + `INST_OP_BITS +
1 + `NR_BITS + (`NUM_THREADS * 3 * `XLEN) +
`UUID_WIDTH + `NUM_THREADS + `NR_BITS + (`NUM_THREADS*`XLEN) + 1)
) scope_tap (
.clk (clk),
.reset (scope_reset),
.start (1'b0),
.stop (1'b0),
.triggers ({
reset,
operands_if_fire,
operands_if_not_ready,
writeback_if_valid
}),
.probes ({
operands_if.data.uuid,
operands_if.data.tmask,
operands_if.data.ex_type,
operands_if.data.op_type,
operands_if.data.wb,
operands_if.data.rd,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data,
writeback_if.data.uuid,
writeback_if.data.tmask,
writeback_if.data.rd,
writeback_if.data.data,
writeback_if.data.eop
}),
.bus_in (scope_bus_in),
.bus_out (scope_bus_out)
);
`else
`SCOPE_IO_UNUSED()
`endif
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (operands_if.valid && operands_if.ready) begin
`TRACE(1, ("%d: %s wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, wis_to_wid(operands_if.data.wis, ISSUE_ID), {operands_if.data.PC, 1'b0}));
trace_ex_type(1, operands_if.data.ex_type);
`TRACE(1, (", op="));
trace_ex_op(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args);
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if.data.tmask, operands_if.data.wb, operands_if.data.rd));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs1_data, `NUM_THREADS);
`TRACE(1, (", rs2_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs2_data, `NUM_THREADS);
`TRACE(1, (", rs3_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs3_data, `NUM_THREADS);
trace_op_args(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args);
`TRACE(1, (" (#%0d)\n", operands_if.data.uuid));
end
end
`endif
endmodule

132
hw/rtl/core/VX_issue_top.sv Normal file
View file

@ -0,0 +1,132 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_issue_top import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "issue"
) (
// Clock
input wire clk,
input wire reset,
input wire decode_valid,
input wire [`UUID_WIDTH-1:0] decode_uuid,
input wire [`NW_WIDTH-1:0] decode_wid,
input wire [`NUM_THREADS-1:0] decode_tmask,
input wire [`PC_BITS-1:0] decode_PC,
input wire [`EX_BITS-1:0] decode_ex_type,
input wire [`INST_OP_BITS-1:0] decode_op_type,
input op_args_t decode_op_args,
input wire decode_wb,
input wire [`NR_BITS-1:0] decode_rd,
input wire [`NR_BITS-1:0] decode_rs1,
input wire [`NR_BITS-1:0] decode_rs2,
input wire [`NR_BITS-1:0] decode_rs3,
output wire decode_ready,
input wire writeback_valid[`ISSUE_WIDTH],
input wire [`UUID_WIDTH-1:0] writeback_uuid[`ISSUE_WIDTH],
input wire [ISSUE_WIS_W-1:0] writeback_wis[`ISSUE_WIDTH],
input wire [`NUM_THREADS-1:0] writeback_tmask[`ISSUE_WIDTH],
input wire [`PC_BITS-1:0] writeback_PC[`ISSUE_WIDTH],
input wire [`NR_BITS-1:0] writeback_rd[`ISSUE_WIDTH],
input wire [`NUM_THREADS-1:0][`XLEN-1:0] writeback_data[`ISSUE_WIDTH],
input wire writeback_sop[`ISSUE_WIDTH],
input wire writeback_eop[`ISSUE_WIDTH],
output wire dispatch_valid[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`UUID_WIDTH-1:0] dispatch_uuid[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [ISSUE_WIS_W-1:0] dispatch_wis[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NUM_THREADS-1:0] dispatch_tmask[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`PC_BITS-1:0] dispatch_PC[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`INST_ALU_BITS-1:0] dispatch_op_type[`NUM_EX_UNITS * `ISSUE_WIDTH],
output op_args_t dispatch_op_args[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire dispatch_wb[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NR_BITS-1:0] dispatch_rd[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NT_WIDTH-1:0] dispatch_tid[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs1_data[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data[`NUM_EX_UNITS * `ISSUE_WIDTH],
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data[`NUM_EX_UNITS * `ISSUE_WIDTH],
input wire dispatch_ready[`NUM_EX_UNITS * `ISSUE_WIDTH]
);
VX_decode_if decode_if();
VX_dispatch_if dispatch_if[`NUM_EX_UNITS * `ISSUE_WIDTH]();
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
assign decode_if.valid = decode_valid;
assign decode_if.data.uuid = decode_uuid;
assign decode_if.data.wid = decode_wid;
assign decode_if.data.tmask = decode_tmask;
assign decode_if.data.PC = decode_PC;
assign decode_if.data.ex_type = decode_ex_type;
assign decode_if.data.op_type = decode_op_type;
assign decode_if.data.op_args = decode_op_args;
assign decode_if.data.wb = decode_wb;
assign decode_if.data.rd = decode_rd;
assign decode_if.data.rs1 = decode_rs1;
assign decode_if.data.rs2 = decode_rs2;
assign decode_if.data.rs3 = decode_rs3;
assign decode_ready = decode_if.ready;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign writeback_if[i].valid = writeback_valid[i];
assign writeback_if[i].data.uuid = writeback_uuid[i];
assign writeback_if[i].data.wis = writeback_wis[i];
assign writeback_if[i].data.tmask = writeback_tmask[i];
assign writeback_if[i].data.PC = writeback_PC[i];
assign writeback_if[i].data.rd = writeback_rd[i];
assign writeback_if[i].data.data = writeback_data[i];
assign writeback_if[i].data.sop = writeback_sop[i];
assign writeback_if[i].data.eop = writeback_eop[i];
end
for (genvar i = 0; i < `NUM_EX_UNITS * `ISSUE_WIDTH; ++i) begin
assign dispatch_valid[i] = dispatch_if[i].valid;
assign dispatch_uuid[i] = dispatch_if[i].data.uuid;
assign dispatch_wis[i] = dispatch_if[i].data.wis;
assign dispatch_tmask[i] = dispatch_if[i].data.tmask;
assign dispatch_PC[i] = dispatch_if[i].data.PC;
assign dispatch_op_type[i] = dispatch_if[i].data.op_type;
assign dispatch_op_args[i] = dispatch_if[i].data.op_args;
assign dispatch_wb[i] = dispatch_if[i].data.wb;
assign dispatch_rd[i] = dispatch_if[i].data.rd;
assign dispatch_tid[i] = dispatch_if[i].data.tid;
assign dispatch_rs1_data[i] = dispatch_if[i].data.rs1_data;
assign dispatch_rs2_data[i] = dispatch_if[i].data.rs2_data;
assign dispatch_rs3_data[i] = dispatch_if[i].data.rs3_data;
assign dispatch_if[i].ready = dispatch_ready[i];
end
`ifdef PERF_ENABLE
issue_perf_t issue_perf = '0;
`endif
VX_issue #(
.INSTANCE_ID (INSTANCE_ID)
) issue (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.issue_perf (issue_perf),
`endif
.decode_if (decode_if),
.writeback_if (writeback_if),
.dispatch_if (dispatch_if)
);
endmodule

View file

@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_lmem_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
@ -37,9 +37,7 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_lsu_if[`NUM_LSU_BLOCKS]();
`RESET_RELAY (req_reset, reset);
) lsu_switch_if[`NUM_LSU_BLOCKS]();
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
@ -54,13 +52,15 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
wire req_global_ready;
wire req_local_ready;
`RESET_RELAY (switch_reset, reset);
VX_elastic_buffer #(
.DATAW (REQ_DATAW),
.SIZE (2),
.OUT_REG (1)
) req_global_buf (
.clk (clk),
.reset (req_reset),
.reset (switch_reset),
.valid_in (lsu_mem_in_if[i].req_valid && is_addr_global),
.data_in ({
lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask,
@ -91,7 +91,7 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
.OUT_REG (0)
) req_local_buf (
.clk (clk),
.reset (req_reset),
.reset (switch_reset),
.valid_in (lsu_mem_in_if[i].req_valid && is_addr_local),
.data_in ({
lsu_mem_in_if[i].req_data.mask & is_addr_local_mask,
@ -103,73 +103,47 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
lsu_mem_in_if[i].req_data.tag
}),
.ready_in (req_local_ready),
.valid_out (lmem_lsu_if[i].req_valid),
.valid_out (lsu_switch_if[i].req_valid),
.data_out ({
lmem_lsu_if[i].req_data.mask,
lmem_lsu_if[i].req_data.rw,
lmem_lsu_if[i].req_data.byteen,
lmem_lsu_if[i].req_data.addr,
lmem_lsu_if[i].req_data.atype,
lmem_lsu_if[i].req_data.data,
lmem_lsu_if[i].req_data.tag
lsu_switch_if[i].req_data.mask,
lsu_switch_if[i].req_data.rw,
lsu_switch_if[i].req_data.byteen,
lsu_switch_if[i].req_data.addr,
lsu_switch_if[i].req_data.atype,
lsu_switch_if[i].req_data.data,
lsu_switch_if[i].req_data.tag
}),
.ready_out (lmem_lsu_if[i].req_ready)
.ready_out (lsu_switch_if[i].req_ready)
);
assign lsu_mem_in_if[i].req_ready = (req_global_ready && is_addr_global)
|| (req_local_ready && is_addr_local);
end
`RESET_RELAY (rsp_reset, reset);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
wire rsp_arb_valid;
wire rsp_arb_index;
wire rsp_arb_ready;
VX_generic_arbiter #(
.NUM_REQS (2),
.LOCK_ENABLE (1),
.TYPE ("R")
) arbiter (
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_DATAW),
.ARBITER ("R"),
.OUT_BUF (1)
) rsp_arb (
.clk (clk),
.reset (rsp_reset),
.requests ({
lmem_lsu_if[i].rsp_valid,
.reset (switch_reset),
.valid_in ({
lsu_switch_if[i].rsp_valid,
lsu_mem_out_if[i].rsp_valid
}),
.grant_valid (rsp_arb_valid),
.grant_index (rsp_arb_index),
`UNUSED_PIN (grant_onehot),
.grant_unlock(rsp_arb_ready)
);
VX_elastic_buffer #(
.DATAW (RSP_DATAW),
.SIZE (2),
.OUT_REG (0)
) rsp_buf (
.clk (clk),
.reset (rsp_reset),
.valid_in (rsp_arb_valid),
.ready_in ({
lsu_switch_if[i].rsp_ready,
lsu_mem_out_if[i].rsp_ready
}),
.data_in ({
rsp_arb_index ? lmem_lsu_if[i].rsp_data.mask : lsu_mem_out_if[i].rsp_data.mask,
rsp_arb_index ? lmem_lsu_if[i].rsp_data.data : lsu_mem_out_if[i].rsp_data.data,
rsp_arb_index ? lmem_lsu_if[i].rsp_data.tag : lsu_mem_out_if[i].rsp_data.tag
lsu_switch_if[i].rsp_data,
lsu_mem_out_if[i].rsp_data
}),
.ready_in (rsp_arb_ready),
.data_out (lsu_mem_in_if[i].rsp_data),
.valid_out (lsu_mem_in_if[i].rsp_valid),
.data_out ({
lsu_mem_in_if[i].rsp_data.mask,
lsu_mem_in_if[i].rsp_data.data,
lsu_mem_in_if[i].rsp_data.tag
}),
.ready_out (lsu_mem_in_if[i].rsp_ready)
.ready_out (lsu_mem_in_if[i].rsp_ready),
`UNUSED_PIN (sel_out)
);
assign lsu_mem_out_if[i].rsp_ready = rsp_arb_ready && ~rsp_arb_index;
assign lmem_lsu_if[i].rsp_ready = rsp_arb_ready && rsp_arb_index;
end
VX_mem_bus_if #(
@ -177,25 +151,25 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_if[LSU_NUM_REQS]();
`RESET_RELAY (adapter_reset, reset);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_tmp_if[`NUM_LSU_LANES]();
`RESET_RELAY (adapter_reset, reset);
VX_lsu_adapter #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH),
.REQ_OUT_BUF (2),
.RSP_OUT_BUF (1)
.REQ_OUT_BUF (3),
.RSP_OUT_BUF (0)
) lsu_adapter (
.clk (clk),
.reset (adapter_reset),
.lsu_mem_if (lmem_lsu_if[i]),
.lsu_mem_if (lsu_switch_if[i]),
.mem_bus_if (lmem_bus_tmp_if)
);
@ -207,14 +181,15 @@ module VX_lmem_unit import VX_gpu_pkg::*; #(
`RESET_RELAY (lmem_reset, reset);
VX_local_mem #(
.INSTANCE_ID($sformatf("core%0d-lmem", CORE_ID)),
.INSTANCE_ID($sformatf("%s-lmem", INSTANCE_ID)),
.SIZE (1 << `LMEM_LOG_SIZE),
.NUM_REQS (LSU_NUM_REQS),
.NUM_BANKS (`LMEM_NUM_BANKS),
.WORD_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH)
.TAG_WIDTH (LSU_TAG_WIDTH),
.OUT_BUF (3)
) local_mem (
.clk (clk),
.reset (lmem_reset),

View file

@ -13,9 +13,8 @@
`include "VX_define.vh"
module VX_lsu_slice import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter BLOCK_ID = 0
module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -88,7 +87,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
wire [NUM_LANES-1:0] mem_req_mask;
wire mem_req_rw;
wire [NUM_LANES-1:0][LSU_ADDR_WIDTH-1:0] mem_req_addr;
reg [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] mem_req_byteen;
wire [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] mem_req_byteen;
reg [NUM_LANES-1:0][LSU_WORD_SIZE*8-1:0] mem_req_data;
wire [TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
@ -159,27 +158,30 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
// byte enable formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
reg [LSU_WORD_SIZE-1:0] mem_req_byteen_r;
always @(*) begin
mem_req_byteen[i] = '0;
mem_req_byteen_r = '0;
case (`INST_LSU_WSIZE(execute_if.data.op_type))
0: begin // 8-bit
mem_req_byteen[i][req_align[i]] = 1'b1;
mem_req_byteen_r[req_align[i]] = 1'b1;
end
1: begin // 16 bit
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
end
`ifdef XLEN_64
2: begin // 32 bit
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
end
`endif
default : mem_req_byteen[i] = {LSU_WORD_SIZE{1'b1}};
// 3: 64 bit
default : mem_req_byteen_r = {LSU_WORD_SIZE{1'b1}};
endcase
end
assign mem_req_byteen[i] = mem_req_byteen_r;
end
// memory misalignment not supported!
@ -312,7 +314,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
`RESET_RELAY (mem_scheduler_reset, reset);
VX_mem_scheduler #(
.INSTANCE_ID ($sformatf("core%0d-lsu-memsched%0d", CORE_ID, BLOCK_ID)),
.INSTANCE_ID ($sformatf("%s-scheduler", INSTANCE_ID)),
.CORE_REQS (NUM_LANES),
.MEM_CHANNELS(NUM_LANES),
.WORD_SIZE (LSU_WORD_SIZE),
@ -504,11 +506,11 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
`ifdef DBG_TRACE_MEM
always @(posedge clk) begin
if (execute_if.valid && fence_lock) begin
`TRACE(1, ("%d: *** D$%0d fence wait\n", $time, CORE_ID));
`TRACE(1, ("%d: *** %s fence wait\n", $time, INSTANCE_ID));
end
if (mem_req_fire) begin
if (mem_req_rw) begin
`TRACE(1, ("%d: D$%0d Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask));
`TRACE(1, ("%d: %s Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
@ -516,7 +518,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
`TRACE_ARRAY1D(1, "0x%0h", mem_req_data, NUM_LANES);
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_req_tag, execute_if.data.uuid));
end else begin
`TRACE(1, ("%d: D$%0d Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask));
`TRACE(1, ("%d: %s Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
@ -524,8 +526,8 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
end
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: D$%0d Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=",
$time, CORE_ID, rsp_wid, {rsp_pc, 1'b0}, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop));
`TRACE(1, ("%d: %s Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=",
$time, INSTANCE_ID, rsp_wid, {rsp_pc, 1'b0}, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop));
`TRACE_ARRAY1D(1, "0x%0h", mem_rsp_data, NUM_LANES);
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid));
end
@ -533,36 +535,20 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
`endif
`ifdef DBG_SCOPE_LSU
if (CORE_ID == 0 && BLOCK_ID == 0) begin
`ifdef SCOPE
VX_scope_tap #(
.SCOPE_ID (3),
.TRIGGERW (3),
.PROBEW (`UUID_WIDTH+NUM_LANES*(`XLEN+4+`XLEN)+1+`UUID_WIDTH+NUM_LANES*`XLEN)
.PROBEW (1 + NUM_LANES*(`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE*8) + `UUID_WIDTH + NUM_LANES*LSU_WORD_SIZE*8 + `UUID_WIDTH)
) scope_tap (
.clk (clk),
.reset (scope_reset),
.start (1'b0),
.stop (1'b0),
.triggers({reset, mem_req_fire, mem_rsp_fire}),
.probes({execute_if.data.uuid, full_addr, mem_req_rw, mem_req_byteen, mem_req_data, rsp_uuid, rsp_data}),
.probes ({mem_req_rw, full_addr, mem_req_byteen, mem_req_data, execute_if.data.uuid, rsp_data, rsp_uuid}),
.bus_in (scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
wire [31:0] full_addr_0 = full_addr[0];
wire [31:0] mem_req_data_0 = mem_req_data[0];
wire [31:0] rsp_data_0 = rsp_data[0];
ila_lsu ila_lsu_inst (
.clk (clk),
.probe0 ({mem_req_data_0, execute_if.data.uuid, execute_if.data.wid, execute_if.data.PC, mem_req_mask, full_addr_0, mem_req_byteen, mem_req_rw, mem_req_ready, mem_req_valid}),
.probe1 ({rsp_data_0, rsp_uuid, mem_rsp_eop, rsp_pc, rsp_rd, mem_rsp_mask, rsp_wid, mem_rsp_ready, mem_rsp_valid}),
.probe2 ({lsu_mem_if.req_data.data, lsu_mem_if.req_data.tag, lsu_mem_if.req_data.byteen, lsu_mem_if.req_data.addr, lsu_mem_if.req_data.rw, lsu_mem_if.req_ready, lsu_mem_if.req_valid}),
.probe3 ({lsu_mem_if.rsp_data.data, lsu_mem_if.rsp_data.tag, lsu_mem_if.rsp_ready, lsu_mem_if.rsp_valid})
);
`endif
end
`else
`SCOPE_IO_UNUSED()
`endif

View file

@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_lsu_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -32,7 +32,6 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
localparam NUM_LANES = `NUM_LSU_LANES;
`ifdef SCOPE
localparam scope_lsu = 0;
`SCOPE_IO_SWITCH (BLOCK_SIZE);
`endif
@ -55,17 +54,16 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
.NUM_LANES (NUM_LANES)
) per_block_commit_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : lsu_slices
`RESET_RELAY (block_reset, reset);
`RESET_RELAY (slice_reset, reset);
VX_lsu_slice #(
.CORE_ID (CORE_ID),
.BLOCK_ID (block_idx)
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, block_idx))
) lsu_slice(
`SCOPE_IO_BIND (scope_lsu+block_idx)
`SCOPE_IO_BIND (block_idx)
.clk (clk),
.reset (block_reset),
.reset (slice_reset),
.execute_if (per_block_execute_if[block_idx]),
.commit_if (per_block_commit_if[block_idx]),
.lsu_mem_if (lsu_mem_if[block_idx])

View file

@ -14,29 +14,288 @@
`include "VX_define.vh"
module VX_operands import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = "",
parameter NUM_BANKS = 4,
parameter OUT_BUF = 4 // using 2-cycle EB for area reduction
) (
input wire clk,
input wire reset,
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_scoreboard_if.slave scoreboard_if [`ISSUE_WIDTH],
VX_operands_if.master operands_if [`ISSUE_WIDTH]
`ifdef PERF_ENABLE
output wire [`PERF_CTR_BITS-1:0] perf_stalls,
`endif
VX_writeback_if.slave writeback_if,
VX_scoreboard_if.slave scoreboard_if,
VX_operands_if.master operands_if
);
`UNUSED_SPARAM (INSTANCE_ID)
localparam NUM_SRC_REGS = 3;
localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_REGS);
localparam REQ_SEL_WIDTH = `UP(REQ_SEL_BITS);
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS;
localparam METADATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS;
localparam DATAW = `UUID_WIDTH + METADATAW + 3 * `NUM_THREADS * `XLEN;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS);
localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS;
localparam XLEN_SIZE = `XLEN / 8;
localparam BYTEENW = `NUM_THREADS * XLEN_SIZE;
`UNUSED_VAR (writeback_if.data.sop)
wire [NUM_SRC_REGS-1:0] src_valid;
wire [NUM_SRC_REGS-1:0] req_in_valid;
wire [NUM_SRC_REGS-1:0] req_in_ready;
wire [NUM_SRC_REGS-1:0][PER_BANK_ADDRW-1:0] req_in_data;
wire [NUM_SRC_REGS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
wire [NUM_BANKS-1:0] gpr_rd_valid_n, gpr_rd_ready;
reg [NUM_BANKS-1:0] gpr_rd_valid;
wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr_n;
reg [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr;
wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx_n;
reg [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx;
wire pipe_in_ready;
reg pipe_out_valid;
wire pipe_out_ready;
reg [`UUID_WIDTH-1:0] pipe_out_uuid;
reg [METADATAW-1:0] pipe_out_data;
reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data, src_data_n;
reg [NUM_SRC_REGS-1:0] data_fetched;
reg has_collision, has_collision_n;
wire stg_in_valid, stg_in_ready;
wire [NUM_SRC_REGS-1:0][`NR_BITS-1:0] src_regs = {scoreboard_if.data.rs3,
scoreboard_if.data.rs2,
scoreboard_if.data.rs1};
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
if (ISSUE_WIS != 0) begin
assign req_in_data[i] = {src_regs[i][`NR_BITS-1:BANK_SEL_BITS], scoreboard_if.data.wis};
end else begin
assign req_in_data[i] = src_regs[i][`NR_BITS-1:BANK_SEL_BITS];
end
if (NUM_BANKS != 1) begin
assign req_bank_idx[i] = src_regs[i][BANK_SEL_BITS-1:0];
end else begin
assign req_bank_idx[i] = '0;
end
end
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
assign src_valid[i] = (src_regs[i] != 0) && ~data_fetched[i];
end
assign req_in_valid = {NUM_SRC_REGS{scoreboard_if.valid}} & src_valid;
VX_stream_xbar #(
.NUM_INPUTS (NUM_SRC_REGS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (PER_BANK_ADDRW),
.ARBITER ("P"), // use priority arbiter
.PERF_CTR_BITS(`PERF_CTR_BITS),
.OUT_BUF (0) // no output buffering
) req_xbar (
.clk (clk),
.reset (reset),
`UNUSED_PIN(collisions),
.valid_in (req_in_valid),
.data_in (req_in_data),
.sel_in (req_bank_idx),
.ready_in (req_in_ready),
.valid_out (gpr_rd_valid_n),
.data_out (gpr_rd_addr_n),
.sel_out (gpr_rd_req_idx_n),
.ready_out (gpr_rd_ready)
);
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
assign gpr_rd_ready = {NUM_BANKS{stg_in_ready}};
`RESET_RELAY (slice_reset, reset);
always @(*) begin
has_collision_n = 0;
for (integer i = 0; i < NUM_SRC_REGS; ++i) begin
for (integer j = 1; j < (NUM_SRC_REGS-i); ++j) begin
has_collision_n |= src_valid[i]
&& src_valid[j+i]
&& (req_bank_idx[i] == req_bank_idx[j+i]);
end
end
end
VX_gpr_slice #(
.CORE_ID (CORE_ID)
) gpr_slice (
always @(*) begin
src_data_n = src_data;
for (integer b = 0; b < NUM_BANKS; ++b) begin
if (gpr_rd_valid[b]) begin
src_data_n[gpr_rd_req_idx[b]] = gpr_rd_data[b];
end
end
end
wire pipe_stall = pipe_out_valid && ~pipe_out_ready;
assign pipe_in_ready = ~pipe_stall;
assign scoreboard_if.ready = pipe_in_ready && ~has_collision_n;
wire stg_in_fire = stg_in_valid && stg_in_ready;
always @(posedge clk) begin
if (reset) begin
pipe_out_valid <= 0;
gpr_rd_valid <= '0;
data_fetched <= '0;
src_data <= '0;
end else begin
if (~pipe_stall) begin
pipe_out_valid <= scoreboard_if.valid;
gpr_rd_valid <= gpr_rd_valid_n;
if (scoreboard_if.ready) begin
data_fetched <= '0;
end else begin
data_fetched <= data_fetched | req_in_ready;
end
if (stg_in_fire) begin
src_data <= '0;
end else begin
src_data <= src_data_n;
end
end
end
if (~pipe_stall) begin
pipe_out_uuid <= scoreboard_if.data.uuid;
pipe_out_data <= {
scoreboard_if.data.wis,
scoreboard_if.data.tmask,
scoreboard_if.data.PC,
scoreboard_if.data.wb,
scoreboard_if.data.ex_type,
scoreboard_if.data.op_type,
scoreboard_if.data.op_args,
scoreboard_if.data.rd
};
has_collision <= has_collision_n;
gpr_rd_addr <= gpr_rd_addr_n;
gpr_rd_req_idx <= gpr_rd_req_idx_n;
end
end
assign pipe_out_ready = stg_in_ready;
assign stg_in_valid = pipe_out_valid && ~has_collision;
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)),
.LUTRAM (1)
) out_buffer (
.clk (clk),
.reset (slice_reset),
.writeback_if (writeback_if[i]),
.scoreboard_if(scoreboard_if[i]),
.operands_if (operands_if[i])
.reset (reset),
.valid_in (stg_in_valid),
.ready_in (stg_in_ready),
.data_in ({
pipe_out_uuid,
pipe_out_data,
src_data_n[0],
src_data_n[1],
src_data_n[2]
}),
.data_out ({
operands_if.data.uuid,
operands_if.data.wis,
operands_if.data.tmask,
operands_if.data.PC,
operands_if.data.wb,
operands_if.data.ex_type,
operands_if.data.op_type,
operands_if.data.op_args,
operands_if.data.rd,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data
}),
.valid_out (operands_if.valid),
.ready_out (operands_if.ready)
);
wire [PER_BANK_ADDRW-1:0] gpr_wr_addr;
if (ISSUE_WIS != 0) begin
assign gpr_wr_addr = {writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS], writeback_if.data.wis};
end else begin
assign gpr_wr_addr = writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS];
end
wire [BANK_SEL_WIDTH-1:0] gpr_wr_bank_idx;
if (NUM_BANKS != 1) begin
assign gpr_wr_bank_idx = writeback_if.data.rd[BANK_SEL_BITS-1:0];
end else begin
assign gpr_wr_bank_idx = '0;
end
`ifdef GPR_RESET
reg wr_enabled = 0;
always @(posedge clk) begin
if (reset) begin
wr_enabled <= 1;
end
end
`else
wire wr_enabled = 1;
`endif
for (genvar b = 0; b < NUM_BANKS; ++b) begin
wire gpr_wr_enabled;
if (BANK_SEL_BITS != 0) begin
assign gpr_wr_enabled = wr_enabled
&& writeback_if.valid
&& (gpr_wr_bank_idx == BANK_SEL_BITS'(b));
end else begin
assign gpr_wr_enabled = wr_enabled && writeback_if.valid;
end
wire [BYTEENW-1:0] wren;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign wren[i*XLEN_SIZE+:XLEN_SIZE] = {XLEN_SIZE{writeback_if.data.tmask[i]}};
end
`ifdef GPR_RESET
VX_dp_ram_rst #(
`else
VX_dp_ram #(
`endif
.DATAW (`XLEN * `NUM_THREADS),
.SIZE (PER_BANK_REGS * PER_ISSUE_WARPS),
.WRENW (BYTEENW),
.NO_RWCHECK (1)
) gpr_ram (
.clk (clk),
`ifdef GPR_RESET
.reset (reset),
`endif
.read (1'b1),
.wren (wren),
.write (gpr_wr_enabled),
.waddr (gpr_wr_addr),
.wdata (writeback_if.data.data),
.raddr (gpr_rd_addr[b]),
.rdata (gpr_rd_data[b])
);
end
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] collisions_r;
always @(posedge clk) begin
if (reset) begin
collisions_r <= '0;
end else begin
collisions_r <= collisions_r + `PERF_CTR_BITS'(scoreboard_if.valid && pipe_in_ready && has_collision_n);
end
end
assign perf_stalls = collisions_r;
`endif
endmodule

View file

@ -14,13 +14,14 @@
`include "VX_define.vh"
module VX_schedule import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
VX_pipeline_perf_if.schedule perf_schedule_if,
output sched_perf_t sched_perf,
`endif
// configuration
@ -42,6 +43,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
// status
output wire busy
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (CORE_ID)
reg [`NUM_WARPS-1:0] active_warps, active_warps_n; // updated when a warp is activated or disabled
@ -290,7 +292,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
`RESET_RELAY (split_join_reset, reset);
VX_split_join #(
.CORE_ID (CORE_ID)
.INSTANCE_ID ($sformatf("%s-splitjoin", INSTANCE_ID))
) split_join (
.clk (clk),
.reset (split_join_reset),
@ -368,24 +370,42 @@ module VX_schedule import VX_gpu_pkg::*; #(
assign schedule_if.data.uuid = instr_uuid;
// Track pending instructions per warp
reg [`NUM_WARPS-1:0] per_warp_incr;
always @(*) begin
per_warp_incr = 0;
if (schedule_if_fire) begin
per_warp_incr[schedule_if.data.wid] = 1;
end
end
wire [`NUM_WARPS-1:0] pending_warp_empty;
wire [`NUM_WARPS-1:0] pending_warp_alm_empty;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
`RESET_RELAY (pending_instr_reset, reset);
wire no_pending_instr;
VX_pending_instr #(
.CTR_WIDTH (12),
.DECR_COUNT (`ISSUE_WIDTH),
VX_pending_size #(
.SIZE (4096),
.ALM_EMPTY (1)
) pending_instr(
) counter (
.clk (clk),
.reset (pending_instr_reset),
.incr (schedule_if_fire),
.incr_wid (schedule_if.data.wid),
.decr (commit_sched_if.committed),
.decr_wid (commit_sched_if.committed_wid),
.alm_empty_wid (sched_csr_if.alm_empty_wid),
.alm_empty (sched_csr_if.alm_empty),
.empty (no_pending_instr)
.incr (per_warp_incr[i]),
.decr (commit_sched_if.committed_warps[i]),
.empty (pending_warp_empty[i]),
.alm_empty (pending_warp_alm_empty[i]),
`UNUSED_PIN (full),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (size)
);
end
assign sched_csr_if.alm_empty = pending_warp_alm_empty[sched_csr_if.alm_empty_wid];
wire no_pending_instr = (& pending_warp_empty);
`BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1);
@ -412,7 +432,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
end
end
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps))
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** %s timeout: stalled_warps=%b", $time, INSTANCE_ID, stalled_warps))
`ifdef PERF_ENABLE
reg [`PERF_CTR_BITS-1:0] perf_sched_idles;
@ -431,8 +451,8 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
end
assign perf_schedule_if.sched_idles = perf_sched_idles;
assign perf_schedule_if.sched_stalls = perf_sched_stalls;
assign sched_perf.idles = perf_sched_idles;
assign sched_perf.stalls = perf_sched_stalls;
`endif
endmodule

View file

@ -14,39 +14,37 @@
`include "VX_define.vh"
module VX_scoreboard import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output reg [`PERF_CTR_BITS-1:0] perf_scb_stalls,
output reg [`PERF_CTR_BITS-1:0] perf_units_uses [`NUM_EX_UNITS],
output reg [`PERF_CTR_BITS-1:0] perf_sfu_uses [`NUM_SFU_UNITS],
output reg [`PERF_CTR_BITS-1:0] perf_stalls,
output reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_units_uses,
output reg [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_sfu_uses,
`endif
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_ibuffer_if.slave ibuffer_if [`NUM_WARPS],
VX_scoreboard_if.master scoreboard_if [`ISSUE_WIDTH]
VX_writeback_if.slave writeback_if,
VX_ibuffer_if.slave ibuffer_if [PER_ISSUE_WARPS],
VX_scoreboard_if.master scoreboard_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + 1;
VX_ibuffer_if staging_if [PER_ISSUE_WARPS]();
reg [PER_ISSUE_WARPS-1:0] operands_ready;
`ifdef PERF_ENABLE
reg [`NUM_WARPS-1:0][`NUM_EX_UNITS-1:0] perf_inuse_units_per_cycle;
reg [PER_ISSUE_WARPS-1:0][`NUM_EX_UNITS-1:0] perf_inuse_units_per_cycle;
wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
reg [`NUM_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
wire [`NUM_WARPS-1:0] perf_issue_stalls_per_cycle;
wire [`CLOG2(`NUM_WARPS+1)-1:0] perf_stalls_per_cycle, perf_stalls_per_cycle_r;
`POP_COUNT(perf_stalls_per_cycle, perf_issue_stalls_per_cycle);
VX_reduce #(
.DATAW_IN (`NUM_EX_UNITS),
.N (`NUM_WARPS),
.N (PER_ISSUE_WARPS),
.OP ("|")
) perf_units_reduce (
.data_in (perf_inuse_units_per_cycle),
@ -55,22 +53,28 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
VX_reduce #(
.DATAW_IN (`NUM_SFU_UNITS),
.N (`NUM_WARPS),
.N (PER_ISSUE_WARPS),
.OP ("|")
) perf_sfu_reduce (
.data_in (perf_inuse_sfu_per_cycle),
.data_out (perf_sfu_per_cycle)
);
`BUFFER(perf_stalls_per_cycle_r, perf_stalls_per_cycle);
`BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, `CDIV(`NUM_WARPS, `MAX_FANOUT));
`BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, `CDIV(`NUM_WARPS, `MAX_FANOUT));
`BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
`BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
wire [PER_ISSUE_WARPS-1:0] stg_valid_in;
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
assign stg_valid_in[w] = staging_if[w].valid;
end
wire perf_stall_per_cycle = (|stg_valid_in) && ~(|(stg_valid_in & operands_ready));
always @(posedge clk) begin
if (reset) begin
perf_scb_stalls <= '0;
perf_stalls <= '0;
end else begin
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(perf_stalls_per_cycle_r);
perf_stalls <= perf_stalls + `PERF_CTR_BITS'(perf_stall_per_cycle);
end
end
@ -95,138 +99,121 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
`endif
VX_ibuffer_if staging_if [`NUM_WARPS]();
wire [`NUM_WARPS-1:0][3:0] staging_opds_busy;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (1)
) stanging_buf (
.clk (clk),
.reset (reset),
.valid_in (ibuffer_if[i].valid),
.data_in (ibuffer_if[i].data),
.ready_in (ibuffer_if[i].ready),
.valid_out(staging_if[i].valid),
.data_out (staging_if[i].data),
.ready_out(staging_if[i].ready)
.valid_in (ibuffer_if[w].valid),
.data_in (ibuffer_if[w].data),
.ready_in (ibuffer_if[w].ready),
.valid_out(staging_if[w].valid),
.data_out (staging_if[w].data),
.ready_out(staging_if[w].ready)
);
end
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
reg [`NUM_REGS-1:0] inuse_regs;
reg [3:0] operands_busy_r, operands_busy_n;
reg [3:0] operands_busy, operands_busy_n;
localparam iw = i % `ISSUE_WIDTH;
localparam wis = i / `ISSUE_WIDTH;
wire ibuffer_fire = ibuffer_if[w].valid && ibuffer_if[w].ready;
wire ibuffer_fire = ibuffer_if[i].valid && ibuffer_if[i].ready;
wire staging_fire = staging_if[w].valid && staging_if[w].ready;
wire staging_fire = staging_if[i].valid && staging_if[i].ready;
wire writeback_fire = writeback_if[iw].valid
&& (writeback_if[iw].data.wis == ISSUE_WIS_W'(wis))
&& writeback_if[iw].data.eop;
wire writeback_fire = writeback_if.valid
&& (writeback_if.data.wis == ISSUE_WIS_W'(w))
&& writeback_if.data.eop;
`ifdef PERF_ENABLE
reg [`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
reg [`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
reg [`SFU_WIDTH-1:0] sfu_type;
always @(*) begin
case (staging_if[i].data.op_type)
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: sfu_type = `SFU_CSRS;
default: sfu_type = `SFU_WCTL;
endcase
end
always @(*) begin
perf_inuse_units_per_cycle[i] = '0;
perf_inuse_sfu_per_cycle[i] = '0;
if (staging_if[i].valid) begin
if (operands_busy_r[0]) begin
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rd]] = 1;
if (inuse_units[staging_if[i].data.rd] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[i][inuse_sfu[staging_if[i].data.rd]] = 1;
perf_inuse_units_per_cycle[w] = '0;
perf_inuse_sfu_per_cycle[w] = '0;
if (staging_if[w].valid) begin
if (operands_busy[0]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rd]] = 1;
if (inuse_units[staging_if[w].data.rd] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rd]] = 1;
end
end
if (operands_busy_r[1]) begin
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs1]] = 1;
if (inuse_units[staging_if[i].data.rs1] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[i][inuse_sfu[staging_if[i].data.rs1]] = 1;
if (operands_busy[1]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs1]] = 1;
if (inuse_units[staging_if[w].data.rs1] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs1]] = 1;
end
end
if (operands_busy_r[2]) begin
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs2]] = 1;
if (inuse_units[staging_if[i].data.rs2] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[i][inuse_sfu[staging_if[i].data.rs2]] = 1;
if (operands_busy[2]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs2]] = 1;
if (inuse_units[staging_if[w].data.rs2] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs2]] = 1;
end
end
if (operands_busy_r[3]) begin
perf_inuse_units_per_cycle[i][inuse_units[staging_if[i].data.rs3]] = 1;
if (inuse_units[staging_if[i].data.rs3] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[i][inuse_sfu[staging_if[i].data.rs3]] = 1;
if (operands_busy[3]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs3]] = 1;
if (inuse_units[staging_if[w].data.rs3] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs3]] = 1;
end
end
end
end
assign perf_issue_stalls_per_cycle[i] = staging_if[i].valid && ~staging_if[i].ready;
`endif
always @(*) begin
operands_busy_n = operands_busy_r;
operands_busy_n = operands_busy;
if (ibuffer_fire) begin
operands_busy_n = {
inuse_regs[ibuffer_if[i].data.rs3],
inuse_regs[ibuffer_if[i].data.rs2],
inuse_regs[ibuffer_if[i].data.rs1],
inuse_regs[ibuffer_if[i].data.rd]
inuse_regs[ibuffer_if[w].data.rs3],
inuse_regs[ibuffer_if[w].data.rs2],
inuse_regs[ibuffer_if[w].data.rs1],
inuse_regs[ibuffer_if[w].data.rd]
};
end
if (writeback_fire) begin
if (ibuffer_fire) begin
if (writeback_if[iw].data.rd == ibuffer_if[i].data.rd) begin
if (writeback_if.data.rd == ibuffer_if[w].data.rd) begin
operands_busy_n[0] = 0;
end
if (writeback_if[iw].data.rd == ibuffer_if[i].data.rs1) begin
if (writeback_if.data.rd == ibuffer_if[w].data.rs1) begin
operands_busy_n[1] = 0;
end
if (writeback_if[iw].data.rd == ibuffer_if[i].data.rs2) begin
if (writeback_if.data.rd == ibuffer_if[w].data.rs2) begin
operands_busy_n[2] = 0;
end
if (writeback_if[iw].data.rd == ibuffer_if[i].data.rs3) begin
if (writeback_if.data.rd == ibuffer_if[w].data.rs3) begin
operands_busy_n[3] = 0;
end
end else begin
if (writeback_if[iw].data.rd == staging_if[i].data.rd) begin
if (writeback_if.data.rd == staging_if[w].data.rd) begin
operands_busy_n[0] = 0;
end
if (writeback_if[iw].data.rd == staging_if[i].data.rs1) begin
if (writeback_if.data.rd == staging_if[w].data.rs1) begin
operands_busy_n[1] = 0;
end
if (writeback_if[iw].data.rd == staging_if[i].data.rs2) begin
if (writeback_if.data.rd == staging_if[w].data.rs2) begin
operands_busy_n[2] = 0;
end
if (writeback_if[iw].data.rd == staging_if[i].data.rs3) begin
if (writeback_if.data.rd == staging_if[w].data.rs3) begin
operands_busy_n[3] = 0;
end
end
end
if (staging_fire && staging_if[i].data.wb) begin
if (staging_if[i].data.rd == ibuffer_if[i].data.rd) begin
if (staging_fire && staging_if[w].data.wb) begin
if (staging_if[w].data.rd == ibuffer_if[w].data.rd) begin
operands_busy_n[0] = 1;
end
if (staging_if[i].data.rd == ibuffer_if[i].data.rs1) begin
if (staging_if[w].data.rd == ibuffer_if[w].data.rs1) begin
operands_busy_n[1] = 1;
end
if (staging_if[i].data.rd == ibuffer_if[i].data.rs2) begin
if (staging_if[w].data.rd == ibuffer_if[w].data.rs2) begin
operands_busy_n[2] = 1;
end
if (staging_if[i].data.rd == ibuffer_if[i].data.rs3) begin
if (staging_if[w].data.rd == ibuffer_if[w].data.rs3) begin
operands_busy_n[3] = 1;
end
end
@ -237,25 +224,24 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
inuse_regs <= '0;
end else begin
if (writeback_fire) begin
inuse_regs[writeback_if[iw].data.rd] <= 0;
inuse_regs[writeback_if.data.rd] <= 0;
end
if (staging_fire && staging_if[i].data.wb) begin
inuse_regs[staging_if[i].data.rd] <= 1;
if (staging_fire && staging_if[w].data.wb) begin
inuse_regs[staging_if[w].data.rd] <= 1;
end
end
operands_busy_r <= operands_busy_n;
operands_busy <= operands_busy_n;
operands_ready[w] <= ~(| operands_busy_n);
`ifdef PERF_ENABLE
if (staging_fire && staging_if[i].data.wb) begin
inuse_units[staging_if[i].data.rd] <= staging_if[i].data.ex_type;
if (staging_if[i].data.ex_type == `EX_SFU) begin
inuse_sfu[staging_if[i].data.rd] <= sfu_type;
if (staging_fire && staging_if[w].data.wb) begin
inuse_units[staging_if[w].data.rd] <= staging_if[w].data.ex_type;
if (staging_if[w].data.ex_type == `EX_SFU) begin
inuse_sfu[staging_if[w].data.rd] <= op_to_sfu_type(staging_if[w].data.op_type);
end
end
`endif
end
assign staging_opds_busy[i] = operands_busy_r;
`ifdef SIMULATION
reg [31:0] timeout_ctr;
@ -263,11 +249,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
if (reset) begin
timeout_ctr <= '0;
end else begin
if (staging_if[i].valid && ~staging_if[i].ready) begin
if (staging_if[w].valid && ~staging_if[w].ready) begin
`ifdef DBG_TRACE_PIPELINE
`TRACE(3, ("%d: *** core%0d-scoreboard-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, CORE_ID, i, {staging_if[i].data.PC, 1'b0}, staging_if[i].data.tmask, timeout_ctr,
operands_busy_r, staging_if[i].data.uuid));
`TRACE(3, ("%d: *** %s-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr,
operands_busy, staging_if[w].data.uuid));
`endif
timeout_ctr <= timeout_ctr + 1;
end else if (ibuffer_fire) begin
@ -277,59 +263,57 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
("%t: *** core%0d-scoreboard-timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, CORE_ID, i, {staging_if[i].data.PC, 1'b0}, staging_if[i].data.tmask, timeout_ctr,
operands_busy_r, staging_if[i].data.uuid));
("%t: *** %s timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr,
operands_busy, staging_if[w].data.uuid));
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[iw].data.rd] != 0,
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, CORE_ID, i, {writeback_if[iw].data.PC, 1'b0}, writeback_if[iw].data.tmask, writeback_if[iw].data.rd, writeback_if[iw].data.uuid));
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if.data.rd] != 0,
("%t: *** %s invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, INSTANCE_ID, w, {writeback_if.data.PC, 1'b0}, writeback_if.data.tmask, writeback_if.data.rd, writeback_if.data.uuid));
`endif
end
`RESET_RELAY (arb_reset, reset);
wire [PER_ISSUE_WARPS-1:0] arb_valid_in;
wire [PER_ISSUE_WARPS-1:0][DATAW-1:0] arb_data_in;
wire [PER_ISSUE_WARPS-1:0] arb_ready_in;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
wire [ISSUE_RATIO-1:0] valid_in;
wire [ISSUE_RATIO-1:0][DATAW-1:0] data_in;
wire [ISSUE_RATIO-1:0] ready_in;
for (genvar j = 0; j < ISSUE_RATIO; ++j) begin
wire operands_ready = ~(| staging_opds_busy[j * `ISSUE_WIDTH + i]);
assign valid_in[j] = staging_if[j * `ISSUE_WIDTH + i].valid && operands_ready;
assign data_in[j] = staging_if[j * `ISSUE_WIDTH + i].data;
assign staging_if[j * `ISSUE_WIDTH + i].ready = ready_in[j] && operands_ready;
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
assign arb_valid_in[w] = staging_if[w].valid && operands_ready[w];
assign arb_data_in[w] = staging_if[w].data;
assign staging_if[w].ready = arb_ready_in[w] && operands_ready[w];
end
`RESET_RELAY (arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (ISSUE_RATIO),
.NUM_INPUTS (PER_ISSUE_WARPS),
.DATAW (DATAW),
.ARBITER ("R"),
.OUT_BUF (2)
.ARBITER ("F"),
.LUTRAM (1),
.OUT_BUF (4) // using 2-cycle EB for area reduction
) out_arb (
.clk (clk),
.reset (arb_reset),
.valid_in (valid_in),
.ready_in (ready_in),
.data_in (data_in),
.valid_in (arb_valid_in),
.ready_in (arb_ready_in),
.data_in (arb_data_in),
.data_out ({
scoreboard_if[i].data.uuid,
scoreboard_if[i].data.tmask,
scoreboard_if[i].data.PC,
scoreboard_if[i].data.ex_type,
scoreboard_if[i].data.op_type,
scoreboard_if[i].data.op_args,
scoreboard_if[i].data.wb,
scoreboard_if[i].data.rd,
scoreboard_if[i].data.rs1,
scoreboard_if[i].data.rs2,
scoreboard_if[i].data.rs3
scoreboard_if.data.uuid,
scoreboard_if.data.tmask,
scoreboard_if.data.PC,
scoreboard_if.data.ex_type,
scoreboard_if.data.op_type,
scoreboard_if.data.op_args,
scoreboard_if.data.wb,
scoreboard_if.data.rd,
scoreboard_if.data.rs1,
scoreboard_if.data.rs2,
scoreboard_if.data.rs3
}),
.valid_out (scoreboard_if[i].valid),
.ready_out (scoreboard_if[i].ready),
.sel_out (scoreboard_if[i].data.wis)
.valid_out (scoreboard_if.valid),
.ready_out (scoreboard_if.ready),
.sel_out (scoreboard_if.data.wis)
);
end
endmodule

View file

@ -14,6 +14,7 @@
`include "VX_define.vh"
module VX_sfu_unit import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter CORE_ID = 0
) (
input wire clk,
@ -39,7 +40,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_warp_ctl_if.master warp_ctl_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_SFU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
@ -83,7 +84,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
`RESET_RELAY (wctl_reset, reset);
VX_wctl_unit #(
.CORE_ID (CORE_ID),
.INSTANCE_ID ($sformatf("%s-wctl", INSTANCE_ID)),
.NUM_LANES (NUM_LANES)
) wctl_unit (
.clk (clk),
@ -111,6 +112,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
`RESET_RELAY (csr_reset, reset);
VX_csr_unit #(
.INSTANCE_ID ($sformatf("%s-csr", INSTANCE_ID)),
.CORE_ID (CORE_ID),
.NUM_LANES (NUM_LANES)
) csr_unit (

View file

@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_split_join import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
@ -31,7 +31,7 @@ module VX_split_join import VX_gpu_pkg::*; #(
input wire [`NW_WIDTH-1:0] stack_wid,
output wire [`DV_STACK_SIZEW-1:0] stack_ptr
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
wire [(`NUM_THREADS+`PC_BITS)-1:0] ipdom_data [`NUM_WARPS-1:0];
wire [`DV_STACK_SIZEW-1:0] ipdom_q_ptr [`NUM_WARPS-1:0];

399
hw/rtl/core/VX_trace_pkg.sv Normal file
View file

@ -0,0 +1,399 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`ifndef VX_TRACE_PKG_VH
`define VX_TRACE_PKG_VH
`include "VX_define.vh"
package VX_trace_pkg;
`ifdef SIMULATION
`ifdef SV_DPI
import "DPI-C" function void dpi_trace(input int level, input string format /*verilator sformat*/);
`endif
import VX_gpu_pkg::*;
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
case (ex_type)
`EX_ALU: `TRACE(level, ("ALU"));
`EX_LSU: `TRACE(level, ("LSU"));
`EX_FPU: `TRACE(level, ("FPU"));
`EX_SFU: `TRACE(level, ("SFU"));
default: `TRACE(level, ("?"));
endcase
endtask
task trace_ex_op(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input VX_gpu_pkg::op_args_t op_args
);
case (ex_type)
`EX_ALU: begin
case (op_args.alu.xtype)
`ALU_TYPE_ARITH: begin
if (op_args.alu.is_w) begin
if (op_args.alu.use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDIW"));
`INST_ALU_SLL: `TRACE(level, ("SLLIW"));
`INST_ALU_SRL: `TRACE(level, ("SRLIW"));
`INST_ALU_SRA: `TRACE(level, ("SRAIW"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDW"));
`INST_ALU_SUB: `TRACE(level, ("SUBW"));
`INST_ALU_SLL: `TRACE(level, ("SLLW"));
`INST_ALU_SRL: `TRACE(level, ("SRLW"));
`INST_ALU_SRA: `TRACE(level, ("SRAW"));
default: `TRACE(level, ("?"));
endcase
end
end else begin
if (op_args.alu.use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDI"));
`INST_ALU_SLL: `TRACE(level, ("SLLI"));
`INST_ALU_SRL: `TRACE(level, ("SRLI"));
`INST_ALU_SRA: `TRACE(level, ("SRAI"));
`INST_ALU_SLT: `TRACE(level, ("SLTI"));
`INST_ALU_SLTU: `TRACE(level, ("SLTIU"));
`INST_ALU_XOR: `TRACE(level, ("XORI"));
`INST_ALU_OR: `TRACE(level, ("ORI"));
`INST_ALU_AND: `TRACE(level, ("ANDI"));
`INST_ALU_LUI: `TRACE(level, ("LUI"));
`INST_ALU_AUIPC: `TRACE(level, ("AUIPC"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADD"));
`INST_ALU_SUB: `TRACE(level, ("SUB"));
`INST_ALU_SLL: `TRACE(level, ("SLL"));
`INST_ALU_SRL: `TRACE(level, ("SRL"));
`INST_ALU_SRA: `TRACE(level, ("SRA"));
`INST_ALU_SLT: `TRACE(level, ("SLT"));
`INST_ALU_SLTU: `TRACE(level, ("SLTU"));
`INST_ALU_XOR: `TRACE(level, ("XOR"));
`INST_ALU_OR: `TRACE(level, ("OR"));
`INST_ALU_AND: `TRACE(level, ("AND"));
`INST_ALU_CZEQ: `TRACE(level, ("CZERO.EQZ"));
`INST_ALU_CZNE: `TRACE(level, ("CZERO.NEZ"));
default: `TRACE(level, ("?"));
endcase
end
end
end
`ALU_TYPE_BRANCH: begin
case (`INST_BR_BITS'(op_type))
`INST_BR_EQ: `TRACE(level, ("BEQ"));
`INST_BR_NE: `TRACE(level, ("BNE"));
`INST_BR_LT: `TRACE(level, ("BLT"));
`INST_BR_GE: `TRACE(level, ("BGE"));
`INST_BR_LTU: `TRACE(level, ("BLTU"));
`INST_BR_GEU: `TRACE(level, ("BGEU"));
`INST_BR_JAL: `TRACE(level, ("JAL"));
`INST_BR_JALR: `TRACE(level, ("JALR"));
`INST_BR_ECALL: `TRACE(level, ("ECALL"));
`INST_BR_EBREAK:`TRACE(level, ("EBREAK"));
`INST_BR_URET: `TRACE(level, ("URET"));
`INST_BR_SRET: `TRACE(level, ("SRET"));
`INST_BR_MRET: `TRACE(level, ("MRET"));
default: `TRACE(level, ("?"));
endcase
end
`ALU_TYPE_MULDIV: begin
if (op_args.alu.is_w) begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MULW"));
`INST_M_DIV: `TRACE(level, ("DIVW"));
`INST_M_DIVU: `TRACE(level, ("DIVUW"));
`INST_M_REM: `TRACE(level, ("REMW"));
`INST_M_REMU: `TRACE(level, ("REMUW"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MUL"));
`INST_M_MULH: `TRACE(level, ("MULH"));
`INST_M_MULHSU:`TRACE(level, ("MULHSU"));
`INST_M_MULHU: `TRACE(level, ("MULHU"));
`INST_M_DIV: `TRACE(level, ("DIV"));
`INST_M_DIVU: `TRACE(level, ("DIVU"));
`INST_M_REM: `TRACE(level, ("REM"));
`INST_M_REMU: `TRACE(level, ("REMU"));
default: `TRACE(level, ("?"));
endcase
end
end
default: `TRACE(level, ("?"));
endcase
end
`EX_LSU: begin
if (op_args.lsu.is_float) begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LW: `TRACE(level, ("FLW"));
`INST_LSU_LD: `TRACE(level, ("FLD"));
`INST_LSU_SW: `TRACE(level, ("FSW"));
`INST_LSU_SD: `TRACE(level, ("FSD"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LB: `TRACE(level, ("LB"));
`INST_LSU_LH: `TRACE(level, ("LH"));
`INST_LSU_LW: `TRACE(level, ("LW"));
`INST_LSU_LD: `TRACE(level, ("LD"));
`INST_LSU_LBU:`TRACE(level, ("LBU"));
`INST_LSU_LHU:`TRACE(level, ("LHU"));
`INST_LSU_LWU:`TRACE(level, ("LWU"));
`INST_LSU_SB: `TRACE(level, ("SB"));
`INST_LSU_SH: `TRACE(level, ("SH"));
`INST_LSU_SW: `TRACE(level, ("SW"));
`INST_LSU_SD: `TRACE(level, ("SD"));
`INST_LSU_FENCE:`TRACE(level,("FENCE"));
default: `TRACE(level, ("?"));
endcase
end
end
`EX_FPU: begin
case (`INST_FPU_BITS'(op_type))
`INST_FPU_ADD: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FADD.D"));
else
`TRACE(level, ("FADD.S"));
end
`INST_FPU_SUB: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FSUB.D"));
else
`TRACE(level, ("FSUB.S"));
end
`INST_FPU_MUL: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FMUL.D"));
else
`TRACE(level, ("FMUL.S"));
end
`INST_FPU_DIV: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FDIV.D"));
else
`TRACE(level, ("FDIV.S"));
end
`INST_FPU_SQRT: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FSQRT.D"));
else
`TRACE(level, ("FSQRT.S"));
end
`INST_FPU_MADD: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FMADD.D"));
else
`TRACE(level, ("FMADD.S"));
end
`INST_FPU_MSUB: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FMSUB.D"));
else
`TRACE(level, ("FMSUB.S"));
end
`INST_FPU_NMADD: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FNMADD.D"));
else
`TRACE(level, ("FNMADD.S"));
end
`INST_FPU_NMSUB: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FNMSUB.D"));
else
`TRACE(level, ("FNMSUB.S"));
end
`INST_FPU_CMP: begin
if (op_args.fpu.fmt[0]) begin
case (op_args.fpu.frm[1:0])
0: `TRACE(level, ("FLE.D"));
1: `TRACE(level, ("FLT.D"));
2: `TRACE(level, ("FEQ.D"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (op_args.fpu.frm[1:0])
0: `TRACE(level, ("FLE.S"));
1: `TRACE(level, ("FLT.S"));
2: `TRACE(level, ("FEQ.S"));
default: `TRACE(level, ("?"));
endcase
end
end
`INST_FPU_F2F: begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FCVT.D.S"));
end else begin
`TRACE(level, ("FCVT.S.D"));
end
end
`INST_FPU_F2I: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.L.D"));
end else begin
`TRACE(level, ("FCVT.W.D"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.L.S"));
end else begin
`TRACE(level, ("FCVT.W.S"));
end
end
end
`INST_FPU_F2U: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.LU.D"));
end else begin
`TRACE(level, ("FCVT.WU.D"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.LU.S"));
end else begin
`TRACE(level, ("FCVT.WU.S"));
end
end
end
`INST_FPU_I2F: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.D.L"));
end else begin
`TRACE(level, ("FCVT.D.W"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.S.L"));
end else begin
`TRACE(level, ("FCVT.S.W"));
end
end
end
`INST_FPU_U2F: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.D.LU"));
end else begin
`TRACE(level, ("FCVT.D.WU"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.S.LU"));
end else begin
`TRACE(level, ("FCVT.S.WU"));
end
end
end
`INST_FPU_MISC: begin
if (op_args.fpu.fmt[0]) begin
case (op_args.fpu.frm)
0: `TRACE(level, ("FSGNJ.D"));
1: `TRACE(level, ("FSGNJN.D"));
2: `TRACE(level, ("FSGNJX.D"));
3: `TRACE(level, ("FCLASS.D"));
4: `TRACE(level, ("FMV.X.D"));
5: `TRACE(level, ("FMV.D.X"));
6: `TRACE(level, ("FMIN.D"));
7: `TRACE(level, ("FMAX.D"));
endcase
end else begin
case (op_args.fpu.frm)
0: `TRACE(level, ("FSGNJ.S"));
1: `TRACE(level, ("FSGNJN.S"));
2: `TRACE(level, ("FSGNJX.S"));
3: `TRACE(level, ("FCLASS.S"));
4: `TRACE(level, ("FMV.X.S"));
5: `TRACE(level, ("FMV.S.X"));
6: `TRACE(level, ("FMIN.S"));
7: `TRACE(level, ("FMAX.S"));
endcase
end
end
default: `TRACE(level, ("?"));
endcase
end
`EX_SFU: begin
case (`INST_SFU_BITS'(op_type))
`INST_SFU_TMC: `TRACE(level, ("TMC"));
`INST_SFU_WSPAWN:`TRACE(level, ("WSPAWN"));
`INST_SFU_SPLIT: begin if (op_args.wctl.is_neg) `TRACE(level, ("SPLIT.N")); else `TRACE(level, ("SPLIT")); end
`INST_SFU_JOIN: `TRACE(level, ("JOIN"));
`INST_SFU_BAR: `TRACE(level, ("BAR"));
`INST_SFU_PRED: begin if (op_args.wctl.is_neg) `TRACE(level, ("PRED.N")); else `TRACE(level, ("PRED")); end
`INST_SFU_CSRRW: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end
`INST_SFU_CSRRS: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end
`INST_SFU_CSRRC: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end
default: `TRACE(level, ("?"));
endcase
end
default: `TRACE(level, ("?"));
endcase
endtask
task trace_op_args(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input VX_gpu_pkg::op_args_t op_args
);
case (ex_type)
`EX_ALU: begin
`TRACE(level, (", use_PC=%b, use_imm=%b, imm=0x%0h", op_args.alu.use_PC, op_args.alu.use_imm, op_args.alu.imm));
end
`EX_LSU: begin
`TRACE(level, (", offset=0x%0h", op_args.lsu.offset));
end
`EX_FPU: begin
`TRACE(level, (", fmt=0x%0h, frm=0x%0h", op_args.fpu.fmt, op_args.fpu.frm));
end
`EX_SFU: begin
if (`INST_SFU_IS_CSR(op_type)) begin
`TRACE(level, (", addr=0x%0h, use_imm=%b, imm=0x%0h", op_args.csr.addr, op_args.csr.use_imm, op_args.csr.imm));
end
end
default:;
endcase
endtask
task trace_base_dcr(input int level, input [`VX_DCR_ADDR_WIDTH-1:0] addr);
case (addr)
`VX_DCR_BASE_STARTUP_ADDR0: `TRACE(level, ("STARTUP_ADDR0"));
`VX_DCR_BASE_STARTUP_ADDR1: `TRACE(level, ("STARTUP_ADDR1"));
`VX_DCR_BASE_STARTUP_ARG0: `TRACE(level, ("STARTUP_ARG0"));
`VX_DCR_BASE_STARTUP_ARG1: `TRACE(level, ("STARTUP_ARG1"));
`VX_DCR_BASE_MPM_CLASS: `TRACE(level, ("MPM_CLASS"));
default: `TRACE(level, ("?"));
endcase
endtask
`endif
endpackage
`endif // VX_TRACE_PKG_VH

View file

@ -14,7 +14,7 @@
`include "VX_define.vh"
module VX_wctl_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter `STRING INSTANCE_ID = "",
parameter NUM_LANES = 1
) (
input wire clk,
@ -27,7 +27,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
VX_warp_ctl_if.master warp_ctl_if,
VX_commit_if.master commit_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_SPARAM (INSTANCE_ID)
localparam LANE_BITS = `CLOG2(NUM_LANES);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);

View file

@ -141,13 +141,13 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
begin : fma
reg [NUM_LANES-1:0][`XLEN-1:0] result_fma;
wire [NUM_LANES-1:0][63:0] result_fadd;
wire [NUM_LANES-1:0][63:0] result_fsub;
wire [NUM_LANES-1:0][63:0] result_fmul;
wire [NUM_LANES-1:0][63:0] result_fmadd;
wire [NUM_LANES-1:0][63:0] result_fmsub;
wire [NUM_LANES-1:0][63:0] result_fnmadd;
wire [NUM_LANES-1:0][63:0] result_fnmsub;
reg [NUM_LANES-1:0][63:0] result_fadd;
reg [NUM_LANES-1:0][63:0] result_fsub;
reg [NUM_LANES-1:0][63:0] result_fmul;
reg [NUM_LANES-1:0][63:0] result_fmadd;
reg [NUM_LANES-1:0][63:0] result_fmsub;
reg [NUM_LANES-1:0][63:0] result_fnmadd;
reg [NUM_LANES-1:0][63:0] result_fnmsub;
fflags_t [NUM_LANES-1:0] fflags_fma;
fflags_t [NUM_LANES-1:0] fflags_fadd;
@ -217,7 +217,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
begin : fdiv
reg [NUM_LANES-1:0][`XLEN-1:0] result_fdiv_r;
wire [NUM_LANES-1:0][63:0] result_fdiv;
reg [NUM_LANES-1:0][63:0] result_fdiv;
fflags_t [NUM_LANES-1:0] fflags_fdiv;
wire fdiv_valid = (valid_in && core_select == FPU_DIVSQRT) && is_div;
@ -256,7 +256,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
begin : fsqrt
reg [NUM_LANES-1:0][`XLEN-1:0] result_fsqrt_r;
wire [NUM_LANES-1:0][63:0] result_fsqrt;
reg [NUM_LANES-1:0][63:0] result_fsqrt;
fflags_t [NUM_LANES-1:0] fflags_fsqrt;
wire fsqrt_valid = (valid_in && core_select == FPU_DIVSQRT) && ~is_div;
@ -295,11 +295,11 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
begin : fcvt
reg [NUM_LANES-1:0][`XLEN-1:0] result_fcvt;
wire [NUM_LANES-1:0][63:0] result_itof;
wire [NUM_LANES-1:0][63:0] result_utof;
wire [NUM_LANES-1:0][63:0] result_ftoi;
wire [NUM_LANES-1:0][63:0] result_ftou;
wire [NUM_LANES-1:0][63:0] result_f2f;
reg [NUM_LANES-1:0][63:0] result_itof;
reg [NUM_LANES-1:0][63:0] result_utof;
reg [NUM_LANES-1:0][63:0] result_ftoi;
reg [NUM_LANES-1:0][63:0] result_ftou;
reg [NUM_LANES-1:0][63:0] result_f2f;
fflags_t [NUM_LANES-1:0] fflags_fcvt;
fflags_t [NUM_LANES-1:0] fflags_itof;
@ -359,15 +359,15 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
begin : fncp
reg [NUM_LANES-1:0][`XLEN-1:0] result_fncp;
wire [NUM_LANES-1:0][63:0] result_fclss;
wire [NUM_LANES-1:0][63:0] result_flt;
wire [NUM_LANES-1:0][63:0] result_fle;
wire [NUM_LANES-1:0][63:0] result_feq;
wire [NUM_LANES-1:0][63:0] result_fmin;
wire [NUM_LANES-1:0][63:0] result_fmax;
wire [NUM_LANES-1:0][63:0] result_fsgnj;
wire [NUM_LANES-1:0][63:0] result_fsgnjn;
wire [NUM_LANES-1:0][63:0] result_fsgnjx;
reg [NUM_LANES-1:0][63:0] result_fclss;
reg [NUM_LANES-1:0][63:0] result_flt;
reg [NUM_LANES-1:0][63:0] result_fle;
reg [NUM_LANES-1:0][63:0] result_feq;
reg [NUM_LANES-1:0][63:0] result_fmin;
reg [NUM_LANES-1:0][63:0] result_fmax;
reg [NUM_LANES-1:0][63:0] result_fsgnj;
reg [NUM_LANES-1:0][63:0] result_fsgnjn;
reg [NUM_LANES-1:0][63:0] result_fsgnjx;
reg [NUM_LANES-1:0][63:0] result_fmvx;
reg [NUM_LANES-1:0][63:0] result_fmvf;
@ -470,7 +470,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
VX_stream_arb #(
.NUM_INPUTS (NUM_FPC),
.DATAW (RSP_DATAW),
.ARBITER ("R"),
.ARBITER ("F"),
.OUT_BUF (OUT_BUF)
) rsp_arb (
.clk (clk),

View file

@ -296,7 +296,7 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
VX_stream_arb #(
.NUM_INPUTS (NUM_FPC),
.DATAW (RSP_DATAW + 2),
.ARBITER ("R"),
.ARBITER ("F"),
.OUT_BUF (OUT_BUF)
) rsp_arb (
.clk (clk),

View file

@ -98,7 +98,7 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
.DATA_IN_WIDTH(3*32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG (1)
.PE_REG ((NUM_LANES != NUM_PES) ? 1 : 0)
) pe_serializer (
.clk (clk),
.reset (reset),

View file

@ -105,7 +105,7 @@ module VX_fpu_fpnew
`UNUSED_VAR (fmt)
always @(*) begin
fpu_op = 'x;
fpu_op = fpnew_pkg::operation_e'('x);
fpu_rnd = frm;
fpu_op_mod = 0;
fpu_has_fflags = 1;

View file

@ -15,17 +15,14 @@
interface VX_commit_sched_if ();
wire [`ISSUE_WIDTH-1:0] committed;
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] committed_wid;
wire [`NUM_WARPS-1:0] committed_warps;
modport master (
output committed,
output committed_wid
output committed_warps
);
modport slave (
input committed,
input committed_wid
input committed_warps
);
endinterface

View file

@ -13,11 +13,14 @@
`include "VX_define.vh"
interface VX_decode_if import VX_gpu_pkg::*; ();
interface VX_decode_if import VX_gpu_pkg::*; #(
parameter NUM_WARPS = `NUM_WARPS,
parameter NW_WIDTH = `LOG2UP(NUM_WARPS)
);
typedef struct packed {
logic [`UUID_WIDTH-1:0] uuid;
logic [`NW_WIDTH-1:0] wid;
logic [NW_WIDTH-1:0] wid;
logic [`NUM_THREADS-1:0] tmask;
logic [`PC_BITS-1:0] PC;
logic [`EX_BITS-1:0] ex_type;
@ -34,7 +37,7 @@ interface VX_decode_if import VX_gpu_pkg::*; ();
data_t data;
logic ready;
`ifndef L1_ENABLE
wire [`NUM_WARPS-1:0] ibuf_pop;
wire [NUM_WARPS-1:0] ibuf_pop;
`endif
modport master (

View file

@ -13,13 +13,9 @@
`include "VX_define.vh"
interface VX_pipeline_perf_if ();
wire [`PERF_CTR_BITS-1:0] sched_idles;
wire [`PERF_CTR_BITS-1:0] sched_stalls;
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
wire [`PERF_CTR_BITS-1:0] scb_stalls;
wire [`PERF_CTR_BITS-1:0] units_uses [`NUM_EX_UNITS];
wire [`PERF_CTR_BITS-1:0] sfu_uses [`NUM_SFU_UNITS];
interface VX_pipeline_perf_if import VX_gpu_pkg::*; ();
sched_perf_t sched;
issue_perf_t issue;
wire [`PERF_CTR_BITS-1:0] ifetches;
wire [`PERF_CTR_BITS-1:0] loads;
@ -27,25 +23,19 @@ interface VX_pipeline_perf_if ();
wire [`PERF_CTR_BITS-1:0] ifetch_latency;
wire [`PERF_CTR_BITS-1:0] load_latency;
modport schedule (
output sched_idles,
output sched_stalls
);
modport issue (
output ibf_stalls,
output scb_stalls,
output units_uses,
output sfu_uses
modport master (
output sched,
output issue,
output ifetches,
output loads,
output stores,
output ifetch_latency,
output load_latency
);
modport slave (
input sched_idles,
input sched_stalls,
input ibf_stalls,
input scb_stalls,
input units_uses,
input sfu_uses,
input sched,
input issue,
input ifetches,
input loads,
input stores,

View file

@ -89,9 +89,11 @@ module VX_avs_adapter #(
.reset (reset),
.incr (req_queue_push[i]),
.decr (req_queue_pop[i]),
`UNUSED_PIN (empty),
`UNUSED_PIN (alm_empty),
.full (req_queue_going_full[i]),
.size (req_queue_size[i]),
`UNUSED_PIN (empty)
`UNUSED_PIN (alm_full),
.size (req_queue_size[i])
);
`UNUSED_VAR (req_queue_size)

View file

@ -16,7 +16,6 @@
`TRACING_OFF
module VX_cyclic_arbiter #(
parameter NUM_REQS = 1,
parameter LOCK_ENABLE = 0,
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
) (
input wire clk,
@ -25,7 +24,7 @@ module VX_cyclic_arbiter #(
output wire [LOG_NUM_REQS-1:0] grant_index,
output wire [NUM_REQS-1:0] grant_onehot,
output wire grant_valid,
input wire grant_unlock
input wire grant_ready
);
if (NUM_REQS == 1) begin
@ -48,7 +47,7 @@ module VX_cyclic_arbiter #(
end else begin
if (!IS_POW2 && grant_index_r == LOG_NUM_REQS'(NUM_REQS-1)) begin
grant_index_r <= '0;
end else if (!LOCK_ENABLE || ~grant_valid || grant_unlock) begin
end else if (~grant_valid || grant_ready) begin
grant_index_r <= grant_index_r + LOG_NUM_REQS'(1);
end
end

View file

@ -17,6 +17,7 @@
module VX_dp_ram #(
parameter DATAW = 1,
parameter SIZE = 1,
parameter ADDR_MIN = 0,
parameter WRENW = 1,
parameter OUT_REG = 0,
parameter NO_RWCHECK = 0,
@ -57,7 +58,7 @@ module VX_dp_ram #(
if (LUTRAM != 0) begin
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
`USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [SIZE-1:0];
`USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
@ -72,7 +73,7 @@ module VX_dp_ram #(
end
assign rdata = rdata_r;
end else begin
`USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [SIZE-1:0];
`USE_FAST_BRAM reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
@ -87,7 +88,7 @@ module VX_dp_ram #(
end else begin
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
reg [WRENW-1:0][WSELW-1:0] ram [SIZE-1:0];
reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
@ -103,7 +104,7 @@ module VX_dp_ram #(
assign rdata = rdata_r;
end else begin
if (NO_RWCHECK != 0) begin
`NO_RW_RAM_CHECK reg [WRENW-1:0][WSELW-1:0] ram [SIZE-1:0];
`NO_RW_RAM_CHECK reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
@ -115,7 +116,7 @@ module VX_dp_ram #(
end
assign rdata = ram[raddr];
end else begin
reg [WRENW-1:0][WSELW-1:0] ram [SIZE-1:0];
reg [WRENW-1:0][WSELW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
@ -132,7 +133,7 @@ module VX_dp_ram #(
`else
// default synthesis
if (LUTRAM != 0) begin
`USE_FAST_BRAM reg [DATAW-1:0] ram [SIZE-1:0];
`USE_FAST_BRAM reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
@ -161,7 +162,7 @@ module VX_dp_ram #(
end
end else begin
if (OUT_REG != 0) begin
reg [DATAW-1:0] ram [SIZE-1:0];
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
reg [DATAW-1:0] rdata_r;
`RAM_INITIALIZATION
always @(posedge clk) begin
@ -178,7 +179,7 @@ module VX_dp_ram #(
assign rdata = rdata_r;
end else begin
if (NO_RWCHECK != 0) begin
`NO_RW_RAM_CHECK reg [DATAW-1:0] ram [SIZE-1:0];
`NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
@ -190,7 +191,7 @@ module VX_dp_ram #(
end
assign rdata = ram[raddr];
end else begin
reg [DATAW-1:0] ram [SIZE-1:0];
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
@ -208,7 +209,7 @@ module VX_dp_ram #(
end else begin
// (WRENW == 1)
if (LUTRAM != 0) begin
`USE_FAST_BRAM reg [DATAW-1:0] ram [SIZE-1:0];
`USE_FAST_BRAM reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
@ -231,7 +232,7 @@ module VX_dp_ram #(
end
end else begin
if (OUT_REG != 0) begin
reg [DATAW-1:0] ram [SIZE-1:0];
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
reg [DATAW-1:0] rdata_r;
`RAM_INITIALIZATION
always @(posedge clk) begin
@ -245,7 +246,7 @@ module VX_dp_ram #(
assign rdata = rdata_r;
end else begin
if (NO_RWCHECK != 0) begin
`NO_RW_RAM_CHECK reg [DATAW-1:0] ram [SIZE-1:0];
`NO_RW_RAM_CHECK reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
@ -254,7 +255,7 @@ module VX_dp_ram #(
end
assign rdata = ram[raddr];
end else begin
reg [DATAW-1:0] ram [SIZE-1:0];
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
always @(posedge clk) begin
if (write) begin
@ -268,7 +269,7 @@ module VX_dp_ram #(
end
`else
// RAM emulation
reg [DATAW-1:0] ram [SIZE-1:0];
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
wire [DATAW-1:0] ram_n;

View file

@ -0,0 +1,115 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_platform.vh"
`TRACING_OFF
module VX_dp_ram_rst #(
parameter DATAW = 1,
parameter SIZE = 1,
parameter ADDR_MIN = 0,
parameter WRENW = 1,
parameter OUT_REG = 0,
parameter NO_RWCHECK = 0,
parameter LUTRAM = 0,
parameter INIT_ENABLE = 0,
parameter INIT_FILE = "",
parameter [DATAW-1:0] INIT_VALUE = 0,
parameter ADDRW = `LOG2UP(SIZE)
) (
input wire clk,
input wire reset,
input wire read,
input wire write,
input wire [WRENW-1:0] wren,
input wire [ADDRW-1:0] waddr,
input wire [DATAW-1:0] wdata,
input wire [ADDRW-1:0] raddr,
output wire [DATAW-1:0] rdata
);
localparam WSELW = DATAW / WRENW;
`STATIC_ASSERT((WRENW * WSELW == DATAW), ("invalid parameter"))
`define RAM_INITIALIZATION \
if (INIT_ENABLE != 0) begin \
if (INIT_FILE != "") begin \
initial $readmemh(INIT_FILE, ram); \
end else begin \
initial \
for (integer i = 0; i < SIZE; ++i) \
ram[i] = INIT_VALUE; \
end \
end
`UNUSED_VAR (read)
// RAM emulation
reg [DATAW-1:0] ram [ADDR_MIN:SIZE-1];
`RAM_INITIALIZATION
wire [DATAW-1:0] ram_n;
for (genvar i = 0; i < WRENW; ++i) begin
assign ram_n[i * WSELW +: WSELW] = ((WRENW == 1) | wren[i]) ? wdata[i * WSELW +: WSELW] : ram[waddr][i * WSELW +: WSELW];
end
if (OUT_REG != 0) begin
reg [DATAW-1:0] rdata_r;
always @(posedge clk) begin
if (reset) begin
for (integer i = 0; i < SIZE; ++i) begin
ram[i] <= DATAW'(INIT_VALUE);
end
rdata_r <= '0;
end else begin
if (write) begin
ram[waddr] <= ram_n;
end
if (read) begin
rdata_r <= ram[raddr];
end
end
end
assign rdata = rdata_r;
end else begin
reg [DATAW-1:0] prev_data;
reg [ADDRW-1:0] prev_waddr;
reg prev_write;
always @(posedge clk) begin
if (reset) begin
for (integer i = 0; i < SIZE; ++i) begin
ram[i] <= DATAW'(INIT_VALUE);
end
prev_write <= 0;
prev_data <= '0;
prev_waddr <= '0;
end else begin
if (write) begin
ram[waddr] <= ram_n;
end
prev_write <= (| wren);
prev_data <= ram[waddr];
prev_waddr <= waddr;
end
end
if (LUTRAM || !NO_RWCHECK) begin
`UNUSED_VAR (prev_write)
`UNUSED_VAR (prev_data)
`UNUSED_VAR (prev_waddr)
assign rdata = ram[raddr];
end else begin
assign rdata = (prev_write && (prev_waddr == raddr)) ? prev_data : ram[raddr];
end
end
endmodule
`TRACING_ON

View file

@ -55,7 +55,7 @@ module VX_elastic_buffer #(
.ready_out (ready_out)
);
end else if (SIZE == 2) begin
end else if (SIZE == 2 && LUTRAM == 0) begin
VX_skid_buffer #(
.DATAW (DATAW),
@ -105,7 +105,7 @@ module VX_elastic_buffer #(
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (OUT_REG == 2)
.SIZE ((OUT_REG == 2) ? 1 : 0)
) out_buf (
.clk (clk),
.reset (reset),

View file

@ -16,7 +16,6 @@
`TRACING_OFF
module VX_fair_arbiter #(
parameter NUM_REQS = 1,
parameter LOCK_ENABLE = 0,
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
) (
input wire clk,
@ -25,13 +24,13 @@ module VX_fair_arbiter #(
output wire [LOG_NUM_REQS-1:0] grant_index,
output wire [NUM_REQS-1:0] grant_onehot,
output wire grant_valid,
input wire grant_unlock
input wire grant_ready
);
if (NUM_REQS == 1) begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (grant_unlock)
`UNUSED_VAR (grant_ready)
assign grant_index = '0;
assign grant_onehot = requests;
@ -39,17 +38,17 @@ module VX_fair_arbiter #(
end else begin
reg [NUM_REQS-1:0] buffer;
reg [NUM_REQS-1:0] grant_mask;
wire [NUM_REQS-1:0] buffer_qual = buffer & requests;
wire [NUM_REQS-1:0] requests_qual = (| buffer) ? buffer_qual : requests;
wire [NUM_REQS-1:0] buffer_n = requests_qual & ~grant_onehot;
wire [NUM_REQS-1:0] requests_rem = requests & ~grant_mask;
wire rem_valid = (| requests_rem);
wire [NUM_REQS-1:0] requests_qual = rem_valid ? requests_rem : requests;
always @(posedge clk) begin
if (reset) begin
buffer <= '0;
end else if (!LOCK_ENABLE || grant_unlock) begin
buffer <= buffer_n;
grant_mask <= '0;
end else if (grant_ready) begin
grant_mask <= rem_valid ? (grant_mask | grant_onehot) : grant_onehot;
end
end

View file

@ -116,7 +116,7 @@ module VX_fifo_queue #(
end
end
if (DEPTH == 2) begin
if (DEPTH == 2 && LUTRAM == 0) begin
assign used_n = used_r ^ (push ^ pop);

View file

@ -16,7 +16,6 @@
`TRACING_OFF
module VX_generic_arbiter #(
parameter NUM_REQS = 1,
parameter LOCK_ENABLE = 0,
parameter `STRING TYPE = "P",
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
) (
@ -26,14 +25,13 @@ module VX_generic_arbiter #(
output wire [LOG_NUM_REQS-1:0] grant_index,
output wire [NUM_REQS-1:0] grant_onehot,
output wire grant_valid,
input wire grant_unlock
input wire grant_ready
);
if (TYPE == "P") begin
`UNUSED_PARAM (LOCK_ENABLE)
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (grant_unlock)
`UNUSED_VAR (grant_ready)
VX_priority_arbiter #(
.NUM_REQS (NUM_REQS)
@ -47,8 +45,7 @@ module VX_generic_arbiter #(
end else if (TYPE == "R") begin
VX_rr_arbiter #(
.NUM_REQS (NUM_REQS),
.LOCK_ENABLE (LOCK_ENABLE)
.NUM_REQS (NUM_REQS)
) rr_arbiter (
.clk (clk),
.reset (reset),
@ -56,14 +53,13 @@ module VX_generic_arbiter #(
.grant_valid (grant_valid),
.grant_index (grant_index),
.grant_onehot (grant_onehot),
.grant_unlock (grant_unlock)
.grant_ready (grant_ready)
);
end else if (TYPE == "F") begin
VX_fair_arbiter #(
.NUM_REQS (NUM_REQS),
.LOCK_ENABLE (LOCK_ENABLE)
.NUM_REQS (NUM_REQS)
) fair_arbiter (
.clk (clk),
.reset (reset),
@ -71,14 +67,13 @@ module VX_generic_arbiter #(
.grant_valid (grant_valid),
.grant_index (grant_index),
.grant_onehot (grant_onehot),
.grant_unlock (grant_unlock)
.grant_ready (grant_ready)
);
end else if (TYPE == "M") begin
VX_matrix_arbiter #(
.NUM_REQS (NUM_REQS),
.LOCK_ENABLE (LOCK_ENABLE)
.NUM_REQS (NUM_REQS)
) matrix_arbiter (
.clk (clk),
.reset (reset),
@ -86,14 +81,13 @@ module VX_generic_arbiter #(
.grant_valid (grant_valid),
.grant_index (grant_index),
.grant_onehot (grant_onehot),
.grant_unlock (grant_unlock)
.grant_ready (grant_ready)
);
end else if (TYPE == "C") begin
VX_cyclic_arbiter #(
.NUM_REQS (NUM_REQS),
.LOCK_ENABLE (LOCK_ENABLE)
.NUM_REQS (NUM_REQS)
) cyclic_arbiter (
.clk (clk),
.reset (reset),
@ -101,7 +95,7 @@ module VX_generic_arbiter #(
.grant_valid (grant_valid),
.grant_index (grant_index),
.grant_onehot (grant_onehot),
.grant_unlock (grant_unlock)
.grant_ready (grant_ready)
);
end else begin

View file

@ -16,7 +16,6 @@
`TRACING_OFF
module VX_matrix_arbiter #(
parameter NUM_REQS = 1,
parameter LOCK_ENABLE = 0,
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
) (
input wire clk,
@ -25,13 +24,13 @@ module VX_matrix_arbiter #(
output wire [LOG_NUM_REQS-1:0] grant_index,
output wire [NUM_REQS-1:0] grant_onehot,
output wire grant_valid,
input wire grant_unlock
input wire grant_ready
);
if (NUM_REQS == 1) begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (grant_unlock)
`UNUSED_VAR (grant_ready)
assign grant_index = '0;
assign grant_onehot = requests;
@ -70,20 +69,15 @@ module VX_matrix_arbiter #(
end
end
if (LOCK_ENABLE == 0) begin
`UNUSED_VAR (grant_unlock)
assign grant_onehot = grant_unqual;
end else begin
reg [NUM_REQS-1:0] grant_unqual_prev;
always @(posedge clk) begin
if (reset) begin
grant_unqual_prev <= '0;
end else if (grant_unlock) begin
end else if (grant_ready) begin
grant_unqual_prev <= grant_unqual;
end
end
assign grant_onehot = grant_unlock ? grant_unqual : grant_unqual_prev;
end
assign grant_onehot = grant_ready ? grant_unqual : grant_unqual_prev;
VX_onehot_encoder #(
.N (NUM_REQS)

View file

@ -27,10 +27,10 @@ module VX_mem_coalescer #(
parameter DATA_IN_WIDTH = DATA_IN_SIZE * 8,
parameter DATA_OUT_WIDTH= DATA_OUT_SIZE * 8,
parameter OUT_REQS = (NUM_REQS * DATA_IN_WIDTH) / DATA_OUT_WIDTH,
parameter BATCH_SIZE = DATA_OUT_SIZE / DATA_IN_SIZE,
parameter BATCH_SIZE_W = `LOG2UP(BATCH_SIZE),
parameter OUT_ADDR_WIDTH= ADDR_WIDTH - BATCH_SIZE_W,
parameter DATA_RATIO = DATA_OUT_SIZE / DATA_IN_SIZE,
parameter DATA_RATIO_W = `LOG2UP(DATA_RATIO),
parameter OUT_REQS = NUM_REQS / DATA_RATIO,
parameter OUT_ADDR_WIDTH= ADDR_WIDTH - DATA_RATIO_W,
parameter QUEUE_ADDRW = `CLOG2(QUEUE_SIZE),
parameter OUT_TAG_WIDTH = UUID_WIDTH + QUEUE_ADDRW
) (
@ -82,23 +82,23 @@ module VX_mem_coalescer #(
localparam TAG_ID_WIDTH = TAG_WIDTH - UUID_WIDTH;
localparam NUM_REQS_W = `LOG2UP(NUM_REQS);
// tag + mask + offest
localparam IBUF_DATA_WIDTH = TAG_ID_WIDTH + NUM_REQS + (NUM_REQS * BATCH_SIZE_W);
localparam IBUF_DATA_WIDTH = TAG_ID_WIDTH + NUM_REQS + (NUM_REQS * DATA_RATIO_W);
localparam STATE_SETUP = 0;
localparam STATE_SEND = 1;
logic state_r, state_n;
reg state_r, state_n;
logic out_req_valid_r, out_req_valid_n;
logic out_req_rw_r, out_req_rw_n;
logic [OUT_REQS-1:0] out_req_mask_r, out_req_mask_n;
logic [OUT_REQS-1:0][DATA_OUT_SIZE-1:0] out_req_byteen_r, out_req_byteen_n;
logic [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr_r, out_req_addr_n;
logic [OUT_REQS-1:0][ATYPE_WIDTH-1:0] out_req_atype_r, out_req_atype_n;
logic [OUT_REQS-1:0][DATA_OUT_WIDTH-1:0] out_req_data_r, out_req_data_n;
logic [OUT_TAG_WIDTH-1:0] out_req_tag_r, out_req_tag_n;
reg out_req_valid_r, out_req_valid_n;
reg out_req_rw_r, out_req_rw_n;
reg [OUT_REQS-1:0] out_req_mask_r, out_req_mask_n;
reg [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] out_req_addr_r, out_req_addr_n;
reg [OUT_REQS-1:0][ATYPE_WIDTH-1:0] out_req_atype_r, out_req_atype_n;
reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] out_req_byteen_r, out_req_byteen_n;
reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_WIDTH-1:0] out_req_data_r, out_req_data_n;
reg [OUT_TAG_WIDTH-1:0] out_req_tag_r, out_req_tag_n;
logic in_req_ready_n;
reg in_req_ready_n;
wire ibuf_push;
wire ibuf_pop;
@ -112,29 +112,41 @@ module VX_mem_coalescer #(
logic [OUT_REQS-1:0] batch_valid_r, batch_valid_n;
logic [OUT_REQS-1:0][OUT_ADDR_WIDTH-1:0] seed_addr_r, seed_addr_n;
logic [OUT_REQS-1:0][ATYPE_WIDTH-1:0] seed_atype_r, seed_atype_n;
logic [NUM_REQS-1:0] addr_matches_r, addr_matches_n;
logic [NUM_REQS-1:0] processed_mask_r, processed_mask_n;
wire [OUT_REQS-1:0][NUM_REQS_W-1:0] seed_idx;
wire [NUM_REQS-1:0][OUT_ADDR_WIDTH-1:0] in_addr_base;
wire [NUM_REQS-1:0][BATCH_SIZE_W-1:0] in_addr_offset;
wire [NUM_REQS-1:0][DATA_RATIO_W-1:0] in_addr_offset;
for (genvar i = 0; i < NUM_REQS; i++) begin
assign in_addr_base[i] = in_req_addr[i][ADDR_WIDTH-1:BATCH_SIZE_W];
assign in_addr_offset[i] = in_req_addr[i][BATCH_SIZE_W-1:0];
assign in_addr_base[i] = in_req_addr[i][ADDR_WIDTH-1:DATA_RATIO_W];
assign in_addr_offset[i] = in_req_addr[i][DATA_RATIO_W-1:0];
end
for (genvar i = 0; i < OUT_REQS; ++i) begin
wire [BATCH_SIZE-1:0] batch_mask = in_req_mask[BATCH_SIZE * i +: BATCH_SIZE] & ~processed_mask_r[BATCH_SIZE * i +: BATCH_SIZE];
wire [BATCH_SIZE_W-1:0] batch_idx;
wire [DATA_RATIO-1:0] batch_mask = in_req_mask[i * DATA_RATIO +: DATA_RATIO] & ~processed_mask_r[i * DATA_RATIO +: DATA_RATIO];
wire [DATA_RATIO_W-1:0] batch_idx;
VX_priority_encoder #(
.N (BATCH_SIZE)
.N (DATA_RATIO)
) priority_encoder (
.data_in (batch_mask),
.index (batch_idx),
`UNUSED_PIN (onehot),
.valid_out (batch_valid_n[i])
);
assign seed_idx[i] = NUM_REQS_W'(BATCH_SIZE * i) + NUM_REQS_W'(batch_idx);
assign seed_idx[i] = NUM_REQS_W'(i * DATA_RATIO) + NUM_REQS_W'(batch_idx);
end
for (genvar i = 0; i < OUT_REQS; ++i) begin
assign seed_addr_n[i] = in_addr_base[seed_idx[i]];
assign seed_atype_n[i] = in_req_atype[seed_idx[i]];
end
for (genvar i = 0; i < OUT_REQS; ++i) begin
for (genvar j = 0; j < DATA_RATIO; ++j) begin
assign addr_matches_n[i * DATA_RATIO + j] = (in_addr_base[i * DATA_RATIO + j] == seed_addr_n[i]);
end
end
always @(posedge clk) begin
@ -144,12 +156,13 @@ module VX_mem_coalescer #(
out_req_valid_r <= 0;
end else begin
state_r <= state_n;
out_req_valid_r <= out_req_valid_n;
batch_valid_r <= batch_valid_n;
seed_addr_r <= seed_addr_n;
seed_atype_r <= seed_atype_n;
out_req_rw_r <= out_req_rw_n;
addr_matches_r <= addr_matches_n;
out_req_valid_r <= out_req_valid_n;
out_req_mask_r <= out_req_mask_n;
out_req_rw_r <= out_req_rw_n;
out_req_addr_r <= out_req_addr_n;
out_req_atype_r <= out_req_atype_n;
out_req_byteen_r <= out_req_byteen_n;
@ -159,43 +172,51 @@ module VX_mem_coalescer #(
end
end
logic [NUM_REQS-1:0] addr_matches;
wire [NUM_REQS-1:0] current_pmask = in_req_mask & addr_matches_r;
reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_SIZE-1:0] req_byteen_merged;
reg [OUT_REQS-1:0][DATA_RATIO-1:0][DATA_IN_WIDTH-1:0] req_data_merged;
always @(*) begin
addr_matches = '0;
req_byteen_merged = '0;
req_data_merged = 'x;
for (integer i = 0; i < OUT_REQS; ++i) begin
for (integer j = 0; j < BATCH_SIZE; j++) begin
if (in_addr_base[BATCH_SIZE * i + j] == seed_addr_r[i]) begin
addr_matches[BATCH_SIZE * i + j] = 1;
for (integer j = 0; j < DATA_RATIO; ++j) begin
if (current_pmask[i * DATA_RATIO + j]) begin
for (integer k = 0; k < DATA_IN_SIZE; ++k) begin
if (in_req_byteen[DATA_RATIO * i + j][k]) begin
req_byteen_merged[i][in_addr_offset[DATA_RATIO * i + j]][k] = 1'b1;
req_data_merged[i][in_addr_offset[DATA_RATIO * i + j]][k * 8 +: 8] = in_req_data[DATA_RATIO * i + j][k * 8 +: 8];
end
end
end
end
end
end
wire [NUM_REQS-1:0] current_pmask = in_req_mask & addr_matches;
wire [OUT_REQS * DATA_RATIO - 1:0] pending_mask;
for (genvar i = 0; i < OUT_REQS * DATA_RATIO; ++i) begin
assign pending_mask[i] = in_req_mask[i] && ~addr_matches_r[i] && ~processed_mask_r[i];
end
wire batch_completed = ~(| pending_mask);
always @(*) begin
state_n = state_r;
out_req_valid_n = out_req_valid_r;
seed_addr_n = seed_addr_r;
seed_atype_n = seed_atype_r;
out_req_rw_n = out_req_rw_r;
out_req_mask_n = out_req_mask_r;
out_req_rw_n = out_req_rw_r;
out_req_addr_n = out_req_addr_r;
out_req_atype_n = out_req_atype_r;
out_req_byteen_n = out_req_byteen_r;
out_req_data_n = out_req_data_r;
out_req_tag_n = out_req_tag_r;
processed_mask_n = processed_mask_r;
in_req_ready_n = 0;
case (state_r)
STATE_SETUP: begin
// find the next seed address
for (integer i = 0; i < OUT_REQS; ++i) begin
seed_addr_n[i] = in_addr_base[seed_idx[i]];
seed_atype_n[i] = in_req_atype[seed_idx[i]];
end
// wait for pending outgoing request to submit
if (out_req_valid && out_req_ready) begin
out_req_valid_n = 0;
@ -206,37 +227,22 @@ module VX_mem_coalescer #(
end
default/*STATE_SEND*/: begin
out_req_valid_n = 1;
out_req_mask_n = batch_valid_r;
out_req_rw_n = in_req_rw;
out_req_addr_n = seed_addr_r;
out_req_atype_n = seed_atype_r;
out_req_byteen_n= req_byteen_merged;
out_req_data_n = req_data_merged;
out_req_tag_n = {in_req_tag[TAG_WIDTH-1 -: UUID_WIDTH], ibuf_waddr};
in_req_ready_n = 1;
out_req_byteen_n = '0;
out_req_data_n = 'x;
for (integer i = 0; i < OUT_REQS; ++i) begin
for (integer j = 0; j < BATCH_SIZE; j++) begin
if (in_req_mask[BATCH_SIZE * i + j]) begin
if (addr_matches[BATCH_SIZE * i + j]) begin
for (integer k = 0; k < DATA_IN_SIZE; ++k) begin
if (in_req_byteen[BATCH_SIZE * i + j][k]) begin
out_req_byteen_n[i][in_addr_offset[BATCH_SIZE * i + j] * DATA_IN_SIZE + k +: 1] = 1'b1;
out_req_data_n[i][in_addr_offset[BATCH_SIZE * i + j] * DATA_IN_WIDTH + k * 8 +: 8] = in_req_data[BATCH_SIZE * i + j][k * 8 +: 8];
end
end
end else begin
if (!processed_mask_r[BATCH_SIZE * i + j]) begin
in_req_ready_n = 0;
end
end
end
end
out_req_mask_n[i] = batch_valid_r[i];
out_req_addr_n[i] = seed_addr_r[i];
out_req_atype_n[i]= seed_atype_r[i];
end
if (in_req_ready_n) begin
in_req_ready_n = batch_completed;
if (batch_completed) begin
processed_mask_n = '0;
end else begin
processed_mask_n = processed_mask_r | current_pmask;
end
state_n = STATE_SETUP;
end
endcase
@ -246,12 +252,14 @@ module VX_mem_coalescer #(
wire out_rsp_eop;
assign ibuf_push = (state_r == STATE_SEND) && ~in_req_rw;
wire req_sent = (state_r == STATE_SEND);
assign ibuf_push = req_sent && ~in_req_rw;
assign ibuf_pop = out_rsp_fire && out_rsp_eop;
assign ibuf_raddr = out_rsp_tag[QUEUE_ADDRW-1:0];
wire [TAG_ID_WIDTH-1:0] ibuf_din_tag = in_req_tag[TAG_ID_WIDTH-1:0];
wire [NUM_REQS-1:0][BATCH_SIZE_W-1:0] ibuf_din_offset = in_addr_offset;
wire [NUM_REQS-1:0][DATA_RATIO_W-1:0] ibuf_din_offset = in_addr_offset;
wire [NUM_REQS-1:0] ibuf_din_pmask = current_pmask;
assign ibuf_din = {ibuf_din_tag, ibuf_din_pmask, ibuf_din_offset};
@ -299,21 +307,19 @@ module VX_mem_coalescer #(
end
end
wire [NUM_REQS-1:0][BATCH_SIZE_W-1:0] ibuf_dout_offset;
reg [NUM_REQS-1:0] ibuf_dout_pmask;
wire [NUM_REQS-1:0][DATA_RATIO_W-1:0] ibuf_dout_offset;
wire [NUM_REQS-1:0] ibuf_dout_pmask;
wire [TAG_ID_WIDTH-1:0] ibuf_dout_tag;
assign {ibuf_dout_tag, ibuf_dout_pmask, ibuf_dout_offset} = ibuf_dout;
logic [NUM_REQS-1:0][DATA_IN_WIDTH-1:0] in_rsp_data_n;
logic [NUM_REQS-1:0] in_rsp_mask_n;
wire [NUM_REQS-1:0][DATA_IN_WIDTH-1:0] in_rsp_data_n;
wire [NUM_REQS-1:0] in_rsp_mask_n;
always @(*) begin
for (integer i = 0; i < OUT_REQS; ++i) begin
for (integer j = 0; j < BATCH_SIZE; j++) begin
in_rsp_mask_n[BATCH_SIZE * i + j] = out_rsp_mask[i] && ibuf_dout_pmask[BATCH_SIZE * i + j];
in_rsp_data_n[BATCH_SIZE * i + j] = out_rsp_data[i][ibuf_dout_offset[BATCH_SIZE * i + j] * DATA_IN_WIDTH +: DATA_IN_WIDTH];
end
for (genvar i = 0; i < OUT_REQS; ++i) begin
for (genvar j = 0; j < DATA_RATIO; ++j) begin
assign in_rsp_mask_n[i * DATA_RATIO + j] = out_rsp_mask[i] && ibuf_dout_pmask[i * DATA_RATIO + j];
assign in_rsp_data_n[i * DATA_RATIO + j] = out_rsp_data[i][ibuf_dout_offset[i * DATA_RATIO + j] * DATA_IN_WIDTH +: DATA_IN_WIDTH];
end
end
@ -335,11 +341,11 @@ module VX_mem_coalescer #(
assign out_rsp_uuid = '0;
end
reg [NUM_REQS-1:0][BATCH_SIZE_W-1:0] out_req_offset;
reg [NUM_REQS-1:0][DATA_RATIO_W-1:0] out_req_offset;
reg [NUM_REQS-1:0] out_req_pmask;
always @(posedge clk) begin
if (ibuf_push) begin
if (req_sent) begin
out_req_offset <= ibuf_din_offset;
out_req_pmask <= ibuf_din_pmask;
end
@ -368,7 +374,7 @@ module VX_mem_coalescer #(
`TRACE_ARRAY1D(1, "%0d", out_req_offset, NUM_REQS);
`TRACE(1, (", pmask=%b, tag=0x%0h (#%0d)\n", out_req_pmask, out_req_tag, out_req_uuid));
if ($countones(out_req_pmask) > 1) begin
`TRACE(1, ("%t: *** %s: coalescing=%b (#%0d)\n", $time, INSTANCE_ID, out_req_pmask, out_req_uuid));
`TRACE(1, ("%t: *** %s: coalesced=%d (#%0d)\n", $time, INSTANCE_ID, $countones(out_req_pmask), out_req_uuid));
end
end
if (out_rsp_fire) begin

View file

@ -139,9 +139,9 @@ module VX_mem_scheduler #(
wire mem_req_ready_s;
wire mem_rsp_valid_s;
wire [CORE_REQS-1:0] mem_rsp_mask_s;
wire [CORE_REQS-1:0][WORD_WIDTH-1:0] mem_rsp_data_s;
wire [REQQ_TAG_WIDTH-1:0] mem_rsp_tag_s;
wire [CORE_CHANNELS-1:0] mem_rsp_mask_s;
wire [CORE_CHANNELS-1:0][WORD_WIDTH-1:0] mem_rsp_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s;
wire mem_rsp_ready_s;
wire crsp_valid;
@ -334,11 +334,16 @@ module VX_mem_scheduler #(
if (MEM_BATCHES != 1) begin
reg [MEM_BATCH_BITS-1:0] req_batch_idx_r;
wire is_degenerate_batch = ~(| mem_req_mask_s);
wire mem_req_valid_b = reqq_valid_s && ~is_degenerate_batch;
wire mem_req_ready_b = mem_req_ready_s || is_degenerate_batch;
always @(posedge clk) begin
if (reset) begin
req_batch_idx_r <= '0;
end else begin
if (reqq_valid_s && mem_req_ready_s) begin
if (reqq_valid_s && mem_req_ready_b) begin
if (req_sent_all) begin
req_batch_idx_r <= '0;
end else begin
@ -368,19 +373,20 @@ module VX_mem_scheduler #(
`UNUSED_PIN (valid_out)
);
assign mem_req_valid_s = mem_req_valid_b;
assign req_batch_idx = req_batch_idx_r;
assign req_sent_all = mem_req_ready_s && (req_batch_idx_r == req_batch_idx_last);
assign req_sent_all = mem_req_ready_b && (req_batch_idx_r == req_batch_idx_last);
assign mem_req_tag_s = {reqq_tag_s, req_batch_idx};
end else begin
assign mem_req_valid_s = reqq_valid_s;
assign req_batch_idx = '0;
assign req_sent_all = mem_req_ready_s;
assign mem_req_tag_s = reqq_tag_s;
end
assign mem_req_valid_s = reqq_valid_s;
assign reqq_ready_s = req_sent_all;
VX_elastic_buffer #(
@ -490,10 +496,11 @@ module VX_mem_scheduler #(
end
assign mem_rsp_ready_s = crsp_ready || ~rsp_complete;
end
if (UUID_WIDTH != 0) begin
assign crsp_tag = {mem_rsp_tag_s[REQQ_TAG_WIDTH-1 -: UUID_WIDTH], ibuf_dout};
assign crsp_tag = {mem_rsp_tag_s[MEM_TAG_WIDTH-1 -: UUID_WIDTH], ibuf_dout};
end else begin
assign crsp_tag = ibuf_dout;
end
@ -563,8 +570,8 @@ module VX_mem_scheduler #(
wire [`UP(UUID_WIDTH)-1:0] rsp_dbg_uuid;
if (UUID_WIDTH != 0) begin
assign mem_req_dbg_uuid = mem_req_tag_s[REQQ_TAG_WIDTH-1 -: UUID_WIDTH];
assign mem_rsp_dbg_uuid = mem_rsp_tag_s[REQQ_TAG_WIDTH-1 -: UUID_WIDTH];
assign mem_req_dbg_uuid = mem_req_tag_s[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
assign mem_rsp_dbg_uuid = mem_rsp_tag_s[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
assign rsp_dbg_uuid = core_rsp_tag[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign mem_req_dbg_uuid = '0;
@ -572,6 +579,8 @@ module VX_mem_scheduler #(
assign rsp_dbg_uuid = '0;
end
wire [CORE_QUEUE_ADDRW-1:0] ibuf_waddr_s = mem_req_tag_s[MEM_BATCH_BITS +: CORE_QUEUE_ADDRW];
wire mem_req_fire_s = mem_req_valid_s && mem_req_ready_s;
always @(posedge clk) begin
@ -590,7 +599,7 @@ module VX_mem_scheduler #(
`TRACE(1, (", tag=0x%0h (#%0d)\n", core_req_tag, req_dbg_uuid));
end
if (core_rsp_valid && core_rsp_ready) begin
`TRACE(1, ("%d: %s-rsp: valid=%b, sop=%b, eop=%b, data=", $time, INSTANCE_ID, core_rsp_mask, core_rsp_sop, core_rsp_eop));
`TRACE(1, ("%d: %s-core-rsp: valid=%b, sop=%b, eop=%b, data=", $time, INSTANCE_ID, core_rsp_mask, core_rsp_sop, core_rsp_eop));
`TRACE_ARRAY1D(1, "0x%0h", core_rsp_data, CORE_REQS);
`TRACE(1, (", tag=0x%0h (#%0d)\n", core_rsp_tag, rsp_dbg_uuid));
end
@ -606,7 +615,7 @@ module VX_mem_scheduler #(
`TRACE(1, ("%d: %s-mem-req-rd: valid=%b, addr=", $time, INSTANCE_ID, mem_req_mask_s));
`TRACE_ARRAY1D(1, "0x%h", mem_req_addr_s, CORE_CHANNELS);
end
`TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_waddr, req_batch_idx, mem_req_dbg_uuid));
`TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_waddr_s, req_batch_idx, mem_req_dbg_uuid));
end
if (mem_rsp_fire_s) begin
`TRACE(1, ("%d: %s-mem-rsp: valid=%b, data=", $time, INSTANCE_ID, mem_rsp_mask_s));

View file

@ -26,112 +26,7 @@ module VX_onehot_mux #(
if (N == 1) begin
`UNUSED_VAR (sel_in)
assign data_out = data_in;
end else if (N == 2) begin
`UNUSED_VAR (sel_in)
assign data_out = sel_in[0] ? data_in[0] : data_in[1];
end else if (N == 3) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
3'b001: data_out_r = data_in[0];
3'b010: data_out_r = data_in[1];
3'b100: data_out_r = data_in[2];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else if (N == 4) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
4'b0001: data_out_r = data_in[0];
4'b0010: data_out_r = data_in[1];
4'b0100: data_out_r = data_in[2];
4'b1000: data_out_r = data_in[3];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else if (N == 5) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
5'b00001: data_out_r = data_in[0];
5'b00010: data_out_r = data_in[1];
5'b00100: data_out_r = data_in[2];
5'b01000: data_out_r = data_in[3];
5'b10000: data_out_r = data_in[4];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else if (N == 6) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
6'b000001: data_out_r = data_in[0];
6'b000010: data_out_r = data_in[1];
6'b000100: data_out_r = data_in[2];
6'b001000: data_out_r = data_in[3];
6'b010000: data_out_r = data_in[4];
6'b100000: data_out_r = data_in[5];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else if (N == 7) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
7'b0000001: data_out_r = data_in[0];
7'b0000010: data_out_r = data_in[1];
7'b0000100: data_out_r = data_in[2];
7'b0001000: data_out_r = data_in[3];
7'b0010000: data_out_r = data_in[4];
7'b0100000: data_out_r = data_in[5];
7'b1000000: data_out_r = data_in[6];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else if (N == 8) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
case (sel_in)
8'b00000001: data_out_r = data_in[0];
8'b00000010: data_out_r = data_in[1];
8'b00000100: data_out_r = data_in[2];
8'b00001000: data_out_r = data_in[3];
8'b00010000: data_out_r = data_in[4];
8'b00100000: data_out_r = data_in[5];
8'b01000000: data_out_r = data_in[6];
8'b10000000: data_out_r = data_in[7];
default: data_out_r = 'x;
endcase
end
assign data_out = data_out_r;
end else begin
if (MODEL == 1) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
data_out_r = 'x;
for (integer i = 0; i < N; ++i) begin
if (sel_in[i]) begin
data_out_r = data_in[i];
end
end
end
assign data_out = data_out_r;
end else if (MODEL == 2) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
data_out_r = '0;
for (integer i = 0; i < N; ++i) begin
data_out_r |= {DATAW{sel_in[i]}} & data_in[i];
end
end
assign data_out = data_out_r;
end else if (MODEL == 3) begin
end else if (MODEL == 1) begin
wire [N-1:0][DATAW-1:0] mask;
for (genvar i = 0; i < N; ++i) begin
assign mask[i] = {DATAW{sel_in[i]}} & data_in[i];
@ -143,8 +38,18 @@ module VX_onehot_mux #(
end
assign data_out[i] = (| gather);
end
end else if (MODEL == 2) begin
reg [DATAW-1:0] data_out_r;
always @(*) begin
data_out_r = 'x;
for (integer i = 0; i < N; ++i) begin
if (sel_in[i]) begin
data_out_r = data_in[i];
end
end
end
assign data_out = data_out_r;
end
endmodule
`TRACING_ON

View file

@ -43,6 +43,7 @@ module VX_pe_serializer #(
output wire [TAG_WIDTH-1:0] tag_out,
input wire ready_out
);
wire [NUM_PES-1:0][DATA_IN_WIDTH-1:0] pe_data_in_s;
wire valid_out_s;
wire [TAG_WIDTH-1:0] tag_out_s;
wire enable;
@ -59,6 +60,17 @@ module VX_pe_serializer #(
.data_out ({valid_out_s, tag_out_s})
);
VX_pipe_register #(
.DATAW (NUM_PES * DATA_IN_WIDTH),
.DEPTH (PE_REG)
) pe_reg (
.clk (clk),
.reset (reset),
.enable (enable),
.data_in (pe_data_in_s),
.data_out (pe_data_in)
);
if (NUM_LANES != NUM_PES) begin
localparam BATCH_SIZE = NUM_LANES / NUM_PES;
@ -67,6 +79,10 @@ module VX_pe_serializer #(
reg [BATCH_SIZEW-1:0] batch_in_idx;
reg [BATCH_SIZEW-1:0] batch_out_idx;
for (genvar i = 0; i < NUM_PES; ++i) begin
assign pe_data_in_s[i] = data_in[batch_in_idx * NUM_PES + i];
end
always @(posedge clk) begin
if (reset) begin
batch_in_idx <= '0;
@ -84,42 +100,26 @@ module VX_pe_serializer #(
wire batch_in_done = (batch_in_idx == BATCH_SIZEW'(BATCH_SIZE-1));
wire batch_out_done = (batch_out_idx == BATCH_SIZEW'(BATCH_SIZE-1));
wire [NUM_PES-1:0][DATA_IN_WIDTH-1:0] pe_data_in_s;
for (genvar i = 0; i < NUM_PES; ++i) begin
assign pe_data_in_s[i] = data_in[batch_in_idx * NUM_PES + i];
end
VX_pipe_register #(
.DATAW (NUM_PES * DATA_IN_WIDTH),
.DEPTH (PE_REG)
) pe_reg (
.clk (clk),
.reset (reset),
.enable (enable),
.data_in (pe_data_in_s),
.data_out (pe_data_in)
);
reg valid_out_r;
reg [BATCH_SIZE-1:0][NUM_PES-1:0][DATA_OUT_WIDTH-1:0] data_out_r;
reg [TAG_WIDTH-1:0] tag_out_r;
wire valid_out_b = valid_out_s && batch_out_done;
wire enable_r = ready_out || ~valid_out;
wire ready_out_b = ready_out || ~valid_out;
always @(posedge clk) begin
if (reset) begin
valid_out_r <= 1'b0;
end else if (enable_r) begin
end else if (ready_out_b) begin
valid_out_r <= valid_out_b;
end
if (enable_r) begin
if (ready_out_b) begin
data_out_r[batch_out_idx] <= pe_data_out;
tag_out_r <= tag_out_s;
end
end
assign enable = (enable_r || ~valid_out_b);
assign enable = ready_out_b || ~valid_out_b;
assign ready_in = enable && batch_in_done;
assign pe_enable = enable;
@ -130,11 +130,12 @@ module VX_pe_serializer #(
end else begin
assign pe_data_in_s = data_in;
assign enable = ready_out || ~valid_out;
assign ready_in = enable;
assign pe_enable = enable;
assign pe_data_in= data_in;
assign valid_out = valid_out_s;
assign data_out = pe_data_out;

View file

@ -13,11 +13,13 @@
`include "VX_platform.vh"
`TRACING_OFF
//`TRACING_OFF
module VX_pending_size #(
parameter SIZE = 1,
parameter INCRW = 1,
parameter DECRW = 1,
parameter ALM_FULL = (SIZE - 1),
parameter ALM_EMPTY = 1,
parameter SIZEW = `CLOG2(SIZE+1)
) (
input wire clk,
@ -25,32 +27,39 @@ module VX_pending_size #(
input wire [INCRW-1:0] incr,
input wire [DECRW-1:0] decr,
output wire empty,
output wire alm_empty,
output wire full,
output wire alm_full,
output wire [SIZEW-1:0] size
);
`STATIC_ASSERT(INCRW <= SIZEW, ("invalid parameter"))
`STATIC_ASSERT(DECRW <= SIZEW, ("invalid parameter"))
`STATIC_ASSERT(INCRW <= SIZEW, ("invalid parameter: %d vs %d", INCRW, SIZEW))
`STATIC_ASSERT(DECRW <= SIZEW, ("invalid parameter: %d vs %d", DECRW, SIZEW))
localparam ADDRW = `LOG2UP(SIZE);
reg empty_r;
reg full_r;
reg empty_r, alm_empty_r;
reg full_r, alm_full_r;
if (INCRW != 1 || DECRW != 1) begin
reg [SIZEW-1:0] size_r;
wire [SIZEW-1:0] size_n;
assign size_n = size_r + SIZEW'(incr) - SIZEW'(decr);
wire [SIZEW-1:0] size_n = size_r + SIZEW'(incr) - SIZEW'(decr);
always @(posedge clk) begin
if (reset) begin
size_r <= '0;
empty_r <= 1;
alm_empty_r <= 1;
alm_full_r <= 0;
full_r <= 0;
size_r <= '0;
end else begin
`ASSERT((SIZEW'(incr) >= SIZEW'(decr)) || (size_n >= size_r), ("runtime error: counter overflow"));
`ASSERT((SIZEW'(incr) <= SIZEW'(decr)) || (size_n <= size_r), ("runtime error: counter underflow"));
size_r <= size_n;
empty_r <= (size_n == SIZEW'(0));
alm_empty_r <= (size_n == SIZEW'(ALM_EMPTY));
full_r <= (size_n == SIZEW'(SIZE));
alm_full_r <= (size_n == SIZEW'(ALM_FULL));
end
end
@ -59,30 +68,47 @@ module VX_pending_size #(
end else begin
reg [ADDRW-1:0] used_r;
wire [ADDRW-1:0] used_n;
always @(posedge clk) begin
if (reset) begin
used_r <= '0;
empty_r <= 1;
alm_empty_r <= 1;
full_r <= 0;
alm_full_r <= 0;
used_r <= '0;
end else begin
`ASSERT(~(incr && ~decr) || ~full, ("runtime error: incrementing full counter"));
`ASSERT(~(decr && ~incr) || ~empty, ("runtime error: decrementing empty counter"));
`ASSERT(~(incr && ~decr) || ~full, ("runtime error: counter overflow"));
`ASSERT(~(decr && ~incr) || ~empty, ("runtime error: counter underflow"));
if (incr) begin
if (~decr) begin
empty_r <= 0;
if (used_r == ADDRW'(ALM_EMPTY))
alm_empty_r <= 0;
if (used_r == ADDRW'(SIZE-1))
full_r <= 1;
if (used_r == ADDRW'(ALM_FULL-1))
alm_full_r <= 1;
end
end else if (decr) begin
full_r <= 0;
if (used_r == ADDRW'(1))
empty_r <= 1;
if (used_r == ADDRW'(ALM_EMPTY+1))
alm_empty_r <= 1;
full_r <= 0;
if (used_r == ADDRW'(ALM_FULL))
alm_full_r <= 0;
end
used_r <= $signed(used_r) + ADDRW'($signed(2'(incr) - 2'(decr)));
used_r <= used_n;
end
end
if (SIZE == 2) begin
assign used_n = used_r ^ (incr ^ decr);
end else begin
assign used_n = $signed(used_r) + ADDRW'($signed(2'(incr) - 2'(decr)));
end
if (SIZE > 1) begin
if (SIZEW > ADDRW) begin
assign size = {full_r, used_r};
@ -96,7 +122,9 @@ module VX_pending_size #(
end
assign empty = empty_r;
assign alm_empty = alm_empty_r;
assign alm_full = alm_full_r;
assign full = full_r;
endmodule
`TRACING_ON
//`TRACING_ON

View file

@ -16,7 +16,6 @@
`TRACING_OFF
module VX_rr_arbiter #(
parameter NUM_REQS = 1,
parameter LOCK_ENABLE = 0,
parameter MODEL = 1,
parameter LOG_NUM_REQS = `LOG2UP(NUM_REQS)
) (
@ -26,13 +25,13 @@ module VX_rr_arbiter #(
output wire [LOG_NUM_REQS-1:0] grant_index,
output wire [NUM_REQS-1:0] grant_onehot,
output wire grant_valid,
input wire grant_unlock
input wire grant_ready
);
if (NUM_REQS == 1) begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
`UNUSED_VAR (grant_unlock)
`UNUSED_VAR (grant_ready)
assign grant_index = '0;
assign grant_onehot = requests;
@ -55,7 +54,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
state <= '0;
end else if (!LOCK_ENABLE || grant_unlock) begin
end else if (grant_ready) begin
state <= grant_index_r;
end
end
@ -85,7 +84,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
state <= '0;
end else if (!LOCK_ENABLE || grant_unlock) begin
end else if (grant_ready) begin
state <= grant_index_r;
end
end
@ -121,7 +120,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
state <= '0;
end else if (!LOCK_ENABLE || grant_unlock) begin
end else if (grant_ready) begin
state <= grant_index_r;
end
end
@ -165,7 +164,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
state <= '0;
end else if (!LOCK_ENABLE || grant_unlock) begin
end else if (grant_ready) begin
state <= grant_index_r;
end
end
@ -219,7 +218,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
state <= '0;
end else if (!LOCK_ENABLE || grant_unlock) begin
end else if (grant_ready) begin
state <= grant_index_r;
end
end
@ -285,7 +284,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
state <= '0;
end else if (!LOCK_ENABLE || grant_unlock) begin
end else if (grant_ready) begin
state <= grant_index_r;
end
end
@ -365,7 +364,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
state <= '0;
end else if (!LOCK_ENABLE || grant_unlock) begin
end else if (grant_ready) begin
state <= grant_index_r;
end
end
@ -385,12 +384,18 @@ module VX_rr_arbiter #(
wire [NUM_REQS-1:0] req_masked = requests & pointer_reg;
assign mask_higher_pri_regs[NUM_REQS-1:1] = mask_higher_pri_regs[NUM_REQS-2:0] | req_masked[NUM_REQS-2:0];
assign mask_higher_pri_regs[0] = 1'b0;
for (genvar i = 1; i < NUM_REQS; ++i) begin
assign mask_higher_pri_regs[i] = mask_higher_pri_regs[i-1] | req_masked[i-1];
end
assign grant_masked[NUM_REQS-1:0] = req_masked[NUM_REQS-1:0] & ~mask_higher_pri_regs[NUM_REQS-1:0];
assign unmask_higher_pri_regs[NUM_REQS-1:1] = unmask_higher_pri_regs[NUM_REQS-2:0] | requests[NUM_REQS-2:0];
assign unmask_higher_pri_regs[0] = 1'b0;
for (genvar i = 1; i < NUM_REQS; ++i) begin
assign unmask_higher_pri_regs[i] = unmask_higher_pri_regs[i-1] | requests[i-1];
end
assign grant_unmasked[NUM_REQS-1:0] = requests[NUM_REQS-1:0] & ~unmask_higher_pri_regs[NUM_REQS-1:0];
wire no_req_masked = ~(|req_masked);
@ -399,7 +404,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
pointer_reg <= {NUM_REQS{1'b1}};
end else if (!LOCK_ENABLE || grant_unlock) begin
end else if (grant_ready) begin
if (|req_masked) begin
pointer_reg <= mask_higher_pri_regs;
end else if (|requests) begin
@ -443,7 +448,7 @@ module VX_rr_arbiter #(
always @(posedge clk) begin
if (reset) begin
state <= '0;
end else if (!LOCK_ENABLE || grant_unlock) begin
end else if (grant_ready) begin
state <= grant_index_r;
end
end

View file

@ -17,6 +17,7 @@
module VX_sp_ram #(
parameter DATAW = 1,
parameter SIZE = 1,
parameter ADDR_MIN = 0,
parameter WRENW = 1,
parameter OUT_REG = 0,
parameter NO_RWCHECK = 0,
@ -37,6 +38,7 @@ module VX_sp_ram #(
VX_dp_ram #(
.DATAW (DATAW),
.SIZE (SIZE),
.ADDR_MIN (ADDR_MIN),
.WRENW (WRENW),
.OUT_REG (OUT_REG),
.NO_RWCHECK (NO_RWCHECK),

View file

@ -21,6 +21,7 @@ module VX_stream_arb #(
parameter `STRING ARBITER = "P",
parameter MAX_FANOUT = `MAX_FANOUT,
parameter OUT_BUF = 0,
parameter LUTRAM = 0,
parameter NUM_REQS = `CDIV(NUM_INPUTS, NUM_OUTPUTS),
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS),
parameter NUM_REQS_W = `UP(LOG_NUM_REQS)
@ -57,7 +58,8 @@ module VX_stream_arb #(
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
.OUT_BUF (OUT_BUF),
.LUTRAM (LUTRAM)
) arb_slice (
.clk (clk),
.reset (slice_reset),
@ -101,7 +103,8 @@ module VX_stream_arb #(
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
.OUT_BUF (3), // registered output
.LUTRAM (LUTRAM)
) fanout_slice_arb (
.clk (clk),
.reset (slice_reset),
@ -127,7 +130,8 @@ module VX_stream_arb #(
.DATAW (DATAW + LOG_NUM_REQS2),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
.OUT_BUF (OUT_BUF),
.LUTRAM (LUTRAM)
) fanout_join_arb (
.clk (clk),
.reset (reset),
@ -158,7 +162,6 @@ module VX_stream_arb #(
VX_generic_arbiter #(
.NUM_REQS (NUM_REQS),
.LOCK_ENABLE (1),
.TYPE (ARBITER)
) arbiter (
.clk (clk),
@ -167,21 +170,30 @@ module VX_stream_arb #(
.grant_valid (arb_valid),
.grant_index (arb_index),
.grant_onehot (arb_onehot),
.grant_unlock (arb_ready)
.grant_ready (arb_ready)
);
assign valid_in_r = arb_valid;
assign data_in_r = data_in[arb_index];
assign arb_ready = ready_in_r;
VX_onehot_mux #(
.DATAW (DATAW),
.N (NUM_REQS)
) onehot_mux (
.data_in (data_in),
.sel_in (arb_onehot),
.data_out (data_in_r)
);
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign ready_in[i] = ready_in_r & arb_onehot[i];
assign ready_in[i] = ready_in_r && arb_onehot[i];
end
VX_elastic_buffer #(
.DATAW (LOG_NUM_REQS + DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)),
.LUTRAM (LUTRAM)
) out_buf (
.clk (clk),
.reset (reset),
@ -214,7 +226,8 @@ module VX_stream_arb #(
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
.OUT_BUF (OUT_BUF),
.LUTRAM (LUTRAM)
) arb_slice (
.clk (clk),
.reset (slice_reset),
@ -248,7 +261,8 @@ module VX_stream_arb #(
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
.OUT_BUF (3), // registered output
.LUTRAM (LUTRAM)
) fanout_fork_arb (
.clk (clk),
.reset (reset),
@ -275,7 +289,8 @@ module VX_stream_arb #(
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
.OUT_BUF (OUT_BUF),
.LUTRAM (LUTRAM)
) fanout_slice_arb (
.clk (clk),
.reset (slice_reset),
@ -302,7 +317,6 @@ module VX_stream_arb #(
VX_generic_arbiter #(
.NUM_REQS (NUM_OUTPUTS),
.LOCK_ENABLE (1),
.TYPE (ARBITER)
) arbiter (
.clk (clk),
@ -311,7 +325,7 @@ module VX_stream_arb #(
.grant_valid (arb_valid),
`UNUSED_PIN (grant_index),
.grant_onehot (arb_onehot),
.grant_unlock (arb_ready)
.grant_ready (arb_ready)
);
assign arb_requests = ready_in_r;
@ -322,7 +336,8 @@ module VX_stream_arb #(
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)),
.LUTRAM (LUTRAM)
) out_buf (
.clk (clk),
.reset (reset),
@ -349,7 +364,8 @@ module VX_stream_arb #(
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)),
.LUTRAM (LUTRAM)
) out_buf (
.clk (clk),
.reset (out_buf_reset),

View file

@ -38,46 +38,47 @@ module VX_stream_pack #(
output wire [TAG_WIDTH-1:0] tag_out,
input wire ready_out
);
localparam LOG_NUM_REQS = `CLOG2(NUM_REQS);
if (NUM_REQS > 1) begin
wire [LOG_NUM_REQS-1:0] grant_index;
wire [NUM_REQS-1:0] grant_onehot;
wire grant_valid;
wire grant_ready;
VX_generic_arbiter #(
.NUM_REQS (NUM_REQS),
.LOCK_ENABLE (1),
.TYPE (ARBITER)
) arbiter (
.clk (clk),
.reset (reset),
.requests (valid_in),
.grant_valid (grant_valid),
.grant_index (grant_index),
`UNUSED_PIN (grant_onehot),
.grant_unlock(grant_ready)
`UNUSED_PIN (grant_index),
.grant_onehot(grant_onehot),
.grant_ready (grant_ready)
);
reg [NUM_REQS-1:0] valid_sel;
reg [NUM_REQS-1:0] ready_sel;
wire ready_unqual;
wire [TAG_WIDTH-1:0] tag_sel;
wire [TAG_WIDTH-1:0] tag_sel = tag_in[grant_index];
VX_onehot_mux #(
.DATAW (TAG_WIDTH),
.N (NUM_REQS)
) onehot_mux (
.data_in (tag_in),
.sel_in (grant_onehot),
.data_out (tag_sel)
);
always @(*) begin
valid_sel = '0;
ready_sel = '0;
for (integer i = 0; i < NUM_REQS; ++i) begin
if (tag_in[i][TAG_SEL_BITS-1:0] == tag_sel[TAG_SEL_BITS-1:0]) begin
valid_sel[i] = valid_in[i];
ready_sel[i] = ready_unqual;
end
end
wire [NUM_REQS-1:0] tag_matches;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign tag_matches[i] = (tag_in[i][TAG_SEL_BITS-1:0] == tag_sel[TAG_SEL_BITS-1:0]);
end
assign grant_ready = ready_unqual;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign ready_in[i] = grant_ready & tag_matches[i];
end
wire [NUM_REQS-1:0] mask_sel = valid_in & tag_matches;
VX_elastic_buffer #(
.DATAW (NUM_REQS + TAG_WIDTH + (NUM_REQS * DATA_WIDTH)),
@ -87,15 +88,13 @@ module VX_stream_pack #(
.clk (clk),
.reset (reset),
.valid_in (grant_valid),
.data_in ({valid_sel, tag_sel, data_in}),
.ready_in (ready_unqual),
.data_in ({mask_sel, tag_sel, data_in}),
.ready_in (grant_ready),
.valid_out (valid_out),
.data_out ({mask_out, tag_out, data_out}),
.ready_out (ready_out)
);
assign ready_in = ready_sel;
end else begin
`UNUSED_VAR (clk)

View file

@ -22,6 +22,7 @@ module VX_stream_xbar #(
parameter OUT_WIDTH = `LOG2UP(NUM_OUTPUTS),
parameter ARBITER = "P",
parameter OUT_BUF = 0,
parameter LUTRAM = 0,
parameter MAX_FANOUT = `MAX_FANOUT,
parameter PERF_CTR_BITS = `CLOG2(NUM_INPUTS+1)
) (
@ -66,7 +67,8 @@ module VX_stream_xbar #(
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
.OUT_BUF (OUT_BUF),
.LUTRAM (LUTRAM)
) xbar_arb (
.clk (clk),
.reset (slice_reset),
@ -94,7 +96,8 @@ module VX_stream_xbar #(
.DATAW (DATAW),
.ARBITER (ARBITER),
.MAX_FANOUT (MAX_FANOUT),
.OUT_BUF (OUT_BUF)
.OUT_BUF (OUT_BUF),
.LUTRAM (LUTRAM)
) xbar_arb (
.clk (clk),
.reset (reset),
@ -130,7 +133,8 @@ module VX_stream_xbar #(
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)),
.LUTRAM (LUTRAM)
) out_buf (
.clk (clk),
.reset (out_buf_reset),
@ -152,7 +156,8 @@ module VX_stream_xbar #(
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)),
.LUTRAM (LUTRAM)
) out_buf (
.clk (clk),
.reset (reset),

View file

@ -33,7 +33,10 @@ module VX_local_mem import VX_gpu_pkg::*; #(
parameter UUID_WIDTH = 0,
// Request tag size
parameter TAG_WIDTH = 16
parameter TAG_WIDTH = 16,
// Response buffer
parameter OUT_BUF = 0
) (
input wire clk,
input wire reset,
@ -206,7 +209,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
.NUM_INPUTS (NUM_BANKS),
.NUM_OUTPUTS (NUM_REQS),
.DATAW (RSP_DATAW),
.OUT_BUF (2)
.OUT_BUF (OUT_BUF)
) rsp_xbar (
.clk (clk),
.reset (reset),

View file

@ -17,7 +17,7 @@ module VX_mem_switch import VX_gpu_pkg::*; #(
parameter NUM_REQS = 1,
parameter DATA_SIZE = 1,
parameter TAG_WIDTH = 1,
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter ADDR_WIDTH = 1,
parameter REQ_OUT_BUF = 0,
parameter RSP_OUT_BUF = 0,
parameter `STRING ARBITER = "R",
@ -30,7 +30,6 @@ module VX_mem_switch import VX_gpu_pkg::*; #(
VX_mem_bus_if.slave bus_in_if,
VX_mem_bus_if.master bus_out_if [NUM_REQS]
);
localparam ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE));
localparam DATA_WIDTH = (8 * DATA_SIZE);
localparam REQ_DATAW = TAG_WIDTH + ADDR_WIDTH + `ADDR_TYPE_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
localparam RSP_DATAW = TAG_WIDTH + DATA_WIDTH;

View file

@ -12,7 +12,6 @@ SRC_DIR := $(VORTEX_HOME)/hw/syn/altera/opae
RTL_DIR := $(VORTEX_HOME)/hw/rtl
DPI_DIR := $(VORTEX_HOME)/hw/dpi
AFU_DIR := $(RTL_DIR)/afu/opae
THIRD_PARTY_DIR := $(VORTEX_HOME)/third_party
SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts
IP_CACHE_DIR := $(ROOT_DIR)/hw/syn/altera/ip_cache/$(DEVICE_FAMILY)
@ -76,9 +75,8 @@ endif
# Debugigng
ifdef DEBUG
ifeq ($(TARGET), fpga)
CFLAGS += -DNDEBUG -DSCOPE $(DBG_SCOPE_FLAGS)
SCOPE_JSON += $(BUILD_DIR)/scope.json
ifneq ($(TARGET), fpga)
CFLAGS += -DNDEBUG
else
CFLAGS += $(DBG_TRACE_FLAGS)
endif
@ -88,7 +86,8 @@ endif
# Enable scope analyzer
ifdef SCOPE
CFLAGS += -DSCOPE
CFLAGS += -DSCOPE $(DBG_SCOPE_FLAGS)
SCOPE_JSON += $(BUILD_DIR)/scope.json
endif
# Enable perf counters

View file

@ -20,7 +20,7 @@ BUILD_DIR=$(realpath $1)
PROGRAM=$(basename "$2")
PROGRAM_DIR=`dirname $2`
POCL_RT_PATH=$TOOLDIR/pocl/runtime
POCL_PATH=$TOOLDIR/pocl
VORTEX_RT_PATH=$SCRIPT_DIR/../../../../runtime
# Export ASE_WORKDIR variable
@ -49,7 +49,7 @@ done
pushd $PROGRAM_DIR
shift 2
echo " [DBG] running ./$PROGRAM $*"
ASE_LOG=0 LD_LIBRARY_PATH=$POCL_RT_PATH/lib:$VORTEX_RT_PATH/opae:$LD_LIBRARY_PATH ./$PROGRAM $*
ASE_LOG=0 LD_LIBRARY_PATH=$POCL_PATH/lib:$VORTEX_RT_PATH/opae:$LD_LIBRARY_PATH ./$PROGRAM $*
popd
# stop the simulator (kill process group)

View file

@ -9,7 +9,7 @@ SCRIPT_DIR := $(VORTEX_HOME)/hw/scripts
IP_CACHE_DIR := $(ROOT_DIR)/hw/syn/altera/ip_cache/$(DEVICE_FAMILY)
.PHONY: dogfood unittest pipeline lmem cache fpu core vortex top test
.PHONY: dogfood unittest pipeline lmem cache fpu core issue vortex top test
ip-gen: $(IP_CACHE_DIR)/ip_gen.log
$(IP_CACHE_DIR)/ip_gen.log:
@ -50,6 +50,11 @@ core:
cp core/Makefile core/$(BUILD_DIR)
$(MAKE) -C core/$(BUILD_DIR) clean && $(MAKE) -C core/$(BUILD_DIR) > core/$(BUILD_DIR)/build.log 2>&1 &
issue:
mkdir -p issue/$(BUILD_DIR)
cp issue/Makefile issue/$(BUILD_DIR)
$(MAKE) -C issue/$(BUILD_DIR) clean && $(MAKE) -C issue/$(BUILD_DIR) > issue/$(BUILD_DIR)/build.log 2>&1 &
vortex: ip-gen
mkdir -p vortex/$(BUILD_DIR)
cp vortex/Makefile vortex/$(BUILD_DIR)

Some files were not shown because too many files have changed in this diff Show more