Merge branch 'master' into assignment5
17
.travis.yml
|
@ -30,25 +30,28 @@ jobs:
|
|||
include:
|
||||
- stage: test
|
||||
name: coverage
|
||||
script: cp -r $PWD ../build1 && cd ../build1 && ./ci/travis_run.py ./ci/regression.sh -coverage
|
||||
script: cp -r $PWD ../build_coverage && cd ../build_coverage && ./ci/travis_run.py ./ci/regression.sh -coverage
|
||||
- stage: test
|
||||
name: tex
|
||||
script: cp -r $PWD ../build_tex && cd ../build_tex && ./ci/travis_run.py ./ci/regression.sh -tex
|
||||
- stage: test
|
||||
name: cluster
|
||||
script: cp -r $PWD ../build2 && cd ../build2 && ./ci/travis_run.py ./ci/regression.sh -cluster
|
||||
script: cp -r $PWD ../build_cluster && cd ../build_cluster && ./ci/travis_run.py ./ci/regression.sh -cluster
|
||||
- stage: test
|
||||
name: debug
|
||||
script: cp -r $PWD ../build3 && cd ../build3 && ./ci/travis_run.py ./ci/regression.sh -debug
|
||||
script: cp -r $PWD ../build_debug && cd ../build_debug && ./ci/travis_run.py ./ci/regression.sh -debug
|
||||
- stage: test
|
||||
name: config
|
||||
script: cp -r $PWD ../build4 && cd ../build4 && ./ci/travis_run.py ./ci/regression.sh -config
|
||||
script: cp -r $PWD ../build_config && cd ../build_config && ./ci/travis_run.py ./ci/regression.sh -config
|
||||
- stage: test
|
||||
name: stress0
|
||||
script: cp -r $PWD ../build5 && cd ../build5 && ./ci/travis_run.py ./ci/regression.sh -stress0
|
||||
script: cp -r $PWD ../build_stress0 && cd ../build_stress0 && ./ci/travis_run.py ./ci/regression.sh -stress0
|
||||
- stage: test
|
||||
name: stress1
|
||||
script: cp -r $PWD ../build6 && cd ../build6 && ./ci/travis_run.py ./ci/regression.sh -stress1
|
||||
script: cp -r $PWD ../build_stress1 && cd ../build_stress1 && ./ci/travis_run.py ./ci/regression.sh -stress1
|
||||
- stage: test
|
||||
name: compiler
|
||||
script: cp -r $PWD ../build7 && cd ../build7 && ./ci/travis_run.py ./ci/test_compiler.sh
|
||||
script: cp -r $PWD ../build_compiler && cd ../build_compiler && ./ci/travis_run.py ./ci/test_compiler.sh
|
||||
|
||||
after_success:
|
||||
# Gather code coverage
|
||||
|
|
59
README.md
|
@ -21,59 +21,32 @@ Vortex is a full-system RISCV-based GPGPU processor.
|
|||
## Directory structure
|
||||
|
||||
- `doc`: [Documentation](doc/Vortex.md).
|
||||
|
||||
- `hw`: Hardware sources.
|
||||
|
||||
- `driver`: Host drivers repository.
|
||||
|
||||
- `runtime`: Kernel Runtime software.
|
||||
|
||||
- `sim`: Simulators repository.
|
||||
|
||||
- `tests`: Tests repository.
|
||||
|
||||
- `ci`: Continuous integration scripts.
|
||||
|
||||
- `miscs`: Miscellaneous resources.
|
||||
|
||||
## Basic Installation
|
||||
|
||||
## Build Instructions
|
||||
### Supported OS Platforms
|
||||
- Ubuntu 18.04
|
||||
- Centos 7
|
||||
### Toolchain Dependencies
|
||||
- [POCL](http://portablecl.org/)
|
||||
- [LLVM](https://llvm.org/)
|
||||
- [RISCV-GNU-TOOLCHAIN](https://github.com/riscv-collab/riscv-gnu-toolchain)
|
||||
- [Verilator](https://www.veripool.org/verilator)
|
||||
### Install development tools
|
||||
|
||||
$ sudo apt-get install build-essential
|
||||
$ sudo apt-get install git
|
||||
|
||||
### Install gnu-riscv-tools
|
||||
|
||||
$ export RISCV_TOOLCHAIN_PATH=/opt/riscv-gnu-toolchain
|
||||
|
||||
$ sudo apt-get -y install \
|
||||
binutils build-essential libtool texinfo \
|
||||
gzip zip unzip patchutils curl git \
|
||||
make cmake ninja-build automake bison flex gperf \
|
||||
grep sed gawk python bc \
|
||||
zlib1g-dev libexpat1-dev libmpc-dev \
|
||||
libglib2.0-dev libfdt-dev libpixman-1-dev
|
||||
$ git clone https://github.com/riscv/riscv-gnu-toolchain
|
||||
$ cd riscv-gnu-toolchain
|
||||
$ git submodule update --init --recursive
|
||||
$ mkdir build
|
||||
$ cd build
|
||||
$ ../configure --prefix=$RISCV_TOOLCHAIN_PATH --with-arch=rv32im --with-abi=ilp32
|
||||
$ make -j`nproc`
|
||||
$ make -j`nproc` build-qemu
|
||||
|
||||
### Install Verilator
|
||||
|
||||
You need into build the latest version using the instructions on their website
|
||||
$ https://www.veripool.org/projects/verilator/wiki/Installing
|
||||
|
||||
### Install Vortex
|
||||
|
||||
### Install Vortex codebase
|
||||
$ git clone --recursive https://github.com/vortexgpgpu/vortex.git
|
||||
$ cd Vortex
|
||||
$ make
|
||||
|
||||
### Quick Test running OpenCL vecadd sample on 2 cores
|
||||
|
||||
$ ./ci/blackbox.sh --cores=2 --app=vecadd
|
||||
### Install prebuilt toolchain
|
||||
$ ./ci/toolchain_install.sh -all
|
||||
### Build Vortex sources
|
||||
$ make -s
|
||||
### Quick demo running vecadd OpenCL kernel on 2 cores
|
||||
$ ./ci/blackbox.sh --driver=rtlsim --cores=2 --app=vecadd
|
||||
|
|
|
@ -12,7 +12,7 @@ VORTEX_HOME=$SCRIPT_DIR/..
|
|||
DRIVER=vlsim
|
||||
APP=sgemm
|
||||
CLUSTERS=1
|
||||
CORES=2
|
||||
CORES=1
|
||||
WARPS=4
|
||||
THREADS=4
|
||||
L2=0
|
||||
|
@ -132,9 +132,9 @@ if [ $DEBUG -eq 1 ]
|
|||
then
|
||||
if [ $SCOPE -eq 1 ]
|
||||
then
|
||||
DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH
|
||||
DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH
|
||||
else
|
||||
DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH
|
||||
DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH
|
||||
fi
|
||||
|
||||
if [ $HAS_ARGS -eq 1 ]
|
||||
|
@ -153,9 +153,9 @@ then
|
|||
else
|
||||
if [ $SCOPE -eq 1 ]
|
||||
then
|
||||
SCOPE=1 CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH
|
||||
SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH
|
||||
else
|
||||
CONFIGS="$CONFIGS" make -s -C $DRIVER_PATH
|
||||
CONFIGS="$CONFIGS" make -C $DRIVER_PATH
|
||||
fi
|
||||
|
||||
if [ $HAS_ARGS -eq 1 ]
|
||||
|
|
|
@ -22,6 +22,17 @@ make -C tests/opencl run-simx
|
|||
echo "coverage tests done!"
|
||||
}
|
||||
|
||||
tex()
|
||||
{
|
||||
echo "begin texture tests..."
|
||||
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=vlsim --app=tex --args="-isoccer.png -osoccer_result.png -g0"
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1"
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-irainbow.png -orainbow_result.png -g1"
|
||||
|
||||
echo "coverage texture done!"
|
||||
}
|
||||
|
||||
cluster()
|
||||
{
|
||||
echo "begin clustering tests..."
|
||||
|
@ -134,13 +145,15 @@ echo "stress1 tests done!"
|
|||
|
||||
usage()
|
||||
{
|
||||
echo "usage: regression [-coverage] [-cluster] [-debug] [-config] [-stress[#n]] [-all] [-h|--help]"
|
||||
echo "usage: regression [-coverage] [-tex] [-cluster] [-debug] [-config] [-stress[#n]] [-all] [-h|--help]"
|
||||
}
|
||||
|
||||
while [ "$1" != "" ]; do
|
||||
case $1 in
|
||||
-coverage ) coverage
|
||||
;;
|
||||
-tex ) tex
|
||||
;;
|
||||
-cluster ) cluster
|
||||
;;
|
||||
-debug ) debug
|
||||
|
@ -155,6 +168,7 @@ while [ "$1" != "" ]; do
|
|||
stress1
|
||||
;;
|
||||
-all ) coverage
|
||||
tex
|
||||
cluster
|
||||
debug
|
||||
config
|
||||
|
|
Before Width: | Height: | Size: 60 KiB After Width: | Height: | Size: 60 KiB |
Before Width: | Height: | Size: 77 KiB After Width: | Height: | Size: 77 KiB |
Before Width: | Height: | Size: 67 KiB After Width: | Height: | Size: 67 KiB |
Before Width: | Height: | Size: 517 KiB After Width: | Height: | Size: 517 KiB |
|
@ -8,7 +8,7 @@ The Vortex Cache Sub-system has the following main properties:
|
|||
|
||||
### Cache Hierarchy
|
||||
|
||||

|
||||

|
||||
|
||||
- Cache can be configured to be any level in the hierarchy
|
||||
- Caches communicate via snooping
|
||||
|
@ -18,7 +18,7 @@ The Vortex Cache Sub-system has the following main properties:
|
|||
|
||||
VX.cache.v is the top module of the cache verilog code located in the `/hw/rtl/cache` directory.
|
||||
|
||||

|
||||

|
||||
|
||||
- Configurable (Cache size, number of banks, bank line size, etc.)
|
||||
- I/O signals
|
||||
|
@ -44,7 +44,7 @@ VX.cache.v is the top module of the cache verilog code located in the `/hw/rtl/c
|
|||
|
||||
VX_bank.v is the verilog code that handles cache bank functionality and is located in the `/hw/rtl/cache` directory.
|
||||
|
||||

|
||||

|
||||
|
||||
- Allows for high throughput
|
||||
- Each bank contains queues to hold requests to the cache
|
128
docs/execute_opencl_on_vortex.md
Normal file
|
@ -0,0 +1,128 @@
|
|||
# Execute OpenCL on Vortex backend
|
||||
|
||||
## Requirements
|
||||
- [Vortex](https://github.com/vortexgpgpu/vortex)
|
||||
- [POCL for Vortex](https://github.com/vortexgpgpu/pocl)
|
||||
- [riscv-toolchain](https://github.com/riscv-collab/riscv-gnu-toolchain)
|
||||
- [llvm-riscv](https://github.com/llvm-mirror/llvm)
|
||||
|
||||
For installation, please see [Build Instructions](../README.md) for more details.
|
||||
|
||||
**For Ubuntu18.04 users, you can directly download pre-build toolchains with [toolchain_install.sh](https://github.com/vortexgpgpu/vortex/blob/master/ci/toolchain_install.sh) script.**
|
||||
```bash
|
||||
# please modify the DESTDIR variable in the script before execution
|
||||
bash toolchain_install.sh -all
|
||||
```
|
||||
Assuming we have installed all dependencies in `/opt` path, we can get the following environment:
|
||||
```bash
|
||||
tree -L 2 /opt
|
||||
'''
|
||||
/opt/
|
||||
├── llvm-riscv
|
||||
│ ├── bin
|
||||
│ ├── include
|
||||
│ ├── lib
|
||||
│ ├── libexec
|
||||
│ └── share
|
||||
├── pocl
|
||||
│ ├── compiler
|
||||
│ └── runtime
|
||||
├── riscv-gnu-toolchain
|
||||
│ ├── bin
|
||||
│ ├── drops
|
||||
│ ├── include
|
||||
│ ├── lib
|
||||
│ ├── libexec
|
||||
│ ├── riscv32-unknown-elf
|
||||
│ ├── share
|
||||
│ └── var
|
||||
└── verilator
|
||||
├── bin
|
||||
├── examples
|
||||
├── include
|
||||
├── verilator-config.cmake
|
||||
└── verilator-config-version.cmake
|
||||
'''
|
||||
```
|
||||
## Execute OpenCL on Vortex
|
||||
In this tutorial, we show the example of executing a vecadd programs on SIMX backend.
|
||||
To execute a OpenCL program on Vortex, we have the following steps:
|
||||
- Compile the [OpenCL kernels](https://github.com/vortexgpgpu/vortex/blob/master/tests/opencl/vecadd/kernel.cl) into risc-v binary by POCL compiler.
|
||||
- Compile the [OpenCL host](https://github.com/vortexgpgpu/vortex/blob/master/tests/opencl/vecadd/main.cc) and link with Vortex driver(```-lvortex```).
|
||||
- Execute the compiled host programs on a backend.
|
||||
|
||||
Thus, we can write a Makefile as following:
|
||||
```Makefile
|
||||
LLVM_PREFIX ?= /opt/llvm-riscv
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= /opt/pocl/compiler
|
||||
POCL_RT_PATH ?= /opt/pocl/runtime
|
||||
|
||||
OPTS ?= -n64
|
||||
|
||||
# please edit these two variable to your environment
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
|
||||
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
|
||||
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
|
||||
|
||||
CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter
|
||||
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
|
||||
|
||||
PROJECT = vecadd
|
||||
|
||||
SRCS = main.cc
|
||||
|
||||
all: $(PROJECT) kernel.pocl
|
||||
|
||||
kernel.pocl: kernel.cl
|
||||
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.pocl *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
```
|
||||
|
||||
First, build the host program.
|
||||
```bash
|
||||
make all
|
||||
```
|
||||
If we want to execute on SIMX, we can execute the command below.
|
||||
```bash
|
||||
make run-simx
|
||||
```
|
|
@ -13,17 +13,6 @@ OPAE Environment Setup
|
|||
$ export PATH=:/opt/verilator/bin:$PATH
|
||||
$ export VERILATOR_ROOT=/opt/verilator
|
||||
|
||||
OPAE Build Configuration
|
||||
------------------------
|
||||
|
||||
Within the `/hw/syn/opae` directory, there are source text files for each core-option for the fpga build (the 32 and 64 core options are not currently implemented) which have the following parameters that can be configured:
|
||||
- NUM_CORES: the number of cores per cluster
|
||||
- NUM_CLUSTERS: the number of clusters alotted to the processor
|
||||
- L3_ENABLE: enable the use of the L3 cache
|
||||
- PERF_ENABLE: enable the use of all profile counters
|
||||
|
||||
To enable L3 cache and profile counters for a build, simply uncomment the definition within the respective source file.
|
||||
|
||||
OPAE Build
|
||||
------------------
|
||||
|
||||
|
@ -33,41 +22,58 @@ The FPGA has to following configuration options:
|
|||
- 4 cores fpga (fpga-4c)
|
||||
- 8 cores fpga (fpga-8c)
|
||||
- 16 cores fpga (fpga-16c)
|
||||
- 32 cores fpga (fpga-32c)
|
||||
- 64 cores fpga (fpga-64c)
|
||||
|
||||
Command line:
|
||||
|
||||
$ cd hw/syn/opae
|
||||
$ make fpga- *# of cores* c
|
||||
$ make fpga-<num-of-cores>c
|
||||
|
||||
Example: `make fpga-4c`
|
||||
|
||||
A new folder (ex: `build_fpga_4c`) will be created and the build will start and take ~30-45 min to complete.
|
||||
A new folder (ex: `build_fpga_4c`) will be created and the build will start and take ~30-480 min to complete.
|
||||
|
||||
|
||||
OPAE Build Configuration
|
||||
------------------------
|
||||
|
||||
The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware parameters that can be modified when build the processor.For example, have the following parameters that can be configured:
|
||||
- `NUM_WARPS`: Number of warps per cores
|
||||
- `NUM_THREADS`: Number of threads per warps
|
||||
- `PERF_ENABLE`: enable the use of all profile counters
|
||||
|
||||
You configure the syntesis build from the command line:
|
||||
|
||||
$ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make fpga-4c
|
||||
|
||||
OPAE Build Progress
|
||||
-------------------
|
||||
|
||||
You could check the last 10 lines in the build log for possible errors until build completion.
|
||||
|
||||
$ tail -n 10 ./build_fpga_4c/build.log
|
||||
$ tail -n 10 ./build_fpga_<num-of-cores>c/build.log
|
||||
|
||||
Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs.
|
||||
|
||||
$ ps -u *username*
|
||||
$ ps -u <username>
|
||||
|
||||
|
||||
If the build fails and you need to restart it, clean up the build folder using the following command:
|
||||
|
||||
$ make clean-fpga- *# of cores* c
|
||||
$ make clean-fpga-<num-of-cores>c
|
||||
|
||||
Example: `make clean-fpga-4c`
|
||||
|
||||
The file `vortex_afu.gbs` should exist when the build is done:
|
||||
|
||||
$ ls -lsa ./build_fpga_ *# of cores* c/vortex_afu.gbs
|
||||
$ ls -lsa ./build_fpga_<num-of-cores>c/vortex_afu.gbs
|
||||
|
||||
|
||||
Signing the bitstream and Programming the FPGA
|
||||
----------------------------------------------
|
||||
|
||||
$ cd ./build_fpga_`# of cores`c/
|
||||
$ cd ./build_fpga_<num-of-cores>c
|
||||
$ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs
|
||||
$ fpgasupdate vortex_afu_unsigned_ssl.gbs
|
||||
|
|
@ -14,17 +14,17 @@
|
|||
|
||||
## Installation
|
||||
|
||||
- Refer to the install instructions in [README](../README.md).
|
||||
- Refer to the build instructions in [README](../README.md).
|
||||
|
||||
## Quick Start Scenarios
|
||||
|
||||
Running Vortex simulators with different configurations:
|
||||
- Run basic driver test with rtlsim driver and Vortex config of 2 clusters, 2 cores, 2 warps, 4 threads
|
||||
|
||||
$ ./ci/blackbox.sh --clusters=2 --cores=2 --warps=2 --threads=4 --driver=rtlsim --app=basic
|
||||
$ ./ci/blackbox.sh --driver=rtlsim --clusters=2 --cores=2 --warps=2 --threads=4 --app=basic
|
||||
- Run demo driver test with vlsim driver and Vortex config of 1 clusters, 4 cores, 4 warps, 2 threads
|
||||
|
||||
$ ./ci/blackbox.sh --clusters=1 --cores=4 --warps=4 --threads=2 --driver=vlsim --app=demo
|
||||
$ ./ci/blackbox.sh --driver=vlsim --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo
|
||||
- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads
|
||||
|
||||
$ ./ci/blackbox.sh --clusters=4 --cores=4 --warps=8 --threads=6 --driver=simx --app=dogfood
|
||||
$ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood
|
|
@ -32,7 +32,7 @@ Vortex uses the SIMT (Single Instruction, Multiple Threads) execution model with
|
|||
|
||||
### Vortex Pipeline/Datapath
|
||||
|
||||

|
||||

|
||||
|
||||
Vortex has a 5-stage pipeline: FI | ID | Issue | EX | WB.
|
||||
|
|
@ -63,12 +63,5 @@ scope: scope-defs.h
|
|||
$(PROJECT): $(SRCS) $(OPAE_SYN_DIR)/vortex_afu.h $(SCOPE_H)
|
||||
$(CXX) $(CXXFLAGS) -DUSE_ASE $(SRCS) $(LDFLAGS) -o $(PROJECT)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend scope-defs.h
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
rm -rf $(PROJECT) *.o scope-defs.h
|
|
@ -65,12 +65,5 @@ scope: scope-defs.h
|
|||
$(PROJECT): $(SRCS) $(OPAE_SYN_DIR)/vortex_afu.h $(SCOPE_H)
|
||||
$(CXX) $(CXXFLAGS) -DUSE_FPGA $^ $(LDFLAGS) -o $(PROJECT)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend scope-defs.h
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
rm -rf $(PROJECT) *.o scope-defs.h
|
|
@ -35,4 +35,4 @@ $(PROJECT): $(SRCS)
|
|||
|
||||
clean:
|
||||
$(MAKE) -C $(RTLSIM_DIR) clean-static
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
rm -rf $(PROJECT) *.o
|
|
@ -21,9 +21,6 @@ $(PROJECT): $(SRCS)
|
|||
$(MAKE) -C $(SIMX_DIR) static
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
$(MAKE) -C $(SIMX_DIR) clean-static
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
rm -rf $(PROJECT) *.o
|
|
@ -50,13 +50,6 @@ $(PROJECT): $(SRCS) $(SCOPE_H)
|
|||
$(SCOPE_ENABLE) $(PERF_ENABLE) $(MAKE) -C $(VLSIM_DIR) static
|
||||
$(CXX) $(CXXFLAGS) -DUSE_VLSIM $(SRCS) $(LDFLAGS) -o $(PROJECT)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $(SRCS) > .depend;
|
||||
|
||||
clean:
|
||||
$(MAKE) -C $(VLSIM_DIR) clean-static
|
||||
rm -rf $(PROJECT) *.o .depend scope-defs.h
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
rm -rf $(PROJECT) *.o scope-defs.h
|
|
@ -217,7 +217,7 @@ module VX_alu_unit #(
|
|||
// can accept new request?
|
||||
assign alu_req_if.ready = ready_in;
|
||||
|
||||
`ifdef DBG_PRINT_PIPELINE
|
||||
`ifdef DBG_TRACE_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (branch_ctl_if.valid) begin
|
||||
dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h\n",
|
||||
|
|
159
hw/rtl/VX_cache_arb.sv
Normal file
|
@ -0,0 +1,159 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_cache_arb #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter LANES = 1,
|
||||
parameter DATA_SIZE = 1,
|
||||
parameter TAG_IN_WIDTH = 1,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
parameter BUFFERED_REQ = 0,
|
||||
parameter BUFFERED_RSP = 0,
|
||||
parameter TYPE = "R",
|
||||
|
||||
localparam ADDR_WIDTH = (32-`CLOG2(DATA_SIZE)),
|
||||
localparam DATA_WIDTH = (8 * DATA_SIZE),
|
||||
localparam LOG_NUM_REQS = `CLOG2(NUM_REQS),
|
||||
localparam TAG_OUT_WIDTH = TAG_IN_WIDTH + LOG_NUM_REQS
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// input requests
|
||||
input wire [NUM_REQS-1:0][LANES-1:0] req_valid_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0] req_rw_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0][DATA_SIZE-1:0] req_byteen_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0][ADDR_WIDTH-1:0] req_addr_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] req_data_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_in,
|
||||
output wire [NUM_REQS-1:0][LANES-1:0] req_ready_in,
|
||||
|
||||
// output request
|
||||
output wire [LANES-1:0] req_valid_out,
|
||||
output wire [LANES-1:0] req_rw_out,
|
||||
output wire [LANES-1:0][DATA_SIZE-1:0] req_byteen_out,
|
||||
output wire [LANES-1:0][ADDR_WIDTH-1:0] req_addr_out,
|
||||
output wire [LANES-1:0][DATA_WIDTH-1:0] req_data_out,
|
||||
output wire [LANES-1:0][TAG_OUT_WIDTH-1:0] req_tag_out,
|
||||
input wire [LANES-1:0] req_ready_out,
|
||||
|
||||
// input response
|
||||
input wire rsp_valid_in,
|
||||
input wire [LANES-1:0] rsp_tmask_in,
|
||||
input wire [LANES-1:0][DATA_WIDTH-1:0] rsp_data_in,
|
||||
input wire [TAG_OUT_WIDTH-1:0] rsp_tag_in,
|
||||
output wire rsp_ready_in,
|
||||
|
||||
// output responses
|
||||
output wire [NUM_REQS-1:0] rsp_valid_out,
|
||||
output wire [NUM_REQS-1:0][LANES-1:0] rsp_tmask_out,
|
||||
output wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] rsp_data_out,
|
||||
output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_out,
|
||||
input wire [NUM_REQS-1:0] rsp_ready_out
|
||||
);
|
||||
localparam REQ_DATAW = TAG_OUT_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
|
||||
localparam RSP_DATAW = LANES * (1 + DATA_WIDTH) + TAG_IN_WIDTH;
|
||||
|
||||
if (NUM_REQS > 1) begin
|
||||
|
||||
wire [NUM_REQS-1:0][LANES-1:0][REQ_DATAW-1:0] req_data_in_merged;
|
||||
wire [LANES-1:0][REQ_DATAW-1:0] req_data_out_merged;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
for (genvar j = 0; j < LANES; ++j) begin
|
||||
wire [TAG_OUT_WIDTH-1:0] req_tag_in_w;
|
||||
|
||||
VX_bits_insert #(
|
||||
.N (TAG_IN_WIDTH),
|
||||
.S (LOG_NUM_REQS),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) bits_insert (
|
||||
.data_in (req_tag_in[i][j]),
|
||||
.sel_in (LOG_NUM_REQS'(i)),
|
||||
.data_out (req_tag_in_w)
|
||||
);
|
||||
|
||||
assign req_data_in_merged[i][j] = {req_tag_in_w, req_addr_in[i][j], req_rw_in[i][j], req_byteen_in[i][j], req_data_in[i][j]};
|
||||
end
|
||||
end
|
||||
|
||||
VX_stream_arbiter #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.LANES (LANES),
|
||||
.DATAW (REQ_DATAW),
|
||||
.BUFFERED (BUFFERED_REQ),
|
||||
.TYPE (TYPE)
|
||||
) req_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (req_valid_in),
|
||||
.data_in (req_data_in_merged),
|
||||
.ready_in (req_ready_in),
|
||||
.valid_out (req_valid_out),
|
||||
.data_out (req_data_out_merged),
|
||||
.ready_out (req_ready_out)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < LANES; ++i) begin
|
||||
assign {req_tag_out[i], req_addr_out[i], req_rw_out[i], req_byteen_out[i], req_data_out[i]} = req_data_out_merged[i];
|
||||
end
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_out_merged;
|
||||
|
||||
wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in[TAG_SEL_IDX +: LOG_NUM_REQS];
|
||||
|
||||
wire [TAG_IN_WIDTH-1:0] rsp_tag_in_w;
|
||||
|
||||
VX_bits_remove #(
|
||||
.N (TAG_OUT_WIDTH),
|
||||
.S (LOG_NUM_REQS),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) bits_remove (
|
||||
.data_in (rsp_tag_in),
|
||||
.data_out (rsp_tag_in_w)
|
||||
);
|
||||
|
||||
VX_stream_demux #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.LANES (1),
|
||||
.DATAW (RSP_DATAW),
|
||||
.BUFFERED (BUFFERED_RSP)
|
||||
) rsp_demux (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.sel_in (rsp_sel),
|
||||
.valid_in (rsp_valid_in),
|
||||
.data_in ({rsp_tmask_in, rsp_tag_in_w, rsp_data_in}),
|
||||
.ready_in (rsp_ready_in),
|
||||
.valid_out (rsp_valid_out),
|
||||
.data_out (rsp_data_out_merged),
|
||||
.ready_out (rsp_ready_out)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
assign {rsp_tmask_out[i], rsp_tag_out[i], rsp_data_out[i]} = rsp_data_out_merged[i];
|
||||
end
|
||||
|
||||
end else begin
|
||||
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
assign req_valid_out = req_valid_in;
|
||||
assign req_tag_out = req_tag_in;
|
||||
assign req_addr_out = req_addr_in;
|
||||
assign req_rw_out = req_rw_in;
|
||||
assign req_byteen_out = req_byteen_in;
|
||||
assign req_data_out = req_data_in;
|
||||
assign req_ready_in = req_ready_out;
|
||||
|
||||
assign rsp_valid_out = rsp_valid_in;
|
||||
assign rsp_tmask_out = rsp_tmask_in;
|
||||
assign rsp_tag_out = rsp_tag_in;
|
||||
assign rsp_data_out = rsp_data_in;
|
||||
assign rsp_ready_in = rsp_ready_out;
|
||||
|
||||
end
|
||||
|
||||
endmodule
|
|
@ -78,14 +78,14 @@ module VX_commit #(
|
|||
`ifdef EXT_F_ENABLE
|
||||
.fpu_commit_if (fpu_commit_if),
|
||||
`endif
|
||||
.gpu_commit_if (gpu_commit_if),
|
||||
.writeback_if (writeback_if)
|
||||
);
|
||||
|
||||
// store and gpu commits don't writeback
|
||||
assign st_commit_if.ready = 1'b1;
|
||||
assign gpu_commit_if.ready = 1'b1;
|
||||
|
||||
`ifdef DBG_PRINT_PIPELINE
|
||||
`ifdef DBG_TRACE_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (alu_commit_if.valid && alu_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.wb, alu_commit_if.rd);
|
||||
|
|
|
@ -230,6 +230,21 @@
|
|||
`define CSR_NW 12'hFC1
|
||||
`define CSR_NC 12'hFC2
|
||||
|
||||
////////// Texture Units //////////////////////////////////////////////////////
|
||||
|
||||
`define NUM_TEX_UNITS 2
|
||||
|
||||
`define CSR_TEX_STATES 7
|
||||
`define CSR_TEX_BEGIN(x) (12'hFD0 + (x) * `CSR_TEX_STATES)
|
||||
|
||||
`define CSR_TEX_ADDR(x) (`CSR_TEX_BEGIN(x) + 12'h00)
|
||||
`define CSR_TEX_FORMAT(x) (`CSR_TEX_BEGIN(x) + 12'h01)
|
||||
`define CSR_TEX_WRAP(x) (`CSR_TEX_BEGIN(x) + 12'h02)
|
||||
`define CSR_TEX_FILTER(x) (`CSR_TEX_BEGIN(x) + 12'h03)
|
||||
`define CSR_TEX_MIPOFF(x) (`CSR_TEX_BEGIN(x) + 12'h04)
|
||||
`define CSR_TEX_WIDTH(x) (`CSR_TEX_BEGIN(x) + 12'h05)
|
||||
`define CSR_TEX_HEIGHT(x) (`CSR_TEX_BEGIN(x) + 12'h06)
|
||||
|
||||
// Pipeline Queues ////////////////////////////////////////////////////////////
|
||||
|
||||
// Size of Instruction Buffer
|
||||
|
|
|
@ -17,6 +17,9 @@ module VX_csr_data #(
|
|||
`ifdef EXT_F_ENABLE
|
||||
VX_fpu_to_csr_if.slave fpu_to_csr_if,
|
||||
`endif
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
VX_tex_csr_if.master tex_csr_if,
|
||||
`endif
|
||||
|
||||
input wire read_enable,
|
||||
input wire[`CSR_ADDR_BITS-1:0] read_addr,
|
||||
|
@ -26,7 +29,7 @@ module VX_csr_data #(
|
|||
input wire write_enable,
|
||||
input wire[`CSR_ADDR_BITS-1:0] write_addr,
|
||||
input wire[`NW_BITS-1:0] write_wid,
|
||||
input wire[`CSR_WIDTH-1:0] write_data,
|
||||
input wire[31:0] write_data,
|
||||
|
||||
input wire busy
|
||||
);
|
||||
|
@ -46,13 +49,13 @@ module VX_csr_data #(
|
|||
|
||||
reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FFLAGS_BITS-1:0] fcsr;
|
||||
|
||||
always @(posedge clk) begin
|
||||
always @(posedge clk) begin
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (reset) begin
|
||||
fcsr <= '0;
|
||||
end
|
||||
end
|
||||
if (fpu_to_csr_if.write_enable) begin
|
||||
fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0]
|
||||
fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0]
|
||||
| fpu_to_csr_if.write_fflags;
|
||||
end
|
||||
`endif
|
||||
|
@ -61,27 +64,33 @@ module VX_csr_data #(
|
|||
`CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0];
|
||||
`CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0];
|
||||
`CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0];
|
||||
|
||||
`CSR_SATP: csr_satp <= write_data;
|
||||
|
||||
`CSR_MSTATUS: csr_mstatus <= write_data;
|
||||
`CSR_MEDELEG: csr_medeleg <= write_data;
|
||||
`CSR_MIDELEG: csr_mideleg <= write_data;
|
||||
`CSR_MIE: csr_mie <= write_data;
|
||||
`CSR_MTVEC: csr_mtvec <= write_data;
|
||||
|
||||
`CSR_MEPC: csr_mepc <= write_data;
|
||||
|
||||
`CSR_PMPCFG0: csr_pmpcfg[0] <= write_data;
|
||||
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data;
|
||||
|
||||
default: begin
|
||||
`ASSERT(~write_enable, ("%t: invalid CSR write address: %0h", $time, write_addr));
|
||||
`CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0];
|
||||
default: begin
|
||||
`ASSERT(write_addr >= `CSR_TEX_BEGIN(0)
|
||||
&& write_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES),
|
||||
("%t: invalid CSR write address: %0h", $time, write_addr));
|
||||
end
|
||||
endcase
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
`UNUSED_VAR (write_data)
|
||||
|
||||
// TEX CSRs
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
assign tex_csr_if.write_enable = write_enable;
|
||||
assign tex_csr_if.write_addr = write_addr;
|
||||
assign tex_csr_if.write_data = write_data;
|
||||
`endif
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
csr_cycle <= 0;
|
||||
|
@ -209,7 +218,8 @@ module VX_csr_data #(
|
|||
|
||||
default: begin
|
||||
if (!((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32))
|
||||
| (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32)))) begin
|
||||
|| (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32)
|
||||
|| (read_addr >= `CSR_TEX_BEGIN(0) && read_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES))))) begin
|
||||
read_addr_valid_r = 0;
|
||||
end
|
||||
end
|
||||
|
|
|
@ -20,6 +20,9 @@ module VX_csr_unit #(
|
|||
VX_fpu_to_csr_if.slave fpu_to_csr_if,
|
||||
input wire[`NUM_WARPS-1:0] fpu_pending,
|
||||
`endif
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
VX_tex_csr_if.master tex_csr_if,
|
||||
`endif
|
||||
|
||||
output wire[`NUM_WARPS-1:0] pending,
|
||||
input wire busy
|
||||
|
@ -46,6 +49,9 @@ module VX_csr_unit #(
|
|||
.fetch_to_csr_if(fetch_to_csr_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_to_csr_if (fpu_to_csr_if),
|
||||
`endif
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
.tex_csr_if (tex_csr_if),
|
||||
`endif
|
||||
.read_enable (csr_req_if.valid),
|
||||
.read_addr (csr_req_if.addr),
|
||||
|
@ -54,7 +60,7 @@ module VX_csr_unit #(
|
|||
.write_enable (write_enable),
|
||||
.write_addr (csr_addr_s1),
|
||||
.write_wid (csr_commit_if.wid),
|
||||
.write_data (csr_updated_data_s1[`CSR_WIDTH-1:0]),
|
||||
.write_data (csr_updated_data_s1),
|
||||
.busy (busy)
|
||||
);
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
`include "VX_define.vh"
|
||||
`ifdef DBG_PRINT_PIPELINE
|
||||
`include "VX_print_instr.vh"
|
||||
`ifdef DBG_TRACE_PIPELINE
|
||||
`include "VX_trace_instr.vh"
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
|
@ -42,6 +42,7 @@ module VX_decode #(
|
|||
|
||||
wire [31:0] instr = ifetch_rsp_if.data;
|
||||
wire [6:0] opcode = instr[6:0];
|
||||
wire [1:0] func2 = instr[26:25];
|
||||
wire [2:0] func3 = instr[14:12];
|
||||
wire [6:0] func7 = instr[31:25];
|
||||
wire [11:0] u_12 = instr[31:20];
|
||||
|
@ -193,7 +194,6 @@ module VX_decode #(
|
|||
end
|
||||
`INST_F: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_OP_BITS'(func3[0]);
|
||||
op_mod = `INST_MOD_BITS'(1);
|
||||
end
|
||||
`INST_SYS : begin
|
||||
|
@ -375,11 +375,21 @@ module VX_decode #(
|
|||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
3'h5: begin
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_TEX);
|
||||
op_mod = `INST_MOD_BITS'(func2);
|
||||
use_rd = 1;
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
`USED_IREG (rs3);
|
||||
end
|
||||
`endif
|
||||
3'h6: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_PRED);
|
||||
imm = {{20{u_12[11]}}, u_12};
|
||||
use_rd = 0;
|
||||
op_type = `INST_OP_BITS'(`INST_LSU_LW);
|
||||
op_mod = `INST_MOD_BITS'(2);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
default:;
|
||||
|
@ -389,6 +399,8 @@ module VX_decode #(
|
|||
endcase
|
||||
end
|
||||
|
||||
`UNUSED_VAR (func2)
|
||||
|
||||
// disable write to integer register r0
|
||||
wire wb = use_rd && (| rd_r);
|
||||
|
||||
|
@ -421,13 +433,13 @@ module VX_decode #(
|
|||
|
||||
assign ifetch_rsp_if.ready = decode_if.ready;
|
||||
|
||||
`ifdef DBG_PRINT_PIPELINE
|
||||
`ifdef DBG_TRACE_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (decode_if.valid && decode_if.ready) begin
|
||||
dpi_trace("%d: core%0d-decode: wid=%0d, PC=%0h, ex=", $time, CORE_ID, decode_if.wid, decode_if.PC);
|
||||
print_ex_type(decode_if.ex_type);
|
||||
trace_ex_type(decode_if.ex_type);
|
||||
dpi_trace(", op=");
|
||||
print_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod);
|
||||
trace_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod);
|
||||
dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm);
|
||||
end
|
||||
end
|
||||
|
|
|
@ -18,6 +18,8 @@
|
|||
|
||||
`define NRI_BITS `LOG2UP(`NUM_IREGS)
|
||||
|
||||
`define NTEX_BITS `LOG2UP(`NUM_TEX_UNITS)
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`define NUM_REGS (2 * `NUM_IREGS)
|
||||
`else
|
||||
|
@ -66,6 +68,8 @@
|
|||
|
||||
`define INST_GPU 7'b1101011
|
||||
|
||||
`define INST_TEX 7'b0101011
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_FRM_RNE 3'b000 // round to nearest even
|
||||
|
@ -150,8 +154,8 @@
|
|||
`define INST_LSU_BITS 4
|
||||
`define INST_LSU_FMT(x) x[2:0]
|
||||
`define INST_LSU_WSIZE(x) x[1:0]
|
||||
`define INST_LSU_IS_FENCE(x) x[0]
|
||||
`define INST_LSU_IS_PREF(x) (x==3'b111)
|
||||
`define INST_LSU_IS_FENCE(x) (3'h1 == x)
|
||||
`define INST_LSU_IS_PREFETCH(x) (3'h2 == x)
|
||||
|
||||
`define INST_FENCE_BITS 1
|
||||
`define INST_FENCE_D 1'h0
|
||||
|
@ -187,6 +191,7 @@
|
|||
`define INST_GPU_JOIN 3'h3
|
||||
`define INST_GPU_BAR 3'h4
|
||||
`define INST_GPU_PRED 3'h5
|
||||
`define INST_GPU_TEX 3'h6
|
||||
`define INST_GPU_BITS 3
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -238,8 +243,11 @@
|
|||
`define DBG_CACHE_REQ_MDATAW 0
|
||||
`endif
|
||||
|
||||
// non-cacheable address bit
|
||||
`define NC_FLAG_BITS 1
|
||||
// non-cacheable tag bits
|
||||
`define NC_TAG_BIT 1
|
||||
|
||||
// texture tag bits
|
||||
`define TEX_TAG_BIT 1
|
||||
|
||||
////////////////////////// Icache Configurable Knobs //////////////////////////
|
||||
|
||||
|
@ -278,12 +286,20 @@
|
|||
// Block size in bytes
|
||||
`define DCACHE_LINE_SIZE `L1_BLOCK_SIZE
|
||||
|
||||
// TAG sharing enable
|
||||
`define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE)
|
||||
`define DCACHE_CORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_FLAG_BITS + `SM_ENABLE)
|
||||
|
||||
// Input request tag bits
|
||||
`define DCACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCACHE_CORE_TAG_ID_BITS)
|
||||
// Core request tag bits
|
||||
`define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE)
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
`define LSU_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE)
|
||||
`define TEX_TAG_ID_BITS (2)
|
||||
`define LSU_TEX_TAG_ID_BITS `MAX(`LSU_TAG_ID_BITS, `TEX_TAG_ID_BITS)
|
||||
`define DCACHE_CORE_TAG_ID_BITS (`LSU_TEX_TAG_ID_BITS + `TEX_TAG_BIT)
|
||||
`define LSU_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TAG_ID_BITS)
|
||||
`define TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `TEX_TAG_ID_BITS)
|
||||
`define LSU_TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TEX_TAG_ID_BITS)
|
||||
`else
|
||||
`define DCACHE_CORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_TAG_BIT + `SM_ENABLE)
|
||||
`endif
|
||||
`define DCACHE_CORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCACHE_CORE_TAG_ID_BITS)
|
||||
|
||||
// Memory request data bits
|
||||
`define DCACHE_MEM_DATA_WIDTH (`DCACHE_LINE_SIZE * 8)
|
||||
|
@ -300,7 +316,7 @@
|
|||
// Memory request tag bits
|
||||
`define _DMEM_ADDR_RATIO_W $clog2(`DCACHE_LINE_SIZE / `DCACHE_WORD_SIZE)
|
||||
`define _DNC_MEM_TAG_WIDTH ($clog2(`DCACHE_NUM_REQS) + `_DMEM_ADDR_RATIO_W + `DCACHE_CORE_TAG_WIDTH)
|
||||
`define DCACHE_MEM_TAG_WIDTH `MAX((`CLOG2(`DCACHE_NUM_BANKS) + `CLOG2(`DCACHE_MSHR_SIZE) + `NC_FLAG_BITS), `_DNC_MEM_TAG_WIDTH)
|
||||
`define DCACHE_MEM_TAG_WIDTH `MAX((`CLOG2(`DCACHE_NUM_BANKS) + `CLOG2(`DCACHE_MSHR_SIZE) + `NC_TAG_BIT), `_DNC_MEM_TAG_WIDTH)
|
||||
|
||||
// Merged D-cache/I-cache memory tag
|
||||
`define L1_MEM_TAG_WIDTH (`MAX(`ICACHE_MEM_TAG_WIDTH, `DCACHE_MEM_TAG_WIDTH) + `CLOG2(2))
|
||||
|
@ -348,7 +364,7 @@
|
|||
// Memory request tag bits
|
||||
`define _L2_MEM_ADDR_RATIO_W $clog2(`L2_CACHE_LINE_SIZE / `L2_WORD_SIZE)
|
||||
`define _L2_NC_MEM_TAG_WIDTH ($clog2(`L2_NUM_REQS) + `_L2_MEM_ADDR_RATIO_W + `L1_MEM_TAG_WIDTH)
|
||||
`define _L2_MEM_TAG_WIDTH `MAX((`CLOG2(`L2_NUM_BANKS) + `CLOG2(`L2_MSHR_SIZE) + `NC_FLAG_BITS), `_L2_NC_MEM_TAG_WIDTH)
|
||||
`define _L2_MEM_TAG_WIDTH `MAX((`CLOG2(`L2_NUM_BANKS) + `CLOG2(`L2_MSHR_SIZE) + `NC_TAG_BIT), `_L2_NC_MEM_TAG_WIDTH)
|
||||
`define L2_MEM_TAG_WIDTH ((`L2_ENABLE) ? `_L2_MEM_TAG_WIDTH : (`L1_MEM_TAG_WIDTH + `CLOG2(`L2_NUM_REQS)))
|
||||
|
||||
////////////////////////// L3cache Configurable Knobs /////////////////////////
|
||||
|
@ -380,7 +396,7 @@
|
|||
// Memory request tag bits
|
||||
`define _L3_MEM_ADDR_RATIO_W $clog2(`L3_CACHE_LINE_SIZE / `L3_WORD_SIZE)
|
||||
`define _L3_NC_MEM_TAG_WIDTH ($clog2(`L3_NUM_REQS) + `_L3_MEM_ADDR_RATIO_W + `L2_MEM_TAG_WIDTH)
|
||||
`define _L3_MEM_TAG_WIDTH `MAX((`CLOG2(`L3_NUM_BANKS) + `CLOG2(`L3_MSHR_SIZE) + `NC_FLAG_BITS), `_L3_NC_MEM_TAG_WIDTH)
|
||||
`define _L3_MEM_TAG_WIDTH `MAX((`CLOG2(`L3_NUM_BANKS) + `CLOG2(`L3_MSHR_SIZE) + `NC_TAG_BIT), `_L3_NC_MEM_TAG_WIDTH)
|
||||
`define L3_MEM_TAG_WIDTH ((`L3_ENABLE) ? `_L3_MEM_TAG_WIDTH : (`L2_MEM_TAG_WIDTH + `CLOG2(`L3_NUM_REQS)))
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_instr_demux (
|
||||
module VX_dispatch (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
|
@ -60,7 +60,7 @@ module VX_instr_demux (
|
|||
wire lsu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_LSU);
|
||||
wire [`INST_LSU_BITS-1:0] lsu_op_type = `INST_LSU_BITS'(ibuffer_if.op_type);
|
||||
wire lsu_is_fence = `INST_LSU_IS_FENCE(ibuffer_if.op_mod);
|
||||
wire lsu_is_prefetch = (~ibuffer_if.wb) && ~(ibuffer_if.op_type[`INST_OP_BITS-1]);
|
||||
wire lsu_is_prefetch = `INST_LSU_IS_PREFETCH(ibuffer_if.op_mod);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1),
|
||||
|
@ -125,18 +125,17 @@ module VX_instr_demux (
|
|||
|
||||
wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU);
|
||||
wire [`INST_GPU_BITS-1:0] gpu_op_type = `INST_GPU_BITS'(ibuffer_if.op_type);
|
||||
wire [31:0] gpu_rs2_data = gpr_rsp_if.rs2_data[tid];
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32)),
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)),
|
||||
.OUT_REG (1)
|
||||
) gpu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (gpu_req_valid),
|
||||
.ready_in (gpu_req_ready),
|
||||
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpu_rs2_data}),
|
||||
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data}),
|
||||
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
|
||||
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}),
|
||||
.valid_out (gpu_req_if.valid),
|
||||
.ready_out (gpu_req_if.ready)
|
||||
);
|
|
@ -45,12 +45,108 @@ module VX_execute #(
|
|||
VX_commit_if.master gpu_commit_if,
|
||||
|
||||
input wire busy
|
||||
);
|
||||
);
|
||||
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
|
||||
VX_dcache_req_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`LSU_DCACHE_TAG_BITS)
|
||||
) lsu_dcache_req_if();
|
||||
|
||||
VX_dcache_rsp_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`LSU_DCACHE_TAG_BITS)
|
||||
) lsu_dcache_rsp_if();
|
||||
|
||||
VX_dcache_req_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`TEX_DCACHE_TAG_BITS)
|
||||
) tex_dcache_req_if();
|
||||
|
||||
VX_dcache_rsp_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`TEX_DCACHE_TAG_BITS)
|
||||
) tex_dcache_rsp_if();
|
||||
|
||||
VX_tex_csr_if tex_csr_if();
|
||||
|
||||
wire [`NUM_THREADS-1:0][`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_in, lsu_tag_in;
|
||||
wire [`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_out, lsu_tag_out;
|
||||
|
||||
`UNUSED_VAR (tex_tag_out)
|
||||
`UNUSED_VAR (lsu_tag_out)
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
assign tex_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(tex_dcache_req_if.tag[i][`TEX_TAG_ID_BITS-1:0]);
|
||||
assign lsu_tag_in[i][`LSU_TEX_TAG_ID_BITS-1:0] = `LSU_TEX_TAG_ID_BITS'(lsu_dcache_req_if.tag[i][`LSU_TAG_ID_BITS-1:0]);
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
assign tex_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = tex_dcache_req_if.tag[i][`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS];
|
||||
assign lsu_tag_in[i][`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS] = lsu_dcache_req_if.tag[i][`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS];
|
||||
`endif
|
||||
end
|
||||
|
||||
assign tex_dcache_rsp_if.tag[`TEX_TAG_ID_BITS-1:0] = tex_tag_out[`TEX_TAG_ID_BITS-1:0];
|
||||
assign lsu_dcache_rsp_if.tag[`LSU_TAG_ID_BITS-1:0] = lsu_tag_out[`LSU_TAG_ID_BITS-1:0];
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
assign tex_dcache_rsp_if.tag[`TEX_DCACHE_TAG_BITS-1:`TEX_TAG_ID_BITS] = tex_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS];
|
||||
assign lsu_dcache_rsp_if.tag[`LSU_DCACHE_TAG_BITS-1:`LSU_TAG_ID_BITS] = lsu_tag_out[`LSU_TEX_DCACHE_TAG_BITS-1:`LSU_TEX_TAG_ID_BITS];
|
||||
`endif
|
||||
|
||||
VX_cache_arb #(
|
||||
.NUM_REQS (2),
|
||||
.LANES (`NUM_THREADS),
|
||||
.DATA_SIZE (4),
|
||||
.TAG_IN_WIDTH (`LSU_TEX_DCACHE_TAG_BITS),
|
||||
.TAG_SEL_IDX (`NC_TAG_BIT + `SM_ENABLE)
|
||||
) tex_lsu_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
// Tex/LSU request
|
||||
.req_valid_in ({tex_dcache_req_if.valid, lsu_dcache_req_if.valid}),
|
||||
.req_rw_in ({tex_dcache_req_if.rw, lsu_dcache_req_if.rw}),
|
||||
.req_byteen_in ({tex_dcache_req_if.byteen, lsu_dcache_req_if.byteen}),
|
||||
.req_addr_in ({tex_dcache_req_if.addr, lsu_dcache_req_if.addr}),
|
||||
.req_data_in ({tex_dcache_req_if.data, lsu_dcache_req_if.data}),
|
||||
.req_tag_in ({tex_tag_in, lsu_tag_in}),
|
||||
.req_ready_in ({tex_dcache_req_if.ready, lsu_dcache_req_if.ready}),
|
||||
|
||||
// Dcache request
|
||||
.req_valid_out (dcache_req_if.valid),
|
||||
.req_rw_out (dcache_req_if.rw),
|
||||
.req_byteen_out (dcache_req_if.byteen),
|
||||
.req_addr_out (dcache_req_if.addr),
|
||||
.req_data_out (dcache_req_if.data),
|
||||
.req_tag_out (dcache_req_if.tag),
|
||||
.req_ready_out (dcache_req_if.ready),
|
||||
|
||||
// Dcache response
|
||||
.rsp_valid_in (dcache_rsp_if.valid),
|
||||
.rsp_tmask_in (dcache_rsp_if.tmask),
|
||||
.rsp_tag_in (dcache_rsp_if.tag),
|
||||
.rsp_data_in (dcache_rsp_if.data),
|
||||
.rsp_ready_in (dcache_rsp_if.ready),
|
||||
|
||||
// Tex/LSU response
|
||||
.rsp_valid_out ({tex_dcache_rsp_if.valid, lsu_dcache_rsp_if.valid}),
|
||||
.rsp_tmask_out ({tex_dcache_rsp_if.tmask, lsu_dcache_rsp_if.tmask}),
|
||||
.rsp_data_out ({tex_dcache_rsp_if.data, lsu_dcache_rsp_if.data}),
|
||||
.rsp_tag_out ({tex_tag_out, lsu_tag_out}),
|
||||
.rsp_ready_out ({tex_dcache_rsp_if.ready, lsu_dcache_rsp_if.ready})
|
||||
);
|
||||
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_fpu_to_csr_if fpu_to_csr_if();
|
||||
wire[`NUM_WARPS-1:0] fpu_pending;
|
||||
wire[`NUM_WARPS-1:0] csr_pending;
|
||||
`endif
|
||||
wire [`NUM_WARPS-1:0] csr_pending;
|
||||
wire [`NUM_WARPS-1:0] fpu_pending;
|
||||
VX_fpu_to_csr_if fpu_to_csr_if();
|
||||
`endif
|
||||
|
||||
`RESET_RELAY (alu_reset);
|
||||
`RESET_RELAY (lsu_reset);
|
||||
|
@ -58,7 +154,7 @@ module VX_execute #(
|
|||
`RESET_RELAY (gpu_reset);
|
||||
|
||||
VX_alu_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
.CORE_ID(CORE_ID)
|
||||
) alu_unit (
|
||||
.clk (clk),
|
||||
.reset (alu_reset),
|
||||
|
@ -68,20 +164,25 @@ module VX_execute #(
|
|||
);
|
||||
|
||||
VX_lsu_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
.CORE_ID(CORE_ID)
|
||||
) lsu_unit (
|
||||
`SCOPE_BIND_VX_execute_lsu_unit
|
||||
.clk (clk),
|
||||
.reset (lsu_reset),
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
.dcache_req_if (lsu_dcache_req_if),
|
||||
.dcache_rsp_if (lsu_dcache_rsp_if),
|
||||
`else
|
||||
.dcache_req_if (dcache_req_if),
|
||||
.dcache_rsp_if (dcache_rsp_if),
|
||||
`endif
|
||||
.lsu_req_if (lsu_req_if),
|
||||
.ld_commit_if (ld_commit_if),
|
||||
.st_commit_if (st_commit_if)
|
||||
);
|
||||
|
||||
VX_csr_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
.CORE_ID(CORE_ID)
|
||||
) csr_unit (
|
||||
.clk (clk),
|
||||
.reset (csr_reset),
|
||||
|
@ -89,7 +190,7 @@ module VX_execute #(
|
|||
.perf_memsys_if (perf_memsys_if),
|
||||
.perf_pipeline_if(perf_pipeline_if),
|
||||
`endif
|
||||
.cmt_to_csr_if (cmt_to_csr_if),
|
||||
.cmt_to_csr_if (cmt_to_csr_if),
|
||||
.fetch_to_csr_if(fetch_to_csr_if),
|
||||
.csr_req_if (csr_req_if),
|
||||
.csr_commit_if (csr_commit_if),
|
||||
|
@ -100,6 +201,9 @@ module VX_execute #(
|
|||
`else
|
||||
`UNUSED_PIN (pending),
|
||||
`endif
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
.tex_csr_if (tex_csr_if),
|
||||
`endif
|
||||
.busy (busy)
|
||||
);
|
||||
|
||||
|
@ -107,7 +211,7 @@ module VX_execute #(
|
|||
`RESET_RELAY (fpu_reset);
|
||||
|
||||
VX_fpu_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
.CORE_ID(CORE_ID)
|
||||
) fpu_unit (
|
||||
.clk (clk),
|
||||
.reset (fpu_reset),
|
||||
|
@ -120,12 +224,17 @@ module VX_execute #(
|
|||
`endif
|
||||
|
||||
VX_gpu_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
.CORE_ID(CORE_ID)
|
||||
) gpu_unit (
|
||||
`SCOPE_BIND_VX_execute_gpu_unit
|
||||
.clk (clk),
|
||||
.reset (gpu_reset),
|
||||
.gpu_req_if (gpu_req_if),
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
.tex_csr_if (tex_csr_if),
|
||||
.dcache_req_if (tex_dcache_req_if),
|
||||
.dcache_rsp_if (tex_dcache_rsp_if),
|
||||
`endif
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.gpu_commit_if (gpu_commit_if)
|
||||
);
|
||||
|
@ -137,4 +246,4 @@ module VX_execute #(
|
|||
&& (`INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_EBREAK
|
||||
|| `INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_ECALL);
|
||||
|
||||
endmodule
|
||||
endmodule
|
|
@ -11,6 +11,12 @@ module VX_gpu_unit #(
|
|||
// Inputs
|
||||
VX_gpu_req_if.slave gpu_req_if,
|
||||
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
VX_dcache_req_if.master dcache_req_if,
|
||||
VX_dcache_rsp_if.slave dcache_rsp_if,
|
||||
VX_tex_csr_if.slave tex_csr_if,
|
||||
`endif
|
||||
|
||||
// Outputs
|
||||
VX_warp_ctl_if.master warp_ctl_if,
|
||||
VX_commit_if.master gpu_commit_if
|
||||
|
@ -18,14 +24,29 @@ module VX_gpu_unit #(
|
|||
import gpu_types::*;
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
localparam WCTL_DATAW = `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS;
|
||||
localparam RSP_DATAW = `MAX(`NUM_THREADS * 32, WCTL_DATAW);
|
||||
|
||||
wire rsp_valid;
|
||||
wire [`NW_BITS-1:0] rsp_wid;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
wire [31:0] rsp_PC;
|
||||
wire [`NR_BITS-1:0] rsp_rd;
|
||||
wire rsp_wb;
|
||||
|
||||
wire [RSP_DATAW-1:0] rsp_data, rsp_data_r;
|
||||
|
||||
gpu_tmc_t tmc;
|
||||
gpu_wspawn_t wspawn;
|
||||
gpu_barrier_t barrier;
|
||||
gpu_split_t split;
|
||||
|
||||
wire [WCTL_DATAW-1:0] warp_ctl_data;
|
||||
wire is_warp_ctl;
|
||||
|
||||
wire stall_in, stall_out;
|
||||
|
||||
wire is_wspawn = (gpu_req_if.op_type == `INST_GPU_WSPAWN);
|
||||
wire is_tmc = (gpu_req_if.op_type == `INST_GPU_TMC);
|
||||
wire is_split = (gpu_req_if.op_type == `INST_GPU_SPLIT);
|
||||
|
@ -33,7 +54,8 @@ module VX_gpu_unit #(
|
|||
wire is_pred = (gpu_req_if.op_type == `INST_GPU_PRED);
|
||||
|
||||
wire [31:0] rs1_data = gpu_req_if.rs1_data[gpu_req_if.tid];
|
||||
|
||||
wire [31:0] rs2_data = gpu_req_if.rs2_data[gpu_req_if.tid];
|
||||
|
||||
wire [`NUM_THREADS-1:0] taken_tmask;
|
||||
wire [`NUM_THREADS-1:0] not_taken_tmask;
|
||||
|
||||
|
@ -52,7 +74,7 @@ module VX_gpu_unit #(
|
|||
|
||||
// wspawn
|
||||
|
||||
wire [31:0] wspawn_pc = gpu_req_if.rs2_data;
|
||||
wire [31:0] wspawn_pc = rs2_data;
|
||||
wire [`NUM_WARPS-1:0] wspawn_wmask;
|
||||
for (genvar i = 0; i < `NUM_WARPS; i++) begin
|
||||
assign wspawn_wmask[i] = (i < rs1_data);
|
||||
|
@ -73,30 +95,109 @@ module VX_gpu_unit #(
|
|||
|
||||
assign barrier.valid = is_bar;
|
||||
assign barrier.id = rs1_data[`NB_BITS-1:0];
|
||||
assign barrier.size_m1 = (`NW_BITS)'(gpu_req_if.rs2_data - 1);
|
||||
assign barrier.size_m1 = (`NW_BITS)'(rs2_data - 1);
|
||||
|
||||
// pack warp ctl result
|
||||
assign warp_ctl_data = {tmc, wspawn, split, barrier};
|
||||
|
||||
// texture
|
||||
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
|
||||
`UNUSED_VAR (gpu_req_if.op_mod)
|
||||
|
||||
VX_tex_req_if tex_req_if();
|
||||
VX_tex_rsp_if tex_rsp_if();
|
||||
|
||||
wire is_tex = (gpu_req_if.op_type == `INST_GPU_TEX);
|
||||
|
||||
assign tex_req_if.valid = gpu_req_if.valid && is_tex;
|
||||
assign tex_req_if.wid = gpu_req_if.wid;
|
||||
assign tex_req_if.tmask = gpu_req_if.tmask;
|
||||
assign tex_req_if.PC = gpu_req_if.PC;
|
||||
assign tex_req_if.rd = gpu_req_if.rd;
|
||||
assign tex_req_if.wb = gpu_req_if.wb;
|
||||
|
||||
assign tex_req_if.unit = gpu_req_if.op_mod[`NTEX_BITS-1:0];
|
||||
assign tex_req_if.coords[0] = gpu_req_if.rs1_data;
|
||||
assign tex_req_if.coords[1] = gpu_req_if.rs2_data;
|
||||
assign tex_req_if.lod = gpu_req_if.rs3_data;
|
||||
|
||||
VX_tex_unit #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) tex_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.tex_req_if (tex_req_if),
|
||||
.tex_csr_if (tex_csr_if),
|
||||
.tex_rsp_if (tex_rsp_if),
|
||||
.dcache_req_if (dcache_req_if),
|
||||
.dcache_rsp_if (dcache_rsp_if)
|
||||
);
|
||||
|
||||
assign tex_rsp_if.ready = !stall_out;
|
||||
|
||||
assign stall_in = (is_tex && ~tex_req_if.ready)
|
||||
|| (~is_tex && (tex_rsp_if.valid || stall_out));
|
||||
|
||||
assign is_warp_ctl = !(is_tex || tex_rsp_if.valid);
|
||||
|
||||
assign rsp_valid = tex_rsp_if.valid || (gpu_req_if.valid && ~is_tex);
|
||||
assign rsp_wid = tex_rsp_if.valid ? tex_rsp_if.wid : gpu_req_if.wid;
|
||||
assign rsp_tmask = tex_rsp_if.valid ? tex_rsp_if.tmask : gpu_req_if.tmask;
|
||||
assign rsp_PC = tex_rsp_if.valid ? tex_rsp_if.PC : gpu_req_if.PC;
|
||||
assign rsp_rd = tex_rsp_if.rd;
|
||||
assign rsp_wb = tex_rsp_if.valid && tex_rsp_if.wb;
|
||||
assign rsp_data = tex_rsp_if.valid ? RSP_DATAW'(tex_rsp_if.data) : RSP_DATAW'(warp_ctl_data);
|
||||
|
||||
`else
|
||||
|
||||
`UNUSED_VAR (gpu_req_if.op_mod)
|
||||
`UNUSED_VAR (gpu_req_if.rs3_data)
|
||||
`UNUSED_VAR (gpu_req_if.wb)
|
||||
`UNUSED_VAR (gpu_req_if.rd)
|
||||
|
||||
assign stall_in = stall_out;
|
||||
assign is_warp_ctl = 1;
|
||||
|
||||
assign rsp_valid = gpu_req_if.valid;
|
||||
assign rsp_wid = gpu_req_if.wid;
|
||||
assign rsp_tmask = gpu_req_if.tmask;
|
||||
assign rsp_PC = gpu_req_if.PC;
|
||||
assign rsp_rd = 0;
|
||||
assign rsp_wb = 0;
|
||||
assign rsp_data = RSP_DATAW'(warp_ctl_data);
|
||||
|
||||
`endif
|
||||
|
||||
wire is_warp_ctl_r;
|
||||
|
||||
// output
|
||||
|
||||
wire stall = ~gpu_commit_if.ready && gpu_commit_if.valid;
|
||||
assign stall_out = ~gpu_commit_if.ready && gpu_commit_if.valid;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS),
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall),
|
||||
.data_in ({gpu_req_if.valid, gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.rd, gpu_req_if.wb, tmc, wspawn, split, barrier}),
|
||||
.data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier})
|
||||
);
|
||||
.enable (!stall_out),
|
||||
.data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}),
|
||||
.data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data_r, is_warp_ctl_r})
|
||||
);
|
||||
|
||||
assign gpu_commit_if.eop = 1'b1;
|
||||
assign gpu_commit_if.data = rsp_data_r[(`NUM_THREADS * 32)-1:0];
|
||||
assign gpu_commit_if.eop = 1'b1;
|
||||
|
||||
assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready;
|
||||
assign warp_ctl_if.wid = gpu_commit_if.wid;
|
||||
// warp control reponse
|
||||
|
||||
assign {warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier} = rsp_data_r[WCTL_DATAW-1:0];
|
||||
|
||||
assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready && is_warp_ctl_r;
|
||||
assign warp_ctl_if.wid = gpu_commit_if.wid;
|
||||
|
||||
// can accept new request?
|
||||
assign gpu_req_if.ready = ~stall;
|
||||
assign gpu_req_if.ready = ~stall_in;
|
||||
|
||||
`SCOPE_ASSIGN (gpu_rsp_valid, warp_ctl_if.valid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_wid, warp_ctl_if.wid);
|
||||
|
|
|
@ -88,7 +88,7 @@ module VX_icache_stage #(
|
|||
`SCOPE_ASSIGN (icache_rsp_data, icache_rsp_if.data);
|
||||
`SCOPE_ASSIGN (icache_rsp_tag, rsp_tag);
|
||||
|
||||
`ifdef DBG_PRINT_CORE_ICACHE
|
||||
`ifdef DBG_TRACE_CORE_ICACHE
|
||||
always @(posedge clk) begin
|
||||
if (icache_req_if.valid && icache_req_if.ready) begin
|
||||
dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC);
|
||||
|
|
|
@ -23,56 +23,60 @@ module VX_issue #(
|
|||
`endif
|
||||
VX_gpu_req_if.master gpu_req_if
|
||||
);
|
||||
VX_ibuffer_if ibuffer_if();
|
||||
VX_gpr_rsp_if gpr_rsp_if();
|
||||
|
||||
VX_gpr_req_if gpr_req_if();
|
||||
assign gpr_req_if.wid = ibuffer_if.wid;
|
||||
assign gpr_req_if.rs1 = ibuffer_if.rs1;
|
||||
assign gpr_req_if.rs2 = ibuffer_if.rs2;
|
||||
assign gpr_req_if.rs3 = ibuffer_if.rs3;
|
||||
|
||||
VX_ibuffer_if ibuffer_if();
|
||||
VX_gpr_req_if gpr_req_if();
|
||||
VX_gpr_rsp_if gpr_rsp_if();
|
||||
VX_writeback_if sboard_wb_if();
|
||||
assign sboard_wb_if.valid = writeback_if.valid;
|
||||
assign sboard_wb_if.wid = writeback_if.wid;
|
||||
assign sboard_wb_if.PC = writeback_if.PC;
|
||||
assign sboard_wb_if.rd = writeback_if.rd;
|
||||
assign sboard_wb_if.eop = writeback_if.eop;
|
||||
assign sboard_wb_if.ready = writeback_if.ready;
|
||||
|
||||
VX_ibuffer_if sboard_ib_if();
|
||||
assign sboard_ib_if.valid = ibuffer_if.valid && idmux_ib_if.ready;
|
||||
assign sboard_ib_if.wid = ibuffer_if.wid;
|
||||
assign sboard_ib_if.PC = ibuffer_if.PC;
|
||||
assign sboard_ib_if.wb = ibuffer_if.wb;
|
||||
assign sboard_ib_if.rd = ibuffer_if.rd;
|
||||
assign sboard_ib_if.rd_n = ibuffer_if.rd_n;
|
||||
assign sboard_ib_if.rs1_n = ibuffer_if.rs1_n;
|
||||
assign sboard_ib_if.rs2_n = ibuffer_if.rs2_n;
|
||||
assign sboard_ib_if.rs3_n = ibuffer_if.rs3_n;
|
||||
assign sboard_ib_if.wid_n = ibuffer_if.wid_n;
|
||||
VX_ibuffer_if scoreboard_if();
|
||||
VX_ibuffer_if dispatch_if();
|
||||
|
||||
VX_ibuffer_if idmux_ib_if();
|
||||
assign idmux_ib_if.valid = ibuffer_if.valid && sboard_ib_if.ready;
|
||||
assign idmux_ib_if.wid = ibuffer_if.wid;
|
||||
assign idmux_ib_if.tmask = ibuffer_if.tmask;
|
||||
assign idmux_ib_if.PC = ibuffer_if.PC;
|
||||
assign idmux_ib_if.ex_type = ibuffer_if.ex_type;
|
||||
assign idmux_ib_if.op_type = ibuffer_if.op_type;
|
||||
assign idmux_ib_if.op_mod = ibuffer_if.op_mod;
|
||||
assign idmux_ib_if.wb = ibuffer_if.wb;
|
||||
assign idmux_ib_if.rd = ibuffer_if.rd;
|
||||
assign idmux_ib_if.rs1 = ibuffer_if.rs1;
|
||||
assign idmux_ib_if.imm = ibuffer_if.imm;
|
||||
assign idmux_ib_if.use_PC = ibuffer_if.use_PC;
|
||||
assign idmux_ib_if.use_imm = ibuffer_if.use_imm;
|
||||
// GPR request interface
|
||||
assign gpr_req_if.wid = ibuffer_if.wid;
|
||||
assign gpr_req_if.rs1 = ibuffer_if.rs1;
|
||||
assign gpr_req_if.rs2 = ibuffer_if.rs2;
|
||||
assign gpr_req_if.rs3 = ibuffer_if.rs3;
|
||||
|
||||
// scoreboard writeback interface
|
||||
assign sboard_wb_if.valid = writeback_if.valid;
|
||||
assign sboard_wb_if.wid = writeback_if.wid;
|
||||
assign sboard_wb_if.PC = writeback_if.PC;
|
||||
assign sboard_wb_if.rd = writeback_if.rd;
|
||||
assign sboard_wb_if.eop = writeback_if.eop;
|
||||
|
||||
// scoreboard interface
|
||||
assign scoreboard_if.valid = ibuffer_if.valid && dispatch_if.ready;
|
||||
assign scoreboard_if.wid = ibuffer_if.wid;
|
||||
assign scoreboard_if.PC = ibuffer_if.PC;
|
||||
assign scoreboard_if.wb = ibuffer_if.wb;
|
||||
assign scoreboard_if.rd = ibuffer_if.rd;
|
||||
assign scoreboard_if.rd_n = ibuffer_if.rd_n;
|
||||
assign scoreboard_if.rs1_n = ibuffer_if.rs1_n;
|
||||
assign scoreboard_if.rs2_n = ibuffer_if.rs2_n;
|
||||
assign scoreboard_if.rs3_n = ibuffer_if.rs3_n;
|
||||
assign scoreboard_if.wid_n = ibuffer_if.wid_n;
|
||||
|
||||
// dispatch interface
|
||||
assign dispatch_if.valid = ibuffer_if.valid && scoreboard_if.ready;
|
||||
assign dispatch_if.wid = ibuffer_if.wid;
|
||||
assign dispatch_if.tmask = ibuffer_if.tmask;
|
||||
assign dispatch_if.PC = ibuffer_if.PC;
|
||||
assign dispatch_if.ex_type = ibuffer_if.ex_type;
|
||||
assign dispatch_if.op_type = ibuffer_if.op_type;
|
||||
assign dispatch_if.op_mod = ibuffer_if.op_mod;
|
||||
assign dispatch_if.wb = ibuffer_if.wb;
|
||||
assign dispatch_if.rd = ibuffer_if.rd;
|
||||
assign dispatch_if.rs1 = ibuffer_if.rs1;
|
||||
assign dispatch_if.imm = ibuffer_if.imm;
|
||||
assign dispatch_if.use_PC = ibuffer_if.use_PC;
|
||||
assign dispatch_if.use_imm = ibuffer_if.use_imm;
|
||||
|
||||
// issue the instruction
|
||||
assign ibuffer_if.ready = sboard_ib_if.ready && idmux_ib_if.ready;
|
||||
assign ibuffer_if.ready = scoreboard_if.ready && dispatch_if.ready;
|
||||
|
||||
`RESET_RELAY (ibuf_reset);
|
||||
`RESET_RELAY (scoreboard_reset);
|
||||
`RESET_RELAY (gpr_reset);
|
||||
`RESET_RELAY (demux_reset);
|
||||
`RESET_RELAY (dispatch_reset);
|
||||
|
||||
VX_ibuffer #(
|
||||
.CORE_ID(CORE_ID)
|
||||
|
@ -87,9 +91,9 @@ module VX_issue #(
|
|||
.CORE_ID(CORE_ID)
|
||||
) scoreboard (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.ibuffer_if (sboard_ib_if),
|
||||
.writeback_if(sboard_wb_if)
|
||||
.reset (scoreboard_reset),
|
||||
.writeback_if(sboard_wb_if),
|
||||
.ibuffer_if (scoreboard_if)
|
||||
);
|
||||
|
||||
VX_gpr_stage #(
|
||||
|
@ -102,10 +106,10 @@ module VX_issue #(
|
|||
.gpr_rsp_if (gpr_rsp_if)
|
||||
);
|
||||
|
||||
VX_instr_demux instr_demux (
|
||||
VX_dispatch dispatch (
|
||||
.clk (clk),
|
||||
.reset (demux_reset),
|
||||
.ibuffer_if (idmux_ib_if),
|
||||
.reset (dispatch_reset),
|
||||
.ibuffer_if (dispatch_if),
|
||||
.gpr_rsp_if (gpr_rsp_if),
|
||||
.alu_req_if (alu_req_if),
|
||||
.lsu_req_if (lsu_req_if),
|
||||
|
@ -131,11 +135,11 @@ module VX_issue #(
|
|||
`SCOPE_ASSIGN (issue_imm, ibuffer_if.imm);
|
||||
`SCOPE_ASSIGN (issue_use_pc, ibuffer_if.use_PC);
|
||||
`SCOPE_ASSIGN (issue_use_imm, ibuffer_if.use_imm);
|
||||
`SCOPE_ASSIGN (scoreboard_delay, !sboard_wb_if.ready);
|
||||
`SCOPE_ASSIGN (execute_delay, !idmux_ib_if.ready);
|
||||
`SCOPE_ASSIGN (gpr_rsp_a, gpr_rsp_if.rs1_data);
|
||||
`SCOPE_ASSIGN (gpr_rsp_b, gpr_rsp_if.rs2_data);
|
||||
`SCOPE_ASSIGN (gpr_rsp_c, gpr_rsp_if.rs3_data);
|
||||
`SCOPE_ASSIGN (scoreboard_delay, !scoreboard_if.ready);
|
||||
`SCOPE_ASSIGN (dispatch_delay, !dispatch_if.ready);
|
||||
`SCOPE_ASSIGN (gpr_rs1, gpr_rsp_if.rs1_data);
|
||||
`SCOPE_ASSIGN (gpr_rs2, gpr_rsp_if.rs2_data);
|
||||
`SCOPE_ASSIGN (gpr_rs3, gpr_rsp_if.rs3_data);
|
||||
`SCOPE_ASSIGN (writeback_valid, writeback_if.valid);
|
||||
`SCOPE_ASSIGN (writeback_tmask, writeback_if.tmask);
|
||||
`SCOPE_ASSIGN (writeback_wid, writeback_if.wid);
|
||||
|
@ -170,7 +174,7 @@ module VX_issue #(
|
|||
if (decode_if.valid & !decode_if.ready) begin
|
||||
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (ibuffer_if.valid & !sboard_wb_if.ready) begin
|
||||
if (scoreboard_if.valid & !scoreboard_if.ready) begin
|
||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (alu_req_if.valid & !alu_req_if.ready) begin
|
||||
|
@ -204,7 +208,7 @@ module VX_issue #(
|
|||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef DBG_PRINT_PIPELINE
|
||||
`ifdef DBG_TRACE_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (alu_req_if.valid && alu_req_if.ready) begin
|
||||
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=ALU, tmask=%b, rd=%0d, rs1_data=",
|
||||
|
@ -246,6 +250,8 @@ module VX_issue #(
|
|||
`TRACE_ARRAY1D(gpu_req_if.rs1_data, `NUM_THREADS);
|
||||
dpi_trace(", rs2_data=");
|
||||
`TRACE_ARRAY1D(gpu_req_if.rs2_data, `NUM_THREADS);
|
||||
dpi_trace(", rs3_data=");
|
||||
`TRACE_ARRAY1D(gpu_req_if.rs3_data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
end
|
||||
end
|
||||
|
|
|
@ -24,7 +24,7 @@ module VX_lsu_unit #(
|
|||
|
||||
localparam REQ_ASHIFT = `CLOG2(`DCACHE_WORD_SIZE);
|
||||
|
||||
localparam ADDR_TYPEW = `NC_FLAG_BITS + `SM_ENABLE;
|
||||
localparam ADDR_TYPEW = `NC_TAG_BIT + `SM_ENABLE;
|
||||
|
||||
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
|
||||
`STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
|
||||
|
@ -80,6 +80,8 @@ module VX_lsu_unit #(
|
|||
|
||||
wire lsu_valid = lsu_req_if.valid && ~fence_wait;
|
||||
|
||||
wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.RESETW (1)
|
||||
|
@ -87,8 +89,8 @@ module VX_lsu_unit #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_in),
|
||||
.data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_req_if.wb | lsu_req_if.is_prefetch, lsu_req_if.store_data}),
|
||||
.data_out ({req_valid, req_is_dup, req_is_prefetch, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data})
|
||||
.data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}),
|
||||
.data_out ({req_valid, req_is_dup, req_is_prefetch, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data})
|
||||
);
|
||||
|
||||
// Can accept new request?
|
||||
|
@ -103,6 +105,7 @@ module VX_lsu_unit #(
|
|||
wire rsp_is_prefetch;
|
||||
|
||||
`UNUSED_VAR (rsp_type)
|
||||
`UNUSED_VAR (rsp_is_prefetch)
|
||||
|
||||
reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask;
|
||||
wire [`NUM_THREADS-1:0] rsp_rem_mask_n;
|
||||
|
@ -132,7 +135,11 @@ module VX_lsu_unit #(
|
|||
|
||||
wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n);
|
||||
|
||||
assign mbuf_raddr = dcache_rsp_if.tag[ADDR_TYPEW +: `LSUQ_ADDR_BITS];
|
||||
assign mbuf_raddr = dcache_rsp_if.tag[ADDR_TYPEW +: `LSUQ_ADDR_BITS];
|
||||
`UNUSED_VAR (dcache_rsp_if.tag)
|
||||
|
||||
// do not writeback from software prefetch
|
||||
wire req_wb2 = req_wb && ~req_is_prefetch;
|
||||
|
||||
VX_index_buffer #(
|
||||
.DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1),
|
||||
|
@ -143,8 +150,8 @@ module VX_lsu_unit #(
|
|||
.write_addr (mbuf_waddr),
|
||||
.acquire_slot (mbuf_push),
|
||||
.read_addr (mbuf_raddr),
|
||||
.write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb, req_type, req_offset, req_is_dup, req_is_prefetch}),
|
||||
.read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}),
|
||||
.write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}),
|
||||
.read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}),
|
||||
.release_addr (mbuf_raddr),
|
||||
.release_slot (mbuf_pop),
|
||||
.full (mbuf_full),
|
||||
|
@ -276,8 +283,6 @@ module VX_lsu_unit #(
|
|||
|
||||
// send load commit
|
||||
|
||||
// ignore responce from software prefetch
|
||||
wire rsp_valid = (rsp_is_prefetch)? 0:(| dcache_rsp_if.valid);
|
||||
wire load_rsp_stall = ~ld_commit_if.ready && ld_commit_if.valid;
|
||||
|
||||
VX_pipe_register #(
|
||||
|
@ -287,12 +292,12 @@ module VX_lsu_unit #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!load_rsp_stall),
|
||||
.data_in ({rsp_valid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}),
|
||||
.data_in ({dcache_rsp_if.valid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}),
|
||||
.data_out ({ld_commit_if.valid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop})
|
||||
);
|
||||
|
||||
// Can accept new cache response?
|
||||
assign dcache_rsp_if.ready = rsp_is_prefetch ? 1 : ~load_rsp_stall;
|
||||
assign dcache_rsp_if.ready = ~load_rsp_stall;
|
||||
|
||||
// scope registration
|
||||
`SCOPE_ASSIGN (dcache_req_fire, dcache_req_fire);
|
||||
|
@ -333,7 +338,7 @@ module VX_lsu_unit #(
|
|||
end
|
||||
`endif
|
||||
|
||||
`ifdef DBG_PRINT_CORE_DCACHE
|
||||
`ifdef DBG_TRACE_CORE_DCACHE
|
||||
wire dcache_req_fire_any = (| dcache_req_fire);
|
||||
always @(posedge clk) begin
|
||||
if (lsu_req_if.valid && fence_wait) begin
|
||||
|
@ -349,7 +354,7 @@ module VX_lsu_unit #(
|
|||
`TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
end else begin
|
||||
dpi_trace("%d: D$%0d Rd Req: req_is_prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire);
|
||||
dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire);
|
||||
`TRACE_ARRAY1D(req_addr, `NUM_THREADS);
|
||||
dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
|
||||
`TRACE_ARRAY1D(req_addr_type, `NUM_THREADS);
|
||||
|
@ -357,7 +362,7 @@ module VX_lsu_unit #(
|
|||
end
|
||||
end
|
||||
if (dcache_rsp_fire) begin
|
||||
dpi_trace("%d: D$%0d Rsp: rsp_is_prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=",
|
||||
dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=",
|
||||
$time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd);
|
||||
`TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS);
|
||||
dpi_trace(", is_dup=%b\n", rsp_is_dup);
|
||||
|
|
|
@ -206,6 +206,7 @@ module VX_mem_unit # (
|
|||
.LANES (`NUM_THREADS),
|
||||
.DATA_SIZE (4),
|
||||
.TAG_IN_WIDTH (`DCACHE_CORE_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (0), // SM flag
|
||||
.TYPE ("P"),
|
||||
.BUFFERED_REQ (2),
|
||||
.BUFFERED_RSP (1)
|
||||
|
|
|
@ -119,9 +119,9 @@
|
|||
|
||||
`define UP(x) (((x) > 0) ? (x) : 1)
|
||||
|
||||
`define RTRIM(x,s) x[$bits(x)-1:($bits(x)-s)]
|
||||
`define RTRIM(x, s) x[$bits(x)-1:($bits(x)-s)]
|
||||
|
||||
`define LTRIM(x,s) x[s-1:0]
|
||||
`define LTRIM(x, s) x[s-1:0]
|
||||
|
||||
`define TRACE_ARRAY1D(a, m) \
|
||||
dpi_trace("{"); \
|
||||
|
|
|
@ -6,8 +6,8 @@ module VX_scoreboard #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
VX_ibuffer_if.scoreboard ibuffer_if,
|
||||
VX_writeback_if.scoreboard writeback_if
|
||||
VX_ibuffer_if.slave ibuffer_if,
|
||||
VX_writeback_if.slave writeback_if
|
||||
);
|
||||
reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
|
||||
|
||||
|
@ -53,11 +53,12 @@ module VX_scoreboard #(
|
|||
|
||||
reg [31:0] deadlock_ctr;
|
||||
wire [31:0] deadlock_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
deadlock_ctr <= 0;
|
||||
end else begin
|
||||
`ifdef DBG_PRINT_PIPELINE
|
||||
`ifdef DBG_TRACE_PIPELINE
|
||||
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
|
||||
dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b\n",
|
||||
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
`ifndef VX_PRINT_INSTR
|
||||
`define VX_PRINT_INSTR
|
||||
`ifndef VX_TRACE_INSTR
|
||||
`define VX_TRACE_INSTR
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
task print_ex_type (
|
||||
task trace_ex_type (
|
||||
input [`EX_BITS-1:0] ex_type
|
||||
);
|
||||
case (ex_type)
|
||||
|
@ -16,7 +16,7 @@ task print_ex_type (
|
|||
endcase
|
||||
endtask
|
||||
|
||||
task print_ex_op (
|
||||
task trace_ex_op (
|
||||
input [`EX_BITS-1:0] ex_type,
|
||||
input [`INST_OP_BITS-1:0] op_type,
|
||||
input [`INST_MOD_BITS-1:0] op_mod
|
||||
|
@ -137,6 +137,7 @@ task print_ex_op (
|
|||
`INST_GPU_JOIN: dpi_trace("JOIN");
|
||||
`INST_GPU_BAR: dpi_trace("BAR");
|
||||
`INST_GPU_PRED: dpi_trace("PRED");
|
||||
`INST_GPU_TEX: dpi_trace("TEX");
|
||||
default: dpi_trace("?");
|
||||
endcase
|
||||
end
|
|
@ -71,8 +71,8 @@ module VX_warp_sched #(
|
|||
|
||||
// activate first warp
|
||||
warp_pcs[0] <= `STARTUP_ADDR;
|
||||
active_warps[0] <= '1;
|
||||
thread_masks[0] <= '1;
|
||||
active_warps[0] <= 1;
|
||||
thread_masks[0] <= 1;
|
||||
end else begin
|
||||
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
|
||||
use_wspawn <= warp_ctl_if.wspawn.wmask & (~`NUM_WARPS'(1));
|
||||
|
|
|
@ -12,7 +12,8 @@ module VX_writeback #(
|
|||
VX_commit_if.slave csr_commit_if,
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_commit_if.slave fpu_commit_if,
|
||||
`endif
|
||||
`endif
|
||||
VX_commit_if.slave gpu_commit_if,
|
||||
|
||||
// outputs
|
||||
VX_writeback_if.master writeback_if
|
||||
|
@ -22,9 +23,17 @@ module VX_writeback #(
|
|||
|
||||
localparam DATAW = `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32) + 1;
|
||||
`ifdef EXT_F_ENABLE
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
localparam NUM_RSPS = 5;
|
||||
`else
|
||||
localparam NUM_RSPS = 4;
|
||||
`endif
|
||||
`else
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
localparam NUM_RSPS = 4;
|
||||
`else
|
||||
localparam NUM_RSPS = 3;
|
||||
`endif
|
||||
`endif
|
||||
|
||||
wire wb_valid;
|
||||
|
@ -40,22 +49,27 @@ module VX_writeback #(
|
|||
wire [NUM_RSPS-1:0] rsp_ready;
|
||||
wire stall;
|
||||
|
||||
assign rsp_valid = {
|
||||
assign rsp_valid = {
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
gpu_commit_if.valid && gpu_commit_if.wb,
|
||||
`endif
|
||||
csr_commit_if.valid && csr_commit_if.wb,
|
||||
alu_commit_if.valid && alu_commit_if.wb,
|
||||
|
||||
alu_commit_if.valid && alu_commit_if.wb,
|
||||
`ifdef EXT_F_ENABLE
|
||||
fpu_commit_if.valid && fpu_commit_if.wb,
|
||||
`endif
|
||||
ld_commit_if.valid && ld_commit_if.wb
|
||||
};
|
||||
|
||||
assign rsp_data = {
|
||||
assign rsp_data = {
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
{gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.rd, gpu_commit_if.data, gpu_commit_if.eop},
|
||||
`endif
|
||||
{csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.rd, csr_commit_if.data, csr_commit_if.eop},
|
||||
{alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.rd, alu_commit_if.data, alu_commit_if.eop},
|
||||
`ifdef EXT_F_ENABLE
|
||||
{fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.rd, fpu_commit_if.data, fpu_commit_if.eop},
|
||||
`endif
|
||||
`endif
|
||||
{ ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.rd, ld_commit_if.data, ld_commit_if.eop}
|
||||
};
|
||||
|
||||
|
@ -82,8 +96,20 @@ module VX_writeback #(
|
|||
`else
|
||||
assign alu_commit_if.ready = rsp_ready[1] || ~alu_commit_if.wb;
|
||||
assign csr_commit_if.ready = rsp_ready[2] || ~csr_commit_if.wb;
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb;
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
`ifdef EXT_F_ENABLE
|
||||
assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb;
|
||||
`else
|
||||
assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb;
|
||||
`endif
|
||||
`else
|
||||
assign gpu_commit_if.ready = 1;
|
||||
`endif
|
||||
|
||||
|
||||
assign stall = ~writeback_if.ready && writeback_if.valid;
|
||||
|
||||
|
|
|
@ -201,7 +201,7 @@ module Vortex (
|
|||
`SCOPE_ASSIGN (mem_rsp_tag, mem_rsp_tag);
|
||||
`SCOPE_ASSIGN (busy, busy);
|
||||
|
||||
`ifdef DBG_PRINT_MEM
|
||||
`ifdef DBG_TRACE_MEM
|
||||
always @(posedge clk) begin
|
||||
if (mem_req_valid && mem_req_ready) begin
|
||||
if (mem_req_rw)
|
||||
|
|
|
@ -158,7 +158,7 @@ module VX_avs_wrapper #(
|
|||
.ready_out (mem_rsp_ready)
|
||||
);
|
||||
|
||||
`ifdef DBG_PRINT_AVS
|
||||
`ifdef DBG_TRACE_AVS
|
||||
always @(posedge clk) begin
|
||||
if (mem_req_valid && mem_req_ready) begin
|
||||
if (mem_req_rw) begin
|
||||
|
|
|
@ -45,12 +45,14 @@ localparam CCI_DATA_WIDTH = $bits(t_ccip_clData);
|
|||
localparam CCI_DATA_SIZE = CCI_DATA_WIDTH / 8;
|
||||
localparam CCI_ADDR_WIDTH = 32 - $clog2(CCI_DATA_SIZE);
|
||||
|
||||
|
||||
localparam AVS_RD_QUEUE_SIZE = 4;
|
||||
localparam AVS_REQ_TAGW_VX_ = `VX_MEM_TAG_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(`VX_MEM_DATA_WIDTH);
|
||||
localparam AVS_REQ_TAGW_VX = `MAX(`VX_MEM_TAG_WIDTH, AVS_REQ_TAGW_VX_);
|
||||
localparam AVS_REQ_TAGW_CCI_ = CCI_ADDR_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(CCI_DATA_WIDTH);
|
||||
localparam AVS_REQ_TAGW_CCI = `MAX(CCI_ADDR_WIDTH, AVS_REQ_TAGW_CCI_);
|
||||
localparam AVS_REQ_TAGW = `MAX(AVS_REQ_TAGW_VX, AVS_REQ_TAGW_CCI);
|
||||
localparam _VX_MEM_TAG_WIDTH = `VX_MEM_TAG_WIDTH;
|
||||
localparam _AVS_REQ_TAGW_VX = _VX_MEM_TAG_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(`VX_MEM_DATA_WIDTH);
|
||||
localparam _AVS_REQ_TAGW_VX2 = `MAX(_VX_MEM_TAG_WIDTH, _AVS_REQ_TAGW_VX);
|
||||
localparam _AVS_REQ_TAGW_CCI = CCI_ADDR_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(CCI_DATA_WIDTH);
|
||||
localparam _AVS_REQ_TAGW_CCI2 = `MAX(CCI_ADDR_WIDTH, _AVS_REQ_TAGW_CCI);
|
||||
localparam AVS_REQ_TAGW = `MAX(_AVS_REQ_TAGW_VX2, _AVS_REQ_TAGW_CCI2);
|
||||
|
||||
localparam CCI_RD_WINDOW_SIZE = 8;
|
||||
localparam CCI_RW_PENDING_SIZE= 256;
|
||||
|
@ -185,36 +187,36 @@ always @(posedge clk) begin
|
|||
case (mmio_hdr.address)
|
||||
MMIO_IO_ADDR: begin
|
||||
cmd_io_addr <= t_ccip_clAddr'(cp2af_sRxPort.c0.data);
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: MMIO_IO_ADDR: addr=%0h, data=0x%0h\n", $time, mmio_hdr.address, t_ccip_clAddr'(cp2af_sRxPort.c0.data));
|
||||
`endif
|
||||
end
|
||||
MMIO_MEM_ADDR: begin
|
||||
cmd_mem_addr <= $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data);
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: MMIO_MEM_ADDR: addr=%0h, data=0x%0h\n", $time, mmio_hdr.address, $bits(cmd_mem_addr)'(cp2af_sRxPort.c0.data));
|
||||
`endif
|
||||
end
|
||||
MMIO_DATA_SIZE: begin
|
||||
cmd_data_size <= $bits(cmd_data_size)'(cp2af_sRxPort.c0.data);
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: MMIO_DATA_SIZE: addr=%0h, data=%0d\n", $time, mmio_hdr.address, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data));
|
||||
`endif
|
||||
end
|
||||
MMIO_CMD_TYPE: begin
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: MMIO_CMD_TYPE: addr=%0h, data=%0d\n", $time, mmio_hdr.address, $bits(cmd_type)'(cp2af_sRxPort.c0.data));
|
||||
`endif
|
||||
end
|
||||
`ifdef SCOPE
|
||||
MMIO_SCOPE_WRITE: begin
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: MMIO_SCOPE_WRITE: addr=%0h, data=%0h\n", $time, mmio_hdr.address, 64'(cp2af_sRxPort.c0.data));
|
||||
`endif
|
||||
end
|
||||
`endif
|
||||
default: begin
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: Unknown MMIO Wr: addr=%0h, data=%0h\n", $time, mmio_hdr.address, $bits(cmd_data_size)'(cp2af_sRxPort.c0.data));
|
||||
`endif
|
||||
end
|
||||
|
@ -241,7 +243,7 @@ always @(posedge clk) begin
|
|||
16'h0008: mmio_tx.data <= 64'h0; // reserved
|
||||
MMIO_STATUS: begin
|
||||
mmio_tx.data <= 64'({cout_q_dout, !cout_q_empty, 8'(state)});
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
if (state != STATE_WIDTH'(mmio_tx.data)) begin
|
||||
dpi_trace("%d: MMIO_STATUS: addr=%0h, state=%0d\n", $time, mmio_hdr.address, state);
|
||||
end
|
||||
|
@ -250,20 +252,20 @@ always @(posedge clk) begin
|
|||
`ifdef SCOPE
|
||||
MMIO_SCOPE_READ: begin
|
||||
mmio_tx.data <= cmd_scope_rdata;
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: MMIO_SCOPE_READ: addr=%0h, data=%0h\n", $time, mmio_hdr.address, cmd_scope_rdata);
|
||||
`endif
|
||||
end
|
||||
`endif
|
||||
MMIO_DEV_CAPS: begin
|
||||
mmio_tx.data <= dev_caps;
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: MMIO_DEV_CAPS: addr=%0h, data=%0h\n", $time, mmio_hdr.address, dev_caps);
|
||||
`endif
|
||||
end
|
||||
default: begin
|
||||
mmio_tx.data <= 64'h0;
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: Unknown MMIO Rd: addr=%0h\n", $time, mmio_hdr.address);
|
||||
`endif
|
||||
end
|
||||
|
@ -297,19 +299,19 @@ always @(posedge clk) begin
|
|||
STATE_IDLE: begin
|
||||
case (cmd_type)
|
||||
CMD_MEM_READ: begin
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: STATE READ: ia=%0h addr=%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size);
|
||||
`endif
|
||||
state <= STATE_READ;
|
||||
end
|
||||
CMD_MEM_WRITE: begin
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: STATE WRITE: ia=%0h addr=%0h size=%0d\n", $time, cmd_io_addr, cmd_mem_addr, cmd_data_size);
|
||||
`endif
|
||||
state <= STATE_WRITE;
|
||||
end
|
||||
CMD_RUN: begin
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: STATE START\n", $time);
|
||||
`endif
|
||||
vx_reset <= 1;
|
||||
|
@ -324,7 +326,7 @@ always @(posedge clk) begin
|
|||
STATE_READ: begin
|
||||
if (cmd_read_done) begin
|
||||
state <= STATE_IDLE;
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: STATE IDLE\n", $time);
|
||||
`endif
|
||||
end
|
||||
|
@ -333,7 +335,7 @@ always @(posedge clk) begin
|
|||
STATE_WRITE: begin
|
||||
if (cmd_write_done) begin
|
||||
state <= STATE_IDLE;
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: STATE IDLE\n", $time);
|
||||
`endif
|
||||
end
|
||||
|
@ -345,7 +347,7 @@ always @(posedge clk) begin
|
|||
if (cmd_run_done) begin
|
||||
vx_started <= 0;
|
||||
state <= STATE_IDLE;
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: STATE IDLE\n", $time);
|
||||
`endif
|
||||
end
|
||||
|
@ -699,7 +701,7 @@ always @(posedge clk) begin
|
|||
if (cci_rd_req_fire) begin
|
||||
cci_rd_req_addr <= cci_rd_req_addr + 1;
|
||||
cci_rd_req_ctr <= cci_rd_req_ctr + 1;
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: CCI Rd Req: addr=%0h, tag=%0h, rem=%0d, pending=%0d\n", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr - 1), cci_pending_reads);
|
||||
`endif
|
||||
end
|
||||
|
@ -709,13 +711,13 @@ always @(posedge clk) begin
|
|||
if (CCI_RD_QUEUE_TAGW'(cci_rd_rsp_ctr) == CCI_RD_QUEUE_TAGW'(CCI_RD_WINDOW_SIZE-1)) begin
|
||||
cci_mem_wr_req_addr_base <= cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE);
|
||||
end
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: CCI Rd Rsp: idx=%0d, ctr=%0d, data=%0h\n", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data);
|
||||
`endif
|
||||
end
|
||||
|
||||
if (cci_rdq_pop) begin
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: CCI Rd Queue Pop: pending=%0d\n", $time, cci_pending_reads);
|
||||
`endif
|
||||
end
|
||||
|
@ -856,13 +858,13 @@ begin
|
|||
if (cci_wr_req_ctr == CCI_ADDR_WIDTH'(1)) begin
|
||||
cci_wr_req_done <= 1;
|
||||
end
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d, data=%0h\n", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data);
|
||||
`endif
|
||||
end
|
||||
|
||||
if (cci_wr_rsp_fire) begin
|
||||
`ifdef DBG_PRINT_OPAE
|
||||
`ifdef DBG_TRACE_OPAE
|
||||
dpi_trace("%d: CCI Wr Rsp: pending=%0d\n", $time, cci_pending_writes);
|
||||
`endif
|
||||
end
|
||||
|
|
2
hw/rtl/cache/VX_bank.sv
vendored
|
@ -509,7 +509,7 @@ module VX_bank #(
|
|||
assign perf_mshr_stalls = mshr_alm_full;
|
||||
`endif
|
||||
|
||||
`ifdef DBG_PRINT_CACHE_BANK
|
||||
`ifdef DBG_TRACE_CACHE_BANK
|
||||
wire crsq_fire = crsq_valid && crsq_ready;
|
||||
wire pipeline_stall = (mshr_valid || mem_rsp_valid || creq_valid)
|
||||
&& ~(mshr_fire || mem_rsp_fire || creq_fire);
|
||||
|
|
2
hw/rtl/cache/VX_cache_define.vh
vendored
|
@ -53,7 +53,7 @@
|
|||
|
||||
`define LINE_TAG_ADDR(x) x[`LINE_ADDR_WIDTH-1 : `LINE_SELECT_BITS]
|
||||
|
||||
`define CACHE_REQ_INFO_RNG CORE_TAG_WIDTH-1:(CORE_TAG_WIDTH-`NW_BITS-32)
|
||||
`define CACHE_REQ_INFO_RNG CORE_TAG_WIDTH-1 : (CORE_TAG_WIDTH-`DBG_CACHE_REQ_MDATAW)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
|
2
hw/rtl/cache/VX_data_access.sv
vendored
|
@ -119,7 +119,7 @@ module VX_data_access #(
|
|||
|
||||
`UNUSED_VAR (stall)
|
||||
|
||||
`ifdef DBG_PRINT_CACHE_DATA
|
||||
`ifdef DBG_TRACE_CACHE_DATA
|
||||
always @(posedge clk) begin
|
||||
if (fill && ~stall) begin
|
||||
dpi_trace("%d: cache%0d:%0d data-fill: addr=%0h, blk_addr=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, fill_data);
|
||||
|
|
2
hw/rtl/cache/VX_miss_resrv.sv
vendored
|
@ -202,7 +202,7 @@ module VX_miss_resrv #(
|
|||
|
||||
`UNUSED_VAR (lookup_valid)
|
||||
|
||||
`ifdef DBG_PRINT_CACHE_MSHR
|
||||
`ifdef DBG_TRACE_CACHE_MSHR
|
||||
always @(posedge clk) begin
|
||||
if (allocate_fire || fill_valid || dequeue_fire || lookup_replay || lookup_valid || release_valid) begin
|
||||
if (allocate_fire)
|
||||
|
|
4
hw/rtl/cache/VX_shared_mem.sv
vendored
|
@ -229,7 +229,7 @@ module VX_shared_mem #(
|
|||
core_rsp_data_in = 'x;
|
||||
bank_rsp_sel_n = bank_rsp_sel_r;
|
||||
for (integer i = 0; i < NUM_BANKS; i++) begin
|
||||
if (per_bank_core_req_valid[i]
|
||||
if (core_req_read_mask[i]
|
||||
&& (core_rsp_tag_in[CORE_TAG_ID_BITS-1:0] == per_bank_core_req_tag[i][CORE_TAG_ID_BITS-1:0])) begin
|
||||
core_rsp_valids_in[per_bank_core_req_tid[i]] = 1;
|
||||
core_rsp_data_in[per_bank_core_req_tid[i]] = per_bank_core_rsp_data[i];
|
||||
|
@ -271,7 +271,7 @@ module VX_shared_mem #(
|
|||
end
|
||||
`endif
|
||||
|
||||
`ifdef DBG_PRINT_CACHE_BANK
|
||||
`ifdef DBG_TRACE_CACHE_BANK
|
||||
|
||||
reg is_multi_tag_req;
|
||||
`IGNORE_UNUSED_BEGIN
|
||||
|
|
2
hw/rtl/cache/VX_tag_access.sv
vendored
|
@ -61,7 +61,7 @@ module VX_tag_access #(
|
|||
|
||||
`UNUSED_VAR (stall)
|
||||
|
||||
`ifdef DBG_PRINT_CACHE_TAG
|
||||
`ifdef DBG_TRACE_CACHE_TAG
|
||||
always @(posedge clk) begin
|
||||
if (fill && ~stall) begin
|
||||
dpi_trace("%d: cache%0d:%0d tag-fill: addr=%0h, blk_addr=%0d, tag_id=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr, BANK_ID), line_addr, line_tag);
|
||||
|
|
|
@ -3,8 +3,7 @@
|
|||
`include "defs_div_sqrt_mvp.sv"
|
||||
|
||||
`TRACING_OFF
|
||||
module VX_fpu_fpnew
|
||||
#(
|
||||
module VX_fpu_fpnew #(
|
||||
parameter TAGW = 1,
|
||||
parameter FMULADD = 1,
|
||||
parameter FDIVSQRT = 1,
|
||||
|
|
|
@ -12,9 +12,11 @@ interface VX_gpu_req_if();
|
|||
wire [31:0] PC;
|
||||
wire [31:0] next_PC;
|
||||
wire [`INST_GPU_BITS-1:0] op_type;
|
||||
wire [`INST_MOD_BITS-1:0] op_mod;
|
||||
wire [`NT_BITS-1:0] tid;
|
||||
wire [`NUM_THREADS-1:0][31:0] rs1_data;
|
||||
wire [31:0] rs2_data;
|
||||
wire [`NUM_THREADS-1:0][31:0] rs2_data;
|
||||
wire [`NUM_THREADS-1:0][31:0] rs3_data;
|
||||
wire [`NR_BITS-1:0] rd;
|
||||
wire wb;
|
||||
|
||||
|
@ -27,9 +29,11 @@ interface VX_gpu_req_if();
|
|||
output PC,
|
||||
output next_PC,
|
||||
output op_type,
|
||||
output op_mod,
|
||||
output tid,
|
||||
output rs1_data,
|
||||
output rs2_data,
|
||||
output rs3_data,
|
||||
output rd,
|
||||
output wb,
|
||||
input ready
|
||||
|
@ -42,9 +46,11 @@ interface VX_gpu_req_if();
|
|||
input PC,
|
||||
input next_PC,
|
||||
input op_type,
|
||||
input op_mod,
|
||||
input tid,
|
||||
input rs1_data,
|
||||
input rs2_data,
|
||||
input rs3_data,
|
||||
input rd,
|
||||
input wb,
|
||||
output ready
|
||||
|
|
|
@ -76,20 +76,6 @@ interface VX_ibuffer_if ();
|
|||
input wid_n,
|
||||
output ready
|
||||
);
|
||||
|
||||
modport scoreboard (
|
||||
input valid,
|
||||
input wid,
|
||||
input PC,
|
||||
input wb,
|
||||
input rd,
|
||||
input rd_n,
|
||||
input rs1_n,
|
||||
input rs2_n,
|
||||
input rs3_n,
|
||||
input wid_n,
|
||||
output ready
|
||||
);
|
||||
|
||||
endinterface
|
||||
|
||||
|
|
26
hw/rtl/interfaces/VX_tex_csr_if.sv
Normal file
|
@ -0,0 +1,26 @@
|
|||
`ifndef VX_TEX_CSR_IF
|
||||
`define VX_TEX_CSR_IF
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_tex_csr_if ();
|
||||
|
||||
wire write_enable;
|
||||
wire [`CSR_ADDR_BITS-1:0] write_addr;
|
||||
wire [31:0] write_data;
|
||||
|
||||
modport master (
|
||||
output write_enable,
|
||||
output write_addr,
|
||||
output write_data
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input write_enable,
|
||||
input write_addr,
|
||||
input write_data
|
||||
);
|
||||
|
||||
endinterface
|
||||
|
||||
`endif
|
51
hw/rtl/interfaces/VX_tex_req_if.sv
Normal file
|
@ -0,0 +1,51 @@
|
|||
`ifndef VX_TEX_REQ_IF
|
||||
`define VX_TEX_REQ_IF
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_tex_req_if ();
|
||||
|
||||
wire valid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
wire [`NR_BITS-1:0] rd;
|
||||
wire wb;
|
||||
|
||||
wire [`NTEX_BITS-1:0] unit;
|
||||
wire [1:0][`NUM_THREADS-1:0][31:0] coords;
|
||||
wire [`NUM_THREADS-1:0][31:0] lod;
|
||||
|
||||
wire ready;
|
||||
|
||||
modport master (
|
||||
output valid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
output rd,
|
||||
output wb,
|
||||
output unit,
|
||||
output coords,
|
||||
output lod,
|
||||
input ready
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input valid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
input rd,
|
||||
input wb,
|
||||
input unit,
|
||||
input coords,
|
||||
input lod,
|
||||
output ready
|
||||
);
|
||||
|
||||
endinterface
|
||||
`endif
|
||||
|
||||
|
||||
|
43
hw/rtl/interfaces/VX_tex_rsp_if.sv
Normal file
|
@ -0,0 +1,43 @@
|
|||
`ifndef VX_TEX_RSP_IF
|
||||
`define VX_TEX_RSP_IF
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_tex_rsp_if ();
|
||||
|
||||
wire valid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
wire [`NR_BITS-1:0] rd;
|
||||
wire wb;
|
||||
wire [`NUM_THREADS-1:0][31:0] data;
|
||||
wire ready;
|
||||
|
||||
modport master (
|
||||
output valid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
output rd,
|
||||
output wb,
|
||||
output data,
|
||||
input ready
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input valid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
input rd,
|
||||
input wb,
|
||||
input data,
|
||||
output ready
|
||||
);
|
||||
|
||||
endinterface
|
||||
|
||||
`endif
|
||||
|
||||
|
|
@ -36,15 +36,6 @@ interface VX_writeback_if ();
|
|||
output ready
|
||||
);
|
||||
|
||||
modport scoreboard (
|
||||
input valid,
|
||||
input wid,
|
||||
input PC,
|
||||
input rd,
|
||||
input eop,
|
||||
output ready
|
||||
);
|
||||
|
||||
endinterface
|
||||
|
||||
`endif
|
||||
|
|
|
@ -93,13 +93,13 @@ module VX_scope #(
|
|||
CMD_SET_START: begin
|
||||
delay_val <= $bits(delay_val)'(cmd_data);
|
||||
cmd_start <= 1;
|
||||
`ifdef DBG_PRINT_SCOPE
|
||||
`ifdef DBG_TRACE_SCOPE
|
||||
dpi_trace("%d: *** scope: CMD_SET_START: delay_val=%0d\n", $time, $bits(delay_val)'(cmd_data));
|
||||
`endif
|
||||
end
|
||||
CMD_SET_STOP: begin
|
||||
waddr_end <= $bits(waddr)'(cmd_data);
|
||||
`ifdef DBG_PRINT_SCOPE
|
||||
`ifdef DBG_TRACE_SCOPE
|
||||
dpi_trace("%d: *** scope: CMD_SET_STOP: waddr_end=%0d\n", $time, $bits(waddr)'(cmd_data));
|
||||
`endif
|
||||
end
|
||||
|
@ -116,7 +116,7 @@ module VX_scope #(
|
|||
delta <= 0;
|
||||
delay_cntr <= 0;
|
||||
start_time <= timestamp;
|
||||
`ifdef DBG_PRINT_SCOPE
|
||||
`ifdef DBG_TRACE_SCOPE
|
||||
dpi_trace("%d: *** scope: recording start - start_time=%0d\n", $time, timestamp);
|
||||
`endif
|
||||
end else begin
|
||||
|
@ -132,7 +132,7 @@ module VX_scope #(
|
|||
recording <= 1;
|
||||
delta <= 0;
|
||||
start_time <= timestamp;
|
||||
`ifdef DBG_PRINT_SCOPE
|
||||
`ifdef DBG_TRACE_SCOPE
|
||||
dpi_trace("%d: *** scope: recording start - start_time=%0d\n", $time, timestamp);
|
||||
`endif
|
||||
end
|
||||
|
@ -161,7 +161,7 @@ module VX_scope #(
|
|||
|
||||
if (stop
|
||||
|| (waddr >= waddr_end)) begin
|
||||
`ifdef DBG_PRINT_SCOPE
|
||||
`ifdef DBG_TRACE_SCOPE
|
||||
dpi_trace("%d: *** scope: recording stop - waddr=(%0d, %0d)\n", $time, waddr, waddr_end);
|
||||
`endif
|
||||
waddr <= waddr; // keep last address
|
||||
|
@ -229,7 +229,7 @@ module VX_scope #(
|
|||
|
||||
assign bus_out = bus_out_r;
|
||||
|
||||
`ifdef DBG_PRINT_SCOPE
|
||||
`ifdef DBG_TRACE_SCOPE
|
||||
always @(posedge clk) begin
|
||||
if (bus_read) begin
|
||||
dpi_trace("%d: scope-read: cmd=%0d, addr=%0d, value=%0h\n", $time, get_cmd, raddr, bus_out);
|
||||
|
|
178
hw/rtl/tex_unit/VX_tex_addr.sv
Normal file
|
@ -0,0 +1,178 @@
|
|||
`include "VX_tex_define.vh"
|
||||
|
||||
module VX_tex_addr #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter REQ_INFOW = 1,
|
||||
parameter NUM_REQS = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// inputs
|
||||
|
||||
input wire req_valid,
|
||||
input wire [NUM_REQS-1:0] req_tmask,
|
||||
input wire [1:0][NUM_REQS-1:0][31:0] req_coords,
|
||||
input wire [`TEX_FORMAT_BITS-1:0] req_format,
|
||||
input wire [`TEX_FILTER_BITS-1:0] req_filter,
|
||||
input wire [1:0][`TEX_WRAP_BITS-1:0] req_wraps,
|
||||
input wire [`TEX_ADDR_BITS-1:0] req_baseaddr,
|
||||
input wire [NUM_REQS-1:0][`TEX_MIPOFF_BITS-1:0] req_mipoff,
|
||||
input wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] req_logdims,
|
||||
input wire [REQ_INFOW-1:0] req_info,
|
||||
output wire req_ready,
|
||||
|
||||
// outputs
|
||||
|
||||
output wire rsp_valid,
|
||||
output wire [NUM_REQS-1:0] rsp_tmask,
|
||||
output wire [`TEX_FILTER_BITS-1:0] rsp_filter,
|
||||
output wire [`TEX_STRIDE_BITS-1:0] rsp_stride,
|
||||
output wire [NUM_REQS-1:0][3:0][31:0] rsp_addr,
|
||||
output wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends,
|
||||
output wire [REQ_INFOW-1:0] rsp_info,
|
||||
input wire rsp_ready
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
localparam PITCH_BITS = `MAX(`TEX_DIM_BITS, `TEX_STRIDE_BITS) + 1;
|
||||
localparam SCALED_U_W = `FIXED_INT + `TEX_STRIDE_BITS;
|
||||
localparam SCALED_X_W = (2 * `FIXED_INT);
|
||||
localparam SCALED_V_W = SCALED_X_W + `TEX_STRIDE_BITS;
|
||||
|
||||
wire valid_s0;
|
||||
wire [NUM_REQS-1:0] tmask_s0;
|
||||
wire [`TEX_FILTER_BITS-1:0] filter_s0;
|
||||
wire [REQ_INFOW-1:0] req_info_s0;
|
||||
wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_lo, clamped_lo_s0;
|
||||
wire [NUM_REQS-1:0][1:0][`FIXED_FRAC-1:0] clamped_hi, clamped_hi_s0;
|
||||
wire [`TEX_STRIDE_BITS-1:0] log_stride, log_stride_s0;
|
||||
wire [NUM_REQS-1:0][31:0] mip_addr, mip_addr_s0;
|
||||
wire [NUM_REQS-1:0][1:0][`TEX_DIM_BITS-1:0] log_dims_s0;
|
||||
wire [NUM_REQS-1:0][PITCH_BITS-1:0] log_pitch, log_pitch_s0;
|
||||
|
||||
wire stall_out;
|
||||
|
||||
// stride
|
||||
|
||||
VX_tex_stride #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) tex_stride (
|
||||
.format (req_format),
|
||||
.log_stride (log_stride)
|
||||
);
|
||||
|
||||
// addressing mode
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
for (genvar j = 0; j < 2; ++j) begin
|
||||
wire [`FIXED_FRAC-1:0] delta = (`FIXED_HALF >> req_logdims[i][j]);
|
||||
wire [31:0] coord_lo = req_filter ? (req_coords[j][i] - 32'(delta)) : req_coords[j][i];
|
||||
wire [31:0] coord_hi = req_filter ? (req_coords[j][i] + 32'(delta)) : req_coords[j][i];
|
||||
|
||||
VX_tex_wrap #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) tex_wrap_lo (
|
||||
.wrap_i (req_wraps[j]),
|
||||
.coord_i (coord_lo),
|
||||
.coord_o (clamped_lo[i][j])
|
||||
);
|
||||
|
||||
VX_tex_wrap #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) tex_wrap_hi (
|
||||
.wrap_i (req_wraps[j]),
|
||||
.coord_i (coord_hi),
|
||||
.coord_o (clamped_hi[i][j])
|
||||
);
|
||||
end
|
||||
assign log_pitch[i] = PITCH_BITS'(req_logdims[i][0]) + PITCH_BITS'(log_stride);
|
||||
assign mip_addr[i] = req_baseaddr + 32'(req_mipoff[i]);
|
||||
end
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + REQ_INFOW + NUM_REQS * (PITCH_BITS + 2 * `TEX_DIM_BITS + 32 + 2 * 2 * `FIXED_FRAC)),
|
||||
.RESETW (1)
|
||||
) pipe_reg0 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, log_pitch, req_logdims, mip_addr, clamped_lo, clamped_hi}),
|
||||
.data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, log_pitch_s0, log_dims_s0, mip_addr_s0, clamped_lo_s0, clamped_hi_s0})
|
||||
);
|
||||
|
||||
// addresses generation
|
||||
|
||||
wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_lo;
|
||||
wire [NUM_REQS-1:0][1:0][`FIXED_INT-1:0] scaled_hi;
|
||||
wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] blends;
|
||||
wire [NUM_REQS-1:0][3:0][31:0] addr;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
for (genvar j = 0; j < 2; ++j) begin
|
||||
assign scaled_lo[i][j] = scale_to_dim(clamped_lo_s0[i][j], log_dims_s0[i][j]);
|
||||
assign scaled_hi[i][j] = scale_to_dim(clamped_hi_s0[i][j], log_dims_s0[i][j]);
|
||||
assign blends[i][j] = filter_s0 ? clamped_lo_s0[i][j][`BLEND_FRAC-1:0] : `BLEND_FRAC'(0);
|
||||
end
|
||||
end
|
||||
|
||||
`UNUSED_VAR (log_pitch_s0)
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
wire [SCALED_U_W-1:0] offset_u_lo = SCALED_U_W'(scaled_lo[i][0]) << log_stride_s0;
|
||||
wire [SCALED_U_W-1:0] offset_u_hi = SCALED_U_W'(scaled_hi[i][0]) << log_stride_s0;
|
||||
|
||||
wire [SCALED_V_W-1:0] offset_v_lo = SCALED_V_W'(scaled_lo[i][1]) << log_pitch_s0[i];
|
||||
wire [SCALED_V_W-1:0] offset_v_hi = SCALED_V_W'(scaled_hi[i][1]) << log_pitch_s0[i];
|
||||
|
||||
wire [31:0] base_addr_lo = mip_addr_s0[i] + 32'(offset_v_lo);
|
||||
wire [31:0] base_addr_hi = mip_addr_s0[i] + 32'(offset_v_hi);
|
||||
|
||||
assign addr[i][0] = base_addr_lo + 32'(offset_u_lo);
|
||||
assign addr[i][1] = base_addr_lo + 32'(offset_u_hi);
|
||||
assign addr[i][2] = base_addr_hi + 32'(offset_u_lo);
|
||||
assign addr[i][3] = base_addr_hi + 32'(offset_u_hi);
|
||||
end
|
||||
|
||||
assign stall_out = rsp_valid && ~rsp_ready;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `BLEND_FRAC) + REQ_INFOW),
|
||||
.RESETW (1)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, addr, blends, req_info_s0}),
|
||||
.data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_stride, rsp_addr, rsp_blends, rsp_info})
|
||||
);
|
||||
|
||||
assign req_ready = ~stall_out;
|
||||
|
||||
`ifdef DBG_TRACE_TEX
|
||||
wire [`NW_BITS-1:0] rsp_wid;
|
||||
wire [31:0] rsp_PC;
|
||||
|
||||
assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (rsp_valid && rsp_ready) begin
|
||||
dpi_trace("%d: core%0d-tex-addr: wid=%0d, PC=%0h, tmask=%b, req_filter=%0d, tride=%0d, addr=",
|
||||
$time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask, rsp_filter, rsp_stride);
|
||||
`TRACE_ARRAY2D(rsp_addr, 4, NUM_REQS);
|
||||
dpi_trace("\n");
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
function logic [`FIXED_INT-1:0] scale_to_dim (input logic [`FIXED_FRAC-1:0] src,
|
||||
input logic [`TEX_DIM_BITS-1:0] dim);
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
logic [`FIXED_BITS-1:0] out;
|
||||
`IGNORE_WARNINGS_END
|
||||
out = `FIXED_BITS'(src) << dim;
|
||||
return out[`FIXED_FRAC +: `FIXED_INT];
|
||||
endfunction
|
||||
|
||||
endmodule
|
39
hw/rtl/tex_unit/VX_tex_define.vh
Normal file
|
@ -0,0 +1,39 @@
|
|||
`ifndef VX_TEX_DEFINE
|
||||
`define VX_TEX_DEFINE
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
`define FIXED_BITS 32
|
||||
`define FIXED_FRAC 20
|
||||
`define FIXED_INT (`FIXED_BITS - `FIXED_FRAC)
|
||||
`define FIXED_ONE (2 ** `FIXED_FRAC)
|
||||
`define FIXED_HALF (`FIXED_ONE >> 1)
|
||||
`define FIXED_MASK (`FIXED_ONE - 1)
|
||||
|
||||
`define TEX_ADDR_BITS 32
|
||||
`define TEX_FORMAT_BITS 3
|
||||
`define TEX_WRAP_BITS 2
|
||||
`define TEX_DIM_BITS 4
|
||||
`define TEX_FILTER_BITS 1
|
||||
|
||||
`define TEX_MIPOFF_BITS (2*12+1)
|
||||
`define TEX_STRIDE_BITS 2
|
||||
|
||||
`define TEX_LOD_BITS 4
|
||||
`define TEX_MIP_BITS (`NTEX_BITS + `TEX_LOD_BITS)
|
||||
|
||||
`define TEX_WRAP_CLAMP 0
|
||||
`define TEX_WRAP_REPEAT 1
|
||||
`define TEX_WRAP_MIRROR 2
|
||||
|
||||
`define BLEND_FRAC 8
|
||||
`define BLEND_ONE (2 ** `BLEND_FRAC)
|
||||
|
||||
`define TEX_FORMAT_R8G8B8A8 `TEX_FORMAT_BITS'(0)
|
||||
`define TEX_FORMAT_R5G6B5 `TEX_FORMAT_BITS'(1)
|
||||
`define TEX_FORMAT_R4G4B4A4 `TEX_FORMAT_BITS'(2)
|
||||
`define TEX_FORMAT_L8A8 `TEX_FORMAT_BITS'(3)
|
||||
`define TEX_FORMAT_L8 `TEX_FORMAT_BITS'(4)
|
||||
`define TEX_FORMAT_A8 `TEX_FORMAT_BITS'(5)
|
||||
|
||||
`endif
|
58
hw/rtl/tex_unit/VX_tex_format.sv
Normal file
|
@ -0,0 +1,58 @@
|
|||
`include "VX_tex_define.vh"
|
||||
|
||||
module VX_tex_format #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire [`TEX_FORMAT_BITS-1:0] format,
|
||||
input wire [31:0] texel_in,
|
||||
output wire [31:0] texel_out
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
reg [31:0] texel_out_r;
|
||||
|
||||
always @(*) begin
|
||||
case (format)
|
||||
`TEX_FORMAT_R8G8B8A8: begin
|
||||
texel_out_r[07:00] = texel_in[7:0];
|
||||
texel_out_r[15:08] = texel_in[15:8];
|
||||
texel_out_r[23:16] = texel_in[23:16];
|
||||
texel_out_r[31:24] = texel_in[31:24];
|
||||
end
|
||||
`TEX_FORMAT_R5G6B5: begin
|
||||
texel_out_r[07:00] = {texel_in[15:11], texel_in[15:13]};
|
||||
texel_out_r[15:08] = {texel_in[10:5], texel_in[10:9]};
|
||||
texel_out_r[23:16] = {texel_in[4:0], texel_in[4:2]};
|
||||
texel_out_r[31:24] = 8'hff;
|
||||
end
|
||||
`TEX_FORMAT_R4G4B4A4: begin
|
||||
texel_out_r[07:00] = {texel_in[11:8], texel_in[15:12]};
|
||||
texel_out_r[15:08] = {2{texel_in[7:4]}};
|
||||
texel_out_r[23:16] = {2{texel_in[3:0]}};
|
||||
texel_out_r[31:24] = {2{texel_in[15:12]}};
|
||||
end
|
||||
`TEX_FORMAT_L8A8: begin
|
||||
texel_out_r[07:00] = texel_in[7:0];
|
||||
texel_out_r[15:08] = texel_in[7:0];
|
||||
texel_out_r[23:16] = texel_in[7:0];
|
||||
texel_out_r[31:24] = texel_in[15:8];
|
||||
end
|
||||
`TEX_FORMAT_L8: begin
|
||||
texel_out_r[07:00] = texel_in[7:0];
|
||||
texel_out_r[15:08] = texel_in[7:0];
|
||||
texel_out_r[23:16] = texel_in[7:0];
|
||||
texel_out_r[31:24] = 8'hff;
|
||||
end
|
||||
//`TEX_FORMAT_A8
|
||||
default: begin
|
||||
texel_out_r[07:00] = 0;
|
||||
texel_out_r[15:08] = 0;
|
||||
texel_out_r[23:16] = 0;
|
||||
texel_out_r[31:24] = texel_in[7:0];
|
||||
end
|
||||
endcase
|
||||
end
|
||||
|
||||
assign texel_out = texel_out_r;
|
||||
|
||||
endmodule
|
16
hw/rtl/tex_unit/VX_tex_lerp.sv
Normal file
|
@ -0,0 +1,16 @@
|
|||
`include "VX_tex_define.vh"
|
||||
|
||||
module VX_tex_lerp (
|
||||
input wire [3:0][7:0] in1,
|
||||
input wire [3:0][7:0] in2,
|
||||
input wire [8:0] alpha,
|
||||
input wire [7:0] beta,
|
||||
output wire [3:0][7:0] out
|
||||
);
|
||||
for (genvar i = 0; i < 4; ++i) begin
|
||||
wire [16:0] sum = in1[i] * alpha + in2[i] * beta;
|
||||
`UNUSED_VAR (sum)
|
||||
assign out[i] = sum[15:8];
|
||||
end
|
||||
|
||||
endmodule
|
295
hw/rtl/tex_unit/VX_tex_mem.sv
Normal file
|
@ -0,0 +1,295 @@
|
|||
`include "VX_tex_define.vh"
|
||||
module VX_tex_mem #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter REQ_INFOW = 1,
|
||||
parameter NUM_REQS = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// memory interface
|
||||
VX_dcache_req_if.master dcache_req_if,
|
||||
VX_dcache_rsp_if.slave dcache_rsp_if,
|
||||
|
||||
// inputs
|
||||
input wire req_valid,
|
||||
input wire [NUM_REQS-1:0] req_tmask,
|
||||
input wire [`TEX_FILTER_BITS-1:0] req_filter,
|
||||
input wire [`TEX_STRIDE_BITS-1:0] req_stride,
|
||||
input wire [NUM_REQS-1:0][3:0][31:0] req_addr,
|
||||
input wire [REQ_INFOW-1:0] req_info,
|
||||
output wire req_ready,
|
||||
|
||||
// outputs
|
||||
output wire rsp_valid,
|
||||
output wire [NUM_REQS-1:0] rsp_tmask,
|
||||
output wire [NUM_REQS-1:0][3:0][31:0] rsp_data,
|
||||
output wire [REQ_INFOW-1:0] rsp_info,
|
||||
input wire rsp_ready
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
localparam RSP_CTR_W = $clog2(NUM_REQS * 4 + 1);
|
||||
|
||||
wire [3:0] dup_reqs;
|
||||
wire [3:0][NUM_REQS-1:0][29:0] req_addr_w;
|
||||
wire [3:0][NUM_REQS-1:0][1:0] align_offs;
|
||||
|
||||
// reorder address into quads
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
for (genvar j = 0; j < 4; ++j) begin
|
||||
assign req_addr_w[j][i] = req_addr[i][j][31:2];
|
||||
assign align_offs[j][i] = req_addr[i][j][1:0];
|
||||
end
|
||||
end
|
||||
|
||||
// find duplicate addresses
|
||||
|
||||
for (genvar i = 0; i < 4; ++i) begin
|
||||
wire [NUM_REQS-1:0] addr_matches;
|
||||
for (genvar j = 0; j < NUM_REQS; j++) begin
|
||||
assign addr_matches[j] = (req_addr_w[i][0] == req_addr_w[i][j]) || ~req_tmask[j];
|
||||
end
|
||||
assign dup_reqs[i] = req_tmask[0] && (& addr_matches);
|
||||
end
|
||||
|
||||
// save request addresses into fifo
|
||||
|
||||
wire reqq_push, reqq_pop, reqq_empty, reqq_full;
|
||||
|
||||
wire [3:0][NUM_REQS-1:0][29:0] q_req_addr;
|
||||
wire [NUM_REQS-1:0] q_req_tmask;
|
||||
wire [`TEX_FILTER_BITS-1:0] q_req_filter;
|
||||
wire [REQ_INFOW-1:0] q_req_info;
|
||||
wire [`TEX_STRIDE_BITS-1:0] q_req_stride;
|
||||
wire [3:0][NUM_REQS-1:0][1:0] q_align_offs;
|
||||
wire [3:0] q_dup_reqs;
|
||||
|
||||
assign reqq_push = req_valid && req_ready;
|
||||
|
||||
VX_fifo_queue #(
|
||||
.DATAW ((NUM_REQS * 4 * 30) + NUM_REQS + REQ_INFOW + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (4 * NUM_REQS * 2) + 4),
|
||||
.SIZE (`LSUQ_SIZE),
|
||||
.OUT_REG (1)
|
||||
) req_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.push (reqq_push),
|
||||
.pop (reqq_pop),
|
||||
.data_in ({req_addr_w, req_tmask, req_info, req_filter, req_stride, align_offs, dup_reqs}),
|
||||
.data_out ({q_req_addr, q_req_tmask, q_req_info, q_req_filter, q_req_stride, q_align_offs, q_dup_reqs}),
|
||||
.empty (reqq_empty),
|
||||
.full (reqq_full),
|
||||
`UNUSED_PIN (alm_full),
|
||||
`UNUSED_PIN (alm_empty),
|
||||
`UNUSED_PIN (size)
|
||||
);
|
||||
|
||||
// can take more requests?
|
||||
assign req_ready = ~reqq_full;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire req_texel_valid;
|
||||
wire sent_all_ready, last_texel_sent;
|
||||
wire req_texel_dup;
|
||||
wire [NUM_REQS-1:0][29:0] req_texel_addr;
|
||||
reg [1:0] req_texel_idx;
|
||||
reg req_texels_done;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset || last_texel_sent) begin
|
||||
req_texel_idx <= 0;
|
||||
end else if (req_texel_valid && sent_all_ready) begin
|
||||
req_texel_idx <= req_texel_idx + 1;
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset || reqq_pop) begin
|
||||
req_texels_done <= 0;
|
||||
end else if (last_texel_sent) begin
|
||||
req_texels_done <= 1;
|
||||
end
|
||||
end
|
||||
|
||||
assign req_texel_valid = ~reqq_empty && ~req_texels_done;
|
||||
assign req_texel_addr = q_req_addr[req_texel_idx];
|
||||
assign req_texel_dup = q_dup_reqs[req_texel_idx];
|
||||
|
||||
wire is_last_texel = (req_texel_idx == (q_req_filter ? 3 : 0));
|
||||
assign last_texel_sent = req_texel_valid && sent_all_ready && is_last_texel;
|
||||
|
||||
// DCache Request
|
||||
|
||||
reg [NUM_REQS-1:0] texel_sent_mask;
|
||||
|
||||
wire [NUM_REQS-1:0] dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready;
|
||||
|
||||
wire dcache_req_fire_any = (| dcache_req_fire);
|
||||
|
||||
assign sent_all_ready = (&(dcache_req_if.ready | texel_sent_mask | ~q_req_tmask))
|
||||
|| (req_texel_dup & dcache_req_if.ready[0]);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset || sent_all_ready) begin
|
||||
texel_sent_mask <= 0;
|
||||
end else begin
|
||||
texel_sent_mask <= texel_sent_mask | dcache_req_fire;
|
||||
end
|
||||
end
|
||||
|
||||
wire [NUM_REQS-1:0] req_dup_mask = {{(NUM_REQS-1){~req_texel_dup}}, 1'b1};
|
||||
|
||||
assign dcache_req_if.valid = {NUM_REQS{req_texel_valid}} & q_req_tmask & req_dup_mask & ~texel_sent_mask;
|
||||
assign dcache_req_if.rw = {NUM_REQS{1'b0}};
|
||||
assign dcache_req_if.addr = req_texel_addr;
|
||||
assign dcache_req_if.byteen = {NUM_REQS{4'b1111}};
|
||||
assign dcache_req_if.data = 'x;
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
assign dcache_req_if.tag = {NUM_REQS{q_req_info[`DBG_CACHE_REQ_MDATAW-1:0], req_texel_idx}};
|
||||
`else
|
||||
assign dcache_req_if.tag = {NUM_REQS{req_texel_idx}};
|
||||
`endif
|
||||
|
||||
// Dcache Response
|
||||
|
||||
reg [3:0][NUM_REQS-1:0][31:0] rsp_texels, rsp_texels_n;
|
||||
wire [NUM_REQS-1:0][3:0][31:0] rsp_texels_qual;
|
||||
reg [NUM_REQS-1:0][31:0] rsp_data_qual;
|
||||
reg [RSP_CTR_W-1:0] rsp_rem_ctr, rsp_rem_ctr_init;
|
||||
wire [RSP_CTR_W-1:0] rsp_rem_ctr_n;
|
||||
wire dcache_rsp_fire;
|
||||
wire [1:0] rsp_texel_idx;
|
||||
wire rsp_texel_dup;
|
||||
|
||||
assign rsp_texel_idx = dcache_rsp_if.tag[1:0];
|
||||
`UNUSED_VAR (dcache_rsp_if.tag)
|
||||
|
||||
assign rsp_texel_dup = q_dup_reqs[rsp_texel_idx];
|
||||
|
||||
assign dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
wire [31:0] src_mask = {32{dcache_rsp_if.tmask[i]}};
|
||||
wire [31:0] src_data = ((i == 0 || rsp_texel_dup) ? dcache_rsp_if.data[0] : dcache_rsp_if.data[i]) & src_mask;
|
||||
|
||||
reg [31:0] rsp_data_shifted;
|
||||
always @(*) begin
|
||||
rsp_data_shifted[31:16] = src_data[31:16];
|
||||
rsp_data_shifted[15:0] = q_align_offs[rsp_texel_idx][i][1] ? src_data[31:16] : src_data[15:0];
|
||||
rsp_data_shifted[7:0] = q_align_offs[rsp_texel_idx][i][0] ? rsp_data_shifted[15:8] : rsp_data_shifted[7:0];
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
case (q_req_stride)
|
||||
0: rsp_data_qual[i] = 32'(rsp_data_shifted[7:0]);
|
||||
1: rsp_data_qual[i] = 32'(rsp_data_shifted[15:0]);
|
||||
default: rsp_data_qual[i] = rsp_data_shifted;
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
rsp_texels_n = rsp_texels;
|
||||
rsp_texels_n[rsp_texel_idx] |= rsp_data_qual;
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset || reqq_pop) begin
|
||||
rsp_texels <= '0;
|
||||
end else if (dcache_rsp_fire) begin
|
||||
rsp_texels <= rsp_texels_n;
|
||||
end
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
rsp_rem_ctr_init = RSP_CTR_W'($countones(q_dup_reqs[0] ? NUM_REQS'(1) : q_req_tmask));
|
||||
if (q_req_filter) begin
|
||||
for (integer i = 1; i < 4; ++i) begin
|
||||
rsp_rem_ctr_init += RSP_CTR_W'($countones(q_dup_reqs[i] ? NUM_REQS'(1) : q_req_tmask));
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign rsp_rem_ctr_n = rsp_rem_ctr - RSP_CTR_W'($countones(dcache_rsp_if.tmask));
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
rsp_rem_ctr <= 0;
|
||||
end else begin
|
||||
if (dcache_req_fire_any && 0 == rsp_rem_ctr) begin
|
||||
rsp_rem_ctr <= rsp_rem_ctr_init;
|
||||
end else if (dcache_rsp_fire) begin
|
||||
rsp_rem_ctr <= rsp_rem_ctr_n;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
for (genvar j = 0; j < 4; ++j) begin
|
||||
assign rsp_texels_qual[i][j] = rsp_texels_n[j][i];
|
||||
end
|
||||
end
|
||||
|
||||
wire stall_out = rsp_valid && ~rsp_ready;
|
||||
|
||||
wire is_last_rsp = (0 == rsp_rem_ctr_n);
|
||||
|
||||
wire rsp_texels_done = dcache_rsp_fire && is_last_rsp;
|
||||
|
||||
assign reqq_pop = rsp_texels_done && ~stall_out;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_REQS + REQ_INFOW + (4 * NUM_REQS * 32)),
|
||||
.RESETW (1)
|
||||
) rsp_pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({rsp_texels_done, q_req_tmask, q_req_info, rsp_texels_qual}),
|
||||
.data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data})
|
||||
);
|
||||
|
||||
// Can accept new cache response?
|
||||
assign dcache_rsp_if.ready = ~(is_last_rsp && stall_out);
|
||||
|
||||
`ifdef DBG_TRACE_TEX
|
||||
wire [`NW_BITS-1:0] q_req_wid, req_wid, rsp_wid;
|
||||
wire [31:0] q_req_PC, req_PC, rsp_PC;
|
||||
assign {q_req_wid, q_req_PC} = q_req_info[`NW_BITS+32-1:0];
|
||||
assign {req_wid, req_PC} = req_info[`NW_BITS+32-1:0];
|
||||
assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (dcache_req_fire_any) begin
|
||||
dpi_trace("%d: core%0d-tex-cache-req: wid=%0d, PC=%0h, tmask=%b, tag=%0h, addr=",
|
||||
$time, CORE_ID, q_req_wid, q_req_PC, dcache_req_fire, req_texel_idx);
|
||||
`TRACE_ARRAY1D(req_texel_addr, NUM_REQS);
|
||||
dpi_trace(", is_dup=%b\n", req_texel_dup);
|
||||
end
|
||||
if (dcache_rsp_fire) begin
|
||||
dpi_trace("%d: core%0d-tex-cache-rsp: wid=%0d, PC=%0h, tmask=%b, tag=%0h, data=",
|
||||
$time, CORE_ID, q_req_wid, q_req_PC, dcache_rsp_if.tmask, rsp_texel_idx);
|
||||
`TRACE_ARRAY1D(dcache_rsp_if.data, NUM_REQS);
|
||||
dpi_trace("\n");
|
||||
end
|
||||
if (req_valid && req_ready) begin
|
||||
dpi_trace("%d: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, stride=%0d, addr=",
|
||||
$time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_stride);
|
||||
`TRACE_ARRAY2D(req_addr, 4, NUM_REQS);
|
||||
dpi_trace("\n");
|
||||
end
|
||||
if (rsp_valid && rsp_ready) begin
|
||||
dpi_trace("%d: core%0d-tex-mem-rsp: wid=%0d, PC=%0h, tmask=%b, data=",
|
||||
$time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask);
|
||||
`TRACE_ARRAY2D(rsp_data, 4, NUM_REQS);
|
||||
dpi_trace("\n");
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
146
hw/rtl/tex_unit/VX_tex_sampler.sv
Normal file
|
@ -0,0 +1,146 @@
|
|||
`include "VX_tex_define.vh"
|
||||
|
||||
module VX_tex_sampler #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter REQ_INFOW = 1,
|
||||
parameter NUM_REQS = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// inputs
|
||||
input wire req_valid,
|
||||
input wire [NUM_REQS-1:0] req_tmask,
|
||||
input wire [`TEX_FORMAT_BITS-1:0] req_format,
|
||||
input wire [NUM_REQS-1:0][1:0][`BLEND_FRAC-1:0] req_blends,
|
||||
input wire [NUM_REQS-1:0][3:0][31:0] req_data,
|
||||
input wire [REQ_INFOW-1:0] req_info,
|
||||
output wire req_ready,
|
||||
|
||||
// ouputs
|
||||
output wire rsp_valid,
|
||||
output wire [NUM_REQS-1:0] rsp_tmask,
|
||||
output wire [NUM_REQS-1:0][31:0] rsp_data,
|
||||
output wire [REQ_INFOW-1:0] rsp_info,
|
||||
input wire rsp_ready
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
wire valid_s0;
|
||||
wire [NUM_REQS-1:0] tmask_s0;
|
||||
wire [REQ_INFOW-1:0] req_info_s0;
|
||||
wire [NUM_REQS-1:0][31:0] texel_ul, texel_uh;
|
||||
wire [NUM_REQS-1:0][31:0] texel_ul_s0, texel_uh_s0;
|
||||
wire [NUM_REQS-1:0][`BLEND_FRAC-1:0] blend_v, blend_v_s0;
|
||||
wire [NUM_REQS-1:0][31:0] texel_v;
|
||||
|
||||
wire stall_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
|
||||
wire [3:0][31:0] fmt_texels;
|
||||
|
||||
for (genvar j = 0; j < 4; ++j) begin
|
||||
VX_tex_format #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) tex_format (
|
||||
.format (req_format),
|
||||
.texel_in (req_data[i][j]),
|
||||
.texel_out (fmt_texels[j])
|
||||
);
|
||||
end
|
||||
|
||||
wire [7:0] beta = req_blends[i][0];
|
||||
wire [8:0] alpha = `BLEND_ONE - beta;
|
||||
|
||||
VX_tex_lerp #(
|
||||
) tex_lerp_ul (
|
||||
.in1 (fmt_texels[0]),
|
||||
.in2 (fmt_texels[1]),
|
||||
.alpha (alpha),
|
||||
.beta (beta),
|
||||
.out (texel_ul[i])
|
||||
);
|
||||
|
||||
VX_tex_lerp #(
|
||||
) tex_lerp_uh (
|
||||
.in1 (fmt_texels[2]),
|
||||
.in2 (fmt_texels[3]),
|
||||
.alpha (alpha),
|
||||
.beta (beta),
|
||||
.out (texel_uh[i])
|
||||
);
|
||||
|
||||
assign blend_v[i] = req_blends[i][1];
|
||||
end
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * `BLEND_FRAC) + (2 * NUM_REQS * 32)),
|
||||
.RESETW (1)
|
||||
) pipe_reg0 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({req_valid, req_tmask, req_info, blend_v, texel_ul, texel_uh}),
|
||||
.data_out ({valid_s0, tmask_s0, req_info_s0, blend_v_s0, texel_ul_s0, texel_uh_s0})
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
wire [7:0] beta = blend_v_s0[i];
|
||||
wire [8:0] alpha = `BLEND_ONE - beta;
|
||||
|
||||
VX_tex_lerp #(
|
||||
) tex_lerp_v (
|
||||
.in1 (texel_ul_s0[i]),
|
||||
.in2 (texel_uh_s0[i]),
|
||||
.alpha (alpha),
|
||||
.beta (beta),
|
||||
.out (texel_v[i])
|
||||
);
|
||||
end
|
||||
|
||||
assign stall_out = rsp_valid && ~rsp_ready;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_REQS + REQ_INFOW + (NUM_REQS * 32)),
|
||||
.RESETW (1)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({valid_s0, tmask_s0, req_info_s0, texel_v}),
|
||||
.data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data})
|
||||
);
|
||||
|
||||
// can accept new request?
|
||||
assign req_ready = ~stall_out;
|
||||
|
||||
`ifdef DBG_TRACE_TEX
|
||||
wire [`NW_BITS-1:0] req_wid, rsp_wid;
|
||||
wire [31:0] req_PC, rsp_PC;
|
||||
|
||||
assign {req_wid, req_PC} = req_info[`NW_BITS+32-1:0];
|
||||
assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (req_valid && req_ready) begin
|
||||
dpi_trace("%d: core%0d-tex-sampler-req: wid=%0d, PC=%0h, tmask=%b, format=%0d, data=",
|
||||
$time, CORE_ID, req_wid, req_PC, req_tmask, req_format);
|
||||
`TRACE_ARRAY2D(req_data, 4, NUM_REQS);
|
||||
dpi_trace(", u0=");
|
||||
`TRACE_ARRAY1D(req_blends[0], NUM_REQS);
|
||||
dpi_trace(", v0=");
|
||||
`TRACE_ARRAY1D(req_blends[1], NUM_REQS);
|
||||
dpi_trace("\n");
|
||||
end
|
||||
if (rsp_valid && rsp_ready) begin
|
||||
dpi_trace("%d: core%0d-tex-sampler-rsp: wid=%0d, PC=%0h, tmask=%b, data=",
|
||||
$time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask);
|
||||
`TRACE_ARRAY1D(rsp_data, NUM_REQS);
|
||||
dpi_trace("\n");
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
21
hw/rtl/tex_unit/VX_tex_sat.sv
Normal file
|
@ -0,0 +1,21 @@
|
|||
`include "VX_platform.vh"
|
||||
|
||||
module VX_tex_sat #(
|
||||
parameter IN_W = 1,
|
||||
parameter OUT_W = 1,
|
||||
parameter MODEL = 1
|
||||
) (
|
||||
input wire [IN_W-1:0] data_in,
|
||||
output wire [OUT_W-1:0] data_out
|
||||
);
|
||||
`STATIC_ASSERT(((OUT_W+1) < IN_W), ("invalid parameter"))
|
||||
|
||||
if (MODEL == 1) begin
|
||||
wire [OUT_W-1:0] underflow_mask = {OUT_W{~data_in[IN_W-1]}};
|
||||
wire [OUT_W-1:0] overflow_mask = {OUT_W{(| data_in[IN_W-2:OUT_W])}};
|
||||
assign data_out = (data_in[OUT_W-1:0] | overflow_mask) & underflow_mask;
|
||||
end else begin
|
||||
assign data_out = data_in[IN_W-1] ? OUT_W'(0) : ((data_in > {OUT_W{1'b1}}) ? {OUT_W{1'b1}} : OUT_W'(data_in));
|
||||
end
|
||||
|
||||
endmodule
|
27
hw/rtl/tex_unit/VX_tex_stride.sv
Normal file
|
@ -0,0 +1,27 @@
|
|||
`include "VX_tex_define.vh"
|
||||
|
||||
module VX_tex_stride #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire [`TEX_FORMAT_BITS-1:0] format,
|
||||
output wire [`TEX_STRIDE_BITS-1:0] log_stride
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
reg [`TEX_STRIDE_BITS-1:0] log_stride_r;
|
||||
|
||||
always @(*) begin
|
||||
case (format)
|
||||
`TEX_FORMAT_A8: log_stride_r = 0;
|
||||
`TEX_FORMAT_L8: log_stride_r = 0;
|
||||
`TEX_FORMAT_L8A8: log_stride_r = 1;
|
||||
`TEX_FORMAT_R5G6B5: log_stride_r = 1;
|
||||
`TEX_FORMAT_R4G4B4A4: log_stride_r = 1;
|
||||
//`TEX_FORMAT_R8G8B8A8
|
||||
default: log_stride_r = 2;
|
||||
endcase
|
||||
end
|
||||
|
||||
assign log_stride = log_stride_r;
|
||||
|
||||
endmodule
|
234
hw/rtl/tex_unit/VX_tex_unit.sv
Normal file
|
@ -0,0 +1,234 @@
|
|||
`include "VX_tex_define.vh"
|
||||
|
||||
module VX_tex_unit #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// Texture unit <-> Memory Unit
|
||||
VX_dcache_req_if.master dcache_req_if,
|
||||
VX_dcache_rsp_if.slave dcache_rsp_if,
|
||||
|
||||
// Inputs
|
||||
VX_tex_req_if.slave tex_req_if,
|
||||
VX_tex_csr_if.slave tex_csr_if,
|
||||
|
||||
// Outputs
|
||||
VX_tex_rsp_if.master tex_rsp_if
|
||||
);
|
||||
|
||||
localparam REQ_INFOW_S = `NR_BITS + 1 + `NW_BITS + 32;
|
||||
localparam REQ_INFOW_A = `TEX_FORMAT_BITS + REQ_INFOW_S;
|
||||
localparam REQ_INFOW_M = (2 * `NUM_THREADS * `BLEND_FRAC) + REQ_INFOW_A;
|
||||
|
||||
reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0];
|
||||
reg [1:0][`TEX_DIM_BITS-1:0] tex_dims [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0];
|
||||
reg [`TEX_ADDR_BITS-1:0] tex_baddr [`NUM_TEX_UNITS-1:0];
|
||||
reg [`TEX_FORMAT_BITS-1:0] tex_format [`NUM_TEX_UNITS-1:0];
|
||||
reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0];
|
||||
reg [`TEX_FILTER_BITS-1:0] tex_filter [`NUM_TEX_UNITS-1:0];
|
||||
|
||||
// CSRs programming
|
||||
|
||||
reg [`NUM_TEX_UNITS-1:0] csrs_dirty;
|
||||
`UNUSED_VAR (csrs_dirty)
|
||||
|
||||
for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin
|
||||
wire [`TEX_LOD_BITS-1:0] mip_level = tex_csr_if.write_data[28 +: `TEX_LOD_BITS];
|
||||
always @(posedge clk) begin
|
||||
if (tex_csr_if.write_enable) begin
|
||||
case (tex_csr_if.write_addr)
|
||||
`CSR_TEX_ADDR(i) : begin
|
||||
tex_baddr[i] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
`CSR_TEX_FORMAT(i) : begin
|
||||
tex_format[i] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
`CSR_TEX_WRAP(i) : begin
|
||||
tex_wraps[i][0] <= tex_csr_if.write_data[0 +: `TEX_WRAP_BITS];
|
||||
tex_wraps[i][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS +: `TEX_WRAP_BITS];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
`CSR_TEX_FILTER(i) : begin
|
||||
tex_filter[i] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
`CSR_TEX_MIPOFF(i) : begin
|
||||
tex_mipoff[i][mip_level] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
`CSR_TEX_WIDTH(i) : begin
|
||||
tex_dims[i][mip_level][0] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
`CSR_TEX_HEIGHT(i) : begin
|
||||
tex_dims[i][mip_level][1] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
endcase
|
||||
end
|
||||
if (reset || (tex_req_if.valid && tex_req_if.ready)) begin
|
||||
csrs_dirty[i] <= '0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
// mipmap attributes
|
||||
|
||||
wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff;
|
||||
wire [`NUM_THREADS-1:0][1:0][`TEX_DIM_BITS-1:0] sel_dims;
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
wire [`NTEX_BITS-1:0] unit = tex_req_if.unit[`NTEX_BITS-1:0];
|
||||
wire [`TEX_LOD_BITS-1:0] mip_level = tex_req_if.lod[i][20+:`TEX_LOD_BITS];
|
||||
assign sel_mipoff[i] = tex_mipoff[unit][mip_level];
|
||||
assign sel_dims[i] = tex_dims[unit][mip_level];
|
||||
end
|
||||
|
||||
// address generation
|
||||
|
||||
wire mem_req_valid;
|
||||
wire [`NUM_THREADS-1:0] mem_req_tmask;
|
||||
wire [`TEX_FILTER_BITS-1:0] mem_req_filter;
|
||||
wire [`TEX_STRIDE_BITS-1:0] mem_req_stride;
|
||||
wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] mem_req_blends;
|
||||
wire [`NUM_THREADS-1:0][3:0][31:0] mem_req_addr;
|
||||
wire [REQ_INFOW_A-1:0] mem_req_info;
|
||||
wire mem_req_ready;
|
||||
|
||||
VX_tex_addr #(
|
||||
.CORE_ID (CORE_ID),
|
||||
.REQ_INFOW (REQ_INFOW_A),
|
||||
.NUM_REQS (`NUM_THREADS)
|
||||
) tex_addr (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
.req_valid (tex_req_if.valid),
|
||||
.req_tmask (tex_req_if.tmask),
|
||||
.req_coords (tex_req_if.coords),
|
||||
.req_format (tex_format[tex_req_if.unit]),
|
||||
.req_filter (tex_filter[tex_req_if.unit]),
|
||||
.req_wraps (tex_wraps[tex_req_if.unit]),
|
||||
.req_baseaddr (tex_baddr[tex_req_if.unit]),
|
||||
.req_mipoff (sel_mipoff),
|
||||
.req_logdims (sel_dims),
|
||||
.req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}),
|
||||
.req_ready (tex_req_if.ready),
|
||||
|
||||
.rsp_valid (mem_req_valid),
|
||||
.rsp_tmask (mem_req_tmask),
|
||||
.rsp_filter (mem_req_filter),
|
||||
.rsp_stride (mem_req_stride),
|
||||
.rsp_addr (mem_req_addr),
|
||||
.rsp_blends (mem_req_blends),
|
||||
.rsp_info (mem_req_info),
|
||||
.rsp_ready (mem_req_ready)
|
||||
);
|
||||
|
||||
// retrieve texel values from memory
|
||||
|
||||
wire mem_rsp_valid;
|
||||
wire [`NUM_THREADS-1:0] mem_rsp_tmask;
|
||||
wire [`NUM_THREADS-1:0][3:0][31:0] mem_rsp_data;
|
||||
wire [REQ_INFOW_M-1:0] mem_rsp_info;
|
||||
wire mem_rsp_ready;
|
||||
|
||||
VX_tex_mem #(
|
||||
.CORE_ID (CORE_ID),
|
||||
.REQ_INFOW (REQ_INFOW_M),
|
||||
.NUM_REQS (`NUM_THREADS)
|
||||
) tex_mem (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
// memory interface
|
||||
.dcache_req_if (dcache_req_if),
|
||||
.dcache_rsp_if (dcache_rsp_if),
|
||||
|
||||
// inputs
|
||||
.req_valid (mem_req_valid),
|
||||
.req_tmask (mem_req_tmask),
|
||||
.req_filter(mem_req_filter),
|
||||
.req_stride(mem_req_stride),
|
||||
.req_addr (mem_req_addr),
|
||||
.req_info ({mem_req_blends, mem_req_info}),
|
||||
.req_ready (mem_req_ready),
|
||||
|
||||
// outputs
|
||||
.rsp_valid (mem_rsp_valid),
|
||||
.rsp_tmask (mem_rsp_tmask),
|
||||
.rsp_data (mem_rsp_data),
|
||||
.rsp_info (mem_rsp_info),
|
||||
.rsp_ready (mem_rsp_ready)
|
||||
);
|
||||
|
||||
// apply sampler
|
||||
|
||||
wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends;
|
||||
wire [`TEX_FORMAT_BITS-1:0] rsp_format;
|
||||
wire [REQ_INFOW_S-1:0] rsp_info;
|
||||
|
||||
assign {rsp_blends, rsp_format, rsp_info} = mem_rsp_info;
|
||||
|
||||
VX_tex_sampler #(
|
||||
.CORE_ID (CORE_ID),
|
||||
.REQ_INFOW (REQ_INFOW_S),
|
||||
.NUM_REQS (`NUM_THREADS)
|
||||
) tex_sampler (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
// inputs
|
||||
.req_valid (mem_rsp_valid),
|
||||
.req_tmask (mem_rsp_tmask),
|
||||
.req_data (mem_rsp_data),
|
||||
.req_format (rsp_format),
|
||||
.req_blends (rsp_blends),
|
||||
.req_info (rsp_info),
|
||||
.req_ready (mem_rsp_ready),
|
||||
|
||||
// outputs
|
||||
.rsp_valid (tex_rsp_if.valid),
|
||||
.rsp_tmask (tex_rsp_if.tmask),
|
||||
.rsp_data (tex_rsp_if.data),
|
||||
.rsp_info ({tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC}),
|
||||
.rsp_ready (tex_rsp_if.ready)
|
||||
);
|
||||
|
||||
`ifdef DBG_TRACE_TEX
|
||||
always @(posedge clk) begin
|
||||
if (tex_req_if.valid && tex_req_if.ready) begin
|
||||
for (integer i = 0; i < `NUM_TEX_UNITS; ++i) begin
|
||||
if (csrs_dirty[i]) begin
|
||||
dpi_trace("%d: core%0d-tex-csr: tex%0d_addr=%0h\n", $time, CORE_ID, i, tex_baddr[i]);
|
||||
dpi_trace("%d: core%0d-tex-csr: tex%0d_format=%0h\n", $time, CORE_ID, i, tex_format[i]);
|
||||
dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_u=%0h\n", $time, CORE_ID, i, tex_wraps[i][0]);
|
||||
dpi_trace("%d: core%0d-tex-csr: tex%0d_wrap_v=%0h\n", $time, CORE_ID, i, tex_wraps[i][1]);
|
||||
dpi_trace("%d: core%0d-tex-csr: tex%0d_filter=%0h\n", $time, CORE_ID, i, tex_filter[i]);
|
||||
dpi_trace("%d: core%0d-tex-csr: tex%0d_mipoff[0]=%0h\n", $time, CORE_ID, i, tex_mipoff[i][0]);
|
||||
dpi_trace("%d: core%0d-tex-csr: tex%0d_width[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][0]);
|
||||
dpi_trace("%d: core%0d-tex-csr: tex%0d_height[0]=%0h\n", $time, CORE_ID, i, tex_dims[i][0][1]);
|
||||
end
|
||||
end
|
||||
|
||||
dpi_trace("%d: core%0d-tex-req: wid=%0d, PC=%0h, tmask=%b, unit=%0d, lod=%0h, u=",
|
||||
$time, CORE_ID, tex_req_if.wid, tex_req_if.PC, tex_req_if.tmask, tex_req_if.unit, tex_req_if.lod);
|
||||
`TRACE_ARRAY1D(tex_req_if.coords[0], `NUM_THREADS);
|
||||
dpi_trace(", v=");
|
||||
`TRACE_ARRAY1D(tex_req_if.coords[1], `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
end
|
||||
if (tex_rsp_if.valid && tex_rsp_if.ready) begin
|
||||
dpi_trace("%d: core%0d-tex-rsp: wid=%0d, PC=%0h, tmask=%b, data=",
|
||||
$time, CORE_ID, tex_rsp_if.wid, tex_rsp_if.PC, tex_rsp_if.tmask);
|
||||
`TRACE_ARRAY1D(tex_rsp_if.data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
38
hw/rtl/tex_unit/VX_tex_wrap.sv
Normal file
|
@ -0,0 +1,38 @@
|
|||
`include "VX_tex_define.vh"
|
||||
|
||||
module VX_tex_wrap #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire [`TEX_WRAP_BITS-1:0] wrap_i,
|
||||
input wire [31:0] coord_i,
|
||||
output wire [`FIXED_FRAC-1:0] coord_o
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
reg [`FIXED_FRAC-1:0] coord_r;
|
||||
|
||||
wire [`FIXED_FRAC-1:0] clamp;
|
||||
|
||||
VX_tex_sat #(
|
||||
.IN_W (32),
|
||||
.OUT_W (`FIXED_FRAC)
|
||||
) sat_fx (
|
||||
.data_in (coord_i),
|
||||
.data_out (clamp)
|
||||
);
|
||||
|
||||
always @(*) begin
|
||||
case (wrap_i)
|
||||
`TEX_WRAP_CLAMP:
|
||||
coord_r = clamp;
|
||||
`TEX_WRAP_MIRROR:
|
||||
coord_r = coord_i[`FIXED_FRAC-1:0] ^ {`FIXED_FRAC{coord_i[`FIXED_FRAC]}};
|
||||
default: //`TEX_WRAP_REPEAT
|
||||
coord_r = coord_i[`FIXED_FRAC-1:0];
|
||||
endcase
|
||||
end
|
||||
|
||||
assign coord_o = coord_r;
|
||||
|
||||
endmodule
|
|
@ -194,9 +194,9 @@
|
|||
"issue_imm": 32,
|
||||
"issue_use_pc": 1,
|
||||
"issue_use_imm": 1,
|
||||
"gpr_rsp_a":"`NUM_THREADS * 32",
|
||||
"gpr_rsp_b":"`NUM_THREADS * 32",
|
||||
"gpr_rsp_c":"`NUM_THREADS * 32",
|
||||
"gpr_rs1":"`NUM_THREADS * 32",
|
||||
"gpr_rs2":"`NUM_THREADS * 32",
|
||||
"gpr_rs3":"`NUM_THREADS * 32",
|
||||
"?writeback_valid": 1,
|
||||
"writeback_wid":"`NW_BITS",
|
||||
"writeback_pc": 32,
|
||||
|
@ -205,7 +205,7 @@
|
|||
"writeback_data":"`NUM_THREADS * 32",
|
||||
"writeback_eop": 1,
|
||||
"!scoreboard_delay": 1,
|
||||
"!execute_delay": 1
|
||||
"!dispatch_delay": 1
|
||||
},
|
||||
"afu/vortex/l3cache/bank, afu/vortex/cluster/l2cache/bank, afu/vortex/cluster/core/mem_unit/dcache/bank, afu/vortex/cluster/core/mem_unit/icache/bank": {
|
||||
"?valid_st0": 1,
|
||||
|
|
|
@ -262,7 +262,7 @@ def expand_text(text, params):
|
|||
has_func = do_repl.has_func
|
||||
if not (params_updated or do_repl.expanded):
|
||||
break
|
||||
text = new_text
|
||||
text = new_text
|
||||
changed = True
|
||||
if not has_func:
|
||||
break
|
||||
|
|
|
@ -8,20 +8,21 @@ else
|
|||
RUN_SYNTH=qsub-synth
|
||||
endif
|
||||
|
||||
# control RTL debug print states
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
|
||||
# control RTL debug tracing states
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_MEM
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_OPAE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_AVS
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
|
||||
|
||||
DBG_FLAGS += $(DBG_PRINT_FLAGS)
|
||||
DBG_FLAGS += $(DBG_TRACE_FLAGS)
|
||||
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
|
||||
|
||||
CONFIG1 := -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
|
||||
|
@ -33,7 +34,8 @@ CONFIG32 := -DNUM_CLUSTERS=4 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=1 -DICACHE_
|
|||
CONFIG64 := -DNUM_CLUSTERS=8 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=1 -DICACHE_SIZE=8192 -DDCACHE_SIZE=8192 -DL3_CACHE_SIZE=524288 $(CONFIGS)
|
||||
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)
|
||||
RTL_INCLUDE = -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) -I$(RTL_DIR) -I$(RTL_DIR)/afu
|
||||
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/afu $(FPU_INCLUDE) $(TEX_INCLUDE)
|
||||
|
||||
CFLAGS += $(RTL_INCLUDE)
|
||||
|
||||
|
|
|
@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
|
@ -12,12 +12,12 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
# Part, Family
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
# Executable Configuration
|
||||
SYN_ARGS = --parallel --read_settings_files=on
|
||||
|
|
|
@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
|
@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
|
@ -12,7 +12,8 @@ FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
|
@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
|
@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
|
@ -12,7 +12,8 @@ DEVICE = 1SX280HN2F43E2VG
|
|||
FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
|
@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
|
@ -12,7 +12,9 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
# Executable Configuration
|
||||
|
|
|
@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
40
hw/unit_tests/cache/Makefile
vendored
|
@ -1,46 +1,42 @@
|
|||
PARAM += -DCACHE_SIZE=4096 -DCACHE_WORD_SIZE=4 -DCACHE_LINE_SIZE=16 -DCACHE_NUM_BANKS=4 -DCACHE_CREQ_SIZE=4 -DMRVQ_SIZE=16 -DDFPQ_SIZE=16 -DSNRQ_SIZE=16 -DCWBQ_SIZE=4 -DDWBQ_SIZE=4 -DFQQ_SIZE=4
|
||||
|
||||
PARAMS += -DCACHE_SIZE=4096 -DCACHE_WORD_SIZE=4 -DCACHE_LINE_SIZE=16 -DCACHE_NUM_BANKS=4 -DCACHE_CREQ_SIZE=4 -DMRVQ_SIZE=16 -DDFPQ_SIZE=16 -DSNRQ_SIZE=16 -DCWBQ_SIZE=4 -DDWBQ_SIZE=4 -DFQQ_SIZE=4
|
||||
|
||||
# control RTL debug tracing states
|
||||
DBG_TRACE_FLAGS = -DDBG_TRACE_CORE_ICACHE \
|
||||
-DDBG_TRACE_CORE_DCACHE \
|
||||
-DDBG_TRACE_CACHE_BANK \
|
||||
-DDBG_TRACE_CACHE_SNP \
|
||||
-DDBG_TRACE_CACHE_MSHR \
|
||||
-DDBG_TRACE_CACHE_TAG \
|
||||
-DDBG_TRACE_CACHE_DATA \
|
||||
-DDBG_TRACE_MEM \
|
||||
-DDBG_TRACE_OPAE \
|
||||
-DDBG_TRACE_AVS
|
||||
|
||||
# control RTL debug print states
|
||||
DBG_PRINT_FLAGS = -DDBG_PRINT_CORE_ICACHE \
|
||||
-DDBG_PRINT_CORE_DCACHE \
|
||||
-DDBG_PRINT_CACHE_BANK \
|
||||
-DDBG_PRINT_CACHE_SNP \
|
||||
-DDBG_PRINT_CACHE_MSHR \
|
||||
-DDBG_PRINT_CACHE_TAG \
|
||||
-DDBG_PRINT_CACHE_DATA \
|
||||
-DDBG_PRINT_MEM \
|
||||
-DDBG_PRINT_OPAE \
|
||||
-DDBG_PRINT_AVS
|
||||
|
||||
#DBG_PRINT=$(DBG_PRINT_FLAGS)
|
||||
#DBG_PRINT=$(DBG_TRACE_FLAGS)
|
||||
|
||||
INCLUDE = -I../../rtl/ -I../../rtl/cache -I../../rtl/libs
|
||||
|
||||
|
||||
SRCS = cachesim.cpp testbench.cpp
|
||||
|
||||
all: build
|
||||
|
||||
CF += -std=c++11 -fms-extensions -I../..
|
||||
CF += $(PARAMS)
|
||||
|
||||
VF += --language 1800-2009 --assert -Wall --trace #-Wpedantic
|
||||
VF += -Wno-DECLFILENAME
|
||||
VF += --x-initial unique
|
||||
VF += -exe $(SRCS) $(INCLUDE)
|
||||
|
||||
DBG += -DVCD_OUTPUT $(DBG_PRINT)
|
||||
|
||||
VF += $(PARAMS)
|
||||
|
||||
gen:
|
||||
verilator $(VF) -DNDEBUG -cc VX_cache.v $(PARAM) -CFLAGS '$(CF) -DNDEBUG $(PARAM)' --exe $(SRCS)
|
||||
verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS)
|
||||
|
||||
build: gen
|
||||
(cd obj_dir && make -j -f VVX_cache.mk)
|
||||
(cd obj_dir && make -j -f V$(TOP).mk)
|
||||
|
||||
run: build
|
||||
(cd obj_dir && ./VVX_cache)
|
||||
(cd obj_dir && ./V$(TOP))
|
||||
|
||||
clean:
|
||||
rm -rf obj_dir
|
||||
|
|
10
hw/unit_tests/cache/cachesim.cpp
vendored
|
@ -173,10 +173,10 @@ void CacheSim::stall_mem(){
|
|||
}
|
||||
|
||||
void CacheSim::send_snoop_req(){
|
||||
cache_->snp_req_valid = 1;
|
||||
/*cache_->snp_req_valid = 1;
|
||||
cache_->snp_req_addr = 0x12222222;
|
||||
cache_->snp_req_invalidate = 1;
|
||||
cache_->snp_req_tag = 0xff;
|
||||
cache_->snp_req_tag = 0xff; */
|
||||
}
|
||||
|
||||
void CacheSim::eval_mem_bus() {
|
||||
|
@ -274,9 +274,9 @@ bool CacheSim::assert_equal(unsigned int* data, unsigned int tag){
|
|||
//DEBUG
|
||||
|
||||
void CacheSim::display_miss(){
|
||||
int i = (unsigned int)cache_->miss_vec;
|
||||
std::bitset<8> x(i);
|
||||
if (i) std::cout << "Miss Vec " << x << std::endl;
|
||||
//int i = (unsigned int)cache_->miss_vec;
|
||||
//std::bitset<8> x(i);
|
||||
//if (i) std::cout << "Miss Vec " << x << std::endl;
|
||||
//std::cout << "Miss Vec 0" << cache_->miss_vec[0] << std::endl;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,11 +1,30 @@
|
|||
all: testbench.iv
|
||||
TOP = VX_fifo_queue
|
||||
|
||||
testbench.iv: testbench.v
|
||||
iverilog testbench.v -o testbench.iv -I ../../rtl/
|
||||
PARAMS ?=
|
||||
|
||||
run: testbench.iv
|
||||
! vvp testbench.iv | grep 'ERROR' || false
|
||||
INCLUDE = -I../../rtl/ -I../../rtl/libs
|
||||
|
||||
SRCS = main.cpp
|
||||
|
||||
all: build
|
||||
|
||||
CF += -std=c++11 -fms-extensions -I../..
|
||||
VF += $(PARAMS)
|
||||
|
||||
VF += --language 1800-2009 --assert -Wall --trace
|
||||
VF += -Wno-DECLFILENAME
|
||||
VF += --x-initial unique
|
||||
VF += -exe $(SRCS) $(INCLUDE)
|
||||
VF += $(PARAMS)
|
||||
|
||||
gen:
|
||||
verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS)
|
||||
|
||||
build: gen
|
||||
(cd obj_dir && make -j -f V$(TOP).mk)
|
||||
|
||||
run: build
|
||||
(cd obj_dir && ./V$(TOP))
|
||||
|
||||
clean:
|
||||
rm testbench.iv
|
||||
|
||||
rm -rf obj_dir
|
||||
|
|
93
hw/unit_tests/generic_queue/main.cpp
Normal file
|
@ -0,0 +1,93 @@
|
|||
#include "vl_simulator.h"
|
||||
#include "VVX_fifo_queue.h"
|
||||
#include <iostream>
|
||||
|
||||
#define MAX_TICKS 20
|
||||
|
||||
#define CHECK(x) \
|
||||
do { \
|
||||
if (x) \
|
||||
break; \
|
||||
std::cout << "FAILED: " << #x << std::endl; \
|
||||
std::abort(); \
|
||||
} while (false)
|
||||
|
||||
uint64_t ticks = 0;
|
||||
|
||||
double sc_time_stamp() {
|
||||
return ticks;
|
||||
}
|
||||
|
||||
using Device = VVX_fifo_queue;
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
// Initialize Verilators variables
|
||||
Verilated::commandArgs(argc, argv);
|
||||
|
||||
vl_simulator<Device> sim;
|
||||
|
||||
// run test
|
||||
ticks = sim.reset(0);
|
||||
while (ticks < MAX_TICKS) {
|
||||
switch (ticks) {
|
||||
case 0:
|
||||
// initial values
|
||||
sim->pop = 0;
|
||||
sim->push = 0;
|
||||
ticks = sim.step(ticks, 2);
|
||||
break;
|
||||
case 2:
|
||||
// Verify outputs
|
||||
CHECK(sim->full == 0x0);
|
||||
CHECK(sim->empty == 0x1);
|
||||
// push 0xa
|
||||
sim->pop = 0;
|
||||
sim->push = 1;
|
||||
sim->data_in = 0xa;
|
||||
break;
|
||||
case 4:
|
||||
// verify outputs
|
||||
CHECK(sim->data_out == 0xa);
|
||||
CHECK(sim->full == 0x0);
|
||||
CHECK(sim->empty == 0x0);
|
||||
// push 0xb
|
||||
sim->pop = 0;
|
||||
sim->push = 1;
|
||||
sim->data_in = 0xb;
|
||||
break;
|
||||
case 6:
|
||||
// verify outputs
|
||||
CHECK(sim->data_out == 0xa);
|
||||
CHECK(sim->full == 0x1);
|
||||
CHECK(sim->empty == 0x0);
|
||||
// pop
|
||||
sim->pop = 1;
|
||||
sim->push = 0;
|
||||
break;
|
||||
case 8:
|
||||
// verify outputs
|
||||
CHECK(sim->data_out == 0xb);
|
||||
CHECK(sim->full == 0x0);
|
||||
CHECK(sim->empty == 0x0);
|
||||
// pop
|
||||
sim->pop = 1;
|
||||
sim->push = 0;
|
||||
break;
|
||||
case 10:
|
||||
// verify outputs
|
||||
CHECK(sim->full == 0x0);
|
||||
CHECK(sim->empty == 0x1);
|
||||
sim->pop = 0;
|
||||
sim->push = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
// advance clock
|
||||
ticks = sim.step(ticks, 2);
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
std::cout << "Simulation time: " << std::dec << ticks/2 << " cycles" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
81
hw/unit_tests/generic_queue/vl_simulator.h
Normal file
|
@ -0,0 +1,81 @@
|
|||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <cstdint>
|
||||
#include "verilated.h"
|
||||
|
||||
#ifdef VM_TRACE
|
||||
#include <verilated_vcd_c.h> // Trace file format header
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
class vl_simulator {
|
||||
private:
|
||||
|
||||
T top_;
|
||||
#ifdef VM_TRACE
|
||||
VerilatedVcdC tfp_;
|
||||
#endif
|
||||
|
||||
public:
|
||||
|
||||
vl_simulator() {
|
||||
top_.clk = 0;
|
||||
top_.reset = 0;
|
||||
#ifdef VM_TRACE
|
||||
Verilated::traceEverOn(true);
|
||||
top_.trace(&tfp_, 99);
|
||||
tfp_.open("trace.vcd");
|
||||
#endif
|
||||
}
|
||||
|
||||
~vl_simulator() {
|
||||
#ifdef VM_TRACE
|
||||
tfp_.close();
|
||||
#endif
|
||||
top_.final();
|
||||
}
|
||||
|
||||
uint64_t reset(uint64_t ticks) {
|
||||
top_.reset = 1;
|
||||
ticks = this->step(ticks, 2);
|
||||
top_.reset = 0;
|
||||
return ticks;
|
||||
}
|
||||
|
||||
uint64_t step(uint64_t ticks, uint32_t count = 1) {
|
||||
while (count--) {
|
||||
top_.eval();
|
||||
#ifdef VM_TRACE
|
||||
tfp_.dump(ticks);
|
||||
#endif
|
||||
top_.clk = !top_.clk;
|
||||
++ticks;
|
||||
}
|
||||
return ticks;
|
||||
}
|
||||
|
||||
T* operator->() {
|
||||
return &top_;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename... Args>
|
||||
void vl_setw(uint32_t* sig, Args&&... args) {
|
||||
std::array<uint32_t, sizeof... (Args)> arr{static_cast<uint32_t>(std::forward<Args>(args))...};
|
||||
for (size_t i = 0; i < sizeof... (Args); ++i) {
|
||||
sig[i] = arr[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
int vl_cmpw(const uint32_t* sig, Args&&... args) {
|
||||
std::array<uint32_t, sizeof... (Args)> arr{static_cast<uint32_t>(std::forward<Args>(args))...};
|
||||
for (size_t i = 0; i < sizeof... (Args); ++i) {
|
||||
if (sig[i] < arr[i])
|
||||
return -1;
|
||||
if (sig[i] > arr[i])
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
30
hw/unit_tests/tex_unit/tex_sampler/Makefile
Normal file
|
@ -0,0 +1,30 @@
|
|||
TOP = VX_tex_sampler
|
||||
|
||||
PARAMS ?=
|
||||
|
||||
INCLUDE = -I../../../rtl/ -I../../../rtl/libs -I../../../rtl/tex_unit
|
||||
|
||||
SRCS = main.cpp
|
||||
|
||||
all: build
|
||||
|
||||
CF += -std=c++11 -fms-extensions -I../..
|
||||
VF += $(PARAMS)
|
||||
|
||||
VF += --language 1800-2009 --assert -Wall --trace
|
||||
VF += -Wno-DECLFILENAME
|
||||
VF += --x-initial unique
|
||||
VF += -exe $(SRCS) $(INCLUDE)
|
||||
VF += $(PARAMS)
|
||||
|
||||
gen:
|
||||
verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS)
|
||||
|
||||
build: gen
|
||||
(cd obj_dir && make -j -f V$(TOP).mk)
|
||||
|
||||
run: build
|
||||
(cd obj_dir && ./V$(TOP))
|
||||
|
||||
clean:
|
||||
rm -rf obj_dir
|
215
hw/unit_tests/tex_unit/tex_sampler/main.cpp
Normal file
|
@ -0,0 +1,215 @@
|
|||
#include "vl_simulator.h"
|
||||
#include "VVX_tex_sampler.h"
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
|
||||
#define MAX_TICKS 20
|
||||
#define MAX_UNIT_CYCLES 5
|
||||
#define NUM_THREADS
|
||||
|
||||
#define CHECK(x) \
|
||||
do { \
|
||||
if (x) \
|
||||
break; \
|
||||
std::cout << "FAILED: " << #x << std::endl; \
|
||||
std::abort(); \
|
||||
} while (false)
|
||||
|
||||
uint64_t ticks = 0;
|
||||
|
||||
// using Device = VVX_tex_sampler;
|
||||
|
||||
template <typename T>
|
||||
class testbench
|
||||
{
|
||||
private:
|
||||
vl_simulator<T> sim;
|
||||
std::map<int, struct Input> input_map;
|
||||
std::map<int, struct Output> output_map;
|
||||
|
||||
public:
|
||||
|
||||
struct UnitTest {
|
||||
bool use_reset;
|
||||
unsigned int num_cycles;
|
||||
bool use_cmodel;
|
||||
struct Output outputs[MAX_UNIT_CYCLES];
|
||||
struct Input inputs[MAX_UNIT_CYCLES];
|
||||
unsigned int num_output_check;
|
||||
unsigned int check_output_cycle[MAX_UNIT_CYCLES];
|
||||
}
|
||||
|
||||
struct Input {
|
||||
bool req_valid;
|
||||
unsigned int req_wid;
|
||||
unsigned int req_tmask;
|
||||
unsigned int req_PC;
|
||||
unsigned int req_rd;
|
||||
unsigned int req_wb;
|
||||
unsigned int req_filter;
|
||||
unsigned int req_format;
|
||||
unsigned int req_u[NUM_THREADS];
|
||||
unsigned int req_v[NUM_THREADS];
|
||||
unsigned int req_texels[NUM_THREADS][4];
|
||||
bool rsp_ready;
|
||||
}
|
||||
|
||||
struct Output {
|
||||
int output_cycle;
|
||||
// outputs
|
||||
bool req_ready;
|
||||
bool rsp_valid;
|
||||
unsigned int rsp_wid;
|
||||
unsigned int rsp_tmask;
|
||||
unsigned int rsp_PC;
|
||||
unsigned int rsp_rd;
|
||||
bool rsp_wb;
|
||||
unsigned int rsp_data[NUM_THREADS];
|
||||
}
|
||||
|
||||
testbench(/* args */){
|
||||
|
||||
}
|
||||
|
||||
~testbench(){
|
||||
}
|
||||
|
||||
void unittest_Cmodel(struct UnitTest * test){
|
||||
int cycles = test->num_cycles;
|
||||
int num_outputs = test->num_output_check;
|
||||
|
||||
// struct Input* inputs = new (struct Input)[cycles];
|
||||
struct Output* outputs = new (struct Output)[num_outputs];
|
||||
|
||||
// implement c model and assign outputs to struct
|
||||
|
||||
if (test->inputs[0]->req_filter == 0){
|
||||
for (int i = 0; i < NUM_THREADS; i++)
|
||||
outputs[0]->rsp_data[0] = test->inputs->req_texels[i][0];
|
||||
} else {
|
||||
// for (int i = 0; i < NUM_THREADS; i++){
|
||||
// uint32_t low[4], high[4];
|
||||
// for (int j = 0; j < 4; j++){
|
||||
// low[j] = test->inputs->req_texels[i][j] & 0x00ff00ff;
|
||||
// high[j] = (test->inputs->req_texels[i][j] >> 8) & 0x00ff00ff;
|
||||
// }
|
||||
|
||||
// }
|
||||
}
|
||||
outputs[0]->output_cycle = 1;
|
||||
test->num_cycles = 1;
|
||||
test->outputs = &outputs;
|
||||
|
||||
}
|
||||
|
||||
void generate_test_vectors(struct UnitTest * tests, int num_tests, bool is_pipe){
|
||||
// for all unit tests create output test vectors (w w/o c-model)
|
||||
int prev_test_cycle = 0;
|
||||
|
||||
for (int i = 0; i < num_tests; i++)
|
||||
{
|
||||
int op_counter = 0;
|
||||
int ip_counter = 0;
|
||||
|
||||
int test_cycle = 0;
|
||||
int last_ip_cycle = 0;
|
||||
|
||||
struct UnitTest curr_test = tests[i];
|
||||
|
||||
if (curr_test->use_cmodel){
|
||||
unittest_Cmodel(&curr_test);
|
||||
}
|
||||
|
||||
for (int j = 0; j < curr_test->num_cycles; j++)
|
||||
{
|
||||
if (curr_test->inputs[ip_counter]->input_cycle == test_cycle){
|
||||
input_map.insert(std::make_pair(prev_test_cycle + test_cycle, curr_test->inputs[j]));
|
||||
last_ip_cycle = prev_test_cycle + test_cycle;
|
||||
ip_counter++;
|
||||
}
|
||||
|
||||
if (curr_test->outputs[op_counter]->output_cycle == test_cycle){
|
||||
output_map.insert(std::make_pair(prev_test_cycle + test_cycle, curr_test->outputs[op_counter]));
|
||||
op_counter++;
|
||||
}
|
||||
|
||||
test_cycle++;
|
||||
}
|
||||
|
||||
if(!is_pipe){
|
||||
prev_test_cycle += (test_cycle - 1);
|
||||
}
|
||||
else{
|
||||
prev_test_cycle = last_ip_cycle + 1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void run(){
|
||||
|
||||
ticks = sim.reset(0);
|
||||
int cycle = 0;
|
||||
|
||||
while (ticks < MAX_TICKS) {
|
||||
|
||||
auto input = input_map.find(cycle);
|
||||
auto output = output_map.find(cycle);
|
||||
|
||||
if (input != input_map.end()){
|
||||
sim->req_valid = input->req_valid;
|
||||
sim->req_wid = input->req_wid;
|
||||
sim->req_tmask = input->req_tmask;
|
||||
sim->req_PC = input->req_PC;
|
||||
sim->req_rd = input->req_rd;
|
||||
sim->req_wb = input->req_wb;
|
||||
sim->req_filter = input->req_filter;
|
||||
sim->req_format = input->req_format;
|
||||
// sim->req_u = input->req_u[NUM_THREADS];
|
||||
// sim->req_v = input->req_v[NUM_THREADS];
|
||||
vl_setw(sim->req_texels, input->req_texels)
|
||||
// sim->req_texels = input->req_texels[NUM_THREADS][4];
|
||||
sim->rsp_ready = input->rsp_ready;
|
||||
} else{
|
||||
std::cout << "Warning! No Input on Cycle " << cycle << std::endl;
|
||||
}
|
||||
|
||||
if(output != output_map.end()){
|
||||
CHECK(sim->req_ready == output->req_ready);
|
||||
CHECK(sim->rsp_valid == output->rsp_valid);
|
||||
CHECK(sim->rsp_wid == output->rsp_wid);
|
||||
CHECK(sim->rsp_tmask == output->rsp_tmask);
|
||||
CHECK(sim->rsp_PC == output->rsp_PC);
|
||||
CHECK(sim->rsp_rd == output->rsp_rd);
|
||||
CHECK(sim->rsp_wb == output->rsp_wb);
|
||||
CHECK(vl_cmpw(sim->rsp_data, output->rsp_data));
|
||||
}
|
||||
|
||||
cycle++;
|
||||
ticks = sim.step(ticks,2);
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
std::cout << "Simulation time: " << std::dec << ticks/2 << " cycles" << std::endl;
|
||||
|
||||
};
|
||||
|
||||
|
||||
double sc_time_stamp() {
|
||||
return ticks;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
// Initialize Verilators variables
|
||||
Verilated::commandArgs(argc, argv);
|
||||
|
||||
testbench<VVX_tex_sampler> sampler_testbench;
|
||||
|
||||
sampler_testbench.generate_test_vectors(tests, 1, 0);
|
||||
sampler_test_bench.run();
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
81
hw/unit_tests/tex_unit/tex_sampler/vl_simulator.h
Normal file
|
@ -0,0 +1,81 @@
|
|||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <cstdint>
|
||||
#include "verilated.h"
|
||||
|
||||
#ifdef VM_TRACE
|
||||
#include <verilated_vcd_c.h> // Trace file format header
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
class vl_simulator {
|
||||
private:
|
||||
|
||||
T top_;
|
||||
#ifdef VM_TRACE
|
||||
VerilatedVcdC tfp_;
|
||||
#endif
|
||||
|
||||
public:
|
||||
|
||||
vl_simulator() {
|
||||
top_.clk = 0;
|
||||
top_.reset = 0;
|
||||
#ifdef VM_TRACE
|
||||
Verilated::traceEverOn(true);
|
||||
top_.trace(&tfp_, 99);
|
||||
tfp_.open("trace.vcd");
|
||||
#endif
|
||||
}
|
||||
|
||||
~vl_simulator() {
|
||||
#ifdef VM_TRACE
|
||||
tfp_.close();
|
||||
#endif
|
||||
top_.final();
|
||||
}
|
||||
|
||||
uint64_t reset(uint64_t ticks) {
|
||||
top_.reset = 1;
|
||||
ticks = this->step(ticks, 2);
|
||||
top_.reset = 0;
|
||||
return ticks;
|
||||
}
|
||||
|
||||
uint64_t step(uint64_t ticks, uint32_t count = 1) {
|
||||
while (count--) {
|
||||
top_.eval();
|
||||
#ifdef VM_TRACE
|
||||
tfp_.dump(ticks);
|
||||
#endif
|
||||
top_.clk = !top_.clk;
|
||||
++ticks;
|
||||
}
|
||||
return ticks;
|
||||
}
|
||||
|
||||
T* operator->() {
|
||||
return &top_;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename... Args>
|
||||
void vl_setw(uint32_t* sig, Args&&... args) {
|
||||
std::array<uint32_t, sizeof... (Args)> arr{static_cast<uint32_t>(std::forward<Args>(args))...};
|
||||
for (size_t i = 0; i < sizeof... (Args); ++i) {
|
||||
sig[i] = arr[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
int vl_cmpw(const uint32_t* sig, Args&&... args) {
|
||||
std::array<uint32_t, sizeof... (Args)> arr{static_cast<uint32_t>(std::forward<Args>(args))...};
|
||||
for (size_t i = 0; i < sizeof... (Args); ++i) {
|
||||
if (sig[i] < arr[i])
|
||||
return -1;
|
||||
if (sig[i] > arr[i])
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -5,7 +5,62 @@
|
|||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
||||
#endif
|
||||
#ifdef __ASSEMBLY__
|
||||
#define __ASM_STR(x) x
|
||||
#else
|
||||
#define __ASM_STR(x) #x
|
||||
#endif
|
||||
|
||||
#define vx_csr_swap(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
|
||||
__v; \
|
||||
})
|
||||
|
||||
#define vx_csr_read(csr) ({ \
|
||||
register unsigned __v; \
|
||||
__asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \
|
||||
__v; \
|
||||
})
|
||||
|
||||
#define vx_csr_write(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
|
||||
})
|
||||
|
||||
#define vx_csr_read_set(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
|
||||
__v; \
|
||||
})
|
||||
|
||||
#define vx_csr_set(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
|
||||
})
|
||||
|
||||
#define vx_csr_read_clear(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
|
||||
__v; \
|
||||
})
|
||||
|
||||
#define vx_csr_clear(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
|
||||
})
|
||||
|
||||
// Texture load
|
||||
#define vx_tex(unit, u, v, l) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __u = u; \
|
||||
unsigned __v = v; \
|
||||
unsigned __l = l; \
|
||||
__asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#ifdef __ASSEMBLY__
|
||||
#define __ASM_STR(x) x
|
||||
|
@ -52,6 +107,16 @@ extern "C" {
|
|||
__asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
|
||||
})
|
||||
|
||||
// Texture load
|
||||
#define vx_tex(unit, u, v, l) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __u = u; \
|
||||
unsigned __v = v; \
|
||||
unsigned __l = l; \
|
||||
__asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
// Set thread mask
|
||||
inline void vx_tmc(unsigned thread_mask) {
|
||||
asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(thread_mask));
|
||||
|
@ -86,7 +151,7 @@ inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
|
|||
|
||||
// Prefetch
|
||||
inline void vx_prefetch(unsigned addr) {
|
||||
asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) );
|
||||
asm volatile (".insn s 0x6b, 6, x0, 0(%0)" :: "r"(addr) );
|
||||
}
|
||||
|
||||
// Return active warp's thread id
|
||||
|
@ -170,6 +235,8 @@ inline void vx_fence() {
|
|||
|
||||
#define __endif vx_join();
|
||||
|
||||
#define __DIVERGENT__ __attribute__((annotate("divergent")))
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -34,7 +34,7 @@ int vx_vprintf(const char* format, va_list va) {
|
|||
printf_arg_t arg;
|
||||
arg.format = format;
|
||||
arg.va = &va;
|
||||
vx_serial(__printf_cb, &arg);
|
||||
vx_serial((vx_serial_cb)__printf_cb, &arg);
|
||||
return arg.ret;
|
||||
}
|
||||
|
||||
|
@ -63,7 +63,7 @@ void vx_putint(int value, int base) {
|
|||
putint_arg_t arg;
|
||||
arg.value = value;
|
||||
arg.base = base;
|
||||
vx_serial(__putint_cb, &arg);
|
||||
vx_serial((vx_serial_cb)__putint_cb, &arg);
|
||||
}
|
||||
|
||||
static void __putfloat_cb(const putfloat_arg_t* arg) {
|
||||
|
@ -83,7 +83,7 @@ void vx_putfloat(float value, int precision) {
|
|||
putfloat_arg_t arg;
|
||||
arg.value = value;
|
||||
arg.precision = precision;
|
||||
vx_serial(__putfloat_cb, &arg);
|
||||
vx_serial((vx_serial_cb)__putfloat_cb, &arg);
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
|
|
@ -1,32 +1,34 @@
|
|||
RTL_DIR=../../hw/rtl
|
||||
DPI_DIR=../../hw/dpi
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
|
||||
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
|
||||
CXXFLAGS += -I../../../hw -I../../common
|
||||
CXXFLAGS += -I../../common/softfloat/source/include
|
||||
|
||||
LDFLAGS += ../../common/softfloat/build/Linux-x86_64-GCC/softfloat.a
|
||||
|
||||
# control RTL debug print states
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
|
||||
# control RTL debug tracing states
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_MEM
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_OPAE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_AVS
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
|
||||
|
||||
DBG_FLAGS += $(DBG_PRINT_FLAGS)
|
||||
DBG_FLAGS += $(DBG_TRACE_FLAGS)
|
||||
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
|
||||
DBG_FLAGS += -DVCD_OUTPUT
|
||||
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE)
|
||||
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE) $(TEX_INCLUDE)
|
||||
|
||||
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
|
||||
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
|
||||
|
|
|
@ -182,7 +182,7 @@ static const char* op_string(const Instr &instr) {
|
|||
case 2: return "SPLIT";
|
||||
case 3: return "JOIN";
|
||||
case 4: return "BAR";
|
||||
case 5: return "PREFETCH";
|
||||
case 6: return "PREFETCH";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
|
|
|
@ -712,7 +712,7 @@ void Warp::execute(const Instr &instr, Pipeline *pipeline) {
|
|||
pipeline->stall_warp = true;
|
||||
runOnce = true;
|
||||
} break;
|
||||
case 5: {
|
||||
case 6: {
|
||||
// PREFETCH
|
||||
int addr = rsdata[0];
|
||||
printf("*** PREFETCHED %d ***\n", addr);
|
||||
|
|
|
@ -2,27 +2,28 @@ RTL_DIR = ../../hw/rtl
|
|||
DPI_DIR = ../../hw/dpi
|
||||
SCRIPT_DIR=../../hw/scripts
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors -Wno-array-bounds
|
||||
CXXFLAGS += -fPIC -Wno-maybe-uninitialized
|
||||
CXXFLAGS += -I.. -I../../../hw -I../../common
|
||||
CXXFLAGS += -I../../common/softfloat/source/include
|
||||
|
||||
LDFLAGS += -shared ../../common/softfloat/build/Linux-x86_64-GCC/softfloat.a
|
||||
|
||||
# control RTL debug print states
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_PIPELINE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_ICACHE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CORE_DCACHE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_BANK
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_MSHR
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_TAG
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_CACHE_DATA
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
|
||||
# control RTL debug tracing states
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_PIPELINE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_ICACHE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CORE_DCACHE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_BANK
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_MSHR
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_TAG
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_CACHE_DATA
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_MEM
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_OPAE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_AVS
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_SCOPE
|
||||
DBG_TRACE_FLAGS += -DDBG_TRACE_TEX
|
||||
|
||||
DBG_FLAGS += $(DBG_PRINT_FLAGS)
|
||||
DBG_FLAGS += $(DBG_TRACE_FLAGS)
|
||||
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
|
||||
|
||||
SRCS = ../common/util.cpp ../common/mem.cpp ../common/rvfloats.cpp
|
||||
|
@ -30,7 +31,8 @@ SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
|
|||
SRCS += fpga.cpp opae_sim.cpp
|
||||
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE)
|
||||
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(DPI_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE)
|
||||
RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip
|
||||
|
||||
TOP = vortex_afu_shim
|
||||
|
@ -84,12 +86,12 @@ VL_FLAGS += -D$(FPU_CORE)
|
|||
|
||||
PROJECT = libopae-c-vlsim
|
||||
|
||||
all: shared
|
||||
all: $(PROJECT).so
|
||||
|
||||
vortex_afu.h : $(RTL_DIR)/afu/vortex_afu.vh
|
||||
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/afu/vortex_afu.vh -o vortex_afu.h
|
||||
|
||||
shared: $(SRCS) vortex_afu.h
|
||||
$(PROJECT).so: $(SRCS) vortex_afu.h
|
||||
verilator --build $(VL_FLAGS) $(SRCS) -CFLAGS '$(CXXFLAGS)' -LDFLAGS '$(LDFLAGS)' -o ../$(PROJECT).so
|
||||
|
||||
static: $(SRCS) vortex_afu.h
|
||||
|
|
|
@ -421,7 +421,7 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
|
|||
#else
|
||||
|
||||
commandQueue = clCreateCommandQueue(context,
|
||||
devices[device_touse], NULL, &status);
|
||||
devices[device_touse], 0, &status);
|
||||
|
||||
#endif // PROFILING
|
||||
|
||||
|
@ -451,8 +451,8 @@ void cl_cleanup()
|
|||
printf("clReleaseContext()\n");
|
||||
}
|
||||
|
||||
for (int p = 0; p < numPlatforms; ++p) {
|
||||
for (int d = 0; d < numDevices[p]; ++d) {
|
||||
for (cl_uint p = 0; p < numPlatforms; ++p) {
|
||||
for (cl_uint d = 0; d < numDevices[p]; ++d) {
|
||||
status = clReleaseDevice(devices[d]);
|
||||
cl_errChk(status, "Oops!", true);
|
||||
printf("clReleaseDevice()\n");
|
||||
|
|
|
@ -421,7 +421,7 @@ cl_context cl_init_context(int platform, int dev,int quiet) {
|
|||
#else
|
||||
|
||||
commandQueue = clCreateCommandQueue(context,
|
||||
devices[device_touse], NULL, &status);
|
||||
devices[device_touse], 0, &status);
|
||||
|
||||
#endif // PROFILING
|
||||
|
||||
|
@ -451,8 +451,8 @@ void cl_cleanup()
|
|||
printf("clReleaseContext()\n");
|
||||
}
|
||||
|
||||
for (int p = 0; p < numPlatforms; ++p) {
|
||||
for (int d = 0; d < numDevices[p]; ++d) {
|
||||
for (cl_uint p = 0; p < numPlatforms; ++p) {
|
||||
for (cl_uint d = 0; d < numDevices[p]; ++d) {
|
||||
status = clReleaseDevice(devices[d]);
|
||||
cl_errChk(status, "Oops!", true);
|
||||
printf("clReleaseDevice()\n");
|
||||
|
|