mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-24 05:47:35 -04:00
Vortex 2.0 changes:
+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes
This commit is contained in:
parent
d69a64c32c
commit
d47cccc157
1300 changed files with 247321 additions and 311189 deletions
3
.gitmodules
vendored
3
.gitmodules
vendored
|
@ -4,9 +4,6 @@
|
|||
[submodule "third_party/softfloat"]
|
||||
path = third_party/softfloat
|
||||
url = https://github.com/ucb-bar/berkeley-softfloat-3.git
|
||||
[submodule "third_party/cocogfx"]
|
||||
path = third_party/cocogfx
|
||||
url = https://github.com/gtcasl/cocogfx.git
|
||||
[submodule "third_party/ramulator"]
|
||||
path = third_party/ramulator
|
||||
url = https://github.com/CMU-SAFARI/ramulator.git
|
||||
|
|
87
.travis.yml
87
.travis.yml
|
@ -9,21 +9,31 @@ addons:
|
|||
packages:
|
||||
- build-essential
|
||||
- valgrind
|
||||
- verilator
|
||||
- yosys
|
||||
- libpng-dev
|
||||
- libboost-serialization-dev
|
||||
- libstdc++6
|
||||
- hwloc
|
||||
|
||||
install:
|
||||
# Set environments
|
||||
- export RISCV_TOOLCHAIN_PATH=/opt/riscv-gnu-toolchain
|
||||
- export VERILATOR_ROOT=/opt/verilator
|
||||
- export PATH=$VERILATOR_ROOT/bin:$PATH
|
||||
install:
|
||||
# Install toolchain
|
||||
- ci/toolchain_install.sh -all
|
||||
- export TOOLDIR=$HOME/tools
|
||||
- mkdir -p $TOOLDIR
|
||||
- DESTDIR=$TOOLDIR ./ci/toolchain_install.sh --all
|
||||
# Set environments
|
||||
- export RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv-gnu-toolchain
|
||||
- export LLVM_POCL=$TOOLDIR/llvm-pocl
|
||||
- export LLVM_VORTEX=$TOOLDIR/llvm-vortex
|
||||
- export VERILATOR_ROOT=$TOOLDIR/verilator
|
||||
- export PATH=$VERILATOR_ROOT/bin:$PATH
|
||||
- export SV2V_PATH=$TOOLDIR/sv2v
|
||||
- export PATH=$SV2V_PATH/bin:$PATH
|
||||
- export YOSYS_PATH=$TOOLDIR/yosys
|
||||
- export PATH=$YOSYS_PATH/bin:$PATH
|
||||
- export POCL_CC_PATH=$TOOLDIR/pocl/compiler
|
||||
- export POCL_RT_PATH=$TOOLDIR/pocl/runtime
|
||||
# build project
|
||||
- make -s
|
||||
- cp -r $PWD ../build32 && cd ../build32 && make clean-all && make
|
||||
- cp -r $PWD ../build64 && cd ../build64 && make clean-all && XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain make
|
||||
|
||||
# stages ordering
|
||||
stages:
|
||||
|
@ -32,45 +42,54 @@ stages:
|
|||
jobs:
|
||||
include:
|
||||
- stage: test
|
||||
name: coverage
|
||||
script: cp -r $PWD ../build_coverage && cd ../build_coverage && ./ci/travis_run.py ./ci/regression.sh -coverage
|
||||
name: unittest
|
||||
script: cp -r ../build32 ../build32_unittest && cd ../build32_unittest && ./ci/travis_run.py ./ci/regression.sh --unittest
|
||||
- stage: test
|
||||
name: coverage64
|
||||
script: cp -r $PWD ../build_coverage64 && cd ../build_coverage64 && ./ci/travis_run.py ./ci/regression64.sh -coverage
|
||||
name: isa
|
||||
script: cp -r ../build32 ../build32_isa && cd ../build32_isa && ./ci/travis_run.py ./ci/regression.sh --isa
|
||||
- stage: test
|
||||
name: tex
|
||||
script: cp -r $PWD ../build_tex && cd ../build_tex && ./ci/travis_run.py ./ci/regression.sh -tex
|
||||
name: isa64
|
||||
script: cp -r ../build64 ../build64_isa && cd ../build64_isa && XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain ./ci/travis_run.py ./ci/regression.sh --isa
|
||||
- stage: test
|
||||
name: regression
|
||||
script: cp -r ../build32 ../build32_regression && cd ../build32_regression && ./ci/travis_run.py ./ci/regression.sh --regression
|
||||
- stage: test
|
||||
name: regression64
|
||||
script: cp -r ../build64 ../build64_regression && cd ../build64_regression && XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain ./ci/travis_run.py ./ci/regression.sh --regression
|
||||
- stage: test
|
||||
name: opencl
|
||||
script: cp -r ../build32 ../build32_opencl && cd ../build32_opencl && ./ci/travis_run.py ./ci/regression.sh --opencl
|
||||
- stage: test
|
||||
name: cluster
|
||||
script: cp -r $PWD ../build_cluster && cd ../build_cluster && ./ci/travis_run.py ./ci/regression.sh -cluster
|
||||
script: cp -r ../build32 ../build32_cluster && cd ../build32_cluster && ./ci/travis_run.py ./ci/regression.sh --cluster
|
||||
- stage: test
|
||||
name: config
|
||||
script: cp -r $PWD ../build_config && cd ../build_config && ./ci/travis_run.py ./ci/regression.sh -config
|
||||
script: cp -r ../build32 ../build32_config && cd ../build32_config && ./ci/travis_run.py ./ci/regression.sh --config
|
||||
- stage: test
|
||||
name: debug
|
||||
script: cp -r $PWD ../build_debug && cd ../build_debug && ./ci/travis_run.py ./ci/regression.sh -debug
|
||||
script: cp -r ../build32 ../build32_debug && cd ../build32_debug && ./ci/travis_run.py ./ci/regression.sh --debug
|
||||
- stage: test
|
||||
name: stress0
|
||||
script: cp -r $PWD ../build_stress0 && cd ../build_stress0 && ./ci/travis_run.py ./ci/regression.sh -stress0
|
||||
script: cp -r ../build32 ../build32_stress0 && cd ../build32_stress0 && ./ci/travis_run.py ./ci/regression.sh --stress0
|
||||
- stage: test
|
||||
name: stress1
|
||||
script: cp -r $PWD ../build_stress1 && cd ../build_stress1 && ./ci/travis_run.py ./ci/regression.sh -stress1
|
||||
script: cp -r ../build32 ../build32_stress1 && cd ../build32_stress1 && ./ci/travis_run.py ./ci/regression.sh --stress1
|
||||
- stage: test
|
||||
name: synthesis
|
||||
script: cp -r ../build32 ../build32_isa && cd ../build32_isa && ./ci/travis_run.py ./ci/regression.sh --synthesis
|
||||
- stage: test
|
||||
name: synthesis64
|
||||
script: cp -r ../build64 ../build64_isa && cd ../build64_isa && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --synthesis
|
||||
- stage: test
|
||||
name: compiler
|
||||
script: cp -r $PWD ../build_compiler && cd ../build_compiler && ./ci/travis_run.py ./ci/test_compiler.sh
|
||||
- stage: test
|
||||
name: tex
|
||||
script: cp -r $PWD ../build_tex && cd ../build_tex && ./ci/travis_run.py ./ci/regression.sh -tex
|
||||
- stage: test
|
||||
name: unittest
|
||||
script: cp -r $PWD ../build_unittest && cd ../build_unittest && ./ci/travis_run.py ./ci/regression.sh -unittest
|
||||
|
||||
script: cp -r ../build32 ../build32_compiler && cd ../build32_compiler && ./ci/travis_run.py ./ci/test_compiler.sh
|
||||
|
||||
after_success:
|
||||
# Gather code coverage
|
||||
- lcov --directory driver --capture --output-file driver.cov # capture trace
|
||||
- lcov --directory simx --capture --output-file simx.cov # capture trace
|
||||
- lcov --list driver.cov # output coverage data for debugging
|
||||
- lcov --list simx.cov # output coverage data for debugging
|
||||
- lcov --directory runtime --capture --output-file runtime.cov # capture trace
|
||||
- lcov --directory sim --capture --output-file sim.cov # capture trace
|
||||
- lcov --list runtime.cov # output coverage data for debugging
|
||||
- lcov --list sim.cov # output coverage data for debugging
|
||||
# Upload coverage report
|
||||
- bash <(curl -s https://codecov.io/bash) -f driver.cov
|
||||
- bash <(curl -s https://codecov.io/bash) -f simx.cov
|
||||
- bash <(curl -s https://codecov.io/bash) -f runtime.cov
|
||||
- bash <(curl -s https://codecov.io/bash) -f sim.cov
|
||||
|
|
221
LICENSE
221
LICENSE
|
@ -1,24 +1,201 @@
|
|||
Copyright (c) <2020>, <Georgia Institute of Technology>
|
||||
All rights reserved.
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the Georgia Institute of Technology nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDER BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
|
|
20
Makefile
20
Makefile
|
@ -2,13 +2,27 @@ all:
|
|||
$(MAKE) -C third_party
|
||||
$(MAKE) -C hw
|
||||
$(MAKE) -C sim
|
||||
$(MAKE) -C driver
|
||||
$(MAKE) -C kernel
|
||||
$(MAKE) -C runtime
|
||||
$(MAKE) -C tests
|
||||
|
||||
clean:
|
||||
$(MAKE) -C hw clean
|
||||
$(MAKE) -C sim clean
|
||||
$(MAKE) -C driver clean
|
||||
$(MAKE) -C kernel clean
|
||||
$(MAKE) -C runtime clean
|
||||
$(MAKE) -C tests clean
|
||||
$(MAKE) -C tests clean
|
||||
|
||||
clean-all:
|
||||
$(MAKE) -C third_party clean
|
||||
$(MAKE) -C hw clean
|
||||
$(MAKE) -C sim clean
|
||||
$(MAKE) -C kernel clean
|
||||
$(MAKE) -C runtime clean
|
||||
$(MAKE) -C tests clean-all
|
||||
|
||||
crtlsim:
|
||||
$(MAKE) -C sim clean
|
||||
|
||||
brtlsim:
|
||||
$(MAKE) -C sim
|
||||
|
|
45
README.md
45
README.md
|
@ -1,22 +1,25 @@
|
|||
[](https://travis-ci.com/vortexgpgpu/vortex)
|
||||
[](https://codecov.io/gh/vortexgpgpu/vortex)
|
||||
|
||||
# Vortex OpenGPU
|
||||
# Vortex GPGPU
|
||||
|
||||
Vortex is a full-system RISCV-based GPGPU processor.
|
||||
Vortex is a full-stack open-source RISC-V GPGPU.
|
||||
|
||||
## Specifications
|
||||
|
||||
- Support RISC-V RV32IMF ISA
|
||||
- Performance:
|
||||
- 1024 total threads running at 250 MHz
|
||||
- 128 Gflops of compute bandwidth
|
||||
- 16 GB/s of memory bandwidth
|
||||
- Scalability: up to 64 cores with optional L2 and L3 caches
|
||||
- Software: OpenCL 1.2 Support
|
||||
- Support RISC-V RV32IMAF and RV64IMAFD
|
||||
- Microarchitecture:
|
||||
- configurable number of cores, warps, and threads.
|
||||
- configurable number of ALU, FPU, LSU, and SFU units per core.
|
||||
- configurable pipeline issue width.
|
||||
- optional shared memory, L1, L2, and L3 caches.
|
||||
- Software:
|
||||
- OpenCL 1.2 Support.
|
||||
- Supported FPGAs:
|
||||
- Intel Arria 10
|
||||
- Intel Stratix 10
|
||||
- Altera Arria 10
|
||||
- Altera Stratix 10
|
||||
- Xilinx Alveo U50, U250, U280
|
||||
- Xilinx Versal VCK5000
|
||||
|
||||
## Directory structure
|
||||
|
||||
|
@ -38,6 +41,11 @@ Vortex is a full-system RISCV-based GPGPU processor.
|
|||
- [LLVM](https://llvm.org/)
|
||||
- [RISCV-GNU-TOOLCHAIN](https://github.com/riscv-collab/riscv-gnu-toolchain)
|
||||
- [Verilator](https://www.veripool.org/verilator)
|
||||
- [FpNew](https://github.com/pulp-platform/fpnew.git)
|
||||
- [SoftFloat](https://github.com/ucb-bar/berkeley-softfloat-3.git)
|
||||
- [Ramulator](https://github.com/CMU-SAFARI/ramulator.git)
|
||||
- [Yosys](https://github.com/YosysHQ/yosys)
|
||||
- [Sv2v](https://github.com/zachjs/sv2v)
|
||||
### Install development tools
|
||||
$ sudo apt-get install build-essential
|
||||
$ sudo apt-get install git
|
||||
|
@ -45,8 +53,19 @@ Vortex is a full-system RISCV-based GPGPU processor.
|
|||
$ git clone --recursive https://github.com/vortexgpgpu/vortex.git
|
||||
$ cd Vortex
|
||||
### Install prebuilt toolchain
|
||||
$ ./ci/toolchain_install.sh -all
|
||||
$ ./ci/toolchain_install.sh --all
|
||||
|
||||
By default, the toolchain will install to /opt folder.
|
||||
You can install the toolchain to a different directory by overiding DESTDIR.
|
||||
|
||||
$ DESTDIR=$TOOLDIR ./ci/toolchain_install.sh --all
|
||||
$ export VORTEX_HOME=$TOOLDIR/vortex
|
||||
$ export LLVM_VORTEX=$TOOLDIR/llvm-vortex
|
||||
$ export LLVM_POCL=$TOOLDIR/llvm-pocl
|
||||
$ export RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv-gnu-toolchain
|
||||
$ export VERILATOR_ROOT=$TOOLDIR/verilator
|
||||
$ export PATH=$VERILATOR_ROOT/bin:$PATH
|
||||
### Build Vortex sources
|
||||
$ make -s
|
||||
### Quick demo running vecadd OpenCL kernel on 2 cores
|
||||
$ ./ci/blackbox.sh --driver=rtlsim --cores=2 --app=vecadd
|
||||
$ ./ci/blackbox.sh --cores=2 --app=vecadd
|
||||
|
|
136
ci/blackbox.sh
136
ci/blackbox.sh
|
@ -1,26 +1,42 @@
|
|||
#!/bin/sh
|
||||
|
||||
# Copyright © 2019-2023
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
show_usage()
|
||||
{
|
||||
echo "Vortex BlackBox Test Driver v1.0"
|
||||
echo "Usage: [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=rtlsim|vlsim|simx] [--debug] [--scope] [--perf] [--app=vecadd|sgemm|basic|demo|dogfood] [--args=<args>] [--help]]"
|
||||
echo "Usage: $0 [[--clusters=#n] [--cores=#n] [--warps=#n] [--threads=#n] [--l2cache] [--l3cache] [[--driver=#name] [--app=#app] [--args=#args] [--debug=#level] [--scope] [--perf=#class] [--rebuild=0|1] [--log=logfile] [--help]]"
|
||||
}
|
||||
|
||||
SCRIPT_DIR=$(dirname "$0")
|
||||
VORTEX_HOME=$SCRIPT_DIR/..
|
||||
|
||||
DRIVER=vlsim
|
||||
DRIVER=simx
|
||||
APP=sgemm
|
||||
CLUSTERS=1
|
||||
CORES=1
|
||||
WARPS=4
|
||||
THREADS=4
|
||||
L2=0
|
||||
L3=0
|
||||
L2=
|
||||
L3=
|
||||
DEBUG=0
|
||||
DEBUG_LEVEL=0
|
||||
SCOPE=0
|
||||
HAS_ARGS=0
|
||||
DEBUG_LEVEL=1
|
||||
PERF_CLASS=0
|
||||
REBUILD=2
|
||||
LOGFILE=run.log
|
||||
|
||||
for i in "$@"
|
||||
do
|
||||
|
@ -50,14 +66,15 @@ case $i in
|
|||
shift
|
||||
;;
|
||||
--l2cache)
|
||||
L2=1
|
||||
L2=-DL2_ENABLE
|
||||
shift
|
||||
;;
|
||||
--l3cache)
|
||||
L3=1
|
||||
L3=-DL3_ENABLE
|
||||
shift
|
||||
;;
|
||||
--debug)
|
||||
--debug=*)
|
||||
DEBUG_LEVEL=${i#*=}
|
||||
DEBUG=1
|
||||
shift
|
||||
;;
|
||||
|
@ -66,8 +83,9 @@ case $i in
|
|||
CORES=1
|
||||
shift
|
||||
;;
|
||||
--perf)
|
||||
--perf=*)
|
||||
PERF_FLAG=-DPERF_ENABLE
|
||||
PERF_CLASS=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--args=*)
|
||||
|
@ -75,33 +93,37 @@ case $i in
|
|||
HAS_ARGS=1
|
||||
shift
|
||||
;;
|
||||
--rebuild=*)
|
||||
REBUILD=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--log=*)
|
||||
LOGFILE=${i#*=}
|
||||
shift
|
||||
;;
|
||||
--help)
|
||||
show_usage
|
||||
exit 0
|
||||
;;
|
||||
*)
|
||||
show_usage
|
||||
exit -1
|
||||
;;
|
||||
show_usage
|
||||
exit -1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
case $DRIVER in
|
||||
rtlsim)
|
||||
DRIVER_PATH=$VORTEX_HOME/driver/rtlsim
|
||||
;;
|
||||
vlsim)
|
||||
DRIVER_PATH=$VORTEX_HOME/driver/vlsim
|
||||
;;
|
||||
asesim)
|
||||
DRIVER_PATH=$VORTEX_HOME/driver/asesim
|
||||
;;
|
||||
fpga)
|
||||
DRIVER_PATH=$VORTEX_HOME/driver/fpga
|
||||
;;
|
||||
simx)
|
||||
DRIVER_PATH=$VORTEX_HOME/driver/simx
|
||||
DEBUG_LEVEL=3
|
||||
DRIVER_PATH=$VORTEX_HOME/runtime/simx
|
||||
;;
|
||||
rtlsim)
|
||||
DRIVER_PATH=$VORTEX_HOME/runtime/rtlsim
|
||||
;;
|
||||
opae)
|
||||
DRIVER_PATH=$VORTEX_HOME/runtime/opae
|
||||
;;
|
||||
xrt)
|
||||
DRIVER_PATH=$VORTEX_HOME/runtime/xrt
|
||||
;;
|
||||
*)
|
||||
echo "invalid driver: $DRIVER"
|
||||
|
@ -116,49 +138,61 @@ elif [ -d "$VORTEX_HOME/tests/regression/$APP" ];
|
|||
then
|
||||
APP_PATH=$VORTEX_HOME/tests/regression/$APP
|
||||
else
|
||||
echo "Application folder found: $APP"
|
||||
echo "Application folder not found: $APP"
|
||||
exit -1
|
||||
fi
|
||||
|
||||
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS -DL2_ENABLE=$L2 -DL3_ENABLE=$L3 $PERF_FLAG $CONFIGS"
|
||||
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS"
|
||||
|
||||
echo "CONFIGS=$CONFIGS"
|
||||
|
||||
BLACKBOX_CACHE=blackbox.$DRIVER.cache
|
||||
|
||||
if [ -f "$BLACKBOX_CACHE" ]
|
||||
then
|
||||
LAST_CONFIGS=`cat $BLACKBOX_CACHE`
|
||||
fi
|
||||
|
||||
if [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ];
|
||||
if [ $REBUILD -ne 0 ]
|
||||
then
|
||||
make -C $DRIVER_PATH clean
|
||||
BLACKBOX_CACHE=blackbox.$DRIVER.cache
|
||||
if [ -f "$BLACKBOX_CACHE" ]
|
||||
then
|
||||
LAST_CONFIGS=`cat $BLACKBOX_CACHE`
|
||||
fi
|
||||
|
||||
if [ $REBUILD -eq 1 ] || [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ];
|
||||
then
|
||||
make -C $DRIVER_PATH clean > /dev/null
|
||||
echo "$CONFIGS+$DEBUG+$SCOPE" > $BLACKBOX_CACHE
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "$CONFIGS+$DEBUG+$SCOPE" > $BLACKBOX_CACHE
|
||||
# export performance monitor class identifier
|
||||
export PERF_CLASS=$PERF_CLASS
|
||||
|
||||
status=0
|
||||
|
||||
if [ $DEBUG -eq 1 ]
|
||||
# ensure config update
|
||||
make -C $VORTEX_HOME/hw config > /dev/null
|
||||
|
||||
# ensure the stub driver is present
|
||||
make -C $VORTEX_HOME/runtime/stub > /dev/null
|
||||
|
||||
if [ $DEBUG -ne 0 ]
|
||||
then
|
||||
# driver initialization
|
||||
if [ $SCOPE -eq 1 ]
|
||||
then
|
||||
echo "running: DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH"
|
||||
DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH
|
||||
DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||
else
|
||||
echo "running: DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH"
|
||||
DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH
|
||||
fi
|
||||
DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||
fi
|
||||
|
||||
# running application
|
||||
if [ $HAS_ARGS -eq 1 ]
|
||||
then
|
||||
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER > run.log 2>&1"
|
||||
OPTS=$ARGS make -C $APP_PATH run-$DRIVER > run.log 2>&1
|
||||
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||
OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
status=$?
|
||||
else
|
||||
echo "running: make -C $APP_PATH run-$DRIVER > run.log 2>&1"
|
||||
make -C $APP_PATH run-$DRIVER > run.log 2>&1
|
||||
echo "running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
|
||||
make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
|
||||
status=$?
|
||||
fi
|
||||
|
||||
|
@ -167,17 +201,17 @@ then
|
|||
mv -f $APP_PATH/trace.vcd .
|
||||
fi
|
||||
else
|
||||
echo "driver initialization..."
|
||||
# driver initialization
|
||||
if [ $SCOPE -eq 1 ]
|
||||
then
|
||||
echo "running: SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH"
|
||||
SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH
|
||||
SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||
else
|
||||
echo "running: CONFIGS="$CONFIGS" make -C $DRIVER_PATH"
|
||||
CONFIGS="$CONFIGS" make -C $DRIVER_PATH
|
||||
CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
|
||||
fi
|
||||
|
||||
echo "running application..."
|
||||
# running application
|
||||
if [ $HAS_ARGS -eq 1 ]
|
||||
then
|
||||
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
|
||||
|
@ -190,4 +224,4 @@ else
|
|||
fi
|
||||
fi
|
||||
|
||||
exit $status
|
||||
exit $status
|
||||
|
|
|
@ -1,73 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# exit when any command fails
|
||||
set -e
|
||||
|
||||
OS_DIR=${OS_DIR:-'ubuntu/bionic'}
|
||||
SRCDIR=${SRCDIR:-'/opt'}
|
||||
DESTDIR=${DESTDIR:-'.'}
|
||||
|
||||
echo "OS_DIR=${OS_DIR}"
|
||||
echo "SRCDIR=${SRCDIR}"
|
||||
echo "DESTDIR=${DESTDIR}"
|
||||
|
||||
riscv()
|
||||
{
|
||||
echo "prebuilt riscv-gnu-toolchain..."
|
||||
tar -C $SRCDIR -cvjf riscv-gnu-toolchain.tar.bz2 riscv-gnu-toolchain
|
||||
split -b 50M riscv-gnu-toolchain.tar.bz2 "riscv-gnu-toolchain.tar.bz2.part"
|
||||
mv riscv-gnu-toolchain.tar.bz2.part* $DESTDIR/riscv-gnu-toolchain/$OS_DIR
|
||||
rm riscv-gnu-toolchain.tar.bz2
|
||||
}
|
||||
|
||||
llvm()
|
||||
{
|
||||
echo "prebuilt llvm-riscv..."
|
||||
tar -C $SRCDIR -cvjf llvm-vortex1.tar.bz2 llvm-riscv
|
||||
split -b 50M llvm-vortex1.tar.bz2 "llvm-vortex1.tar.bz2.part"
|
||||
mv llvm-vortex1.tar.bz2.part* $DESTDIR/llvm-vortex/$OS_DIR
|
||||
rm llvm-vortex1.tar.bz2
|
||||
}
|
||||
|
||||
pocl()
|
||||
{
|
||||
echo "prebuilt pocl..."
|
||||
tar -C $SRCDIR -cvjf pocl1.tar.bz2 pocl
|
||||
mv pocl1.tar.bz2 $DESTDIR/pocl/$OS_DIR
|
||||
}
|
||||
|
||||
verilator()
|
||||
{
|
||||
echo "prebuilt verilator..."
|
||||
tar -C $SRCDIR -cvjf verilator.tar.bz2 verilator
|
||||
mv verilator.tar.bz2 $DESTDIR/verilator/$OS_DIR
|
||||
}
|
||||
|
||||
usage()
|
||||
{
|
||||
echo "usage: prebuilt [[-riscv] [-llvm] [-pocl] [-verilator] [-all] [-h|--help]]"
|
||||
}
|
||||
|
||||
while [ "$1" != "" ]; do
|
||||
case $1 in
|
||||
-pocl ) pocl
|
||||
;;
|
||||
-verilator ) verilator
|
||||
;;
|
||||
-riscv ) riscv
|
||||
;;
|
||||
-llvm ) llvm
|
||||
;;
|
||||
-all ) riscv
|
||||
llvm
|
||||
pocl
|
||||
verilator
|
||||
;;
|
||||
-h | --help ) usage
|
||||
exit
|
||||
;;
|
||||
* ) usage
|
||||
exit 1
|
||||
esac
|
||||
shift
|
||||
done
|
297
ci/regression.sh
297
ci/regression.sh
|
@ -1,44 +1,102 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright © 2019-2023
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# exit when any command fails
|
||||
set -e
|
||||
|
||||
# ensure build
|
||||
make -s
|
||||
# clear blackbox cache
|
||||
rm -f blackbox.*.cache
|
||||
|
||||
unittest()
|
||||
{
|
||||
make -C tests/unittest run
|
||||
}
|
||||
|
||||
coverage()
|
||||
isa()
|
||||
{
|
||||
echo "begin coverage tests..."
|
||||
echo "begin isa tests..."
|
||||
|
||||
make -C tests/runtime run-rtlsim
|
||||
make -C tests/riscv/isa run-rtlsim
|
||||
make -C tests/regression run-vlsim
|
||||
make -C tests/opencl run-vlsim
|
||||
make -C tests/runtime run-simx
|
||||
make -C tests/riscv/isa run-simx
|
||||
make -C tests/regression run-simx
|
||||
make -C tests/opencl run-simx
|
||||
make -C tests/riscv/isa run-rtlsim
|
||||
CONFIGS="-DDPI_DISABLE" make -C tests/riscv/isa run-rtlsim
|
||||
|
||||
echo "coverage tests done!"
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim
|
||||
make -C tests/riscv/isa run-rtlsim-32f
|
||||
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim
|
||||
make -C tests/riscv/isa run-rtlsim-32f
|
||||
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim
|
||||
make -C tests/riscv/isa run-rtlsim-32f
|
||||
|
||||
if [ "$XLEN" == "64" ]
|
||||
then
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim
|
||||
make -C tests/riscv/isa run-rtlsim-64f
|
||||
|
||||
make -C sim/rtlsim clean && CONFIGS="-DEXT_D_ENABLE -DFPU_FPNEW" make -C sim/rtlsim
|
||||
make -C tests/riscv/isa run-rtlsim-64d || true
|
||||
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim
|
||||
make -C tests/riscv/isa run-rtlsim-64f
|
||||
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim
|
||||
make -C tests/riscv/isa run-rtlsim-64fx
|
||||
fi
|
||||
|
||||
make -C sim/rtlsim clean && make -C sim/rtlsim
|
||||
|
||||
echo "isa tests done!"
|
||||
}
|
||||
|
||||
tex()
|
||||
regression()
|
||||
{
|
||||
echo "begin texture tests..."
|
||||
echo "begin regression tests..."
|
||||
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=vlsim --app=tex --args="-isoccer.png -osoccer_result.png -g0"
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-isoccer.png -osoccer_result.png -g0"
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1"
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-irainbow.png -orainbow_result.png -g2"
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1" --perf
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-itoad.png -otoad_result.png -g1" --perf
|
||||
make -C tests/kernel run-simx
|
||||
make -C tests/kernel run-rtlsim
|
||||
|
||||
echo "coverage texture done!"
|
||||
make -C tests/regression run-simx
|
||||
make -C tests/regression run-rtlsim
|
||||
|
||||
# test FPU hardware implementations
|
||||
CONFIGS="-DFPU_DPI" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
CONFIGS="-DFPU_DSP" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
CONFIGS="-DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
|
||||
# test local barrier
|
||||
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -t19"
|
||||
./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -t19"
|
||||
|
||||
# test global barrier
|
||||
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -t20" --cores=2
|
||||
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -t20" --cores=2
|
||||
|
||||
# test FPU core
|
||||
|
||||
echo "regression tests done!"
|
||||
}
|
||||
|
||||
opencl()
|
||||
{
|
||||
echo "begin opencl tests..."
|
||||
|
||||
make -C tests/opencl run-simx
|
||||
make -C tests/opencl run-rtlsim
|
||||
|
||||
echo "opencl tests done!"
|
||||
}
|
||||
|
||||
cluster()
|
||||
|
@ -46,23 +104,26 @@ cluster()
|
|||
echo "begin clustering tests..."
|
||||
|
||||
# warp/threads configurations
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=demo
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=demo
|
||||
./ci/blackbox.sh --driver=simx --cores=1 --warps=8 --threads=16 --app=demo
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=1 --threads=1 --app=diverge
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=2 --app=diverge
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=diverge
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=diverge
|
||||
./ci/blackbox.sh --driver=simx --cores=1 --warps=1 --threads=1 --app=diverge
|
||||
./ci/blackbox.sh --driver=simx --cores=1 --warps=8 --threads=16 --app=diverge
|
||||
|
||||
# cores clustering
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=diverge --args="-n1"
|
||||
|
||||
# L2/L3
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=diverge --args="-n1"
|
||||
|
||||
echo "clustering tests done!"
|
||||
}
|
||||
|
@ -71,11 +132,22 @@ debug()
|
|||
{
|
||||
echo "begin debugging tests..."
|
||||
|
||||
./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --perf --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --perf --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --debug --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=vlsim --cores=1 --scope --app=basic --args="-t0 -n1"
|
||||
# test CSV trace generation
|
||||
make -C sim/simx clean && DEBUG=3 make -C sim/simx
|
||||
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim
|
||||
make -C tests/riscv/isa run-simx-32im > run_simx.log
|
||||
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
|
||||
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
|
||||
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
|
||||
diff trace_rtlsim.csv trace_simx.csv
|
||||
make -C sim/simx clean && make -C sim/simx
|
||||
make -C sim/rtlsim clean && make -C sim/rtlsim
|
||||
|
||||
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=basic --args="-t0 -n1"
|
||||
|
||||
echo "debugging tests done!"
|
||||
}
|
||||
|
@ -84,51 +156,77 @@ config()
|
|||
{
|
||||
echo "begin configuration tests..."
|
||||
|
||||
# disable DPI
|
||||
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
|
||||
|
||||
# issue width
|
||||
CONFIGS="-DISSUE_WIDTH=1" ./ci/blackbox.sh --driver=rtlsim --app=diverge
|
||||
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
|
||||
CONFIGS="-DISSUE_WIDTH=1" ./ci/blackbox.sh --driver=simx --app=diverge
|
||||
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=simx --app=diverge
|
||||
|
||||
# dispatch size
|
||||
CONFIGS="-DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=1" ./ci/blackbox.sh --driver=rtlsim --app=diverge
|
||||
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
|
||||
CONFIGS="-DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=1" ./ci/blackbox.sh --driver=simx --app=diverge
|
||||
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=simx --app=diverge
|
||||
|
||||
# FPU scaling
|
||||
CONFIGS="-DNUM_ALU_BLOCK=4 -DNUM_FPU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
CONFIGS="-DNUM_ALU_BLOCK=4 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
|
||||
# custom program startup address
|
||||
make -C tests/regression/dogfood clean-all
|
||||
STARTUP_ADDR=0x40000000 make -C tests/regression/dogfood
|
||||
CONFIGS="-DSTARTUP_ADDR=0x40000000" ./ci/blackbox.sh --driver=simx --app=dogfood
|
||||
CONFIGS="-DSTARTUP_ADDR=0x40000000" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
make -C tests/regression/dogfood clean-all
|
||||
make -C tests/regression/dogfood
|
||||
|
||||
# disabling M extension
|
||||
CONFIGS=-DEXT_M_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
|
||||
CONFIGS="-DEXT_M_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
|
||||
|
||||
# disabling F extension
|
||||
CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
|
||||
CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext --perf
|
||||
CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=simx --cores=1 --app=no_mf_ext --perf
|
||||
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
|
||||
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext --perf=1
|
||||
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=simx --cores=1 --app=no_mf_ext --perf=1
|
||||
|
||||
# disable shared memory
|
||||
CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem
|
||||
CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem --perf
|
||||
CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=simx --cores=1 --app=no_smem --perf
|
||||
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem
|
||||
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem --perf=1
|
||||
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=simx --cores=1 --app=no_smem --perf=1
|
||||
|
||||
# using Default FPU core
|
||||
FPU_CORE=FPU_DEFAULT ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood
|
||||
# disable L1 cache
|
||||
CONFIGS="-DL1_DISABLE -DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
|
||||
# using FPNEW FPU core
|
||||
FPU_CORE=FPU_FPNEW ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood
|
||||
# multiple L1 caches per cluster
|
||||
CONFIGS="-DNUM_DCACHES=2 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --cores=8 --warps=1 --threads=2
|
||||
|
||||
# using AXI bus
|
||||
# test AXI bus
|
||||
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo
|
||||
|
||||
# adjust l1 block size to match l2
|
||||
CONFIGS="-DL1_BLOCK_SIZE=64" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1"
|
||||
CONFIGS="-DL1_LINE_SIZE=64" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1"
|
||||
|
||||
# test cache banking
|
||||
CONFIGS="-DDNUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr
|
||||
CONFIGS="-DDNUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr
|
||||
CONFIGS="-DDNUM_BANKS=2" ./ci/blackbox.sh --driver=simx --cores=1 --app=io_addr
|
||||
|
||||
# test cache multi-porting
|
||||
CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=io_addr
|
||||
CONFIGS="-DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo --debug --args="-n1"
|
||||
CONFIGS="-DL2_NUM_PORTS=2 -DDNUM_PORTS=2" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr
|
||||
CONFIGS="-DL2_NUM_PORTS=4 -DDNUM_PORTS=4" ./ci/blackbox.sh --driver=rtlsim --cores=4 --l2cache --app=io_addr
|
||||
CONFIGS="-DL2_NUM_PORTS=4 -DDNUM_PORTS=4" ./ci/blackbox.sh --driver=simx --cores=4 --l2cache --app=io_addr
|
||||
CONFIGS="-DSMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
CONFIGS="-DSMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
CONFIGS="-DSMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemm
|
||||
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
|
||||
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
|
||||
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --cores=1 --app=sgemm
|
||||
|
||||
# test 128-bit MEM block
|
||||
CONFIGS=-DMEM_BLOCK_SIZE=16 ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
|
||||
|
||||
# test single-bank DRAM
|
||||
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
|
||||
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
|
||||
|
||||
# test 27-bit DRAM address
|
||||
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo
|
||||
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
|
||||
|
||||
echo "configuration tests done!"
|
||||
}
|
||||
|
@ -138,14 +236,9 @@ stress0()
|
|||
echo "begin stress0 tests..."
|
||||
|
||||
# test verilator reset values
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=0" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=sgemm
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=sgemm
|
||||
FPU_CORE=FPU_DEFAULT CONFIGS="-DVERILATOR_RESET_VALUE=0" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
|
||||
FPU_CORE=FPU_DEFAULT CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=0" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=0" ./ci/blackbox.sh --driver=vlsim --app=printf
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --app=printf
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --app=printf
|
||||
|
||||
echo "stress0 tests done!"
|
||||
}
|
||||
|
@ -154,51 +247,75 @@ stress1()
|
|||
{
|
||||
echo "begin stress1 tests..."
|
||||
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --clusters=2 --l3cache --app=sgemm --args="-n256"
|
||||
./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n128" --l2cache
|
||||
|
||||
echo "stress1 tests done!"
|
||||
}
|
||||
|
||||
usage()
|
||||
synthesis()
|
||||
{
|
||||
echo "usage: regression [-unittest] [-coverage] [-tex] [-cluster] [-debug] [-config] [-stress[#n]] [-all] [-h|--help]"
|
||||
echo "begin synthesis tests..."
|
||||
|
||||
PREFIX=build_base make -C hw/syn/yosys clean
|
||||
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys elaborate
|
||||
|
||||
echo "synthesis tests done!"
|
||||
}
|
||||
|
||||
show_usage()
|
||||
{
|
||||
echo "Vortex Regression Test"
|
||||
echo "Usage: $0 [--unittest] [--isa] [--regression] [--opencl] [--cluster] [--debug] [--config] [--stress[#n]] [--synthesis] [--all] [--h|--help]"
|
||||
}
|
||||
|
||||
start=$SECONDS
|
||||
|
||||
while [ "$1" != "" ]; do
|
||||
case $1 in
|
||||
-unittest ) unittest
|
||||
--unittest ) unittest
|
||||
;;
|
||||
-coverage ) coverage
|
||||
--isa ) isa
|
||||
;;
|
||||
-tex ) tex
|
||||
--regression ) regression
|
||||
;;
|
||||
-cluster ) cluster
|
||||
--opencl ) opencl
|
||||
;;
|
||||
-debug ) debug
|
||||
--cluster ) cluster
|
||||
;;
|
||||
-config ) config
|
||||
--debug ) debug
|
||||
;;
|
||||
-stress0 ) stress0
|
||||
--config ) config
|
||||
;;
|
||||
-stress1 ) stress1
|
||||
--stress0 ) stress0
|
||||
;;
|
||||
-stress ) stress0
|
||||
--stress1 ) stress1
|
||||
;;
|
||||
--stress ) stress0
|
||||
stress1
|
||||
;;
|
||||
-all ) unittest
|
||||
coverage
|
||||
tex
|
||||
--synthesis ) synthesis
|
||||
;;
|
||||
--all ) unittest
|
||||
isa
|
||||
regression
|
||||
opencl
|
||||
cluster
|
||||
debug
|
||||
config
|
||||
stress0
|
||||
stress1
|
||||
synthesis
|
||||
;;
|
||||
-h | --help ) usage
|
||||
-h | --help ) show_usage
|
||||
exit
|
||||
;;
|
||||
* ) usage
|
||||
* ) show_usage
|
||||
exit 1
|
||||
esac
|
||||
shift
|
||||
done
|
||||
done
|
||||
|
||||
echo "Regression completed!"
|
||||
|
||||
duration=$(( SECONDS - start ))
|
||||
awk -v t=$duration 'BEGIN{t=int(t*1000); printf "Elapsed Time: %d:%02d:%02d\n", t/3600000, t/60000%60, t/1000%60}'
|
||||
|
|
|
@ -1,38 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
# exit when any command fails
|
||||
set -e
|
||||
|
||||
# ensure build
|
||||
make -s
|
||||
|
||||
coverage()
|
||||
{
|
||||
echo "begin coverage tests..."
|
||||
|
||||
make -C sim/simx clean
|
||||
XLEN=64 make -C sim/simx
|
||||
XLEN=64 make -C tests/riscv/isa run-simx
|
||||
|
||||
echo "coverage tests done!"
|
||||
}
|
||||
|
||||
usage()
|
||||
{
|
||||
echo "usage: regression [-coverage] [-all] [-h|--help]"
|
||||
}
|
||||
|
||||
while [ "$1" != "" ]; do
|
||||
case $1 in
|
||||
-coverage ) coverage
|
||||
;;
|
||||
-all ) coverage
|
||||
;;
|
||||
-h | --help ) usage
|
||||
exit
|
||||
;;
|
||||
* ) usage
|
||||
exit 1
|
||||
esac
|
||||
shift
|
||||
done
|
|
@ -1,30 +1,31 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright © 2019-2023
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# exit when any command fails
|
||||
set -e
|
||||
|
||||
# ensure build
|
||||
make -s
|
||||
|
||||
# clear POCL cache
|
||||
rm -rf ~/.cache/pocl
|
||||
|
||||
# rebuild runtime
|
||||
make -C runtime clean
|
||||
make -C runtime
|
||||
# force rebuild test kernels
|
||||
make -C tests clean-all
|
||||
|
||||
# rebuild drivers
|
||||
make -C driver clean
|
||||
make -C driver
|
||||
# ensure build
|
||||
make -s
|
||||
|
||||
# rebuild runtime tests
|
||||
make -C tests/runtime clean
|
||||
make -C tests/runtime
|
||||
|
||||
# rebuild regression tests
|
||||
make -C tests/regression clean-all
|
||||
make -C tests/regression
|
||||
|
||||
# rebuild opencl tests
|
||||
make -C tests/opencl clean-all
|
||||
make -C tests/opencl
|
||||
# run tests
|
||||
make -C tests/kernel run-simx
|
||||
make -C tests/regression run-simx
|
||||
make -C tests/opencl run-simx
|
|
@ -1,5 +1,18 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright © 2019-2023
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# exit when any command fails
|
||||
set -e
|
||||
|
||||
|
@ -7,91 +20,166 @@ REPOSITORY=https://github.com/vortexgpgpu/vortex-toolchain-prebuilt/raw/master
|
|||
|
||||
DESTDIR="${DESTDIR:=/opt}"
|
||||
|
||||
OS="${OS:=ubuntu/bionic}"
|
||||
|
||||
riscv()
|
||||
{
|
||||
for x in {a..j}
|
||||
case $OS in
|
||||
"centos/7") parts=$(eval echo {a..h}) ;;
|
||||
*) parts=$(eval echo {a..j}) ;;
|
||||
esac
|
||||
rm -f riscv-gnu-toolchain.tar.bz2.parta*
|
||||
for x in $parts
|
||||
do
|
||||
wget $REPOSITORY/riscv-gnu-toolchain/ubuntu/bionic/riscv-gnu-toolchain.tar.bz2.parta$x
|
||||
wget $REPOSITORY/riscv-gnu-toolchain/$OS/riscv-gnu-toolchain.tar.bz2.parta$x
|
||||
done
|
||||
cat riscv-gnu-toolchain.tar.bz2.parta* > riscv-gnu-toolchain.tar.bz2
|
||||
tar -xvf riscv-gnu-toolchain.tar.bz2
|
||||
rm -f riscv-gnu-toolchain.tar.bz2*
|
||||
cp -r riscv-gnu-toolchain $DESTDIR
|
||||
rm -f riscv-gnu-toolchain.tar.bz2*
|
||||
rm -rf riscv-gnu-toolchain
|
||||
}
|
||||
|
||||
riscv64()
|
||||
{
|
||||
for x in {a..j}
|
||||
case $OS in
|
||||
"centos/7") parts=$(eval echo {a..h}) ;;
|
||||
*) parts=$(eval echo {a..j}) ;;
|
||||
esac
|
||||
rm -f riscv64-gnu-toolchain.tar.bz2.parta*
|
||||
for x in $parts
|
||||
do
|
||||
wget $REPOSITORY/riscv64-gnu-toolchain/ubuntu/bionic/riscv64-gnu-toolchain.tar.bz2.parta$x
|
||||
wget $REPOSITORY/riscv64-gnu-toolchain/$OS/riscv64-gnu-toolchain.tar.bz2.parta$x
|
||||
done
|
||||
cat riscv64-gnu-toolchain.tar.bz2.parta* > riscv64-gnu-toolchain.tar.bz2
|
||||
tar -xvf riscv64-gnu-toolchain.tar.bz2
|
||||
rm -f riscv64-gnu-toolchain.tar.bz2*
|
||||
cp -r riscv64-gnu-toolchain $DESTDIR
|
||||
rm -f riscv64-gnu-toolchain.tar.bz2*
|
||||
rm -rf riscv64-gnu-toolchain
|
||||
}
|
||||
|
||||
llvm()
|
||||
llvm-vortex()
|
||||
{
|
||||
for x in {a..b}
|
||||
case $OS in
|
||||
"centos/7") parts=$(eval echo {a..b}) ;;
|
||||
*) parts=$(eval echo {a..b}) ;;
|
||||
esac
|
||||
echo $parts
|
||||
rm -f llvm-vortex.tar.bz2.parta*
|
||||
for x in $parts
|
||||
do
|
||||
wget $REPOSITORY/llvm-vortex/ubuntu/bionic/llvm-vortex1.tar.bz2.parta$x
|
||||
wget $REPOSITORY/llvm-vortex/$OS/llvm-vortex.tar.bz2.parta$x
|
||||
done
|
||||
cat llvm-vortex1.tar.bz2.parta* > llvm-vortex1.tar.bz2
|
||||
tar -xvf llvm-vortex1.tar.bz2
|
||||
rm -f llvm-vortex1.tar.bz2*
|
||||
cp -r llvm-riscv $DESTDIR
|
||||
rm -rf llvm-riscv
|
||||
cat llvm-vortex.tar.bz2.parta* > llvm-vortex.tar.bz2
|
||||
tar -xvf llvm-vortex.tar.bz2
|
||||
cp -r llvm-vortex $DESTDIR
|
||||
rm -f llvm-vortex.tar.bz2*
|
||||
rm -rf llvm-vortex
|
||||
}
|
||||
|
||||
llvm-pocl()
|
||||
{
|
||||
case $OS in
|
||||
"centos/7") parts=$(eval echo {a..b}) ;;
|
||||
*) parts=$(eval echo {a..b}) ;;
|
||||
esac
|
||||
echo $parts
|
||||
rm -f llvm-pocl.tar.bz2.parta*
|
||||
for x in $parts
|
||||
do
|
||||
wget $REPOSITORY/llvm-pocl/$OS/llvm-pocl.tar.bz2.parta$x
|
||||
done
|
||||
cat llvm-pocl.tar.bz2.parta* > llvm-pocl.tar.bz2
|
||||
tar -xvf llvm-pocl.tar.bz2
|
||||
cp -r llvm-pocl $DESTDIR
|
||||
rm -f llvm-pocl.tar.bz2*
|
||||
rm -rf llvm-pocl
|
||||
}
|
||||
|
||||
pocl()
|
||||
{
|
||||
wget $REPOSITORY/pocl/ubuntu/bionic/pocl1.tar.bz2
|
||||
tar -xvf pocl1.tar.bz2
|
||||
rm -f pocl1.tar.bz2
|
||||
wget $REPOSITORY/pocl/$OS/pocl.tar.bz2
|
||||
tar -xvf pocl.tar.bz2
|
||||
rm -f pocl.tar.bz2
|
||||
cp -r pocl $DESTDIR
|
||||
rm -rf pocl
|
||||
}
|
||||
|
||||
verilator()
|
||||
{
|
||||
wget $REPOSITORY/verilator/ubuntu/bionic/verilator.tar.bz2
|
||||
wget $REPOSITORY/verilator/$OS/verilator.tar.bz2
|
||||
tar -xvf verilator.tar.bz2
|
||||
rm -f verilator.tar.bz2
|
||||
cp -r verilator $DESTDIR
|
||||
rm -f verilator.tar.bz2
|
||||
rm -rf verilator
|
||||
}
|
||||
|
||||
usage()
|
||||
sv2v()
|
||||
{
|
||||
echo "usage: toolchain_install [[-riscv] [-riscv64] [-llvm] [-pocl] [-verilator] [-all] [-h|--help]]"
|
||||
wget $REPOSITORY/sv2v/$OS/sv2v.tar.bz2
|
||||
tar -xvf sv2v.tar.bz2
|
||||
rm -f sv2v.tar.bz2
|
||||
cp -r sv2v $DESTDIR
|
||||
rm -rf sv2v
|
||||
}
|
||||
|
||||
yosys()
|
||||
{
|
||||
case $OS in
|
||||
"centos/7") parts=$(eval echo {a..c}) ;;
|
||||
*) parts=$(eval echo {a..c}) ;;
|
||||
esac
|
||||
echo $parts
|
||||
rm -f yosys.tar.bz2.parta*
|
||||
for x in $parts
|
||||
do
|
||||
wget $REPOSITORY/yosys/$OS/yosys.tar.bz2.parta$x
|
||||
done
|
||||
cat yosys.tar.bz2.parta* > yosys.tar.bz2
|
||||
tar -xvf yosys.tar.bz2
|
||||
cp -r yosys $DESTDIR
|
||||
rm -f yosys.tar.bz2*
|
||||
rm -rf yosys
|
||||
}
|
||||
|
||||
show_usage()
|
||||
{
|
||||
echo "Install Pre-built Vortex Toolchain"
|
||||
echo "Usage: $0 [[--riscv] [--riscv64] [--llvm-vortex] [--llvm-pocl] [--pocl] [--verilator] [--sv2v] [--yosys] [--all] [-h|--help]]"
|
||||
}
|
||||
|
||||
while [ "$1" != "" ]; do
|
||||
case $1 in
|
||||
-pocl ) pocl
|
||||
--pocl ) pocl
|
||||
;;
|
||||
-verilator ) verilator
|
||||
;;
|
||||
-riscv ) riscv
|
||||
;;
|
||||
-riscv64 ) riscv64
|
||||
;;
|
||||
-llvm ) llvm
|
||||
--verilator ) verilator
|
||||
;;
|
||||
-all ) riscv
|
||||
riscv64
|
||||
llvm
|
||||
pocl
|
||||
verilator
|
||||
;;
|
||||
-h | --help ) usage
|
||||
exit
|
||||
;;
|
||||
* ) usage
|
||||
exit 1
|
||||
--riscv ) riscv
|
||||
;;
|
||||
--riscv64 ) riscv64
|
||||
;;
|
||||
--llvm-vortex ) llvm-vortex
|
||||
;;
|
||||
--llvm-pocl ) llvm-pocl
|
||||
;;
|
||||
--sv2v ) sv2v
|
||||
;;
|
||||
--yosys ) yosys
|
||||
;;
|
||||
--all ) riscv
|
||||
riscv64
|
||||
llvm-vortex
|
||||
llvm-pocl
|
||||
pocl
|
||||
verilator
|
||||
sv2v
|
||||
yosys
|
||||
;;
|
||||
-h | --help ) show_usage
|
||||
exit
|
||||
;;
|
||||
* ) show_usage
|
||||
exit 1
|
||||
esac
|
||||
shift
|
||||
done
|
133
ci/toolchain_prebuilt.sh
Executable file
133
ci/toolchain_prebuilt.sh
Executable file
|
@ -0,0 +1,133 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Copyright © 2019-2023
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
# exit when any command fails
|
||||
set -e
|
||||
|
||||
OS_DIR=${OS_DIR:-'ubuntu/bionic'}
|
||||
SRCDIR=${SRCDIR:-'/opt'}
|
||||
DESTDIR=${DESTDIR:-'.'}
|
||||
|
||||
echo "OS_DIR=${OS_DIR}"
|
||||
echo "SRCDIR=${SRCDIR}"
|
||||
echo "DESTDIR=${DESTDIR}"
|
||||
|
||||
riscv()
|
||||
{
|
||||
echo "prebuilt riscv-gnu-toolchain..."
|
||||
tar -C $SRCDIR -cvjf riscv-gnu-toolchain.tar.bz2 riscv-gnu-toolchain
|
||||
split -b 50M riscv-gnu-toolchain.tar.bz2 "riscv-gnu-toolchain.tar.bz2.part"
|
||||
mv riscv-gnu-toolchain.tar.bz2.part* $DESTDIR/riscv-gnu-toolchain/$OS_DIR
|
||||
rm riscv-gnu-toolchain.tar.bz2
|
||||
}
|
||||
|
||||
riscv64()
|
||||
{
|
||||
echo "prebuilt riscv64-gnu-toolchain..."
|
||||
tar -C $SRCDIR -cvjf riscv64-gnu-toolchain.tar.bz2 riscv64-gnu-toolchain
|
||||
split -b 50M riscv64-gnu-toolchain.tar.bz2 "riscv64-gnu-toolchain.tar.bz2.part"
|
||||
mv riscv64-gnu-toolchain.tar.bz2.part* $DESTDIR/riscv64-gnu-toolchain/$OS_DIR
|
||||
rm riscv64-gnu-toolchain.tar.bz2
|
||||
}
|
||||
|
||||
llvm-vortex()
|
||||
{
|
||||
echo "prebuilt llvm-vortex..."
|
||||
tar -C $SRCDIR -cvjf llvm-vortex.tar.bz2 llvm-vortex
|
||||
split -b 50M llvm-vortex.tar.bz2 "llvm-vortex.tar.bz2.part"
|
||||
mv llvm-vortex.tar.bz2.part* $DESTDIR/llvm-vortex/$OS_DIR
|
||||
rm llvm-vortex.tar.bz2
|
||||
}
|
||||
|
||||
llvm-pocl()
|
||||
{
|
||||
echo "prebuilt llvm-pocl..."
|
||||
tar -C $SRCDIR -cvjf llvm-pocl.tar.bz2 llvm-pocl
|
||||
split -b 50M llvm-pocl.tar.bz2 "llvm-pocl.tar.bz2.part"
|
||||
mv llvm-pocl.tar.bz2.part* $DESTDIR/llvm-pocl/$OS_DIR
|
||||
rm llvm-pocl.tar.bz2
|
||||
}
|
||||
|
||||
pocl()
|
||||
{
|
||||
echo "prebuilt pocl..."
|
||||
tar -C $SRCDIR -cvjf pocl.tar.bz2 pocl
|
||||
mv pocl.tar.bz2 $DESTDIR/pocl/$OS_DIR
|
||||
}
|
||||
|
||||
verilator()
|
||||
{
|
||||
echo "prebuilt verilator..."
|
||||
tar -C $SRCDIR -cvjf verilator.tar.bz2 verilator
|
||||
mv verilator.tar.bz2 $DESTDIR/verilator/$OS_DIR
|
||||
}
|
||||
|
||||
sv2v()
|
||||
{
|
||||
echo "prebuilt sv2v..."
|
||||
tar -C $SRCDIR -cvjf sv2v.tar.bz2 sv2v
|
||||
mv sv2v.tar.bz2 $DESTDIR/sv2v/$OS_DIR
|
||||
}
|
||||
|
||||
yosys()
|
||||
{
|
||||
echo "prebuilt yosys..."
|
||||
tar -C $SRCDIR -cvjf yosys.tar.bz2 yosys
|
||||
split -b 50M yosys.tar.bz2 "yosys.tar.bz2.part"
|
||||
mv yosys.tar.bz2.part* $DESTDIR/yosys/$OS_DIR
|
||||
rm yosys.tar.bz2
|
||||
}
|
||||
|
||||
show_usage()
|
||||
{
|
||||
echo "Setup Pre-built Vortex Toolchain"
|
||||
echo "Usage: $0 [[--riscv] [--llvm-vortex] [--llvm-pocl] [--pocl] [--verilator] [--sv2v] [-yosys] [--all] [-h|--help]]"
|
||||
}
|
||||
|
||||
while [ "$1" != "" ]; do
|
||||
case $1 in
|
||||
--pocl ) pocl
|
||||
;;
|
||||
--verilator ) verilator
|
||||
;;
|
||||
--riscv ) riscv
|
||||
;;
|
||||
--riscv64 ) riscv64
|
||||
;;
|
||||
--llvm-vortex ) llvm-vortex
|
||||
;;
|
||||
--llvm-pocl ) llvm-pocl
|
||||
;;
|
||||
--sv2v ) sv2v
|
||||
;;
|
||||
--yosys ) yosys
|
||||
;;
|
||||
--all ) riscv
|
||||
riscv64
|
||||
llvm-vortex
|
||||
llvm-pocl
|
||||
pocl
|
||||
verilator
|
||||
sv2v
|
||||
yosys
|
||||
;;
|
||||
-h | --help ) show_usage
|
||||
exit
|
||||
;;
|
||||
* ) show_usage
|
||||
exit 1
|
||||
esac
|
||||
shift
|
||||
done
|
245
ci/trace_csv.py
Executable file
245
ci/trace_csv.py
Executable file
|
@ -0,0 +1,245 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
# Copyright © 2019-2023
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
import csv
|
||||
import re
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description='CPU trace log to CSV format converter.')
|
||||
parser.add_argument('-t', '--type', default='simx', help='log type (rtlsim or simx)')
|
||||
parser.add_argument('-o', '--csv', default='trace.csv', help='Output CSV file')
|
||||
parser.add_argument('log', help='Input log file')
|
||||
return parser.parse_args()
|
||||
|
||||
def parse_simx(log_filename):
|
||||
pc_pattern = r"PC=(0x[0-9a-fA-F]+)"
|
||||
instr_pattern = r"Instr (0x[0-9a-fA-F]+):"
|
||||
opcode_pattern = r"Instr 0x[0-9a-fA-F]+: ([0-9a-zA-Z_\.]+)"
|
||||
core_id_pattern = r"cid=(\d+)"
|
||||
warp_id_pattern = r"wid=(\d+)"
|
||||
tmask_pattern = r"tmask=(\d+)"
|
||||
operands_pattern = r"Src\d+ Reg: (.+)"
|
||||
destination_pattern = r"Dest Reg: (.+)"
|
||||
uuid_pattern = r"#(\d+)"
|
||||
entries = []
|
||||
with open(log_filename, 'r') as log_file:
|
||||
instr_data = None
|
||||
for lineno, line in enumerate(log_file, start=1):
|
||||
if line.startswith("DEBUG Fetch:"):
|
||||
if instr_data:
|
||||
entries.append(instr_data)
|
||||
instr_data = {}
|
||||
instr_data["lineno"] = lineno
|
||||
instr_data["PC"] = re.search(pc_pattern, line).group(1)
|
||||
instr_data["core_id"] = re.search(core_id_pattern, line).group(1)
|
||||
instr_data["warp_id"] = re.search(warp_id_pattern, line).group(1)
|
||||
instr_data["tmask"] = re.search(tmask_pattern, line).group(1)
|
||||
instr_data["uuid"] = re.search(uuid_pattern, line).group(1)
|
||||
elif line.startswith("DEBUG Instr"):
|
||||
instr_data["instr"] = re.search(instr_pattern, line).group(1)
|
||||
instr_data["opcode"] = re.search(opcode_pattern, line).group(1)
|
||||
elif line.startswith("DEBUG Src"):
|
||||
src_reg = re.search(operands_pattern, line).group(1)
|
||||
instr_data["operands"] = (instr_data["operands"] + ', ' + src_reg) if 'operands' in instr_data else src_reg
|
||||
elif line.startswith("DEBUG Dest"):
|
||||
instr_data["destination"] = re.search(destination_pattern, line).group(1)
|
||||
if instr_data:
|
||||
entries.append(instr_data)
|
||||
return entries
|
||||
|
||||
def reverse_binary(bin_str):
|
||||
return bin_str[::-1]
|
||||
|
||||
def bin_to_array(bin_str):
|
||||
return [int(bit) for bit in bin_str]
|
||||
|
||||
def append_reg(text, value, sep):
|
||||
if sep:
|
||||
text += ", "
|
||||
ivalue = int(value)
|
||||
if (ivalue >= 32):
|
||||
text += "f" + str(ivalue % 32)
|
||||
else:
|
||||
text += "x" + value
|
||||
sep = True
|
||||
return text, sep
|
||||
|
||||
def append_imm(text, value, sep):
|
||||
if sep:
|
||||
text += ", "
|
||||
text += value
|
||||
sep = True
|
||||
return text, sep
|
||||
|
||||
def append_value(text, reg, value, tmask_arr, sep):
|
||||
text, sep = append_reg(text, reg, sep)
|
||||
text += "={"
|
||||
for i in range(len(tmask_arr)):
|
||||
if i != 0:
|
||||
text += ", "
|
||||
if tmask_arr[i]:
|
||||
text += value[i]
|
||||
else:
|
||||
text +="-"
|
||||
text += "}"
|
||||
return text, sep
|
||||
|
||||
def parse_rtlsim(log_filename):
|
||||
line_pattern = r"\d+: core(\d+)-(decode|issue|commit)"
|
||||
pc_pattern = r"PC=(0x[0-9a-fA-F]+)"
|
||||
instr_pattern = r"instr=(0x[0-9a-fA-F]+)"
|
||||
ex_pattern = r"ex=([a-zA-Z]+)"
|
||||
op_pattern = r"op=([\?0-9a-zA-Z_\.]+)"
|
||||
warp_id_pattern = r"wid=(\d+)"
|
||||
tmask_pattern = r"tmask=(\d+)"
|
||||
wb_pattern = r"wb=(\d)"
|
||||
opds_pattern = r"opds=(\d+)"
|
||||
use_imm_pattern = r"use_imm=(\d)"
|
||||
imm_pattern = r"imm=(0x[0-9a-fA-F]+)"
|
||||
rd_pattern = r"rd=(\d+)"
|
||||
rs1_pattern = r"rs1=(\d+)"
|
||||
rs2_pattern = r"rs2=(\d+)"
|
||||
rs3_pattern = r"rs3=(\d+)"
|
||||
rs1_data_pattern = r"rs1_data=\{(.+?)\}"
|
||||
rs2_data_pattern = r"rs2_data=\{(.+?)\}"
|
||||
rs3_data_pattern = r"rs3_data=\{(.+?)\}"
|
||||
rd_data_pattern = r"data=\{(.+?)\}"
|
||||
eop_pattern = r"eop=(\d)"
|
||||
uuid_pattern = r"#(\d+)"
|
||||
entries = []
|
||||
with open(log_filename, 'r') as log_file:
|
||||
instr_data = {}
|
||||
for lineno, line in enumerate(log_file, start=1):
|
||||
line_match = re.search(line_pattern, line)
|
||||
if line_match:
|
||||
PC = re.search(pc_pattern, line).group(1)
|
||||
warp_id = re.search(warp_id_pattern, line).group(1)
|
||||
tmask = re.search(tmask_pattern, line).group(1)
|
||||
uuid = re.search(uuid_pattern, line).group(1)
|
||||
core_id = line_match.group(1)
|
||||
stage = line_match.group(2)
|
||||
if stage == "decode":
|
||||
trace = {}
|
||||
trace["uuid"] = uuid
|
||||
trace["PC"] = PC
|
||||
trace["core_id"] = core_id
|
||||
trace["warp_id"] = warp_id
|
||||
trace["tmask"] = reverse_binary(tmask)
|
||||
trace["instr"] = re.search(instr_pattern, line).group(1)
|
||||
trace["opcode"] = re.search(op_pattern, line).group(1)
|
||||
trace["opds"] = bin_to_array(re.search(opds_pattern, line).group(1))
|
||||
trace["rd"] = re.search(rd_pattern, line).group(1)
|
||||
trace["rs1"] = re.search(rs1_pattern, line).group(1)
|
||||
trace["rs2"] = re.search(rs2_pattern, line).group(1)
|
||||
trace["rs3"] = re.search(rs3_pattern, line).group(1)
|
||||
trace["use_imm"] = re.search(use_imm_pattern, line).group(1) == "1"
|
||||
trace["imm"] = re.search(imm_pattern, line).group(1)
|
||||
instr_data[uuid] = trace
|
||||
elif stage == "issue":
|
||||
if uuid in instr_data:
|
||||
trace = instr_data[uuid]
|
||||
trace["lineno"] = lineno
|
||||
opds = trace["opds"]
|
||||
if opds[1]:
|
||||
trace["rs1_data"] = re.search(rs1_data_pattern, line).group(1).split(', ')[::-1]
|
||||
if opds[2]:
|
||||
trace["rs2_data"] = re.search(rs2_data_pattern, line).group(1).split(', ')[::-1]
|
||||
if opds[3]:
|
||||
trace["rs3_data"] = re.search(rs3_data_pattern, line).group(1).split(', ')[::-1]
|
||||
trace["issued"] = True
|
||||
instr_data[uuid] = trace
|
||||
elif stage == "commit":
|
||||
if uuid in instr_data:
|
||||
trace = instr_data[uuid]
|
||||
if "issued" in trace:
|
||||
opds = trace["opds"]
|
||||
dst_tmask_arr = bin_to_array(tmask)[::-1]
|
||||
wb = re.search(wb_pattern, line).group(1) == "1"
|
||||
if wb:
|
||||
rd_data = re.search(rd_data_pattern, line).group(1).split(', ')[::-1]
|
||||
if 'rd_data' in trace:
|
||||
merged_rd_data = trace['rd_data']
|
||||
for i in range(len(dst_tmask_arr)):
|
||||
if dst_tmask_arr[i] == 1:
|
||||
merged_rd_data[i] = rd_data[i]
|
||||
trace['rd_data'] = merged_rd_data
|
||||
else:
|
||||
trace['rd_data'] = rd_data
|
||||
instr_data[uuid] = trace
|
||||
eop = re.search(eop_pattern, line).group(1) == "1"
|
||||
if eop:
|
||||
tmask_arr = bin_to_array(trace["tmask"])
|
||||
destination = ''
|
||||
if wb:
|
||||
destination, sep = append_value(destination, trace["rd"], trace['rd_data'], tmask_arr, False)
|
||||
del trace['rd_data']
|
||||
trace["destination"] = destination
|
||||
operands = ''
|
||||
sep = False
|
||||
if opds[1]:
|
||||
operands, sep = append_value(operands, trace["rs1"], trace["rs1_data"], tmask_arr, sep)
|
||||
del trace["rs1_data"]
|
||||
if opds[2]:
|
||||
operands, sep = append_value(operands, trace["rs2"], trace["rs2_data"], tmask_arr, sep)
|
||||
del trace["rs2_data"]
|
||||
if opds[3]:
|
||||
operands, sep = append_value(operands, trace["rs3"], trace["rs3_data"], tmask_arr, sep)
|
||||
del trace["rs3_data"]
|
||||
trace["operands"] = operands
|
||||
del trace["opds"]
|
||||
del trace["rd"]
|
||||
del trace["rs1"]
|
||||
del trace["rs2"]
|
||||
del trace["rs3"]
|
||||
del trace["use_imm"]
|
||||
del trace["imm"]
|
||||
del trace["issued"]
|
||||
del instr_data[uuid]
|
||||
entries.append(trace)
|
||||
return entries
|
||||
|
||||
def write_csv(log_filename, csv_filename, log_type):
|
||||
entries = None
|
||||
|
||||
# parse log file
|
||||
if log_type == "rtlsim":
|
||||
entries = parse_rtlsim(log_filename)
|
||||
elif log_type == "simx":
|
||||
entries = parse_simx(log_filename)
|
||||
else:
|
||||
print('Error: invalid log type')
|
||||
sys.exit()
|
||||
|
||||
# sort entries by uuid
|
||||
entries.sort(key=lambda x: (int(x['core_id']), int(x['warp_id']), int(x['lineno'])))
|
||||
for entry in entries:
|
||||
del entry['lineno']
|
||||
|
||||
# write to CSV
|
||||
with open(csv_filename, 'w', newline='') as csv_file:
|
||||
fieldnames = ["uuid", "PC", "opcode", "instr", "core_id", "warp_id", "tmask", "operands", "destination"]
|
||||
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
|
||||
writer.writeheader()
|
||||
for entry in entries:
|
||||
writer.writerow(entry)
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
write_csv(args.log, args.csv, args.type)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,4 +1,18 @@
|
|||
#!/usr/bin/env python
|
||||
|
||||
# Copyright © 2019-2023
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
import time
|
||||
import threading
|
||||
|
|
|
@ -40,9 +40,9 @@ VX.cache.v is the top module of the cache verilog code located in the `/hw/rtl/c
|
|||
- Core Response Merge
|
||||
- Cache accesses one line at a time. As a result, each request may not come back in the same response. This module tries to recombine the responses by thread ID.
|
||||
|
||||
### VX_bank.v
|
||||
### VX_cache_bank.v
|
||||
|
||||
VX_bank.v is the verilog code that handles cache bank functionality and is located in the `/hw/rtl/cache` directory.
|
||||
VX_cache_bank.v is the verilog code that handles cache bank functionality and is located in the `/hw/rtl/cache` directory.
|
||||
|
||||

|
||||
|
||||
|
|
|
@ -3,38 +3,39 @@
|
|||
The directory/file layout of the Vortex codebase is as followed:
|
||||
|
||||
- `hw`:
|
||||
- `rtl`: hardware rtl sources
|
||||
- `cache`: cache subsystem code
|
||||
- `fp_cores`: floating point unit code
|
||||
- `rtl`: hardware rtl sources
|
||||
- `core`: core pipeline
|
||||
- `cache`: cache subsystem
|
||||
- `mem`: memory subsystem
|
||||
- `fpu`: floating point unit
|
||||
- `interfaces`: interfaces for inter-module communication
|
||||
- `libs`: general-purpose RTL modules
|
||||
- `libs`: general-purpose RTL modules
|
||||
- `syn`: synthesis directory
|
||||
- `opae`: OPAE synthesis scripts
|
||||
- `quartus`: Quartus synthesis scripts
|
||||
- `altera`: Altera synthesis scripts
|
||||
- `xilinx`: Xilinx synthesis scripts
|
||||
- `synopsys`: Synopsys synthesis scripts
|
||||
- `modelsim`: Modelsim synthesis scripts
|
||||
- `yosys`: Yosys synthesis scripts
|
||||
- `unit_tests`: unit tests for some hardware components
|
||||
- `driver`: host drivers repository
|
||||
- `runtime`: host runtime software APIs
|
||||
- `include`: Vortex driver public headers
|
||||
- `stub`: Vortex stub driver library
|
||||
- `fpga`: software driver that uses Intel OPAE FPGA
|
||||
- `asesim`: software driver that uses Intel ASE simulator
|
||||
- `vlsim`: software driver that uses vlsim simulator
|
||||
- `opae`: software driver that uses Intel OPAE API with device targets=fpga|asesim|opaesim
|
||||
- `xrt`: software driver that uses Xilinx XRT API with device targets=hw|hw_emu|sw_emu
|
||||
- `rtlsim`: software driver that uses rtlsim simulator
|
||||
- `simx`: software driver that uses simX simulator
|
||||
- `runtime`: kernel runtime software
|
||||
- `kernel`: GPU kernel software APIs
|
||||
- `include`: Vortex runtime public headers
|
||||
- `linker`: linker file for compiling kernels
|
||||
- `src`: runtime implementation
|
||||
- `sim`:
|
||||
- `vlsim`: AFU RTL simulator
|
||||
- `opaesim`: Intel OPAE AFU RTL simulator
|
||||
- `rtlsim`: processor RTL simulator
|
||||
- `simX`: cycle approximate simulator for vortex
|
||||
- `tests`: tests repository.
|
||||
- `runtime`: runtime tests
|
||||
- `regression`: regression tests
|
||||
- `riscv`: RISC-V standard tests
|
||||
- `riscv`: RISC-V conformance tests
|
||||
- `kernel`: kernel tests
|
||||
- `regression`: regression tests
|
||||
- `opencl`: opencl benchmarks and tests
|
||||
- `ci`: continuous integration scripts
|
||||
- `miscs`: miscellaneous resources.
|
||||
|
|
|
@ -1,29 +1,37 @@
|
|||
# Debugging Vortex Hardware
|
||||
# Debugging Vortex GPU
|
||||
|
||||
## Testing changes to the RTL or simulator GPU driver.
|
||||
|
||||
The Blackbox utility script will not pick up your changes if the h/w configuration is the same as during teh last run.
|
||||
To force the utility to build the driver, you need pass the --rebuild=1 option when running tests.
|
||||
Using --rebuild=0 will prevent the rebuild even if the h/w configuration is different from last run.
|
||||
|
||||
$ ./ci/blackbox.sh --driver=simx --app=demo --rebuild=1
|
||||
|
||||
## SimX Debugging
|
||||
|
||||
SimX cycle-approximate simulator allows faster debugging of Vortex kernels' execution.
|
||||
The recommended method to enable debugging is to pass the `--debug` flag to `blackbox` tool when running a program.
|
||||
The recommended method to enable debugging is to pass the `--debug=<level>` flag to `blackbox` tool when running a program.
|
||||
|
||||
// Running demo program on SimX in debug mode
|
||||
$ ./ci/blackbox.sh --driver=simx --app=demo --debug
|
||||
$ ./ci/blackbox.sh --driver=simx --app=demo --debug=1
|
||||
|
||||
A debug trace `run.log` is generated in the current directory during the program execution. The trace includes important states of the simulated processor (decoded instruction, register states, pipeline states, etc..). You can increase the verbosity level of the trace by changing the `DEBUG_LEVEL` variable to a value [1-5] (default is 3).
|
||||
A debug trace `run.log` is generated in the current directory during the program execution. The trace includes important states of the simulated processor (decoded instruction, register states, pipeline states, etc..). You can increase the verbosity of the trace by changing the debug level.
|
||||
|
||||
// Using SimX in debug mode with verbose level 4
|
||||
$ CONFIGS=-DDEBUG_LEVEL=4 ./ci/blackbox.sh --driver=simx --app=demo --debug
|
||||
// Using SimX in debug mode with verbose level 3
|
||||
$ ./ci/blackbox.sh --driver=simx --app=demo --debug=3
|
||||
|
||||
## RTL Debugging
|
||||
|
||||
To debug the processor RTL, you need to use VLSIM or RTLSIM driver. VLSIM simulates the full processor including the AFU command processor (using `/rtl/afu/vortex_afu.sv` as top module). RTLSIM simulates the Vortex processor only (using `/rtl/Vortex.v` as top module).
|
||||
To debug the processor RTL, you need to use VLSIM or RTLSIM driver. VLSIM simulates the full processor including the AFU command processor (using `/rtl/afu/opae/vortex_afu.sv` as top module). RTLSIM simulates the Vortex processor only (using `/rtl/Vortex.v` as top module).
|
||||
|
||||
The recommended method to enable debugging is to pass the `--debug` flag to `blackbox` tool when running a program.
|
||||
|
||||
// Running demo program on vlsim in debug mode
|
||||
$ ./ci/blackbox.sh --driver=vlsim --app=demo --debug
|
||||
// Running demo program on the opae simulator in debug mode
|
||||
$ TARGET=opaesim ./ci/blackbox.sh --driver=opae --app=demo --debug=1
|
||||
|
||||
// Running demo program on rtlsim in debug mode
|
||||
$ ./ci/blackbox.sh --driver=rtlsim --app=demo --debug
|
||||
$ ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=1
|
||||
|
||||
A debug trace `run.log` is generated in the current directory during the program execution. The trace includes important states of the simulated processor (memory, caches, pipeline, stalls, etc..). A waveform trace `trace.vcd` is also generated in the current directory during the program execution. You can visualize the waveform trace using any tool that can open VCD files (Modelsim, Quartus, Vivado, etc..). [GTKwave] (http://gtkwave.sourceforge.net) is a great open-source scope analyzer that also works with VCD files.
|
||||
|
||||
|
@ -32,7 +40,7 @@ A debug trace `run.log` is generated in the current directory during the program
|
|||
Debugging the FPGA directly may be necessary to investigate runtime bugs that the RTL simulation cannot catch. We have implemented an in-house scope analyzer for Vortex that works when the FPGA is running. To enable the FPGA scope analyzer, the FPGA bitstream should be built using `SCOPE=1` flag
|
||||
|
||||
& cd /hw/syn/opae
|
||||
$ CONFIGS=-DSCOPE=1 make fpga-4c
|
||||
$ CONFIGS="-DSCOPE=1" TARGET=fpga make
|
||||
|
||||
When running the program on the FPGA, you need to pass the `--scope` flag to the `blackbox` tool.
|
||||
|
||||
|
@ -40,4 +48,18 @@ When running the program on the FPGA, you need to pass the `--scope` flag to the
|
|||
$ ./ci/blackbox.sh --driver=fpga --app=demo --scope
|
||||
|
||||
|
||||
A waveform trace `trace.vcd` will be generated in the current directory during the program execution. This trace includes a limited set of signals that are defined in `/hw/scripts/scope.json`. You can expand your signals' selection by updating the json file.
|
||||
A waveform trace `trace.vcd` will be generated in the current directory during the program execution. This trace includes a limited set of signals that are defined in `/hw/scripts/scope.json`. You can expand your signals' selection by updating the json file.
|
||||
|
||||
## Analyzing Vortex trace log
|
||||
|
||||
When debugging Vortex RTL or SimX Simulator, reading the trace run.log file can be overwhelming when the trace gets really large.
|
||||
We provide a trace sanitizer tool under ./hw/scripts/trace_csv.py that you can use to convert the large trace into a CSV file containing all the instructions that executed with their source and destination operands.
|
||||
|
||||
$ ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=3 --log=run_rtlsim.log
|
||||
$ ./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
|
||||
|
||||
$ ./ci/blackbox.sh --driver=simx --app=demo --debug=3 --log=run_simx.log
|
||||
$ ./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
|
||||
|
||||
The first column in the CSV trace is UUID (universal unique identifier) of the instruction and the content is sorted by the UUID. You can use the UUID to trace the same instruction running on either the RTL hw or SimX simulator.
|
||||
This can be very effective if you want to use SimX to debugging your RTL hardware by comparing CSV traces.
|
|
@ -1,128 +0,0 @@
|
|||
# Execute OpenCL on Vortex backend
|
||||
|
||||
## Requirements
|
||||
- [Vortex](https://github.com/vortexgpgpu/vortex)
|
||||
- [POCL for Vortex](https://github.com/vortexgpgpu/pocl)
|
||||
- [riscv-toolchain](https://github.com/riscv-collab/riscv-gnu-toolchain)
|
||||
- [llvm-riscv](https://github.com/llvm-mirror/llvm)
|
||||
|
||||
For installation, please see [Build Instructions](../README.md) for more details.
|
||||
|
||||
**For Ubuntu18.04 users, you can directly download pre-build toolchains with [toolchain_install.sh](https://github.com/vortexgpgpu/vortex/blob/master/ci/toolchain_install.sh) script.**
|
||||
```bash
|
||||
# please modify the DESTDIR variable in the script before execution
|
||||
bash toolchain_install.sh -all
|
||||
```
|
||||
Assuming we have installed all dependencies in `/opt` path, we can get the following environment:
|
||||
```bash
|
||||
tree -L 2 /opt
|
||||
'''
|
||||
/opt/
|
||||
├── llvm-riscv
|
||||
│ ├── bin
|
||||
│ ├── include
|
||||
│ ├── lib
|
||||
│ ├── libexec
|
||||
│ └── share
|
||||
├── pocl
|
||||
│ ├── compiler
|
||||
│ └── runtime
|
||||
├── riscv-gnu-toolchain
|
||||
│ ├── bin
|
||||
│ ├── drops
|
||||
│ ├── include
|
||||
│ ├── lib
|
||||
│ ├── libexec
|
||||
│ ├── riscv32-unknown-elf
|
||||
│ ├── share
|
||||
│ └── var
|
||||
└── verilator
|
||||
├── bin
|
||||
├── examples
|
||||
├── include
|
||||
├── verilator-config.cmake
|
||||
└── verilator-config-version.cmake
|
||||
'''
|
||||
```
|
||||
## Execute OpenCL on Vortex
|
||||
In this tutorial, we show the example of executing a vecadd programs on SIMX backend.
|
||||
To execute a OpenCL program on Vortex, we have the following steps:
|
||||
- Compile the [OpenCL kernels](https://github.com/vortexgpgpu/vortex/blob/master/tests/opencl/vecadd/kernel.cl) into risc-v binary by POCL compiler.
|
||||
- Compile the [OpenCL host](https://github.com/vortexgpgpu/vortex/blob/master/tests/opencl/vecadd/main.cc) and link with Vortex driver(```-lvortex```).
|
||||
- Execute the compiled host programs on a backend.
|
||||
|
||||
Thus, we can write a Makefile as following:
|
||||
```Makefile
|
||||
LLVM_PREFIX ?= /opt/llvm-riscv
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= /opt/pocl/compiler
|
||||
POCL_RT_PATH ?= /opt/pocl/runtime
|
||||
|
||||
OPTS ?= -n64
|
||||
|
||||
# please edit these two variable to your environment
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
|
||||
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
|
||||
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
|
||||
|
||||
CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter
|
||||
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
|
||||
|
||||
PROJECT = vecadd
|
||||
|
||||
SRCS = main.cc
|
||||
|
||||
all: $(PROJECT) kernel.pocl
|
||||
|
||||
kernel.pocl: kernel.cl
|
||||
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.pocl *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
```
|
||||
|
||||
First, build the host program.
|
||||
```bash
|
||||
make all
|
||||
```
|
||||
If we want to execute on SIMX, we can execute the command below.
|
||||
```bash
|
||||
make run-simx
|
||||
```
|
|
@ -17,22 +17,16 @@ OPAE Build
|
|||
------------------
|
||||
|
||||
The FPGA has to following configuration options:
|
||||
- 1 core fpga (fpga-1c)
|
||||
- 2 cores fpga (fpga-2c)
|
||||
- 4 cores fpga (fpga-4c)
|
||||
- 8 cores fpga (fpga-8c)
|
||||
- 16 cores fpga (fpga-16c)
|
||||
- 32 cores fpga (fpga-32c)
|
||||
- 64 cores fpga (fpga-64c)
|
||||
- DEVICE_FAMILY=arria10 | stratix10
|
||||
- NUM_CORES=#n
|
||||
|
||||
Command line:
|
||||
|
||||
$ cd hw/syn/opae
|
||||
$ make fpga-<num-of-cores>c
|
||||
$ cd hw/syn/altera/opae
|
||||
$ PREFIX=test1 TARGET=fpga NUM_CORES=4 make
|
||||
|
||||
Example: `make fpga-4c`
|
||||
|
||||
A new folder (ex: `build_fpga_4c`) will be created and the build will start and take ~30-480 min to complete.
|
||||
A new folder (ex: `test1_xxx_4c`) will be created and the build will start and take ~30-480 min to complete.
|
||||
Setting TARGET=ase will build the project for simulation using Intel ASE.
|
||||
|
||||
|
||||
OPAE Build Configuration
|
||||
|
@ -45,35 +39,32 @@ The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware
|
|||
|
||||
You configure the syntesis build from the command line:
|
||||
|
||||
$ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make fpga-4c
|
||||
$ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make
|
||||
|
||||
OPAE Build Progress
|
||||
-------------------
|
||||
|
||||
You could check the last 10 lines in the build log for possible errors until build completion.
|
||||
|
||||
$ tail -n 10 ./build_fpga_<num-of-cores>c/build.log
|
||||
$ tail -n 10 <build_dir>/build.log
|
||||
|
||||
Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs.
|
||||
|
||||
$ ps -u <username>
|
||||
|
||||
|
||||
If the build fails and you need to restart it, clean up the build folder using the following command:
|
||||
|
||||
$ make clean-fpga-<num-of-cores>c
|
||||
|
||||
Example: `make clean-fpga-4c`
|
||||
$ make clean
|
||||
|
||||
The file `vortex_afu.gbs` should exist when the build is done:
|
||||
|
||||
$ ls -lsa ./build_fpga_<num-of-cores>c/vortex_afu.gbs
|
||||
$ ls -lsa <build_dir>/vortex_afu.gbs
|
||||
|
||||
|
||||
Signing the bitstream and Programming the FPGA
|
||||
----------------------------------------------
|
||||
|
||||
$ cd ./build_fpga_<num-of-cores>c
|
||||
$ cd <build_dir>
|
||||
$ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs
|
||||
$ fpgasupdate vortex_afu_unsigned_ssl.gbs
|
||||
|
||||
|
|
|
@ -11,7 +11,6 @@
|
|||
- [Debugging](debugging.md)
|
||||
- [Useful Links](references.md)
|
||||
|
||||
|
||||
## Installation
|
||||
|
||||
- Refer to the build instructions in [README](../README.md).
|
||||
|
@ -22,9 +21,11 @@ Running Vortex simulators with different configurations:
|
|||
- Run basic driver test with rtlsim driver and Vortex config of 2 clusters, 2 cores, 2 warps, 4 threads
|
||||
|
||||
$ ./ci/blackbox.sh --driver=rtlsim --clusters=2 --cores=2 --warps=2 --threads=4 --app=basic
|
||||
- Run demo driver test with vlsim driver and Vortex config of 1 clusters, 4 cores, 4 warps, 2 threads
|
||||
|
||||
$ ./ci/blackbox.sh --driver=vlsim --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo
|
||||
- Run demo driver test with opae driver and Vortex config of 1 clusters, 4 cores, 4 warps, 2 threads
|
||||
|
||||
$ ./ci/blackbox.sh --driver=opae --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo
|
||||
|
||||
- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads
|
||||
|
||||
$ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood
|
|
@ -10,7 +10,7 @@ SimX is a C++ cycle-level in-house simulator developed for Vortex. The relevant
|
|||
|
||||
### FGPA Simulation
|
||||
|
||||
The current target FPGA for simulation is the Arria10 Intel Accelerator Card v1.0. The guide to build the fpga with specific configurations is located [here.](https://github.com/vortexgpgpu/vortex-dev/blob/master/doc/FPGA_Startup_Guide.md)
|
||||
The current target FPGA for simulation is the Arria10 Intel Accelerator Card v1.0. The guide to build the fpga with specific configurations is located [here.](fpga_setup.md)
|
||||
|
||||
### How to Test
|
||||
|
||||
|
@ -22,15 +22,15 @@ Running tests under specific drivers (rtlsim,simx,fpga) is done using the script
|
|||
- *Threads* - used to specify the number of threads (smallest unit of computation) within a configuration.
|
||||
- *L2cache* - used to enable the shard l2cache among the Vortex cores.
|
||||
- *L3cache* - used to enable the shared l3cache among the Vortex clusters.
|
||||
- *Driver* - used to specify which driver to run the Vortex simulation (either rtlsim, vlsim, fpga, or simx).
|
||||
- *Driver* - used to specify which driver to run the Vortex simulation (either rtlsim, opae, xrt, simx).
|
||||
- *Debug* - used to enable debug mode for the Vortex simulation.
|
||||
- *Perf* - used to enable the detailed performance counters within the Vortex simulation.
|
||||
- *App* - used to specify which test/benchmark to run in the Vortex simulation. The main choices are vecadd, sgemm, basic, demo, and dogfood. Other tests/benchmarks are located in the `/benchmarks/opencl` folder though not all of them work wit the current version of Vortex.
|
||||
- *Args* - used to pass additional arguments to the application.
|
||||
|
||||
Example use of command line arguments: Run the sgemm benchmark using the vlsim driver with a Vortex configuration of 1 cluster, 4 cores, 4 warps, and 4 threads.
|
||||
Example use of command line arguments: Run the sgemm benchmark using the opae driver with a Vortex configuration of 1 cluster, 4 cores, 4 warps, and 4 threads.
|
||||
|
||||
$ ./ci/blackbox.sh --clusters=1 --cores=4 --warps=4 --threads=4 --driver=vlsim --app=sgemm
|
||||
$ ./ci/blackbox.sh --clusters=1 --cores=4 --warps=4 --threads=4 --driver=opae --app=sgemm
|
||||
|
||||
Output from terminal:
|
||||
```
|
||||
|
|
33
docs/testing.md
Normal file
33
docs/testing.md
Normal file
|
@ -0,0 +1,33 @@
|
|||
# Testing
|
||||
|
||||
## Running a Vortex application
|
||||
|
||||
The framework provides a utility script: blakcbox.sh under the /ci/ folder for executing applications in the tests tree.
|
||||
You can query the commandline options of the tool using:
|
||||
|
||||
$ ./ci/blakcbox.sh --help
|
||||
|
||||
To execute sgemm test program on the simx driver and passing "-n10" as argument to sgemm:
|
||||
|
||||
$ ./ci/blakcbox.sh --driver=simx --app=sgemm --args="-n10"
|
||||
|
||||
You can execute the same application of a GPU architecture with 2 cores:
|
||||
|
||||
$ ./ci/blakcbox.sh --core=2 --driver=simx --app=sgemm --args="-n10"
|
||||
|
||||
When excuting, Blackbox needs to recompile the driver if the desired architecture changes.
|
||||
It tracks the latest configuration in a file under the current directory blackbox.<driver>.cache.
|
||||
To avoid having to rebuild the driver all the time, Blackbox checks if the latest cached configuration matches the current.
|
||||
|
||||
## Running Benchmarks
|
||||
|
||||
The Vortex test suite is located under the /test/ folder
|
||||
You can execute the default regression suite by running the following commands at the root folder.
|
||||
|
||||
$ make -C tests/regression run-simx
|
||||
$ make -C tests/regression run-rtlsim
|
||||
|
||||
You can execute the default opncl suite by running the following commands at the root folder.
|
||||
|
||||
$ make -C tests/opencl run-simx
|
||||
$ make -C tests/opencl run-rtlsim
|
|
@ -1,29 +0,0 @@
|
|||
all: stub rtlsim simx vlsim
|
||||
|
||||
stub:
|
||||
$(MAKE) -C stub
|
||||
|
||||
fpga:
|
||||
$(MAKE) -C fpga
|
||||
|
||||
asesim:
|
||||
$(MAKE) -C asesim
|
||||
|
||||
vlsim:
|
||||
$(MAKE) -C vlsim
|
||||
|
||||
rtlsim:
|
||||
$(MAKE) -C rtlsim
|
||||
|
||||
simx:
|
||||
$(MAKE) -C simx
|
||||
|
||||
clean:
|
||||
$(MAKE) clean -C stub
|
||||
$(MAKE) clean -C fpga
|
||||
$(MAKE) clean -C asesim
|
||||
$(MAKE) clean -C vlsim
|
||||
$(MAKE) clean -C rtlsim
|
||||
$(MAKE) clean -C simx
|
||||
|
||||
.PHONY: all stub fpga asesim vlsim rtlsim simx clean
|
|
@ -1,73 +0,0 @@
|
|||
OPAE_HOME ?= /tools/opae/1.4.0
|
||||
|
||||
RTL_DIR=../../hw/rtl
|
||||
|
||||
SCRIPT_DIR=../../hw/scripts
|
||||
|
||||
OPAE_SYN_DIR=../../hw/syn/opae
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I. -I../include -I../../hw -I$(OPAE_HOME)/include -I$(OPAE_SYN_DIR)
|
||||
|
||||
LDFLAGS += -L$(OPAE_HOME)/lib -luuid -lopae-c-ase
|
||||
|
||||
# stack execution protection
|
||||
LDFLAGS +=-z noexecstack
|
||||
|
||||
# data relocation and projection
|
||||
LDFLAGS +=-z relro -z now
|
||||
|
||||
# stack buffer overrun detection
|
||||
CXXFLAGS +=-fstack-protector
|
||||
|
||||
# Position independent code
|
||||
CXXFLAGS += -fPIC
|
||||
|
||||
# Add external configuration
|
||||
CXXFLAGS += $(CONFIGS)
|
||||
|
||||
# Dump perf stats
|
||||
CXXFLAGS += -DDUMP_PERF_STATS
|
||||
|
||||
LDFLAGS += -shared
|
||||
|
||||
PROJECT = libvortex.so
|
||||
|
||||
SRCS = ../common/opae.cpp ../common/vx_utils.cpp
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
# Enable scope analyzer
|
||||
ifdef SCOPE
|
||||
CXXFLAGS += -DSCOPE
|
||||
SRCS += ../common/vx_scope.cpp
|
||||
SCOPE_H = scope-defs.h
|
||||
endif
|
||||
|
||||
# Enable perf counters
|
||||
ifdef PERF
|
||||
CXXFLAGS += -DPERF_ENABLE
|
||||
endif
|
||||
|
||||
all: $(PROJECT)
|
||||
|
||||
$(OPAE_SYN_DIR)/vortex_afu.h:
|
||||
$(MAKE) -C $(OPAE_SYN_DIR) vortex_afu.h
|
||||
|
||||
scope-defs.h: $(SCRIPT_DIR)/scope.json
|
||||
$(SCRIPT_DIR)/scope.py $(CONFIGS) -cc scope-defs.h -vl $(RTL_DIR)/scope-defs.vh $(SCRIPT_DIR)/scope.json
|
||||
|
||||
# generate scope data
|
||||
scope: scope-defs.h
|
||||
|
||||
$(PROJECT): $(SRCS) $(OPAE_SYN_DIR)/vortex_afu.h $(SCOPE_H)
|
||||
$(CXX) $(CXXFLAGS) -DUSE_ASE $(SRCS) $(LDFLAGS) -o $(PROJECT)
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o scope-defs.h
|
|
@ -1,535 +0,0 @@
|
|||
#include <stdint.h>
|
||||
#include <iostream>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <cstdlib>
|
||||
#include <unistd.h>
|
||||
#include <assert.h>
|
||||
#include <cmath>
|
||||
#include <sstream>
|
||||
#include <unordered_map>
|
||||
#include <list>
|
||||
|
||||
#if defined(USE_FPGA) || defined(USE_ASE)
|
||||
#include <opae/fpga.h>
|
||||
#include <uuid/uuid.h>
|
||||
#elif defined(USE_VLSIM)
|
||||
#include <fpga.h>
|
||||
#endif
|
||||
|
||||
#include "vx_utils.h"
|
||||
#include "vx_malloc.h"
|
||||
#include <vortex.h>
|
||||
#include <VX_config.h>
|
||||
#include "vortex_afu.h"
|
||||
|
||||
#ifdef SCOPE
|
||||
#include "vx_scope.h"
|
||||
#endif
|
||||
|
||||
#define CHECK_RES(_expr) \
|
||||
do { \
|
||||
fpga_result res = _expr; \
|
||||
if (res == FPGA_OK) \
|
||||
break; \
|
||||
printf("[VXDRV] Error: '%s' returned %d, %s!\n", \
|
||||
#_expr, (int)res, fpgaErrStr(res)); \
|
||||
return -1; \
|
||||
} while (false)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define CMD_MEM_READ AFU_IMAGE_CMD_MEM_READ
|
||||
#define CMD_MEM_WRITE AFU_IMAGE_CMD_MEM_WRITE
|
||||
#define CMD_RUN AFU_IMAGE_CMD_RUN
|
||||
|
||||
#define MMIO_CMD_TYPE (AFU_IMAGE_MMIO_CMD_TYPE * 4)
|
||||
#define MMIO_IO_ADDR (AFU_IMAGE_MMIO_IO_ADDR * 4)
|
||||
#define MMIO_MEM_ADDR (AFU_IMAGE_MMIO_MEM_ADDR * 4)
|
||||
#define MMIO_DATA_SIZE (AFU_IMAGE_MMIO_DATA_SIZE * 4)
|
||||
#define MMIO_DEV_CAPS (AFU_IMAGE_MMIO_DEV_CAPS * 4)
|
||||
#define MMIO_STATUS (AFU_IMAGE_MMIO_STATUS * 4)
|
||||
|
||||
#define STATUS_STATE_BITS 8
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class vx_device {
|
||||
public:
|
||||
vx_device()
|
||||
: mem_allocator(
|
||||
ALLOC_BASE_ADDR,
|
||||
ALLOC_BASE_ADDR + LOCAL_MEM_SIZE,
|
||||
4096,
|
||||
CACHE_BLOCK_SIZE)
|
||||
{}
|
||||
|
||||
~vx_device() {}
|
||||
|
||||
fpga_handle fpga;
|
||||
vortex::MemoryAllocator mem_allocator;
|
||||
unsigned version;
|
||||
unsigned num_cores;
|
||||
unsigned num_warps;
|
||||
unsigned num_threads;
|
||||
};
|
||||
|
||||
typedef struct vx_buffer_ {
|
||||
uint64_t wsid;
|
||||
void* host_ptr;
|
||||
uint64_t io_addr;
|
||||
vx_device_h hdevice;
|
||||
uint64_t size;
|
||||
} vx_buffer_t;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
class AutoPerfDump {
|
||||
private:
|
||||
std::list<vx_device_h> devices_;
|
||||
|
||||
public:
|
||||
AutoPerfDump() {}
|
||||
|
||||
~AutoPerfDump() {
|
||||
for (auto device : devices_) {
|
||||
vx_dump_perf(device, stdout);
|
||||
}
|
||||
}
|
||||
|
||||
void add_device(vx_device_h device) {
|
||||
devices_.push_back(device);
|
||||
}
|
||||
|
||||
void remove_device(vx_device_h device) {
|
||||
devices_.remove(device);
|
||||
}
|
||||
};
|
||||
|
||||
AutoPerfDump gAutoPerfDump;
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
switch (caps_id) {
|
||||
case VX_CAPS_VERSION:
|
||||
*value = device->version;
|
||||
break;
|
||||
case VX_CAPS_MAX_CORES:
|
||||
*value = device->num_cores;
|
||||
break;
|
||||
case VX_CAPS_MAX_WARPS:
|
||||
*value = device->num_warps;
|
||||
break;
|
||||
case VX_CAPS_MAX_THREADS:
|
||||
*value = device->num_threads;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
*value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||
*value = LOCAL_MEM_SIZE;
|
||||
break;
|
||||
case VX_CAPS_ALLOC_BASE_ADDR:
|
||||
*value = ALLOC_BASE_ADDR;
|
||||
break;
|
||||
case VX_CAPS_KERNEL_BASE_ADDR:
|
||||
*value = STARTUP_ADDR;
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "[VXDRV] Error: invalid caps id: %d\n", caps_id);
|
||||
std::abort();
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_open(vx_device_h* hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
fpga_handle accel_handle;
|
||||
vx_device* device;
|
||||
|
||||
#ifndef USE_VLSIM
|
||||
fpga_result res;
|
||||
fpga_token accel_token;
|
||||
fpga_properties filter = nullptr;
|
||||
fpga_guid guid;
|
||||
uint32_t num_matches;
|
||||
|
||||
// Set up a filter that will search for an accelerator
|
||||
CHECK_RES(fpgaGetProperties(nullptr, &filter));
|
||||
res = fpgaPropertiesSetObjectType(filter, FPGA_ACCELERATOR);
|
||||
if (res != FPGA_OK) {
|
||||
fprintf(stderr, "[VXDRV] Error: fpgaGetProperties() returned %d, %s!\n", (int)res, fpgaErrStr(res));
|
||||
fpgaDestroyProperties(&filter);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Add the desired UUID to the filter
|
||||
uuid_parse(AFU_ACCEL_UUID, guid);
|
||||
res = fpgaPropertiesSetGUID(filter, guid);
|
||||
if (res != FPGA_OK) {
|
||||
fprintf(stderr, "[VXDRV] Error: fpgaPropertiesSetGUID() returned %d, %s!\n", (int)res, fpgaErrStr(res));
|
||||
fpgaDestroyProperties(&filter);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Do the search across the available FPGA contexts
|
||||
num_matches = 1;
|
||||
res = fpgaEnumerate(&filter, 1, &accel_token, 1, &num_matches);
|
||||
if (res != FPGA_OK) {
|
||||
fprintf(stderr, "[VXDRV] Error: fpgaEnumerate() returned %d, %s!\n", (int)res, fpgaErrStr(res));
|
||||
fpgaDestroyProperties(&filter);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Not needed anymore
|
||||
fpgaDestroyProperties(&filter);
|
||||
|
||||
if (num_matches < 1) {
|
||||
fprintf(stderr, "[VXDRV] Error: accelerator %s not found!\n", AFU_ACCEL_UUID);
|
||||
fpgaDestroyToken(&accel_token);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Open accelerator
|
||||
res = fpgaOpen(accel_token, &accel_handle, 0);
|
||||
if (res != FPGA_OK) {
|
||||
fprintf(stderr, "[VXDRV] Error: fpgaOpen() returned %d, %s!\n", (int)res, fpgaErrStr(res));
|
||||
fpgaDestroyToken(&accel_token);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Done with token
|
||||
fpgaDestroyToken(&accel_token);
|
||||
#else
|
||||
// Open accelerator
|
||||
CHECK_RES(fpgaOpen(NULL, &accel_handle, 0));
|
||||
#endif
|
||||
|
||||
// allocate device object
|
||||
device = new vx_device();
|
||||
if (nullptr == device) {
|
||||
fpgaClose(accel_handle);
|
||||
return -1;
|
||||
}
|
||||
|
||||
device->fpga = accel_handle;
|
||||
|
||||
{
|
||||
// Load device CAPS
|
||||
uint64_t dev_caps;
|
||||
int ret = fpgaReadMMIO64(device->fpga, 0, MMIO_DEV_CAPS, &dev_caps);
|
||||
if (ret != FPGA_OK) {
|
||||
fpgaClose(accel_handle);
|
||||
return ret;
|
||||
}
|
||||
device->version = (dev_caps >> 0) & 0xffff;
|
||||
device->num_cores = (dev_caps >> 16) & 0xffff;
|
||||
device->num_warps = (dev_caps >> 32) & 0xffff;
|
||||
device->num_threads = (dev_caps >> 48) & 0xffff;
|
||||
#ifndef NDEBUG
|
||||
fprintf(stdout, "[VXDRV] DEVCAPS: version=%d, num_cores=%d, num_warps=%d, num_threads=%d\n",
|
||||
device->version, device->num_cores, device->num_warps, device->num_threads);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef SCOPE
|
||||
{
|
||||
int ret = vx_scope_start(accel_handle, 0, -1);
|
||||
if (ret != 0) {
|
||||
fpgaClose(accel_handle);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
*hdevice = device;
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
gAutoPerfDump.add_device(*hdevice);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_close(vx_device_h hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
#ifdef SCOPE
|
||||
vx_scope_stop(device->fpga);
|
||||
#endif
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
gAutoPerfDump.remove_device(hdevice);
|
||||
vx_dump_perf(hdevice, stdout);
|
||||
#endif
|
||||
|
||||
fpgaClose(device->fpga);
|
||||
|
||||
delete device;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr) {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == dev_maddr
|
||||
|| 0 >= size)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->mem_allocator.allocate(size, dev_maddr);
|
||||
}
|
||||
|
||||
extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_maddr) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->mem_allocator.release(dev_maddr);
|
||||
}
|
||||
|
||||
extern int vx_buf_alloc(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer) {
|
||||
fpga_result res;
|
||||
void* host_ptr;
|
||||
uint64_t wsid;
|
||||
uint64_t io_addr;
|
||||
vx_buffer_t* buffer;
|
||||
|
||||
if (nullptr == hdevice
|
||||
|| 0 >= size
|
||||
|| nullptr == hbuffer)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
size_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
|
||||
res = fpgaPrepareBuffer(device->fpga, asize, &host_ptr, &wsid, 0);
|
||||
if (FPGA_OK != res) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Get the physical address of the buffer in the accelerator
|
||||
res = fpgaGetIOAddress(device->fpga, wsid, &io_addr);
|
||||
if (FPGA_OK != res) {
|
||||
fpgaReleaseBuffer(device->fpga, wsid);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// allocate buffer object
|
||||
buffer = (vx_buffer_t*)malloc(sizeof(vx_buffer_t));
|
||||
if (nullptr == buffer) {
|
||||
fpgaReleaseBuffer(device->fpga, wsid);
|
||||
return -1;
|
||||
}
|
||||
|
||||
buffer->wsid = wsid;
|
||||
buffer->host_ptr = host_ptr;
|
||||
buffer->io_addr = io_addr;
|
||||
buffer->hdevice = hdevice;
|
||||
buffer->size = asize;
|
||||
|
||||
*hbuffer = buffer;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern void* vx_host_ptr(vx_buffer_h hbuffer) {
|
||||
if (nullptr == hbuffer)
|
||||
return nullptr;
|
||||
|
||||
vx_buffer_t* buffer = ((vx_buffer_t*)hbuffer);
|
||||
return buffer->host_ptr;
|
||||
}
|
||||
|
||||
extern int vx_buf_free(vx_buffer_h hbuffer) {
|
||||
if (nullptr == hbuffer)
|
||||
return -1;
|
||||
|
||||
vx_buffer_t* buffer = ((vx_buffer_t*)hbuffer);
|
||||
vx_device *device = ((vx_device*)buffer->hdevice);
|
||||
|
||||
fpgaReleaseBuffer(device->fpga, buffer->wsid);
|
||||
|
||||
free(buffer);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
std::unordered_map<uint32_t, std::stringstream> print_bufs;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
struct timespec sleep_time;
|
||||
|
||||
#if defined(USE_ASE)
|
||||
sleep_time.tv_sec = 1;
|
||||
sleep_time.tv_nsec = 0;
|
||||
#else
|
||||
sleep_time.tv_sec = 0;
|
||||
sleep_time.tv_nsec = 1000000;
|
||||
#endif
|
||||
|
||||
// to milliseconds
|
||||
uint64_t sleep_time_ms = (sleep_time.tv_sec * 1000) + (sleep_time.tv_nsec / 1000000);
|
||||
|
||||
for (;;) {
|
||||
uint64_t status;
|
||||
CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &status));
|
||||
|
||||
// check for console data
|
||||
uint32_t cout_data = status >> STATUS_STATE_BITS;
|
||||
if (cout_data & 0x1) {
|
||||
// retrieve console data
|
||||
do {
|
||||
char cout_char = (cout_data >> 1) & 0xff;
|
||||
uint32_t cout_tid = (cout_data >> 9) & 0xff;
|
||||
auto& ss_buf = print_bufs[cout_tid];
|
||||
ss_buf << cout_char;
|
||||
if (cout_char == '\n') {
|
||||
std::cout << std::dec << "#" << cout_tid << ": " << ss_buf.str() << std::flush;
|
||||
ss_buf.str("");
|
||||
}
|
||||
CHECK_RES(fpgaReadMMIO64(device->fpga, 0, MMIO_STATUS, &status));
|
||||
cout_data = status >> STATUS_STATE_BITS;
|
||||
} while (cout_data & 0x1);
|
||||
}
|
||||
|
||||
uint32_t state = status & ((1 << STATUS_STATE_BITS)-1);
|
||||
|
||||
if (0 == state || 0 == timeout) {
|
||||
for (auto& buf : print_bufs) {
|
||||
auto str = buf.second.str();
|
||||
if (!str.empty()) {
|
||||
std::cout << "#" << buf.first << ": " << str << std::endl;
|
||||
}
|
||||
}
|
||||
if (state != 0) {
|
||||
fprintf(stdout, "[VXDRV] ready-wait timed out: state=%d\n", state);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
nanosleep(&sleep_time, nullptr);
|
||||
timeout -= sleep_time_ms;
|
||||
};
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset) {
|
||||
if (nullptr == hbuffer
|
||||
|| 0 >= size)
|
||||
return -1;
|
||||
|
||||
vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer);
|
||||
vx_device *device = ((vx_device*)buffer->hdevice);
|
||||
|
||||
uint64_t dev_mem_size = LOCAL_MEM_SIZE;
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
|
||||
// check alignment
|
||||
if (!is_aligned(dev_maddr, CACHE_BLOCK_SIZE))
|
||||
return -1;
|
||||
if (!is_aligned(buffer->io_addr + src_offset, CACHE_BLOCK_SIZE))
|
||||
return -1;
|
||||
|
||||
// bound checking
|
||||
if (src_offset + asize > buffer->size)
|
||||
return -1;
|
||||
if (dev_maddr + asize > dev_mem_size)
|
||||
return -1;
|
||||
|
||||
// Ensure ready for new command
|
||||
if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0)
|
||||
return -1;
|
||||
|
||||
auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE);
|
||||
|
||||
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_IO_ADDR, (buffer->io_addr + src_offset) >> ls_shift));
|
||||
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_MEM_ADDR, dev_maddr >> ls_shift));
|
||||
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_DATA_SIZE, asize >> ls_shift));
|
||||
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_WRITE));
|
||||
|
||||
// Wait for the write operation to finish
|
||||
if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0)
|
||||
return -1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dest_offset) {
|
||||
if (nullptr == hbuffer
|
||||
|| 0 >= size)
|
||||
return -1;
|
||||
|
||||
vx_buffer_t *buffer = ((vx_buffer_t*)hbuffer);
|
||||
vx_device *device = ((vx_device*)buffer->hdevice);
|
||||
|
||||
uint64_t dev_mem_size = LOCAL_MEM_SIZE;
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
|
||||
// check alignment
|
||||
if (!is_aligned(dev_maddr, CACHE_BLOCK_SIZE))
|
||||
return -1;
|
||||
if (!is_aligned(buffer->io_addr + dest_offset, CACHE_BLOCK_SIZE))
|
||||
return -1;
|
||||
|
||||
// bound checking
|
||||
if (dest_offset + asize > buffer->size)
|
||||
return -1;
|
||||
if (dev_maddr + asize > dev_mem_size)
|
||||
return -1;
|
||||
|
||||
// Ensure ready for new command
|
||||
if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0)
|
||||
return -1;
|
||||
|
||||
auto ls_shift = (int)std::log2(CACHE_BLOCK_SIZE);
|
||||
|
||||
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_IO_ADDR, (buffer->io_addr + dest_offset) >> ls_shift));
|
||||
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_MEM_ADDR, dev_maddr >> ls_shift));
|
||||
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_DATA_SIZE, asize >> ls_shift));
|
||||
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_MEM_READ));
|
||||
|
||||
// Wait for the write operation to finish
|
||||
if (vx_ready_wait(buffer->hdevice, MAX_TIMEOUT) != 0)
|
||||
return -1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_start(vx_device_h hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
// Ensure ready for new command
|
||||
if (vx_ready_wait(hdevice, MAX_TIMEOUT) != 0)
|
||||
return -1;
|
||||
|
||||
// start execution
|
||||
CHECK_RES(fpgaWriteMMIO64(device->fpga, 0, MMIO_CMD_TYPE, CMD_RUN));
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,250 +0,0 @@
|
|||
#include "vx_scope.h"
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <thread>
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
#include <assert.h>
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <VX_config.h>
|
||||
#include <vortex_afu.h>
|
||||
#include <scope-defs.h>
|
||||
|
||||
#define FRAME_FLUSH_SIZE 100
|
||||
|
||||
#define CHECK_RES(_expr) \
|
||||
do { \
|
||||
fpga_result res = _expr; \
|
||||
if (res == FPGA_OK) \
|
||||
break; \
|
||||
printf("OPAE Error: '%s' returned %d, %s!\n", \
|
||||
#_expr, (int)res, fpgaErrStr(res)); \
|
||||
return -1; \
|
||||
} while (false)
|
||||
|
||||
#define MMIO_SCOPE_READ (AFU_IMAGE_MMIO_SCOPE_READ * 4)
|
||||
#define MMIO_SCOPE_WRITE (AFU_IMAGE_MMIO_SCOPE_WRITE * 4)
|
||||
|
||||
#define CMD_GET_VALID 0
|
||||
#define CMD_GET_DATA 1
|
||||
#define CMD_GET_WIDTH 2
|
||||
#define CMD_GET_COUNT 3
|
||||
#define CMD_SET_START 4
|
||||
#define CMD_SET_STOP 5
|
||||
#define CMD_GET_OFFSET 6
|
||||
|
||||
static constexpr int num_modules = sizeof(scope_modules) / sizeof(scope_module_t);
|
||||
|
||||
static constexpr int num_taps = sizeof(scope_taps) / sizeof(scope_tap_t);
|
||||
|
||||
constexpr int calcFrameWidth(int index = 0) {
|
||||
return (index < num_taps) ? (scope_taps[index].width + calcFrameWidth(index + 1)) : 0;
|
||||
}
|
||||
|
||||
static constexpr int fwidth = calcFrameWidth();
|
||||
|
||||
#ifdef HANG_TIMEOUT
|
||||
static std::thread g_timeout_thread;
|
||||
static std::mutex g_timeout_mutex;
|
||||
|
||||
static void timeout_callback(fpga_handle fpga) {
|
||||
std::this_thread::sleep_for(std::chrono::seconds{HANG_TIMEOUT});
|
||||
vx_scope_stop(fpga);
|
||||
fpgaClose(fpga);
|
||||
exit(0);
|
||||
}
|
||||
#endif
|
||||
|
||||
uint64_t print_clock(std::ofstream& ofs, uint64_t delta, uint64_t timestamp) {
|
||||
while (delta != 0) {
|
||||
ofs << '#' << timestamp++ << std::endl;
|
||||
ofs << "b0 0" << std::endl;
|
||||
ofs << '#' << timestamp++ << std::endl;
|
||||
ofs << "b1 0" << std::endl;
|
||||
--delta;
|
||||
}
|
||||
return timestamp;
|
||||
}
|
||||
|
||||
void dump_taps(std::ofstream& ofs, int module) {
|
||||
for (int i = 0; i < num_taps; ++i) {
|
||||
auto& tap = scope_taps[i];
|
||||
if (tap.module != module)
|
||||
continue;
|
||||
ofs << "$var reg " << tap.width << " " << (i + 1) << " " << tap.name << " $end" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void dump_module(std::ofstream& ofs, int parent) {
|
||||
for (auto& module : scope_modules) {
|
||||
if (module.parent != parent)
|
||||
continue;
|
||||
if (module.name[0] == '*') {
|
||||
ofs << "$var reg 1 0 clk $end" << std::endl;
|
||||
} else {
|
||||
ofs << "$scope module " << module.name << " $end" << std::endl;
|
||||
}
|
||||
dump_module(ofs, module.index);
|
||||
dump_taps(ofs, module.index);
|
||||
if (module.name[0] != '*') {
|
||||
ofs << "$upscope $end" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int vx_scope_start(fpga_handle hfpga, uint64_t start_time, uint64_t stop_time) {
|
||||
if (nullptr == hfpga)
|
||||
return -1;
|
||||
|
||||
if (stop_time != uint64_t(-1)) {
|
||||
// set stop time
|
||||
uint64_t cmd_stop = ((stop_time << 3) | CMD_SET_STOP);
|
||||
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, cmd_stop));
|
||||
std::cout << "scope stop time: " << std::dec << stop_time << "s" << std::endl;
|
||||
}
|
||||
|
||||
// start recording
|
||||
uint64_t cmd_delay = ((start_time << 3) | CMD_SET_START);
|
||||
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, cmd_delay));
|
||||
std::cout << "scope start time: " << std::dec << start_time << "s" << std::endl;
|
||||
|
||||
#ifdef HANG_TIMEOUT
|
||||
g_timeout_thread = std::thread(timeout_callback, hfpga);
|
||||
g_timeout_thread.detach();
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int vx_scope_stop(fpga_handle hfpga) {
|
||||
#ifdef HANG_TIMEOUT
|
||||
if (!g_timeout_mutex.try_lock())
|
||||
return 0;
|
||||
#endif
|
||||
|
||||
if (nullptr == hfpga)
|
||||
return -1;
|
||||
|
||||
// forced stop
|
||||
uint64_t cmd_stop = ((0 << 3) | CMD_SET_STOP);
|
||||
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, cmd_stop));
|
||||
|
||||
std::cout << "scope trace dump begin..." << std::endl;
|
||||
|
||||
std::ofstream ofs("trace.vcd");
|
||||
|
||||
ofs << "$version Generated by Vortex Scope $end" << std::endl;
|
||||
ofs << "$timescale 1 ns $end" << std::endl;
|
||||
ofs << "$scope module TOP $end" << std::endl;
|
||||
|
||||
dump_module(ofs, -1);
|
||||
dump_taps(ofs, -1);
|
||||
ofs << "$upscope $end" << std::endl;
|
||||
ofs << "enddefinitions $end" << std::endl;
|
||||
|
||||
uint64_t frame_width, max_frames, data_valid, offset, delta;
|
||||
uint64_t timestamp = 0;
|
||||
uint64_t frame_offset = 0;
|
||||
uint64_t frame_no = 0;
|
||||
int signal_id = 0;
|
||||
int signal_offset = 0;
|
||||
|
||||
// wait for recording to terminate
|
||||
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_VALID));
|
||||
do {
|
||||
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &data_valid));
|
||||
if (data_valid)
|
||||
break;
|
||||
std::this_thread::sleep_for(std::chrono::seconds(1));
|
||||
} while (true);
|
||||
|
||||
// get frame width
|
||||
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_WIDTH));
|
||||
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &frame_width));
|
||||
std::cout << "scope::frame_width=" << std::dec << frame_width << std::endl;
|
||||
|
||||
if (fwidth != (int)frame_width) {
|
||||
std::cerr << "invalid frame_width: expecting " << std::dec << fwidth << "!" << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
|
||||
// get max frames
|
||||
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_COUNT));
|
||||
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &max_frames));
|
||||
std::cout << "scope::max_frames=" << std::dec << max_frames << std::endl;
|
||||
|
||||
// get offset
|
||||
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_OFFSET));
|
||||
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &offset));
|
||||
|
||||
// get data
|
||||
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_DATA));
|
||||
|
||||
// print clock header
|
||||
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &delta));
|
||||
timestamp = print_clock(ofs, offset + delta + 2, timestamp);
|
||||
signal_id = num_taps;
|
||||
|
||||
std::vector<char> signal_data(frame_width+1);
|
||||
|
||||
do {
|
||||
if (frame_no == (max_frames-1)) {
|
||||
// verify last frame is valid
|
||||
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_VALID));
|
||||
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &data_valid));
|
||||
assert(data_valid == 1);
|
||||
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_DATA));
|
||||
}
|
||||
|
||||
// read next data words
|
||||
uint64_t word;
|
||||
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &word));
|
||||
|
||||
do {
|
||||
int signal_width = scope_taps[signal_id-1].width;
|
||||
int word_offset = frame_offset % 64;
|
||||
|
||||
signal_data[signal_width - signal_offset - 1] = ((word >> word_offset) & 0x1) ? '1' : '0';
|
||||
|
||||
++signal_offset;
|
||||
++frame_offset;
|
||||
|
||||
if (signal_offset == signal_width) {
|
||||
signal_data[signal_width] = 0; // string null termination
|
||||
ofs << 'b' << signal_data.data() << ' ' << signal_id << std::endl;
|
||||
signal_offset = 0;
|
||||
--signal_id;
|
||||
}
|
||||
|
||||
if (frame_offset == frame_width) {
|
||||
assert(0 == signal_offset);
|
||||
frame_offset = 0;
|
||||
++frame_no;
|
||||
|
||||
if (frame_no != max_frames) {
|
||||
// print clock header
|
||||
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &delta));
|
||||
timestamp = print_clock(ofs, delta + 1, timestamp);
|
||||
signal_id = num_taps;
|
||||
if (0 == (frame_no % FRAME_FLUSH_SIZE)) {
|
||||
ofs << std::flush;
|
||||
std::cout << "*** " << frame_no << "/" << max_frames << " frames" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} while ((frame_offset % 64) != 0);
|
||||
|
||||
} while (frame_no != max_frames);
|
||||
|
||||
std::cout << "scope trace dump done! - " << (timestamp/2) << " cycles" << std::endl;
|
||||
|
||||
// verify data not valid
|
||||
CHECK_RES(fpgaWriteMMIO64(hfpga, 0, MMIO_SCOPE_WRITE, CMD_GET_VALID));
|
||||
CHECK_RES(fpgaReadMMIO64(hfpga, 0, MMIO_SCOPE_READ, &data_valid));
|
||||
assert(data_valid == 0);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,19 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef USE_VLSIM
|
||||
#include <fpga.h>
|
||||
#else
|
||||
#include <opae/fpga.h>
|
||||
#endif
|
||||
|
||||
#if defined(USE_FPGA)
|
||||
#define HANG_TIMEOUT 60
|
||||
#else
|
||||
#define HANG_TIMEOUT (30*60)
|
||||
#endif
|
||||
|
||||
int vx_scope_start(fpga_handle hfpga, uint64_t start_time = 0, uint64_t stop_time = -1);
|
||||
|
||||
int vx_scope_stop(fpga_handle hfpga);
|
|
@ -1,356 +0,0 @@
|
|||
#include "vx_utils.h"
|
||||
#include <iostream>
|
||||
#include <fstream>
|
||||
#include <cstring>
|
||||
#include <vortex.h>
|
||||
#include <VX_config.h>
|
||||
#include <assert.h>
|
||||
|
||||
uint64_t aligned_size(uint64_t size, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return (size + alignment - 1) & ~(alignment - 1);
|
||||
}
|
||||
|
||||
bool is_aligned(uint64_t addr, uint64_t alignment) {
|
||||
assert(0 == (alignment & (alignment - 1)));
|
||||
return 0 == (addr & (alignment - 1));
|
||||
}
|
||||
|
||||
extern int vx_upload_kernel_bytes(vx_device_h device, const void* content, uint64_t size) {
|
||||
int err = 0;
|
||||
|
||||
if (NULL == content || 0 == size)
|
||||
return -1;
|
||||
|
||||
uint32_t buffer_transfer_size = 65536; // 64 KB
|
||||
uint64_t kernel_base_addr;
|
||||
err = vx_dev_caps(device, VX_CAPS_KERNEL_BASE_ADDR, &kernel_base_addr);
|
||||
if (err != 0)
|
||||
return -1;
|
||||
|
||||
// allocate device buffer
|
||||
vx_buffer_h buffer;
|
||||
err = vx_buf_alloc(device, buffer_transfer_size, &buffer);
|
||||
if (err != 0)
|
||||
return -1;
|
||||
|
||||
// get buffer address
|
||||
auto buf_ptr = (uint8_t*)vx_host_ptr(buffer);
|
||||
|
||||
//
|
||||
// upload content
|
||||
//
|
||||
|
||||
uint64_t offset = 0;
|
||||
while (offset < size) {
|
||||
auto chunk_size = std::min<uint64_t>(buffer_transfer_size, size - offset);
|
||||
std::memcpy(buf_ptr, (uint8_t*)content + offset, chunk_size);
|
||||
|
||||
/*printf("*** Upload Kernel to 0x%0x: data=", kernel_base_addr + offset);
|
||||
for (int i = 0, n = ((chunk_size+7)/8); i < n; ++i) {
|
||||
printf("%08x", ((uint64_t*)((uint8_t*)content + offset))[n-1-i]);
|
||||
}
|
||||
printf("\n");*/
|
||||
|
||||
err = vx_copy_to_dev(buffer, kernel_base_addr + offset, chunk_size, 0);
|
||||
if (err != 0) {
|
||||
vx_buf_free(buffer);
|
||||
return err;
|
||||
}
|
||||
offset += chunk_size;
|
||||
}
|
||||
|
||||
vx_buf_free(buffer);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_upload_kernel_file(vx_device_h device, const char* filename) {
|
||||
std::ifstream ifs(filename);
|
||||
if (!ifs) {
|
||||
std::cout << "error: " << filename << " not found" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// read file content
|
||||
ifs.seekg(0, ifs.end);
|
||||
auto size = ifs.tellg();
|
||||
auto content = new char [size];
|
||||
ifs.seekg(0, ifs.beg);
|
||||
ifs.read(content, size);
|
||||
|
||||
// upload
|
||||
int err = vx_upload_kernel_bytes(device, content, size);
|
||||
|
||||
// release buffer
|
||||
delete[] content;
|
||||
|
||||
return err;
|
||||
}
|
||||
|
||||
/*static uint32_t get_csr_32(const uint32_t* buffer, int addr) {
|
||||
uint32_t value_lo = buffer[addr - CSR_MPM_BASE];
|
||||
return value_lo;
|
||||
}*/
|
||||
|
||||
static uint64_t get_csr_64(const uint32_t* buffer, int addr) {
|
||||
uint32_t value_lo = buffer[addr - CSR_MPM_BASE];
|
||||
uint32_t value_hi = buffer[addr - CSR_MPM_BASE + 32];
|
||||
return (uint64_t(value_hi) << 32) | value_lo;
|
||||
}
|
||||
|
||||
extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
||||
int ret = 0;
|
||||
|
||||
uint64_t instrs = 0;
|
||||
uint64_t cycles = 0;
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
// PERF: pipeline stalls
|
||||
uint64_t ibuffer_stalls = 0;
|
||||
uint64_t scoreboard_stalls = 0;
|
||||
uint64_t lsu_stalls = 0;
|
||||
uint64_t fpu_stalls = 0;
|
||||
uint64_t csr_stalls = 0;
|
||||
uint64_t alu_stalls = 0;
|
||||
uint64_t gpu_stalls = 0;
|
||||
// PERF: decode
|
||||
uint64_t loads = 0;
|
||||
uint64_t stores = 0;
|
||||
uint64_t branches = 0;
|
||||
// PERF: Icache
|
||||
uint64_t icache_reads = 0;
|
||||
uint64_t icache_read_misses = 0;
|
||||
// PERF: Dcache
|
||||
uint64_t dcache_reads = 0;
|
||||
uint64_t dcache_writes = 0;
|
||||
uint64_t dcache_read_misses = 0;
|
||||
uint64_t dcache_write_misses = 0;
|
||||
uint64_t dcache_bank_stalls = 0;
|
||||
uint64_t dcache_mshr_stalls = 0;
|
||||
// PERF: shared memory
|
||||
uint64_t smem_reads = 0;
|
||||
uint64_t smem_writes = 0;
|
||||
uint64_t smem_bank_stalls = 0;
|
||||
// PERF: memory
|
||||
uint64_t mem_reads = 0;
|
||||
uint64_t mem_writes = 0;
|
||||
uint64_t mem_lat = 0;
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
// PERF: texunit
|
||||
uint64_t tex_mem_reads = 0;
|
||||
uint64_t tex_mem_lat = 0;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
uint64_t num_cores;
|
||||
ret = vx_dev_caps(device, VX_CAPS_MAX_CORES, &num_cores);
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
vx_buffer_h staging_buf;
|
||||
ret = vx_buf_alloc(device, 64 * sizeof(uint32_t), &staging_buf);
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
|
||||
auto staging_ptr = (uint32_t*)vx_host_ptr(staging_buf);
|
||||
|
||||
for (unsigned core_id = 0; core_id < num_cores; ++core_id) {
|
||||
ret = vx_copy_from_dev(staging_buf, IO_CSR_ADDR + 64 * sizeof(uint32_t) * core_id, 64 * sizeof(uint32_t), 0);
|
||||
if (ret != 0) {
|
||||
vx_buf_free(staging_buf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
uint64_t instrs_per_core = get_csr_64(staging_ptr, CSR_MINSTRET);
|
||||
uint64_t cycles_per_core = get_csr_64(staging_ptr, CSR_MCYCLE);
|
||||
float IPC = (float)(double(instrs_per_core) / double(cycles_per_core));
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: instrs=%ld, cycles=%ld, IPC=%f\n", core_id, instrs_per_core, cycles_per_core, IPC);
|
||||
instrs += instrs_per_core;
|
||||
cycles = std::max<uint64_t>(cycles_per_core, cycles);
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
// PERF: pipeline
|
||||
// ibuffer_stall
|
||||
uint64_t ibuffer_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_IBUF_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: ibuffer stalls=%ld\n", core_id, ibuffer_stalls_per_core);
|
||||
ibuffer_stalls += ibuffer_stalls_per_core;
|
||||
// scoreboard_stall
|
||||
uint64_t scoreboard_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_SCRB_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: scoreboard stalls=%ld\n", core_id, scoreboard_stalls_per_core);
|
||||
scoreboard_stalls += scoreboard_stalls_per_core;
|
||||
// alu_stall
|
||||
uint64_t alu_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_ALU_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: alu unit stalls=%ld\n", core_id, alu_stalls_per_core);
|
||||
alu_stalls += alu_stalls_per_core;
|
||||
// lsu_stall
|
||||
uint64_t lsu_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_LSU_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: lsu unit stalls=%ld\n", core_id, lsu_stalls_per_core);
|
||||
lsu_stalls += lsu_stalls_per_core;
|
||||
// csr_stall
|
||||
uint64_t csr_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_CSR_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: csr unit stalls=%ld\n", core_id, csr_stalls_per_core);
|
||||
csr_stalls += csr_stalls_per_core;
|
||||
// fpu_stall
|
||||
uint64_t fpu_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_FPU_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: fpu unit stalls=%ld\n", core_id, fpu_stalls_per_core);
|
||||
fpu_stalls += fpu_stalls_per_core;
|
||||
// gpu_stall
|
||||
uint64_t gpu_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_GPU_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: gpu unit stalls=%ld\n", core_id, gpu_stalls_per_core);
|
||||
gpu_stalls += gpu_stalls_per_core;
|
||||
|
||||
// PERF: decode
|
||||
// loads
|
||||
uint64_t loads_per_core = get_csr_64(staging_ptr, CSR_MPM_LOADS);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
|
||||
loads += loads_per_core;
|
||||
// stores
|
||||
uint64_t stores_per_core = get_csr_64(staging_ptr, CSR_MPM_STORES);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
|
||||
stores += stores_per_core;
|
||||
// branches
|
||||
uint64_t branches_per_core = get_csr_64(staging_ptr, CSR_MPM_BRANCHES);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: branches=%ld\n", core_id, branches_per_core);
|
||||
branches += branches_per_core;
|
||||
|
||||
// PERF: Icache
|
||||
// total reads
|
||||
uint64_t icache_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_READS);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache reads=%ld\n", core_id, icache_reads_per_core);
|
||||
icache_reads += icache_reads_per_core;
|
||||
// read misses
|
||||
uint64_t icache_miss_r_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_MISS_R);
|
||||
int icache_read_hit_ratio = (int)((1.0 - (double(icache_miss_r_per_core) / double(icache_reads_per_core))) * 100);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache misses=%ld (hit ratio=%d%%)\n", core_id, icache_miss_r_per_core, icache_read_hit_ratio);
|
||||
icache_read_misses += icache_miss_r_per_core;
|
||||
|
||||
// PERF: Dcache
|
||||
// total reads
|
||||
uint64_t dcache_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_READS);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reads=%ld\n", core_id, dcache_reads_per_core);
|
||||
dcache_reads += dcache_reads_per_core;
|
||||
// total write
|
||||
uint64_t dcache_writes_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_WRITES);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache writes=%ld\n", core_id, dcache_writes_per_core);
|
||||
dcache_writes += dcache_writes_per_core;
|
||||
// read misses
|
||||
uint64_t dcache_miss_r_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_MISS_R);
|
||||
int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_miss_r_per_core) / double(dcache_reads_per_core))) * 100);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache read misses=%ld (hit ratio=%d%%)\n", core_id, dcache_miss_r_per_core, dcache_read_hit_ratio);
|
||||
dcache_read_misses += dcache_miss_r_per_core;
|
||||
// read misses
|
||||
uint64_t dcache_miss_w_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_MISS_W);
|
||||
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_miss_w_per_core) / double(dcache_writes_per_core))) * 100);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache write misses=%ld (hit ratio=%d%%)\n", core_id, dcache_miss_w_per_core, dcache_write_hit_ratio);
|
||||
dcache_write_misses += dcache_miss_w_per_core;
|
||||
// bank_stalls
|
||||
uint64_t dcache_bank_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_BANK_ST);
|
||||
int dcache_bank_utilization = (int)((double(dcache_reads_per_core + dcache_writes_per_core) / double(dcache_reads_per_core + dcache_writes_per_core + dcache_bank_st_per_core)) * 100);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache bank stalls=%ld (utilization=%d%%)\n", core_id, dcache_bank_st_per_core, dcache_bank_utilization);
|
||||
dcache_bank_stalls += dcache_bank_st_per_core;
|
||||
// mshr_stalls
|
||||
uint64_t dcache_mshr_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_MSHR_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_st_per_core);
|
||||
dcache_mshr_stalls += dcache_mshr_st_per_core;
|
||||
|
||||
// PERF: SMEM
|
||||
// total reads
|
||||
uint64_t smem_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_SMEM_READS);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: smem reads=%ld\n", core_id, smem_reads_per_core);
|
||||
smem_reads += smem_reads_per_core;
|
||||
// total write
|
||||
uint64_t smem_writes_per_core = get_csr_64(staging_ptr, CSR_MPM_SMEM_WRITES);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: smem writes=%ld\n", core_id, smem_writes_per_core);
|
||||
smem_writes += smem_writes_per_core;
|
||||
// bank_stalls
|
||||
uint64_t smem_bank_st_per_core = get_csr_64(staging_ptr, CSR_MPM_SMEM_BANK_ST);
|
||||
int smem_bank_utilization = (int)((double(smem_reads_per_core + smem_writes_per_core) / double(smem_reads_per_core + smem_writes_per_core + smem_bank_st_per_core)) * 100);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: smem bank stalls=%ld (utilization=%d%%)\n", core_id, smem_bank_st_per_core, smem_bank_utilization);
|
||||
smem_bank_stalls += smem_bank_st_per_core;
|
||||
|
||||
// PERF: memory
|
||||
uint64_t mem_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_READS);
|
||||
uint64_t mem_writes_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_WRITES);
|
||||
uint64_t mem_lat_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_LAT);
|
||||
int mem_avg_lat = (int)(double(mem_lat_per_core) / double(mem_reads_per_core));
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory requests=%ld (reads=%ld, writes=%ld)\n", core_id, (mem_reads_per_core + mem_writes_per_core), mem_reads_per_core, mem_writes_per_core);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
mem_reads += mem_reads_per_core;
|
||||
mem_writes += mem_writes_per_core;
|
||||
mem_lat += mem_lat_per_core;
|
||||
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
// total reads
|
||||
uint64_t tex_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_TEX_READS);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: tex memory reads=%ld\n", core_id, tex_reads_per_core);
|
||||
tex_mem_reads += tex_reads_per_core;
|
||||
|
||||
// read latency
|
||||
uint64_t tex_lat_per_core = get_csr_64(staging_ptr, CSR_MPM_TEX_LAT);
|
||||
int tex_avg_lat = (int)(double(tex_lat_per_core) / double(tex_reads_per_core));
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: tex memory latency=%d cycles\n", core_id, tex_avg_lat);
|
||||
tex_mem_lat += tex_lat_per_core;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
float IPC = (float)(double(instrs) / double(cycles));
|
||||
fprintf(stream, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
|
||||
|
||||
#ifdef PERF_ENABLE
|
||||
int icache_read_hit_ratio = (int)((1.0 - (double(icache_read_misses) / double(icache_reads))) * 100);
|
||||
int dcache_read_hit_ratio = (int)((1.0 - (double(dcache_read_misses) / double(dcache_reads))) * 100);
|
||||
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
|
||||
int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100);
|
||||
int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100);
|
||||
int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads));
|
||||
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
|
||||
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
|
||||
fprintf(stream, "PERF: alu unit stalls=%ld\n", alu_stalls);
|
||||
fprintf(stream, "PERF: lsu unit stalls=%ld\n", lsu_stalls);
|
||||
fprintf(stream, "PERF: csr unit stalls=%ld\n", csr_stalls);
|
||||
fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls);
|
||||
fprintf(stream, "PERF: gpu unit stalls=%ld\n", gpu_stalls);
|
||||
fprintf(stream, "PERF: loads=%ld\n", loads);
|
||||
fprintf(stream, "PERF: stores=%ld\n", stores);
|
||||
fprintf(stream, "PERF: branches=%ld\n", branches);
|
||||
fprintf(stream, "PERF: icache reads=%ld\n", icache_reads);
|
||||
fprintf(stream, "PERF: icache read misses=%ld (hit ratio=%d%%)\n", icache_read_misses, icache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads);
|
||||
fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes);
|
||||
fprintf(stream, "PERF: dcache read misses=%ld (hit ratio=%d%%)\n", dcache_read_misses, dcache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: dcache write misses=%ld (hit ratio=%d%%)\n", dcache_write_misses, dcache_write_hit_ratio);
|
||||
fprintf(stream, "PERF: dcache bank stalls=%ld (utilization=%d%%)\n", dcache_bank_stalls, dcache_bank_utilization);
|
||||
fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls);
|
||||
fprintf(stream, "PERF: smem reads=%ld\n", smem_reads);
|
||||
fprintf(stream, "PERF: smem writes=%ld\n", smem_writes);
|
||||
fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization);
|
||||
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
|
||||
fprintf(stream, "PERF: memory average latency=%d cycles\n", mem_avg_lat);
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
int tex_avg_lat = (int)(double(tex_mem_lat) / double(tex_mem_reads));
|
||||
fprintf(stream, "PERF: tex memory reads=%ld\n", tex_mem_reads);
|
||||
fprintf(stream, "PERF: tex memory latency=%d cycles\n", tex_avg_lat);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// release allocated resources
|
||||
vx_buf_free(staging_buf);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Deprecated API functions
|
||||
|
||||
extern int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer) {
|
||||
return vx_buf_alloc(hdevice, size, hbuffer);
|
||||
}
|
||||
|
||||
extern int vx_buf_release(vx_buffer_h hbuffer) {
|
||||
return vx_buf_free(hbuffer);
|
||||
}
|
||||
|
||||
extern int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr) {
|
||||
return vx_mem_alloc(hdevice, size, dev_maddr);
|
||||
}
|
|
@ -1,11 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
uint64_t aligned_size(uint64_t size, uint64_t alignment);
|
||||
|
||||
bool is_aligned(uint64_t addr, uint64_t alignment);
|
||||
|
||||
#define CACHE_BLOCK_SIZE 64
|
||||
#define ALLOC_BASE_ADDR 0x00000000
|
||||
#define LOCAL_MEM_SIZE 4294967296 // 4 GB
|
|
@ -1,75 +0,0 @@
|
|||
OPAE_HOME ?= /tools/opae/1.4.0
|
||||
|
||||
RTL_DIR=../../hw/rtl
|
||||
|
||||
SCRIPT_DIR=../../hw/scripts
|
||||
|
||||
OPAE_SYN_DIR=../../hw/syn/opae
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I. -I../include -I../../hw -I$(OPAE_HOME)/include -I$(OPAE_SYN_DIR)
|
||||
|
||||
LDFLAGS += -L$(OPAE_HOME)/lib -luuid -lopae-c
|
||||
|
||||
#SCOPE=1
|
||||
|
||||
# stack execution protection
|
||||
LDFLAGS +=-z noexecstack
|
||||
|
||||
# data relocation and projection
|
||||
LDFLAGS +=-z relro -z now
|
||||
|
||||
# stack buffer overrun detection
|
||||
CXXFLAGS +=-fstack-protector
|
||||
|
||||
# Position independent code
|
||||
CXXFLAGS += -fPIC
|
||||
|
||||
# Add external configuration
|
||||
CXXFLAGS += $(CONFIGS)
|
||||
|
||||
# Dump perf stats
|
||||
CXXFLAGS += -DDUMP_PERF_STATS
|
||||
|
||||
LDFLAGS += -shared
|
||||
|
||||
PROJECT = libvortex.so
|
||||
|
||||
SRCS = ../common/opae.cpp ../common/vx_utils.cpp
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
# Enable scope analyzer
|
||||
ifdef SCOPE
|
||||
CXXFLAGS += -DSCOPE
|
||||
SRCS += ../common/vx_scope.cpp
|
||||
SCOPE_H = scope-defs.h
|
||||
endif
|
||||
|
||||
# Enable perf counters
|
||||
ifdef PERF
|
||||
CXXFLAGS += -DPERF_ENABLE
|
||||
endif
|
||||
|
||||
all: $(PROJECT)
|
||||
|
||||
$(OPAE_SYN_DIR)/vortex_afu.h:
|
||||
$(MAKE) -C $(OPAE_SYN_DIR) vortex_afu.h
|
||||
|
||||
scope-defs.h: $(SCRIPT_DIR)/scope.json
|
||||
$(SCRIPT_DIR)/scope.py $(CONFIGS) -cc scope-defs.h -vl $(RTL_DIR)/scope-defs.vh $(SCRIPT_DIR)/scope.json
|
||||
|
||||
# generate scope data
|
||||
scope: scope-defs.h
|
||||
|
||||
$(PROJECT): $(SRCS) $(OPAE_SYN_DIR)/vortex_afu.h $(SCOPE_H)
|
||||
$(CXX) $(CXXFLAGS) -DUSE_FPGA $^ $(LDFLAGS) -o $(PROJECT)
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o scope-defs.h
|
|
@ -1,84 +0,0 @@
|
|||
#ifndef __VX_DRIVER_H__
|
||||
#define __VX_DRIVER_H__
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef void* vx_device_h;
|
||||
|
||||
typedef void* vx_buffer_h;
|
||||
|
||||
// device caps ids
|
||||
#define VX_CAPS_VERSION 0x0
|
||||
#define VX_CAPS_MAX_CORES 0x1
|
||||
#define VX_CAPS_MAX_WARPS 0x2
|
||||
#define VX_CAPS_MAX_THREADS 0x3
|
||||
#define VX_CAPS_CACHE_LINE_SIZE 0x4
|
||||
#define VX_CAPS_LOCAL_MEM_SIZE 0x5
|
||||
#define VX_CAPS_ALLOC_BASE_ADDR 0x6
|
||||
#define VX_CAPS_KERNEL_BASE_ADDR 0x7
|
||||
|
||||
#define MAX_TIMEOUT (60*60*1000) // 1hr
|
||||
|
||||
// open the device and connect to it
|
||||
int vx_dev_open(vx_device_h* hdevice);
|
||||
|
||||
// Close the device when all the operations are done
|
||||
int vx_dev_close(vx_device_h hdevice);
|
||||
|
||||
// return device configurations
|
||||
int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value);
|
||||
|
||||
// Allocate shared buffer with device
|
||||
int vx_buf_alloc(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer);
|
||||
|
||||
// release buffer
|
||||
int vx_buf_free(vx_buffer_h hbuffer);
|
||||
|
||||
// Get host pointer address
|
||||
void* vx_host_ptr(vx_buffer_h hbuffer);
|
||||
|
||||
// allocate device memory and return address
|
||||
int vx_mem_alloc(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr);
|
||||
|
||||
// release device memory
|
||||
int vx_mem_free(vx_device_h hdevice, uint64_t dev_maddr);
|
||||
|
||||
// Copy bytes from buffer to device local memory
|
||||
int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset);
|
||||
|
||||
// Copy bytes from device local memory to buffer
|
||||
int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dst_offset);
|
||||
|
||||
// Start device execution
|
||||
int vx_start(vx_device_h hdevice);
|
||||
|
||||
// Wait for device ready with milliseconds timeout
|
||||
int vx_ready_wait(vx_device_h hdevice, uint64_t timeout);
|
||||
|
||||
////////////////////////////// UTILITY FUNCIONS ///////////////////////////////
|
||||
|
||||
// upload kernel bytes to device
|
||||
int vx_upload_kernel_bytes(vx_device_h device, const void* content, uint64_t size);
|
||||
|
||||
// upload kernel file to device
|
||||
int vx_upload_kernel_file(vx_device_h device, const char* filename);
|
||||
|
||||
// dump performance counters
|
||||
int vx_dump_perf(vx_device_h device, FILE* stream);
|
||||
|
||||
//////////////////////////// DEPRECATED FUNCTIONS /////////////////////////////
|
||||
int vx_alloc_dev_mem(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr);
|
||||
int vx_alloc_shared_mem(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer);
|
||||
int vx_buf_release(vx_buffer_h hbuffer);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // __VX_DRIVER_H__
|
|
@ -1,355 +0,0 @@
|
|||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <iostream>
|
||||
#include <future>
|
||||
#include <list>
|
||||
#include <chrono>
|
||||
|
||||
#include <vortex.h>
|
||||
#include <vx_malloc.h>
|
||||
#include <vx_utils.h>
|
||||
#include <VX_config.h>
|
||||
#include <mem.h>
|
||||
#include <util.h>
|
||||
#include <processor.h>
|
||||
|
||||
#define RAM_PAGE_SIZE 4096
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class vx_device;
|
||||
class vx_buffer {
|
||||
public:
|
||||
vx_buffer(uint64_t size, vx_device* device)
|
||||
: size_(size)
|
||||
, device_(device) {
|
||||
auto aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
data_ = malloc(aligned_asize);
|
||||
}
|
||||
|
||||
~vx_buffer() {
|
||||
if (data_) {
|
||||
free(data_);
|
||||
}
|
||||
}
|
||||
|
||||
void* data() const {
|
||||
return data_;
|
||||
}
|
||||
|
||||
uint64_t size() const {
|
||||
return size_;
|
||||
}
|
||||
|
||||
vx_device* device() const {
|
||||
return device_;
|
||||
}
|
||||
|
||||
private:
|
||||
uint64_t size_;
|
||||
vx_device* device_;
|
||||
void* data_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class vx_device {
|
||||
public:
|
||||
vx_device()
|
||||
: ram_(RAM_PAGE_SIZE)
|
||||
, mem_allocator_(
|
||||
ALLOC_BASE_ADDR,
|
||||
ALLOC_BASE_ADDR + LOCAL_MEM_SIZE,
|
||||
RAM_PAGE_SIZE,
|
||||
CACHE_BLOCK_SIZE)
|
||||
{
|
||||
processor_.attach_ram(&ram_);
|
||||
}
|
||||
|
||||
~vx_device() {
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
}
|
||||
|
||||
int alloc_local_mem(uint64_t size, uint64_t* dev_maddr) {
|
||||
return mem_allocator_.allocate(size, dev_maddr);
|
||||
}
|
||||
|
||||
int free_local_mem(uint64_t dev_maddr) {
|
||||
return mem_allocator_.release(dev_maddr);
|
||||
}
|
||||
|
||||
int upload(const void* src, uint64_t dest_addr, uint64_t size, uint64_t src_offset) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dest_addr + asize > LOCAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
/*printf("VXDRV: upload %ld bytes from 0x%lx:", size, uintptr_t((uint8_t*)src + src_offset));
|
||||
for (int i = 0; i < (asize / CACHE_BLOCK_SIZE); ++i) {
|
||||
printf("\n0x%08lx=", dest_addr + i * CACHE_BLOCK_SIZE);
|
||||
for (int j = 0; j < CACHE_BLOCK_SIZE; ++j) {
|
||||
printf("%02x", *((uint8_t*)src + src_offset + i * CACHE_BLOCK_SIZE + CACHE_BLOCK_SIZE - 1 - j));
|
||||
}
|
||||
}
|
||||
printf("\n");*/
|
||||
|
||||
ram_.write((const uint8_t*)src + src_offset, dest_addr, asize);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int download(void* dest, uint64_t src_addr, uint64_t size, uint64_t dest_offset) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (src_addr + asize > LOCAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.read((uint8_t*)dest + dest_offset, src_addr, asize);
|
||||
|
||||
/*printf("VXDRV: download %ld bytes to 0x%lx:", size, uintptr_t((uint8_t*)dest + dest_offset));
|
||||
for (int i = 0; i < (asize / CACHE_BLOCK_SIZE); ++i) {
|
||||
printf("\n0x%08lx=", src_addr + i * CACHE_BLOCK_SIZE);
|
||||
for (int j = 0; j < CACHE_BLOCK_SIZE; ++j) {
|
||||
printf("%02x", *((uint8_t*)dest + dest_offset + i * CACHE_BLOCK_SIZE + CACHE_BLOCK_SIZE - 1 - j));
|
||||
}
|
||||
}
|
||||
printf("\n");*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int start() {
|
||||
// ensure prior run completed
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
// start new run
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
processor_.run();
|
||||
});
|
||||
return 0;
|
||||
}
|
||||
|
||||
int wait(uint64_t timeout) {
|
||||
if (!future_.valid())
|
||||
return 0;
|
||||
uint64_t timeout_sec = timeout / 1000;
|
||||
std::chrono::seconds wait_time(1);
|
||||
for (;;) {
|
||||
// wait for 1 sec and check status
|
||||
auto status = future_.wait_for(wait_time);
|
||||
if (status == std::future_status::ready
|
||||
|| 0 == timeout_sec--)
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
RAM ram_;
|
||||
Processor processor_;
|
||||
MemoryAllocator mem_allocator_;
|
||||
std::future<void> future_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
class AutoPerfDump {
|
||||
private:
|
||||
std::list<vx_device_h> devices_;
|
||||
|
||||
public:
|
||||
AutoPerfDump() {}
|
||||
|
||||
~AutoPerfDump() {
|
||||
for (auto device : devices_) {
|
||||
vx_dump_perf(device, stdout);
|
||||
}
|
||||
}
|
||||
|
||||
void add_device(vx_device_h device) {
|
||||
devices_.push_back(device);
|
||||
}
|
||||
|
||||
void remove_device(vx_device_h device) {
|
||||
devices_.remove(device);
|
||||
}
|
||||
};
|
||||
|
||||
AutoPerfDump gAutoPerfDump;
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
switch (caps_id) {
|
||||
case VX_CAPS_VERSION:
|
||||
*value = IMPLEMENTATION_ID;
|
||||
break;
|
||||
case VX_CAPS_MAX_CORES:
|
||||
*value = NUM_CORES * NUM_CLUSTERS;
|
||||
break;
|
||||
case VX_CAPS_MAX_WARPS:
|
||||
*value = NUM_WARPS;
|
||||
break;
|
||||
case VX_CAPS_MAX_THREADS:
|
||||
*value = NUM_THREADS;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
*value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||
*value = LOCAL_MEM_SIZE;
|
||||
break;
|
||||
case VX_CAPS_ALLOC_BASE_ADDR:
|
||||
*value = ALLOC_BASE_ADDR;
|
||||
break;
|
||||
case VX_CAPS_KERNEL_BASE_ADDR:
|
||||
*value = STARTUP_ADDR;
|
||||
break;
|
||||
default:
|
||||
std::cout << "invalid caps id: " << caps_id << std::endl;
|
||||
std::abort();
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_open(vx_device_h* hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
*hdevice = new vx_device();
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
gAutoPerfDump.add_device(*hdevice);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_close(vx_device_h hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
gAutoPerfDump.remove_device(hdevice);
|
||||
vx_dump_perf(hdevice, stdout);
|
||||
#endif
|
||||
|
||||
delete device;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr) {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == dev_maddr
|
||||
|| 0 >= size)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->alloc_local_mem(size, dev_maddr);
|
||||
}
|
||||
|
||||
extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_maddr) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->free_local_mem(dev_maddr);
|
||||
}
|
||||
|
||||
extern int vx_buf_alloc(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer) {
|
||||
if (nullptr == hdevice
|
||||
|| 0 >= size
|
||||
|| nullptr == hbuffer)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
auto buffer = new vx_buffer(size, device);
|
||||
if (nullptr == buffer->data()) {
|
||||
delete buffer;
|
||||
return -1;
|
||||
}
|
||||
|
||||
*hbuffer = buffer;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern void* vx_host_ptr(vx_buffer_h hbuffer) {
|
||||
if (nullptr == hbuffer)
|
||||
return nullptr;
|
||||
|
||||
vx_buffer* buffer = ((vx_buffer*)hbuffer);
|
||||
|
||||
return buffer->data();
|
||||
}
|
||||
|
||||
extern int vx_buf_free(vx_buffer_h hbuffer) {
|
||||
if (nullptr == hbuffer)
|
||||
return -1;
|
||||
|
||||
vx_buffer* buffer = ((vx_buffer*)hbuffer);
|
||||
|
||||
delete buffer;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset) {
|
||||
if (nullptr == hbuffer
|
||||
|| 0 >= size)
|
||||
return -1;
|
||||
|
||||
auto buffer = (vx_buffer*)hbuffer;
|
||||
|
||||
if (size + src_offset > buffer->size())
|
||||
return -1;
|
||||
|
||||
return buffer->device()->upload(buffer->data(), dev_maddr, size, src_offset);
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dest_offset) {
|
||||
if (nullptr == hbuffer
|
||||
|| 0 >= size)
|
||||
return -1;
|
||||
|
||||
auto buffer = (vx_buffer*)hbuffer;
|
||||
|
||||
if (size + dest_offset > buffer->size())
|
||||
return -1;
|
||||
|
||||
return buffer->device()->download(buffer->data(), dev_maddr, size, dest_offset);
|
||||
}
|
||||
|
||||
extern int vx_start(vx_device_h hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
return device->start();
|
||||
}
|
||||
|
||||
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
return device->wait(timeout);
|
||||
}
|
|
@ -1,357 +0,0 @@
|
|||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <iostream>
|
||||
#include <future>
|
||||
#include <chrono>
|
||||
|
||||
#include <vortex.h>
|
||||
#include <vx_utils.h>
|
||||
#include <vx_malloc.h>
|
||||
|
||||
#include <VX_config.h>
|
||||
|
||||
#include <util.h>
|
||||
|
||||
#include <processor.h>
|
||||
#include <archdef.h>
|
||||
#include <mem.h>
|
||||
#include <constants.h>
|
||||
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class vx_device;
|
||||
|
||||
class vx_buffer {
|
||||
public:
|
||||
vx_buffer(uint64_t size, vx_device* device)
|
||||
: size_(size)
|
||||
, device_(device) {
|
||||
uint64_t aligned_asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
data_ = malloc(aligned_asize);
|
||||
}
|
||||
|
||||
~vx_buffer() {
|
||||
if (data_) {
|
||||
free(data_);
|
||||
}
|
||||
}
|
||||
|
||||
void* data() const {
|
||||
return data_;
|
||||
}
|
||||
|
||||
uint64_t size() const {
|
||||
return size_;
|
||||
}
|
||||
|
||||
vx_device* device() const {
|
||||
return device_;
|
||||
}
|
||||
|
||||
private:
|
||||
uint64_t size_;
|
||||
vx_device* device_;
|
||||
void* data_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class vx_device {
|
||||
public:
|
||||
vx_device()
|
||||
: arch_(NUM_CORES * NUM_CLUSTERS, NUM_WARPS, NUM_THREADS)
|
||||
, ram_(RAM_PAGE_SIZE)
|
||||
, processor_(arch_)
|
||||
, mem_allocator_(
|
||||
ALLOC_BASE_ADDR,
|
||||
ALLOC_BASE_ADDR + LOCAL_MEM_SIZE,
|
||||
RAM_PAGE_SIZE,
|
||||
CACHE_BLOCK_SIZE)
|
||||
{
|
||||
// attach memory module
|
||||
processor_.attach_ram(&ram_);
|
||||
}
|
||||
|
||||
~vx_device() {
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
}
|
||||
|
||||
int alloc_local_mem(uint64_t size, uint64_t* dev_maddr) {
|
||||
return mem_allocator_.allocate(size, dev_maddr);
|
||||
}
|
||||
|
||||
int free_local_mem(uint64_t dev_maddr) {
|
||||
return mem_allocator_.release(dev_maddr);
|
||||
}
|
||||
|
||||
int upload(const void* src, uint64_t dest_addr, uint64_t size, uint64_t src_offset) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (dest_addr + asize > LOCAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.write((const uint8_t*)src + src_offset, dest_addr, asize);
|
||||
|
||||
/*printf("VXDRV: upload %d bytes to 0x%x\n", size, dest_addr);
|
||||
for (int i = 0; i < size; i += 4) {
|
||||
printf("mem-write: 0x%x <- 0x%x\n", dest_addr + i, *(uint32_t*)((uint8_t*)src + src_offset + i));
|
||||
}*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int download(void* dest, uint64_t src_addr, uint64_t size, uint64_t dest_offset) {
|
||||
uint64_t asize = aligned_size(size, CACHE_BLOCK_SIZE);
|
||||
if (src_addr + asize > LOCAL_MEM_SIZE)
|
||||
return -1;
|
||||
|
||||
ram_.read((uint8_t*)dest + dest_offset, src_addr, asize);
|
||||
|
||||
/*printf("VXDRV: download %d bytes from 0x%x\n", size, src_addr);
|
||||
for (int i = 0; i < size; i += 4) {
|
||||
printf("mem-read: 0x%x -> 0x%x\n", src_addr + i, *(uint32_t*)((uint8_t*)dest + dest_offset + i));
|
||||
}*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int start() {
|
||||
// ensure prior run completed
|
||||
if (future_.valid()) {
|
||||
future_.wait();
|
||||
}
|
||||
|
||||
// start new run
|
||||
future_ = std::async(std::launch::async, [&]{
|
||||
processor_.run();
|
||||
});
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int wait(uint64_t timeout) {
|
||||
if (!future_.valid())
|
||||
return 0;
|
||||
uint64_t timeout_sec = timeout / 1000;
|
||||
std::chrono::seconds wait_time(1);
|
||||
for (;;) {
|
||||
// wait for 1 sec and check status
|
||||
auto status = future_.wait_for(wait_time);
|
||||
if (status == std::future_status::ready
|
||||
|| 0 == timeout_sec--)
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
private:
|
||||
ArchDef arch_;
|
||||
RAM ram_;
|
||||
Processor processor_;
|
||||
MemoryAllocator mem_allocator_;
|
||||
std::future<void> future_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
class AutoPerfDump {
|
||||
private:
|
||||
std::list<vx_device_h> devices_;
|
||||
|
||||
public:
|
||||
AutoPerfDump() {}
|
||||
|
||||
~AutoPerfDump() {
|
||||
for (auto device : devices_) {
|
||||
vx_dump_perf(device, stdout);
|
||||
}
|
||||
}
|
||||
|
||||
void add_device(vx_device_h device) {
|
||||
devices_.push_back(device);
|
||||
}
|
||||
|
||||
void remove_device(vx_device_h device) {
|
||||
devices_.remove(device);
|
||||
}
|
||||
};
|
||||
|
||||
AutoPerfDump gAutoPerfDump;
|
||||
#endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern int vx_dev_open(vx_device_h* hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
*hdevice = new vx_device();
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
gAutoPerfDump.add_device(*hdevice);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_close(vx_device_h hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
#ifdef DUMP_PERF_STATS
|
||||
gAutoPerfDump.remove_device(hdevice);
|
||||
vx_dump_perf(hdevice, stdout);
|
||||
#endif
|
||||
|
||||
delete device;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_dev_caps(vx_device_h hdevice, uint32_t caps_id, uint64_t *value) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
switch (caps_id) {
|
||||
case VX_CAPS_VERSION:
|
||||
*value = IMPLEMENTATION_ID;
|
||||
break;
|
||||
case VX_CAPS_MAX_CORES:
|
||||
*value = NUM_CORES * NUM_CLUSTERS;
|
||||
break;
|
||||
case VX_CAPS_MAX_WARPS:
|
||||
*value = NUM_WARPS;
|
||||
break;
|
||||
case VX_CAPS_MAX_THREADS:
|
||||
*value = NUM_THREADS;
|
||||
break;
|
||||
case VX_CAPS_CACHE_LINE_SIZE:
|
||||
*value = CACHE_BLOCK_SIZE;
|
||||
break;
|
||||
case VX_CAPS_LOCAL_MEM_SIZE:
|
||||
*value = LOCAL_MEM_SIZE;
|
||||
break;
|
||||
case VX_CAPS_ALLOC_BASE_ADDR:
|
||||
*value = ALLOC_BASE_ADDR;
|
||||
break;
|
||||
case VX_CAPS_KERNEL_BASE_ADDR:
|
||||
*value = STARTUP_ADDR;
|
||||
break;
|
||||
default:
|
||||
std::cout << "invalid caps id: " << caps_id << std::endl;
|
||||
std::abort();
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_mem_alloc(vx_device_h hdevice, uint64_t size, uint64_t* dev_maddr) {
|
||||
if (nullptr == hdevice
|
||||
|| nullptr == dev_maddr
|
||||
|| 0 >= size)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->alloc_local_mem(size, dev_maddr);
|
||||
}
|
||||
|
||||
extern int vx_mem_free(vx_device_h hdevice, uint64_t dev_maddr) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
return device->free_local_mem(dev_maddr);
|
||||
}
|
||||
|
||||
extern int vx_buf_alloc(vx_device_h hdevice, uint64_t size, vx_buffer_h* hbuffer) {
|
||||
if (nullptr == hdevice
|
||||
|| 0 >= size
|
||||
|| nullptr == hbuffer)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
auto buffer = new vx_buffer(size, device);
|
||||
if (nullptr == buffer->data()) {
|
||||
delete buffer;
|
||||
return -1;
|
||||
}
|
||||
|
||||
*hbuffer = buffer;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern void* vx_host_ptr(vx_buffer_h hbuffer) {
|
||||
if (nullptr == hbuffer)
|
||||
return nullptr;
|
||||
|
||||
vx_buffer* buffer = ((vx_buffer*)hbuffer);
|
||||
|
||||
return buffer->data();
|
||||
}
|
||||
|
||||
extern int vx_buf_free(vx_buffer_h hbuffer) {
|
||||
if (nullptr == hbuffer)
|
||||
return -1;
|
||||
|
||||
vx_buffer* buffer = ((vx_buffer*)hbuffer);
|
||||
|
||||
delete buffer;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern int vx_copy_to_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t src_offset) {
|
||||
if (nullptr == hbuffer
|
||||
|| 0 >= size)
|
||||
return -1;
|
||||
|
||||
auto buffer = (vx_buffer*)hbuffer;
|
||||
|
||||
if (size + src_offset > buffer->size())
|
||||
return -1;
|
||||
|
||||
return buffer->device()->upload(buffer->data(), dev_maddr, size, src_offset);
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(vx_buffer_h hbuffer, uint64_t dev_maddr, uint64_t size, uint64_t dest_offset) {
|
||||
if (nullptr == hbuffer
|
||||
|| 0 >= size)
|
||||
return -1;
|
||||
|
||||
auto buffer = (vx_buffer*)hbuffer;
|
||||
|
||||
if (size + dest_offset > buffer->size())
|
||||
return -1;
|
||||
|
||||
return buffer->device()->download(buffer->data(), dev_maddr, size, dest_offset);
|
||||
}
|
||||
|
||||
extern int vx_start(vx_device_h hdevice) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
return device->start();
|
||||
}
|
||||
|
||||
extern int vx_ready_wait(vx_device_h hdevice, uint64_t timeout) {
|
||||
if (nullptr == hdevice)
|
||||
return -1;
|
||||
|
||||
vx_device *device = ((vx_device*)hdevice);
|
||||
|
||||
return device->wait(timeout);
|
||||
}
|
|
@ -1,49 +0,0 @@
|
|||
#include <vortex.h>
|
||||
|
||||
extern int vx_dev_open(vx_device_h* /*hdevice*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_dev_close(vx_device_h /*hdevice*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_dev_caps(vx_device_h /*hdevice*/, uint32_t /*caps_id*/, uint64_t* /*value*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_mem_alloc(vx_device_h /*hdevice*/, uint64_t /*size*/, uint64_t* /*dev_maddr*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
int vx_mem_free(vx_device_h /*hdevice*/, uint64_t /*dev_maddr*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_buf_alloc(vx_device_h /*hdevice*/, uint64_t /*size*/, vx_buffer_h* /*hbuffer*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern void* vx_host_ptr(vx_buffer_h /*hbuffer*/) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
extern int vx_buf_free(vx_buffer_h /*hbuffer*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_copy_to_dev(vx_buffer_h /*hbuffer*/, uint64_t /*dev_maddr*/, uint64_t /*size*/, uint64_t /*src_offset*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_copy_from_dev(vx_buffer_h /*hbuffer*/, uint64_t /*dev_maddr*/, uint64_t /*size*/, uint64_t /*dest_offset*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_start(vx_device_h /*hdevice*/) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
extern int vx_ready_wait(vx_device_h /*hdevice*/, uint64_t /*timeout*/) {
|
||||
return -1;
|
||||
}
|
|
@ -1,60 +0,0 @@
|
|||
VLSIM_DIR = ../../sim/vlsim
|
||||
|
||||
RTL_DIR=../../hw/rtl
|
||||
|
||||
SCRIPT_DIR=../../hw/scripts
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I. -I../include -I../../hw -I$(VLSIM_DIR)
|
||||
|
||||
# Position independent code
|
||||
CXXFLAGS += -fPIC
|
||||
|
||||
# Add external configuration
|
||||
CXXFLAGS += $(CONFIGS)
|
||||
|
||||
# Dump perf stats
|
||||
CXXFLAGS += -DDUMP_PERF_STATS
|
||||
|
||||
LDFLAGS += -shared -pthread
|
||||
LDFLAGS += -L. -lopae-c-vlsim
|
||||
|
||||
SRCS = ../common/opae.cpp ../common/vx_utils.cpp
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
# Enable scope analyzer
|
||||
ifdef SCOPE
|
||||
CXXFLAGS += -DSCOPE
|
||||
SRCS += ../common/vx_scope.cpp
|
||||
SCOPE_H = scope-defs.h
|
||||
endif
|
||||
|
||||
# Enable perf counters
|
||||
ifdef PERF
|
||||
CXXFLAGS += -DPERF_ENABLE
|
||||
endif
|
||||
|
||||
PROJECT = libvortex.so
|
||||
|
||||
all: $(PROJECT)
|
||||
|
||||
scope-defs.h: $(SCRIPT_DIR)/scope.json
|
||||
$(SCRIPT_DIR)/scope.py $(CONFIGS) -cc scope-defs.h -vl $(RTL_DIR)/scope-defs.vh $(SCRIPT_DIR)/scope.json
|
||||
|
||||
# generate scope data
|
||||
scope: scope-defs.h
|
||||
|
||||
$(PROJECT): $(SRCS) $(SCOPE_H)
|
||||
DESTDIR=../../driver/vlsim $(MAKE) -C $(VLSIM_DIR) ../../driver/vlsim/libopae-c-vlsim.so
|
||||
$(CXX) $(CXXFLAGS) -DUSE_VLSIM $(SRCS) $(LDFLAGS) -o $(PROJECT)
|
||||
|
||||
clean:
|
||||
DESTDIR=../../driver/vlsim $(MAKE) -C $(VLSIM_DIR) clean
|
||||
rm -rf libopae-c-vlsim.so $(PROJECT) *.o scope-defs.h
|
3
hw/.gitignore
vendored
3
hw/.gitignore
vendored
|
@ -1 +1,2 @@
|
|||
obj_dir/*
|
||||
VX_config.h
|
||||
VX_types.h
|
13
hw/Makefile
13
hw/Makefile
|
@ -1,12 +1,17 @@
|
|||
RTL_DIR=./rtl
|
||||
SCRIPT_DIR=./scripts
|
||||
|
||||
all: VX_config.h
|
||||
all: config
|
||||
|
||||
config: VX_config.h VX_types.h
|
||||
|
||||
VX_config.h: $(RTL_DIR)/VX_config.vh
|
||||
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/VX_config.vh -o VX_config.h
|
||||
|
||||
clean:
|
||||
rm -f VX_config.h
|
||||
VX_types.h: $(RTL_DIR)/VX_types.vh
|
||||
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/VX_types.vh -o VX_types.h
|
||||
|
||||
.PHONY: VX_config.h
|
||||
clean:
|
||||
rm -f VX_config.h VX_types.h
|
||||
|
||||
.PHONY: VX_config.h VX_types.h
|
|
@ -1,3 +1,16 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <unordered_map>
|
||||
|
@ -5,167 +18,323 @@
|
|||
#include <mutex>
|
||||
#include <iostream>
|
||||
#include <rvfloats.h>
|
||||
#include <util.h>
|
||||
#include "svdpi.h"
|
||||
#include "verilated_vpi.h"
|
||||
#include "VX_config.h"
|
||||
|
||||
extern "C" {
|
||||
void dpi_fadd(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
|
||||
void dpi_fsub(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
|
||||
void dpi_fmul(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
|
||||
void dpi_fmadd(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
|
||||
void dpi_fmsub(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
|
||||
void dpi_fnmadd(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
|
||||
void dpi_fnmsub(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
|
||||
void dpi_fadd(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
|
||||
void dpi_fsub(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
|
||||
void dpi_fmul(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
|
||||
void dpi_fmadd(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
|
||||
void dpi_fmsub(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
|
||||
void dpi_fnmadd(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
|
||||
void dpi_fnmsub(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
|
||||
|
||||
void dpi_fdiv(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
|
||||
void dpi_fsqrt(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
|
||||
void dpi_fdiv(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
|
||||
void dpi_fsqrt(bool enable, int dst_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
|
||||
|
||||
void dpi_ftoi(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
|
||||
void dpi_ftou(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
|
||||
void dpi_itof(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
|
||||
void dpi_utof(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags);
|
||||
void dpi_f2f(bool enable, int dst_fmt, int64_t a, int64_t* result);
|
||||
|
||||
void dpi_ftoi(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
|
||||
void dpi_ftou(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
|
||||
void dpi_itof(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
|
||||
void dpi_utof(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags);
|
||||
void dpi_fclss(bool enable, int dst_fmt, int64_t a, int64_t* result);
|
||||
void dpi_fsgnj(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result);
|
||||
void dpi_fsgnjn(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result);
|
||||
void dpi_fsgnjx(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result);
|
||||
|
||||
void dpi_fclss(bool enable, int a, int* result);
|
||||
void dpi_fsgnj(bool enable, int a, int b, int* result);
|
||||
void dpi_fsgnjn(bool enable, int a, int b, int* result);
|
||||
void dpi_fsgnjx(bool enable, int a, int b, int* result);
|
||||
|
||||
void dpi_flt(bool enable, int a, int b, int* result, svBitVecVal* fflags);
|
||||
void dpi_fle(bool enable, int a, int b, int* result, svBitVecVal* fflags);
|
||||
void dpi_feq(bool enable, int a, int b, int* result, svBitVecVal* fflags);
|
||||
void dpi_fmin(bool enable, int a, int b, int* result, svBitVecVal* fflags);
|
||||
void dpi_fmax(bool enable, int a, int b, int* result, svBitVecVal* fflags);
|
||||
void dpi_flt(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags);
|
||||
void dpi_fle(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags);
|
||||
void dpi_feq(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags);
|
||||
void dpi_fmin(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags);
|
||||
void dpi_fmax(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags);
|
||||
}
|
||||
|
||||
void dpi_fadd(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_fadd_s(a, b, (*frm & 0x7), fflags);
|
||||
inline uint64_t nan_box(uint32_t value) {
|
||||
#ifdef FPU_RV64F
|
||||
return value | 0xffffffff00000000;
|
||||
#else
|
||||
return value;
|
||||
#endif
|
||||
}
|
||||
|
||||
void dpi_fsub(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_fsub_s(a, b, (*frm & 0x7), fflags);
|
||||
inline bool is_nan_boxed(uint64_t value) {
|
||||
#ifdef FPU_RV64F
|
||||
return (uint32_t(value >> 32) == 0xffffffff);
|
||||
#else
|
||||
__unused (value);
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
void dpi_fmul(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_fmul_s(a, b, (*frm & 0x7), fflags);
|
||||
inline int64_t check_boxing(int64_t a) {
|
||||
if (!is_nan_boxed(a)) {
|
||||
return nan_box(0x7fc00000); // NaN
|
||||
}
|
||||
return a;
|
||||
}
|
||||
|
||||
void dpi_fmadd(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
|
||||
void dpi_fadd(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_fmadd_s(a, b, c, (*frm & 0x7), fflags);
|
||||
if (dst_fmt) {
|
||||
*result = rv_fadd_d(a, b, (*frm & 0x7), fflags);
|
||||
} else {
|
||||
*result = nan_box(rv_fadd_s(check_boxing(a), check_boxing(b), (*frm & 0x7), fflags));
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_fmsub(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
|
||||
void dpi_fsub(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_fmsub_s(a, b, c, (*frm & 0x7), fflags);
|
||||
if (dst_fmt) {
|
||||
*result = rv_fsub_d(a, b, (*frm & 0x7), fflags);
|
||||
} else {
|
||||
*result = nan_box(rv_fsub_s(check_boxing(a), check_boxing(b), (*frm & 0x7), fflags));
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_fnmadd(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
|
||||
void dpi_fmul(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_fnmadd_s(a, b, c, (*frm & 0x7), fflags);
|
||||
if (dst_fmt) {
|
||||
*result = rv_fmul_d(a, b, (*frm & 0x7), fflags);
|
||||
} else {
|
||||
*result = nan_box(rv_fmul_s(check_boxing(a), check_boxing(b), (*frm & 0x7), fflags));
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_fnmsub(bool enable, int a, int b, int c, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
|
||||
void dpi_fmadd(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_fnmsub_s(a, b, c, (*frm & 0x7), fflags);
|
||||
if (dst_fmt) {
|
||||
*result = rv_fmadd_d(a, b, c, (*frm & 0x7), fflags);
|
||||
} else {
|
||||
*result = nan_box(rv_fmadd_s(check_boxing(a), check_boxing(b), check_boxing(c), (*frm & 0x7), fflags));
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_fdiv(bool enable, int a, int b, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
|
||||
void dpi_fmsub(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_fdiv_s(a, b, (*frm & 0x7), fflags);
|
||||
if (dst_fmt) {
|
||||
*result = rv_fmsub_d(a, b, c, (*frm & 0x7), fflags);
|
||||
} else {
|
||||
*result = nan_box(rv_fmsub_s(check_boxing(a), check_boxing(b), check_boxing(c), (*frm & 0x7), fflags));
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_fsqrt(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
|
||||
void dpi_fnmadd(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_fsqrt_s(a, (*frm & 0x7), fflags);
|
||||
if (dst_fmt) {
|
||||
*result = rv_fnmadd_d(a, b, c, (*frm & 0x7), fflags);
|
||||
} else {
|
||||
*result = nan_box(rv_fnmadd_s(check_boxing(a), check_boxing(b), check_boxing(c), (*frm & 0x7), fflags));
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_ftoi(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
|
||||
void dpi_fnmsub(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t c, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_ftoi_s(a, (*frm & 0x7), fflags);
|
||||
if (dst_fmt) {
|
||||
*result = rv_fnmsub_d(a, b, c, (*frm & 0x7), fflags);
|
||||
} else {
|
||||
*result = nan_box(rv_fnmsub_s(check_boxing(a), check_boxing(b), check_boxing(c), (*frm & 0x7), fflags));
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_ftou(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
|
||||
void dpi_fdiv(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_ftou_s(a, (*frm & 0x7), fflags);
|
||||
if (dst_fmt) {
|
||||
*result = rv_fdiv_d(a, b, (*frm & 0x7), fflags);
|
||||
} else {
|
||||
*result = nan_box(rv_fdiv_s(check_boxing(a), check_boxing(b), (*frm & 0x7), fflags));
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_itof(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
|
||||
void dpi_fsqrt(bool enable, int dst_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_itof_s(a, (*frm & 0x7), fflags);
|
||||
if (dst_fmt) {
|
||||
*result = rv_fsqrt_d(a, (*frm & 0x7), fflags);
|
||||
} else {
|
||||
*result = nan_box(rv_fsqrt_s(check_boxing(a), (*frm & 0x7), fflags));
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_utof(bool enable, int a, const svBitVecVal* frm, int* result, svBitVecVal* fflags) {
|
||||
void dpi_ftoi(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_utof_s(a, (*frm & 0x7), fflags);
|
||||
if (dst_fmt) {
|
||||
if (src_fmt) {
|
||||
*result = rv_ftol_d(a, (*frm & 0x7), fflags);
|
||||
} else {
|
||||
*result = rv_ftol_s(check_boxing(a), (*frm & 0x7), fflags);
|
||||
}
|
||||
} else {
|
||||
if (src_fmt) {
|
||||
*result = sext<uint64_t>(rv_ftoi_d(a, (*frm & 0x7), fflags), 32);
|
||||
} else {
|
||||
*result = sext<uint64_t>(rv_ftoi_s(check_boxing(a), (*frm & 0x7), fflags), 32);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_flt(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
|
||||
void dpi_ftou(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_flt_s(a, b, fflags);
|
||||
if (dst_fmt) {
|
||||
if (src_fmt) {
|
||||
*result = rv_ftolu_d(a, (*frm & 0x7), fflags);
|
||||
} else {
|
||||
*result = rv_ftolu_s(check_boxing(a), (*frm & 0x7), fflags);
|
||||
}
|
||||
} else {
|
||||
if (src_fmt) {
|
||||
*result = sext<uint64_t>(rv_ftou_d(a, (*frm & 0x7), fflags), 32);
|
||||
} else {
|
||||
*result = sext<uint64_t>(rv_ftou_s(check_boxing(a), (*frm & 0x7), fflags), 32);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_fle(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
|
||||
void dpi_itof(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_fle_s(a, b, fflags);
|
||||
if (dst_fmt) {
|
||||
if (src_fmt) {
|
||||
*result = rv_ltof_d(a, (*frm & 0x7), fflags);
|
||||
} else {
|
||||
*result = rv_itof_d(a, (*frm & 0x7), fflags);
|
||||
}
|
||||
} else {
|
||||
if (src_fmt) {
|
||||
*result = nan_box(rv_ltof_s(a, (*frm & 0x7), fflags));
|
||||
} else {
|
||||
*result = nan_box(rv_itof_s(a, (*frm & 0x7), fflags));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_feq(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
|
||||
void dpi_utof(bool enable, int dst_fmt, int src_fmt, int64_t a, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_feq_s(a, b, fflags);
|
||||
if (dst_fmt) {
|
||||
if (src_fmt) {
|
||||
*result = rv_lutof_d(a, (*frm & 0x7), fflags);
|
||||
} else {
|
||||
*result = rv_utof_d(a, (*frm & 0x7), fflags);
|
||||
}
|
||||
} else {
|
||||
if (src_fmt) {
|
||||
*result = nan_box(rv_lutof_s(a, (*frm & 0x7), fflags));
|
||||
} else {
|
||||
*result = nan_box(rv_utof_s(a, (*frm & 0x7), fflags));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_fmin(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
|
||||
void dpi_f2f(bool enable, int dst_fmt, int64_t a, int64_t* result) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_fmin_s(a, b, fflags);
|
||||
if (dst_fmt) {
|
||||
*result = rv_ftod((int32_t)check_boxing(a));
|
||||
} else {
|
||||
*result = nan_box(rv_dtof(a));
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_fmax(bool enable, int a, int b, int* result, svBitVecVal* fflags) {
|
||||
void dpi_fclss(bool enable, int dst_fmt, int64_t a, int64_t* result) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_fmax_s(a, b, fflags);
|
||||
if (dst_fmt) {
|
||||
*result = rv_fclss_d(a);
|
||||
} else {
|
||||
*result = rv_fclss_s(check_boxing(a));
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_fclss(bool enable, int a, int* result) {
|
||||
void dpi_fsgnj(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_fclss_s(a);
|
||||
if (dst_fmt) {
|
||||
*result = rv_fsgnj_d(a, b);
|
||||
} else {
|
||||
*result = nan_box(rv_fsgnj_s(check_boxing(a), check_boxing(b)));
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_fsgnj(bool enable, int a, int b, int* result) {
|
||||
void dpi_fsgnjn(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_fsgnj_s(a, b);
|
||||
if (dst_fmt) {
|
||||
*result = rv_fsgnjn_d(a, b);
|
||||
} else {
|
||||
*result = nan_box(rv_fsgnjn_s(check_boxing(a), check_boxing(b)));
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_fsgnjn(bool enable, int a, int b, int* result) {
|
||||
void dpi_fsgnjx(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_fsgnjn_s(a, b);
|
||||
if (dst_fmt) {
|
||||
*result = rv_fsgnjx_d(a, b);
|
||||
} else {
|
||||
*result = nan_box(rv_fsgnjx_s(check_boxing(a), check_boxing(b)));
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_fsgnjx(bool enable, int a, int b, int* result) {
|
||||
void dpi_flt(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
*result = rv_fsgnjx_s(a, b);
|
||||
if (dst_fmt) {
|
||||
*result = rv_flt_d(a, b, fflags);
|
||||
} else {
|
||||
*result = rv_flt_s(check_boxing(a), check_boxing(b), fflags);
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_fle(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
if (dst_fmt) {
|
||||
*result = rv_fle_d(a, b, fflags);
|
||||
} else {
|
||||
*result = rv_fle_s(check_boxing(a), check_boxing(b), fflags);
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_feq(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
if (dst_fmt) {
|
||||
*result = rv_feq_d(a, b, fflags);
|
||||
} else {
|
||||
*result = rv_feq_s(check_boxing(a), check_boxing(b), fflags);
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_fmin(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
if (dst_fmt) {
|
||||
*result = rv_fmin_d(a, b, fflags);
|
||||
} else {
|
||||
*result = nan_box(rv_fmin_s(check_boxing(a), check_boxing(b), fflags));
|
||||
}
|
||||
}
|
||||
|
||||
void dpi_fmax(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, svBitVecVal* fflags) {
|
||||
if (!enable)
|
||||
return;
|
||||
if (dst_fmt) {
|
||||
*result = rv_fmax_d(a, b, fflags);
|
||||
} else {
|
||||
*result = nan_box(rv_fmax_s(check_boxing(a), check_boxing(b), fflags));
|
||||
}
|
||||
}
|
|
@ -1,31 +1,47 @@
|
|||
`ifndef FLOAT_DPI
|
||||
`define FLOAT_DPI
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
import "DPI-C" function void dpi_fadd(input logic enable, input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fsub(input logic enable, input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fmul(input logic enable, input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fmadd(input logic enable, input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fmsub(input logic enable, input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fnmadd(input logic enable, input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fnmsub(input logic enable, input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
`ifndef FLOAT_DPI_VH
|
||||
`define FLOAT_DPI_VH
|
||||
|
||||
import "DPI-C" function void dpi_fdiv(input logic enable, input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fsqrt(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
`include "VX_config.vh"
|
||||
|
||||
import "DPI-C" function void dpi_ftoi(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_ftou(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_itof(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_utof(input logic enable, input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fadd(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fsub(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fmul(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fmadd(input logic enable, input int dst_fmt, input longint a, input longint b, input longint c, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fmsub(input logic enable, input int dst_fmt, input longint a, input longint b, input longint c, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fnmadd(input logic enable, input int dst_fmt, input longint a, input longint b, input longint c, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fnmsub(input logic enable, input int dst_fmt, input longint a, input longint b, input longint c, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
|
||||
|
||||
import "DPI-C" function void dpi_fclss(input logic enable, input int a, output int result);
|
||||
import "DPI-C" function void dpi_fsgnj(input logic enable, input int a, input int b, output int result);
|
||||
import "DPI-C" function void dpi_fsgnjn(input logic enable, input int a, input int b, output int result);
|
||||
import "DPI-C" function void dpi_fsgnjx(input logic enable, input int a, input int b, output int result);
|
||||
import "DPI-C" function void dpi_fdiv(input logic enable, input int dst_fmt, input longint a, input longint b, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fsqrt(input logic enable, input int dst_fmt, input longint a, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
|
||||
|
||||
import "DPI-C" function void dpi_flt(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fle(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_feq(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fmin(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fmax(input logic enable, input int a, input int b, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_ftoi(input logic enable, input int dst_fmt, input int src_fmt, input longint a, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_ftou(input logic enable, input int dst_fmt, input int src_fmt, input longint a, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_itof(input logic enable, input int dst_fmt, input int src_fmt, input longint a, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_utof(input logic enable, input int dst_fmt, input int src_fmt, input longint a, input bit[2:0] frm, output longint result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_f2f(input logic enable, input int dst_fmt, input longint a, output longint result);
|
||||
|
||||
`endif
|
||||
import "DPI-C" function void dpi_fclss(input logic enable, input int dst_fmt, input longint a, output longint result);
|
||||
import "DPI-C" function void dpi_fsgnj(input logic enable, input int dst_fmt, input longint a, input longint b, output longint result);
|
||||
import "DPI-C" function void dpi_fsgnjn(input logic enable, input int dst_fmt, input longint a, input longint b, output longint result);
|
||||
import "DPI-C" function void dpi_fsgnjx(input logic enable, input int dst_fmt, input longint a, input longint b, output longint result);
|
||||
|
||||
import "DPI-C" function void dpi_flt(input logic enable, input int dst_fmt, input longint a, input longint b, output longint result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fle(input logic enable, input int dst_fmt, input longint a, input longint b, output longint result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_feq(input logic enable, input int dst_fmt, input longint a, input longint b, output longint result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fmin(input logic enable, input int dst_fmt, input longint a, input longint b, output longint result, output bit[4:0] fflags);
|
||||
import "DPI-C" function void dpi_fmax(input logic enable, input int dst_fmt, input longint a, input longint b, output longint result, output bit[4:0] fflags);
|
||||
|
||||
`endif
|
||||
|
|
|
@ -1,23 +1,57 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <mutex>
|
||||
#include <iostream>
|
||||
|
||||
#include "svdpi.h"
|
||||
#include "verilated_vpi.h"
|
||||
#include "VX_config.h"
|
||||
|
||||
#include "uuid_gen.h"
|
||||
|
||||
#ifdef XLEN_64
|
||||
#define iword_t int64_t
|
||||
#define uword_t uint64_t
|
||||
#define idword_t __int128_t
|
||||
#define udword_t __uint128_t
|
||||
#else
|
||||
#define iword_t int32_t
|
||||
#define uword_t uint32_t
|
||||
#define idword_t int64_t
|
||||
#define udword_t uint64_t
|
||||
#endif
|
||||
|
||||
#ifndef DEBUG_LEVEL
|
||||
#define DEBUG_LEVEL 3
|
||||
#endif
|
||||
|
||||
extern "C" {
|
||||
void dpi_imul(bool enable, int a, int b, bool is_signed_a, bool is_signed_b, int* resultl, int* resulth);
|
||||
void dpi_idiv(bool enable, int a, int b, bool is_signed, int* quotient, int* remainder);
|
||||
void dpi_imul(bool enable, bool is_signed_a, bool is_signed_b, iword_t a, iword_t b, iword_t* resultl, iword_t* resulth);
|
||||
void dpi_idiv(bool enable, bool is_signed, iword_t a, iword_t b, iword_t* quotient, iword_t* remainder);
|
||||
|
||||
int dpi_register();
|
||||
void dpi_assert(int inst, bool cond, int delay);
|
||||
|
||||
void dpi_trace(const char* format, ...);
|
||||
void dpi_trace(int level, const char* format, ...);
|
||||
void dpi_trace_start();
|
||||
void dpi_trace_stop();
|
||||
|
||||
uint64_t dpi_uuid_gen(bool reset, int wid, uint64_t PC);
|
||||
}
|
||||
|
||||
bool sim_trace_enabled();
|
||||
|
@ -93,49 +127,54 @@ void dpi_assert(int inst, bool cond, int delay) {
|
|||
}
|
||||
}
|
||||
|
||||
void dpi_imul(bool enable, int a, int b, bool is_signed_a, bool is_signed_b, int* resultl, int* resulth) {
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void dpi_imul(bool enable, bool is_signed_a, bool is_signed_b, iword_t a, iword_t b, iword_t* resultl, iword_t* resulth) {
|
||||
if (!enable)
|
||||
return;
|
||||
udword_t first = *(uword_t*)&a;
|
||||
udword_t second = *(uword_t*)&b;
|
||||
|
||||
udword_t mask = udword_t(-1) << (8 * sizeof(iword_t));
|
||||
|
||||
uint64_t first = *(uint32_t*)&a;
|
||||
uint64_t second = *(uint32_t*)&b;
|
||||
|
||||
if (is_signed_a && (first & 0x80000000)) {
|
||||
first |= 0xFFFFFFFF00000000;
|
||||
if (is_signed_a && a < 0) {
|
||||
first |= mask;
|
||||
}
|
||||
|
||||
if (is_signed_b && (second & 0x80000000)) {
|
||||
second |= 0xFFFFFFFF00000000;
|
||||
if (is_signed_b && b < 0) {
|
||||
second |= mask;
|
||||
}
|
||||
|
||||
uint64_t result;
|
||||
udword_t result;
|
||||
if (is_signed_a || is_signed_b) {
|
||||
result = (int64_t)first * (int64_t)second;
|
||||
result = idword_t(first) * idword_t(second);
|
||||
} else {
|
||||
result = first * second;
|
||||
}
|
||||
|
||||
*resultl = result & 0xFFFFFFFF;
|
||||
*resulth = (result >> 32) & 0xFFFFFFFF;
|
||||
}
|
||||
|
||||
*resultl = iword_t(result);
|
||||
*resulth = iword_t(result >> (8 * sizeof(iword_t)));
|
||||
}
|
||||
|
||||
void dpi_idiv(bool enable, int a, int b, bool is_signed, int* quotient, int* remainder) {
|
||||
void dpi_idiv(bool enable, bool is_signed, iword_t a, iword_t b, iword_t* quotient, iword_t* remainder) {
|
||||
if (!enable)
|
||||
return;
|
||||
|
||||
uint32_t dividen = *(uint32_t*)&a;
|
||||
uint32_t divisor = *(uint32_t*)&b;
|
||||
uword_t dividen = a;
|
||||
uword_t divisor = b;
|
||||
|
||||
auto inf_neg = uword_t(1) << (XLEN-1);
|
||||
|
||||
if (is_signed) {
|
||||
if (b == 0) {
|
||||
*quotient = -1;
|
||||
*remainder = dividen;
|
||||
} else if (dividen == 0x80000000 && divisor == 0xffffffff) {
|
||||
} else if (dividen == inf_neg && divisor == -1) {
|
||||
*remainder = 0;
|
||||
*quotient = dividen;
|
||||
} else {
|
||||
*quotient = (int32_t)dividen / (int32_t)divisor;
|
||||
*remainder = (int32_t)dividen % (int32_t)divisor;
|
||||
*quotient = (iword_t)dividen / (iword_t)divisor;
|
||||
*remainder = (iword_t)dividen % (iword_t)divisor;
|
||||
}
|
||||
} else {
|
||||
if (b == 0) {
|
||||
|
@ -148,7 +187,11 @@ void dpi_idiv(bool enable, int a, int b, bool is_signed, int* quotient, int* rem
|
|||
}
|
||||
}
|
||||
|
||||
void dpi_trace(const char* format, ...) {
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void dpi_trace(int level, const char* format, ...) {
|
||||
if (level > DEBUG_LEVEL)
|
||||
return;
|
||||
if (!sim_trace_enabled())
|
||||
return;
|
||||
va_list va;
|
||||
|
@ -163,4 +206,28 @@ void dpi_trace_start() {
|
|||
|
||||
void dpi_trace_stop() {
|
||||
sim_trace_enable(false);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
std::unordered_map<uint32_t, std::shared_ptr<vortex::UUIDGenerator>> g_uuid_gens;
|
||||
|
||||
uint64_t dpi_uuid_gen(bool reset, int wid, uint64_t PC) {
|
||||
if (reset) {
|
||||
g_uuid_gens.clear();
|
||||
return 0;
|
||||
}
|
||||
std::shared_ptr<vortex::UUIDGenerator> uuid_gen;
|
||||
auto it = g_uuid_gens.find(wid);
|
||||
if (it == g_uuid_gens.end()) {
|
||||
uuid_gen = std::make_shared<vortex::UUIDGenerator>();
|
||||
g_uuid_gens.emplace(wid, uuid_gen);
|
||||
} else {
|
||||
uuid_gen = it->second;
|
||||
}
|
||||
uint32_t instr_uuid = uuid_gen->get_uuid(PC);
|
||||
uint32_t instr_id = instr_uuid & 0xffff;
|
||||
uint32_t instr_ref = instr_uuid >> 16;
|
||||
uint64_t uuid = (uint64_t(instr_ref) << 32) | (wid << 16) | instr_id;
|
||||
return uuid;
|
||||
}
|
|
@ -1,14 +1,37 @@
|
|||
`ifndef UTIL_DPI
|
||||
`define UTIL_DPI
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
import "DPI-C" function void dpi_imul(input logic enable, input int a, input int b, input logic is_signed_a, input logic is_signed_b, output int resultl, output int resulth);
|
||||
import "DPI-C" function void dpi_idiv(input logic enable, input int a, input int b, input logic is_signed, output int quotient, output int remainder);
|
||||
`ifndef UTIL_DPI_VH
|
||||
`define UTIL_DPI_VH
|
||||
|
||||
`include "VX_config.vh"
|
||||
|
||||
`ifdef XLEN_64
|
||||
`define INT_TYPE longint
|
||||
`else
|
||||
`define INT_TYPE int
|
||||
`endif
|
||||
|
||||
import "DPI-C" function void dpi_imul(input logic enable, input logic is_signed_a, input logic is_signed_b, input `INT_TYPE a, input `INT_TYPE b, output `INT_TYPE resultl, output `INT_TYPE resulth);
|
||||
import "DPI-C" function void dpi_idiv(input logic enable, input logic is_signed, input `INT_TYPE a, input `INT_TYPE b, output `INT_TYPE quotient, output `INT_TYPE remainder);
|
||||
|
||||
import "DPI-C" function int dpi_register();
|
||||
import "DPI-C" function void dpi_assert(int inst, input logic cond, input int delay);
|
||||
|
||||
import "DPI-C" function void dpi_trace(input string format /*verilator sformat*/);
|
||||
import "DPI-C" function void dpi_trace(input int level, input string format /*verilator sformat*/);
|
||||
import "DPI-C" function void dpi_trace_start();
|
||||
import "DPI-C" function void dpi_trace_stop();
|
||||
|
||||
`endif
|
||||
import "DPI-C" function longint dpi_uuid_gen(input logic reset, input int wid, input longint PC);
|
||||
|
||||
`endif
|
||||
|
|
1
hw/rtl/.gitignore
vendored
1
hw/rtl/.gitignore
vendored
|
@ -1 +0,0 @@
|
|||
/VX_user_config.vh
|
|
@ -1,235 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_alu_unit #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// Inputs
|
||||
VX_alu_req_if.slave alu_req_if,
|
||||
|
||||
// Outputs
|
||||
VX_branch_ctl_if.master branch_ctl_if,
|
||||
VX_commit_if.master alu_commit_if
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
reg [`NUM_THREADS-1:0][31:0] alu_result;
|
||||
wire [`NUM_THREADS-1:0][31:0] add_result;
|
||||
wire [`NUM_THREADS-1:0][32:0] sub_result;
|
||||
wire [`NUM_THREADS-1:0][31:0] shr_result;
|
||||
reg [`NUM_THREADS-1:0][31:0] msc_result;
|
||||
|
||||
wire ready_in;
|
||||
|
||||
`UNUSED_VAR (alu_req_if.op_mod)
|
||||
wire is_br_op = `INST_ALU_IS_BR(alu_req_if.op_mod);
|
||||
wire [`INST_ALU_BITS-1:0] alu_op = `INST_ALU_BITS'(alu_req_if.op_type);
|
||||
wire [`INST_BR_BITS-1:0] br_op = `INST_BR_BITS'(alu_req_if.op_type);
|
||||
wire alu_signed = `INST_ALU_SIGNED(alu_op);
|
||||
wire [1:0] alu_op_class = `INST_ALU_OP_CLASS(alu_op);
|
||||
wire is_sub = (alu_op == `INST_ALU_SUB);
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] alu_in1 = alu_req_if.rs1_data;
|
||||
wire [`NUM_THREADS-1:0][31:0] alu_in2 = alu_req_if.rs2_data;
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] alu_in1_PC = alu_req_if.use_PC ? {`NUM_THREADS{alu_req_if.PC}} : alu_in1;
|
||||
wire [`NUM_THREADS-1:0][31:0] alu_in2_imm = alu_req_if.use_imm ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2;
|
||||
wire [`NUM_THREADS-1:0][31:0] alu_in2_less = (alu_req_if.use_imm && ~is_br_op) ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2;
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
assign add_result[i] = alu_in1_PC[i] + alu_in2_imm[i];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
wire [32:0] sub_in1 = {alu_signed & alu_in1[i][31], alu_in1[i]};
|
||||
wire [32:0] sub_in2 = {alu_signed & alu_in2_less[i][31], alu_in2_less[i]};
|
||||
assign sub_result[i] = sub_in1 - sub_in2;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
wire [32:0] shr_in1 = {alu_signed & alu_in1[i][31], alu_in1[i]};
|
||||
assign shr_result[i] = 32'($signed(shr_in1) >>> alu_in2_imm[i][4:0]);
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
always @(*) begin
|
||||
case (alu_op)
|
||||
`INST_ALU_AND: msc_result[i] = alu_in1[i] & alu_in2_imm[i];
|
||||
`INST_ALU_OR: msc_result[i] = alu_in1[i] | alu_in2_imm[i];
|
||||
`INST_ALU_XOR: msc_result[i] = alu_in1[i] ^ alu_in2_imm[i];
|
||||
//`INST_ALU_SLL,
|
||||
default: msc_result[i] = alu_in1[i] << alu_in2_imm[i][4:0];
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
always @(*) begin
|
||||
case (alu_op_class)
|
||||
2'b00: alu_result[i] = add_result[i]; // ADD, LUI, AUIPC
|
||||
2'b01: alu_result[i] = {31'b0, sub_result[i][32]}; // SLTU, SLT
|
||||
2'b10: alu_result[i] = is_sub ? sub_result[i][31:0] // SUB
|
||||
: shr_result[i]; // SRL, SRA
|
||||
// 2'b11,
|
||||
default: alu_result[i] = msc_result[i]; // AND, OR, XOR, SLL
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
// branch
|
||||
|
||||
wire is_jal = is_br_op && (br_op == `INST_BR_JAL || br_op == `INST_BR_JALR);
|
||||
wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{alu_req_if.next_PC}} : alu_result;
|
||||
|
||||
wire [31:0] br_dest = add_result[alu_req_if.tid];
|
||||
wire [32:0] cmp_result = sub_result[alu_req_if.tid];
|
||||
|
||||
wire is_less = cmp_result[32];
|
||||
wire is_equal = ~(| cmp_result[31:0]);
|
||||
|
||||
// output
|
||||
|
||||
wire alu_valid_in;
|
||||
wire alu_ready_in;
|
||||
wire alu_valid_out;
|
||||
wire alu_ready_out;
|
||||
wire [`UUID_BITS-1:0] alu_uuid;
|
||||
wire [`NW_BITS-1:0] alu_wid;
|
||||
wire [`NUM_THREADS-1:0] alu_tmask;
|
||||
wire [31:0] alu_PC;
|
||||
wire [`NR_BITS-1:0] alu_rd;
|
||||
wire alu_wb;
|
||||
wire [`NUM_THREADS-1:0][31:0] alu_data;
|
||||
|
||||
wire [`INST_BR_BITS-1:0] br_op_r;
|
||||
wire [31:0] br_dest_r;
|
||||
wire is_less_r;
|
||||
wire is_equal_r;
|
||||
wire is_br_op_r;
|
||||
|
||||
assign alu_ready_in = alu_ready_out || ~alu_valid_out;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (alu_ready_in),
|
||||
.data_in ({alu_valid_in, alu_req_if.uuid, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result, is_br_op, br_op, is_less, is_equal, br_dest}),
|
||||
.data_out ({alu_valid_out, alu_uuid, alu_wid, alu_tmask, alu_PC, alu_rd, alu_wb, alu_data, is_br_op_r, br_op_r, is_less_r, is_equal_r, br_dest_r})
|
||||
);
|
||||
|
||||
`UNUSED_VAR (br_op_r)
|
||||
wire br_neg = `INST_BR_NEG(br_op_r);
|
||||
wire br_less = `INST_BR_LESS(br_op_r);
|
||||
wire br_static = `INST_BR_STATIC(br_op_r);
|
||||
|
||||
assign branch_ctl_if.valid = alu_valid_out && alu_ready_out && is_br_op_r;
|
||||
assign branch_ctl_if.taken = ((br_less ? is_less_r : is_equal_r) ^ br_neg) | br_static;
|
||||
assign branch_ctl_if.wid = alu_wid;
|
||||
assign branch_ctl_if.dest = br_dest_r;
|
||||
|
||||
`ifdef EXT_M_ENABLE
|
||||
|
||||
wire mul_valid_in;
|
||||
wire mul_ready_in;
|
||||
wire mul_valid_out;
|
||||
wire mul_ready_out;
|
||||
wire [`UUID_BITS-1:0] mul_uuid;
|
||||
wire [`NW_BITS-1:0] mul_wid;
|
||||
wire [`NUM_THREADS-1:0] mul_tmask;
|
||||
wire [31:0] mul_PC;
|
||||
wire [`NR_BITS-1:0] mul_rd;
|
||||
wire mul_wb;
|
||||
wire [`NUM_THREADS-1:0][31:0] mul_data;
|
||||
|
||||
wire [`INST_MUL_BITS-1:0] mul_op = `INST_MUL_BITS'(alu_req_if.op_type);
|
||||
|
||||
VX_muldiv muldiv (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
// Inputs
|
||||
.alu_op (mul_op),
|
||||
.uuid_in (alu_req_if.uuid),
|
||||
.wid_in (alu_req_if.wid),
|
||||
.tmask_in (alu_req_if.tmask),
|
||||
.PC_in (alu_req_if.PC),
|
||||
.rd_in (alu_req_if.rd),
|
||||
.wb_in (alu_req_if.wb),
|
||||
.alu_in1 (alu_req_if.rs1_data),
|
||||
.alu_in2 (alu_req_if.rs2_data),
|
||||
|
||||
// Outputs
|
||||
.wid_out (mul_wid),
|
||||
.uuid_out (mul_uuid),
|
||||
.tmask_out (mul_tmask),
|
||||
.PC_out (mul_PC),
|
||||
.rd_out (mul_rd),
|
||||
.wb_out (mul_wb),
|
||||
.data_out (mul_data),
|
||||
|
||||
// handshake
|
||||
.valid_in (mul_valid_in),
|
||||
.ready_in (mul_ready_in),
|
||||
.valid_out (mul_valid_out),
|
||||
.ready_out (mul_ready_out)
|
||||
);
|
||||
|
||||
wire is_mul_op = `INST_ALU_IS_MUL(alu_req_if.op_mod);
|
||||
|
||||
assign ready_in = is_mul_op ? mul_ready_in : alu_ready_in;
|
||||
|
||||
assign alu_valid_in = alu_req_if.valid && ~is_mul_op;
|
||||
assign mul_valid_in = alu_req_if.valid && is_mul_op;
|
||||
|
||||
assign alu_commit_if.valid = alu_valid_out || mul_valid_out;
|
||||
assign alu_commit_if.uuid = alu_valid_out ? alu_uuid : mul_uuid;
|
||||
assign alu_commit_if.wid = alu_valid_out ? alu_wid : mul_wid;
|
||||
assign alu_commit_if.tmask = alu_valid_out ? alu_tmask : mul_tmask;
|
||||
assign alu_commit_if.PC = alu_valid_out ? alu_PC : mul_PC;
|
||||
assign alu_commit_if.rd = alu_valid_out ? alu_rd : mul_rd;
|
||||
assign alu_commit_if.wb = alu_valid_out ? alu_wb : mul_wb;
|
||||
assign alu_commit_if.data = alu_valid_out ? alu_data : mul_data;
|
||||
|
||||
assign alu_ready_out = alu_commit_if.ready;
|
||||
assign mul_ready_out = alu_commit_if.ready & ~alu_valid_out; // ALU takes priority
|
||||
|
||||
`else
|
||||
|
||||
assign ready_in = alu_ready_in;
|
||||
|
||||
assign alu_valid_in = alu_req_if.valid;
|
||||
|
||||
assign alu_commit_if.valid = alu_valid_out;
|
||||
assign alu_commit_if.uuid = alu_uuid;
|
||||
assign alu_commit_if.wid = alu_wid;
|
||||
assign alu_commit_if.tmask = alu_tmask;
|
||||
assign alu_commit_if.PC = alu_PC;
|
||||
assign alu_commit_if.rd = alu_rd;
|
||||
assign alu_commit_if.wb = alu_wb;
|
||||
assign alu_commit_if.data = alu_data;
|
||||
|
||||
assign alu_ready_out = alu_commit_if.ready;
|
||||
|
||||
`endif
|
||||
|
||||
assign alu_commit_if.eop = 1'b1;
|
||||
|
||||
// can accept new request?
|
||||
assign alu_req_if.ready = ready_in;
|
||||
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (branch_ctl_if.valid) begin
|
||||
dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h (#%0d)\n",
|
||||
$time, CORE_ID, branch_ctl_if.wid, alu_commit_if.PC, branch_ctl_if.taken, branch_ctl_if.dest, alu_uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
|
@ -1,159 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_cache_arb #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter LANES = 1,
|
||||
parameter DATA_SIZE = 1,
|
||||
parameter TAG_IN_WIDTH = 1,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
parameter BUFFERED_REQ = 0,
|
||||
parameter BUFFERED_RSP = 0,
|
||||
parameter TYPE = "R",
|
||||
|
||||
localparam ADDR_WIDTH = (32-`CLOG2(DATA_SIZE)),
|
||||
localparam DATA_WIDTH = (8 * DATA_SIZE),
|
||||
localparam LOG_NUM_REQS = `CLOG2(NUM_REQS),
|
||||
localparam TAG_OUT_WIDTH = TAG_IN_WIDTH + LOG_NUM_REQS
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// input requests
|
||||
input wire [NUM_REQS-1:0][LANES-1:0] req_valid_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0] req_rw_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0][DATA_SIZE-1:0] req_byteen_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0][ADDR_WIDTH-1:0] req_addr_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] req_data_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_in,
|
||||
output wire [NUM_REQS-1:0][LANES-1:0] req_ready_in,
|
||||
|
||||
// output request
|
||||
output wire [LANES-1:0] req_valid_out,
|
||||
output wire [LANES-1:0] req_rw_out,
|
||||
output wire [LANES-1:0][DATA_SIZE-1:0] req_byteen_out,
|
||||
output wire [LANES-1:0][ADDR_WIDTH-1:0] req_addr_out,
|
||||
output wire [LANES-1:0][DATA_WIDTH-1:0] req_data_out,
|
||||
output wire [LANES-1:0][TAG_OUT_WIDTH-1:0] req_tag_out,
|
||||
input wire [LANES-1:0] req_ready_out,
|
||||
|
||||
// input response
|
||||
input wire rsp_valid_in,
|
||||
input wire [LANES-1:0] rsp_tmask_in,
|
||||
input wire [LANES-1:0][DATA_WIDTH-1:0] rsp_data_in,
|
||||
input wire [TAG_OUT_WIDTH-1:0] rsp_tag_in,
|
||||
output wire rsp_ready_in,
|
||||
|
||||
// output responses
|
||||
output wire [NUM_REQS-1:0] rsp_valid_out,
|
||||
output wire [NUM_REQS-1:0][LANES-1:0] rsp_tmask_out,
|
||||
output wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] rsp_data_out,
|
||||
output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_out,
|
||||
input wire [NUM_REQS-1:0] rsp_ready_out
|
||||
);
|
||||
localparam REQ_DATAW = TAG_OUT_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
|
||||
localparam RSP_DATAW = LANES * (1 + DATA_WIDTH) + TAG_IN_WIDTH;
|
||||
|
||||
if (NUM_REQS > 1) begin
|
||||
|
||||
wire [NUM_REQS-1:0][LANES-1:0][REQ_DATAW-1:0] req_data_in_merged;
|
||||
wire [LANES-1:0][REQ_DATAW-1:0] req_data_out_merged;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
for (genvar j = 0; j < LANES; ++j) begin
|
||||
wire [TAG_OUT_WIDTH-1:0] req_tag_in_w;
|
||||
|
||||
VX_bits_insert #(
|
||||
.N (TAG_IN_WIDTH),
|
||||
.S (LOG_NUM_REQS),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) bits_insert (
|
||||
.data_in (req_tag_in[i][j]),
|
||||
.sel_in (LOG_NUM_REQS'(i)),
|
||||
.data_out (req_tag_in_w)
|
||||
);
|
||||
|
||||
assign req_data_in_merged[i][j] = {req_tag_in_w, req_addr_in[i][j], req_rw_in[i][j], req_byteen_in[i][j], req_data_in[i][j]};
|
||||
end
|
||||
end
|
||||
|
||||
VX_stream_arbiter #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.LANES (LANES),
|
||||
.DATAW (REQ_DATAW),
|
||||
.BUFFERED (BUFFERED_REQ),
|
||||
.TYPE (TYPE)
|
||||
) req_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (req_valid_in),
|
||||
.data_in (req_data_in_merged),
|
||||
.ready_in (req_ready_in),
|
||||
.valid_out (req_valid_out),
|
||||
.data_out (req_data_out_merged),
|
||||
.ready_out (req_ready_out)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < LANES; ++i) begin
|
||||
assign {req_tag_out[i], req_addr_out[i], req_rw_out[i], req_byteen_out[i], req_data_out[i]} = req_data_out_merged[i];
|
||||
end
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_out_merged;
|
||||
|
||||
wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in[TAG_SEL_IDX +: LOG_NUM_REQS];
|
||||
|
||||
wire [TAG_IN_WIDTH-1:0] rsp_tag_in_w;
|
||||
|
||||
VX_bits_remove #(
|
||||
.N (TAG_OUT_WIDTH),
|
||||
.S (LOG_NUM_REQS),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) bits_remove (
|
||||
.data_in (rsp_tag_in),
|
||||
.data_out (rsp_tag_in_w)
|
||||
);
|
||||
|
||||
VX_stream_demux #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.LANES (1),
|
||||
.DATAW (RSP_DATAW),
|
||||
.BUFFERED (BUFFERED_RSP)
|
||||
) rsp_demux (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.sel_in (rsp_sel),
|
||||
.valid_in (rsp_valid_in),
|
||||
.data_in ({rsp_tmask_in, rsp_tag_in_w, rsp_data_in}),
|
||||
.ready_in (rsp_ready_in),
|
||||
.valid_out (rsp_valid_out),
|
||||
.data_out (rsp_data_out_merged),
|
||||
.ready_out (rsp_ready_out)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
assign {rsp_tmask_out[i], rsp_tag_out[i], rsp_data_out[i]} = rsp_data_out_merged[i];
|
||||
end
|
||||
|
||||
end else begin
|
||||
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
assign req_valid_out = req_valid_in;
|
||||
assign req_tag_out = req_tag_in;
|
||||
assign req_addr_out = req_addr_in;
|
||||
assign req_rw_out = req_rw_in;
|
||||
assign req_byteen_out = req_byteen_in;
|
||||
assign req_data_out = req_data_in;
|
||||
assign req_ready_in = req_ready_out;
|
||||
|
||||
assign rsp_valid_out = rsp_valid_in;
|
||||
assign rsp_tmask_out = rsp_tmask_in;
|
||||
assign rsp_tag_out = rsp_tag_in;
|
||||
assign rsp_data_out = rsp_data_in;
|
||||
assign rsp_ready_in = rsp_ready_out;
|
||||
|
||||
end
|
||||
|
||||
endmodule
|
|
@ -1,195 +1,155 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_cluster #(
|
||||
module VX_cluster import VX_gpu_pkg::*; #(
|
||||
parameter CLUSTER_ID = 0
|
||||
) (
|
||||
`SCOPE_IO_VX_cluster
|
||||
`SCOPE_IO_DECL
|
||||
|
||||
// Clock
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// Memory request
|
||||
output wire mem_req_valid,
|
||||
output wire mem_req_rw,
|
||||
output wire [`L2_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
|
||||
output wire [`L2_MEM_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [`L2_MEM_DATA_WIDTH-1:0] mem_req_data,
|
||||
output wire [`L2_MEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||
input wire mem_req_ready,
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.master mem_perf_if,
|
||||
VX_mem_perf_if.slave perf_memsys_total_if,
|
||||
`endif
|
||||
|
||||
// Memory response
|
||||
input wire mem_rsp_valid,
|
||||
input wire [`L2_MEM_DATA_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [`L2_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire mem_rsp_ready,
|
||||
VX_dcr_bus_if.slave dcr_bus_if,
|
||||
|
||||
// Memory
|
||||
VX_mem_bus_if.master mem_bus_if,
|
||||
|
||||
// simulation helper signals
|
||||
output wire sim_ebreak,
|
||||
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
);
|
||||
`STATIC_ASSERT((`L2_ENABLE == 0 || `NUM_CORES > 1), ("invalid parameter"))
|
||||
output wire busy
|
||||
);
|
||||
|
||||
wire [`NUM_CORES-1:0] per_core_mem_req_valid;
|
||||
wire [`NUM_CORES-1:0] per_core_mem_req_rw;
|
||||
wire [`NUM_CORES-1:0][`DCACHE_MEM_BYTEEN_WIDTH-1:0] per_core_mem_req_byteen;
|
||||
wire [`NUM_CORES-1:0][`DCACHE_MEM_ADDR_WIDTH-1:0] per_core_mem_req_addr;
|
||||
wire [`NUM_CORES-1:0][`DCACHE_MEM_DATA_WIDTH-1:0] per_core_mem_req_data;
|
||||
wire [`NUM_CORES-1:0][`L1_MEM_TAG_WIDTH-1:0] per_core_mem_req_tag;
|
||||
wire [`NUM_CORES-1:0] per_core_mem_req_ready;
|
||||
`ifdef SCOPE
|
||||
localparam scope_socket = 0;
|
||||
`SCOPE_IO_SWITCH (scope_socket + `NUM_SOCKETS);
|
||||
`endif
|
||||
|
||||
wire [`NUM_CORES-1:0] per_core_mem_rsp_valid;
|
||||
wire [`NUM_CORES-1:0][`DCACHE_MEM_DATA_WIDTH-1:0] per_core_mem_rsp_data;
|
||||
wire [`NUM_CORES-1:0][`L1_MEM_TAG_WIDTH-1:0] per_core_mem_rsp_tag;
|
||||
wire [`NUM_CORES-1:0] per_core_mem_rsp_ready;
|
||||
`ifdef GBAR_ENABLE
|
||||
|
||||
wire [`NUM_CORES-1:0] per_core_busy;
|
||||
VX_gbar_bus_if per_socket_gbar_bus_if[`NUM_SOCKETS]();
|
||||
VX_gbar_bus_if gbar_bus_if();
|
||||
|
||||
for (genvar i = 0; i < `NUM_CORES; i++) begin
|
||||
`RESET_RELAY (gbar_reset, reset);
|
||||
|
||||
`RESET_RELAY (core_reset);
|
||||
VX_gbar_arb #(
|
||||
.NUM_REQS (`NUM_SOCKETS),
|
||||
.OUT_REG ((`NUM_SOCKETS > 2) ? 1 : 0) // bgar_unit has no backpressure
|
||||
) gbar_arb (
|
||||
.clk (clk),
|
||||
.reset (gbar_reset),
|
||||
.bus_in_if (per_socket_gbar_bus_if),
|
||||
.bus_out_if (gbar_bus_if)
|
||||
);
|
||||
|
||||
VX_core #(
|
||||
.CORE_ID(i + (CLUSTER_ID * `NUM_CORES))
|
||||
) core (
|
||||
`SCOPE_BIND_VX_cluster_core(i)
|
||||
VX_gbar_unit #(
|
||||
.INSTANCE_ID ($sformatf("gbar%0d", CLUSTER_ID))
|
||||
) gbar_unit (
|
||||
.clk (clk),
|
||||
.reset (gbar_reset),
|
||||
.gbar_bus_if (gbar_bus_if)
|
||||
);
|
||||
`endif
|
||||
|
||||
.clk (clk),
|
||||
.reset (core_reset),
|
||||
|
||||
.mem_req_valid (per_core_mem_req_valid[i]),
|
||||
.mem_req_rw (per_core_mem_req_rw [i]),
|
||||
.mem_req_byteen (per_core_mem_req_byteen[i]),
|
||||
.mem_req_addr (per_core_mem_req_addr [i]),
|
||||
.mem_req_data (per_core_mem_req_data [i]),
|
||||
.mem_req_tag (per_core_mem_req_tag [i]),
|
||||
.mem_req_ready (per_core_mem_req_ready[i]),
|
||||
|
||||
.mem_rsp_valid (per_core_mem_rsp_valid[i]),
|
||||
.mem_rsp_data (per_core_mem_rsp_data [i]),
|
||||
.mem_rsp_tag (per_core_mem_rsp_tag [i]),
|
||||
.mem_rsp_ready (per_core_mem_rsp_ready[i]),
|
||||
|
||||
.busy (per_core_busy [i])
|
||||
);
|
||||
end
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_ARB_TAG_WIDTH)
|
||||
) per_socket_dcache_bus_if[`NUM_SOCKETS * DCACHE_NUM_REQS]();
|
||||
|
||||
assign busy = (| per_core_busy);
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (ICACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (ICACHE_ARB_TAG_WIDTH)
|
||||
) per_socket_icache_bus_if[`NUM_SOCKETS]();
|
||||
|
||||
`RESET_RELAY (mem_unit_reset, reset);
|
||||
|
||||
VX_mem_unit #(
|
||||
.CLUSTER_ID (CLUSTER_ID)
|
||||
) mem_unit (
|
||||
.clk (clk),
|
||||
.reset (mem_unit_reset),
|
||||
|
||||
if (`L2_ENABLE) begin
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_cache_if perf_l2cache_if();
|
||||
.mem_perf_if (mem_perf_if),
|
||||
`endif
|
||||
|
||||
`RESET_RELAY (l2_reset);
|
||||
.dcache_bus_if (per_socket_dcache_bus_if),
|
||||
|
||||
.icache_bus_if (per_socket_icache_bus_if),
|
||||
|
||||
VX_cache #(
|
||||
.CACHE_ID (`L2_CACHE_ID),
|
||||
.CACHE_SIZE (`L2_CACHE_SIZE),
|
||||
.CACHE_LINE_SIZE (`L2_CACHE_LINE_SIZE),
|
||||
.NUM_BANKS (`L2_NUM_BANKS),
|
||||
.NUM_PORTS (`L2_NUM_PORTS),
|
||||
.WORD_SIZE (`L2_WORD_SIZE),
|
||||
.NUM_REQS (`L2_NUM_REQS),
|
||||
.CREQ_SIZE (`L2_CREQ_SIZE),
|
||||
.CRSQ_SIZE (`L2_CRSQ_SIZE),
|
||||
.MSHR_SIZE (`L2_MSHR_SIZE),
|
||||
.MRSQ_SIZE (`L2_MRSQ_SIZE),
|
||||
.MREQ_SIZE (`L2_MREQ_SIZE),
|
||||
.WRITE_ENABLE (1),
|
||||
.CORE_TAG_WIDTH (`L1_MEM_TAG_WIDTH),
|
||||
.CORE_TAG_ID_BITS (0),
|
||||
.MEM_TAG_WIDTH (`L2_MEM_TAG_WIDTH),
|
||||
.NC_ENABLE (1)
|
||||
) l2cache (
|
||||
`SCOPE_BIND_VX_cluster_l2cache
|
||||
|
||||
.clk (clk),
|
||||
.reset (l2_reset),
|
||||
.mem_bus_if (mem_bus_if)
|
||||
);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [`NUM_SOCKETS-1:0] per_socket_sim_ebreak;
|
||||
wire [`NUM_SOCKETS-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_socket_sim_wb_value;
|
||||
assign sim_ebreak = per_socket_sim_ebreak[0];
|
||||
assign sim_wb_value = per_socket_sim_wb_value[0];
|
||||
`UNUSED_VAR (per_socket_sim_ebreak)
|
||||
`UNUSED_VAR (per_socket_sim_wb_value)
|
||||
|
||||
VX_dcr_bus_if socket_dcr_bus_tmp_if();
|
||||
assign socket_dcr_bus_tmp_if.write_valid = dcr_bus_if.write_valid && (dcr_bus_if.write_addr >= `VX_DCR_BASE_STATE_BEGIN && dcr_bus_if.write_addr < `VX_DCR_BASE_STATE_END);
|
||||
assign socket_dcr_bus_tmp_if.write_addr = dcr_bus_if.write_addr;
|
||||
assign socket_dcr_bus_tmp_if.write_data = dcr_bus_if.write_data;
|
||||
|
||||
wire [`NUM_SOCKETS-1:0] per_socket_busy;
|
||||
|
||||
`BUFFER_DCR_BUS_IF (socket_dcr_bus_if, socket_dcr_bus_tmp_if, (`NUM_SOCKETS > 1));
|
||||
|
||||
// Generate all sockets
|
||||
for (genvar i = 0; i < `NUM_SOCKETS; ++i) begin
|
||||
|
||||
`RESET_RELAY (socket_reset, reset);
|
||||
|
||||
VX_socket #(
|
||||
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + i)
|
||||
) socket (
|
||||
`SCOPE_IO_BIND (scope_socket+i)
|
||||
.clk (clk),
|
||||
.reset (socket_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_cache_if (perf_l2cache_if),
|
||||
.mem_perf_if (perf_memsys_total_if),
|
||||
`endif
|
||||
|
||||
.dcr_bus_if (socket_dcr_bus_if),
|
||||
|
||||
.dcache_bus_if (per_socket_dcache_bus_if[i * DCACHE_NUM_REQS +: DCACHE_NUM_REQS]),
|
||||
|
||||
.icache_bus_if (per_socket_icache_bus_if[i]),
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
.gbar_bus_if (per_socket_gbar_bus_if[i]),
|
||||
`endif
|
||||
|
||||
// Core request
|
||||
.core_req_valid (per_core_mem_req_valid),
|
||||
.core_req_rw (per_core_mem_req_rw),
|
||||
.core_req_byteen (per_core_mem_req_byteen),
|
||||
.core_req_addr (per_core_mem_req_addr),
|
||||
.core_req_data (per_core_mem_req_data),
|
||||
.core_req_tag (per_core_mem_req_tag),
|
||||
.core_req_ready (per_core_mem_req_ready),
|
||||
|
||||
// Core response
|
||||
.core_rsp_valid (per_core_mem_rsp_valid),
|
||||
.core_rsp_data (per_core_mem_rsp_data),
|
||||
.core_rsp_tag (per_core_mem_rsp_tag),
|
||||
.core_rsp_ready (per_core_mem_rsp_ready),
|
||||
`UNUSED_PIN (core_rsp_tmask),
|
||||
|
||||
// Memory request
|
||||
.mem_req_valid (mem_req_valid),
|
||||
.mem_req_rw (mem_req_rw),
|
||||
.mem_req_byteen (mem_req_byteen),
|
||||
.mem_req_addr (mem_req_addr),
|
||||
.mem_req_data (mem_req_data),
|
||||
.mem_req_tag (mem_req_tag),
|
||||
.mem_req_ready (mem_req_ready),
|
||||
|
||||
// Memory response
|
||||
.mem_rsp_valid (mem_rsp_valid),
|
||||
.mem_rsp_tag (mem_rsp_tag),
|
||||
.mem_rsp_data (mem_rsp_data),
|
||||
.mem_rsp_ready (mem_rsp_ready)
|
||||
.sim_ebreak (per_socket_sim_ebreak[i]),
|
||||
.sim_wb_value (per_socket_sim_wb_value[i]),
|
||||
.busy (per_socket_busy[i])
|
||||
);
|
||||
|
||||
end else begin
|
||||
|
||||
`RESET_RELAY (mem_arb_reset);
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_REQS (`NUM_CORES),
|
||||
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
|
||||
.ADDR_WIDTH (`DCACHE_MEM_ADDR_WIDTH),
|
||||
.TAG_IN_WIDTH (`L1_MEM_TAG_WIDTH),
|
||||
.TYPE ("R"),
|
||||
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||
.BUFFERED_REQ (1),
|
||||
.BUFFERED_RSP (1)
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (mem_arb_reset),
|
||||
|
||||
// Core request
|
||||
.req_valid_in (per_core_mem_req_valid),
|
||||
.req_rw_in (per_core_mem_req_rw),
|
||||
.req_byteen_in (per_core_mem_req_byteen),
|
||||
.req_addr_in (per_core_mem_req_addr),
|
||||
.req_data_in (per_core_mem_req_data),
|
||||
.req_tag_in (per_core_mem_req_tag),
|
||||
.req_ready_in (per_core_mem_req_ready),
|
||||
|
||||
// Memory request
|
||||
.req_valid_out (mem_req_valid),
|
||||
.req_rw_out (mem_req_rw),
|
||||
.req_byteen_out (mem_req_byteen),
|
||||
.req_addr_out (mem_req_addr),
|
||||
.req_data_out (mem_req_data),
|
||||
.req_tag_out (mem_req_tag),
|
||||
.req_ready_out (mem_req_ready),
|
||||
|
||||
// Core response
|
||||
.rsp_valid_out (per_core_mem_rsp_valid),
|
||||
.rsp_data_out (per_core_mem_rsp_data),
|
||||
.rsp_tag_out (per_core_mem_rsp_tag),
|
||||
.rsp_ready_out (per_core_mem_rsp_ready),
|
||||
|
||||
// Memory response
|
||||
.rsp_valid_in (mem_rsp_valid),
|
||||
.rsp_tag_in (mem_rsp_tag),
|
||||
.rsp_data_in (mem_rsp_data),
|
||||
.rsp_ready_in (mem_rsp_ready)
|
||||
);
|
||||
|
||||
end
|
||||
|
||||
`BUFFER_BUSY (busy, (| per_socket_busy), (`NUM_SOCKETS > 1));
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -1,138 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_commit #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// inputs
|
||||
VX_commit_if.slave alu_commit_if,
|
||||
VX_commit_if.slave ld_commit_if,
|
||||
VX_commit_if.slave st_commit_if,
|
||||
VX_commit_if.slave csr_commit_if,
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_commit_if.slave fpu_commit_if,
|
||||
`endif
|
||||
VX_commit_if.slave gpu_commit_if,
|
||||
|
||||
// outputs
|
||||
VX_writeback_if.master writeback_if,
|
||||
VX_cmt_to_csr_if.master cmt_to_csr_if
|
||||
);
|
||||
// CSRs update
|
||||
|
||||
wire alu_commit_fire = alu_commit_if.valid && alu_commit_if.ready;
|
||||
wire ld_commit_fire = ld_commit_if.valid && ld_commit_if.ready;
|
||||
wire st_commit_fire = st_commit_if.valid && st_commit_if.ready;
|
||||
wire csr_commit_fire = csr_commit_if.valid && csr_commit_if.ready;
|
||||
`ifdef EXT_F_ENABLE
|
||||
wire fpu_commit_fire = fpu_commit_if.valid && fpu_commit_if.ready;
|
||||
`endif
|
||||
wire gpu_commit_fire = gpu_commit_if.valid && gpu_commit_if.ready;
|
||||
|
||||
wire commit_fire = alu_commit_fire
|
||||
|| ld_commit_fire
|
||||
|| st_commit_fire
|
||||
|| csr_commit_fire
|
||||
`ifdef EXT_F_ENABLE
|
||||
|| fpu_commit_fire
|
||||
`endif
|
||||
|| gpu_commit_fire;
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
wire [(6*`NUM_THREADS)-1:0] commit_tmask;
|
||||
`else
|
||||
wire [(5*`NUM_THREADS)-1:0] commit_tmask;
|
||||
`endif
|
||||
|
||||
wire [$clog2($bits(commit_tmask)+1)-1:0] commit_size;
|
||||
|
||||
assign commit_tmask = {
|
||||
{`NUM_THREADS{alu_commit_fire}} & alu_commit_if.tmask,
|
||||
{`NUM_THREADS{ld_commit_fire}} & ld_commit_if.tmask,
|
||||
{`NUM_THREADS{st_commit_fire}} & st_commit_if.tmask,
|
||||
{`NUM_THREADS{csr_commit_fire}} & csr_commit_if.tmask,
|
||||
`ifdef EXT_F_ENABLE
|
||||
{`NUM_THREADS{fpu_commit_fire}} & fpu_commit_if.tmask,
|
||||
`endif
|
||||
{`NUM_THREADS{gpu_commit_fire}} & gpu_commit_if.tmask
|
||||
};
|
||||
|
||||
`POP_COUNT(commit_size, commit_tmask);
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + $bits(commit_size)),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (1'b1),
|
||||
.data_in ({commit_fire, commit_size}),
|
||||
.data_out ({cmt_to_csr_if.valid, cmt_to_csr_if.commit_size})
|
||||
);
|
||||
|
||||
// Writeback
|
||||
|
||||
VX_writeback #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) writeback (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
.alu_commit_if (alu_commit_if),
|
||||
.ld_commit_if (ld_commit_if),
|
||||
.csr_commit_if (csr_commit_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_commit_if (fpu_commit_if),
|
||||
`endif
|
||||
.gpu_commit_if (gpu_commit_if),
|
||||
.writeback_if (writeback_if)
|
||||
);
|
||||
|
||||
// store and gpu commits don't writeback
|
||||
assign st_commit_if.ready = 1'b1;
|
||||
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (alu_commit_if.valid && alu_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.wb, alu_commit_if.rd);
|
||||
`TRACE_ARRAY1D(alu_commit_if.data, `NUM_THREADS);
|
||||
dpi_trace(" (#%0d)\n", alu_commit_if.uuid);
|
||||
end
|
||||
if (ld_commit_if.valid && ld_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.wb, ld_commit_if.rd);
|
||||
`TRACE_ARRAY1D(ld_commit_if.data, `NUM_THREADS);
|
||||
dpi_trace(" (#%0d)\n", ld_commit_if.uuid);
|
||||
end
|
||||
if (st_commit_if.valid && st_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d (#%0d)\n", $time, CORE_ID, st_commit_if.wid, st_commit_if.PC, st_commit_if.tmask, st_commit_if.wb, st_commit_if.rd, st_commit_if.uuid);
|
||||
end
|
||||
if (csr_commit_if.valid && csr_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=CSR, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.wb, csr_commit_if.rd);
|
||||
`TRACE_ARRAY1D(csr_commit_if.data, `NUM_THREADS);
|
||||
dpi_trace(" (#%0d)\n", csr_commit_if.uuid);
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (fpu_commit_if.valid && fpu_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=FPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.wb, fpu_commit_if.rd);
|
||||
`TRACE_ARRAY1D(fpu_commit_if.data, `NUM_THREADS);
|
||||
dpi_trace(" (#%0d)\n", fpu_commit_if.uuid);
|
||||
end
|
||||
`endif
|
||||
if (gpu_commit_if.valid && gpu_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=GPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.wb, gpu_commit_if.rd);
|
||||
`TRACE_ARRAY1D(gpu_commit_if.data, `NUM_THREADS);
|
||||
dpi_trace(" (#%0d)\n", gpu_commit_if.uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,7 +1,49 @@
|
|||
`ifndef VX_CONFIG
|
||||
`define VX_CONFIG
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`ifndef XLEN
|
||||
`ifndef VX_CONFIG_VH
|
||||
`define VX_CONFIG_VH
|
||||
|
||||
`ifndef MIN
|
||||
`define MIN(x, y) (((x) < (y)) ? (x) : (y))
|
||||
`endif
|
||||
|
||||
`ifndef MAX
|
||||
`define MAX(x, y) (((x) > (y)) ? (x) : (y))
|
||||
`endif
|
||||
|
||||
`ifndef CLAMP
|
||||
`define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x)))
|
||||
`endif
|
||||
|
||||
`ifndef UP
|
||||
`define UP(x) (((x) != 0) ? (x) : 1)
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// 32 bit XLEN as default.
|
||||
`ifndef XLEN_32
|
||||
`ifndef XLEN_64
|
||||
`define XLEN_32
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef XLEN_64
|
||||
`define XLEN 64
|
||||
`endif
|
||||
|
||||
`ifdef XLEN_32
|
||||
`define XLEN 32
|
||||
`endif
|
||||
|
||||
|
@ -25,54 +67,127 @@
|
|||
`define NUM_BARRIERS 4
|
||||
`endif
|
||||
|
||||
`ifndef L2_ENABLE
|
||||
`define L2_ENABLE 0
|
||||
`ifndef SOCKET_SIZE
|
||||
`define SOCKET_SIZE `MIN(4, `NUM_CORES)
|
||||
`endif
|
||||
|
||||
`ifndef L3_ENABLE
|
||||
`define L3_ENABLE 0
|
||||
`ifdef L2_ENABLE
|
||||
`define L2_ENABLED 1
|
||||
`else
|
||||
`define L2_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifndef SM_ENABLE
|
||||
`define SM_ENABLE 1
|
||||
`ifdef L3_ENABLE
|
||||
`define L3_ENABLED 1
|
||||
`else
|
||||
`define L3_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifdef L1_DISABLE
|
||||
`define ICACHE_DISABLE
|
||||
`define DCACHE_DISABLE
|
||||
`endif
|
||||
|
||||
`ifndef MEM_BLOCK_SIZE
|
||||
`define MEM_BLOCK_SIZE 64
|
||||
`endif
|
||||
|
||||
`ifndef L1_BLOCK_SIZE
|
||||
`define L1_BLOCK_SIZE ((`L2_ENABLE || `L3_ENABLE) ? 16 : `MEM_BLOCK_SIZE)
|
||||
`ifndef MEM_ADDR_WIDTH
|
||||
`ifdef XLEN_64
|
||||
`define MEM_ADDR_WIDTH 48
|
||||
`else
|
||||
`define MEM_ADDR_WIDTH 32
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifndef L1_LINE_SIZE
|
||||
`ifdef L1_DISABLE
|
||||
`define L1_LINE_SIZE ((`L2_ENABLED || `L3_ENABLED) ? 4 : `MEM_BLOCK_SIZE)
|
||||
`else
|
||||
`define L1_LINE_SIZE ((`L2_ENABLED || `L3_ENABLED) ? 16 : `MEM_BLOCK_SIZE)
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef XLEN_64
|
||||
|
||||
`ifndef STARTUP_ADDR
|
||||
`define STARTUP_ADDR 64'h180000000
|
||||
`endif
|
||||
|
||||
`ifndef STACK_BASE_ADDR
|
||||
`define STACK_BASE_ADDR 64'h1FF000000
|
||||
`endif
|
||||
|
||||
`else
|
||||
|
||||
`ifndef STARTUP_ADDR
|
||||
`define STARTUP_ADDR 32'h80000000
|
||||
`endif
|
||||
|
||||
`ifndef IO_BASE_ADDR
|
||||
`define IO_BASE_ADDR 32'hFF000000
|
||||
`ifndef STACK_BASE_ADDR
|
||||
`define STACK_BASE_ADDR 32'hFF000000
|
||||
`endif
|
||||
|
||||
`ifndef IO_ADDR_SIZE
|
||||
`define IO_ADDR_SIZE (32'hFFFFFFFF - `IO_BASE_ADDR + 1)
|
||||
`endif
|
||||
|
||||
`ifndef IO_COUT_ADDR
|
||||
`define IO_COUT_ADDR (32'hFFFFFFFF - `MEM_BLOCK_SIZE + 1)
|
||||
`endif
|
||||
|
||||
`ifndef IO_COUT_SIZE
|
||||
`define IO_COUT_SIZE `MEM_BLOCK_SIZE
|
||||
`endif
|
||||
|
||||
`ifndef IO_CSR_ADDR
|
||||
`define IO_CSR_ADDR `IO_BASE_ADDR
|
||||
`endif
|
||||
|
||||
`ifndef SMEM_BASE_ADDR
|
||||
`define SMEM_BASE_ADDR `IO_BASE_ADDR
|
||||
`define SMEM_BASE_ADDR `STACK_BASE_ADDR
|
||||
`endif
|
||||
|
||||
`ifndef SMEM_LOG_SIZE
|
||||
`define SMEM_LOG_SIZE 14
|
||||
`endif
|
||||
|
||||
`ifndef IO_BASE_ADDR
|
||||
`define IO_BASE_ADDR (`SMEM_BASE_ADDR + (1 << `SMEM_LOG_SIZE))
|
||||
`endif
|
||||
|
||||
`ifndef IO_COUT_ADDR
|
||||
`define IO_COUT_ADDR `IO_BASE_ADDR
|
||||
`endif
|
||||
`define IO_COUT_SIZE `MEM_BLOCK_SIZE
|
||||
|
||||
`ifndef IO_CSR_ADDR
|
||||
`define IO_CSR_ADDR (`IO_COUT_ADDR + `IO_COUT_SIZE)
|
||||
`endif
|
||||
`define IO_CSR_SIZE (4 * 64 * `NUM_CORES * `NUM_CLUSTERS)
|
||||
|
||||
`ifndef STACK_LOG2_SIZE
|
||||
`define STACK_LOG2_SIZE 13
|
||||
`endif
|
||||
`define STACK_SIZE (1 << `STACK_LOG2_SIZE)
|
||||
|
||||
`define RESET_DELAY 8
|
||||
|
||||
`ifndef STALL_TIMEOUT
|
||||
`define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
|
||||
`endif
|
||||
|
||||
`ifndef FPU_FPNEW
|
||||
`ifndef FPU_DSP
|
||||
`ifndef FPU_DPI
|
||||
`ifdef SYNTHESIS
|
||||
`define FPU_DSP
|
||||
`else
|
||||
`define FPU_DPI
|
||||
`endif
|
||||
`endif
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
`ifndef DPI_DISABLE
|
||||
`define IMUL_DPI
|
||||
`define IDIV_DPI
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifndef DEBUG_LEVEL
|
||||
`define DEBUG_LEVEL 3
|
||||
`endif
|
||||
|
||||
// ISA Extensions /////////////////////////////////////////////////////////////
|
||||
|
||||
`ifndef EXT_M_DISABLE
|
||||
`define EXT_M_ENABLE
|
||||
`endif
|
||||
|
@ -81,230 +196,278 @@
|
|||
`define EXT_F_ENABLE
|
||||
`endif
|
||||
|
||||
// Device identification
|
||||
`ifdef EXT_D_ENABLE
|
||||
`define FLEN_64
|
||||
`else
|
||||
`define FLEN_32
|
||||
`endif
|
||||
|
||||
`ifdef FLEN_64
|
||||
`define FLEN 64
|
||||
`endif
|
||||
|
||||
`ifdef FLEN_32
|
||||
`define FLEN 32
|
||||
`endif
|
||||
|
||||
`ifdef XLEN_64
|
||||
`ifdef FLEN_32
|
||||
`define FPU_RV64F
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`define ISA_STD_A 0
|
||||
`define ISA_STD_C 2
|
||||
`define ISA_STD_D 3
|
||||
`define ISA_STD_E 4
|
||||
`define ISA_STD_F 5
|
||||
`define ISA_STD_H 7
|
||||
`define ISA_STD_I 8
|
||||
`define ISA_STD_N 13
|
||||
`define ISA_STD_Q 16
|
||||
`define ISA_STD_S 18
|
||||
`define ISA_STD_U 20
|
||||
|
||||
`define ISA_EXT_TEX 0
|
||||
`define ISA_EXT_RASTER 1
|
||||
`define ISA_EXT_ROP 2
|
||||
|
||||
`ifdef EXT_A_ENABLE
|
||||
`define EXT_A_ENABLED 1
|
||||
`else
|
||||
`define EXT_A_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifdef EXT_C_ENABLE
|
||||
`define EXT_C_ENABLED 1
|
||||
`else
|
||||
`define EXT_C_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifdef EXT_D_ENABLE
|
||||
`define EXT_D_ENABLED 1
|
||||
`else
|
||||
`define EXT_D_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`define EXT_F_ENABLED 1
|
||||
`else
|
||||
`define EXT_F_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifdef EXT_M_ENABLE
|
||||
`define EXT_M_ENABLED 1
|
||||
`else
|
||||
`define EXT_M_ENABLED 0
|
||||
`endif
|
||||
|
||||
`define ISA_X_ENABLED 0
|
||||
|
||||
`define MISA_EXT 0
|
||||
|
||||
`define MISA_STD (`EXT_A_ENABLED << 0) /* A - Atomic Instructions extension */ \
|
||||
| (0 << 1) /* B - Tentatively reserved for Bit operations extension */ \
|
||||
| (`EXT_C_ENABLED << 2) /* C - Compressed extension */ \
|
||||
| (`EXT_D_ENABLED << 3) /* D - Double precsision floating-point extension */ \
|
||||
| (0 << 4) /* E - RV32E base ISA */ \
|
||||
| (`EXT_F_ENABLED << 5) /* F - Single precsision floating-point extension */ \
|
||||
| (0 << 6) /* G - Additional standard extensions present */ \
|
||||
| (0 << 7) /* H - Hypervisor mode implemented */ \
|
||||
| (1 << 8) /* I - RV32I/64I/128I base ISA */ \
|
||||
| (0 << 9) /* J - Reserved */ \
|
||||
| (0 << 10) /* K - Reserved */ \
|
||||
| (0 << 11) /* L - Tentatively reserved for Bit operations extension */ \
|
||||
| (`EXT_M_ENABLED << 12) /* M - Integer Multiply/Divide extension */ \
|
||||
| (0 << 13) /* N - User level interrupts supported */ \
|
||||
| (0 << 14) /* O - Reserved */ \
|
||||
| (0 << 15) /* P - Tentatively reserved for Packed-SIMD extension */ \
|
||||
| (0 << 16) /* Q - Quad-precision floating-point extension */ \
|
||||
| (0 << 17) /* R - Reserved */ \
|
||||
| (0 << 18) /* S - Supervisor mode implemented */ \
|
||||
| (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \
|
||||
| (1 << 20) /* U - User mode implemented */ \
|
||||
| (0 << 21) /* V - Tentatively reserved for Vector extension */ \
|
||||
| (0 << 22) /* W - Reserved */ \
|
||||
| (`ISA_X_ENABLED << 23) /* X - Non-standard extensions present */ \
|
||||
| (0 << 24) /* Y - Reserved */ \
|
||||
| (0 << 25) /* Z - Reserved */
|
||||
|
||||
// Device identification //////////////////////////////////////////////////////
|
||||
|
||||
`define VENDOR_ID 0
|
||||
`define ARCHITECTURE_ID 0
|
||||
`define IMPLEMENTATION_ID 0
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Pipeline Configuration /////////////////////////////////////////////////////
|
||||
|
||||
`ifndef LATENCY_IMUL
|
||||
`define LATENCY_IMUL 3
|
||||
// Issue width
|
||||
`ifndef ISSUE_WIDTH
|
||||
`define ISSUE_WIDTH `MIN(`NUM_WARPS, 4)
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FNCP
|
||||
`define LATENCY_FNCP 2
|
||||
// Number of ALU units
|
||||
`ifndef NUM_ALU_LANES
|
||||
`define NUM_ALU_LANES `UP(`NUM_THREADS / 2)
|
||||
`endif
|
||||
`ifndef NUM_ALU_BLOCKS
|
||||
`define NUM_ALU_BLOCKS `UP(`ISSUE_WIDTH / 1)
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FMA
|
||||
`define LATENCY_FMA 4
|
||||
// Number of FPU units
|
||||
`ifndef NUM_FPU_LANES
|
||||
`define NUM_FPU_LANES `UP(`NUM_THREADS / 2)
|
||||
`endif
|
||||
`ifndef NUM_FPU_BLOCKS
|
||||
`define NUM_FPU_BLOCKS `UP(`ISSUE_WIDTH / 1)
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FDIV
|
||||
`ifdef ALTERA_S10
|
||||
`define LATENCY_FDIV 34
|
||||
`else
|
||||
`define LATENCY_FDIV 15
|
||||
`endif
|
||||
// Number of LSU units
|
||||
`ifndef NUM_LSU_LANES
|
||||
`define NUM_LSU_LANES `MIN(`NUM_THREADS, 4)
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FSQRT
|
||||
`ifdef ALTERA_S10
|
||||
`define LATENCY_FSQRT 25
|
||||
`else
|
||||
`define LATENCY_FSQRT 10
|
||||
// Number of SFU units
|
||||
`ifndef NUM_SFU_LANES
|
||||
`define NUM_SFU_LANES `MIN(`NUM_THREADS, 4)
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FDIVSQRT
|
||||
`define LATENCY_FDIVSQRT 32
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FCVT
|
||||
`define LATENCY_FCVT 5
|
||||
`endif
|
||||
|
||||
`define RESET_DELAY 6
|
||||
|
||||
// CSR Addresses //////////////////////////////////////////////////////////////
|
||||
|
||||
// User Floating-Point CSRs
|
||||
`define CSR_FFLAGS 12'h001
|
||||
`define CSR_FRM 12'h002
|
||||
`define CSR_FCSR 12'h003
|
||||
|
||||
`define CSR_SATP 12'h180
|
||||
|
||||
`define CSR_PMPCFG0 12'h3A0
|
||||
`define CSR_PMPADDR0 12'h3B0
|
||||
|
||||
`define CSR_MSTATUS 12'h300
|
||||
`define CSR_MISA 12'h301
|
||||
`define CSR_MEDELEG 12'h302
|
||||
`define CSR_MIDELEG 12'h303
|
||||
`define CSR_MIE 12'h304
|
||||
`define CSR_MTVEC 12'h305
|
||||
|
||||
`define CSR_MEPC 12'h341
|
||||
|
||||
// Machine Performance-monitoring counters
|
||||
`define CSR_MPM_BASE 12'hB00
|
||||
`define CSR_MPM_BASE_H 12'hB80
|
||||
// PERF: pipeline
|
||||
`define CSR_MCYCLE 12'hB00
|
||||
`define CSR_MCYCLE_H 12'hB80
|
||||
`define CSR_MPM_RESERVED 12'hB01
|
||||
`define CSR_MPM_RESERVED_H 12'hB81
|
||||
`define CSR_MINSTRET 12'hB02
|
||||
`define CSR_MINSTRET_H 12'hB82
|
||||
`define CSR_MPM_IBUF_ST 12'hB03
|
||||
`define CSR_MPM_IBUF_ST_H 12'hB83
|
||||
`define CSR_MPM_SCRB_ST 12'hB04
|
||||
`define CSR_MPM_SCRB_ST_H 12'hB84
|
||||
`define CSR_MPM_ALU_ST 12'hB05
|
||||
`define CSR_MPM_ALU_ST_H 12'hB85
|
||||
`define CSR_MPM_LSU_ST 12'hB06
|
||||
`define CSR_MPM_LSU_ST_H 12'hB86
|
||||
`define CSR_MPM_CSR_ST 12'hB07
|
||||
`define CSR_MPM_CSR_ST_H 12'hB87
|
||||
`define CSR_MPM_FPU_ST 12'hB08
|
||||
`define CSR_MPM_FPU_ST_H 12'hB88
|
||||
`define CSR_MPM_GPU_ST 12'hB09
|
||||
`define CSR_MPM_GPU_ST_H 12'hB89
|
||||
// PERF: decode
|
||||
`define CSR_MPM_LOADS 12'hB0A
|
||||
`define CSR_MPM_LOADS_H 12'hB8A
|
||||
`define CSR_MPM_STORES 12'hB0B
|
||||
`define CSR_MPM_STORES_H 12'hB8B
|
||||
`define CSR_MPM_BRANCHES 12'hB0C
|
||||
`define CSR_MPM_BRANCHES_H 12'hB8C
|
||||
// PERF: icache
|
||||
`define CSR_MPM_ICACHE_READS 12'hB0D // total reads
|
||||
`define CSR_MPM_ICACHE_READS_H 12'hB8D
|
||||
`define CSR_MPM_ICACHE_MISS_R 12'hB0E // read misses
|
||||
`define CSR_MPM_ICACHE_MISS_R_H 12'hB8E
|
||||
// PERF: dcache
|
||||
`define CSR_MPM_DCACHE_READS 12'hB0F // total reads
|
||||
`define CSR_MPM_DCACHE_READS_H 12'hB8F
|
||||
`define CSR_MPM_DCACHE_WRITES 12'hB10 // total writes
|
||||
`define CSR_MPM_DCACHE_WRITES_H 12'hB90
|
||||
`define CSR_MPM_DCACHE_MISS_R 12'hB11 // read misses
|
||||
`define CSR_MPM_DCACHE_MISS_R_H 12'hB91
|
||||
`define CSR_MPM_DCACHE_MISS_W 12'hB12 // write misses
|
||||
`define CSR_MPM_DCACHE_MISS_W_H 12'hB92
|
||||
`define CSR_MPM_DCACHE_BANK_ST 12'hB13 // bank conflicts
|
||||
`define CSR_MPM_DCACHE_BANK_ST_H 12'hB93
|
||||
`define CSR_MPM_DCACHE_MSHR_ST 12'hB14 // MSHR stalls
|
||||
`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB94
|
||||
// PERF: smem
|
||||
`define CSR_MPM_SMEM_READS 12'hB15 // total reads
|
||||
`define CSR_MPM_SMEM_READS_H 12'hB95
|
||||
`define CSR_MPM_SMEM_WRITES 12'hB16 // total writes
|
||||
`define CSR_MPM_SMEM_WRITES_H 12'hB96
|
||||
`define CSR_MPM_SMEM_BANK_ST 12'hB17 // bank conflicts
|
||||
`define CSR_MPM_SMEM_BANK_ST_H 12'hB97
|
||||
// PERF: memory
|
||||
`define CSR_MPM_MEM_READS 12'hB18 // memory reads
|
||||
`define CSR_MPM_MEM_READS_H 12'hB98
|
||||
`define CSR_MPM_MEM_WRITES 12'hB19 // memory writes
|
||||
`define CSR_MPM_MEM_WRITES_H 12'hB99
|
||||
`define CSR_MPM_MEM_LAT 12'hB1A // memory latency
|
||||
`define CSR_MPM_MEM_LAT_H 12'hB9A
|
||||
// PERF: texunit
|
||||
`define CSR_MPM_TEX_READS 12'hB1B // texture accesses
|
||||
`define CSR_MPM_TEX_READS_H 12'hB9B
|
||||
`define CSR_MPM_TEX_LAT 12'hB1C // texture latency
|
||||
`define CSR_MPM_TEX_LAT_H 12'hB9C
|
||||
|
||||
// Machine Information Registers
|
||||
`define CSR_MVENDORID 12'hF11
|
||||
`define CSR_MARCHID 12'hF12
|
||||
`define CSR_MIMPID 12'hF13
|
||||
`define CSR_MHARTID 12'hF14
|
||||
|
||||
// User SIMT CSRs
|
||||
`define CSR_WTID 12'hCC0
|
||||
`define CSR_LTID 12'hCC1
|
||||
`define CSR_GTID 12'hCC2
|
||||
`define CSR_LWID 12'hCC3
|
||||
`define CSR_GWID `CSR_MHARTID
|
||||
`define CSR_GCID 12'hCC5
|
||||
`define CSR_TMASK 12'hCC4
|
||||
|
||||
// Machine SIMT CSRs
|
||||
`define CSR_NT 12'hFC0
|
||||
`define CSR_NW 12'hFC1
|
||||
`define CSR_NC 12'hFC2
|
||||
|
||||
////////// Texture Units //////////////////////////////////////////////////////
|
||||
|
||||
`define NUM_TEX_UNITS 2
|
||||
`define TEX_SUBPIXEL_BITS 8
|
||||
|
||||
`define TEX_DIM_BITS 15
|
||||
`define TEX_LOD_MAX `TEX_DIM_BITS
|
||||
`define TEX_LOD_BITS 4
|
||||
|
||||
`define TEX_FXD_BITS 32
|
||||
`define TEX_FXD_FRAC (`TEX_DIM_BITS+`TEX_SUBPIXEL_BITS)
|
||||
|
||||
`define TEX_STATE_ADDR 0
|
||||
`define TEX_STATE_WIDTH 1
|
||||
`define TEX_STATE_HEIGHT 2
|
||||
`define TEX_STATE_FORMAT 3
|
||||
`define TEX_STATE_FILTER 4
|
||||
`define TEX_STATE_WRAPU 5
|
||||
`define TEX_STATE_WRAPV 6
|
||||
`define TEX_STATE_MIPOFF(lod) (7+(lod))
|
||||
`define NUM_TEX_STATES (`TEX_STATE_MIPOFF(`TEX_LOD_MAX)+1)
|
||||
|
||||
`define CSR_TEX_UNIT 12'hFD0
|
||||
|
||||
`define CSR_TEX_STATE_BEGIN 12'hFD1
|
||||
`define CSR_TEX_ADDR (`CSR_TEX_STATE_BEGIN+`TEX_STATE_ADDR)
|
||||
`define CSR_TEX_WIDTH (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WIDTH)
|
||||
`define CSR_TEX_HEIGHT (`CSR_TEX_STATE_BEGIN+`TEX_STATE_HEIGHT)
|
||||
`define CSR_TEX_FORMAT (`CSR_TEX_STATE_BEGIN+`TEX_STATE_FORMAT)
|
||||
`define CSR_TEX_FILTER (`CSR_TEX_STATE_BEGIN+`TEX_STATE_FILTER)
|
||||
`define CSR_TEX_WRAPU (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WRAPU)
|
||||
`define CSR_TEX_WRAPV (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WRAPV)
|
||||
`define CSR_TEX_MIPOFF(lod) (`CSR_TEX_STATE_BEGIN+`TEX_STATE_MIPOFF(lod))
|
||||
`define CSR_TEX_STATE_END (`CSR_TEX_STATE_BEGIN+`NUM_TEX_STATES)
|
||||
|
||||
`define CSR_TEX_STATE(addr) ((addr) - `CSR_TEX_STATE_BEGIN)
|
||||
|
||||
// Pipeline Queues ////////////////////////////////////////////////////////////
|
||||
|
||||
// Size of Instruction Buffer
|
||||
`ifndef IBUF_SIZE
|
||||
`define IBUF_SIZE 2
|
||||
`define IBUF_SIZE (2 * (`NUM_WARPS / `ISSUE_WIDTH))
|
||||
`endif
|
||||
|
||||
// Size of LSU Request Queue
|
||||
`ifndef LSUQ_SIZE
|
||||
`define LSUQ_SIZE (`NUM_WARPS * 2)
|
||||
`define LSUQ_SIZE (2 * (`NUM_THREADS / `NUM_LSU_LANES))
|
||||
`endif
|
||||
|
||||
// LSU Duplicate Address Check
|
||||
`ifdef LSU_DUP
|
||||
`define LSU_DUP_ENABLED 1
|
||||
`else
|
||||
`define LSU_DUP_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
`define GBAR_ENABLED 1
|
||||
`else
|
||||
`define GBAR_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_IMUL
|
||||
`ifdef VIVADO
|
||||
`define LATENCY_IMUL 4
|
||||
`endif
|
||||
`ifdef QUARTUS
|
||||
`define LATENCY_IMUL 3
|
||||
`endif
|
||||
`ifndef LATENCY_IMUL
|
||||
`define LATENCY_IMUL 4
|
||||
`endif
|
||||
`endif
|
||||
|
||||
// Floating-Point Units ///////////////////////////////////////////////////////
|
||||
|
||||
// Size of FPU Request Queue
|
||||
`ifndef FPUQ_SIZE
|
||||
`define FPUQ_SIZE 8
|
||||
`ifndef FPU_REQ_QUEUE_SIZE
|
||||
`define FPU_REQ_QUEUE_SIZE (2 * (`NUM_THREADS / `NUM_FPU_LANES))
|
||||
`endif
|
||||
|
||||
// Texture Unit Request Queue
|
||||
`ifndef TEXQ_SIZE
|
||||
`define TEXQ_SIZE (`NUM_WARPS * 2)
|
||||
// FNCP Latency
|
||||
`ifndef LATENCY_FNCP
|
||||
`define LATENCY_FNCP 2
|
||||
`endif
|
||||
|
||||
// FMA Latency
|
||||
`ifndef LATENCY_FMA
|
||||
`ifdef FPU_DPI
|
||||
`define LATENCY_FMA 4
|
||||
`endif
|
||||
`ifdef FPU_FPNEW
|
||||
`define LATENCY_FMA 4
|
||||
`endif
|
||||
`ifdef FPU_DSP
|
||||
`ifdef QUARTUS
|
||||
`define LATENCY_FMA 4
|
||||
`endif
|
||||
`ifdef VIVADO
|
||||
`define LATENCY_FMA 16
|
||||
`endif
|
||||
`ifndef LATENCY_FMA
|
||||
`define LATENCY_FMA 4
|
||||
`endif
|
||||
`endif
|
||||
`endif
|
||||
|
||||
// FDIV Latency
|
||||
`ifndef LATENCY_FDIV
|
||||
`ifdef FPU_DPI
|
||||
`define LATENCY_FDIV 15
|
||||
`endif
|
||||
`ifdef FPU_FPNEW
|
||||
`define LATENCY_FDIV 16
|
||||
`endif
|
||||
`ifdef FPU_DSP
|
||||
`ifdef QUARTUS
|
||||
`define LATENCY_FDIV 15
|
||||
`endif
|
||||
`ifdef VIVADO
|
||||
`define LATENCY_FDIV 28
|
||||
`endif
|
||||
`ifndef LATENCY_FDIV
|
||||
`define LATENCY_FDIV 16
|
||||
`endif
|
||||
`endif
|
||||
`endif
|
||||
|
||||
// FSQRT Latency
|
||||
`ifndef LATENCY_FSQRT
|
||||
`ifdef FPU_DPI
|
||||
`define LATENCY_FSQRT 10
|
||||
`endif
|
||||
`ifdef FPU_FPNEW
|
||||
`define LATENCY_FSQRT 16
|
||||
`endif
|
||||
`ifdef FPU_DSP
|
||||
`ifdef QUARTUS
|
||||
`define LATENCY_FSQRT 10
|
||||
`endif
|
||||
`ifdef VIVADO
|
||||
`define LATENCY_FSQRT 28
|
||||
`endif
|
||||
`ifndef LATENCY_FSQRT
|
||||
`define LATENCY_FSQRT 16
|
||||
`endif
|
||||
`endif
|
||||
`endif
|
||||
|
||||
// FCVT Latency
|
||||
`ifndef LATENCY_FCVT
|
||||
`define LATENCY_FCVT 5
|
||||
`endif
|
||||
|
||||
// Icache Configurable Knobs //////////////////////////////////////////////////
|
||||
|
||||
// Size of cache in bytes
|
||||
`ifndef ICACHE_SIZE
|
||||
`define ICACHE_SIZE 16384
|
||||
// Cache Enable
|
||||
`ifndef ICACHE_DISABLE
|
||||
`define ICACHE_ENABLE
|
||||
`endif
|
||||
`ifdef ICACHE_ENABLE
|
||||
`define ICACHE_ENABLED 1
|
||||
`else
|
||||
`define ICACHE_ENABLED 0
|
||||
`define NUM_ICACHES 0
|
||||
`endif
|
||||
|
||||
// Core Request Queue Size
|
||||
`ifndef ICACHE_CREQ_SIZE
|
||||
`define ICACHE_CREQ_SIZE 0
|
||||
// Number of Cache Units
|
||||
`ifndef NUM_ICACHES
|
||||
`define NUM_ICACHES `UP(`NUM_CORES / 4)
|
||||
`endif
|
||||
|
||||
// Cache Size
|
||||
`ifndef ICACHE_SIZE
|
||||
`define ICACHE_SIZE 16384
|
||||
`endif
|
||||
|
||||
// Core Response Queue Size
|
||||
|
@ -314,7 +477,7 @@
|
|||
|
||||
// Miss Handling Register Size
|
||||
`ifndef ICACHE_MSHR_SIZE
|
||||
`define ICACHE_MSHR_SIZE `NUM_WARPS
|
||||
`define ICACHE_MSHR_SIZE 16
|
||||
`endif
|
||||
|
||||
// Memory Request Queue Size
|
||||
|
@ -327,26 +490,38 @@
|
|||
`define ICACHE_MRSQ_SIZE 0
|
||||
`endif
|
||||
|
||||
// Number of Associative Ways
|
||||
`ifndef ICACHE_NUM_WAYS
|
||||
`define ICACHE_NUM_WAYS 2
|
||||
`endif
|
||||
|
||||
// Dcache Configurable Knobs //////////////////////////////////////////////////
|
||||
|
||||
// Size of cache in bytes
|
||||
// Cache Enable
|
||||
`ifndef DCACHE_DISABLE
|
||||
`define DCACHE_ENABLE
|
||||
`endif
|
||||
`ifdef DCACHE_ENABLE
|
||||
`define DCACHE_ENABLED 1
|
||||
`else
|
||||
`define DCACHE_ENABLED 0
|
||||
`define NUM_DCACHES 0
|
||||
`define DCACHE_NUM_BANKS 1
|
||||
`endif
|
||||
|
||||
// Number of Cache Units
|
||||
`ifndef NUM_DCACHES
|
||||
`define NUM_DCACHES `UP(`NUM_CORES / 4)
|
||||
`endif
|
||||
|
||||
// Cache Size
|
||||
`ifndef DCACHE_SIZE
|
||||
`define DCACHE_SIZE 16384
|
||||
`endif
|
||||
|
||||
// Number of banks
|
||||
// Number of Banks
|
||||
`ifndef DCACHE_NUM_BANKS
|
||||
`define DCACHE_NUM_BANKS `NUM_THREADS
|
||||
`endif
|
||||
|
||||
// Number of ports per bank
|
||||
`ifndef DCACHE_NUM_PORTS
|
||||
`define DCACHE_NUM_PORTS 1
|
||||
`endif
|
||||
|
||||
// Core Request Queue Size
|
||||
`ifndef DCACHE_CREQ_SIZE
|
||||
`define DCACHE_CREQ_SIZE 0
|
||||
`define DCACHE_NUM_BANKS (`NUM_LSU_LANES)
|
||||
`endif
|
||||
|
||||
// Core Response Queue Size
|
||||
|
@ -356,7 +531,7 @@
|
|||
|
||||
// Miss Handling Register Size
|
||||
`ifndef DCACHE_MSHR_SIZE
|
||||
`define DCACHE_MSHR_SIZE `LSUQ_SIZE
|
||||
`define DCACHE_MSHR_SIZE 16
|
||||
`endif
|
||||
|
||||
// Memory Request Queue Size
|
||||
|
@ -369,54 +544,42 @@
|
|||
`define DCACHE_MRSQ_SIZE 0
|
||||
`endif
|
||||
|
||||
// Number of Associative Ways
|
||||
`ifndef DCACHE_NUM_WAYS
|
||||
`define DCACHE_NUM_WAYS 2
|
||||
`endif
|
||||
|
||||
// SM Configurable Knobs //////////////////////////////////////////////////////
|
||||
|
||||
// per thread stack size
|
||||
`ifndef STACK_LOG2_SIZE
|
||||
`define STACK_LOG2_SIZE 10
|
||||
`ifndef SM_DISABLE
|
||||
`define SM_ENABLE
|
||||
`endif
|
||||
`define STACK_SIZE (1 << `STACK_LOG2_SIZE)
|
||||
|
||||
// Size of cache in bytes
|
||||
`ifndef SMEM_SIZE
|
||||
`define SMEM_SIZE (`STACK_SIZE * `NUM_WARPS * `NUM_THREADS)
|
||||
`ifdef SM_ENABLE
|
||||
`define SM_ENABLED 1
|
||||
`else
|
||||
`define SM_ENABLED 0
|
||||
`define SMEM_NUM_BANKS 1
|
||||
`endif
|
||||
|
||||
// Number of banks
|
||||
// Number of Banks
|
||||
`ifndef SMEM_NUM_BANKS
|
||||
`define SMEM_NUM_BANKS `NUM_THREADS
|
||||
`endif
|
||||
|
||||
// Core Request Queue Size
|
||||
`ifndef SMEM_CREQ_SIZE
|
||||
`define SMEM_CREQ_SIZE 2
|
||||
`endif
|
||||
|
||||
// Core Response Queue Size
|
||||
`ifndef SMEM_CRSQ_SIZE
|
||||
`define SMEM_CRSQ_SIZE 2
|
||||
`define SMEM_NUM_BANKS (`NUM_LSU_LANES)
|
||||
`endif
|
||||
|
||||
// L2cache Configurable Knobs /////////////////////////////////////////////////
|
||||
|
||||
// Size of cache in bytes
|
||||
// Cache Size
|
||||
`ifndef L2_CACHE_SIZE
|
||||
`define L2_CACHE_SIZE 131072
|
||||
`ifdef ALTERA_S10
|
||||
`define L2_CACHE_SIZE 2097152
|
||||
`else
|
||||
`define L2_CACHE_SIZE 1048576
|
||||
`endif
|
||||
`endif
|
||||
|
||||
// Number of banks
|
||||
// Number of Banks
|
||||
`ifndef L2_NUM_BANKS
|
||||
`define L2_NUM_BANKS ((`NUM_CORES < 4) ? `NUM_CORES : 4)
|
||||
`endif
|
||||
|
||||
// Number of ports per bank
|
||||
`ifndef L2_NUM_PORTS
|
||||
`define L2_NUM_PORTS 1
|
||||
`endif
|
||||
|
||||
// Core Request Queue Size
|
||||
`ifndef L2_CREQ_SIZE
|
||||
`define L2_CREQ_SIZE 0
|
||||
`define L2_NUM_BANKS 2
|
||||
`endif
|
||||
|
||||
// Core Response Queue Size
|
||||
|
@ -439,26 +602,25 @@
|
|||
`define L2_MRSQ_SIZE 0
|
||||
`endif
|
||||
|
||||
// Number of Associative Ways
|
||||
`ifndef L2_NUM_WAYS
|
||||
`define L2_NUM_WAYS 4
|
||||
`endif
|
||||
|
||||
// L3cache Configurable Knobs /////////////////////////////////////////////////
|
||||
|
||||
// Size of cache in bytes
|
||||
// Cache Size
|
||||
`ifndef L3_CACHE_SIZE
|
||||
`ifdef ALTERA_S10
|
||||
`define L3_CACHE_SIZE 2097152
|
||||
`else
|
||||
`define L3_CACHE_SIZE 1048576
|
||||
`endif
|
||||
`endif
|
||||
|
||||
// Number of banks
|
||||
// Number of Banks
|
||||
`ifndef L3_NUM_BANKS
|
||||
`define L3_NUM_BANKS ((`NUM_CLUSTERS < 4) ? `NUM_CORES : 4)
|
||||
`endif
|
||||
|
||||
// Number of ports per bank
|
||||
`ifndef L3_NUM_PORTS
|
||||
`define L3_NUM_PORTS 1
|
||||
`endif
|
||||
|
||||
// Core Request Queue Size
|
||||
`ifndef L3_CREQ_SIZE
|
||||
`define L3_CREQ_SIZE 0
|
||||
`define L3_NUM_BANKS `MIN(4, `NUM_CLUSTERS)
|
||||
`endif
|
||||
|
||||
// Core Response Queue Size
|
||||
|
@ -481,4 +643,9 @@
|
|||
`define L3_MRSQ_SIZE 0
|
||||
`endif
|
||||
|
||||
`endif
|
||||
// Number of Associative Ways
|
||||
`ifndef L3_NUM_WAYS
|
||||
`define L3_NUM_WAYS 4
|
||||
`endif
|
||||
|
||||
`endif // VX_CONFIG_VH
|
||||
|
|
|
@ -1,156 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_core #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
`SCOPE_IO_VX_core
|
||||
|
||||
// Clock
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// Memory request
|
||||
output wire mem_req_valid,
|
||||
output wire mem_req_rw,
|
||||
output wire [`DCACHE_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
|
||||
output wire [`DCACHE_MEM_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [`DCACHE_MEM_DATA_WIDTH-1:0] mem_req_data,
|
||||
output wire [`L1_MEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// Memory reponse
|
||||
input wire mem_rsp_valid,
|
||||
input wire [`DCACHE_MEM_DATA_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [`L1_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire mem_rsp_ready,
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
);
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_memsys_if perf_memsys_if();
|
||||
`endif
|
||||
|
||||
VX_mem_req_if #(
|
||||
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
|
||||
.ADDR_WIDTH (`DCACHE_MEM_ADDR_WIDTH),
|
||||
.TAG_WIDTH (`L1_MEM_TAG_WIDTH)
|
||||
) mem_req_if();
|
||||
|
||||
VX_mem_rsp_if #(
|
||||
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
|
||||
.TAG_WIDTH (`L1_MEM_TAG_WIDTH)
|
||||
) mem_rsp_if();
|
||||
|
||||
assign mem_req_valid = mem_req_if.valid;
|
||||
assign mem_req_rw = mem_req_if.rw;
|
||||
assign mem_req_byteen= mem_req_if.byteen;
|
||||
assign mem_req_addr = mem_req_if.addr;
|
||||
assign mem_req_data = mem_req_if.data;
|
||||
assign mem_req_tag = mem_req_if.tag;
|
||||
assign mem_req_if.ready = mem_req_ready;
|
||||
|
||||
assign mem_rsp_if.valid = mem_rsp_valid;
|
||||
assign mem_rsp_if.data = mem_rsp_data;
|
||||
assign mem_rsp_if.tag = mem_rsp_tag;
|
||||
assign mem_rsp_ready = mem_rsp_if.ready;
|
||||
|
||||
//--
|
||||
|
||||
VX_dcache_req_if #(
|
||||
.NUM_REQS (`DCACHE_NUM_REQS),
|
||||
.WORD_SIZE (`DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH)
|
||||
) dcache_req_if();
|
||||
|
||||
VX_dcache_rsp_if #(
|
||||
.NUM_REQS (`DCACHE_NUM_REQS),
|
||||
.WORD_SIZE (`DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH)
|
||||
) dcache_rsp_if();
|
||||
|
||||
VX_icache_req_if #(
|
||||
.WORD_SIZE (`ICACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (`ICACHE_CORE_TAG_WIDTH)
|
||||
) icache_req_if();
|
||||
|
||||
VX_icache_rsp_if #(
|
||||
.WORD_SIZE (`ICACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (`ICACHE_CORE_TAG_WIDTH)
|
||||
) icache_rsp_if();
|
||||
|
||||
VX_pipeline #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) pipeline (
|
||||
`SCOPE_BIND_VX_core_pipeline
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_memsys_if (perf_memsys_if),
|
||||
`endif
|
||||
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
|
||||
// Dcache core request
|
||||
.dcache_req_valid (dcache_req_if.valid),
|
||||
.dcache_req_rw (dcache_req_if.rw),
|
||||
.dcache_req_byteen (dcache_req_if.byteen),
|
||||
.dcache_req_addr (dcache_req_if.addr),
|
||||
.dcache_req_data (dcache_req_if.data),
|
||||
.dcache_req_tag (dcache_req_if.tag),
|
||||
.dcache_req_ready (dcache_req_if.ready),
|
||||
|
||||
// Dcache core reponse
|
||||
.dcache_rsp_valid (dcache_rsp_if.valid),
|
||||
.dcache_rsp_tmask (dcache_rsp_if.tmask),
|
||||
.dcache_rsp_data (dcache_rsp_if.data),
|
||||
.dcache_rsp_tag (dcache_rsp_if.tag),
|
||||
.dcache_rsp_ready (dcache_rsp_if.ready),
|
||||
|
||||
// Icache core request
|
||||
.icache_req_valid (icache_req_if.valid),
|
||||
.icache_req_addr (icache_req_if.addr),
|
||||
.icache_req_tag (icache_req_if.tag),
|
||||
.icache_req_ready (icache_req_if.ready),
|
||||
|
||||
// Icache core reponse
|
||||
.icache_rsp_valid (icache_rsp_if.valid),
|
||||
.icache_rsp_data (icache_rsp_if.data),
|
||||
.icache_rsp_tag (icache_rsp_if.tag),
|
||||
.icache_rsp_ready (icache_rsp_if.ready),
|
||||
|
||||
// Status
|
||||
.busy(busy)
|
||||
);
|
||||
|
||||
//--
|
||||
|
||||
VX_mem_unit #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) mem_unit (
|
||||
`SCOPE_BIND_VX_core_mem_unit
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_memsys_if (perf_memsys_if),
|
||||
`endif
|
||||
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
// Core <-> Dcache
|
||||
.dcache_req_if (dcache_req_if),
|
||||
.dcache_rsp_if (dcache_rsp_if),
|
||||
|
||||
// Core <-> Icache
|
||||
.icache_req_if (icache_req_if),
|
||||
.icache_rsp_if (icache_rsp_if),
|
||||
|
||||
// Memory
|
||||
.mem_req_if (mem_req_if),
|
||||
.mem_rsp_if (mem_rsp_if)
|
||||
);
|
||||
|
||||
endmodule
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,265 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_csr_data #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
VX_perf_tex_if.slave perf_tex_if,
|
||||
`endif
|
||||
VX_perf_memsys_if.slave perf_memsys_if,
|
||||
VX_perf_pipeline_if.slave perf_pipeline_if,
|
||||
`endif
|
||||
|
||||
VX_cmt_to_csr_if.slave cmt_to_csr_if,
|
||||
VX_fetch_to_csr_if.slave fetch_to_csr_if,
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_fpu_to_csr_if.slave fpu_to_csr_if,
|
||||
`endif
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
VX_tex_csr_if.master tex_csr_if,
|
||||
`endif
|
||||
|
||||
input wire read_enable,
|
||||
input wire [`UUID_BITS-1:0] read_uuid,
|
||||
input wire[`CSR_ADDR_BITS-1:0] read_addr,
|
||||
input wire[`NW_BITS-1:0] read_wid,
|
||||
output wire[31:0] read_data,
|
||||
|
||||
input wire write_enable,
|
||||
input wire [`UUID_BITS-1:0] write_uuid,
|
||||
input wire[`CSR_ADDR_BITS-1:0] write_addr,
|
||||
input wire[`NW_BITS-1:0] write_wid,
|
||||
input wire[31:0] write_data,
|
||||
|
||||
input wire busy
|
||||
);
|
||||
import fpu_types::*;
|
||||
|
||||
reg [`CSR_WIDTH-1:0] csr_satp;
|
||||
reg [`CSR_WIDTH-1:0] csr_mstatus;
|
||||
reg [`CSR_WIDTH-1:0] csr_medeleg;
|
||||
reg [`CSR_WIDTH-1:0] csr_mideleg;
|
||||
reg [`CSR_WIDTH-1:0] csr_mie;
|
||||
reg [`CSR_WIDTH-1:0] csr_mtvec;
|
||||
reg [`CSR_WIDTH-1:0] csr_mepc;
|
||||
reg [`CSR_WIDTH-1:0] csr_pmpcfg [0:0];
|
||||
reg [`CSR_WIDTH-1:0] csr_pmpaddr [0:0];
|
||||
reg [63:0] csr_cycle;
|
||||
reg [63:0] csr_instret;
|
||||
|
||||
reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FFLAGS_BITS-1:0] fcsr;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
fcsr <= '0;
|
||||
end else begin
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (fpu_to_csr_if.write_enable) begin
|
||||
fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0]
|
||||
| fpu_to_csr_if.write_fflags;
|
||||
end
|
||||
`endif
|
||||
if (write_enable) begin
|
||||
case (write_addr)
|
||||
`CSR_FFLAGS: fcsr[write_wid][`FFLAGS_BITS-1:0] <= write_data[`FFLAGS_BITS-1:0];
|
||||
`CSR_FRM: fcsr[write_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS] <= write_data[`INST_FRM_BITS-1:0];
|
||||
`CSR_FCSR: fcsr[write_wid] <= write_data[`FFLAGS_BITS+`INST_FRM_BITS-1:0];
|
||||
`CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0];
|
||||
default: begin
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
`ASSERT((write_addr == `CSR_TEX_UNIT)
|
||||
|| (write_addr >= `CSR_TEX_STATE_BEGIN
|
||||
&& write_addr < `CSR_TEX_STATE_END),
|
||||
("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid));
|
||||
`else
|
||||
`ASSERT(~write_enable, ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid));
|
||||
`endif
|
||||
end
|
||||
endcase
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
`UNUSED_VAR (write_data)
|
||||
|
||||
// TEX CSRs
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
assign tex_csr_if.write_enable = write_enable;
|
||||
assign tex_csr_if.write_addr = write_addr;
|
||||
assign tex_csr_if.write_data = write_data;
|
||||
assign tex_csr_if.write_uuid = write_uuid;
|
||||
`endif
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
csr_cycle <= 0;
|
||||
csr_instret <= 0;
|
||||
end else begin
|
||||
if (busy) begin
|
||||
csr_cycle <= csr_cycle + 1;
|
||||
end
|
||||
if (cmt_to_csr_if.valid) begin
|
||||
csr_instret <= csr_instret + 64'(cmt_to_csr_if.commit_size);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
reg [31:0] read_data_r;
|
||||
reg read_addr_valid_r;
|
||||
|
||||
always @(*) begin
|
||||
read_data_r = 'x;
|
||||
read_addr_valid_r = 1;
|
||||
case (read_addr)
|
||||
`CSR_FFLAGS : read_data_r = 32'(fcsr[read_wid][`FFLAGS_BITS-1:0]);
|
||||
`CSR_FRM : read_data_r = 32'(fcsr[read_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS]);
|
||||
`CSR_FCSR : read_data_r = 32'(fcsr[read_wid]);
|
||||
|
||||
`CSR_WTID ,
|
||||
`CSR_LTID ,
|
||||
`CSR_LWID : read_data_r = 32'(read_wid);
|
||||
`CSR_GTID ,
|
||||
/*`CSR_MHARTID ,*/
|
||||
`CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(read_wid);
|
||||
`CSR_GCID : read_data_r = CORE_ID;
|
||||
|
||||
`CSR_TMASK : read_data_r = 32'(fetch_to_csr_if.thread_masks[read_wid]);
|
||||
|
||||
`CSR_NT : read_data_r = `NUM_THREADS;
|
||||
`CSR_NW : read_data_r = `NUM_WARPS;
|
||||
`CSR_NC : read_data_r = `NUM_CORES * `NUM_CLUSTERS;
|
||||
|
||||
`CSR_MCYCLE : read_data_r = csr_cycle[31:0];
|
||||
`CSR_MCYCLE_H : read_data_r = 32'(csr_cycle[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MINSTRET : read_data_r = csr_instret[31:0];
|
||||
`CSR_MINSTRET_H : read_data_r = 32'(csr_instret[`PERF_CTR_BITS-1:32]);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
// PERF: pipeline
|
||||
`CSR_MPM_IBUF_ST : read_data_r = perf_pipeline_if.ibf_stalls[31:0];
|
||||
`CSR_MPM_IBUF_ST_H : read_data_r = 32'(perf_pipeline_if.ibf_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_SCRB_ST : read_data_r = perf_pipeline_if.scb_stalls[31:0];
|
||||
`CSR_MPM_SCRB_ST_H : read_data_r = 32'(perf_pipeline_if.scb_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_ALU_ST : read_data_r = perf_pipeline_if.alu_stalls[31:0];
|
||||
`CSR_MPM_ALU_ST_H : read_data_r = 32'(perf_pipeline_if.alu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_LSU_ST : read_data_r = perf_pipeline_if.lsu_stalls[31:0];
|
||||
`CSR_MPM_LSU_ST_H : read_data_r = 32'(perf_pipeline_if.lsu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_CSR_ST : read_data_r = perf_pipeline_if.csr_stalls[31:0];
|
||||
`CSR_MPM_CSR_ST_H : read_data_r = 32'(perf_pipeline_if.csr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`ifdef EXT_F_ENABLE
|
||||
`CSR_MPM_FPU_ST : read_data_r = perf_pipeline_if.fpu_stalls[31:0];
|
||||
`CSR_MPM_FPU_ST_H : read_data_r = 32'(perf_pipeline_if.fpu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`else
|
||||
`CSR_MPM_FPU_ST : read_data_r = '0;
|
||||
`CSR_MPM_FPU_ST_H : read_data_r = '0;
|
||||
`endif
|
||||
`CSR_MPM_GPU_ST : read_data_r = perf_pipeline_if.gpu_stalls[31:0];
|
||||
`CSR_MPM_GPU_ST_H : read_data_r = 32'(perf_pipeline_if.gpu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: decode
|
||||
`CSR_MPM_LOADS : read_data_r = perf_pipeline_if.loads[31:0];
|
||||
`CSR_MPM_LOADS_H : read_data_r = 32'(perf_pipeline_if.loads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_STORES : read_data_r = perf_pipeline_if.stores[31:0];
|
||||
`CSR_MPM_STORES_H : read_data_r = 32'(perf_pipeline_if.stores[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_BRANCHES : read_data_r = perf_pipeline_if.branches[31:0];
|
||||
`CSR_MPM_BRANCHES_H : read_data_r = 32'(perf_pipeline_if.branches[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: icache
|
||||
`CSR_MPM_ICACHE_READS : read_data_r = perf_memsys_if.icache_reads[31:0];
|
||||
`CSR_MPM_ICACHE_READS_H : read_data_r = 32'(perf_memsys_if.icache_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_ICACHE_MISS_R : read_data_r = perf_memsys_if.icache_read_misses[31:0];
|
||||
`CSR_MPM_ICACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.icache_read_misses[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: dcache
|
||||
`CSR_MPM_DCACHE_READS : read_data_r = perf_memsys_if.dcache_reads[31:0];
|
||||
`CSR_MPM_DCACHE_READS_H : read_data_r = 32'(perf_memsys_if.dcache_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_WRITES : read_data_r = perf_memsys_if.dcache_writes[31:0];
|
||||
`CSR_MPM_DCACHE_WRITES_H : read_data_r = 32'(perf_memsys_if.dcache_writes[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_MISS_R : read_data_r = perf_memsys_if.dcache_read_misses[31:0];
|
||||
`CSR_MPM_DCACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.dcache_read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_MISS_W : read_data_r = perf_memsys_if.dcache_write_misses[31:0];
|
||||
`CSR_MPM_DCACHE_MISS_W_H : read_data_r = 32'(perf_memsys_if.dcache_write_misses[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_BANK_ST : read_data_r = perf_memsys_if.dcache_bank_stalls[31:0];
|
||||
`CSR_MPM_DCACHE_BANK_ST_H : read_data_r = 32'(perf_memsys_if.dcache_bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_MSHR_ST : read_data_r = perf_memsys_if.dcache_mshr_stalls[31:0];
|
||||
`CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = 32'(perf_memsys_if.dcache_mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: smem
|
||||
`CSR_MPM_SMEM_READS : read_data_r = perf_memsys_if.smem_reads[31:0];
|
||||
`CSR_MPM_SMEM_READS_H : read_data_r = 32'(perf_memsys_if.smem_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_SMEM_WRITES : read_data_r = perf_memsys_if.smem_writes[31:0];
|
||||
`CSR_MPM_SMEM_WRITES_H : read_data_r = 32'(perf_memsys_if.smem_writes[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_SMEM_BANK_ST : read_data_r = perf_memsys_if.smem_bank_stalls[31:0];
|
||||
`CSR_MPM_SMEM_BANK_ST_H : read_data_r = 32'(perf_memsys_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: memory
|
||||
`CSR_MPM_MEM_READS : read_data_r = perf_memsys_if.mem_reads[31:0];
|
||||
`CSR_MPM_MEM_READS_H : read_data_r = 32'(perf_memsys_if.mem_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_MEM_WRITES : read_data_r = perf_memsys_if.mem_writes[31:0];
|
||||
`CSR_MPM_MEM_WRITES_H : read_data_r = 32'(perf_memsys_if.mem_writes[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_MEM_LAT : read_data_r = perf_memsys_if.mem_latency[31:0];
|
||||
`CSR_MPM_MEM_LAT_H : read_data_r = 32'(perf_memsys_if.mem_latency[`PERF_CTR_BITS-1:32]);
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
// PERF: texunit
|
||||
`CSR_MPM_TEX_READS : read_data_r = perf_tex_if.mem_reads[31:0];
|
||||
`CSR_MPM_TEX_READS_H : read_data_r = 32'(perf_tex_if.mem_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_TEX_LAT : read_data_r = perf_tex_if.mem_latency[31:0];
|
||||
`CSR_MPM_TEX_LAT_H : read_data_r = 32'(perf_tex_if.mem_latency[`PERF_CTR_BITS-1:32]);
|
||||
`endif
|
||||
// PERF: reserved
|
||||
`CSR_MPM_RESERVED : read_data_r = '0;
|
||||
`CSR_MPM_RESERVED_H : read_data_r = '0;
|
||||
`endif
|
||||
|
||||
`CSR_SATP : read_data_r = 32'(csr_satp);
|
||||
|
||||
`CSR_MSTATUS : read_data_r = 32'(csr_mstatus);
|
||||
`CSR_MISA : read_data_r = `ISA_CODE;
|
||||
`CSR_MEDELEG : read_data_r = 32'(csr_medeleg);
|
||||
`CSR_MIDELEG : read_data_r = 32'(csr_mideleg);
|
||||
`CSR_MIE : read_data_r = 32'(csr_mie);
|
||||
`CSR_MTVEC : read_data_r = 32'(csr_mtvec);
|
||||
|
||||
`CSR_MEPC : read_data_r = 32'(csr_mepc);
|
||||
|
||||
`CSR_PMPCFG0 : read_data_r = 32'(csr_pmpcfg[0]);
|
||||
`CSR_PMPADDR0 : read_data_r = 32'(csr_pmpaddr[0]);
|
||||
|
||||
`CSR_MVENDORID : read_data_r = `VENDOR_ID;
|
||||
`CSR_MARCHID : read_data_r = `ARCHITECTURE_ID;
|
||||
`CSR_MIMPID : read_data_r = `IMPLEMENTATION_ID;
|
||||
|
||||
default: begin
|
||||
if ((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32))
|
||||
|| (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32))) begin
|
||||
read_addr_valid_r = 1;
|
||||
end else
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
if ((read_addr == `CSR_TEX_UNIT)
|
||||
|| (read_addr >= `CSR_TEX_STATE_BEGIN
|
||||
&& read_addr < `CSR_TEX_STATE_END)) begin
|
||||
read_addr_valid_r = 1;
|
||||
end else
|
||||
`endif
|
||||
read_addr_valid_r = 0;
|
||||
end
|
||||
endcase
|
||||
end
|
||||
|
||||
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: %0h (#%0d)", $time, read_addr, read_uuid))
|
||||
|
||||
assign read_data = read_data_r;
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
assign fpu_to_csr_if.read_frm = fcsr[fpu_to_csr_if.read_wid][`INST_FRM_BITS+`FFLAGS_BITS-1:`FFLAGS_BITS];
|
||||
`endif
|
||||
|
||||
endmodule
|
|
@ -1,151 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_csr_unit #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
VX_perf_tex_if.slave perf_tex_if,
|
||||
`endif
|
||||
VX_perf_memsys_if.slave perf_memsys_if,
|
||||
VX_perf_pipeline_if.slave perf_pipeline_if,
|
||||
`endif
|
||||
|
||||
VX_cmt_to_csr_if.slave cmt_to_csr_if,
|
||||
VX_fetch_to_csr_if.slave fetch_to_csr_if,
|
||||
VX_csr_req_if.slave csr_req_if,
|
||||
VX_commit_if.master csr_commit_if,
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_fpu_to_csr_if.slave fpu_to_csr_if,
|
||||
input wire[`NUM_WARPS-1:0] fpu_pending,
|
||||
`endif
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
VX_tex_csr_if.master tex_csr_if,
|
||||
`endif
|
||||
|
||||
output wire[`NUM_WARPS-1:0] pending,
|
||||
input wire busy
|
||||
);
|
||||
wire csr_we_s1;
|
||||
wire [`CSR_ADDR_BITS-1:0] csr_addr_s1;
|
||||
wire [31:0] csr_read_data;
|
||||
wire [31:0] csr_read_data_s1;
|
||||
wire [31:0] csr_updated_data_s1;
|
||||
|
||||
wire write_enable = csr_commit_if.valid && csr_we_s1;
|
||||
|
||||
wire [31:0] csr_req_data = csr_req_if.use_imm ? 32'(csr_req_if.imm) : csr_req_if.rs1_data;
|
||||
|
||||
VX_csr_data #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) csr_data (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
.perf_tex_if (perf_tex_if),
|
||||
`endif
|
||||
.perf_memsys_if (perf_memsys_if),
|
||||
.perf_pipeline_if(perf_pipeline_if),
|
||||
`endif
|
||||
.cmt_to_csr_if (cmt_to_csr_if),
|
||||
.fetch_to_csr_if(fetch_to_csr_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_to_csr_if (fpu_to_csr_if),
|
||||
`endif
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
.tex_csr_if (tex_csr_if),
|
||||
`endif
|
||||
.read_enable (csr_req_if.valid),
|
||||
.read_uuid (csr_req_if.uuid),
|
||||
.read_addr (csr_req_if.addr),
|
||||
.read_wid (csr_req_if.wid),
|
||||
.read_data (csr_read_data),
|
||||
.write_enable (write_enable),
|
||||
.write_uuid (csr_commit_if.uuid),
|
||||
.write_addr (csr_addr_s1),
|
||||
.write_wid (csr_commit_if.wid),
|
||||
.write_data (csr_updated_data_s1),
|
||||
.busy (busy)
|
||||
);
|
||||
|
||||
wire write_hazard = (csr_addr_s1 == csr_req_if.addr)
|
||||
&& (csr_commit_if.wid == csr_req_if.wid)
|
||||
&& csr_commit_if.valid;
|
||||
|
||||
wire [31:0] csr_read_data_qual = write_hazard ? csr_updated_data_s1 : csr_read_data;
|
||||
|
||||
reg [31:0] csr_updated_data;
|
||||
reg csr_we_s0_unqual;
|
||||
|
||||
always @(*) begin
|
||||
csr_we_s0_unqual = (csr_req_data != 0);
|
||||
case (csr_req_if.op_type)
|
||||
`INST_CSR_RW: begin
|
||||
csr_updated_data = csr_req_data;
|
||||
csr_we_s0_unqual = 1;
|
||||
end
|
||||
`INST_CSR_RS: begin
|
||||
csr_updated_data = csr_read_data_qual | csr_req_data;
|
||||
end
|
||||
//`INST_CSR_RC
|
||||
default: begin
|
||||
csr_updated_data = csr_read_data_qual & ~csr_req_data;
|
||||
end
|
||||
endcase
|
||||
end
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
wire stall_in = fpu_pending[csr_req_if.wid];
|
||||
`else
|
||||
wire stall_in = 0;
|
||||
`endif
|
||||
|
||||
wire csr_req_valid = csr_req_if.valid && !stall_in;
|
||||
|
||||
wire stall_out = ~csr_commit_if.ready && csr_commit_if.valid;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 32 + 32),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_out),
|
||||
.data_in ({csr_req_valid, csr_req_if.uuid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.rd, csr_req_if.wb, csr_we_s0_unqual, csr_req_if.addr, csr_read_data_qual, csr_updated_data}),
|
||||
.data_out ({csr_commit_if.valid, csr_commit_if.uuid, csr_commit_if.wid, csr_commit_if.tmask, csr_commit_if.PC, csr_commit_if.rd, csr_commit_if.wb, csr_we_s1, csr_addr_s1, csr_read_data_s1, csr_updated_data_s1})
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
assign csr_commit_if.data[i] = (csr_addr_s1 == `CSR_WTID) ? i :
|
||||
(csr_addr_s1 == `CSR_LTID
|
||||
|| csr_addr_s1 == `CSR_GTID) ? (csr_read_data_s1 * `NUM_THREADS + i) :
|
||||
csr_read_data_s1;
|
||||
end
|
||||
|
||||
assign csr_commit_if.eop = 1'b1;
|
||||
|
||||
// can accept new request?
|
||||
assign csr_req_if.ready = ~(stall_out || stall_in);
|
||||
|
||||
// pending request
|
||||
reg [`NUM_WARPS-1:0] pending_r;
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
pending_r <= 0;
|
||||
end else begin
|
||||
if (csr_commit_if.valid && csr_commit_if.ready) begin
|
||||
pending_r[csr_commit_if.wid] <= 0;
|
||||
end
|
||||
if (csr_req_if.valid && csr_req_if.ready) begin
|
||||
pending_r[csr_req_if.wid] <= 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
assign pending = pending_r;
|
||||
|
||||
endmodule
|
|
@ -1,495 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE
|
||||
`include "VX_trace_instr.vh"
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`define USED_IREG(r) \
|
||||
r``_r = {1'b0, ``r}
|
||||
|
||||
`define USED_FREG(r) \
|
||||
r``_r = {1'b1, ``r}
|
||||
`else
|
||||
`define USED_IREG(r) \
|
||||
r``_r = ``r
|
||||
`endif
|
||||
|
||||
module VX_decode #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_pipeline_if.decode perf_decode_if,
|
||||
`endif
|
||||
|
||||
// inputs
|
||||
VX_ifetch_rsp_if.slave ifetch_rsp_if,
|
||||
|
||||
// outputs
|
||||
VX_decode_if.master decode_if,
|
||||
VX_wstall_if.master wstall_if,
|
||||
VX_join_if.master join_if
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
reg [`EX_BITS-1:0] ex_type;
|
||||
reg [`INST_OP_BITS-1:0] op_type;
|
||||
reg [`INST_MOD_BITS-1:0] op_mod;
|
||||
reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r;
|
||||
reg [31:0] imm;
|
||||
reg use_rd, use_PC, use_imm;
|
||||
reg is_join, is_wstall;
|
||||
|
||||
wire [31:0] instr = ifetch_rsp_if.data;
|
||||
wire [6:0] opcode = instr[6:0];
|
||||
wire [1:0] func2 = instr[26:25];
|
||||
wire [2:0] func3 = instr[14:12];
|
||||
wire [6:0] func7 = instr[31:25];
|
||||
wire [11:0] u_12 = instr[31:20];
|
||||
|
||||
wire [4:0] rd = instr[11:7];
|
||||
wire [4:0] rs1 = instr[19:15];
|
||||
wire [4:0] rs2 = instr[24:20];
|
||||
wire [4:0] rs3 = instr[31:27];
|
||||
|
||||
wire [19:0] upper_imm = {func7, rs2, rs1, func3};
|
||||
wire [11:0] alu_imm = (func3[0] && ~func3[1]) ? {{7{1'b0}}, rs2} : u_12;
|
||||
wire [11:0] s_imm = {func7, rd};
|
||||
wire [12:0] b_imm = {instr[31], instr[7], instr[30:25], instr[11:8], 1'b0};
|
||||
wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0};
|
||||
|
||||
`UNUSED_VAR (rs3)
|
||||
|
||||
always @(*) begin
|
||||
|
||||
ex_type = 0;
|
||||
op_type = 'x;
|
||||
op_mod = 0;
|
||||
rd_r = 0;
|
||||
rs1_r = 0;
|
||||
rs2_r = 0;
|
||||
rs3_r = 0;
|
||||
imm = 'x;
|
||||
use_imm = 0;
|
||||
use_PC = 0;
|
||||
use_rd = 0;
|
||||
is_join = 0;
|
||||
is_wstall = 0;
|
||||
|
||||
case (opcode)
|
||||
`INST_I: begin
|
||||
ex_type = `EX_ALU;
|
||||
case (func3)
|
||||
3'h0: op_type = `INST_OP_BITS'(`INST_ALU_ADD);
|
||||
3'h1: op_type = `INST_OP_BITS'(`INST_ALU_SLL);
|
||||
3'h2: op_type = `INST_OP_BITS'(`INST_ALU_SLT);
|
||||
3'h3: op_type = `INST_OP_BITS'(`INST_ALU_SLTU);
|
||||
3'h4: op_type = `INST_OP_BITS'(`INST_ALU_XOR);
|
||||
3'h5: op_type = (func7[5]) ? `INST_OP_BITS'(`INST_ALU_SRA) : `INST_OP_BITS'(`INST_ALU_SRL);
|
||||
3'h6: op_type = `INST_OP_BITS'(`INST_ALU_OR);
|
||||
3'h7: op_type = `INST_OP_BITS'(`INST_ALU_AND);
|
||||
default:;
|
||||
endcase
|
||||
use_rd = 1;
|
||||
use_imm = 1;
|
||||
imm = {{20{alu_imm[11]}}, alu_imm};
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
`INST_R: begin
|
||||
ex_type = `EX_ALU;
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (func7[0]) begin
|
||||
case (func3)
|
||||
3'h0: op_type = `INST_OP_BITS'(`INST_MUL_MUL);
|
||||
3'h1: op_type = `INST_OP_BITS'(`INST_MUL_MULH);
|
||||
3'h2: op_type = `INST_OP_BITS'(`INST_MUL_MULHSU);
|
||||
3'h3: op_type = `INST_OP_BITS'(`INST_MUL_MULHU);
|
||||
3'h4: op_type = `INST_OP_BITS'(`INST_MUL_DIV);
|
||||
3'h5: op_type = `INST_OP_BITS'(`INST_MUL_DIVU);
|
||||
3'h6: op_type = `INST_OP_BITS'(`INST_MUL_REM);
|
||||
3'h7: op_type = `INST_OP_BITS'(`INST_MUL_REMU);
|
||||
default:;
|
||||
endcase
|
||||
op_mod = 2;
|
||||
end else
|
||||
`endif
|
||||
begin
|
||||
case (func3)
|
||||
3'h0: op_type = (func7[5]) ? `INST_OP_BITS'(`INST_ALU_SUB) : `INST_OP_BITS'(`INST_ALU_ADD);
|
||||
3'h1: op_type = `INST_OP_BITS'(`INST_ALU_SLL);
|
||||
3'h2: op_type = `INST_OP_BITS'(`INST_ALU_SLT);
|
||||
3'h3: op_type = `INST_OP_BITS'(`INST_ALU_SLTU);
|
||||
3'h4: op_type = `INST_OP_BITS'(`INST_ALU_XOR);
|
||||
3'h5: op_type = (func7[5]) ? `INST_OP_BITS'(`INST_ALU_SRA) : `INST_OP_BITS'(`INST_ALU_SRL);
|
||||
3'h6: op_type = `INST_OP_BITS'(`INST_ALU_OR);
|
||||
3'h7: op_type = `INST_OP_BITS'(`INST_ALU_AND);
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
use_rd = 1;
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
`INST_LUI: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(`INST_ALU_LUI);
|
||||
use_rd = 1;
|
||||
use_imm = 1;
|
||||
imm = {upper_imm, 12'(0)};
|
||||
`USED_IREG (rd);
|
||||
rs1_r = 0;
|
||||
end
|
||||
`INST_AUIPC: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(`INST_ALU_AUIPC);
|
||||
use_rd = 1;
|
||||
use_imm = 1;
|
||||
use_PC = 1;
|
||||
imm = {upper_imm, 12'(0)};
|
||||
`USED_IREG (rd);
|
||||
end
|
||||
`INST_JAL: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(`INST_BR_JAL);
|
||||
op_mod = 1;
|
||||
use_rd = 1;
|
||||
use_imm = 1;
|
||||
use_PC = 1;
|
||||
is_wstall = 1;
|
||||
imm = {{11{jal_imm[20]}}, jal_imm};
|
||||
`USED_IREG (rd);
|
||||
end
|
||||
`INST_JALR: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(`INST_BR_JALR);
|
||||
op_mod = 1;
|
||||
use_rd = 1;
|
||||
use_imm = 1;
|
||||
is_wstall = 1;
|
||||
imm = {{20{u_12[11]}}, u_12};
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
`INST_B: begin
|
||||
ex_type = `EX_ALU;
|
||||
case (func3)
|
||||
3'h0: op_type = `INST_OP_BITS'(`INST_BR_EQ);
|
||||
3'h1: op_type = `INST_OP_BITS'(`INST_BR_NE);
|
||||
3'h4: op_type = `INST_OP_BITS'(`INST_BR_LT);
|
||||
3'h5: op_type = `INST_OP_BITS'(`INST_BR_GE);
|
||||
3'h6: op_type = `INST_OP_BITS'(`INST_BR_LTU);
|
||||
3'h7: op_type = `INST_OP_BITS'(`INST_BR_GEU);
|
||||
default:;
|
||||
endcase
|
||||
op_mod = 1;
|
||||
use_imm = 1;
|
||||
use_PC = 1;
|
||||
is_wstall = 1;
|
||||
imm = {{19{b_imm[12]}}, b_imm};
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
`INST_FENCE: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_mod = `INST_MOD_BITS'(1);
|
||||
end
|
||||
`INST_SYS : begin
|
||||
if (func3[1:0] != 0) begin
|
||||
ex_type = `EX_CSR;
|
||||
op_type = `INST_OP_BITS'(func3[1:0]);
|
||||
use_rd = 1;
|
||||
use_imm = func3[2];
|
||||
imm[`CSR_ADDR_BITS-1:0] = u_12; // addr
|
||||
`USED_IREG (rd);
|
||||
if (func3[2]) begin
|
||||
imm[`CSR_ADDR_BITS +: `NRI_BITS] = rs1; // imm
|
||||
end else begin
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
end else begin
|
||||
ex_type = `EX_ALU;
|
||||
case (u_12)
|
||||
12'h000: op_type = `INST_OP_BITS'(`INST_BR_ECALL);
|
||||
12'h001: op_type = `INST_OP_BITS'(`INST_BR_EBREAK);
|
||||
12'h002: op_type = `INST_OP_BITS'(`INST_BR_URET);
|
||||
12'h102: op_type = `INST_OP_BITS'(`INST_BR_SRET);
|
||||
12'h302: op_type = `INST_OP_BITS'(`INST_BR_MRET);
|
||||
default:;
|
||||
endcase
|
||||
op_mod = 1;
|
||||
use_rd = 1;
|
||||
use_imm = 1;
|
||||
use_PC = 1;
|
||||
is_wstall = 1;
|
||||
imm = 32'd4;
|
||||
`USED_IREG (rd);
|
||||
end
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
`INST_FL,
|
||||
`endif
|
||||
`INST_L: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_OP_BITS'({1'b0, func3});
|
||||
use_rd = 1;
|
||||
imm = {{20{u_12[11]}}, u_12};
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (opcode[2]) begin
|
||||
`USED_FREG (rd);
|
||||
end else
|
||||
`endif
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
`INST_FS,
|
||||
`endif
|
||||
`INST_S: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_OP_BITS'({1'b1, func3});
|
||||
imm = {{20{s_imm[11]}}, s_imm};
|
||||
`USED_IREG (rs1);
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (opcode[2]) begin
|
||||
`USED_FREG (rs2);
|
||||
end else
|
||||
`endif
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
`INST_FMADD,
|
||||
`INST_FMSUB,
|
||||
`INST_FNMSUB,
|
||||
`INST_FNMADD: begin
|
||||
ex_type = `EX_FPU;
|
||||
op_type = `INST_OP_BITS'(opcode[3:0]);
|
||||
op_mod = func3;
|
||||
use_rd = 1;
|
||||
`USED_FREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
`USED_FREG (rs2);
|
||||
`USED_FREG (rs3);
|
||||
end
|
||||
`INST_FCI: begin
|
||||
ex_type = `EX_FPU;
|
||||
op_mod = func3;
|
||||
use_rd = 1;
|
||||
case (func7)
|
||||
7'h00, // FADD
|
||||
7'h04, // FSUB
|
||||
7'h08, // FMUL
|
||||
7'h0C: begin // FDIV
|
||||
op_type = `INST_OP_BITS'(func7[3:0]);
|
||||
`USED_FREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
`USED_FREG (rs2);
|
||||
end
|
||||
7'h2C: begin
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_SQRT);
|
||||
`USED_FREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
end
|
||||
7'h50: begin
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_CMP);
|
||||
`USED_IREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
`USED_FREG (rs2);
|
||||
end
|
||||
7'h60: begin
|
||||
op_type = (instr[20]) ? `INST_OP_BITS'(`INST_FPU_CVTWUS) : `INST_OP_BITS'(`INST_FPU_CVTWS);
|
||||
`USED_IREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
end
|
||||
7'h68: begin
|
||||
op_type = (instr[20]) ? `INST_OP_BITS'(`INST_FPU_CVTSWU) : `INST_OP_BITS'(`INST_FPU_CVTSW);
|
||||
`USED_FREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
7'h10: begin
|
||||
// FSGNJ=0, FSGNJN=1, FSGNJX=2
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
|
||||
op_mod = {1'b0, func3[1:0]};
|
||||
`USED_FREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
`USED_FREG (rs2);
|
||||
end
|
||||
7'h14: begin
|
||||
// FMIN=3, FMAX=4
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
|
||||
op_mod = func3[0] ? 4 : 3;
|
||||
`USED_FREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
`USED_FREG (rs2);
|
||||
end
|
||||
7'h70: begin
|
||||
if (func3[0]) begin
|
||||
// FCLASS
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_CLASS);
|
||||
end else begin
|
||||
// FMV.X.W=5
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
|
||||
op_mod = 5;
|
||||
end
|
||||
`USED_IREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
end
|
||||
7'h78: begin
|
||||
// FMV.W.X=6
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
|
||||
op_mod = 6;
|
||||
`USED_FREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
`endif
|
||||
`INST_GPGPU: begin
|
||||
ex_type = `EX_GPU;
|
||||
case (func3)
|
||||
3'h0: begin
|
||||
op_type = rs2[0] ? `INST_OP_BITS'(`INST_GPU_PRED) : `INST_OP_BITS'(`INST_GPU_TMC);
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
3'h1: begin
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_WSPAWN);
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
3'h2: begin
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_SPLIT);
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
3'h3: begin
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_JOIN);
|
||||
is_join = 1;
|
||||
end
|
||||
3'h4: begin
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_BAR);
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
3'h5: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_OP_BITS'(`INST_LSU_LW);
|
||||
op_mod = `INST_MOD_BITS'(2);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
`INST_GPU: begin
|
||||
case (func3)
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
3'h0: begin
|
||||
ex_type = `EX_GPU;
|
||||
op_type = `INST_OP_BITS'(`INST_GPU_TEX);
|
||||
op_mod = `INST_MOD_BITS'(func2);
|
||||
use_rd = 1;
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
`USED_IREG (rs3);
|
||||
end
|
||||
`endif
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
|
||||
`UNUSED_VAR (func2)
|
||||
|
||||
// disable write to integer register r0
|
||||
wire wb = use_rd && (| rd_r);
|
||||
|
||||
assign decode_if.valid = ifetch_rsp_if.valid;
|
||||
assign decode_if.uuid = ifetch_rsp_if.uuid;
|
||||
assign decode_if.wid = ifetch_rsp_if.wid;
|
||||
assign decode_if.tmask = ifetch_rsp_if.tmask;
|
||||
assign decode_if.PC = ifetch_rsp_if.PC;
|
||||
assign decode_if.ex_type = ex_type;
|
||||
assign decode_if.op_type = op_type;
|
||||
assign decode_if.op_mod = op_mod;
|
||||
assign decode_if.wb = wb;
|
||||
assign decode_if.rd = rd_r;
|
||||
assign decode_if.rs1 = rs1_r;
|
||||
assign decode_if.rs2 = rs2_r;
|
||||
assign decode_if.rs3 = rs3_r;
|
||||
assign decode_if.imm = imm;
|
||||
assign decode_if.use_PC = use_PC;
|
||||
assign decode_if.use_imm = use_imm;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire ifetch_rsp_fire = ifetch_rsp_if.valid && ifetch_rsp_if.ready;
|
||||
|
||||
assign join_if.valid = ifetch_rsp_fire && is_join;
|
||||
assign join_if.wid = ifetch_rsp_if.wid;
|
||||
|
||||
assign wstall_if.valid = ifetch_rsp_fire;
|
||||
assign wstall_if.wid = ifetch_rsp_if.wid;
|
||||
assign wstall_if.stalled = is_wstall;
|
||||
|
||||
assign ifetch_rsp_if.ready = decode_if.ready;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] perf_loads_per_cycle;
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] perf_stores_per_cycle;
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] perf_branches_per_cycle;
|
||||
|
||||
wire [`NUM_THREADS-1:0] perf_loads_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && decode_if.wb}};
|
||||
wire [`NUM_THREADS-1:0] perf_stores_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && ~decode_if.wb}};
|
||||
wire [`NUM_THREADS-1:0] perf_branches_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_ALU && `INST_ALU_IS_BR(decode_if.op_mod)}};
|
||||
|
||||
`POP_COUNT(perf_loads_per_cycle, perf_loads_per_mask);
|
||||
`POP_COUNT(perf_stores_per_cycle, perf_stores_per_mask);
|
||||
`POP_COUNT(perf_branches_per_cycle, perf_branches_per_mask);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_loads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_stores;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_branches;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_loads <= 0;
|
||||
perf_stores <= 0;
|
||||
perf_branches <= 0;
|
||||
end else begin
|
||||
if (decode_if.valid && decode_if.ready) begin
|
||||
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_loads_per_cycle);
|
||||
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_stores_per_cycle);
|
||||
perf_branches <= perf_branches + `PERF_CTR_BITS'(perf_branches_per_cycle);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_decode_if.loads = perf_loads;
|
||||
assign perf_decode_if.stores = perf_stores;
|
||||
assign perf_decode_if.branches = perf_branches;
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (decode_if.valid && decode_if.ready) begin
|
||||
dpi_trace("%d: core%0d-decode: wid=%0d, PC=%0h, ex=", $time, CORE_ID, decode_if.wid, decode_if.PC);
|
||||
trace_ex_type(decode_if.ex_type);
|
||||
dpi_trace(", op=");
|
||||
trace_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod);
|
||||
dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b (#%0d)\n",
|
||||
decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm, decode_if.uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
|
@ -1,416 +1,425 @@
|
|||
`ifndef VX_DEFINE
|
||||
`define VX_DEFINE
|
||||
|
||||
`include "VX_platform.vh"
|
||||
`include "VX_config.vh"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define NW_BITS `LOG2UP(`NUM_WARPS)
|
||||
|
||||
`define NT_BITS `LOG2UP(`NUM_THREADS)
|
||||
|
||||
`define NC_BITS `LOG2UP(`NUM_CORES)
|
||||
|
||||
`define NB_BITS `LOG2UP(`NUM_BARRIERS)
|
||||
|
||||
`define NUM_IREGS 32
|
||||
|
||||
`define NRI_BITS `LOG2UP(`NUM_IREGS)
|
||||
|
||||
`define NTEX_BITS `LOG2UP(`NUM_TEX_UNITS)
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`define NUM_REGS (2 * `NUM_IREGS)
|
||||
`else
|
||||
`define NUM_REGS `NUM_IREGS
|
||||
`endif
|
||||
|
||||
`define NR_BITS `LOG2UP(`NUM_REGS)
|
||||
|
||||
`define CSR_ADDR_BITS 12
|
||||
|
||||
`define CSR_WIDTH 12
|
||||
|
||||
`define PERF_CTR_BITS 44
|
||||
|
||||
`define UUID_BITS 44
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define EX_NOP 3'h0
|
||||
`define EX_ALU 3'h1
|
||||
`define EX_LSU 3'h2
|
||||
`define EX_CSR 3'h3
|
||||
`define EX_FPU 3'h4
|
||||
`define EX_GPU 3'h5
|
||||
`define EX_BITS 3
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_LUI 7'b0110111
|
||||
`define INST_AUIPC 7'b0010111
|
||||
`define INST_JAL 7'b1101111
|
||||
`define INST_JALR 7'b1100111
|
||||
`define INST_B 7'b1100011 // branch instructions
|
||||
`define INST_L 7'b0000011 // load instructions
|
||||
`define INST_S 7'b0100011 // store instructions
|
||||
`define INST_I 7'b0010011 // immediate instructions
|
||||
`define INST_R 7'b0110011 // register instructions
|
||||
`define INST_FENCE 7'b0001111 // Fence instructions
|
||||
`define INST_SYS 7'b1110011 // system instructions
|
||||
|
||||
`define INST_FL 7'b0000111 // float load instruction
|
||||
`define INST_FS 7'b0100111 // float store instruction
|
||||
`define INST_FMADD 7'b1000011
|
||||
`define INST_FMSUB 7'b1000111
|
||||
`define INST_FNMSUB 7'b1001011
|
||||
`define INST_FNMADD 7'b1001111
|
||||
`define INST_FCI 7'b1010011 // float common instructions
|
||||
|
||||
`define INST_GPGPU 7'b1101011
|
||||
`define INST_GPU 7'b1011011
|
||||
|
||||
`define INST_TEX 7'b0101011
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_FRM_RNE 3'b000 // round to nearest even
|
||||
`define INST_FRM_RTZ 3'b001 // round to zero
|
||||
`define INST_FRM_RDN 3'b010 // round to -inf
|
||||
`define INST_FRM_RUP 3'b011 // round to +inf
|
||||
`define INST_FRM_RMM 3'b100 // round to nearest max magnitude
|
||||
`define INST_FRM_DYN 3'b111 // dynamic mode
|
||||
`define INST_FRM_BITS 3
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_OP_BITS 4
|
||||
`define INST_MOD_BITS 3
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_ALU_ADD 4'b0000
|
||||
`define INST_ALU_LUI 4'b0010
|
||||
`define INST_ALU_AUIPC 4'b0011
|
||||
`define INST_ALU_SLTU 4'b0100
|
||||
`define INST_ALU_SLT 4'b0101
|
||||
`define INST_ALU_SRL 4'b1000
|
||||
`define INST_ALU_SRA 4'b1001
|
||||
`define INST_ALU_SUB 4'b1011
|
||||
`define INST_ALU_AND 4'b1100
|
||||
`define INST_ALU_OR 4'b1101
|
||||
`define INST_ALU_XOR 4'b1110
|
||||
`define INST_ALU_SLL 4'b1111
|
||||
`define INST_ALU_OTHER 4'b0111
|
||||
`define INST_ALU_BITS 4
|
||||
`define INST_ALU_OP(x) x[`INST_ALU_BITS-1:0]
|
||||
`define INST_ALU_OP_CLASS(x) x[3:2]
|
||||
`define INST_ALU_SIGNED(x) x[0]
|
||||
`define INST_ALU_IS_BR(x) x[0]
|
||||
`define INST_ALU_IS_MUL(x) x[1]
|
||||
|
||||
`define INST_BR_EQ 4'b0000
|
||||
`define INST_BR_NE 4'b0010
|
||||
`define INST_BR_LTU 4'b0100
|
||||
`define INST_BR_GEU 4'b0110
|
||||
`define INST_BR_LT 4'b0101
|
||||
`define INST_BR_GE 4'b0111
|
||||
`define INST_BR_JAL 4'b1000
|
||||
`define INST_BR_JALR 4'b1001
|
||||
`define INST_BR_ECALL 4'b1010
|
||||
`define INST_BR_EBREAK 4'b1011
|
||||
`define INST_BR_URET 4'b1100
|
||||
`define INST_BR_SRET 4'b1101
|
||||
`define INST_BR_MRET 4'b1110
|
||||
`define INST_BR_OTHER 4'b1111
|
||||
`define INST_BR_BITS 4
|
||||
`define INST_BR_NEG(x) x[1]
|
||||
`define INST_BR_LESS(x) x[2]
|
||||
`define INST_BR_STATIC(x) x[3]
|
||||
|
||||
`define INST_MUL_MUL 3'h0
|
||||
`define INST_MUL_MULH 3'h1
|
||||
`define INST_MUL_MULHSU 3'h2
|
||||
`define INST_MUL_MULHU 3'h3
|
||||
`define INST_MUL_DIV 3'h4
|
||||
`define INST_MUL_DIVU 3'h5
|
||||
`define INST_MUL_REM 3'h6
|
||||
`define INST_MUL_REMU 3'h7
|
||||
`define INST_MUL_BITS 3
|
||||
`define INST_MUL_IS_DIV(x) x[2]
|
||||
|
||||
`define INST_FMT_B 3'b000
|
||||
`define INST_FMT_H 3'b001
|
||||
`define INST_FMT_W 3'b010
|
||||
`define INST_FMT_BU 3'b100
|
||||
`define INST_FMT_HU 3'b101
|
||||
|
||||
`define INST_LSU_LB 4'b0000
|
||||
`define INST_LSU_LH 4'b0001
|
||||
`define INST_LSU_LW 4'b0010
|
||||
`define INST_LSU_LBU 4'b0100
|
||||
`define INST_LSU_LHU 4'b0101
|
||||
`define INST_LSU_SB 4'b1000
|
||||
`define INST_LSU_SH 4'b1001
|
||||
`define INST_LSU_SW 4'b1010
|
||||
`define INST_LSU_BITS 4
|
||||
`define INST_LSU_FMT(x) x[2:0]
|
||||
`define INST_LSU_WSIZE(x) x[1:0]
|
||||
`define INST_LSU_IS_MEM(x) (3'h0 == x)
|
||||
`define INST_LSU_IS_FENCE(x) (3'h1 == x)
|
||||
`define INST_LSU_IS_PREFETCH(x) (3'h2 == x)
|
||||
|
||||
`define INST_FENCE_BITS 1
|
||||
`define INST_FENCE_D 1'h0
|
||||
`define INST_FENCE_I 1'h1
|
||||
|
||||
`define INST_CSR_RW 2'h1
|
||||
`define INST_CSR_RS 2'h2
|
||||
`define INST_CSR_RC 2'h3
|
||||
`define INST_CSR_OTHER 2'h0
|
||||
`define INST_CSR_BITS 2
|
||||
|
||||
`define INST_FPU_ADD 4'h0
|
||||
`define INST_FPU_SUB 4'h4
|
||||
`define INST_FPU_MUL 4'h8
|
||||
`define INST_FPU_DIV 4'hC
|
||||
`define INST_FPU_CVTWS 4'h1 // FCVT.W.S
|
||||
`define INST_FPU_CVTWUS 4'h5 // FCVT.WU.S
|
||||
`define INST_FPU_CVTSW 4'h9 // FCVT.S.W
|
||||
`define INST_FPU_CVTSWU 4'hD // FCVT.S.WU
|
||||
`define INST_FPU_SQRT 4'h2
|
||||
`define INST_FPU_CLASS 4'h6
|
||||
`define INST_FPU_CMP 4'hA
|
||||
`define INST_FPU_MISC 4'hE // SGNJ, SGNJN, SGNJX, FMIN, FMAX, MVXW, MVWX
|
||||
`define INST_FPU_MADD 4'h3
|
||||
`define INST_FPU_MSUB 4'h7
|
||||
`define INST_FPU_NMSUB 4'hB
|
||||
`define INST_FPU_NMADD 4'hF
|
||||
`define INST_FPU_BITS 4
|
||||
|
||||
`define INST_GPU_TMC 4'h0
|
||||
`define INST_GPU_WSPAWN 4'h1
|
||||
`define INST_GPU_SPLIT 4'h2
|
||||
`define INST_GPU_JOIN 4'h3
|
||||
`define INST_GPU_BAR 4'h4
|
||||
`define INST_GPU_PRED 4'h5
|
||||
`define INST_GPU_TEX 4'h6
|
||||
`define INST_GPU_BITS 4
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef EXT_M_ENABLE
|
||||
`define ISA_EXT_M (1 << 12)
|
||||
`else
|
||||
`define ISA_EXT_M 0
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`define ISA_EXT_F (1 << 5)
|
||||
`else
|
||||
`define ISA_EXT_F 0
|
||||
`endif
|
||||
|
||||
`define ISA_CODE (0 << 0) // A - Atomic Instructions extension \
|
||||
| (0 << 1) // B - Tentatively reserved for Bit operations extension \
|
||||
| (0 << 2) // C - Compressed extension \
|
||||
| (0 << 3) // D - Double precsision floating-point extension \
|
||||
| (0 << 4) // E - RV32E base ISA \
|
||||
|`ISA_EXT_F // F - Single precsision floating-point extension \
|
||||
| (0 << 6) // G - Additional standard extensions present \
|
||||
| (0 << 7) // H - Hypervisor mode implemented \
|
||||
| (1 << 8) // I - RV32I/64I/128I base ISA \
|
||||
| (0 << 9) // J - Reserved \
|
||||
| (0 << 10) // K - Reserved \
|
||||
| (0 << 11) // L - Tentatively reserved for Bit operations extension \
|
||||
|`ISA_EXT_M // M - Integer Multiply/Divide extension \
|
||||
| (0 << 13) // N - User level interrupts supported \
|
||||
| (0 << 14) // O - Reserved \
|
||||
| (0 << 15) // P - Tentatively reserved for Packed-SIMD extension \
|
||||
| (0 << 16) // Q - Quad-precision floating-point extension \
|
||||
| (0 << 17) // R - Reserved \
|
||||
| (0 << 18) // S - Supervisor mode implemented \
|
||||
| (0 << 19) // T - Tentatively reserved for Transactional Memory extension \
|
||||
| (1 << 20) // U - User mode implemented \
|
||||
| (0 << 21) // V - Tentatively reserved for Vector extension \
|
||||
| (0 << 22) // W - Reserved \
|
||||
| (1 << 23) // X - Non-standard extensions present \
|
||||
| (0 << 24) // Y - Reserved \
|
||||
| (0 << 25) // Z - Reserved
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// non-cacheable tag bits
|
||||
`define NC_TAG_BIT 1
|
||||
|
||||
// texture tag bits
|
||||
`define TEX_TAG_BIT 1
|
||||
|
||||
// cache address type bits
|
||||
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BIT + `SM_ENABLE)
|
||||
|
||||
////////////////////////// Icache Configurable Knobs //////////////////////////
|
||||
|
||||
// Cache ID
|
||||
`define ICACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 0)
|
||||
|
||||
// Word size in bytes
|
||||
`define ICACHE_WORD_SIZE 4
|
||||
|
||||
// Block size in bytes
|
||||
`define ICACHE_LINE_SIZE `L1_BLOCK_SIZE
|
||||
|
||||
// TAG sharing enable
|
||||
`define ICACHE_CORE_TAG_ID_BITS `NW_BITS
|
||||
|
||||
// Core request tag bits
|
||||
`define ICACHE_CORE_TAG_WIDTH (`UUID_BITS + `ICACHE_CORE_TAG_ID_BITS)
|
||||
|
||||
// Memory request data bits
|
||||
`define ICACHE_MEM_DATA_WIDTH (`ICACHE_LINE_SIZE * 8)
|
||||
|
||||
// Memory request address bits
|
||||
`define ICACHE_MEM_ADDR_WIDTH (32 - `CLOG2(`ICACHE_LINE_SIZE))
|
||||
|
||||
// Memory request tag bits
|
||||
`define ICACHE_MEM_TAG_WIDTH `CLOG2(`ICACHE_MSHR_SIZE)
|
||||
|
||||
////////////////////////// Dcache Configurable Knobs //////////////////////////
|
||||
|
||||
// Cache ID
|
||||
`define DCACHE_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 1)
|
||||
|
||||
// Word size in bytes
|
||||
`define DCACHE_WORD_SIZE 4
|
||||
|
||||
// Block size in bytes
|
||||
`define DCACHE_LINE_SIZE `L1_BLOCK_SIZE
|
||||
|
||||
// Core request tag bits
|
||||
`define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE)
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
`define LSU_TAG_ID_BITS `MAX(`LSUQ_ADDR_BITS, 2)
|
||||
`define LSU_TEX_DCACHE_TAG_BITS (`UUID_BITS + `LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS)
|
||||
`define DCACHE_CORE_TAG_ID_BITS (`LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS + `TEX_TAG_BIT)
|
||||
`else
|
||||
`define LSU_TAG_ID_BITS `LSUQ_ADDR_BITS
|
||||
`define DCACHE_CORE_TAG_ID_BITS (`LSU_TAG_ID_BITS + `CACHE_ADDR_TYPE_BITS)
|
||||
`endif
|
||||
`define DCACHE_CORE_TAG_WIDTH (`UUID_BITS + `DCACHE_CORE_TAG_ID_BITS)
|
||||
|
||||
// Memory request data bits
|
||||
`define DCACHE_MEM_DATA_WIDTH (`DCACHE_LINE_SIZE * 8)
|
||||
|
||||
// Memory request address bits
|
||||
`define DCACHE_MEM_ADDR_WIDTH (32 - `CLOG2(`DCACHE_LINE_SIZE))
|
||||
|
||||
// Memory byte enable bits
|
||||
`define DCACHE_MEM_BYTEEN_WIDTH `DCACHE_LINE_SIZE
|
||||
|
||||
// Input request size
|
||||
`define DCACHE_NUM_REQS `NUM_THREADS
|
||||
|
||||
// Memory request tag bits
|
||||
`define _DMEM_ADDR_RATIO_W $clog2(`DCACHE_LINE_SIZE / `DCACHE_WORD_SIZE)
|
||||
`define _DNC_MEM_TAG_WIDTH ($clog2(`DCACHE_NUM_REQS) + `_DMEM_ADDR_RATIO_W + `DCACHE_CORE_TAG_WIDTH)
|
||||
`define DCACHE_MEM_TAG_WIDTH `MAX((`CLOG2(`DCACHE_NUM_BANKS) + `CLOG2(`DCACHE_MSHR_SIZE) + `NC_TAG_BIT), `_DNC_MEM_TAG_WIDTH)
|
||||
|
||||
// Merged D-cache/I-cache memory tag
|
||||
`define L1_MEM_TAG_WIDTH (`MAX(`ICACHE_MEM_TAG_WIDTH, `DCACHE_MEM_TAG_WIDTH) + `CLOG2(2))
|
||||
|
||||
////////////////////////// SM Configurable Knobs //////////////////////////////
|
||||
|
||||
// Cache ID
|
||||
`define SMEM_ID (32'(`L3_ENABLE) + 32'(`L2_ENABLE) * `NUM_CLUSTERS + CORE_ID * 3 + 2)
|
||||
|
||||
// Word size in bytes
|
||||
`define SMEM_WORD_SIZE 4
|
||||
|
||||
// bank address offset
|
||||
`define SMEM_BANK_ADDR_OFFSET `CLOG2(`STACK_SIZE / `SMEM_WORD_SIZE)
|
||||
|
||||
// Input request size
|
||||
`define SMEM_NUM_REQS `NUM_THREADS
|
||||
|
||||
////////////////////////// L2cache Configurable Knobs /////////////////////////
|
||||
|
||||
// Cache ID
|
||||
`define L2_CACHE_ID (32'(`L3_ENABLE) + CLUSTER_ID)
|
||||
|
||||
// Word size in bytes
|
||||
`define L2_WORD_SIZE `DCACHE_LINE_SIZE
|
||||
|
||||
// Block size in bytes
|
||||
`define L2_CACHE_LINE_SIZE ((`L2_ENABLE) ? `MEM_BLOCK_SIZE : `L2_WORD_SIZE)
|
||||
|
||||
// Input request tag bits
|
||||
`define L2_CORE_TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH + `CLOG2(`NUM_CORES))
|
||||
|
||||
// Memory request data bits
|
||||
`define L2_MEM_DATA_WIDTH (`L2_CACHE_LINE_SIZE * 8)
|
||||
|
||||
// Memory request address bits
|
||||
`define L2_MEM_ADDR_WIDTH (32 - `CLOG2(`L2_CACHE_LINE_SIZE))
|
||||
|
||||
// Memory byte enable bits
|
||||
`define L2_MEM_BYTEEN_WIDTH `L2_CACHE_LINE_SIZE
|
||||
|
||||
// Input request size
|
||||
`define L2_NUM_REQS `NUM_CORES
|
||||
|
||||
// Memory request tag bits
|
||||
`define _L2_MEM_ADDR_RATIO_W $clog2(`L2_CACHE_LINE_SIZE / `L2_WORD_SIZE)
|
||||
`define _L2_NC_MEM_TAG_WIDTH ($clog2(`L2_NUM_REQS) + `_L2_MEM_ADDR_RATIO_W + `L1_MEM_TAG_WIDTH)
|
||||
`define _L2_MEM_TAG_WIDTH `MAX((`CLOG2(`L2_NUM_BANKS) + `CLOG2(`L2_MSHR_SIZE) + `NC_TAG_BIT), `_L2_NC_MEM_TAG_WIDTH)
|
||||
`define L2_MEM_TAG_WIDTH ((`L2_ENABLE) ? `_L2_MEM_TAG_WIDTH : (`L1_MEM_TAG_WIDTH + `CLOG2(`L2_NUM_REQS)))
|
||||
|
||||
////////////////////////// L3cache Configurable Knobs /////////////////////////
|
||||
|
||||
// Cache ID
|
||||
`define L3_CACHE_ID 0
|
||||
|
||||
// Word size in bytes
|
||||
`define L3_WORD_SIZE `L2_CACHE_LINE_SIZE
|
||||
|
||||
// Block size in bytes
|
||||
`define L3_CACHE_LINE_SIZE ((`L3_ENABLE) ? `MEM_BLOCK_SIZE : `L3_WORD_SIZE)
|
||||
|
||||
// Input request tag bits
|
||||
`define L3_CORE_TAG_WIDTH (`L2_CORE_TAG_WIDTH + `CLOG2(`NUM_CLUSTERS))
|
||||
|
||||
// Memory request data bits
|
||||
`define L3_MEM_DATA_WIDTH (`L3_CACHE_LINE_SIZE * 8)
|
||||
|
||||
// Memory request address bits
|
||||
`define L3_MEM_ADDR_WIDTH (32 - `CLOG2(`L3_CACHE_LINE_SIZE))
|
||||
|
||||
// Memory byte enable bits
|
||||
`define L3_MEM_BYTEEN_WIDTH `L3_CACHE_LINE_SIZE
|
||||
|
||||
// Input request size
|
||||
`define L3_NUM_REQS `NUM_CLUSTERS
|
||||
|
||||
// Memory request tag bits
|
||||
`define _L3_MEM_ADDR_RATIO_W $clog2(`L3_CACHE_LINE_SIZE / `L3_WORD_SIZE)
|
||||
`define _L3_NC_MEM_TAG_WIDTH ($clog2(`L3_NUM_REQS) + `_L3_MEM_ADDR_RATIO_W + `L2_MEM_TAG_WIDTH)
|
||||
`define _L3_MEM_TAG_WIDTH `MAX((`CLOG2(`L3_NUM_BANKS) + `CLOG2(`L3_MSHR_SIZE) + `NC_TAG_BIT), `_L3_NC_MEM_TAG_WIDTH)
|
||||
`define L3_MEM_TAG_WIDTH ((`L3_ENABLE) ? `_L3_MEM_TAG_WIDTH : (`L2_MEM_TAG_WIDTH + `CLOG2(`L3_NUM_REQS)))
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define VX_MEM_BYTEEN_WIDTH `L3_MEM_BYTEEN_WIDTH
|
||||
`define VX_MEM_ADDR_WIDTH `L3_MEM_ADDR_WIDTH
|
||||
`define VX_MEM_DATA_WIDTH `L3_MEM_DATA_WIDTH
|
||||
`define VX_MEM_TAG_WIDTH `L3_MEM_TAG_WIDTH
|
||||
`define VX_CORE_TAG_WIDTH `L3_CORE_TAG_WIDTH
|
||||
`define VX_CSR_ID_WIDTH `LOG2UP(`NUM_CLUSTERS * `NUM_CORES)
|
||||
|
||||
`define TO_FULL_ADDR(x) {x, (32-$bits(x))'(0)}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`include "VX_fpu_types.vh"
|
||||
`include "VX_gpu_types.vh"
|
||||
|
||||
`endif
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`ifndef VX_DEFINE_VH
|
||||
`define VX_DEFINE_VH
|
||||
|
||||
`include "VX_platform.vh"
|
||||
`include "VX_config.vh"
|
||||
`include "VX_types.vh"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define NW_BITS `CLOG2(`NUM_WARPS)
|
||||
`define NC_WIDTH `UP(`NC_BITS)
|
||||
|
||||
`define NT_BITS `CLOG2(`NUM_THREADS)
|
||||
`define NW_WIDTH `UP(`NW_BITS)
|
||||
|
||||
`define NC_BITS `CLOG2(`NUM_CORES)
|
||||
`define NT_WIDTH `UP(`NT_BITS)
|
||||
|
||||
`define NB_BITS `CLOG2(`NUM_BARRIERS)
|
||||
`define NB_WIDTH `UP(`NB_BITS)
|
||||
|
||||
`define NUM_IREGS 32
|
||||
|
||||
`define NRI_BITS `CLOG2(`NUM_IREGS)
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`define NUM_REGS (2 * `NUM_IREGS)
|
||||
`else
|
||||
`define NUM_REGS `NUM_IREGS
|
||||
`endif
|
||||
|
||||
`define NR_BITS `CLOG2(`NUM_REGS)
|
||||
|
||||
`define PERF_CTR_BITS 44
|
||||
|
||||
`ifndef NDEBUG
|
||||
`define UUID_WIDTH 44
|
||||
`else
|
||||
`define UUID_WIDTH 1
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define EX_ALU 0
|
||||
`define EX_LSU 1
|
||||
`define EX_SFU 2
|
||||
`define EX_FPU 3
|
||||
|
||||
`define NUM_EX_UNITS (3 + `EXT_F_ENABLED)
|
||||
`define EX_BITS `CLOG2(`NUM_EX_UNITS)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_LUI 7'b0110111
|
||||
`define INST_AUIPC 7'b0010111
|
||||
`define INST_JAL 7'b1101111
|
||||
`define INST_JALR 7'b1100111
|
||||
`define INST_B 7'b1100011 // branch instructions
|
||||
`define INST_L 7'b0000011 // load instructions
|
||||
`define INST_S 7'b0100011 // store instructions
|
||||
`define INST_I 7'b0010011 // immediate instructions
|
||||
`define INST_R 7'b0110011 // register instructions
|
||||
`define INST_FENCE 7'b0001111 // Fence instructions
|
||||
`define INST_SYS 7'b1110011 // system instructions
|
||||
|
||||
// RV64I instruction specific opcodes (for any W instruction)
|
||||
`define INST_I_W 7'b0011011 // W type immediate instructions
|
||||
`define INST_R_W 7'b0111011 // W type register instructions
|
||||
|
||||
`define INST_FL 7'b0000111 // float load instruction
|
||||
`define INST_FS 7'b0100111 // float store instruction
|
||||
`define INST_FMADD 7'b1000011
|
||||
`define INST_FMSUB 7'b1000111
|
||||
`define INST_FNMSUB 7'b1001011
|
||||
`define INST_FNMADD 7'b1001111
|
||||
`define INST_FCI 7'b1010011 // float common instructions
|
||||
|
||||
// Custom extension opcodes
|
||||
`define INST_EXT1 7'b0001011 // 0x0B
|
||||
`define INST_EXT2 7'b0101011 // 0x2B
|
||||
`define INST_EXT3 7'b1011011 // 0x5B
|
||||
`define INST_EXT4 7'b1111011 // 0x7B
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_FRM_RNE 3'b000 // round to nearest even
|
||||
`define INST_FRM_RTZ 3'b001 // round to zero
|
||||
`define INST_FRM_RDN 3'b010 // round to -inf
|
||||
`define INST_FRM_RUP 3'b011 // round to +inf
|
||||
`define INST_FRM_RMM 3'b100 // round to nearest max magnitude
|
||||
`define INST_FRM_DYN 3'b111 // dynamic mode
|
||||
`define INST_FRM_BITS 3
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_OP_BITS 4
|
||||
`define INST_MOD_BITS 3
|
||||
`define INST_FMT_BITS 2
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_ALU_ADD 4'b0000
|
||||
`define INST_ALU_LUI 4'b0010
|
||||
`define INST_ALU_AUIPC 4'b0011
|
||||
`define INST_ALU_SLTU 4'b0100
|
||||
`define INST_ALU_SLT 4'b0101
|
||||
`define INST_ALU_SUB 4'b0111
|
||||
`define INST_ALU_SRL 4'b1000
|
||||
`define INST_ALU_SRA 4'b1001
|
||||
`define INST_ALU_AND 4'b1100
|
||||
`define INST_ALU_OR 4'b1101
|
||||
`define INST_ALU_XOR 4'b1110
|
||||
`define INST_ALU_SLL 4'b1111
|
||||
`define INST_ALU_OTHER 4'b0111
|
||||
`define INST_ALU_BITS 4
|
||||
`define INST_ALU_CLASS(op) op[3:2]
|
||||
`define INST_ALU_SIGNED(op) op[0]
|
||||
`define INST_ALU_IS_SUB(op) op[1]
|
||||
`define INST_ALU_IS_BR(mod) mod[0]
|
||||
`define INST_ALU_IS_M(mod) mod[1]
|
||||
`define INST_ALU_IS_W(mod) mod[2]
|
||||
|
||||
`define INST_BR_EQ 4'b0000
|
||||
`define INST_BR_NE 4'b0010
|
||||
`define INST_BR_LTU 4'b0100
|
||||
`define INST_BR_GEU 4'b0110
|
||||
`define INST_BR_LT 4'b0101
|
||||
`define INST_BR_GE 4'b0111
|
||||
`define INST_BR_JAL 4'b1000
|
||||
`define INST_BR_JALR 4'b1001
|
||||
`define INST_BR_ECALL 4'b1010
|
||||
`define INST_BR_EBREAK 4'b1011
|
||||
`define INST_BR_URET 4'b1100
|
||||
`define INST_BR_SRET 4'b1101
|
||||
`define INST_BR_MRET 4'b1110
|
||||
`define INST_BR_OTHER 4'b1111
|
||||
`define INST_BR_BITS 4
|
||||
`define INST_BR_CLASS(op) {1'b0, ~op[3]}
|
||||
`define INST_BR_IS_NEG(op) op[1]
|
||||
`define INST_BR_IS_LESS(op) op[2]
|
||||
`define INST_BR_IS_STATIC(op) op[3]
|
||||
|
||||
`define INST_M_MUL 3'b000
|
||||
`define INST_M_MULHU 3'b001
|
||||
`define INST_M_MULH 3'b010
|
||||
`define INST_M_MULHSU 3'b011
|
||||
`define INST_M_DIV 3'b100
|
||||
`define INST_M_DIVU 3'b101
|
||||
`define INST_M_REM 3'b110
|
||||
`define INST_M_REMU 3'b111
|
||||
`define INST_M_BITS 3
|
||||
`define INST_M_SIGNED(op) (~op[0])
|
||||
`define INST_M_IS_MULX(op) (~op[2])
|
||||
`define INST_M_IS_MULH(op) (op[1:0] != 0)
|
||||
`define INST_M_SIGNED_A(op) (op[1:0] != 1)
|
||||
`define INST_M_IS_REM(op) op[1]
|
||||
|
||||
`define INST_FMT_B 3'b000
|
||||
`define INST_FMT_H 3'b001
|
||||
`define INST_FMT_W 3'b010
|
||||
`define INST_FMT_D 3'b011
|
||||
`define INST_FMT_BU 3'b100
|
||||
`define INST_FMT_HU 3'b101
|
||||
`define INST_FMT_WU 3'b110
|
||||
|
||||
`define INST_LSU_LB 4'b0000
|
||||
`define INST_LSU_LH 4'b0001
|
||||
`define INST_LSU_LW 4'b0010
|
||||
`define INST_LSU_LD 4'b0011 // new for RV64I LD
|
||||
`define INST_LSU_LBU 4'b0100
|
||||
`define INST_LSU_LHU 4'b0101
|
||||
`define INST_LSU_LWU 4'b0110 // new for RV64I LWU
|
||||
`define INST_LSU_SB 4'b1000
|
||||
`define INST_LSU_SH 4'b1001
|
||||
`define INST_LSU_SW 4'b1010
|
||||
`define INST_LSU_SD 4'b1011 // new for RV64I SD
|
||||
`define INST_LSU_FENCE 4'b1111
|
||||
`define INST_LSU_BITS 4
|
||||
`define INST_LSU_FMT(op) op[2:0]
|
||||
`define INST_LSU_WSIZE(op) op[1:0]
|
||||
`define INST_LSU_IS_FENCE(op) (op[3:2] == 3)
|
||||
|
||||
`define INST_FENCE_BITS 1
|
||||
`define INST_FENCE_D 1'h0
|
||||
`define INST_FENCE_I 1'h1
|
||||
|
||||
`define INST_FPU_ADD 4'b0000
|
||||
`define INST_FPU_SUB 4'b0001
|
||||
`define INST_FPU_MUL 4'b0010
|
||||
`define INST_FPU_DIV 4'b0011
|
||||
`define INST_FPU_SQRT 4'b0100
|
||||
`define INST_FPU_CMP 4'b0101 // mod: LE=0, LT=1, EQ=2
|
||||
`define INST_FPU_F2F 4'b0110
|
||||
`define INST_FPU_MISC 4'b0111 // mod: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
|
||||
`define INST_FPU_F2I 4'b1000
|
||||
`define INST_FPU_F2U 4'b1001
|
||||
`define INST_FPU_I2F 4'b1010
|
||||
`define INST_FPU_U2F 4'b1011
|
||||
`define INST_FPU_MADD 4'b1100
|
||||
`define INST_FPU_MSUB 4'b1101
|
||||
`define INST_FPU_NMSUB 4'b1110
|
||||
`define INST_FPU_NMADD 4'b1111
|
||||
`define INST_FPU_BITS 4
|
||||
`define INST_FPU_IS_W(mod) (mod[4])
|
||||
`define INST_FPU_IS_CLASS(op, mod) (op == `INST_FPU_MISC && mod == 3)
|
||||
`define INST_FPU_IS_MVXW(op, mod) (op == `INST_FPU_MISC && mod == 4)
|
||||
|
||||
`define INST_SFU_TMC 4'h0
|
||||
`define INST_SFU_WSPAWN 4'h1
|
||||
`define INST_SFU_SPLIT 4'h2
|
||||
`define INST_SFU_JOIN 4'h3
|
||||
`define INST_SFU_BAR 4'h4
|
||||
`define INST_SFU_PRED 4'h5
|
||||
`define INST_SFU_CSRRW 4'h6
|
||||
`define INST_SFU_CSRRS 4'h7
|
||||
`define INST_SFU_CSRRC 4'h8
|
||||
`define INST_SFU_TEX 4'h9
|
||||
`define INST_SFU_RASTER 4'hA
|
||||
`define INST_SFU_ROP 4'hB
|
||||
`define INST_SFU_CMOV 4'hC
|
||||
`define INST_SFU_BITS 4
|
||||
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1)
|
||||
`define INST_SFU_IS_WCTL(op) (op <= 5)
|
||||
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// non-cacheable tag bits
|
||||
`define NC_TAG_BITS 1
|
||||
|
||||
// cache address type bits
|
||||
`ifdef SM_ENABLE
|
||||
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BITS + 1)
|
||||
`else
|
||||
`define CACHE_ADDR_TYPE_BITS `NC_TAG_BITS
|
||||
`endif
|
||||
|
||||
`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
|
||||
(`CLOG2(mshr_size) + `CLOG2(num_banks) + `NC_TAG_BITS)
|
||||
|
||||
`define CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
|
||||
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
|
||||
|
||||
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
|
||||
(`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) + `NC_TAG_BITS)
|
||||
|
||||
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \
|
||||
`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width))
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches) \
|
||||
(tag_width + `ARB_SEL_BITS(num_inputs, `UP(num_caches)))
|
||||
|
||||
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \
|
||||
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
|
||||
|
||||
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
|
||||
|
||||
`define CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG((`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
|
||||
|
||||
`define CACHE_CLUSTER_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG((`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)) + `NC_TAG_BITS), num_caches)
|
||||
|
||||
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches))), num_caches)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef L2_ENABLE
|
||||
`define L2_LINE_SIZE `MEM_BLOCK_SIZE
|
||||
`else
|
||||
`define L2_LINE_SIZE `L1_LINE_SIZE
|
||||
`endif
|
||||
|
||||
`ifdef L3_ENABLE
|
||||
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
|
||||
`else
|
||||
`define L3_LINE_SIZE `L2_LINE_SIZE
|
||||
`endif
|
||||
|
||||
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
|
||||
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
|
||||
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
|
||||
`define VX_MEM_TAG_WIDTH L3_MEM_TAG_WIDTH
|
||||
|
||||
`define VX_DCR_ADDR_WIDTH `VX_DCR_ADDR_BITS
|
||||
`define VX_DCR_DATA_WIDTH 32
|
||||
|
||||
`define TO_FULL_ADDR(x) {x, (`MEM_ADDR_WIDTH-$bits(x))'(0)}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define BUFFER_BUSY(dst, src, enable) \
|
||||
logic __busy; \
|
||||
if (enable) begin \
|
||||
always @(posedge clk) begin \
|
||||
if (reset) begin \
|
||||
__busy <= 1'b0; \
|
||||
end else begin \
|
||||
__busy <= src; \
|
||||
end \
|
||||
end \
|
||||
end else begin \
|
||||
assign __busy = src; \
|
||||
end \
|
||||
assign dst = __busy
|
||||
|
||||
`define POP_COUNT_EX(out, in, model) \
|
||||
VX_popcount #( \
|
||||
.N ($bits(in)), \
|
||||
.MODEL (model) \
|
||||
) __``out ( \
|
||||
.data_in (in), \
|
||||
.data_out (out) \
|
||||
)
|
||||
|
||||
`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1)
|
||||
|
||||
`define ASSIGN_VX_MEM_BUS_IF(dst, src) \
|
||||
assign dst.req_valid = src.req_valid; \
|
||||
assign dst.req_data = src.req_data; \
|
||||
assign src.req_ready = dst.req_ready; \
|
||||
assign src.rsp_valid = dst.rsp_valid; \
|
||||
assign src.rsp_data = dst.rsp_data; \
|
||||
assign dst.rsp_ready = src.rsp_ready
|
||||
|
||||
`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \
|
||||
assign dst.req_valid = src.req_valid; \
|
||||
assign dst.req_data.rw = src.req_data.rw; \
|
||||
assign dst.req_data.byteen = src.req_data.byteen; \
|
||||
assign dst.req_data.addr = src.req_data.addr; \
|
||||
assign dst.req_data.data = src.req_data.data; \
|
||||
if (TD != TS) \
|
||||
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
|
||||
else \
|
||||
assign dst.req_data.tag = src.req_data.tag; \
|
||||
assign src.req_ready = dst.req_ready; \
|
||||
assign src.rsp_valid = dst.rsp_valid; \
|
||||
assign src.rsp_data.data = dst.rsp_data.data; \
|
||||
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
|
||||
assign dst.rsp_ready = src.rsp_ready
|
||||
|
||||
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
|
||||
logic [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __``dst; \
|
||||
if (enable) begin \
|
||||
always @(posedge clk) begin \
|
||||
__``dst <= {src.write_valid, src.write_addr, src.write_data}; \
|
||||
end \
|
||||
end else begin \
|
||||
assign __``dst = {src.write_valid, src.write_addr, src.write_data}; \
|
||||
end \
|
||||
VX_dcr_bus_if dst(); \
|
||||
assign {dst.write_valid, dst.write_addr, dst.write_data} = __``dst
|
||||
|
||||
`define PERF_REDUCE(dst, src, field, width, count) \
|
||||
wire [count-1:0][width-1:0] __reduce_add_i_``src``field; \
|
||||
wire [width-1:0] __reduce_add_o_``dst``field; \
|
||||
reg [width-1:0] __reduce_add_r_``dst``field; \
|
||||
for (genvar __i = 0; __i < count; ++__i) begin \
|
||||
assign __reduce_add_i_``src``field[__i] = ``src[__i].``field; \
|
||||
end \
|
||||
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_``dst``field ( \
|
||||
__reduce_add_i_``src``field, \
|
||||
__reduce_add_o_``dst``field \
|
||||
); \
|
||||
always @(posedge clk) begin \
|
||||
if (reset) begin \
|
||||
__reduce_add_r_``dst``field <= '0; \
|
||||
end else begin \
|
||||
__reduce_add_r_``dst``field <= __reduce_add_o_``dst``field; \
|
||||
end \
|
||||
end \
|
||||
assign ``dst.``field = __reduce_add_r_``dst``field
|
||||
|
||||
`define PERF_CACHE_ADD(dst, src, count) \
|
||||
`PERF_REDUCE (dst, src, reads, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, writes, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, read_misses, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, write_misses, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, bank_stalls, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, mshr_stalls, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, mem_stalls, `PERF_CTR_BITS, count); \
|
||||
`PERF_REDUCE (dst, src, crsp_stalls, `PERF_CTR_BITS, count)
|
||||
|
||||
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
|
||||
if (block_size != 1) begin \
|
||||
if (block_size != `NUM_WARPS) begin \
|
||||
assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \
|
||||
end else begin \
|
||||
assign dst = `NW_WIDTH'(block_idx); \
|
||||
end \
|
||||
end else begin \
|
||||
assign dst = src; \
|
||||
end
|
||||
|
||||
`define TO_DISPATCH_DATA(data, tid) \
|
||||
{data.uuid, data.wis, data.tmask, data.op_type, data.op_mod, data.wb, data.use_PC, data.use_imm, data.PC, data.imm, data.rd, tid, data.rs1_data, data.rs2_data, data.rs3_data}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`endif // VX_DEFINE_VH
|
||||
|
|
|
@ -1,159 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_dispatch (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// inputs
|
||||
VX_ibuffer_if.slave ibuffer_if,
|
||||
VX_gpr_rsp_if.slave gpr_rsp_if,
|
||||
|
||||
// outputs
|
||||
VX_alu_req_if.master alu_req_if,
|
||||
VX_lsu_req_if.master lsu_req_if,
|
||||
VX_csr_req_if.master csr_req_if,
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_fpu_req_if.master fpu_req_if,
|
||||
`endif
|
||||
VX_gpu_req_if.master gpu_req_if
|
||||
);
|
||||
wire [`NT_BITS-1:0] tid;
|
||||
wire alu_req_ready;
|
||||
wire lsu_req_ready;
|
||||
wire csr_req_ready;
|
||||
`ifdef EXT_F_ENABLE
|
||||
wire fpu_req_ready;
|
||||
`endif
|
||||
wire gpu_req_ready;
|
||||
|
||||
VX_lzc #(
|
||||
.N (`NUM_THREADS)
|
||||
) tid_select (
|
||||
.in_i (ibuffer_if.tmask),
|
||||
.cnt_o (tid),
|
||||
`UNUSED_PIN (valid_o)
|
||||
);
|
||||
|
||||
wire [31:0] next_PC = ibuffer_if.PC + 4;
|
||||
|
||||
// ALU unit
|
||||
|
||||
wire alu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_ALU);
|
||||
wire [`INST_ALU_BITS-1:0] alu_op_type = `INST_ALU_BITS'(ibuffer_if.op_type);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)),
|
||||
.OUT_REG (1)
|
||||
) alu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (alu_req_valid),
|
||||
.ready_in (alu_req_ready),
|
||||
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, alu_op_type, ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
|
||||
.data_out ({alu_req_if.uuid, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}),
|
||||
.valid_out (alu_req_if.valid),
|
||||
.ready_out (alu_req_if.ready)
|
||||
);
|
||||
|
||||
// lsu unit
|
||||
|
||||
wire lsu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_LSU);
|
||||
wire [`INST_LSU_BITS-1:0] lsu_op_type = `INST_LSU_BITS'(ibuffer_if.op_type);
|
||||
wire lsu_is_fence = `INST_LSU_IS_FENCE(ibuffer_if.op_mod);
|
||||
wire lsu_is_prefetch = `INST_LSU_IS_PREFETCH(ibuffer_if.op_mod);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1),
|
||||
.OUT_REG (1)
|
||||
) lsu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (lsu_req_valid),
|
||||
.ready_in (lsu_req_ready),
|
||||
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, lsu_is_prefetch}),
|
||||
.data_out ({lsu_req_if.uuid, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data, lsu_req_if.is_prefetch}),
|
||||
.valid_out (lsu_req_if.valid),
|
||||
.ready_out (lsu_req_if.ready)
|
||||
);
|
||||
|
||||
// csr unit
|
||||
|
||||
wire csr_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_CSR);
|
||||
wire [`INST_CSR_BITS-1:0] csr_op_type = `INST_CSR_BITS'(ibuffer_if.op_type);
|
||||
wire [`CSR_ADDR_BITS-1:0] csr_addr = ibuffer_if.imm[`CSR_ADDR_BITS-1:0];
|
||||
wire [`NRI_BITS-1:0] csr_imm = ibuffer_if.imm[`CSR_ADDR_BITS +: `NRI_BITS];
|
||||
wire [31:0] csr_rs1_data = gpr_rsp_if.rs1_data[tid];
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32),
|
||||
.OUT_REG (1)
|
||||
) csr_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (csr_req_valid),
|
||||
.ready_in (csr_req_ready),
|
||||
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, csr_op_type, csr_addr, ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, csr_imm, csr_rs1_data}),
|
||||
.data_out ({csr_req_if.uuid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.imm, csr_req_if.rs1_data}),
|
||||
.valid_out (csr_req_if.valid),
|
||||
.ready_out (csr_req_if.ready)
|
||||
);
|
||||
|
||||
// fpu unit
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
wire fpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_FPU);
|
||||
wire [`INST_FPU_BITS-1:0] fpu_op_type = `INST_FPU_BITS'(ibuffer_if.op_type);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)),
|
||||
.OUT_REG (1)
|
||||
) fpu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (fpu_req_valid),
|
||||
.ready_in (fpu_req_ready),
|
||||
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, fpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
|
||||
.data_out ({fpu_req_if.uuid, fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}),
|
||||
.valid_out (fpu_req_if.valid),
|
||||
.ready_out (fpu_req_if.ready)
|
||||
);
|
||||
`else
|
||||
`UNUSED_VAR (gpr_rsp_if.rs3_data)
|
||||
`endif
|
||||
|
||||
// gpu unit
|
||||
|
||||
wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU);
|
||||
wire [`INST_GPU_BITS-1:0] gpu_op_type = `INST_GPU_BITS'(ibuffer_if.op_type);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)),
|
||||
.OUT_REG (1)
|
||||
) gpu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (gpu_req_valid),
|
||||
.ready_in (gpu_req_ready),
|
||||
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
|
||||
.data_out ({gpu_req_if.uuid, gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}),
|
||||
.valid_out (gpu_req_if.valid),
|
||||
.ready_out (gpu_req_if.ready)
|
||||
);
|
||||
|
||||
// can take next request?
|
||||
reg ready_r;
|
||||
always @(*) begin
|
||||
case (ibuffer_if.ex_type)
|
||||
`EX_ALU: ready_r = alu_req_ready;
|
||||
`EX_LSU: ready_r = lsu_req_ready;
|
||||
`EX_CSR: ready_r = csr_req_ready;
|
||||
`ifdef EXT_F_ENABLE
|
||||
`EX_FPU: ready_r = fpu_req_ready;
|
||||
`endif
|
||||
`EX_GPU: ready_r = gpu_req_ready;
|
||||
default: ready_r = 1'b1; // ignore NOPs
|
||||
endcase
|
||||
end
|
||||
assign ibuffer_if.ready = ready_r;
|
||||
|
||||
endmodule
|
|
@ -1,237 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_execute #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
`SCOPE_IO_VX_execute
|
||||
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// Dcache interface
|
||||
VX_dcache_req_if.master dcache_req_if,
|
||||
VX_dcache_rsp_if.slave dcache_rsp_if,
|
||||
|
||||
// commit interface
|
||||
VX_cmt_to_csr_if.slave cmt_to_csr_if,
|
||||
|
||||
// fetch interface
|
||||
VX_fetch_to_csr_if.slave fetch_to_csr_if,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_memsys_if.slave perf_memsys_if,
|
||||
VX_perf_pipeline_if.slave perf_pipeline_if,
|
||||
`endif
|
||||
|
||||
// inputs
|
||||
VX_alu_req_if.slave alu_req_if,
|
||||
VX_lsu_req_if.slave lsu_req_if,
|
||||
VX_csr_req_if.slave csr_req_if,
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_fpu_req_if.slave fpu_req_if,
|
||||
`endif
|
||||
VX_gpu_req_if.slave gpu_req_if,
|
||||
|
||||
// outputs
|
||||
VX_branch_ctl_if.master branch_ctl_if,
|
||||
VX_warp_ctl_if.master warp_ctl_if,
|
||||
VX_commit_if.master alu_commit_if,
|
||||
VX_commit_if.master ld_commit_if,
|
||||
VX_commit_if.master st_commit_if,
|
||||
VX_commit_if.master csr_commit_if,
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_commit_if.master fpu_commit_if,
|
||||
`endif
|
||||
VX_commit_if.master gpu_commit_if,
|
||||
|
||||
input wire busy
|
||||
);
|
||||
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
|
||||
VX_dcache_req_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS)
|
||||
) lsu_dcache_req_if();
|
||||
|
||||
VX_dcache_rsp_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS)
|
||||
) lsu_dcache_rsp_if();
|
||||
|
||||
VX_dcache_req_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS)
|
||||
) tex_dcache_req_if();
|
||||
|
||||
VX_dcache_rsp_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`LSU_TEX_DCACHE_TAG_BITS)
|
||||
) tex_dcache_rsp_if();
|
||||
|
||||
VX_tex_csr_if tex_csr_if();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_tex_if perf_tex_if();
|
||||
`endif
|
||||
|
||||
VX_cache_arb #(
|
||||
.NUM_REQS (2),
|
||||
.LANES (`NUM_THREADS),
|
||||
.DATA_SIZE (4),
|
||||
.TAG_IN_WIDTH (`LSU_TEX_DCACHE_TAG_BITS),
|
||||
.TAG_SEL_IDX (`NC_TAG_BIT + `SM_ENABLE)
|
||||
) tex_lsu_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
// Tex/LSU request
|
||||
.req_valid_in ({tex_dcache_req_if.valid, lsu_dcache_req_if.valid}),
|
||||
.req_rw_in ({tex_dcache_req_if.rw, lsu_dcache_req_if.rw}),
|
||||
.req_byteen_in ({tex_dcache_req_if.byteen, lsu_dcache_req_if.byteen}),
|
||||
.req_addr_in ({tex_dcache_req_if.addr, lsu_dcache_req_if.addr}),
|
||||
.req_data_in ({tex_dcache_req_if.data, lsu_dcache_req_if.data}),
|
||||
.req_tag_in ({tex_dcache_req_if.tag, lsu_dcache_req_if.tag}),
|
||||
.req_ready_in ({tex_dcache_req_if.ready, lsu_dcache_req_if.ready}),
|
||||
|
||||
// Dcache request
|
||||
.req_valid_out (dcache_req_if.valid),
|
||||
.req_rw_out (dcache_req_if.rw),
|
||||
.req_byteen_out (dcache_req_if.byteen),
|
||||
.req_addr_out (dcache_req_if.addr),
|
||||
.req_data_out (dcache_req_if.data),
|
||||
.req_tag_out (dcache_req_if.tag),
|
||||
.req_ready_out (dcache_req_if.ready),
|
||||
|
||||
// Dcache response
|
||||
.rsp_valid_in (dcache_rsp_if.valid),
|
||||
.rsp_tmask_in (dcache_rsp_if.tmask),
|
||||
.rsp_tag_in (dcache_rsp_if.tag),
|
||||
.rsp_data_in (dcache_rsp_if.data),
|
||||
.rsp_ready_in (dcache_rsp_if.ready),
|
||||
|
||||
// Tex/LSU response
|
||||
.rsp_valid_out ({tex_dcache_rsp_if.valid, lsu_dcache_rsp_if.valid}),
|
||||
.rsp_tmask_out ({tex_dcache_rsp_if.tmask, lsu_dcache_rsp_if.tmask}),
|
||||
.rsp_data_out ({tex_dcache_rsp_if.data, lsu_dcache_rsp_if.data}),
|
||||
.rsp_tag_out ({tex_dcache_rsp_if.tag, lsu_dcache_rsp_if.tag}),
|
||||
.rsp_ready_out ({tex_dcache_rsp_if.ready, lsu_dcache_rsp_if.ready})
|
||||
);
|
||||
|
||||
`endif
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
wire [`NUM_WARPS-1:0] csr_pending;
|
||||
wire [`NUM_WARPS-1:0] fpu_pending;
|
||||
VX_fpu_to_csr_if fpu_to_csr_if();
|
||||
`endif
|
||||
|
||||
`RESET_RELAY (alu_reset);
|
||||
`RESET_RELAY (lsu_reset);
|
||||
`RESET_RELAY (csr_reset);
|
||||
`RESET_RELAY (gpu_reset);
|
||||
|
||||
VX_alu_unit #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) alu_unit (
|
||||
.clk (clk),
|
||||
.reset (alu_reset),
|
||||
.alu_req_if (alu_req_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
.alu_commit_if (alu_commit_if)
|
||||
);
|
||||
|
||||
VX_lsu_unit #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) lsu_unit (
|
||||
`SCOPE_BIND_VX_execute_lsu_unit
|
||||
.clk (clk),
|
||||
.reset (lsu_reset),
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
.dcache_req_if (lsu_dcache_req_if),
|
||||
.dcache_rsp_if (lsu_dcache_rsp_if),
|
||||
`else
|
||||
.dcache_req_if (dcache_req_if),
|
||||
.dcache_rsp_if (dcache_rsp_if),
|
||||
`endif
|
||||
.lsu_req_if (lsu_req_if),
|
||||
.ld_commit_if (ld_commit_if),
|
||||
.st_commit_if (st_commit_if)
|
||||
);
|
||||
|
||||
VX_csr_unit #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) csr_unit (
|
||||
.clk (clk),
|
||||
.reset (csr_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
.perf_tex_if (perf_tex_if),
|
||||
`endif
|
||||
.perf_memsys_if (perf_memsys_if),
|
||||
.perf_pipeline_if(perf_pipeline_if),
|
||||
`endif
|
||||
.cmt_to_csr_if (cmt_to_csr_if),
|
||||
.fetch_to_csr_if(fetch_to_csr_if),
|
||||
.csr_req_if (csr_req_if),
|
||||
.csr_commit_if (csr_commit_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_to_csr_if (fpu_to_csr_if),
|
||||
.fpu_pending (fpu_pending),
|
||||
.pending (csr_pending),
|
||||
`else
|
||||
`UNUSED_PIN (pending),
|
||||
`endif
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
.tex_csr_if (tex_csr_if),
|
||||
`endif
|
||||
.busy (busy)
|
||||
);
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`RESET_RELAY (fpu_reset);
|
||||
|
||||
VX_fpu_unit #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) fpu_unit (
|
||||
.clk (clk),
|
||||
.reset (fpu_reset),
|
||||
.fpu_req_if (fpu_req_if),
|
||||
.fpu_to_csr_if (fpu_to_csr_if),
|
||||
.fpu_commit_if (fpu_commit_if),
|
||||
.csr_pending (csr_pending),
|
||||
.pending (fpu_pending)
|
||||
);
|
||||
`endif
|
||||
|
||||
VX_gpu_unit #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) gpu_unit (
|
||||
`SCOPE_BIND_VX_execute_gpu_unit
|
||||
.clk (clk),
|
||||
.reset (gpu_reset),
|
||||
.gpu_req_if (gpu_req_if),
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_tex_if (perf_tex_if),
|
||||
`endif
|
||||
.tex_csr_if (tex_csr_if),
|
||||
.dcache_req_if (tex_dcache_req_if),
|
||||
.dcache_rsp_if (tex_dcache_rsp_if),
|
||||
`endif
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.gpu_commit_if (gpu_commit_if)
|
||||
);
|
||||
|
||||
// special workaround to get RISC-V tests Pass/Fail status
|
||||
wire ebreak /* verilator public */;
|
||||
assign ebreak = alu_req_if.valid && alu_req_if.ready
|
||||
&& `INST_ALU_IS_BR(alu_req_if.op_mod)
|
||||
&& (`INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_EBREAK
|
||||
|| `INST_BR_BITS'(alu_req_if.op_type) == `INST_BR_ECALL);
|
||||
|
||||
endmodule
|
|
@ -1,68 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_fetch #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
`SCOPE_IO_VX_fetch
|
||||
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// Icache interface
|
||||
VX_icache_req_if.master icache_req_if,
|
||||
VX_icache_rsp_if.slave icache_rsp_if,
|
||||
|
||||
// inputs
|
||||
VX_wstall_if.slave wstall_if,
|
||||
VX_join_if.slave join_if,
|
||||
VX_branch_ctl_if.slave branch_ctl_if,
|
||||
VX_warp_ctl_if.slave warp_ctl_if,
|
||||
|
||||
// outputs
|
||||
VX_ifetch_rsp_if.master ifetch_rsp_if,
|
||||
|
||||
// csr interface
|
||||
VX_fetch_to_csr_if.master fetch_to_csr_if,
|
||||
|
||||
// busy status
|
||||
output wire busy
|
||||
);
|
||||
|
||||
VX_ifetch_req_if ifetch_req_if();
|
||||
|
||||
VX_warp_sched #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) warp_sched (
|
||||
`SCOPE_BIND_VX_fetch_warp_sched
|
||||
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.wstall_if (wstall_if),
|
||||
.join_if (join_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
|
||||
.ifetch_req_if (ifetch_req_if),
|
||||
|
||||
.fetch_to_csr_if (fetch_to_csr_if),
|
||||
|
||||
.busy (busy)
|
||||
);
|
||||
|
||||
VX_icache_stage #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) icache_stage (
|
||||
`SCOPE_BIND_VX_fetch_icache_stage
|
||||
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
.icache_rsp_if (icache_rsp_if),
|
||||
.icache_req_if (icache_req_if),
|
||||
|
||||
.ifetch_req_if (ifetch_req_if),
|
||||
.ifetch_rsp_if (ifetch_rsp_if)
|
||||
);
|
||||
|
||||
endmodule
|
|
@ -1,219 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_fpu_unit #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
VX_fpu_req_if.slave fpu_req_if,
|
||||
VX_fpu_to_csr_if.master fpu_to_csr_if,
|
||||
VX_commit_if.master fpu_commit_if,
|
||||
|
||||
input wire[`NUM_WARPS-1:0] csr_pending,
|
||||
output wire[`NUM_WARPS-1:0] pending
|
||||
);
|
||||
import fpu_types::*;
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
localparam FPUQ_BITS = `LOG2UP(`FPUQ_SIZE);
|
||||
|
||||
wire ready_in;
|
||||
wire valid_out;
|
||||
wire ready_out;
|
||||
|
||||
wire [`UUID_BITS-1:0] rsp_uuid;
|
||||
wire [`NW_BITS-1:0] rsp_wid;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
wire [31:0] rsp_PC;
|
||||
wire [`NR_BITS-1:0] rsp_rd;
|
||||
wire rsp_wb;
|
||||
|
||||
wire has_fflags;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags;
|
||||
wire [`NUM_THREADS-1:0][31:0] result;
|
||||
|
||||
wire [FPUQ_BITS-1:0] tag_in, tag_out;
|
||||
wire fpuq_full;
|
||||
|
||||
wire fpuq_push = fpu_req_if.valid && fpu_req_if.ready;
|
||||
wire fpuq_pop = valid_out && ready_out;
|
||||
|
||||
VX_index_buffer #(
|
||||
.DATAW (`UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1),
|
||||
.SIZE (`FPUQ_SIZE)
|
||||
) req_metadata (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.acquire_slot (fpuq_push),
|
||||
.write_addr (tag_in),
|
||||
.read_addr (tag_out),
|
||||
.release_addr (tag_out),
|
||||
.write_data ({fpu_req_if.uuid, fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.rd, fpu_req_if.wb}),
|
||||
.read_data ({rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}),
|
||||
.release_slot (fpuq_pop),
|
||||
.full (fpuq_full),
|
||||
`UNUSED_PIN (empty)
|
||||
);
|
||||
|
||||
// can accept new request?
|
||||
assign fpu_req_if.ready = ready_in && ~fpuq_full && !csr_pending[fpu_req_if.wid];
|
||||
|
||||
wire valid_in = fpu_req_if.valid && ~fpuq_full && !csr_pending[fpu_req_if.wid];
|
||||
|
||||
// resolve dynamic FRM from CSR
|
||||
assign fpu_to_csr_if.read_wid = fpu_req_if.wid;
|
||||
wire [`INST_FRM_BITS-1:0] fpu_frm = (fpu_req_if.op_mod == `INST_FRM_DYN) ? fpu_to_csr_if.read_frm : fpu_req_if.op_mod;
|
||||
|
||||
`ifdef FPU_DPI
|
||||
|
||||
VX_fpu_dpi #(
|
||||
.TAGW (FPUQ_BITS)
|
||||
) fpu_dpi (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
.valid_in (valid_in),
|
||||
.ready_in (ready_in),
|
||||
|
||||
.tag_in (tag_in),
|
||||
|
||||
.op_type (fpu_req_if.op_type),
|
||||
.frm (fpu_frm),
|
||||
|
||||
.dataa (fpu_req_if.rs1_data),
|
||||
.datab (fpu_req_if.rs2_data),
|
||||
.datac (fpu_req_if.rs3_data),
|
||||
.result (result),
|
||||
|
||||
.has_fflags (has_fflags),
|
||||
.fflags (fflags),
|
||||
|
||||
.tag_out (tag_out),
|
||||
|
||||
.ready_out (ready_out),
|
||||
.valid_out (valid_out)
|
||||
);
|
||||
|
||||
`elsif FPU_FPNEW
|
||||
|
||||
VX_fpu_fpnew #(
|
||||
.FMULADD (1),
|
||||
.FDIVSQRT (1),
|
||||
.FNONCOMP (1),
|
||||
.FCONV (1),
|
||||
.TAGW (FPUQ_BITS)
|
||||
) fpu_fpnew (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
.valid_in (valid_in),
|
||||
.ready_in (ready_in),
|
||||
|
||||
.tag_in (tag_in),
|
||||
|
||||
.op_type (fpu_req_if.op_type),
|
||||
.frm (fpu_frm),
|
||||
|
||||
.dataa (fpu_req_if.rs1_data),
|
||||
.datab (fpu_req_if.rs2_data),
|
||||
.datac (fpu_req_if.rs3_data),
|
||||
.result (result),
|
||||
|
||||
.has_fflags (has_fflags),
|
||||
.fflags (fflags),
|
||||
|
||||
.tag_out (tag_out),
|
||||
|
||||
.ready_out (ready_out),
|
||||
.valid_out (valid_out)
|
||||
);
|
||||
|
||||
`else
|
||||
|
||||
VX_fpu_fpga #(
|
||||
.TAGW (FPUQ_BITS)
|
||||
) fpu_fpga (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
.valid_in (valid_in),
|
||||
.ready_in (ready_in),
|
||||
|
||||
.tag_in (tag_in),
|
||||
|
||||
.op_type (fpu_req_if.op_type),
|
||||
.frm (fpu_frm),
|
||||
|
||||
.dataa (fpu_req_if.rs1_data),
|
||||
.datab (fpu_req_if.rs2_data),
|
||||
.datac (fpu_req_if.rs3_data),
|
||||
.result (result),
|
||||
|
||||
.has_fflags (has_fflags),
|
||||
.fflags (fflags),
|
||||
|
||||
.tag_out (tag_out),
|
||||
|
||||
.ready_out (ready_out),
|
||||
.valid_out (valid_out)
|
||||
);
|
||||
|
||||
`endif
|
||||
|
||||
reg has_fflags_r;
|
||||
fflags_t fflags_r;
|
||||
|
||||
fflags_t rsp_fflags;
|
||||
always @(*) begin
|
||||
rsp_fflags = '0;
|
||||
for (integer i = 0; i < `NUM_THREADS; i++) begin
|
||||
if (rsp_tmask[i]) begin
|
||||
rsp_fflags.NX |= fflags[i].NX;
|
||||
rsp_fflags.UF |= fflags[i].UF;
|
||||
rsp_fflags.OF |= fflags[i].OF;
|
||||
rsp_fflags.DZ |= fflags[i].DZ;
|
||||
rsp_fflags.NV |= fflags[i].NV;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
wire stall_out = ~fpu_commit_if.ready && fpu_commit_if.valid;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFLAGS_BITS),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_out),
|
||||
.data_in ({valid_out, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, result, has_fflags, rsp_fflags}),
|
||||
.data_out ({fpu_commit_if.valid, fpu_commit_if.uuid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.wb, fpu_commit_if.data, has_fflags_r, fflags_r})
|
||||
);
|
||||
|
||||
assign fpu_commit_if.eop = 1'b1;
|
||||
|
||||
assign ready_out = ~stall_out;
|
||||
|
||||
// CSR fflags Update
|
||||
assign fpu_to_csr_if.write_enable = fpu_commit_if.valid && fpu_commit_if.ready && has_fflags_r;
|
||||
assign fpu_to_csr_if.write_wid = fpu_commit_if.wid;
|
||||
assign fpu_to_csr_if.write_fflags = fflags_r;
|
||||
|
||||
// pending request
|
||||
reg [`NUM_WARPS-1:0] pending_r;
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
pending_r <= 0;
|
||||
end else begin
|
||||
if (fpu_commit_if.valid && fpu_commit_if.ready) begin
|
||||
pending_r[fpu_commit_if.wid] <= 0;
|
||||
end
|
||||
if (fpu_req_if.valid && fpu_req_if.ready) begin
|
||||
pending_r[fpu_req_if.wid] <= 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
assign pending = pending_r;
|
||||
|
||||
endmodule
|
|
@ -1,91 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_gpr_stage #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// inputs
|
||||
VX_writeback_if.slave writeback_if,
|
||||
VX_gpr_req_if.slave gpr_req_if,
|
||||
|
||||
// outputs
|
||||
VX_gpr_rsp_if.master gpr_rsp_if
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
localparam RAM_SIZE = `NUM_WARPS * `NUM_REGS;
|
||||
|
||||
// ensure r0 never gets written, which can happen before the reset
|
||||
wire write_enable = writeback_if.valid && (writeback_if.rd != 0);
|
||||
|
||||
wire [`NUM_THREADS-1:0] wren;
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
assign wren[i] = write_enable && writeback_if.tmask[i];
|
||||
end
|
||||
|
||||
wire [$clog2(RAM_SIZE)-1:0] waddr, raddr1, raddr2;
|
||||
assign waddr = {writeback_if.wid, writeback_if.rd};
|
||||
assign raddr1 = {gpr_req_if.wid, gpr_req_if.rs1};
|
||||
assign raddr2 = {gpr_req_if.wid, gpr_req_if.rs2};
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
VX_dp_ram #(
|
||||
.DATAW (32),
|
||||
.SIZE (RAM_SIZE),
|
||||
.INIT_ENABLE (1),
|
||||
.INIT_VALUE (0)
|
||||
) dp_ram1 (
|
||||
.clk (clk),
|
||||
.wren (wren[i]),
|
||||
.waddr (waddr),
|
||||
.wdata (writeback_if.data[i]),
|
||||
.raddr (raddr1),
|
||||
.rdata (gpr_rsp_if.rs1_data[i])
|
||||
);
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (32),
|
||||
.SIZE (RAM_SIZE),
|
||||
.INIT_ENABLE (1),
|
||||
.INIT_VALUE (0)
|
||||
) dp_ram2 (
|
||||
.clk (clk),
|
||||
.wren (wren[i]),
|
||||
.waddr (waddr),
|
||||
.wdata (writeback_if.data[i]),
|
||||
.raddr (raddr2),
|
||||
.rdata (gpr_rsp_if.rs2_data[i])
|
||||
);
|
||||
end
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
wire [$clog2(RAM_SIZE)-1:0] raddr3;
|
||||
assign raddr3 = {gpr_req_if.wid, gpr_req_if.rs3};
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
VX_dp_ram #(
|
||||
.DATAW (32),
|
||||
.SIZE (RAM_SIZE),
|
||||
.INIT_ENABLE (1),
|
||||
.INIT_VALUE (0)
|
||||
) dp_ram3 (
|
||||
.clk (clk),
|
||||
.wren (wren[i]),
|
||||
.waddr (waddr),
|
||||
.wdata (writeback_if.data[i]),
|
||||
.raddr (raddr3),
|
||||
.rdata (gpr_rsp_if.rs3_data[i])
|
||||
);
|
||||
end
|
||||
`else
|
||||
`UNUSED_VAR (gpr_req_if.rs3)
|
||||
assign gpr_rsp_if.rs3_data = 'x;
|
||||
`endif
|
||||
|
||||
assign writeback_if.ready = 1'b1;
|
||||
|
||||
endmodule
|
218
hw/rtl/VX_gpu_pkg.sv
Normal file
218
hw/rtl/VX_gpu_pkg.sv
Normal file
|
@ -0,0 +1,218 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`ifndef VX_GPU_PKG_VH
|
||||
`define VX_GPU_PKG_VH
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
package VX_gpu_pkg;
|
||||
|
||||
typedef struct packed {
|
||||
logic valid;
|
||||
logic [`NUM_THREADS-1:0] tmask;
|
||||
} tmc_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic valid;
|
||||
logic [`NUM_WARPS-1:0] wmask;
|
||||
logic [`XLEN-1:0] pc;
|
||||
} wspawn_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic valid;
|
||||
logic is_dvg;
|
||||
logic [`NUM_THREADS-1:0] then_tmask;
|
||||
logic [`NUM_THREADS-1:0] else_tmask;
|
||||
logic [`XLEN-1:0] next_pc;
|
||||
} split_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic valid;
|
||||
logic is_dvg;
|
||||
} join_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic valid;
|
||||
logic [`NB_WIDTH-1:0] id;
|
||||
logic is_global;
|
||||
`ifdef GBAR_ENABLE
|
||||
logic [`MAX(`NW_WIDTH, `NC_WIDTH)-1:0] size_m1;
|
||||
`else
|
||||
logic [`NW_WIDTH-1:0] size_m1;
|
||||
`endif
|
||||
} barrier_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [`XLEN-1:0] startup_addr;
|
||||
logic [7:0] mpm_class;
|
||||
} base_dcrs_t;
|
||||
|
||||
/* verilator lint_off UNUSED */
|
||||
|
||||
////////////////////////// Icache Parameters //////////////////////////////
|
||||
|
||||
// Word size in bytes
|
||||
localparam ICACHE_WORD_SIZE = 4;
|
||||
localparam ICACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(ICACHE_WORD_SIZE));
|
||||
|
||||
// Block size in bytes
|
||||
localparam ICACHE_LINE_SIZE = `L1_LINE_SIZE;
|
||||
|
||||
// Core request tag Id bits
|
||||
localparam ICACHE_TAG_ID_BITS = `NW_WIDTH;
|
||||
|
||||
// Core request tag bits
|
||||
localparam ICACHE_TAG_WIDTH = (`UUID_WIDTH + ICACHE_TAG_ID_BITS);
|
||||
localparam ICACHE_ARB_TAG_WIDTH = (ICACHE_TAG_WIDTH + `CLOG2(`SOCKET_SIZE));
|
||||
|
||||
// Memory request data bits
|
||||
localparam ICACHE_MEM_DATA_WIDTH = (ICACHE_LINE_SIZE * 8);
|
||||
|
||||
// Memory request tag bits
|
||||
`ifdef ICACHE_ENABLE
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
|
||||
`else
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_ARB_TAG_WIDTH, `NUM_SOCKETS, `NUM_ICACHES);
|
||||
`endif
|
||||
|
||||
////////////////////////// Dcache Parameters //////////////////////////////
|
||||
|
||||
// Word size in bytes
|
||||
localparam DCACHE_WORD_SIZE = (`XLEN / 8);
|
||||
localparam DCACHE_ADDR_WIDTH = (`MEM_ADDR_WIDTH - `CLOG2(DCACHE_WORD_SIZE));
|
||||
|
||||
// Block size in bytes
|
||||
localparam DCACHE_LINE_SIZE = `L1_LINE_SIZE;
|
||||
|
||||
// Input request size
|
||||
localparam DCACHE_NUM_REQS = `MAX(`DCACHE_NUM_BANKS, `SMEM_NUM_BANKS);
|
||||
|
||||
// Memory request size
|
||||
localparam LSU_MEM_REQS = `NUM_LSU_LANES;
|
||||
|
||||
// Batch select bits
|
||||
localparam DCACHE_NUM_BATCHES = ((LSU_MEM_REQS + DCACHE_NUM_REQS - 1) / DCACHE_NUM_REQS);
|
||||
localparam DCACHE_BATCH_SEL_BITS = `CLOG2(DCACHE_NUM_BATCHES);
|
||||
|
||||
// Core request tag Id bits
|
||||
localparam LSUQ_TAG_BITS = (`CLOG2(`LSUQ_SIZE) + DCACHE_BATCH_SEL_BITS);
|
||||
localparam DCACHE_TAG_ID_BITS = (LSUQ_TAG_BITS + `CACHE_ADDR_TYPE_BITS);
|
||||
|
||||
// Core request tag bits
|
||||
localparam DCACHE_TAG_WIDTH = (`UUID_WIDTH + DCACHE_TAG_ID_BITS);
|
||||
localparam DCACHE_NOSM_TAG_WIDTH = (DCACHE_TAG_WIDTH - `SM_ENABLED);
|
||||
localparam DCACHE_ARB_TAG_WIDTH = (DCACHE_NOSM_TAG_WIDTH + `CLOG2(`SOCKET_SIZE));
|
||||
|
||||
// Memory request data bits
|
||||
localparam DCACHE_MEM_DATA_WIDTH = (DCACHE_LINE_SIZE * 8);
|
||||
|
||||
// Memory request tag bits
|
||||
`ifdef DCACHE_ENABLE
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_ARB_TAG_WIDTH, `NUM_SOCKETS, `NUM_DCACHES);
|
||||
`else
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_ARB_TAG_WIDTH, `NUM_SOCKETS, `NUM_DCACHES);
|
||||
`endif
|
||||
|
||||
/////////////////////////////// L1 Parameters /////////////////////////////
|
||||
|
||||
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
|
||||
|
||||
localparam NUM_L1_OUTPUTS = 2;
|
||||
|
||||
/////////////////////////////// L2 Parameters /////////////////////////////
|
||||
|
||||
// Word size in bytes
|
||||
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
|
||||
|
||||
// Input request size
|
||||
localparam L2_NUM_REQS = NUM_L1_OUTPUTS;
|
||||
|
||||
// Core request tag bits
|
||||
localparam L2_TAG_WIDTH = L1_MEM_TAG_WIDTH;
|
||||
|
||||
// Memory request data bits
|
||||
localparam L2_MEM_DATA_WIDTH = (`L2_LINE_SIZE * 8);
|
||||
|
||||
// Memory request tag bits
|
||||
`ifdef L2_ENABLE
|
||||
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
|
||||
`else
|
||||
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
|
||||
`endif
|
||||
|
||||
/////////////////////////////// L3 Parameters /////////////////////////////
|
||||
|
||||
// Word size in bytes
|
||||
localparam L3_WORD_SIZE = `L2_LINE_SIZE;
|
||||
|
||||
// Input request size
|
||||
localparam L3_NUM_REQS = `NUM_CLUSTERS;
|
||||
|
||||
// Core request tag bits
|
||||
localparam L3_TAG_WIDTH = L2_MEM_TAG_WIDTH;
|
||||
|
||||
// Memory request data bits
|
||||
localparam L3_MEM_DATA_WIDTH = (`L3_LINE_SIZE * 8);
|
||||
|
||||
// Memory request tag bits
|
||||
`ifdef L3_ENABLE
|
||||
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
|
||||
`else
|
||||
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
|
||||
`endif
|
||||
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
/////////////////////////////// Issue parameters //////////////////////////
|
||||
|
||||
localparam ISSUE_IDX_W = `LOG2UP(`ISSUE_WIDTH);
|
||||
localparam ISSUE_RATIO = `NUM_WARPS / `ISSUE_WIDTH;
|
||||
localparam ISSUE_WIS_W = `LOG2UP(ISSUE_RATIO);
|
||||
localparam ISSUE_ADDRW = `LOG2UP(`NUM_REGS * (ISSUE_RATIO));
|
||||
|
||||
`IGNORE_UNUSED_BEGIN
|
||||
function logic [ISSUE_IDX_W-1:0] wid_to_isw(
|
||||
input logic [`NW_WIDTH-1:0] wid
|
||||
);
|
||||
if (`ISSUE_WIDTH > 1) begin
|
||||
wid_to_isw = ISSUE_IDX_W'(wid);
|
||||
end else begin
|
||||
wid_to_isw = 0;
|
||||
end
|
||||
endfunction
|
||||
`IGNORE_UNUSED_END
|
||||
|
||||
function logic [`NW_WIDTH-1:0] wis_to_wid(
|
||||
input logic [ISSUE_WIS_W-1:0] wis,
|
||||
input logic [ISSUE_IDX_W-1:0] isw
|
||||
);
|
||||
wis_to_wid = `NW_WIDTH'({wis, isw} >> (ISSUE_IDX_W-`CLOG2(`ISSUE_WIDTH)));
|
||||
endfunction
|
||||
|
||||
function logic [ISSUE_WIS_W-1:0] wid_to_wis(
|
||||
input logic [`NW_WIDTH-1:0] wid
|
||||
);
|
||||
wid_to_wis = ISSUE_WIS_W'(wid >> `CLOG2(`ISSUE_WIDTH));
|
||||
endfunction
|
||||
|
||||
function logic [ISSUE_ADDRW-1:0] wis_to_addr(
|
||||
input logic [`NR_BITS-1:0] rid,
|
||||
input logic [ISSUE_WIS_W-1:0] wis
|
||||
);
|
||||
wis_to_addr = ISSUE_ADDRW'({rid, wis} >> (ISSUE_WIS_W-`CLOG2(ISSUE_RATIO)));
|
||||
endfunction
|
||||
|
||||
endpackage
|
||||
|
||||
`endif // VX_GPU_PKG_VH
|
|
@ -1,43 +0,0 @@
|
|||
`ifndef VX_GPU_TYPES
|
||||
`define VX_GPU_TYPES
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
package gpu_types;
|
||||
|
||||
typedef struct packed {
|
||||
logic valid;
|
||||
logic [`NUM_THREADS-1:0] tmask;
|
||||
} gpu_tmc_t;
|
||||
|
||||
`define GPU_TMC_BITS $bits(gpu_types::gpu_tmc_t)
|
||||
|
||||
typedef struct packed {
|
||||
logic valid;
|
||||
logic [`NUM_WARPS-1:0] wmask;
|
||||
logic [31:0] pc;
|
||||
} gpu_wspawn_t;
|
||||
|
||||
`define GPU_WSPAWN_BITS $bits(gpu_types::gpu_wspawn_t)
|
||||
|
||||
typedef struct packed {
|
||||
logic valid;
|
||||
logic diverged;
|
||||
logic [`NUM_THREADS-1:0] then_tmask;
|
||||
logic [`NUM_THREADS-1:0] else_tmask;
|
||||
logic [31:0] pc;
|
||||
} gpu_split_t;
|
||||
|
||||
`define GPU_SPLIT_BITS $bits(gpu_types::gpu_split_t)
|
||||
|
||||
typedef struct packed {
|
||||
logic valid;
|
||||
logic [`NB_BITS-1:0] id;
|
||||
logic [`NW_BITS-1:0] size_m1;
|
||||
} gpu_barrier_t;
|
||||
|
||||
`define GPU_BARRIER_BITS $bits(gpu_types::gpu_barrier_t)
|
||||
|
||||
endpackage
|
||||
|
||||
`endif
|
|
@ -1,220 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_gpu_unit #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
`SCOPE_IO_VX_gpu_unit
|
||||
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// Inputs
|
||||
VX_gpu_req_if.slave gpu_req_if,
|
||||
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
// PERF
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_tex_if.master perf_tex_if,
|
||||
`endif
|
||||
VX_dcache_req_if.master dcache_req_if,
|
||||
VX_dcache_rsp_if.slave dcache_rsp_if,
|
||||
VX_tex_csr_if.slave tex_csr_if,
|
||||
`endif
|
||||
|
||||
// Outputs
|
||||
VX_warp_ctl_if.master warp_ctl_if,
|
||||
VX_commit_if.master gpu_commit_if
|
||||
);
|
||||
import gpu_types::*;
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
localparam WCTL_DATAW = `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS;
|
||||
localparam RSP_DATAW = `MAX(`NUM_THREADS * 32, WCTL_DATAW);
|
||||
|
||||
wire rsp_valid;
|
||||
wire [`UUID_BITS-1:0] rsp_uuid;
|
||||
wire [`NW_BITS-1:0] rsp_wid;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
wire [31:0] rsp_PC;
|
||||
wire [`NR_BITS-1:0] rsp_rd;
|
||||
wire rsp_wb;
|
||||
|
||||
wire [RSP_DATAW-1:0] rsp_data, rsp_data_r;
|
||||
|
||||
gpu_tmc_t tmc;
|
||||
gpu_wspawn_t wspawn;
|
||||
gpu_barrier_t barrier;
|
||||
gpu_split_t split;
|
||||
|
||||
wire [WCTL_DATAW-1:0] warp_ctl_data;
|
||||
wire is_warp_ctl;
|
||||
|
||||
wire stall_in, stall_out;
|
||||
|
||||
wire is_wspawn = (gpu_req_if.op_type == `INST_GPU_WSPAWN);
|
||||
wire is_tmc = (gpu_req_if.op_type == `INST_GPU_TMC);
|
||||
wire is_split = (gpu_req_if.op_type == `INST_GPU_SPLIT);
|
||||
wire is_bar = (gpu_req_if.op_type == `INST_GPU_BAR);
|
||||
wire is_pred = (gpu_req_if.op_type == `INST_GPU_PRED);
|
||||
|
||||
wire [31:0] rs1_data = gpu_req_if.rs1_data[gpu_req_if.tid];
|
||||
wire [31:0] rs2_data = gpu_req_if.rs2_data[gpu_req_if.tid];
|
||||
|
||||
wire [`NUM_THREADS-1:0] taken_tmask;
|
||||
wire [`NUM_THREADS-1:0] not_taken_tmask;
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
wire taken = (gpu_req_if.rs1_data[i] != 0);
|
||||
assign taken_tmask[i] = gpu_req_if.tmask[i] & taken;
|
||||
assign not_taken_tmask[i] = gpu_req_if.tmask[i] & ~taken;
|
||||
end
|
||||
|
||||
// tmc
|
||||
|
||||
wire [`NUM_THREADS-1:0] pred_mask = (taken_tmask != 0) ? taken_tmask : gpu_req_if.tmask;
|
||||
|
||||
assign tmc.valid = is_tmc || is_pred;
|
||||
assign tmc.tmask = is_pred ? pred_mask : rs1_data[`NUM_THREADS-1:0];
|
||||
|
||||
// wspawn
|
||||
|
||||
wire [31:0] wspawn_pc = rs2_data;
|
||||
wire [`NUM_WARPS-1:0] wspawn_wmask;
|
||||
for (genvar i = 0; i < `NUM_WARPS; i++) begin
|
||||
assign wspawn_wmask[i] = (i < rs1_data);
|
||||
end
|
||||
assign wspawn.valid = is_wspawn;
|
||||
assign wspawn.wmask = wspawn_wmask;
|
||||
assign wspawn.pc = wspawn_pc;
|
||||
|
||||
// split
|
||||
|
||||
assign split.valid = is_split;
|
||||
assign split.diverged = (| taken_tmask) && (| not_taken_tmask);
|
||||
assign split.then_tmask = taken_tmask;
|
||||
assign split.else_tmask = not_taken_tmask;
|
||||
assign split.pc = gpu_req_if.next_PC;
|
||||
|
||||
// barrier
|
||||
|
||||
assign barrier.valid = is_bar;
|
||||
assign barrier.id = rs1_data[`NB_BITS-1:0];
|
||||
assign barrier.size_m1 = (`NW_BITS)'(rs2_data - 1);
|
||||
|
||||
// pack warp ctl result
|
||||
assign warp_ctl_data = {tmc, wspawn, split, barrier};
|
||||
|
||||
// texture
|
||||
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
|
||||
`UNUSED_VAR (gpu_req_if.op_mod)
|
||||
|
||||
VX_tex_req_if tex_req_if();
|
||||
VX_tex_rsp_if tex_rsp_if();
|
||||
|
||||
wire is_tex = (gpu_req_if.op_type == `INST_GPU_TEX);
|
||||
|
||||
assign tex_req_if.valid = gpu_req_if.valid && is_tex;
|
||||
assign tex_req_if.uuid = gpu_req_if.uuid;
|
||||
assign tex_req_if.wid = gpu_req_if.wid;
|
||||
assign tex_req_if.tmask = gpu_req_if.tmask;
|
||||
assign tex_req_if.PC = gpu_req_if.PC;
|
||||
assign tex_req_if.rd = gpu_req_if.rd;
|
||||
assign tex_req_if.wb = gpu_req_if.wb;
|
||||
|
||||
assign tex_req_if.unit = gpu_req_if.op_mod[`NTEX_BITS-1:0];
|
||||
assign tex_req_if.coords[0] = gpu_req_if.rs1_data;
|
||||
assign tex_req_if.coords[1] = gpu_req_if.rs2_data;
|
||||
assign tex_req_if.lod = gpu_req_if.rs3_data;
|
||||
|
||||
VX_tex_unit #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) tex_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_tex_if (perf_tex_if),
|
||||
`endif
|
||||
.tex_req_if (tex_req_if),
|
||||
.tex_csr_if (tex_csr_if),
|
||||
.tex_rsp_if (tex_rsp_if),
|
||||
.dcache_req_if (dcache_req_if),
|
||||
.dcache_rsp_if (dcache_rsp_if)
|
||||
);
|
||||
|
||||
assign tex_rsp_if.ready = !stall_out;
|
||||
|
||||
assign stall_in = (is_tex && ~tex_req_if.ready)
|
||||
|| (~is_tex && (tex_rsp_if.valid || stall_out));
|
||||
|
||||
assign is_warp_ctl = !(is_tex || tex_rsp_if.valid);
|
||||
|
||||
assign rsp_valid = tex_rsp_if.valid || (gpu_req_if.valid && ~is_tex);
|
||||
assign rsp_uuid = tex_rsp_if.valid ? tex_rsp_if.uuid : gpu_req_if.uuid;
|
||||
assign rsp_wid = tex_rsp_if.valid ? tex_rsp_if.wid : gpu_req_if.wid;
|
||||
assign rsp_tmask = tex_rsp_if.valid ? tex_rsp_if.tmask : gpu_req_if.tmask;
|
||||
assign rsp_PC = tex_rsp_if.valid ? tex_rsp_if.PC : gpu_req_if.PC;
|
||||
assign rsp_rd = tex_rsp_if.rd;
|
||||
assign rsp_wb = tex_rsp_if.valid && tex_rsp_if.wb;
|
||||
assign rsp_data = tex_rsp_if.valid ? RSP_DATAW'(tex_rsp_if.data) : RSP_DATAW'(warp_ctl_data);
|
||||
|
||||
`else
|
||||
|
||||
`UNUSED_VAR (gpu_req_if.op_mod)
|
||||
`UNUSED_VAR (gpu_req_if.rs3_data)
|
||||
`UNUSED_VAR (gpu_req_if.wb)
|
||||
`UNUSED_VAR (gpu_req_if.rd)
|
||||
|
||||
assign stall_in = stall_out;
|
||||
assign is_warp_ctl = 1;
|
||||
|
||||
assign rsp_valid = gpu_req_if.valid;
|
||||
assign rsp_uuid = gpu_req_if.uuid;
|
||||
assign rsp_wid = gpu_req_if.wid;
|
||||
assign rsp_tmask = gpu_req_if.tmask;
|
||||
assign rsp_PC = gpu_req_if.PC;
|
||||
assign rsp_rd = 0;
|
||||
assign rsp_wb = 0;
|
||||
assign rsp_data = RSP_DATAW'(warp_ctl_data);
|
||||
|
||||
`endif
|
||||
|
||||
wire is_warp_ctl_r;
|
||||
|
||||
// output
|
||||
assign stall_out = ~gpu_commit_if.ready && gpu_commit_if.valid;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_out),
|
||||
.data_in ({rsp_valid, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}),
|
||||
.data_out ({gpu_commit_if.valid, gpu_commit_if.uuid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data_r, is_warp_ctl_r})
|
||||
);
|
||||
|
||||
assign gpu_commit_if.data = rsp_data_r[(`NUM_THREADS * 32)-1:0];
|
||||
assign gpu_commit_if.eop = 1'b1;
|
||||
|
||||
// warp control reponse
|
||||
|
||||
assign {warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier} = rsp_data_r[WCTL_DATAW-1:0];
|
||||
|
||||
assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready && is_warp_ctl_r;
|
||||
assign warp_ctl_if.wid = gpu_commit_if.wid;
|
||||
|
||||
// can accept new request?
|
||||
assign gpu_req_if.ready = ~stall_in;
|
||||
|
||||
`SCOPE_ASSIGN (gpu_rsp_valid, warp_ctl_if.valid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_uuid, gpu_commit_if.uuid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_tmc, warp_ctl_if.tmc.valid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_wspawn, warp_ctl_if.wspawn.valid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_split, warp_ctl_if.split.valid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_barrier, warp_ctl_if.barrier.valid);
|
||||
|
||||
endmodule
|
|
@ -1,210 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_ibuffer #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// inputs
|
||||
VX_decode_if.slave decode_if,
|
||||
|
||||
// outputs
|
||||
VX_ibuffer_if.master ibuffer_if
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
localparam DATAW = `UUID_BITS + `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1;
|
||||
localparam ADDRW = $clog2(`IBUF_SIZE+1);
|
||||
localparam NWARPSW = $clog2(`NUM_WARPS+1);
|
||||
|
||||
reg [`NUM_WARPS-1:0][ADDRW-1:0] used_r;
|
||||
reg [`NUM_WARPS-1:0] full_r, empty_r, alm_empty_r;
|
||||
|
||||
wire [`NUM_WARPS-1:0] q_full, q_empty, q_alm_empty;
|
||||
wire [DATAW-1:0] q_data_in;
|
||||
wire [`NUM_WARPS-1:0][DATAW-1:0] q_data_prev;
|
||||
reg [`NUM_WARPS-1:0][DATAW-1:0] q_data_out;
|
||||
|
||||
wire enq_fire = decode_if.valid && decode_if.ready;
|
||||
wire deq_fire = ibuffer_if.valid && ibuffer_if.ready;
|
||||
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
|
||||
|
||||
wire writing = enq_fire && (i == decode_if.wid);
|
||||
wire reading = deq_fire && (i == ibuffer_if.wid);
|
||||
|
||||
wire going_empty = empty_r[i] || (alm_empty_r[i] && reading);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`IBUF_SIZE),
|
||||
.OUT_REG (1)
|
||||
) queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (writing && !going_empty),
|
||||
.data_in (q_data_in),
|
||||
.ready_out(reading),
|
||||
.data_out (q_data_prev[i]),
|
||||
`UNUSED_PIN (ready_in),
|
||||
`UNUSED_PIN (valid_out)
|
||||
);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
used_r[i] <= 0;
|
||||
full_r[i] <= 0;
|
||||
empty_r[i] <= 1;
|
||||
alm_empty_r[i] <= 1;
|
||||
end else begin
|
||||
if (writing) begin
|
||||
if (!reading) begin
|
||||
empty_r[i] <= 0;
|
||||
if (used_r[i] == 1)
|
||||
alm_empty_r[i] <= 0;
|
||||
if (used_r[i] == ADDRW'(`IBUF_SIZE))
|
||||
full_r[i] <= 1;
|
||||
end
|
||||
end else if (reading) begin
|
||||
full_r[i] <= 0;
|
||||
if (used_r[i] == ADDRW'(1))
|
||||
empty_r[i] <= 1;
|
||||
if (used_r[i] == ADDRW'(2))
|
||||
alm_empty_r[i] <= 1;
|
||||
end
|
||||
used_r[i] <= used_r[i] + ADDRW'($signed(2'(writing) - 2'(reading)));
|
||||
end
|
||||
|
||||
if (writing && going_empty) begin
|
||||
q_data_out[i] <= q_data_in;
|
||||
end else if (reading) begin
|
||||
q_data_out[i] <= q_data_prev[i];
|
||||
end
|
||||
end
|
||||
|
||||
assign q_full[i] = full_r[i];
|
||||
assign q_empty[i] = empty_r[i];
|
||||
assign q_alm_empty[i] = alm_empty_r[i];
|
||||
end
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
reg [`NUM_WARPS-1:0] valid_table, valid_table_n;
|
||||
reg [`NW_BITS-1:0] deq_wid, deq_wid_n;
|
||||
reg [`NW_BITS-1:0] deq_wid_rr, deq_wid_rr_n;
|
||||
reg deq_valid, deq_valid_n;
|
||||
reg [DATAW-1:0] deq_instr, deq_instr_n;
|
||||
reg [NWARPSW-1:0] num_warps;
|
||||
|
||||
`UNUSED_VAR (deq_instr)
|
||||
|
||||
// calculate valid table
|
||||
always @(*) begin
|
||||
valid_table_n = valid_table;
|
||||
if (deq_fire) begin
|
||||
valid_table_n[deq_wid] = !q_alm_empty[deq_wid];
|
||||
end
|
||||
if (enq_fire) begin
|
||||
valid_table_n[decode_if.wid] = 1;
|
||||
end
|
||||
end
|
||||
|
||||
// round-robin warp scheduling
|
||||
VX_rr_arbiter #(
|
||||
.NUM_REQS (`NUM_WARPS)
|
||||
) rr_arbiter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.requests (valid_table_n),
|
||||
.grant_index (deq_wid_rr_n),
|
||||
`UNUSED_PIN (grant_valid),
|
||||
`UNUSED_PIN (grant_onehot),
|
||||
`UNUSED_PIN (enable)
|
||||
);
|
||||
|
||||
// schedule the next instruction to issue
|
||||
always @(*) begin
|
||||
if (num_warps > 1) begin
|
||||
deq_valid_n = 1;
|
||||
deq_wid_n = deq_wid_rr;
|
||||
deq_instr_n = q_data_out[deq_wid_rr];
|
||||
end else if (1 == num_warps && !(deq_fire && q_alm_empty[deq_wid])) begin
|
||||
deq_valid_n = 1;
|
||||
deq_wid_n = deq_wid;
|
||||
deq_instr_n = deq_fire ? q_data_prev[deq_wid] : q_data_out[deq_wid];
|
||||
end else begin
|
||||
deq_valid_n = enq_fire;
|
||||
deq_wid_n = decode_if.wid;
|
||||
deq_instr_n = q_data_in;
|
||||
end
|
||||
end
|
||||
|
||||
wire warp_added = enq_fire && q_empty[decode_if.wid];
|
||||
wire warp_removed = deq_fire && ~(enq_fire && decode_if.wid == deq_wid) && q_alm_empty[deq_wid];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
valid_table <= 0;
|
||||
deq_valid <= 0;
|
||||
num_warps <= 0;
|
||||
end else begin
|
||||
valid_table <= valid_table_n;
|
||||
deq_valid <= deq_valid_n;
|
||||
|
||||
|
||||
if (warp_added && !warp_removed) begin
|
||||
num_warps <= num_warps + NWARPSW'(1);
|
||||
end else if (warp_removed && !warp_added) begin
|
||||
num_warps <= num_warps - NWARPSW'(1);
|
||||
end
|
||||
end
|
||||
|
||||
deq_wid <= deq_wid_n;
|
||||
deq_wid_rr <= deq_wid_rr_n;
|
||||
deq_instr <= deq_instr_n;
|
||||
end
|
||||
|
||||
assign decode_if.ready = ~q_full[decode_if.wid];
|
||||
|
||||
assign q_data_in = {decode_if.uuid,
|
||||
decode_if.tmask,
|
||||
decode_if.PC,
|
||||
decode_if.ex_type,
|
||||
decode_if.op_type,
|
||||
decode_if.op_mod,
|
||||
decode_if.wb,
|
||||
decode_if.use_PC,
|
||||
decode_if.use_imm,
|
||||
decode_if.imm,
|
||||
decode_if.rd,
|
||||
decode_if.rs1,
|
||||
decode_if.rs2,
|
||||
decode_if.rs3};
|
||||
|
||||
assign ibuffer_if.valid = deq_valid;
|
||||
assign ibuffer_if.wid = deq_wid;
|
||||
assign {ibuffer_if.uuid,
|
||||
ibuffer_if.tmask,
|
||||
ibuffer_if.PC,
|
||||
ibuffer_if.ex_type,
|
||||
ibuffer_if.op_type,
|
||||
ibuffer_if.op_mod,
|
||||
ibuffer_if.wb,
|
||||
ibuffer_if.use_PC,
|
||||
ibuffer_if.use_imm,
|
||||
ibuffer_if.imm,
|
||||
ibuffer_if.rd,
|
||||
ibuffer_if.rs1,
|
||||
ibuffer_if.rs2,
|
||||
ibuffer_if.rs3} = deq_instr;
|
||||
|
||||
// scoreboard forwarding
|
||||
assign ibuffer_if.wid_n = deq_wid_n;
|
||||
assign ibuffer_if.rd_n = deq_instr_n[3*`NR_BITS +: `NR_BITS];
|
||||
assign ibuffer_if.rs1_n = deq_instr_n[2*`NR_BITS +: `NR_BITS];
|
||||
assign ibuffer_if.rs2_n = deq_instr_n[1*`NR_BITS +: `NR_BITS];
|
||||
assign ibuffer_if.rs3_n = deq_instr_n[0*`NR_BITS +: `NR_BITS];
|
||||
|
||||
endmodule
|
|
@ -1,102 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_icache_stage #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
`SCOPE_IO_VX_icache_stage
|
||||
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// Icache interface
|
||||
VX_icache_req_if.master icache_req_if,
|
||||
VX_icache_rsp_if.slave icache_rsp_if,
|
||||
|
||||
// request
|
||||
VX_ifetch_req_if.slave ifetch_req_if,
|
||||
|
||||
// reponse
|
||||
VX_ifetch_rsp_if.master ifetch_rsp_if
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
localparam OUT_REG = 0;
|
||||
|
||||
wire [`NW_BITS-1:0] req_tag, rsp_tag;
|
||||
|
||||
wire icache_req_fire = icache_req_if.valid && icache_req_if.ready;
|
||||
|
||||
assign req_tag = ifetch_req_if.wid;
|
||||
assign rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0];
|
||||
|
||||
wire [`UUID_BITS-1:0] rsp_uuid;
|
||||
wire [31:0] rsp_PC;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (32 + `NUM_THREADS + `UUID_BITS),
|
||||
.SIZE (`NUM_WARPS),
|
||||
.LUTRAM (1)
|
||||
) req_metadata (
|
||||
.clk (clk),
|
||||
.wren (icache_req_fire),
|
||||
.waddr (req_tag),
|
||||
.wdata ({ifetch_req_if.PC, ifetch_req_if.tmask, ifetch_req_if.uuid}),
|
||||
.raddr (rsp_tag),
|
||||
.rdata ({rsp_PC, rsp_tmask, rsp_uuid})
|
||||
);
|
||||
|
||||
`RUNTIME_ASSERT((!ifetch_req_if.valid || ifetch_req_if.PC >= `STARTUP_ADDR),
|
||||
("%t: *** invalid PC=%0h, wid=%0d, tmask=%b (#%0d)", $time, ifetch_req_if.PC, ifetch_req_if.wid, ifetch_req_if.tmask, ifetch_req_if.uuid))
|
||||
|
||||
// Icache Request
|
||||
assign icache_req_if.valid = ifetch_req_if.valid;
|
||||
assign icache_req_if.addr = ifetch_req_if.PC[31:2];
|
||||
assign icache_req_if.tag = {ifetch_req_if.uuid, req_tag};
|
||||
|
||||
// Can accept new request?
|
||||
assign ifetch_req_if.ready = icache_req_if.ready;
|
||||
|
||||
wire [`NW_BITS-1:0] rsp_wid = rsp_tag;
|
||||
|
||||
wire stall_out = ~ifetch_rsp_if.ready && (0 == OUT_REG && ifetch_rsp_if.valid);
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + 32 + `UUID_BITS),
|
||||
.RESETW (1),
|
||||
.DEPTH (OUT_REG)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_out),
|
||||
.data_in ({icache_rsp_if.valid, rsp_wid, rsp_tmask, rsp_PC, icache_rsp_if.data, rsp_uuid}),
|
||||
.data_out ({ifetch_rsp_if.valid, ifetch_rsp_if.wid, ifetch_rsp_if.tmask, ifetch_rsp_if.PC, ifetch_rsp_if.data, ifetch_rsp_if.uuid})
|
||||
);
|
||||
|
||||
// Can accept new response?
|
||||
assign icache_rsp_if.ready = ~stall_out;
|
||||
|
||||
`SCOPE_ASSIGN (icache_req_fire, icache_req_fire);
|
||||
`SCOPE_ASSIGN (icache_req_uuid, ifetch_req_if.uuid);
|
||||
`SCOPE_ASSIGN (icache_req_addr, {icache_req_if.addr, 2'b0});
|
||||
`SCOPE_ASSIGN (icache_req_tag, req_tag);
|
||||
|
||||
`SCOPE_ASSIGN (icache_rsp_fire, icache_rsp_if.valid && icache_rsp_if.ready);
|
||||
`SCOPE_ASSIGN (icache_rsp_uuid, rsp_uuid);
|
||||
`SCOPE_ASSIGN (icache_rsp_data, icache_rsp_if.data);
|
||||
`SCOPE_ASSIGN (icache_rsp_tag, rsp_tag);
|
||||
|
||||
`ifdef DBG_TRACE_CORE_ICACHE
|
||||
always @(posedge clk) begin
|
||||
if (icache_req_fire) begin
|
||||
dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h (#%0d)\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC, ifetch_req_if.uuid);
|
||||
end
|
||||
if (ifetch_rsp_if.valid && ifetch_rsp_if.ready) begin
|
||||
dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, data=%0h (#%0d)\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, ifetch_rsp_if.data, ifetch_rsp_if.uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
|
@ -1,68 +0,0 @@
|
|||
`include "VX_platform.vh"
|
||||
|
||||
module VX_ipdom_stack #(
|
||||
parameter WIDTH = 1,
|
||||
parameter DEPTH = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
input wire pair,
|
||||
input wire [WIDTH - 1:0] q1,
|
||||
input wire [WIDTH - 1:0] q2,
|
||||
output wire [WIDTH - 1:0] d,
|
||||
input wire push,
|
||||
input wire pop,
|
||||
output wire index,
|
||||
output wire empty,
|
||||
output wire full
|
||||
);
|
||||
localparam ADDRW = $clog2(DEPTH);
|
||||
|
||||
reg is_part [DEPTH-1:0];
|
||||
|
||||
reg [ADDRW-1:0] rd_ptr, wr_ptr;
|
||||
|
||||
wire [WIDTH-1:0] d1, d2;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
rd_ptr <= 0;
|
||||
wr_ptr <= 0;
|
||||
end else begin
|
||||
if (push) begin
|
||||
rd_ptr <= wr_ptr;
|
||||
wr_ptr <= wr_ptr + ADDRW'(1);
|
||||
end else if (pop) begin
|
||||
wr_ptr <= wr_ptr - ADDRW'(is_part[rd_ptr]);
|
||||
rd_ptr <= rd_ptr - ADDRW'(is_part[rd_ptr]);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (WIDTH * 2),
|
||||
.SIZE (DEPTH),
|
||||
.LUTRAM (1)
|
||||
) store (
|
||||
.clk (clk),
|
||||
.wren (push),
|
||||
.waddr (wr_ptr),
|
||||
.wdata ({q2, q1}),
|
||||
.raddr (rd_ptr),
|
||||
.rdata ({d2, d1})
|
||||
);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (push) begin
|
||||
is_part[wr_ptr] <= ~pair;
|
||||
end else if (pop) begin
|
||||
is_part[rd_ptr] <= 1;
|
||||
end
|
||||
end
|
||||
|
||||
assign index = is_part[rd_ptr];
|
||||
assign d = index ? d1 : d2;
|
||||
assign empty = (ADDRW'(0) == wr_ptr);
|
||||
assign full = (ADDRW'(DEPTH-1) == wr_ptr);
|
||||
|
||||
endmodule
|
|
@ -1,256 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_issue #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
`SCOPE_IO_VX_issue
|
||||
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_pipeline_if.issue perf_issue_if,
|
||||
`endif
|
||||
|
||||
VX_decode_if.slave decode_if,
|
||||
VX_writeback_if.slave writeback_if,
|
||||
|
||||
VX_alu_req_if.master alu_req_if,
|
||||
VX_lsu_req_if.master lsu_req_if,
|
||||
VX_csr_req_if.master csr_req_if,
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_fpu_req_if.master fpu_req_if,
|
||||
`endif
|
||||
VX_gpu_req_if.master gpu_req_if
|
||||
);
|
||||
VX_ibuffer_if ibuffer_if();
|
||||
VX_gpr_req_if gpr_req_if();
|
||||
VX_gpr_rsp_if gpr_rsp_if();
|
||||
VX_writeback_if sboard_wb_if();
|
||||
VX_ibuffer_if scoreboard_if();
|
||||
VX_ibuffer_if dispatch_if();
|
||||
|
||||
// GPR request interface
|
||||
assign gpr_req_if.wid = ibuffer_if.wid;
|
||||
assign gpr_req_if.rs1 = ibuffer_if.rs1;
|
||||
assign gpr_req_if.rs2 = ibuffer_if.rs2;
|
||||
assign gpr_req_if.rs3 = ibuffer_if.rs3;
|
||||
|
||||
// scoreboard writeback interface
|
||||
assign sboard_wb_if.valid = writeback_if.valid;
|
||||
assign sboard_wb_if.uuid = writeback_if.uuid;
|
||||
assign sboard_wb_if.wid = writeback_if.wid;
|
||||
assign sboard_wb_if.PC = writeback_if.PC;
|
||||
assign sboard_wb_if.rd = writeback_if.rd;
|
||||
assign sboard_wb_if.eop = writeback_if.eop;
|
||||
|
||||
// scoreboard interface
|
||||
assign scoreboard_if.valid = ibuffer_if.valid && dispatch_if.ready;
|
||||
assign scoreboard_if.uuid = ibuffer_if.uuid;
|
||||
assign scoreboard_if.wid = ibuffer_if.wid;
|
||||
assign scoreboard_if.PC = ibuffer_if.PC;
|
||||
assign scoreboard_if.wb = ibuffer_if.wb;
|
||||
assign scoreboard_if.rd = ibuffer_if.rd;
|
||||
assign scoreboard_if.rd_n = ibuffer_if.rd_n;
|
||||
assign scoreboard_if.rs1_n = ibuffer_if.rs1_n;
|
||||
assign scoreboard_if.rs2_n = ibuffer_if.rs2_n;
|
||||
assign scoreboard_if.rs3_n = ibuffer_if.rs3_n;
|
||||
assign scoreboard_if.wid_n = ibuffer_if.wid_n;
|
||||
|
||||
// dispatch interface
|
||||
assign dispatch_if.valid = ibuffer_if.valid && scoreboard_if.ready;
|
||||
assign dispatch_if.uuid = ibuffer_if.uuid;
|
||||
assign dispatch_if.wid = ibuffer_if.wid;
|
||||
assign dispatch_if.tmask = ibuffer_if.tmask;
|
||||
assign dispatch_if.PC = ibuffer_if.PC;
|
||||
assign dispatch_if.ex_type = ibuffer_if.ex_type;
|
||||
assign dispatch_if.op_type = ibuffer_if.op_type;
|
||||
assign dispatch_if.op_mod = ibuffer_if.op_mod;
|
||||
assign dispatch_if.wb = ibuffer_if.wb;
|
||||
assign dispatch_if.rd = ibuffer_if.rd;
|
||||
assign dispatch_if.rs1 = ibuffer_if.rs1;
|
||||
assign dispatch_if.imm = ibuffer_if.imm;
|
||||
assign dispatch_if.use_PC = ibuffer_if.use_PC;
|
||||
assign dispatch_if.use_imm = ibuffer_if.use_imm;
|
||||
|
||||
// issue the instruction
|
||||
assign ibuffer_if.ready = scoreboard_if.ready && dispatch_if.ready;
|
||||
|
||||
`RESET_RELAY (ibuf_reset);
|
||||
`RESET_RELAY (scoreboard_reset);
|
||||
`RESET_RELAY (gpr_reset);
|
||||
`RESET_RELAY (dispatch_reset);
|
||||
|
||||
VX_ibuffer #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) ibuffer (
|
||||
.clk (clk),
|
||||
.reset (ibuf_reset),
|
||||
.decode_if (decode_if),
|
||||
.ibuffer_if (ibuffer_if)
|
||||
);
|
||||
|
||||
VX_scoreboard #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) scoreboard (
|
||||
.clk (clk),
|
||||
.reset (scoreboard_reset),
|
||||
.writeback_if(sboard_wb_if),
|
||||
.ibuffer_if (scoreboard_if)
|
||||
);
|
||||
|
||||
VX_gpr_stage #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) gpr_stage (
|
||||
.clk (clk),
|
||||
.reset (gpr_reset),
|
||||
.writeback_if (writeback_if),
|
||||
.gpr_req_if (gpr_req_if),
|
||||
.gpr_rsp_if (gpr_rsp_if)
|
||||
);
|
||||
|
||||
VX_dispatch dispatch (
|
||||
.clk (clk),
|
||||
.reset (dispatch_reset),
|
||||
.ibuffer_if (dispatch_if),
|
||||
.gpr_rsp_if (gpr_rsp_if),
|
||||
.alu_req_if (alu_req_if),
|
||||
.lsu_req_if (lsu_req_if),
|
||||
.csr_req_if (csr_req_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_req_if (fpu_req_if),
|
||||
`endif
|
||||
.gpu_req_if (gpu_req_if)
|
||||
);
|
||||
|
||||
`SCOPE_ASSIGN (issue_fire, ibuffer_if.valid && ibuffer_if.ready);
|
||||
`SCOPE_ASSIGN (issue_uuid, ibuffer_if.uuid);
|
||||
`SCOPE_ASSIGN (issue_tmask, ibuffer_if.tmask);
|
||||
`SCOPE_ASSIGN (issue_ex_type, ibuffer_if.ex_type);
|
||||
`SCOPE_ASSIGN (issue_op_type, ibuffer_if.op_type);
|
||||
`SCOPE_ASSIGN (issue_op_mod, ibuffer_if.op_mod);
|
||||
`SCOPE_ASSIGN (issue_wb, ibuffer_if.wb);
|
||||
`SCOPE_ASSIGN (issue_rd, ibuffer_if.rd);
|
||||
`SCOPE_ASSIGN (issue_rs1, ibuffer_if.rs1);
|
||||
`SCOPE_ASSIGN (issue_rs2, ibuffer_if.rs2);
|
||||
`SCOPE_ASSIGN (issue_rs3, ibuffer_if.rs3);
|
||||
`SCOPE_ASSIGN (issue_imm, ibuffer_if.imm);
|
||||
`SCOPE_ASSIGN (issue_use_pc, ibuffer_if.use_PC);
|
||||
`SCOPE_ASSIGN (issue_use_imm, ibuffer_if.use_imm);
|
||||
`SCOPE_ASSIGN (scoreboard_delay, !scoreboard_if.ready);
|
||||
`SCOPE_ASSIGN (dispatch_delay, !dispatch_if.ready);
|
||||
`SCOPE_ASSIGN (gpr_rs1, gpr_rsp_if.rs1_data);
|
||||
`SCOPE_ASSIGN (gpr_rs2, gpr_rsp_if.rs2_data);
|
||||
`SCOPE_ASSIGN (gpr_rs3, gpr_rsp_if.rs3_data);
|
||||
`SCOPE_ASSIGN (writeback_valid, writeback_if.valid);
|
||||
`SCOPE_ASSIGN (writeback_uuid, writeback_if.uuid);
|
||||
`SCOPE_ASSIGN (writeback_tmask, writeback_if.tmask);
|
||||
`SCOPE_ASSIGN (writeback_rd, writeback_if.rd);
|
||||
`SCOPE_ASSIGN (writeback_data, writeback_if.data);
|
||||
`SCOPE_ASSIGN (writeback_eop, writeback_if.eop);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_scb_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_alu_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_lsu_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_csr_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_gpu_stalls;
|
||||
`ifdef EXT_F_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_fpu_stalls;
|
||||
`endif
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_ibf_stalls <= 0;
|
||||
perf_scb_stalls <= 0;
|
||||
perf_alu_stalls <= 0;
|
||||
perf_lsu_stalls <= 0;
|
||||
perf_csr_stalls <= 0;
|
||||
perf_gpu_stalls <= 0;
|
||||
`ifdef EXT_F_ENABLE
|
||||
perf_fpu_stalls <= 0;
|
||||
`endif
|
||||
end else begin
|
||||
if (decode_if.valid & ~decode_if.ready) begin
|
||||
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (scoreboard_if.valid & ~scoreboard_if.ready) begin
|
||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (dispatch_if.valid & ~dispatch_if.ready) begin
|
||||
case (dispatch_if.ex_type)
|
||||
`EX_ALU: perf_alu_stalls <= perf_alu_stalls + `PERF_CTR_BITS'd1;
|
||||
`ifdef EXT_F_ENABLE
|
||||
`EX_FPU: perf_fpu_stalls <= perf_fpu_stalls + `PERF_CTR_BITS'd1;
|
||||
`endif
|
||||
`EX_LSU: perf_lsu_stalls <= perf_lsu_stalls + `PERF_CTR_BITS'd1;
|
||||
`EX_CSR: perf_csr_stalls <= perf_csr_stalls + `PERF_CTR_BITS'd1;
|
||||
//`EX_GPU:
|
||||
default: perf_gpu_stalls <= perf_gpu_stalls + `PERF_CTR_BITS'd1;
|
||||
endcase
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
|
||||
assign perf_issue_if.scb_stalls = perf_scb_stalls;
|
||||
assign perf_issue_if.alu_stalls = perf_alu_stalls;
|
||||
assign perf_issue_if.lsu_stalls = perf_lsu_stalls;
|
||||
assign perf_issue_if.csr_stalls = perf_csr_stalls;
|
||||
assign perf_issue_if.gpu_stalls = perf_gpu_stalls;
|
||||
`ifdef EXT_F_ENABLE
|
||||
assign perf_issue_if.fpu_stalls = perf_fpu_stalls;
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (alu_req_if.valid && alu_req_if.ready) begin
|
||||
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=ALU, tmask=%b, rd=%0d, rs1_data=",
|
||||
$time, CORE_ID, alu_req_if.wid, alu_req_if.PC, alu_req_if.tmask, alu_req_if.rd);
|
||||
`TRACE_ARRAY1D(alu_req_if.rs1_data, `NUM_THREADS);
|
||||
dpi_trace(", rs2_data=");
|
||||
`TRACE_ARRAY1D(alu_req_if.rs2_data, `NUM_THREADS);
|
||||
dpi_trace(" (#%0d)\n", alu_req_if.uuid);
|
||||
end
|
||||
if (lsu_req_if.valid && lsu_req_if.ready) begin
|
||||
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=LSU, tmask=%b, rd=%0d, offset=%0h, addr=",
|
||||
$time, CORE_ID, lsu_req_if.wid, lsu_req_if.PC, lsu_req_if.tmask, lsu_req_if.rd, lsu_req_if.offset);
|
||||
`TRACE_ARRAY1D(lsu_req_if.base_addr, `NUM_THREADS);
|
||||
dpi_trace(", data=");
|
||||
`TRACE_ARRAY1D(lsu_req_if.store_data, `NUM_THREADS);
|
||||
dpi_trace(" (#%0d)\n", lsu_req_if.uuid);
|
||||
end
|
||||
if (csr_req_if.valid && csr_req_if.ready) begin
|
||||
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=CSR, tmask=%b, rd=%0d, addr=%0h, rs1_data=",
|
||||
$time, CORE_ID, csr_req_if.wid, csr_req_if.PC, csr_req_if.tmask, csr_req_if.rd, csr_req_if.addr);
|
||||
`TRACE_ARRAY1D(csr_req_if.rs1_data, `NUM_THREADS);
|
||||
dpi_trace(" (#%0d)\n", csr_req_if.uuid);
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (fpu_req_if.valid && fpu_req_if.ready) begin
|
||||
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=FPU, tmask=%b, rd=%0d, rs1_data=",
|
||||
$time, CORE_ID, fpu_req_if.wid, fpu_req_if.PC, fpu_req_if.tmask, fpu_req_if.rd);
|
||||
`TRACE_ARRAY1D(fpu_req_if.rs1_data, `NUM_THREADS);
|
||||
dpi_trace(", rs2_data=");
|
||||
`TRACE_ARRAY1D(fpu_req_if.rs2_data, `NUM_THREADS);
|
||||
dpi_trace(", rs3_data=");
|
||||
`TRACE_ARRAY1D(fpu_req_if.rs3_data, `NUM_THREADS);
|
||||
dpi_trace(" (#%0d)\n", fpu_req_if.uuid);
|
||||
end
|
||||
`endif
|
||||
if (gpu_req_if.valid && gpu_req_if.ready) begin
|
||||
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=GPU, tmask=%b, rd=%0d, rs1_data=",
|
||||
$time, CORE_ID, gpu_req_if.wid, gpu_req_if.PC, gpu_req_if.tmask, gpu_req_if.rd);
|
||||
`TRACE_ARRAY1D(gpu_req_if.rs1_data, `NUM_THREADS);
|
||||
dpi_trace(", rs2_data=");
|
||||
`TRACE_ARRAY1D(gpu_req_if.rs2_data, `NUM_THREADS);
|
||||
dpi_trace(", rs3_data=");
|
||||
`TRACE_ARRAY1D(gpu_req_if.rs3_data, `NUM_THREADS);
|
||||
dpi_trace(" (#%0d)\n", gpu_req_if.uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
|
@ -1,372 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_lsu_unit #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
`SCOPE_IO_VX_lsu_unit
|
||||
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// Dcache interface
|
||||
VX_dcache_req_if.master dcache_req_if,
|
||||
VX_dcache_rsp_if.slave dcache_rsp_if,
|
||||
|
||||
// inputs
|
||||
VX_lsu_req_if.slave lsu_req_if,
|
||||
|
||||
// outputs
|
||||
VX_commit_if.master ld_commit_if,
|
||||
VX_commit_if.master st_commit_if
|
||||
);
|
||||
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
|
||||
localparam MEM_ADDRW = 32 - MEM_ASHIFT;
|
||||
localparam REQ_ASHIFT = `CLOG2(`DCACHE_WORD_SIZE);
|
||||
|
||||
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
|
||||
`STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
|
||||
`STATIC_ASSERT(`SMEM_SIZE == `MEM_BLOCK_SIZE * (`SMEM_SIZE / `MEM_BLOCK_SIZE), ("invalid parameter"))
|
||||
|
||||
wire req_valid;
|
||||
wire [`UUID_BITS-1:0] req_uuid;
|
||||
wire [`NUM_THREADS-1:0] req_tmask;
|
||||
wire [`NUM_THREADS-1:0][31:0] req_addr;
|
||||
wire [`INST_LSU_BITS-1:0] req_type;
|
||||
wire [`NUM_THREADS-1:0][31:0] req_data;
|
||||
wire [`NR_BITS-1:0] req_rd;
|
||||
wire req_wb;
|
||||
wire [`NW_BITS-1:0] req_wid;
|
||||
wire [31:0] req_pc;
|
||||
wire req_is_dup;
|
||||
wire req_is_prefetch;
|
||||
|
||||
wire mbuf_empty;
|
||||
|
||||
wire [`NUM_THREADS-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type, req_addr_type;
|
||||
|
||||
// full address calculation
|
||||
wire [`NUM_THREADS-1:0][31:0] full_addr;
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
assign full_addr[i] = lsu_req_if.base_addr[i] + lsu_req_if.offset;
|
||||
end
|
||||
|
||||
// detect duplicate addresses
|
||||
wire [`NUM_THREADS-2:0] addr_matches;
|
||||
for (genvar i = 0; i < (`NUM_THREADS-1); i++) begin
|
||||
assign addr_matches[i] = (lsu_req_if.base_addr[i+1] == lsu_req_if.base_addr[0]) || ~lsu_req_if.tmask[i+1];
|
||||
end
|
||||
|
||||
wire lsu_is_dup = lsu_req_if.tmask[0] && (& addr_matches);
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
// is non-cacheable address
|
||||
wire is_addr_nc = (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] >= MEM_ADDRW'(`IO_BASE_ADDR >> MEM_ASHIFT));
|
||||
if (`SM_ENABLE) begin
|
||||
// is shared memory address
|
||||
wire is_addr_sm = (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] >= MEM_ADDRW'((`SMEM_BASE_ADDR - `SMEM_SIZE) >> MEM_ASHIFT))
|
||||
& (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] < MEM_ADDRW'(`SMEM_BASE_ADDR >> MEM_ASHIFT));
|
||||
assign lsu_addr_type[i] = {is_addr_nc, is_addr_sm};
|
||||
end else begin
|
||||
assign lsu_addr_type[i] = is_addr_nc;
|
||||
end
|
||||
end
|
||||
|
||||
// fence stalls the pipeline until all pending requests are sent
|
||||
wire fence_wait = lsu_req_if.is_fence && (req_valid || !mbuf_empty);
|
||||
|
||||
wire ready_in;
|
||||
wire stall_in = ~ready_in && req_valid;
|
||||
|
||||
wire lsu_valid = lsu_req_if.valid && ~fence_wait;
|
||||
|
||||
wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + 1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * `CACHE_ADDR_TYPE_BITS) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.RESETW (1)
|
||||
) req_pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_in),
|
||||
.data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.uuid, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}),
|
||||
.data_out ({req_valid, req_is_dup, req_is_prefetch, req_uuid, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data})
|
||||
);
|
||||
|
||||
// Can accept new request?
|
||||
assign lsu_req_if.ready = ~stall_in && ~fence_wait;
|
||||
|
||||
wire [`UUID_BITS-1:0] rsp_uuid;
|
||||
wire [`NW_BITS-1:0] rsp_wid;
|
||||
wire [31:0] rsp_pc;
|
||||
wire [`NR_BITS-1:0] rsp_rd;
|
||||
wire rsp_wb;
|
||||
wire [`INST_LSU_BITS-1:0] rsp_type;
|
||||
wire rsp_is_dup;
|
||||
wire rsp_is_prefetch;
|
||||
|
||||
reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask;
|
||||
wire [`NUM_THREADS-1:0] rsp_rem_mask_n;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
|
||||
reg [`NUM_THREADS-1:0] req_sent_mask;
|
||||
reg is_req_start;
|
||||
|
||||
wire [`LSUQ_ADDR_BITS-1:0] mbuf_waddr, mbuf_raddr;
|
||||
wire mbuf_full;
|
||||
|
||||
`UNUSED_VAR (rsp_type)
|
||||
`UNUSED_VAR (rsp_is_prefetch)
|
||||
|
||||
wire [`NUM_THREADS-1:0][REQ_ASHIFT-1:0] req_offset, rsp_offset;
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
assign req_offset[i] = req_addr[i][1:0];
|
||||
end
|
||||
|
||||
wire [`NUM_THREADS-1:0] dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready;
|
||||
|
||||
wire dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready;
|
||||
|
||||
wire [`NUM_THREADS-1:0] req_tmask_dup = req_tmask & {{(`NUM_THREADS-1){~req_is_dup}}, 1'b1};
|
||||
|
||||
wire mbuf_push = ~mbuf_full
|
||||
&& (| ({`NUM_THREADS{req_valid}} & req_tmask_dup & dcache_req_if.ready))
|
||||
&& is_req_start // first submission only
|
||||
&& req_wb; // loads only
|
||||
|
||||
wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n);
|
||||
|
||||
assign mbuf_raddr = dcache_rsp_if.tag[`CACHE_ADDR_TYPE_BITS +: `LSUQ_ADDR_BITS];
|
||||
`UNUSED_VAR (dcache_rsp_if.tag)
|
||||
|
||||
// do not writeback from software prefetch
|
||||
wire req_wb2 = req_wb && ~req_is_prefetch;
|
||||
|
||||
VX_index_buffer #(
|
||||
.DATAW (`UUID_BITS + `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1),
|
||||
.SIZE (`LSUQ_SIZE)
|
||||
) req_metadata (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.write_addr (mbuf_waddr),
|
||||
.acquire_slot (mbuf_push),
|
||||
.read_addr (mbuf_raddr),
|
||||
.write_data ({req_uuid, req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}),
|
||||
.read_data ({rsp_uuid, rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}),
|
||||
.release_addr (mbuf_raddr),
|
||||
.release_slot (mbuf_pop),
|
||||
.full (mbuf_full),
|
||||
.empty (mbuf_empty)
|
||||
);
|
||||
|
||||
wire dcache_req_ready = &(dcache_req_if.ready | req_sent_mask | ~req_tmask_dup);
|
||||
|
||||
wire [`NUM_THREADS-1:0] req_sent_mask_n = req_sent_mask | dcache_req_fire;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
req_sent_mask <= 0;
|
||||
is_req_start <= 1;
|
||||
end else begin
|
||||
if (dcache_req_ready) begin
|
||||
req_sent_mask <= 0;
|
||||
is_req_start <= 1;
|
||||
end else begin
|
||||
req_sent_mask <= req_sent_mask_n;
|
||||
is_req_start <= (0 == req_sent_mask_n);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
// need to hold the acquired tag index until the full request is submitted
|
||||
reg [`LSUQ_ADDR_BITS-1:0] req_tag_hold;
|
||||
wire [`LSUQ_ADDR_BITS-1:0] req_tag = is_req_start ? mbuf_waddr : req_tag_hold;
|
||||
always @(posedge clk) begin
|
||||
if (mbuf_push) begin
|
||||
req_tag_hold <= mbuf_waddr;
|
||||
end
|
||||
end
|
||||
|
||||
assign rsp_rem_mask_n = rsp_rem_mask[mbuf_raddr] & ~dcache_rsp_if.tmask;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (mbuf_push) begin
|
||||
rsp_rem_mask[mbuf_waddr] <= req_tmask_dup;
|
||||
end
|
||||
if (dcache_rsp_fire) begin
|
||||
rsp_rem_mask[mbuf_raddr] <= rsp_rem_mask_n;
|
||||
end
|
||||
end
|
||||
|
||||
// ensure all dependencies for the requests are resolved
|
||||
wire req_dep_ready = (req_wb && ~(mbuf_full && is_req_start))
|
||||
|| (~req_wb && st_commit_if.ready);
|
||||
|
||||
// DCache Request
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
|
||||
reg [3:0] mem_req_byteen;
|
||||
reg [31:0] mem_req_data;
|
||||
|
||||
always @(*) begin
|
||||
mem_req_byteen = {4{req_wb}};
|
||||
case (`INST_LSU_WSIZE(req_type))
|
||||
0: mem_req_byteen[req_offset[i]] = 1;
|
||||
1: begin
|
||||
mem_req_byteen[req_offset[i]] = 1;
|
||||
mem_req_byteen[{req_offset[i][1], 1'b1}] = 1;
|
||||
end
|
||||
default : mem_req_byteen = {4{1'b1}};
|
||||
endcase
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
mem_req_data = req_data[i];
|
||||
case (req_offset[i])
|
||||
1: mem_req_data[31:8] = req_data[i][23:0];
|
||||
2: mem_req_data[31:16] = req_data[i][15:0];
|
||||
3: mem_req_data[31:24] = req_data[i][7:0];
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
|
||||
assign dcache_req_if.valid[i] = req_valid && req_dep_ready && req_tmask_dup[i] && !req_sent_mask[i];
|
||||
assign dcache_req_if.rw[i] = ~req_wb;
|
||||
assign dcache_req_if.addr[i] = req_addr[i][31:2];
|
||||
assign dcache_req_if.byteen[i] = mem_req_byteen;
|
||||
assign dcache_req_if.data[i] = mem_req_data;
|
||||
assign dcache_req_if.tag[i] = {req_uuid, `LSU_TAG_ID_BITS'(req_tag), req_addr_type[i]};
|
||||
end
|
||||
|
||||
assign ready_in = req_dep_ready && dcache_req_ready;
|
||||
|
||||
// send store commit
|
||||
|
||||
wire is_store_rsp = req_valid && ~req_wb && dcache_req_ready;
|
||||
|
||||
assign st_commit_if.valid = is_store_rsp;
|
||||
assign st_commit_if.uuid = req_uuid;
|
||||
assign st_commit_if.wid = req_wid;
|
||||
assign st_commit_if.tmask = req_tmask;
|
||||
assign st_commit_if.PC = req_pc;
|
||||
assign st_commit_if.rd = 0;
|
||||
assign st_commit_if.wb = 0;
|
||||
assign st_commit_if.eop = 1'b1;
|
||||
assign st_commit_if.data = 0;
|
||||
|
||||
// load response formatting
|
||||
|
||||
reg [`NUM_THREADS-1:0][31:0] rsp_data;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask_qual;
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
wire [31:0] rsp_data32 = (i == 0 || rsp_is_dup) ? dcache_rsp_if.data[0] : dcache_rsp_if.data[i];
|
||||
wire [15:0] rsp_data16 = rsp_offset[i][1] ? rsp_data32[31:16] : rsp_data32[15:0];
|
||||
wire [7:0] rsp_data8 = rsp_offset[i][0] ? rsp_data16[15:8] : rsp_data16[7:0];
|
||||
|
||||
always @(*) begin
|
||||
case (`INST_LSU_FMT(rsp_type))
|
||||
`INST_FMT_B: rsp_data[i] = 32'(signed'(rsp_data8));
|
||||
`INST_FMT_H: rsp_data[i] = 32'(signed'(rsp_data16));
|
||||
`INST_FMT_BU: rsp_data[i] = 32'(unsigned'(rsp_data8));
|
||||
`INST_FMT_HU: rsp_data[i] = 32'(unsigned'(rsp_data16));
|
||||
default: rsp_data[i] = rsp_data32;
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
assign rsp_tmask_qual = rsp_is_dup ? rsp_tmask : dcache_rsp_if.tmask;
|
||||
|
||||
// send load commit
|
||||
|
||||
wire load_rsp_stall = ~ld_commit_if.ready && ld_commit_if.valid;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1),
|
||||
.RESETW (1)
|
||||
) rsp_pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!load_rsp_stall),
|
||||
.data_in ({dcache_rsp_if.valid, rsp_uuid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}),
|
||||
.data_out ({ld_commit_if.valid, ld_commit_if.uuid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop})
|
||||
);
|
||||
|
||||
// Can accept new cache response?
|
||||
assign dcache_rsp_if.ready = ~load_rsp_stall;
|
||||
|
||||
// scope registration
|
||||
`SCOPE_ASSIGN (dcache_req_fire, dcache_req_fire);
|
||||
`SCOPE_ASSIGN (dcache_req_uuid, req_uuid);
|
||||
`SCOPE_ASSIGN (dcache_req_addr, req_addr);
|
||||
`SCOPE_ASSIGN (dcache_req_rw, ~req_wb);
|
||||
`SCOPE_ASSIGN (dcache_req_byteen,dcache_req_if.byteen);
|
||||
`SCOPE_ASSIGN (dcache_req_data, dcache_req_if.data);
|
||||
`SCOPE_ASSIGN (dcache_req_tag, req_tag);
|
||||
`SCOPE_ASSIGN (dcache_rsp_fire, dcache_rsp_if.tmask & {`NUM_THREADS{dcache_rsp_fire}});
|
||||
`SCOPE_ASSIGN (dcache_rsp_uuid, rsp_uuid);
|
||||
`SCOPE_ASSIGN (dcache_rsp_data, dcache_rsp_if.data);
|
||||
`SCOPE_ASSIGN (dcache_rsp_tag, mbuf_raddr);
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + `UUID_BITS + 64 + 1)-1:0] pending_reqs;
|
||||
wire [63:0] delay_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
pending_reqs <= '0;
|
||||
end begin
|
||||
if (mbuf_push) begin
|
||||
pending_reqs[mbuf_waddr] <= {req_wid, req_pc, req_rd, req_uuid, $time, 1'b1};
|
||||
end
|
||||
if (mbuf_pop) begin
|
||||
pending_reqs[mbuf_raddr] <= '0;
|
||||
end
|
||||
end
|
||||
|
||||
for (integer i = 0; i < `LSUQ_SIZE; ++i) begin
|
||||
if (pending_reqs[i][0]) begin
|
||||
`ASSERT(($time - pending_reqs[i][1 +: 64]) < delay_timeout,
|
||||
("%t: *** D$%0d response timeout: remaining=%b, wid=%0d, PC=%0h, rd=%0d (#%0d)",
|
||||
$time, CORE_ID, rsp_rem_mask[i], pending_reqs[i][1+64+`UUID_BITS+`NR_BITS+32 +: `NW_BITS],
|
||||
pending_reqs[i][1+64+`UUID_BITS+`NR_BITS +: 32],
|
||||
pending_reqs[i][1+64+`UUID_BITS +: `NR_BITS],
|
||||
pending_reqs[i][1+64 +: `UUID_BITS]));
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_CORE_DCACHE
|
||||
wire dcache_req_fire_any = (| dcache_req_fire);
|
||||
always @(posedge clk) begin
|
||||
if (lsu_req_if.valid && fence_wait) begin
|
||||
dpi_trace("%d: *** D$%0d fence wait\n", $time, CORE_ID);
|
||||
end
|
||||
if (dcache_req_fire_any) begin
|
||||
if (dcache_req_if.rw[0]) begin
|
||||
dpi_trace("%d: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire);
|
||||
`TRACE_ARRAY1D(req_addr, `NUM_THREADS);
|
||||
dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
|
||||
`TRACE_ARRAY1D(req_addr_type, `NUM_THREADS);
|
||||
dpi_trace(", data=");
|
||||
`TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS);
|
||||
dpi_trace(", (#%0d)\n", req_uuid);
|
||||
end else begin
|
||||
dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire);
|
||||
`TRACE_ARRAY1D(req_addr, `NUM_THREADS);
|
||||
dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
|
||||
`TRACE_ARRAY1D(req_addr_type, `NUM_THREADS);
|
||||
dpi_trace(", rd=%0d, is_dup=%b (#%0d)\n", req_rd, req_is_dup, req_uuid);
|
||||
end
|
||||
end
|
||||
if (dcache_rsp_fire) begin
|
||||
dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=",
|
||||
$time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd);
|
||||
`TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS);
|
||||
dpi_trace(", is_dup=%b (#%0d)\n", rsp_is_dup, rsp_uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
|
@ -1,146 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_mem_arb #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter DATA_WIDTH = 1,
|
||||
parameter ADDR_WIDTH = 1,
|
||||
parameter TAG_IN_WIDTH = 1,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
parameter BUFFERED_REQ = 0,
|
||||
parameter BUFFERED_RSP = 0,
|
||||
parameter TYPE = "P",
|
||||
|
||||
parameter DATA_SIZE = (DATA_WIDTH / 8),
|
||||
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS),
|
||||
parameter TAG_OUT_WIDTH = TAG_IN_WIDTH + LOG_NUM_REQS
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// input requests
|
||||
input wire [NUM_REQS-1:0] req_valid_in,
|
||||
input wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] req_tag_in,
|
||||
input wire [NUM_REQS-1:0][ADDR_WIDTH-1:0] req_addr_in,
|
||||
input wire [NUM_REQS-1:0] req_rw_in,
|
||||
input wire [NUM_REQS-1:0][DATA_SIZE-1:0] req_byteen_in,
|
||||
input wire [NUM_REQS-1:0][DATA_WIDTH-1:0] req_data_in,
|
||||
output wire [NUM_REQS-1:0] req_ready_in,
|
||||
|
||||
// output request
|
||||
output wire req_valid_out,
|
||||
output wire [TAG_OUT_WIDTH-1:0] req_tag_out,
|
||||
output wire [ADDR_WIDTH-1:0] req_addr_out,
|
||||
output wire req_rw_out,
|
||||
output wire [DATA_SIZE-1:0] req_byteen_out,
|
||||
output wire [DATA_WIDTH-1:0] req_data_out,
|
||||
input wire req_ready_out,
|
||||
|
||||
// input response
|
||||
input wire rsp_valid_in,
|
||||
input wire [TAG_OUT_WIDTH-1:0] rsp_tag_in,
|
||||
input wire [DATA_WIDTH-1:0] rsp_data_in,
|
||||
output wire rsp_ready_in,
|
||||
|
||||
// output responses
|
||||
output wire [NUM_REQS-1:0] rsp_valid_out,
|
||||
output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_out,
|
||||
output wire [NUM_REQS-1:0][DATA_WIDTH-1:0] rsp_data_out,
|
||||
input wire [NUM_REQS-1:0] rsp_ready_out
|
||||
);
|
||||
localparam REQ_DATAW = TAG_OUT_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
|
||||
localparam RSP_DATAW = TAG_IN_WIDTH + DATA_WIDTH;
|
||||
|
||||
if (NUM_REQS > 1) begin
|
||||
|
||||
wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_in_merged;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
wire [TAG_OUT_WIDTH-1:0] req_tag_in_w;
|
||||
|
||||
VX_bits_insert #(
|
||||
.N (TAG_IN_WIDTH),
|
||||
.S (LOG_NUM_REQS),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) bits_insert (
|
||||
.data_in (req_tag_in[i]),
|
||||
.sel_in (LOG_NUM_REQS'(i)),
|
||||
.data_out (req_tag_in_w)
|
||||
);
|
||||
|
||||
assign req_data_in_merged[i] = {req_tag_in_w, req_addr_in[i], req_rw_in[i], req_byteen_in[i], req_data_in[i]};
|
||||
end
|
||||
|
||||
VX_stream_arbiter #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.DATAW (REQ_DATAW),
|
||||
.BUFFERED (BUFFERED_REQ),
|
||||
.TYPE (TYPE)
|
||||
) req_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (req_valid_in),
|
||||
.data_in (req_data_in_merged),
|
||||
.ready_in (req_ready_in),
|
||||
.valid_out (req_valid_out),
|
||||
.data_out ({req_tag_out, req_addr_out, req_rw_out, req_byteen_out, req_data_out}),
|
||||
.ready_out (req_ready_out)
|
||||
);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_out_merged;
|
||||
|
||||
wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in[TAG_SEL_IDX +: LOG_NUM_REQS];
|
||||
|
||||
wire [TAG_IN_WIDTH-1:0] rsp_tag_in_w;
|
||||
|
||||
VX_bits_remove #(
|
||||
.N (TAG_OUT_WIDTH),
|
||||
.S (LOG_NUM_REQS),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) bits_remove (
|
||||
.data_in (rsp_tag_in),
|
||||
.data_out (rsp_tag_in_w)
|
||||
);
|
||||
|
||||
VX_stream_demux #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.DATAW (RSP_DATAW),
|
||||
.BUFFERED (BUFFERED_RSP)
|
||||
) rsp_demux (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.sel_in (rsp_sel),
|
||||
.valid_in (rsp_valid_in),
|
||||
.data_in ({rsp_tag_in_w, rsp_data_in}),
|
||||
.ready_in (rsp_ready_in),
|
||||
.valid_out (rsp_valid_out),
|
||||
.data_out (rsp_data_out_merged),
|
||||
.ready_out (rsp_ready_out)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
assign {rsp_tag_out[i], rsp_data_out[i]} = rsp_data_out_merged[i];
|
||||
end
|
||||
|
||||
end else begin
|
||||
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
assign req_valid_out = req_valid_in;
|
||||
assign req_tag_out = req_tag_in;
|
||||
assign req_addr_out = req_addr_in;
|
||||
assign req_rw_out = req_rw_in;
|
||||
assign req_byteen_out = req_byteen_in;
|
||||
assign req_data_out = req_data_in;
|
||||
assign req_ready_in = req_ready_out;
|
||||
|
||||
assign rsp_valid_out = rsp_valid_in;
|
||||
assign rsp_tag_out = rsp_tag_in;
|
||||
assign rsp_data_out = rsp_data_in;
|
||||
assign rsp_ready_in = rsp_ready_out;
|
||||
|
||||
end
|
||||
|
||||
endmodule
|
|
@ -1,420 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_mem_unit # (
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
`SCOPE_IO_VX_mem_unit
|
||||
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_memsys_if.master perf_memsys_if,
|
||||
`endif
|
||||
|
||||
// Core <-> Dcache
|
||||
VX_dcache_req_if.slave dcache_req_if,
|
||||
VX_dcache_rsp_if.master dcache_rsp_if,
|
||||
|
||||
// Core <-> Icache
|
||||
VX_icache_req_if.slave icache_req_if,
|
||||
VX_icache_rsp_if.master icache_rsp_if,
|
||||
|
||||
// Memory
|
||||
VX_mem_req_if.master mem_req_if,
|
||||
VX_mem_rsp_if.slave mem_rsp_if
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_cache_if perf_icache_if(), perf_dcache_if(), perf_smem_if();
|
||||
`endif
|
||||
|
||||
VX_mem_req_if #(
|
||||
.DATA_WIDTH (`ICACHE_MEM_DATA_WIDTH),
|
||||
.ADDR_WIDTH (`ICACHE_MEM_ADDR_WIDTH),
|
||||
.TAG_WIDTH (`ICACHE_MEM_TAG_WIDTH)
|
||||
) icache_mem_req_if();
|
||||
|
||||
VX_mem_rsp_if #(
|
||||
.DATA_WIDTH (`ICACHE_MEM_DATA_WIDTH),
|
||||
.TAG_WIDTH (`ICACHE_MEM_TAG_WIDTH)
|
||||
) icache_mem_rsp_if();
|
||||
|
||||
VX_mem_req_if #(
|
||||
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
|
||||
.ADDR_WIDTH (`DCACHE_MEM_ADDR_WIDTH),
|
||||
.TAG_WIDTH (`DCACHE_MEM_TAG_WIDTH)
|
||||
) dcache_mem_req_if();
|
||||
|
||||
VX_mem_rsp_if #(
|
||||
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
|
||||
.TAG_WIDTH (`DCACHE_MEM_TAG_WIDTH)
|
||||
) dcache_mem_rsp_if();
|
||||
|
||||
VX_dcache_req_if #(
|
||||
.NUM_REQS (`DCACHE_NUM_REQS),
|
||||
.WORD_SIZE (`DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE)
|
||||
) dcache_req_tmp_if();
|
||||
|
||||
VX_dcache_rsp_if #(
|
||||
.NUM_REQS (`DCACHE_NUM_REQS),
|
||||
.WORD_SIZE (`DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE)
|
||||
) dcache_rsp_tmp_if();
|
||||
|
||||
`RESET_RELAY (icache_reset);
|
||||
`RESET_RELAY (dcache_reset);
|
||||
`RESET_RELAY (mem_arb_reset);
|
||||
|
||||
VX_cache #(
|
||||
.CACHE_ID (`ICACHE_ID),
|
||||
.CACHE_SIZE (`ICACHE_SIZE),
|
||||
.CACHE_LINE_SIZE (`ICACHE_LINE_SIZE),
|
||||
.NUM_BANKS (1),
|
||||
.WORD_SIZE (`ICACHE_WORD_SIZE),
|
||||
.NUM_REQS (1),
|
||||
.CREQ_SIZE (`ICACHE_CREQ_SIZE),
|
||||
.CRSQ_SIZE (`ICACHE_CRSQ_SIZE),
|
||||
.MSHR_SIZE (`ICACHE_MSHR_SIZE),
|
||||
.MRSQ_SIZE (`ICACHE_MRSQ_SIZE),
|
||||
.MREQ_SIZE (`ICACHE_MREQ_SIZE),
|
||||
.WRITE_ENABLE (0),
|
||||
.CORE_TAG_WIDTH (`ICACHE_CORE_TAG_WIDTH),
|
||||
.CORE_TAG_ID_BITS (`ICACHE_CORE_TAG_ID_BITS),
|
||||
.MEM_TAG_WIDTH (`ICACHE_MEM_TAG_WIDTH)
|
||||
) icache (
|
||||
`SCOPE_BIND_VX_mem_unit_icache
|
||||
|
||||
.clk (clk),
|
||||
.reset (icache_reset),
|
||||
|
||||
// Core request
|
||||
.core_req_valid (icache_req_if.valid),
|
||||
.core_req_rw (1'b0),
|
||||
.core_req_byteen ('b0),
|
||||
.core_req_addr (icache_req_if.addr),
|
||||
.core_req_data ('x),
|
||||
.core_req_tag (icache_req_if.tag),
|
||||
.core_req_ready (icache_req_if.ready),
|
||||
|
||||
// Core response
|
||||
.core_rsp_valid (icache_rsp_if.valid),
|
||||
.core_rsp_data (icache_rsp_if.data),
|
||||
.core_rsp_tag (icache_rsp_if.tag),
|
||||
.core_rsp_ready (icache_rsp_if.ready),
|
||||
`UNUSED_PIN (core_rsp_tmask),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_cache_if (perf_icache_if),
|
||||
`endif
|
||||
|
||||
// Memory Request
|
||||
.mem_req_valid (icache_mem_req_if.valid),
|
||||
.mem_req_rw (icache_mem_req_if.rw),
|
||||
.mem_req_byteen (icache_mem_req_if.byteen),
|
||||
.mem_req_addr (icache_mem_req_if.addr),
|
||||
.mem_req_data (icache_mem_req_if.data),
|
||||
.mem_req_tag (icache_mem_req_if.tag),
|
||||
.mem_req_ready (icache_mem_req_if.ready),
|
||||
|
||||
// Memory response
|
||||
.mem_rsp_valid (icache_mem_rsp_if.valid),
|
||||
.mem_rsp_data (icache_mem_rsp_if.data),
|
||||
.mem_rsp_tag (icache_mem_rsp_if.tag),
|
||||
.mem_rsp_ready (icache_mem_rsp_if.ready)
|
||||
);
|
||||
|
||||
VX_cache #(
|
||||
.CACHE_ID (`DCACHE_ID),
|
||||
.CACHE_SIZE (`DCACHE_SIZE),
|
||||
.CACHE_LINE_SIZE (`DCACHE_LINE_SIZE),
|
||||
.NUM_BANKS (`DCACHE_NUM_BANKS),
|
||||
.NUM_PORTS (`DCACHE_NUM_PORTS),
|
||||
.WORD_SIZE (`DCACHE_WORD_SIZE),
|
||||
.NUM_REQS (`DCACHE_NUM_REQS),
|
||||
.CREQ_SIZE (`DCACHE_CREQ_SIZE),
|
||||
.CRSQ_SIZE (`DCACHE_CRSQ_SIZE),
|
||||
.MSHR_SIZE (`DCACHE_MSHR_SIZE),
|
||||
.MRSQ_SIZE (`DCACHE_MRSQ_SIZE),
|
||||
.MREQ_SIZE (`DCACHE_MREQ_SIZE),
|
||||
.WRITE_ENABLE (1),
|
||||
.CORE_TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE),
|
||||
.CORE_TAG_ID_BITS (`DCACHE_CORE_TAG_ID_BITS-`SM_ENABLE),
|
||||
.MEM_TAG_WIDTH (`DCACHE_MEM_TAG_WIDTH),
|
||||
.NC_ENABLE (1)
|
||||
) dcache (
|
||||
`SCOPE_BIND_VX_mem_unit_dcache
|
||||
|
||||
.clk (clk),
|
||||
.reset (dcache_reset),
|
||||
|
||||
// Core req
|
||||
.core_req_valid (dcache_req_tmp_if.valid),
|
||||
.core_req_rw (dcache_req_tmp_if.rw),
|
||||
.core_req_byteen (dcache_req_tmp_if.byteen),
|
||||
.core_req_addr (dcache_req_tmp_if.addr),
|
||||
.core_req_data (dcache_req_tmp_if.data),
|
||||
.core_req_tag (dcache_req_tmp_if.tag),
|
||||
.core_req_ready (dcache_req_tmp_if.ready),
|
||||
|
||||
// Core response
|
||||
.core_rsp_valid (dcache_rsp_tmp_if.valid),
|
||||
.core_rsp_tmask (dcache_rsp_tmp_if.tmask),
|
||||
.core_rsp_data (dcache_rsp_tmp_if.data),
|
||||
.core_rsp_tag (dcache_rsp_tmp_if.tag),
|
||||
.core_rsp_ready (dcache_rsp_tmp_if.ready),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_cache_if (perf_dcache_if),
|
||||
`endif
|
||||
|
||||
// Memory request
|
||||
.mem_req_valid (dcache_mem_req_if.valid),
|
||||
.mem_req_rw (dcache_mem_req_if.rw),
|
||||
.mem_req_byteen (dcache_mem_req_if.byteen),
|
||||
.mem_req_addr (dcache_mem_req_if.addr),
|
||||
.mem_req_data (dcache_mem_req_if.data),
|
||||
.mem_req_tag (dcache_mem_req_if.tag),
|
||||
.mem_req_ready (dcache_mem_req_if.ready),
|
||||
|
||||
// Memory response
|
||||
.mem_rsp_valid (dcache_mem_rsp_if.valid),
|
||||
.mem_rsp_data (dcache_mem_rsp_if.data),
|
||||
.mem_rsp_tag (dcache_mem_rsp_if.tag),
|
||||
.mem_rsp_ready (dcache_mem_rsp_if.ready)
|
||||
);
|
||||
|
||||
if (`SM_ENABLE) begin
|
||||
VX_dcache_req_if #(
|
||||
.NUM_REQS (`DCACHE_NUM_REQS),
|
||||
.WORD_SIZE (`DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE)
|
||||
) smem_req_if();
|
||||
|
||||
VX_dcache_rsp_if #(
|
||||
.NUM_REQS (`DCACHE_NUM_REQS),
|
||||
.WORD_SIZE (`DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE)
|
||||
) smem_rsp_if();
|
||||
|
||||
`RESET_RELAY (smem_arb_reset);
|
||||
`RESET_RELAY (smem_reset);
|
||||
|
||||
VX_smem_arb #(
|
||||
.NUM_REQS (2),
|
||||
.LANES (`NUM_THREADS),
|
||||
.DATA_SIZE (4),
|
||||
.TAG_IN_WIDTH (`DCACHE_CORE_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (0), // SM flag
|
||||
.TYPE ("P"),
|
||||
.BUFFERED_REQ (2),
|
||||
.BUFFERED_RSP (1)
|
||||
) smem_arb (
|
||||
.clk (clk),
|
||||
.reset (smem_arb_reset),
|
||||
|
||||
// input request
|
||||
.req_valid_in (dcache_req_if.valid),
|
||||
.req_rw_in (dcache_req_if.rw),
|
||||
.req_byteen_in (dcache_req_if.byteen),
|
||||
.req_addr_in (dcache_req_if.addr),
|
||||
.req_data_in (dcache_req_if.data),
|
||||
.req_tag_in (dcache_req_if.tag),
|
||||
.req_ready_in (dcache_req_if.ready),
|
||||
|
||||
// output requests
|
||||
.req_valid_out ({smem_req_if.valid, dcache_req_tmp_if.valid}),
|
||||
.req_rw_out ({smem_req_if.rw, dcache_req_tmp_if.rw}),
|
||||
.req_byteen_out ({smem_req_if.byteen, dcache_req_tmp_if.byteen}),
|
||||
.req_addr_out ({smem_req_if.addr, dcache_req_tmp_if.addr}),
|
||||
.req_data_out ({smem_req_if.data, dcache_req_tmp_if.data}),
|
||||
.req_tag_out ({smem_req_if.tag, dcache_req_tmp_if.tag}),
|
||||
.req_ready_out ({smem_req_if.ready, dcache_req_tmp_if.ready}),
|
||||
|
||||
// input responses
|
||||
.rsp_valid_in ({smem_rsp_if.valid, dcache_rsp_tmp_if.valid}),
|
||||
.rsp_tmask_in ({smem_rsp_if.tmask, dcache_rsp_tmp_if.tmask}),
|
||||
.rsp_data_in ({smem_rsp_if.data, dcache_rsp_tmp_if.data}),
|
||||
.rsp_tag_in ({smem_rsp_if.tag, dcache_rsp_tmp_if.tag}),
|
||||
.rsp_ready_in ({smem_rsp_if.ready, dcache_rsp_tmp_if.ready}),
|
||||
|
||||
// output response
|
||||
.rsp_valid_out (dcache_rsp_if.valid),
|
||||
.rsp_tmask_out (dcache_rsp_if.tmask),
|
||||
.rsp_tag_out (dcache_rsp_if.tag),
|
||||
.rsp_data_out (dcache_rsp_if.data),
|
||||
.rsp_ready_out (dcache_rsp_if.ready)
|
||||
);
|
||||
|
||||
VX_shared_mem #(
|
||||
.CACHE_ID (`SMEM_ID),
|
||||
.CACHE_SIZE (`SMEM_SIZE),
|
||||
.NUM_BANKS (`SMEM_NUM_BANKS),
|
||||
.WORD_SIZE (`SMEM_WORD_SIZE),
|
||||
.NUM_REQS (`SMEM_NUM_REQS),
|
||||
.CREQ_SIZE (`SMEM_CREQ_SIZE),
|
||||
.CRSQ_SIZE (`SMEM_CRSQ_SIZE),
|
||||
.CORE_TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH-`SM_ENABLE),
|
||||
.CORE_TAG_ID_BITS (`DCACHE_CORE_TAG_ID_BITS-`SM_ENABLE),
|
||||
.BANK_ADDR_OFFSET (`SMEM_BANK_ADDR_OFFSET)
|
||||
) smem (
|
||||
.clk (clk),
|
||||
.reset (smem_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_cache_if (perf_smem_if),
|
||||
`endif
|
||||
|
||||
// Core request
|
||||
.core_req_valid (smem_req_if.valid),
|
||||
.core_req_rw (smem_req_if.rw),
|
||||
.core_req_byteen (smem_req_if.byteen),
|
||||
.core_req_addr (smem_req_if.addr),
|
||||
.core_req_data (smem_req_if.data),
|
||||
.core_req_tag (smem_req_if.tag),
|
||||
.core_req_ready (smem_req_if.ready),
|
||||
|
||||
// Core response
|
||||
.core_rsp_valid (smem_rsp_if.valid),
|
||||
.core_rsp_tmask (smem_rsp_if.tmask),
|
||||
.core_rsp_data (smem_rsp_if.data),
|
||||
.core_rsp_tag (smem_rsp_if.tag),
|
||||
.core_rsp_ready (smem_rsp_if.ready)
|
||||
);
|
||||
end else begin
|
||||
// core to D-cache request
|
||||
for (genvar i = 0; i < `DCACHE_NUM_REQS; ++i) begin
|
||||
VX_skid_buffer #(
|
||||
.DATAW ((32-`CLOG2(`DCACHE_WORD_SIZE)) + 1 + `DCACHE_WORD_SIZE + (8*`DCACHE_WORD_SIZE) + `DCACHE_CORE_TAG_WIDTH)
|
||||
) req_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (dcache_req_if.valid[i]),
|
||||
.data_in ({dcache_req_if.addr[i], dcache_req_if.rw[i], dcache_req_if.byteen[i], dcache_req_if.data[i], dcache_req_if.tag[i]}),
|
||||
.ready_in (dcache_req_if.ready[i]),
|
||||
.valid_out (dcache_req_tmp_if.valid[i]),
|
||||
.data_out ({dcache_req_tmp_if.addr[i], dcache_req_tmp_if.rw[i], dcache_req_tmp_if.byteen[i], dcache_req_tmp_if.data[i], dcache_req_tmp_if.tag[i]}),
|
||||
.ready_out (dcache_req_tmp_if.ready[i])
|
||||
);
|
||||
end
|
||||
|
||||
// D-cache to core reponse
|
||||
assign dcache_rsp_if.valid = dcache_rsp_tmp_if.valid;
|
||||
assign dcache_rsp_if.tmask = dcache_rsp_tmp_if.tmask;
|
||||
assign dcache_rsp_if.tag = dcache_rsp_tmp_if.tag;
|
||||
assign dcache_rsp_if.data = dcache_rsp_tmp_if.data;
|
||||
assign dcache_rsp_tmp_if.ready = dcache_rsp_if.ready;
|
||||
end
|
||||
|
||||
wire [`DCACHE_MEM_TAG_WIDTH-1:0] icache_mem_req_tag = `DCACHE_MEM_TAG_WIDTH'(icache_mem_req_if.tag);
|
||||
wire [`DCACHE_MEM_TAG_WIDTH-1:0] icache_mem_rsp_tag;
|
||||
assign icache_mem_rsp_if.tag = icache_mem_rsp_tag[`ICACHE_MEM_TAG_WIDTH-1:0];
|
||||
`UNUSED_VAR (icache_mem_rsp_tag)
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_REQS (2),
|
||||
.DATA_WIDTH (`DCACHE_MEM_DATA_WIDTH),
|
||||
.ADDR_WIDTH (`DCACHE_MEM_ADDR_WIDTH),
|
||||
.TAG_IN_WIDTH (`DCACHE_MEM_TAG_WIDTH),
|
||||
.TYPE ("R"),
|
||||
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||
.BUFFERED_REQ (1),
|
||||
.BUFFERED_RSP (2)
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (mem_arb_reset),
|
||||
|
||||
// Source request
|
||||
.req_valid_in ({dcache_mem_req_if.valid, icache_mem_req_if.valid}),
|
||||
.req_rw_in ({dcache_mem_req_if.rw, icache_mem_req_if.rw}),
|
||||
.req_byteen_in ({dcache_mem_req_if.byteen, icache_mem_req_if.byteen}),
|
||||
.req_addr_in ({dcache_mem_req_if.addr, icache_mem_req_if.addr}),
|
||||
.req_data_in ({dcache_mem_req_if.data, icache_mem_req_if.data}),
|
||||
.req_tag_in ({dcache_mem_req_if.tag, icache_mem_req_tag}),
|
||||
.req_ready_in ({dcache_mem_req_if.ready, icache_mem_req_if.ready}),
|
||||
|
||||
// Memory request
|
||||
.req_valid_out (mem_req_if.valid),
|
||||
.req_rw_out (mem_req_if.rw),
|
||||
.req_byteen_out (mem_req_if.byteen),
|
||||
.req_addr_out (mem_req_if.addr),
|
||||
.req_data_out (mem_req_if.data),
|
||||
.req_tag_out (mem_req_if.tag),
|
||||
.req_ready_out (mem_req_if.ready),
|
||||
|
||||
// Source response
|
||||
.rsp_valid_out ({dcache_mem_rsp_if.valid, icache_mem_rsp_if.valid}),
|
||||
.rsp_data_out ({dcache_mem_rsp_if.data, icache_mem_rsp_if.data}),
|
||||
.rsp_tag_out ({dcache_mem_rsp_if.tag, icache_mem_rsp_tag}),
|
||||
.rsp_ready_out ({dcache_mem_rsp_if.ready, icache_mem_rsp_if.ready}),
|
||||
|
||||
// Memory response
|
||||
.rsp_valid_in (mem_rsp_if.valid),
|
||||
.rsp_tag_in (mem_rsp_if.tag),
|
||||
.rsp_data_in (mem_rsp_if.data),
|
||||
.rsp_ready_in (mem_rsp_if.ready)
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
`UNUSED_VAR (perf_dcache_if.mem_stalls)
|
||||
`UNUSED_VAR (perf_dcache_if.crsp_stalls)
|
||||
|
||||
assign perf_memsys_if.icache_reads = perf_icache_if.reads;
|
||||
assign perf_memsys_if.icache_read_misses = perf_icache_if.read_misses;
|
||||
assign perf_memsys_if.dcache_reads = perf_dcache_if.reads;
|
||||
assign perf_memsys_if.dcache_writes = perf_dcache_if.writes;
|
||||
assign perf_memsys_if.dcache_read_misses = perf_dcache_if.read_misses;
|
||||
assign perf_memsys_if.dcache_write_misses= perf_dcache_if.write_misses;
|
||||
assign perf_memsys_if.dcache_bank_stalls = perf_dcache_if.bank_stalls;
|
||||
assign perf_memsys_if.dcache_mshr_stalls = perf_dcache_if.mshr_stalls;
|
||||
|
||||
if (`SM_ENABLE) begin
|
||||
assign perf_memsys_if.smem_reads = perf_smem_if.reads;
|
||||
assign perf_memsys_if.smem_writes = perf_smem_if.writes;
|
||||
assign perf_memsys_if.smem_bank_stalls = perf_smem_if.bank_stalls;
|
||||
end else begin
|
||||
assign perf_memsys_if.smem_reads = 0;
|
||||
assign perf_memsys_if.smem_writes = 0;
|
||||
assign perf_memsys_if.smem_bank_stalls = 0;
|
||||
end
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_mem_pending_reads <= 0;
|
||||
end else begin
|
||||
perf_mem_pending_reads <= perf_mem_pending_reads +
|
||||
`PERF_CTR_BITS'($signed(2'((mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) && !(mem_rsp_if.valid && mem_rsp_if.ready)) -
|
||||
2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw))));
|
||||
end
|
||||
end
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_writes;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_lat;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_mem_reads <= 0;
|
||||
perf_mem_writes <= 0;
|
||||
perf_mem_lat <= 0;
|
||||
end else begin
|
||||
if (mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) begin
|
||||
perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (mem_req_if.valid && mem_req_if.ready && mem_req_if.rw) begin
|
||||
perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
perf_mem_lat <= perf_mem_lat + perf_mem_pending_reads;
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_memsys_if.mem_reads = perf_mem_reads;
|
||||
assign perf_memsys_if.mem_writes = perf_mem_writes;
|
||||
assign perf_memsys_if.mem_latency = perf_mem_lat;
|
||||
`endif
|
||||
|
||||
endmodule
|
|
@ -1,226 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_muldiv (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// Inputs
|
||||
input wire [`INST_MUL_BITS-1:0] alu_op,
|
||||
input wire [`UUID_BITS-1:0] uuid_in,
|
||||
input wire [`NW_BITS-1:0] wid_in,
|
||||
input wire [`NUM_THREADS-1:0] tmask_in,
|
||||
input wire [31:0] PC_in,
|
||||
input wire [`NR_BITS-1:0] rd_in,
|
||||
input wire wb_in,
|
||||
input wire [`NUM_THREADS-1:0][31:0] alu_in1,
|
||||
input wire [`NUM_THREADS-1:0][31:0] alu_in2,
|
||||
|
||||
// Outputs
|
||||
output wire [`UUID_BITS-1:0] uuid_out,
|
||||
output wire [`NW_BITS-1:0] wid_out,
|
||||
output wire [`NUM_THREADS-1:0] tmask_out,
|
||||
output wire [31:0] PC_out,
|
||||
output wire [`NR_BITS-1:0] rd_out,
|
||||
output wire wb_out,
|
||||
output wire [`NUM_THREADS-1:0][31:0] data_out,
|
||||
|
||||
// handshake
|
||||
input wire valid_in,
|
||||
output wire ready_in,
|
||||
output wire valid_out,
|
||||
input wire ready_out
|
||||
);
|
||||
|
||||
wire is_div_op = `INST_MUL_IS_DIV(alu_op);
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] mul_result;
|
||||
wire [`UUID_BITS-1:0] mul_uuid_out;
|
||||
wire [`NW_BITS-1:0] mul_wid_out;
|
||||
wire [`NUM_THREADS-1:0] mul_tmask_out;
|
||||
wire [31:0] mul_PC_out;
|
||||
wire [`NR_BITS-1:0] mul_rd_out;
|
||||
wire mul_wb_out;
|
||||
|
||||
wire stall_out;
|
||||
|
||||
wire mul_valid_out;
|
||||
wire mul_valid_in = valid_in && !is_div_op;
|
||||
wire mul_ready_in = ~stall_out || ~mul_valid_out;
|
||||
|
||||
wire is_mulh_in = (alu_op != `INST_MUL_MUL);
|
||||
wire is_signed_mul_a = (alu_op != `INST_MUL_MULHU);
|
||||
wire is_signed_mul_b = (alu_op != `INST_MUL_MULHU && alu_op != `INST_MUL_MULHSU);
|
||||
|
||||
`ifdef IMUL_DPI
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] mul_result_tmp;
|
||||
|
||||
wire mul_fire_in = mul_valid_in && mul_ready_in;
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
wire [31:0] mul_resultl, mul_resulth;
|
||||
always @(*) begin
|
||||
dpi_imul (mul_fire_in, alu_in1[i], alu_in2[i], is_signed_mul_a, is_signed_mul_b, mul_resultl, mul_resulth);
|
||||
end
|
||||
assign mul_result_tmp[i] = is_mulh_in ? mul_resulth : mul_resultl;
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.DEPTH (`LATENCY_IMUL),
|
||||
.RESETW (1)
|
||||
) mul_shift_reg (
|
||||
.clk(clk),
|
||||
.reset (reset),
|
||||
.enable (mul_ready_in),
|
||||
.data_in ({mul_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, mul_result_tmp}),
|
||||
.data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_result})
|
||||
);
|
||||
|
||||
`else
|
||||
|
||||
wire is_mulh_out;
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
wire [32:0] mul_in1 = {is_signed_mul_a & alu_in1[i][31], alu_in1[i]};
|
||||
wire [32:0] mul_in2 = {is_signed_mul_b & alu_in2[i][31], alu_in2[i]};
|
||||
`IGNORE_UNUSED_BEGIN
|
||||
wire [65:0] mul_result_tmp;
|
||||
`IGNORE_UNUSED_END
|
||||
|
||||
VX_multiplier #(
|
||||
.WIDTHA (33),
|
||||
.WIDTHB (33),
|
||||
.WIDTHP (66),
|
||||
.SIGNED (1),
|
||||
.LATENCY (`LATENCY_IMUL)
|
||||
) multiplier (
|
||||
.clk (clk),
|
||||
.enable (mul_ready_in),
|
||||
.dataa (mul_in1),
|
||||
.datab (mul_in2),
|
||||
.result (mul_result_tmp)
|
||||
);
|
||||
|
||||
assign mul_result[i] = is_mulh_out ? mul_result_tmp[63:32] : mul_result_tmp[31:0];
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1),
|
||||
.DEPTH (`LATENCY_IMUL),
|
||||
.RESETW (1)
|
||||
) mul_shift_reg (
|
||||
.clk(clk),
|
||||
.reset (reset),
|
||||
.enable (mul_ready_in),
|
||||
.data_in ({mul_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_mulh_in}),
|
||||
.data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out})
|
||||
);
|
||||
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] div_result;
|
||||
wire [`UUID_BITS-1:0] div_uuid_out;
|
||||
wire [`NW_BITS-1:0] div_wid_out;
|
||||
wire [`NUM_THREADS-1:0] div_tmask_out;
|
||||
wire [31:0] div_PC_out;
|
||||
wire [`NR_BITS-1:0] div_rd_out;
|
||||
wire div_wb_out;
|
||||
|
||||
wire is_rem_op_in = (alu_op == `INST_MUL_REM) || (alu_op == `INST_MUL_REMU);
|
||||
wire is_signed_div = (alu_op == `INST_MUL_DIV) || (alu_op == `INST_MUL_REM);
|
||||
wire div_valid_in = valid_in && is_div_op;
|
||||
wire div_ready_out = ~stall_out && ~mul_valid_out; // arbitration prioritizes MUL
|
||||
wire div_ready_in;
|
||||
wire div_valid_out;
|
||||
|
||||
`ifdef IDIV_DPI
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] div_result_tmp;
|
||||
|
||||
wire div_fire_in = div_valid_in && div_ready_in;
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
wire [31:0] div_quotient, div_remainder;
|
||||
always @(*) begin
|
||||
dpi_idiv (div_fire_in, alu_in1[i], alu_in2[i], is_signed_div, div_quotient, div_remainder);
|
||||
end
|
||||
assign div_result_tmp[i] = is_rem_op_in ? div_remainder : div_quotient;
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.DEPTH (`LATENCY_IMUL),
|
||||
.RESETW (1)
|
||||
) div_shift_reg (
|
||||
.clk(clk),
|
||||
.reset (reset),
|
||||
.enable (div_ready_in),
|
||||
.data_in ({div_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, div_result_tmp}),
|
||||
.data_out ({div_valid_out, div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_result})
|
||||
);
|
||||
|
||||
assign div_ready_in = div_ready_out || ~div_valid_out;
|
||||
|
||||
`else
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] div_result_tmp, rem_result_tmp;
|
||||
wire is_rem_op_out;
|
||||
|
||||
VX_serial_div #(
|
||||
.WIDTHN (32),
|
||||
.WIDTHD (32),
|
||||
.WIDTHQ (32),
|
||||
.WIDTHR (32),
|
||||
.LANES (`NUM_THREADS),
|
||||
.TAGW (64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1)
|
||||
) divide (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (div_valid_in),
|
||||
.ready_in (div_ready_in),
|
||||
.signed_mode(is_signed_div),
|
||||
.tag_in ({uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_rem_op_in}),
|
||||
.numer (alu_in1),
|
||||
.denom (alu_in2),
|
||||
.quotient (div_result_tmp),
|
||||
.remainder (rem_result_tmp),
|
||||
.ready_out (div_ready_out),
|
||||
.valid_out (div_valid_out),
|
||||
.tag_out ({div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out})
|
||||
);
|
||||
|
||||
assign div_result = is_rem_op_out ? rem_result_tmp : div_result_tmp;
|
||||
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire rsp_valid = mul_valid_out || div_valid_out;
|
||||
wire [`UUID_BITS-1:0] rsp_uuid = mul_valid_out ? mul_uuid_out : div_uuid_out;
|
||||
wire [`NW_BITS-1:0] rsp_wid = mul_valid_out ? mul_wid_out : div_wid_out;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask = mul_valid_out ? mul_tmask_out : div_tmask_out;
|
||||
wire [31:0] rsp_PC = mul_valid_out ? mul_PC_out : div_PC_out;
|
||||
wire [`NR_BITS-1:0] rsp_rd = mul_valid_out ? mul_rd_out : div_rd_out;
|
||||
wire rsp_wb = mul_valid_out ? mul_wb_out : div_wb_out;
|
||||
wire [`NUM_THREADS-1:0][31:0] rsp_data = mul_valid_out ? mul_result : div_result;
|
||||
|
||||
assign stall_out = ~ready_out && valid_out;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({rsp_valid, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data}),
|
||||
.data_out ({valid_out, uuid_out, wid_out, tmask_out, PC_out, rd_out, wb_out, data_out})
|
||||
);
|
||||
|
||||
// can accept new request?
|
||||
assign ready_in = is_div_op ? div_ready_in : mul_ready_in;
|
||||
|
||||
endmodule
|
|
@ -1,261 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_pipeline #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
`SCOPE_IO_VX_pipeline
|
||||
|
||||
// Clock
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// Dcache core request
|
||||
output wire [`NUM_THREADS-1:0] dcache_req_valid,
|
||||
output wire [`NUM_THREADS-1:0] dcache_req_rw,
|
||||
output wire [`NUM_THREADS-1:0][3:0] dcache_req_byteen,
|
||||
output wire [`NUM_THREADS-1:0][29:0] dcache_req_addr,
|
||||
output wire [`NUM_THREADS-1:0][31:0] dcache_req_data,
|
||||
output wire [`NUM_THREADS-1:0][`DCACHE_CORE_TAG_WIDTH-1:0] dcache_req_tag,
|
||||
input wire [`NUM_THREADS-1:0] dcache_req_ready,
|
||||
|
||||
// Dcache core reponse
|
||||
input wire dcache_rsp_valid,
|
||||
input wire [`NUM_THREADS-1:0] dcache_rsp_tmask,
|
||||
input wire [`NUM_THREADS-1:0][31:0] dcache_rsp_data,
|
||||
input wire [`DCACHE_CORE_TAG_WIDTH-1:0] dcache_rsp_tag,
|
||||
output wire dcache_rsp_ready,
|
||||
|
||||
// Icache core request
|
||||
output wire icache_req_valid,
|
||||
output wire [29:0] icache_req_addr,
|
||||
output wire [`ICACHE_CORE_TAG_WIDTH-1:0] icache_req_tag,
|
||||
input wire icache_req_ready,
|
||||
|
||||
// Icache core response
|
||||
input wire icache_rsp_valid,
|
||||
input wire [31:0] icache_rsp_data,
|
||||
input wire [`ICACHE_CORE_TAG_WIDTH-1:0] icache_rsp_tag,
|
||||
output wire icache_rsp_ready,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_memsys_if.slave perf_memsys_if,
|
||||
`endif
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
);
|
||||
//
|
||||
// Dcache request
|
||||
//
|
||||
|
||||
VX_dcache_req_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH)
|
||||
) dcache_req_if();
|
||||
|
||||
assign dcache_req_valid = dcache_req_if.valid;
|
||||
assign dcache_req_rw = dcache_req_if.rw;
|
||||
assign dcache_req_byteen = dcache_req_if.byteen;
|
||||
assign dcache_req_addr = dcache_req_if.addr;
|
||||
assign dcache_req_data = dcache_req_if.data;
|
||||
assign dcache_req_tag = dcache_req_if.tag;
|
||||
assign dcache_req_if.ready = dcache_req_ready;
|
||||
|
||||
//
|
||||
// Dcache response
|
||||
//
|
||||
|
||||
VX_dcache_rsp_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`DCACHE_CORE_TAG_WIDTH)
|
||||
) dcache_rsp_if();
|
||||
|
||||
assign dcache_rsp_if.valid = dcache_rsp_valid;
|
||||
assign dcache_rsp_if.tmask = dcache_rsp_tmask;
|
||||
assign dcache_rsp_if.data = dcache_rsp_data;
|
||||
assign dcache_rsp_if.tag = dcache_rsp_tag;
|
||||
assign dcache_rsp_ready = dcache_rsp_if.ready;
|
||||
|
||||
//
|
||||
// Icache request
|
||||
//
|
||||
|
||||
VX_icache_req_if #(
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`ICACHE_CORE_TAG_WIDTH)
|
||||
) icache_req_if();
|
||||
|
||||
assign icache_req_valid = icache_req_if.valid;
|
||||
assign icache_req_addr = icache_req_if.addr;
|
||||
assign icache_req_tag = icache_req_if.tag;
|
||||
assign icache_req_if.ready = icache_req_ready;
|
||||
|
||||
//
|
||||
// Icache response
|
||||
//
|
||||
|
||||
VX_icache_rsp_if #(
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`ICACHE_CORE_TAG_WIDTH)
|
||||
) icache_rsp_if();
|
||||
|
||||
assign icache_rsp_if.valid = icache_rsp_valid;
|
||||
assign icache_rsp_if.data = icache_rsp_data;
|
||||
assign icache_rsp_if.tag = icache_rsp_tag;
|
||||
assign icache_rsp_ready = icache_rsp_if.ready;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
VX_fetch_to_csr_if fetch_to_csr_if();
|
||||
VX_cmt_to_csr_if cmt_to_csr_if();
|
||||
VX_decode_if decode_if();
|
||||
VX_branch_ctl_if branch_ctl_if();
|
||||
VX_warp_ctl_if warp_ctl_if();
|
||||
VX_ifetch_rsp_if ifetch_rsp_if();
|
||||
VX_alu_req_if alu_req_if();
|
||||
VX_lsu_req_if lsu_req_if();
|
||||
VX_csr_req_if csr_req_if();
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_fpu_req_if fpu_req_if();
|
||||
`endif
|
||||
VX_gpu_req_if gpu_req_if();
|
||||
VX_writeback_if writeback_if();
|
||||
VX_wstall_if wstall_if();
|
||||
VX_join_if join_if();
|
||||
VX_commit_if alu_commit_if();
|
||||
VX_commit_if ld_commit_if();
|
||||
VX_commit_if st_commit_if();
|
||||
VX_commit_if csr_commit_if();
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_commit_if fpu_commit_if();
|
||||
`endif
|
||||
VX_commit_if gpu_commit_if();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_pipeline_if perf_pipeline_if();
|
||||
`endif
|
||||
|
||||
`RESET_RELAY (fetch_reset);
|
||||
`RESET_RELAY (decode_reset);
|
||||
`RESET_RELAY (issue_reset);
|
||||
`RESET_RELAY (execute_reset);
|
||||
`RESET_RELAY (commit_reset);
|
||||
|
||||
VX_fetch #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) fetch (
|
||||
`SCOPE_BIND_VX_pipeline_fetch
|
||||
.clk (clk),
|
||||
.reset (fetch_reset),
|
||||
.icache_req_if (icache_req_if),
|
||||
.icache_rsp_if (icache_rsp_if),
|
||||
.wstall_if (wstall_if),
|
||||
.join_if (join_if),
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
.ifetch_rsp_if (ifetch_rsp_if),
|
||||
.fetch_to_csr_if(fetch_to_csr_if),
|
||||
.busy (busy)
|
||||
);
|
||||
|
||||
VX_decode #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) decode (
|
||||
.clk (clk),
|
||||
.reset (decode_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_decode_if (perf_pipeline_if.decode),
|
||||
`endif
|
||||
.ifetch_rsp_if (ifetch_rsp_if),
|
||||
.decode_if (decode_if),
|
||||
.wstall_if (wstall_if),
|
||||
.join_if (join_if)
|
||||
);
|
||||
|
||||
VX_issue #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) issue (
|
||||
`SCOPE_BIND_VX_pipeline_issue
|
||||
|
||||
.clk (clk),
|
||||
.reset (issue_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_issue_if (perf_pipeline_if.issue),
|
||||
`endif
|
||||
|
||||
.decode_if (decode_if),
|
||||
.writeback_if (writeback_if),
|
||||
|
||||
.alu_req_if (alu_req_if),
|
||||
.lsu_req_if (lsu_req_if),
|
||||
.csr_req_if (csr_req_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_req_if (fpu_req_if),
|
||||
`endif
|
||||
.gpu_req_if (gpu_req_if)
|
||||
);
|
||||
|
||||
VX_execute #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) execute (
|
||||
`SCOPE_BIND_VX_pipeline_execute
|
||||
|
||||
.clk (clk),
|
||||
.reset (execute_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_memsys_if (perf_memsys_if),
|
||||
.perf_pipeline_if (perf_pipeline_if),
|
||||
`endif
|
||||
|
||||
.dcache_req_if (dcache_req_if),
|
||||
.dcache_rsp_if (dcache_rsp_if),
|
||||
|
||||
.cmt_to_csr_if (cmt_to_csr_if),
|
||||
.fetch_to_csr_if(fetch_to_csr_if),
|
||||
|
||||
.alu_req_if (alu_req_if),
|
||||
.lsu_req_if (lsu_req_if),
|
||||
.csr_req_if (csr_req_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_req_if (fpu_req_if),
|
||||
`endif
|
||||
.gpu_req_if (gpu_req_if),
|
||||
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.branch_ctl_if (branch_ctl_if),
|
||||
.alu_commit_if (alu_commit_if),
|
||||
.ld_commit_if (ld_commit_if),
|
||||
.st_commit_if (st_commit_if),
|
||||
.csr_commit_if (csr_commit_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_commit_if (fpu_commit_if),
|
||||
`endif
|
||||
.gpu_commit_if (gpu_commit_if),
|
||||
|
||||
.busy (busy)
|
||||
);
|
||||
|
||||
VX_commit #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) commit (
|
||||
.clk (clk),
|
||||
.reset (commit_reset),
|
||||
|
||||
.alu_commit_if (alu_commit_if),
|
||||
.ld_commit_if (ld_commit_if),
|
||||
.st_commit_if (st_commit_if),
|
||||
.csr_commit_if (csr_commit_if),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_commit_if (fpu_commit_if),
|
||||
`endif
|
||||
.gpu_commit_if (gpu_commit_if),
|
||||
|
||||
.writeback_if (writeback_if),
|
||||
.cmt_to_csr_if (cmt_to_csr_if)
|
||||
);
|
||||
|
||||
endmodule
|
|
@ -1,5 +1,18 @@
|
|||
`ifndef VX_PLATFORM
|
||||
`define VX_PLATFORM
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`ifndef VX_PLATFORM_VH
|
||||
`define VX_PLATFORM_VH
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
`include "util_dpi.vh"
|
||||
|
@ -9,8 +22,36 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
`ifdef VIVADO
|
||||
`define STRING
|
||||
`else
|
||||
`define STRING string
|
||||
`endif
|
||||
|
||||
`ifdef SYNTHESIS
|
||||
`define TRACING_ON
|
||||
`define TRACING_OFF
|
||||
`ifndef NDEBUG
|
||||
`define DEBUG_BLOCK(x) x
|
||||
`else
|
||||
`define DEBUG_BLOCK(x)
|
||||
`endif
|
||||
`define IGNORE_UNOPTFLAT_BEGIN
|
||||
`define IGNORE_UNOPTFLAT_END
|
||||
`define IGNORE_UNUSED_BEGIN
|
||||
`define IGNORE_UNUSED_END
|
||||
`define IGNORE_WARNINGS_BEGIN
|
||||
`define IGNORE_WARNINGS_END
|
||||
`define UNUSED_PARAM(x)
|
||||
`define UNUSED_SPARAM(x)
|
||||
`define UNUSED_VAR(x)
|
||||
`define UNUSED_PIN(x) . x ()
|
||||
`define UNUSED_ARG(x) x
|
||||
`define TRACE(level, args) $write args
|
||||
`else
|
||||
`ifdef VERILATOR
|
||||
`define TRACING_ON /* verilator tracing_on */
|
||||
`define TRACING_OFF /* verilator tracing_off */
|
||||
`ifndef NDEBUG
|
||||
`define DEBUG_BLOCK(x) /* verilator lint_off UNUSED */ \
|
||||
x \
|
||||
|
@ -19,6 +60,10 @@
|
|||
`define DEBUG_BLOCK(x)
|
||||
`endif
|
||||
|
||||
`define IGNORE_UNOPTFLAT_BEGIN /* verilator lint_off UNOPTFLAT */
|
||||
|
||||
`define IGNORE_UNOPTFLAT_END /* verilator lint_off UNOPTFLAT */
|
||||
|
||||
`define IGNORE_UNUSED_BEGIN /* verilator lint_off UNUSED */
|
||||
|
||||
`define IGNORE_UNUSED_END /* verilator lint_on UNUSED */
|
||||
|
@ -30,7 +75,9 @@
|
|||
/* verilator lint_off UNDRIVEN */ \
|
||||
/* verilator lint_off DECLFILENAME */ \
|
||||
/* verilator lint_off IMPLICIT */ \
|
||||
/* verilator lint_off IMPORTSTAR */
|
||||
/* verilator lint_off PINMISSING */ \
|
||||
/* verilator lint_off IMPORTSTAR */ \
|
||||
/* verilator lint_off UNSIGNED */
|
||||
|
||||
`define IGNORE_WARNINGS_END /* verilator lint_on UNUSED */ \
|
||||
/* verilator lint_on PINCONNECTEMPTY */ \
|
||||
|
@ -39,68 +86,80 @@
|
|||
/* verilator lint_on UNDRIVEN */ \
|
||||
/* verilator lint_on DECLFILENAME */ \
|
||||
/* verilator lint_on IMPLICIT */ \
|
||||
/* verilator lint_on IMPORTSTAR */
|
||||
/* verilator lint_off PINMISSING */ \
|
||||
/* verilator lint_on IMPORTSTAR */ \
|
||||
/* verilator lint_on UNSIGNED */
|
||||
|
||||
`define UNUSED_PARAM(x) /* verilator lint_off UNUSED */ \
|
||||
localparam __``x = x; \
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
`define UNUSED_VAR(x) always @(x) begin end
|
||||
`define UNUSED_SPARAM(x) /* verilator lint_off UNUSED */ \
|
||||
localparam `STRING __``x = x; \
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
`define UNUSED_PIN(x) /* verilator lint_off PINCONNECTEMPTY */ \
|
||||
. x () \
|
||||
/* verilator lint_on PINCONNECTEMPTY */
|
||||
`define UNUSED_VAR(x) if (1) begin \
|
||||
/* verilator lint_off UNUSED */ \
|
||||
wire [$bits(x)-1:0] __x = x; \
|
||||
/* verilator lint_on UNUSED */ \
|
||||
end
|
||||
|
||||
`define ERROR(msg) \
|
||||
$error msg
|
||||
`define UNUSED_PIN(x) /* verilator lint_off PINCONNECTEMPTY */ \
|
||||
. x () \
|
||||
/* verilator lint_on PINCONNECTEMPTY */
|
||||
`define UNUSED_ARG(x) /* verilator lint_off UNUSED */ \
|
||||
x \
|
||||
/* verilator lint_on UNUSED */
|
||||
`define TRACE(level, args) dpi_trace(level, $sformatf args)
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`define ASSERT(cond, msg) \
|
||||
assert(cond) else $error msg
|
||||
|
||||
`define STATIC_ASSERT(cond, msg) \
|
||||
`ifdef SIMULATION
|
||||
`define STATIC_ASSERT(cond, msg) \
|
||||
generate \
|
||||
if (!(cond)) $error msg; \
|
||||
endgenerate
|
||||
|
||||
`define RUNTIME_ASSERT(cond, msg) \
|
||||
always @(posedge clk) begin \
|
||||
assert(cond) else $error msg; \
|
||||
end
|
||||
`define ERROR(msg) \
|
||||
$error msg
|
||||
|
||||
`define TRACING_ON /* verilator tracing_on */
|
||||
`define TRACING_OFF /* verilator tracing_off */
|
||||
`define ASSERT(cond, msg) \
|
||||
assert(cond) else $error msg
|
||||
|
||||
`else // SYNTHESIS
|
||||
|
||||
`define DEBUG_BLOCK(x)
|
||||
`define IGNORE_UNUSED_BEGIN
|
||||
`define IGNORE_UNUSED_END
|
||||
`define IGNORE_WARNINGS_BEGIN
|
||||
`define IGNORE_WARNINGS_END
|
||||
`define UNUSED_PARAM(x)
|
||||
`define UNUSED_VAR(x)
|
||||
`define UNUSED_PIN(x) . x ()
|
||||
`define ERROR(msg)
|
||||
`define ASSERT(cond, msg) if (cond);
|
||||
`define STATIC_ASSERT(cond, msg)
|
||||
`define RUNTIME_ASSERT(cond, msg)
|
||||
`define TRACING_ON
|
||||
`define TRACING_OFF
|
||||
|
||||
`endif // SYNTHESIS
|
||||
`define RUNTIME_ASSERT(cond, msg) \
|
||||
always @(posedge clk) begin \
|
||||
assert(cond) else $error msg; \
|
||||
end
|
||||
`else
|
||||
`define STATIC_ASSERT(cond, msg)
|
||||
`define ERROR(msg) //
|
||||
`define ASSERT(cond, msg) //
|
||||
`define RUNTIME_ASSERT(cond, msg)
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef QUARTUS
|
||||
`define MAX_FANOUT 4
|
||||
`define IF_DATA_SIZE(x) $bits(x.data)
|
||||
`define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *)
|
||||
`define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *)
|
||||
`define DISABLE_BRAM (* ramstyle = "logic" *)
|
||||
`define PRESERVE_REG (* preserve *)
|
||||
`define PRESERVE_NET (* preserve *)
|
||||
`elsif VIVADO
|
||||
`define MAX_FANOUT 4
|
||||
`define IF_DATA_SIZE(x) $bits(x.data)
|
||||
`define USE_FAST_BRAM (* ram_style = "distributed" *)
|
||||
`define NO_RW_RAM_CHECK (* rw_addr_collision = "no" *)
|
||||
`define DISABLE_BRAM (* ram_style = "registers" *)
|
||||
`define PRESERVE_NET (* keep = "true" *)
|
||||
`else
|
||||
`define MAX_FANOUT 4
|
||||
`define IF_DATA_SIZE(x) x.DATA_WIDTH
|
||||
`define USE_FAST_BRAM
|
||||
`define NO_RW_RAM_CHECK
|
||||
`define DISABLE_BRAM
|
||||
`define PRESERVE_REG
|
||||
`define PRESERVE_NET
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -112,52 +171,105 @@
|
|||
`define LOG2UP(x) (((x) > 1) ? $clog2(x) : 1)
|
||||
`define ISPOW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1))))
|
||||
|
||||
`define ABS(x) (($signed(x) < 0) ? (-$signed(x)) : (x));
|
||||
`define ABS(x) (((x) < 0) ? (-(x)) : (x));
|
||||
|
||||
`ifndef MIN
|
||||
`define MIN(x, y) (((x) < (y)) ? (x) : (y))
|
||||
`define MAX(x, y) (((x) > (y)) ? (x) : (y))
|
||||
`endif
|
||||
|
||||
`define UP(x) (((x) > 0) ? (x) : 1)
|
||||
`ifndef MAX
|
||||
`define MAX(x, y) (((x) > (y)) ? (x) : (y))
|
||||
`endif
|
||||
|
||||
`ifndef CLAMP
|
||||
`define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x)))
|
||||
`endif
|
||||
|
||||
`ifndef UP
|
||||
`define UP(x) (((x) != 0) ? (x) : 1)
|
||||
`endif
|
||||
|
||||
`define RTRIM(x, s) x[$bits(x)-1:($bits(x)-s)]
|
||||
|
||||
`define LTRIM(x, s) x[s-1:0]
|
||||
|
||||
`define TRACE_ARRAY1D(a, m) \
|
||||
dpi_trace("{"); \
|
||||
for (integer i = (m-1); i >= 0; --i) begin \
|
||||
if (i != (m-1)) dpi_trace(", "); \
|
||||
dpi_trace("0x%0h", a[i]); \
|
||||
`define TRACE_ARRAY1D(lvl, arr, m) \
|
||||
`TRACE(lvl, ("{")); \
|
||||
for (integer __i = (m-1); __i >= 0; --__i) begin \
|
||||
if (__i != (m-1)) `TRACE(lvl, (", ")); \
|
||||
`TRACE(lvl, ("0x%0h", arr[__i])); \
|
||||
end \
|
||||
dpi_trace("}"); \
|
||||
`TRACE(lvl, ("}"));
|
||||
|
||||
`define TRACE_ARRAY2D(a, m, n) \
|
||||
dpi_trace("{"); \
|
||||
for (integer i = n-1; i >= 0; --i) begin \
|
||||
if (i != (n-1)) dpi_trace(", "); \
|
||||
dpi_trace("{"); \
|
||||
for (integer j = (m-1); j >= 0; --j) begin \
|
||||
if (j != (m-1)) dpi_trace(", "); \
|
||||
dpi_trace("0x%0h", a[i][j]); \
|
||||
`define TRACE_ARRAY2D(lvl, arr, m, n) \
|
||||
`TRACE(lvl, ("{")); \
|
||||
for (integer __i = n-1; __i >= 0; --__i) begin \
|
||||
if (__i != (n-1)) `TRACE(lvl, (", ")); \
|
||||
`TRACE(lvl, ("{")); \
|
||||
for (integer __j = (m-1); __j >= 0; --__j) begin \
|
||||
if (__j != (m-1)) `TRACE(lvl, (", "));\
|
||||
`TRACE(lvl, ("0x%0h", arr[__i][__j])); \
|
||||
end \
|
||||
dpi_trace("}"); \
|
||||
`TRACE(lvl, ("}")); \
|
||||
end \
|
||||
dpi_trace("}")
|
||||
`TRACE(lvl, ("}"))
|
||||
|
||||
`define RESET_RELAY(signal) \
|
||||
wire signal; \
|
||||
VX_reset_relay __``signal ( \
|
||||
.clk (clk), \
|
||||
.reset (reset), \
|
||||
.reset_o (signal) \
|
||||
`define RESET_RELAY_EX(dst, src, size, fanout) \
|
||||
wire [size-1:0] dst; \
|
||||
VX_reset_relay #(.N(size), .MAX_FANOUT(fanout)) __``dst ( \
|
||||
.clk (clk), \
|
||||
.reset (src), \
|
||||
.reset_o (dst) \
|
||||
)
|
||||
|
||||
`define POP_COUNT(out, in) \
|
||||
VX_popcount #( \
|
||||
.N ($bits(in)) \
|
||||
) __``out ( \
|
||||
.in_i (in), \
|
||||
.cnt_o (out) \
|
||||
)
|
||||
`define RESET_RELAY_EN(dst, src, enable) \
|
||||
`RESET_RELAY_EX (dst, src, 1, ((enable) ? 0 : -1))
|
||||
|
||||
`endif
|
||||
`define RESET_RELAY(dst, src) \
|
||||
`RESET_RELAY_EX (dst, src, 1, 0)
|
||||
|
||||
// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2
|
||||
`define OUT_REG_TO_EB_SIZE(out_reg) `MIN(out_reg, 2)
|
||||
|
||||
// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2
|
||||
`define OUT_REG_TO_EB_REG(out_reg) ((out_reg & 1) + ((out_reg >> 2) << 1))
|
||||
|
||||
`define REPEAT(n,f,s) `_REPEAT_``n(f,s)
|
||||
`define _REPEAT_0(f,s)
|
||||
`define _REPEAT_1(f,s) `f(0)
|
||||
`define _REPEAT_2(f,s) `f(1) `s `_REPEAT_1(f,s)
|
||||
`define _REPEAT_3(f,s) `f(2) `s `_REPEAT_2(f,s)
|
||||
`define _REPEAT_4(f,s) `f(3) `s `_REPEAT_3(f,s)
|
||||
`define _REPEAT_5(f,s) `f(4) `s `_REPEAT_4(f,s)
|
||||
`define _REPEAT_6(f,s) `f(5) `s `_REPEAT_5(f,s)
|
||||
`define _REPEAT_7(f,s) `f(6) `s `_REPEAT_6(f,s)
|
||||
`define _REPEAT_8(f,s) `f(7) `s `_REPEAT_7(f,s)
|
||||
`define _REPEAT_9(f,s) `f(8) `s `_REPEAT_8(f,s)
|
||||
`define _REPEAT_10(f,s) `f(9) `s `_REPEAT_9(f,s)
|
||||
`define _REPEAT_11(f,s) `f(10) `s `_REPEAT_10(f,s)
|
||||
`define _REPEAT_12(f,s) `f(11) `s `_REPEAT_11(f,s)
|
||||
`define _REPEAT_13(f,s) `f(12) `s `_REPEAT_12(f,s)
|
||||
`define _REPEAT_14(f,s) `f(13) `s `_REPEAT_13(f,s)
|
||||
`define _REPEAT_15(f,s) `f(14) `s `_REPEAT_14(f,s)
|
||||
`define _REPEAT_16(f,s) `f(15) `s `_REPEAT_15(f,s)
|
||||
`define _REPEAT_17(f,s) `f(16) `s `_REPEAT_16(f,s)
|
||||
`define _REPEAT_18(f,s) `f(17) `s `_REPEAT_17(f,s)
|
||||
`define _REPEAT_19(f,s) `f(18) `s `_REPEAT_18(f,s)
|
||||
`define _REPEAT_20(f,s) `f(19) `s `_REPEAT_19(f,s)
|
||||
`define _REPEAT_21(f,s) `f(20) `s `_REPEAT_20(f,s)
|
||||
`define _REPEAT_22(f,s) `f(21) `s `_REPEAT_21(f,s)
|
||||
`define _REPEAT_23(f,s) `f(22) `s `_REPEAT_22(f,s)
|
||||
`define _REPEAT_24(f,s) `f(23) `s `_REPEAT_23(f,s)
|
||||
`define _REPEAT_25(f,s) `f(24) `s `_REPEAT_24(f,s)
|
||||
`define _REPEAT_26(f,s) `f(25) `s `_REPEAT_25(f,s)
|
||||
`define _REPEAT_27(f,s) `f(26) `s `_REPEAT_26(f,s)
|
||||
`define _REPEAT_28(f,s) `f(27) `s `_REPEAT_27(f,s)
|
||||
`define _REPEAT_29(f,s) `f(28) `s `_REPEAT_28(f,s)
|
||||
`define _REPEAT_30(f,s) `f(29) `s `_REPEAT_29(f,s)
|
||||
`define _REPEAT_31(f,s) `f(30) `s `_REPEAT_30(f,s)
|
||||
`define _REPEAT_32(f,s) `f(31) `s `_REPEAT_31(f,s)
|
||||
|
||||
`define REPEAT_COMMA ,
|
||||
`define REPEAT_SEMICOLON ;
|
||||
|
||||
`endif // VX_PLATFORM_VH
|
||||
|
|
|
@ -1,89 +1,68 @@
|
|||
`ifndef VX_SCOPE
|
||||
`define VX_SCOPE
|
||||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`ifndef VX_SCOPE_VH
|
||||
`define VX_SCOPE_VH
|
||||
|
||||
`ifdef SCOPE
|
||||
|
||||
`include "scope-defs.vh"
|
||||
`define SCOPE_IO_DECL \
|
||||
input wire scope_reset, \
|
||||
input wire scope_bus_in, \
|
||||
output wire scope_bus_out,
|
||||
|
||||
`define SCOPE_ASSIGN(d,s) assign scope_``d = s
|
||||
`define SCOPE_IO_SWITCH(__count) \
|
||||
wire scope_bus_in_w [__count]; \
|
||||
wire scope_bus_out_w [__count]; \
|
||||
`RESET_RELAY_EX(scope_reset_w, scope_reset, __count, 4); \
|
||||
VX_scope_switch #( \
|
||||
.N (__count) \
|
||||
) scope_switch ( \
|
||||
.clk (clk), \
|
||||
.reset (scope_reset), \
|
||||
.req_in (scope_bus_in), \
|
||||
.rsp_out (scope_bus_out), \
|
||||
.req_out (scope_bus_in_w), \
|
||||
.rsp_in (scope_bus_out_w) \
|
||||
);
|
||||
|
||||
`define SCOPE_SIZE 1024
|
||||
`define SCOPE_IO_BIND(__i) \
|
||||
.scope_reset (scope_reset_w[__i]), \
|
||||
.scope_bus_in (scope_bus_in_w[__i]), \
|
||||
.scope_bus_out (scope_bus_out_w[__i]),
|
||||
|
||||
`define SCOPE_IO_UNUSED() \
|
||||
`UNUSED_VAR (scope_reset); \
|
||||
`UNUSED_VAR (scope_bus_in); \
|
||||
assign scope_bus_out = 0;
|
||||
|
||||
`define SCOPE_IO_UNUSED_W(__i) \
|
||||
`UNUSED_VAR (scope_reset_w[__i]); \
|
||||
`UNUSED_VAR (scope_bus_in_w[__i]); \
|
||||
assign scope_bus_out_w[__i] = 0;
|
||||
|
||||
`else
|
||||
|
||||
`define SCOPE_IO_VX_icache_stage
|
||||
`define SCOPE_IO_DECL
|
||||
|
||||
`define SCOPE_IO_VX_fetch
|
||||
`define SCOPE_IO_SWITCH(__count)
|
||||
|
||||
`define SCOPE_BIND_VX_fetch_icache_stage
|
||||
`define SCOPE_IO_BIND(__i)
|
||||
|
||||
`define SCOPE_BIND_VX_fetch_warp_sched
|
||||
`define SCOPE_IO_UNUSED_W(__i)
|
||||
|
||||
`define SCOPE_IO_VX_warp_sched
|
||||
|
||||
`define SCOPE_IO_VX_pipeline
|
||||
|
||||
`define SCOPE_BIND_VX_pipeline_fetch
|
||||
|
||||
`define SCOPE_IO_VX_core
|
||||
|
||||
`define SCOPE_BIND_VX_core_pipeline
|
||||
|
||||
`define SCOPE_IO_VX_cluster
|
||||
|
||||
`define SCOPE_BIND_VX_cluster_core(__i__)
|
||||
|
||||
`define SCOPE_IO_Vortex
|
||||
|
||||
`define SCOPE_BIND_Vortex_cluster(__i__)
|
||||
|
||||
`define SCOPE_BIND_afu_vortex
|
||||
|
||||
`define SCOPE_IO_VX_lsu_unit
|
||||
|
||||
`define SCOPE_IO_VX_gpu_unit
|
||||
|
||||
`define SCOPE_IO_VX_execute
|
||||
|
||||
`define SCOPE_BIND_VX_execute_lsu_unit
|
||||
|
||||
`define SCOPE_BIND_VX_execute_gpu_unit
|
||||
|
||||
`define SCOPE_BIND_VX_pipeline_execute
|
||||
|
||||
`define SCOPE_IO_VX_issue
|
||||
|
||||
`define SCOPE_BIND_VX_pipeline_issue
|
||||
|
||||
`define SCOPE_IO_VX_bank
|
||||
|
||||
`define SCOPE_IO_VX_cache
|
||||
|
||||
`define SCOPE_BIND_VX_cache_bank(__i__)
|
||||
|
||||
`define SCOPE_BIND_Vortex_l3cache
|
||||
|
||||
`define SCOPE_BIND_VX_cluster_l2cache
|
||||
|
||||
`define SCOPE_IO_VX_mem_unit
|
||||
|
||||
`define SCOPE_BIND_VX_mem_unit_dcache
|
||||
|
||||
`define SCOPE_BIND_VX_core_mem_unit
|
||||
|
||||
`define SCOPE_BIND_VX_mem_unit_icache
|
||||
|
||||
`define SCOPE_BIND_VX_mem_unit_smem
|
||||
|
||||
`define SCOPE_DECL_SIGNALS
|
||||
|
||||
`define SCOPE_DATA_LIST
|
||||
|
||||
`define SCOPE_UPDATE_LIST
|
||||
|
||||
`define SCOPE_TRIGGER
|
||||
|
||||
`define SCOPE_ASSIGN(d,s)
|
||||
`define SCOPE_IO_UNUSED(__i)
|
||||
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`endif // VX_SCOPE_VH
|
||||
|
|
|
@ -1,85 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_scoreboard #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
VX_ibuffer_if.slave ibuffer_if,
|
||||
VX_writeback_if.slave writeback_if
|
||||
);
|
||||
reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
|
||||
|
||||
wire reserve_reg = ibuffer_if.valid && ibuffer_if.ready && ibuffer_if.wb;
|
||||
|
||||
wire release_reg = writeback_if.valid && writeback_if.ready && writeback_if.eop;
|
||||
|
||||
always @(*) begin
|
||||
inuse_regs_n = inuse_regs;
|
||||
if (reserve_reg) begin
|
||||
inuse_regs_n[ibuffer_if.wid][ibuffer_if.rd] = 1;
|
||||
end
|
||||
if (release_reg) begin
|
||||
inuse_regs_n[writeback_if.wid][writeback_if.rd] = 0;
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
inuse_regs <= '0;
|
||||
end else begin
|
||||
inuse_regs <= inuse_regs_n;
|
||||
end
|
||||
end
|
||||
|
||||
reg deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3;
|
||||
|
||||
always @(posedge clk) begin
|
||||
deq_inuse_rd <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rd_n];
|
||||
deq_inuse_rs1 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs1_n];
|
||||
deq_inuse_rs2 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs2_n];
|
||||
deq_inuse_rs3 <= inuse_regs_n[ibuffer_if.wid_n][ibuffer_if.rs3_n];
|
||||
end
|
||||
|
||||
assign writeback_if.ready = 1'b1;
|
||||
|
||||
assign ibuffer_if.ready = ~(deq_inuse_rd
|
||||
| deq_inuse_rs1
|
||||
| deq_inuse_rs2
|
||||
| deq_inuse_rs3);
|
||||
|
||||
`UNUSED_VAR (writeback_if.PC)
|
||||
|
||||
reg [31:0] deadlock_ctr;
|
||||
wire [31:0] deadlock_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
deadlock_ctr <= 0;
|
||||
end else begin
|
||||
`ifdef DBG_TRACE_CORE_PIPELINE
|
||||
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
|
||||
dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b (#%0d)\n",
|
||||
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,
|
||||
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3, ibuffer_if.uuid);
|
||||
end
|
||||
`endif
|
||||
if (release_reg) begin
|
||||
`ASSERT(inuse_regs[writeback_if.wid][writeback_if.rd] != 0,
|
||||
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d (#%0d)",
|
||||
$time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd,writeback_if.uuid));
|
||||
end
|
||||
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
|
||||
deadlock_ctr <= deadlock_ctr + 1;
|
||||
`ASSERT(deadlock_ctr < deadlock_timeout,
|
||||
("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b (#%0d)",
|
||||
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,
|
||||
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3, ibuffer_if.uuid));
|
||||
end else if (ibuffer_if.valid && ibuffer_if.ready) begin
|
||||
deadlock_ctr <= 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
endmodule
|
|
@ -1,160 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_smem_arb #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter LANES = 1,
|
||||
parameter DATA_SIZE = 1,
|
||||
parameter TAG_IN_WIDTH = 1,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
parameter BUFFERED_REQ = 0,
|
||||
parameter BUFFERED_RSP = 0,
|
||||
parameter TYPE = "P",
|
||||
|
||||
parameter ADDR_WIDTH = (32-`CLOG2(DATA_SIZE)),
|
||||
parameter DATA_WIDTH = (8 * DATA_SIZE),
|
||||
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS),
|
||||
parameter TAG_OUT_WIDTH = TAG_IN_WIDTH - LOG_NUM_REQS
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// input request
|
||||
input wire [LANES-1:0] req_valid_in,
|
||||
input wire [LANES-1:0] req_rw_in,
|
||||
input wire [LANES-1:0][DATA_SIZE-1:0] req_byteen_in,
|
||||
input wire [LANES-1:0][ADDR_WIDTH-1:0] req_addr_in,
|
||||
input wire [LANES-1:0][DATA_WIDTH-1:0] req_data_in,
|
||||
input wire [LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_in,
|
||||
output wire [LANES-1:0] req_ready_in,
|
||||
|
||||
// output requests
|
||||
output wire [NUM_REQS-1:0][LANES-1:0] req_valid_out,
|
||||
output wire [NUM_REQS-1:0][LANES-1:0] req_rw_out,
|
||||
output wire [NUM_REQS-1:0][LANES-1:0][DATA_SIZE-1:0] req_byteen_out,
|
||||
output wire [NUM_REQS-1:0][LANES-1:0][ADDR_WIDTH-1:0] req_addr_out,
|
||||
output wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] req_data_out,
|
||||
output wire [NUM_REQS-1:0][LANES-1:0][TAG_OUT_WIDTH-1:0] req_tag_out,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0] req_ready_out,
|
||||
|
||||
// input responses
|
||||
input wire [NUM_REQS-1:0] rsp_valid_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0] rsp_tmask_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] rsp_data_in,
|
||||
input wire [NUM_REQS-1:0][TAG_OUT_WIDTH-1:0] rsp_tag_in,
|
||||
output wire [NUM_REQS-1:0] rsp_ready_in,
|
||||
|
||||
// output response
|
||||
output wire rsp_valid_out,
|
||||
output wire [LANES-1:0] rsp_tmask_out,
|
||||
output wire [LANES-1:0][DATA_WIDTH-1:0] rsp_data_out,
|
||||
output wire [TAG_IN_WIDTH-1:0] rsp_tag_out,
|
||||
input wire rsp_ready_out
|
||||
);
|
||||
localparam REQ_DATAW = TAG_OUT_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
|
||||
localparam RSP_DATAW = LANES * (1 + DATA_WIDTH) + TAG_IN_WIDTH;
|
||||
|
||||
if (NUM_REQS > 1) begin
|
||||
|
||||
wire [LANES-1:0][REQ_DATAW-1:0] req_data_in_merged;
|
||||
wire [NUM_REQS-1:0][LANES-1:0][REQ_DATAW-1:0] req_data_out_merged;
|
||||
|
||||
wire [LANES-1:0][LOG_NUM_REQS-1:0] req_sel;
|
||||
wire [LANES-1:0][TAG_OUT_WIDTH-1:0] req_tag_in_w;
|
||||
|
||||
for (genvar i = 0; i < LANES; ++i) begin
|
||||
assign req_sel[i] = req_tag_in[i][TAG_SEL_IDX +: LOG_NUM_REQS];
|
||||
|
||||
VX_bits_remove #(
|
||||
.N (TAG_IN_WIDTH),
|
||||
.S (LOG_NUM_REQS),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) bits_remove (
|
||||
.data_in (req_tag_in[i]),
|
||||
.data_out (req_tag_in_w[i])
|
||||
);
|
||||
|
||||
assign req_data_in_merged[i] = {req_tag_in_w[i], req_addr_in[i], req_rw_in[i], req_byteen_in[i], req_data_in[i]};
|
||||
end
|
||||
|
||||
VX_stream_demux #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.LANES (LANES),
|
||||
.DATAW (REQ_DATAW),
|
||||
.BUFFERED (BUFFERED_REQ)
|
||||
) req_demux (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.sel_in (req_sel),
|
||||
.valid_in (req_valid_in),
|
||||
.data_in (req_data_in_merged),
|
||||
.ready_in (req_ready_in),
|
||||
.valid_out (req_valid_out),
|
||||
.data_out (req_data_out_merged),
|
||||
.ready_out (req_ready_out)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
for (genvar j = 0; j < LANES; ++j) begin
|
||||
assign {req_tag_out[i][j], req_addr_out[i][j], req_rw_out[i][j], req_byteen_out[i][j], req_data_out[i][j]} = req_data_out_merged[i][j];
|
||||
end
|
||||
end
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_in_merged;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
wire [TAG_IN_WIDTH-1:0] rsp_tag_in_w;
|
||||
|
||||
VX_bits_insert #(
|
||||
.N (TAG_OUT_WIDTH),
|
||||
.S (LOG_NUM_REQS),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) bits_insert (
|
||||
.data_in (rsp_tag_in[i]),
|
||||
.sel_in (LOG_NUM_REQS'(i)),
|
||||
.data_out (rsp_tag_in_w)
|
||||
);
|
||||
|
||||
assign rsp_data_in_merged[i] = {rsp_tag_in_w, rsp_tmask_in[i], rsp_data_in[i]};
|
||||
end
|
||||
|
||||
VX_stream_arbiter #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.LANES (1),
|
||||
.DATAW (RSP_DATAW),
|
||||
.BUFFERED (BUFFERED_RSP),
|
||||
.TYPE (TYPE)
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (rsp_valid_in),
|
||||
.data_in (rsp_data_in_merged),
|
||||
.ready_in (rsp_ready_in),
|
||||
.valid_out (rsp_valid_out),
|
||||
.data_out ({rsp_tag_out, rsp_tmask_out, rsp_data_out}),
|
||||
.ready_out (rsp_ready_out)
|
||||
);
|
||||
|
||||
end else begin
|
||||
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
assign req_valid_out = req_valid_in;
|
||||
assign req_tag_out = req_tag_in;
|
||||
assign req_addr_out = req_addr_in;
|
||||
assign req_rw_out = req_rw_in;
|
||||
assign req_byteen_out = req_byteen_in;
|
||||
assign req_data_out = req_data_in;
|
||||
assign req_ready_in = req_ready_out;
|
||||
|
||||
assign rsp_valid_out = rsp_valid_in;
|
||||
assign rsp_tmask_out = rsp_tmask_in;
|
||||
assign rsp_tag_out = rsp_tag_in;
|
||||
assign rsp_data_out = rsp_data_in;
|
||||
assign rsp_ready_in = rsp_ready_out;
|
||||
|
||||
end
|
||||
|
||||
endmodule
|
187
hw/rtl/VX_socket.sv
Normal file
187
hw/rtl/VX_socket.sv
Normal file
|
@ -0,0 +1,187 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_socket import VX_gpu_pkg::*; #(
|
||||
parameter SOCKET_ID = 0
|
||||
) (
|
||||
`SCOPE_IO_DECL
|
||||
|
||||
// Clock
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if.slave mem_perf_if,
|
||||
`endif
|
||||
|
||||
VX_dcr_bus_if.slave dcr_bus_if,
|
||||
|
||||
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS],
|
||||
|
||||
VX_mem_bus_if.master icache_bus_if,
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
VX_gbar_bus_if.master gbar_bus_if,
|
||||
`endif
|
||||
|
||||
// simulation helper signals
|
||||
output wire sim_ebreak,
|
||||
output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value,
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
);
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
VX_gbar_bus_if per_core_gbar_bus_if[`SOCKET_SIZE]();
|
||||
|
||||
`RESET_RELAY (gbar_arb_reset, reset);
|
||||
|
||||
VX_gbar_arb #(
|
||||
.NUM_REQS (`SOCKET_SIZE),
|
||||
.OUT_REG ((`SOCKET_SIZE > 1) ? 2 : 0)
|
||||
) gbar_arb (
|
||||
.clk (clk),
|
||||
.reset (gbar_arb_reset),
|
||||
.bus_in_if (per_core_gbar_bus_if),
|
||||
.bus_out_if (gbar_bus_if)
|
||||
);
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
|
||||
) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS]();
|
||||
|
||||
`RESET_RELAY (dcache_arb_reset, reset);
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_ARB_TAG_WIDTH)
|
||||
) dcache_bus_tmp_if[1]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
|
||||
) per_core_dcache_bus_tmp_if[`SOCKET_SIZE]();
|
||||
|
||||
for (genvar j = 0; j < `SOCKET_SIZE; ++j) begin
|
||||
`ASSIGN_VX_MEM_BUS_IF (per_core_dcache_bus_tmp_if[j], per_core_dcache_bus_if[j * DCACHE_NUM_REQS + i]);
|
||||
end
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (`SOCKET_SIZE),
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (`CACHE_ADDR_TYPE_BITS),
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG_REQ ((`SOCKET_SIZE > 1) ? 2 : 0),
|
||||
.OUT_REG_RSP ((`SOCKET_SIZE > 1) ? 2 : 0)
|
||||
) dcache_arb (
|
||||
.clk (clk),
|
||||
.reset (dcache_arb_reset),
|
||||
.bus_in_if (per_core_dcache_bus_tmp_if),
|
||||
.bus_out_if (dcache_bus_tmp_if)
|
||||
);
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[0]);
|
||||
end
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (ICACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (ICACHE_TAG_WIDTH)
|
||||
) per_core_icache_bus_if[`SOCKET_SIZE]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (ICACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (ICACHE_ARB_TAG_WIDTH)
|
||||
) icache_bus_tmp_if[1]();
|
||||
|
||||
`RESET_RELAY (icache_arb_reset, reset);
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (`SOCKET_SIZE),
|
||||
.NUM_OUTPUTS (1),
|
||||
.DATA_SIZE (ICACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (ICACHE_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (0),
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG_REQ ((`SOCKET_SIZE > 1) ? 2 : 0),
|
||||
.OUT_REG_RSP ((`SOCKET_SIZE > 1) ? 2 : 0)
|
||||
) icache_arb (
|
||||
.clk (clk),
|
||||
.reset (icache_arb_reset),
|
||||
.bus_in_if (per_core_icache_bus_if),
|
||||
.bus_out_if (icache_bus_tmp_if)
|
||||
);
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF (icache_bus_if, icache_bus_tmp_if[0]);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [`SOCKET_SIZE-1:0] per_core_sim_ebreak;
|
||||
wire [`SOCKET_SIZE-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_core_sim_wb_value;
|
||||
assign sim_ebreak = per_core_sim_ebreak[0];
|
||||
assign sim_wb_value = per_core_sim_wb_value[0];
|
||||
`UNUSED_VAR (per_core_sim_ebreak)
|
||||
`UNUSED_VAR (per_core_sim_wb_value)
|
||||
|
||||
wire [`SOCKET_SIZE-1:0] per_core_busy;
|
||||
|
||||
`BUFFER_DCR_BUS_IF (core_dcr_bus_if, dcr_bus_if, (`SOCKET_SIZE > 1));
|
||||
|
||||
`SCOPE_IO_SWITCH (`SOCKET_SIZE)
|
||||
|
||||
// Generate all cores
|
||||
for (genvar i = 0; i < `SOCKET_SIZE; ++i) begin
|
||||
|
||||
`RESET_RELAY (core_reset, reset);
|
||||
|
||||
VX_core #(
|
||||
.CORE_ID ((SOCKET_ID * `SOCKET_SIZE) + i)
|
||||
) core (
|
||||
`SCOPE_IO_BIND (i)
|
||||
|
||||
.clk (clk),
|
||||
.reset (core_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.mem_perf_if (mem_perf_if),
|
||||
`endif
|
||||
|
||||
.dcr_bus_if (core_dcr_bus_if),
|
||||
|
||||
.dcache_bus_if (per_core_dcache_bus_if[i * DCACHE_NUM_REQS +: DCACHE_NUM_REQS]),
|
||||
|
||||
.icache_bus_if (per_core_icache_bus_if[i]),
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
.gbar_bus_if (per_core_gbar_bus_if[i]),
|
||||
`endif
|
||||
|
||||
.sim_ebreak (per_core_sim_ebreak[i]),
|
||||
.sim_wb_value (per_core_sim_wb_value[i]),
|
||||
.busy (per_core_busy[i])
|
||||
);
|
||||
end
|
||||
|
||||
`BUFFER_BUSY (busy, (| per_core_busy), (`SOCKET_SIZE > 1));
|
||||
|
||||
endmodule
|
|
@ -1,148 +0,0 @@
|
|||
`ifndef VX_TRACE_INSTR
|
||||
`define VX_TRACE_INSTR
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
task trace_ex_type (
|
||||
input [`EX_BITS-1:0] ex_type
|
||||
);
|
||||
case (ex_type)
|
||||
`EX_ALU: dpi_trace("ALU");
|
||||
`EX_LSU: dpi_trace("LSU");
|
||||
`EX_CSR: dpi_trace("CSR");
|
||||
`EX_FPU: dpi_trace("FPU");
|
||||
`EX_GPU: dpi_trace("GPU");
|
||||
default: dpi_trace("NOP");
|
||||
endcase
|
||||
endtask
|
||||
|
||||
task trace_ex_op (
|
||||
input [`EX_BITS-1:0] ex_type,
|
||||
input [`INST_OP_BITS-1:0] op_type,
|
||||
input [`INST_MOD_BITS-1:0] op_mod
|
||||
);
|
||||
case (ex_type)
|
||||
`EX_ALU: begin
|
||||
if (`INST_ALU_IS_BR(op_mod)) begin
|
||||
case (`INST_BR_BITS'(op_type))
|
||||
`INST_BR_EQ: dpi_trace("BEQ");
|
||||
`INST_BR_NE: dpi_trace("BNE");
|
||||
`INST_BR_LT: dpi_trace("BLT");
|
||||
`INST_BR_GE: dpi_trace("BGE");
|
||||
`INST_BR_LTU: dpi_trace("BLTU");
|
||||
`INST_BR_GEU: dpi_trace("BGEU");
|
||||
`INST_BR_JAL: dpi_trace("JAL");
|
||||
`INST_BR_JALR: dpi_trace("JALR");
|
||||
`INST_BR_ECALL: dpi_trace("ECALL");
|
||||
`INST_BR_EBREAK:dpi_trace("EBREAK");
|
||||
`INST_BR_URET: dpi_trace("URET");
|
||||
`INST_BR_SRET: dpi_trace("SRET");
|
||||
`INST_BR_MRET: dpi_trace("MRET");
|
||||
default: dpi_trace("?");
|
||||
endcase
|
||||
end else if (`INST_ALU_IS_MUL(op_mod)) begin
|
||||
case (`INST_MUL_BITS'(op_type))
|
||||
`INST_MUL_MUL: dpi_trace("MUL");
|
||||
`INST_MUL_MULH: dpi_trace("MULH");
|
||||
`INST_MUL_MULHSU:dpi_trace("MULHSU");
|
||||
`INST_MUL_MULHU: dpi_trace("MULHU");
|
||||
`INST_MUL_DIV: dpi_trace("DIV");
|
||||
`INST_MUL_DIVU: dpi_trace("DIVU");
|
||||
`INST_MUL_REM: dpi_trace("REM");
|
||||
`INST_MUL_REMU: dpi_trace("REMU");
|
||||
default: dpi_trace("?");
|
||||
endcase
|
||||
end else begin
|
||||
case (`INST_ALU_BITS'(op_type))
|
||||
`INST_ALU_ADD: dpi_trace("ADD");
|
||||
`INST_ALU_SUB: dpi_trace("SUB");
|
||||
`INST_ALU_SLL: dpi_trace("SLL");
|
||||
`INST_ALU_SRL: dpi_trace("SRL");
|
||||
`INST_ALU_SRA: dpi_trace("SRA");
|
||||
`INST_ALU_SLT: dpi_trace("SLT");
|
||||
`INST_ALU_SLTU: dpi_trace("SLTU");
|
||||
`INST_ALU_XOR: dpi_trace("XOR");
|
||||
`INST_ALU_OR: dpi_trace("OR");
|
||||
`INST_ALU_AND: dpi_trace("AND");
|
||||
`INST_ALU_LUI: dpi_trace("LUI");
|
||||
`INST_ALU_AUIPC: dpi_trace("AUIPC");
|
||||
default: dpi_trace("?");
|
||||
endcase
|
||||
end
|
||||
end
|
||||
`EX_LSU: begin
|
||||
if (op_mod == 0) begin
|
||||
case (`INST_LSU_BITS'(op_type))
|
||||
`INST_LSU_LB: dpi_trace("LB");
|
||||
`INST_LSU_LH: dpi_trace("LH");
|
||||
`INST_LSU_LW: dpi_trace("LW");
|
||||
`INST_LSU_LBU:dpi_trace("LBU");
|
||||
`INST_LSU_LHU:dpi_trace("LHU");
|
||||
`INST_LSU_SB: dpi_trace("SB");
|
||||
`INST_LSU_SH: dpi_trace("SH");
|
||||
`INST_LSU_SW: dpi_trace("SW");
|
||||
default: dpi_trace("?");
|
||||
endcase
|
||||
end else if (op_mod == 1) begin
|
||||
case (`INST_FENCE_BITS'(op_type))
|
||||
`INST_FENCE_D: dpi_trace("DFENCE");
|
||||
`INST_FENCE_I: dpi_trace("IFENCE");
|
||||
default: dpi_trace("?");
|
||||
endcase
|
||||
end
|
||||
end
|
||||
`EX_CSR: begin
|
||||
case (`INST_CSR_BITS'(op_type))
|
||||
`INST_CSR_RW: dpi_trace("CSRW");
|
||||
`INST_CSR_RS: dpi_trace("CSRS");
|
||||
`INST_CSR_RC: dpi_trace("CSRC");
|
||||
default: dpi_trace("?");
|
||||
endcase
|
||||
end
|
||||
`EX_FPU: begin
|
||||
case (`INST_FPU_BITS'(op_type))
|
||||
`INST_FPU_ADD: dpi_trace("ADD");
|
||||
`INST_FPU_SUB: dpi_trace("SUB");
|
||||
`INST_FPU_MUL: dpi_trace("MUL");
|
||||
`INST_FPU_DIV: dpi_trace("DIV");
|
||||
`INST_FPU_SQRT: dpi_trace("SQRT");
|
||||
`INST_FPU_MADD: dpi_trace("MADD");
|
||||
`INST_FPU_NMSUB: dpi_trace("NMSUB");
|
||||
`INST_FPU_NMADD: dpi_trace("NMADD");
|
||||
`INST_FPU_CVTWS: dpi_trace("CVTWS");
|
||||
`INST_FPU_CVTWUS:dpi_trace("CVTWUS");
|
||||
`INST_FPU_CVTSW: dpi_trace("CVTSW");
|
||||
`INST_FPU_CVTSWU:dpi_trace("CVTSWU");
|
||||
`INST_FPU_CLASS: dpi_trace("CLASS");
|
||||
`INST_FPU_CMP: dpi_trace("CMP");
|
||||
`INST_FPU_MISC: begin
|
||||
case (op_mod)
|
||||
0: dpi_trace("SGNJ");
|
||||
1: dpi_trace("SGNJN");
|
||||
2: dpi_trace("SGNJX");
|
||||
3: dpi_trace("MIN");
|
||||
4: dpi_trace("MAX");
|
||||
5: dpi_trace("MVXW");
|
||||
6: dpi_trace("MVWX");
|
||||
endcase
|
||||
end
|
||||
default: dpi_trace("?");
|
||||
endcase
|
||||
end
|
||||
`EX_GPU: begin
|
||||
case (`INST_GPU_BITS'(op_type))
|
||||
`INST_GPU_TMC: dpi_trace("TMC");
|
||||
`INST_GPU_WSPAWN:dpi_trace("WSPAWN");
|
||||
`INST_GPU_SPLIT: dpi_trace("SPLIT");
|
||||
`INST_GPU_JOIN: dpi_trace("JOIN");
|
||||
`INST_GPU_BAR: dpi_trace("BAR");
|
||||
`INST_GPU_PRED: dpi_trace("PRED");
|
||||
`INST_GPU_TEX: dpi_trace("TEX");
|
||||
default: dpi_trace("?");
|
||||
endcase
|
||||
end
|
||||
default: dpi_trace("?");
|
||||
endcase
|
||||
endtask
|
||||
|
||||
`endif
|
177
hw/rtl/VX_types.vh
Normal file
177
hw/rtl/VX_types.vh
Normal file
|
@ -0,0 +1,177 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`ifndef VX_TYPES_VH
|
||||
`define VX_TYPES_VH
|
||||
|
||||
// Device configuration registers
|
||||
|
||||
`define VX_CSR_ADDR_BITS 12
|
||||
`define VX_DCR_ADDR_BITS 12
|
||||
|
||||
`define VX_DCR_BASE_STATE_BEGIN 12'h001
|
||||
`define VX_DCR_BASE_STARTUP_ADDR0 12'h001
|
||||
`define VX_DCR_BASE_STARTUP_ADDR1 12'h002
|
||||
`define VX_DCR_BASE_MPM_CLASS 12'h003
|
||||
`define VX_DCR_BASE_STATE_END 12'h004
|
||||
|
||||
`define VX_DCR_BASE_STATE(addr) ((addr) - `VX_DCR_BASE_STATE_BEGIN)
|
||||
`define VX_DCR_BASE_STATE_COUNT (`VX_DCR_BASE_STATE_END-`VX_DCR_BASE_STATE_BEGIN)
|
||||
|
||||
// Machine Performance-monitoring counters classes
|
||||
|
||||
`define VX_DCR_MPM_CLASS_NONE 0
|
||||
`define VX_DCR_MPM_CLASS_CORE 1
|
||||
`define VX_DCR_MPM_CLASS_MEM 2
|
||||
`define VX_DCR_MPM_CLASS_TEX 3
|
||||
`define VX_DCR_MPM_CLASS_RASTER 4
|
||||
`define VX_DCR_MPM_CLASS_ROP 5
|
||||
|
||||
// User Floating-Point CSRs
|
||||
|
||||
`define VX_CSR_FFLAGS 12'h001
|
||||
`define VX_CSR_FRM 12'h002
|
||||
`define VX_CSR_FCSR 12'h003
|
||||
|
||||
`define VX_CSR_SATP 12'h180
|
||||
|
||||
`define VX_CSR_PMPCFG0 12'h3A0
|
||||
`define VX_CSR_PMPADDR0 12'h3B0
|
||||
|
||||
`define VX_CSR_MSTATUS 12'h300
|
||||
`define VX_CSR_MISA 12'h301
|
||||
`define VX_CSR_MEDELEG 12'h302
|
||||
`define VX_CSR_MIDELEG 12'h303
|
||||
`define VX_CSR_MIE 12'h304
|
||||
`define VX_CSR_MTVEC 12'h305
|
||||
|
||||
`define VX_CSR_MEPC 12'h341
|
||||
|
||||
`define VX_CSR_MNSTATUS 12'h744
|
||||
|
||||
`define VX_CSR_MPM_BASE 12'hB00
|
||||
`define VX_CSR_MPM_BASE_H 12'hB80
|
||||
|
||||
// Machine Performance-monitoring core counters
|
||||
// PERF: Standard
|
||||
`define VX_CSR_MCYCLE 12'hB00
|
||||
`define VX_CSR_MCYCLE_H 12'hB80
|
||||
`define VX_CSR_MPM_RESERVED 12'hB01
|
||||
`define VX_CSR_MPM_RESERVED_H 12'hB81
|
||||
`define VX_CSR_MINSTRET 12'hB02
|
||||
`define VX_CSR_MINSTRET_H 12'hB82
|
||||
// PERF: pipeline
|
||||
`define VX_CSR_MPM_IBUF_ST 12'hB03
|
||||
`define VX_CSR_MPM_IBUF_ST_H 12'hB83
|
||||
`define VX_CSR_MPM_SCRB_ST 12'hB04
|
||||
`define VX_CSR_MPM_SCRB_ST_H 12'hB84
|
||||
`define VX_CSR_MPM_ALU_ST 12'hB05
|
||||
`define VX_CSR_MPM_ALU_ST_H 12'hB85
|
||||
`define VX_CSR_MPM_LSU_ST 12'hB06
|
||||
`define VX_CSR_MPM_LSU_ST_H 12'hB86
|
||||
`define VX_CSR_MPM_FPU_ST 12'hB07
|
||||
`define VX_CSR_MPM_FPU_ST_H 12'hB87
|
||||
`define VX_CSR_MPM_SFU_ST 12'hB08
|
||||
`define VX_CSR_MPM_SFU_ST_H 12'hB88
|
||||
// PERF: memory
|
||||
`define VX_CSR_MPM_IFETCHES 12'hB0A
|
||||
`define VX_CSR_MPM_IFETCHES_H 12'hB8A
|
||||
`define VX_CSR_MPM_LOADS 12'hB0B
|
||||
`define VX_CSR_MPM_LOADS_H 12'hB8B
|
||||
`define VX_CSR_MPM_STORES 12'hB0C
|
||||
`define VX_CSR_MPM_STORES_H 12'hB8C
|
||||
`define VX_CSR_MPM_IFETCH_LAT 12'hB0D
|
||||
`define VX_CSR_MPM_IFETCH_LAT_H 12'hB8D
|
||||
`define VX_CSR_MPM_LOAD_LAT 12'hB0E
|
||||
`define VX_CSR_MPM_LOAD_LAT_H 12'hB8E
|
||||
|
||||
// Machine Performance-monitoring memory counters
|
||||
// PERF: icache
|
||||
`define VX_CSR_MPM_ICACHE_READS 12'hB03 // total reads
|
||||
`define VX_CSR_MPM_ICACHE_READS_H 12'hB83
|
||||
`define VX_CSR_MPM_ICACHE_MISS_R 12'hB04 // read misses
|
||||
`define VX_CSR_MPM_ICACHE_MISS_R_H 12'hB84
|
||||
// PERF: dcache
|
||||
`define VX_CSR_MPM_DCACHE_READS 12'hB05 // total reads
|
||||
`define VX_CSR_MPM_DCACHE_READS_H 12'hB85
|
||||
`define VX_CSR_MPM_DCACHE_WRITES 12'hB06 // total writes
|
||||
`define VX_CSR_MPM_DCACHE_WRITES_H 12'hB86
|
||||
`define VX_CSR_MPM_DCACHE_MISS_R 12'hB07 // read misses
|
||||
`define VX_CSR_MPM_DCACHE_MISS_R_H 12'hB87
|
||||
`define VX_CSR_MPM_DCACHE_MISS_W 12'hB08 // write misses
|
||||
`define VX_CSR_MPM_DCACHE_MISS_W_H 12'hB88
|
||||
`define VX_CSR_MPM_DCACHE_BANK_ST 12'hB09 // bank conflicts
|
||||
`define VX_CSR_MPM_DCACHE_BANK_ST_H 12'hB89
|
||||
`define VX_CSR_MPM_DCACHE_MSHR_ST 12'hB0A // MSHR stalls
|
||||
`define VX_CSR_MPM_DCACHE_MSHR_ST_H 12'hB8A
|
||||
// PERF: smem
|
||||
`define VX_CSR_MPM_SMEM_READS 12'hB0B // memory reads
|
||||
`define VX_CSR_MPM_SMEM_READS_H 12'hB8B
|
||||
`define VX_CSR_MPM_SMEM_WRITES 12'hB0C // memory writes
|
||||
`define VX_CSR_MPM_SMEM_WRITES_H 12'hB8C
|
||||
`define VX_CSR_MPM_SMEM_BANK_ST 12'hB0D // bank conflicts
|
||||
`define VX_CSR_MPM_SMEM_BANK_ST_H 12'hB8D
|
||||
// PERF: l2cache
|
||||
`define VX_CSR_MPM_L2CACHE_READS 12'hB0E // total reads
|
||||
`define VX_CSR_MPM_L2CACHE_READS_H 12'hB8E
|
||||
`define VX_CSR_MPM_L2CACHE_WRITES 12'hB0F // total writes
|
||||
`define VX_CSR_MPM_L2CACHE_WRITES_H 12'hB8F
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_R 12'hB10 // read misses
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_R_H 12'hB90
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_W 12'hB11 // write misses
|
||||
`define VX_CSR_MPM_L2CACHE_MISS_W_H 12'hB91
|
||||
`define VX_CSR_MPM_L2CACHE_BANK_ST 12'hB12 // bank conflicts
|
||||
`define VX_CSR_MPM_L2CACHE_BANK_ST_H 12'hB92
|
||||
`define VX_CSR_MPM_L2CACHE_MSHR_ST 12'hB13 // MSHR stalls
|
||||
`define VX_CSR_MPM_L2CACHE_MSHR_ST_H 12'hB93
|
||||
// PERF: l3cache
|
||||
`define VX_CSR_MPM_L3CACHE_READS 12'hB14 // total reads
|
||||
`define VX_CSR_MPM_L3CACHE_READS_H 12'hB94
|
||||
`define VX_CSR_MPM_L3CACHE_WRITES 12'hB15 // total writes
|
||||
`define VX_CSR_MPM_L3CACHE_WRITES_H 12'hB95
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_R 12'hB16 // read misses
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_R_H 12'hB96
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_W 12'hB17 // write misses
|
||||
`define VX_CSR_MPM_L3CACHE_MISS_W_H 12'hB97
|
||||
`define VX_CSR_MPM_L3CACHE_BANK_ST 12'hB18 // bank conflicts
|
||||
`define VX_CSR_MPM_L3CACHE_BANK_ST_H 12'hB98
|
||||
`define VX_CSR_MPM_L3CACHE_MSHR_ST 12'hB19 // MSHR stalls
|
||||
`define VX_CSR_MPM_L3CACHE_MSHR_ST_H 12'hB99
|
||||
// PERF: memory
|
||||
`define VX_CSR_MPM_MEM_READS 12'hB1A // total reads
|
||||
`define VX_CSR_MPM_MEM_READS_H 12'hB9A
|
||||
`define VX_CSR_MPM_MEM_WRITES 12'hB1B // total writes
|
||||
`define VX_CSR_MPM_MEM_WRITES_H 12'hB9B
|
||||
`define VX_CSR_MPM_MEM_LAT 12'hB1C // memory latency
|
||||
`define VX_CSR_MPM_MEM_LAT_H 12'hB9C
|
||||
|
||||
// Machine Information Registers
|
||||
|
||||
`define VX_CSR_MVENDORID 12'hF11
|
||||
`define VX_CSR_MARCHID 12'hF12
|
||||
`define VX_CSR_MIMPID 12'hF13
|
||||
`define VX_CSR_MHARTID 12'hF14
|
||||
|
||||
// GPGU CSRs
|
||||
|
||||
`define VX_CSR_THREAD_ID 12'hCC0
|
||||
`define VX_CSR_WARP_ID 12'hCC1
|
||||
`define VX_CSR_CORE_ID 12'hCC2
|
||||
`define VX_CSR_WARP_MASK 12'hCC3
|
||||
`define VX_CSR_THREAD_MASK 12'hCC4 // warning! this value is also used in LLVM
|
||||
|
||||
`define VX_CSR_NUM_THREADS 12'hFC0
|
||||
`define VX_CSR_NUM_WARPS 12'hFC1
|
||||
`define VX_CSR_NUM_CORES 12'hFC2
|
||||
|
||||
`endif // VX_TYPES_VH
|
|
@ -1,254 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_warp_sched #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
`SCOPE_IO_VX_warp_sched
|
||||
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
VX_warp_ctl_if.slave warp_ctl_if,
|
||||
VX_wstall_if.slave wstall_if,
|
||||
VX_join_if.slave join_if,
|
||||
VX_branch_ctl_if.slave branch_ctl_if,
|
||||
|
||||
VX_ifetch_req_if.master ifetch_req_if,
|
||||
|
||||
VX_fetch_to_csr_if.master fetch_to_csr_if,
|
||||
|
||||
output wire busy
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
wire join_else;
|
||||
wire [31:0] join_pc;
|
||||
wire [`NUM_THREADS-1:0] join_tmask;
|
||||
|
||||
reg [`NUM_WARPS-1:0] active_warps, active_warps_n; // real active warps (updated when a warp is activated or disabled)
|
||||
reg [`NUM_WARPS-1:0] stalled_warps; // asserted when a branch/gpgpu instructions are issued
|
||||
|
||||
reg [`NUM_WARPS-1:0][`NUM_THREADS-1:0] thread_masks;
|
||||
reg [`NUM_WARPS-1:0][31:0] warp_pcs;
|
||||
|
||||
// barriers
|
||||
reg [`NUM_BARRIERS-1:0][`NUM_WARPS-1:0] barrier_masks; // warps waiting on barrier
|
||||
wire reached_barrier_limit; // the expected number of warps reached the barrier
|
||||
|
||||
// wspawn
|
||||
reg [31:0] wspawn_pc;
|
||||
reg [`NUM_WARPS-1:0] use_wspawn;
|
||||
|
||||
wire [`NW_BITS-1:0] schedule_wid;
|
||||
wire [`NUM_THREADS-1:0] schedule_tmask;
|
||||
wire [31:0] schedule_pc;
|
||||
wire schedule_valid;
|
||||
wire warp_scheduled;
|
||||
|
||||
reg [`UUID_BITS-1:0] issued_instrs;
|
||||
|
||||
wire ifetch_req_fire = ifetch_req_if.valid && ifetch_req_if.ready;
|
||||
|
||||
wire tmc_active = (warp_ctl_if.tmc.tmask != 0);
|
||||
|
||||
always @(*) begin
|
||||
active_warps_n = active_warps;
|
||||
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
|
||||
active_warps_n = warp_ctl_if.wspawn.wmask;
|
||||
end
|
||||
if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
|
||||
active_warps_n[warp_ctl_if.wid] = tmc_active;
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
barrier_masks <= '0;
|
||||
use_wspawn <= '0;
|
||||
stalled_warps <= '0;
|
||||
warp_pcs <= '0;
|
||||
active_warps <= '0;
|
||||
thread_masks <= '0;
|
||||
issued_instrs <= '0;
|
||||
|
||||
// activate first warp
|
||||
warp_pcs[0] <= `STARTUP_ADDR;
|
||||
active_warps[0] <= 1;
|
||||
thread_masks[0] <= 1;
|
||||
end else begin
|
||||
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
|
||||
use_wspawn <= warp_ctl_if.wspawn.wmask & (~`NUM_WARPS'(1));
|
||||
wspawn_pc <= warp_ctl_if.wspawn.pc;
|
||||
end
|
||||
|
||||
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid) begin
|
||||
stalled_warps[warp_ctl_if.wid] <= 0;
|
||||
if (reached_barrier_limit) begin
|
||||
barrier_masks[warp_ctl_if.barrier.id] <= 0;
|
||||
end else begin
|
||||
barrier_masks[warp_ctl_if.barrier.id][warp_ctl_if.wid] <= 1;
|
||||
end
|
||||
end
|
||||
|
||||
if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
|
||||
thread_masks[warp_ctl_if.wid] <= warp_ctl_if.tmc.tmask;
|
||||
stalled_warps[warp_ctl_if.wid] <= 0;
|
||||
end
|
||||
|
||||
if (warp_ctl_if.valid && warp_ctl_if.split.valid) begin
|
||||
stalled_warps[warp_ctl_if.wid] <= 0;
|
||||
if (warp_ctl_if.split.diverged) begin
|
||||
thread_masks[warp_ctl_if.wid] <= warp_ctl_if.split.then_tmask;
|
||||
end
|
||||
end
|
||||
|
||||
// Branch
|
||||
if (branch_ctl_if.valid) begin
|
||||
if (branch_ctl_if.taken) begin
|
||||
warp_pcs[branch_ctl_if.wid] <= branch_ctl_if.dest;
|
||||
end
|
||||
stalled_warps[branch_ctl_if.wid] <= 0;
|
||||
end
|
||||
|
||||
if (warp_scheduled) begin
|
||||
// stall the warp until decode stage
|
||||
stalled_warps[schedule_wid] <= 1;
|
||||
|
||||
// release wspawn
|
||||
use_wspawn[schedule_wid] <= 0;
|
||||
if (use_wspawn[schedule_wid]) begin
|
||||
thread_masks[schedule_wid] <= 1;
|
||||
end
|
||||
|
||||
issued_instrs <= issued_instrs + 1;
|
||||
end
|
||||
|
||||
if (ifetch_req_fire) begin
|
||||
warp_pcs[ifetch_req_if.wid] <= ifetch_req_if.PC + 4;
|
||||
end
|
||||
|
||||
if (wstall_if.valid) begin
|
||||
stalled_warps[wstall_if.wid] <= wstall_if.stalled;
|
||||
end
|
||||
|
||||
// join handling
|
||||
if (join_if.valid) begin
|
||||
if (join_else) begin
|
||||
warp_pcs[join_if.wid] <= join_pc;
|
||||
end
|
||||
thread_masks[join_if.wid] <= join_tmask;
|
||||
end
|
||||
|
||||
active_warps <= active_warps_n;
|
||||
end
|
||||
end
|
||||
|
||||
// export thread mask register
|
||||
assign fetch_to_csr_if.thread_masks = thread_masks;
|
||||
|
||||
// calculate active barrier status
|
||||
|
||||
`IGNORE_UNUSED_BEGIN
|
||||
wire [`NW_BITS:0] active_barrier_count;
|
||||
`IGNORE_UNUSED_END
|
||||
wire [`NUM_WARPS-1:0] barrier_mask = barrier_masks[warp_ctl_if.barrier.id];
|
||||
`POP_COUNT(active_barrier_count, barrier_mask);
|
||||
|
||||
assign reached_barrier_limit = (active_barrier_count[`NW_BITS-1:0] == warp_ctl_if.barrier.size_m1);
|
||||
|
||||
reg [`NUM_WARPS-1:0] barrier_stalls;
|
||||
always @(*) begin
|
||||
barrier_stalls = barrier_masks[0];
|
||||
for (integer i = 1; i < `NUM_BARRIERS; ++i) begin
|
||||
barrier_stalls |= barrier_masks[i];
|
||||
end
|
||||
end
|
||||
|
||||
// split/join stack management
|
||||
|
||||
wire [(32+`NUM_THREADS)-1:0] ipdom_data [`NUM_WARPS-1:0];
|
||||
wire ipdom_index [`NUM_WARPS-1:0];
|
||||
|
||||
for (genvar i = 0; i < `NUM_WARPS; i++) begin
|
||||
wire push = warp_ctl_if.valid
|
||||
&& warp_ctl_if.split.valid
|
||||
&& (i == warp_ctl_if.wid);
|
||||
|
||||
wire pop = join_if.valid && (i == join_if.wid);
|
||||
|
||||
wire [`NUM_THREADS-1:0] else_tmask = warp_ctl_if.split.else_tmask;
|
||||
wire [`NUM_THREADS-1:0] orig_tmask = thread_masks[warp_ctl_if.wid];
|
||||
|
||||
wire [(32+`NUM_THREADS)-1:0] q_else = {warp_ctl_if.split.pc, else_tmask};
|
||||
wire [(32+`NUM_THREADS)-1:0] q_end = {32'b0, orig_tmask};
|
||||
|
||||
VX_ipdom_stack #(
|
||||
.WIDTH (32+`NUM_THREADS),
|
||||
.DEPTH (2 ** (`NT_BITS+1))
|
||||
) ipdom_stack (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.push (push),
|
||||
.pop (pop),
|
||||
.pair (warp_ctl_if.split.diverged),
|
||||
.q1 (q_end),
|
||||
.q2 (q_else),
|
||||
.d (ipdom_data[i]),
|
||||
.index (ipdom_index[i]),
|
||||
`UNUSED_PIN (empty),
|
||||
`UNUSED_PIN (full)
|
||||
);
|
||||
end
|
||||
|
||||
assign {join_pc, join_tmask} = ipdom_data[join_if.wid];
|
||||
assign join_else = ~ipdom_index[join_if.wid];
|
||||
|
||||
// schedule the next ready warp
|
||||
|
||||
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls);
|
||||
|
||||
VX_lzc #(
|
||||
.N (`NUM_WARPS)
|
||||
) wid_select (
|
||||
.in_i (ready_warps),
|
||||
.cnt_o (schedule_wid),
|
||||
.valid_o (schedule_valid)
|
||||
);
|
||||
|
||||
wire [`NUM_WARPS-1:0][(`NUM_THREADS + 32)-1:0] schedule_data;
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
|
||||
assign schedule_data[i] = {(use_wspawn[i] ? `NUM_THREADS'(1) : thread_masks[i]),
|
||||
(use_wspawn[i] ? wspawn_pc : warp_pcs[i])};
|
||||
end
|
||||
|
||||
assign {schedule_tmask, schedule_pc} = schedule_data[schedule_wid];
|
||||
|
||||
wire stall_out = ~ifetch_req_if.ready && ifetch_req_if.valid;
|
||||
|
||||
assign warp_scheduled = schedule_valid && ~stall_out;
|
||||
|
||||
wire [`UUID_BITS-1:0] instr_uuid = (issued_instrs * `NUM_CORES * `NUM_CLUSTERS) + `UUID_BITS'(CORE_ID);
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `UUID_BITS + `NUM_THREADS + 32 + `NW_BITS),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_out),
|
||||
.data_in ({schedule_valid, instr_uuid, schedule_tmask, schedule_pc, schedule_wid}),
|
||||
.data_out ({ifetch_req_if.valid, ifetch_req_if.uuid, ifetch_req_if.tmask, ifetch_req_if.PC, ifetch_req_if.wid})
|
||||
);
|
||||
|
||||
assign busy = (active_warps != 0);
|
||||
|
||||
`SCOPE_ASSIGN (wsched_scheduled, warp_scheduled);
|
||||
`SCOPE_ASSIGN (wsched_schedule_uuid, instr_uuid);
|
||||
`SCOPE_ASSIGN (wsched_active_warps, active_warps);
|
||||
`SCOPE_ASSIGN (wsched_stalled_warps, stalled_warps);
|
||||
`SCOPE_ASSIGN (wsched_schedule_wid, schedule_wid);
|
||||
`SCOPE_ASSIGN (wsched_schedule_tmask, schedule_tmask);
|
||||
`SCOPE_ASSIGN (wsched_schedule_pc, schedule_pc);
|
||||
|
||||
endmodule
|
|
@ -1,113 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_writeback #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// inputs
|
||||
VX_commit_if.slave alu_commit_if,
|
||||
VX_commit_if.slave ld_commit_if,
|
||||
VX_commit_if.slave csr_commit_if,
|
||||
`ifdef EXT_F_ENABLE
|
||||
VX_commit_if.slave fpu_commit_if,
|
||||
`endif
|
||||
VX_commit_if.slave gpu_commit_if,
|
||||
|
||||
// outputs
|
||||
VX_writeback_if.master writeback_if
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
localparam DATAW = `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32) + 1;
|
||||
`ifdef EXT_F_ENABLE
|
||||
localparam NUM_RSPS = 5;
|
||||
`else
|
||||
localparam NUM_RSPS = 4;
|
||||
`endif
|
||||
|
||||
wire wb_valid;
|
||||
wire [`NW_BITS-1:0] wb_wid;
|
||||
wire [31:0] wb_PC;
|
||||
wire [`NUM_THREADS-1:0] wb_tmask;
|
||||
wire [`NR_BITS-1:0] wb_rd;
|
||||
wire [`NUM_THREADS-1:0][31:0] wb_data;
|
||||
wire wb_eop;
|
||||
|
||||
wire [NUM_RSPS-1:0] rsp_valid;
|
||||
wire [NUM_RSPS-1:0][DATAW-1:0] rsp_data;
|
||||
wire [NUM_RSPS-1:0] rsp_ready;
|
||||
wire stall;
|
||||
|
||||
assign rsp_valid = {
|
||||
gpu_commit_if.valid && gpu_commit_if.wb,
|
||||
csr_commit_if.valid && csr_commit_if.wb,
|
||||
alu_commit_if.valid && alu_commit_if.wb,
|
||||
`ifdef EXT_F_ENABLE
|
||||
fpu_commit_if.valid && fpu_commit_if.wb,
|
||||
`endif
|
||||
ld_commit_if.valid && ld_commit_if.wb
|
||||
};
|
||||
|
||||
assign rsp_data = {
|
||||
{gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.rd, gpu_commit_if.data, gpu_commit_if.eop},
|
||||
{csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.rd, csr_commit_if.data, csr_commit_if.eop},
|
||||
{alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.rd, alu_commit_if.data, alu_commit_if.eop},
|
||||
`ifdef EXT_F_ENABLE
|
||||
{fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.rd, fpu_commit_if.data, fpu_commit_if.eop},
|
||||
`endif
|
||||
{ ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.rd, ld_commit_if.data, ld_commit_if.eop}
|
||||
};
|
||||
|
||||
VX_stream_arbiter #(
|
||||
.NUM_REQS (NUM_RSPS),
|
||||
.DATAW (DATAW),
|
||||
.BUFFERED (1),
|
||||
.TYPE ("R")
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (rsp_valid),
|
||||
.data_in (rsp_data),
|
||||
.ready_in (rsp_ready),
|
||||
.valid_out (wb_valid),
|
||||
.data_out ({wb_wid, wb_PC, wb_tmask, wb_rd, wb_data, wb_eop}),
|
||||
.ready_out (~stall)
|
||||
);
|
||||
|
||||
assign ld_commit_if.ready = rsp_ready[0] || ~ld_commit_if.wb;
|
||||
`ifdef EXT_F_ENABLE
|
||||
assign fpu_commit_if.ready = rsp_ready[1] || ~fpu_commit_if.wb;
|
||||
assign alu_commit_if.ready = rsp_ready[2] || ~alu_commit_if.wb;
|
||||
assign csr_commit_if.ready = rsp_ready[3] || ~csr_commit_if.wb;
|
||||
assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb;
|
||||
`else
|
||||
assign alu_commit_if.ready = rsp_ready[1] || ~alu_commit_if.wb;
|
||||
assign csr_commit_if.ready = rsp_ready[2] || ~csr_commit_if.wb;
|
||||
assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb;
|
||||
`endif
|
||||
|
||||
assign stall = ~writeback_if.ready && writeback_if.valid;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + DATAW),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall),
|
||||
.data_in ({wb_valid, wb_wid, wb_PC, wb_tmask, wb_rd, wb_data, wb_eop}),
|
||||
.data_out ({writeback_if.valid, writeback_if.wid, writeback_if.PC, writeback_if.tmask, writeback_if.rd, writeback_if.data, writeback_if.eop})
|
||||
);
|
||||
|
||||
// special workaround to get RISC-V tests Pass/Fail status
|
||||
reg [31:0] last_wb_value [`NUM_REGS-1:0] /* verilator public */;
|
||||
always @(posedge clk) begin
|
||||
if (writeback_if.valid && writeback_if.ready) begin
|
||||
last_wb_value[writeback_if.rd] <= writeback_if.data[0];
|
||||
end
|
||||
end
|
||||
|
||||
endmodule
|
369
hw/rtl/Vortex.sv
369
hw/rtl/Vortex.sv
|
@ -1,7 +1,20 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module Vortex (
|
||||
`SCOPE_IO_Vortex
|
||||
module Vortex import VX_gpu_pkg::*; (
|
||||
`SCOPE_IO_DECL
|
||||
|
||||
// Clock
|
||||
input wire clk,
|
||||
|
@ -22,204 +35,224 @@ module Vortex (
|
|||
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire mem_rsp_ready,
|
||||
|
||||
// DCR write request
|
||||
input wire dcr_wr_valid,
|
||||
input wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
|
||||
input wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data,
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
);
|
||||
`STATIC_ASSERT((`L3_ENABLE == 0 || `NUM_CLUSTERS > 1), ("invalid parameter"))
|
||||
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_valid;
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_rw;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2_MEM_BYTEEN_WIDTH-1:0] per_cluster_mem_req_byteen;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2_MEM_ADDR_WIDTH-1:0] per_cluster_mem_req_addr;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2_MEM_DATA_WIDTH-1:0] per_cluster_mem_req_data;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2_MEM_TAG_WIDTH-1:0] per_cluster_mem_req_tag;
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_req_ready;
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_if[`NUM_CLUSTERS]();
|
||||
VX_mem_perf_if perf_memsys_total_if();
|
||||
VX_cache_perf_if perf_l3cache_if();
|
||||
`endif
|
||||
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_rsp_valid;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2_MEM_DATA_WIDTH-1:0] per_cluster_mem_rsp_data;
|
||||
wire [`NUM_CLUSTERS-1:0][`L2_MEM_TAG_WIDTH-1:0] per_cluster_mem_rsp_tag;
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_mem_rsp_ready;
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L3_LINE_SIZE),
|
||||
.TAG_WIDTH (L3_MEM_TAG_WIDTH)
|
||||
) mem_bus_if();
|
||||
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_busy;
|
||||
assign mem_req_valid = mem_bus_if.req_valid;
|
||||
assign mem_req_rw = mem_bus_if.req_data.rw;
|
||||
assign mem_req_byteen= mem_bus_if.req_data.byteen;
|
||||
assign mem_req_addr = mem_bus_if.req_data.addr;
|
||||
assign mem_req_data = mem_bus_if.req_data.data;
|
||||
assign mem_req_tag = mem_bus_if.req_data.tag;
|
||||
assign mem_bus_if.req_ready = mem_req_ready;
|
||||
|
||||
for (genvar i = 0; i < `NUM_CLUSTERS; i++) begin
|
||||
assign mem_bus_if.rsp_valid = mem_rsp_valid;
|
||||
assign mem_bus_if.rsp_data.data = mem_rsp_data;
|
||||
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
|
||||
assign mem_rsp_ready = mem_bus_if.rsp_ready;
|
||||
|
||||
`RESET_RELAY (cluster_reset);
|
||||
wire mem_req_fire = mem_req_valid && mem_req_ready;
|
||||
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
|
||||
`UNUSED_VAR (mem_req_fire)
|
||||
`UNUSED_VAR (mem_rsp_fire)
|
||||
|
||||
wire sim_ebreak /* verilator public */;
|
||||
wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value /* verilator public */;
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_sim_ebreak;
|
||||
wire [`NUM_CLUSTERS-1:0][`NUM_REGS-1:0][`XLEN-1:0] per_cluster_sim_wb_value;
|
||||
assign sim_ebreak = per_cluster_sim_ebreak[0];
|
||||
assign sim_wb_value = per_cluster_sim_wb_value[0];
|
||||
`UNUSED_VAR (per_cluster_sim_ebreak)
|
||||
`UNUSED_VAR (per_cluster_sim_wb_value)
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L2_LINE_SIZE),
|
||||
.TAG_WIDTH (L2_MEM_TAG_WIDTH)
|
||||
) per_cluster_mem_bus_if[`NUM_CLUSTERS]();
|
||||
|
||||
VX_dcr_bus_if dcr_bus_if();
|
||||
assign dcr_bus_if.write_valid = dcr_wr_valid;
|
||||
assign dcr_bus_if.write_addr = dcr_wr_addr;
|
||||
assign dcr_bus_if.write_data = dcr_wr_data;
|
||||
|
||||
wire [`NUM_CLUSTERS-1:0] per_cluster_busy;
|
||||
|
||||
`SCOPE_IO_SWITCH (`NUM_CLUSTERS)
|
||||
|
||||
// Generate all clusters
|
||||
for (genvar i = 0; i < `NUM_CLUSTERS; ++i) begin
|
||||
|
||||
`RESET_RELAY (cluster_reset, reset);
|
||||
|
||||
`BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, (`NUM_CLUSTERS > 1));
|
||||
|
||||
VX_cluster #(
|
||||
.CLUSTER_ID(i)
|
||||
.CLUSTER_ID (i)
|
||||
) cluster (
|
||||
`SCOPE_BIND_Vortex_cluster(i)
|
||||
`SCOPE_IO_BIND (i)
|
||||
|
||||
.clk (clk),
|
||||
.reset (cluster_reset),
|
||||
|
||||
.mem_req_valid (per_cluster_mem_req_valid [i]),
|
||||
.mem_req_rw (per_cluster_mem_req_rw [i]),
|
||||
.mem_req_byteen (per_cluster_mem_req_byteen[i]),
|
||||
.mem_req_addr (per_cluster_mem_req_addr [i]),
|
||||
.mem_req_data (per_cluster_mem_req_data [i]),
|
||||
.mem_req_tag (per_cluster_mem_req_tag [i]),
|
||||
.mem_req_ready (per_cluster_mem_req_ready [i]),
|
||||
|
||||
.mem_rsp_valid (per_cluster_mem_rsp_valid [i]),
|
||||
.mem_rsp_data (per_cluster_mem_rsp_data [i]),
|
||||
.mem_rsp_tag (per_cluster_mem_rsp_tag [i]),
|
||||
.mem_rsp_ready (per_cluster_mem_rsp_ready [i]),
|
||||
|
||||
.busy (per_cluster_busy [i])
|
||||
);
|
||||
end
|
||||
|
||||
assign busy = (| per_cluster_busy);
|
||||
|
||||
if (`L3_ENABLE) begin
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_cache_if perf_l3cache_if();
|
||||
`endif
|
||||
|
||||
`RESET_RELAY (l3_reset);
|
||||
|
||||
VX_cache #(
|
||||
.CACHE_ID (`L3_CACHE_ID),
|
||||
.CACHE_SIZE (`L3_CACHE_SIZE),
|
||||
.CACHE_LINE_SIZE (`L3_CACHE_LINE_SIZE),
|
||||
.NUM_BANKS (`L3_NUM_BANKS),
|
||||
.NUM_PORTS (`L3_NUM_PORTS),
|
||||
.WORD_SIZE (`L3_WORD_SIZE),
|
||||
.NUM_REQS (`L3_NUM_REQS),
|
||||
.CREQ_SIZE (`L3_CREQ_SIZE),
|
||||
.CRSQ_SIZE (`L3_CRSQ_SIZE),
|
||||
.MSHR_SIZE (`L3_MSHR_SIZE),
|
||||
.MRSQ_SIZE (`L3_MRSQ_SIZE),
|
||||
.MREQ_SIZE (`L3_MREQ_SIZE),
|
||||
.WRITE_ENABLE (1),
|
||||
.CORE_TAG_WIDTH (`L2_MEM_TAG_WIDTH),
|
||||
.CORE_TAG_ID_BITS (0),
|
||||
.MEM_TAG_WIDTH (`L3_MEM_TAG_WIDTH),
|
||||
.NC_ENABLE (1)
|
||||
) l3cache (
|
||||
`SCOPE_BIND_Vortex_l3cache
|
||||
|
||||
.clk (clk),
|
||||
.reset (l3_reset),
|
||||
.reset (cluster_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_cache_if (perf_l3cache_if),
|
||||
.mem_perf_if (mem_perf_if[i]),
|
||||
.perf_memsys_total_if (perf_memsys_total_if),
|
||||
`endif
|
||||
|
||||
// Core request
|
||||
.core_req_valid (per_cluster_mem_req_valid),
|
||||
.core_req_rw (per_cluster_mem_req_rw),
|
||||
.core_req_byteen (per_cluster_mem_req_byteen),
|
||||
.core_req_addr (per_cluster_mem_req_addr),
|
||||
.core_req_data (per_cluster_mem_req_data),
|
||||
.core_req_tag (per_cluster_mem_req_tag),
|
||||
.core_req_ready (per_cluster_mem_req_ready),
|
||||
|
||||
// Core response
|
||||
.core_rsp_valid (per_cluster_mem_rsp_valid),
|
||||
.core_rsp_data (per_cluster_mem_rsp_data),
|
||||
.core_rsp_tag (per_cluster_mem_rsp_tag),
|
||||
.core_rsp_ready (per_cluster_mem_rsp_ready),
|
||||
`UNUSED_PIN (core_rsp_tmask),
|
||||
|
||||
// Memory request
|
||||
.mem_req_valid (mem_req_valid),
|
||||
.mem_req_rw (mem_req_rw),
|
||||
.mem_req_byteen (mem_req_byteen),
|
||||
.mem_req_addr (mem_req_addr),
|
||||
.mem_req_data (mem_req_data),
|
||||
.mem_req_tag (mem_req_tag),
|
||||
.mem_req_ready (mem_req_ready),
|
||||
|
||||
// Memory response
|
||||
.mem_rsp_valid (mem_rsp_valid),
|
||||
.mem_rsp_data (mem_rsp_data),
|
||||
.mem_rsp_tag (mem_rsp_tag),
|
||||
.mem_rsp_ready (mem_rsp_ready)
|
||||
);
|
||||
|
||||
end else begin
|
||||
|
||||
`RESET_RELAY (mem_arb_reset);
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_REQS (`NUM_CLUSTERS),
|
||||
.DATA_WIDTH (`L3_MEM_DATA_WIDTH),
|
||||
.ADDR_WIDTH (`L3_MEM_ADDR_WIDTH),
|
||||
.TAG_IN_WIDTH (`L2_MEM_TAG_WIDTH),
|
||||
.TYPE ("R"),
|
||||
.BUFFERED_REQ (1),
|
||||
.BUFFERED_RSP (1)
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (mem_arb_reset),
|
||||
|
||||
// Core request
|
||||
.req_valid_in (per_cluster_mem_req_valid),
|
||||
.req_rw_in (per_cluster_mem_req_rw),
|
||||
.req_byteen_in (per_cluster_mem_req_byteen),
|
||||
.req_addr_in (per_cluster_mem_req_addr),
|
||||
.req_data_in (per_cluster_mem_req_data),
|
||||
.req_tag_in (per_cluster_mem_req_tag),
|
||||
.req_ready_in (per_cluster_mem_req_ready),
|
||||
|
||||
// Memory request
|
||||
.req_valid_out (mem_req_valid),
|
||||
.req_rw_out (mem_req_rw),
|
||||
.req_byteen_out (mem_req_byteen),
|
||||
.req_addr_out (mem_req_addr),
|
||||
.req_data_out (mem_req_data),
|
||||
.req_tag_out (mem_req_tag),
|
||||
.req_ready_out (mem_req_ready),
|
||||
|
||||
// Core response
|
||||
.rsp_valid_out (per_cluster_mem_rsp_valid),
|
||||
.rsp_data_out (per_cluster_mem_rsp_data),
|
||||
.rsp_tag_out (per_cluster_mem_rsp_tag),
|
||||
.rsp_ready_out (per_cluster_mem_rsp_ready),
|
||||
|
||||
// Memory response
|
||||
.rsp_valid_in (mem_rsp_valid),
|
||||
.rsp_tag_in (mem_rsp_tag),
|
||||
.rsp_data_in (mem_rsp_data),
|
||||
.rsp_ready_in (mem_rsp_ready)
|
||||
);
|
||||
.dcr_bus_if (cluster_dcr_bus_if),
|
||||
|
||||
.mem_bus_if (per_cluster_mem_bus_if[i]),
|
||||
|
||||
.sim_ebreak (per_cluster_sim_ebreak[i]),
|
||||
.sim_wb_value (per_cluster_sim_wb_value[i]),
|
||||
|
||||
.busy (per_cluster_busy[i])
|
||||
);
|
||||
end
|
||||
|
||||
`SCOPE_ASSIGN (reset, reset);
|
||||
`SCOPE_ASSIGN (mem_req_fire, mem_req_valid && mem_req_ready);
|
||||
`SCOPE_ASSIGN (mem_req_addr, `TO_FULL_ADDR(mem_req_addr));
|
||||
`SCOPE_ASSIGN (mem_req_rw, mem_req_rw);
|
||||
`SCOPE_ASSIGN (mem_req_byteen, mem_req_byteen);
|
||||
`SCOPE_ASSIGN (mem_req_data, mem_req_data);
|
||||
`SCOPE_ASSIGN (mem_req_tag, mem_req_tag);
|
||||
`SCOPE_ASSIGN (mem_rsp_fire, mem_rsp_valid && mem_rsp_ready);
|
||||
`SCOPE_ASSIGN (mem_rsp_data, mem_rsp_data);
|
||||
`SCOPE_ASSIGN (mem_rsp_tag, mem_rsp_tag);
|
||||
`SCOPE_ASSIGN (busy, busy);
|
||||
`BUFFER_BUSY (busy, (| per_cluster_busy), (`NUM_CLUSTERS > 1));
|
||||
|
||||
`RESET_RELAY (l3_reset, reset);
|
||||
|
||||
VX_cache_wrap #(
|
||||
.INSTANCE_ID ("l3cache"),
|
||||
.CACHE_SIZE (`L3_CACHE_SIZE),
|
||||
.LINE_SIZE (`L3_LINE_SIZE),
|
||||
.NUM_BANKS (`L3_NUM_BANKS),
|
||||
.NUM_WAYS (`L3_NUM_WAYS),
|
||||
.WORD_SIZE (L3_WORD_SIZE),
|
||||
.NUM_REQS (L3_NUM_REQS),
|
||||
.CRSQ_SIZE (`L3_CRSQ_SIZE),
|
||||
.MSHR_SIZE (`L3_MSHR_SIZE),
|
||||
.MRSQ_SIZE (`L3_MRSQ_SIZE),
|
||||
.MREQ_SIZE (`L3_MREQ_SIZE),
|
||||
.TAG_WIDTH (L2_MEM_TAG_WIDTH),
|
||||
.WRITE_ENABLE (1),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.CORE_OUT_REG (2),
|
||||
.MEM_OUT_REG (2),
|
||||
.NC_ENABLE (1),
|
||||
.PASSTHRU (!`L3_ENABLED)
|
||||
) l3cache (
|
||||
.clk (clk),
|
||||
.reset (l3_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf_if (perf_l3cache_if),
|
||||
`endif
|
||||
|
||||
.core_bus_if (per_cluster_mem_bus_if),
|
||||
.mem_bus_if (mem_bus_if)
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, icache_reads, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, icache_read_misses, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_reads, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_writes, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_read_misses, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_write_misses, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_bank_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, dcache_mshr_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, smem_reads, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, smem_writes, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, smem_bank_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_reads, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_writes, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_read_misses, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_write_misses, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_bank_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
||||
`PERF_REDUCE (perf_memsys_total_if, mem_perf_if, l2cache_mshr_stalls, `PERF_CTR_BITS, `NUM_CLUSTERS);
|
||||
|
||||
`ifdef L3_ENABLE
|
||||
assign perf_memsys_total_if.l3cache_reads = perf_l3cache_if.reads;
|
||||
assign perf_memsys_total_if.l3cache_writes = perf_l3cache_if.writes;
|
||||
assign perf_memsys_total_if.l3cache_read_misses = perf_l3cache_if.read_misses;
|
||||
assign perf_memsys_total_if.l3cache_write_misses= perf_l3cache_if.write_misses;
|
||||
assign perf_memsys_total_if.l3cache_bank_stalls = perf_l3cache_if.bank_stalls;
|
||||
assign perf_memsys_total_if.l3cache_mshr_stalls = perf_l3cache_if.mshr_stalls;
|
||||
`else
|
||||
assign perf_memsys_total_if.l3cache_reads = '0;
|
||||
assign perf_memsys_total_if.l3cache_writes = '0;
|
||||
assign perf_memsys_total_if.l3cache_read_misses = '0;
|
||||
assign perf_memsys_total_if.l3cache_write_misses= '0;
|
||||
assign perf_memsys_total_if.l3cache_bank_stalls = '0;
|
||||
assign perf_memsys_total_if.l3cache_mshr_stalls = '0;
|
||||
`endif
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_mem_pending_reads <= '0;
|
||||
end else begin
|
||||
perf_mem_pending_reads <= $signed(perf_mem_pending_reads) +
|
||||
`PERF_CTR_BITS'($signed(2'(mem_req_fire && ~mem_bus_if.req_data.rw) - 2'(mem_rsp_fire)));
|
||||
end
|
||||
end
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_writes;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_lat;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_mem_reads <= '0;
|
||||
perf_mem_writes <= '0;
|
||||
perf_mem_lat <= '0;
|
||||
end else begin
|
||||
if (mem_req_fire && ~mem_bus_if.req_data.rw) begin
|
||||
perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'(1);
|
||||
end
|
||||
if (mem_req_fire && mem_bus_if.req_data.rw) begin
|
||||
perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'(1);
|
||||
end
|
||||
perf_mem_lat <= perf_mem_lat + perf_mem_pending_reads;
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_memsys_total_if.mem_reads = perf_mem_reads;
|
||||
assign perf_memsys_total_if.mem_writes = perf_mem_writes;
|
||||
assign perf_memsys_total_if.mem_latency = perf_mem_lat;
|
||||
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_CORE_MEM
|
||||
always @(posedge clk) begin
|
||||
if (mem_req_valid && mem_req_ready) begin
|
||||
if (mem_req_fire) begin
|
||||
if (mem_req_rw)
|
||||
dpi_trace("%d: MEM Wr Req: addr=%0h, tag=%0h, byteen=%0h data=%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data);
|
||||
`TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h data=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data));
|
||||
else
|
||||
dpi_trace("%d: MEM Rd Req: addr=%0h, tag=%0h, byteen=%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen);
|
||||
`TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen));
|
||||
end
|
||||
if (mem_rsp_valid && mem_rsp_ready) begin
|
||||
dpi_trace("%d: MEM Rsp: tag=%0h, data=%0h\n", $time, mem_rsp_tag, mem_rsp_data);
|
||||
if (mem_rsp_fire) begin
|
||||
`TRACE(1, ("%d: MEM Rsp: tag=0x%0h, data=0x%0h\n", $time, mem_rsp_tag, mem_rsp_data));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
|
||||
`ifndef NDEBUG
|
||||
`ifdef SIMULATION
|
||||
always @(posedge clk) begin
|
||||
$fflush(); // flush stdout buffer
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
endmodule
|
||||
|
|
|
@ -1,65 +1,91 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module Vortex_axi #(
|
||||
parameter AXI_DATA_WIDTH = `VX_MEM_DATA_WIDTH,
|
||||
parameter AXI_ADDR_WIDTH = 32,
|
||||
parameter AXI_TID_WIDTH = `VX_MEM_TAG_WIDTH,
|
||||
parameter AXI_STROBE_WIDTH = (`VX_MEM_DATA_WIDTH / 8)
|
||||
module Vortex_axi import VX_gpu_pkg::*; #(
|
||||
parameter AXI_DATA_WIDTH = `VX_MEM_DATA_WIDTH,
|
||||
parameter AXI_ADDR_WIDTH = `XLEN,
|
||||
parameter AXI_TID_WIDTH = `VX_MEM_TAG_WIDTH,
|
||||
parameter AXI_NUM_BANKS = 1
|
||||
)(
|
||||
`SCOPE_IO_DECL
|
||||
|
||||
// Clock
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// AXI write request address channel
|
||||
output wire [AXI_TID_WIDTH-1:0] m_axi_awid,
|
||||
output wire [AXI_ADDR_WIDTH-1:0] m_axi_awaddr,
|
||||
output wire [7:0] m_axi_awlen,
|
||||
output wire [2:0] m_axi_awsize,
|
||||
output wire [1:0] m_axi_awburst,
|
||||
output wire m_axi_awlock,
|
||||
output wire [3:0] m_axi_awcache,
|
||||
output wire [2:0] m_axi_awprot,
|
||||
output wire [3:0] m_axi_awqos,
|
||||
output wire m_axi_awvalid,
|
||||
input wire m_axi_awready,
|
||||
output wire m_axi_awvalid [AXI_NUM_BANKS],
|
||||
input wire m_axi_awready [AXI_NUM_BANKS],
|
||||
output wire [AXI_ADDR_WIDTH-1:0] m_axi_awaddr [AXI_NUM_BANKS],
|
||||
output wire [AXI_TID_WIDTH-1:0] m_axi_awid [AXI_NUM_BANKS],
|
||||
output wire [7:0] m_axi_awlen [AXI_NUM_BANKS],
|
||||
output wire [2:0] m_axi_awsize [AXI_NUM_BANKS],
|
||||
output wire [1:0] m_axi_awburst [AXI_NUM_BANKS],
|
||||
output wire [1:0] m_axi_awlock [AXI_NUM_BANKS],
|
||||
output wire [3:0] m_axi_awcache [AXI_NUM_BANKS],
|
||||
output wire [2:0] m_axi_awprot [AXI_NUM_BANKS],
|
||||
output wire [3:0] m_axi_awqos [AXI_NUM_BANKS],
|
||||
output wire [3:0] m_axi_awregion [AXI_NUM_BANKS],
|
||||
|
||||
// AXI write request data channel
|
||||
output wire [AXI_DATA_WIDTH-1:0] m_axi_wdata,
|
||||
output wire [AXI_STROBE_WIDTH-1:0] m_axi_wstrb,
|
||||
output wire m_axi_wlast,
|
||||
output wire m_axi_wvalid,
|
||||
input wire m_axi_wready,
|
||||
output wire m_axi_wvalid [AXI_NUM_BANKS],
|
||||
input wire m_axi_wready [AXI_NUM_BANKS],
|
||||
output wire [AXI_DATA_WIDTH-1:0] m_axi_wdata [AXI_NUM_BANKS],
|
||||
output wire [AXI_DATA_WIDTH/8-1:0] m_axi_wstrb [AXI_NUM_BANKS],
|
||||
output wire m_axi_wlast [AXI_NUM_BANKS],
|
||||
|
||||
// AXI write response channel
|
||||
input wire [AXI_TID_WIDTH-1:0] m_axi_bid,
|
||||
input wire [1:0] m_axi_bresp,
|
||||
input wire m_axi_bvalid,
|
||||
output wire m_axi_bready,
|
||||
input wire m_axi_bvalid [AXI_NUM_BANKS],
|
||||
output wire m_axi_bready [AXI_NUM_BANKS],
|
||||
input wire [AXI_TID_WIDTH-1:0] m_axi_bid [AXI_NUM_BANKS],
|
||||
input wire [1:0] m_axi_bresp [AXI_NUM_BANKS],
|
||||
|
||||
// AXI read request channel
|
||||
output wire [AXI_TID_WIDTH-1:0] m_axi_arid,
|
||||
output wire [AXI_ADDR_WIDTH-1:0] m_axi_araddr,
|
||||
output wire [7:0] m_axi_arlen,
|
||||
output wire [2:0] m_axi_arsize,
|
||||
output wire [1:0] m_axi_arburst,
|
||||
output wire m_axi_arlock,
|
||||
output wire [3:0] m_axi_arcache,
|
||||
output wire [2:0] m_axi_arprot,
|
||||
output wire [3:0] m_axi_arqos,
|
||||
output wire m_axi_arvalid,
|
||||
input wire m_axi_arready,
|
||||
output wire m_axi_arvalid [AXI_NUM_BANKS],
|
||||
input wire m_axi_arready [AXI_NUM_BANKS],
|
||||
output wire [AXI_ADDR_WIDTH-1:0] m_axi_araddr [AXI_NUM_BANKS],
|
||||
output wire [AXI_TID_WIDTH-1:0] m_axi_arid [AXI_NUM_BANKS],
|
||||
output wire [7:0] m_axi_arlen [AXI_NUM_BANKS],
|
||||
output wire [2:0] m_axi_arsize [AXI_NUM_BANKS],
|
||||
output wire [1:0] m_axi_arburst [AXI_NUM_BANKS],
|
||||
output wire [1:0] m_axi_arlock [AXI_NUM_BANKS],
|
||||
output wire [3:0] m_axi_arcache [AXI_NUM_BANKS],
|
||||
output wire [2:0] m_axi_arprot [AXI_NUM_BANKS],
|
||||
output wire [3:0] m_axi_arqos [AXI_NUM_BANKS],
|
||||
output wire [3:0] m_axi_arregion [AXI_NUM_BANKS],
|
||||
|
||||
// AXI read response channel
|
||||
input wire [AXI_TID_WIDTH-1:0] m_axi_rid,
|
||||
input wire [AXI_DATA_WIDTH-1:0] m_axi_rdata,
|
||||
input wire [1:0] m_axi_rresp,
|
||||
input wire m_axi_rlast,
|
||||
input wire m_axi_rvalid,
|
||||
output wire m_axi_rready,
|
||||
input wire m_axi_rvalid [AXI_NUM_BANKS],
|
||||
output wire m_axi_rready [AXI_NUM_BANKS],
|
||||
input wire [AXI_DATA_WIDTH-1:0] m_axi_rdata [AXI_NUM_BANKS],
|
||||
input wire m_axi_rlast [AXI_NUM_BANKS],
|
||||
input wire [AXI_TID_WIDTH-1:0] m_axi_rid [AXI_NUM_BANKS],
|
||||
input wire [1:0] m_axi_rresp [AXI_NUM_BANKS],
|
||||
|
||||
// DCR write request
|
||||
input wire dcr_wr_valid,
|
||||
input wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
|
||||
input wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data,
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
);
|
||||
`STATIC_ASSERT((AXI_DATA_WIDTH == `VX_MEM_DATA_WIDTH), ("invalid memory data size: current=%0d, expected=%0d", AXI_DATA_WIDTH, `VX_MEM_DATA_WIDTH))
|
||||
`STATIC_ASSERT((AXI_ADDR_WIDTH >= `XLEN), ("invalid memory address size: current=%0d, expected=%0d", AXI_ADDR_WIDTH, `VX_MEM_ADDR_WIDTH))
|
||||
//`STATIC_ASSERT((AXI_TID_WIDTH >= `VX_MEM_TAG_WIDTH), ("invalid memory tag size: current=%0d, expected=%0d", AXI_TID_WIDTH, `VX_MEM_TAG_WIDTH))
|
||||
|
||||
wire mem_req_valid;
|
||||
wire mem_req_rw;
|
||||
wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen;
|
||||
|
@ -72,16 +98,33 @@ module Vortex_axi #(
|
|||
wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data;
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag;
|
||||
wire mem_rsp_ready;
|
||||
|
||||
wire [`XLEN-1:0] m_axi_awaddr_unqual [AXI_NUM_BANKS];
|
||||
wire [`XLEN-1:0] m_axi_araddr_unqual [AXI_NUM_BANKS];
|
||||
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_awid_unqual [AXI_NUM_BANKS];
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_arid_unqual [AXI_NUM_BANKS];
|
||||
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_bid_unqual [AXI_NUM_BANKS];
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_rid_unqual [AXI_NUM_BANKS];
|
||||
|
||||
for (genvar i = 0; i < AXI_NUM_BANKS; ++i) begin
|
||||
assign m_axi_awaddr[i] = `XLEN'(m_axi_awaddr_unqual[i]);
|
||||
assign m_axi_araddr[i] = `XLEN'(m_axi_araddr_unqual[i]);
|
||||
|
||||
assign m_axi_awid[i] = AXI_TID_WIDTH'(m_axi_awid_unqual[i]);
|
||||
assign m_axi_arid[i] = AXI_TID_WIDTH'(m_axi_arid_unqual[i]);
|
||||
|
||||
assign m_axi_rid_unqual[i] = `VX_MEM_TAG_WIDTH'(m_axi_rid[i]);
|
||||
assign m_axi_bid_unqual[i] = `VX_MEM_TAG_WIDTH'(m_axi_bid[i]);
|
||||
end
|
||||
|
||||
VX_axi_adapter #(
|
||||
.VX_DATA_WIDTH (`VX_MEM_DATA_WIDTH),
|
||||
.VX_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH),
|
||||
.VX_TAG_WIDTH (`VX_MEM_TAG_WIDTH),
|
||||
.VX_BYTEEN_WIDTH (AXI_STROBE_WIDTH),
|
||||
.AXI_DATA_WIDTH (AXI_DATA_WIDTH),
|
||||
.AXI_ADDR_WIDTH (AXI_ADDR_WIDTH),
|
||||
.AXI_TID_WIDTH (AXI_TID_WIDTH),
|
||||
.AXI_STROBE_WIDTH (AXI_STROBE_WIDTH)
|
||||
.DATA_WIDTH (`VX_MEM_DATA_WIDTH),
|
||||
.ADDR_WIDTH (`XLEN),
|
||||
.TAG_WIDTH (`VX_MEM_TAG_WIDTH),
|
||||
.NUM_BANKS (AXI_NUM_BANKS),
|
||||
.OUT_REG_RSP((AXI_NUM_BANKS > 1) ? 2 : 0)
|
||||
) axi_adapter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -98,9 +141,11 @@ module Vortex_axi #(
|
|||
.mem_rsp_data (mem_rsp_data),
|
||||
.mem_rsp_tag (mem_rsp_tag),
|
||||
.mem_rsp_ready (mem_rsp_ready),
|
||||
|
||||
.m_axi_awid (m_axi_awid),
|
||||
.m_axi_awaddr (m_axi_awaddr),
|
||||
|
||||
.m_axi_awvalid (m_axi_awvalid),
|
||||
.m_axi_awready (m_axi_awready),
|
||||
.m_axi_awaddr (m_axi_awaddr_unqual),
|
||||
.m_axi_awid (m_axi_awid_unqual),
|
||||
.m_axi_awlen (m_axi_awlen),
|
||||
.m_axi_awsize (m_axi_awsize),
|
||||
.m_axi_awburst (m_axi_awburst),
|
||||
|
@ -108,22 +153,23 @@ module Vortex_axi #(
|
|||
.m_axi_awcache (m_axi_awcache),
|
||||
.m_axi_awprot (m_axi_awprot),
|
||||
.m_axi_awqos (m_axi_awqos),
|
||||
.m_axi_awvalid (m_axi_awvalid),
|
||||
.m_axi_awready (m_axi_awready),
|
||||
.m_axi_awregion (m_axi_awregion),
|
||||
|
||||
.m_axi_wvalid (m_axi_wvalid),
|
||||
.m_axi_wready (m_axi_wready),
|
||||
.m_axi_wdata (m_axi_wdata),
|
||||
.m_axi_wstrb (m_axi_wstrb),
|
||||
.m_axi_wlast (m_axi_wlast),
|
||||
.m_axi_wvalid (m_axi_wvalid),
|
||||
.m_axi_wready (m_axi_wready),
|
||||
|
||||
.m_axi_bid (m_axi_bid),
|
||||
.m_axi_bresp (m_axi_bresp),
|
||||
|
||||
.m_axi_bvalid (m_axi_bvalid),
|
||||
.m_axi_bready (m_axi_bready),
|
||||
.m_axi_bid (m_axi_bid_unqual),
|
||||
.m_axi_bresp (m_axi_bresp),
|
||||
|
||||
.m_axi_arid (m_axi_arid),
|
||||
.m_axi_araddr (m_axi_araddr),
|
||||
.m_axi_arvalid (m_axi_arvalid),
|
||||
.m_axi_arready (m_axi_arready),
|
||||
.m_axi_araddr (m_axi_araddr_unqual),
|
||||
.m_axi_arid (m_axi_arid_unqual),
|
||||
.m_axi_arlen (m_axi_arlen),
|
||||
.m_axi_arsize (m_axi_arsize),
|
||||
.m_axi_arburst (m_axi_arburst),
|
||||
|
@ -131,18 +177,21 @@ module Vortex_axi #(
|
|||
.m_axi_arcache (m_axi_arcache),
|
||||
.m_axi_arprot (m_axi_arprot),
|
||||
.m_axi_arqos (m_axi_arqos),
|
||||
.m_axi_arvalid (m_axi_arvalid),
|
||||
.m_axi_arready (m_axi_arready),
|
||||
.m_axi_arregion (m_axi_arregion),
|
||||
|
||||
.m_axi_rid (m_axi_rid),
|
||||
.m_axi_rdata (m_axi_rdata),
|
||||
.m_axi_rresp (m_axi_rresp),
|
||||
.m_axi_rlast (m_axi_rlast),
|
||||
.m_axi_rvalid (m_axi_rvalid),
|
||||
.m_axi_rready (m_axi_rready)
|
||||
.m_axi_rready (m_axi_rready),
|
||||
.m_axi_rdata (m_axi_rdata),
|
||||
.m_axi_rlast (m_axi_rlast) ,
|
||||
.m_axi_rid (m_axi_rid_unqual),
|
||||
.m_axi_rresp (m_axi_rresp)
|
||||
);
|
||||
|
||||
`SCOPE_IO_SWITCH (1)
|
||||
|
||||
Vortex vortex (
|
||||
`SCOPE_IO_BIND (0)
|
||||
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
|
@ -159,7 +208,11 @@ module Vortex_axi #(
|
|||
.mem_rsp_tag (mem_rsp_tag),
|
||||
.mem_rsp_ready (mem_rsp_ready),
|
||||
|
||||
.dcr_wr_valid (dcr_wr_valid),
|
||||
.dcr_wr_addr (dcr_wr_addr),
|
||||
.dcr_wr_data (dcr_wr_data),
|
||||
|
||||
.busy (busy)
|
||||
);
|
||||
|
||||
endmodule
|
||||
endmodule
|
||||
|
|
|
@ -1,176 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_avs_wrapper #(
|
||||
parameter AVS_DATA_WIDTH = 1,
|
||||
parameter AVS_ADDR_WIDTH = 1,
|
||||
parameter AVS_BURST_WIDTH = 1,
|
||||
parameter AVS_BANKS = 1,
|
||||
parameter REQ_TAG_WIDTH = 1,
|
||||
parameter RD_QUEUE_SIZE = 1,
|
||||
|
||||
parameter AVS_BYTEENW = (AVS_DATA_WIDTH / 8),
|
||||
parameter RD_QUEUE_ADDR_WIDTH = $clog2(RD_QUEUE_SIZE+1)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// Memory request
|
||||
input wire mem_req_valid,
|
||||
input wire mem_req_rw,
|
||||
input wire [AVS_BYTEENW-1:0] mem_req_byteen,
|
||||
input wire [AVS_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
input wire [AVS_DATA_WIDTH-1:0] mem_req_data,
|
||||
input wire [REQ_TAG_WIDTH-1:0] mem_req_tag,
|
||||
output wire mem_req_ready,
|
||||
|
||||
// Memory response
|
||||
output wire mem_rsp_valid,
|
||||
output wire [AVS_DATA_WIDTH-1:0] mem_rsp_data,
|
||||
output wire [REQ_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
input wire mem_rsp_ready,
|
||||
|
||||
// AVS bus
|
||||
output wire [AVS_DATA_WIDTH-1:0] avs_writedata [AVS_BANKS],
|
||||
input wire [AVS_DATA_WIDTH-1:0] avs_readdata [AVS_BANKS],
|
||||
output wire [AVS_ADDR_WIDTH-1:0] avs_address [AVS_BANKS],
|
||||
input wire avs_waitrequest [AVS_BANKS],
|
||||
output wire avs_write [AVS_BANKS],
|
||||
output wire avs_read [AVS_BANKS],
|
||||
output wire [AVS_BYTEENW-1:0] avs_byteenable [AVS_BANKS],
|
||||
output wire [AVS_BURST_WIDTH-1:0] avs_burstcount [AVS_BANKS],
|
||||
input avs_readdatavalid [AVS_BANKS]
|
||||
);
|
||||
|
||||
localparam BANK_ADDRW = `LOG2UP(AVS_BANKS);
|
||||
|
||||
// Requests handling
|
||||
|
||||
wire [AVS_BANKS-1:0] avs_reqq_push, avs_reqq_pop, avs_reqq_ready;
|
||||
wire [AVS_BANKS-1:0][REQ_TAG_WIDTH-1:0] avs_reqq_tag_out;
|
||||
wire [AVS_BANKS-1:0] req_queue_going_full;
|
||||
wire [AVS_BANKS-1:0][RD_QUEUE_ADDR_WIDTH-1:0] req_queue_size;
|
||||
wire [BANK_ADDRW-1:0] req_bank_sel;
|
||||
|
||||
if (AVS_BANKS >= 2) begin
|
||||
assign req_bank_sel = mem_req_addr[BANK_ADDRW-1:0];
|
||||
end else begin
|
||||
assign req_bank_sel = 0;
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < AVS_BANKS; i++) begin
|
||||
assign avs_reqq_ready[i] = !req_queue_going_full[i] && !avs_waitrequest[i];
|
||||
assign avs_reqq_push[i] = mem_req_valid && !mem_req_rw && avs_reqq_ready[i] && (req_bank_sel == i);
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < AVS_BANKS; i++) begin
|
||||
VX_pending_size #(
|
||||
.SIZE (RD_QUEUE_SIZE)
|
||||
) pending_size (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.incr (avs_reqq_push[i]),
|
||||
.decr (avs_reqq_pop[i]),
|
||||
.full (req_queue_going_full[i]),
|
||||
.size (req_queue_size[i]),
|
||||
`UNUSED_PIN (empty)
|
||||
);
|
||||
`UNUSED_VAR (req_queue_size)
|
||||
|
||||
VX_fifo_queue #(
|
||||
.DATAW (REQ_TAG_WIDTH),
|
||||
.SIZE (RD_QUEUE_SIZE)
|
||||
) rd_req_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.push (avs_reqq_push[i]),
|
||||
.pop (avs_reqq_pop[i]),
|
||||
.data_in (mem_req_tag),
|
||||
.data_out (avs_reqq_tag_out[i]),
|
||||
`UNUSED_PIN (empty),
|
||||
`UNUSED_PIN (full),
|
||||
`UNUSED_PIN (alm_empty),
|
||||
`UNUSED_PIN (alm_full),
|
||||
`UNUSED_PIN (size)
|
||||
);
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < AVS_BANKS; i++) begin
|
||||
assign avs_read[i] = mem_req_valid && !mem_req_rw && !req_queue_going_full[i] && (req_bank_sel == i);
|
||||
assign avs_write[i] = mem_req_valid && mem_req_rw && !req_queue_going_full[i] && (req_bank_sel == i);
|
||||
assign avs_address[i] = mem_req_addr;
|
||||
assign avs_byteenable[i] = mem_req_byteen;
|
||||
assign avs_writedata[i] = mem_req_data;
|
||||
assign avs_burstcount[i] = AVS_BURST_WIDTH'(1);
|
||||
end
|
||||
|
||||
if (AVS_BANKS >= 2) begin
|
||||
assign mem_req_ready = avs_reqq_ready[req_bank_sel];
|
||||
end else begin
|
||||
assign mem_req_ready = avs_reqq_ready;
|
||||
end
|
||||
|
||||
// Responses handling
|
||||
|
||||
wire [AVS_BANKS-1:0] rsp_arb_valid_in;
|
||||
wire [AVS_BANKS-1:0][AVS_DATA_WIDTH+REQ_TAG_WIDTH-1:0] rsp_arb_data_in;
|
||||
wire [AVS_BANKS-1:0] rsp_arb_ready_in;
|
||||
|
||||
wire [AVS_BANKS-1:0][AVS_DATA_WIDTH-1:0] avs_rspq_data_out;
|
||||
wire [AVS_BANKS-1:0] avs_rspq_empty;
|
||||
|
||||
for (genvar i = 0; i < AVS_BANKS; i++) begin
|
||||
VX_fifo_queue #(
|
||||
.DATAW (AVS_DATA_WIDTH),
|
||||
.SIZE (RD_QUEUE_SIZE)
|
||||
) rd_rsp_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.push (avs_readdatavalid[i]),
|
||||
.pop (avs_reqq_pop[i]),
|
||||
.data_in (avs_readdata[i]),
|
||||
.data_out (avs_rspq_data_out[i]),
|
||||
.empty (avs_rspq_empty[i]),
|
||||
`UNUSED_PIN (full),
|
||||
`UNUSED_PIN (alm_empty),
|
||||
`UNUSED_PIN (alm_full),
|
||||
`UNUSED_PIN (size)
|
||||
);
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < AVS_BANKS; i++) begin
|
||||
assign rsp_arb_valid_in[i] = !avs_rspq_empty[i];
|
||||
assign rsp_arb_data_in[i] = {avs_rspq_data_out[i], avs_reqq_tag_out[i]};
|
||||
assign avs_reqq_pop[i] = rsp_arb_valid_in[i] && rsp_arb_ready_in[i];
|
||||
end
|
||||
|
||||
VX_stream_arbiter #(
|
||||
.NUM_REQS (AVS_BANKS),
|
||||
.DATAW (AVS_DATA_WIDTH + REQ_TAG_WIDTH),
|
||||
.TYPE ("R")
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (rsp_arb_valid_in),
|
||||
.data_in (rsp_arb_data_in),
|
||||
.ready_in (rsp_arb_ready_in),
|
||||
.valid_out (mem_rsp_valid),
|
||||
.data_out ({mem_rsp_data, mem_rsp_tag}),
|
||||
.ready_out (mem_rsp_ready)
|
||||
);
|
||||
|
||||
`ifdef DBG_TRACE_AFU
|
||||
always @(posedge clk) begin
|
||||
if (mem_req_valid && mem_req_ready) begin
|
||||
if (mem_req_rw) begin
|
||||
dpi_trace("%d: AVS Wr Req: addr=%0h, byteen=%0h, tag=%0h, data=%0h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_byteen, mem_req_tag, mem_req_data);
|
||||
end else begin
|
||||
dpi_trace("%d: AVS Rd Req: addr=%0h, byteen=%0h, tag=%0h, pending=%0d\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_byteen, mem_req_tag, req_queue_size);
|
||||
end
|
||||
end
|
||||
if (mem_rsp_valid && mem_rsp_ready) begin
|
||||
dpi_trace("%d: AVS Rd Rsp: tag=%0h, data=%0h, pending=%0d\n", $time, mem_rsp_tag, mem_rsp_data, req_queue_size);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
|
@ -1,181 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_to_mem #(
|
||||
parameter SRC_DATA_WIDTH = 1,
|
||||
parameter SRC_ADDR_WIDTH = 1,
|
||||
parameter DST_DATA_WIDTH = 1,
|
||||
parameter DST_ADDR_WIDTH = 1,
|
||||
parameter SRC_TAG_WIDTH = 1,
|
||||
parameter DST_TAG_WIDTH = 1,
|
||||
parameter SRC_DATA_SIZE = (SRC_DATA_WIDTH / 8),
|
||||
parameter DST_DATA_SIZE = (DST_DATA_WIDTH / 8)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
input wire mem_req_valid_in,
|
||||
input wire [SRC_ADDR_WIDTH-1:0] mem_req_addr_in,
|
||||
input wire mem_req_rw_in,
|
||||
input wire [SRC_DATA_SIZE-1:0] mem_req_byteen_in,
|
||||
input wire [SRC_DATA_WIDTH-1:0] mem_req_data_in,
|
||||
input wire [SRC_TAG_WIDTH-1:0] mem_req_tag_in,
|
||||
output wire mem_req_ready_in,
|
||||
|
||||
output wire mem_req_valid_out,
|
||||
output wire [DST_ADDR_WIDTH-1:0] mem_req_addr_out,
|
||||
output wire mem_req_rw_out,
|
||||
output wire [DST_DATA_SIZE-1:0] mem_req_byteen_out,
|
||||
output wire [DST_DATA_WIDTH-1:0] mem_req_data_out,
|
||||
output wire [DST_TAG_WIDTH-1:0] mem_req_tag_out,
|
||||
input wire mem_req_ready_out,
|
||||
|
||||
input wire mem_rsp_valid_in,
|
||||
input wire [DST_DATA_WIDTH-1:0] mem_rsp_data_in,
|
||||
input wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_in,
|
||||
output wire mem_rsp_ready_in,
|
||||
|
||||
output wire mem_rsp_valid_out,
|
||||
output wire [SRC_DATA_WIDTH-1:0] mem_rsp_data_out,
|
||||
output wire [SRC_TAG_WIDTH-1:0] mem_rsp_tag_out,
|
||||
input wire mem_rsp_ready_out
|
||||
);
|
||||
`STATIC_ASSERT ((DST_TAG_WIDTH >= SRC_TAG_WIDTH), ("oops!"))
|
||||
|
||||
localparam DST_LDATAW = $clog2(DST_DATA_WIDTH);
|
||||
localparam SRC_LDATAW = $clog2(SRC_DATA_WIDTH);
|
||||
localparam D = `ABS(DST_LDATAW - SRC_LDATAW);
|
||||
localparam P = 2**D;
|
||||
|
||||
`UNUSED_VAR (mem_rsp_tag_in)
|
||||
|
||||
if (DST_LDATAW > SRC_LDATAW) begin
|
||||
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
wire [D-1:0] req_idx = mem_req_addr_in[D-1:0];
|
||||
wire [D-1:0] rsp_idx = mem_rsp_tag_in[D-1:0];
|
||||
|
||||
wire [SRC_ADDR_WIDTH-D-1:0] mem_req_addr_in_qual = mem_req_addr_in[SRC_ADDR_WIDTH-1:D];
|
||||
|
||||
wire [P-1:0][SRC_DATA_WIDTH-1:0] mem_rsp_data_in_w = mem_rsp_data_in;
|
||||
|
||||
if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH - D)) begin
|
||||
`UNUSED_VAR (mem_req_addr_in_qual)
|
||||
assign mem_req_addr_out = mem_req_addr_in_qual[DST_ADDR_WIDTH-1:0];
|
||||
end else if (DST_ADDR_WIDTH > (SRC_ADDR_WIDTH - D)) begin
|
||||
assign mem_req_addr_out = DST_ADDR_WIDTH'(mem_req_addr_in_qual);
|
||||
end else begin
|
||||
assign mem_req_addr_out = mem_req_addr_in_qual;
|
||||
end
|
||||
|
||||
assign mem_req_valid_out = mem_req_valid_in;
|
||||
assign mem_req_rw_out = mem_req_rw_in;
|
||||
assign mem_req_byteen_out = DST_DATA_SIZE'(mem_req_byteen_in) << ((DST_LDATAW-3)'(req_idx) << (SRC_LDATAW-3));
|
||||
assign mem_req_data_out = DST_DATA_WIDTH'(mem_req_data_in) << ((DST_LDATAW'(req_idx)) << SRC_LDATAW);
|
||||
assign mem_req_tag_out = DST_TAG_WIDTH'({mem_req_tag_in, req_idx});
|
||||
assign mem_req_ready_in = mem_req_ready_out;
|
||||
|
||||
assign mem_rsp_valid_out = mem_rsp_valid_in;
|
||||
assign mem_rsp_data_out = mem_rsp_data_in_w[rsp_idx];
|
||||
assign mem_rsp_tag_out = SRC_TAG_WIDTH'(mem_rsp_tag_in[SRC_TAG_WIDTH+D-1:D]);
|
||||
assign mem_rsp_ready_in = mem_rsp_ready_out;
|
||||
|
||||
end else if (DST_LDATAW < SRC_LDATAW) begin
|
||||
|
||||
reg [D-1:0] req_ctr, rsp_ctr;
|
||||
|
||||
reg [P-1:0][DST_DATA_WIDTH-1:0] mem_rsp_data_out_r, mem_rsp_data_out_n;
|
||||
|
||||
wire mem_req_out_fire = mem_req_valid_out && mem_req_ready_out;
|
||||
wire mem_rsp_in_fire = mem_rsp_valid_in && mem_rsp_ready_in;
|
||||
|
||||
wire [P-1:0][DST_DATA_WIDTH-1:0] mem_req_data_in_w = mem_req_data_in;
|
||||
wire [P-1:0][DST_DATA_SIZE-1:0] mem_req_byteen_in_w = mem_req_byteen_in;
|
||||
|
||||
always @(*) begin
|
||||
mem_rsp_data_out_n = mem_rsp_data_out_r;
|
||||
if (mem_rsp_in_fire) begin
|
||||
mem_rsp_data_out_n[rsp_ctr] = mem_rsp_data_in;
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
req_ctr <= 0;
|
||||
rsp_ctr <= 0;
|
||||
end else begin
|
||||
if (mem_req_out_fire) begin
|
||||
req_ctr <= req_ctr + 1;
|
||||
end
|
||||
if (mem_rsp_in_fire) begin
|
||||
rsp_ctr <= rsp_ctr + 1;
|
||||
end
|
||||
end
|
||||
mem_rsp_data_out_r <= mem_rsp_data_out_n;
|
||||
end
|
||||
|
||||
reg [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_r;
|
||||
wire [DST_TAG_WIDTH-1:0] mem_rsp_tag_in_w;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (mem_rsp_in_fire) begin
|
||||
mem_rsp_tag_in_r <= mem_rsp_tag_in;
|
||||
end
|
||||
end
|
||||
assign mem_rsp_tag_in_w = (rsp_ctr != 0) ? mem_rsp_tag_in_r : mem_rsp_tag_in;
|
||||
`RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_w == mem_rsp_tag_in),
|
||||
("%t: *** out-of-order memory reponse! cur=%d, expected=%d", $time, mem_rsp_tag_in_w, mem_rsp_tag_in))
|
||||
|
||||
wire [SRC_ADDR_WIDTH+D-1:0] mem_req_addr_in_qual = {mem_req_addr_in, req_ctr};
|
||||
|
||||
if (DST_ADDR_WIDTH < (SRC_ADDR_WIDTH + D)) begin
|
||||
`UNUSED_VAR (mem_req_addr_in_qual)
|
||||
assign mem_req_addr_out = mem_req_addr_in_qual[DST_ADDR_WIDTH-1:0];
|
||||
end else if (DST_ADDR_WIDTH > (SRC_ADDR_WIDTH + D)) begin
|
||||
assign mem_req_addr_out = DST_ADDR_WIDTH'(mem_req_addr_in_qual);
|
||||
end else begin
|
||||
assign mem_req_addr_out = mem_req_addr_in_qual;
|
||||
end
|
||||
|
||||
assign mem_req_valid_out = mem_req_valid_in;
|
||||
assign mem_req_rw_out = mem_req_rw_in;
|
||||
assign mem_req_byteen_out = mem_req_byteen_in_w[req_ctr];
|
||||
assign mem_req_data_out = mem_req_data_in_w[req_ctr];
|
||||
assign mem_req_tag_out = DST_TAG_WIDTH'(mem_req_tag_in);
|
||||
assign mem_req_ready_in = mem_req_ready_out && (req_ctr == (P-1));
|
||||
|
||||
assign mem_rsp_valid_out = mem_rsp_valid_in && (rsp_ctr == (P-1));
|
||||
assign mem_rsp_data_out = mem_rsp_data_out_n;
|
||||
assign mem_rsp_tag_out = SRC_TAG_WIDTH'(mem_rsp_tag_in);
|
||||
assign mem_rsp_ready_in = mem_rsp_ready_out;
|
||||
|
||||
end else begin
|
||||
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
if (DST_ADDR_WIDTH < SRC_ADDR_WIDTH) begin
|
||||
`UNUSED_VAR (mem_req_addr_in)
|
||||
assign mem_req_addr_out = mem_req_addr_in[DST_ADDR_WIDTH-1:0];
|
||||
end else if (DST_ADDR_WIDTH > SRC_ADDR_WIDTH) begin
|
||||
assign mem_req_addr_out = DST_ADDR_WIDTH'(mem_req_addr_in);
|
||||
end else begin
|
||||
assign mem_req_addr_out = mem_req_addr_in;
|
||||
end
|
||||
|
||||
assign mem_req_valid_out = mem_req_valid_in;
|
||||
assign mem_req_rw_out = mem_req_rw_in;
|
||||
assign mem_req_byteen_out = mem_req_byteen_in;
|
||||
assign mem_req_data_out = mem_req_data_in;
|
||||
assign mem_req_tag_out = DST_TAG_WIDTH'(mem_req_tag_in);
|
||||
assign mem_req_ready_in = mem_req_ready_out;
|
||||
|
||||
assign mem_rsp_valid_out = mem_rsp_valid_in;
|
||||
assign mem_rsp_data_out = mem_rsp_data_in;
|
||||
assign mem_rsp_tag_out = SRC_TAG_WIDTH'(mem_rsp_tag_in);
|
||||
assign mem_rsp_ready_in = mem_rsp_ready_out;
|
||||
|
||||
end
|
||||
|
||||
endmodule
|
|
@ -241,4 +241,4 @@ typedef union packed {
|
|||
t_ccip_c0_ReqMmioHdr reqMmioHdr;
|
||||
} t_if_ccip_c0_RxHdr;
|
||||
|
||||
endpackage
|
||||
endpackage
|
|
@ -45,4 +45,4 @@ begin
|
|||
pck_af2cp_sTx_T1 = pck_af2cp_sTx_T0_q;
|
||||
end
|
||||
|
||||
endmodule
|
||||
endmodule
|
|
@ -58,4 +58,4 @@ package local_mem_cfg_pkg;
|
|||
|
||||
endpackage // local_mem_cfg_pkg
|
||||
|
||||
`endif // PLATFORM_PROVIDES_LOCAL_MEMORY
|
||||
`endif // PLATFORM_PROVIDES_LOCAL_MEMORY
|
1093
hw/rtl/afu/opae/vortex_afu.sv
Normal file
1093
hw/rtl/afu/opae/vortex_afu.sv
Normal file
File diff suppressed because it is too large
Load diff
39
hw/rtl/afu/opae/vortex_afu.vh
Normal file
39
hw/rtl/afu/opae/vortex_afu.vh
Normal file
|
@ -0,0 +1,39 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`ifndef VORTEX_AFU_VH
|
||||
`define VORTEX_AFU_VH
|
||||
|
||||
`define AFU_ACCEL_NAME "vortex_afu"
|
||||
`define AFU_ACCEL_UUID 128'h35F9452B_25C2_434C_93D5_6F8C60DB361C
|
||||
|
||||
`define AFU_IMAGE_CMD_MEM_READ 1
|
||||
`define AFU_IMAGE_CMD_MEM_WRITE 2
|
||||
`define AFU_IMAGE_CMD_RUN 3
|
||||
`define AFU_IMAGE_CMD_DCR_WRITE 4
|
||||
`define AFU_IMAGE_CMD_MAX_VALUE 4
|
||||
|
||||
`define AFU_IMAGE_MMIO_CMD_TYPE 10
|
||||
`define AFU_IMAGE_MMIO_CMD_ARG0 12
|
||||
`define AFU_IMAGE_MMIO_CMD_ARG1 14
|
||||
`define AFU_IMAGE_MMIO_CMD_ARG2 16
|
||||
`define AFU_IMAGE_MMIO_STATUS 18
|
||||
`define AFU_IMAGE_MMIO_SCOPE_READ 20
|
||||
`define AFU_IMAGE_MMIO_SCOPE_WRITE 22
|
||||
`define AFU_IMAGE_MMIO_DEV_CAPS 24
|
||||
`define AFU_IMAGE_MMIO_ISA_CAPS 26
|
||||
|
||||
`define AFU_IMAGE_POWER 0
|
||||
`define AFU_TOP_IFC "ccip_std_afu_avalon_mm"
|
||||
|
||||
`endif // VORTEX_AFU_VH
|
File diff suppressed because it is too large
Load diff
|
@ -1,44 +0,0 @@
|
|||
`ifndef __VORTEX_AFU__
|
||||
`define __VORTEX_AFU__
|
||||
|
||||
`include "ccip_if_pkg.sv"
|
||||
|
||||
`define PLATFORM_PROVIDES_LOCAL_MEMORY
|
||||
|
||||
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BANKS
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_BANKS 2
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH 26
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH 512
|
||||
`endif
|
||||
|
||||
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH
|
||||
`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH 4
|
||||
`endif
|
||||
|
||||
`include "local_mem_cfg_pkg.sv"
|
||||
|
||||
`define AFU_ACCEL_NAME "vortex_afu"
|
||||
`define AFU_ACCEL_UUID 128'h35f9452b_25c2_434c_93d5_6f8c60db361c
|
||||
|
||||
`define AFU_IMAGE_CMD_MEM_READ 1
|
||||
`define AFU_IMAGE_CMD_MEM_WRITE 2
|
||||
`define AFU_IMAGE_CMD_RUN 3
|
||||
`define AFU_IMAGE_MMIO_CMD_TYPE 10
|
||||
`define AFU_IMAGE_MMIO_DATA_SIZE 16
|
||||
`define AFU_IMAGE_MMIO_IO_ADDR 12
|
||||
`define AFU_IMAGE_MMIO_MEM_ADDR 14
|
||||
`define AFU_IMAGE_MMIO_SCOPE_READ 20
|
||||
`define AFU_IMAGE_MMIO_SCOPE_WRITE 22
|
||||
`define AFU_IMAGE_MMIO_DEV_CAPS 24
|
||||
`define AFU_IMAGE_MMIO_STATUS 18
|
||||
|
||||
`define AFU_IMAGE_POWER 0
|
||||
`define AFU_TOP_IFC "ccip_std_afu_avalon_mm"
|
||||
|
||||
`endif
|
419
hw/rtl/afu/xrt/VX_afu_ctrl.sv
Normal file
419
hw/rtl/afu/xrt/VX_afu_ctrl.sv
Normal file
|
@ -0,0 +1,419 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "vortex_afu.vh"
|
||||
|
||||
module VX_afu_ctrl #(
|
||||
parameter AXI_ADDR_WIDTH = 8,
|
||||
parameter AXI_DATA_WIDTH = 32,
|
||||
parameter AXI_NUM_BANKS = 1
|
||||
) (
|
||||
// axi4 lite slave signals
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
input wire clk_en,
|
||||
|
||||
input wire s_axi_awvalid,
|
||||
input wire [AXI_ADDR_WIDTH-1:0] s_axi_awaddr,
|
||||
output wire s_axi_awready,
|
||||
|
||||
input wire s_axi_wvalid,
|
||||
input wire [AXI_DATA_WIDTH-1:0] s_axi_wdata,
|
||||
input wire [AXI_DATA_WIDTH/8-1:0] s_axi_wstrb,
|
||||
output wire s_axi_wready,
|
||||
|
||||
output wire s_axi_bvalid,
|
||||
output wire [1:0] s_axi_bresp,
|
||||
input wire s_axi_bready,
|
||||
|
||||
input wire s_axi_arvalid,
|
||||
input wire [AXI_ADDR_WIDTH-1:0] s_axi_araddr,
|
||||
output wire s_axi_arready,
|
||||
|
||||
output wire s_axi_rvalid,
|
||||
output wire [AXI_DATA_WIDTH-1:0] s_axi_rdata,
|
||||
output wire [1:0] s_axi_rresp,
|
||||
input wire s_axi_rready,
|
||||
|
||||
output wire ap_reset,
|
||||
output wire ap_start,
|
||||
input wire ap_done,
|
||||
input wire ap_ready,
|
||||
input wire ap_idle,
|
||||
output wire interrupt,
|
||||
|
||||
`ifdef SCOPE
|
||||
input wire scope_bus_in,
|
||||
output wire scope_bus_out,
|
||||
`endif
|
||||
|
||||
output wire [63:0] mem_base [AXI_NUM_BANKS],
|
||||
|
||||
output wire dcr_wr_valid,
|
||||
output wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
|
||||
output wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data
|
||||
);
|
||||
|
||||
// Address Info
|
||||
// 0x00 : Control signals
|
||||
// bit 0 - ap_start (Read/Write/COH)
|
||||
// bit 1 - ap_done (Read/COR)
|
||||
// bit 2 - ap_idle (Read)
|
||||
// bit 3 - ap_ready (Read)
|
||||
// bit 4 - ap_reset (Write)
|
||||
// bit 7 - auto_restart (Read/Write)
|
||||
// others - reserved
|
||||
// 0x04 : Global Interrupt Enable Register
|
||||
// bit 0 - Global Interrupt Enable (Read/Write)
|
||||
// others - reserved
|
||||
// 0x08 : IP Interrupt Enable Register (Read/Write)
|
||||
// bit 0 - Channel 0 (ap_done)
|
||||
// bit 1 - Channel 1 (ap_ready)
|
||||
// others - reserved
|
||||
// 0x0c : IP Interrupt Status Register (Read/TOW)
|
||||
// bit 0 - Channel 0 (ap_done)
|
||||
// bit 1 - Channel 1 (ap_ready)
|
||||
// others - reserved
|
||||
// 0x10 : Low 32-bit Data signal of DEV_CAPS
|
||||
// 0x14 : High 32-bit Data signal of DEV_CAPS
|
||||
// 0x18 : Control signal of DEV_CAPS
|
||||
// 0x1C : Low 32-bit Data signal of ISA_CAPS
|
||||
// 0x20 : High 32-bit Data signal of ISA_CAPS
|
||||
// 0x24 : Control signal of ISA_CAPS
|
||||
// 0x28 : Low 32-bit Data signal of DCR
|
||||
// 0x2C : High 32-bit Data signal of DCR
|
||||
// 0x30 : Control signal of DCR
|
||||
// 0x34 : Low 32-bit Data signal of SCP
|
||||
// 0x38 : High 32-bit Data signal of SCP
|
||||
// 0x3C : Control signal of SCP
|
||||
// 0x40 : Low 32-bit Data signal of MEM
|
||||
// 0x44 : High 32-bit Data signal of MEM
|
||||
// 0x48 : Control signal of MEM
|
||||
// (SC = Self Clear, COR = Clear on Read, TOW = Toggle on Write, COH = Clear on Handshake)
|
||||
|
||||
// Parameters
|
||||
localparam
|
||||
ADDR_AP_CTRL = 8'h00,
|
||||
ADDR_GIE = 8'h04,
|
||||
ADDR_IER = 8'h08,
|
||||
ADDR_ISR = 8'h0C,
|
||||
|
||||
ADDR_DEV_0 = 8'h10,
|
||||
ADDR_DEV_1 = 8'h14,
|
||||
ADDR_DEV_CTRL = 8'h18,
|
||||
|
||||
ADDR_ISA_0 = 8'h1C,
|
||||
ADDR_ISA_1 = 8'h20,
|
||||
ADDR_ISA_CTRL = 8'h24,
|
||||
|
||||
ADDR_DCR_0 = 8'h28,
|
||||
ADDR_DCR_1 = 8'h2C,
|
||||
ADDR_DCR_CTRL = 8'h30,
|
||||
|
||||
ADDR_SCP_0 = 8'h34,
|
||||
ADDR_SCP_1 = 8'h38,
|
||||
ADDR_SCP_CTRL = 8'h3C,
|
||||
|
||||
ADDR_MEM_0 = 8'h40,
|
||||
ADDR_MEM_1 = 8'h44,
|
||||
ADDR_MEM_CTRL = 8'h48,
|
||||
|
||||
ADDR_BITS = 8;
|
||||
|
||||
localparam
|
||||
WSTATE_IDLE = 2'd0,
|
||||
WSTATE_DATA = 2'd1,
|
||||
WSTATE_RESP = 2'd2;
|
||||
|
||||
localparam
|
||||
RSTATE_IDLE = 2'd0,
|
||||
RSTATE_DATA = 2'd1;
|
||||
|
||||
// device caps
|
||||
wire [63:0] dev_caps = {16'b0,
|
||||
8'(`SM_ENABLED ? `SMEM_LOG_SIZE : 0),
|
||||
16'(`NUM_CORES * `NUM_CLUSTERS),
|
||||
8'(`NUM_WARPS),
|
||||
8'(`NUM_THREADS),
|
||||
8'(`IMPLEMENTATION_ID)};
|
||||
|
||||
wire [63:0] isa_caps = {32'(`MISA_EXT),
|
||||
2'(`CLOG2(`XLEN)-4),
|
||||
30'(`MISA_STD)};
|
||||
|
||||
reg [1:0] wstate;
|
||||
reg [ADDR_BITS-1:0] waddr;
|
||||
wire [31:0] wmask;
|
||||
wire s_axi_aw_fire;
|
||||
wire s_axi_w_fire;
|
||||
|
||||
reg [1:0] rstate;
|
||||
reg [31:0] rdata;
|
||||
wire [ADDR_BITS-1:0] raddr;
|
||||
wire s_axi_ar_fire;
|
||||
|
||||
reg ap_reset_r;
|
||||
reg ap_start_r;
|
||||
reg auto_restart_r;
|
||||
reg gie_r;
|
||||
reg [1:0] ier_r;
|
||||
reg [1:0] isr_r;
|
||||
reg [63:0] mem_r [AXI_NUM_BANKS];
|
||||
reg [31:0] dcra_r;
|
||||
reg [31:0] dcrv_r;
|
||||
reg dcr_wr_valid_r;
|
||||
|
||||
`ifdef SCOPE
|
||||
|
||||
reg [63:0] scope_bus_wdata;
|
||||
reg [63:0] scope_bus_rdata;
|
||||
reg [5:0] scope_bus_ctr;
|
||||
|
||||
reg cmd_scope_reading;
|
||||
reg cmd_scope_writing;
|
||||
reg scope_bus_out_r;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
cmd_scope_reading <= 0;
|
||||
cmd_scope_writing <= 0;
|
||||
scope_bus_ctr <= '0;
|
||||
scope_bus_out_r <= 0;
|
||||
end else if (clk_en) begin
|
||||
if (s_axi_w_fire && waddr == ADDR_SCP_0) begin
|
||||
scope_bus_wdata[31:0] <= (s_axi_wdata & wmask) | (scope_bus_wdata[31:0] & ~wmask);
|
||||
end
|
||||
if (s_axi_w_fire && waddr == ADDR_SCP_1) begin
|
||||
scope_bus_wdata[63:32] <= (s_axi_wdata & wmask) | (scope_bus_wdata[63:32] & ~wmask);
|
||||
cmd_scope_writing <= 1;
|
||||
scope_bus_out_r <= 1;
|
||||
scope_bus_ctr <= 63;
|
||||
end
|
||||
if (scope_bus_in) begin
|
||||
cmd_scope_reading <= 1;
|
||||
scope_bus_ctr <= 63;
|
||||
end
|
||||
if (cmd_scope_reading) begin
|
||||
scope_bus_rdata <= {scope_bus_rdata[62:0], scope_bus_in};
|
||||
scope_bus_ctr <= scope_bus_ctr - 1;
|
||||
if (scope_bus_ctr == 0) begin
|
||||
cmd_scope_reading <= 0;
|
||||
end
|
||||
end
|
||||
if (cmd_scope_writing) begin
|
||||
scope_bus_out_r <= 1'(scope_bus_wdata >> scope_bus_ctr);
|
||||
scope_bus_ctr <= scope_bus_ctr - 1;
|
||||
if (scope_bus_ctr == 0) begin
|
||||
cmd_scope_writing <= 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign scope_bus_out = scope_bus_out_r;
|
||||
|
||||
`endif
|
||||
|
||||
// AXI Write
|
||||
|
||||
assign s_axi_awready = (wstate == WSTATE_IDLE);
|
||||
assign s_axi_wready = (wstate == WSTATE_DATA);
|
||||
assign s_axi_bvalid = (wstate == WSTATE_RESP);
|
||||
assign s_axi_bresp = 2'b00; // OKAY
|
||||
|
||||
assign s_axi_aw_fire = s_axi_awvalid && s_axi_awready;
|
||||
assign s_axi_w_fire = s_axi_wvalid && s_axi_wready;
|
||||
|
||||
for (genvar i = 0; i < 4; ++i) begin
|
||||
assign wmask[8 * i +: 8] = {8{s_axi_wstrb[i]}};
|
||||
end
|
||||
|
||||
// wstate
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
wstate <= WSTATE_IDLE;
|
||||
end else if (clk_en) begin
|
||||
case (wstate)
|
||||
WSTATE_IDLE: wstate <= s_axi_awvalid ? WSTATE_DATA : WSTATE_IDLE;
|
||||
WSTATE_DATA: wstate <= s_axi_wvalid ? WSTATE_RESP : WSTATE_DATA;
|
||||
WSTATE_RESP: wstate <= s_axi_bready ? WSTATE_IDLE : WSTATE_RESP;
|
||||
default: wstate <= WSTATE_IDLE;
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
// waddr
|
||||
always @(posedge clk) begin
|
||||
if (clk_en) begin
|
||||
if (s_axi_aw_fire)
|
||||
waddr <= s_axi_awaddr[ADDR_BITS-1:0];
|
||||
end
|
||||
end
|
||||
|
||||
// wdata
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
ap_start_r <= 0;
|
||||
ap_reset_r <= 0;
|
||||
auto_restart_r <= 0;
|
||||
|
||||
gie_r <= 0;
|
||||
ier_r <= '0;
|
||||
isr_r <= '0;
|
||||
|
||||
dcra_r <= '0;
|
||||
dcrv_r <= '0;
|
||||
dcr_wr_valid_r <= 0;
|
||||
|
||||
for (integer i = 0; i < AXI_NUM_BANKS; ++i) begin
|
||||
mem_r[i] <= '0;
|
||||
end
|
||||
end else if (clk_en) begin
|
||||
if (ap_ready)
|
||||
ap_start_r <= auto_restart_r;
|
||||
|
||||
dcr_wr_valid_r <= 0;
|
||||
|
||||
if (s_axi_w_fire) begin
|
||||
case (waddr)
|
||||
ADDR_AP_CTRL: begin
|
||||
if (s_axi_wstrb[0]) begin
|
||||
if (s_axi_wdata[0])
|
||||
ap_start_r <= 1;
|
||||
if (s_axi_wdata[4])
|
||||
ap_reset_r <= 1;
|
||||
if (s_axi_wdata[7])
|
||||
auto_restart_r <= 1;
|
||||
end
|
||||
end
|
||||
ADDR_GIE: begin
|
||||
if (s_axi_wstrb[0])
|
||||
gie_r <= s_axi_wdata[0];
|
||||
end
|
||||
ADDR_IER: begin
|
||||
if (s_axi_wstrb[0])
|
||||
ier_r <= s_axi_wdata[1:0];
|
||||
end
|
||||
ADDR_ISR: begin
|
||||
if (s_axi_wstrb[0])
|
||||
isr_r <= isr_r ^ s_axi_wdata[1:0];
|
||||
end
|
||||
ADDR_DCR_0: begin
|
||||
dcra_r <= (s_axi_wdata & wmask) | (dcra_r & ~wmask);
|
||||
end
|
||||
ADDR_DCR_1: begin
|
||||
dcrv_r <= (s_axi_wdata & wmask) | (dcrv_r & ~wmask);
|
||||
dcr_wr_valid_r <= 1;
|
||||
end
|
||||
default: begin
|
||||
for (integer i = 0; i < AXI_NUM_BANKS; ++i) begin
|
||||
if (waddr == (ADDR_MEM_0 + i * 12)) begin
|
||||
mem_r[i][31:0] <= (s_axi_wdata & wmask) | (mem_r[i][31:0] & ~wmask);
|
||||
end
|
||||
if (waddr == (ADDR_MEM_1 + i * 12)) begin
|
||||
mem_r[i][63:32] <= (s_axi_wdata & wmask) | (mem_r[i][63:32] & ~wmask);
|
||||
end
|
||||
end
|
||||
end
|
||||
endcase
|
||||
|
||||
if (ier_r[0] & ap_done)
|
||||
isr_r[0] <= 1'b1;
|
||||
if (ier_r[1] & ap_ready)
|
||||
isr_r[1] <= 1'b1;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
// AXI Read
|
||||
|
||||
assign s_axi_arready = (rstate == RSTATE_IDLE);
|
||||
assign s_axi_rvalid = (rstate == RSTATE_DATA);
|
||||
assign s_axi_rdata = rdata;
|
||||
assign s_axi_rresp = 2'b00; // OKAY
|
||||
|
||||
assign s_axi_ar_fire = s_axi_arvalid && s_axi_arready;
|
||||
assign raddr = s_axi_araddr[ADDR_BITS-1:0];
|
||||
|
||||
// rstate
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
rstate <= RSTATE_IDLE;
|
||||
end else if (clk_en) begin
|
||||
case (rstate)
|
||||
RSTATE_IDLE: rstate <= s_axi_arvalid ? RSTATE_DATA : RSTATE_IDLE;
|
||||
RSTATE_DATA: rstate <= (s_axi_rready & s_axi_rvalid) ? RSTATE_IDLE : RSTATE_DATA;
|
||||
default: rstate <= RSTATE_IDLE;
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
// rdata
|
||||
always @(posedge clk) begin
|
||||
if (clk_en) begin
|
||||
if (s_axi_ar_fire) begin
|
||||
rdata <= '0;
|
||||
case (raddr)
|
||||
ADDR_AP_CTRL: begin
|
||||
rdata[0] <= ap_start_r;
|
||||
rdata[1] <= ap_done;
|
||||
rdata[2] <= ap_idle;
|
||||
rdata[3] <= ap_ready;
|
||||
rdata[7] <= auto_restart_r;
|
||||
end
|
||||
ADDR_GIE: begin
|
||||
rdata <= 32'(gie_r);
|
||||
end
|
||||
ADDR_IER: begin
|
||||
rdata <= 32'(ier_r);
|
||||
end
|
||||
ADDR_ISR: begin
|
||||
rdata <= 32'(isr_r);
|
||||
end
|
||||
ADDR_DEV_0: begin
|
||||
rdata <= dev_caps[31:0];
|
||||
end
|
||||
ADDR_DEV_1: begin
|
||||
rdata <= dev_caps[63:32];
|
||||
end
|
||||
ADDR_ISA_0: begin
|
||||
rdata <= isa_caps[31:0];
|
||||
end
|
||||
ADDR_ISA_1: begin
|
||||
rdata <= isa_caps[63:32];
|
||||
end
|
||||
`ifdef SCOPE
|
||||
ADDR_SCP_0: begin
|
||||
rdata <= scope_bus_rdata[31:0];
|
||||
end
|
||||
ADDR_SCP_1: begin
|
||||
rdata <= scope_bus_rdata[63:32];
|
||||
end
|
||||
`endif
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign ap_reset = ap_reset_r;
|
||||
assign ap_start = ap_start_r;
|
||||
assign interrupt = gie_r & (| isr_r);
|
||||
|
||||
assign mem_base = mem_r;
|
||||
|
||||
assign dcr_wr_valid = dcr_wr_valid_r;
|
||||
assign dcr_wr_addr = `VX_DCR_ADDR_WIDTH'(dcra_r);
|
||||
assign dcr_wr_data = `VX_DCR_DATA_WIDTH'(dcrv_r);
|
||||
|
||||
endmodule
|
412
hw/rtl/afu/xrt/VX_afu_wrap.sv
Normal file
412
hw/rtl/afu/xrt/VX_afu_wrap.sv
Normal file
|
@ -0,0 +1,412 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "vortex_afu.vh"
|
||||
|
||||
module VX_afu_wrap #(
|
||||
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
|
||||
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
|
||||
parameter C_M_AXI_MEM_ID_WIDTH = 16,
|
||||
parameter C_M_AXI_MEM_ADDR_WIDTH = 32,
|
||||
parameter C_M_AXI_MEM_DATA_WIDTH = 512
|
||||
) (
|
||||
// System signals
|
||||
input wire ap_clk,
|
||||
input wire ap_rst_n,
|
||||
|
||||
// AXI4 master interface
|
||||
`REPEAT (`M_AXI_MEM_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
|
||||
|
||||
// AXI4-Lite slave interface
|
||||
input wire s_axi_ctrl_awvalid,
|
||||
output wire s_axi_ctrl_awready,
|
||||
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_awaddr,
|
||||
input wire s_axi_ctrl_wvalid,
|
||||
output wire s_axi_ctrl_wready,
|
||||
input wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_wdata,
|
||||
input wire [C_S_AXI_CTRL_DATA_WIDTH/8-1:0] s_axi_ctrl_wstrb,
|
||||
input wire s_axi_ctrl_arvalid,
|
||||
output wire s_axi_ctrl_arready,
|
||||
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_araddr,
|
||||
output wire s_axi_ctrl_rvalid,
|
||||
input wire s_axi_ctrl_rready,
|
||||
output wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_rdata,
|
||||
output wire [1:0] s_axi_ctrl_rresp,
|
||||
output wire s_axi_ctrl_bvalid,
|
||||
input wire s_axi_ctrl_bready,
|
||||
output wire [1:0] s_axi_ctrl_bresp,
|
||||
|
||||
output wire interrupt
|
||||
);
|
||||
localparam C_M_AXI_MEM_NUM_BANKS = `M_AXI_MEM_NUM_BANKS;
|
||||
|
||||
localparam STATE_IDLE = 0;
|
||||
localparam STATE_RUN = 1;
|
||||
|
||||
wire m_axi_mem_awvalid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_awready_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_awid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [7:0] m_axi_mem_awlen_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_wvalid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_wready_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_wdata_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_DATA_WIDTH/8-1:0] m_axi_mem_wstrb_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_wlast_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_bvalid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_bready_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_bid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [1:0] m_axi_mem_bresp_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_arvalid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_arready_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_arid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [7:0] m_axi_mem_arlen_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_rvalid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_rready_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_rdata_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire m_axi_mem_rlast_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_rid_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [1:0] m_axi_mem_rresp_a [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
||||
// convert memory interface to array
|
||||
`REPEAT (`M_AXI_MEM_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
|
||||
|
||||
wire clk = ap_clk;
|
||||
wire reset = ~ap_rst_n;
|
||||
|
||||
reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr;
|
||||
reg [15:0] vx_pending_writes;
|
||||
reg vx_busy_wait;
|
||||
reg vx_running;
|
||||
|
||||
wire vx_busy;
|
||||
|
||||
wire [63:0] mem_base [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
||||
wire dcr_wr_valid;
|
||||
wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr;
|
||||
wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data;
|
||||
|
||||
reg state;
|
||||
|
||||
wire ap_reset;
|
||||
wire ap_start;
|
||||
wire ap_idle = ~vx_running;
|
||||
wire ap_done = ~(state == STATE_RUN || vx_pending_writes != 0);
|
||||
wire ap_ready = 1'b1;
|
||||
|
||||
`ifdef SCOPE
|
||||
wire scope_bus_in;
|
||||
wire scope_bus_out;
|
||||
wire scope_reset = reset;
|
||||
`endif
|
||||
|
||||
always @(posedge ap_clk) begin
|
||||
if (reset || ap_reset) begin
|
||||
state <= STATE_IDLE;
|
||||
vx_busy_wait <= 0;
|
||||
vx_running <= 0;
|
||||
end else begin
|
||||
case (state)
|
||||
STATE_IDLE: begin
|
||||
if (ap_start) begin
|
||||
`ifdef DBG_TRACE_AFU
|
||||
`TRACE(2, ("%d: STATE RUN\n", $time));
|
||||
`endif
|
||||
state <= STATE_RUN;
|
||||
vx_running <= 0;
|
||||
end
|
||||
end
|
||||
STATE_RUN: begin
|
||||
if (vx_running) begin
|
||||
if (vx_busy_wait) begin
|
||||
// wait until processor goes busy
|
||||
if (vx_busy) begin
|
||||
vx_busy_wait <= 0;
|
||||
end
|
||||
end else begin
|
||||
// wait until the processor is not busy
|
||||
if (~vx_busy) begin
|
||||
state <= STATE_IDLE;
|
||||
`ifdef DBG_TRACE_AFU
|
||||
`TRACE(2, ("%d: AFU: End execution\n", $time));
|
||||
`TRACE(2, ("%d: STATE IDLE\n", $time));
|
||||
`endif
|
||||
end
|
||||
end
|
||||
end else begin
|
||||
// wait until the reset sequence is complete
|
||||
if (vx_reset_ctr == (`RESET_DELAY-1)) begin
|
||||
`ifdef DBG_TRACE_AFU
|
||||
`TRACE(2, ("%d: AFU: Begin execution\n", $time));
|
||||
`endif
|
||||
vx_running <= 1;
|
||||
vx_busy_wait <= 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
reg m_axi_mem_wfire;
|
||||
reg m_axi_mem_bfire;
|
||||
|
||||
always @(*) begin
|
||||
m_axi_mem_wfire = 0;
|
||||
m_axi_mem_bfire = 0;
|
||||
for (integer i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
|
||||
m_axi_mem_wfire |= m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i];
|
||||
m_axi_mem_bfire |= m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i];
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge ap_clk) begin
|
||||
if (reset || ap_reset) begin
|
||||
vx_pending_writes <= '0;
|
||||
end else begin
|
||||
if (m_axi_mem_wfire && ~m_axi_mem_bfire)
|
||||
vx_pending_writes <= vx_pending_writes + 1;
|
||||
if (~m_axi_mem_wfire && m_axi_mem_bfire)
|
||||
vx_pending_writes <= vx_pending_writes - 1;
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge ap_clk) begin
|
||||
if (state == STATE_RUN) begin
|
||||
vx_reset_ctr <= vx_reset_ctr + 1;
|
||||
end else begin
|
||||
vx_reset_ctr <= '0;
|
||||
end
|
||||
end
|
||||
|
||||
VX_afu_ctrl #(
|
||||
.AXI_ADDR_WIDTH (C_S_AXI_CTRL_ADDR_WIDTH),
|
||||
.AXI_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH),
|
||||
.AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
|
||||
) afu_ctrl (
|
||||
.clk (ap_clk),
|
||||
.reset (reset || ap_reset),
|
||||
.clk_en (1'b1),
|
||||
|
||||
.s_axi_awvalid (s_axi_ctrl_awvalid),
|
||||
.s_axi_awready (s_axi_ctrl_awready),
|
||||
.s_axi_awaddr (s_axi_ctrl_awaddr),
|
||||
.s_axi_wvalid (s_axi_ctrl_wvalid),
|
||||
.s_axi_wready (s_axi_ctrl_wready),
|
||||
.s_axi_wdata (s_axi_ctrl_wdata),
|
||||
.s_axi_wstrb (s_axi_ctrl_wstrb),
|
||||
.s_axi_arvalid (s_axi_ctrl_arvalid),
|
||||
.s_axi_arready (s_axi_ctrl_arready),
|
||||
.s_axi_araddr (s_axi_ctrl_araddr),
|
||||
.s_axi_rvalid (s_axi_ctrl_rvalid),
|
||||
.s_axi_rready (s_axi_ctrl_rready),
|
||||
.s_axi_rdata (s_axi_ctrl_rdata),
|
||||
.s_axi_rresp (s_axi_ctrl_rresp),
|
||||
.s_axi_bvalid (s_axi_ctrl_bvalid),
|
||||
.s_axi_bready (s_axi_ctrl_bready),
|
||||
.s_axi_bresp (s_axi_ctrl_bresp),
|
||||
|
||||
.ap_reset (ap_reset),
|
||||
.ap_start (ap_start),
|
||||
.ap_done (ap_done),
|
||||
.ap_ready (ap_ready),
|
||||
.ap_idle (ap_idle),
|
||||
.interrupt (interrupt),
|
||||
|
||||
`ifdef SCOPE
|
||||
.scope_bus_in (scope_bus_out),
|
||||
.scope_bus_out (scope_bus_in),
|
||||
`endif
|
||||
|
||||
.mem_base (mem_base),
|
||||
|
||||
.dcr_wr_valid (dcr_wr_valid),
|
||||
.dcr_wr_addr (dcr_wr_addr),
|
||||
.dcr_wr_data (dcr_wr_data)
|
||||
);
|
||||
|
||||
wire [`XLEN-1:0] m_axi_mem_awaddr_w [C_M_AXI_MEM_NUM_BANKS];
|
||||
wire [`XLEN-1:0] m_axi_mem_araddr_w [C_M_AXI_MEM_NUM_BANKS];
|
||||
|
||||
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
|
||||
assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_w[i]) + C_M_AXI_MEM_ADDR_WIDTH'(mem_base[i]);
|
||||
assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_w[i]) + C_M_AXI_MEM_ADDR_WIDTH'(mem_base[i]);
|
||||
end
|
||||
|
||||
`SCOPE_IO_SWITCH (2)
|
||||
|
||||
Vortex_axi #(
|
||||
.AXI_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH),
|
||||
.AXI_ADDR_WIDTH (`XLEN),
|
||||
.AXI_TID_WIDTH (C_M_AXI_MEM_ID_WIDTH),
|
||||
.AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
|
||||
) vortex_axi (
|
||||
`SCOPE_IO_BIND (1)
|
||||
|
||||
.clk (ap_clk),
|
||||
.reset (reset || ap_reset || ~vx_running),
|
||||
|
||||
.m_axi_awvalid (m_axi_mem_awvalid_a),
|
||||
.m_axi_awready (m_axi_mem_awready_a),
|
||||
.m_axi_awaddr (m_axi_mem_awaddr_w),
|
||||
.m_axi_awid (m_axi_mem_awid_a),
|
||||
`UNUSED_PIN (m_axi_awlen),
|
||||
`UNUSED_PIN (m_axi_awsize),
|
||||
`UNUSED_PIN (m_axi_awburst),
|
||||
`UNUSED_PIN (m_axi_awlock),
|
||||
`UNUSED_PIN (m_axi_awcache),
|
||||
`UNUSED_PIN (m_axi_awprot),
|
||||
`UNUSED_PIN (m_axi_awqos),
|
||||
`UNUSED_PIN (m_axi_awregion),
|
||||
|
||||
.m_axi_wvalid (m_axi_mem_wvalid_a),
|
||||
.m_axi_wready (m_axi_mem_wready_a),
|
||||
.m_axi_wdata (m_axi_mem_wdata_a),
|
||||
.m_axi_wstrb (m_axi_mem_wstrb_a),
|
||||
.m_axi_wlast (m_axi_mem_wlast_a),
|
||||
|
||||
.m_axi_bvalid (m_axi_mem_bvalid_a),
|
||||
.m_axi_bready (m_axi_mem_bready_a),
|
||||
.m_axi_bid (m_axi_mem_bid_a),
|
||||
.m_axi_bresp (m_axi_mem_bresp_a),
|
||||
|
||||
.m_axi_arvalid (m_axi_mem_arvalid_a),
|
||||
.m_axi_arready (m_axi_mem_arready_a),
|
||||
.m_axi_araddr (m_axi_mem_araddr_w),
|
||||
.m_axi_arid (m_axi_mem_arid_a),
|
||||
.m_axi_arlen (m_axi_mem_arlen_a),
|
||||
`UNUSED_PIN (m_axi_arsize),
|
||||
`UNUSED_PIN (m_axi_arburst),
|
||||
`UNUSED_PIN (m_axi_arlock),
|
||||
`UNUSED_PIN (m_axi_arcache),
|
||||
`UNUSED_PIN (m_axi_arprot),
|
||||
`UNUSED_PIN (m_axi_arqos),
|
||||
`UNUSED_PIN (m_axi_arregion),
|
||||
|
||||
.m_axi_rvalid (m_axi_mem_rvalid_a),
|
||||
.m_axi_rready (m_axi_mem_rready_a),
|
||||
.m_axi_rdata (m_axi_mem_rdata_a),
|
||||
.m_axi_rlast (m_axi_mem_rlast_a),
|
||||
.m_axi_rid (m_axi_mem_rid_a),
|
||||
.m_axi_rresp (m_axi_mem_rresp_a),
|
||||
|
||||
.dcr_wr_valid (dcr_wr_valid),
|
||||
.dcr_wr_addr (dcr_wr_addr),
|
||||
.dcr_wr_data (dcr_wr_data),
|
||||
|
||||
.busy (vx_busy)
|
||||
);
|
||||
|
||||
// SCOPE //////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef DBG_SCOPE_AFU
|
||||
`ifdef SCOPE
|
||||
`define TRIGGERS { \
|
||||
reset, \
|
||||
ap_start, \
|
||||
ap_done, \
|
||||
ap_idle, \
|
||||
interrupt, \
|
||||
vx_busy_wait, \
|
||||
vx_busy, \
|
||||
vx_running \
|
||||
}
|
||||
|
||||
`define PROBES { \
|
||||
vx_pending_writes \
|
||||
}
|
||||
|
||||
VX_scope_tap #(
|
||||
.SCOPE_ID (0),
|
||||
.TRIGGERW ($bits(`TRIGGERS)),
|
||||
.PROBEW ($bits(`PROBES))
|
||||
) scope_tap (
|
||||
.clk(clk),
|
||||
.reset(scope_reset_w[0]),
|
||||
.start(1'b0),
|
||||
.stop(1'b0),
|
||||
.triggers(`TRIGGERS),
|
||||
.probes(`PROBES),
|
||||
.bus_in(scope_bus_in_w[0]),
|
||||
.bus_out(scope_bus_out_w[0])
|
||||
);
|
||||
`endif
|
||||
`ifdef CHIPSCOPE
|
||||
ila_afu ila_afu_inst (
|
||||
.clk (ap_clk),
|
||||
.probe0 ({
|
||||
ap_start,
|
||||
ap_done,
|
||||
ap_idle,
|
||||
interrupt
|
||||
}),
|
||||
.probe1 ({
|
||||
vx_pending_writes,
|
||||
vx_busy_wait,
|
||||
vx_busy,
|
||||
vx_running
|
||||
})
|
||||
);
|
||||
`endif
|
||||
`else
|
||||
`SCOPE_IO_UNUSED_W(0)
|
||||
`endif
|
||||
|
||||
`ifdef SIMULATION
|
||||
`ifndef VERILATOR
|
||||
// disable assertions until full reset
|
||||
reg [`CLOG2(`RESET_DELAY+1)-1:0] assert_delay_ctr;
|
||||
reg assert_enabled;
|
||||
initial begin
|
||||
$assertoff(0, vortex_axi);
|
||||
end
|
||||
always @(posedge ap_clk) begin
|
||||
if (reset) begin
|
||||
assert_delay_ctr <= '0;
|
||||
assert_enabled <= 0;
|
||||
end else begin
|
||||
if (~assert_enabled) begin
|
||||
if (assert_delay_ctr == (`RESET_DELAY-1)) begin
|
||||
assert_enabled <= 1;
|
||||
$asserton(0, vortex_axi); // enable assertions
|
||||
end else begin
|
||||
assert_delay_ctr <= assert_delay_ctr + 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_AFU
|
||||
always @(posedge ap_clk) begin
|
||||
for (integer i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
|
||||
if (m_axi_mem_awvalid_a[i] && m_axi_mem_awready_a[i]) begin
|
||||
`TRACE(2, ("%d: AFU Wr Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]));
|
||||
end
|
||||
if (m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i]) begin
|
||||
`TRACE(2, ("%d: AFU Wr Req [%0d]: data=0x%0h\n", $time, i, m_axi_mem_wdata_a[i]));
|
||||
end
|
||||
if (m_axi_mem_arvalid_a[i] && m_axi_mem_arready_a[i]) begin
|
||||
`TRACE(2, ("%d: AFU Rd Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i]));
|
||||
end
|
||||
if (m_axi_mem_rvalid_a[i] && m_axi_mem_rready_a[i]) begin
|
||||
`TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]));
|
||||
end
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
85
hw/rtl/afu/xrt/vortex_afu.v
Normal file
85
hw/rtl/afu/xrt/vortex_afu.v
Normal file
|
@ -0,0 +1,85 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "vortex_afu.vh"
|
||||
|
||||
module vortex_afu #(
|
||||
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
|
||||
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
|
||||
parameter C_M_AXI_MEM_ID_WIDTH = `M_AXI_MEM_ID_WIDTH,
|
||||
parameter C_M_AXI_MEM_ADDR_WIDTH = 64,
|
||||
parameter C_M_AXI_MEM_DATA_WIDTH = `VX_MEM_DATA_WIDTH
|
||||
) (
|
||||
// System signals
|
||||
input wire ap_clk,
|
||||
input wire ap_rst_n,
|
||||
|
||||
// AXI4 master interface
|
||||
`REPEAT (`M_AXI_MEM_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
|
||||
|
||||
// AXI4-Lite slave interface
|
||||
input wire s_axi_ctrl_awvalid,
|
||||
output wire s_axi_ctrl_awready,
|
||||
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_awaddr,
|
||||
input wire s_axi_ctrl_wvalid,
|
||||
output wire s_axi_ctrl_wready,
|
||||
input wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_wdata,
|
||||
input wire [C_S_AXI_CTRL_DATA_WIDTH/8-1:0] s_axi_ctrl_wstrb,
|
||||
input wire s_axi_ctrl_arvalid,
|
||||
output wire s_axi_ctrl_arready,
|
||||
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_araddr,
|
||||
output wire s_axi_ctrl_rvalid,
|
||||
input wire s_axi_ctrl_rready,
|
||||
output wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_rdata,
|
||||
output wire [1:0] s_axi_ctrl_rresp,
|
||||
output wire s_axi_ctrl_bvalid,
|
||||
input wire s_axi_ctrl_bready,
|
||||
output wire [1:0] s_axi_ctrl_bresp,
|
||||
|
||||
output wire interrupt
|
||||
);
|
||||
|
||||
VX_afu_wrap #(
|
||||
.C_S_AXI_CTRL_ADDR_WIDTH (C_S_AXI_CTRL_ADDR_WIDTH),
|
||||
.C_S_AXI_CTRL_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH),
|
||||
.C_M_AXI_MEM_ID_WIDTH (C_M_AXI_MEM_ID_WIDTH),
|
||||
.C_M_AXI_MEM_ADDR_WIDTH (C_M_AXI_MEM_ADDR_WIDTH),
|
||||
.C_M_AXI_MEM_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH)
|
||||
) afu_wrap (
|
||||
.ap_clk (ap_clk),
|
||||
.ap_rst_n (ap_rst_n),
|
||||
|
||||
`REPEAT (`M_AXI_MEM_NUM_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
|
||||
|
||||
.s_axi_ctrl_awvalid (s_axi_ctrl_awvalid),
|
||||
.s_axi_ctrl_awready (s_axi_ctrl_awready),
|
||||
.s_axi_ctrl_awaddr (s_axi_ctrl_awaddr),
|
||||
.s_axi_ctrl_wvalid (s_axi_ctrl_wvalid),
|
||||
.s_axi_ctrl_wready (s_axi_ctrl_wready),
|
||||
.s_axi_ctrl_wdata (s_axi_ctrl_wdata),
|
||||
.s_axi_ctrl_wstrb (s_axi_ctrl_wstrb),
|
||||
.s_axi_ctrl_arvalid (s_axi_ctrl_arvalid),
|
||||
.s_axi_ctrl_arready (s_axi_ctrl_arready),
|
||||
.s_axi_ctrl_araddr (s_axi_ctrl_araddr),
|
||||
.s_axi_ctrl_rvalid (s_axi_ctrl_rvalid),
|
||||
.s_axi_ctrl_rready (s_axi_ctrl_rready),
|
||||
.s_axi_ctrl_rdata (s_axi_ctrl_rdata),
|
||||
.s_axi_ctrl_rresp (s_axi_ctrl_rresp),
|
||||
.s_axi_ctrl_bvalid (s_axi_ctrl_bvalid),
|
||||
.s_axi_ctrl_bready (s_axi_ctrl_bready),
|
||||
.s_axi_ctrl_bresp (s_axi_ctrl_bresp),
|
||||
|
||||
.interrupt (interrupt)
|
||||
);
|
||||
|
||||
endmodule
|
108
hw/rtl/afu/xrt/vortex_afu.vh
Normal file
108
hw/rtl/afu/xrt/vortex_afu.vh
Normal file
|
@ -0,0 +1,108 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`ifndef VORTEX_AFU_VH
|
||||
`define VORTEX_AFU_VH
|
||||
|
||||
`ifndef M_AXI_MEM_NUM_BANKS
|
||||
`define M_AXI_MEM_NUM_BANKS 1
|
||||
`endif
|
||||
|
||||
`ifndef M_AXI_MEM_ID_WIDTH
|
||||
`define M_AXI_MEM_ID_WIDTH 32
|
||||
`endif
|
||||
|
||||
`define GEN_AXI_MEM(i) \
|
||||
output wire m_axi_mem_``i``_awvalid, \
|
||||
input wire m_axi_mem_``i``_awready, \
|
||||
output wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_``i``_awaddr, \
|
||||
output wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_``i``_awid, \
|
||||
output wire [7:0] m_axi_mem_``i``_awlen, \
|
||||
output wire m_axi_mem_``i``_wvalid, \
|
||||
input wire m_axi_mem_``i``_wready, \
|
||||
output wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_``i``_wdata, \
|
||||
output wire [C_M_AXI_MEM_DATA_WIDTH/8-1:0] m_axi_mem_``i``_wstrb, \
|
||||
output wire m_axi_mem_``i``_wlast, \
|
||||
output wire m_axi_mem_``i``_arvalid, \
|
||||
input wire m_axi_mem_``i``_arready, \
|
||||
output wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_``i``_araddr, \
|
||||
output wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_``i``_arid, \
|
||||
output wire [7:0] m_axi_mem_``i``_arlen, \
|
||||
input wire m_axi_mem_``i``_rvalid, \
|
||||
output wire m_axi_mem_``i``_rready, \
|
||||
input wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_``i``_rdata, \
|
||||
input wire m_axi_mem_``i``_rlast, \
|
||||
input wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_``i``_rid, \
|
||||
input wire [1:0] m_axi_mem_``i``_rresp, \
|
||||
input wire m_axi_mem_``i``_bvalid, \
|
||||
output wire m_axi_mem_``i``_bready, \
|
||||
input wire [1:0] m_axi_mem_``i``_bresp, \
|
||||
input wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_``i``_bid
|
||||
|
||||
`define AXI_MEM_ARGS(i) \
|
||||
.m_axi_mem_``i``_awvalid(m_axi_mem_``i``_awvalid), \
|
||||
.m_axi_mem_``i``_awready(m_axi_mem_``i``_awready), \
|
||||
.m_axi_mem_``i``_awaddr(m_axi_mem_``i``_awaddr), \
|
||||
.m_axi_mem_``i``_awid(m_axi_mem_``i``_awid), \
|
||||
.m_axi_mem_``i``_awlen(m_axi_mem_``i``_awlen), \
|
||||
.m_axi_mem_``i``_wvalid(m_axi_mem_``i``_wvalid), \
|
||||
.m_axi_mem_``i``_wready(m_axi_mem_``i``_wready), \
|
||||
.m_axi_mem_``i``_wdata(m_axi_mem_``i``_wdata), \
|
||||
.m_axi_mem_``i``_wstrb(m_axi_mem_``i``_wstrb), \
|
||||
.m_axi_mem_``i``_wlast(m_axi_mem_``i``_wlast), \
|
||||
.m_axi_mem_``i``_arvalid(m_axi_mem_``i``_arvalid), \
|
||||
.m_axi_mem_``i``_arready(m_axi_mem_``i``_arready), \
|
||||
.m_axi_mem_``i``_araddr(m_axi_mem_``i``_araddr), \
|
||||
.m_axi_mem_``i``_arid(m_axi_mem_``i``_arid), \
|
||||
.m_axi_mem_``i``_arlen(m_axi_mem_``i``_arlen), \
|
||||
.m_axi_mem_``i``_rvalid(m_axi_mem_``i``_rvalid), \
|
||||
.m_axi_mem_``i``_rready(m_axi_mem_``i``_rready), \
|
||||
.m_axi_mem_``i``_rdata(m_axi_mem_``i``_rdata), \
|
||||
.m_axi_mem_``i``_rlast(m_axi_mem_``i``_rlast), \
|
||||
.m_axi_mem_``i``_rid(m_axi_mem_``i``_rid), \
|
||||
.m_axi_mem_``i``_rresp(m_axi_mem_``i``_rresp), \
|
||||
.m_axi_mem_``i``_bvalid(m_axi_mem_``i``_bvalid), \
|
||||
.m_axi_mem_``i``_bready(m_axi_mem_``i``_bready), \
|
||||
.m_axi_mem_``i``_bresp(m_axi_mem_``i``_bresp), \
|
||||
.m_axi_mem_``i``_bid(m_axi_mem_``i``_bid)
|
||||
|
||||
`define AXI_MEM_TO_ARRAY(i) \
|
||||
assign m_axi_mem_``i``_awvalid = m_axi_mem_awvalid_a[i]; \
|
||||
assign m_axi_mem_awready_a[i] = m_axi_mem_``i``_awready; \
|
||||
assign m_axi_mem_``i``_awaddr = m_axi_mem_awaddr_a[i]; \
|
||||
assign m_axi_mem_``i``_awid = m_axi_mem_awid_a[i]; \
|
||||
assign m_axi_mem_``i``_awlen = m_axi_mem_awlen_a[i]; \
|
||||
assign m_axi_mem_``i``_wvalid = m_axi_mem_wvalid_a[i]; \
|
||||
assign m_axi_mem_wready_a[i] = m_axi_mem_``i``_wready; \
|
||||
assign m_axi_mem_``i``_wdata = m_axi_mem_wdata_a[i]; \
|
||||
assign m_axi_mem_``i``_wstrb = m_axi_mem_wstrb_a[i]; \
|
||||
assign m_axi_mem_``i``_wlast = m_axi_mem_wlast_a[i]; \
|
||||
assign m_axi_mem_``i``_arvalid = m_axi_mem_arvalid_a[i]; \
|
||||
assign m_axi_mem_arready_a[i] = m_axi_mem_``i``_arready; \
|
||||
assign m_axi_mem_``i``_araddr = m_axi_mem_araddr_a[i]; \
|
||||
assign m_axi_mem_``i``_arid = m_axi_mem_arid_a[i]; \
|
||||
assign m_axi_mem_``i``_arlen = m_axi_mem_arlen_a[i]; \
|
||||
assign m_axi_mem_rvalid_a[i] = m_axi_mem_``i``_rvalid; \
|
||||
assign m_axi_mem_``i``_rready = m_axi_mem_rready_a[i]; \
|
||||
assign m_axi_mem_rdata_a[i] = m_axi_mem_``i``_rdata; \
|
||||
assign m_axi_mem_rlast_a[i] = m_axi_mem_``i``_rlast; \
|
||||
assign m_axi_mem_rid_a[i] = m_axi_mem_``i``_rid; \
|
||||
assign m_axi_mem_rresp_a[i] = m_axi_mem_``i``_rresp; \
|
||||
assign m_axi_mem_bvalid_a[i] = m_axi_mem_``i``_bvalid; \
|
||||
assign m_axi_mem_``i``_bready = m_axi_mem_bready_a[i]; \
|
||||
assign m_axi_mem_bresp_a[i] = m_axi_mem_``i``_bresp; \
|
||||
assign m_axi_mem_bid_a[i] = m_axi_mem_``i``_bid
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
`endif // VORTEX_AFU_VH
|
511
hw/rtl/cache/VX_bank.sv
vendored
511
hw/rtl/cache/VX_bank.sv
vendored
|
@ -1,511 +0,0 @@
|
|||
`include "VX_cache_define.vh"
|
||||
|
||||
module VX_bank #(
|
||||
parameter CACHE_ID = 0,
|
||||
parameter BANK_ID = 0,
|
||||
|
||||
// Number of Word requests per cycle
|
||||
parameter NUM_REQS = 1,
|
||||
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 1,
|
||||
// Size of line inside a bank in bytes
|
||||
parameter CACHE_LINE_SIZE = 1,
|
||||
// Number of bankS
|
||||
parameter NUM_BANKS = 1,
|
||||
// Number of ports per banks
|
||||
parameter NUM_PORTS = 1,
|
||||
// Size of a word in bytes
|
||||
parameter WORD_SIZE = 1,
|
||||
|
||||
// Core Request Queue Size
|
||||
parameter CREQ_SIZE = 1,
|
||||
// Core Response Queue Size
|
||||
parameter CRSQ_SIZE = 1,
|
||||
// Miss Reserv Queue Knob
|
||||
parameter MSHR_SIZE = 1,
|
||||
// Memory Request Queue Size
|
||||
parameter MREQ_SIZE = 1,
|
||||
|
||||
// Enable cache writeable
|
||||
parameter WRITE_ENABLE = 1,
|
||||
|
||||
// core request tag size
|
||||
parameter CORE_TAG_WIDTH = 1,
|
||||
|
||||
// bank offset from beginning of index range
|
||||
parameter BANK_ADDR_OFFSET = 0,
|
||||
|
||||
parameter MSHR_ADDR_WIDTH = $clog2(MSHR_SIZE),
|
||||
parameter WORD_SELECT_BITS = `UP(`WORD_SELECT_BITS)
|
||||
) (
|
||||
`SCOPE_IO_VX_bank
|
||||
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output wire perf_read_misses,
|
||||
output wire perf_write_misses,
|
||||
output wire perf_mshr_stalls,
|
||||
`endif
|
||||
|
||||
// Core Request
|
||||
input wire core_req_valid,
|
||||
input wire [NUM_PORTS-1:0] core_req_pmask,
|
||||
input wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] core_req_wsel,
|
||||
input wire [NUM_PORTS-1:0][WORD_SIZE-1:0] core_req_byteen,
|
||||
input wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] core_req_data,
|
||||
input wire [NUM_PORTS-1:0][`REQS_BITS-1:0] core_req_tid,
|
||||
input wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] core_req_tag,
|
||||
input wire core_req_rw,
|
||||
input wire [`LINE_ADDR_WIDTH-1:0] core_req_addr,
|
||||
output wire core_req_ready,
|
||||
|
||||
// Core Response
|
||||
output wire core_rsp_valid,
|
||||
output wire [NUM_PORTS-1:0] core_rsp_pmask,
|
||||
output wire [NUM_PORTS-1:0][`REQS_BITS-1:0] core_rsp_tid,
|
||||
output wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] core_rsp_data,
|
||||
output wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag,
|
||||
input wire core_rsp_ready,
|
||||
|
||||
// Memory request
|
||||
output wire mem_req_valid,
|
||||
output wire mem_req_rw,
|
||||
output wire [NUM_PORTS-1:0] mem_req_pmask,
|
||||
output wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen,
|
||||
output wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel,
|
||||
output wire [`LINE_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id,
|
||||
output wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// Memory response
|
||||
input wire mem_rsp_valid,
|
||||
input wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id,
|
||||
input wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data,
|
||||
output wire mem_rsp_ready,
|
||||
|
||||
// flush
|
||||
input wire flush_enable,
|
||||
input wire [`LINE_SELECT_BITS-1:0] flush_addr
|
||||
);
|
||||
|
||||
`IGNORE_UNUSED_BEGIN
|
||||
wire [`DBG_CACHE_REQ_IDW-1:0] req_id_sel, req_id_st0, req_id_st1;
|
||||
`IGNORE_UNUSED_END
|
||||
|
||||
wire [NUM_PORTS-1:0] creq_pmask;
|
||||
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] creq_wsel;
|
||||
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] creq_byteen;
|
||||
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] creq_data;
|
||||
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] creq_tid;
|
||||
wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] creq_tag;
|
||||
wire creq_rw;
|
||||
wire [`LINE_ADDR_WIDTH-1:0] creq_addr;
|
||||
|
||||
wire creq_valid, creq_ready;
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (1 + `LINE_ADDR_WIDTH + NUM_PORTS * (1 + WORD_SELECT_BITS + WORD_SIZE + `WORD_WIDTH + `REQS_BITS + CORE_TAG_WIDTH)),
|
||||
.SIZE (CREQ_SIZE)
|
||||
) core_req_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.ready_in (core_req_ready),
|
||||
.valid_in (core_req_valid),
|
||||
.data_in ({core_req_rw, core_req_addr, core_req_pmask, core_req_wsel, core_req_byteen, core_req_data, core_req_tid, core_req_tag}),
|
||||
.data_out ({creq_rw, creq_addr, creq_pmask, creq_wsel, creq_byteen, creq_data, creq_tid, creq_tag}),
|
||||
.ready_out (creq_ready),
|
||||
.valid_out (creq_valid)
|
||||
);
|
||||
|
||||
wire mreq_alm_full;
|
||||
wire [`LINE_ADDR_WIDTH-1:0] mem_rsp_addr;
|
||||
wire crsq_valid, crsq_ready;
|
||||
wire crsq_stall;
|
||||
|
||||
wire mshr_valid;
|
||||
wire mshr_ready;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id;
|
||||
wire mshr_alm_full;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mshr_dequeue_id;
|
||||
wire [`LINE_ADDR_WIDTH-1:0] mshr_addr;
|
||||
wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] mshr_tag;
|
||||
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mshr_wsel;
|
||||
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] mshr_tid;
|
||||
wire [NUM_PORTS-1:0] mshr_pmask;
|
||||
|
||||
wire [`LINE_ADDR_WIDTH-1:0] addr_st0, addr_st1;
|
||||
wire is_read_st0, is_read_st1;
|
||||
wire is_write_st0, is_write_st1;
|
||||
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] wsel_st0, wsel_st1;
|
||||
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] byteen_st0, byteen_st1;
|
||||
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] req_tid_st0, req_tid_st1;
|
||||
wire [NUM_PORTS-1:0] pmask_st0, pmask_st1;
|
||||
wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] tag_st0, tag_st1;
|
||||
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] rdata_st1;
|
||||
wire [`CACHE_LINE_WIDTH-1:0] wdata_st0, wdata_st1;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mshr_id_st0, mshr_id_st1;
|
||||
wire valid_st0, valid_st1;
|
||||
wire is_fill_st0, is_fill_st1;
|
||||
wire is_mshr_st0, is_mshr_st1;
|
||||
wire miss_st0, miss_st1;
|
||||
wire is_flush_st0;
|
||||
wire mshr_pending_st0, mshr_pending_st1;
|
||||
|
||||
// prevent read-during-write hazard when accessing tags/data block RAMs
|
||||
wire rdw_fill_hazard = valid_st0 && is_fill_st0;
|
||||
wire rdw_write_hazard = valid_st0 && is_write_st0 && ~creq_rw;
|
||||
|
||||
// determine which queue to pop next in priority order
|
||||
wire mshr_grant = !flush_enable;
|
||||
wire mshr_enable = mshr_grant && mshr_valid;
|
||||
|
||||
wire mrsq_grant = !flush_enable && !mshr_enable;
|
||||
wire mrsq_enable = mrsq_grant && mem_rsp_valid;
|
||||
wire creq_grant = !flush_enable && !mshr_enable && !mrsq_enable;
|
||||
|
||||
wire creq_enable = creq_grant && creq_valid;
|
||||
|
||||
assign mshr_ready = mshr_grant
|
||||
&& !rdw_fill_hazard // prevent read-during-write hazard
|
||||
&& !crsq_stall; // ensure core_rsp_queue not full
|
||||
|
||||
|
||||
assign mem_rsp_ready = mrsq_grant
|
||||
&& !crsq_stall; // ensure core_rsp_queue not full
|
||||
|
||||
assign creq_ready = creq_grant
|
||||
&& !rdw_write_hazard // prevent read-during-write hazard
|
||||
&& !mreq_alm_full // ensure mem_req_queue not full
|
||||
&& !mshr_alm_full // ensure mshr not full
|
||||
&& !crsq_stall; // ensure core_rsp_queue not full
|
||||
|
||||
wire flush_fire = flush_enable;
|
||||
wire mshr_fire = mshr_valid && mshr_ready;
|
||||
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
|
||||
wire creq_fire = creq_valid && creq_ready;
|
||||
|
||||
assign req_id_sel = mshr_enable ? mshr_tag[0][`CACHE_REQ_ID_RNG] : creq_tag[0][`CACHE_REQ_ID_RNG];
|
||||
|
||||
wire [`CACHE_LINE_WIDTH-1:0] wdata_sel;
|
||||
assign wdata_sel[(NUM_PORTS * `WORD_WIDTH)-1:0] = (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data[(NUM_PORTS * `WORD_WIDTH)-1:0] : creq_data;
|
||||
for (genvar i = NUM_PORTS * `WORD_WIDTH; i < `CACHE_LINE_WIDTH; ++i) begin
|
||||
assign wdata_sel[i] = mem_rsp_data[i];
|
||||
end
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + NUM_PORTS * (WORD_SELECT_BITS + WORD_SIZE + `REQS_BITS + 1 + CORE_TAG_WIDTH) + MSHR_ADDR_WIDTH),
|
||||
.RESETW (1)
|
||||
) pipe_reg0 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!crsq_stall),
|
||||
.data_in ({
|
||||
flush_fire || mshr_fire || mem_rsp_fire || creq_fire,
|
||||
flush_enable,
|
||||
mshr_enable,
|
||||
mrsq_enable,
|
||||
creq_enable && ~creq_rw,
|
||||
creq_enable && creq_rw,
|
||||
flush_enable ? `LINE_ADDR_WIDTH'(flush_addr) : (mshr_valid ? mshr_addr : (mem_rsp_valid ? mem_rsp_addr : creq_addr)),
|
||||
wdata_sel,
|
||||
mshr_valid ? mshr_wsel : creq_wsel,
|
||||
creq_byteen,
|
||||
mshr_valid ? mshr_tid : creq_tid,
|
||||
mshr_valid ? mshr_pmask : creq_pmask,
|
||||
mshr_valid ? mshr_tag : creq_tag,
|
||||
mshr_valid ? mshr_dequeue_id : mem_rsp_id
|
||||
}),
|
||||
.data_out ({valid_st0, is_flush_st0, is_mshr_st0, is_fill_st0, is_read_st0, is_write_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_st0})
|
||||
);
|
||||
|
||||
assign req_id_st0 = tag_st0[0][`CACHE_REQ_ID_RNG];
|
||||
|
||||
wire do_fill_st0 = valid_st0 && is_fill_st0;
|
||||
wire do_flush_st0 = valid_st0 && is_flush_st0;
|
||||
wire do_lookup_st0 = valid_st0 && ~(is_fill_st0 || is_flush_st0);
|
||||
|
||||
wire tag_match_st0;
|
||||
|
||||
VX_tag_access #(
|
||||
.BANK_ID (BANK_ID),
|
||||
.CACHE_ID (CACHE_ID),
|
||||
.CACHE_SIZE (CACHE_SIZE),
|
||||
.CACHE_LINE_SIZE (CACHE_LINE_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.BANK_ADDR_OFFSET (BANK_ADDR_OFFSET)
|
||||
) tag_access (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
.req_id (req_id_st0),
|
||||
|
||||
.stall (crsq_stall),
|
||||
|
||||
// read/Fill
|
||||
.lookup (do_lookup_st0),
|
||||
.addr (addr_st0),
|
||||
.fill (do_fill_st0),
|
||||
.flush (do_flush_st0),
|
||||
.tag_match (tag_match_st0)
|
||||
);
|
||||
|
||||
// we have a core request hit
|
||||
assign miss_st0 = (is_read_st0 || is_write_st0) && ~tag_match_st0;
|
||||
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mshr_id_a_st0 = (is_read_st0 || is_write_st0) ? mshr_alloc_id : mshr_id_st0;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + `LINE_ADDR_WIDTH + `CACHE_LINE_WIDTH + NUM_PORTS * (WORD_SELECT_BITS + WORD_SIZE + `REQS_BITS + 1 + CORE_TAG_WIDTH) + MSHR_ADDR_WIDTH + 1),
|
||||
.RESETW (1)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!crsq_stall),
|
||||
.data_in ({valid_st0, is_mshr_st0, is_fill_st0, is_read_st0, is_write_st0, miss_st0, addr_st0, wdata_st0, wsel_st0, byteen_st0, req_tid_st0, pmask_st0, tag_st0, mshr_id_a_st0, mshr_pending_st0}),
|
||||
.data_out ({valid_st1, is_mshr_st1, is_fill_st1, is_read_st1, is_write_st1, miss_st1, addr_st1, wdata_st1, wsel_st1, byteen_st1, req_tid_st1, pmask_st1, tag_st1, mshr_id_st1, mshr_pending_st1})
|
||||
);
|
||||
|
||||
assign req_id_st1 = tag_st1[0][`CACHE_REQ_ID_RNG];
|
||||
|
||||
wire do_read_st0 = valid_st0 && is_read_st0;
|
||||
wire do_read_st1 = valid_st1 && is_read_st1;
|
||||
wire do_fill_st1 = valid_st1 && is_fill_st1;
|
||||
wire do_write_st1 = valid_st1 && is_write_st1;
|
||||
wire do_mshr_st1 = valid_st1 && is_mshr_st1;
|
||||
|
||||
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] creq_data_st1 = wdata_st1[0 +: NUM_PORTS * `WORD_WIDTH];
|
||||
`UNUSED_VAR (wdata_st1)
|
||||
|
||||
VX_data_access #(
|
||||
.BANK_ID (BANK_ID),
|
||||
.CACHE_ID (CACHE_ID),
|
||||
.CACHE_SIZE (CACHE_SIZE),
|
||||
.CACHE_LINE_SIZE(CACHE_LINE_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.NUM_PORTS (NUM_PORTS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.WRITE_ENABLE (WRITE_ENABLE)
|
||||
) data_access (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
.req_id (req_id_st1),
|
||||
|
||||
.stall (crsq_stall),
|
||||
|
||||
.read (do_read_st1 || do_mshr_st1),
|
||||
.fill (do_fill_st1),
|
||||
.write (do_write_st1 && !miss_st1),
|
||||
.addr (addr_st1),
|
||||
.wsel (wsel_st1),
|
||||
.pmask (pmask_st1),
|
||||
.byteen (byteen_st1),
|
||||
.fill_data (wdata_st1),
|
||||
.write_data (creq_data_st1),
|
||||
.read_data (rdata_st1)
|
||||
);
|
||||
|
||||
wire mshr_allocate = do_read_st0 && !crsq_stall;
|
||||
wire mshr_replay = do_fill_st0 && !crsq_stall;
|
||||
wire mshr_lookup = mshr_allocate;
|
||||
wire mshr_release = do_read_st1 && !miss_st1 && !crsq_stall;
|
||||
|
||||
VX_pending_size #(
|
||||
.SIZE (MSHR_SIZE)
|
||||
) mshr_pending_size (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.incr (creq_fire && ~creq_rw),
|
||||
.decr (mshr_fire || mshr_release),
|
||||
.full (mshr_alm_full),
|
||||
`UNUSED_PIN (size),
|
||||
`UNUSED_PIN (empty)
|
||||
);
|
||||
|
||||
VX_miss_resrv #(
|
||||
.BANK_ID (BANK_ID),
|
||||
.CACHE_ID (CACHE_ID),
|
||||
.CACHE_LINE_SIZE (CACHE_LINE_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.NUM_PORTS (NUM_PORTS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.CORE_TAG_WIDTH (CORE_TAG_WIDTH)
|
||||
) miss_resrv (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
.deq_req_id (req_id_sel),
|
||||
.lkp_req_id (req_id_st0),
|
||||
.rel_req_id (req_id_st1),
|
||||
|
||||
// allocate
|
||||
.allocate_valid (mshr_allocate),
|
||||
.allocate_addr (addr_st0),
|
||||
.allocate_data ({wsel_st0, tag_st0, req_tid_st0, pmask_st0}),
|
||||
.allocate_id (mshr_alloc_id),
|
||||
`UNUSED_PIN (allocate_ready),
|
||||
|
||||
// lookup
|
||||
.lookup_valid (mshr_lookup),
|
||||
.lookup_replay (mshr_replay),
|
||||
.lookup_id (mshr_alloc_id),
|
||||
.lookup_addr (addr_st0),
|
||||
.lookup_match (mshr_pending_st0),
|
||||
|
||||
// fill
|
||||
.fill_valid (mem_rsp_fire),
|
||||
.fill_id (mem_rsp_id),
|
||||
.fill_addr (mem_rsp_addr),
|
||||
|
||||
// dequeue
|
||||
.dequeue_valid (mshr_valid),
|
||||
.dequeue_id (mshr_dequeue_id),
|
||||
.dequeue_addr (mshr_addr),
|
||||
.dequeue_data ({mshr_wsel, mshr_tag, mshr_tid, mshr_pmask}),
|
||||
.dequeue_ready (mshr_ready),
|
||||
|
||||
// release
|
||||
.release_valid (mshr_release),
|
||||
.release_id (mshr_id_st1)
|
||||
);
|
||||
|
||||
// Enqueue core response
|
||||
|
||||
wire [NUM_PORTS-1:0] crsq_pmask;
|
||||
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] crsq_data;
|
||||
wire [NUM_PORTS-1:0][`REQS_BITS-1:0] crsq_tid;
|
||||
wire [NUM_PORTS-1:0][CORE_TAG_WIDTH-1:0] crsq_tag;
|
||||
|
||||
assign crsq_valid = (do_read_st1 && !miss_st1)
|
||||
|| do_mshr_st1;
|
||||
|
||||
assign crsq_stall = crsq_valid && !crsq_ready;
|
||||
|
||||
assign crsq_pmask = pmask_st1;
|
||||
assign crsq_tid = req_tid_st1;
|
||||
assign crsq_data = rdata_st1;
|
||||
assign crsq_tag = tag_st1;
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (NUM_PORTS * (CORE_TAG_WIDTH + 1 + `WORD_WIDTH + `REQS_BITS)),
|
||||
.SIZE (CRSQ_SIZE),
|
||||
.OUT_REG (1)
|
||||
) core_rsp_req (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (crsq_valid),
|
||||
.data_in ({crsq_tag, crsq_pmask, crsq_data, crsq_tid}),
|
||||
.ready_in (crsq_ready),
|
||||
.valid_out (core_rsp_valid),
|
||||
.data_out ({core_rsp_tag, core_rsp_pmask, core_rsp_data, core_rsp_tid}),
|
||||
.ready_out (core_rsp_ready)
|
||||
);
|
||||
|
||||
// Enqueue memory request
|
||||
|
||||
wire mreq_push, mreq_pop, mreq_empty;
|
||||
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mreq_data;
|
||||
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mreq_byteen;
|
||||
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mreq_wsel;
|
||||
wire [NUM_PORTS-1:0] mreq_pmask;
|
||||
wire [`LINE_ADDR_WIDTH-1:0] mreq_addr;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mreq_id;
|
||||
wire mreq_rw;
|
||||
|
||||
assign mreq_push = (do_read_st1 && miss_st1 && !mshr_pending_st1)
|
||||
|| do_write_st1;
|
||||
|
||||
assign mreq_pop = mem_req_valid && mem_req_ready;
|
||||
|
||||
assign mreq_rw = WRITE_ENABLE && is_write_st1;
|
||||
assign mreq_addr = addr_st1;
|
||||
assign mreq_id = mshr_id_st1;
|
||||
assign mreq_pmask= pmask_st1;
|
||||
assign mreq_wsel = wsel_st1;
|
||||
assign mreq_byteen = byteen_st1;
|
||||
assign mreq_data = creq_data_st1;
|
||||
|
||||
VX_fifo_queue #(
|
||||
.DATAW (1 + `LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_PORTS * (1 + WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH)),
|
||||
.SIZE (MREQ_SIZE),
|
||||
.ALM_FULL (MREQ_SIZE-2),
|
||||
.OUT_REG (1 == NUM_BANKS)
|
||||
) mem_req_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.push (mreq_push),
|
||||
.pop (mreq_pop),
|
||||
.data_in ({mreq_rw, mreq_addr, mreq_id, mreq_pmask, mreq_byteen, mreq_wsel, mreq_data}),
|
||||
.data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_pmask, mem_req_byteen, mem_req_wsel, mem_req_data}),
|
||||
.empty (mreq_empty),
|
||||
.alm_full (mreq_alm_full),
|
||||
`UNUSED_PIN (full),
|
||||
`UNUSED_PIN (alm_empty),
|
||||
`UNUSED_PIN (size)
|
||||
);
|
||||
|
||||
assign mem_req_valid = !mreq_empty;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`SCOPE_ASSIGN (valid_st0, valid_st0);
|
||||
`SCOPE_ASSIGN (valid_st1, valid_st1);
|
||||
`SCOPE_ASSIGN (is_fill_st0, is_fill_st0);
|
||||
`SCOPE_ASSIGN (is_mshr_st0, is_mshr_st0);
|
||||
`SCOPE_ASSIGN (miss_st0, miss_st0);
|
||||
`SCOPE_ASSIGN (crsq_stall, crsq_stall);
|
||||
`SCOPE_ASSIGN (mreq_alm_full, mreq_alm_full);
|
||||
`SCOPE_ASSIGN (mshr_alm_full, mshr_alm_full);
|
||||
`SCOPE_ASSIGN (addr_st0, `LINE_TO_BYTE_ADDR(addr_st0, BANK_ID));
|
||||
`SCOPE_ASSIGN (addr_st1, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID));
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
assign perf_read_misses = do_read_st1 && miss_st1;
|
||||
assign perf_write_misses = do_write_st1 && miss_st1;
|
||||
assign perf_mshr_stalls = mshr_alm_full;
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_CACHE_BANK
|
||||
wire crsq_fire = crsq_valid && crsq_ready;
|
||||
wire pipeline_stall = (mshr_valid || mem_rsp_valid || creq_valid)
|
||||
&& ~(mshr_fire || mem_rsp_fire || creq_fire);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (pipeline_stall) begin
|
||||
dpi_trace("%d: *** cache%0d:%0d stall: crsq=%b, mreq=%b, mshr=%b\n", $time, CACHE_ID, BANK_ID, crsq_stall, mreq_alm_full, mshr_alm_full);
|
||||
end
|
||||
if (flush_enable) begin
|
||||
dpi_trace("%d: cache%0d:%0d flush: addr=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(flush_addr, BANK_ID));
|
||||
end
|
||||
if (mem_rsp_fire) begin
|
||||
dpi_trace("%d: cache%0d:%0d fill-rsp: addr=%0h, id=%0d, data=%0h\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data);
|
||||
end
|
||||
if (mshr_fire) begin
|
||||
dpi_trace("%d: cache%0d:%0d mshr-pop: addr=%0h, tag=%0h, pmask=%b, tid=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mshr_addr, BANK_ID), mshr_tag, mshr_pmask, mshr_tid, req_id_sel);
|
||||
end
|
||||
if (creq_fire) begin
|
||||
if (creq_rw)
|
||||
dpi_trace("%d: cache%0d:%0d core-wr-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d, byteen=%b, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, creq_byteen, creq_data, req_id_sel);
|
||||
else
|
||||
dpi_trace("%d: cache%0d:%0d core-rd-req: addr=%0h, tag=%0h, pmask=%b, tid=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(creq_addr, BANK_ID), creq_tag, creq_pmask, creq_tid, req_id_sel);
|
||||
end
|
||||
if (crsq_fire) begin
|
||||
dpi_trace("%d: cache%0d:%0d core-rsp: addr=%0h, tag=%0h, pmask=%b, tid=%0d, data=%0h (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_pmask, crsq_tid, crsq_data, req_id_st1);
|
||||
end
|
||||
if (mreq_push) begin
|
||||
if (is_write_st1)
|
||||
dpi_trace("%d: cache%0d:%0d writeback: addr=%0h, data=%0h, byteen=%b (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_data, mreq_byteen, req_id_st1);
|
||||
else
|
||||
dpi_trace("%d: cache%0d:%0d fill-req: addr=%0h, id=%0d (#%0d)\n", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(mreq_addr, BANK_ID), mreq_id, req_id_st1);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
1047
hw/rtl/cache/VX_cache.sv
vendored
1047
hw/rtl/cache/VX_cache.sv
vendored
File diff suppressed because it is too large
Load diff
549
hw/rtl/cache/VX_cache_bank.sv
vendored
Normal file
549
hw/rtl/cache/VX_cache_bank.sv
vendored
Normal file
|
@ -0,0 +1,549 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_cache_define.vh"
|
||||
|
||||
module VX_cache_bank #(
|
||||
parameter `STRING INSTANCE_ID= "",
|
||||
parameter BANK_ID = 0,
|
||||
|
||||
// Number of Word requests per cycle
|
||||
parameter NUM_REQS = 1,
|
||||
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 1024,
|
||||
// Size of line inside a bank in bytes
|
||||
parameter LINE_SIZE = 16,
|
||||
// Number of banks
|
||||
parameter NUM_BANKS = 1,
|
||||
// Number of associative ways
|
||||
parameter NUM_WAYS = 1,
|
||||
// Size of a word in bytes
|
||||
parameter WORD_SIZE = 4,
|
||||
|
||||
// Core Response Queue Size
|
||||
parameter CRSQ_SIZE = 1,
|
||||
// Miss Reserv Queue Knob
|
||||
parameter MSHR_SIZE = 1,
|
||||
// Memory Request Queue Size
|
||||
parameter MREQ_SIZE = 1,
|
||||
|
||||
// Enable cache writeable
|
||||
parameter WRITE_ENABLE = 1,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
// core request tag size
|
||||
parameter TAG_WIDTH = UUID_WIDTH + 1,
|
||||
|
||||
// Core response output register
|
||||
parameter CORE_OUT_REG = 0,
|
||||
|
||||
// Memory request output register
|
||||
parameter MEM_OUT_REG = 0,
|
||||
|
||||
parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE),
|
||||
parameter REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS),
|
||||
parameter WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output wire perf_read_misses,
|
||||
output wire perf_write_misses,
|
||||
output wire perf_mshr_stalls,
|
||||
`endif
|
||||
|
||||
// Core Request
|
||||
input wire core_req_valid,
|
||||
input wire [`CS_LINE_ADDR_WIDTH-1:0] core_req_addr,
|
||||
input wire core_req_rw,
|
||||
input wire [WORD_SEL_WIDTH-1:0] core_req_wsel,
|
||||
input wire [WORD_SIZE-1:0] core_req_byteen,
|
||||
input wire [`CS_WORD_WIDTH-1:0] core_req_data,
|
||||
input wire [TAG_WIDTH-1:0] core_req_tag,
|
||||
input wire [REQ_SEL_WIDTH-1:0] core_req_idx,
|
||||
output wire core_req_ready,
|
||||
|
||||
// Core Response
|
||||
output wire core_rsp_valid,
|
||||
output wire [`CS_WORD_WIDTH-1:0] core_rsp_data,
|
||||
output wire [TAG_WIDTH-1:0] core_rsp_tag,
|
||||
output wire [REQ_SEL_WIDTH-1:0] core_rsp_idx,
|
||||
input wire core_rsp_ready,
|
||||
|
||||
// Memory request
|
||||
output wire mem_req_valid,
|
||||
output wire [`CS_LINE_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire mem_req_rw,
|
||||
output wire [WORD_SEL_WIDTH-1:0] mem_req_wsel,
|
||||
output wire [WORD_SIZE-1:0] mem_req_byteen,
|
||||
output wire [`CS_WORD_WIDTH-1:0] mem_req_data,
|
||||
output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// Memory response
|
||||
input wire mem_rsp_valid,
|
||||
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id,
|
||||
output wire mem_rsp_ready,
|
||||
|
||||
// initialization
|
||||
input wire init_enable,
|
||||
input wire [`CS_LINE_SEL_BITS-1:0] init_line_sel
|
||||
);
|
||||
|
||||
`IGNORE_UNUSED_BEGIN
|
||||
wire [`UP(UUID_WIDTH)-1:0] req_uuid_sel, req_uuid_st0, req_uuid_st1;
|
||||
`IGNORE_UNUSED_END
|
||||
|
||||
wire crsq_stall;
|
||||
wire mshr_alm_full;
|
||||
wire mreq_alm_full;
|
||||
|
||||
wire [`CS_LINE_ADDR_WIDTH-1:0] mem_rsp_addr;
|
||||
|
||||
wire replay_valid;
|
||||
wire [`CS_LINE_ADDR_WIDTH-1:0] replay_addr;
|
||||
wire replay_rw;
|
||||
wire [WORD_SEL_WIDTH-1:0] replay_wsel;
|
||||
wire [WORD_SIZE-1:0] replay_byteen;
|
||||
wire [`CS_WORD_WIDTH-1:0] replay_data;
|
||||
wire [TAG_WIDTH-1:0] replay_tag;
|
||||
wire [REQ_SEL_WIDTH-1:0] replay_idx;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] replay_id;
|
||||
wire replay_ready;
|
||||
|
||||
wire [`CS_LINE_ADDR_WIDTH-1:0] addr_sel, addr_st0, addr_st1;
|
||||
wire rw_st0, rw_st1;
|
||||
wire [WORD_SEL_WIDTH-1:0] wsel_st0, wsel_st1;
|
||||
wire [WORD_SIZE-1:0] byteen_st0, byteen_st1;
|
||||
wire [REQ_SEL_WIDTH-1:0] req_idx_st0, req_idx_st1;
|
||||
wire [TAG_WIDTH-1:0] tag_st0, tag_st1;
|
||||
wire [`CS_WORD_WIDTH-1:0] read_data_st1;
|
||||
wire [`CS_LINE_WIDTH-1:0] data_sel, data_st0, data_st1;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0, mshr_id_st0, mshr_id_st1;
|
||||
wire valid_sel, valid_st0, valid_st1;
|
||||
wire is_init_st0;
|
||||
wire is_creq_st0, is_creq_st1;
|
||||
wire is_fill_st0, is_fill_st1;
|
||||
wire is_replay_st0, is_replay_st1;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mshr_tail_st0, mshr_tail_st1;
|
||||
wire mshr_pending_st0, mshr_pending_st1;
|
||||
|
||||
wire rdw_hazard_st0;
|
||||
reg rdw_hazard_st1;
|
||||
|
||||
wire pipe_stall = crsq_stall || rdw_hazard_st1;
|
||||
|
||||
// inputs arbitration:
|
||||
// mshr replay has highest priority to maximize utilization since there is no miss.
|
||||
// handle memory responses next to prevent deadlock with potential memory request from a miss.
|
||||
wire replay_grant = ~init_enable;
|
||||
wire replay_enable = replay_grant && replay_valid;
|
||||
|
||||
wire fill_grant = ~init_enable && ~replay_enable;
|
||||
wire fill_enable = fill_grant && mem_rsp_valid;
|
||||
|
||||
wire creq_grant = ~init_enable && ~replay_enable && ~fill_enable;
|
||||
wire creq_enable = creq_grant && core_req_valid;
|
||||
|
||||
assign replay_ready = replay_grant
|
||||
&& ~rdw_hazard_st0
|
||||
&& ~pipe_stall;
|
||||
|
||||
assign mem_rsp_ready = fill_grant
|
||||
&& ~pipe_stall;
|
||||
|
||||
assign core_req_ready = creq_grant
|
||||
&& ~mreq_alm_full
|
||||
&& ~mshr_alm_full
|
||||
&& ~pipe_stall;
|
||||
|
||||
wire init_fire = init_enable;
|
||||
wire replay_fire = replay_valid && replay_ready;
|
||||
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
|
||||
wire core_req_fire = core_req_valid && core_req_ready;
|
||||
|
||||
wire [TAG_WIDTH-1:0] mshr_creq_tag = replay_enable ? replay_tag : core_req_tag;
|
||||
|
||||
if (UUID_WIDTH != 0) begin
|
||||
assign req_uuid_sel = mshr_creq_tag[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
end else begin
|
||||
assign req_uuid_sel = 0;
|
||||
end
|
||||
|
||||
`UNUSED_VAR (mshr_creq_tag)
|
||||
|
||||
assign valid_sel = init_fire || replay_fire || mem_rsp_fire || core_req_fire;
|
||||
|
||||
assign addr_sel = init_enable ? `CS_LINE_ADDR_WIDTH'(init_line_sel) :
|
||||
(replay_valid ? replay_addr :
|
||||
(mem_rsp_valid ? mem_rsp_addr : core_req_addr));
|
||||
|
||||
assign data_sel[`CS_WORD_WIDTH-1:0] = (mem_rsp_valid || !WRITE_ENABLE) ? mem_rsp_data[`CS_WORD_WIDTH-1:0] : (replay_valid ? replay_data : core_req_data);
|
||||
for (genvar i = `CS_WORD_WIDTH; i < `CS_LINE_WIDTH; ++i) begin
|
||||
assign data_sel[i] = mem_rsp_data[i];
|
||||
end
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
|
||||
.RESETW (1)
|
||||
) pipe_reg0 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~pipe_stall),
|
||||
.data_in ({
|
||||
valid_sel,
|
||||
init_enable,
|
||||
replay_enable,
|
||||
fill_enable,
|
||||
creq_enable,
|
||||
addr_sel,
|
||||
data_sel,
|
||||
replay_valid ? replay_rw : core_req_rw,
|
||||
replay_valid ? replay_byteen : core_req_byteen,
|
||||
replay_valid ? replay_wsel : core_req_wsel,
|
||||
replay_valid ? replay_idx : core_req_idx,
|
||||
replay_valid ? replay_tag : core_req_tag,
|
||||
replay_id
|
||||
}),
|
||||
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_creq_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
|
||||
);
|
||||
|
||||
if (UUID_WIDTH != 0) begin
|
||||
assign req_uuid_st0 = tag_st0[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
end else begin
|
||||
assign req_uuid_st0 = 0;
|
||||
end
|
||||
|
||||
wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0;
|
||||
wire do_fill_st0 = valid_st0 && is_fill_st0;
|
||||
wire do_init_st0 = valid_st0 && is_init_st0;
|
||||
wire do_lookup_st0 = valid_st0 && ~(is_fill_st0 || is_init_st0);
|
||||
|
||||
wire [`CS_WORD_WIDTH-1:0] write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0];
|
||||
|
||||
wire [NUM_WAYS-1:0] tag_matches_st0, tag_matches_st1;
|
||||
wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1;
|
||||
|
||||
`RESET_RELAY (tag_reset, reset);
|
||||
|
||||
VX_cache_tags #(
|
||||
.INSTANCE_ID(INSTANCE_ID),
|
||||
.BANK_ID (BANK_ID),
|
||||
.CACHE_SIZE (CACHE_SIZE),
|
||||
.LINE_SIZE (LINE_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.NUM_WAYS (NUM_WAYS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.UUID_WIDTH (UUID_WIDTH)
|
||||
) cache_tags (
|
||||
.clk (clk),
|
||||
.reset (tag_reset),
|
||||
|
||||
.req_uuid (req_uuid_st0),
|
||||
|
||||
.stall (pipe_stall),
|
||||
|
||||
// read/Fill
|
||||
.lookup (do_lookup_st0),
|
||||
.line_addr (addr_st0),
|
||||
.fill (do_fill_st0),
|
||||
.init (do_init_st0),
|
||||
.way_sel (way_sel_st0),
|
||||
.tag_matches(tag_matches_st0)
|
||||
);
|
||||
|
||||
assign mshr_id_st0 = is_creq_st0 ? mshr_alloc_id_st0 : replay_id_st0;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + NUM_WAYS + 1),
|
||||
.RESETW (1)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~pipe_stall),
|
||||
.data_in ({valid_st0, is_replay_st0, is_fill_st0, is_creq_st0, rw_st0, addr_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_tail_st0, tag_matches_st0, way_sel_st0, mshr_pending_st0}),
|
||||
.data_out ({valid_st1, is_replay_st1, is_fill_st1, is_creq_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_tail_st1, tag_matches_st1, way_sel_st1, mshr_pending_st1})
|
||||
);
|
||||
|
||||
// we have a tag hit
|
||||
wire is_hit_st1 = (| tag_matches_st1);
|
||||
|
||||
if (UUID_WIDTH != 0) begin
|
||||
assign req_uuid_st1 = tag_st1[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
end else begin
|
||||
assign req_uuid_st1 = 0;
|
||||
end
|
||||
|
||||
wire do_creq_rd_st1 = valid_st1 && is_creq_st1 && ~rw_st1;
|
||||
wire do_creq_wr_st1 = valid_st1 && is_creq_st1 && rw_st1;
|
||||
wire do_fill_st1 = valid_st1 && is_fill_st1;
|
||||
wire do_replay_rd_st1 = valid_st1 && is_replay_st1 && ~rw_st1;
|
||||
wire do_replay_wr_st1 = valid_st1 && is_replay_st1 && rw_st1;
|
||||
|
||||
wire do_read_hit_st1 = do_creq_rd_st1 && is_hit_st1;
|
||||
wire do_read_miss_st1 = do_creq_rd_st1 && ~is_hit_st1;
|
||||
|
||||
wire do_write_hit_st1 = do_creq_wr_st1 && is_hit_st1;
|
||||
wire do_write_miss_st1= do_creq_wr_st1 && ~is_hit_st1;
|
||||
|
||||
`UNUSED_VAR (do_write_miss_st1)
|
||||
|
||||
// ensure mshr replay always get a hit
|
||||
`RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("runtime error: invalid mshr replay"));
|
||||
|
||||
// detect BRAM's read-during-write hazard
|
||||
assign rdw_hazard_st0 = do_fill_st0; // after a fill
|
||||
always @(posedge clk) begin
|
||||
rdw_hazard_st1 <= (do_creq_rd_st0 && do_write_hit_st1 && (addr_st0 == addr_st1))
|
||||
&& ~rdw_hazard_st1; // after a write to same address
|
||||
end
|
||||
|
||||
wire [`CS_WORD_WIDTH-1:0] write_data_st1 = data_st1[`CS_WORD_WIDTH-1:0];
|
||||
wire [`CS_LINE_WIDTH-1:0] fill_data_st1 = data_st1;
|
||||
|
||||
`RESET_RELAY (data_reset, reset);
|
||||
|
||||
VX_cache_data #(
|
||||
.INSTANCE_ID (INSTANCE_ID),
|
||||
.BANK_ID (BANK_ID),
|
||||
.CACHE_SIZE (CACHE_SIZE),
|
||||
.LINE_SIZE (LINE_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.NUM_WAYS (NUM_WAYS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.UUID_WIDTH (UUID_WIDTH)
|
||||
) cache_data (
|
||||
.clk (clk),
|
||||
.reset (data_reset),
|
||||
|
||||
.req_uuid (req_uuid_st1),
|
||||
|
||||
.stall (pipe_stall),
|
||||
|
||||
.read (do_read_hit_st1 || do_replay_rd_st1),
|
||||
.fill (do_fill_st1),
|
||||
.write (do_write_hit_st1 || do_replay_wr_st1),
|
||||
.way_sel (way_sel_st1 | tag_matches_st1),
|
||||
.line_addr (addr_st1),
|
||||
.wsel (wsel_st1),
|
||||
.byteen (byteen_st1),
|
||||
.fill_data (fill_data_st1),
|
||||
.write_data (write_data_st1),
|
||||
.read_data (read_data_st1)
|
||||
);
|
||||
|
||||
wire [MSHR_SIZE-1:0] mshr_matches_st0;
|
||||
wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~pipe_stall;
|
||||
wire mshr_lookup_st0 = mshr_allocate_st0;
|
||||
wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~pipe_stall;
|
||||
wire mshr_release_st1 = is_hit_st1 || (rw_st1 && ~mshr_pending_st1);
|
||||
|
||||
VX_pending_size #(
|
||||
.SIZE (MSHR_SIZE)
|
||||
) mshr_pending_size (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.incr (core_req_fire),
|
||||
.decr (replay_fire || (mshr_finalize_st1 && mshr_release_st1)),
|
||||
.full (mshr_alm_full),
|
||||
`UNUSED_PIN (size),
|
||||
`UNUSED_PIN (empty)
|
||||
);
|
||||
|
||||
`RESET_RELAY (mshr_reset, reset);
|
||||
|
||||
VX_cache_mshr #(
|
||||
.INSTANCE_ID (INSTANCE_ID),
|
||||
.BANK_ID (BANK_ID),
|
||||
.LINE_SIZE (LINE_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH)
|
||||
) cache_mshr (
|
||||
.clk (clk),
|
||||
.reset (mshr_reset),
|
||||
|
||||
.deq_req_uuid (req_uuid_sel),
|
||||
.lkp_req_uuid (req_uuid_st0),
|
||||
.fin_req_uuid (req_uuid_st1),
|
||||
|
||||
// memory fill
|
||||
.fill_valid (mem_rsp_fire),
|
||||
.fill_id (mem_rsp_id),
|
||||
.fill_addr (mem_rsp_addr),
|
||||
|
||||
// dequeue
|
||||
.dequeue_valid (replay_valid),
|
||||
.dequeue_addr (replay_addr),
|
||||
.dequeue_rw (replay_rw),
|
||||
.dequeue_data ({replay_wsel, replay_byteen, replay_data, replay_tag, replay_idx}),
|
||||
.dequeue_id (replay_id),
|
||||
.dequeue_ready (replay_ready),
|
||||
|
||||
// allocate
|
||||
.allocate_valid (mshr_allocate_st0),
|
||||
.allocate_addr (addr_st0),
|
||||
.allocate_rw (rw_st0),
|
||||
.allocate_data ({wsel_st0, byteen_st0, write_data_st0, tag_st0, req_idx_st0}),
|
||||
.allocate_id (mshr_alloc_id_st0),
|
||||
.allocate_tail (mshr_tail_st0),
|
||||
`UNUSED_PIN (allocate_ready),
|
||||
|
||||
// lookup
|
||||
.lookup_valid (mshr_lookup_st0),
|
||||
.lookup_addr (addr_st0),
|
||||
.lookup_matches (mshr_matches_st0),
|
||||
|
||||
// finalize
|
||||
.finalize_valid (mshr_finalize_st1),
|
||||
.finalize_release(mshr_release_st1),
|
||||
.finalize_pending(mshr_pending_st1),
|
||||
.finalize_id (mshr_id_st1),
|
||||
.finalize_tail (mshr_tail_st1)
|
||||
);
|
||||
|
||||
// ignore allocated id from mshr matches
|
||||
wire [MSHR_SIZE-1:0] lookup_matches;
|
||||
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
|
||||
assign lookup_matches[i] = (i != mshr_alloc_id_st0) && mshr_matches_st0[i];
|
||||
end
|
||||
assign mshr_pending_st0 = (| lookup_matches);
|
||||
|
||||
// schedule core response
|
||||
|
||||
wire crsq_valid, crsq_ready;
|
||||
wire [`CS_WORD_WIDTH-1:0] crsq_data;
|
||||
wire [REQ_SEL_WIDTH-1:0] crsq_idx;
|
||||
wire [TAG_WIDTH-1:0] crsq_tag;
|
||||
|
||||
assign crsq_valid = do_read_hit_st1 || do_replay_rd_st1;
|
||||
assign crsq_idx = req_idx_st1;
|
||||
assign crsq_data = read_data_st1;
|
||||
assign crsq_tag = tag_st1;
|
||||
|
||||
`RESET_RELAY (crsp_reset, reset);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (TAG_WIDTH + `CS_WORD_WIDTH + REQ_SEL_WIDTH),
|
||||
.SIZE (CRSQ_SIZE),
|
||||
.OUT_REG (CORE_OUT_REG)
|
||||
) core_rsp_queue (
|
||||
.clk (clk),
|
||||
.reset (crsp_reset),
|
||||
.valid_in (crsq_valid && ~rdw_hazard_st1),
|
||||
.ready_in (crsq_ready),
|
||||
.data_in ({crsq_tag, crsq_data, crsq_idx}),
|
||||
.data_out ({core_rsp_tag, core_rsp_data, core_rsp_idx}),
|
||||
.valid_out (core_rsp_valid),
|
||||
.ready_out (core_rsp_ready)
|
||||
);
|
||||
|
||||
assign crsq_stall = crsq_valid && ~crsq_ready;
|
||||
|
||||
// schedule memory request
|
||||
|
||||
wire mreq_push, mreq_pop, mreq_empty;
|
||||
wire [`CS_WORD_WIDTH-1:0] mreq_data;
|
||||
wire [WORD_SIZE-1:0] mreq_byteen;
|
||||
wire [WORD_SEL_WIDTH-1:0] mreq_wsel;
|
||||
wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_addr;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mreq_id;
|
||||
wire mreq_rw;
|
||||
|
||||
assign mreq_push = (do_read_miss_st1 && ~mshr_pending_st1)
|
||||
|| do_creq_wr_st1;
|
||||
|
||||
assign mreq_pop = mem_req_valid && mem_req_ready;
|
||||
|
||||
assign mreq_rw = WRITE_ENABLE && rw_st1;
|
||||
assign mreq_addr = addr_st1;
|
||||
assign mreq_id = mshr_id_st1;
|
||||
assign mreq_wsel = wsel_st1;
|
||||
assign mreq_byteen = byteen_st1;
|
||||
assign mreq_data = write_data_st1;
|
||||
|
||||
`RESET_RELAY (mreq_reset, reset);
|
||||
|
||||
VX_fifo_queue #(
|
||||
.DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + `CS_WORD_WIDTH),
|
||||
.DEPTH (MREQ_SIZE),
|
||||
.ALM_FULL (MREQ_SIZE-2),
|
||||
.OUT_REG (MEM_OUT_REG)
|
||||
) mem_req_queue (
|
||||
.clk (clk),
|
||||
.reset (mreq_reset),
|
||||
.push (mreq_push),
|
||||
.pop (mreq_pop),
|
||||
.data_in ({mreq_rw, mreq_addr, mreq_id, mreq_byteen, mreq_wsel, mreq_data}),
|
||||
.data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_wsel, mem_req_data}),
|
||||
.empty (mreq_empty),
|
||||
.alm_full (mreq_alm_full),
|
||||
`UNUSED_PIN (full),
|
||||
`UNUSED_PIN (alm_empty),
|
||||
`UNUSED_PIN (size)
|
||||
);
|
||||
|
||||
assign mem_req_valid = ~mreq_empty;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
assign perf_read_misses = do_read_miss_st1;
|
||||
assign perf_write_misses = do_write_miss_st1;
|
||||
assign perf_mshr_stalls = mshr_alm_full;
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_CACHE_BANK
|
||||
wire crsq_fire = crsq_valid && crsq_ready;
|
||||
wire pipeline_stall = (replay_valid || mem_rsp_valid || core_req_valid)
|
||||
&& ~(replay_fire || mem_rsp_fire || core_req_fire);
|
||||
always @(posedge clk) begin
|
||||
if (pipeline_stall) begin
|
||||
`TRACE(3, ("%d: *** %s-bank%0d stall: crsq=%b, mreq=%b, mshr=%b\n", $time, INSTANCE_ID, BANK_ID, crsq_stall, mreq_alm_full, mshr_alm_full));
|
||||
end
|
||||
if (init_enable) begin
|
||||
`TRACE(2, ("%d: %s-bank%0d init: addr=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(init_line_sel, BANK_ID)));
|
||||
end
|
||||
if (mem_rsp_fire) begin
|
||||
`TRACE(2, ("%d: %s-bank%0d fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data));
|
||||
end
|
||||
if (replay_fire) begin
|
||||
`TRACE(2, ("%d: %s-bank%0d mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel));
|
||||
end
|
||||
if (core_req_fire) begin
|
||||
if (core_req_rw)
|
||||
`TRACE(2, ("%d: %s-bank%0d core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel));
|
||||
else
|
||||
`TRACE(2, ("%d: %s-bank%0d core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel));
|
||||
end
|
||||
if (crsq_fire) begin
|
||||
`TRACE(2, ("%d: %s-bank%0d core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsq_tag, crsq_idx, crsq_data, req_uuid_st1));
|
||||
end
|
||||
if (mreq_push) begin
|
||||
if (do_creq_wr_st1)
|
||||
`TRACE(2, ("%d: %s-bank%0d writethrough: addr=0x%0h, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mreq_addr, BANK_ID), mreq_byteen, mreq_data, req_uuid_st1));
|
||||
else
|
||||
`TRACE(2, ("%d: %s-bank%0d fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(mreq_addr, BANK_ID), mreq_id, req_uuid_st1));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
348
hw/rtl/cache/VX_cache_bypass.sv
vendored
Normal file
348
hw/rtl/cache/VX_cache_bypass.sv
vendored
Normal file
|
@ -0,0 +1,348 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_platform.vh"
|
||||
|
||||
module VX_cache_bypass #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter NC_TAG_BIT = 0,
|
||||
|
||||
parameter NC_ENABLE = 0,
|
||||
parameter PASSTHRU = 0,
|
||||
|
||||
parameter CORE_ADDR_WIDTH = 1,
|
||||
parameter CORE_DATA_SIZE = 1,
|
||||
parameter CORE_TAG_IN_WIDTH = 1,
|
||||
|
||||
parameter MEM_ADDR_WIDTH = 1,
|
||||
parameter MEM_DATA_SIZE = 1,
|
||||
parameter MEM_TAG_IN_WIDTH = 1,
|
||||
parameter MEM_TAG_OUT_WIDTH = 1,
|
||||
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
parameter CORE_DATA_WIDTH = CORE_DATA_SIZE * 8,
|
||||
parameter MEM_DATA_WIDTH = MEM_DATA_SIZE * 8,
|
||||
parameter CORE_TAG_OUT_WIDTH= CORE_TAG_IN_WIDTH - NC_ENABLE
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// Core request in
|
||||
input wire [NUM_REQS-1:0] core_req_valid_in,
|
||||
input wire [NUM_REQS-1:0] core_req_rw_in,
|
||||
input wire [NUM_REQS-1:0][CORE_ADDR_WIDTH-1:0] core_req_addr_in,
|
||||
input wire [NUM_REQS-1:0][CORE_DATA_SIZE-1:0] core_req_byteen_in,
|
||||
input wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_req_data_in,
|
||||
input wire [NUM_REQS-1:0][CORE_TAG_IN_WIDTH-1:0] core_req_tag_in,
|
||||
output wire [NUM_REQS-1:0] core_req_ready_in,
|
||||
|
||||
// Core request out
|
||||
output wire [NUM_REQS-1:0] core_req_valid_out,
|
||||
output wire [NUM_REQS-1:0] core_req_rw_out,
|
||||
output wire [NUM_REQS-1:0][CORE_ADDR_WIDTH-1:0] core_req_addr_out,
|
||||
output wire [NUM_REQS-1:0][CORE_DATA_SIZE-1:0] core_req_byteen_out,
|
||||
output wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_req_data_out,
|
||||
output wire [NUM_REQS-1:0][CORE_TAG_OUT_WIDTH-1:0] core_req_tag_out,
|
||||
input wire [NUM_REQS-1:0] core_req_ready_out,
|
||||
|
||||
// Core response in
|
||||
input wire [NUM_REQS-1:0] core_rsp_valid_in,
|
||||
input wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_rsp_data_in,
|
||||
input wire [NUM_REQS-1:0][CORE_TAG_OUT_WIDTH-1:0] core_rsp_tag_in,
|
||||
output wire [NUM_REQS-1:0] core_rsp_ready_in,
|
||||
|
||||
// Core response out
|
||||
output wire [NUM_REQS-1:0] core_rsp_valid_out,
|
||||
output wire [NUM_REQS-1:0][CORE_DATA_WIDTH-1:0] core_rsp_data_out,
|
||||
output wire [NUM_REQS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_out,
|
||||
input wire [NUM_REQS-1:0] core_rsp_ready_out,
|
||||
|
||||
// Memory request in
|
||||
input wire mem_req_valid_in,
|
||||
input wire mem_req_rw_in,
|
||||
input wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_in,
|
||||
input wire [MEM_DATA_SIZE-1:0] mem_req_byteen_in,
|
||||
input wire [MEM_DATA_WIDTH-1:0] mem_req_data_in,
|
||||
input wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_in,
|
||||
output wire mem_req_ready_in,
|
||||
|
||||
// Memory request out
|
||||
output wire mem_req_valid_out,
|
||||
output wire mem_req_rw_out,
|
||||
output wire [MEM_ADDR_WIDTH-1:0] mem_req_addr_out,
|
||||
output wire [MEM_DATA_SIZE-1:0] mem_req_byteen_out,
|
||||
output wire [MEM_DATA_WIDTH-1:0] mem_req_data_out,
|
||||
output wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_tag_out,
|
||||
input wire mem_req_ready_out,
|
||||
|
||||
// Memory response in
|
||||
input wire mem_rsp_valid_in,
|
||||
input wire [MEM_DATA_WIDTH-1:0] mem_rsp_data_in,
|
||||
input wire [MEM_TAG_OUT_WIDTH-1:0] mem_rsp_tag_in,
|
||||
output wire mem_rsp_ready_in,
|
||||
|
||||
// Memory response out
|
||||
output wire mem_rsp_valid_out,
|
||||
output wire [MEM_DATA_WIDTH-1:0] mem_rsp_data_out,
|
||||
output wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_out,
|
||||
input wire mem_rsp_ready_out
|
||||
);
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
localparam REQ_SEL_BITS = `CLOG2(NUM_REQS);
|
||||
localparam MUX_DATAW = CORE_TAG_IN_WIDTH + CORE_DATA_WIDTH + CORE_DATA_SIZE + CORE_ADDR_WIDTH + 1;
|
||||
|
||||
localparam WORDS_PER_LINE = MEM_DATA_SIZE / CORE_DATA_SIZE;
|
||||
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
|
||||
|
||||
localparam CORE_TAG_ID_BITS = CORE_TAG_IN_WIDTH - UUID_WIDTH;
|
||||
localparam MEM_TAG_ID_BITS = REQ_SEL_BITS + WSEL_BITS + CORE_TAG_ID_BITS;
|
||||
|
||||
localparam MEM_TAG_OUT_NC_WIDTH = MEM_TAG_OUT_WIDTH - 1 + NC_ENABLE;
|
||||
|
||||
// core request handling
|
||||
|
||||
wire [NUM_REQS-1:0] core_req_valid_in_nc;
|
||||
wire [NUM_REQS-1:0] core_req_nc_idxs;
|
||||
wire [`UP(REQ_SEL_BITS)-1:0] core_req_nc_idx;
|
||||
wire [NUM_REQS-1:0] core_req_nc_sel;
|
||||
wire core_req_nc_valid;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
if (PASSTHRU != 0) begin
|
||||
assign core_req_nc_idxs[i] = 1'b1;
|
||||
end else begin
|
||||
assign core_req_nc_idxs[i] = core_req_tag_in[i][NC_TAG_BIT];
|
||||
end
|
||||
end
|
||||
|
||||
assign core_req_valid_in_nc = core_req_valid_in & core_req_nc_idxs;
|
||||
|
||||
wire core_req_in_fire = | (core_req_valid_in & core_req_ready_in);
|
||||
|
||||
VX_generic_arbiter #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.TYPE (PASSTHRU ? "R" : "P"),
|
||||
.LOCK_ENABLE (1)
|
||||
) req_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.unlock (core_req_in_fire),
|
||||
.requests (core_req_valid_in_nc),
|
||||
.grant_index (core_req_nc_idx),
|
||||
.grant_onehot (core_req_nc_sel),
|
||||
.grant_valid (core_req_nc_valid)
|
||||
);
|
||||
|
||||
assign core_req_valid_out = core_req_valid_in & ~core_req_nc_idxs;
|
||||
assign core_req_rw_out = core_req_rw_in;
|
||||
assign core_req_addr_out = core_req_addr_in;
|
||||
assign core_req_byteen_out = core_req_byteen_in;
|
||||
assign core_req_data_out = core_req_data_in;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
VX_bits_remove #(
|
||||
.N (CORE_TAG_IN_WIDTH),
|
||||
.S (NC_ENABLE),
|
||||
.POS (NC_TAG_BIT)
|
||||
) core_req_tag_nc_remove (
|
||||
.data_in (core_req_tag_in[i]),
|
||||
.data_out (core_req_tag_out[i])
|
||||
);
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_req_ready_in[i] = core_req_valid_in_nc[i] ? (~mem_req_valid_in && mem_req_ready_out && core_req_nc_sel[i])
|
||||
: core_req_ready_out[i];
|
||||
end
|
||||
|
||||
// memory request handling
|
||||
|
||||
assign mem_req_valid_out = mem_req_valid_in || core_req_nc_valid;
|
||||
assign mem_req_ready_in = mem_req_ready_out;
|
||||
|
||||
wire [CORE_TAG_IN_WIDTH-1:0] core_req_tag_in_sel;
|
||||
wire [CORE_DATA_WIDTH-1:0] core_req_data_in_sel;
|
||||
wire [CORE_DATA_SIZE-1:0] core_req_byteen_in_sel;
|
||||
wire [CORE_ADDR_WIDTH-1:0] core_req_addr_in_sel;
|
||||
wire core_req_rw_in_sel;
|
||||
|
||||
wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in;
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_req_nc_mux_in[i] = {core_req_tag_in[i], core_req_data_in[i], core_req_byteen_in[i], core_req_addr_in[i], core_req_rw_in[i]};
|
||||
end
|
||||
assign {core_req_tag_in_sel, core_req_data_in_sel, core_req_byteen_in_sel, core_req_addr_in_sel, core_req_rw_in_sel} = core_req_nc_mux_in[core_req_nc_idx];
|
||||
|
||||
wire [CORE_TAG_ID_BITS-1:0] core_req_in_id = core_req_tag_in_sel[CORE_TAG_ID_BITS-1:0];
|
||||
|
||||
assign mem_req_rw_out = mem_req_valid_in ? mem_req_rw_in : core_req_rw_in_sel;
|
||||
assign mem_req_addr_out = mem_req_valid_in ? mem_req_addr_in : core_req_addr_in_sel[WSEL_BITS +: MEM_ADDR_WIDTH];
|
||||
|
||||
wire [MEM_TAG_ID_BITS-1:0] mem_req_tag_id_bypass;
|
||||
|
||||
if (WORDS_PER_LINE > 1) begin
|
||||
reg [WORDS_PER_LINE-1:0][CORE_DATA_SIZE-1:0] mem_req_byteen_in_r;
|
||||
reg [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_r;
|
||||
|
||||
wire [WSEL_BITS-1:0] req_wsel = core_req_addr_in_sel[WSEL_BITS-1:0];
|
||||
|
||||
always @(*) begin
|
||||
mem_req_byteen_in_r = '0;
|
||||
mem_req_byteen_in_r[req_wsel] = core_req_byteen_in_sel;
|
||||
|
||||
mem_req_data_in_r = 'x;
|
||||
mem_req_data_in_r[req_wsel] = core_req_data_in_sel;
|
||||
end
|
||||
|
||||
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : mem_req_byteen_in_r;
|
||||
assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : mem_req_data_in_r;
|
||||
if (NUM_REQS > 1) begin
|
||||
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, req_wsel, core_req_in_id});
|
||||
end else begin
|
||||
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({req_wsel, core_req_in_id});
|
||||
end
|
||||
end else begin
|
||||
assign mem_req_byteen_out = mem_req_valid_in ? mem_req_byteen_in : core_req_byteen_in_sel;
|
||||
assign mem_req_data_out = mem_req_valid_in ? mem_req_data_in : core_req_data_in_sel;
|
||||
if (NUM_REQS > 1) begin
|
||||
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, core_req_in_id});
|
||||
end else begin
|
||||
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_in_id});
|
||||
end
|
||||
end
|
||||
|
||||
wire [MEM_TAG_OUT_NC_WIDTH-1:0] mem_req_tag_bypass;
|
||||
|
||||
if (UUID_WIDTH != 0) begin
|
||||
assign mem_req_tag_bypass = {core_req_tag_in_sel[CORE_TAG_ID_BITS +: UUID_WIDTH], mem_req_tag_id_bypass};
|
||||
end else begin
|
||||
assign mem_req_tag_bypass = mem_req_tag_id_bypass;
|
||||
end
|
||||
|
||||
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_tag_bypass_nc;
|
||||
wire [(MEM_TAG_IN_WIDTH + 1)-1:0] mem_req_tag_in_nc;
|
||||
|
||||
VX_bits_insert #(
|
||||
.N (MEM_TAG_OUT_NC_WIDTH),
|
||||
.S (NC_ENABLE ? 0 : 1),
|
||||
.POS (NC_TAG_BIT)
|
||||
) mem_req_tag_bypass_nc_insert (
|
||||
.data_in (mem_req_tag_bypass),
|
||||
.sel_in (1'b0),
|
||||
.data_out (mem_req_tag_bypass_nc)
|
||||
);
|
||||
|
||||
VX_bits_insert #(
|
||||
.N (MEM_TAG_IN_WIDTH),
|
||||
.POS (NC_TAG_BIT)
|
||||
) mem_req_tag_in_nc_insert (
|
||||
.data_in (mem_req_tag_in),
|
||||
.sel_in (1'b0),
|
||||
.data_out (mem_req_tag_in_nc)
|
||||
);
|
||||
|
||||
assign mem_req_tag_out = mem_req_valid_in ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : mem_req_tag_bypass_nc;
|
||||
|
||||
// core response handling
|
||||
|
||||
wire [NUM_REQS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_in_nc;
|
||||
|
||||
wire is_mem_rsp_nc;
|
||||
if (PASSTHRU != 0) begin
|
||||
assign is_mem_rsp_nc = mem_rsp_valid_in;
|
||||
end else begin
|
||||
assign is_mem_rsp_nc = mem_rsp_valid_in && mem_rsp_tag_in[NC_TAG_BIT];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
VX_bits_insert #(
|
||||
.N (CORE_TAG_OUT_WIDTH),
|
||||
.S (NC_ENABLE),
|
||||
.POS (NC_TAG_BIT)
|
||||
) core_rsp_tag_in_nc_insert (
|
||||
.data_in (core_rsp_tag_in[i]),
|
||||
.sel_in ('0),
|
||||
.data_out (core_rsp_tag_in_nc[i])
|
||||
);
|
||||
end
|
||||
|
||||
wire [MEM_TAG_OUT_NC_WIDTH-1:0] mem_rsp_tag_in_nc;
|
||||
|
||||
VX_bits_remove #(
|
||||
.N (MEM_TAG_OUT_WIDTH),
|
||||
.S (NC_ENABLE ? 0 : 1),
|
||||
.POS (NC_TAG_BIT)
|
||||
) mem_rsp_tag_in_nc_remove (
|
||||
.data_in (mem_rsp_tag_in),
|
||||
.data_out (mem_rsp_tag_in_nc)
|
||||
);
|
||||
|
||||
wire [`UP(REQ_SEL_BITS)-1:0] rsp_idx;
|
||||
if (NUM_REQS > 1) begin
|
||||
assign rsp_idx = mem_rsp_tag_in_nc[(CORE_TAG_ID_BITS + WSEL_BITS) +: REQ_SEL_BITS];
|
||||
end else begin
|
||||
assign rsp_idx = 1'b0;
|
||||
end
|
||||
|
||||
reg [NUM_REQS-1:0] rsp_nc_valid_r;
|
||||
always @(*) begin
|
||||
rsp_nc_valid_r = '0;
|
||||
rsp_nc_valid_r[rsp_idx] = is_mem_rsp_nc;
|
||||
end
|
||||
|
||||
assign core_rsp_valid_out = core_rsp_valid_in | rsp_nc_valid_r;
|
||||
assign core_rsp_ready_in = core_rsp_ready_out;
|
||||
|
||||
if (WORDS_PER_LINE > 1) begin
|
||||
wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_in_nc[CORE_TAG_ID_BITS +: WSEL_BITS];
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_rsp_data_out[i] = core_rsp_valid_in[i] ?
|
||||
core_rsp_data_in[i] : mem_rsp_data_in[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
|
||||
end
|
||||
end else begin
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_rsp_data_out[i] = core_rsp_valid_in[i] ? core_rsp_data_in[i] : mem_rsp_data_in;
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
if (UUID_WIDTH != 0) begin
|
||||
assign core_rsp_tag_out[i] = core_rsp_valid_in[i] ? core_rsp_tag_in_nc[i] : {mem_rsp_tag_in_nc[MEM_TAG_OUT_NC_WIDTH-1 -: UUID_WIDTH], mem_rsp_tag_in_nc[CORE_TAG_ID_BITS-1:0]};
|
||||
end else begin
|
||||
assign core_rsp_tag_out[i] = core_rsp_valid_in[i] ? core_rsp_tag_in_nc[i] : mem_rsp_tag_in_nc[CORE_TAG_ID_BITS-1:0];
|
||||
end
|
||||
end
|
||||
|
||||
// memory response handling
|
||||
|
||||
if (PASSTHRU != 0) begin
|
||||
assign mem_rsp_valid_out = 1'b0;
|
||||
end else begin
|
||||
assign mem_rsp_valid_out = mem_rsp_valid_in && ~mem_rsp_tag_in[NC_TAG_BIT];
|
||||
end
|
||||
|
||||
assign mem_rsp_data_out = mem_rsp_data_in;
|
||||
|
||||
VX_bits_remove #(
|
||||
.N (MEM_TAG_IN_WIDTH + 1),
|
||||
.POS (NC_TAG_BIT)
|
||||
) mem_rsp_tag_out_remove (
|
||||
.data_in (mem_rsp_tag_in[(MEM_TAG_IN_WIDTH + 1)-1:0]),
|
||||
.data_out (mem_rsp_tag_out)
|
||||
);
|
||||
|
||||
assign mem_rsp_ready_in = is_mem_rsp_nc ? (~core_rsp_valid_in[rsp_idx] && core_rsp_ready_out[rsp_idx]) : mem_rsp_ready_out;
|
||||
|
||||
endmodule
|
368
hw/rtl/cache/VX_cache_cluster.sv
vendored
Normal file
368
hw/rtl/cache/VX_cache_cluster.sv
vendored
Normal file
|
@ -0,0 +1,368 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_cache_define.vh"
|
||||
|
||||
module VX_cache_cluster #(
|
||||
parameter `STRING INSTANCE_ID = "",
|
||||
|
||||
parameter NUM_UNITS = 1,
|
||||
parameter NUM_INPUTS = 1,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
|
||||
// Number of requests per cycle
|
||||
parameter NUM_REQS = 4,
|
||||
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 16384,
|
||||
// Size of line inside a bank in bytes
|
||||
parameter LINE_SIZE = 64,
|
||||
// Number of banks
|
||||
parameter NUM_BANKS = 1,
|
||||
// Number of associative ways
|
||||
parameter NUM_WAYS = 4,
|
||||
// Size of a word in bytes
|
||||
parameter WORD_SIZE = 4,
|
||||
|
||||
// Core Response Queue Size
|
||||
parameter CRSQ_SIZE = 2,
|
||||
// Miss Reserv Queue Knob
|
||||
parameter MSHR_SIZE = 8,
|
||||
// Memory Response Queue Size
|
||||
parameter MRSQ_SIZE = 0,
|
||||
// Memory Request Queue Size
|
||||
parameter MREQ_SIZE = 4,
|
||||
|
||||
// Enable cache writeable
|
||||
parameter WRITE_ENABLE = 1,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
// core request tag size
|
||||
parameter TAG_WIDTH = UUID_WIDTH + 1,
|
||||
|
||||
// enable bypass for non-cacheable addresses
|
||||
parameter NC_ENABLE = 0,
|
||||
|
||||
// Core response output register
|
||||
parameter CORE_OUT_REG = 0,
|
||||
|
||||
// Memory request output register
|
||||
parameter MEM_OUT_REG = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// PERF
|
||||
`ifdef PERF_ENABLE
|
||||
VX_cache_perf_if.master cache_perf_if,
|
||||
`endif
|
||||
|
||||
VX_mem_bus_if.slave core_bus_if [NUM_INPUTS * NUM_REQS],
|
||||
VX_mem_bus_if.master mem_bus_if
|
||||
);
|
||||
localparam NUM_CACHES = `UP(NUM_UNITS);
|
||||
localparam PASSTHRU = (NUM_UNITS == 0);
|
||||
localparam ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES);
|
||||
localparam MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
|
||||
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH)) :
|
||||
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
|
||||
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
|
||||
|
||||
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_cache_perf_if perf_cache_unit_if[NUM_CACHES]();
|
||||
`PERF_CACHE_ADD (cache_perf_if, perf_cache_unit_if, NUM_CACHES);
|
||||
`endif
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH)
|
||||
) cache_mem_bus_if[NUM_CACHES]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (ARB_TAG_WIDTH)
|
||||
) arb_core_bus_if[NUM_CACHES * NUM_REQS]();
|
||||
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (TAG_WIDTH)
|
||||
) core_bus_tmp_if[NUM_INPUTS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (ARB_TAG_WIDTH)
|
||||
) arb_core_bus_tmp_if[NUM_CACHES]();
|
||||
|
||||
for (genvar j = 0; j < NUM_INPUTS; ++j) begin
|
||||
`ASSIGN_VX_MEM_BUS_IF (core_bus_tmp_if[j], core_bus_if[j * NUM_REQS + i]);
|
||||
end
|
||||
|
||||
`RESET_RELAY (cache_arb_reset, reset);
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (NUM_INPUTS),
|
||||
.NUM_OUTPUTS (NUM_CACHES),
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.TAG_SEL_IDX (TAG_SEL_IDX),
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG_REQ ((NUM_INPUTS != NUM_CACHES) ? 2 : 0),
|
||||
.OUT_REG_RSP ((NUM_INPUTS != NUM_CACHES) ? 2 : 0)
|
||||
) cache_arb (
|
||||
.clk (clk),
|
||||
.reset (cache_arb_reset),
|
||||
.bus_in_if (core_bus_tmp_if),
|
||||
.bus_out_if (arb_core_bus_tmp_if)
|
||||
);
|
||||
|
||||
for (genvar k = 0; k < NUM_CACHES; ++k) begin
|
||||
`ASSIGN_VX_MEM_BUS_IF (arb_core_bus_if[k * NUM_REQS + i], arb_core_bus_tmp_if[k]);
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_CACHES; ++i) begin
|
||||
|
||||
`RESET_RELAY (cache_reset, reset);
|
||||
|
||||
VX_cache_wrap #(
|
||||
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, i)),
|
||||
.CACHE_SIZE (CACHE_SIZE),
|
||||
.LINE_SIZE (LINE_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.NUM_WAYS (NUM_WAYS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.CRSQ_SIZE (CRSQ_SIZE),
|
||||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.MRSQ_SIZE (MRSQ_SIZE),
|
||||
.MREQ_SIZE (MREQ_SIZE),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.TAG_WIDTH (ARB_TAG_WIDTH),
|
||||
.CORE_OUT_REG ((NUM_INPUTS != NUM_CACHES) ? 2 : CORE_OUT_REG),
|
||||
.MEM_OUT_REG ((NUM_CACHES > 1) ? 2 : MEM_OUT_REG),
|
||||
.NC_ENABLE (NC_ENABLE),
|
||||
.PASSTHRU (PASSTHRU)
|
||||
) cache_wrap (
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf_if (perf_cache_unit_if[i]),
|
||||
`endif
|
||||
.clk (clk),
|
||||
.reset (cache_reset),
|
||||
.core_bus_if (arb_core_bus_if[i * NUM_REQS +: NUM_REQS]),
|
||||
.mem_bus_if (cache_mem_bus_if[i])
|
||||
);
|
||||
end
|
||||
|
||||
`RESET_RELAY (mem_arb_reset, reset);
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1))
|
||||
) mem_bus_tmp_if[1]();
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (NUM_CACHES),
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG_REQ ((NUM_CACHES > 1) ? 2 : 0),
|
||||
.OUT_REG_RSP ((NUM_CACHES > 1) ? 2 : 0)
|
||||
) mem_arb (
|
||||
.clk (clk),
|
||||
.reset (mem_arb_reset),
|
||||
.bus_in_if (cache_mem_bus_if),
|
||||
.bus_out_if (mem_bus_tmp_if)
|
||||
);
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]);
|
||||
|
||||
endmodule
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
module VX_cache_cluster_top #(
|
||||
parameter `STRING INSTANCE_ID = "",
|
||||
|
||||
parameter NUM_UNITS = 2,
|
||||
parameter NUM_INPUTS = 4,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
|
||||
// Number of Word requests per cycle
|
||||
parameter NUM_REQS = 4,
|
||||
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 16384,
|
||||
// Size of line inside a bank in bytes
|
||||
parameter LINE_SIZE = 16,
|
||||
// Number of banks
|
||||
parameter NUM_BANKS = 4,
|
||||
// Number of associative ways
|
||||
parameter NUM_WAYS = 4,
|
||||
// Size of a word in bytes
|
||||
parameter WORD_SIZE = 4,
|
||||
|
||||
// Core Response Queue Size
|
||||
parameter CRSQ_SIZE = 2,
|
||||
// Miss Reserv Queue Knob
|
||||
parameter MSHR_SIZE = 16,
|
||||
// Memory Response Queue Size
|
||||
parameter MRSQ_SIZE = 0,
|
||||
// Memory Request Queue Size
|
||||
parameter MREQ_SIZE = 4,
|
||||
|
||||
// Enable cache writeable
|
||||
parameter WRITE_ENABLE = 1,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
// core request tag size
|
||||
parameter TAG_WIDTH = 16,
|
||||
|
||||
// enable bypass for non-cacheable addresses
|
||||
parameter NC_ENABLE = 1,
|
||||
|
||||
// Core response output register
|
||||
parameter CORE_OUT_REG = 2,
|
||||
|
||||
// Memory request output register
|
||||
parameter MEM_OUT_REG = 2,
|
||||
|
||||
parameter NUM_CACHES = `UP(NUM_UNITS),
|
||||
parameter PASSTHRU = (NUM_UNITS == 0),
|
||||
parameter ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES),
|
||||
parameter MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
|
||||
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH)) :
|
||||
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
|
||||
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS))
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// Core request
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_valid,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_rw,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag,
|
||||
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_ready,
|
||||
|
||||
// Core response
|
||||
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_valid,
|
||||
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data,
|
||||
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_ready,
|
||||
|
||||
// Memory request
|
||||
output wire mem_req_valid,
|
||||
output wire mem_req_rw,
|
||||
output wire [LINE_SIZE-1:0] mem_req_byteen,
|
||||
output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
|
||||
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// Memory response
|
||||
input wire mem_rsp_valid,
|
||||
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire mem_rsp_ready
|
||||
);
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (TAG_WIDTH)
|
||||
) core_bus_if[NUM_INPUTS * NUM_REQS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH)
|
||||
) mem_bus_if();
|
||||
|
||||
// Core request
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
|
||||
for (genvar r = 0; r < NUM_REQS; ++r) begin
|
||||
assign core_bus_if[i * NUM_REQS + r].req_valid = core_req_valid[i][r];
|
||||
assign core_bus_if[i * NUM_REQS + r].req_data.rw = core_req_rw[i][r];
|
||||
assign core_bus_if[i * NUM_REQS + r].req_data.byteen = core_req_byteen[i][r];
|
||||
assign core_bus_if[i * NUM_REQS + r].req_data.addr = core_req_addr[i][r];
|
||||
assign core_bus_if[i * NUM_REQS + r].req_data.data = core_req_data[i][r];
|
||||
assign core_bus_if[i * NUM_REQS + r].req_data.tag = core_req_tag[i][r];
|
||||
assign core_req_ready[i][r] = core_bus_if[i * NUM_REQS + r].req_ready;
|
||||
end
|
||||
end
|
||||
|
||||
// Core response
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
|
||||
for (genvar r = 0; r < NUM_REQS; ++r) begin
|
||||
assign core_rsp_valid[i][r] = core_bus_if[i * NUM_REQS + r].rsp_valid;
|
||||
assign core_rsp_data[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.data;
|
||||
assign core_rsp_tag[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.tag;
|
||||
assign core_bus_if[i * NUM_REQS + r].rsp_ready = core_rsp_ready[i][r];
|
||||
end
|
||||
end
|
||||
|
||||
// Memory request
|
||||
assign mem_req_valid = mem_bus_if.req_valid;
|
||||
assign mem_req_rw = mem_bus_if.req_data.rw;
|
||||
assign mem_req_byteen = mem_bus_if.req_data.byteen;
|
||||
assign mem_req_addr = mem_bus_if.req_data.addr;
|
||||
assign mem_req_data = mem_bus_if.req_data.data;
|
||||
assign mem_req_tag = mem_bus_if.req_data.tag;
|
||||
assign mem_bus_if.req_ready = mem_req_ready;
|
||||
|
||||
// Memory response
|
||||
assign mem_bus_if.rsp_valid = mem_rsp_valid;
|
||||
assign mem_bus_if.rsp_data.data = mem_rsp_data;
|
||||
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
|
||||
assign mem_rsp_ready = mem_bus_if.rsp_ready;
|
||||
|
||||
VX_cache_cluster #(
|
||||
.INSTANCE_ID (INSTANCE_ID),
|
||||
.NUM_UNITS (NUM_UNITS),
|
||||
.NUM_INPUTS (NUM_INPUTS),
|
||||
.TAG_SEL_IDX (TAG_SEL_IDX),
|
||||
.CACHE_SIZE (CACHE_SIZE),
|
||||
.LINE_SIZE (LINE_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.NUM_WAYS (NUM_WAYS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.CRSQ_SIZE (CRSQ_SIZE),
|
||||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.MRSQ_SIZE (MRSQ_SIZE),
|
||||
.MREQ_SIZE (MREQ_SIZE),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.CORE_OUT_REG (CORE_OUT_REG),
|
||||
.MEM_OUT_REG (MEM_OUT_REG)
|
||||
) cache (
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf_if (perf_icache_if),
|
||||
`endif
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.core_bus_if (core_bus_if),
|
||||
.mem_bus_if (mem_bus_if)
|
||||
);
|
||||
|
||||
endmodule
|
152
hw/rtl/cache/VX_cache_data.sv
vendored
Normal file
152
hw/rtl/cache/VX_cache_data.sv
vendored
Normal file
|
@ -0,0 +1,152 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_cache_define.vh"
|
||||
|
||||
module VX_cache_data #(
|
||||
parameter `STRING INSTANCE_ID= "",
|
||||
parameter BANK_ID = 0,
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 1024,
|
||||
// Size of line inside a bank in bytes
|
||||
parameter LINE_SIZE = 16,
|
||||
// Number of banks
|
||||
parameter NUM_BANKS = 1,
|
||||
// Number of associative ways
|
||||
parameter NUM_WAYS = 1,
|
||||
// Size of a word in bytes
|
||||
parameter WORD_SIZE = 1,
|
||||
// Enable cache writeable
|
||||
parameter WRITE_ENABLE = 1,
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`IGNORE_UNUSED_BEGIN
|
||||
input wire[`UP(UUID_WIDTH)-1:0] req_uuid,
|
||||
`IGNORE_UNUSED_END
|
||||
|
||||
input wire stall,
|
||||
|
||||
input wire read,
|
||||
input wire fill,
|
||||
input wire write,
|
||||
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
|
||||
input wire [`UP(`CS_WORD_SEL_BITS)-1:0] wsel,
|
||||
input wire [WORD_SIZE-1:0] byteen,
|
||||
input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] fill_data,
|
||||
input wire [`CS_WORD_WIDTH-1:0] write_data,
|
||||
input wire [NUM_WAYS-1:0] way_sel,
|
||||
|
||||
output wire [`CS_WORD_WIDTH-1:0] read_data
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
`UNUSED_PARAM (BANK_ID)
|
||||
`UNUSED_PARAM (WORD_SIZE)
|
||||
`UNUSED_VAR (reset)
|
||||
`UNUSED_VAR (line_addr)
|
||||
`UNUSED_VAR (read)
|
||||
|
||||
localparam BYTEENW = (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) ? (LINE_SIZE * NUM_WAYS) : 1;
|
||||
|
||||
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] wdata;
|
||||
wire [BYTEENW-1:0] wren;
|
||||
|
||||
if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin
|
||||
reg [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] wdata_r;
|
||||
reg [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] wren_r;
|
||||
|
||||
always @(*) begin
|
||||
wdata_r = {`CS_WORDS_PER_LINE{write_data}};
|
||||
wren_r = '0;
|
||||
wren_r[wsel] = byteen;
|
||||
end
|
||||
|
||||
// order the data layout to perform ways multiplexing last
|
||||
// this allows performing onehot encoding of the way index in parallel with BRAM read.
|
||||
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w;
|
||||
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
|
||||
assign wdata[i] = fill ? {NUM_WAYS{fill_data[i]}} : {NUM_WAYS{wdata_r[i]}};
|
||||
for (genvar j = 0; j < NUM_WAYS; ++j) begin
|
||||
assign wren_w[i][j] = (fill ? {WORD_SIZE{1'b1}} : wren_r[i])
|
||||
& {WORD_SIZE{((NUM_WAYS == 1) || way_sel[j])}};
|
||||
end
|
||||
end
|
||||
assign wren = wren_w;
|
||||
end else begin
|
||||
`UNUSED_VAR (write)
|
||||
`UNUSED_VAR (byteen)
|
||||
`UNUSED_VAR (write_data)
|
||||
assign wdata = fill_data;
|
||||
assign wren = fill;
|
||||
end
|
||||
|
||||
wire [`CLOG2(NUM_WAYS)-1:0] way_idx;
|
||||
|
||||
VX_onehot_encoder #(
|
||||
.N (NUM_WAYS)
|
||||
) way_enc (
|
||||
.data_in (way_sel),
|
||||
.data_out (way_idx),
|
||||
`UNUSED_PIN (valid_out)
|
||||
);
|
||||
|
||||
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] rdata;
|
||||
|
||||
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
|
||||
|
||||
VX_sp_ram #(
|
||||
.DATAW (`CS_LINE_WIDTH * NUM_WAYS),
|
||||
.SIZE (`CS_LINES_PER_BANK),
|
||||
.WRENW (BYTEENW),
|
||||
.NO_RWCHECK (1)
|
||||
) data_store (
|
||||
.clk (clk),
|
||||
.read (1'b1),
|
||||
.write (write || fill),
|
||||
.wren (wren),
|
||||
.addr (line_sel),
|
||||
.wdata (wdata),
|
||||
.rdata (rdata)
|
||||
);
|
||||
|
||||
wire [NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] per_way_rdata;
|
||||
|
||||
if (`CS_WORDS_PER_LINE > 1) begin
|
||||
assign per_way_rdata = rdata[wsel];
|
||||
end else begin
|
||||
`UNUSED_VAR (wsel)
|
||||
assign per_way_rdata = rdata;
|
||||
end
|
||||
|
||||
assign read_data = per_way_rdata[way_idx];
|
||||
|
||||
`UNUSED_VAR (stall)
|
||||
|
||||
`ifdef DBG_TRACE_CACHE_DATA
|
||||
always @(posedge clk) begin
|
||||
if (fill && ~stall) begin
|
||||
`TRACE(3, ("%d: %s-bank%0d data-fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data));
|
||||
end
|
||||
if (read && ~stall) begin
|
||||
`TRACE(3, ("%d: %s-bank%0d data-read: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, read_data, req_uuid));
|
||||
end
|
||||
if (write && ~stall) begin
|
||||
`TRACE(3, ("%d: %s-bank%0d data-write: addr=0x%0h, way=%b, blk_addr=%0d, byteen=%b, data=0x%0h (#%0d)\n", $time, INSTANCE_ID, BANK_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, byteen, write_data, req_uuid));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue