Compare commits

...

537 commits
v2.2 ... master

Author SHA1 Message Date
tinebp
332e8eeaf9
Merge pull request #244 from vortexgpgpu/bug_fixes
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
Bug fixes
2025-04-13 20:48:36 -07:00
tinebp
5dbfcecc21 minor update
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2025-04-13 17:28:33 -07:00
tinebp
f19335023f CI migration to ubuntu 22.04 2025-04-13 14:16:05 -07:00
tinebp
6a7e402ab4
Merge pull request #239 from vortexgpgpu/bug_fixes
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
regression fix
2025-03-17 04:33:46 -07:00
tinebp
18687d53b3
Merge branch 'master' into bug_fixes
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2025-03-17 04:33:12 -07:00
tinebp
a35fb4bf1d regression fix 2025-03-17 04:30:50 -07:00
tinebp
9929c42417
Merge pull request #238 from vortexgpgpu/bug_fixes
workaroud fix for opencl kernel include in POCL
2025-03-17 04:07:52 -07:00
tinebp
06e5e2e859 workaroud fix for opencl kernel include in POCL 2025-03-17 04:04:07 -07:00
tinebp
09e89791e5
Merge pull request #237 from vortexgpgpu/bug_fixes
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
CI versioning
2025-03-12 20:21:51 -07:00
tinebp
b35f69f486 CI versioning
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2025-03-12 17:38:09 -07:00
tinebp
63b41f21c6 xrt sandbox simulation
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2025-02-12 00:24:36 -08:00
tinebp
cc7fdf2fbd fixed github actions versioning 2025-02-11 22:03:32 -08:00
tinebp
a9352a3b64 minor update 2025-02-11 21:56:05 -08:00
tinebp
9a2709db08 xrt sandbox synthesis build fix
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vector, 32) (push) Blocked by required conditions
CI / tests (vector, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2025-02-11 14:10:29 -08:00
tinebp
4785736e4d minor update
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2025-01-26 22:55:22 -08:00
tinebp
38861d9aaf minor updates 2025-01-26 22:40:34 -08:00
tinebp
82b0eeded6 minor update 2025-01-26 19:35:56 -08:00
tinebp
22398c991d ramulator memory addressing bug fix + platform memory refactoring
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vector, 32) (push) Blocked by required conditions
CI / tests (vector, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2025-01-26 06:28:51 -08:00
tinebp
e80ee2c819 minor update
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2025-01-22 04:56:49 -08:00
tinebp
9dc1d3f688 Merge branch 'bug_fixes' 2025-01-22 02:49:55 -08:00
tinebp
0c1bc17c09
Merge pull request #222 from MichaelJSr/simx-vpu-toggle
Toggle the RISC-V Vector Extension on and off
2025-01-22 02:48:38 -08:00
tinebp
4e83c28d04 minor bug fix 2025-01-21 23:07:41 -08:00
tinebp
2c940cf509 AXI adapter bug fix
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vector, 32) (push) Blocked by required conditions
CI / tests (vector, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2025-01-21 06:14:22 -08:00
tinebp
fb4527fe95 cache repl reset
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vector, 32) (push) Blocked by required conditions
CI / tests (vector, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2025-01-21 01:06:22 -08:00
tinebp
d1f37fc629 minor update 2025-01-20 22:19:42 -08:00
tinebp
001a107395 bram reset bug fix 2025-01-20 22:16:05 -08:00
tinebp
fce24b9535 fixed XRT AFU done handshake
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2025-01-17 23:58:23 -08:00
MichaelJSr
6d27575db3 Revert some of "Added ifndef statements for the vector extension anywhere they didn't exist already" 2025-01-14 21:56:39 -08:00
MichaelJSr
a2cfeffcfe Added ifndef statements for the vector extension anywhere they didn't exist already
Added ifndef statements for the vector extension anywhere they didn't exist already

more ifdef statements

more ifdef

Update decode.cpp

Update decode.cpp

Update decode.cpp
2025-01-14 21:29:47 -08:00
MichaelJSr
cb491ddb53 test
Revert "test"

This reverts commit 393e347c2faba260f1469667596e22dc2aa16553.
2025-01-14 21:22:28 -08:00
tinebp
43b143bba6 bug fixes
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2025-01-14 03:44:53 -08:00
tinebp
87297e0eca bug fixes 2025-01-14 02:21:17 -08:00
MichaelJSr
929ef1b6e2 Remove unused EXTV code, clean up code, pragma once around vpu.h 2025-01-13 16:45:13 -08:00
tinebp
83ba1cc3dc minor update
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2025-01-11 20:23:26 -08:00
tinebp
347889c504 minor updates
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vector, 32) (push) Blocked by required conditions
CI / tests (vector, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2025-01-11 03:24:06 -08:00
tinebp
083cf04afd timing optimizations 2025-01-11 03:19:55 -08:00
tinebp
84b1c8a43c BRAM optimizations 2025-01-11 03:18:11 -08:00
tinebp
43d33b942e minor update
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2024-12-26 11:32:57 -08:00
tinebp
adf60e7e35 minor update 2024-12-26 10:58:13 -08:00
tinebp
8fda922570 minor update
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vector, 32) (push) Blocked by required conditions
CI / tests (vector, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2024-12-26 10:20:57 -08:00
tinebp
53900bee4f bug fixes 2024-12-26 10:01:36 -08:00
tinebp
704f525fd6 memory mem_coalescer miss perf counter
RTL perf counters refactoring
2024-12-26 08:00:36 -08:00
tinebp
f478bdcf25 memory coalescer misses perf counter
rtl perf interface refactoring
2024-12-26 07:56:28 -08:00
tinebp
01974e124f
Merge pull request #216 from sij814/simx2
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
Simx ICache DCache Changes
2024-12-18 03:17:14 -08:00
tinebp
100e4e3970 multi-ports fixes
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2024-12-17 22:38:23 -08:00
tinebp
4819891a5e minor update 2024-12-17 18:06:52 -08:00
tinebp
066ab105eb multiports fixes 2024-12-17 16:23:08 -08:00
tinebp
a98d2e24e5 rtlsim multibanks
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vector, 32) (push) Blocked by required conditions
CI / tests (vector, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2024-12-16 22:10:57 -08:00
sij814
572a397018 changed versions 2024-12-15 15:11:13 -08:00
sij814
cad129c64c added icache dcache overlap 2024-12-15 14:55:21 -08:00
tinebp
bae24e589c minor update
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2024-12-14 02:04:50 -08:00
tinebp
461f2cbbc9 Intel Opae AFU support for multiport
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vector, 32) (push) Blocked by required conditions
CI / tests (vector, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2024-12-13 21:20:38 -08:00
tinebp
7975a5a38c fixed AXI adapter 2024-12-12 20:52:45 -08:00
tinebp
f635d71ba4 minor fix
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2024-12-11 10:31:03 -08:00
tinebp
70ade222b1 multiport 2024-12-10 23:25:05 -08:00
tinebp
aa6a47eb11 minor update
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vector, 32) (push) Has been cancelled
CI / tests (vector, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2024-12-05 23:35:15 -08:00
tinebp
115ff2b599 minor fixes 2024-12-05 22:38:04 -08:00
tinebp
896c59306c adding clang-format file 2024-12-05 15:58:04 -08:00
tinebp
6bbcd4ebaf vector updates with clang formatting 2024-12-05 15:55:57 -08:00
tinebp
6b23d290c3 vector ISA updates 2024-12-05 14:43:51 -08:00
tinebp
5d91fe58ad
Merge pull request #211 from MichaelJSr/riscv-vector-isa-simx-clean
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vector, 32) (push) Blocked by required conditions
CI / tests (vector, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
Vector extension simx with fallbacks for testcases and clean history
2024-12-05 10:18:45 -08:00
tinebp
5891a1e592
Merge branch 'master' into riscv-vector-isa-simx-clean 2024-12-05 10:17:05 -08:00
tinebp
18ae57cc7f Merge branch 'bug_fixes'
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2024-12-04 22:20:52 -08:00
tinebp
a760d909cb minor update
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2024-12-04 21:36:31 -08:00
tinebp
86f20b27dd SimX multi-ports memory fixes 2024-12-04 21:11:51 -08:00
tinebp
3ace9bbeda minor updates
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2024-12-04 06:00:19 -08:00
tinebp
30b0daf050 SimX multiports support fixes
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2024-12-03 05:46:33 -08:00
tinebp
24ca4f03aa minor update
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2024-12-02 19:53:28 -08:00
tinebp
3b454efd56 fixes to SimX's multiports memory support 2024-12-02 17:51:42 -08:00
MichaelJSr
951746badc Commented out some vector testcases that dont pass 2024-11-28 05:13:56 -08:00
MichaelJSr
6c2cbdfec2 made -v a valid option for simx simulator 2024-11-28 02:12:01 -08:00
MichaelJSr
973fcd7845 Merge branch 'riscv-vector-isa-simx-clean' of https://github.com/MichaelJSr/vortex into riscv-vector-isa-simx-clean 2024-11-27 23:53:57 -08:00
MichaelJSr
5eecd0e987 Added case for vector-test due to different exitcode
The vector tests need the cluster exitcodes
2024-11-27 23:50:57 -08:00
MichaelJSr
073e0ddd10 Adds the riscv vector extension into simx
Added vector regression test to ci.yml
2024-11-27 23:22:22 -08:00
MichaelJSr
c05a0571c8 Added vector regression test to ci.yml 2024-11-27 13:10:08 -08:00
MichaelJSr
1e4583ac17 Adds the riscv vector extension into simx 2024-11-26 18:41:01 -08:00
tinebp
3e4bbfc9f0 minor update 2024-11-22 11:12:17 -08:00
tinebp
7c4ce74801 memory unit timing optimization 2024-11-21 16:48:41 -08:00
tinebp
18bf49d1e0 minor update 2024-11-21 16:48:18 -08:00
tinebp
180735c531 fifoqueue area optimization 2024-11-21 16:47:00 -08:00
tinebp
8d8769c710 stream_buffer area optimization
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2024-11-20 19:15:51 -08:00
tinebp
b0c48e7a46 stream buffer area optimization 2024-11-20 18:27:52 -08:00
tinebp
320c090613 xilinx asynchronous bram patch fixes
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2024-11-19 01:57:33 -08:00
tinebp
b48b605b51 remove deprecared yosys link 2024-11-15 03:42:06 -08:00
tinebp
8230b37411 fixed opae build bug
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2024-11-14 11:42:21 -08:00
tinebp
5844de8c4d Merge branch 'rtl_cache'
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2024-11-13 22:27:11 -08:00
tinebp
dfc7b6178c cleanup old cache test
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2024-11-13 20:56:06 -08:00
tinebp
bffc6d9610 enabling Vivado's asynchronous bram suppot via direct netlist transformation
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2024-11-13 16:20:25 -08:00
Hyesoon Kim
6dbbc62b04
Merge pull request #200 from Udit8348/develop-docker-micro
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
Develop Docker Micro
2024-11-09 10:45:16 -05:00
Udit Subramanya
667fa1662d update docker for micro apptainer 2024-11-01 14:46:38 -04:00
Udit Subramanya
e73e1c2bb3 update xilinx fpga steps with environment variable steps 2024-11-01 13:56:01 -04:00
Udit Subramanya
27f3d6dde6 Merge remote-tracking branch 'origin/master' into develop-documentation 2024-10-25 13:16:28 -04:00
Udit Subramanya
d475e9d201 remove duplicate block 2024-10-25 12:59:24 -04:00
Blaise Tine
ce510d78c7 minor update
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2024-10-24 05:02:46 -07:00
Blaise Tine
eecff10dea minor update
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2024-10-24 02:51:08 -07:00
Blaise Tine
98b58606e5 merge fixes
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2024-10-24 02:18:00 -07:00
Blaise Tine
8b172d07ec revert xilinx's asynchronous bram workaround 2024-10-24 01:44:55 -07:00
Hyesoon Kim
f68cc95cbe
Merge branch 'master' into develop-documentation 2024-10-23 19:41:29 -04:00
Hyesoon Kim
659ad87f93
Merge pull request #188 from Udit8348/develop-docker
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
Vortex Dockerfiles
2024-10-23 19:41:06 -04:00
Blaise Tine
22ade31fd5 minor updates 2024-10-23 15:55:11 -07:00
Hyesoon Kim
2d3f4b6efc
Merge branch 'master' into develop-docker 2024-10-23 18:08:19 -04:00
Blaise Tine
cc5ac8388b minor update
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2024-10-23 14:03:19 -07:00
Blaise Tine
ec12b50007 minor udpate 2024-10-23 13:09:34 -07:00
Blaise Tine
e7d09feb4a decode => demux 2024-10-23 13:06:45 -07:00
Blaise Tine
7ab58111d8 minor update 2024-10-23 12:30:39 -07:00
Blaise Tine
1c384c096d minor update 2024-10-23 12:27:44 -07:00
Udit Subramanya
24d018b4c9 documentation updates 2024-10-23 05:18:53 -04:00
Blaise Tine
1fa4603fa2 disable sformatf during synthesis
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2024-10-23 01:14:19 -07:00
Blaise Tine
3a3bb7b70a cleanup deleted files
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / tests (vm, 32) (push) Has been cancelled
CI / tests (vm, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2024-10-21 22:46:04 -07:00
Blaise Tine
ff50306833 minor update 2024-10-21 22:24:54 -07:00
Udit Subramanya
519023fb2b add citation for MICRO 21 paper 2024-10-21 15:39:10 -04:00
Udit Subramanya
8fdca0e52a correct vitis env 2024-10-21 15:38:53 -04:00
Udit Subramanya
f184b57c24 merge upstream and resolve deleted file conflict 2024-10-21 13:45:32 -04:00
Udit Subramanya
d584e7bac1 intermediate docs update 2024-10-21 13:28:57 -04:00
Blaise Tine
2b3d1f0860 minor update
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2024-10-20 23:54:42 -07:00
Blaise Tine
fccbadfe25 minor update 2024-10-20 23:32:22 -07:00
Blaise Tine
1e4f0fa0bd minor update 2024-10-20 21:42:02 -07:00
Blaise Tine
22c3828bf5 minor update 2024-10-20 21:12:49 -07:00
Blaise Tine
acc1e3dfd8 minor update 2024-10-20 20:07:34 -07:00
Blaise Tine
0f380a3d78 minor update
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2024-10-20 07:49:27 -07:00
Blaise Tine
9373e21950 minor update 2024-10-20 07:32:32 -07:00
Blaise Tine
2bd22253eb minor update
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2024-10-19 22:14:38 -07:00
Blaise Tine
4206ffdb80 minor update 2024-10-19 21:39:34 -07:00
Blaise Tine
b6bd6467ef cache hit timing optimization 2024-10-19 20:04:51 -07:00
Blaise Tine
8f29ad58ae block ram redesign to support synthesizable write-first mode
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2024-10-18 23:54:20 -07:00
Blaise Tine
6b1091e08f minor update
Some checks failed
CI / setup (push) Has been cancelled
CI / build (32) (push) Has been cancelled
CI / build (64) (push) Has been cancelled
CI / tests (cache, 32) (push) Has been cancelled
CI / tests (cache, 64) (push) Has been cancelled
CI / tests (config1, 32) (push) Has been cancelled
CI / tests (config1, 64) (push) Has been cancelled
CI / tests (config2, 32) (push) Has been cancelled
CI / tests (config2, 64) (push) Has been cancelled
CI / tests (debug, 32) (push) Has been cancelled
CI / tests (debug, 64) (push) Has been cancelled
CI / tests (opencl, 32) (push) Has been cancelled
CI / tests (opencl, 64) (push) Has been cancelled
CI / tests (regression, 32) (push) Has been cancelled
CI / tests (regression, 64) (push) Has been cancelled
CI / tests (scope, 32) (push) Has been cancelled
CI / tests (scope, 64) (push) Has been cancelled
CI / tests (stress, 32) (push) Has been cancelled
CI / tests (stress, 64) (push) Has been cancelled
CI / tests (synthesis, 32) (push) Has been cancelled
CI / tests (synthesis, 64) (push) Has been cancelled
CI / complete (push) Has been cancelled
2024-10-17 14:07:22 -07:00
Blaise Tine
91fee5da11 minor update 2024-10-17 11:25:17 -07:00
Blaise Tine
077b682d7d minor update
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2024-10-17 04:58:29 -07:00
Blaise Tine
5971158f43 minor update
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
2024-10-16 20:22:42 -07:00
Blaise Tine
a7ba377581 minor update 2024-10-16 18:04:11 -07:00
Blaise Tine
f695e4d754 minor update 2024-10-15 14:59:31 -07:00
Blaise Tine
e06333b3c0 minor update 2024-10-15 11:28:33 -07:00
Blaise Tine
645befdce6 minor update 2024-10-15 11:23:29 -07:00
Blaise Tine
e62b638d88 minor update 2024-10-15 10:36:05 -07:00
Blaise Tine
1d5e4f63dd minor update 2024-10-15 03:24:02 -07:00
Blaise Tine
68b78fc42f minor update 2024-10-15 02:32:17 -07:00
Blaise Tine
db98965f56 minor update 2024-10-15 02:27:07 -07:00
Blaise Tine
03a1e25828 adding cache replacement policy 2024-10-15 00:28:09 -07:00
tinebp
5d7e53f7d7
Merge pull request #194 from MichaelJSr/add-back-ecall-ebreak-traps
Add back the "ecall" and "ebreak" instruction traps for riscv-vector test functionality
2024-10-14 20:46:29 -07:00
Blaise Tine
37757fab8f fixed fifo_queue support for BRAM 2024-10-14 15:48:49 -07:00
MichaelJSr
0d04423074 Readded the ecall and ebreak instruction traps so that the riscv-vector tests run properly 2024-10-14 10:12:33 -07:00
Blaise Tine
fe5442dbb3 minor update 2024-10-13 23:34:57 -07:00
Blaise Tine
2a2fc2ae39 minor update 2024-10-13 23:25:41 -07:00
Blaise Tine
26df675e24 minor update 2024-10-13 20:08:38 -07:00
Blaise Tine
f63233334e minor update 2024-10-13 16:22:59 -07:00
Blaise Tine
9e5638c9b0 minor update 2024-10-13 12:06:55 -07:00
Blaise Tine
1d626588ef minor update 2024-10-13 11:49:12 -07:00
Blaise Tine
37f4d05393 minor update 2024-10-13 10:44:04 -07:00
Blaise Tine
9f32e5693c minor update 2024-10-13 10:41:32 -07:00
Blaise Tine
684f2e2d3d minor update 2024-10-13 03:42:51 -07:00
Blaise Tine
28bf27e951 rtl cache redesign to support xilinx bram types 2024-10-13 03:40:45 -07:00
Udit Subramanya
8155173aab add documentation based on intial feedback 2024-10-11 07:40:21 -07:00
Udit Subramanya
d3df61abb0 add initial development and production dockerfiles 2024-10-09 12:32:49 -04:00
Blaise Tine
a5381fd788 async bram optimization 2024-10-09 04:14:15 -07:00
Blaise Tine
f49084b298 improving block rams inference with registered read address. 2024-10-08 23:44:36 -07:00
Blaise Tine
ee96d4334b VX_onehot_encoder update 2024-10-08 23:01:01 -07:00
Blaise Tine
c91f9684fc minor update 2024-10-05 18:35:26 -07:00
Blaise Tine
07ce16e75c minor update 2024-10-05 17:42:26 -07:00
Blaise Tine
2eeb2ac532 fixed memory flags propagation through the cache hierarchy 2024-10-05 13:46:10 -07:00
Hyesoon Kim
91c135ac15
Merge pull request #185 from vortexgpgpu/tensor-core
Merge tensor-core and devel branch into master
2024-10-05 10:46:50 -04:00
jaewon-lee-github
faa3b9a469 Merge branch 'master' into tensor-core 2024-10-04 12:58:51 -04:00
Hyesoon Kim
847562be9e
Merge pull request #187 from vortexgpgpu/revert-181-master
Revert "Initial HBM changes for RTL"
2024-10-04 11:37:06 -04:00
Jaewon Lee
0bf79a0f05
Revert "Initial HBM changes for RTL" 2024-10-04 10:13:31 -04:00
Jaewon Lee
119805a959
Merge branch 'master' into tensor-core 2024-10-04 10:03:00 -04:00
tinebp
bc765d10bd
Merge pull request #181 from sij814/master
Initial HBM changes for RTL
2024-10-04 06:45:44 -07:00
Udit Subramanya
208c5b3804 reorg docs 2024-10-04 08:56:49 -04:00
Udit Subramanya
32b0376b28 remove old artifacts 2024-10-03 17:43:39 -04:00
Udit Subramanya
6a447350b7 remove redundant docs after consolidating 2024-10-03 17:42:47 -04:00
Udit Subramanya
dd16d70515 contributing and fpga docs 2024-10-03 17:29:21 -04:00
jaewon-lee-github
5cf6797bd3 - Change STARTUP_ADDR to use the same 0x80000000 address
- Fix environment variable for vortex kernel directories
2024-10-03 15:19:39 -04:00
jaewon-lee-github
bbc02cc013 merged with master 2024-10-03 13:44:39 -04:00
jaewon-lee-github
b7531c9de1 support 64bit 2024-10-02 17:46:01 -04:00
Jaewon Lee
6c725978b4
Merge pull request #184 from vortexgpgpu/develop
Develop
2024-10-02 15:41:35 -04:00
jaewon-lee-github
d1175a03c9 update the code accessing registers in obsoleted way 2024-10-02 14:16:57 -04:00
Blaise Tine
83badaac86 minor update 2024-10-02 11:10:33 -07:00
Blaise Tine
4b8ca42e85 minor update 2024-10-02 09:27:26 -07:00
Blaise Tine
ad7377c8ba minor udpate 2024-10-02 07:41:29 -07:00
Blaise Tine
5cb033ae13 minor update 2024-10-02 07:12:30 -07:00
Blaise Tine
44ebc12ed4 minor update 2024-10-01 00:55:45 -07:00
Blaise Tine
a3aca502b7 minor update 2024-09-30 14:20:48 -07:00
Jaewon Lee
4a606061d2
Merge branch 'develop' into tensor-core 2024-09-30 16:48:47 -04:00
Blaise Tine
ee69024841 minor update 2024-09-30 09:17:42 -07:00
Blaise Tine
6f81df5edb axi_adapter large tags support 2024-09-30 06:25:50 -07:00
Blaise Tine
1deb13c469 minor update 2024-09-30 03:36:00 -07:00
Blaise Tine
2d00cec9d3 minor update 2024-09-30 02:12:30 -07:00
Blaise Tine
a3031922ce minor update 2024-09-29 09:07:45 -07:00
Blaise Tine
60860ec684 minor update 2024-09-29 09:03:24 -07:00
Blaise Tine
cf3909a910 minor update 2024-09-29 07:52:53 -07:00
Blaise Tine
5c694a997c update scope tap testing 2024-09-29 00:09:25 -07:00
Blaise Tine
30571d716c updated scope CI test 2024-09-28 21:37:48 -07:00
Blaise Tine
b8475c65dc adjusting platform caps 2024-09-28 21:25:55 -07:00
Blaise Tine
4329e3f968 minor update 2024-09-28 20:28:57 -07:00
Blaise Tine
b634f9f47d count_leading_zeros fix 2024-09-28 20:15:03 -07:00
Blaise Tine
87e613d29d fixed XRT AFU deadlock on exit 2024-09-28 05:20:37 -07:00
Blaise Tine
eee037ffcd minor update 2024-09-27 20:59:29 -07:00
Blaise Tine
9027555e6a minor update 2024-09-27 20:30:57 -07:00
Blaise Tine
989341a77d minor udpate 2024-09-27 15:13:42 -07:00
Blaise Tine
ec8cc4c84d minor update 2024-09-27 14:21:09 -07:00
Blaise Tine
6e40162027 extending scope triggering to capture continous firing events 2024-09-27 11:36:31 -07:00
Blaise Tine
f2c970868e minor update 2024-09-27 10:02:59 -07:00
Blaise Tine
533ddffc47 cleanup multi-dimensional array to improve synthesis compatibility 2024-09-27 09:48:05 -07:00
Blaise Tine
e9f19a0bf9 fixed BRAM multi-dimensional array bug on Xilinx Vivado 2024-09-27 09:13:24 -07:00
Blaise Tine
5db1937a5e fixed scope parser array indexing 2024-09-27 07:52:38 -07:00
Blaise Tine
9a3eb74051 adding scope.py support for structs 2024-09-26 09:50:38 -07:00
Blaise Tine
27543e240e minor update 2024-09-25 19:11:40 -07:00
Blaise Tine
4f11278d2c scope_tap bug fixes and improvements 2024-09-25 10:28:19 -07:00
Blaise Tine
0e3206747a scope_tap bug fix 2024-09-24 21:46:26 -07:00
Blaise Tine
ce4f90e843 scope analyzer updates 2024-09-24 01:20:26 -07:00
Blaise Tine
a9a5ded030 bitmanip logceil fix 2024-09-23 23:54:43 -07:00
Hyesoon Kim
b5f541b891
Merge pull request #180 from vortexgpgpu/vortex_vm
Vortex vm
2024-09-24 02:48:46 -04:00
Blaise Tine
2cf483ddf5 xrt afu bug fixes 2024-09-23 21:01:24 -07:00
Blaise Tine
9a6dbdf1a9 xrtsim addressing fix 2024-09-23 08:56:57 -07:00
Blaise Tine
818522f7e4 CI scripts update 2024-09-23 05:57:08 -07:00
Blaise Tine
030071571d test memory bank interleaving 2024-09-23 04:30:28 -07:00
Blaise Tine
e5e9a5c2e9 build fix 2024-09-23 04:03:04 -07:00
Blaise Tine
406583c0bd build fix 2024-09-23 04:00:23 -07:00
Blaise Tine
29ea3041c4 build fix 2024-09-23 03:52:03 -07:00
Blaise Tine
828b8827e7 build error fix 2024-09-23 03:36:35 -07:00
Blaise Tine
a80be895ba fixed compiler errors 2024-09-23 03:05:46 -07:00
Blaise Tine
923d2bb94c mark as executable 2024-09-23 02:30:34 -07:00
Blaise Tine
e38c2c1fba xilinx xrt platforms configuration 2024-09-23 02:12:47 -07:00
Blaise Tine
8bb5e5ab8a build error fix 2024-09-22 22:47:23 -07:00
Blaise Tine
b146fab290 xrt kernel registers update 2024-09-22 22:46:55 -07:00
Blaise Tine
15ead4acf6 xrt with merge memory interface 2024-09-22 22:46:10 -07:00
Blaise Tine
f5eca75311 handling synthesis builds with simulation enabled (e.g xrt with hw_emu) 2024-09-22 22:43:48 -07:00
Blaise Tine
5e123d0507 minor update 2024-09-22 22:31:54 -07:00
Blaise Tine
54f0c8e270 scope analyzer optimization 2024-09-22 22:31:14 -07:00
Blaise Tine
b8199decf4 opaesim and xrtsim multi-bank memory support 2024-09-22 03:54:40 -07:00
Blaise Tine
00feb8b424 scope analyzer bug fixes 2024-09-21 08:39:20 -07:00
Blaise Tine
7938c7be5f synthesis updates 2024-09-20 20:35:58 -07:00
sij814
3bac7eae6a changed fpnew commit 2024-09-20 16:52:12 -07:00
Blaise Tine
a61f97f6c6 minor update 2024-09-20 08:09:46 -07:00
Jaewon Lee
5ab13559e0
Update README.md 2024-09-20 10:08:53 -04:00
jaewon-lee-github
4383631543 Add BARE mode test and print out VM info 2024-09-20 09:58:50 -04:00
jaewon-lee-github
9cc0010835 change verilator path 2024-09-20 09:19:17 -04:00
jaewon-lee-github
9902856221 VERILATOR 2024-09-20 09:05:54 -04:00
jaewon-lee-github
e5f2442353 Update Virtual Memory testing 2024-09-20 08:58:11 -04:00
Udit Subramanya
ff9d52c162 Merge remote-tracking branch 'upstream/master' into develop-documentation 2024-09-20 08:26:08 -04:00
Udit Subramanya
acc8221a7e Merge remote-tracking branch 'origin/master' into develop-documentation
Bring in latest docs, to update in this branch
2024-09-20 08:20:17 -04:00
Blaise Tine
63cce35c1a scope taps annotation 2024-09-19 23:33:23 -07:00
Blaise Tine
d2db612bb4 adding scope support to xrtsim 2024-09-19 22:33:28 -07:00
sij814
e8ce3878bb Merge branch 'master' of github.com:vortexgpgpu/vortex 2024-09-19 13:36:46 -07:00
sij814
380c36d930 merged rtlsim branch 2024-09-19 13:31:25 -07:00
sij814
4fff940e42 two different versions of bypass connection 2024-09-19 13:21:14 -07:00
Blaise Tine
2d7f9eae0a minor update 2024-09-19 04:44:00 -07:00
Blaise Tine
a37309c6b0 xrtsim implementation 2024-09-19 04:24:20 -07:00
sij814
48f86a48f6 changed mem_req_arb in VX_cache_l3.sv to accept data_out 2024-09-18 22:05:40 -07:00
Blaise Tine
f0bff2a4a2 minor update 2024-09-17 20:31:12 -07:00
Blaise Tine
8e3bd5696b xilinx synthesis debugging fixes 2024-09-17 19:52:51 -07:00
sij814
992f8d97d3 sliced the bypass requests 2024-09-17 19:47:13 -07:00
Blaise Tine
8908f3e006 minor update 2024-09-17 10:05:17 -07:00
Blaise Tine
f2c1ad7831 minor update 2024-09-17 09:56:54 -07:00
Blaise Tine
8135f72cc9 configure update 2024-09-17 06:45:22 -07:00
Blaise Tine
50458bbae0 xilinx synthesis debugging foxes 2024-09-17 06:22:07 -07:00
Jaewon Lee
5a2d4e6c26
Merge pull request #179 from vortexgpgpu/jaewon-lee-github-patch-2
Update README.md
2024-09-13 10:50:03 -04:00
Jaewon Lee
0a48d98bc1
Update README.md
It has the instruction about the other branch(Vortex_vm).
2024-09-13 09:39:28 -04:00
Udit Subramanya
dc76101068 contribution stats 2024-09-13 09:09:38 -04:00
Blaise Tine
bbe9c0372f minor update 2024-09-13 00:35:42 -07:00
Blaise Tine
263893eb7c minor update 2024-09-13 00:03:08 -07:00
Blaise Tine
b77fff764e minor update 2024-09-12 22:12:03 -07:00
Blaise Tine
145eacc451 minor update 2024-09-12 21:08:19 -07:00
Blaise Tine
1ddd1ba1cc minor update 2024-09-12 20:15:41 -07:00
Blaise Tine
49ed88e59f minor update 2024-09-12 20:12:18 -07:00
Blaise Tine
7208f251b7 minor update 2024-09-12 20:07:19 -07:00
Blaise Tine
6cf0d9f7b4 fixed generate labels lint warnings to improve hardware debugging 2024-09-12 20:00:50 -07:00
Hyesoon Kim
ccf0135d97
Merge pull request #178 from vortexgpgpu/vortex_vm
Vortex Virtual Memory Support
2024-09-12 14:12:04 -04:00
jaewon-lee-github
daec55ae95 change the ci version 2024-09-12 11:24:37 -04:00
Jaewon Lee
e91eb4aed4 merge from master branch 2024-09-12 10:32:02 -04:00
Blaise Tine
5c72685356 minor update 2024-09-11 17:27:36 -07:00
Blaise Tine
f00f96377b disable tracing on synthesis mode 2024-09-11 17:16:34 -07:00
Blaise Tine
230b29de6f minor update 2024-09-11 06:57:43 -07:00
Blaise Tine
bb9ae8576d adding uuid support to memory transactions 2024-09-11 06:47:33 -07:00
Blaise Tine
ae24264a2a minor update 2024-09-11 05:40:05 -07:00
Blaise Tine
83d65e2cf1 tracing update 2024-09-10 16:22:34 -07:00
Blaise Tine
63840a20da minor update 2024-09-09 06:10:56 -07:00
Blaise Tine
b56aa00f4f reset cleanup 2024-09-08 20:37:28 -07:00
Blaise Tine
202af1e783 rtl bug fix 2024-09-08 20:33:27 -07:00
Blaise Tine
207840a97e minor update 2024-09-08 17:49:28 -07:00
Blaise Tine
b1dc2fba42 cache read byteenable bug fix 2024-09-08 17:47:17 -07:00
Blaise Tine
cc105eaea9 tracing refactoring 2024-09-08 14:54:04 -07:00
Blaise Tine
fa11d4c502 TRACING refactoring to support vivado/quartus simulators 2024-09-08 05:26:00 -07:00
Blaise Tine
6626f9201c minor update 2024-09-08 02:46:32 -07:00
Blaise Tine
7823f5529c minor update 2024-09-08 01:38:48 -07:00
Blaise Tine
7bef62aef8 minor update 2024-09-08 01:37:20 -07:00
Blaise Tine
1a35d3fed1 fixed byteen signal on memory read 2024-09-07 21:33:45 -07:00
Blaise Tine
0cbdc3be9e opae afu x warning fixes 2024-09-07 21:32:11 -07:00
Blaise Tine
aa1489d8eb fixed trace.vcd copy 2024-09-07 03:45:23 -07:00
Blaise Tine
a75ed78bf2 fixed getopt exitcode with invalid parameters 2024-09-07 03:42:46 -07:00
Blaise Tine
2041a4ad4a xrt.ini update 2024-09-07 01:43:30 -07:00
Blaise Tine
bfbe642170 adding RTL uuigen 2024-09-07 01:36:17 -07:00
Blaise Tine
fdc62c5f98 minor update 2024-09-06 01:27:54 -07:00
Blaise Tine
e178eb1330 operands's x-propagation bug fix (caught using vivado simulator) 2024-09-05 21:35:10 -07:00
Blaise Tine
7cbb026a12 minor update 2024-09-05 21:34:44 -07:00
Blaise Tine
efc8834c75 xilinx afu reset refactoring 2024-09-05 21:32:25 -07:00
Blaise Tine
8db77ea1cd minor updates 2024-09-05 21:29:01 -07:00
Blaise Tine
cf9172b8fc minor update 2024-09-04 20:16:54 -07:00
Blaise Tine
fb0cd1c272 minor update 2024-09-04 18:24:42 -07:00
Blaise Tine
0aaca84016 minor update 2024-09-04 18:22:37 -07:00
Blaise Tine
8d1baf677d minor update 2024-09-04 18:17:27 -07:00
Blaise Tine
37555b1208 minor update 2024-09-04 15:18:39 -07:00
Blaise Tine
96fb3566a9 minor update 2024-09-04 13:44:23 -07:00
Blaise Tine
7ca9a5e87e reset relay refactory 2024-09-04 13:39:51 -07:00
Blaise Tine
039e5e2ffc minor update 2024-09-04 03:52:55 -07:00
Blaise Tine
32738e0b74 CI script update 2024-09-04 03:39:29 -07:00
Blaise Tine
fd5903fef1 minor update 2024-09-04 03:34:25 -07:00
Blaise Tine
335b53475a minor updates 2024-09-04 02:01:59 -07:00
Blaise Tine
f9230bdac3 minor update 2024-09-03 06:14:09 -07:00
Blaise Tine
19d6142023 fixed fpu serialization 2024-09-03 04:54:29 -07:00
Blaise Tine
c28449f515 minor update 2024-09-02 21:58:12 -07:00
Blaise Tine
45ed8abf22 minor update 2024-09-02 19:39:28 -07:00
Blaise Tine
d16aee3ecd minor update 2024-09-02 10:37:51 -07:00
Blaise Tine
c4df7221c6 Merge branch 'master' of https://github.com/vortexgpgpu/vortex into develop 2024-09-02 04:13:35 -07:00
Blaise Tine
33bec667c2 minor update 2024-09-02 04:12:58 -07:00
Blaise Tine
a17580375b fpu timing optimization 2024-09-02 03:11:26 -07:00
Blaise Tine
40e04a409e adding PE switch 2024-09-02 02:34:08 -07:00
Blaise Tine
d7eae0c886 minor update 2024-09-02 02:33:30 -07:00
Blaise Tine
32636fac70 minor update 2024-09-01 10:15:02 -07:00
Blaise Tine
8215089194 minor update 2024-09-01 04:03:46 -07:00
Blaise Tine
d979cf277f decoder logic specialization 2024-09-01 04:00:57 -07:00
Blaise Tine
72c63a47f3 adding read-first mode support to block ram 2024-09-01 01:19:24 -07:00
Blaise Tine
431c0cfc46 minor update 2024-08-31 02:14:08 -07:00
Blaise Tine
83ea236b84 minor update 2024-08-31 01:58:21 -07:00
Blaise Tine
01fedb066c minor updates 2024-08-31 01:57:08 -07:00
Blaise Tine
7d0c141129 minor updates 2024-08-31 01:44:41 -07:00
Blaise Tine
6eee0728fb minor update 2024-08-29 03:22:09 -07:00
Blaise Tine
fc5bb387a2 minor update 2024-08-29 03:02:50 -07:00
Blaise Tine
961b9c3d63 minor update 2024-08-29 02:41:36 -07:00
Blaise Tine
5f2bf2418b minor update 2024-08-29 02:40:54 -07:00
Blaise Tine
847dee3473 minor update 2024-08-29 01:30:54 -07:00
Blaise Tine
105f884129 migration from fpnew to latest cvfpu core to resolve fpnew bugs and feature limitations 2024-08-29 00:48:51 -07:00
Blaise Tine
fa1fd39645 minor updates 2024-08-28 21:31:09 -07:00
Blaise Tine
a38960674e SimX split.N fix 2024-08-28 21:10:05 -07:00
Blaise Tine
0f41774fea SimX's decode minor fix 2024-08-28 19:07:15 -07:00
Blaise Tine
41e41c9688 adjust SimX's split/join to match RTL. 2024-08-28 18:46:30 -07:00
Blaise Tine
74a47ebbe4 displatch unit fix 2024-08-28 04:36:13 -07:00
Blaise Tine
6c1e785004 minor update 2024-08-28 03:08:08 -07:00
Blaise Tine
4cc7426c44 minor update 2024-08-28 02:52:20 -07:00
Blaise Tine
cf42025c20 minor update 2024-08-28 01:35:55 -07:00
Blaise Tine
f4426e0127 fpu timing optimization 2024-08-28 01:27:51 -07:00
Blaise Tine
91b8c6e67a fixed xilinx fpu ip dut synthesis 2024-08-28 00:40:28 -07:00
Blaise Tine
c162d04b8f minor update 2024-08-27 03:17:01 -07:00
Blaise Tine
4480ed8b0e minor update 2024-08-27 01:19:02 -07:00
Blaise Tine
5adfd5ec68 minor update 2024-08-26 23:45:00 -07:00
Blaise Tine
6d5e71a062 minor update 2024-08-25 20:12:05 -07:00
Blaise Tine
9718a5b405 fpu timing optimization 2024-08-25 19:20:07 -07:00
Blaise Tine
51719f69bb minor update 2024-08-25 16:51:00 -07:00
Blaise Tine
2ca3439109 xrt runtime update 2024-08-25 15:52:27 -07:00
Blaise Tine
088aed022f minor update 2024-08-25 15:52:17 -07:00
Blaise Tine
df3fc150f4 minor update 2024-08-25 06:06:52 -07:00
Blaise Tine
b40441b68f minor update 2024-08-25 05:12:44 -07:00
Blaise Tine
bdcc5f5991 FPU decode optimization 2024-08-25 05:11:48 -07:00
Blaise Tine
b6879b25e3 switching to python3 dependency 2024-08-24 20:46:25 -07:00
Blaise Tine
592297582e fpu_unit timing optimization 2024-08-24 19:44:03 -07:00
Blaise Tine
e538dfa316 minor update 2024-08-24 19:11:06 -07:00
Blaise Tine
e05fe0d75b dispatch_unit speed up 2024-08-24 18:11:06 -07:00
Blaise Tine
383dc1f6b8 timing optimization 2024-08-24 17:38:01 -07:00
Blaise Tine
3b336d7fb3 register vs combinational signals naming consistency 2024-08-24 16:59:18 -07:00
Blaise Tine
4570a20eee minor update 2024-08-24 12:15:12 -07:00
Blaise Tine
10a8705161 minor update 2024-08-24 10:42:48 -07:00
Blaise Tine
1f5cc53434 minor update 2024-08-24 09:16:23 -07:00
Blaise Tine
0ed589a3bf minor update 2024-08-24 07:49:08 -07:00
Blaise Tine
cd97945d0d minor update 2024-08-24 04:51:27 -07:00
Blaise Tine
31a5ab714e xbar timing optimitzaion 2024-08-24 01:57:45 -07:00
Blaise Tine
370daf1025 fifo refactoring 2024-08-24 01:56:56 -07:00
Blaise Tine
bcf7d9f960 timing optimization 2024-08-24 01:56:14 -07:00
Blaise Tine
ade6b2c985 timing optimization 2024-08-24 01:55:25 -07:00
Blaise Tine
4f9b15d96d minor update 2024-08-24 01:54:17 -07:00
Hanran Wu
f57841608e Merge branch 'vortex_vm_rebased' into vortex_vm 2024-08-23 17:45:59 -04:00
Hanran Wu
35c15f554d Merge branch 'mranduril-vortex_vm_rebased' into vortex_vm 2024-08-23 17:45:03 -04:00
Hanran Wu
ea9560b33b merge 2024-08-23 17:44:24 -04:00
Hanran Wu
86b0bdd93c merge into vortex_vm 2024-08-23 17:20:42 -04:00
Hanran Wu
66fd2d4e2d update ci 2024-08-23 16:42:31 -04:00
Blaise Tine
6eeb8eac0f minor update 2024-08-23 00:54:48 -07:00
Blaise Tine
df99b9da0e minor update 2024-08-22 16:29:27 -07:00
sij814
7ae7ffa007 pulled master and made initial changes 2024-08-22 18:37:34 +02:00
Blaise Tine
e4bfa47895 adding test coverage for xilinx synthesis 2024-08-22 02:51:17 -07:00
Blaise Tine
ca3499f3df minor update 2024-08-21 17:54:30 -07:00
Blaise Tine
811ceb5dc0 minor update 2024-08-21 13:00:05 -07:00
Blaise Tine
177f0efc59 minor update 2024-08-21 03:39:09 -07:00
Blaise Tine
9797c6c48a minor udpate 2024-08-21 03:38:15 -07:00
Blaise Tine
771a10ea0c minor update 2024-08-20 23:31:16 -07:00
Blaise Tine
005d480bb4 minor updates 2024-08-20 23:30:44 -07:00
Blaise Tine
5e241c153c Ci script update 2024-08-19 18:36:37 -07:00
Blaise Tine
693a9f648d Ci script update 2024-08-19 18:25:38 -07:00
Blaise Tine
1814ff6d40 xilinx standalone synthesis fixes 2024-08-18 22:02:37 -07:00
Blaise Tine
2762bd53ff minor updates 2024-08-18 18:56:17 -07:00
Blaise Tine
8e9026524a synthesis of the memory unit and local memory 2024-08-18 16:03:59 -07:00
Blaise Tine
3612ceda80 minor update 2024-08-18 02:13:43 -07:00
Blaise Tine
a2b24b4ed0 xilinx non-xrt synthesis fixes 2024-08-18 02:10:34 -07:00
Blaise Tine
de47307428 minor update 2024-08-18 01:57:36 -07:00
Blaise Tine
06ef53025d minor update 2024-08-17 21:19:10 -07:00
tinebp
6c607d32fe
Merge pull request #169 from sij814/simx
simx HBM initial implementation
2024-08-17 20:24:37 -07:00
Blaise Tine
f6daf9bb84 Merge branch 'develop' of https://github.com/vortexgpgpu/vortex into develop 2024-08-17 19:10:29 -07:00
tinebp
adcad92a73 extending OS support 2024-08-17 19:09:02 -07:00
Blaise Tine
51862dbc06 doc update 2024-08-17 19:05:47 -07:00
Blaise Tine
9d3d35c6b4 operands timing optimization 2024-08-17 16:03:02 -07:00
Blaise Tine
b6663eaff9 output register fix 2024-08-17 15:49:49 -07:00
Blaise Tine
a03471837c minor update 2024-08-17 15:21:13 -07:00
Blaise Tine
9638f5a6e6 minor update 2024-08-17 06:05:26 -07:00
Blaise Tine
62a4ee7a3e minor update 2024-08-17 05:32:21 -07:00
Blaise Tine
1f43d4a2fc ASE simulation fixes + docs update 2024-08-17 04:55:32 -07:00
Blaise Tine
8fe02093e2 minor udpate 2024-08-17 04:11:16 -07:00
Blaise Tine
20b82fd34d update configure to deep-copy syn directory tree 2024-08-17 04:09:50 -07:00
Blaise Tine
4b6f8efeaa removing trace_pkg to fix unsupported package dependencies 2024-08-17 04:07:10 -07:00
Blaise Tine
9fc9b43307 OPAE runtime bug fix 2024-08-17 02:18:04 -07:00
Blaise Tine
304761c6fc fixed blackbox temp driver mode with --rebuild=3 2024-08-16 22:32:35 -07:00
sij814
e34e4b790a forced memory bank change in opae 2024-08-16 16:53:18 -07:00
sij814
7a61b67170 added CAPS 2024-08-16 15:47:03 -07:00
Blaise Tine
f6ed49f19c minor update 2024-08-16 08:19:55 -07:00
Blaise Tine
d5fa26350c minor update 2024-08-16 01:35:20 -07:00
sij814
a523afbebe removed jammy 2024-08-15 22:30:32 -07:00
Blaise Tine
b83190c6e1 minor update 2024-08-15 21:29:06 -07:00
Blaise Tine
f4983cb380 core memory unit refactoring 2024-08-15 21:12:28 -07:00
Blaise Tine
65bd9afabb reset relay cleanup 2024-08-15 20:35:07 -07:00
Hanran Wu
54045fa05b skip build and tests ci stages for vm_disable due to verilator dependency 2024-08-15 23:04:08 -04:00
Hanran Wu
bc936c67a3 update ci 2024-08-15 23:02:03 -04:00
Hanran Wu
4a213e7c20 update readme 2024-08-15 23:00:14 -04:00
Hanran Wu
26df47d6e2 add a subset of tests for vm and update ci 2024-08-15 22:55:29 -04:00
sij814
d7e8fd74ff source_id = 0 2024-08-15 19:40:52 -07:00
Blaise Tine
49738672ec minor update 2024-08-15 19:34:50 -07:00
Hanran Wu
48ff4ee4e0 add VM_ENABLE flag to configure&compilation 2024-08-15 16:34:36 -04:00
Blaise Tine
aaff18cca2 bug fix 2024-08-15 05:11:51 -07:00
Blaise Tine
2b22d47dd9 minor update 2024-08-15 05:11:19 -07:00
Blaise Tine
98db249500 minor updates 2024-08-15 01:56:31 -07:00
Blaise Tine
9c346dee86 read-only cache optimization 2024-08-15 01:55:22 -07:00
Blaise Tine
58e5435f0f a priority arbiter performs better than round-robin during commit arbitration 2024-08-13 22:30:54 -07:00
Blaise Tine
cfb5cd5326 arbiter runtime assertion 2024-08-13 21:39:08 -07:00
Blaise Tine
aef1411af5 scoreboard timing optimization 2024-08-13 21:38:33 -07:00
tinebp
e23d569076
Merge pull request #171 from dhy2000/master
Same as #170
2024-08-13 18:48:07 -07:00
Blaise Tine
d6f1393627 memory coalescer timing optimization 2024-08-13 18:34:06 -07:00
sij814
ea34239b43 changes made for initial feedback 2024-08-13 16:52:27 -07:00
Hanran Wu
7528dd9c0f debug and remove travis.yml 2024-08-13 18:18:54 -04:00
Hanran Wu
19b5496f00 modify makefile to only compile simx 2024-08-13 17:54:06 -04:00
Blaise Tine
ee39da74b4 increasing reset delay 2024-08-13 04:14:02 -07:00
donghanyuan
1a9a04ac76 replace local static allocator to global static
Ensure MemoryPool construct before SimPlatform,
thus MemoryPool destruct after SimPlatform.

Avoid use-after-free issue clearing events_ of SimPlatform
after SimPortEvent's allocator is destructed.
2024-08-13 18:13:41 +08:00
Blaise Tine
76f4cd66d3 minor update 2024-08-13 03:08:48 -07:00
Blaise Tine
3ae3afc59b minor update 2024-08-12 21:34:41 -07:00
Blaise Tine
5126a7c472 minor update 2024-08-12 21:32:20 -07:00
Blaise Tine
6c1ee9bfea arbiter fixes 2024-08-12 20:08:08 -07:00
Blaise Tine
14ae4b8c13 minor update 2024-08-12 20:07:50 -07:00
Blaise Tine
2edda834c3 minor update 2024-08-12 18:11:21 -07:00
sij814
47427ab22e regression test with source_id 0 2024-08-12 16:22:30 -07:00
Blaise Tine
d74ee43a66 minor update 2024-08-12 14:19:09 -07:00
Blaise Tine
79362dea4b minor update 2024-08-12 14:01:11 -07:00
Blaise Tine
9053919e92 fixed synthesis warning 2024-08-12 05:24:46 -07:00
Blaise Tine
ed66ee2806 arbitration update 2024-08-12 04:09:56 -07:00
sij814
bab9496117 debugging segmentation fault with 8 clusters 2024-08-12 03:52:48 -07:00
sij814
de81baaabf hbm for vortex 2.2 2024-08-12 02:52:47 -07:00
Blaise Tine
6f3add273d elastic buffer lutram refactoring 2024-08-11 20:28:39 -07:00
Blaise Tine
1fb0691bc7 minor update 2024-08-11 19:50:31 -07:00
sij814
c94c3651ec configure change 22.04 2024-08-11 14:47:43 -07:00
Blaise Tine
8fb73b6da7 fair arbiter optimization 2024-08-10 22:11:49 -07:00
Blaise Tine
32a882e26f arbiters optimization 2024-08-10 18:41:10 -07:00
Blaise Tine
eaa7ed7fe2 rtl arbiter update 2024-08-10 02:38:54 -07:00
Blaise Tine
c8d0357ac6 rtl arbiter fixes 2024-08-10 00:37:56 -07:00
Blaise Tine
229641441f adding static assertion 2024-08-09 18:13:52 -07:00
Blaise Tine
42afa2472f cdiv 2024-08-09 18:11:12 -07:00
Blaise Tine
455fc8389c refactoring priority encoder 2024-08-09 13:58:19 -07:00
Blaise Tine
ab21f76aed minor update 2024-08-07 19:44:24 -07:00
Blaise Tine
f1e79f4c0f fixed toolchain install on centos/7 2024-08-07 19:44:04 -07:00
tinebp
932c435a20
Merge pull request #101 from dhy2000/master
fix #100: change return type to float
2024-08-07 18:09:44 -07:00
tinebp
aad3b26332
Merge branch 'master' into master 2024-08-07 18:09:27 -07:00
Blaise Tine
30ebb65fc3 minor update 2024-08-06 23:36:37 -07:00
Blaise Tine
0d7012e69e minor update 2024-08-06 21:27:08 -07:00
Blaise Tine
bddf276335 memory request flags refactoring 2024-08-06 19:05:22 -07:00
tinebp
09028d8cee
Merge pull request #144 from nayannair/tensor-core
dummy commit
2024-08-02 15:45:18 -07:00
Jaewon Lee
9cc3e0a459
Merge pull request #151 from mranduril/vortex_vm
Add virtual memory allocator for vortex vm
2024-07-30 13:59:14 -04:00
Hanran Wu
34f7e3c982 config ramulator2 2024-07-30 00:18:28 -04:00
Jaewon Lee
30258c04d2 Apply suggestions from code review
Co-authored-by: Martin Troiber <34752929+troibe@users.noreply.github.com>
2024-07-29 16:29:39 -04:00
Jaewon Lee
9db3870309 Update runtime/simx/vortex.cpp
Co-authored-by: Martin Troiber <34752929+troibe@users.noreply.github.com>
2024-07-29 16:29:39 -04:00
Jaewon Lee
34ef500910 Update runtime/simx/vortex.cpp
Co-authored-by: Martin Troiber <34752929+troibe@users.noreply.github.com>
2024-07-29 16:29:39 -04:00
Jaewon Lee
735b713613 Update runtime/simx/vortex.cpp
Co-authored-by: Martin Troiber <34752929+troibe@users.noreply.github.com>
2024-07-29 16:29:39 -04:00
Jaewon Lee
8d978f23ce Update runtime/simx/vortex.cpp
Co-authored-by: Martin Troiber <34752929+troibe@users.noreply.github.com>
2024-07-29 16:29:39 -04:00
Hanran Wu
78fc053ad5 save work before pull 2024-07-29 16:29:39 -04:00
Hanran Wu
6add1e16f6 debugged virtual memory allocator 2024-07-29 16:29:39 -04:00
Hanran Wu
49255bfa69 add virtual mem allocator addr spacereservation 2024-07-29 16:29:39 -04:00
Hanran Wu
31133ae6e9 update destructor of vx_device 2024-07-29 16:29:39 -04:00
Hanran Wu
7916684c36 vpn allocator debug complete, now pass demo&vecadd tests 2024-07-29 16:29:39 -04:00
Hanran Wu
aa45f55126 vpn allocator added but doesn't pass any tests 2024-07-29 16:29:39 -04:00
Jaewon Lee
5877cfe8ae Change STARTUP_ADDR from 0x40000000 to 0x80000000(32b) and 0x180000000(64b) 2024-07-29 16:29:39 -04:00
Jaewon Lee
52233fe13a fixed compile error 2024-07-29 16:29:39 -04:00
Jaewon Lee
6d480b3da1 satp_ is not set, then we skip VAT 2024-07-29 16:29:39 -04:00
Jaewon Lee
2e61dad11f Update README.md
Update TOOLDIR to vortex-toolchain-2024-6-14/
2024-07-29 16:29:39 -04:00
Jaewon Lee
c99e4b37b6 Update README.md 2024-07-29 16:29:02 -04:00
Jaewon Lee
3a5278a62e 64bit support 2024-07-29 15:31:47 -04:00
Jaewon Lee
e21bf9afbd Merge Vortex 2.2 2024-07-29 15:31:17 -04:00
Jaewon Lee
9942f251e0 remove # 2024-07-29 15:31:17 -04:00
Jaewon Lee
da9c51aa3f Virtual Memory Support 2024-07-29 15:31:17 -04:00
Jaewon Lee
7b80da2538 Update upload and download function in simx runtime 2024-07-29 15:31:17 -04:00
Jaewon Lee
53c547f9de Change the declaration of set_processor_satp function 2024-07-29 15:31:17 -04:00
Jaewon Lee
43a90071e1 Merge Austin's code (Preliminary) 2024-07-29 15:31:17 -04:00
Jaewon Lee
2662b6bcab Update README.md 2024-07-29 15:31:17 -04:00
Jaewon Lee
da1f4baa5d Update README.md 2024-07-29 15:29:59 -04:00
Hanran Wu
768c966681 expand MemoryUnit class defs and add some tlb-related functions 2024-07-29 15:29:59 -04:00
Jaewon Lee
ae312f9022 Update README.md 2024-07-29 15:29:36 -04:00
Jaewon Lee
e20a610e67 Update README.md 2024-07-29 15:29:20 -04:00
Hanran Wu
e7660b6ffe Merge branch 'vortex_vm' of https://github.com/mranduril/vortex into vortex_vm
add changes from pull request reviews
2024-07-29 14:35:20 -04:00
Hanran Wu
de66a1b861 save work before pull 2024-07-29 14:35:11 -04:00
Jaewon Lee
90b4a16c9b
Apply suggestions from code review
Co-authored-by: Martin Troiber <34752929+troibe@users.noreply.github.com>
2024-07-20 11:20:27 -04:00
Jaewon Lee
c3e657f201
Update runtime/simx/vortex.cpp
Co-authored-by: Martin Troiber <34752929+troibe@users.noreply.github.com>
2024-07-20 10:39:40 -04:00
Jaewon Lee
0f8e5505d3
Update runtime/simx/vortex.cpp
Co-authored-by: Martin Troiber <34752929+troibe@users.noreply.github.com>
2024-07-20 10:36:58 -04:00
Jaewon Lee
a23fb26a8b
Update runtime/simx/vortex.cpp
Co-authored-by: Martin Troiber <34752929+troibe@users.noreply.github.com>
2024-07-20 10:34:24 -04:00
Jaewon Lee
a4ee8dfa7f
Update runtime/simx/vortex.cpp
Co-authored-by: Martin Troiber <34752929+troibe@users.noreply.github.com>
2024-07-20 10:25:33 -04:00
Hanran Wu
91a1f41f99 debugged virtual memory allocator 2024-07-11 14:49:00 -04:00
Hanran Wu
b8757c539d add virtual mem allocator addr spacereservation 2024-07-10 22:39:00 -04:00
Hanran Wu
314ad3ff8a update destructor of vx_device 2024-07-09 13:42:57 -04:00
Hanran Wu
31837dd7c3 vpn allocator debug complete, now pass demo&vecadd tests 2024-07-08 17:10:19 -04:00
Hanran Wu
f0ea1acaa2 vpn allocator added but doesn't pass any tests 2024-07-08 17:07:30 -04:00
Jaewon Lee
c13e02b19f Change STARTUP_ADDR from 0x40000000 to 0x80000000(32b) and 0x180000000(64b) 2024-06-30 03:10:36 -04:00
Jaewon Lee
ccbb2243cc fixed compile error 2024-06-30 00:54:22 -04:00
Jaewon Lee
3caeeeea13 satp_ is not set, then we skip VAT 2024-06-30 00:35:26 -04:00
Jaewon Lee
b99cd97622 Merge branch 'vortex_vm' of github.com:vortexgpgpu/vortex into vortex_vm 2024-06-29 17:43:44 -04:00
Jaewon Lee
d531fa6b26 64bit support 2024-06-29 17:43:20 -04:00
Jaewon Lee
4ab015ddd9
Update README.md
Update TOOLDIR to vortex-toolchain-2024-6-14/
2024-06-28 09:48:04 -04:00
Nayan Sivakumar Nair
5e63b8f35a dummy commit 2024-06-25 23:27:18 -04:00
Nayan Sivakumar Nair
5b0fc8cbd4 Fixes for PR 2024-06-25 03:18:50 -04:00
Jaewon Lee
3d98121ab6
Update README.md 2024-06-23 11:24:10 -04:00
Jaewon Lee
02091f3d44 Merge Vortex 2.2 2024-06-22 23:55:01 -04:00
Nayan Sivakumar Nair
a378aed67c Moved tc_num, tc_size param to makefile args 2024-06-21 22:23:24 -04:00
Jaewon Lee
2271d2b286 remove # 2024-06-19 02:04:24 -04:00
Jaewon Lee
862997fc94 Virtual Memory Support 2024-06-19 01:52:22 -04:00
Jaewon Lee
62673b4b72 Update upload and download function in simx runtime 2024-06-19 01:43:11 -04:00
Jaewon Lee
01c7b5e384 Change the declaration of set_processor_satp function 2024-06-19 01:36:26 -04:00
Jaewon Lee
cfcece940e Merge Austin's code (Preliminary) 2024-06-19 01:36:26 -04:00
Jaewon Lee
d8a6ac748a Update README.md 2024-06-19 01:09:56 -04:00
Jaewon Lee
2f2974ee72 Ignore the changed on ramulator 2024-06-19 01:09:56 -04:00
Jaewon Lee
6f0af066e8 Update README.md 2024-06-19 01:09:56 -04:00
Hanran Wu
2b426693f5 expand MemoryUnit class defs and add some tlb-related functions 2024-06-19 01:09:56 -04:00
Jaewon Lee
cf3f2d4f6f Update README.md 2024-06-19 01:09:56 -04:00
Jaewon Lee
54af5eb186 Update README.md 2024-06-19 01:09:56 -04:00
Jaewon Lee
efe12ca6bf Update README.md 2024-06-19 01:09:18 -04:00
Varsha Singhania
0e3badf723 Script checkin and code cleanup 2024-06-18 02:19:57 -04:00
Varsha Singhania
99c6a1af5a Tensor cores in Vortex 2024-06-17 04:28:51 -04:00
dhy2000
b08c7403f6
fix #100: change return type to float 2024-01-26 19:56:50 +08:00
Udit Subramanya
247f91a296
Merge branch 'vortexgpgpu:master' into master 2023-12-01 08:39:18 -05:00
Nicholas Ade
afa9e4003c adding mul and divide to bfloat 2023-04-13 04:20:23 -04:00
Nicholas Ade
1b6d9bd3a5 Making the bfloat files 2023-04-12 15:01:09 -04:00
398 changed files with 54338 additions and 29992 deletions

8
.clang-format Normal file
View file

@ -0,0 +1,8 @@
Language: Cpp
BasedOnStyle: LLVM
IndentWidth: 2
TabWidth: 2
ColumnLimit: 0
UseTab: Never
BreakBeforeBraces: Attach
AlwaysBreakTemplateDeclarations: true

View file

@ -17,17 +17,17 @@ on: [push, pull_request]
jobs:
setup:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
steps:
- name: Checkout code
uses: actions/checkout@v2
uses: actions/checkout@v4
with:
submodules: recursive
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v2
uses: actions/cache@v4
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
@ -36,7 +36,7 @@ jobs:
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v2
uses: actions/cache@v4
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
@ -46,7 +46,7 @@ jobs:
- name: Install Dependencies
if: steps.cache-toolchain.outputs.cache-hit != 'true' || steps.cache-thirdparty.outputs.cache-hit != 'true'
run: |
sudo bash ./ci/system_updates.sh
sudo bash ./ci/install_dependencies.sh
- name: Setup Toolchain
if: steps.cache-toolchain.outputs.cache-hit != 'true'
@ -63,7 +63,7 @@ jobs:
make -C third_party > /dev/null
build:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
needs: setup
strategy:
matrix:
@ -71,15 +71,15 @@ jobs:
steps:
- name: Checkout code
uses: actions/checkout@v2
uses: actions/checkout@v4
- name: Install Dependencies
run: |
sudo bash ./ci/system_updates.sh
sudo bash ./ci/install_dependencies.sh
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v2
uses: actions/cache@v4
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
@ -88,7 +88,7 @@ jobs:
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v2
uses: actions/cache@v4
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
@ -106,31 +106,31 @@ jobs:
make tests -s > /dev/null
- name: Upload Build Artifact
uses: actions/upload-artifact@v2
uses: actions/upload-artifact@v4
with:
name: build-${{ matrix.xlen }}
path: build${{ matrix.xlen }}
tests:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
needs: build
strategy:
fail-fast: false
matrix:
name: [regression, opencl, cache, config1, config2, debug, stress]
name: [regression, opencl, cache, config1, config2, debug, scope, stress, synthesis, vm, vector]
xlen: [32, 64]
steps:
- name: Checkout code
uses: actions/checkout@v2
uses: actions/checkout@v4
- name: Install Dependencies
run: |
sudo bash ./ci/system_updates.sh
sudo bash ./ci/install_dependencies.sh
- name: Cache Toolchain Directory
id: cache-toolchain
uses: actions/cache@v2
uses: actions/cache@v4
with:
path: tools
key: ${{ runner.os }}-toolchain-v0.1
@ -139,7 +139,7 @@ jobs:
- name: Cache Third Party Directory
id: cache-thirdparty
uses: actions/cache@v2
uses: actions/cache@v4
with:
path: third_party
key: ${{ runner.os }}-thirdparty-v0.1
@ -147,7 +147,7 @@ jobs:
${{ runner.os }}-thirdparty-
- name: Download Build Artifact
uses: actions/download-artifact@v2
uses: actions/download-artifact@v4
with:
name: build-${{ matrix.xlen }}
path: build${{ matrix.xlen }}
@ -161,16 +161,15 @@ jobs:
./ci/regression.sh --unittest
./ci/regression.sh --isa
./ci/regression.sh --kernel
./ci/regression.sh --synthesis
./ci/regression.sh --regression
else
./ci/regression.sh --${{ matrix.name }}
fi
complete:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
needs: tests
steps:
- name: Check Completion
run: echo "All matrix jobs passed"
run: echo "All matrix jobs passed"

3
.gitignore vendored
View file

@ -1,3 +1,4 @@
/build*
/.vscode
*.cache
*.cache
*.code-workspace

6
.gitmodules vendored
View file

@ -1,9 +1,9 @@
[submodule "third_party/fpnew"]
path = third_party/fpnew
url = https://github.com/pulp-platform/fpnew.git
[submodule "third_party/softfloat"]
path = third_party/softfloat
url = https://github.com/ucb-bar/berkeley-softfloat-3.git
[submodule "third_party/ramulator"]
path = third_party/ramulator
url = https://github.com/CMU-SAFARI/ramulator2.git
[submodule "third_party/cvfpu"]
path = third_party/cvfpu
url = https://github.com/openhwgroup/cvfpu.git

20
Dockerfile.dev Normal file
View file

@ -0,0 +1,20 @@
FROM ubuntu:20.04
LABEL "Udit Subramanya"="usubramanya3@gatech.edu"
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get install -y build-essential valgrind git wget libpng-dev libboost-all-dev uuid-dev ccache cmake
# Third-Party Repository to Install g++11 on Ubuntu 18.04
RUN apt-get install -y manpages-dev software-properties-common
RUN add-apt-repository -y ppa:ubuntu-toolchain-r/test
RUN apt-get install -y gcc-11 g++-11
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 11
RUN update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 11
# create a directory for mounting the volume
WORKDIR /root/vortex

View file

@ -1,10 +1,35 @@
# Vortex GPGPU
Vortex is a full-stack open-source RISC-V GPGPU.
Vortex is a full-stack open-source RISC-V GPGPU. Vortex supports multiple **backend drivers**, including our C++ simulator (simx), an RTL simulator, and physical Xilinx and Altera FPGAs-- all controlled by a single driver script. The chosen driver determines the corresponding code invoked to run Vortex. Generally, developers will prototype their intended design in simx, before completing going forward with an RTL implementation. Alternatively, you can get up and running by selecting a driver of your choice and running a demo program.
## Website
Vortex news can be found on its [website](https://vortex.cc.gatech.edu/)
## Citation
```
@inproceedings{10.1145/3466752.3480128,
author = {Tine, Blaise and Yalamarthy, Krishna Praveen and Elsabbagh, Fares and Hyesoon, Kim},
title = {Vortex: Extending the RISC-V ISA for GPGPU and 3D-Graphics},
year = {2021},
isbn = {9781450385572},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3466752.3480128},
doi = {10.1145/3466752.3480128},
abstract = {The importance of open-source hardware and software has been increasing. However, despite GPUs being one of the more popular accelerators across various applications, there is very little open-source GPU infrastructure in the public domain. We argue that one of the reasons for the lack of open-source infrastructure for GPUs is rooted in the complexity of their ISA and software stacks. In this work, we first propose an ISA extension to RISC-V that supports GPGPUs and graphics. The main goal of the ISA extension proposal is to minimize the ISA changes so that the corresponding changes to the open-source ecosystem are also minimal, which makes for a sustainable development ecosystem. To demonstrate the feasibility of the minimally extended RISC-V ISA, we implemented the complete software and hardware stacks of Vortex on FPGA. Vortex is a PCIe-based soft GPU that supports OpenCL and OpenGL. Vortex can be used in a variety of applications, including machine learning, graph analytics, and graphics rendering. Vortex can scale up to 32 cores on an Altera Stratix 10 FPGA, delivering a peak performance of 25.6 GFlops at 200 Mhz.},
booktitle = {MICRO-54: 54th Annual IEEE/ACM International Symposium on Microarchitecture},
pages = {754766},
numpages = {13},
keywords = {reconfigurable computing, memory systems., computer graphics},
location = {Virtual Event, Greece},
series = {MICRO '21}
}
```
## Specifications
- Support RISC-V RV32IMAF and RV64IMAFD
- Microarchitecture:
- configurable number of cores, warps, and threads.
- configurable number of ALU, FPU, LSU, and SFU units per core.
@ -29,48 +54,50 @@ Vortex is a full-stack open-source RISC-V GPGPU.
- `ci`: Continuous integration scripts.
- `miscs`: Miscellaneous resources.
## Build Instructions
More detailed build instructions can be found [here](docs/install_vortex.md).
## Quick Start
If you are interested in a stable release of Vortex, you can download the latest release [here](https://github.com/vortexgpgpu/vortex/releases/latest). Otherwise, you can pull the most recent, but (potentially) unstable version as shown below. The following steps demonstrate how to build and run Vortex with the default driver: SimX. If you are interested in a different backend, look [here](docs/simulation.md).
### Supported OS Platforms
- Ubuntu 18.04, 20.04
- Ubuntu 18.04, 20.04, 22.04, 24.04
- Centos 7
### Toolchain Dependencies
The following dependencies will be fetched prebuilt by `toolchain_install.sh`.
- [POCL](http://portablecl.org/)
- [LLVM](https://llvm.org/)
- [RISCV-GNU-TOOLCHAIN](https://github.com/riscv-collab/riscv-gnu-toolchain)
- [Verilator](https://www.veripool.org/verilator)
- [FpNew](https://github.com/pulp-platform/fpnew.git)
- [cvfpu](https://github.com/openhwgroup/cvfpu.git)
- [SoftFloat](https://github.com/ucb-bar/berkeley-softfloat-3.git)
- [Ramulator](https://github.com/CMU-SAFARI/ramulator.git)
- [Yosys](https://github.com/YosysHQ/yosys)
- [Sv2v](https://github.com/zachjs/sv2v)
### Install development tools
```sh
sudo apt-get install build-essential
sudo apt-get install binutils
sudo apt-get install python
sudo apt-get install uuid-dev
sudo apt-get install git
```
### Install Vortex codebase
```sh
git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git
cd vortex
git clone --depth=1 --recursive https://github.com/vortexgpgpu/vortex.git
cd vortex
```
### Install system dependencies
```sh
# ensure dependent libraries are present
sudo ./ci/install_dependencies.sh
```
### Configure your build folder
```sh
mkdir build
cd build
../configure --xlen=32 --tooldir=$HOME/tools
mkdir build
cd build
# for 32bit
../configure --xlen=32 --tooldir=$HOME/tools
# for 64bit
../configure --xlen=64 --tooldir=$HOME/tools
```
### Install prebuilt toolchain
```sh
./ci/toolchain_install.sh --all
./ci/toolchain_install.sh --all
```
### Set environment variables
### set environment variables
```sh
# should always run before using the toolchain!
source ./ci/toolchain_env.sh
# should always run before using the toolchain!
source ./ci/toolchain_env.sh
```
### Building Vortex
```sh
@ -88,20 +115,20 @@ make -s
make -s
make install
```
- Building Vortex 64-bit simply requires using --xlen=64 configure option.
- Building Vortex 64-bit requires setting --xlen=64 configure option.
```sh
../configure --xlen=32 --tooldir=$HOME/tools
../configure --xlen=64 --tooldir=$HOME/tools
```
- Sourcing "./ci/toolchain_env.sh" is required everytime you start a new terminal. we recommend adding "source <build-path>/ci/toolchain_env.sh" to your ~/.bashrc file to automate the process at login.
```sh
echo "source <build-path>/ci/toolchain_env.sh" >> ~/.bashrc
```
- Making changes to Makefiles in your source tree or adding new folders will require executing the "configure" script again to get it propagated into your build folder.
- Making changes to Makefiles in your source tree or adding new folders will require executing the "configure" script again without any options to get changes propagated to your build folder.
```sh
../configure
```
- To debug the GPU, you can generate a "run.log" trace. see /docs/debugging.md for more information.
- To debug the GPU, the simulation can generate a runtime trace for analysis. See /docs/debugging.md for more information.
```sh
./ci/blackbox.sh --app=demo --debug=3
```
- For additional information, check out the /docs.
- For additional information, check out the [documentation](docs/index.md)

View file

@ -13,6 +13,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
SCRIPT_DIR=$(dirname "$0")
ROOT_DIR=$SCRIPT_DIR/..
show_usage()
{
echo "Vortex BlackBox Test Driver v1.0"
@ -29,302 +32,174 @@ show_help()
echo "--rebuild: 0=disable, 1=force, 2=auto, 3=temp"
}
SCRIPT_DIR=$(dirname "$0")
ROOT_DIR=$SCRIPT_DIR/..
DRIVER=simx
APP=sgemm
CLUSTERS=1
CORES=1
WARPS=4
THREADS=4
L2=
L3=
DEBUG=0
DEBUG_LEVEL=0
SCOPE=0
HAS_ARGS=0
PERF_CLASS=0
REBUILD=2
TEMPBUILD=0
LOGFILE=run.log
for i in "$@"
do
case $i in
--driver=*)
DRIVER=${i#*=}
shift
;;
--app=*)
APP=${i#*=}
shift
;;
--clusters=*)
CLUSTERS=${i#*=}
shift
;;
--cores=*)
CORES=${i#*=}
shift
;;
--warps=*)
WARPS=${i#*=}
shift
;;
--threads=*)
THREADS=${i#*=}
shift
;;
--l2cache)
L2=-DL2_ENABLE
shift
;;
--l3cache)
L3=-DL3_ENABLE
shift
;;
--debug=*)
DEBUG_LEVEL=${i#*=}
DEBUG=1
shift
;;
--scope)
SCOPE=1
CORES=1
shift
;;
--perf=*)
PERF_FLAG=-DPERF_ENABLE
PERF_CLASS=${i#*=}
shift
;;
--args=*)
ARGS=${i#*=}
HAS_ARGS=1
shift
;;
--rebuild=*)
REBUILD=${i#*=}
shift
;;
--log=*)
LOGFILE=${i#*=}
shift
;;
--help)
show_help
exit 0
;;
*)
show_usage
exit -1
;;
esac
done
if [ $REBUILD -eq 3 ];
then
REBUILD=1
TEMPBUILD=1
fi
case $DRIVER in
gpu)
DRIVER_PATH=
;;
simx)
DRIVER_PATH=$ROOT_DIR/runtime/simx
;;
rtlsim)
DRIVER_PATH=$ROOT_DIR/runtime/rtlsim
;;
opae)
DRIVER_PATH=$ROOT_DIR/runtime/opae
;;
xrt)
DRIVER_PATH=$ROOT_DIR/runtime/xrt
;;
*)
echo "invalid driver: $DRIVER"
exit -1
;;
esac
if [ -d "$ROOT_DIR/tests/opencl/$APP" ];
then
APP_PATH=$ROOT_DIR/tests/opencl/$APP
elif [ -d "$ROOT_DIR/tests/regression/$APP" ];
then
APP_PATH=$ROOT_DIR/tests/regression/$APP
else
echo "Application folder not found: $APP"
exit -1
fi
if [ "$DRIVER" = "gpu" ];
then
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
OPTS=$ARGS make -C $APP_PATH run-$DRIVER
status=$?
add_option() {
if [ -n "$1" ]; then
echo "$1 $2"
else
echo "running: make -C $APP_PATH run-$DRIVER"
make -C $APP_PATH run-$DRIVER
status=$?
echo "$2"
fi
}
DEFAULTS() {
DRIVER=simx
APP=sgemm
DEBUG=0
DEBUG_LEVEL=0
SCOPE=0
HAS_ARGS=0
PERF_CLASS=0
CONFIGS="$CONFIGS"
REBUILD=2
TEMPBUILD=0
LOGFILE=run.log
}
parse_args() {
DEFAULTS
for i in "$@"; do
case $i in
--driver=*) DRIVER=${i#*=} ;;
--app=*) APP=${i#*=} ;;
--clusters=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_CLUSTERS=${i#*=}") ;;
--cores=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_CORES=${i#*=}") ;;
--warps=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_WARPS=${i#*=}") ;;
--threads=*) CONFIGS=$(add_option "$CONFIGS" "-DNUM_THREADS=${i#*=}") ;;
--l2cache) CONFIGS=$(add_option "$CONFIGS" "-DL2_ENABLE") ;;
--l3cache) CONFIGS=$(add_option "$CONFIGS" "-DL3_ENABLE") ;;
--perf=*) CONFIGS=$(add_option "$CONFIGS" "-DPERF_ENABLE"); PERF_CLASS=${i#*=} ;;
--debug=*) DEBUG=1; DEBUG_LEVEL=${i#*=} ;;
--scope) SCOPE=1; ;;
--args=*) HAS_ARGS=1; ARGS=${i#*=} ;;
--rebuild=*) REBUILD=${i#*=} ;;
--log=*) LOGFILE=${i#*=} ;;
--help) show_help; exit 0 ;;
*) show_usage; exit 1 ;;
esac
done
if [ $REBUILD -eq 3 ];
then
REBUILD=1
TEMPBUILD=1
fi
}
set_driver_path() {
case $DRIVER in
gpu) DRIVER_PATH="" ;;
simx|rtlsim|opae|xrt) DRIVER_PATH="$ROOT_DIR/runtime/$DRIVER" ;;
*) echo "Invalid driver: $DRIVER"; exit 1 ;;
esac
}
set_app_path() {
if [ -d "$ROOT_DIR/tests/opencl/$APP" ]; then
APP_PATH="$ROOT_DIR/tests/opencl/$APP"
elif [ -d "$ROOT_DIR/tests/regression/$APP" ]; then
APP_PATH="$ROOT_DIR/tests/regression/$APP"
else
echo "Application folder not found: $APP"
exit 1
fi
}
build_driver() {
local cmd_opts=""
[ $DEBUG -ne 0 ] && cmd_opts=$(add_option "$cmd_opts" "DEBUG=$DEBUG_LEVEL")
[ $SCOPE -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "SCOPE=1")
[ $TEMPBUILD -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "DESTDIR=\"$TEMPDIR\"")
[ -n "$CONFIGS" ] && cmd_opts=$(add_option "$cmd_opts" "CONFIGS=\"$CONFIGS\"")
if [ -n "$cmd_opts" ]; then
echo "Running: $cmd_opts make -C $DRIVER_PATH > /dev/null"
eval "$cmd_opts make -C $DRIVER_PATH > /dev/null"
else
echo "Running: make -C $DRIVER_PATH > /dev/null"
make -C $DRIVER_PATH > /dev/null
fi
}
run_app() {
local cmd_opts=""
[ $DEBUG -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "DEBUG=1")
[ $TEMPBUILD -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "VORTEX_RT_PATH=\"$TEMPDIR\"")
[ $HAS_ARGS -eq 1 ] && cmd_opts=$(add_option "$cmd_opts" "OPTS=\"$ARGS\"")
if [ $DEBUG -ne 0 ]; then
if [ -n "$cmd_opts" ]; then
echo "Running: $cmd_opts make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
eval "$cmd_opts make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
else
echo "Running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
fi
else
if [ -n "$cmd_opts" ]; then
echo "Running: $cmd_opts make -C $APP_PATH run-$DRIVER"
eval "$cmd_opts make -C $APP_PATH run-$DRIVER"
else
echo "Running: make -C $APP_PATH run-$DRIVER"
make -C $APP_PATH run-$DRIVER
fi
fi
status=$?
return $status
}
main() {
parse_args "$@"
set_driver_path
set_app_path
# execute on default installed GPU
if [ "$DRIVER" = "gpu" ]; then
run_app
exit $?
fi
if [ -n "$CONFIGS" ]; then
echo "CONFIGS=$CONFIGS"
fi
if [ $REBUILD -ne 0 ]; then
BLACKBOX_CACHE=blackbox.$DRIVER.cache
LAST_CONFIGS=$(cat "$BLACKBOX_CACHE" 2>/dev/null || echo "")
if [ $REBUILD -eq 1 ] || [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ]; then
make -C $DRIVER_PATH clean-driver > /dev/null
echo "$CONFIGS+$DEBUG+$SCOPE" > "$BLACKBOX_CACHE"
fi
fi
export VORTEX_PROFILING=$PERF_CLASS
make -C "$ROOT_DIR/hw" config > /dev/null
make -C "$ROOT_DIR/runtime/stub" > /dev/null
if [ $TEMPBUILD -eq 1 ]; then
# setup temp directory
TEMPDIR=$(mktemp -d)
mkdir -p "$TEMPDIR"
# build stub driver
echo "running: DESTDIR=$TEMPDIR make -C $ROOT_DIR/runtime/stub"
DESTDIR="$TEMPDIR" make -C $ROOT_DIR/runtime/stub > /dev/null
# register tempdir cleanup on exit
trap "rm -rf $TEMPDIR" EXIT
fi
build_driver
run_app
status=$?
if [ $DEBUG -eq 1 ] && [ -f "$APP_PATH/trace.vcd" ]; then
mv -f $APP_PATH/trace.vcd .
fi
if [ $SCOPE -eq 1 ] && [ -f "$APP_PATH/scope.vcd" ]; then
mv -f $APP_PATH/scope.vcd .
fi
exit $status
fi
}
CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_THREADS=$THREADS $L2 $L3 $PERF_FLAG $CONFIGS"
echo "CONFIGS=$CONFIGS"
if [ $REBUILD -ne 0 ]
then
BLACKBOX_CACHE=blackbox.$DRIVER.cache
if [ -f "$BLACKBOX_CACHE" ]
then
LAST_CONFIGS=`cat $BLACKBOX_CACHE`
fi
if [ $REBUILD -eq 1 ] || [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ];
then
make -C $DRIVER_PATH clean-driver > /dev/null
echo "$CONFIGS+$DEBUG+$SCOPE" > $BLACKBOX_CACHE
fi
fi
# export performance monitor class identifier
export VORTEX_PROFILING=$PERF_CLASS
status=0
# ensure config update
make -C $ROOT_DIR/hw config > /dev/null
# ensure the stub driver is present
make -C $ROOT_DIR/runtime/stub > /dev/null
if [ $DEBUG -ne 0 ]
then
# running application
if [ $TEMPBUILD -eq 1 ]
then
# setup temp directory
TEMPDIR=$(mktemp -d)
mkdir -p "$TEMPDIR/$DRIVER"
# driver initialization
if [ $SCOPE -eq 1 ]
then
echo "running: DESTDIR=$TEMPDIR/$DRIVER DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: DESTDIR=$TEMPDIR/$DRIVER DEBUG=$DEBUG_LEVEL CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
else
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
DEBUG=1 VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
fi
# cleanup temp directory
trap "rm -rf $TEMPDIR" EXIT
else
# driver initialization
if [ $SCOPE -eq 1 ]
then
echo "running: DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DEBUG=$DEBUG_LEVEL SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: DEBUG=$DEBUG_LEVEL CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DEBUG=$DEBUG_LEVEL CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
DEBUG=1 OPTS=$ARGS make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
else
echo "running: make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1"
DEBUG=1 make -C $APP_PATH run-$DRIVER > $LOGFILE 2>&1
status=$?
fi
fi
if [ -f "$APP_PATH/trace.vcd" ]
then
mv -f $APP_PATH/trace.vcd .
fi
else
if [ $TEMPBUILD -eq 1 ]
then
# setup temp directory
TEMPDIR=$(mktemp -d)
mkdir -p "$TEMPDIR/$DRIVER"
# driver initialization
if [ $SCOPE -eq 1 ]
then
echo "running: DESTDIR=$TEMPDIR/$DRIVER SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: DESTDIR=$TEMPDIR/$DRIVER CONFIGS=$CONFIGS make -C $DRIVER_PATH"
DESTDIR="$TEMPDIR/$DRIVER" CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
VORTEX_RT_PATH=$TEMPDIR OPTS=$ARGS make -C $APP_PATH run-$DRIVER
status=$?
else
echo "running: VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER"
VORTEX_RT_PATH=$TEMPDIR make -C $APP_PATH run-$DRIVER
status=$?
fi
# cleanup temp directory
trap "rm -rf $TEMPDIR" EXIT
else
# driver initialization
if [ $SCOPE -eq 1 ]
then
echo "running: SCOPE=1 CONFIGS=$CONFIGS make -C $DRIVER_PATH"
SCOPE=1 CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
else
echo "running: CONFIGS=$CONFIGS make -C $DRIVER_PATH"
CONFIGS="$CONFIGS" make -C $DRIVER_PATH > /dev/null
fi
# running application
if [ $HAS_ARGS -eq 1 ]
then
echo "running: OPTS=$ARGS make -C $APP_PATH run-$DRIVER"
OPTS=$ARGS make -C $APP_PATH run-$DRIVER
status=$?
else
echo "running: make -C $APP_PATH run-$DRIVER"
make -C $APP_PATH run-$DRIVER
status=$?
fi
fi
fi
exit $status
main "$@"

46
ci/install_dependencies.sh Executable file
View file

@ -0,0 +1,46 @@
#!/bin/sh
# Copyright 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
# Function to check if GCC version is less than 11
check_gcc_version() {
local gcc_version
gcc_version=$(gcc -dumpversion)
if dpkg --compare-versions "$gcc_version" lt 11; then
return 0 # GCC version is less than 11
else
return 1 # GCC version is 11 or greater
fi
}
# Update package list
apt-get update -y
# install system dependencies
apt-get install -y build-essential valgrind libstdc++6 binutils python3 uuid-dev ccache cmake libffi7
# Check and install GCC 11 if necessary
if check_gcc_version; then
echo "GCC version is less than 11. Installing GCC 11..."
add-apt-repository -y ppa:ubuntu-toolchain-r/test
apt-get update
apt-get install -y g++-11 gcc-11
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100
else
echo "GCC version is 11 or greater. No need to install GCC 11."
fi

View file

@ -19,6 +19,8 @@ set -e
# clear blackbox cache
rm -f blackbox.*.cache
# HW: add a test "VM Test" to make sure VM feature is enabled
XLEN=${XLEN:=@XLEN@}
XSIZE=$((XLEN / 8))
@ -41,31 +43,23 @@ isa()
make -C tests/riscv/isa run-simx
make -C tests/riscv/isa run-rtlsim
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32f
if [ "$XLEN" == "64" ]
then
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64d
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64d
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64d
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64d
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64fx
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP -DEXT_D_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-64fx
fi
# clean build
@ -100,10 +94,18 @@ regression()
# test global barrier
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tgbar" --cores=2
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tgbar" --cores=2
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=xrt --app=dogfood --args="-n1 -tgbar" --cores=2
# test local barrier
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -tbar"
./ci/blackbox.sh --driver=opae --app=dogfood --args="-n1 -tbar"
./ci/blackbox.sh --driver=xrt --app=dogfood --args="-n1 -tbar"
# test temp driver mode for
./ci/blackbox.sh --driver=simx --app=vecadd --rebuild=3
# test for matmul
CONFIGS="-DTC_NUM=4 -DTC_SIZE=8" ./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --args="-n128 -d1"
echo "regression tests done!"
}
@ -124,6 +126,22 @@ opencl()
echo "opencl tests done!"
}
vm(){
echo "begin vm tests..."
make -C sim/simx clean && CONFIGS="-DVM_ENABLE" make -C sim/simx
make -C runtime/simx clean && CONFIGS="-DVM_ENABLE" make -C runtime/simx
make -C tests/opencl run-simx
make -C tests/regression run-simx
make -C sim/simx clean && CONFIGS="-DVM_ENABLE -DVM_ADDR_MODE=BARE" make -C sim/simx
make -C runtime/simx clean && CONFIGS="-DVM_ENABLE -DVM_ADDR_MODE=BARE" make -C runtime/simx
make -C tests/opencl run-simx
make -C tests/regression run-simx
echo "vm tests done!"
}
cache()
{
echo "begin cache tests..."
@ -140,27 +158,33 @@ cache()
# reduce l1 line size
CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=rtlsim --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XSIZE -DDISABLE_L1" ./ci/blackbox.sh --driver=rtlsim --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=io_addr
CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DL1_LINE_SIZE=$XSIZE -DLMEM_DISABLE" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache ways
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_NUM_WAYS=8 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DICACHE_NUM_WAYS=1 -DDCACHE_NUM_WAYS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_NUM_WAYS=4 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DICACHE_NUM_WAYS=4 -DDCACHE_NUM_WAYS=8" ./ci/blackbox.sh --driver=simx --app=sgemmx
# test cache banking
CONFIGS="-DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DMEM_BLOCK_SIZE=8 -DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DMEM_BLOCK_SIZE=8 -DLMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=simx --app=sgemmx
CONFIGS="-DMEM_BLOCK_SIZE=8 -DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=8 -DLMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=4" ./ci/blackbox.sh --driver=simx --app=sgemmx --threads=8
# replacement policy
CONFIGS="-DDCACHE_REPL_POLICY=0" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_REPL_POLICY=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
CONFIGS="-DDCACHE_REPL_POLICY=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx
# test writeback
CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --app=mstress
CONFIGS="-DDCACHE_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
CONFIGS="-DDCACHE_WRITEBACK=1 -DDCACHE_DIRTYBYTES=0 -DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=mstress
CONFIGS="-DDCACHE_WRITEBACK=1 -DDCACHE_DIRTYBYTES=1 -DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=rtlsim --app=mstress
CONFIGS="-DDCACHE_WRITEBACK=1 -DDCACHE_NUM_WAYS=4" ./ci/blackbox.sh --driver=simx --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
CONFIGS="-DSOCKET_SIZE=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --l3cache --app=mstress
# cache clustering
CONFIGS="-DSOCKET_SIZE=4 -DNUM_DCACHES=4 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --cores=4 --warps=1 --threads=2
@ -235,33 +259,39 @@ config2()
# test opaesim
./ci/blackbox.sh --driver=opae --app=printf
./ci/blackbox.sh --driver=opae --app=diverge
./ci/blackbox.sh --driver=xrt --app=diverge
# disable DPI
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
if [ "$XLEN" == "64" ]; then
# need to disable trig on 64-bit due to a bug inside fpnew's sqrt core.
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-xtrig -xbar -xgbar"
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood --args="-xtrig -xbar -xgbar"
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=xrt --app=dogfood --args="-xtrig -xbar -xgbar"
else
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=xrt --app=dogfood
fi
# custom program startup address
make -C tests/regression/dogfood clean-kernel
STARTUP_ADDR=0x40000000 make -C tests/regression/dogfood
STARTUP_ADDR=0x80000000 make -C tests/regression/dogfood
./ci/blackbox.sh --driver=simx --app=dogfood
./ci/blackbox.sh --driver=rtlsim --app=dogfood
make -C tests/regression/dogfood clean-kernel
# disabling M & F extensions
make -C sim/rtlsim clean && CONFIGS="-DEXT_M_DISABLE -DEXT_F_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32i
make -C sim/rtlsim clean && CONFIGS="-DEXT_M_DISABLE -DEXT_F_DISABLE" make -C sim/rtlsim > /dev/null && make -C tests/riscv/isa run-rtlsim-32i
make -C sim/rtlsim clean
# disabling ZICOND extension
CONFIGS="-DEXT_ZICOND_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=demo
# test AXI bus
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --app=mstress
# test 128-bit MEM block
# test 128-bit memory block
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=xrt --app=mstress
# test XLEN-bit MEM block
# test XLEN-bit memory block
CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=$XSIZE" ./ci/blackbox.sh --driver=simx --app=mstress
@ -269,11 +299,35 @@ config2()
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8
# test single-bank DRAM
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
# test single-bank memory
if [ "$XLEN" == "64" ]; then
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=xrt --app=mstress
else
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_NUM_BANKS=1" ./ci/blackbox.sh --driver=xrt --app=mstress
fi
# test 27-bit DRAM address
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --app=mstress
# test larger memory address
if [ "$XLEN" == "64" ]; then
CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=49" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=49" ./ci/blackbox.sh --driver=xrt --app=mstress
else
CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=33" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_ADDR_WIDTH=33" ./ci/blackbox.sh --driver=xrt --app=mstress
fi
# test memory banks interleaving
CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=1" ./ci/blackbox.sh --driver=opae --app=mstress
CONFIGS="-DPLATFORM_MEMORY_INTERLEAVE=0" ./ci/blackbox.sh --driver=opae --app=mstress
# test memory ports
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress
CONFIGS="-DMEM_BLOCK_SIZE=8 -DPLATFORM_MEMORY_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=8" ./ci/blackbox.sh --driver=opae --app=mstress --threads=8
CONFIGS="-DMEM_BLOCK_SIZE=8" ./ci/blackbox.sh --driver=xrt --app=mstress --threads=8
echo "configuration-2 tests done!"
}
@ -299,20 +353,32 @@ debug()
test_csv_trace
CONFIGS="-O0" ./ci/blackbox.sh --driver=opae --app=demo --args="-n1"
CONFIGS="-O0" ./ci/blackbox.sh --driver=xrt --app=demo --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=xrt --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
CONFIGS="-DSOCKET_SIZE=1" ./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=demo --args="-n1"
echo "debugging tests done!"
}
scope()
{
echo "begin scope tests..."
SCOPE_DEPTH=128 ./ci/blackbox.sh --driver=opae --app=demo --args="-n1" --scope
SCOPE_DEPTH=128 ./ci/blackbox.sh --driver=xrt --app=demo --args="-n1" --scope
echo "debugging scope done!"
}
stress()
{
echo "begin stress tests..."
# test verilator reset values
CONFIGS="-DVERILATOR_RESET_VALUE=1 -DSOCKET_SIZE=1 -DDCACHE_WRITEBACK=1 -DL2_WRITEBACK=1 -DL3_WRITEBACK=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemmx --args="-n128" --l2cache
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=xrt --app=sgemmx --args="-n128" --l2cache
echo "stress tests done!"
}
@ -322,15 +388,25 @@ synthesis()
echo "begin synthesis tests..."
PREFIX=build_base make -C hw/syn/yosys clean
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys synthesis
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE -DNUM_WARPS=2 -DNUM_THREADS=2" make -C hw/syn/yosys synthesis
echo "synthesis tests done!"
}
vector()
{
echo "begin vector tests..."
make -C sim/simx clean && CONFIGS="-DEXT_V_ENABLE" make -C sim/simx
TOOLDIR=@TOOLDIR@ XLEN=@XLEN@ VLEN=256 REG_TESTS=1 ./tests/riscv/riscv-vector-tests/run-test.sh
echo "vector tests done!"
}
show_usage()
{
echo "Vortex Regression Test"
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--stress] [--synthesis] [--all] [--h|--help]"
echo "Usage: $0 [--clean] [--unittest] [--isa] [--kernel] [--regression] [--opencl] [--cache] [--config1] [--config2] [--debug] [--scope] [--stress] [--synthesis] [--vector] [--all] [--h|--help]"
}
declare -a tests=()
@ -359,6 +435,9 @@ while [ "$1" != "" ]; do
--cache )
tests+=("cache")
;;
--vm )
tests+=("vm")
;;
--config1 )
tests+=("config1")
;;
@ -368,12 +447,18 @@ while [ "$1" != "" ]; do
--debug )
tests+=("debug")
;;
--scope )
tests+=("scope")
;;
--stress )
tests+=("stress")
;;
--synthesis )
tests+=("synthesis")
;;
--vector )
tests+=("vector")
;;
--all )
tests=()
tests+=("unittest")
@ -382,11 +467,14 @@ while [ "$1" != "" ]; do
tests+=("regression")
tests+=("opencl")
tests+=("cache")
tests+=("vm")
tests+=("config1")
tests+=("config2")
tests+=("debug")
tests+=("scope")
tests+=("stress")
tests+=("synthesis")
tests+=("vector")
;;
-h | --help )
show_usage

View file

@ -1,27 +0,0 @@
#!/bin/sh
# Copyright 2019-2023
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
set -e
apt-get update -y
add-apt-repository -y ppa:ubuntu-toolchain-r/test
apt-get update
apt-get install -y g++-11 gcc-11
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 100
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 100
apt-get install -y build-essential valgrind libstdc++6 binutils python uuid-dev ccache

View file

@ -1,13 +1,13 @@
#!/bin/sh
# Copyright 2019-2023
#
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -15,7 +15,6 @@
# limitations under the License.
TOOLDIR=${TOOLDIR:=@TOOLDIR@}
export PATH=$TOOLDIR/verilator/bin:$PATH
export SV2V_PATH=$TOOLDIR/sv2v

View file

@ -23,9 +23,9 @@ OSVERSION=${OSVERSION:=@OSVERSION@}
riscv32()
{
case $OSVERSION in
"centos/7") parts=$(eval echo {a..h}) ;;
"ubuntu/focal") parts=$(eval echo {a..k}) ;;
*) parts=$(eval echo {a..j}) ;;
"centos/7") parts=$(eval echo {a..l}) ;;
"ubuntu/bionic") parts=$(eval echo {a..j}) ;;
*) parts=$(eval echo {a..k}) ;;
esac
rm -f riscv32-gnu-toolchain.tar.bz2.parta*
for x in $parts
@ -41,7 +41,7 @@ riscv32()
riscv64()
{
case $OSVERSION in
"centos/7") parts=$(eval echo {a..h}) ;;
"centos/7") parts=$(eval echo {a..l}) ;;
*) parts=$(eval echo {a..j}) ;;
esac
rm -f riscv64-gnu-toolchain.tar.bz2.parta*

View file

@ -44,7 +44,8 @@ def load_config(filename):
'num_barriers': int(config_match.group(7)),
}
return config
return None
print("Error: missing CONFIGS: header")
sys.exit(1)
def parse_simx(log_lines):
pc_pattern = r"PC=(0x[0-9a-fA-F]+)"
@ -274,6 +275,8 @@ def split_log_file(log_filename):
if current_sublog is not None:
sublogs.append(current_sublog)
else:
sublogs.append(log_lines)
return sublogs

View file

@ -1,4 +1,4 @@
#!/usr/bin/env python
#!/usr/bin/env python3
# Copyright 2019-2023
#

View file

@ -31,7 +31,4 @@ RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv$(XLEN)-gnu-toolchain
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
VORTEX_RT_PATH ?= $(VORTEX_HOME)/runtime
VORTEX_KN_PATH ?= $(VORTEX_HOME)/kernel
THIRD_PARTY_DIR ?= $(VORTEX_HOME)/third_party
THIRD_PARTY_DIR ?= $(VORTEX_HOME)/third_party

10
configure vendored
View file

@ -26,6 +26,8 @@ detect_osversion() {
case "$VERSION_CODENAME" in
bionic) osversion="ubuntu/bionic";;
focal) osversion="ubuntu/focal";;
jammy) osversion="ubuntu/focal";;
noble) osversion="ubuntu/focal";;
# Add new versions as needed
esac
;;
@ -63,7 +65,7 @@ copy_files() {
filename_no_ext="${filename%.in}"
dest_file="$dest_dir/$filename_no_ext"
mkdir -p "$dest_dir"
sed "s|@VORTEX_HOME@|$SCRIPT_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g" "$file" > "$dest_file"
sed "s|@VORTEX_HOME@|$SOURCE_DIR|g; s|@XLEN@|$XLEN|g; s|@TOOLDIR@|$TOOLDIR|g; s|@OSVERSION@|$OSVERSION|g; s|@INSTALLDIR@|$PREFIX|g; s|@BUILDDIR@|$CURRENT_DIR|g" "$file" > "$dest_file"
# apply permissions to bash scripts
read -r firstline < "$dest_file"
if [[ "$firstline" =~ ^#!.*bash ]]; then
@ -167,8 +169,8 @@ fi
SUBDIRS=("." "!ci" "!perf" "hw*" "kernel*" "runtime*" "sim*" "tests*")
# Get the directory of the script
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
SOURCE_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
THIRD_PARTY_DIR=$SCRIPT_DIR/third_party
THIRD_PARTY_DIR=$SOURCE_DIR/third_party
copy_files "$SCRIPT_DIR" "$CURRENT_DIR"
copy_files "$SOURCE_DIR" "$CURRENT_DIR"

View file

@ -1,79 +0,0 @@
# FPGA Startup and Configuration Guide
OPAE Environment Setup
----------------------
$ source /opt/inteldevstack/init_env_user.sh
$ export OPAE_HOME=/opt/opae/1.1.2
$ export PATH=$OPAE_HOME/bin:$PATH
$ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH
$ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH
$ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH
OPAE Build
------------------
The FPGA has to following configuration options:
- DEVICE_FAMILY=arria10 | stratix10
- NUM_CORES=#n
Command line:
$ cd hw/syn/altera/opae
$ PREFIX=test1 TARGET=fpga NUM_CORES=4 make
A new folder (ex: `test1_xxx_4c`) will be created and the build will start and take ~30-480 min to complete.
Setting TARGET=ase will build the project for simulation using Intel ASE.
OPAE Build Configuration
------------------------
The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware parameters that can be modified when build the processor.For example, have the following parameters that can be configured:
- `NUM_WARPS`: Number of warps per cores
- `NUM_THREADS`: Number of threads per warps
- `PERF_ENABLE`: enable the use of all profile counters
You configure the syntesis build from the command line:
$ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make
OPAE Build Progress
-------------------
You could check the last 10 lines in the build log for possible errors until build completion.
$ tail -n 10 <build_dir>/build.log
Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs.
$ ps -u <username>
If the build fails and you need to restart it, clean up the build folder using the following command:
$ make clean
The bitstream file `vortex_afu.gbs` should exist when the build is done:
$ ls -lsa <build_dir>/synth/vortex_afu.gbs
Signing the bitstream and Programming the FPGA
----------------------------------------------
$ cd <build_dir>
$ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs
$ fpgasupdate vortex_afu_unsigned_ssl.gbs
Sample FPGA Run Test
--------------------
Ensure you have the correct opae runtime for the FPGA target
$ make -C runtime/opae clean
$ TARGET=FPGA make -C runtime/opae
Run the following from your Vortex build directory
$ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128"

View file

@ -1,18 +1,37 @@
# Contributing to Vortex on Github
# Contributing to Vortex
## Github Details
- There are two main repos, `vortex` (public, this one) and `vortex-dev` (private)
- todo: Most current development is on `vortex`
- If you have a legacy version of `vortex`, you can use the releases branch or tags to access the repo at that point in time
## Github
Vortex uses Github to host its git repositories.
There are a lot of ways to use the features on Github for collaboration.
Therefore, this documentation details the standard procedure for contributing to Vortex.
Development of Vortex is consolidated to this repo, `vortex` and any associated forks.
Previously, there was active work done on a private repo named `vortex-dev`.
`vortex-dev` has officially been deprecated and fully merged into this public repo, `vortex`.
If you are returning to this project and have legacy versions of Vortex, you can use the releases branches to access older versions.
## Contribution Process
- You should create a new branch from develop that is clearly named with the feature that you want to add
- Avoid pushing directly to the `master` branch instead you will need to make a Pull Request (PR)
- There should be protections in place that prevent pushing directly to the main branch, but don't rely on it
- When you make a PR it will be tested against the continuous integration (ci) pipeline (see `continuous_integration.md`)
- It is not sufficient to just write some tests, they need to be incorporated into the ci pipeline to make sure they are run
- During a PR, you might receive feedback regarding your changes and you might need to make further commits to your branch
In an effort to keep `vortex` organized, permissions to directly create branches and push code has been limited to admins.
However, contributions are strongly encouraged and keep the project moving forward! Here is the procedure for contributing:
1. Create a fork of `vortex`
2. In your fork, create a branch from `master` that briefly explains the work you are adding (ie: `develop-documentation`)
3. Make your changes on the new branch in your fork. You may create as many commits as you need, which might be common if you are making multiple iterations
4. Since you are the owner of your fork, you have full permissions to push commits to your fork
4. When you are satisfied with the changes on your fork, you can open a PR from your fork using the online interface
5. If you recently made a push, you will get automatically get a prompt on Github online to create a PR, which you can press
6. Otherwise, you can go to your fork on Github online and manually create a PR (todo)
(todo): how to name and format your PR, what information you should add to the PR, does not need to be too strict if you are attending the weekly meetings*
7. Github uses the following semantics: `base repository` gets the changes from your `head repository`
8. Therefore, you should set the `base repository` to `vortexgpgpu/vortex` and the `base` branch to `master` since the master branch is protected by reviewed PRs.
9. And you should assign the `head repository` to `<your-github-username>/vortex` (which represents your fork of vortex) and the `base` branch to the one created in step 2
10. Now that your intended PR has been specified, you should review the status. Check for merge conflicts, if all your commits are present, and all the modified files make sense
11. You can still make a PR if there are issues in step 10, just make sure the structure is correct according to steps 7-9
12. Once the PR is made, the CI pipeline will run automatically, testing your changes
13. Remember, a PR is flexible if you need to make changes to the code you can go back to your branch of the fork to commit and push any updates
14. As long as the `head repository`'s `base` branch is the one you edited, the PR will automatically get the most recent changes
15. When all merge conflicts are resolved, changes are made, and tests pass you can have an admin merge your PR
## Creating and Adding Tests
see `testing.md`
## What Makes a Good Contribution?
- If you are contributing code changes, then review [testing.md](./testing.md) to ensure your tests are integrated into the [CI pipeline](continuous_integration.md)
- During a PR, you should consider the advice you are provided by your reviewers. Remember you keep adding commits to an open PR!
- If your change aims to fix an issue opened on Github, please tag that issue in the PR itself

View file

@ -33,7 +33,13 @@ The recommended method to enable debugging is to pass the `--debug` flag to `bla
// Running demo program on rtlsim in debug mode
$ ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=1
A debug trace `run.log` is generated in the current directory during the program execution. The trace includes important states of the simulated processor (memory, caches, pipeline, stalls, etc..). A waveform trace `trace.vcd` is also generated in the current directory during the program execution. You can visualize the waveform trace using any tool that can open VCD files (Modelsim, Quartus, Vivado, etc..). [GTKwave] (http://gtkwave.sourceforge.net) is a great open-source scope analyzer that also works with VCD files.
A debug trace `run.log` is generated in the current directory during the program execution. The trace includes important states of the simulated processor (memory, caches, pipeline, stalls, etc..). A waveform trace `trace.vcd` is also generated in the current directory during the program execution.
By default all library modules unde the /libs/ folder are excluded from the trace to reduce the waveform file size, you can chnage that behavoir by either explicitly commenting out `TRACING_OFF`/`TRACING_ON` inside a lib module source (e.g. VX_stream_buffer.sv) or simply enabling a full trace using the following command.
// Debugging the demo program with rtlsim in full tracing mode
$ CONFIGS="-DTRACING_ALL" ./ci/blackbox.sh --driver=rtlsim --app=demo --debug=1
You can visualize the waveform trace using any tool that can open VCD files (Modelsim, Quartus, Vivado, etc..). [GTKwave] (http://gtkwave.sourceforge.net) is a great open-source scope analyzer that also works with VCD files.
## FPGA Debugging

View file

@ -1,16 +1,19 @@
# Environment Setup
These instructions apply to the development vortex repo using the updated toolchain. The updated toolchain is considered to be any commit of `master` pulled from July 2, 2023 onwards. The toolchain update in question can be viewed in this [commit](https://github.com/vortexgpgpu/vortex-dev/commit/0048496ba28d7b9a209a0e569d52d60f2b68fc04). Therefore, if you are unsure whether you are using the new toolchain or not, then you should check the `ci` folder for the existence of the `toolchain_prebuilt.sh` script. Furthermore, you should notice that the `toolchain_install.sh` script has the legacy `llvm()` split into `llvm-vortex()` and `llvm-pocl()`.
## Set Up on Your Own System
The toolchain binaries provided with Vortex are built on Ubuntu-based systems. To install Vortex on your own system, [follow these instructions](install_vortex.md).
## Servers for Georgia Tech Students and Collaborators
### Volvo
Volvo is a 64-core server provided by HPArch. You need valid credentials to access it. If you don't already have access, you can get in contact with your mentor to ask about setting your account up.
Setup on Volvo:
1. Connect to Georgia Tech's VPN or ssh into another machine on campus
2. `ssh volvo.cc.gatech.edu`
3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
@ -19,9 +22,11 @@ Setup on Volvo:
6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
### Nio
Nio is a 20-core desktop server provided by HPArch. If you have access to Volvo, you also have access to Nio.
Setup on Nio:
1. Connect to Georgia Tech's VPN or ssh into another machine on campus
2. `ssh nio.cc.gatech.edu`
3. Clone Vortex to your home directory: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
@ -29,11 +34,12 @@ Setup on Nio:
5. `make -s` in the `vortex` root directory
6. Run a test program: `./ci/blackbox.sh --cores=2 --app=dogfood`
## Docker (Experimental)
Docker allows for isolated pre-built environments to be created, shared and used. The emulation mode required for ARM-based processors will incur a decrease in performance. Currently, the dockerfile is not included with the official vortex repository and is not actively maintained or supported.
### Setup with Docker
1. Clone repo recursively onto your local machine: `git clone --recursive https://github.com/vortexgpgpu/vortex.git`
2. Download the dockerfile from [here](https://github.gatech.edu/gist/usubramanya3/f1bf3e953faa38a6372e1292ffd0b65c) and place it in the root of the repo.
3. Build the Dockerfile into an image: `docker build --platform=linux/amd64 -t vortex -f dockerfile .`

217
docs/fpga_setup.md Normal file
View file

@ -0,0 +1,217 @@
# FPGA Startup and Configuration Guide
## Gaining Access to FPGA's with CRNCH
If you are associated with Georgia Tech (or related workshops) you can use CRNCH's server to gain remote access to FPGA's. Otherwise, you can skip to the Xilinx or Intel (Altera) synthesis steps below.
## What is CRNCH?
**C**enter for **R**esearch into **N**ovel **C**omputing **H**ierarchies
## What does CRNCH Offer?
**The Rogues Gallery (RG)**: new concept focused on developing our understanding of next-generation hardware with a focus on unorthodox and uncommon technologies. **RG** will acquire new and unique hardware (ie, the aforementioned “*rogues*”) from vendors, research labs, and startups and make this hardware available to students, faculty, and industry collaborators within a managed data center environment
## Why are the Rouges Important?
By exposing students and researchers to this set of unique hardware, we hope to foster cross-cutting discussions about hardware designs that will drive future *performance improvements in computing long after the Moores Law era of “cheap transistors” ends*. Specifically, the Rouges Gallery contains FPGA's which can be synthesized into Vortex hardware.
## How is the Rouges Gallery Funded?
Rogues Gallery testbed is primarily supported by the National Science Foundation (NSF) under NSF Award Number [#2016701](https://www.nsf.gov/awardsearch/showAward?AWD_ID=2016701&HistoricalAwards=false)
## Rouges Gallery Documentation
You can read about RG in more detail on its official documentation [page](https://gt-crnch-rg.readthedocs.io/en/main/index.html#).
You can listen to a talk about RG [here](https://mediaspace.gatech.edu/media/Jeff%20Young%20-%20Rogues%20Gallery%20-%20CRNCH%20Summit%202021/1_lqlgr0jj)
[CRNCH Summit 2023](https://github.com/gt-crnch/crnch-summit-2023/tree/main)
## Request Access for Rouges Gallery
You should use [this form](https://crnch-rg.cc.gatech.edu/request-rogues-gallery-access/) to request access to RGs reconfigurable computing (vortex fpga) resources. You should receive an email with your ticket item being created. Once it gets processed, you should get an email confirmed your access has been granted. It might take some time to get processed.
## How to Access Rouges Gallery?
There are two methods of accessing CRNCH's Rouges Gallery
1) Web-based GUI: [rg-ood.crnch.gatech.edu](http://rg-ood.crnch.gatech.edu/)
2) SSH: `ssh <your-gt-username>@rg-login.crnch.gatech.edu`
## Where should I keep my files?
The CRNCH servers have a folder called `USERSCRATCH` which can be found in your home directory: `echo $HOME`. You should keep all your files in this folder since it is available across all the Rouges Gallery Nodes.
## **What Machines are Available in the Rogues Gallery?**
Complete list of machines can be found [here](https://gt-crnch-rg.readthedocs.io/en/main/general/rg-hardware.html). Furthermore, you can find detailed information about the FPGA hardware [here](https://gt-crnch-rg.readthedocs.io/en/main/reconfig/xilinx/xilinx-getting-started.html).
## Allocate an FPGA Node
Once youve connected to the CRNCH login node, you can use the Slurm scheduler to request an interactive job using `salloc`. This [page](https://gt-crnch-rg.readthedocs.io/en/main/general/using-slurm.html) explains why we use Slurm to request resources. Documentation for `salloc` can be found [here](https://gt-crnch-rg.readthedocs.io/en/main/general/using-slurm-examples.html). And here.
To request 16 cores and 64GB of RAM for 6 hours on flubber9, a fpga dev node:
```bash
salloc -p rg-fpga --nodes=1 --ntasks-per-node=16 --mem=64G --nodelist flubber1 --time=06:00:00
```
Synthesis for Xilinx Boards
----------------------
Once you are logged in, you will need to complete some first time configurations. If you are interested in the Intel (Altera) synthesis steps, scroll down below.
### Source Configuration Scripts
```
# From any directory
$ source /opt/xilinx/xrt/setup.sh
$ source /tools/reconfig/xilinx/Vitis/2023.1/settings64.sh
```
### Check Installed FPGA Platforms
`platforminfo -l` which tells us the correct name of the platform installed on the current fpga node. It should be used for the `PLATFORM` variable below. Otherwise, if there is an error then there was an issue with the previous two commands.
### Install Vortex Toolchain
The Xilinx synthesis process requires verilator to generate the bitstream. Eventually, you will need the whole toolchain to run the bitstream on the FPGA. Therefore, the Vortex toolchain and can be installed as follows. If you complete these steps properly, you should only need to complete them once and you can skip to `Activate Vortex Toolchain`
```
# Make a build directory from root and configure scripts for your environment
mkdir build && cd build && ../configure --tooldir=$HOME/tools
# Install the whole prebuilt toolchain
./ci/toolchain_install.sh --all
# Add environment variables to bashrc
echo "source <full-path-to-vortex-root>/vortex/build/ci/toolchain_env.sh" >> ~/.bashrc
```
### Activate Vortex Toolchain
```
# From any directory
source ~/.bashrc
# Check environment setup
verilator --version
```
### Build the FPGA Bitstream
The root directory contains the path `hw/syn/xilinx/xrt` which has the makefile used to generate the Vortex bitstream.
```
$ cd hw/syn/xilinx/xrt
$ PREFIX=test1 PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 TARGET=hw NUM_CORES=1 make > build_u250_hw_1c.log 2>&1 &
```
Will run the synthesis under new build directory: BUILD_DIR := "\<PREFIX>\_\<PLATFORM>\_\<TARGET>"
The generated bitstream will be located under <BUILD_DIR>/bin/vortex_afu.xclbin
For long-running jobs, invocation of this makefile can be made of the following form:
`[CONFIGS=<vortex macros>] [PREFIX=<prefix directory name>] [NUM_CORES=<#>] TARGET=hw|hw_emu PLATFORM=<platform baseName> nohup make > <log filename> 2>&1 &`
For example:
```bash
CONFIGS="-DL2_ENABLE -DDCACHE_SIZE=8192" PREFIX=build_4c_u280 NUM_CORES=4 TARGET=hw PLATFORM=xilinx_u280_gen3x16_xdma_1_202310_1 nohup make > build_u250_hw_4c.log 2>&1 &
```
The build is complete when the bitstream file `vortex_afu.xclbin` exists in `<prefix directory name><platform baseName>hw|hw_emu/bin`.
### Running a Program on Xilinx FPGA
The [blackbox.sh](./simulation.md) script within the build directory can be used to run a test with Vortexs xrt driver using the following command:
`FPGA_BIN_DIR=<path to bitstream directory> TARGET=hw|hw_emu PLATFORM=<platform baseName> ./ci/blackbox.sh --driver=xrt --app=<test name>`
For example:
```FPGA_BIN_DIR=<realpath> hw/syn/xilinx/xrt/build_4c_u280_xilinx_u280_gen3x16_xdma_1_202211_1_hw/bin TARGET=hw PLATFORM=xilinx_u280_gen3x16_xdma_1_202211_1 ./ci/blackbox.sh --driver=xrt --app=demo```
Synthesis for Intel (Altera) Boards
----------------------
### OPAE Environment Setup
$ source /opt/inteldevstack/init_env_user.sh
$ export OPAE_HOME=/opt/opae/1.1.2
$ export PATH=$OPAE_HOME/bin:$PATH
$ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH
$ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH
$ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH
### OPAE Build
The FPGA has to following configuration options:
- DEVICE_FAMILY=arria10 | stratix10
- NUM_CORES=#n
Command line:
$ cd hw/syn/altera/opae
$ PREFIX=test1 TARGET=fpga NUM_CORES=4 make
A new folder (ex: `test1_xxx_4c`) will be created and the build will start and take ~30-480 min to complete.
Setting TARGET=ase will build the project for simulation using Intel ASE.
### OPAE Build Configuration
The hardware configuration file `/hw/rtl/VX_config.vh` defines all the hardware parameters that can be modified when build the processor.For example, have the following parameters that can be configured:
- `NUM_WARPS`: Number of warps per cores
- `NUM_THREADS`: Number of threads per warps
- `PERF_ENABLE`: enable the use of all profile counters
You configure the syntesis build from the command line:
$ CONFIGS="-DPERF_ENABLE -DNUM_THREADS=8" make
### OPAE Build Progress
You could check the last 10 lines in the build log for possible errors until build completion.
$ tail -n 10 <build_dir>/build.log
Check if the build is still running by looking for quartus_sh, quartus_syn, or quartus_fit programs.
$ ps -u <username>
If the build fails and you need to restart it, clean up the build folder using the following command:
$ make clean
The file `vortex_afu.gbs` should exist when the build is done:
$ ls -lsa <build_dir>/synth/vortex_afu.gbs
### Signing the bitstream and Programming the FPGA
$ cd <build_dir>
$ PACSign PR -t UPDATE -H openssl_manager -i vortex_afu.gbs -o vortex_afu_unsigned_ssl.gbs
$ fpgasupdate vortex_afu_unsigned_ssl.gbs
### Sample FPGA Run Test
Ensure you have the correct opae runtime for the FPGA target
```
$ TARGET=FPGA make -C runtime/opae
```
Run the [blackbox.sh](./simulation.md) from your Vortex build directory
```
$ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128"
```
### FPGA sample test running OpenCL sgemm kernel
You can use the `blackbox.sh` script to run the following from your Vortex build directory
$ TARGET=fpga ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n128"
### Testing Vortex using OPAE with Intel ASE Simulation
Building ASE synthesis
```$ TARGET=asesim make -C runtime/opae```
Building ASE runtime
```$ TARGET=asesim make -C runtime/opae```
Running ASE simulation
```$ ASE_LOG=0 ASE_WORKDIR=<build_dir>/synth/work TARGET=asesim ./ci/blackbox.sh --driver=opae --app=sgemm --args="-n16"```

View file

@ -2,32 +2,8 @@
## Table of Contents
- [Codebase Layout](codebase.md)
- [Microarchitecture](microarchitecture.md)
- [Cache Subsystem](cache_subsystem.md)
- [Software](software.md)
- [Simulation](simulation.md)
- [Altera FPGA Setup Guide](altera_fpga_guide.md)
- [Xilinx FPGA Setup Guide](xilinx_fpga_guide.md)
- [Debugging](debugging.md)
- [Useful Links](references.md)
## Installation
- For the different environments Vortex supports, [read this document](environment_setup.md).
- To install on your own system, [follow this document](install_vortex.md).
## Quick Start Scenarios
Running Vortex simulators with different configurations:
- Run basic driver test with rtlsim driver and Vortex config of 2 clusters, 2 cores, 2 warps, 4 threads
$ ./ci/blackbox.sh --driver=rtlsim --clusters=2 --cores=2 --warps=2 --threads=4 --app=basic
- Run demo driver test with opae driver and Vortex config of 1 clusters, 4 cores, 4 warps, 2 threads
$ ./ci/blackbox.sh --driver=opae --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo
- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads
$ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood
- [Codebase Layout](codebase.md): Summary of repo file tree
- [Microarchitecture](microarchitecture.md): Vortex Pipeline and cache microarchitectural details and reconfigurability
- [Simulation](simulation.md): Details for building and running each simulation driver
- [Contributing](contributing.md): Process for contributing your own features including repo semantics and testing
- [Debugging](debugging.md): Debugging configurations for each Vortex driver

View file

@ -77,4 +77,7 @@ Vortex has a 6-stage pipeline:
- Sockets
- Grouping multiple cores sharing L1 cache
- Clusters
- Grouping of sockets sharing L2 cache
- Grouping of sockets sharing L2 cache
### Vortex Cache Subsystem
More details about the cache subsystem are provided [here](./cache_subsystem.md).

View file

@ -6,13 +6,16 @@
### Cycle-Approximate Simulation
SimX is a C++ cycle-level in-house simulator developed for Vortex. The relevant files are located in the `simX` folder.
SimX is a C++ cycle-level in-house simulator developed for Vortex. The relevant files are located in the `simx` folder. The [readme](README.md) has the most detailed instructions for building and running simX.
- To install on your own system, [follow this document](install_vortex.md).
- For the different Georgia Tech environments Vortex supports, [read this document](environment_setup.md).
### FGPA Simulation
The current target FPGA for simulation is the Arria10 Intel Accelerator Card v1.0. The guide to build the fpga with specific configurations is located [here.](fpga_setup.md)
The guide to build the fpga with specific configurations is located [here.](fpga_setup.md) You can find instructions for both Xilinx and Altera based FPGAs.
### How to Test
### How to Test (using `blackbox.sh`)
Running tests under specific drivers (rtlsim,simx,fpga) is done using the script named `blackbox.sh` located in the `ci` folder. Running command `./ci/blackbox.sh --help` from the Vortex root directory will display the following command line arguments for `blackbox.sh`:
@ -47,4 +50,20 @@ PERF: core1: instrs=90693, cycles=53108, IPC=1.707709
PERF: core2: instrs=90849, cycles=53107, IPC=1.710678
PERF: core3: instrs=90836, cycles=50347, IPC=1.804199
PERF: instrs=363180, cycles=53108, IPC=6.838518
```
```
## Additional Quick Start Scenarios
Running Vortex simulators with different configurations and drivers is supported. For example:
- Run basic driver test with rtlsim driver and Vortex config of 2 clusters, 2 cores, 2 warps, 4 threads
$ ./ci/blackbox.sh --driver=rtlsim --clusters=2 --cores=2 --warps=2 --threads=4 --app=basic
- Run demo driver test with opae driver and Vortex config of 1 clusters, 4 cores, 4 warps, 2 threads
$ ./ci/blackbox.sh --driver=opae --clusters=1 --cores=4 --warps=4 --threads=2 --app=demo
- Run dogfood driver test with simx driver and Vortex config of 4 cluster, 4 cores, 8 warps, 6 threads
$ ./ci/blackbox.sh --driver=simx --clusters=4 --cores=4 --warps=8 --threads=6 --app=dogfood

View file

@ -2,7 +2,7 @@
## Running a Vortex application
The framework provides a utility script: blackbox.sh under the /ci/ folder for executing applications in the tests tree.
The framework provides a utility script: blackbox.sh under the /ci/ folder for executing applications in the tests tree. It gets copied into the `build` directory with all the environment variables resolved, so you should run it from the `build` directory as follows:
You can query the commandline options of the tool using:
$ ./ci/blackbox.sh --help
@ -49,4 +49,4 @@ Compile your test: `$ make -C tests/regression/<test-name>`
Run your test: `$ ./ci/blackbox.sh --driver=simx --app=<test-name> --debug`
## Adding Your Tests to the CI Pipeline
See `continuous_integration.md`
If you are a contributor, then you will need to add tests that integrate into the continuous integration pipeline. Remember, Pull Requests cannot be merged unless new code has tests and existing tests do not regress. Furthermore, if you are contributing a new feature, it is recommended that you add the ability to enable / disable the new feature that you are adding. See more at [contributing.md](contributing.md) and [continuous_integration.md](continuous_integration.md).

View file

@ -1,36 +0,0 @@
# FPGA Startup and Configuration Guide
XRT Environment Setup
----------------------
$ source /opt/xilinx/Vitis/2023.1/settings64.sh
$ source /opt/xilinx/xrt/setup.sh
Check Installed FPGA Platforms
------------------------------
$ platforminfo -l
Build FPGA image
----------------
$ cd hw/syn/xilinx/xrt
$ PREFIX=test1 PLATFORM=xilinx_u50_gen3x16_xdma_5_202210_1 TARGET=hw NUM_CORES=4 make
Will run the synthesis under new build directory: BUILD_DIR := "\<PREFIX>\_\<PLATFORM>\_\<TARGET>"
The generated bitstream will be located under <BUILD_DIR>/bin/vortex_afu.xclbin
Sample FPGA Run Test
--------------------
Ensure you have the correct opae runtime for the FPGA target
$ make -C runtime/xrt clean
$ TARGET=hw make -C runtime/xrt
Run the following from your Vortex build directory
$ TARGET=hw FPGA_BIN_DIR=<BUILD_DIR>/bin ./ci/blackbox.sh --driver=xrt --app=sgemm --args="-n128"

View file

@ -47,8 +47,6 @@ extern "C" {
void dpi_trace(int level, const char* format, ...);
void dpi_trace_start();
void dpi_trace_stop();
uint64_t dpi_uuid_gen(bool reset, int wid);
}
bool sim_trace_enabled();
@ -204,17 +202,3 @@ void dpi_trace_start() {
void dpi_trace_stop() {
sim_trace_enable(false);
}
///////////////////////////////////////////////////////////////////////////////
std::unordered_map<uint32_t, uint32_t> g_uuid_gens;
uint64_t dpi_uuid_gen(bool reset, int wid) {
if (reset) {
g_uuid_gens.clear();
return 0;
}
uint32_t instr_uuid = g_uuid_gens[wid]++;
uint64_t uuid = (uint64_t(wid) << 32) | instr_uuid;
return uuid;
}

View file

@ -30,6 +30,4 @@ import "DPI-C" function void dpi_trace(input int level, input string format /*ve
import "DPI-C" function void dpi_trace_start();
import "DPI-C" function void dpi_trace_stop();
import "DPI-C" function longint dpi_uuid_gen(input logic reset, input int wid);
`endif

View file

@ -24,14 +24,14 @@ module VX_cluster import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
input sysmem_perf_t sysmem_perf,
`endif
// DCRs
VX_dcr_bus_if.slave dcr_bus_if,
// Memory
VX_mem_bus_if.master mem_bus_if,
VX_mem_bus_if.master mem_bus_if [`L2_MEM_PORTS],
// Status
output wire busy
@ -43,12 +43,12 @@ module VX_cluster import VX_gpu_pkg::*; #(
`endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
assign mem_perf_tmp_if.icache = 'x;
assign mem_perf_tmp_if.dcache = 'x;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.lmem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
cache_perf_t l2_perf;
sysmem_perf_t sysmem_perf_tmp;
always @(*) begin
sysmem_perf_tmp = sysmem_perf;
sysmem_perf_tmp.l2cache = l2_perf;
end
`endif
`ifdef GBAR_ENABLE
@ -56,23 +56,21 @@ module VX_cluster import VX_gpu_pkg::*; #(
VX_gbar_bus_if per_socket_gbar_bus_if[`NUM_SOCKETS]();
VX_gbar_bus_if gbar_bus_if();
`RESET_RELAY (gbar_reset, reset);
VX_gbar_arb #(
.NUM_REQS (`NUM_SOCKETS),
.OUT_BUF ((`NUM_SOCKETS > 2) ? 1 : 0) // bgar_unit has no backpressure
) gbar_arb (
.clk (clk),
.reset (gbar_reset),
.reset (reset),
.bus_in_if (per_socket_gbar_bus_if),
.bus_out_if (gbar_bus_if)
);
VX_gbar_unit #(
.INSTANCE_ID ($sformatf("gbar%0d", CLUSTER_ID))
.INSTANCE_ID (`SFORMATF(("gbar%0d", CLUSTER_ID)))
) gbar_unit (
.clk (clk),
.reset (gbar_reset),
.reset (reset),
.gbar_bus_if (gbar_bus_if)
);
@ -81,18 +79,19 @@ module VX_cluster import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) per_socket_mem_bus_if[`NUM_SOCKETS]();
) per_socket_mem_bus_if[`NUM_SOCKETS * `L1_MEM_PORTS]();
`RESET_RELAY (l2_reset, reset);
VX_cache_wrap #(
.INSTANCE_ID ($sformatf("%s-l2cache", INSTANCE_ID)),
.INSTANCE_ID (`SFORMATF(("%s-l2cache", INSTANCE_ID))),
.CACHE_SIZE (`L2_CACHE_SIZE),
.LINE_SIZE (`L2_LINE_SIZE),
.NUM_BANKS (`L2_NUM_BANKS),
.NUM_WAYS (`L2_NUM_WAYS),
.WORD_SIZE (L2_WORD_SIZE),
.NUM_REQS (L2_NUM_REQS),
.MEM_PORTS (`L2_MEM_PORTS),
.CRSQ_SIZE (`L2_CRSQ_SIZE),
.MSHR_SIZE (`L2_MSHR_SIZE),
.MRSQ_SIZE (`L2_MRSQ_SIZE),
@ -100,17 +99,19 @@ module VX_cluster import VX_gpu_pkg::*; #(
.TAG_WIDTH (L2_TAG_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`L2_WRITEBACK),
.DIRTY_BYTES (`L2_WRITEBACK),
.DIRTY_BYTES (`L2_DIRTYBYTES),
.REPL_POLICY (`L2_REPL_POLICY),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2),
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
.CORE_OUT_BUF (3),
.MEM_OUT_BUF (3),
.NC_ENABLE (1),
.PASSTHRU (!`L2_ENABLED)
) l2cache (
.clk (clk),
.reset (l2_reset),
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.l2cache),
.cache_perf (l2_perf),
`endif
.core_bus_if (per_socket_mem_bus_if),
.mem_bus_if (mem_bus_if)
@ -118,24 +119,20 @@ module VX_cluster import VX_gpu_pkg::*; #(
///////////////////////////////////////////////////////////////////////////
VX_dcr_bus_if socket_dcr_bus_tmp_if();
assign socket_dcr_bus_tmp_if.write_valid = dcr_bus_if.write_valid && (dcr_bus_if.write_addr >= `VX_DCR_BASE_STATE_BEGIN && dcr_bus_if.write_addr < `VX_DCR_BASE_STATE_END);
assign socket_dcr_bus_tmp_if.write_addr = dcr_bus_if.write_addr;
assign socket_dcr_bus_tmp_if.write_data = dcr_bus_if.write_data;
wire [`NUM_SOCKETS-1:0] per_socket_busy;
VX_dcr_bus_if socket_dcr_bus_if();
`BUFFER_DCR_BUS_IF (socket_dcr_bus_if, socket_dcr_bus_tmp_if, (`NUM_SOCKETS > 1));
// Generate all sockets
for (genvar socket_id = 0; socket_id < `NUM_SOCKETS; ++socket_id) begin : sockets
for (genvar socket_id = 0; socket_id < `NUM_SOCKETS; ++socket_id) begin : g_sockets
`RESET_RELAY (socket_reset, reset);
VX_dcr_bus_if socket_dcr_bus_if();
wire is_base_dcr_addr = (dcr_bus_if.write_addr >= `VX_DCR_BASE_STATE_BEGIN && dcr_bus_if.write_addr < `VX_DCR_BASE_STATE_END);
`BUFFER_DCR_BUS_IF (socket_dcr_bus_if, dcr_bus_if, is_base_dcr_addr, (`NUM_SOCKETS > 1))
VX_socket #(
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + socket_id),
.INSTANCE_ID ($sformatf("%s-socket%0d", INSTANCE_ID, socket_id))
.INSTANCE_ID (`SFORMATF(("%s-socket%0d", INSTANCE_ID, socket_id)))
) socket (
`SCOPE_IO_BIND (scope_socket+socket_id)
@ -143,12 +140,12 @@ module VX_cluster import VX_gpu_pkg::*; #(
.reset (socket_reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
.sysmem_perf (sysmem_perf_tmp),
`endif
.dcr_bus_if (socket_dcr_bus_if),
.mem_bus_if (per_socket_mem_bus_if[socket_id]),
.mem_bus_if (per_socket_mem_bus_if[socket_id * `L1_MEM_PORTS +: `L1_MEM_PORTS]),
`ifdef GBAR_ENABLE
.gbar_bus_if (per_socket_gbar_bus_if[socket_id]),
@ -158,6 +155,6 @@ module VX_cluster import VX_gpu_pkg::*; #(
);
end
`BUFFER_EX(busy, (| per_socket_busy), 1'b1, (`NUM_SOCKETS > 1));
`BUFFER_EX(busy, (| per_socket_busy), 1'b1, 1, (`NUM_SOCKETS > 1));
endmodule

View file

@ -31,7 +31,6 @@
`endif
///////////////////////////////////////////////////////////////////////////////
`ifndef EXT_M_DISABLE
`define EXT_M_ENABLE
`endif
@ -86,6 +85,10 @@
`endif
`endif
`ifndef VLEN
`define VLEN 256
`endif
`ifndef NUM_CLUSTERS
`define NUM_CLUSTERS 1
`endif
@ -110,6 +113,24 @@
`define SOCKET_SIZE `MIN(4, `NUM_CORES)
`endif
// Size of Tensor Core
`ifndef TC_SIZE
`define TC_SIZE 8
`endif
// Number of TCs per Warp
`ifndef TC_NUM
`define TC_NUM 4
`endif
`ifndef NUM_TCU_LANES
`define NUM_TCU_LANES `TC_NUM
`endif
`ifndef NUM_TCU_BLOCKS
`define NUM_TCU_BLOCKS `ISSUE_WIDTH
`endif
`ifdef L2_ENABLE
`define L2_ENABLED 1
`else
@ -151,6 +172,28 @@
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
`endif
// Platform memory parameters
`ifndef PLATFORM_MEMORY_NUM_BANKS
`define PLATFORM_MEMORY_NUM_BANKS 2
`endif
`ifndef PLATFORM_MEMORY_ADDR_WIDTH
`ifdef XLEN_64
`define PLATFORM_MEMORY_ADDR_WIDTH 48
`else
`define PLATFORM_MEMORY_ADDR_WIDTH 32
`endif
`endif
`ifndef PLATFORM_MEMORY_DATA_SIZE
`define PLATFORM_MEMORY_DATA_SIZE 64
`endif
`ifndef PLATFORM_MEMORY_INTERLEAVE
`define PLATFORM_MEMORY_INTERLEAVE 1
`endif
`ifdef XLEN_64
`ifndef STACK_BASE_ADDR
@ -169,7 +212,14 @@
`define IO_BASE_ADDR 64'h000000040
`endif
`else
`ifdef VM_ENABLE
`ifndef PAGE_TABLE_BASE_ADDR
`define PAGE_TABLE_BASE_ADDR 64'h0F0000000
`endif
`endif
`else // XLEN_32
`ifndef STACK_BASE_ADDR
`define STACK_BASE_ADDR 32'hFFFF0000
@ -187,6 +237,13 @@
`define IO_BASE_ADDR 32'h00000040
`endif
`ifdef VM_ENABLE
`ifndef PAGE_TABLE_BASE_ADDR
`define PAGE_TABLE_BASE_ADDR 32'hF0000000
`endif
`endif
`endif
`define IO_END_ADDR `USER_BASE_ADDR
@ -214,15 +271,17 @@
`endif
`define STACK_SIZE (1 << `STACK_LOG2_SIZE)
`define RESET_DELAY 8
`define RESET_DELAY 8
`ifndef STALL_TIMEOUT
`define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
`endif
`ifndef SV_DPI
`ifndef DPI_DISABLE
`define DPI_DISABLE
`endif
`endif
`ifndef FPU_FPNEW
`ifndef FPU_DSP
@ -251,6 +310,59 @@
`define DEBUG_LEVEL 3
`endif
`ifndef MEM_PAGE_SIZE
`define MEM_PAGE_SIZE (4096)
`endif
`ifndef MEM_PAGE_LOG2_SIZE
`define MEM_PAGE_LOG2_SIZE (12)
`endif
// Virtual Memory Configuration ///////////////////////////////////////////////////////
`ifdef VM_ENABLE
`ifdef XLEN_32
`ifndef VM_ADDR_MODE
`define VM_ADDR_MODE SV32 //or BARE
`endif
`ifndef PT_LEVEL
`define PT_LEVEL (2)
`endif
`ifndef PTE_SIZE
`define PTE_SIZE (4)
`endif
`ifndef NUM_PTE_ENTRY
`define NUM_PTE_ENTRY (1024)
`endif
`ifndef PT_SIZE_LIMIT
`define PT_SIZE_LIMIT (1<<23)
`endif
`else
`ifndef VM_ADDR_MODE
`define VM_ADDR_MODE SV39 //or BARE
`endif
`ifndef PT_LEVEL
`define PT_LEVEL (3)
`endif
`ifndef PTE_SIZE
`define PTE_SIZE (8)
`endif
`ifndef NUM_PTE_ENTRY
`define NUM_PTE_ENTRY (512)
`endif
`ifndef PT_SIZE_LIMIT
`define PT_SIZE_LIMIT (1<<25)
`endif
`endif
`ifndef PT_SIZE
`define PT_SIZE MEM_PAGE_SIZE
`endif
`ifndef TLB_SIZE
`define TLB_SIZE (32)
`endif
`endif
// Pipeline Configuration /////////////////////////////////////////////////////
// Issue width
@ -478,7 +590,16 @@
// Number of Associative Ways
`ifndef ICACHE_NUM_WAYS
`define ICACHE_NUM_WAYS 1
`define ICACHE_NUM_WAYS 4
`endif
// Replacement Policy
`ifndef ICACHE_REPL_POLICY
`define ICACHE_REPL_POLICY 1
`endif
`ifndef ICACHE_MEM_PORTS
`define ICACHE_MEM_PORTS 1
`endif
// Dcache Configurable Knobs //////////////////////////////////////////////////
@ -507,7 +628,7 @@
// Number of Banks
`ifndef DCACHE_NUM_BANKS
`define DCACHE_NUM_BANKS `MIN(`NUM_LSU_LANES, 4)
`define DCACHE_NUM_BANKS `MIN(DCACHE_NUM_REQS, 16)
`endif
// Core Response Queue Size
@ -527,12 +648,12 @@
// Memory Response Queue Size
`ifndef DCACHE_MRSQ_SIZE
`define DCACHE_MRSQ_SIZE 0
`define DCACHE_MRSQ_SIZE 4
`endif
// Number of Associative Ways
`ifndef DCACHE_NUM_WAYS
`define DCACHE_NUM_WAYS 1
`define DCACHE_NUM_WAYS 4
`endif
// Enable Cache Writeback
@ -540,6 +661,25 @@
`define DCACHE_WRITEBACK 0
`endif
// Enable Cache Dirty bytes
`ifndef DCACHE_DIRTYBYTES
`define DCACHE_DIRTYBYTES `DCACHE_WRITEBACK
`endif
// Replacement Policy
`ifndef DCACHE_REPL_POLICY
`define DCACHE_REPL_POLICY 1
`endif
// Number of Memory Ports
`ifndef L1_MEM_PORTS
`ifdef L1_DISABLE
`define L1_MEM_PORTS `MIN(DCACHE_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
`else
`define L1_MEM_PORTS `MIN(`DCACHE_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
`endif
`endif
// LMEM Configurable Knobs ////////////////////////////////////////////////////
`ifndef LMEM_DISABLE
@ -562,16 +702,12 @@
// Cache Size
`ifndef L2_CACHE_SIZE
`ifdef ALTERA_S10
`define L2_CACHE_SIZE 2097152
`else
`define L2_CACHE_SIZE 1048576
`endif
`endif
// Number of Banks
`ifndef L2_NUM_BANKS
`define L2_NUM_BANKS `MIN(4, `NUM_SOCKETS)
`define L2_NUM_BANKS `MIN(L2_NUM_REQS, 16)
`endif
// Core Response Queue Size
@ -591,12 +727,12 @@
// Memory Response Queue Size
`ifndef L2_MRSQ_SIZE
`define L2_MRSQ_SIZE 0
`define L2_MRSQ_SIZE 4
`endif
// Number of Associative Ways
`ifndef L2_NUM_WAYS
`define L2_NUM_WAYS 2
`define L2_NUM_WAYS 8
`endif
// Enable Cache Writeback
@ -604,20 +740,35 @@
`define L2_WRITEBACK 0
`endif
// Enable Cache Dirty bytes
`ifndef L2_DIRTYBYTES
`define L2_DIRTYBYTES `L2_WRITEBACK
`endif
// Replacement Policy
`ifndef L2_REPL_POLICY
`define L2_REPL_POLICY 1
`endif
// Number of Memory Ports
`ifndef L2_MEM_PORTS
`ifdef L2_ENABLE
`define L2_MEM_PORTS `MIN(`L2_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
`else
`define L2_MEM_PORTS `MIN(L2_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
`endif
`endif
// L3cache Configurable Knobs /////////////////////////////////////////////////
// Cache Size
`ifndef L3_CACHE_SIZE
`ifdef ALTERA_S10
`define L3_CACHE_SIZE 2097152
`else
`define L3_CACHE_SIZE 1048576
`endif
`endif
// Number of Banks
`ifndef L3_NUM_BANKS
`define L3_NUM_BANKS `MIN(4, `NUM_CLUSTERS)
`define L3_NUM_BANKS `MIN(L3_NUM_REQS, 16)
`endif
// Core Response Queue Size
@ -637,12 +788,12 @@
// Memory Response Queue Size
`ifndef L3_MRSQ_SIZE
`define L3_MRSQ_SIZE 0
`define L3_MRSQ_SIZE 4
`endif
// Number of Associative Ways
`ifndef L3_NUM_WAYS
`define L3_NUM_WAYS 4
`define L3_NUM_WAYS 8
`endif
// Enable Cache Writeback
@ -650,6 +801,25 @@
`define L3_WRITEBACK 0
`endif
// Enable Cache Dirty bytes
`ifndef L3_DIRTYBYTES
`define L3_DIRTYBYTES `L3_WRITEBACK
`endif
// Replacement Policy
`ifndef L3_REPL_POLICY
`define L3_REPL_POLICY 1
`endif
// Number of Memory Ports
`ifndef L3_MEM_PORTS
`ifdef L3_ENABLE
`define L3_MEM_PORTS `MIN(`L3_NUM_BANKS, `PLATFORM_MEMORY_NUM_BANKS)
`else
`define L3_MEM_PORTS `MIN(L3_NUM_REQS, `PLATFORM_MEMORY_NUM_BANKS)
`endif
`endif
// ISA Extensions /////////////////////////////////////////////////////////////
`ifdef EXT_A_ENABLE
@ -682,6 +852,12 @@
`define EXT_M_ENABLED 0
`endif
`ifdef EXT_V_ENABLE
`define EXT_V_ENABLED 1
`else
`define EXT_V_ENABLED 0
`endif
`ifdef EXT_ZICOND_ENABLE
`define EXT_ZICOND_ENABLED 1
`else
@ -698,7 +874,7 @@
`define ISA_STD_N 13
`define ISA_STD_Q 16
`define ISA_STD_S 18
`define ISA_STD_U 20
`define ISA_STD_V 21
`define ISA_EXT_ICACHE 0
`define ISA_EXT_DCACHE 1
@ -735,7 +911,7 @@
| (0 << 18) /* S - Supervisor mode implemented */ \
| (0 << 19) /* T - Tentatively reserved for Transactional Memory extension */ \
| (1 << 20) /* U - User mode implemented */ \
| (0 << 21) /* V - Tentatively reserved for Vector extension */ \
| (`EXT_V_ENABLED << 21) /* V - Tentatively reserved for Vector extension */ \
| (0 << 22) /* W - Reserved */ \
| (1 << 23) /* X - Non-standard extensions present */ \
| (0 << 24) /* Y - Reserved */ \

View file

@ -50,10 +50,16 @@
`define PERF_CTR_BITS 44
`ifndef NDEBUG
`define UUID_ENABLE
`define UUID_WIDTH 44
`else
`ifdef SCOPE
`define UUID_ENABLE
`define UUID_WIDTH 44
`else
`define UUID_WIDTH 1
`endif
`endif
`define PC_BITS (`XLEN-1)
`define OFFSET_BITS 12
@ -227,22 +233,19 @@
`define INST_FENCE_D 1'h0
`define INST_FENCE_I 1'h1
`define INST_FPU_ADD 4'b0000
`define INST_FPU_SUB 4'b0001
`define INST_FPU_MUL 4'b0010
`define INST_FPU_DIV 4'b0011
`define INST_FPU_SQRT 4'b0100
`define INST_FPU_CMP 4'b0101 // frm: LE=0, LT=1, EQ=2
`define INST_FPU_F2F 4'b0110
`define INST_FPU_MISC 4'b0111 // frm: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
`define INST_FPU_F2I 4'b1000
`define INST_FPU_F2U 4'b1001
`define INST_FPU_I2F 4'b1010
`define INST_FPU_U2F 4'b1011
`define INST_FPU_MADD 4'b1100
`define INST_FPU_MSUB 4'b1101
`define INST_FPU_NMSUB 4'b1110
`define INST_FPU_NMADD 4'b1111
`define INST_FPU_ADD 4'b0000 // SUB=fmt[1]
`define INST_FPU_MUL 4'b0001
`define INST_FPU_MADD 4'b0010 // SUB=fmt[1]
`define INST_FPU_NMADD 4'b0011 // SUB=fmt[1]
`define INST_FPU_DIV 4'b0100
`define INST_FPU_SQRT 4'b0101
`define INST_FPU_F2I 4'b1000 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
`define INST_FPU_F2U 4'b1001 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
`define INST_FPU_I2F 4'b1010 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
`define INST_FPU_U2F 4'b1011 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
`define INST_FPU_CMP 4'b1100 // frm: LE=0, LT=1, EQ=2
`define INST_FPU_F2F 4'b1101 // fmt[0]: F32=0, F64=1
`define INST_FPU_MISC 4'b1110 // frm: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
`define INST_FPU_BITS 4
`define INST_FPU_IS_CLASS(op, frm) (op == `INST_FPU_MISC && frm == 3)
`define INST_FPU_IS_MVXW(op, frm) (op == `INST_FPU_MISC && frm == 4)
@ -267,14 +270,14 @@
///////////////////////////////////////////////////////////////////////////////
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
(`CLOG2(mshr_size) + `CLOG2(num_banks))
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width) \
(uuid_width + `CLOG2(mshr_size) + `CLOG2(`CDIV(num_banks, mem_ports)))
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width) \
(`CLOG2(`CDIV(num_reqs, mem_ports)) + `CLOG2(line_size / word_size) + tag_width)
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \
(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width)) + 1)
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, tag_width, uuid_width) \
(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width), `CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width)) + 1)
///////////////////////////////////////////////////////////////////////////////
@ -284,14 +287,14 @@
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, num_caches, uuid_width) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks, mem_ports, uuid_width), num_caches)
`define CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_BYPASS_TAG_WIDTH(num_reqs, mem_ports, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, tag_width, num_inputs, num_caches, uuid_width) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches), uuid_width), num_caches)
///////////////////////////////////////////////////////////////////////////////
@ -303,11 +306,12 @@
`define L1_ENABLE
`endif
`define ADDR_TYPE_FLUSH 0
`define ADDR_TYPE_IO 1
`define ADDR_TYPE_LOCAL 2 // shoud be last since optional
`define ADDR_TYPE_WIDTH (`ADDR_TYPE_LOCAL + `LMEM_ENABLED)
`define MEM_REQ_FLAG_FLUSH 0
`define MEM_REQ_FLAG_IO 1
`define MEM_REQ_FLAG_LOCAL 2 // shoud be last since optional
`define MEM_REQ_FLAGS_WIDTH (`MEM_REQ_FLAG_LOCAL + `LMEM_ENABLED)
`define VX_MEM_PORTS `L3_MEM_PORTS
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
@ -320,12 +324,23 @@
///////////////////////////////////////////////////////////////////////////////
`define BUFFER_EX(dst, src, ena, latency) \
`define NEG_EDGE(dst, src) \
VX_edge_trigger #( \
.POS (0), \
.INIT (0) \
) __neg_edge`__LINE__ ( \
.clk (clk), \
.reset (1'b0), \
.data_in (src), \
.data_out (dst) \
)
`define BUFFER_EX(dst, src, ena, resetw, latency) \
VX_pipe_register #( \
.DATAW ($bits(dst)), \
.RESETW ($bits(dst)), \
.RESETW (resetw), \
.DEPTH (latency) \
) __``dst``__ ( \
) __buffer_ex`__LINE__ ( \
.clk (clk), \
.reset (reset), \
.enable (ena), \
@ -333,13 +348,13 @@
.data_out (dst) \
)
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, 1)
`define BUFFER(dst, src) `BUFFER_EX(dst, src, 1'b1, $bits(dst), 1)
`define POP_COUNT_EX(out, in, model) \
VX_popcount #( \
.N ($bits(in)), \
.MODEL (model) \
) __``out``__ ( \
) __pop_count_ex`__LINE__ ( \
.data_in (in), \
.data_out (out) \
)
@ -359,50 +374,114 @@
assign src.rsp_data = dst.rsp_data; \
assign dst.rsp_ready = src.rsp_ready
`define ASSIGN_VX_MEM_BUS_IF_X(dst, src, TD, TS) \
`define ASSIGN_VX_MEM_BUS_RO_IF(dst, src) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data.rw = src.req_data.rw; \
assign dst.req_data.byteen = src.req_data.byteen; \
assign dst.req_data.rw = 0; \
assign dst.req_data.addr = src.req_data.addr; \
assign dst.req_data.atype = src.req_data.atype; \
assign dst.req_data.data = src.req_data.data; \
if (TD != TS) \
assign dst.req_data.tag = {src.req_data.tag, {(TD-TS){1'b0}}}; \
else \
assign dst.req_data.tag = src.req_data.tag; \
assign dst.req_data.data = '0; \
assign dst.req_data.byteen = '1; \
assign dst.req_data.flags = src.req_data.flags; \
assign dst.req_data.tag = src.req_data.tag; \
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data.data = dst.rsp_data.data; \
assign src.rsp_data.tag = dst.rsp_data.tag[TD-1 -: TS]; \
assign src.rsp_data.tag = dst.rsp_data.tag; \
assign dst.rsp_ready = src.rsp_ready
`define ASSIGN_VX_LSU_MEM_IF(dst, src) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data = src.req_data; \
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data = dst.rsp_data; \
assign dst.rsp_ready = src.rsp_ready
`define BUFFER_DCR_BUS_IF(dst, src, enable) \
if (enable) begin \
reg [(1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH)-1:0] __dst; \
always @(posedge clk) begin \
__dst <= {src.write_valid, src.write_addr, src.write_data}; \
`define ASSIGN_VX_MEM_BUS_IF_EX(dst, src, TD, TS, UUID) \
assign dst.req_valid = src.req_valid; \
assign dst.req_data.rw = src.req_data.rw; \
assign dst.req_data.addr = src.req_data.addr; \
assign dst.req_data.data = src.req_data.data; \
assign dst.req_data.byteen = src.req_data.byteen; \
assign dst.req_data.flags = src.req_data.flags; \
/* verilator lint_off GENUNNAMED */ \
if (TD != TS) begin \
if (UUID != 0) begin \
if (TD > TS) begin \
assign dst.req_data.tag = {src.req_data.tag.uuid, {(TD-TS){1'b0}}, src.req_data.tag.value}; \
end else begin \
assign dst.req_data.tag = {src.req_data.tag.uuid, src.req_data.tag.value[TD-UUID-1:0]}; \
end \
end else begin \
if (TD > TS) begin \
assign dst.req_data.tag = {{(TD-TS){1'b0}}, src.req_data.tag}; \
end else begin \
assign dst.req_data.tag = src.req_data.tag[TD-1:0]; \
end \
end \
assign {dst.write_valid, dst.write_addr, dst.write_data} = __dst; \
end else begin \
assign {dst.write_valid, dst.write_addr, dst.write_data} = {src.write_valid, src.write_addr, src.write_data}; \
end
assign dst.req_data.tag = src.req_data.tag; \
end \
/* verilator lint_on GENUNNAMED */ \
assign src.req_ready = dst.req_ready; \
assign src.rsp_valid = dst.rsp_valid; \
assign src.rsp_data.data = dst.rsp_data.data; \
/* verilator lint_off GENUNNAMED */ \
if (TD != TS) begin \
if (UUID != 0) begin \
if (TD > TS) begin \
assign src.rsp_data.tag = {dst.rsp_data.tag.uuid, dst.rsp_data.tag.value[TS-UUID-1:0]}; \
end else begin \
assign src.rsp_data.tag = {dst.rsp_data.tag.uuid, {(TS-TD){1'b0}}, dst.rsp_data.tag.value}; \
end \
end else begin \
if (TD > TS) begin \
assign src.rsp_data.tag = dst.rsp_data.tag[TS-1:0]; \
end else begin \
assign src.rsp_data.tag = {{(TS-TD){1'b0}}, dst.rsp_data.tag}; \
end \
end \
end else begin \
assign src.rsp_data.tag = dst.rsp_data.tag; \
end \
/* verilator lint_on GENUNNAMED */ \
assign dst.rsp_ready = src.rsp_ready
`define INIT_VX_MEM_BUS_IF(itf) \
assign itf.req_valid = 0; \
assign itf.req_data = '0; \
`UNUSED_VAR (itf.req_ready) \
`UNUSED_VAR (itf.rsp_valid) \
`UNUSED_VAR (itf.rsp_data) \
assign itf.rsp_ready = 0;
`define UNUSED_VX_MEM_BUS_IF(itf) \
`UNUSED_VAR (itf.req_valid) \
`UNUSED_VAR (itf.req_data) \
assign itf.req_ready = 0; \
assign itf.rsp_valid = 0; \
assign itf.rsp_data = '0; \
`UNUSED_VAR (itf.rsp_ready)
`define BUFFER_DCR_BUS_IF(dst, src, ena, latency) \
/* verilator lint_off GENUNNAMED */ \
if (latency != 0) begin \
VX_pipe_register #( \
.DATAW (1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH), \
.DEPTH (latency) \
) pipe_reg ( \
.clk (clk), \
.reset (1'b0), \
.enable (1'b1), \
.data_in ({src.write_valid && ena, src.write_addr, src.write_data}), \
.data_out ({dst.write_valid, dst.write_addr, dst.write_data}) \
); \
end else begin \
assign {dst.write_valid, dst.write_addr, dst.write_data} = {src.write_valid && ena, src.write_addr, src.write_data}; \
end \
/* verilator lint_on GENUNNAMED */
`define PERF_COUNTER_ADD(dst, src, field, width, count, reg_enable) \
/* verilator lint_off GENUNNAMED */ \
if (count > 1) begin \
wire [count-1:0][width-1:0] __reduce_add_i_field; \
wire [width-1:0] __reduce_add_o_field; \
for (genvar __i = 0; __i < count; ++__i) begin \
assign __reduce_add_i_field[__i] = src[__i].``field; \
end \
VX_reduce #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
VX_reduce_tree #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
__reduce_add_i_field, \
__reduce_add_o_field \
); \
@ -421,9 +500,11 @@
end \
end else begin \
assign dst.``field = src[0].``field; \
end
end \
/* verilator lint_on GENUNNAMED */
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
/* verilator lint_off GENUNNAMED */ \
if (block_size != 1) begin \
if (block_size != `NUM_WARPS) begin \
assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \
@ -432,6 +513,7 @@
end \
end else begin \
assign dst = src; \
end
end \
/* verilator lint_on GENUNNAMED */
`endif // VX_DEFINE_VH

View file

@ -73,6 +73,17 @@ package VX_gpu_pkg;
logic [`PERF_CTR_BITS-1:0] crsp_stalls;
} cache_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] reads;
logic [`PERF_CTR_BITS-1:0] writes;
logic [`PERF_CTR_BITS-1:0] bank_stalls;
logic [`PERF_CTR_BITS-1:0] crsp_stalls;
} lmem_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] misses;
} coalescer_perf_t;
typedef struct packed {
logic [`PERF_CTR_BITS-1:0] reads;
logic [`PERF_CTR_BITS-1:0] writes;
@ -92,6 +103,26 @@ package VX_gpu_pkg;
logic [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] sfu_uses;
} issue_perf_t;
typedef struct packed {
cache_perf_t icache;
cache_perf_t dcache;
cache_perf_t l2cache;
cache_perf_t l3cache;
lmem_perf_t lmem;
coalescer_perf_t coalescer;
mem_perf_t mem;
} sysmem_perf_t;
typedef struct packed {
sched_perf_t sched;
issue_perf_t issue;
logic [`PERF_CTR_BITS-1:0] ifetches;
logic [`PERF_CTR_BITS-1:0] loads;
logic [`PERF_CTR_BITS-1:0] stores;
logic [`PERF_CTR_BITS-1:0] ifetch_latency;
logic [`PERF_CTR_BITS-1:0] load_latency;
} pipeline_perf_t;
//////////////////////// instruction arguments ////////////////////////////
typedef struct packed {
@ -145,6 +176,7 @@ package VX_gpu_pkg;
localparam LSU_TAG_ID_BITS = (`CLOG2(`LSUQ_IN_SIZE) + `CLOG2(LSU_MEM_BATCHES));
localparam LSU_TAG_WIDTH = (`UUID_WIDTH + LSU_TAG_ID_BITS);
localparam LSU_NUM_REQS = `NUM_LSU_BLOCKS * `NUM_LSU_LANES;
localparam LMEM_TAG_WIDTH = LSU_TAG_WIDTH + `CLOG2(`NUM_LSU_BLOCKS);
////////////////////////// Icache Parameters //////////////////////////////
@ -166,9 +198,9 @@ package VX_gpu_pkg;
// Memory request tag bits
`ifdef ICACHE_ENABLE
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, 1, `NUM_ICACHES, `UUID_WIDTH);
`else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, 1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif
////////////////////////// Dcache Parameters //////////////////////////////
@ -180,7 +212,7 @@ package VX_gpu_pkg;
// Block size in bytes
localparam DCACHE_LINE_SIZE = `L1_LINE_SIZE;
// Input request size
// Input request size (using coalesced memory blocks)
localparam DCACHE_CHANNELS = `UP((`NUM_LSU_LANES * LSU_WORD_SIZE) / DCACHE_WORD_SIZE);
localparam DCACHE_NUM_REQS = `NUM_LSU_BLOCKS * DCACHE_CHANNELS;
@ -197,26 +229,27 @@ package VX_gpu_pkg;
// Memory request tag bits
`ifdef DCACHE_ENABLE
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, `L1_MEM_PORTS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES, `UUID_WIDTH);
`else
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, `L1_MEM_PORTS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`endif
/////////////////////////////// L1 Parameters /////////////////////////////
// arbitrate between icache and dcache
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
/////////////////////////////// L2 Parameters /////////////////////////////
localparam ICACHE_MEM_ARB_IDX = 0;
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
localparam ICACHE_MEM_ARB_IDX = 0;
localparam DCACHE_MEM_ARB_IDX = ICACHE_MEM_ARB_IDX + 1;
// Word size in bytes
localparam L2_WORD_SIZE = `L1_LINE_SIZE;
// Input request size
localparam L2_NUM_REQS = `NUM_SOCKETS;
localparam L2_NUM_REQS = `NUM_SOCKETS * `L1_MEM_PORTS;
// Core request tag bits
localparam L2_TAG_WIDTH = L1_MEM_ARB_TAG_WIDTH;
@ -226,9 +259,9 @@ package VX_gpu_pkg;
// Memory request tag bits
`ifdef L2_ENABLE
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_MEM_PORTS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH, `UUID_WIDTH);
`else
localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_MEM_PORTS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
`endif
/////////////////////////////// L3 Parameters /////////////////////////////
@ -237,7 +270,7 @@ package VX_gpu_pkg;
localparam L3_WORD_SIZE = `L2_LINE_SIZE;
// Input request size
localparam L3_NUM_REQS = `NUM_CLUSTERS;
localparam L3_NUM_REQS = `NUM_CLUSTERS * `L2_MEM_PORTS;
// Core request tag bits
localparam L3_TAG_WIDTH = L2_MEM_TAG_WIDTH;
@ -247,9 +280,9 @@ package VX_gpu_pkg;
// Memory request tag bits
`ifdef L3_ENABLE
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_MEM_PORTS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH, `UUID_WIDTH);
`else
localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_MEM_PORTS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`endif
/////////////////////////////// Issue parameters //////////////////////////
@ -308,6 +341,430 @@ package VX_gpu_pkg;
`IGNORE_UNUSED_END
////////////////////////////////// Tracing ////////////////////////////////////
`ifdef SIMULATION
`ifdef SV_DPI
import "DPI-C" function void dpi_trace(input int level, input string format /*verilator sformat*/);
`endif
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
case (ex_type)
`EX_ALU: `TRACE(level, ("ALU"))
`EX_LSU: `TRACE(level, ("LSU"))
`EX_SFU: `TRACE(level, ("SFU"))
`ifdef EXT_F_ENABLE
`EX_FPU: `TRACE(level, ("FPU"))
`endif
default: `TRACE(level, ("?"))
endcase
endtask
task trace_ex_op(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input VX_gpu_pkg::op_args_t op_args
);
case (ex_type)
`EX_ALU: begin
case (op_args.alu.xtype)
`ALU_TYPE_ARITH: begin
if (op_args.alu.is_w) begin
if (op_args.alu.use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDIW"))
`INST_ALU_SLL: `TRACE(level, ("SLLIW"))
`INST_ALU_SRL: `TRACE(level, ("SRLIW"))
`INST_ALU_SRA: `TRACE(level, ("SRAIW"))
default: `TRACE(level, ("?"))
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDW"))
`INST_ALU_SUB: `TRACE(level, ("SUBW"))
`INST_ALU_SLL: `TRACE(level, ("SLLW"))
`INST_ALU_SRL: `TRACE(level, ("SRLW"))
`INST_ALU_SRA: `TRACE(level, ("SRAW"))
default: `TRACE(level, ("?"))
endcase
end
end else begin
if (op_args.alu.use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDI"))
`INST_ALU_SLL: `TRACE(level, ("SLLI"))
`INST_ALU_SRL: `TRACE(level, ("SRLI"))
`INST_ALU_SRA: `TRACE(level, ("SRAI"))
`INST_ALU_SLT: `TRACE(level, ("SLTI"))
`INST_ALU_SLTU: `TRACE(level, ("SLTIU"))
`INST_ALU_XOR: `TRACE(level, ("XORI"))
`INST_ALU_OR: `TRACE(level, ("ORI"))
`INST_ALU_AND: `TRACE(level, ("ANDI"))
`INST_ALU_LUI: `TRACE(level, ("LUI"))
`INST_ALU_AUIPC: `TRACE(level, ("AUIPC"))
default: `TRACE(level, ("?"))
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADD"))
`INST_ALU_SUB: `TRACE(level, ("SUB"))
`INST_ALU_SLL: `TRACE(level, ("SLL"))
`INST_ALU_SRL: `TRACE(level, ("SRL"))
`INST_ALU_SRA: `TRACE(level, ("SRA"))
`INST_ALU_SLT: `TRACE(level, ("SLT"))
`INST_ALU_SLTU: `TRACE(level, ("SLTU"))
`INST_ALU_XOR: `TRACE(level, ("XOR"))
`INST_ALU_OR: `TRACE(level, ("OR"))
`INST_ALU_AND: `TRACE(level, ("AND"))
`INST_ALU_CZEQ: `TRACE(level, ("CZERO.EQZ"))
`INST_ALU_CZNE: `TRACE(level, ("CZERO.NEZ"))
default: `TRACE(level, ("?"))
endcase
end
end
end
`ALU_TYPE_BRANCH: begin
case (`INST_BR_BITS'(op_type))
`INST_BR_EQ: `TRACE(level, ("BEQ"))
`INST_BR_NE: `TRACE(level, ("BNE"))
`INST_BR_LT: `TRACE(level, ("BLT"))
`INST_BR_GE: `TRACE(level, ("BGE"))
`INST_BR_LTU: `TRACE(level, ("BLTU"))
`INST_BR_GEU: `TRACE(level, ("BGEU"))
`INST_BR_JAL: `TRACE(level, ("JAL"))
`INST_BR_JALR: `TRACE(level, ("JALR"))
`INST_BR_ECALL: `TRACE(level, ("ECALL"))
`INST_BR_EBREAK:`TRACE(level, ("EBREAK"))
`INST_BR_URET: `TRACE(level, ("URET"))
`INST_BR_SRET: `TRACE(level, ("SRET"))
`INST_BR_MRET: `TRACE(level, ("MRET"))
default: `TRACE(level, ("?"))
endcase
end
`ALU_TYPE_MULDIV: begin
if (op_args.alu.is_w) begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MULW"))
`INST_M_DIV: `TRACE(level, ("DIVW"))
`INST_M_DIVU: `TRACE(level, ("DIVUW"))
`INST_M_REM: `TRACE(level, ("REMW"))
`INST_M_REMU: `TRACE(level, ("REMUW"))
default: `TRACE(level, ("?"))
endcase
end else begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MUL"))
`INST_M_MULH: `TRACE(level, ("MULH"))
`INST_M_MULHSU:`TRACE(level, ("MULHSU"))
`INST_M_MULHU: `TRACE(level, ("MULHU"))
`INST_M_DIV: `TRACE(level, ("DIV"))
`INST_M_DIVU: `TRACE(level, ("DIVU"))
`INST_M_REM: `TRACE(level, ("REM"))
`INST_M_REMU: `TRACE(level, ("REMU"))
default: `TRACE(level, ("?"))
endcase
end
end
default: `TRACE(level, ("?"))
endcase
end
`EX_LSU: begin
if (op_args.lsu.is_float) begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LW: `TRACE(level, ("FLW"))
`INST_LSU_LD: `TRACE(level, ("FLD"))
`INST_LSU_SW: `TRACE(level, ("FSW"))
`INST_LSU_SD: `TRACE(level, ("FSD"))
default: `TRACE(level, ("?"))
endcase
end else begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LB: `TRACE(level, ("LB"))
`INST_LSU_LH: `TRACE(level, ("LH"))
`INST_LSU_LW: `TRACE(level, ("LW"))
`INST_LSU_LD: `TRACE(level, ("LD"))
`INST_LSU_LBU:`TRACE(level, ("LBU"))
`INST_LSU_LHU:`TRACE(level, ("LHU"))
`INST_LSU_LWU:`TRACE(level, ("LWU"))
`INST_LSU_SB: `TRACE(level, ("SB"))
`INST_LSU_SH: `TRACE(level, ("SH"))
`INST_LSU_SW: `TRACE(level, ("SW"))
`INST_LSU_SD: `TRACE(level, ("SD"))
`INST_LSU_FENCE:`TRACE(level,("FENCE"))
default: `TRACE(level, ("?"))
endcase
end
end
`EX_SFU: begin
case (`INST_SFU_BITS'(op_type))
`INST_SFU_TMC: `TRACE(level, ("TMC"))
`INST_SFU_WSPAWN:`TRACE(level, ("WSPAWN"))
`INST_SFU_SPLIT: begin
if (op_args.wctl.is_neg) begin
`TRACE(level, ("SPLIT.N"))
end else begin
`TRACE(level, ("SPLIT"))
end
end
`INST_SFU_JOIN: `TRACE(level, ("JOIN"))
`INST_SFU_BAR: `TRACE(level, ("BAR"))
`INST_SFU_PRED: begin
if (op_args.wctl.is_neg) begin
`TRACE(level, ("PRED.N"))
end else begin
`TRACE(level, ("PRED"))
end
end
`INST_SFU_CSRRW: begin
if (op_args.csr.use_imm) begin
`TRACE(level, ("CSRRWI"))
end else begin
`TRACE(level, ("CSRRW"))
end
end
`INST_SFU_CSRRS: begin
if (op_args.csr.use_imm) begin
`TRACE(level, ("CSRRSI"))
end else begin
`TRACE(level, ("CSRRS"))
end
end
`INST_SFU_CSRRC: begin
if (op_args.csr.use_imm) begin
`TRACE(level, ("CSRRCI"))
end else begin
`TRACE(level, ("CSRRC"))
end
end
default: `TRACE(level, ("?"))
endcase
end
`ifdef EXT_F_ENABLE
`EX_FPU: begin
case (`INST_FPU_BITS'(op_type))
`INST_FPU_ADD: begin
if (op_args.fpu.fmt[1]) begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FSUB.D"))
end else begin
`TRACE(level, ("FSUB.S"))
end
end else begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FADD.D"))
end else begin
`TRACE(level, ("FADD.S"))
end
end
end
`INST_FPU_MADD: begin
if (op_args.fpu.fmt[1]) begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FMSUB.D"))
end else begin
`TRACE(level, ("FMSUB.S"))
end
end else begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FMADD.D"))
end else begin
`TRACE(level, ("FMADD.S"))
end
end
end
`INST_FPU_NMADD: begin
if (op_args.fpu.fmt[1]) begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FNMSUB.D"))
end else begin
`TRACE(level, ("FNMSUB.S"))
end
end else begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FNMADD.D"))
end else begin
`TRACE(level, ("FNMADD.S"))
end
end
end
`INST_FPU_MUL: begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FMUL.D"))
end else begin
`TRACE(level, ("FMUL.S"))
end
end
`INST_FPU_DIV: begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FDIV.D"))
end else begin
`TRACE(level, ("FDIV.S"))
end
end
`INST_FPU_SQRT: begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FSQRT.D"))
end else begin
`TRACE(level, ("FSQRT.S"))
end
end
`INST_FPU_CMP: begin
if (op_args.fpu.fmt[0]) begin
case (op_args.fpu.frm[1:0])
0: `TRACE(level, ("FLE.D"))
1: `TRACE(level, ("FLT.D"))
2: `TRACE(level, ("FEQ.D"))
default: `TRACE(level, ("?"))
endcase
end else begin
case (op_args.fpu.frm[1:0])
0: `TRACE(level, ("FLE.S"))
1: `TRACE(level, ("FLT.S"))
2: `TRACE(level, ("FEQ.S"))
default: `TRACE(level, ("?"))
endcase
end
end
`INST_FPU_F2F: begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FCVT.D.S"))
end else begin
`TRACE(level, ("FCVT.S.D"))
end
end
`INST_FPU_F2I: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.L.D"))
end else begin
`TRACE(level, ("FCVT.W.D"))
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.L.S"))
end else begin
`TRACE(level, ("FCVT.W.S"))
end
end
end
`INST_FPU_F2U: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.LU.D"))
end else begin
`TRACE(level, ("FCVT.WU.D"))
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.LU.S"))
end else begin
`TRACE(level, ("FCVT.WU.S"))
end
end
end
`INST_FPU_I2F: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.D.L"))
end else begin
`TRACE(level, ("FCVT.D.W"))
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.S.L"))
end else begin
`TRACE(level, ("FCVT.S.W"))
end
end
end
`INST_FPU_U2F: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.D.LU"))
end else begin
`TRACE(level, ("FCVT.D.WU"))
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.S.LU"))
end else begin
`TRACE(level, ("FCVT.S.WU"))
end
end
end
`INST_FPU_MISC: begin
if (op_args.fpu.fmt[0]) begin
case (op_args.fpu.frm)
0: `TRACE(level, ("FSGNJ.D"))
1: `TRACE(level, ("FSGNJN.D"))
2: `TRACE(level, ("FSGNJX.D"))
3: `TRACE(level, ("FCLASS.D"))
4: `TRACE(level, ("FMV.X.D"))
5: `TRACE(level, ("FMV.D.X"))
6: `TRACE(level, ("FMIN.D"))
7: `TRACE(level, ("FMAX.D"))
endcase
end else begin
case (op_args.fpu.frm)
0: `TRACE(level, ("FSGNJ.S"))
1: `TRACE(level, ("FSGNJN.S"))
2: `TRACE(level, ("FSGNJX.S"))
3: `TRACE(level, ("FCLASS.S"))
4: `TRACE(level, ("FMV.X.S"))
5: `TRACE(level, ("FMV.S.X"))
6: `TRACE(level, ("FMIN.S"))
7: `TRACE(level, ("FMAX.S"))
endcase
end
end
default: `TRACE(level, ("?"))
endcase
end
`endif
default: `TRACE(level, ("?"))
endcase
endtask
task trace_op_args(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input VX_gpu_pkg::op_args_t op_args
);
case (ex_type)
`EX_ALU: begin
`TRACE(level, (", use_PC=%b, use_imm=%b, imm=0x%0h", op_args.alu.use_PC, op_args.alu.use_imm, op_args.alu.imm))
end
`EX_LSU: begin
`TRACE(level, (", offset=0x%0h", op_args.lsu.offset))
end
`EX_SFU: begin
if (`INST_SFU_IS_CSR(op_type)) begin
`TRACE(level, (", addr=0x%0h, use_imm=%b, imm=0x%0h", op_args.csr.addr, op_args.csr.use_imm, op_args.csr.imm))
end
end
`ifdef EXT_F_ENABLE
`EX_FPU: begin
`TRACE(level, (", fmt=0x%0h, frm=0x%0h", op_args.fpu.fmt, op_args.fpu.frm))
end
`endif
default:;
endcase
endtask
task trace_base_dcr(input int level, input [`VX_DCR_ADDR_WIDTH-1:0] addr);
case (addr)
`VX_DCR_BASE_STARTUP_ADDR0: `TRACE(level, ("STARTUP_ADDR0"))
`VX_DCR_BASE_STARTUP_ADDR1: `TRACE(level, ("STARTUP_ADDR1"))
`VX_DCR_BASE_STARTUP_ARG0: `TRACE(level, ("STARTUP_ARG0"))
`VX_DCR_BASE_STARTUP_ARG1: `TRACE(level, ("STARTUP_ARG1"))
`VX_DCR_BASE_MPM_CLASS: `TRACE(level, ("MPM_CLASS"))
default: `TRACE(level, ("?"))
endcase
endtask
`endif
endpackage
`endif // VX_GPU_PKG_VH

View file

@ -22,36 +22,34 @@
///////////////////////////////////////////////////////////////////////////////
`ifdef VIVADO
`define STRING
`else
`define STRING string
`endif
`ifdef SIMULATION
`ifdef SYNTHESIS
`define TRACING_ON
`define TRACING_OFF
`ifndef NDEBUG
`define DEBUG_BLOCK(x) x
`else
`define DEBUG_BLOCK(x)
`endif
`define IGNORE_UNOPTFLAT_BEGIN
`define IGNORE_UNOPTFLAT_END
`define IGNORE_UNUSED_BEGIN
`define IGNORE_UNUSED_END
`define IGNORE_WARNINGS_BEGIN
`define IGNORE_WARNINGS_END
`define UNUSED_PARAM(x)
`define UNUSED_SPARAM(x)
`define UNUSED_VAR(x)
`define UNUSED_PIN(x) . x ()
`define UNUSED_ARG(x) x
`define TRACE(level, args) if (level <= `DEBUG_LEVEL) $write args
`else
`ifdef VERILATOR
`define STATIC_ASSERT(cond, msg) \
/* verilator lint_off GENUNNAMED */ \
if (!(cond)) $error msg; \
/* verilator lint_on GENUNNAMED */ \
`define ERROR(msg) \
$error msg
`define ASSERT(cond, msg) \
assert(cond) else $error msg
`define RUNTIME_ASSERT(cond, msg) \
always @(posedge clk) begin \
if (!reset) begin \
`ASSERT(cond, msg); \
end \
end
`ifndef TRACING_ALL
`define TRACING_ON /* verilator tracing_on */
`define TRACING_OFF /* verilator tracing_off */
`else
`define TRACING_ON
`define TRACING_OFF
`endif
`ifndef NDEBUG
`define DEBUG_BLOCK(x) /* verilator lint_off UNUSED */ \
x \
@ -100,74 +98,99 @@
localparam `STRING __``x = x; \
/* verilator lint_on UNUSED */
`define UNUSED_VAR(x) if (1) begin \
`define UNUSED_VAR(x) /* verilator lint_off GENUNNAMED */ \
if (1) begin \
/* verilator lint_off UNUSED */ \
wire [$bits(x)-1:0] __x = x; \
wire [$bits(x)-1:0] __unused = x; \
/* verilator lint_on UNUSED */ \
end
end \
/* verilator lint_on GENUNNAMED */
`define UNUSED_PIN(x) /* verilator lint_off PINCONNECTEMPTY */ \
. x () \
/* verilator lint_on PINCONNECTEMPTY */
`define UNUSED_ARG(x) /* verilator lint_off UNUSED */ \
x \
/* verilator lint_on UNUSED */
`endif
`ifdef SV_DPI
`define TRACE(level, args) dpi_trace(level, $sformatf args)
`define TRACE(level, args) dpi_trace(level, $sformatf args);
`else
`define TRACE(level, args) if (level <= `DEBUG_LEVEL) $write args
`define TRACE(level, args) \
if (level <= `DEBUG_LEVEL) begin \
$write args; \
end
`endif
`endif
`define SFORMATF(x) $sformatf x
`ifdef SIMULATION
`define STATIC_ASSERT(cond, msg) \
generate \
if (!(cond)) $error msg; \
endgenerate
`else // SYNTHESIS
`define ERROR(msg) \
$error msg
`define STATIC_ASSERT(cond, msg)
`define ERROR(msg) //
`define ASSERT(cond, msg) //
`define RUNTIME_ASSERT(cond, msg)
`define ASSERT(cond, msg) \
assert(cond) else $error msg
`define DEBUG_BLOCK(x)
`define TRACE(level, args)
`define SFORMATF(x) ""
`define TRACING_ON
`define TRACING_OFF
`define IGNORE_UNOPTFLAT_BEGIN
`define IGNORE_UNOPTFLAT_END
`define IGNORE_UNUSED_BEGIN
`define IGNORE_UNUSED_END
`define IGNORE_WARNINGS_BEGIN
`define IGNORE_WARNINGS_END
`define UNUSED_PARAM(x)
`define UNUSED_SPARAM(x)
`define UNUSED_VAR(x)
`define UNUSED_PIN(x) . x ()
`define UNUSED_ARG(x) x
`define RUNTIME_ASSERT(cond, msg) \
always @(posedge clk) begin \
assert(cond) else $error msg; \
end
`else
`define STATIC_ASSERT(cond, msg)
`define ERROR(msg) //
`define ASSERT(cond, msg) //
`define RUNTIME_ASSERT(cond, msg)
`endif
///////////////////////////////////////////////////////////////////////////////
`ifdef QUARTUS
`define MAX_FANOUT 8
`define IF_DATA_SIZE(x) $bits(x.data)
`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256)
`define USE_BLOCK_BRAM (* ramstyle = "block" *)
`define USE_FAST_BRAM (* ramstyle = "MLAB, no_rw_check" *)
`define NO_RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams off" *)
`define RW_RAM_CHECK (* altera_attribute = "-name add_pass_through_logic_to_inferred_rams on" *)
`define DISABLE_BRAM (* ramstyle = "logic" *)
`define PRESERVE_NET (* preserve *)
`define BLACKBOX_CELL (* black_box *)
`define STRING string
`elsif VIVADO
`define MAX_FANOUT 8
`define IF_DATA_SIZE(x) $bits(x.data)
`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256)
`define USE_BLOCK_BRAM (* ram_style = "block" *)
`define USE_FAST_BRAM (* ram_style = "distributed" *)
`define NO_RW_RAM_CHECK (* rw_addr_collision = "no" *)
`define RW_RAM_CHECK (* rw_addr_collision = "yes" *)
`define DISABLE_BRAM (* ram_style = "registers" *)
`define PRESERVE_NET (* keep = "true" *)
`define BLACKBOX_CELL (* black_box *)
`define STRING
`ifndef SIMULATION
`define ASYNC_BRAM_PATCH
`endif
`else
`define MAX_FANOUT 8
`define IF_DATA_SIZE(x) x.DATA_WIDTH
`define FORCE_BRAM(d,w) (d >= 16 || w >= 128 || (d * w) >= 256)
`define USE_BLOCK_BRAM
`define USE_FAST_BRAM
`define NO_RW_RAM_CHECK
`define RW_RAM_CHECK
`define DISABLE_BRAM
`define PRESERVE_NET
`define BLACKBOX_CELL
`define STRING string
`endif
///////////////////////////////////////////////////////////////////////////////
@ -192,7 +215,7 @@
`define CLAMP(x, lo, hi) (((x) > (hi)) ? (hi) : (((x) < (lo)) ? (lo) : (x)))
`define UP(x) (((x) != 0) ? (x) : 1)
`define UP(x) (((x) > 0) ? (x) : 1)
`define CDIV(n,d) ((n + d - 1) / (d))
@ -204,23 +227,23 @@
`define SEXT(len, x) {{(len-$bits(x)+1){x[$bits(x)-1]}}, x[$bits(x)-2:0]}
`define TRACE_ARRAY1D(lvl, fmt, arr, n) \
`TRACE(lvl, ("{")); \
`TRACE(lvl, ("{")) \
for (integer __i = (n-1); __i >= 0; --__i) begin \
if (__i != (n-1)) `TRACE(lvl, (", ")); \
`TRACE(lvl, (fmt, arr[__i])); \
if (__i != (n-1)) `TRACE(lvl, (", ")) \
`TRACE(lvl, (fmt, arr[__i])) \
end \
`TRACE(lvl, ("}"));
`TRACE(lvl, ("}"))
`define TRACE_ARRAY2D(lvl, fmt, arr, m, n) \
`TRACE(lvl, ("{")); \
`TRACE(lvl, ("{")) \
for (integer __i = n-1; __i >= 0; --__i) begin \
if (__i != (n-1)) `TRACE(lvl, (", ")); \
`TRACE(lvl, ("{")); \
if (__i != (n-1)) `TRACE(lvl, (", ")) \
`TRACE(lvl, ("{")) \
for (integer __j = (m-1); __j >= 0; --__j) begin \
if (__j != (m-1)) `TRACE(lvl, (", "));\
`TRACE(lvl, (fmt, arr[__i][__j])); \
if (__j != (m-1)) `TRACE(lvl, (", "))\
`TRACE(lvl, (fmt, arr[__i][__j])) \
end \
`TRACE(lvl, ("}")); \
`TRACE(lvl, ("}")) \
end \
`TRACE(lvl, ("}"))
@ -239,10 +262,13 @@
`RESET_RELAY_EX (dst, src, 1, 0)
// size(x): 0 -> 0, 1 -> 1, 2 -> 2, 3 -> 2, 4-> 2, 5 -> 2
`define TO_OUT_BUF_SIZE(s) `MIN(s, 2)
`define TO_OUT_BUF_SIZE(s) `MIN(s & 7, 2)
// reg(x): 0 -> 0, 1 -> 1, 2 -> 0, 3 -> 1, 4 -> 2, 5 > 3
`define TO_OUT_BUF_REG(s) ((s < 2) ? s : (s - 2))
`define TO_OUT_BUF_REG(s) (((s & 7) < 2) ? (s & 7) : ((s & 7) - 2))
// lut(x): (x & 8) != 0
`define TO_OUT_BUF_LUTRAM(s) ((s & 8) != 0)
`define REPEAT(n,f,s) `_REPEAT_``n(f,s)
`define _REPEAT_0(f,s)

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -21,48 +21,67 @@
input wire scope_bus_in, \
output wire scope_bus_out,
`define SCOPE_IO_SWITCH(__count) \
wire scope_bus_in_w [__count]; \
wire scope_bus_out_w [__count]; \
`RESET_RELAY_EX(scope_reset_w, scope_reset, __count, 4); \
VX_scope_switch #( \
.N (__count) \
) scope_switch ( \
.clk (clk), \
.reset (scope_reset), \
.req_in (scope_bus_in), \
.rsp_out (scope_bus_out), \
.req_out (scope_bus_in_w), \
.rsp_in (scope_bus_out_w) \
);
`define SCOPE_IO_BIND(__i) \
.scope_reset (scope_reset_w[__i]), \
.scope_bus_in (scope_bus_in_w[__i]), \
.scope_bus_out (scope_bus_out_w[__i]),
`define SCOPE_IO_UNUSED() \
`UNUSED_VAR (scope_reset); \
`UNUSED_VAR (scope_bus_in); \
assign scope_bus_out = 0;
`define SCOPE_IO_UNUSED_W(__i) \
`define SCOPE_IO_UNUSED(__i) \
`UNUSED_VAR (scope_reset_w[__i]); \
`UNUSED_VAR (scope_bus_in_w[__i]); \
assign scope_bus_out_w[__i] = 0;
`define SCOPE_IO_SWITCH(__count) \
wire [__count-1:0] scope_bus_in_w; \
wire [__count-1:0] scope_bus_out_w; \
wire [__count-1:0] scope_reset_w = {__count{scope_reset}}; \
VX_scope_switch #( \
.N (__count) \
) scope_switch ( \
.clk (clk), \
.reset (scope_reset), \
.req_in (scope_bus_in), \
.rsp_out (scope_bus_out), \
.req_out (scope_bus_in_w), \
.rsp_in (scope_bus_out_w) \
)
`define SCOPE_TAP_EX(__idx, __id, __xtriggers_w, __htriggers_w, __probes_w, __xtriggers, __htriggers, __probes, __start, __stop, __depth) \
VX_scope_tap #( \
.SCOPE_ID (__id), \
.XTRIGGERW(__xtriggers_w), \
.HTRIGGERW(__htriggers_w), \
.PROBEW (__probes_w), \
.DEPTH (__depth) \
) scope_tap_``idx ( \
.clk (clk), \
.reset (scope_reset_w[__idx]), \
.start (__start), \
.stop (__stop), \
.xtriggers(__xtriggers), \
.htriggers(__htriggers), \
.probes (__probes), \
.bus_in (scope_bus_in_w[__idx]), \
.bus_out(scope_bus_out_w[__idx]) \
)
`define SCOPE_TAP(__idx, __id, __xtriggers, __htriggers, __probes, __start, __stop, __depth) \
`SCOPE_TAP_EX(__idx, __id, $bits(__xtriggers), $bits(__htriggers), $bits(__probes), __xtriggers, __htriggers, __probes, __start, __stop, __depth)
`else
`define SCOPE_IO_DECL
`define SCOPE_IO_SWITCH(__count)
`define SCOPE_IO_BIND(__i)
`define SCOPE_IO_UNUSED_W(__i)
`define SCOPE_IO_UNUSED(__i)
`define SCOPE_IO_SWITCH(__count)
`define SCOPE_TAP(__idx, __id, __xtriggers, __probes, __depth)
`define SCOPE_TAP_EX(__idx, __id, __xtriggers_w, __probes_w, __xtriggers, __probes, __depth)
`endif
`endif // VX_SCOPE_VH

View file

@ -24,14 +24,14 @@ module VX_socket import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
input sysmem_perf_t sysmem_perf,
`endif
// DCRs
VX_dcr_bus_if.slave dcr_bus_if,
// Memory
VX_mem_bus_if.master mem_bus_if,
VX_mem_bus_if.master mem_bus_if [`L1_MEM_PORTS],
`ifdef GBAR_ENABLE
// Barrier
@ -49,14 +49,12 @@ module VX_socket import VX_gpu_pkg::*; #(
`ifdef GBAR_ENABLE
VX_gbar_bus_if per_core_gbar_bus_if[`SOCKET_SIZE]();
`RESET_RELAY (gbar_arb_reset, reset);
VX_gbar_arb #(
.NUM_REQS (`SOCKET_SIZE),
.OUT_BUF ((`SOCKET_SIZE > 1) ? 2 : 0)
) gbar_arb (
.clk (clk),
.reset (gbar_arb_reset),
.reset (reset),
.bus_in_if (per_core_gbar_bus_if),
.bus_out_if (gbar_bus_if)
);
@ -65,11 +63,13 @@ module VX_socket import VX_gpu_pkg::*; #(
///////////////////////////////////////////////////////////////////////////
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.lmem = 'x;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
cache_perf_t icache_perf, dcache_perf;
sysmem_perf_t sysmem_perf_tmp;
always @(*) begin
sysmem_perf_tmp = sysmem_perf;
sysmem_perf_tmp.icache = icache_perf;
sysmem_perf_tmp.dcache = dcache_perf;
end
`endif
///////////////////////////////////////////////////////////////////////////
@ -82,12 +82,12 @@ module VX_socket import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_LINE_SIZE),
.TAG_WIDTH (ICACHE_MEM_TAG_WIDTH)
) icache_mem_bus_if();
) icache_mem_bus_if[1]();
`RESET_RELAY (icache_reset, reset);
VX_cache_cluster #(
.INSTANCE_ID ($sformatf("%s-icache", INSTANCE_ID)),
.INSTANCE_ID (`SFORMATF(("%s-icache", INSTANCE_ID))),
.NUM_UNITS (`NUM_ICACHES),
.NUM_INPUTS (`SOCKET_SIZE),
.TAG_SEL_IDX (0),
@ -97,19 +97,22 @@ module VX_socket import VX_gpu_pkg::*; #(
.NUM_WAYS (`ICACHE_NUM_WAYS),
.WORD_SIZE (ICACHE_WORD_SIZE),
.NUM_REQS (1),
.MEM_PORTS (1),
.CRSQ_SIZE (`ICACHE_CRSQ_SIZE),
.MSHR_SIZE (`ICACHE_MSHR_SIZE),
.MRSQ_SIZE (`ICACHE_MRSQ_SIZE),
.MREQ_SIZE (`ICACHE_MREQ_SIZE),
.TAG_WIDTH (ICACHE_TAG_WIDTH),
.FLAGS_WIDTH (0),
.UUID_WIDTH (`UUID_WIDTH),
.WRITE_ENABLE (0),
.REPL_POLICY (`ICACHE_REPL_POLICY),
.NC_ENABLE (0),
.CORE_OUT_BUF (2),
.CORE_OUT_BUF (3),
.MEM_OUT_BUF (2)
) icache (
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.icache),
.cache_perf (icache_perf),
`endif
.clk (clk),
.reset (icache_reset),
@ -127,12 +130,12 @@ module VX_socket import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_LINE_SIZE),
.TAG_WIDTH (DCACHE_MEM_TAG_WIDTH)
) dcache_mem_bus_if();
) dcache_mem_bus_if[`L1_MEM_PORTS]();
`RESET_RELAY (dcache_reset, reset);
VX_cache_cluster #(
.INSTANCE_ID ($sformatf("%s-dcache", INSTANCE_ID)),
.INSTANCE_ID (`SFORMATF(("%s-dcache", INSTANCE_ID))),
.NUM_UNITS (`NUM_DCACHES),
.NUM_INPUTS (`SOCKET_SIZE),
.TAG_SEL_IDX (0),
@ -142,21 +145,24 @@ module VX_socket import VX_gpu_pkg::*; #(
.NUM_WAYS (`DCACHE_NUM_WAYS),
.WORD_SIZE (DCACHE_WORD_SIZE),
.NUM_REQS (DCACHE_NUM_REQS),
.MEM_PORTS (`L1_MEM_PORTS),
.CRSQ_SIZE (`DCACHE_CRSQ_SIZE),
.MSHR_SIZE (`DCACHE_MSHR_SIZE),
.MRSQ_SIZE (`DCACHE_MRSQ_SIZE),
.MREQ_SIZE (`DCACHE_WRITEBACK ? `DCACHE_MSHR_SIZE : `DCACHE_MREQ_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`DCACHE_WRITEBACK),
.DIRTY_BYTES (`DCACHE_WRITEBACK),
.DIRTY_BYTES (`DCACHE_DIRTYBYTES),
.REPL_POLICY (`DCACHE_REPL_POLICY),
.NC_ENABLE (1),
.CORE_OUT_BUF (2),
.CORE_OUT_BUF (3),
.MEM_OUT_BUF (2)
) dcache (
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.dcache),
.cache_perf (dcache_perf),
`endif
.clk (clk),
.reset (dcache_reset),
@ -166,51 +172,64 @@ module VX_socket import VX_gpu_pkg::*; #(
///////////////////////////////////////////////////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
) l1_mem_bus_if[2]();
for (genvar i = 0; i < `L1_MEM_PORTS; ++i) begin : g_mem_bus_if
if (i == 0) begin : g_i0
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH)
) l1_mem_bus_if[2]();
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) l1_mem_arb_bus_if[1]();
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) l1_mem_arb_bus_if[1]();
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[0], icache_mem_bus_if, L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_X (l1_mem_bus_if[1], dcache_mem_bus_if, L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_bus_if[0], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_bus_if[1], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
VX_mem_arb #(
.NUM_INPUTS (2),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX (0),
.ARBITER ("R"),
.REQ_OUT_BUF (2),
.RSP_OUT_BUF (2)
) mem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (l1_mem_bus_if),
.bus_out_if (l1_mem_arb_bus_if)
);
VX_mem_arb #(
.NUM_INPUTS (2),
.NUM_OUTPUTS(1),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX(0),
.ARBITER ("P"), // prioritize the icache
.REQ_OUT_BUF(3),
.RSP_OUT_BUF(3)
) mem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (l1_mem_bus_if),
.bus_out_if (l1_mem_arb_bus_if)
);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, l1_mem_arb_bus_if[0]);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[0], l1_mem_arb_bus_if[0]);
end else begin : g_i
VX_mem_bus_if #(
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
) l1_mem_arb_bus_if();
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_arb_bus_if, dcache_mem_bus_if[i], L1_MEM_ARB_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], l1_mem_arb_bus_if);
end
end
///////////////////////////////////////////////////////////////////////////
wire [`SOCKET_SIZE-1:0] per_core_busy;
VX_dcr_bus_if core_dcr_bus_if();
`BUFFER_DCR_BUS_IF (core_dcr_bus_if, dcr_bus_if, (`SOCKET_SIZE > 1));
// Generate all cores
for (genvar core_id = 0; core_id < `SOCKET_SIZE; ++core_id) begin : cores
for (genvar core_id = 0; core_id < `SOCKET_SIZE; ++core_id) begin : g_cores
`RESET_RELAY (core_reset, reset);
VX_dcr_bus_if core_dcr_bus_if();
`BUFFER_DCR_BUS_IF (core_dcr_bus_if, dcr_bus_if, 1'b1, (`SOCKET_SIZE > 1))
VX_core #(
.CORE_ID ((SOCKET_ID * `SOCKET_SIZE) + core_id),
.INSTANCE_ID ($sformatf("%s-core%0d", INSTANCE_ID, core_id))
.INSTANCE_ID (`SFORMATF(("%s-core%0d", INSTANCE_ID, core_id)))
) core (
`SCOPE_IO_BIND (scope_core + core_id)
@ -218,7 +237,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.reset (core_reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
.sysmem_perf (sysmem_perf_tmp),
`endif
.dcr_bus_if (core_dcr_bus_if),
@ -235,6 +254,6 @@ module VX_socket import VX_gpu_pkg::*; #(
);
end
`BUFFER_EX(busy, (| per_core_busy), 1'b1, (`SOCKET_SIZE > 1));
`BUFFER_EX(busy, (| per_core_busy), 1'b1, 1, (`SOCKET_SIZE > 1));
endmodule

View file

@ -166,6 +166,8 @@
`define VX_CSR_MPM_MEM_WRITES_H 12'hB99
`define VX_CSR_MPM_MEM_LT 12'hB1A // memory latency
`define VX_CSR_MPM_MEM_LT_H 12'hB9A
`define VX_CSR_MPM_MEM_BANK_ST 12'hB1E // bank conflicts
`define VX_CSR_MPM_MEM_BANK_ST_H 12'hB9E
// PERF: lmem
`define VX_CSR_MPM_LMEM_READS 12'hB1B // memory reads
`define VX_CSR_MPM_LMEM_READS_H 12'hB9B
@ -173,6 +175,9 @@
`define VX_CSR_MPM_LMEM_WRITES_H 12'hB9C
`define VX_CSR_MPM_LMEM_BANK_ST 12'hB1D // bank conflicts
`define VX_CSR_MPM_LMEM_BANK_ST_H 12'hB9D
// PERF: coalescer
`define VX_CSR_MPM_COALESCER_MISS 12'hB1F // coalescer misses
`define VX_CSR_MPM_COALESCER_MISS_H 12'hB9F
// Machine Performance-monitoring memory counters (class 3) ///////////////////
// <Add your own counters: use addresses hB03..B1F, hB83..hB9F>
@ -184,6 +189,19 @@
`define VX_CSR_MIMPID 12'hF13
`define VX_CSR_MHARTID 12'hF14
// Vector CSRs
`define VX_CSR_VSTART 12'h008
`define VX_CSR_VXSAT 12'h009
`define VX_CSR_VXRM 12'h00A
`define VX_CSR_VCSR 12'h00F
`define VX_CSR_VL 12'hC20
`define VX_CSR_VTYPE 12'hC21
`define VX_CSR_VLENB 12'hC22
`define VX_CSR_VCYCLE 12'hC00
`define VX_CSR_VTIME 12'hC01
`define VX_CSR_VINSTRET 12'hC02
// GPGU CSRs
`define VX_CSR_THREAD_ID 12'hCC0
@ -197,4 +215,10 @@
`define VX_CSR_NUM_CORES 12'hFC2
`define VX_CSR_LOCAL_MEM_BASE 12'hFC3
`define VX_MAT_MUL_SIZE 12'hFC4 // VX_MAT_MUL_SIZE = Matrix Size / TC Size
`define VX_TC_NUM 12'hFC5
`define VX_TC_SIZE 12'hFC6
`endif // VX_TYPES_VH

View file

@ -21,19 +21,19 @@ module Vortex import VX_gpu_pkg::*; (
input wire reset,
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen,
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data,
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
output wire mem_req_valid [`VX_MEM_PORTS],
output wire mem_req_rw [`VX_MEM_PORTS],
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen [`VX_MEM_PORTS],
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr [`VX_MEM_PORTS],
output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data [`VX_MEM_PORTS],
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag [`VX_MEM_PORTS],
input wire mem_req_ready [`VX_MEM_PORTS],
// Memory response
input wire mem_rsp_valid,
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data,
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
input wire mem_rsp_valid [`VX_MEM_PORTS],
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data [`VX_MEM_PORTS],
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag [`VX_MEM_PORTS],
output wire mem_rsp_ready [`VX_MEM_PORTS],
// DCR write request
input wire dcr_wr_valid,
@ -50,22 +50,25 @@ module Vortex import VX_gpu_pkg::*; (
`endif
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if();
assign mem_perf_if.icache = 'x;
assign mem_perf_if.dcache = 'x;
assign mem_perf_if.l2cache = 'x;
assign mem_perf_if.lmem = 'x;
cache_perf_t l3_perf;
mem_perf_t mem_perf;
sysmem_perf_t sysmem_perf;
always @(*) begin
sysmem_perf = '0;
sysmem_perf.l3cache = l3_perf;
sysmem_perf.mem = mem_perf;
end
`endif
VX_mem_bus_if #(
.DATA_SIZE (`L2_LINE_SIZE),
.TAG_WIDTH (L2_MEM_TAG_WIDTH)
) per_cluster_mem_bus_if[`NUM_CLUSTERS]();
) per_cluster_mem_bus_if[`NUM_CLUSTERS * `L2_MEM_PORTS]();
VX_mem_bus_if #(
.DATA_SIZE (`L3_LINE_SIZE),
.TAG_WIDTH (L3_MEM_TAG_WIDTH)
) mem_bus_if();
) mem_bus_if[`L3_MEM_PORTS]();
`RESET_RELAY (l3_reset, reset);
@ -77,6 +80,7 @@ module Vortex import VX_gpu_pkg::*; (
.NUM_WAYS (`L3_NUM_WAYS),
.WORD_SIZE (L3_WORD_SIZE),
.NUM_REQS (L3_NUM_REQS),
.MEM_PORTS (`L3_MEM_PORTS),
.CRSQ_SIZE (`L3_CRSQ_SIZE),
.MSHR_SIZE (`L3_MSHR_SIZE),
.MRSQ_SIZE (`L3_MRSQ_SIZE),
@ -84,10 +88,12 @@ module Vortex import VX_gpu_pkg::*; (
.TAG_WIDTH (L2_MEM_TAG_WIDTH),
.WRITE_ENABLE (1),
.WRITEBACK (`L3_WRITEBACK),
.DIRTY_BYTES (`L3_WRITEBACK),
.DIRTY_BYTES (`L3_DIRTYBYTES),
.REPL_POLICY (`L3_REPL_POLICY),
.UUID_WIDTH (`UUID_WIDTH),
.CORE_OUT_BUF (2),
.MEM_OUT_BUF (2),
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
.CORE_OUT_BUF (3),
.MEM_OUT_BUF (3),
.NC_ENABLE (1),
.PASSTHRU (!`L3_ENABLED)
) l3cache (
@ -95,31 +101,28 @@ module Vortex import VX_gpu_pkg::*; (
.reset (l3_reset),
`ifdef PERF_ENABLE
.cache_perf (mem_perf_if.l3cache),
.cache_perf (l3_perf),
`endif
.core_bus_if (per_cluster_mem_bus_if),
.mem_bus_if (mem_bus_if)
);
assign mem_req_valid = mem_bus_if.req_valid;
assign mem_req_rw = mem_bus_if.req_data.rw;
assign mem_req_byteen= mem_bus_if.req_data.byteen;
assign mem_req_addr = mem_bus_if.req_data.addr;
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
`UNUSED_VAR (mem_bus_if.req_data.atype)
for (genvar i = 0; i < `L3_MEM_PORTS; ++i) begin : g_mem_bus_if
assign mem_req_valid[i] = mem_bus_if[i].req_valid;
assign mem_req_rw[i] = mem_bus_if[i].req_data.rw;
assign mem_req_byteen[i] = mem_bus_if[i].req_data.byteen;
assign mem_req_addr[i] = mem_bus_if[i].req_data.addr;
assign mem_req_data[i] = mem_bus_if[i].req_data.data;
assign mem_req_tag[i] = mem_bus_if[i].req_data.tag;
`UNUSED_VAR (mem_bus_if[i].req_data.flags)
assign mem_bus_if[i].req_ready = mem_req_ready[i];
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data;
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
assign mem_rsp_ready = mem_bus_if.rsp_ready;
wire mem_req_fire = mem_req_valid && mem_req_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
`UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire)
assign mem_bus_if[i].rsp_valid = mem_rsp_valid[i];
assign mem_bus_if[i].rsp_data.data = mem_rsp_data[i];
assign mem_bus_if[i].rsp_data.tag = mem_rsp_tag[i];
assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready;
end
VX_dcr_bus_if dcr_bus_if();
assign dcr_bus_if.write_valid = dcr_wr_valid;
@ -129,16 +132,16 @@ module Vortex import VX_gpu_pkg::*; (
wire [`NUM_CLUSTERS-1:0] per_cluster_busy;
// Generate all clusters
for (genvar cluster_id = 0; cluster_id < `NUM_CLUSTERS; ++cluster_id) begin : clusters
for (genvar cluster_id = 0; cluster_id < `NUM_CLUSTERS; ++cluster_id) begin : g_clusters
`RESET_RELAY (cluster_reset, reset);
VX_dcr_bus_if cluster_dcr_bus_if();
`BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, (`NUM_CLUSTERS > 1));
`BUFFER_DCR_BUS_IF (cluster_dcr_bus_if, dcr_bus_if, 1'b1, (`NUM_CLUSTERS > 1))
VX_cluster #(
.CLUSTER_ID (cluster_id),
.INSTANCE_ID ($sformatf("cluster%0d", cluster_id))
.INSTANCE_ID (`SFORMATF(("cluster%0d", cluster_id)))
) cluster (
`SCOPE_IO_BIND (scope_cluster + cluster_id)
@ -146,59 +149,83 @@ module Vortex import VX_gpu_pkg::*; (
.reset (cluster_reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.sysmem_perf (sysmem_perf),
`endif
.dcr_bus_if (cluster_dcr_bus_if),
.mem_bus_if (per_cluster_mem_bus_if[cluster_id]),
.mem_bus_if (per_cluster_mem_bus_if[cluster_id * `L2_MEM_PORTS +: `L2_MEM_PORTS]),
.busy (per_cluster_busy[cluster_id])
);
end
`BUFFER_EX(busy, (| per_cluster_busy), 1'b1, (`NUM_CLUSTERS > 1));
`BUFFER_EX(busy, (| per_cluster_busy), 1'b1, 1, (`NUM_CLUSTERS > 1));
`ifdef PERF_ENABLE
localparam MEM_PORTS_CTR_W = `CLOG2(`VX_MEM_PORTS+1);
wire [`VX_MEM_PORTS-1:0] mem_req_fire, mem_rsp_fire;
wire [`VX_MEM_PORTS-1:0] mem_rd_req_fire, mem_wr_req_fire;
for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_perf_ctrs
assign mem_req_fire[i] = mem_req_valid[i] & mem_req_ready[i];
assign mem_rsp_fire[i] = mem_rsp_valid[i] & mem_rsp_ready[i];
assign mem_rd_req_fire[i] = mem_req_fire[i] & ~mem_req_rw[i];
assign mem_wr_req_fire[i] = mem_req_fire[i] & mem_req_rw[i];
end
wire [MEM_PORTS_CTR_W-1:0] perf_mem_reads_per_cycle;
wire [MEM_PORTS_CTR_W-1:0] perf_mem_writes_per_cycle;
wire [MEM_PORTS_CTR_W-1:0] perf_mem_rsps_per_cycle;
`POP_COUNT(perf_mem_reads_per_cycle, mem_rd_req_fire);
`POP_COUNT(perf_mem_writes_per_cycle, mem_wr_req_fire);
`POP_COUNT(perf_mem_rsps_per_cycle, mem_rsp_fire);
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
mem_perf_t mem_perf;
always @(posedge clk) begin
if (reset) begin
perf_mem_pending_reads <= '0;
end else begin
perf_mem_pending_reads <= $signed(perf_mem_pending_reads) +
`PERF_CTR_BITS'($signed(2'(mem_req_fire && ~mem_bus_if.req_data.rw) - 2'(mem_rsp_fire)));
`PERF_CTR_BITS'($signed((MEM_PORTS_CTR_W+1)'(perf_mem_reads_per_cycle) - (MEM_PORTS_CTR_W+1)'(perf_mem_rsps_per_cycle)));
end
end
wire mem_rd_req_fire = mem_req_fire && ~mem_bus_if.req_data.rw;
wire mem_wr_req_fire = mem_req_fire && mem_bus_if.req_data.rw;
always @(posedge clk) begin
if (reset) begin
mem_perf <= '0;
end else begin
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(mem_rd_req_fire);
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(mem_wr_req_fire);
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(perf_mem_reads_per_cycle);
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(perf_mem_writes_per_cycle);
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
end
end
assign mem_perf_if.mem = mem_perf;
`endif
// dump device configuration
initial begin
`TRACE(0, ("CONFIGS: num_threads=%0d, num_warps=%0d, num_cores=%0d, num_clusters=%0d, socket_size=%0d, local_mem_base=0x%0h, num_barriers=%0d\n",
`NUM_THREADS, `NUM_WARPS, `NUM_CORES, `NUM_CLUSTERS, `SOCKET_SIZE, `LMEM_BASE_ADDR, `NUM_BARRIERS))
end
`ifdef DBG_TRACE_MEM
always @(posedge clk) begin
if (mem_req_fire) begin
if (mem_req_rw)
`TRACE(1, ("%d: MEM Wr Req: addr=0x%0h, tag=0x%0h, byteen=0x%h data=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen, mem_req_data));
else
`TRACE(1, ("%d: MEM Rd Req: addr=0x%0h, tag=0x%0h, byteen=0x%h\n", $time, `TO_FULL_ADDR(mem_req_addr), mem_req_tag, mem_req_byteen));
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: MEM Rd Rsp: tag=0x%0h, data=0x%h\n", $time, mem_rsp_tag, mem_rsp_data));
for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_trace
always @(posedge clk) begin
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
if (mem_bus_if[i].req_data.rw) begin
`TRACE(2, ("%t: MEM Wr Req[%0d]: addr=0x%0h, byteen=0x%h data=0x%h, tag=0x%0h (#%0d)\n", $time, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
end else begin
`TRACE(2, ("%t: MEM Rd Req[%0d]: addr=0x%0h, byteen=0x%h, tag=0x%0h (#%0d)\n", $time, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
end
end
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
`TRACE(2, ("%t: MEM Rd Rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n", $time, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid))
end
end
end
`endif

View file

@ -82,112 +82,26 @@ module Vortex_axi import VX_gpu_pkg::*; #(
// Status
output wire busy
);
`STATIC_ASSERT((AXI_DATA_WIDTH == `VX_MEM_DATA_WIDTH), ("invalid memory data size: current=%0d, expected=%0d", AXI_DATA_WIDTH, `VX_MEM_DATA_WIDTH))
`STATIC_ASSERT((AXI_ADDR_WIDTH >= `MEM_ADDR_WIDTH), ("invalid memory address size: current=%0d, expected=%0d", AXI_ADDR_WIDTH, `VX_MEM_ADDR_WIDTH))
//`STATIC_ASSERT((AXI_TID_WIDTH >= `VX_MEM_TAG_WIDTH), ("invalid memory tag size: current=%0d, expected=%0d", AXI_TID_WIDTH, `VX_MEM_TAG_WIDTH))
localparam DST_LDATAW = `CLOG2(AXI_DATA_WIDTH);
localparam SRC_LDATAW = `CLOG2(`VX_MEM_DATA_WIDTH);
localparam SUB_LDATAW = DST_LDATAW - SRC_LDATAW;
localparam VX_MEM_TAG_A_WIDTH = `VX_MEM_TAG_WIDTH + `MAX(SUB_LDATAW, 0);
localparam VX_MEM_ADDR_A_WIDTH = `VX_MEM_ADDR_WIDTH - SUB_LDATAW;
wire mem_req_valid;
wire mem_req_rw;
wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen;
wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr;
wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data;
wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
wire mem_req_valid [`VX_MEM_PORTS];
wire mem_req_rw [`VX_MEM_PORTS];
wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen [`VX_MEM_PORTS];
wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr [`VX_MEM_PORTS];
wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data [`VX_MEM_PORTS];
wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag [`VX_MEM_PORTS];
wire mem_req_ready [`VX_MEM_PORTS];
wire mem_rsp_valid;
wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data;
wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_ready;
wire mem_rsp_valid [`VX_MEM_PORTS];
wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data [`VX_MEM_PORTS];
wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag [`VX_MEM_PORTS];
wire mem_rsp_ready [`VX_MEM_PORTS];
wire [`MEM_ADDR_WIDTH-1:0] m_axi_awaddr_unqual [AXI_NUM_BANKS];
wire [`MEM_ADDR_WIDTH-1:0] m_axi_araddr_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_awid_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_arid_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_bid_unqual [AXI_NUM_BANKS];
wire [`VX_MEM_TAG_WIDTH-1:0] m_axi_rid_unqual [AXI_NUM_BANKS];
for (genvar i = 0; i < AXI_NUM_BANKS; ++i) begin
assign m_axi_awaddr[i] = `MEM_ADDR_WIDTH'(m_axi_awaddr_unqual[i]);
assign m_axi_araddr[i] = `MEM_ADDR_WIDTH'(m_axi_araddr_unqual[i]);
assign m_axi_awid[i] = AXI_TID_WIDTH'(m_axi_awid_unqual[i]);
assign m_axi_arid[i] = AXI_TID_WIDTH'(m_axi_arid_unqual[i]);
assign m_axi_rid_unqual[i] = `VX_MEM_TAG_WIDTH'(m_axi_rid[i]);
assign m_axi_bid_unqual[i] = `VX_MEM_TAG_WIDTH'(m_axi_bid[i]);
end
VX_axi_adapter #(
.DATA_WIDTH (`VX_MEM_DATA_WIDTH),
.ADDR_WIDTH (`MEM_ADDR_WIDTH),
.TAG_WIDTH (`VX_MEM_TAG_WIDTH),
.NUM_BANKS (AXI_NUM_BANKS),
.RSP_OUT_BUF((AXI_NUM_BANKS > 1) ? 2 : 0)
) axi_adapter (
.clk (clk),
.reset (reset),
.mem_req_valid (mem_req_valid),
.mem_req_rw (mem_req_rw),
.mem_req_byteen (mem_req_byteen),
.mem_req_addr (mem_req_addr),
.mem_req_data (mem_req_data),
.mem_req_tag (mem_req_tag),
.mem_req_ready (mem_req_ready),
.mem_rsp_valid (mem_rsp_valid),
.mem_rsp_data (mem_rsp_data),
.mem_rsp_tag (mem_rsp_tag),
.mem_rsp_ready (mem_rsp_ready),
.m_axi_awvalid (m_axi_awvalid),
.m_axi_awready (m_axi_awready),
.m_axi_awaddr (m_axi_awaddr_unqual),
.m_axi_awid (m_axi_awid_unqual),
.m_axi_awlen (m_axi_awlen),
.m_axi_awsize (m_axi_awsize),
.m_axi_awburst (m_axi_awburst),
.m_axi_awlock (m_axi_awlock),
.m_axi_awcache (m_axi_awcache),
.m_axi_awprot (m_axi_awprot),
.m_axi_awqos (m_axi_awqos),
.m_axi_awregion (m_axi_awregion),
.m_axi_wvalid (m_axi_wvalid),
.m_axi_wready (m_axi_wready),
.m_axi_wdata (m_axi_wdata),
.m_axi_wstrb (m_axi_wstrb),
.m_axi_wlast (m_axi_wlast),
.m_axi_bvalid (m_axi_bvalid),
.m_axi_bready (m_axi_bready),
.m_axi_bid (m_axi_bid_unqual),
.m_axi_bresp (m_axi_bresp),
.m_axi_arvalid (m_axi_arvalid),
.m_axi_arready (m_axi_arready),
.m_axi_araddr (m_axi_araddr_unqual),
.m_axi_arid (m_axi_arid_unqual),
.m_axi_arlen (m_axi_arlen),
.m_axi_arsize (m_axi_arsize),
.m_axi_arburst (m_axi_arburst),
.m_axi_arlock (m_axi_arlock),
.m_axi_arcache (m_axi_arcache),
.m_axi_arprot (m_axi_arprot),
.m_axi_arqos (m_axi_arqos),
.m_axi_arregion (m_axi_arregion),
.m_axi_rvalid (m_axi_rvalid),
.m_axi_rready (m_axi_rready),
.m_axi_rdata (m_axi_rdata),
.m_axi_rlast (m_axi_rlast) ,
.m_axi_rid (m_axi_rid_unqual),
.m_axi_rresp (m_axi_rresp)
);
`SCOPE_IO_SWITCH (1)
`SCOPE_IO_SWITCH (1);
Vortex vortex (
`SCOPE_IO_BIND (0)
@ -215,4 +129,133 @@ module Vortex_axi import VX_gpu_pkg::*; #(
.busy (busy)
);
wire mem_req_valid_a [`VX_MEM_PORTS];
wire mem_req_rw_a [`VX_MEM_PORTS];
wire [(AXI_DATA_WIDTH/8)-1:0] mem_req_byteen_a [`VX_MEM_PORTS];
wire [VX_MEM_ADDR_A_WIDTH-1:0] mem_req_addr_a [`VX_MEM_PORTS];
wire [AXI_DATA_WIDTH-1:0] mem_req_data_a [`VX_MEM_PORTS];
wire [VX_MEM_TAG_A_WIDTH-1:0] mem_req_tag_a [`VX_MEM_PORTS];
wire mem_req_ready_a [`VX_MEM_PORTS];
wire mem_rsp_valid_a [`VX_MEM_PORTS];
wire [AXI_DATA_WIDTH-1:0] mem_rsp_data_a [`VX_MEM_PORTS];
wire [VX_MEM_TAG_A_WIDTH-1:0] mem_rsp_tag_a [`VX_MEM_PORTS];
wire mem_rsp_ready_a [`VX_MEM_PORTS];
// Adjust memory data width to match AXI interface
for (genvar i = 0; i < `VX_MEM_PORTS; i++) begin : g_mem_adapter
VX_mem_data_adapter #(
.SRC_DATA_WIDTH (`VX_MEM_DATA_WIDTH),
.DST_DATA_WIDTH (AXI_DATA_WIDTH),
.SRC_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH),
.DST_ADDR_WIDTH (VX_MEM_ADDR_A_WIDTH),
.SRC_TAG_WIDTH (`VX_MEM_TAG_WIDTH),
.DST_TAG_WIDTH (VX_MEM_TAG_A_WIDTH),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (0)
) mem_data_adapter (
.clk (clk),
.reset (reset),
.mem_req_valid_in (mem_req_valid[i]),
.mem_req_addr_in (mem_req_addr[i]),
.mem_req_rw_in (mem_req_rw[i]),
.mem_req_byteen_in (mem_req_byteen[i]),
.mem_req_data_in (mem_req_data[i]),
.mem_req_tag_in (mem_req_tag[i]),
.mem_req_ready_in (mem_req_ready[i]),
.mem_rsp_valid_in (mem_rsp_valid[i]),
.mem_rsp_data_in (mem_rsp_data[i]),
.mem_rsp_tag_in (mem_rsp_tag[i]),
.mem_rsp_ready_in (mem_rsp_ready[i]),
.mem_req_valid_out (mem_req_valid_a[i]),
.mem_req_addr_out (mem_req_addr_a[i]),
.mem_req_rw_out (mem_req_rw_a[i]),
.mem_req_byteen_out (mem_req_byteen_a[i]),
.mem_req_data_out (mem_req_data_a[i]),
.mem_req_tag_out (mem_req_tag_a[i]),
.mem_req_ready_out (mem_req_ready_a[i]),
.mem_rsp_valid_out (mem_rsp_valid_a[i]),
.mem_rsp_data_out (mem_rsp_data_a[i]),
.mem_rsp_tag_out (mem_rsp_tag_a[i]),
.mem_rsp_ready_out (mem_rsp_ready_a[i])
);
end
VX_axi_adapter #(
.DATA_WIDTH (AXI_DATA_WIDTH),
.ADDR_WIDTH_IN (VX_MEM_ADDR_A_WIDTH),
.ADDR_WIDTH_OUT (AXI_ADDR_WIDTH),
.TAG_WIDTH_IN (VX_MEM_TAG_A_WIDTH),
.TAG_WIDTH_OUT (AXI_TID_WIDTH),
.NUM_PORTS_IN (`VX_MEM_PORTS),
.NUM_BANKS_OUT (AXI_NUM_BANKS),
.INTERLEAVE (`PLATFORM_MEMORY_INTERLEAVE),
.REQ_OUT_BUF ((`VX_MEM_PORTS > 1) ? 2 : 0),
.RSP_OUT_BUF ((`VX_MEM_PORTS > 1 || AXI_NUM_BANKS > 1) ? 2 : 0)
) axi_adapter (
.clk (clk),
.reset (reset),
.mem_req_valid (mem_req_valid_a),
.mem_req_rw (mem_req_rw_a),
.mem_req_byteen (mem_req_byteen_a),
.mem_req_addr (mem_req_addr_a),
.mem_req_data (mem_req_data_a),
.mem_req_tag (mem_req_tag_a),
.mem_req_ready (mem_req_ready_a),
.mem_rsp_valid (mem_rsp_valid_a),
.mem_rsp_data (mem_rsp_data_a),
.mem_rsp_tag (mem_rsp_tag_a),
.mem_rsp_ready (mem_rsp_ready_a),
.m_axi_awvalid (m_axi_awvalid),
.m_axi_awready (m_axi_awready),
.m_axi_awaddr (m_axi_awaddr),
.m_axi_awid (m_axi_awid),
.m_axi_awlen (m_axi_awlen),
.m_axi_awsize (m_axi_awsize),
.m_axi_awburst (m_axi_awburst),
.m_axi_awlock (m_axi_awlock),
.m_axi_awcache (m_axi_awcache),
.m_axi_awprot (m_axi_awprot),
.m_axi_awqos (m_axi_awqos),
.m_axi_awregion (m_axi_awregion),
.m_axi_wvalid (m_axi_wvalid),
.m_axi_wready (m_axi_wready),
.m_axi_wdata (m_axi_wdata),
.m_axi_wstrb (m_axi_wstrb),
.m_axi_wlast (m_axi_wlast),
.m_axi_bvalid (m_axi_bvalid),
.m_axi_bready (m_axi_bready),
.m_axi_bid (m_axi_bid),
.m_axi_bresp (m_axi_bresp),
.m_axi_arvalid (m_axi_arvalid),
.m_axi_arready (m_axi_arready),
.m_axi_araddr (m_axi_araddr),
.m_axi_arid (m_axi_arid),
.m_axi_arlen (m_axi_arlen),
.m_axi_arsize (m_axi_arsize),
.m_axi_arburst (m_axi_arburst),
.m_axi_arlock (m_axi_arlock),
.m_axi_arcache (m_axi_arcache),
.m_axi_arprot (m_axi_arprot),
.m_axi_arqos (m_axi_arqos),
.m_axi_arregion (m_axi_arregion),
.m_axi_rvalid (m_axi_rvalid),
.m_axi_rready (m_axi_rready),
.m_axi_rdata (m_axi_rdata),
.m_axi_rlast (m_axi_rlast),
.m_axi_rid (m_axi_rid),
.m_axi_rresp (m_axi_rresp)
);
endmodule

View file

@ -28,9 +28,19 @@
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//`include "platform_afu_top_config.vh"
`include "VX_define.vh"
`ifdef PLATFORM_PROVIDES_LOCAL_MEMORY
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH ((`PLATFORM_MEMORY_ADDR_WIDTH - $clog2(`PLATFORM_MEMORY_NUM_BANKS)) - $clog2(`PLATFORM_MEMORY_DATA_SIZE))
`endif
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_DATA_WIDTH (`PLATFORM_MEMORY_DATA_SIZE * 8)
`endif
`ifndef PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH
`define PLATFORM_PARAM_LOCAL_MEMORY_BURST_CNT_WIDTH 4
`endif
package local_mem_cfg_pkg;
@ -57,5 +67,3 @@ package local_mem_cfg_pkg;
typedef logic [LOCAL_MEM_DATA_N_BYTES-1:0] t_local_mem_byte_mask;
endpackage // local_mem_cfg_pkg
`endif // PLATFORM_PROVIDES_LOCAL_MEMORY

File diff suppressed because it is too large Load diff

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -17,9 +17,9 @@
`define AFU_ACCEL_NAME "vortex_afu"
`define AFU_ACCEL_UUID 128'h35F9452B_25C2_434C_93D5_6F8C60DB361C
`define AFU_IMAGE_CMD_MEM_READ 1
`define AFU_IMAGE_CMD_MEM_READ 1
`define AFU_IMAGE_CMD_MEM_WRITE 2
`define AFU_IMAGE_CMD_RUN 3
`define AFU_IMAGE_CMD_RUN 3
`define AFU_IMAGE_CMD_DCR_WRITE 4
`define AFU_IMAGE_CMD_MAX_VALUE 4

View file

@ -14,22 +14,20 @@
`include "vortex_afu.vh"
module VX_afu_ctrl #(
parameter AXI_ADDR_WIDTH = 8,
parameter AXI_DATA_WIDTH = 32,
parameter AXI_NUM_BANKS = 1
parameter S_AXI_ADDR_WIDTH = 8,
parameter S_AXI_DATA_WIDTH = 32
) (
// axi4 lite slave signals
input wire clk,
input wire reset,
input wire clk_en,
input wire s_axi_awvalid,
input wire [AXI_ADDR_WIDTH-1:0] s_axi_awaddr,
input wire [S_AXI_ADDR_WIDTH-1:0] s_axi_awaddr,
output wire s_axi_awready,
input wire s_axi_wvalid,
input wire [AXI_DATA_WIDTH-1:0] s_axi_wdata,
input wire [AXI_DATA_WIDTH/8-1:0] s_axi_wstrb,
input wire [S_AXI_DATA_WIDTH-1:0] s_axi_wdata,
input wire [S_AXI_DATA_WIDTH/8-1:0]s_axi_wstrb,
output wire s_axi_wready,
output wire s_axi_bvalid,
@ -37,11 +35,11 @@ module VX_afu_ctrl #(
input wire s_axi_bready,
input wire s_axi_arvalid,
input wire [AXI_ADDR_WIDTH-1:0] s_axi_araddr,
input wire [S_AXI_ADDR_WIDTH-1:0] s_axi_araddr,
output wire s_axi_arready,
output wire s_axi_rvalid,
output wire [AXI_DATA_WIDTH-1:0] s_axi_rdata,
output wire [S_AXI_DATA_WIDTH-1:0] s_axi_rdata,
output wire [1:0] s_axi_rresp,
input wire s_axi_rready,
@ -52,13 +50,13 @@ module VX_afu_ctrl #(
input wire ap_idle,
output wire interrupt,
output wire ap_ctrl_read,
`ifdef SCOPE
input wire scope_bus_in,
output wire scope_bus_out,
`endif
output wire [63:0] mem_base [AXI_NUM_BANKS],
output wire dcr_wr_valid,
output wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
output wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data
@ -110,39 +108,38 @@ module VX_afu_ctrl #(
ADDR_DEV_0 = 8'h10,
ADDR_DEV_1 = 8'h14,
//ADDR_DEV_CTRL = 8'h18,
ADDR_ISA_0 = 8'h1C,
ADDR_ISA_1 = 8'h20,
//ADDR_ISA_CTRL = 8'h24,
ADDR_ISA_0 = 8'h18,
ADDR_ISA_1 = 8'h1C,
ADDR_DCR_0 = 8'h28,
ADDR_DCR_1 = 8'h2C,
//ADDR_DCR_CTRL = 8'h30,
ADDR_DCR_0 = 8'h20,
ADDR_DCR_1 = 8'h24,
`ifdef SCOPE
ADDR_SCP_0 = 8'h34,
ADDR_SCP_1 = 8'h38,
//ADDR_SCP_CTRL = 8'h3C,
ADDR_SCP_0 = 8'h28,
ADDR_SCP_1 = 8'h2C,
`endif
ADDR_MEM_0 = 8'h40,
ADDR_MEM_1 = 8'h44,
//ADDR_MEM_CTRL = 8'h48,
ADDR_BITS = 8;
localparam
WSTATE_IDLE = 2'd0,
WSTATE_ADDR = 2'd0,
WSTATE_DATA = 2'd1,
WSTATE_RESP = 2'd2;
WSTATE_RESP = 2'd2,
WSTATE_WIDTH = 2;
localparam
RSTATE_IDLE = 2'd0,
RSTATE_DATA = 2'd1;
RSTATE_ADDR = 2'd0,
RSTATE_DATA = 2'd1,
RSTATE_RESP = 2'd2,
RSTATE_WIDTH = 2;
localparam MEMORY_BANK_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH - `CLOG2(`PLATFORM_MEMORY_NUM_BANKS);
// device caps
wire [63:0] dev_caps = {16'b0,
wire [63:0] dev_caps = {8'b0,
5'(MEMORY_BANK_ADDR_WIDTH-20),
3'(`CLOG2(`PLATFORM_MEMORY_NUM_BANKS)),
8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
16'(`NUM_CORES * `NUM_CLUSTERS),
8'(`NUM_WARPS),
@ -153,16 +150,18 @@ module VX_afu_ctrl #(
2'(`CLOG2(`XLEN)-4),
30'(`MISA_STD)};
reg [1:0] wstate;
reg [WSTATE_WIDTH-1:0] wstate;
reg [ADDR_BITS-1:0] waddr;
wire [31:0] wmask;
wire s_axi_aw_fire;
wire s_axi_w_fire;
wire s_axi_b_fire;
reg [1:0] rstate;
logic [RSTATE_WIDTH-1:0] rstate;
reg [31:0] rdata;
wire [ADDR_BITS-1:0] raddr;
reg [ADDR_BITS-1:0] raddr;
wire s_axi_ar_fire;
wire s_axi_r_fire;
reg ap_reset_r;
reg ap_start_r;
@ -170,20 +169,23 @@ module VX_afu_ctrl #(
reg gie_r;
reg [1:0] ier_r;
reg [1:0] isr_r;
reg [63:0] mem_r [AXI_NUM_BANKS];
reg [31:0] dcra_r;
reg [31:0] dcrv_r;
reg dcr_wr_valid_r;
logic wready_stall;
logic rvalid_stall;
`ifdef SCOPE
reg [63:0] scope_bus_wdata;
reg [63:0] scope_bus_rdata;
reg [63:0] scope_bus_wdata, scope_bus_rdata;
reg [5:0] scope_bus_ctr;
reg cmd_scope_reading;
reg cmd_scope_writing;
reg cmd_scope_writing, cmd_scope_reading;
reg scope_bus_out_r;
reg scope_rdata_valid;
reg is_scope_waddr, is_scope_raddr;
always @(posedge clk) begin
if (reset) begin
@ -191,18 +193,33 @@ module VX_afu_ctrl #(
cmd_scope_writing <= 0;
scope_bus_ctr <= '0;
scope_bus_out_r <= 0;
end else if (clk_en) begin
is_scope_waddr <= 0;
is_scope_raddr <= 0;
scope_bus_rdata <= '0;
scope_rdata_valid <= 0;
end else begin
scope_bus_out_r <= 0;
if (s_axi_aw_fire) begin
is_scope_waddr <= (s_axi_awaddr[ADDR_BITS-1:0] == ADDR_SCP_0)
|| (s_axi_awaddr[ADDR_BITS-1:0] == ADDR_SCP_1);
end
if (s_axi_ar_fire) begin
is_scope_raddr <= (s_axi_araddr[ADDR_BITS-1:0] == ADDR_SCP_0)
|| (s_axi_araddr[ADDR_BITS-1:0] == ADDR_SCP_1);
end
if (s_axi_w_fire && waddr == ADDR_SCP_0) begin
scope_bus_wdata[31:0] <= (s_axi_wdata & wmask) | (scope_bus_wdata[31:0] & ~wmask);
end
if (s_axi_w_fire && waddr == ADDR_SCP_1) begin
scope_bus_wdata[63:32] <= (s_axi_wdata & wmask) | (scope_bus_wdata[63:32] & ~wmask);
cmd_scope_writing <= 1;
scope_rdata_valid <= 0;
scope_bus_out_r <= 1;
scope_bus_ctr <= 63;
end
if (scope_bus_in) begin
cmd_scope_reading <= 1;
scope_bus_rdata <= '0;
scope_bus_ctr <= 63;
end
if (cmd_scope_reading) begin
@ -210,13 +227,16 @@ module VX_afu_ctrl #(
scope_bus_ctr <= scope_bus_ctr - 1;
if (scope_bus_ctr == 0) begin
cmd_scope_reading <= 0;
scope_rdata_valid <= 1;
scope_bus_ctr <= 0;
end
end
if (cmd_scope_writing) begin
scope_bus_out_r <= 1'(scope_bus_wdata >> scope_bus_ctr);
scope_bus_out_r <= scope_bus_wdata[scope_bus_ctr];
scope_bus_ctr <= scope_bus_ctr - 1;
if (scope_bus_ctr == 0) begin
cmd_scope_writing <= 0;
scope_bus_ctr <= 0;
end
end
end
@ -224,41 +244,50 @@ module VX_afu_ctrl #(
assign scope_bus_out = scope_bus_out_r;
assign wready_stall = is_scope_waddr && cmd_scope_writing;
assign rvalid_stall = is_scope_raddr && ~scope_rdata_valid;
`else
assign wready_stall = 0;
assign rvalid_stall = 0;
`endif
// AXI Write
// AXI Write Request
assign s_axi_awready = (wstate == WSTATE_ADDR);
assign s_axi_wready = (wstate == WSTATE_DATA) && ~wready_stall;
assign s_axi_awready = (wstate == WSTATE_IDLE);
assign s_axi_wready = (wstate == WSTATE_DATA);
// AXI Write Response
assign s_axi_bvalid = (wstate == WSTATE_RESP);
assign s_axi_bresp = 2'b00; // OKAY
assign s_axi_aw_fire = s_axi_awvalid && s_axi_awready;
assign s_axi_w_fire = s_axi_wvalid && s_axi_wready;
for (genvar i = 0; i < 4; ++i) begin
for (genvar i = 0; i < 4; ++i) begin : g_wmask
assign wmask[8 * i +: 8] = {8{s_axi_wstrb[i]}};
end
assign s_axi_aw_fire = s_axi_awvalid && s_axi_awready;
assign s_axi_w_fire = s_axi_wvalid && s_axi_wready;
assign s_axi_b_fire = s_axi_bvalid && s_axi_bready;
// wstate
always @(posedge clk) begin
if (reset) begin
wstate <= WSTATE_IDLE;
end else if (clk_en) begin
wstate <= WSTATE_ADDR;
end else begin
case (wstate)
WSTATE_IDLE: wstate <= s_axi_awvalid ? WSTATE_DATA : WSTATE_IDLE;
WSTATE_DATA: wstate <= s_axi_wvalid ? WSTATE_RESP : WSTATE_DATA;
WSTATE_RESP: wstate <= s_axi_bready ? WSTATE_IDLE : WSTATE_RESP;
default: wstate <= WSTATE_IDLE;
WSTATE_ADDR: wstate <= s_axi_aw_fire ? WSTATE_DATA : WSTATE_ADDR;
WSTATE_DATA: wstate <= s_axi_w_fire ? WSTATE_RESP : WSTATE_DATA;
WSTATE_RESP: wstate <= s_axi_b_fire ? WSTATE_ADDR : WSTATE_RESP;
default: wstate <= WSTATE_ADDR;
endcase
end
end
// waddr
always @(posedge clk) begin
if (clk_en) begin
if (s_axi_aw_fire)
waddr <= s_axi_awaddr[ADDR_BITS-1:0];
if (s_axi_aw_fire) begin
waddr <= s_axi_awaddr[ADDR_BITS-1:0];
end
end
@ -276,16 +305,13 @@ module VX_afu_ctrl #(
dcra_r <= '0;
dcrv_r <= '0;
dcr_wr_valid_r <= 0;
end else begin
dcr_wr_valid_r <= 0;
ap_reset_r <= 0;
for (integer i = 0; i < AXI_NUM_BANKS; ++i) begin
mem_r[i] <= '0;
end
end else if (clk_en) begin
if (ap_ready)
ap_start_r <= auto_restart_r;
dcr_wr_valid_r <= 0;
if (s_axi_w_fire) begin
case (waddr)
ADDR_AP_CTRL: begin
@ -317,16 +343,7 @@ module VX_afu_ctrl #(
dcrv_r <= (s_axi_wdata & wmask) | (dcrv_r & ~wmask);
dcr_wr_valid_r <= 1;
end
default: begin
for (integer i = 0; i < AXI_NUM_BANKS; ++i) begin
if (waddr == (ADDR_MEM_0 + 8'(i) * 8'd12)) begin
mem_r[i][31:0] <= (s_axi_wdata & wmask) | (mem_r[i][31:0] & ~wmask);
end
if (waddr == (ADDR_MEM_1 + 8'(i) * 8'd12)) begin
mem_r[i][63:32] <= (s_axi_wdata & wmask) | (mem_r[i][63:32] & ~wmask);
end
end
end
default:;
endcase
if (ier_r[0] & ap_done)
@ -337,82 +354,87 @@ module VX_afu_ctrl #(
end
end
// AXI Read
// AXI Read Request
assign s_axi_arready = (rstate == RSTATE_ADDR);
assign s_axi_arready = (rstate == RSTATE_IDLE);
assign s_axi_rvalid = (rstate == RSTATE_DATA);
// AXI Read Response
assign s_axi_rvalid = (rstate == RSTATE_RESP);
assign s_axi_rdata = rdata;
assign s_axi_rresp = 2'b00; // OKAY
assign s_axi_ar_fire = s_axi_arvalid && s_axi_arready;
assign raddr = s_axi_araddr[ADDR_BITS-1:0];
assign s_axi_r_fire = s_axi_rvalid && s_axi_rready;
// rstate
always @(posedge clk) begin
if (reset) begin
rstate <= RSTATE_IDLE;
end else if (clk_en) begin
rstate <= RSTATE_ADDR;
end else begin
case (rstate)
RSTATE_IDLE: rstate <= s_axi_arvalid ? RSTATE_DATA : RSTATE_IDLE;
RSTATE_DATA: rstate <= (s_axi_rready & s_axi_rvalid) ? RSTATE_IDLE : RSTATE_DATA;
default: rstate <= RSTATE_IDLE;
RSTATE_ADDR: rstate <= s_axi_ar_fire ? RSTATE_DATA : RSTATE_ADDR;
RSTATE_DATA: rstate <= rvalid_stall ? RSTATE_DATA : RSTATE_RESP;
RSTATE_RESP: rstate <= s_axi_r_fire ? RSTATE_ADDR : RSTATE_RESP;
default: rstate <= RSTATE_ADDR;
endcase
end
end
// raddr
always @(posedge clk) begin
if (s_axi_ar_fire) begin
raddr <= s_axi_araddr[ADDR_BITS-1:0];
end
end
// rdata
always @(posedge clk) begin
if (clk_en) begin
if (s_axi_ar_fire) begin
rdata <= '0;
case (raddr)
ADDR_AP_CTRL: begin
rdata[0] <= ap_start_r;
rdata[1] <= ap_done;
rdata[2] <= ap_idle;
rdata[3] <= ap_ready;
rdata[7] <= auto_restart_r;
end
ADDR_GIE: begin
rdata <= 32'(gie_r);
end
ADDR_IER: begin
rdata <= 32'(ier_r);
end
ADDR_ISR: begin
rdata <= 32'(isr_r);
end
ADDR_DEV_0: begin
rdata <= dev_caps[31:0];
end
ADDR_DEV_1: begin
rdata <= dev_caps[63:32];
end
ADDR_ISA_0: begin
rdata <= isa_caps[31:0];
end
ADDR_ISA_1: begin
rdata <= isa_caps[63:32];
end
`ifdef SCOPE
ADDR_SCP_0: begin
rdata <= scope_bus_rdata[31:0];
end
ADDR_SCP_1: begin
rdata <= scope_bus_rdata[63:32];
end
`endif
default:;
endcase
rdata <= '0;
case (raddr)
ADDR_AP_CTRL: begin
rdata[0] <= ap_start_r;
rdata[1] <= ap_done;
rdata[2] <= ap_idle;
rdata[3] <= ap_ready;
rdata[7] <= auto_restart_r;
end
end
ADDR_GIE: begin
rdata <= 32'(gie_r);
end
ADDR_IER: begin
rdata <= 32'(ier_r);
end
ADDR_ISR: begin
rdata <= 32'(isr_r);
end
ADDR_DEV_0: begin
rdata <= dev_caps[31:0];
end
ADDR_DEV_1: begin
rdata <= dev_caps[63:32];
end
ADDR_ISA_0: begin
rdata <= isa_caps[31:0];
end
ADDR_ISA_1: begin
rdata <= isa_caps[63:32];
end
`ifdef SCOPE
ADDR_SCP_0: begin
rdata <= scope_bus_rdata[31:0];
end
ADDR_SCP_1: begin
rdata <= scope_bus_rdata[63:32];
end
`endif
default:;
endcase
end
assign ap_reset = ap_reset_r;
assign ap_start = ap_start_r;
assign interrupt = gie_r & (| isr_r);
assign mem_base = mem_r;
assign ap_ctrl_read = s_axi_r_fire && (raddr == ADDR_AP_CTRL);
assign dcr_wr_valid = dcr_wr_valid_r;
assign dcr_wr_addr = `VX_DCR_ADDR_WIDTH'(dcra_r);

View file

@ -10,68 +10,93 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Reference: https://www.xilinx.com/developer/articles/porting-rtl-designs-to-vitis-rtl-kernels.html
`include "vortex_afu.vh"
module VX_afu_wrap #(
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
parameter C_M_AXI_MEM_ID_WIDTH = `M_AXI_MEM_ID_WIDTH,
parameter C_M_AXI_MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter C_M_AXI_MEM_DATA_WIDTH = `VX_MEM_DATA_WIDTH
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH,
parameter C_M_AXI_MEM_DATA_WIDTH = `PLATFORM_MEMORY_DATA_SIZE * 8,
parameter C_M_AXI_MEM_ADDR_WIDTH = 64,
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
parameter C_M_AXI_MEM_NUM_BANKS = 1
`else
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS
`endif
) (
// System signals
input wire ap_clk,
input wire ap_rst_n,
input wire clk,
input wire reset,
// AXI4 master interface
`REPEAT (`M_AXI_MEM_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA),
`else
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
`endif
// AXI4-Lite slave interface
input wire s_axi_ctrl_awvalid,
output wire s_axi_ctrl_awready,
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_awaddr,
input wire s_axi_ctrl_wvalid,
output wire s_axi_ctrl_wready,
input wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_wdata,
input wire [C_S_AXI_CTRL_DATA_WIDTH/8-1:0] s_axi_ctrl_wstrb,
input wire s_axi_ctrl_arvalid,
output wire s_axi_ctrl_arready,
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_araddr,
output wire s_axi_ctrl_rvalid,
input wire s_axi_ctrl_rready,
output wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_rdata,
output wire [1:0] s_axi_ctrl_rresp,
output wire s_axi_ctrl_bvalid,
input wire s_axi_ctrl_bready,
output wire [1:0] s_axi_ctrl_bresp,
output wire interrupt
);
localparam C_M_AXI_MEM_NUM_BANKS = `M_AXI_MEM_NUM_BANKS;
localparam M_AXI_MEM_ADDR_WIDTH = `PLATFORM_MEMORY_ADDR_WIDTH;
localparam STATE_IDLE = 0;
localparam STATE_RUN = 1;
typedef enum logic [1:0] {
STATE_IDLE = 0,
STATE_INIT = 1,
STATE_RUN = 2,
STATE_DONE = 3
} state_e;
localparam PENDING_WR_SIZEW = 12; // max outstanding requests size
localparam NUM_MEM_BANKS_SIZEW = `CLOG2(C_M_AXI_MEM_NUM_BANKS+1);
wire m_axi_mem_awvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_awready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_awid_a [C_M_AXI_MEM_NUM_BANKS];
wire [7:0] m_axi_mem_awlen_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_wvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_wready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_wdata_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_DATA_WIDTH/8-1:0] m_axi_mem_wstrb_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_wlast_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_bvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_bready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_bid_a [C_M_AXI_MEM_NUM_BANKS];
wire [1:0] m_axi_mem_bresp_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_arvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_arready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_ID_WIDTH-1:0] m_axi_mem_arid_a [C_M_AXI_MEM_NUM_BANKS];
wire [7:0] m_axi_mem_arlen_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_rvalid_a [C_M_AXI_MEM_NUM_BANKS];
wire m_axi_mem_rready_a [C_M_AXI_MEM_NUM_BANKS];
wire [C_M_AXI_MEM_DATA_WIDTH-1:0] m_axi_mem_rdata_a [C_M_AXI_MEM_NUM_BANKS];
@ -80,30 +105,31 @@ module VX_afu_wrap #(
wire [1:0] m_axi_mem_rresp_a [C_M_AXI_MEM_NUM_BANKS];
// convert memory interface to array
`REPEAT (`M_AXI_MEM_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
wire reset = ~ap_rst_n;
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
`else
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
`endif
reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr;
reg [15:0] vx_pending_writes;
reg vx_busy_wait;
reg vx_running;
reg [PENDING_WR_SIZEW-1:0] vx_pending_writes;
reg vx_reset = 1; // asserted at initialization
wire vx_busy;
wire [63:0] mem_base [C_M_AXI_MEM_NUM_BANKS];
wire dcr_wr_valid;
wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr;
wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data;
reg state;
state_e state;
wire ap_reset;
wire ap_start;
wire ap_idle = ~vx_running;
wire ap_done = ~(state == STATE_RUN || vx_pending_writes != 0);
wire ap_ready = 1'b1;
wire ap_ctrl_read;
wire ap_idle = (state == STATE_IDLE);
wire ap_done = (state == STATE_DONE) && (vx_pending_writes == '0);
wire ap_ready = ap_done;
wire ap_done_ack = ap_done && ap_ctrl_read;
`ifdef SCOPE
wire scope_bus_in;
@ -111,108 +137,129 @@ module VX_afu_wrap #(
wire scope_reset = reset;
`endif
always @(posedge ap_clk) begin
always @(posedge clk) begin
if (reset || ap_reset) begin
state <= STATE_IDLE;
vx_busy_wait <= 0;
vx_running <= 0;
state <= STATE_IDLE;
vx_reset <= 1;
end else begin
case (state)
STATE_IDLE: begin
if (ap_start) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: STATE RUN\n", $time));
`TRACE(2, ("%t: AFU: Begin initialization\n", $time))
`endif
state <= STATE_RUN;
vx_running <= 0;
state <= STATE_INIT;
vx_reset_ctr <= (`RESET_DELAY-1);
vx_reset <= 1;
end
end
STATE_INIT: begin
if (vx_reset) begin
// wait for reset to complete
if (vx_reset_ctr == 0) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: Initialization completed\n", $time))
`endif
vx_reset <= 0;
end
end else begin
// wait until processor goes busy
if (vx_busy) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: Begin execution\n", $time))
`endif
state <= STATE_RUN;
end
end
end
STATE_RUN: begin
if (vx_running) begin
if (vx_busy_wait) begin
// wait until processor goes busy
if (vx_busy) begin
vx_busy_wait <= 0;
end
end else begin
// wait until the processor is not busy
if (~vx_busy) begin
state <= STATE_IDLE;
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: AFU: End execution\n", $time));
`TRACE(2, ("%d: STATE IDLE\n", $time));
`endif
end
end
end else begin
// wait until the reset sequence is complete
if (vx_reset_ctr == (`RESET_DELAY-1)) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: AFU: Begin execution\n", $time));
`endif
vx_running <= 1;
vx_busy_wait <= 1;
end
// wait until the processor is not busy
if (~vx_busy) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: Execution completed\n", $time))
`endif
state <= STATE_DONE;
end
end
STATE_DONE: begin
// wait for host's done acknowledgement
if (ap_done_ack) begin
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%t: AFU: Processor idle\n", $time))
`endif
state <= STATE_IDLE;
end
end
endcase
// ensure reset network initialization
if (vx_reset_ctr != '0) begin
vx_reset_ctr <= vx_reset_ctr - 1;
end
end
end
reg m_axi_mem_wfire;
reg m_axi_mem_bfire;
wire [C_M_AXI_MEM_NUM_BANKS-1:0] m_axi_wr_req_fire, m_axi_wr_rsp_fire;
wire [NUM_MEM_BANKS_SIZEW-1:0] cur_wr_reqs, cur_wr_rsps;
always @(*) begin
m_axi_mem_wfire = 0;
m_axi_mem_bfire = 0;
for (integer i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
m_axi_mem_wfire |= m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i];
m_axi_mem_bfire |= m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i];
end
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_m_axi_wr_req_fire
VX_axi_write_ack axi_write_ack (
.clk (clk),
.reset (reset),
.awvalid(m_axi_mem_awvalid_a[i]),
.awready(m_axi_mem_awready_a[i]),
.wvalid (m_axi_mem_wvalid_a[i]),
.wready (m_axi_mem_wready_a[i]),
.tx_ack (m_axi_wr_req_fire[i]),
`UNUSED_PIN (aw_ack),
`UNUSED_PIN (w_ack),
`UNUSED_PIN (tx_rdy)
);
end
always @(posedge ap_clk) begin
if (reset || ap_reset) begin
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_m_axi_wr_rsp_fire
assign m_axi_wr_rsp_fire[i] = m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i];
end
`POP_COUNT(cur_wr_reqs, m_axi_wr_req_fire);
`POP_COUNT(cur_wr_rsps, m_axi_wr_rsp_fire);
wire signed [NUM_MEM_BANKS_SIZEW:0] reqs_sub = (NUM_MEM_BANKS_SIZEW+1)'(cur_wr_reqs) -
(NUM_MEM_BANKS_SIZEW+1)'(cur_wr_rsps);
always @(posedge clk) begin
if (reset) begin
vx_pending_writes <= '0;
end else begin
if (m_axi_mem_wfire && ~m_axi_mem_bfire)
vx_pending_writes <= vx_pending_writes + 1;
if (~m_axi_mem_wfire && m_axi_mem_bfire)
vx_pending_writes <= vx_pending_writes - 1;
end
end
always @(posedge ap_clk) begin
if (state == STATE_RUN) begin
vx_reset_ctr <= vx_reset_ctr + 1;
end else begin
vx_reset_ctr <= '0;
vx_pending_writes <= vx_pending_writes + PENDING_WR_SIZEW'(reqs_sub);
end
end
VX_afu_ctrl #(
.AXI_ADDR_WIDTH (C_S_AXI_CTRL_ADDR_WIDTH),
.AXI_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH),
.AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
.S_AXI_ADDR_WIDTH (C_S_AXI_CTRL_ADDR_WIDTH),
.S_AXI_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH)
) afu_ctrl (
.clk (ap_clk),
.reset (reset || ap_reset),
.clk_en (1'b1),
.clk (clk),
.reset (reset),
.s_axi_awvalid (s_axi_ctrl_awvalid),
.s_axi_awready (s_axi_ctrl_awready),
.s_axi_awaddr (s_axi_ctrl_awaddr),
.s_axi_wvalid (s_axi_ctrl_wvalid),
.s_axi_wready (s_axi_ctrl_wready),
.s_axi_wdata (s_axi_ctrl_wdata),
.s_axi_wstrb (s_axi_ctrl_wstrb),
.s_axi_arvalid (s_axi_ctrl_arvalid),
.s_axi_arready (s_axi_ctrl_arready),
.s_axi_araddr (s_axi_ctrl_araddr),
.s_axi_rvalid (s_axi_ctrl_rvalid),
.s_axi_rready (s_axi_ctrl_rready),
.s_axi_rdata (s_axi_ctrl_rdata),
.s_axi_rresp (s_axi_ctrl_rresp),
.s_axi_bvalid (s_axi_ctrl_bvalid),
.s_axi_bready (s_axi_ctrl_bready),
.s_axi_bresp (s_axi_ctrl_bresp),
@ -224,42 +271,42 @@ module VX_afu_wrap #(
.ap_idle (ap_idle),
.interrupt (interrupt),
.ap_ctrl_read (ap_ctrl_read),
`ifdef SCOPE
.scope_bus_in (scope_bus_out),
.scope_bus_out (scope_bus_in),
`endif
.mem_base (mem_base),
.dcr_wr_valid (dcr_wr_valid),
.dcr_wr_addr (dcr_wr_addr),
.dcr_wr_data (dcr_wr_data)
);
wire [`MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_w [C_M_AXI_MEM_NUM_BANKS];
wire [`MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_w [C_M_AXI_MEM_NUM_BANKS];
wire [M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_awaddr_u [C_M_AXI_MEM_NUM_BANKS];
wire [M_AXI_MEM_ADDR_WIDTH-1:0] m_axi_mem_araddr_u [C_M_AXI_MEM_NUM_BANKS];
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_w[i]) + C_M_AXI_MEM_ADDR_WIDTH'(mem_base[i]);
assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_w[i]) + C_M_AXI_MEM_ADDR_WIDTH'(mem_base[i]);
for (genvar i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin : g_addressing
assign m_axi_mem_awaddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_awaddr_u[i]) + C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET);
assign m_axi_mem_araddr_a[i] = C_M_AXI_MEM_ADDR_WIDTH'(m_axi_mem_araddr_u[i]) + C_M_AXI_MEM_ADDR_WIDTH'(`PLATFORM_MEMORY_OFFSET);
end
`SCOPE_IO_SWITCH (2)
`SCOPE_IO_SWITCH (2);
Vortex_axi #(
.AXI_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH),
.AXI_ADDR_WIDTH (`MEM_ADDR_WIDTH),
.AXI_ADDR_WIDTH (M_AXI_MEM_ADDR_WIDTH),
.AXI_TID_WIDTH (C_M_AXI_MEM_ID_WIDTH),
.AXI_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
) vortex_axi (
`SCOPE_IO_BIND (1)
.clk (ap_clk),
.reset (reset || ap_reset || ~vx_running),
.clk (clk),
.reset (vx_reset),
.m_axi_awvalid (m_axi_mem_awvalid_a),
.m_axi_awready (m_axi_mem_awready_a),
.m_axi_awaddr (m_axi_mem_awaddr_w),
.m_axi_awaddr (m_axi_mem_awaddr_u),
.m_axi_awid (m_axi_mem_awid_a),
.m_axi_awlen (m_axi_mem_awlen_a),
`UNUSED_PIN (m_axi_awsize),
@ -283,7 +330,7 @@ module VX_afu_wrap #(
.m_axi_arvalid (m_axi_mem_arvalid_a),
.m_axi_arready (m_axi_mem_arready_a),
.m_axi_araddr (m_axi_mem_araddr_w),
.m_axi_araddr (m_axi_mem_araddr_u),
.m_axi_arid (m_axi_mem_arid_a),
.m_axi_arlen (m_axi_mem_arlen_a),
`UNUSED_PIN (m_axi_arsize),
@ -310,38 +357,79 @@ module VX_afu_wrap #(
// SCOPE //////////////////////////////////////////////////////////////////////
`ifdef SCOPE
`ifdef DBG_SCOPE_AFU
`define TRIGGERS { \
reset, \
ap_start, \
ap_done, \
ap_idle, \
interrupt, \
vx_busy_wait, \
vx_busy, \
vx_running \
}
`define PROBES { \
vx_pending_writes \
}
VX_scope_tap #(
.SCOPE_ID (0),
.TRIGGERW ($bits(`TRIGGERS)),
.PROBEW ($bits(`PROBES))
) scope_tap (
.clk (clk),
.reset (scope_reset_w[0]),
.start (1'b0),
.stop (1'b0),
.triggers (`TRIGGERS),
.probes (`PROBES),
.bus_in (scope_bus_in_w[0]),
.bus_out (scope_bus_out_w[0])
);
wire m_axi_mem_awfire_0 = m_axi_mem_awvalid_a[0] & m_axi_mem_awready_a[0];
wire m_axi_mem_arfire_0 = m_axi_mem_arvalid_a[0] & m_axi_mem_arready_a[0];
wire m_axi_mem_wfire_0 = m_axi_mem_wvalid_a[0] & m_axi_mem_wready_a[0];
wire m_axi_mem_bfire_0 = m_axi_mem_bvalid_a[0] & m_axi_mem_bready_a[0];
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP (0, 0, {
ap_reset,
ap_start,
ap_done,
ap_idle,
interrupt,
vx_reset,
vx_busy,
state,
m_axi_mem_awvalid_a[0],
m_axi_mem_awready_a[0],
m_axi_mem_wvalid_a[0],
m_axi_mem_wready_a[0],
m_axi_mem_bvalid_a[0],
m_axi_mem_bready_a[0],
m_axi_mem_arvalid_a[0],
m_axi_mem_arready_a[0],
m_axi_mem_rvalid_a[0],
m_axi_mem_rready_a[0]
}, {
dcr_wr_valid,
m_axi_mem_awfire_0,
m_axi_mem_arfire_0,
m_axi_mem_wfire_0,
m_axi_mem_bfire_0
}, {
dcr_wr_addr,
dcr_wr_data,
vx_pending_writes,
m_axi_mem_awaddr_u[0],
m_axi_mem_awid_a[0],
m_axi_mem_bid_a[0],
m_axi_mem_araddr_u[0],
m_axi_mem_arid_a[0],
m_axi_mem_rid_a[0]
},
reset_negedge, 1'b0, 4096
);
`else
`SCOPE_IO_UNUSED_W(0)
`SCOPE_IO_UNUSED(0)
`endif
`endif
`ifdef CHIPSCOPE
`ifdef DBG_SCOPE_AFU
ila_afu ila_afu_inst (
.clk (clk),
.probe0 ({
ap_reset,
ap_start,
ap_done,
ap_idle,
state,
interrupt
}),
.probe1 ({
vx_pending_writes,
vx_busy,
vx_reset,
dcr_wr_valid,
dcr_wr_addr,
dcr_wr_data
})
);
`endif
`endif
`ifdef SIMULATION
@ -352,7 +440,7 @@ module VX_afu_wrap #(
initial begin
$assertoff(0, vortex_axi);
end
always @(posedge ap_clk) begin
always @(posedge clk) begin
if (reset) begin
assert_delay_ctr <= '0;
assert_enabled <= 0;
@ -371,19 +459,22 @@ module VX_afu_wrap #(
`endif
`ifdef DBG_TRACE_AFU
always @(posedge ap_clk) begin
always @(posedge clk) begin
for (integer i = 0; i < C_M_AXI_MEM_NUM_BANKS; ++i) begin
if (m_axi_mem_awvalid_a[i] && m_axi_mem_awready_a[i]) begin
`TRACE(2, ("%d: AFU Wr Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]));
`TRACE(2, ("%t: AXI Wr Req [%0d]: addr=0x%0h, id=0x%0h\n", $time, i, m_axi_mem_awaddr_a[i], m_axi_mem_awid_a[i]))
end
if (m_axi_mem_wvalid_a[i] && m_axi_mem_wready_a[i]) begin
`TRACE(2, ("%d: AFU Wr Req [%0d]: data=0x%h\n", $time, i, m_axi_mem_wdata_a[i]));
`TRACE(2, ("%t: AXI Wr Req [%0d]: strb=0x%h, data=0x%h\n", $time, i, m_axi_mem_wstrb_a[i], m_axi_mem_wdata_a[i]))
end
if (m_axi_mem_bvalid_a[i] && m_axi_mem_bready_a[i]) begin
`TRACE(2, ("%t: AXI Wr Rsp [%0d]: id=0x%0h\n", $time, i, m_axi_mem_bid_a[i]))
end
if (m_axi_mem_arvalid_a[i] && m_axi_mem_arready_a[i]) begin
`TRACE(2, ("%d: AFU Rd Req [%0d]: addr=0x%0h, tag=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i]));
`TRACE(2, ("%t: AXI Rd Req [%0d]: addr=0x%0h, id=0x%0h\n", $time, i, m_axi_mem_araddr_a[i], m_axi_mem_arid_a[i]))
end
if (m_axi_mem_rvalid_a[i] && m_axi_mem_rready_a[i]) begin
`TRACE(2, ("%d: AVS Rd Rsp [%0d]: data=0x%h, tag=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]));
`TRACE(2, ("%t: AXI Rd Rsp [%0d]: data=0x%h, id=0x%0h\n", $time, i, m_axi_mem_rdata_a[i], m_axi_mem_rid_a[i]))
end
end
end

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -16,37 +16,50 @@
module vortex_afu #(
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
parameter C_M_AXI_MEM_ID_WIDTH = `M_AXI_MEM_ID_WIDTH,
parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH,
parameter C_M_AXI_MEM_DATA_WIDTH = (`PLATFORM_MEMORY_DATA_SIZE * 8),
parameter C_M_AXI_MEM_ADDR_WIDTH = 64,
parameter C_M_AXI_MEM_DATA_WIDTH = `VX_MEM_DATA_WIDTH
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
parameter C_M_AXI_MEM_NUM_BANKS = 1
`else
parameter C_M_AXI_MEM_NUM_BANKS = `PLATFORM_MEMORY_NUM_BANKS
`endif
) (
// System signals
input wire ap_clk,
input wire ap_rst_n,
// AXI4 master interface
`REPEAT (`M_AXI_MEM_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, GEN_AXI_MEM, REPEAT_COMMA),
`else
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, GEN_AXI_MEM, REPEAT_COMMA),
`endif
// AXI4-Lite slave interface
input wire s_axi_ctrl_awvalid,
output wire s_axi_ctrl_awready,
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_awaddr,
input wire s_axi_ctrl_wvalid,
output wire s_axi_ctrl_wready,
input wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_wdata,
input wire [C_S_AXI_CTRL_DATA_WIDTH/8-1:0] s_axi_ctrl_wstrb,
input wire s_axi_ctrl_arvalid,
input wire s_axi_ctrl_arvalid,
output wire s_axi_ctrl_arready,
input wire [C_S_AXI_CTRL_ADDR_WIDTH-1:0] s_axi_ctrl_araddr,
output wire s_axi_ctrl_rvalid,
input wire s_axi_ctrl_rready,
input wire s_axi_ctrl_rready,
output wire [C_S_AXI_CTRL_DATA_WIDTH-1:0] s_axi_ctrl_rdata,
output wire [1:0] s_axi_ctrl_rresp,
output wire s_axi_ctrl_bvalid,
output wire s_axi_ctrl_bvalid,
input wire s_axi_ctrl_bready,
output wire [1:0] s_axi_ctrl_bresp,
output wire interrupt
output wire interrupt
);
VX_afu_wrap #(
@ -54,32 +67,39 @@ module vortex_afu #(
.C_S_AXI_CTRL_DATA_WIDTH (C_S_AXI_CTRL_DATA_WIDTH),
.C_M_AXI_MEM_ID_WIDTH (C_M_AXI_MEM_ID_WIDTH),
.C_M_AXI_MEM_ADDR_WIDTH (C_M_AXI_MEM_ADDR_WIDTH),
.C_M_AXI_MEM_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH)
.C_M_AXI_MEM_DATA_WIDTH (C_M_AXI_MEM_DATA_WIDTH),
.C_M_AXI_MEM_NUM_BANKS (C_M_AXI_MEM_NUM_BANKS)
) afu_wrap (
.ap_clk (ap_clk),
.ap_rst_n (ap_rst_n),
`REPEAT (`M_AXI_MEM_NUM_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
.clk (ap_clk),
.reset (~ap_rst_n),
`ifdef PLATFORM_MERGED_MEMORY_INTERFACE
`REPEAT (1, AXI_MEM_ARGS, REPEAT_COMMA),
`else
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_ARGS, REPEAT_COMMA),
`endif
.s_axi_ctrl_awvalid (s_axi_ctrl_awvalid),
.s_axi_ctrl_awready (s_axi_ctrl_awready),
.s_axi_ctrl_awaddr (s_axi_ctrl_awaddr),
.s_axi_ctrl_awaddr (s_axi_ctrl_awaddr),
.s_axi_ctrl_wvalid (s_axi_ctrl_wvalid),
.s_axi_ctrl_wready (s_axi_ctrl_wready),
.s_axi_ctrl_wdata (s_axi_ctrl_wdata),
.s_axi_ctrl_wstrb (s_axi_ctrl_wstrb),
.s_axi_ctrl_arvalid (s_axi_ctrl_arvalid),
.s_axi_ctrl_arready (s_axi_ctrl_arready),
.s_axi_ctrl_araddr (s_axi_ctrl_araddr),
.s_axi_ctrl_rvalid (s_axi_ctrl_rvalid),
.s_axi_ctrl_rready (s_axi_ctrl_rready),
.s_axi_ctrl_rdata (s_axi_ctrl_rdata),
.s_axi_ctrl_rresp (s_axi_ctrl_rresp),
.s_axi_ctrl_bvalid (s_axi_ctrl_bvalid),
.s_axi_ctrl_bready (s_axi_ctrl_bready),
.s_axi_ctrl_bresp (s_axi_ctrl_bresp),
.interrupt (interrupt)
);
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -14,12 +14,12 @@
`ifndef VORTEX_AFU_VH
`define VORTEX_AFU_VH
`ifndef M_AXI_MEM_NUM_BANKS
`define M_AXI_MEM_NUM_BANKS 1
`ifndef PLATFORM_MEMORY_OFFSET
`define PLATFORM_MEMORY_OFFSET 0
`endif
`ifndef M_AXI_MEM_ID_WIDTH
`define M_AXI_MEM_ID_WIDTH 32
`ifndef PLATFORM_MEMORY_ID_WIDTH
`define PLATFORM_MEMORY_ID_WIDTH 32
`endif
`define GEN_AXI_MEM(i) \

View file

@ -33,7 +33,7 @@ module VX_bank_flush #(
output wire flush_init,
output wire flush_valid,
output wire [`CS_LINE_SEL_BITS-1:0] flush_line,
output wire [NUM_WAYS-1:0] flush_way,
output wire [`CS_WAY_SEL_WIDTH-1:0] flush_way,
input wire flush_ready,
input wire mshr_empty,
input wire bank_empty
@ -48,20 +48,21 @@ module VX_bank_flush #(
localparam STATE_WAIT2 = 4;
localparam STATE_DONE = 5;
reg [2:0] state_r, state_n;
reg [2:0] state, state_n;
reg [CTR_WIDTH-1:0] counter_r;
reg [CTR_WIDTH-1:0] counter;
always @(*) begin
state_n = state_r;
case (state_r)
STATE_IDLE: begin
state_n = state;
case (state)
//STATE_IDLE:
default : begin
if (flush_begin) begin
state_n = STATE_WAIT1;
end
end
STATE_INIT: begin
if (counter_r == ((2 ** `CS_LINE_SEL_BITS)-1)) begin
if (counter == ((2 ** `CS_LINE_SEL_BITS)-1)) begin
state_n = STATE_IDLE;
end
end
@ -72,7 +73,7 @@ module VX_bank_flush #(
end
end
STATE_FLUSH: begin
if (counter_r == ((2 ** CTR_WIDTH)-1) && flush_ready) begin
if (counter == ((2 ** CTR_WIDTH)-1) && flush_ready) begin
state_n = (BANK_ID == 0) ? STATE_DONE : STATE_WAIT2;
end
end
@ -93,35 +94,30 @@ module VX_bank_flush #(
always @(posedge clk) begin
if (reset) begin
state_r <= STATE_INIT;
counter_r <= '0;
state <= STATE_INIT;
counter <= '0;
end else begin
state_r <= state_n;
if (state_r != STATE_IDLE) begin
if ((state_r == STATE_INIT)
|| ((state_r == STATE_FLUSH) && flush_ready)) begin
counter_r <= counter_r + CTR_WIDTH'(1);
state <= state_n;
if (state != STATE_IDLE) begin
if ((state == STATE_INIT)
|| ((state == STATE_FLUSH) && flush_ready)) begin
counter <= counter + CTR_WIDTH'(1);
end
end else begin
counter_r <= '0;
counter <= '0;
end
end
end
assign flush_end = (state_r == STATE_DONE);
assign flush_init = (state_r == STATE_INIT);
assign flush_valid = (state_r == STATE_FLUSH);
assign flush_line = counter_r[`CS_LINE_SEL_BITS-1:0];
assign flush_end = (state == STATE_DONE);
assign flush_init = (state == STATE_INIT);
assign flush_valid = (state == STATE_FLUSH);
assign flush_line = counter[`CS_LINE_SEL_BITS-1:0];
if (WRITEBACK && `CS_WAY_SEL_BITS > 0) begin
reg [NUM_WAYS-1:0] flush_way_r;
always @(*) begin
flush_way_r = '0;
flush_way_r[counter_r[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS]] = 1;
end
assign flush_way = flush_way_r;
end else begin
assign flush_way = {NUM_WAYS{1'b1}};
if (WRITEBACK && (NUM_WAYS > 1)) begin : g_flush_way
assign flush_way = counter[`CS_LINE_SEL_BITS +: `CS_WAY_SEL_BITS];
end else begin : g_flush_way_all
assign flush_way = '0;
end
endmodule

View file

@ -19,23 +19,26 @@ module VX_cache import VX_gpu_pkg::*; #(
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Number of memory ports
parameter MEM_PORTS = 1,
// Size of cache in bytes
parameter CACHE_SIZE = 4096,
parameter CACHE_SIZE = 32768,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
parameter NUM_BANKS = 4,
// Number of associative ways
parameter NUM_WAYS = 1,
parameter NUM_WAYS = 4,
// Size of a word in bytes
parameter WORD_SIZE = `XLEN/8,
parameter WORD_SIZE = 16,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
parameter CRSQ_SIZE = 4,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 8,
parameter MSHR_SIZE = 16,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
parameter MRSQ_SIZE = 4,
// Memory Request Queue Size
parameter MREQ_SIZE = 4,
@ -48,17 +51,23 @@ module VX_cache import VX_gpu_pkg::*; #(
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Replacement policy
parameter REPL_POLICY = `CS_REPL_FIFO,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 1,
// core request flags
parameter FLAGS_WIDTH = 0,
// Core response output register
parameter CORE_OUT_BUF = 0,
parameter CORE_OUT_BUF = 3,
// Memory request output register
parameter MEM_OUT_BUF = 0
parameter MEM_OUT_BUF = 3
) (
// PERF
`ifdef PERF_ENABLE
@ -69,34 +78,37 @@ module VX_cache import VX_gpu_pkg::*; #(
input wire reset,
VX_mem_bus_if.slave core_bus_if [NUM_REQS],
VX_mem_bus_if.master mem_bus_if
VX_mem_bus_if.master mem_bus_if [MEM_PORTS]
);
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter: number of banks must be power of 2"))
`STATIC_ASSERT(WRITE_ENABLE || !WRITEBACK, ("invalid parameter: writeback requires write enable"))
`STATIC_ASSERT(WRITEBACK || !DIRTY_BYTES, ("invalid parameter: dirty bytes require writeback"))
// In writeback mode, memory fill response may issue a new memory request to handle evicted blocks.
// We need to ensure that the memory request queue never fills up to avoid deadlock.
`STATIC_ASSERT(!WRITEBACK || (MREQ_SIZE >= MSHR_SIZE), ("invalid parameter: writeback requires MREQ_SIZE >= MSHR_SIZE"))
`STATIC_ASSERT(NUM_BANKS >= MEM_PORTS, ("invalid parameter: number of banks must be greater or equal to number of memory ports"))
localparam REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS);
localparam WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS);
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
localparam MEM_TAG_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
localparam MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH);
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
localparam WORD_WIDTH = WORD_SIZE * 8;
localparam WORD_SEL_BITS = `CLOG2(WORDS_PER_LINE);
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam LINE_ADDR_WIDTH = (`CS_WORD_ADDR_WIDTH - BANK_SEL_BITS - WORD_SEL_BITS);
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + 1;
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + `UP(FLAGS_WIDTH);
localparam CORE_RSP_DATAW = WORD_WIDTH + TAG_WIDTH;
localparam BANK_MEM_TAG_WIDTH = UUID_WIDTH + MSHR_ADDR_WIDTH;
localparam MEM_REQ_DATAW = (`CS_LINE_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + `UP(FLAGS_WIDTH));
localparam MEM_RSP_DATAW = `CS_LINE_WIDTH + MEM_TAG_WIDTH;
localparam MEM_PORTS_SEL_BITS = `CLOG2(MEM_PORTS);
localparam MEM_PORTS_SEL_WIDTH = `UP(MEM_PORTS_SEL_BITS);
localparam MEM_ARB_SEL_BITS = `CLOG2(`CDIV(NUM_BANKS, MEM_PORTS));
localparam MEM_ARB_SEL_WIDTH = `UP(MEM_ARB_SEL_BITS);
localparam CORE_REQ_BUF_ENABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
localparam MEM_REQ_BUF_ENABLE = (NUM_BANKS != 1);
localparam REQ_XBAR_BUF = (NUM_REQS > 4) ? 2 : 0;
localparam REQ_XBAR_BUF = (NUM_REQS > 2) ? 2 : 0;
localparam CORE_RSP_BUF_ENABLE = (NUM_BANKS != 1) || (NUM_REQS != 1);
localparam MEM_REQ_BUF_ENABLE = (NUM_BANKS != 1);
`ifdef PERF_ENABLE
wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
@ -110,6 +122,7 @@ module VX_cache import VX_gpu_pkg::*; #(
) core_bus2_if[NUM_REQS]();
wire [NUM_BANKS-1:0] per_bank_flush_begin;
wire [`UP(UUID_WIDTH)-1:0] flush_uuid;
wire [NUM_BANKS-1:0] per_bank_flush_end;
wire [NUM_BANKS-1:0] per_bank_core_req_fire;
@ -117,7 +130,9 @@ module VX_cache import VX_gpu_pkg::*; #(
VX_cache_flush #(
.NUM_REQS (NUM_REQS),
.NUM_BANKS (NUM_BANKS),
.BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // bank xbar latency
.UUID_WIDTH(UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // request xbar latency
) flush_unit (
.clk (clk),
.reset (reset),
@ -125,92 +140,101 @@ module VX_cache import VX_gpu_pkg::*; #(
.core_bus_out_if (core_bus2_if),
.bank_req_fire (per_bank_core_req_fire),
.flush_begin (per_bank_flush_begin),
.flush_uuid (flush_uuid),
.flush_end (per_bank_flush_end)
);
///////////////////////////////////////////////////////////////////////////
// Memory response gather /////////////////////////////////////////////////
// Core response buffering
wire [NUM_REQS-1:0] core_rsp_valid_s;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_s;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
wire [NUM_REQS-1:0] core_rsp_ready_s;
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) mem_bus_tmp_if[MEM_PORTS]();
`RESET_RELAY_EX (core_rsp_reset, reset, NUM_REQS, `MAX_FANOUT);
for (genvar i = 0; i < NUM_REQS; ++i) begin
wire [MEM_PORTS-1:0] mem_rsp_queue_valid;
wire [MEM_PORTS-1:0][MEM_RSP_DATAW-1:0] mem_rsp_queue_data;
wire [MEM_PORTS-1:0] mem_rsp_queue_ready;
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
.SIZE (CORE_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
.clk (clk),
.reset (core_rsp_reset[i]),
.valid_in (core_rsp_valid_s[i]),
.ready_in (core_rsp_ready_s[i]),
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
.data_out ({core_bus2_if[i].rsp_data.data, core_bus2_if[i].rsp_data.tag}),
.valid_out (core_bus2_if[i].rsp_valid),
.ready_out (core_bus2_if[i].rsp_ready)
.DATAW (MEM_RSP_DATAW),
.SIZE (MRSQ_SIZE),
.OUT_REG (MRSQ_SIZE > 2)
) mem_rsp_queue (
.clk (clk),
.reset (reset),
.valid_in (mem_bus_tmp_if[i].rsp_valid),
.data_in (mem_bus_tmp_if[i].rsp_data),
.ready_in (mem_bus_tmp_if[i].rsp_ready),
.valid_out (mem_rsp_queue_valid[i]),
.data_out (mem_rsp_queue_data[i]),
.ready_out (mem_rsp_queue_ready[i])
);
end
///////////////////////////////////////////////////////////////////////////
wire [MEM_PORTS-1:0][MEM_RSP_DATAW-MEM_ARB_SEL_BITS-1:0] mem_rsp_queue_data_s;
wire [MEM_PORTS-1:0][BANK_SEL_WIDTH-1:0] mem_rsp_queue_sel;
// Memory request buffering
wire mem_req_valid_s;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_s;
wire mem_req_rw_s;
wire [LINE_SIZE-1:0] mem_req_byteen_s;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s;
wire mem_req_flush_s;
wire mem_req_ready_s;
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue_data_s
wire [BANK_MEM_TAG_WIDTH-1:0] mem_rsp_tag_s = mem_rsp_queue_data[i][MEM_TAG_WIDTH-1:MEM_ARB_SEL_BITS];
wire [`CS_LINE_WIDTH-1:0] mem_rsp_data_s = mem_rsp_queue_data[i][MEM_RSP_DATAW-1:MEM_TAG_WIDTH];
assign mem_rsp_queue_data_s[i] = {mem_rsp_data_s, mem_rsp_tag_s};
end
wire mem_bus_if_flush;
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_rsp_queue_sel
if (NUM_BANKS > 1) begin : g_multibanks
if (NUM_BANKS != MEM_PORTS) begin : g_arb_sel
VX_bits_concat #(
.L (MEM_ARB_SEL_BITS),
.R (MEM_PORTS_SEL_BITS)
) mem_rsp_sel_concat (
.left_in (mem_rsp_queue_data[i][MEM_ARB_SEL_BITS-1:0]),
.right_in (MEM_PORTS_SEL_WIDTH'(i)),
.data_out (mem_rsp_queue_sel[i])
);
end else begin : g_no_arb_sel
assign mem_rsp_queue_sel[i] = MEM_PORTS_SEL_WIDTH'(i);
end
end else begin : g_singlebank
assign mem_rsp_queue_sel[i] = 0;
end
end
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + 1),
.SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
wire [NUM_BANKS-1:0] per_bank_mem_rsp_valid;
wire [NUM_BANKS-1:0][MEM_RSP_DATAW-MEM_ARB_SEL_BITS-1:0] per_bank_mem_rsp_pdata;
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
VX_stream_omega #(
.NUM_INPUTS (MEM_PORTS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (MEM_RSP_DATAW-MEM_ARB_SEL_BITS),
.ARBITER ("R"),
.OUT_BUF (3)
) mem_rsp_xbar (
.clk (clk),
.reset (reset),
.valid_in (mem_req_valid_s),
.ready_in (mem_req_ready_s),
.data_in ({mem_req_rw_s, mem_req_byteen_s, mem_req_addr_s, mem_req_data_s, mem_req_tag_s, mem_req_flush_s}),
.data_out ({mem_bus_if.req_data.rw, mem_bus_if.req_data.byteen, mem_bus_if.req_data.addr, mem_bus_if.req_data.data, mem_bus_if.req_data.tag, mem_bus_if_flush}),
.valid_out (mem_bus_if.req_valid),
.ready_out (mem_bus_if.req_ready)
.valid_in (mem_rsp_queue_valid),
.data_in (mem_rsp_queue_data_s),
.sel_in (mem_rsp_queue_sel),
.ready_in (mem_rsp_queue_ready),
.valid_out (per_bank_mem_rsp_valid),
.data_out (per_bank_mem_rsp_pdata),
`UNUSED_PIN (sel_out),
.ready_out (per_bank_mem_rsp_ready),
`UNUSED_PIN (collisions)
);
assign mem_bus_if.req_data.atype = mem_bus_if_flush ? `ADDR_TYPE_WIDTH'(1 << `ADDR_TYPE_FLUSH) : '0;
wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_rsp_data;
wire [NUM_BANKS-1:0][BANK_MEM_TAG_WIDTH-1:0] per_bank_mem_rsp_tag;
///////////////////////////////////////////////////////////////////////////
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_per_bank_mem_rsp_data
assign {
per_bank_mem_rsp_data[i],
per_bank_mem_rsp_tag[i]
} = per_bank_mem_rsp_pdata[i];
end
// Memory response buffering
wire mem_rsp_valid_s;
wire [`CS_LINE_WIDTH-1:0] mem_rsp_data_s;
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s;
wire mem_rsp_ready_s;
VX_elastic_buffer #(
.DATAW (MEM_TAG_WIDTH + `CS_LINE_WIDTH),
.SIZE (MRSQ_SIZE),
.OUT_REG (MRSQ_SIZE > 2)
) mem_rsp_queue (
.clk (clk),
.reset (reset),
.valid_in (mem_bus_if.rsp_valid),
.ready_in (mem_bus_if.rsp_ready),
.data_in ({mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data}),
.data_out ({mem_rsp_tag_s, mem_rsp_data_s}),
.valid_out (mem_rsp_valid_s),
.ready_out (mem_rsp_ready_s)
);
///////////////////////////////////////////////////////////////////////////
// Core requests dispatch /////////////////////////////////////////////////
wire [NUM_BANKS-1:0] per_bank_core_req_valid;
wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_core_req_addr;
@ -220,7 +244,7 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_req_data;
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_req_tag;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_req_idx;
wire [NUM_BANKS-1:0] per_bank_core_req_flush;
wire [NUM_BANKS-1:0][`UP(FLAGS_WIDTH)-1:0] per_bank_core_req_flags;
wire [NUM_BANKS-1:0] per_bank_core_req_ready;
wire [NUM_BANKS-1:0] per_bank_core_rsp_valid;
@ -230,33 +254,21 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0] per_bank_core_rsp_ready;
wire [NUM_BANKS-1:0] per_bank_mem_req_valid;
wire [NUM_BANKS-1:0][`CS_MEM_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
wire [NUM_BANKS-1:0][`CS_LINE_ADDR_WIDTH-1:0] per_bank_mem_req_addr;
wire [NUM_BANKS-1:0] per_bank_mem_req_rw;
wire [NUM_BANKS-1:0][LINE_SIZE-1:0] per_bank_mem_req_byteen;
wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_req_data;
wire [NUM_BANKS-1:0][MSHR_ADDR_WIDTH-1:0] per_bank_mem_req_id;
wire [NUM_BANKS-1:0] per_bank_mem_req_flush;
wire [NUM_BANKS-1:0][BANK_MEM_TAG_WIDTH-1:0] per_bank_mem_req_tag;
wire [NUM_BANKS-1:0][`UP(FLAGS_WIDTH)-1:0] per_bank_mem_req_flags;
wire [NUM_BANKS-1:0] per_bank_mem_req_ready;
wire [NUM_BANKS-1:0] per_bank_mem_rsp_ready;
assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready;
if (NUM_BANKS == 1) begin
assign mem_rsp_ready_s = per_bank_mem_rsp_ready;
end else begin
assign mem_rsp_ready_s = per_bank_mem_rsp_ready[`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s)];
end
// Bank requests dispatch
wire [NUM_REQS-1:0] core_req_valid;
wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr;
wire [NUM_REQS-1:0] core_req_rw;
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag;
wire [NUM_REQS-1:0] core_req_flush;
wire [NUM_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] core_req_flags;
wire [NUM_REQS-1:0] core_req_ready;
wire [NUM_REQS-1:0][LINE_ADDR_WIDTH-1:0] core_req_line_addr;
@ -266,35 +278,38 @@ module VX_cache import VX_gpu_pkg::*; #(
wire [NUM_REQS-1:0][CORE_REQ_DATAW-1:0] core_req_data_in;
wire [NUM_BANKS-1:0][CORE_REQ_DATAW-1:0] core_req_data_out;
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req
assign core_req_valid[i] = core_bus2_if[i].req_valid;
assign core_req_rw[i] = core_bus2_if[i].req_data.rw;
assign core_req_byteen[i] = core_bus2_if[i].req_data.byteen;
assign core_req_addr[i] = core_bus2_if[i].req_data.addr;
assign core_req_data[i] = core_bus2_if[i].req_data.data;
assign core_req_tag[i] = core_bus2_if[i].req_data.tag;
assign core_req_flush[i] = core_bus2_if[i].req_data.atype[`ADDR_TYPE_FLUSH];
assign core_req_flags[i] = `UP(FLAGS_WIDTH)'(core_bus2_if[i].req_data.flags);
assign core_bus2_if[i].req_ready = core_req_ready[i];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (WORDS_PER_LINE > 1) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_wsel
if (WORDS_PER_LINE > 1) begin : g_wsel
assign core_req_wsel[i] = core_req_addr[i][0 +: WORD_SEL_BITS];
end else begin
end else begin : g_no_wsel
assign core_req_wsel[i] = '0;
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_line_addr
assign core_req_line_addr[i] = core_req_addr[i][(BANK_SEL_BITS + WORD_SEL_BITS) +: LINE_ADDR_WIDTH];
end
if (NUM_BANKS > 1) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_bid
if (NUM_BANKS > 1) begin : g_multibanks
assign core_req_bid[i] = core_req_addr[i][WORD_SEL_BITS +: BANK_SEL_BITS];
end else begin : g_singlebank
assign core_req_bid[i] = '0;
end
end else begin
assign core_req_bid = '0;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_data_in
assign core_req_data_in[i] = {
core_req_line_addr[i],
core_req_rw[i],
@ -302,26 +317,26 @@ module VX_cache import VX_gpu_pkg::*; #(
core_req_byteen[i],
core_req_data[i],
core_req_tag[i],
core_req_flush[i]
core_req_flags[i]
};
end
assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready;
`ifdef PERF_ENABLE
wire [`PERF_CTR_BITS-1:0] perf_collisions;
`endif
`RESET_RELAY (req_xbar_reset, reset);
VX_stream_xbar #(
.NUM_INPUTS (NUM_REQS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (CORE_REQ_DATAW),
.PERF_CTR_BITS (`PERF_CTR_BITS),
.ARBITER ("F"),
.ARBITER ("R"),
.OUT_BUF (REQ_XBAR_BUF)
) req_xbar (
.clk (clk),
.reset (req_xbar_reset),
.reset (reset),
`ifdef PERF_ENABLE
.collisions(perf_collisions),
`else
@ -337,7 +352,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.ready_out (per_bank_core_req_ready)
);
for (genvar i = 0; i < NUM_BANKS; ++i) begin
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_core_req_data_out
assign {
per_bank_core_req_addr[i],
per_bank_core_req_rw[i],
@ -345,50 +360,42 @@ module VX_cache import VX_gpu_pkg::*; #(
per_bank_core_req_byteen[i],
per_bank_core_req_data[i],
per_bank_core_req_tag[i],
per_bank_core_req_flush[i]
per_bank_core_req_flags[i]
} = core_req_data_out[i];
end
// Banks access
for (genvar bank_id = 0; bank_id < NUM_BANKS; ++bank_id) begin : banks
wire [`CS_LINE_ADDR_WIDTH-1:0] curr_bank_mem_req_addr;
wire curr_bank_mem_rsp_valid;
if (NUM_BANKS == 1) begin
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s;
end else begin
assign curr_bank_mem_rsp_valid = mem_rsp_valid_s && (`CS_MEM_TAG_TO_BANK_ID(mem_rsp_tag_s) == bank_id);
end
`RESET_RELAY (bank_reset, reset);
// Banks access ///////////////////////////////////////////////////////////
for (genvar bank_id = 0; bank_id < NUM_BANKS; ++bank_id) begin : g_banks
VX_cache_bank #(
.BANK_ID (bank_id),
.INSTANCE_ID ($sformatf("%s-bank%0d", INSTANCE_ID, bank_id)),
.INSTANCE_ID (`SFORMATF(("%s-bank%0d", INSTANCE_ID, bank_id))),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.REPL_POLICY (REPL_POLICY),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.DIRTY_BYTES (DIRTY_BYTES),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_OUT_BUF (CORE_REQ_BUF_ENABLE ? 0 : CORE_OUT_BUF),
.MEM_OUT_BUF (MEM_REQ_BUF_ENABLE ? 0 : MEM_OUT_BUF)
.FLAGS_WIDTH (FLAGS_WIDTH),
.CORE_OUT_REG (CORE_RSP_BUF_ENABLE ? 0 : `TO_OUT_BUF_REG(CORE_OUT_BUF)),
.MEM_OUT_REG (MEM_REQ_BUF_ENABLE ? 0 : `TO_OUT_BUF_REG(MEM_OUT_BUF))
) bank (
.clk (clk),
.reset (bank_reset),
.reset (reset),
`ifdef PERF_ENABLE
.perf_read_misses (perf_read_miss_per_bank[bank_id]),
.perf_write_misses (perf_write_miss_per_bank[bank_id]),
.perf_mshr_stalls (perf_mshr_stall_per_bank[bank_id]),
.perf_read_miss (perf_read_miss_per_bank[bank_id]),
.perf_write_miss (perf_write_miss_per_bank[bank_id]),
.perf_mshr_stall (perf_mshr_stall_per_bank[bank_id]),
`endif
// Core request
@ -400,7 +407,7 @@ module VX_cache import VX_gpu_pkg::*; #(
.core_req_data (per_bank_core_req_data[bank_id]),
.core_req_tag (per_bank_core_req_tag[bank_id]),
.core_req_idx (per_bank_core_req_idx[bank_id]),
.core_req_flush (per_bank_core_req_flush[bank_id]),
.core_req_flags (per_bank_core_req_flags[bank_id]),
.core_req_ready (per_bank_core_req_ready[bank_id]),
// Core response
@ -412,50 +419,49 @@ module VX_cache import VX_gpu_pkg::*; #(
// Memory request
.mem_req_valid (per_bank_mem_req_valid[bank_id]),
.mem_req_addr (curr_bank_mem_req_addr),
.mem_req_addr (per_bank_mem_req_addr[bank_id]),
.mem_req_rw (per_bank_mem_req_rw[bank_id]),
.mem_req_byteen (per_bank_mem_req_byteen[bank_id]),
.mem_req_data (per_bank_mem_req_data[bank_id]),
.mem_req_id (per_bank_mem_req_id[bank_id]),
.mem_req_flush (per_bank_mem_req_flush[bank_id]),
.mem_req_tag (per_bank_mem_req_tag[bank_id]),
.mem_req_flags (per_bank_mem_req_flags[bank_id]),
.mem_req_ready (per_bank_mem_req_ready[bank_id]),
// Memory response
.mem_rsp_valid (curr_bank_mem_rsp_valid),
.mem_rsp_data (mem_rsp_data_s),
.mem_rsp_id (`CS_MEM_TAG_TO_REQ_ID(mem_rsp_tag_s)),
.mem_rsp_valid (per_bank_mem_rsp_valid[bank_id]),
.mem_rsp_data (per_bank_mem_rsp_data[bank_id]),
.mem_rsp_tag (per_bank_mem_rsp_tag[bank_id]),
.mem_rsp_ready (per_bank_mem_rsp_ready[bank_id]),
// Flush request
.flush_begin (per_bank_flush_begin[bank_id]),
.flush_uuid (flush_uuid),
.flush_end (per_bank_flush_end[bank_id])
);
if (NUM_BANKS == 1) begin
assign per_bank_mem_req_addr[bank_id] = curr_bank_mem_req_addr;
end else begin
assign per_bank_mem_req_addr[bank_id] = `CS_LINE_TO_MEM_ADDR(curr_bank_mem_req_addr, bank_id);
end
end
// Bank responses gather
// Core responses gather //////////////////////////////////////////////////
wire [NUM_BANKS-1:0][CORE_RSP_DATAW-1:0] core_rsp_data_in;
wire [NUM_REQS-1:0][CORE_RSP_DATAW-1:0] core_rsp_data_out;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
wire [NUM_REQS-1:0] core_rsp_valid_s;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data_s;
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag_s;
wire [NUM_REQS-1:0] core_rsp_ready_s;
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_core_rsp_data_in
assign core_rsp_data_in[i] = {per_bank_core_rsp_data[i], per_bank_core_rsp_tag[i]};
end
`RESET_RELAY (rsp_xbar_reset, reset);
VX_stream_xbar #(
.NUM_INPUTS (NUM_BANKS),
.NUM_OUTPUTS (NUM_REQS),
.DATAW (CORE_RSP_DATAW),
.ARBITER ("F")
.ARBITER ("R")
) rsp_xbar (
.clk (clk),
.reset (rsp_xbar_reset),
.reset (reset),
`UNUSED_PIN (collisions),
.valid_in (per_bank_core_rsp_valid),
.data_in (core_rsp_data_in),
@ -467,113 +473,170 @@ module VX_cache import VX_gpu_pkg::*; #(
`UNUSED_PIN (sel_out)
);
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_data_s
assign {core_rsp_data_s[i], core_rsp_tag_s[i]} = core_rsp_data_out[i];
end
///////////////////////////////////////////////////////////////////////////
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_rsp_buf
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + TAG_WIDTH),
.SIZE (CORE_RSP_BUF_ENABLE ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_valid_s[i]),
.ready_in (core_rsp_ready_s[i]),
.data_in ({core_rsp_data_s[i], core_rsp_tag_s[i]}),
.data_out ({core_bus2_if[i].rsp_data.data, core_bus2_if[i].rsp_data.tag}),
.valid_out (core_bus2_if[i].rsp_valid),
.ready_out (core_bus2_if[i].rsp_ready)
);
end
wire mem_req_valid_p;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_p;
wire mem_req_rw_p;
wire [LINE_SIZE-1:0] mem_req_byteen_p;
wire [`CS_LINE_WIDTH-1:0] mem_req_data_p;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_p;
wire [MSHR_ADDR_WIDTH-1:0] mem_req_id_p;
wire mem_req_flush_p;
wire mem_req_ready_p;
// Memory request arbitration /////////////////////////////////////////////
// Memory request arbitration
wire [NUM_BANKS-1:0][(`CS_MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + 1)-1:0] data_in;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign data_in[i] = {
per_bank_mem_req_addr[i],
wire [NUM_BANKS-1:0][MEM_REQ_DATAW-1:0] per_bank_mem_req_pdata;
for (genvar i = 0; i < NUM_BANKS; ++i) begin : g_per_bank_mem_req_pdata
assign per_bank_mem_req_pdata[i] = {
per_bank_mem_req_rw[i],
per_bank_mem_req_byteen[i],
per_bank_mem_req_addr[i],
per_bank_mem_req_data[i],
per_bank_mem_req_id[i],
per_bank_mem_req_flush[i]
per_bank_mem_req_byteen[i],
per_bank_mem_req_flags[i],
per_bank_mem_req_tag[i]
};
end
wire [MEM_PORTS-1:0] mem_req_valid;
wire [MEM_PORTS-1:0][MEM_REQ_DATAW-1:0] mem_req_pdata;
wire [MEM_PORTS-1:0] mem_req_ready;
wire [MEM_PORTS-1:0][MEM_ARB_SEL_WIDTH-1:0] mem_req_sel_out;
VX_stream_arb #(
.NUM_INPUTS (NUM_BANKS),
.DATAW (`CS_MEM_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + MSHR_ADDR_WIDTH + 1),
.ARBITER ("F")
.NUM_OUTPUTS(MEM_PORTS),
.DATAW (MEM_REQ_DATAW),
.ARBITER ("R")
) mem_req_arb (
.clk (clk),
.reset (reset),
.valid_in (per_bank_mem_req_valid),
.data_in (per_bank_mem_req_pdata),
.ready_in (per_bank_mem_req_ready),
.data_in (data_in),
.data_out ({mem_req_addr_p, mem_req_rw_p, mem_req_byteen_p, mem_req_data_p, mem_req_id_p, mem_req_flush_p}),
.valid_out (mem_req_valid_p),
.ready_out (mem_req_ready_p),
`UNUSED_PIN (sel_out)
.valid_out (mem_req_valid),
.data_out (mem_req_pdata),
.ready_out (mem_req_ready),
.sel_out (mem_req_sel_out)
);
if (NUM_BANKS > 1) begin
wire [`CS_BANK_SEL_BITS-1:0] mem_req_bank_id = `CS_MEM_ADDR_TO_BANK_ID(mem_req_addr_p);
assign mem_req_tag_p = MEM_TAG_WIDTH'({mem_req_bank_id, mem_req_id_p});
end else begin
assign mem_req_tag_p = MEM_TAG_WIDTH'(mem_req_id_p);
end
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_req_buf
wire mem_req_rw;
wire [`CS_LINE_ADDR_WIDTH-1:0] mem_req_addr;
wire [`CS_LINE_WIDTH-1:0] mem_req_data;
wire [LINE_SIZE-1:0] mem_req_byteen;
wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags;
wire [BANK_MEM_TAG_WIDTH-1:0] mem_req_tag;
// Memory request multi-port handling
assign {
mem_req_rw,
mem_req_addr,
mem_req_data,
mem_req_byteen,
mem_req_flags,
mem_req_tag
} = mem_req_pdata[i];
assign mem_req_valid_s = mem_req_valid_p;
assign mem_req_addr_s = mem_req_addr_p;
assign mem_req_tag_s = mem_req_tag_p;
assign mem_req_flush_s = mem_req_flush_p;
assign mem_req_ready_p = mem_req_ready_s;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_w;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_w;
wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags_w;
if (WRITE_ENABLE != 0) begin
assign mem_req_rw_s = mem_req_rw_p;
assign mem_req_byteen_s = mem_req_byteen_p;
assign mem_req_data_s = mem_req_data_p;
end else begin
`UNUSED_VAR (mem_req_byteen_p)
`UNUSED_VAR (mem_req_data_p)
`UNUSED_VAR (mem_req_rw_p)
if (NUM_BANKS > 1) begin : g_mem_req_tag_multibanks
if (NUM_BANKS != MEM_PORTS) begin : g_arb_sel
wire [`CS_BANK_SEL_BITS-1:0] mem_req_bank_id;
VX_bits_concat #(
.L (MEM_ARB_SEL_BITS),
.R (MEM_PORTS_SEL_BITS)
) bank_id_concat (
.left_in (mem_req_sel_out[i]),
.right_in (MEM_PORTS_SEL_WIDTH'(i)),
.data_out (mem_req_bank_id)
);
assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'({mem_req_addr, mem_req_bank_id});
assign mem_req_tag_w = {mem_req_tag, mem_req_sel_out[i]};
end else begin : g_no_arb_sel
`UNUSED_VAR (mem_req_sel_out)
assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'({mem_req_addr, MEM_PORTS_SEL_WIDTH'(i)});
assign mem_req_tag_w = MEM_TAG_WIDTH'(mem_req_tag);
end
end else begin : g_mem_req_tag
`UNUSED_VAR (mem_req_sel_out)
assign mem_req_addr_w = `CS_MEM_ADDR_WIDTH'(mem_req_addr);
assign mem_req_tag_w = MEM_TAG_WIDTH'(mem_req_tag);
end
assign mem_req_rw_s = 0;
assign mem_req_byteen_s = {LINE_SIZE{1'b1}};
assign mem_req_data_s = '0;
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)),
.SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_valid[i]),
.ready_in (mem_req_ready[i]),
.data_in ({mem_req_rw, mem_req_byteen, mem_req_addr_w, mem_req_data, mem_req_tag_w, mem_req_flags}),
.data_out ({mem_bus_tmp_if[i].req_data.rw, mem_bus_tmp_if[i].req_data.byteen, mem_bus_tmp_if[i].req_data.addr, mem_bus_tmp_if[i].req_data.data, mem_bus_tmp_if[i].req_data.tag, mem_req_flags_w}),
.valid_out (mem_bus_tmp_if[i].req_valid),
.ready_out (mem_bus_tmp_if[i].req_ready)
);
if (FLAGS_WIDTH != 0) begin : g_mem_req_flags
assign mem_bus_tmp_if[i].req_data.flags = mem_req_flags_w;
end else begin : g_no_mem_req_flags
assign mem_bus_tmp_if[i].req_data.flags = '0;
`UNUSED_VAR (mem_req_flags_w)
end
if (WRITE_ENABLE) begin : g_mem_bus_if
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
end else begin : g_mem_bus_if_ro
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
end
end
`ifdef PERF_ENABLE
// per cycle: core_reads, core_writes
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [NUM_REQS-1:0] perf_core_reads_per_req;
wire [NUM_REQS-1:0] perf_core_writes_per_req;
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
wire [NUM_REQS-1:0] perf_core_reads_per_req;
wire [NUM_REQS-1:0] perf_core_writes_per_req;
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
wire [MEM_PORTS-1:0] perf_mem_stall_per_port;
`BUFFER(perf_core_reads_per_req, core_req_valid & core_req_ready & ~core_req_rw);
`BUFFER(perf_core_writes_per_req, core_req_valid & core_req_ready & core_req_rw);
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready;
end
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_perf_mem_stall_per_port
assign perf_mem_stall_per_port[i] = mem_bus_if[i].req_valid && ~mem_bus_if[i].req_ready;
end
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
wire [`CLOG2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
wire [`CLOG2(MEM_PORTS+1)-1:0] perf_mem_stall_per_cycle;
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
`POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank);
`POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank);
`POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank);
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign perf_crsp_stall_per_req[i] = core_bus2_if[i].rsp_valid && ~core_bus2_if[i].rsp_ready;
end
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
wire perf_mem_stall_per_cycle = mem_bus_if.req_valid && ~mem_bus_if.req_ready;
`POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port);
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
reg [`PERF_CTR_BITS-1:0] perf_core_writes;

View file

@ -47,19 +47,26 @@ module VX_cache_bank #(
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Replacement policy
parameter REPL_POLICY = `CS_REPL_FIFO,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 1,
// Core response output buffer
parameter CORE_OUT_BUF = 0,
// core request flags
parameter FLAGS_WIDTH = 0,
// Memory request output buffer
parameter MEM_OUT_BUF = 0,
// Core response output register
parameter CORE_OUT_REG = 0,
// Memory request output register
parameter MEM_OUT_REG = 0,
parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE),
parameter MEM_TAG_WIDTH = UUID_WIDTH + MSHR_ADDR_WIDTH,
parameter REQ_SEL_WIDTH = `UP(`CS_REQ_SEL_BITS),
parameter WORD_SEL_WIDTH = `UP(`CS_WORD_SEL_BITS)
) (
@ -67,9 +74,9 @@ module VX_cache_bank #(
input wire reset,
`ifdef PERF_ENABLE
output wire perf_read_misses,
output wire perf_write_misses,
output wire perf_mshr_stalls,
output wire perf_read_miss,
output wire perf_write_miss,
output wire perf_mshr_stall,
`endif
// Core Request
@ -81,7 +88,7 @@ module VX_cache_bank #(
input wire [`CS_WORD_WIDTH-1:0] core_req_data, // data to be written
input wire [TAG_WIDTH-1:0] core_req_tag, // identifier of the request (request id)
input wire [REQ_SEL_WIDTH-1:0] core_req_idx, // index of the request in the core request array
input wire core_req_flush, // flush enable
input wire [`UP(FLAGS_WIDTH)-1:0] core_req_flags,
output wire core_req_ready,
// Core Response
@ -97,18 +104,19 @@ module VX_cache_bank #(
output wire mem_req_rw,
output wire [LINE_SIZE-1:0] mem_req_byteen,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MSHR_ADDR_WIDTH-1:0] mem_req_id, // index of the head entry in the mshr
output wire mem_req_flush,
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
output wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
input wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id,
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready,
// flush
input wire flush_begin,
input wire [`UP(UUID_WIDTH)-1:0] flush_uuid,
output wire flush_end
);
@ -136,43 +144,45 @@ module VX_cache_bank #(
wire [MSHR_ADDR_WIDTH-1:0] replay_id;
wire replay_ready;
wire is_init_st0, is_init_st1;
wire valid_sel, valid_st0, valid_st1;
wire is_init_st0;
wire is_creq_st0, is_creq_st1;
wire is_fill_st0, is_fill_st1;
wire is_flush_st0, is_flush_st1;
wire [NUM_WAYS-1:0] flush_way_st0;
wire [`CS_WAY_SEL_WIDTH-1:0] flush_way_st0, evict_way_st0;
wire [`CS_WAY_SEL_WIDTH-1:0] way_idx_st0, way_idx_st1;
wire [`CS_LINE_ADDR_WIDTH-1:0] addr_sel, addr_st0, addr_st1;
wire [`CS_LINE_SEL_BITS-1:0] line_sel_st0, line_sel_st1;
wire [`CS_LINE_SEL_BITS-1:0] line_idx_st0, line_idx_st1;
wire [`CS_TAG_SEL_BITS-1:0] line_tag_st0, line_tag_st1;
wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0, evict_tag_st1;
wire rw_sel, rw_st0, rw_st1;
wire [WORD_SEL_WIDTH-1:0] wsel_sel, wsel_st0, wsel_st1;
wire [WORD_SEL_WIDTH-1:0] word_idx_sel, word_idx_st0, word_idx_st1;
wire [WORD_SIZE-1:0] byteen_sel, byteen_st0, byteen_st1;
wire [REQ_SEL_WIDTH-1:0] req_idx_sel, req_idx_st0, req_idx_st1;
wire [TAG_WIDTH-1:0] tag_sel, tag_st0, tag_st1;
wire [`CS_WORD_WIDTH-1:0] read_data_st1;
wire [`CS_WORD_WIDTH-1:0] write_word_st0, write_word_st1;
wire [`CS_LINE_WIDTH-1:0] data_sel, data_st0, data_st1;
wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0, mshr_id_st0, mshr_id_st1;
wire valid_sel, valid_st0, valid_st1;
wire is_creq_st0, is_creq_st1;
wire is_fill_st0, is_fill_st1;
wire [MSHR_ADDR_WIDTH-1:0] mshr_id_st0, mshr_id_st1;
wire [MSHR_ADDR_WIDTH-1:0] replay_id_st0;
wire is_dirty_st0, is_dirty_st1;
wire is_replay_st0, is_replay_st1;
wire creq_flush_sel, creq_flush_st0, creq_flush_st1;
wire evict_dirty_st0, evict_dirty_st1;
wire [NUM_WAYS-1:0] way_sel_st0, way_sel_st1;
wire [NUM_WAYS-1:0] tag_matches_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_prev_st0, mshr_prev_st1;
wire is_hit_st0, is_hit_st1;
wire [`UP(FLAGS_WIDTH)-1:0] flags_sel, flags_st0, flags_st1;
wire mshr_pending_st0, mshr_pending_st1;
wire [MSHR_ADDR_WIDTH-1:0] mshr_previd_st0, mshr_previd_st1;
wire mshr_empty;
wire flush_valid;
wire init_valid;
wire [`CS_LINE_SEL_BITS-1:0] flush_sel;
wire [NUM_WAYS-1:0] flush_way;
wire [`CS_WAY_SEL_WIDTH-1:0] flush_way;
wire flush_ready;
// ensure we have no pending memory request in the bank
wire no_pending_req = ~valid_st0 && ~valid_st1 && mreq_queue_empty;
// flush unit
VX_bank_flush #(
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
@ -194,11 +204,7 @@ module VX_cache_bank #(
.bank_empty (no_pending_req)
);
wire rdw_hazard1_sel;
wire rdw_hazard2_sel;
reg rdw_hazard3_st1;
wire pipe_stall = crsp_queue_stall || rdw_hazard3_st1;
wire pipe_stall = crsp_queue_stall;
// inputs arbitration:
// mshr replay has highest priority to maximize utilization since there is no miss.
@ -217,216 +223,217 @@ module VX_cache_bank #(
wire creq_enable = creq_grant && core_req_valid;
assign replay_ready = replay_grant
&& ~rdw_hazard1_sel
&& ~(!WRITEBACK && replay_rw && mreq_queue_alm_full) // needed for writethrough
&& ~pipe_stall;
assign mem_rsp_ready = fill_grant
&& (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions
&& ~rdw_hazard2_sel
&& ~(WRITEBACK && mreq_queue_alm_full) // needed for writeback
&& ~pipe_stall;
assign flush_ready = flush_grant
&& (!WRITEBACK || ~mreq_queue_alm_full) // needed for evictions
&& ~rdw_hazard2_sel
&& ~(WRITEBACK && mreq_queue_alm_full) // needed for writeback
&& ~pipe_stall;
assign core_req_ready = creq_grant
&& ~mreq_queue_alm_full
&& ~mshr_alm_full
&& ~mreq_queue_alm_full // needed for fill requests
&& ~mshr_alm_full // needed for mshr allocation
&& ~pipe_stall;
wire init_fire = init_valid;
wire replay_fire = replay_valid && replay_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
wire flush_fire = flush_valid && flush_ready;
wire flush_fire = flush_valid && flush_ready;
wire core_req_fire = core_req_valid && core_req_ready;
wire [MSHR_ADDR_WIDTH-1:0] mem_rsp_id = mem_rsp_tag[MSHR_ADDR_WIDTH-1:0];
wire [TAG_WIDTH-1:0] mem_rsp_tag_s;
if (TAG_WIDTH > MEM_TAG_WIDTH) begin : g_mem_rsp_tag_s_pad
assign mem_rsp_tag_s = {mem_rsp_tag, (TAG_WIDTH-MEM_TAG_WIDTH)'(1'b0)};
end else begin : g_mem_rsp_tag_s_cut
assign mem_rsp_tag_s = mem_rsp_tag[MEM_TAG_WIDTH-1 -: TAG_WIDTH];
`UNUSED_VAR (mem_rsp_tag)
end
wire [TAG_WIDTH-1:0] flush_tag;
if (UUID_WIDTH != 0) begin : g_flush_tag_uuid
assign flush_tag = {flush_uuid, (TAG_WIDTH-UUID_WIDTH)'(1'b0)};
end else begin : g_flush_tag_0
`UNUSED_VAR (flush_uuid)
assign flush_tag = '0;
end
assign valid_sel = init_fire || replay_fire || mem_rsp_fire || flush_fire || core_req_fire;
assign rw_sel = replay_valid ? replay_rw : core_req_rw;
assign byteen_sel = replay_valid ? replay_byteen : core_req_byteen;
assign wsel_sel = replay_valid ? replay_wsel : core_req_wsel;
assign req_idx_sel = replay_valid ? replay_idx : core_req_idx;
assign tag_sel = replay_valid ? replay_tag : core_req_tag;
assign creq_flush_sel = core_req_valid && core_req_flush;
assign addr_sel = (init_valid | flush_valid) ? `CS_LINE_ADDR_WIDTH'(flush_sel) :
(replay_valid ? replay_addr : (mem_rsp_valid ? mem_rsp_addr : core_req_addr));
assign word_idx_sel= replay_valid ? replay_wsel : core_req_wsel;
assign req_idx_sel = replay_valid ? replay_idx : core_req_idx;
assign tag_sel = (init_valid | flush_valid) ? (flush_valid ? flush_tag : '0) :
(replay_valid ? replay_tag : (mem_rsp_valid ? mem_rsp_tag_s : core_req_tag));
assign flags_sel = core_req_valid ? core_req_flags : '0;
if (WRITE_ENABLE) begin
assign data_sel[`CS_WORD_WIDTH-1:0] = replay_valid ? replay_data : (mem_rsp_valid ? mem_rsp_data[`CS_WORD_WIDTH-1:0] : core_req_data);
end else begin
assign data_sel[`CS_WORD_WIDTH-1:0] = mem_rsp_data[`CS_WORD_WIDTH-1:0];
if (WRITE_ENABLE) begin : g_data_sel
for (genvar i = 0; i < `CS_LINE_WIDTH; ++i) begin : g_i
if (i < `CS_WORD_WIDTH) begin : g_lo
assign data_sel[i] = replay_valid ? replay_data[i] : (mem_rsp_valid ? mem_rsp_data[i] : core_req_data[i]);
end else begin : g_hi
assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words of data_sel
end
end
end else begin : g_data_sel_ro
assign data_sel = mem_rsp_data;
`UNUSED_VAR (core_req_data)
`UNUSED_VAR (replay_data)
end
for (genvar i = `CS_WORD_WIDTH; i < `CS_LINE_WIDTH; ++i) begin
assign data_sel[i] = mem_rsp_data[i]; // only the memory response fills the upper words of data_sel
if (UUID_WIDTH != 0) begin : g_req_uuid_sel
assign req_uuid_sel = tag_sel[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin : g_req_uuid_sel_0
assign req_uuid_sel = '0;
end
if (UUID_WIDTH != 0) begin
assign req_uuid_sel = tag_sel[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign req_uuid_sel = 0;
end
wire is_init_sel = init_valid;
wire is_creq_sel = creq_enable || replay_enable;
wire is_fill_sel = fill_enable;
wire is_flush_sel = flush_enable;
wire is_replay_sel = replay_enable;
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + NUM_WAYS + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + `UP(FLAGS_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
.RESETW (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (~pipe_stall),
.data_in ({valid_sel, init_valid, replay_enable, fill_enable, flush_enable, creq_enable, creq_flush_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, wsel_sel, req_idx_sel, tag_sel, replay_id}),
.data_out ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush_st0, is_creq_st0, creq_flush_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, replay_id_st0})
.data_in ({valid_sel, is_init_sel, is_fill_sel, is_flush_sel, is_creq_sel, is_replay_sel, flags_sel, flush_way, addr_sel, data_sel, rw_sel, byteen_sel, word_idx_sel, req_idx_sel, tag_sel, replay_id}),
.data_out ({valid_st0, is_init_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, flags_st0, flush_way_st0, addr_st0, data_st0, rw_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, replay_id_st0})
);
if (UUID_WIDTH != 0) begin
if (UUID_WIDTH != 0) begin : g_req_uuid_st0
assign req_uuid_st0 = tag_st0[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign req_uuid_st0 = 0;
end else begin : g_req_uuid_st0_0
assign req_uuid_st0 = '0;
end
wire do_init_st0 = valid_st0 && is_init_st0;
wire do_flush_st0 = valid_st0 && is_flush_st0;
wire do_creq_rd_st0 = valid_st0 && is_creq_st0 && ~rw_st0;
wire do_creq_wr_st0 = valid_st0 && is_creq_st0 && rw_st0;
wire do_replay_rd_st0 = valid_st0 && is_replay_st0 && ~rw_st0;
wire do_replay_wr_st0 = valid_st0 && is_replay_st0 && rw_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0;
wire do_cache_rd_st0 = do_creq_rd_st0 || do_replay_rd_st0;
wire do_cache_wr_st0 = do_creq_wr_st0 || do_replay_wr_st0;
wire do_lookup_st0 = do_cache_rd_st0 || do_cache_wr_st0;
wire is_read_st0 = is_creq_st0 && ~rw_st0;
wire is_write_st0 = is_creq_st0 && rw_st0;
wire [`CS_WORD_WIDTH-1:0] write_data_st0 = data_st0[`CS_WORD_WIDTH-1:0];
wire do_init_st0 = valid_st0 && is_init_st0;
wire do_flush_st0 = valid_st0 && is_flush_st0;
wire do_read_st0 = valid_st0 && is_read_st0;
wire do_write_st0 = valid_st0 && is_write_st0;
wire do_fill_st0 = valid_st0 && is_fill_st0;
assign line_sel_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0];
wire is_read_st1 = is_creq_st1 && ~rw_st1;
wire is_write_st1 = is_creq_st1 && rw_st1;
wire [NUM_WAYS-1:0] evict_way_st0;
wire [`CS_TAG_SEL_BITS-1:0] evict_tag_st0;
wire do_read_st1 = valid_st1 && is_read_st1;
wire do_write_st1 = valid_st1 && is_write_st1;
assign line_idx_st0 = addr_st0[`CS_LINE_SEL_BITS-1:0];
assign line_tag_st0 = `CS_LINE_ADDR_TAG(addr_st0);
assign write_word_st0 = data_st0[`CS_WORD_WIDTH-1:0];
wire do_lookup_st0 = do_read_st0 || do_write_st0;
wire do_lookup_st1 = do_read_st1 || do_write_st1;
wire [`CS_WAY_SEL_WIDTH-1:0] victim_way_st0;
wire [NUM_WAYS-1:0] tag_matches_st0;
VX_cache_repl #(
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.REPL_POLICY (REPL_POLICY)
) cache_repl (
.clk (clk),
.reset (reset),
.stall (pipe_stall),
.init (do_init_st0),
.lookup_valid(do_lookup_st1 && ~pipe_stall),
.lookup_hit (is_hit_st1),
.lookup_line(line_idx_st1),
.lookup_way (way_idx_st1),
.repl_valid (do_fill_st0 && ~pipe_stall),
.repl_line (line_idx_st0),
.repl_way (victim_way_st0)
);
assign evict_way_st0 = is_fill_st0 ? victim_way_st0 : flush_way_st0;
VX_cache_tags #(
.INSTANCE_ID($sformatf("%s-tags", INSTANCE_ID)),
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH)
.WRITEBACK (WRITEBACK)
) cache_tags (
.clk (clk),
.reset (reset),
.req_uuid (req_uuid_st0),
.stall (pipe_stall),
// init/flush/fill/write/lookup
// inputs
.init (do_init_st0),
.flush (do_flush_st0),
.fill (do_fill_st0),
.write (do_cache_wr_st0),
.lookup (do_lookup_st0),
.line_addr (addr_st0),
.way_sel (flush_way_st0),
.tag_matches(tag_matches_st0),
// replacement
.evict_dirty(evict_dirty_st0),
.flush (do_flush_st0 && ~pipe_stall),
.fill (do_fill_st0 && ~pipe_stall),
.read (do_read_st0 && ~pipe_stall),
.write (do_write_st0 && ~pipe_stall),
.line_idx (line_idx_st0),
.line_tag (line_tag_st0),
.evict_way (evict_way_st0),
// outputs
.tag_matches(tag_matches_st0),
.evict_dirty(is_dirty_st0),
.evict_tag (evict_tag_st0)
);
wire [`CS_LINE_ADDR_WIDTH-1:0] addr2_st0;
wire [`CS_WAY_SEL_WIDTH-1:0] hit_idx_st0;
VX_onehot_encoder #(
.N (NUM_WAYS)
) way_idx_enc (
.data_in (tag_matches_st0),
.data_out (hit_idx_st0),
`UNUSED_PIN (valid_out)
);
wire is_flush2_st0 = WRITEBACK && is_flush_st0;
assign way_idx_st0 = is_creq_st0 ? hit_idx_st0 : evict_way_st0;
assign is_hit_st0 = (| tag_matches_st0);
assign mshr_id_st0 = is_creq_st0 ? mshr_alloc_id_st0 : replay_id_st0;
assign way_sel_st0 = (is_fill_st0 || is_flush2_st0) ? evict_way_st0 : tag_matches_st0;
assign addr2_st0 = (is_fill_st0 || is_flush2_st0) ? {evict_tag_st0, line_sel_st0} : addr_st0;
wire [MSHR_ADDR_WIDTH-1:0] mshr_alloc_id_st0;
assign mshr_id_st0 = is_replay_st0 ? replay_id_st0 : mshr_alloc_id_st0;
VX_pipe_register #(
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_WAYS + 1 + 1),
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(FLAGS_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_TAG_SEL_BITS + `CS_TAG_SEL_BITS + `CS_LINE_SEL_BITS + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1),
.RESETW (1)
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (~pipe_stall),
.data_in ({valid_st0, is_init_st0, is_replay_st0, is_fill_st0, is_flush2_st0, is_creq_st0, creq_flush_st0, rw_st0, addr2_st0, data_st0, byteen_st0, wsel_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_prev_st0, way_sel_st0, evict_dirty_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_init_st1, is_replay_st1, is_fill_st1, is_flush_st1, is_creq_st1, creq_flush_st1, rw_st1, addr_st1, data_st1, byteen_st1, wsel_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_prev_st1, way_sel_st1, evict_dirty_st1, mshr_pending_st1})
.data_in ({valid_st0, is_fill_st0, is_flush_st0, is_creq_st0, is_replay_st0, is_dirty_st0, is_hit_st0, rw_st0, flags_st0, way_idx_st0, evict_tag_st0, line_tag_st0, line_idx_st0, data_st0, byteen_st0, word_idx_st0, req_idx_st0, tag_st0, mshr_id_st0, mshr_previd_st0, mshr_pending_st0}),
.data_out ({valid_st1, is_fill_st1, is_flush_st1, is_creq_st1, is_replay_st1, is_dirty_st1, is_hit_st1, rw_st1, flags_st1, way_idx_st1, evict_tag_st1, line_tag_st1, line_idx_st1, data_st1, byteen_st1, word_idx_st1, req_idx_st1, tag_st1, mshr_id_st1, mshr_previd_st1, mshr_pending_st1})
);
// we have a tag hit
wire is_hit_st1 = (| way_sel_st1);
if (UUID_WIDTH != 0) begin
if (UUID_WIDTH != 0) begin : g_req_uuid_st1
assign req_uuid_st1 = tag_st1[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign req_uuid_st1 = 0;
end else begin : g_req_uuid_st1_0
assign req_uuid_st1 = '0;
end
wire is_read_st1 = is_creq_st1 && ~rw_st1;
wire is_write_st1 = is_creq_st1 && rw_st1;
wire do_init_st1 = valid_st1 && is_init_st1;
wire do_fill_st1 = valid_st1 && is_fill_st1;
wire do_flush_st1 = valid_st1 && is_flush_st1;
wire do_creq_rd_st1 = valid_st1 && is_read_st1;
wire do_creq_wr_st1 = valid_st1 && is_write_st1;
wire do_replay_rd_st1 = valid_st1 && is_replay_st1 && ~rw_st1;
wire do_replay_wr_st1 = valid_st1 && is_replay_st1 && rw_st1;
wire do_read_hit_st1 = do_creq_rd_st1 && is_hit_st1;
wire do_read_miss_st1 = do_creq_rd_st1 && ~is_hit_st1;
wire do_write_hit_st1 = do_creq_wr_st1 && is_hit_st1;
wire do_write_miss_st1= do_creq_wr_st1 && ~is_hit_st1;
wire do_cache_rd_st1 = do_read_hit_st1 || do_replay_rd_st1;
wire do_cache_wr_st1 = do_write_hit_st1 || do_replay_wr_st1;
assign line_sel_st1 = addr_st1[`CS_LINE_SEL_BITS-1:0];
`UNUSED_VAR (do_write_miss_st1)
assign addr_st1 = {line_tag_st1, line_idx_st1};
// ensure mshr replay always get a hit
`RUNTIME_ASSERT (~(valid_st1 && is_replay_st1) || is_hit_st1, ("missed mshr replay"));
`RUNTIME_ASSERT (~(valid_st1 && is_replay_st1 && ~is_hit_st1), ("%t: missed mshr replay", $time))
// both tag and data stores use BRAM with no read-during-write protection.
// we ned to stall the pipeline to prevent read-after-write hazards.
assign rdw_hazard1_sel = do_fill_st0; // stall first replay following a fill
assign rdw_hazard2_sel = WRITEBACK && do_cache_wr_st0; // a writeback can evict any preceeding write
always @(posedge clk) begin
// stall reads following writes to same line address
rdw_hazard3_st1 <= do_cache_rd_st0 && do_cache_wr_st1 && (line_sel_st0 == line_sel_st1)
&& ~rdw_hazard3_st1; // release pipeline stall
end
assign write_word_st1 = data_st1[`CS_WORD_WIDTH-1:0];
`UNUSED_VAR (data_st1)
wire [`CS_LINE_WIDTH-1:0] write_data_st1 = {`CS_WORDS_PER_LINE{data_st1[`CS_WORD_WIDTH-1:0]}};
wire [`CS_LINE_WIDTH-1:0] fill_data_st1 = data_st1;
wire [LINE_SIZE-1:0] write_byteen_st1;
wire [`CS_LINE_WIDTH-1:0] dirty_data_st1;
wire [LINE_SIZE-1:0] dirty_byteen_st1;
if (`CS_WORDS_PER_LINE > 1) begin
reg [LINE_SIZE-1:0] write_byteen_r;
always @(*) begin
write_byteen_r = '0;
write_byteen_r[wsel_st1 * WORD_SIZE +: WORD_SIZE] = byteen_st1;
end
assign write_byteen_st1 = write_byteen_r;
end else begin
assign write_byteen_st1 = byteen_st1;
end
wire[`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] read_data_st1;
wire [LINE_SIZE-1:0] evict_byteen_st1;
VX_cache_data #(
.INSTANCE_ID ($sformatf("%s-data", INSTANCE_ID)),
.BANK_ID (BANK_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
@ -434,56 +441,57 @@ module VX_cache_bank #(
.WORD_SIZE (WORD_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH)
.DIRTY_BYTES (DIRTY_BYTES)
) cache_data (
.clk (clk),
.reset (reset),
.req_uuid (req_uuid_st1),
.stall (pipe_stall),
.init (do_init_st1),
.read (do_cache_rd_st1),
.fill (do_fill_st1),
.flush (do_flush_st1),
.write (do_cache_wr_st1),
.way_sel (way_sel_st1),
.line_addr (addr_st1),
.wsel (wsel_st1),
.fill_data (fill_data_st1),
.write_data (write_data_st1),
.write_byteen(write_byteen_st1),
// inputs
.init (do_init_st0),
.fill (do_fill_st0 && ~pipe_stall),
.flush (do_flush_st0 && ~pipe_stall),
.read (do_read_st0 && ~pipe_stall),
.write (do_write_st0 && ~pipe_stall),
.evict_way (evict_way_st0),
.tag_matches(tag_matches_st0),
.line_idx (line_idx_st0),
.fill_data (data_st0),
.write_word (write_word_st0),
.word_idx (word_idx_st0),
.write_byteen(byteen_st0),
.way_idx_r (way_idx_st1),
// outputs
.read_data (read_data_st1),
.dirty_data (dirty_data_st1),
.dirty_byteen(dirty_byteen_st1)
.evict_byteen(evict_byteen_st1)
);
wire [MSHR_SIZE-1:0] mshr_lookup_pending_st0;
wire [MSHR_SIZE-1:0] mshr_lookup_rw_st0;
wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~pipe_stall;
wire mshr_lookup_st0 = mshr_allocate_st0;
wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~pipe_stall;
// only allocate MSHR entries for non-replay core requests
wire mshr_allocate_st0 = valid_st0 && is_creq_st0 && ~is_replay_st0;
wire mshr_finalize_st1 = valid_st1 && is_creq_st1 && ~is_replay_st1;
// release allocated mshr entry if we had a hit
wire mshr_release_st1;
if (WRITEBACK) begin
if (WRITEBACK) begin : g_mshr_release
assign mshr_release_st1 = is_hit_st1;
end else begin
// we need to keep missed write requests in MSHR if there is already a pending entry to the same address
// this ensures that missed write requests are replayed locally in case a pending fill arrives without the write content
// this can happen when writes are sent late, when the fill was already in flight.
end else begin : g_mshr_release_ro
// we need to keep missed write requests in MSHR if there is already a pending entry to the same address.
// this ensures that missed write requests are replayed locally in case a pending fill arrives without the write content.
// this can happen when writes are sent to memory late, when a related fill was already in flight.
assign mshr_release_st1 = is_hit_st1 || (rw_st1 && ~mshr_pending_st1);
end
wire mshr_release_fire = mshr_finalize_st1 && mshr_release_st1 && ~pipe_stall;
wire [1:0] mshr_dequeue;
`POP_COUNT(mshr_dequeue, {replay_fire, mshr_release_fire});
VX_pending_size #(
.SIZE (MSHR_SIZE)
.SIZE (MSHR_SIZE),
.DECRW (2)
) mshr_pending_size (
.clk (clk),
.reset (reset),
.incr (core_req_fire),
.decr (replay_fire || (mshr_finalize_st1 && mshr_release_st1)),
.decr (mshr_dequeue),
.empty (mshr_empty),
`UNUSED_PIN (alm_empty),
.full (mshr_alm_full),
@ -492,11 +500,12 @@ module VX_cache_bank #(
);
VX_cache_mshr #(
.INSTANCE_ID ($sformatf("%s-mshr", INSTANCE_ID)),
.INSTANCE_ID (`SFORMATF(("%s-mshr", INSTANCE_ID))),
.BANK_ID (BANK_ID),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.MSHR_SIZE (MSHR_SIZE),
.WRITEBACK (WRITEBACK),
.UUID_WIDTH (UUID_WIDTH),
.DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH)
) cache_mshr (
@ -504,7 +513,7 @@ module VX_cache_bank #(
.reset (reset),
.deq_req_uuid (req_uuid_sel),
.lkp_req_uuid (req_uuid_st0),
.alc_req_uuid (req_uuid_st0),
.fin_req_uuid (req_uuid_st1),
// memory fill
@ -521,37 +530,23 @@ module VX_cache_bank #(
.dequeue_ready (replay_ready),
// allocate
.allocate_valid (mshr_allocate_st0),
.allocate_valid (mshr_allocate_st0 && ~pipe_stall),
.allocate_addr (addr_st0),
.allocate_rw (rw_st0),
.allocate_data ({wsel_st0, byteen_st0, write_data_st0, tag_st0, req_idx_st0}),
.allocate_data ({word_idx_st0, byteen_st0, write_word_st0, tag_st0, req_idx_st0}),
.allocate_id (mshr_alloc_id_st0),
.allocate_prev (mshr_prev_st0),
.allocate_pending(mshr_pending_st0),
.allocate_previd(mshr_previd_st0),
`UNUSED_PIN (allocate_ready),
// lookup
.lookup_valid (mshr_lookup_st0),
.lookup_addr (addr_st0),
.lookup_pending (mshr_lookup_pending_st0),
.lookup_rw (mshr_lookup_rw_st0),
// finalize
.finalize_valid (mshr_finalize_st1),
.finalize_release(mshr_release_st1),
.finalize_pending(mshr_pending_st1),
.finalize_valid (mshr_finalize_st1 && ~pipe_stall),
.finalize_is_release(mshr_release_st1),
.finalize_is_pending(mshr_pending_st1),
.finalize_id (mshr_id_st1),
.finalize_prev (mshr_prev_st1)
.finalize_previd(mshr_previd_st1)
);
// check if there are pending requests to same line in the MSHR
wire [MSHR_SIZE-1:0] lookup_matches;
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
assign lookup_matches[i] = mshr_lookup_pending_st0[i]
&& (i != mshr_alloc_id_st0) // exclude current mshr id
&& (WRITEBACK || ~mshr_lookup_rw_st0[i]); // exclude write requests if writethrough
end
assign mshr_pending_st0 = (| lookup_matches);
// schedule core response
wire crsp_queue_valid, crsp_queue_ready;
@ -559,19 +554,19 @@ module VX_cache_bank #(
wire [REQ_SEL_WIDTH-1:0] crsp_queue_idx;
wire [TAG_WIDTH-1:0] crsp_queue_tag;
assign crsp_queue_valid = do_cache_rd_st1;
assign crsp_queue_valid = do_read_st1 && is_hit_st1;
assign crsp_queue_idx = req_idx_st1;
assign crsp_queue_data = read_data_st1;
assign crsp_queue_data = read_data_st1[word_idx_st1];
assign crsp_queue_tag = tag_st1;
VX_elastic_buffer #(
.DATAW (TAG_WIDTH + `CS_WORD_WIDTH + REQ_SEL_WIDTH),
.SIZE (CRSQ_SIZE),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
.OUT_REG (CORE_OUT_REG)
) core_rsp_queue (
.clk (clk),
.reset (reset),
.valid_in (crsp_queue_valid && ~rdw_hazard3_st1),
.valid_in (crsp_queue_valid),
.ready_in (crsp_queue_ready),
.data_in ({crsp_queue_tag, crsp_queue_data, crsp_queue_idx}),
.data_out ({core_rsp_tag, core_rsp_data, core_rsp_idx}),
@ -587,59 +582,93 @@ module VX_cache_bank #(
wire [`CS_LINE_WIDTH-1:0] mreq_queue_data;
wire [LINE_SIZE-1:0] mreq_queue_byteen;
wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr;
wire [MSHR_ADDR_WIDTH-1:0] mreq_queue_id;
wire [MEM_TAG_WIDTH-1:0] mreq_queue_tag;
wire mreq_queue_rw;
wire mreq_queue_flush;
wire [`UP(FLAGS_WIDTH)-1:0] mreq_queue_flags;
wire is_fill_or_flush_st1 = is_fill_st1 || is_flush_st1;
wire is_fill_or_flush_st1 = is_fill_st1 || (is_flush_st1 && WRITEBACK);
wire do_fill_or_flush_st1 = valid_st1 && is_fill_or_flush_st1;
wire do_writeback_st1 = do_fill_or_flush_st1 && evict_dirty_st1;
wire do_writeback_st1 = do_fill_or_flush_st1 && is_dirty_st1;
wire [`CS_LINE_ADDR_WIDTH-1:0] evict_addr_st1 = {evict_tag_st1, line_idx_st1};
if (WRITEBACK) begin
if (DIRTY_BYTES) begin
// ensure dirty bytes match the tag info
wire has_dirty_bytes = (| dirty_byteen_st1);
`RUNTIME_ASSERT (~do_fill_or_flush_st1 || (evict_dirty_st1 == has_dirty_bytes), ("missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", evict_dirty_st1, has_dirty_bytes, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID)));
if (WRITE_ENABLE) begin : g_mreq_queue
if (WRITEBACK) begin : g_wb
if (DIRTY_BYTES) begin : g_dirty_bytes
// ensure dirty bytes match the tag info
wire has_dirty_bytes = (| evict_byteen_st1);
`RUNTIME_ASSERT (~do_fill_or_flush_st1 || (is_dirty_st1 == has_dirty_bytes), ("%t: missmatch dirty bytes: dirty_line=%b, dirty_bytes=%b, addr=0x%0h", $time, is_dirty_st1, has_dirty_bytes, `CS_BANK_TO_FULL_ADDR(addr_st1, BANK_ID)))
end
// issue a fill request on a read/write miss
// issue a writeback on a dirty line eviction
assign mreq_queue_push = ((do_lookup_st1 && ~is_hit_st1 && ~mshr_pending_st1)
|| do_writeback_st1)
&& ~pipe_stall;
assign mreq_queue_addr = is_fill_or_flush_st1 ? evict_addr_st1 : addr_st1;
assign mreq_queue_rw = is_fill_or_flush_st1;
assign mreq_queue_data = read_data_st1;
assign mreq_queue_byteen = is_fill_or_flush_st1 ? evict_byteen_st1 : '1;
`UNUSED_VAR (write_word_st1)
`UNUSED_VAR (byteen_st1)
end else begin : g_wt
wire [LINE_SIZE-1:0] line_byteen;
VX_demux #(
.DATAW (WORD_SIZE),
.N (`CS_WORDS_PER_LINE)
) byteen_demux (
.sel_in (word_idx_st1),
.data_in (byteen_st1),
.data_out (line_byteen)
);
// issue a fill request on a read miss
// issue a memory write on a write request
assign mreq_queue_push = ((do_read_st1 && ~is_hit_st1 && ~mshr_pending_st1)
|| do_write_st1)
&& ~pipe_stall;
assign mreq_queue_addr = addr_st1;
assign mreq_queue_rw = rw_st1;
assign mreq_queue_data = {`CS_WORDS_PER_LINE{write_word_st1}};
assign mreq_queue_byteen = rw_st1 ? line_byteen : '1;
`UNUSED_VAR (is_fill_or_flush_st1)
`UNUSED_VAR (do_writeback_st1)
`UNUSED_VAR (evict_addr_st1)
`UNUSED_VAR (evict_byteen_st1)
end
assign mreq_queue_push = (((do_read_miss_st1 || do_write_miss_st1) && ~mshr_pending_st1)
|| do_writeback_st1)
&& ~rdw_hazard3_st1;
end else begin
end else begin : g_mreq_queue_ro
// issue a fill request on a read miss
assign mreq_queue_push = (do_read_st1 && ~is_hit_st1 && ~mshr_pending_st1)
&& ~pipe_stall;
assign mreq_queue_addr = addr_st1;
assign mreq_queue_rw = 0;
assign mreq_queue_data = '0;
assign mreq_queue_byteen = '1;
`UNUSED_VAR (do_writeback_st1)
assign mreq_queue_push = ((do_read_miss_st1 && ~mshr_pending_st1)
|| do_creq_wr_st1)
&& ~rdw_hazard3_st1;
`UNUSED_VAR (evict_addr_st1)
`UNUSED_VAR (evict_byteen_st1)
`UNUSED_VAR (write_word_st1)
`UNUSED_VAR (byteen_st1)
end
if (UUID_WIDTH != 0) begin : g_mreq_queue_tag_uuid
assign mreq_queue_tag = {req_uuid_st1, mshr_id_st1};
end else begin : g_mreq_queue_tag
assign mreq_queue_tag = mshr_id_st1;
end
assign mreq_queue_pop = mem_req_valid && mem_req_ready;
assign mreq_queue_addr = addr_st1;
assign mreq_queue_id = mshr_id_st1;
assign mreq_queue_flush = creq_flush_st1;
if (WRITE_ENABLE) begin
assign mreq_queue_rw = WRITEBACK ? is_fill_or_flush_st1 : rw_st1;
assign mreq_queue_data = WRITEBACK ? dirty_data_st1 : write_data_st1;
assign mreq_queue_byteen = WRITEBACK ? dirty_byteen_st1 : write_byteen_st1;
end else begin
assign mreq_queue_rw = 0;
assign mreq_queue_data = 0;
assign mreq_queue_byteen = 0;
`UNUSED_VAR (dirty_data_st1)
`UNUSED_VAR (dirty_byteen_st1)
end
assign mreq_queue_flags = flags_st1;
VX_fifo_queue #(
.DATAW (1 + `CS_LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + 1),
.DATAW (1 + `CS_LINE_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)),
.DEPTH (MREQ_SIZE),
.ALM_FULL (MREQ_SIZE-PIPELINE_STAGES),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
.ALM_FULL (MREQ_SIZE - PIPELINE_STAGES),
.OUT_REG (MEM_OUT_REG)
) mem_req_queue (
.clk (clk),
.reset (reset),
.push (mreq_queue_push),
.pop (mreq_queue_pop),
.data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_id, mreq_queue_byteen, mreq_queue_data, mreq_queue_flush}),
.data_out ({mem_req_rw, mem_req_addr, mem_req_id, mem_req_byteen, mem_req_data, mem_req_flush}),
.data_in ({mreq_queue_rw, mreq_queue_addr, mreq_queue_byteen, mreq_queue_data, mreq_queue_tag, mreq_queue_flags}),
.data_out ({mem_req_rw, mem_req_addr, mem_req_byteen, mem_req_data, mem_req_tag, mem_req_flags}),
.empty (mreq_queue_empty),
.alm_full (mreq_queue_alm_full),
`UNUSED_PIN (full),
@ -649,44 +678,101 @@ module VX_cache_bank #(
assign mem_req_valid = ~mreq_queue_empty;
`UNUSED_VAR (do_lookup_st0)
///////////////////////////////////////////////////////////////////////////////
`ifdef PERF_ENABLE
assign perf_read_misses = do_read_miss_st1;
assign perf_write_misses = do_write_miss_st1;
assign perf_mshr_stalls = mshr_alm_full;
assign perf_read_miss = do_read_st1 && ~is_hit_st1;
assign perf_write_miss = do_write_st1 && ~is_hit_st1;
assign perf_mshr_stall = mshr_alm_full;
`endif
`ifdef DBG_TRACE_CACHE
wire crsp_queue_fire = crsp_queue_valid && crsp_queue_ready;
wire input_stall = (replay_valid || mem_rsp_valid || core_req_valid || flush_valid)
&& ~(replay_fire || mem_rsp_fire || core_req_fire || flush_fire);
wire [`XLEN-1:0] mem_rsp_full_addr = `CS_BANK_TO_FULL_ADDR(mem_rsp_addr, BANK_ID);
wire [`XLEN-1:0] replay_full_addr = `CS_BANK_TO_FULL_ADDR(replay_addr, BANK_ID);
wire [`XLEN-1:0] core_req_full_addr = `CS_BANK_TO_FULL_ADDR(core_req_addr, BANK_ID);
wire [`XLEN-1:0] full_addr_st0 = `CS_BANK_TO_FULL_ADDR(addr_st0, BANK_ID);
wire [`XLEN-1:0] full_addr_st1 = `CS_BANK_TO_FULL_ADDR(addr_st1, BANK_ID);
wire [`XLEN-1:0] mreq_queue_full_addr = `CS_BANK_TO_FULL_ADDR(mreq_queue_addr, BANK_ID);
always @(posedge clk) begin
if (input_stall || pipe_stall) begin
`TRACE(3, ("%d: *** %s stall: crsq=%b, mreq=%b, mshr=%b, rdw1=%b, rdw2=%b, rdw3=%b\n", $time, INSTANCE_ID, crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full, rdw_hazard1_sel, rdw_hazard2_sel, rdw_hazard3_st1));
`TRACE(4, ("%t: *** %s stall: crsq=%b, mreq=%b, mshr=%b\n", $time, INSTANCE_ID,
crsp_queue_stall, mreq_queue_alm_full, mshr_alm_full))
end
if (mem_rsp_fire) begin
`TRACE(2, ("%d: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mem_rsp_addr, BANK_ID), mem_rsp_id, mem_rsp_data));
`TRACE(2, ("%t: %s fill-rsp: addr=0x%0h, mshr_id=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
mem_rsp_full_addr, mem_rsp_id, mem_rsp_data, req_uuid_sel))
end
if (replay_fire) begin
`TRACE(2, ("%d: %s mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(replay_addr, BANK_ID), replay_tag, replay_idx, req_uuid_sel));
`TRACE(2, ("%t: %s mshr-pop: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID,
replay_full_addr, replay_tag, replay_idx, req_uuid_sel))
end
if (core_req_fire) begin
if (core_req_rw)
`TRACE(2, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel));
else
`TRACE(2, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(core_req_addr, BANK_ID), core_req_tag, core_req_idx, req_uuid_sel));
if (core_req_rw) begin
`TRACE(2, ("%t: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
core_req_full_addr, core_req_tag, core_req_idx, core_req_byteen, core_req_data, req_uuid_sel))
end else begin
`TRACE(2, ("%t: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID,
core_req_full_addr, core_req_tag, core_req_idx, req_uuid_sel))
end
end
if (do_init_st0) begin
`TRACE(3, ("%t: %s tags-init: addr=0x%0h, line=%0d\n", $time, INSTANCE_ID, full_addr_st0, line_idx_st0))
end
if (do_fill_st0 && ~pipe_stall) begin
`TRACE(3, ("%t: %s tags-fill: addr=0x%0h, way=%0d, line=%0d, dirty=%b (#%0d)\n", $time, INSTANCE_ID,
full_addr_st0, evict_way_st0, line_idx_st0, is_dirty_st0, req_uuid_st0))
end
if (do_flush_st0 && ~pipe_stall) begin
`TRACE(3, ("%t: %s tags-flush: addr=0x%0h, way=%0d, line=%0d, dirty=%b (#%0d)\n", $time, INSTANCE_ID,
full_addr_st0, evict_way_st0, line_idx_st0, is_dirty_st0, req_uuid_st0))
end
if (do_lookup_st0 && ~pipe_stall) begin
if (is_hit_st0) begin
`TRACE(3, ("%t: %s tags-hit: addr=0x%0h, rw=%b, way=%0d, line=%0d, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID,
full_addr_st0, rw_st0, way_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0))
end else begin
`TRACE(3, ("%t: %s tags-miss: addr=0x%0h, rw=%b, way=%0d, line=%0d, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID,
full_addr_st0, rw_st0, way_idx_st0, line_idx_st0, line_tag_st0, req_uuid_st0))
end
end
if (do_fill_st0 && ~pipe_stall) begin
`TRACE(3, ("%t: %s data-fill: addr=0x%0h, way=%0d, line=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
full_addr_st0, way_idx_st0, line_idx_st0, data_st0, req_uuid_st0))
end
if (do_flush_st0 && ~pipe_stall) begin
`TRACE(3, ("%t: %s data-flush: addr=0x%0h, way=%0d, line=%0d (#%0d)\n", $time, INSTANCE_ID,
full_addr_st0, way_idx_st0, line_idx_st0, req_uuid_st0))
end
if (do_read_st1 && is_hit_st1 && ~pipe_stall) begin
`TRACE(3, ("%t: %s data-read: addr=0x%0h, way=%0d, line=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
full_addr_st1, way_idx_st1, line_idx_st1, word_idx_st1, crsp_queue_data, req_uuid_st1))
end
if (do_write_st1 && is_hit_st1 && ~pipe_stall) begin
`TRACE(3, ("%t: %s data-write: addr=0x%0h, way=%0d, line=%0d, wsel=%0d, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
full_addr_st1, way_idx_st1, line_idx_st1, word_idx_st1, byteen_st1, write_word_st1, req_uuid_st1))
end
if (crsp_queue_fire) begin
`TRACE(2, ("%d: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(addr_st1, BANK_ID), crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1));
`TRACE(2, ("%t: %s core-rd-rsp: addr=0x%0h, tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
full_addr_st1, crsp_queue_tag, crsp_queue_idx, crsp_queue_data, req_uuid_st1))
end
if (mreq_queue_push) begin
if (do_creq_wr_st1 && !WRITEBACK)
`TRACE(2, ("%d: %s writethrough: addr=0x%0h, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data, req_uuid_st1));
else if (do_writeback_st1)
`TRACE(2, ("%d: %s writeback: addr=0x%0h, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_byteen, mreq_queue_data));
else
`TRACE(2, ("%d: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(mreq_queue_addr, BANK_ID), mreq_queue_id, req_uuid_st1));
if (!WRITEBACK && do_write_st1) begin
`TRACE(2, ("%t: %s writethrough: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
mreq_queue_full_addr, mreq_queue_byteen, mreq_queue_data, req_uuid_st1))
end else if (WRITEBACK && do_writeback_st1) begin
`TRACE(2, ("%t: %s writeback: addr=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID,
mreq_queue_full_addr, mreq_queue_byteen, mreq_queue_data, req_uuid_st1))
end else begin
`TRACE(2, ("%t: %s fill-req: addr=0x%0h, mshr_id=%0d (#%0d)\n", $time, INSTANCE_ID,
mreq_queue_full_addr, mshr_id_st1, req_uuid_st1))
end
end
end
`endif

View file

@ -15,10 +15,10 @@
module VX_cache_bypass #(
parameter NUM_REQS = 1,
parameter MEM_PORTS = 1,
parameter TAG_SEL_IDX = 0,
parameter PASSTHRU = 0,
parameter NC_ENABLE = 0,
parameter CACHE_ENABLE = 0,
parameter WORD_SIZE = 1,
parameter LINE_SIZE = 1,
@ -29,14 +29,11 @@ module VX_cache_bypass #(
parameter MEM_ADDR_WIDTH = 1,
parameter MEM_TAG_IN_WIDTH = 1,
parameter MEM_TAG_OUT_WIDTH = 1,
parameter UUID_WIDTH = 0,
parameter CORE_OUT_BUF = 0,
parameter MEM_OUT_BUF = 0,
parameter CORE_DATA_WIDTH = WORD_SIZE * 8
parameter MEM_OUT_BUF = 0
) (
input wire clk,
input wire reset,
@ -48,304 +45,222 @@ module VX_cache_bypass #(
VX_mem_bus_if.master core_bus_out_if [NUM_REQS],
// Memory request in
VX_mem_bus_if.slave mem_bus_in_if,
VX_mem_bus_if.slave mem_bus_in_if [MEM_PORTS],
// Memory request out
VX_mem_bus_if.master mem_bus_out_if
VX_mem_bus_if.master mem_bus_out_if [MEM_PORTS]
);
localparam DIRECT_PASSTHRU = PASSTHRU && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == 1);
localparam DIRECT_PASSTHRU = !CACHE_ENABLE && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == MEM_PORTS);
localparam CORE_DATA_WIDTH = WORD_SIZE * 8;
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
localparam REQ_SEL_BITS = `CLOG2(NUM_REQS);
localparam MUX_DATAW = 1 + WORD_SIZE + CORE_ADDR_WIDTH + `ADDR_TYPE_WIDTH + CORE_DATA_WIDTH + CORE_TAG_WIDTH;
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
localparam CORE_TAG_ID_BITS = CORE_TAG_WIDTH - UUID_WIDTH;
localparam MEM_TAG_ID_BITS = REQ_SEL_BITS + WSEL_BITS + CORE_TAG_ID_BITS;
localparam MEM_TAG_BYPASS_BITS = UUID_WIDTH + MEM_TAG_ID_BITS;
localparam CORE_TAG_ID_WIDTH = CORE_TAG_WIDTH - UUID_WIDTH;
localparam MEM_TAG_ID_WIDTH = `CLOG2(`CDIV(NUM_REQS, MEM_PORTS)) + CORE_TAG_ID_WIDTH;
localparam MEM_TAG_NC1_WIDTH = UUID_WIDTH + MEM_TAG_ID_WIDTH;
localparam MEM_TAG_NC2_WIDTH = MEM_TAG_NC1_WIDTH + WSEL_BITS;
localparam MEM_TAG_OUT_WIDTH = CACHE_ENABLE ? `MAX(MEM_TAG_IN_WIDTH, MEM_TAG_NC2_WIDTH) : MEM_TAG_NC2_WIDTH;
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % `MEM_BLOCK_SIZE), ("invalid parameter"))
// handle core requests ///////////////////////////////////////////////////
// hanlde non-cacheable core request switch ///////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (CORE_TAG_WIDTH)
) core_bus_nc_switch_if[(CACHE_ENABLE ? 2 : 1) * NUM_REQS]();
wire core_req_nc_valid;
wire [NUM_REQS-1:0] core_req_nc_valids;
wire [NUM_REQS-1:0] core_req_nc_idxs;
wire [`UP(REQ_SEL_BITS)-1:0] core_req_nc_idx;
wire [NUM_REQS-1:0] core_req_nc_sel;
wire core_req_nc_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (PASSTHRU != 0) begin
assign core_req_nc_idxs[i] = 1'b1;
end else if (NC_ENABLE) begin
assign core_req_nc_idxs[i] = core_bus_in_if[i].req_data.atype[`ADDR_TYPE_IO];
end else begin
assign core_req_nc_idxs[i] = 1'b0;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_is_nc
if (CACHE_ENABLE) begin : g_cache
assign core_req_nc_sel[i] = ~core_bus_in_if[i].req_data.flags[`MEM_REQ_FLAG_IO];
end else begin : g_no_cache
assign core_req_nc_sel[i] = 1'b0;
end
assign core_req_nc_valids[i] = core_bus_in_if[i].req_valid && core_req_nc_idxs[i];
end
VX_generic_arbiter #(
.NUM_REQS (NUM_REQS),
.TYPE (PASSTHRU ? "R" : "P")
) core_req_nc_arb (
.clk (clk),
.reset (reset),
.requests (core_req_nc_valids),
.grant_index (core_req_nc_idx),
.grant_onehot (core_req_nc_sel),
.grant_valid (core_req_nc_valid),
.grant_ready (core_req_nc_ready)
VX_mem_switch #(
.NUM_INPUTS (NUM_REQS),
.NUM_OUTPUTS ((CACHE_ENABLE ? 2 : 1) * NUM_REQS),
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (CORE_TAG_WIDTH),
.ARBITER ("R"),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(CORE_OUT_BUF))
) core_bus_nc_switch (
.clk (clk),
.reset (reset),
.bus_sel (core_req_nc_sel),
.bus_in_if (core_bus_in_if),
.bus_out_if(core_bus_nc_switch_if)
);
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && ~core_req_nc_idxs[i];
assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
assign core_bus_in_if[i].req_ready = core_req_nc_valids[i] ? (core_req_nc_ready && core_req_nc_sel[i])
: core_bus_out_if[i].req_ready;
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (CORE_TAG_WIDTH)
) core_bus_in_nc_if[NUM_REQS]();
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_nc_switch_if
assign core_bus_in_nc_if[i].req_valid = core_bus_nc_switch_if[0 * NUM_REQS + i].req_valid;
assign core_bus_in_nc_if[i].req_data = core_bus_nc_switch_if[0 * NUM_REQS + i].req_data;
assign core_bus_nc_switch_if[0 * NUM_REQS + i].req_ready = core_bus_in_nc_if[i].req_ready;
assign core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_valid = core_bus_in_nc_if[i].rsp_valid;
assign core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_data = core_bus_in_nc_if[i].rsp_data;
assign core_bus_in_nc_if[i].rsp_ready = core_bus_nc_switch_if[0 * NUM_REQS + i].rsp_ready;
if (CACHE_ENABLE) begin : g_cache
assign core_bus_out_if[i].req_valid = core_bus_nc_switch_if[1 * NUM_REQS + i].req_valid;
assign core_bus_out_if[i].req_data = core_bus_nc_switch_if[1 * NUM_REQS + i].req_data;
assign core_bus_nc_switch_if[1 * NUM_REQS + i].req_ready = core_bus_out_if[i].req_ready;
assign core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_valid = core_bus_out_if[i].rsp_valid;
assign core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_data = core_bus_out_if[i].rsp_data;
assign core_bus_out_if[i].rsp_ready = core_bus_nc_switch_if[1 * NUM_REQS + i].rsp_ready;
end else begin : g_no_cache
`INIT_VX_MEM_BUS_IF (core_bus_out_if[i])
end
end
// handle memory requests /////////////////////////////////////////////////
wire mem_req_out_valid;
wire mem_req_out_rw;
wire [LINE_SIZE-1:0] mem_req_out_byteen;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_out_addr;
wire [`ADDR_TYPE_WIDTH-1:0] mem_req_out_atype;
wire [`CS_LINE_WIDTH-1:0] mem_req_out_data;
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_out_tag;
wire mem_req_out_ready;
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (MEM_TAG_NC1_WIDTH)
) core_bus_nc_arb_if[MEM_PORTS]();
wire core_req_nc_sel_rw;
wire [WORD_SIZE-1:0] core_req_nc_sel_byteen;
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_sel_addr;
wire [`ADDR_TYPE_WIDTH-1:0] core_req_nc_sel_atype;
wire [CORE_DATA_WIDTH-1:0] core_req_nc_sel_data;
wire [CORE_TAG_WIDTH-1:0] core_req_nc_sel_tag;
VX_mem_arb #(
.NUM_INPUTS (NUM_REQS),
.NUM_OUTPUTS(MEM_PORTS),
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (CORE_TAG_WIDTH),
.TAG_SEL_IDX(TAG_SEL_IDX),
.ARBITER (CACHE_ENABLE ? "P" : "R"),
.REQ_OUT_BUF(0),
.RSP_OUT_BUF(0)
) core_bus_nc_arb (
.clk (clk),
.reset (reset),
.bus_in_if (core_bus_in_nc_if),
.bus_out_if (core_bus_nc_arb_if)
);
wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_nc_mux_in[i] = {
core_bus_in_if[i].req_data.rw,
core_bus_in_if[i].req_data.byteen,
core_bus_in_if[i].req_data.addr,
core_bus_in_if[i].req_data.atype,
core_bus_in_if[i].req_data.data,
core_bus_in_if[i].req_data.tag
};
end
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_NC2_WIDTH)
) mem_bus_out_nc_if[MEM_PORTS]();
assign {
core_req_nc_sel_rw,
core_req_nc_sel_byteen,
core_req_nc_sel_addr,
core_req_nc_sel_atype,
core_req_nc_sel_data,
core_req_nc_sel_tag
} = core_req_nc_mux_in[core_req_nc_idx];
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_out_nc
wire core_req_nc_arb_rw;
wire [WORD_SIZE-1:0] core_req_nc_arb_byteen;
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_arb_addr;
wire [`MEM_REQ_FLAGS_WIDTH-1:0] core_req_nc_arb_flags;
wire [CORE_DATA_WIDTH-1:0] core_req_nc_arb_data;
wire [MEM_TAG_NC1_WIDTH-1:0] core_req_nc_arb_tag;
assign core_req_nc_ready = ~mem_bus_in_if.req_valid && mem_req_out_ready;
assign {
core_req_nc_arb_rw,
core_req_nc_arb_addr,
core_req_nc_arb_data,
core_req_nc_arb_byteen,
core_req_nc_arb_flags,
core_req_nc_arb_tag
} = core_bus_nc_arb_if[i].req_data;
assign mem_req_out_valid = mem_bus_in_if.req_valid || core_req_nc_valid;
assign mem_req_out_rw = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.rw : core_req_nc_sel_rw;
assign mem_req_out_addr = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.addr : core_req_nc_sel_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
assign mem_req_out_atype = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.atype : core_req_nc_sel_atype;
logic [MEM_ADDR_WIDTH-1:0] core_req_nc_arb_addr_w;
logic [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] core_req_nc_arb_byteen_w;
logic [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] core_req_nc_arb_data_w;
logic [CORE_DATA_WIDTH-1:0] core_rsp_nc_arb_data_w;
wire [MEM_TAG_NC2_WIDTH-1:0] core_req_nc_arb_tag_w;
wire [MEM_TAG_NC1_WIDTH-1:0] core_rsp_nc_arb_tag_w;
wire [MEM_TAG_ID_BITS-1:0] mem_req_tag_id_bypass;
wire [CORE_TAG_ID_BITS-1:0] core_req_in_id = core_req_nc_sel_tag[CORE_TAG_ID_BITS-1:0];
if (WORDS_PER_LINE > 1) begin
reg [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] mem_req_byteen_in_r;
reg [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_r;
wire [WSEL_BITS-1:0] req_wsel = core_req_nc_sel_addr[WSEL_BITS-1:0];
always @(*) begin
mem_req_byteen_in_r = '0;
mem_req_byteen_in_r[req_wsel] = core_req_nc_sel_byteen;
mem_req_data_in_r = 'x;
mem_req_data_in_r[req_wsel] = core_req_nc_sel_data;
end
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : mem_req_byteen_in_r;
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : mem_req_data_in_r;
if (NUM_REQS > 1) begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, req_wsel, core_req_in_id});
end else begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({req_wsel, core_req_in_id});
end
end else begin
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : core_req_nc_sel_byteen;
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : core_req_nc_sel_data;
if (NUM_REQS > 1) begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, core_req_in_id});
end else begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_in_id});
end
end
wire [MEM_TAG_BYPASS_BITS-1:0] mem_req_tag_bypass;
if (UUID_WIDTH != 0) begin
assign mem_req_tag_bypass = {core_req_nc_sel_tag[CORE_TAG_ID_BITS +: UUID_WIDTH], mem_req_tag_id_bypass};
end else begin
assign mem_req_tag_bypass = mem_req_tag_id_bypass;
end
if (PASSTHRU != 0) begin
assign mem_req_out_tag = mem_req_tag_bypass;
`UNUSED_VAR (mem_bus_in_if.req_data.tag)
end else begin
if (NC_ENABLE) begin
if (WORDS_PER_LINE > 1) begin : g_multi_word_line
wire [WSEL_BITS-1:0] rsp_wsel;
wire [WSEL_BITS-1:0] req_wsel = core_req_nc_arb_addr[WSEL_BITS-1:0];
always @(*) begin
core_req_nc_arb_byteen_w = '0;
core_req_nc_arb_byteen_w[req_wsel] = core_req_nc_arb_byteen;
core_req_nc_arb_data_w = 'x;
core_req_nc_arb_data_w[req_wsel] = core_req_nc_arb_data;
end
VX_bits_insert #(
.N (MEM_TAG_OUT_WIDTH-1),
.S (1),
.N (MEM_TAG_NC1_WIDTH),
.S (WSEL_BITS),
.POS (TAG_SEL_IDX)
) mem_req_tag_in_nc_insert (
.data_in (mem_bus_in_if.req_valid ? (MEM_TAG_OUT_WIDTH-1)'(mem_bus_in_if.req_data.tag) : (MEM_TAG_OUT_WIDTH-1)'(mem_req_tag_bypass)),
.ins_in (~mem_bus_in_if.req_valid),
.data_out (mem_req_out_tag)
) wsel_insert (
.data_in (core_req_nc_arb_tag),
.ins_in (req_wsel),
.data_out (core_req_nc_arb_tag_w)
);
end else begin
assign mem_req_out_tag = mem_bus_in_if.req_data.tag;
VX_bits_remove #(
.N (MEM_TAG_NC2_WIDTH),
.S (WSEL_BITS),
.POS (TAG_SEL_IDX)
) wsel_remove (
.data_in (mem_bus_out_nc_if[i].rsp_data.tag),
.sel_out (rsp_wsel),
.data_out (core_rsp_nc_arb_tag_w)
);
assign core_req_nc_arb_addr_w = core_req_nc_arb_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
assign core_rsp_nc_arb_data_w = mem_bus_out_nc_if[i].rsp_data.data[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
end else begin : g_single_word_line
assign core_req_nc_arb_addr_w = core_req_nc_arb_addr;
assign core_req_nc_arb_byteen_w = core_req_nc_arb_byteen;
assign core_req_nc_arb_data_w = core_req_nc_arb_data;
assign core_req_nc_arb_tag_w = MEM_TAG_NC2_WIDTH'(core_req_nc_arb_tag);
assign core_rsp_nc_arb_data_w = mem_bus_out_nc_if[i].rsp_data.data;
assign core_rsp_nc_arb_tag_w = MEM_TAG_NC1_WIDTH'(mem_bus_out_nc_if[i].rsp_data.tag);
end
assign mem_bus_out_nc_if[i].req_valid = core_bus_nc_arb_if[i].req_valid;
assign mem_bus_out_nc_if[i].req_data = {
core_req_nc_arb_rw,
core_req_nc_arb_addr_w,
core_req_nc_arb_data_w,
core_req_nc_arb_byteen_w,
core_req_nc_arb_flags,
core_req_nc_arb_tag_w
};
assign core_bus_nc_arb_if[i].req_ready = mem_bus_out_nc_if[i].req_ready;
assign core_bus_nc_arb_if[i].rsp_valid = mem_bus_out_nc_if[i].rsp_valid;
assign core_bus_nc_arb_if[i].rsp_data = {
core_rsp_nc_arb_data_w,
core_rsp_nc_arb_tag_w
};
assign mem_bus_out_nc_if[i].rsp_ready = core_bus_nc_arb_if[i].rsp_ready;
end
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_OUT_WIDTH)
) mem_bus_out_src_if[(CACHE_ENABLE ? 2 : 1) * MEM_PORTS]();
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_out_src
`ASSIGN_VX_MEM_BUS_IF_EX(mem_bus_out_src_if[0 * MEM_PORTS + i], mem_bus_out_nc_if[i], MEM_TAG_OUT_WIDTH, MEM_TAG_NC2_WIDTH, UUID_WIDTH);
if (CACHE_ENABLE) begin : g_cache
`ASSIGN_VX_MEM_BUS_IF_EX(mem_bus_out_src_if[1 * MEM_PORTS + i], mem_bus_in_if[i], MEM_TAG_OUT_WIDTH, MEM_TAG_IN_WIDTH, UUID_WIDTH);
end else begin : g_no_cache
`UNUSED_VX_MEM_BUS_IF(mem_bus_in_if[i])
end
end
assign mem_bus_in_if.req_ready = mem_req_out_ready;
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `ADDR_TYPE_WIDTH + `CS_LINE_WIDTH + MEM_TAG_OUT_WIDTH),
.SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_out_valid),
.ready_in (mem_req_out_ready),
.data_in ({mem_req_out_rw, mem_req_out_byteen, mem_req_out_addr, mem_req_out_atype, mem_req_out_data, mem_req_out_tag}),
.data_out ({mem_bus_out_if.req_data.rw, mem_bus_out_if.req_data.byteen, mem_bus_out_if.req_data.addr, mem_bus_out_if.req_data.atype, mem_bus_out_if.req_data.data, mem_bus_out_if.req_data.tag}),
.valid_out (mem_bus_out_if.req_valid),
.ready_out (mem_bus_out_if.req_ready)
VX_mem_arb #(
.NUM_INPUTS ((CACHE_ENABLE ? 2 : 1) * MEM_PORTS),
.NUM_OUTPUTS(MEM_PORTS),
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_OUT_WIDTH),
.ARBITER ("R"),
.REQ_OUT_BUF(DIRECT_PASSTHRU ? 0 : `TO_OUT_BUF_SIZE(MEM_OUT_BUF)),
.RSP_OUT_BUF(0)
) mem_bus_out_arb (
.clk (clk),
.reset (reset),
.bus_in_if (mem_bus_out_src_if),
.bus_out_if (mem_bus_out_if)
);
// handle core responses //////////////////////////////////////////////////
wire [NUM_REQS-1:0] core_rsp_in_valid;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_in_data;
wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_in_tag;
wire [NUM_REQS-1:0] core_rsp_in_ready;
wire is_mem_rsp_nc;
if (PASSTHRU != 0) begin
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid;
end else begin
if (NC_ENABLE) begin
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid && mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
end else begin
assign is_mem_rsp_nc = 1'b0;
end
end
wire [(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1:0] mem_rsp_tag_id_nc;
VX_bits_remove #(
.N (MEM_TAG_OUT_WIDTH),
.S (NC_ENABLE),
.POS (TAG_SEL_IDX)
) mem_rsp_tag_in_nc_remove (
.data_in (mem_bus_out_if.rsp_data.tag),
.data_out (mem_rsp_tag_id_nc)
);
wire [`UP(REQ_SEL_BITS)-1:0] rsp_idx;
if (NUM_REQS > 1) begin
assign rsp_idx = mem_rsp_tag_id_nc[(CORE_TAG_ID_BITS + WSEL_BITS) +: REQ_SEL_BITS];
end else begin
assign rsp_idx = 1'b0;
end
reg [NUM_REQS-1:0] rsp_nc_valid_r;
always @(*) begin
rsp_nc_valid_r = '0;
rsp_nc_valid_r[rsp_idx] = is_mem_rsp_nc;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_in_valid[i] = core_bus_out_if[i].rsp_valid || rsp_nc_valid_r[i];
assign core_bus_out_if[i].rsp_ready = core_rsp_in_ready[i];
end
if (WORDS_PER_LINE > 1) begin
wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS +: WSEL_BITS];
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ?
core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
end
end else begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data;
end
end
wire [(CORE_TAG_ID_BITS + UUID_WIDTH)-1:0] mem_rsp_tag_in_nc2;
if (UUID_WIDTH != 0) begin
assign mem_rsp_tag_in_nc2 = {mem_rsp_tag_id_nc[(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1 -: UUID_WIDTH], mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0]};
end else begin
assign mem_rsp_tag_in_nc2 = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (PASSTHRU) begin
assign core_rsp_in_tag[i] = mem_rsp_tag_in_nc2;
end else if (NC_ENABLE) begin
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.tag : mem_rsp_tag_in_nc2;
end else begin
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_data.tag;
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + CORE_TAG_WIDTH),
.SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_in_valid[i]),
.ready_in (core_rsp_in_ready[i]),
.data_in ({core_rsp_in_data[i], core_rsp_in_tag[i]}),
.data_out ({core_bus_in_if[i].rsp_data.data, core_bus_in_if[i].rsp_data.tag}),
.valid_out (core_bus_in_if[i].rsp_valid),
.ready_out (core_bus_in_if[i].rsp_ready)
);
end
// handle memory responses ////////////////////////////////////////////////
if (PASSTHRU != 0) begin
assign mem_bus_in_if.rsp_valid = 1'b0;
assign mem_bus_in_if.rsp_data.data = '0;
assign mem_bus_in_if.rsp_data.tag = '0;
end else if (NC_ENABLE) begin
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid && ~mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc[MEM_TAG_IN_WIDTH-1:0];
end else begin
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid;
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc;
end
wire [NUM_REQS-1:0] core_rsp_out_valid;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_out_valid[i] = core_bus_out_if[i].rsp_valid;
end
assign mem_bus_out_if.rsp_ready = is_mem_rsp_nc ? (~core_rsp_out_valid[rsp_idx] && core_rsp_in_ready[rsp_idx]) : mem_bus_in_if.rsp_ready;
endmodule

View file

@ -23,23 +23,26 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
// Number of requests per cycle
parameter NUM_REQS = 4,
// Number of memory ports
parameter MEM_PORTS = 1,
// Size of cache in bytes
parameter CACHE_SIZE = 16384,
parameter CACHE_SIZE = 32768,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
parameter NUM_BANKS = 4,
// Number of associative ways
parameter NUM_WAYS = 4,
// Size of a word in bytes
parameter WORD_SIZE = 4,
parameter WORD_SIZE = 16,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
parameter CRSQ_SIZE = 4,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 8,
parameter MSHR_SIZE = 16,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
parameter MRSQ_SIZE = 4,
// Memory Request Queue Size
parameter MREQ_SIZE = 4,
@ -52,20 +55,26 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Replacement policy
parameter REPL_POLICY = `CS_REPL_FIFO,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 1,
// core request flags
parameter FLAGS_WIDTH = 0,
// enable bypass for non-cacheable addresses
parameter NC_ENABLE = 0,
// Core response output buffer
parameter CORE_OUT_BUF = 0,
parameter CORE_OUT_BUF = 3,
// Memory request output buffer
parameter MEM_OUT_BUF = 0
parameter MEM_OUT_BUF = 3
) (
input wire clk,
input wire reset,
@ -76,14 +85,16 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
`endif
VX_mem_bus_if.slave core_bus_if [NUM_INPUTS * NUM_REQS],
VX_mem_bus_if.master mem_bus_if
VX_mem_bus_if.master mem_bus_if [MEM_PORTS]
);
localparam NUM_CACHES = `UP(NUM_UNITS);
localparam PASSTHRU = (NUM_UNITS == 0);
localparam ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES);
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
localparam CACHE_MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH);
localparam BYPASS_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, MEM_PORTS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH);
localparam NC_TAG_WIDTH = `MAX(CACHE_MEM_TAG_WIDTH, BYPASS_TAG_WIDTH) + 1;
localparam MEM_TAG_WIDTH = PASSTHRU ? BYPASS_TAG_WIDTH : (NC_ENABLE ? NC_TAG_WIDTH : CACHE_MEM_TAG_WIDTH);
`STATIC_ASSERT(NUM_INPUTS >= NUM_CACHES, ("invalid parameter"))
@ -95,16 +106,14 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) cache_mem_bus_if[NUM_CACHES]();
) cache_mem_bus_if[NUM_CACHES * MEM_PORTS]();
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (ARB_TAG_WIDTH)
) arb_core_bus_if[NUM_CACHES * NUM_REQS]();
`RESET_RELAY_EX (cache_arb_reset, reset, NUM_REQS, `MAX_FANOUT);
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_arb
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (TAG_WIDTH)
@ -115,7 +124,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.TAG_WIDTH (ARB_TAG_WIDTH)
) arb_core_bus_tmp_if[NUM_CACHES]();
for (genvar j = 0; j < NUM_INPUTS; ++j) begin
for (genvar j = 0; j < NUM_INPUTS; ++j) begin : g_core_bus_tmp_if
`ASSIGN_VX_MEM_BUS_IF (core_bus_tmp_if[j], core_bus_if[j * NUM_REQS + i]);
end
@ -127,40 +136,40 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.TAG_SEL_IDX (TAG_SEL_IDX),
.ARBITER ("R"),
.REQ_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0),
.RSP_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : 0)
) cache_arb (
.RSP_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? CORE_OUT_BUF : 0)
) core_arb (
.clk (clk),
.reset (cache_arb_reset[i]),
.reset (reset),
.bus_in_if (core_bus_tmp_if),
.bus_out_if (arb_core_bus_tmp_if)
);
for (genvar k = 0; k < NUM_CACHES; ++k) begin
for (genvar k = 0; k < NUM_CACHES; ++k) begin : g_arb_core_bus_if
`ASSIGN_VX_MEM_BUS_IF (arb_core_bus_if[k * NUM_REQS + i], arb_core_bus_tmp_if[k]);
end
end
for (genvar i = 0; i < NUM_CACHES; ++i) begin : caches
`RESET_RELAY (cache_reset, reset);
for (genvar i = 0; i < NUM_CACHES; ++i) begin : g_cache_wrap
VX_cache_wrap #(
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, i)),
.INSTANCE_ID (`SFORMATF(("%s%0d", INSTANCE_ID, i))),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.MEM_PORTS (MEM_PORTS),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.REPL_POLICY (REPL_POLICY),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (ARB_TAG_WIDTH),
.FLAGS_WIDTH (FLAGS_WIDTH),
.TAG_SEL_IDX (TAG_SEL_IDX),
.CORE_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : CORE_OUT_BUF),
.MEM_OUT_BUF ((NUM_CACHES > 1) ? 2 : MEM_OUT_BUF),
@ -171,32 +180,48 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.cache_perf (perf_cache_unit[i]),
`endif
.clk (clk),
.reset (cache_reset),
.reset (reset),
.core_bus_if (arb_core_bus_if[i * NUM_REQS +: NUM_REQS]),
.mem_bus_if (cache_mem_bus_if[i])
.mem_bus_if (cache_mem_bus_if[i * MEM_PORTS +: MEM_PORTS])
);
end
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1))
) mem_bus_tmp_if[1]();
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_if
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) arb_core_bus_tmp_if[NUM_CACHES]();
VX_mem_arb #(
.NUM_INPUTS (NUM_CACHES),
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH),
.TAG_SEL_IDX (TAG_SEL_IDX),
.ARBITER ("R"),
.REQ_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0),
.RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0)
) mem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (cache_mem_bus_if),
.bus_out_if (mem_bus_tmp_if)
);
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1))
) mem_bus_tmp_if[1]();
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_tmp_if[0]);
for (genvar j = 0; j < NUM_CACHES; ++j) begin : g_arb_core_bus_tmp_if
`ASSIGN_VX_MEM_BUS_IF (arb_core_bus_tmp_if[j], cache_mem_bus_if[j * MEM_PORTS + i]);
end
VX_mem_arb #(
.NUM_INPUTS (NUM_CACHES),
.NUM_OUTPUTS (1),
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH),
.TAG_SEL_IDX (TAG_SEL_IDX),
.ARBITER ("R"),
.REQ_OUT_BUF ((NUM_CACHES > 1) ? MEM_OUT_BUF : 0),
.RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0)
) mem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (arb_core_bus_tmp_if),
.bus_out_if (mem_bus_tmp_if)
);
if (WRITE_ENABLE) begin : g_we
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[0]);
end else begin : g_ro
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[0]);
end
end
endmodule

View file

@ -14,8 +14,6 @@
`include "VX_cache_define.vh"
module VX_cache_data #(
parameter `STRING INSTANCE_ID= "",
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
@ -31,169 +29,116 @@ module VX_cache_data #(
// Enable cache writeback
parameter WRITEBACK = 0,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Request debug identifier
parameter UUID_WIDTH = 0
parameter DIRTY_BYTES = 0
) (
input wire clk,
input wire reset,
`IGNORE_UNUSED_BEGIN
input wire[`UP(UUID_WIDTH)-1:0] req_uuid,
`IGNORE_UNUSED_END
input wire stall,
// inputs
input wire init,
input wire read,
input wire fill,
input wire flush,
input wire read,
input wire write,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire [`UP(`CS_WORD_SEL_BITS)-1:0] wsel,
input wire [`CS_LINE_SEL_BITS-1:0] line_idx,
input wire [`CS_WAY_SEL_WIDTH-1:0] evict_way,
input wire [NUM_WAYS-1:0] tag_matches,
input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] fill_data,
input wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] write_data,
input wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_byteen,
input wire [NUM_WAYS-1:0] way_sel,
output wire [`CS_WORD_WIDTH-1:0] read_data,
output wire [`CS_LINE_WIDTH-1:0] dirty_data,
output wire [LINE_SIZE-1:0] dirty_byteen
input wire [`CS_WORD_WIDTH-1:0] write_word,
input wire [WORD_SIZE-1:0] write_byteen,
input wire [`UP(`CS_WORD_SEL_BITS)-1:0] word_idx,
input wire [`CS_WAY_SEL_WIDTH-1:0] way_idx_r,
// outputs
output wire [`CS_LINE_WIDTH-1:0] read_data,
output wire [LINE_SIZE-1:0] evict_byteen
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_PARAM (WORD_SIZE)
`UNUSED_VAR (stall)
`UNUSED_VAR (line_addr)
`UNUSED_VAR (init)
`UNUSED_VAR (read)
`UNUSED_VAR (flush)
localparam BYTEENW = (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) ? (LINE_SIZE * NUM_WAYS) : 1;
wire [`CS_WORDS_PER_LINE-1:0][WORD_SIZE-1:0] write_mask;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin : g_write_mask
wire word_en = (`CS_WORDS_PER_LINE == 1) || (word_idx == i);
assign write_mask[i] = write_byteen & {WORD_SIZE{word_en}};
end
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
if (DIRTY_BYTES != 0) begin : g_dirty_bytes
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_rdata;
wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
wire [NUM_WAYS-1:0][LINE_SIZE-1:0] byteen_rdata;
if (WRITEBACK) begin
if (DIRTY_BYTES) begin
wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_rdata;
wire [NUM_WAYS-1:0][LINE_SIZE-1:0] bs_wdata;
for (genvar i = 0; i < NUM_WAYS; ++i) begin
wire [LINE_SIZE-1:0] wdata = write ? (bs_rdata[i] | write_byteen) : ((fill || flush) ? '0 : bs_rdata[i]);
assign bs_wdata[i] = init ? '0 : (way_sel[i] ? wdata : bs_rdata[i]);
end
for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_byteen_store
wire [LINE_SIZE-1:0] byteen_wdata = {LINE_SIZE{write}}; // only asserted on writes
wire [LINE_SIZE-1:0] byteen_wren = {LINE_SIZE{init || fill || flush}} | write_mask;
wire byteen_write = ((fill || flush) && ((NUM_WAYS == 1) || (evict_way == i)))
|| (write && tag_matches[i])
|| init;
wire byteen_read = fill || flush;
VX_sp_ram #(
.DATAW (LINE_SIZE * NUM_WAYS),
.SIZE (`CS_LINES_PER_BANK)
.DATAW (LINE_SIZE),
.WRENW (LINE_SIZE),
.SIZE (`CS_LINES_PER_BANK),
.OUT_REG (1),
.RDW_MODE ("R")
) byteen_store (
.clk (clk),
.reset (reset),
.read (write || fill || flush),
.write (init || write || fill || flush),
.wren (1'b1),
.addr (line_sel),
.wdata (bs_wdata),
.rdata (bs_rdata)
.read (byteen_read),
.write (byteen_write),
.wren (byteen_wren),
.addr (line_idx),
.wdata (byteen_wdata),
.rdata (byteen_rdata[i])
);
assign dirty_byteen = bs_rdata[way_idx];
end else begin
assign dirty_byteen = {LINE_SIZE{1'b1}};
end
wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] flipped_rdata;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
for (genvar j = 0; j < NUM_WAYS; ++j) begin
assign flipped_rdata[j][i] = line_rdata[i][j];
end
end
assign dirty_data = flipped_rdata[way_idx];
end else begin
assign dirty_byteen = '0;
assign dirty_data = '0;
assign evict_byteen = byteen_rdata[way_idx_r];
end else begin : g_no_dirty_bytes
`UNUSED_VAR (init)
`UNUSED_VAR (flush)
assign evict_byteen = '1; // update whole line
end
// order the data layout to perform ways multiplexing last.
// this allows converting way index to binary in parallel with BRAM readaccess and way selection.
wire [NUM_WAYS-1:0][`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] line_rdata;
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] line_wdata;
wire [BYTEENW-1:0] line_wren;
for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_data_store
if (WRITE_ENABLE != 0 || (NUM_WAYS > 1)) begin
wire [`CS_WORDS_PER_LINE-1:0][NUM_WAYS-1:0][WORD_SIZE-1:0] wren_w;
for (genvar i = 0; i < `CS_WORDS_PER_LINE; ++i) begin
for (genvar j = 0; j < NUM_WAYS; ++j) begin
assign line_wdata[i][j] = (fill || !WRITE_ENABLE) ? fill_data[i] : write_data[i];
assign wren_w[i][j] = ((fill || !WRITE_ENABLE) ? {WORD_SIZE{1'b1}} : write_byteen[i])
& {WORD_SIZE{(way_sel[j] || (NUM_WAYS == 1))}};
end
localparam WRENW = WRITE_ENABLE ? LINE_SIZE : 1;
wire [`CS_WORDS_PER_LINE-1:0][`CS_WORD_WIDTH-1:0] line_wdata;
wire [WRENW-1:0] line_wren;
if (WRITE_ENABLE) begin : g_wren
assign line_wdata = fill ? fill_data : {`CS_WORDS_PER_LINE{write_word}};
assign line_wren = {LINE_SIZE{fill}} | write_mask;
end else begin : g_no_wren
`UNUSED_VAR (write_word)
`UNUSED_VAR (write_mask)
assign line_wdata = fill_data;
assign line_wren = 1'b1;
end
assign line_wren = wren_w;
end else begin
`UNUSED_VAR (write)
`UNUSED_VAR (write_byteen)
`UNUSED_VAR (write_data)
assign line_wdata = fill_data;
assign line_wren = fill;
wire line_write = (fill && ((NUM_WAYS == 1) || (evict_way == i)))
|| (write && tag_matches[i] && WRITE_ENABLE);
wire line_read = read || ((fill || flush) && WRITEBACK);
VX_sp_ram #(
.DATAW (`CS_LINE_WIDTH),
.SIZE (`CS_LINES_PER_BANK),
.WRENW (WRENW),
.OUT_REG (1),
.RDW_MODE ("R")
) data_store (
.clk (clk),
.reset (reset),
.read (line_read),
.write (line_write),
.wren (line_wren),
.addr (line_idx),
.wdata (line_wdata),
.rdata (line_rdata[i])
);
end
VX_onehot_encoder #(
.N (NUM_WAYS)
) way_enc (
.data_in (way_sel),
.data_out (way_idx),
`UNUSED_PIN (valid_out)
);
wire line_read = (read && ~stall)
|| (WRITEBACK && (fill || flush));
wire line_write = write || fill;
VX_sp_ram #(
.DATAW (`CS_LINE_WIDTH * NUM_WAYS),
.SIZE (`CS_LINES_PER_BANK),
.WRENW (BYTEENW),
.NO_RWCHECK (1),
.RW_ASSERT (1)
) data_store (
.clk (clk),
.reset (reset),
.read (line_read),
.write (line_write),
.wren (line_wren),
.addr (line_sel),
.wdata (line_wdata),
.rdata (line_rdata)
);
wire [NUM_WAYS-1:0][`CS_WORD_WIDTH-1:0] per_way_rdata;
if (`CS_WORDS_PER_LINE > 1) begin
assign per_way_rdata = line_rdata[wsel];
end else begin
`UNUSED_VAR (wsel)
assign per_way_rdata = line_rdata;
end
assign read_data = per_way_rdata[way_idx];
`ifdef DBG_TRACE_CACHE
always @(posedge clk) begin
if (fill && ~stall) begin
`TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, fill_data));
end
if (flush && ~stall) begin
`TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, byteen=%h, data=0x%h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, dirty_byteen, dirty_data));
end
if (read && ~stall) begin
`TRACE(3, ("%d: %s read: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, read_data, req_uuid));
end
if (write && ~stall) begin
`TRACE(3, ("%d: %s write: addr=0x%0h, way=%b, blk_addr=%0d, wsel=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), way_sel, line_sel, wsel, write_byteen[wsel], write_data[wsel], req_uuid));
end
end
`endif
assign read_data = line_rdata[way_idx_r];
endmodule

View file

@ -22,6 +22,7 @@
`define CS_LINE_WIDTH (8 * LINE_SIZE)
`define CS_BANK_SIZE (CACHE_SIZE / NUM_BANKS)
`define CS_WAY_SEL_BITS `CLOG2(NUM_WAYS)
`define CS_WAY_SEL_WIDTH `UP(`CS_WAY_SEL_BITS)
`define CS_LINES_PER_BANK (`CS_BANK_SIZE / (LINE_SIZE * NUM_WAYS))
`define CS_WORDS_PER_LINE (LINE_SIZE / WORD_SIZE)
@ -54,12 +55,7 @@
///////////////////////////////////////////////////////////////////////////////
`define CS_LINE_TO_MEM_ADDR(x, i) {x, `CS_BANK_SEL_BITS'(i)}
`define CS_MEM_ADDR_TO_BANK_ID(x) x[0 +: `CS_BANK_SEL_BITS]
`define CS_MEM_TAG_TO_REQ_ID(x) x[MSHR_ADDR_WIDTH-1:0]
`define CS_MEM_TAG_TO_BANK_ID(x) x[MSHR_ADDR_WIDTH +: `CS_BANK_SEL_BITS]
`define CS_LINE_TO_FULL_ADDR(x, i) {x, (`XLEN-$bits(x))'(i << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
`define CS_BANK_TO_FULL_ADDR(x, b) {x, (`XLEN-$bits(x))'(b << (`XLEN-$bits(x)-`CS_BANK_SEL_BITS))}
`define CS_MEM_TO_FULL_ADDR(x) {x, (`XLEN-$bits(x))'(0)}
///////////////////////////////////////////////////////////////////////////////
@ -74,4 +70,10 @@
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, count, (count > 1)) \
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, count, (count > 1))
///////////////////////////////////////////////////////////////////////////////
`define CS_REPL_RANDOM 0
`define CS_REPL_FIFO 1
`define CS_REPL_PLRU 2
`endif // VX_CACHE_DEFINE_VH

View file

@ -18,6 +18,10 @@ module VX_cache_flush #(
parameter NUM_REQS = 4,
// Number of banks
parameter NUM_BANKS = 1,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 1,
// Bank select latency
parameter BANK_SEL_LATENCY = 1
) (
@ -27,8 +31,11 @@ module VX_cache_flush #(
VX_mem_bus_if.master core_bus_out_if [NUM_REQS],
input wire [NUM_BANKS-1:0] bank_req_fire,
output wire [NUM_BANKS-1:0] flush_begin,
output wire [`UP(UUID_WIDTH)-1:0] flush_uuid,
input wire [NUM_BANKS-1:0] flush_end
);
`UNUSED_PARAM (TAG_WIDTH)
localparam STATE_IDLE = 0;
localparam STATE_WAIT1 = 1;
localparam STATE_FLUSH = 2;
@ -41,13 +48,13 @@ module VX_cache_flush #(
wire no_inflight_reqs;
if (BANK_SEL_LATENCY != 0) begin
if (BANK_SEL_LATENCY != 0) begin : g_bank_sel_latency
localparam NUM_REQS_W = `CLOG2(NUM_REQS+1);
localparam NUM_BANKS_W = `CLOG2(NUM_BANKS+1);
wire [NUM_REQS-1:0] core_bus_out_fire;
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_fire
assign core_bus_out_fire[i] = core_bus_out_if[i].req_valid && core_bus_out_if[i].req_ready;
end
@ -74,7 +81,7 @@ module VX_cache_flush #(
`UNUSED_PIN (size)
);
end else begin
end else begin : g_no_bank_sel_latency
assign no_inflight_reqs = 0;
`UNUSED_VAR (bank_req_fire)
end
@ -82,28 +89,38 @@ module VX_cache_flush #(
reg [NUM_BANKS-1:0] flush_done, flush_done_n;
wire [NUM_REQS-1:0] flush_req_mask;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign flush_req_mask[i] = core_bus_in_if[i].req_valid && core_bus_in_if[i].req_data.atype[`ADDR_TYPE_FLUSH];
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_flush_req_mask
assign flush_req_mask[i] = core_bus_in_if[i].req_valid && core_bus_in_if[i].req_data.flags[`MEM_REQ_FLAG_FLUSH];
end
wire flush_req_enable = (| flush_req_mask);
reg [NUM_REQS-1:0] lock_released, lock_released_n;
reg [`UP(UUID_WIDTH)-1:0] flush_uuid_r, flush_uuid_n;
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_req
wire input_enable = ~flush_req_enable || lock_released[i];
assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && input_enable;
assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
assign core_bus_in_if[i].req_ready = core_bus_out_if[i].req_ready && input_enable;
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_in_rsp
assign core_bus_in_if[i].rsp_valid = core_bus_out_if[i].rsp_valid;
assign core_bus_in_if[i].rsp_data = core_bus_out_if[i].rsp_data;
assign core_bus_out_if[i].rsp_ready = core_bus_in_if[i].rsp_ready;
end
reg [NUM_REQS-1:0][`UP(UUID_WIDTH)-1:0] core_bus_out_uuid;
wire [NUM_REQS-1:0] core_bus_out_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_uuid
if (UUID_WIDTH != 0) begin : g_uuid
assign core_bus_out_uuid[i] = core_bus_in_if[i].req_data.tag.uuid;
end else begin : g_no_uuid
assign core_bus_out_uuid[i] = 0;
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_out_ready
assign core_bus_out_ready[i] = core_bus_out_if[i].req_ready;
end
@ -111,10 +128,17 @@ module VX_cache_flush #(
state_n = state;
flush_done_n = flush_done;
lock_released_n = lock_released;
flush_uuid_n = flush_uuid_r;
case (state)
STATE_IDLE: begin
//STATE_IDLE:
default: begin
if (flush_req_enable) begin
state_n = (BANK_SEL_LATENCY != 0) ? STATE_WAIT1 : STATE_FLUSH;
for (integer i = NUM_REQS-1; i >= 0; --i) begin
if (flush_req_mask[i]) begin
flush_uuid_n = core_bus_out_uuid[i];
end
end
end
end
STATE_WAIT1: begin
@ -158,8 +182,10 @@ module VX_cache_flush #(
flush_done <= flush_done_n;
lock_released <= lock_released_n;
end
flush_uuid_r <= flush_uuid_n;
end
assign flush_begin = {NUM_BANKS{state == STATE_FLUSH}};
assign flush_uuid = flush_uuid_r;
endmodule

View file

@ -24,36 +24,23 @@
// arrival and are dequeued in the same order.
// Each entry has a next pointer to the next entry pending for the same cache line.
//
// During the fill operation, the MSHR will release the MSHR entry at fill_id
// During the fill request, the MSHR will dequue the MSHR entry at the fill_id location
// which represents the first request in the pending list that initiated the memory fill.
//
// The dequeue operation directly follows the fill operation and will release
// The dequeue response directly follows the fill request and will release
// all the subsequent entries linked to fill_id (pending the same cache line).
//
// During the allocation operation, the MSHR will allocate the next free slot
// During the allocation request, the MSHR will allocate the next free slot
// for the incoming core request. We return the allocated slot id as well as
// the slot id of the previous entry for the same cache line. This is used to
// link the new entry to the pending list during finalization.
// link the new entry to the pending list.
//
// The lookup operation is used to find all pending entries for a given cache line.
// This is used to by the cache bank to determine if a cache miss is already pending
// and therefore avoid issuing a memory fill request.
//
// The finalize operation is used to release the allocated MSHR entry if we had a hit.
// If we had a miss and finalize_pending is true, we link the allocated entry to
// its corresponding pending list (via finalize_prev).
// The finalize request is used to persit or release the currently allocated MSHR entry
// if we had a cache miss or a hit, respectively.
//
// Warning: This MSHR implementation is strongly coupled with the bank pipeline
// and as such changes to either module requires careful evaluation.
//
// This architecture implements three pipeline stages:
// - Arbitration: cache bank arbitration before entering pipeline.
// fill and dequeue operations are executed at this stage.
// - stage 0: cache bank tag access stage.
// allocate and lookup operations are executed at this stage.
// - stage 1: cache bank tdatag access stage.
// finalize operation is executed at this stage.
//
module VX_cache_mshr #(
parameter `STRING INSTANCE_ID= "",
@ -68,6 +55,9 @@ module VX_cache_mshr #(
parameter UUID_WIDTH = 0,
// MSHR parameters
parameter DATA_WIDTH = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
parameter MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE)
) (
input wire clk,
@ -75,7 +65,7 @@ module VX_cache_mshr #(
`IGNORE_UNUSED_BEGIN
input wire[`UP(UUID_WIDTH)-1:0] deq_req_uuid,
input wire[`UP(UUID_WIDTH)-1:0] lkp_req_uuid,
input wire[`UP(UUID_WIDTH)-1:0] alc_req_uuid,
input wire[`UP(UUID_WIDTH)-1:0] fin_req_uuid,
`IGNORE_UNUSED_END
@ -98,26 +88,21 @@ module VX_cache_mshr #(
input wire allocate_rw,
input wire [DATA_WIDTH-1:0] allocate_data,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_id,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_prev,
output wire allocate_pending,
output wire [MSHR_ADDR_WIDTH-1:0] allocate_previd,
output wire allocate_ready,
// lookup
input wire lookup_valid,
input wire [`CS_LINE_ADDR_WIDTH-1:0] lookup_addr,
output wire [MSHR_SIZE-1:0] lookup_pending,
output wire [MSHR_SIZE-1:0] lookup_rw,
// finalize
input wire finalize_valid,
input wire finalize_release,
input wire finalize_pending,
input wire [MSHR_ADDR_WIDTH-1:0] finalize_id,
input wire [MSHR_ADDR_WIDTH-1:0] finalize_prev
input wire finalize_is_release,
input wire finalize_is_pending,
input wire [MSHR_ADDR_WIDTH-1:0] finalize_previd,
input wire [MSHR_ADDR_WIDTH-1:0] finalize_id
);
`UNUSED_PARAM (BANK_ID)
reg [`CS_LINE_ADDR_WIDTH-1:0] addr_table [MSHR_SIZE-1:0];
reg [MSHR_ADDR_WIDTH-1:0] next_index [MSHR_SIZE-1:0];
reg [`CS_LINE_ADDR_WIDTH-1:0] addr_table [0:MSHR_SIZE-1];
reg [MSHR_ADDR_WIDTH-1:0] next_index [0:MSHR_SIZE-1];
reg [MSHR_SIZE-1:0] valid_table, valid_table_n;
reg [MSHR_SIZE-1:0] next_table, next_table_x, next_table_n;
@ -135,8 +120,8 @@ module VX_cache_mshr #(
wire dequeue_fire = dequeue_valid && dequeue_ready;
wire [MSHR_SIZE-1:0] addr_matches;
for (genvar i = 0; i < MSHR_SIZE; ++i) begin
assign addr_matches[i] = valid_table[i] && (addr_table[i] == lookup_addr);
for (genvar i = 0; i < MSHR_SIZE; ++i) begin : g_addr_matches
assign addr_matches[i] = valid_table[i] && (addr_table[i] == allocate_addr);
end
VX_lzc #(
@ -148,11 +133,13 @@ module VX_cache_mshr #(
.valid_out (allocate_rdy_n)
);
VX_onehot_encoder #(
// find matching tail-entry
VX_priority_encoder #(
.N (MSHR_SIZE)
) prev_sel (
.data_in (addr_matches & ~next_table_x),
.data_out (prev_idx),
.index_out (prev_idx),
`UNUSED_PIN (onehot_out),
`UNUSED_PIN (valid_out)
);
@ -171,17 +158,22 @@ module VX_cache_mshr #(
valid_table_n[dequeue_id] = 0;
if (next_table[dequeue_id]) begin
dequeue_id_n = next_index[dequeue_id];
end else if (finalize_valid && finalize_is_pending && (finalize_previd == dequeue_id)) begin
dequeue_id_n = finalize_id;
end else begin
dequeue_val_n = 0;
end
end
if (finalize_valid) begin
if (finalize_release) begin
if (finalize_is_release) begin
valid_table_n[finalize_id] = 0;
end
if (finalize_pending) begin
next_table_x[finalize_prev] = 1;
// warning: This code allows 'finalize_is_pending' to be asserted regardless of hit/miss
// to reduce the its propagation delay into the MSHR. this is safe because wrong updates
// to 'next_table_n' will be cleared during 'allocate_fire' below.
if (finalize_is_pending) begin
next_table_x[finalize_previd] = 1;
end
end
@ -204,12 +196,12 @@ module VX_cache_mshr #(
end
if (allocate_fire) begin
addr_table[allocate_id] <= allocate_addr;
addr_table[allocate_id] <= allocate_addr;
write_table[allocate_id] <= allocate_rw;
end
if (finalize_valid && finalize_pending) begin
next_index[finalize_prev] <= finalize_id;
if (finalize_valid && finalize_is_pending) begin
next_index[finalize_previd] <= finalize_id;
end
dequeue_id_r <= dequeue_id_n;
@ -217,20 +209,21 @@ module VX_cache_mshr #(
next_table <= next_table_n;
end
`RUNTIME_ASSERT((~allocate_fire || ~valid_table[allocate_id_r]), ("%t: *** %s inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id_r, lkp_req_uuid))
`RUNTIME_ASSERT(~(allocate_fire && valid_table[allocate_id_r]), ("%t: *** %s inuse allocation: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
`CS_BANK_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id_r, alc_req_uuid))
`RUNTIME_ASSERT((~finalize_valid || valid_table[finalize_id]), ("%t: *** %s invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[finalize_id], BANK_ID), finalize_id, fin_req_uuid))
`RUNTIME_ASSERT(~(finalize_valid && ~valid_table[finalize_id]), ("%t: *** %s invalid release: addr=0x%0h, id=%0d (#%0d)", $time, INSTANCE_ID,
`CS_BANK_TO_FULL_ADDR(addr_table[finalize_id], BANK_ID), finalize_id, fin_req_uuid))
`RUNTIME_ASSERT((~fill_valid || valid_table[fill_id]), ("%t: *** %s invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), fill_id))
`RUNTIME_ASSERT(~(fill_valid && ~valid_table[fill_id]), ("%t: *** %s invalid fill: addr=0x%0h, id=%0d", $time, INSTANCE_ID,
`CS_BANK_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), fill_id))
VX_dp_ram #(
.DATAW (DATA_WIDTH),
.SIZE (MSHR_SIZE),
.LUTRAM (1)
) entries (
.DATAW (DATA_WIDTH),
.SIZE (MSHR_SIZE),
.RDW_MODE ("R"),
.RADDR_REG (1)
) mshr_store (
.clk (clk),
.reset (reset),
.read (1'b1),
@ -245,19 +238,20 @@ module VX_cache_mshr #(
assign fill_addr = addr_table[fill_id];
assign allocate_ready = allocate_rdy;
assign allocate_id = allocate_id_r;
assign allocate_prev = prev_idx;
assign allocate_id = allocate_id_r;
assign allocate_previd = prev_idx;
assign dequeue_valid = dequeue_val;
assign dequeue_addr = addr_table[dequeue_id_r];
assign dequeue_rw = write_table[dequeue_id_r];
assign dequeue_id = dequeue_id_r;
if (WRITEBACK) begin : g_pending_wb
assign allocate_pending = |addr_matches;
end else begin : g_pending_wt
// exclude write requests if writethrough
assign allocate_pending = |(addr_matches & ~write_table);
end
// return pending entries for the given cache line
assign lookup_pending = addr_matches;
assign lookup_rw = write_table;
`UNUSED_VAR (lookup_valid)
assign dequeue_valid = dequeue_val;
assign dequeue_addr = addr_table[dequeue_id_r];
assign dequeue_rw = write_table[dequeue_id_r];
assign dequeue_id = dequeue_id_r;
`ifdef DBG_TRACE_CACHE
reg show_table;
@ -265,37 +259,42 @@ module VX_cache_mshr #(
if (reset) begin
show_table <= 0;
end else begin
show_table <= allocate_fire || lookup_valid || finalize_valid || fill_valid || dequeue_fire;
show_table <= allocate_fire || finalize_valid || fill_valid || dequeue_fire;
end
if (allocate_fire) begin
`TRACE(3, ("%t: %s allocate: addr=0x%0h, id=%0d, pending=%b, prev=%0d (#%0d)\n", $time, INSTANCE_ID,
`CS_BANK_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_id, allocate_pending, prev_idx, alc_req_uuid))
end
if (finalize_valid && finalize_is_release) begin
`TRACE(3, ("%t: %s release: id=%0d (#%0d)\n", $time, INSTANCE_ID, finalize_id, fin_req_uuid))
end
if (finalize_valid && finalize_is_pending) begin
`TRACE(3, ("%t: %s finalize: id=%0d (#%0d)\n", $time, INSTANCE_ID, finalize_id, fin_req_uuid))
end
if (fill_valid) begin
`TRACE(3, ("%t: %s fill: addr=0x%0h, id=%0d\n", $time, INSTANCE_ID,
`CS_BANK_TO_FULL_ADDR(fill_addr, BANK_ID), fill_id))
end
if (dequeue_fire) begin
`TRACE(3, ("%t: %s dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID,
`CS_BANK_TO_FULL_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_uuid))
end
if (allocate_fire)
`TRACE(3, ("%d: %s allocate: addr=0x%0h, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(allocate_addr, BANK_ID), allocate_prev, allocate_id, lkp_req_uuid));
if (lookup_valid)
`TRACE(3, ("%d: %s lookup: addr=0x%0h, matches=%b (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(lookup_addr, BANK_ID), lookup_pending, lkp_req_uuid));
if (finalize_valid)
`TRACE(3, ("%d: %s finalize release=%b, pending=%b, prev=%0d, id=%0d (#%0d)\n", $time, INSTANCE_ID,
finalize_release, finalize_pending, finalize_prev, finalize_id, fin_req_uuid));
if (fill_valid)
`TRACE(3, ("%d: %s fill: addr=0x%0h, addr=0x%0h, id=%0d\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(addr_table[fill_id], BANK_ID), `CS_LINE_TO_FULL_ADDR(fill_addr, BANK_ID), fill_id));
if (dequeue_fire)
`TRACE(3, ("%d: %s dequeue: addr=0x%0h, id=%0d (#%0d)\n", $time, INSTANCE_ID,
`CS_LINE_TO_FULL_ADDR(dequeue_addr, BANK_ID), dequeue_id_r, deq_req_uuid));
if (show_table) begin
`TRACE(3, ("%d: %s table", $time, INSTANCE_ID));
`TRACE(3, ("%t: %s table", $time, INSTANCE_ID))
for (integer i = 0; i < MSHR_SIZE; ++i) begin
if (valid_table[i]) begin
`TRACE(3, (" %0d=0x%0h", i, `CS_LINE_TO_FULL_ADDR(addr_table[i], BANK_ID)));
if (write_table[i])
`TRACE(3, ("(w)"));
else
`TRACE(3, ("(r)"));
if (next_table[i])
`TRACE(3, ("->%0d", next_index[i]));
`TRACE(3, (" %0d=0x%0h", i, `CS_BANK_TO_FULL_ADDR(addr_table[i], BANK_ID)))
if (write_table[i]) begin
`TRACE(3, ("(w)"))
end else begin
`TRACE(3, ("(r)"))
end
if (next_table[i]) begin
`TRACE(3, ("->%0d", next_index[i]))
end
end
end
`TRACE(3, ("\n"));
`TRACE(3, ("\n"))
end
end
`endif

210
hw/rtl/cache/VX_cache_repl.sv vendored Normal file
View file

@ -0,0 +1,210 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
// Fast PLRU encoder and decoder utility
// Adapted from BaseJump STL: http://bjump.org/data_out.html
module plru_decoder #(
parameter NUM_WAYS = 1,
parameter WAY_IDX_BITS = $clog2(NUM_WAYS),
parameter WAY_IDX_WIDTH = `UP(WAY_IDX_BITS)
) (
input wire [WAY_IDX_WIDTH-1:0] way_idx,
output wire [`UP(NUM_WAYS-1)-1:0] lru_data,
output wire [`UP(NUM_WAYS-1)-1:0] lru_mask
);
if (NUM_WAYS > 1) begin : g_dec
wire [`UP(NUM_WAYS-1)-1:0] data;
`IGNORE_UNOPTFLAT_BEGIN
wire [`UP(NUM_WAYS-1)-1:0] mask;
`IGNORE_UNOPTFLAT_END
for (genvar i = 0; i < NUM_WAYS-1; ++i) begin : g_i
if (i == 0) begin : g_i_0
assign mask[i] = 1'b1;
end else if (i % 2 == 1) begin : g_i_odd
assign mask[i] = mask[(i-1)/2] & ~way_idx[WAY_IDX_BITS-$clog2(i+2)+1];
end else begin : g_i_even
assign mask[i] = mask[(i-2)/2] & way_idx[WAY_IDX_BITS-$clog2(i+2)+1];
end
assign data[i] = ~way_idx[WAY_IDX_BITS-$clog2(i+2)];
end
assign lru_data = data;
assign lru_mask = mask;
end else begin : g_no_dec
`UNUSED_VAR (way_idx)
assign lru_data = '0;
assign lru_mask = '0;
end
endmodule
module plru_encoder #(
parameter NUM_WAYS = 1,
parameter WAY_IDX_BITS = $clog2(NUM_WAYS),
parameter WAY_IDX_WIDTH = `UP(WAY_IDX_BITS)
) (
input wire [`UP(NUM_WAYS-1)-1:0] lru_in,
output wire [WAY_IDX_WIDTH-1:0] way_idx
);
if (NUM_WAYS > 1) begin : g_enc
wire [WAY_IDX_BITS-1:0] tmp;
for (genvar i = 0; i < WAY_IDX_BITS; ++i) begin : g_i
if (i == 0) begin : g_i_0
assign tmp[WAY_IDX_WIDTH-1] = lru_in[0];
end else begin : g_i_n
VX_mux #(
.N (2**i)
) mux (
.data_in (lru_in[((2**i)-1)+:(2**i)]),
.sel_in (tmp[WAY_IDX_BITS-1-:i]),
.data_out (tmp[WAY_IDX_BITS-1-i])
);
end
end
assign way_idx = tmp;
end else begin : g_no_enc
`UNUSED_VAR (lru_in)
assign way_idx = '0;
end
endmodule
module VX_cache_repl #(
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
// Number of associative ways
parameter NUM_WAYS = 1,
// replacement policy
parameter REPL_POLICY = `CS_REPL_FIFO
) (
input wire clk,
input wire reset,
input wire stall,
input wire init,
input wire lookup_valid,
input wire lookup_hit,
input wire [`CS_LINE_SEL_BITS-1:0] lookup_line,
input wire [`CS_WAY_SEL_WIDTH-1:0] lookup_way,
input wire repl_valid,
input wire [`CS_LINE_SEL_BITS-1:0] repl_line,
output wire [`CS_WAY_SEL_WIDTH-1:0] repl_way
);
localparam WAY_SEL_WIDTH = `CS_WAY_SEL_WIDTH;
`UNUSED_VAR (reset)
`UNUSED_VAR (init)
`UNUSED_VAR (stall)
if (NUM_WAYS > 1) begin : g_enable
if (REPL_POLICY == `CS_REPL_PLRU) begin : g_plru
// Pseudo Least Recently Used replacement policy
localparam LRU_WIDTH = `UP(NUM_WAYS-1);
wire [LRU_WIDTH-1:0] plru_rdata;
wire [LRU_WIDTH-1:0] plru_wdata;
wire [LRU_WIDTH-1:0] plru_wmask;
VX_dp_ram #(
.DATAW (LRU_WIDTH),
.SIZE (`CS_LINES_PER_BANK),
.WRENW (LRU_WIDTH),
.RDW_MODE ("R"),
.RADDR_REG (1)
) plru_store (
.clk (clk),
.reset (1'b0),
.read (repl_valid),
.write (init || (lookup_valid && lookup_hit)),
.wren (init ? '1 : plru_wmask),
.waddr (lookup_line),
.raddr (repl_line),
.wdata (init ? '0 : plru_wdata),
.rdata (plru_rdata)
);
plru_decoder #(
.NUM_WAYS (NUM_WAYS)
) plru_dec (
.way_idx (lookup_way),
.lru_data (plru_wdata),
.lru_mask (plru_wmask)
);
plru_encoder #(
.NUM_WAYS (NUM_WAYS)
) plru_enc (
.lru_in (plru_rdata),
.way_idx (repl_way)
);
end else if (REPL_POLICY == `CS_REPL_FIFO) begin : g_fifo
// Fifo replacement policy
`UNUSED_VAR (lookup_valid)
`UNUSED_VAR (lookup_hit)
`UNUSED_VAR (lookup_line)
`UNUSED_VAR (lookup_way)
wire [WAY_SEL_WIDTH-1:0] fifo_rdata;
wire [WAY_SEL_WIDTH-1:0] fifo_wdata = fifo_rdata + 1;
VX_sp_ram #(
.DATAW (WAY_SEL_WIDTH),
.SIZE (`CS_LINES_PER_BANK),
.RDW_MODE ("R"),
.RADDR_REG (1)
) fifo_store (
.clk (clk),
.reset (1'b0),
.read (repl_valid),
.write (init || repl_valid),
.wren (1'b1),
.addr (repl_line),
.wdata (init ? '0 : fifo_wdata),
.rdata (fifo_rdata)
);
assign repl_way = fifo_rdata;
end else begin : g_random
// Random replacement policy
`UNUSED_VAR (lookup_valid)
`UNUSED_VAR (lookup_hit)
`UNUSED_VAR (lookup_line)
`UNUSED_VAR (lookup_way)
`UNUSED_VAR (repl_valid)
`UNUSED_VAR (repl_line)
reg [WAY_SEL_WIDTH-1:0] victim_idx;
always @(posedge clk) begin
if (reset) begin
victim_idx <= 0;
end else if (~stall) begin
victim_idx <= victim_idx + 1;
end
end
assign repl_way = victim_idx;
end
end else begin : g_disable
`UNUSED_VAR (clk)
`UNUSED_VAR (lookup_valid)
`UNUSED_VAR (lookup_hit)
`UNUSED_VAR (lookup_line)
`UNUSED_VAR (lookup_way)
`UNUSED_VAR (repl_valid)
`UNUSED_VAR (repl_line)
assign repl_way = 1'b0;
end
endmodule

View file

@ -14,8 +14,6 @@
`include "VX_cache_define.vh"
module VX_cache_tags #(
parameter `STRING INSTANCE_ID = "",
parameter BANK_ID = 0,
// Size of cache in bytes
parameter CACHE_SIZE = 1024,
// Size of line inside a bank in bytes
@ -27,96 +25,61 @@ module VX_cache_tags #(
// Size of a word in bytes
parameter WORD_SIZE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
// Request debug identifier
parameter UUID_WIDTH = 0
parameter WRITEBACK = 0
) (
input wire clk,
input wire reset,
`IGNORE_UNUSED_BEGIN
input wire [`UP(UUID_WIDTH)-1:0] req_uuid,
`IGNORE_UNUSED_END
input wire stall,
// init/fill/lookup
// inputs
input wire init,
input wire flush,
input wire fill,
input wire read,
input wire write,
input wire lookup,
input wire [`CS_LINE_ADDR_WIDTH-1:0] line_addr,
input wire [NUM_WAYS-1:0] way_sel,
output wire [NUM_WAYS-1:0] tag_matches,
input wire [`CS_LINE_SEL_BITS-1:0] line_idx,
input wire [`CS_TAG_SEL_BITS-1:0] line_tag,
input wire [`CS_WAY_SEL_WIDTH-1:0] evict_way,
// eviction
// outputs
output wire [NUM_WAYS-1:0] tag_matches,
output wire evict_dirty,
output wire [NUM_WAYS-1:0] evict_way,
output wire [`CS_TAG_SEL_BITS-1:0] evict_tag
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (BANK_ID)
`UNUSED_VAR (lookup)
// valid, dirty, tag
localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS;
wire [`CS_LINE_SEL_BITS-1:0] line_sel = line_addr[`CS_LINE_SEL_BITS-1:0];
wire [`CS_TAG_SEL_BITS-1:0] line_tag = `CS_LINE_ADDR_TAG(line_addr);
// valid, dirty, tag
localparam TAG_WIDTH = 1 + WRITEBACK + `CS_TAG_SEL_BITS;
wire [NUM_WAYS-1:0][`CS_TAG_SEL_BITS-1:0] read_tag;
wire [NUM_WAYS-1:0] read_valid;
wire [NUM_WAYS-1:0] read_dirty;
`UNUSED_VAR (read)
if (NUM_WAYS > 1) begin
reg [NUM_WAYS-1:0] evict_way_r;
// cyclic assignment of replacement way
always @(posedge clk) begin
if (reset) begin
evict_way_r <= 1;
end else if (~stall) begin // holding the value on stalls prevents filling different slots twice
evict_way_r <= {evict_way_r[NUM_WAYS-2:0], evict_way_r[NUM_WAYS-1]};
end
end
assign evict_way = fill ? evict_way_r : way_sel;
VX_onehot_mux #(
.DATAW (`CS_TAG_SEL_BITS),
.N (NUM_WAYS)
) evict_tag_sel (
.data_in (read_tag),
.sel_in (evict_way),
.data_out (evict_tag)
);
end else begin
`UNUSED_VAR (stall)
assign evict_way = 1'b1;
assign evict_tag = read_tag;
if (WRITEBACK) begin : g_evict_tag_wb
assign evict_dirty = read_dirty[evict_way];
assign evict_tag = read_tag[evict_way];
end else begin : g_evict_tag_wt
`UNUSED_VAR (read_dirty)
assign evict_dirty = 1'b0;
assign evict_tag = '0;
end
// fill and flush need to also read in writeback mode
wire fill_s = fill && (!WRITEBACK || ~stall);
wire flush_s = flush && (!WRITEBACK || ~stall);
for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_tag_store
wire way_en = (NUM_WAYS == 1) || (evict_way == i);
wire do_init = init; // init all ways
wire do_fill = fill && way_en;
wire do_flush = flush && (!WRITEBACK || way_en); // flush the whole line in writethrough mode
wire do_write = WRITEBACK && write && tag_matches[i]; // only write on tag hit
for (genvar i = 0; i < NUM_WAYS; ++i) begin
wire do_fill = fill_s && evict_way[i];
wire do_flush = flush_s && (!WRITEBACK || way_sel[i]); // flush the whole line in writethrough mode
wire do_write = WRITEBACK && write && tag_matches[i];
wire line_read = (WRITEBACK && (fill_s || flush_s));
wire line_write = init || do_fill || do_flush || do_write;
wire line_valid = ~(init || flush);
wire line_read = read || write || (WRITEBACK && (fill || flush));
wire line_write = do_init || do_fill || do_flush || do_write;
wire line_valid = fill || write;
wire [TAG_WIDTH-1:0] line_wdata;
wire [TAG_WIDTH-1:0] line_rdata;
if (WRITEBACK) begin
if (WRITEBACK) begin : g_wdata
assign line_wdata = {line_valid, write, line_tag};
assign {read_valid[i], read_dirty[i], read_tag[i]} = line_rdata;
end else begin
end else begin : g_wdata
assign line_wdata = {line_valid, line_tag};
assign {read_valid[i], read_tag[i]} = line_rdata;
assign read_dirty[i] = 1'b0;
@ -125,52 +88,22 @@ module VX_cache_tags #(
VX_sp_ram #(
.DATAW (TAG_WIDTH),
.SIZE (`CS_LINES_PER_BANK),
.NO_RWCHECK (1),
.RW_ASSERT (1)
.RDW_MODE ("W"),
.RADDR_REG (1)
) tag_store (
.clk (clk),
.reset (reset),
.read (line_read),
.write (line_write),
.wren (1'b1),
.addr (line_sel),
.addr (line_idx),
.wdata (line_wdata),
.rdata (line_rdata)
);
end
for (genvar i = 0; i < NUM_WAYS; ++i) begin
for (genvar i = 0; i < NUM_WAYS; ++i) begin : g_tag_matches
assign tag_matches[i] = read_valid[i] && (line_tag == read_tag[i]);
end
assign evict_dirty = | (read_dirty & evict_way);
`ifdef DBG_TRACE_CACHE
wire [`CS_LINE_ADDR_WIDTH-1:0] evict_line_addr = {evict_tag, line_sel};
always @(posedge clk) begin
if (fill && ~stall) begin
`TRACE(3, ("%d: %s fill: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h, dirty=%b, evict_addr=0x%0h\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), evict_way, line_sel, line_tag, evict_dirty, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID)));
end
if (init) begin
`TRACE(3, ("%d: %s init: addr=0x%0h, blk_addr=%0d\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel));
end
if (flush && ~stall) begin
`TRACE(3, ("%d: %s flush: addr=0x%0h, way=%b, blk_addr=%0d, dirty=%b\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(evict_line_addr, BANK_ID), way_sel, line_sel, evict_dirty));
end
if (lookup && ~stall) begin
if (tag_matches != 0) begin
if (write)
`TRACE(3, ("%d: %s write-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid));
else
`TRACE(3, ("%d: %s read-hit: addr=0x%0h, way=%b, blk_addr=%0d, tag_id=0x%0h (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), tag_matches, line_sel, line_tag, req_uuid));
end else begin
if (write)
`TRACE(3, ("%d: %s write-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
else
`TRACE(3, ("%d: %s read-miss: addr=0x%0h, blk_addr=%0d, tag_id=0x%0h, (#%0d)\n", $time, INSTANCE_ID, `CS_LINE_TO_FULL_ADDR(line_addr, BANK_ID), line_sel, line_tag, req_uuid));
end
end
end
`endif
endmodule

View file

@ -19,8 +19,11 @@ module VX_cache_top import VX_gpu_pkg::*; #(
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Number of memory ports
parameter MEM_PORTS = 1,
// Size of cache in bytes
parameter CACHE_SIZE = 16384,
parameter CACHE_SIZE = 65536,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
@ -28,39 +31,39 @@ module VX_cache_top import VX_gpu_pkg::*; #(
// Number of associative ways
parameter NUM_WAYS = 4,
// Size of a word in bytes
parameter WORD_SIZE = 4,
parameter WORD_SIZE = 16,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
parameter CRSQ_SIZE = 8,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 16,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
parameter MRSQ_SIZE = 8,
// Memory Request Queue Size
parameter MREQ_SIZE = 4,
parameter MREQ_SIZE = 8,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Enable cache writeback
parameter WRITEBACK = 0,
parameter WRITEBACK = 1,
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
parameter DIRTY_BYTES = 1,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = 16,
parameter TAG_WIDTH = 32,
// Core response output buffer
parameter CORE_OUT_BUF = 2,
parameter CORE_OUT_BUF = 3,
// Memory request output buffer
parameter MEM_OUT_BUF = 2,
parameter MEM_OUT_BUF = 3,
parameter MEM_TAG_WIDTH = `CLOG2(MSHR_SIZE) + `CLOG2(NUM_BANKS)
parameter MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH)
) (
input wire clk,
input wire reset,
@ -71,35 +74,35 @@ module VX_cache_top import VX_gpu_pkg::*; #(
`endif
// Core request
input wire [NUM_REQS-1:0] core_req_valid,
input wire [NUM_REQS-1:0] core_req_rw,
input wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr,
input wire [NUM_REQS-1:0][`ADDR_TYPE_WIDTH-1:0] core_req_atype,
input wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data,
input wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag,
output wire [NUM_REQS-1:0] core_req_ready,
input wire core_req_valid [NUM_REQS],
input wire core_req_rw [NUM_REQS],
input wire[WORD_SIZE-1:0] core_req_byteen [NUM_REQS],
input wire[`CS_WORD_ADDR_WIDTH-1:0] core_req_addr [NUM_REQS],
input wire[`MEM_REQ_FLAGS_WIDTH-1:0] core_req_flags [NUM_REQS],
input wire[`CS_WORD_WIDTH-1:0] core_req_data [NUM_REQS],
input wire[TAG_WIDTH-1:0] core_req_tag [NUM_REQS],
output wire core_req_ready [NUM_REQS],
// Core response
output wire [NUM_REQS-1:0] core_rsp_valid,
output wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data,
output wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag,
input wire [NUM_REQS-1:0] core_rsp_ready,
output wire core_rsp_valid [NUM_REQS],
output wire[`CS_WORD_WIDTH-1:0] core_rsp_data [NUM_REQS],
output wire[TAG_WIDTH-1:0] core_rsp_tag [NUM_REQS],
input wire core_rsp_ready [NUM_REQS],
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [LINE_SIZE-1:0] mem_req_byteen,
output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
output wire mem_req_valid [MEM_PORTS],
output wire mem_req_rw [MEM_PORTS],
output wire [LINE_SIZE-1:0] mem_req_byteen [MEM_PORTS],
output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr [MEM_PORTS],
output wire [`CS_LINE_WIDTH-1:0] mem_req_data [MEM_PORTS],
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag [MEM_PORTS],
input wire mem_req_ready [MEM_PORTS],
// Memory response
input wire mem_rsp_valid,
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready
input wire mem_rsp_valid [MEM_PORTS],
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data [MEM_PORTS],
input wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag [MEM_PORTS],
output wire mem_rsp_ready [MEM_PORTS]
);
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
@ -109,7 +112,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) mem_bus_if();
) mem_bus_if[MEM_PORTS]();
// Core request
for (genvar i = 0; i < NUM_REQS; ++i) begin
@ -117,7 +120,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
assign core_bus_if[i].req_data.rw = core_req_rw[i];
assign core_bus_if[i].req_data.byteen = core_req_byteen[i];
assign core_bus_if[i].req_data.addr = core_req_addr[i];
assign core_bus_if[i].req_data.atype = core_req_atype[i];
assign core_bus_if[i].req_data.flags = core_req_flags[i];
assign core_bus_if[i].req_data.data = core_req_data[i];
assign core_bus_if[i].req_data.tag = core_req_tag[i];
assign core_req_ready[i] = core_bus_if[i].req_ready;
@ -125,29 +128,32 @@ module VX_cache_top import VX_gpu_pkg::*; #(
// Core response
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_valid[i] = core_bus_if[i].rsp_valid;
assign core_rsp_valid[i]= core_bus_if[i].rsp_valid;
assign core_rsp_data[i] = core_bus_if[i].rsp_data.data;
assign core_rsp_tag[i] = core_bus_if[i].rsp_data.tag;
assign core_rsp_tag[i] = core_bus_if[i].rsp_data.tag;
assign core_bus_if[i].rsp_ready = core_rsp_ready[i];
end
// Memory request
assign mem_req_valid = mem_bus_if.req_valid;
assign mem_req_rw = mem_bus_if.req_data.rw;
assign mem_req_byteen = mem_bus_if.req_data.byteen;
assign mem_req_addr = mem_bus_if.req_data.addr;
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
`UNUSED_VAR (mem_bus_if.req_data.atype)
for (genvar i = 0; i < MEM_PORTS; ++i) begin
assign mem_req_valid[i] = mem_bus_if[i].req_valid;
assign mem_req_rw[i] = mem_bus_if[i].req_data.rw;
assign mem_req_byteen[i]= mem_bus_if[i].req_data.byteen;
assign mem_req_addr[i] = mem_bus_if[i].req_data.addr;
assign mem_req_data[i] = mem_bus_if[i].req_data.data;
assign mem_req_tag[i] = mem_bus_if[i].req_data.tag;
assign mem_bus_if[i].req_ready = mem_req_ready[i];
end
// Memory response
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data;
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
assign mem_rsp_ready = mem_bus_if.rsp_ready;
for (genvar i = 0; i < MEM_PORTS; ++i) begin
assign mem_bus_if[i].rsp_valid = mem_rsp_valid[i];
assign mem_bus_if[i].rsp_data.data = mem_rsp_data[i];
assign mem_bus_if[i].rsp_data.tag = mem_rsp_tag[i];
assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready;
end
VX_cache #(
VX_cache_wrap #(
.INSTANCE_ID (INSTANCE_ID),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
@ -155,6 +161,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.MEM_PORTS (MEM_PORTS),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MRSQ_SIZE (MRSQ_SIZE),

View file

@ -21,24 +21,26 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Number of memory ports
parameter MEM_PORTS = 1,
// Size of cache in bytes
parameter CACHE_SIZE = 4096,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 1,
parameter NUM_BANKS = 4,
// Number of associative ways
parameter NUM_WAYS = 1,
parameter NUM_WAYS = 4,
// Size of a word in bytes
parameter WORD_SIZE = 4,
parameter WORD_SIZE = 16,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
parameter CRSQ_SIZE = 4,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 8,
parameter MSHR_SIZE = 16,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
parameter MRSQ_SIZE = 4,
// Memory Request Queue Size
parameter MREQ_SIZE = 4,
@ -51,12 +53,18 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
// Enable dirty bytes on writeback
parameter DIRTY_BYTES = 0,
// Replacement policy
parameter REPL_POLICY = `CS_REPL_FIFO,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 1,
// core request flags
parameter FLAGS_WIDTH = 0,
// enable bypass for non-cacheable addresses
parameter NC_ENABLE = 0,
@ -64,10 +72,10 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
parameter PASSTHRU = 0,
// Core response output buffer
parameter CORE_OUT_BUF = 0,
parameter CORE_OUT_BUF = 3,
// Memory request output buffer
parameter MEM_OUT_BUF = 0
parameter MEM_OUT_BUF = 3
) (
input wire clk,
@ -79,19 +87,16 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
`endif
VX_mem_bus_if.slave core_bus_if [NUM_REQS],
VX_mem_bus_if.master mem_bus_if
VX_mem_bus_if.master mem_bus_if [MEM_PORTS]
);
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
localparam CACHE_MEM_TAG_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
localparam NC_OR_BYPASS = (NC_ENABLE || PASSTHRU);
localparam CACHE_MEM_TAG_WIDTH = `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, MEM_PORTS, UUID_WIDTH);
localparam BYPASS_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, MEM_PORTS, LINE_SIZE, WORD_SIZE, TAG_WIDTH);
localparam NC_TAG_WIDTH = `MAX(CACHE_MEM_TAG_WIDTH, BYPASS_TAG_WIDTH) + 1;
localparam MEM_TAG_WIDTH = PASSTHRU ? BYPASS_TAG_WIDTH : (NC_ENABLE ? NC_TAG_WIDTH : CACHE_MEM_TAG_WIDTH);
localparam BYPASS_ENABLE = (NC_ENABLE || PASSTHRU);
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
@ -101,18 +106,21 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (CACHE_MEM_TAG_WIDTH)
) mem_bus_cache_if();
) mem_bus_cache_if[MEM_PORTS]();
if (NC_OR_BYPASS) begin
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH)
) mem_bus_tmp_if[MEM_PORTS]();
`RESET_RELAY (nc_bypass_reset, reset);
if (BYPASS_ENABLE) begin : g_bypass
VX_cache_bypass #(
.NUM_REQS (NUM_REQS),
.MEM_PORTS (MEM_PORTS),
.TAG_SEL_IDX (TAG_SEL_IDX),
.PASSTHRU (PASSTHRU),
.NC_ENABLE (PASSTHRU ? 0 : NC_ENABLE),
.CACHE_ENABLE (!PASSTHRU),
.WORD_SIZE (WORD_SIZE),
.LINE_SIZE (LINE_SIZE),
@ -122,7 +130,6 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.MEM_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH),
.MEM_TAG_IN_WIDTH (CACHE_MEM_TAG_WIDTH),
.MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH),
.UUID_WIDTH (UUID_WIDTH),
@ -130,51 +137,35 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.MEM_OUT_BUF (MEM_OUT_BUF)
) cache_bypass (
.clk (clk),
.reset (nc_bypass_reset),
.reset (reset),
.core_bus_in_if (core_bus_if),
.core_bus_out_if(core_bus_cache_if),
.mem_bus_in_if (mem_bus_cache_if),
.mem_bus_out_if (mem_bus_if)
.mem_bus_out_if (mem_bus_tmp_if)
);
end else begin
end else begin : g_no_bypass
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_cache_if
`ASSIGN_VX_MEM_BUS_IF (core_bus_cache_if[i], core_bus_if[i]);
end
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_cache_if);
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_tmp_if
`ASSIGN_VX_MEM_BUS_IF (mem_bus_tmp_if[i], mem_bus_cache_if[i]);
end
end
if (PASSTHRU != 0) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
`UNUSED_VAR (core_bus_cache_if[i].req_valid)
`UNUSED_VAR (core_bus_cache_if[i].req_data)
assign core_bus_cache_if[i].req_ready = 0;
assign core_bus_cache_if[i].rsp_valid = 0;
assign core_bus_cache_if[i].rsp_data = '0;
`UNUSED_VAR (core_bus_cache_if[i].rsp_ready)
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_if
if (WRITE_ENABLE) begin : g_we
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
end else begin : g_ro
`ASSIGN_VX_MEM_BUS_RO_IF (mem_bus_if[i], mem_bus_tmp_if[i]);
end
end
assign mem_bus_cache_if.req_valid = 0;
assign mem_bus_cache_if.req_data = '0;
`UNUSED_VAR (mem_bus_cache_if.req_ready)
`UNUSED_VAR (mem_bus_cache_if.rsp_valid)
`UNUSED_VAR (mem_bus_cache_if.rsp_data)
assign mem_bus_cache_if.rsp_ready = 0;
`ifdef PERF_ENABLE
assign cache_perf = '0;
`endif
end else begin
`RESET_RELAY (cache_reset, reset);
if (PASSTHRU == 0) begin : g_cache
VX_cache #(
.INSTANCE_ID (INSTANCE_ID),
@ -184,20 +175,23 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.NUM_REQS (NUM_REQS),
.MEM_PORTS (MEM_PORTS),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.REPL_POLICY (REPL_POLICY),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.WRITEBACK (WRITEBACK),
.DIRTY_BYTES (DIRTY_BYTES),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_OUT_BUF (NC_OR_BYPASS ? 1 : CORE_OUT_BUF),
.MEM_OUT_BUF (NC_OR_BYPASS ? 1 : MEM_OUT_BUF)
.FLAGS_WIDTH (FLAGS_WIDTH),
.CORE_OUT_BUF (BYPASS_ENABLE ? 1 : CORE_OUT_BUF),
.MEM_OUT_BUF (BYPASS_ENABLE ? 1 : MEM_OUT_BUF)
) cache (
.clk (clk),
.reset (cache_reset),
.reset (reset),
`ifdef PERF_ENABLE
.cache_perf (cache_perf),
`endif
@ -205,64 +199,105 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.mem_bus_if (mem_bus_cache_if)
);
end else begin : g_passthru
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_bus_cache_if
`UNUSED_VX_MEM_BUS_IF (core_bus_cache_if[i])
end
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_mem_bus_cache_if
`INIT_VX_MEM_BUS_IF (mem_bus_cache_if[i])
end
`ifdef PERF_ENABLE
wire [NUM_REQS-1:0] perf_core_reads_per_req;
wire [NUM_REQS-1:0] perf_core_writes_per_req;
wire [NUM_REQS-1:0] perf_crsp_stall_per_req;
wire [MEM_PORTS-1:0] perf_mem_stall_per_port;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_perf_crsp_stall_per_req
assign perf_core_reads_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && ~core_bus_if[i].req_data.rw;
assign perf_core_writes_per_req[i] = core_bus_if[i].req_valid && core_bus_if[i].req_ready && core_bus_if[i].req_data.rw;
assign perf_crsp_stall_per_req[i] = core_bus_if[i].rsp_valid && ~core_bus_if[i].rsp_ready;
end
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_perf_mem_stall_per_port
assign perf_mem_stall_per_port[i] = mem_bus_if[i].req_valid && ~mem_bus_if[i].req_ready;
end
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
wire [`CLOG2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
wire [`CLOG2(MEM_PORTS+1)-1:0] perf_mem_stall_per_cycle;
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_req);
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_req);
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
`POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port);
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
always @(posedge clk) begin
if (reset) begin
perf_core_reads <= '0;
perf_core_writes <= '0;
perf_mem_stalls <= '0;
perf_crsp_stalls <= '0;
end else begin
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'(perf_mem_stall_per_cycle);
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
end
end
assign cache_perf.reads = perf_core_reads;
assign cache_perf.writes = perf_core_writes;
assign cache_perf.read_misses = '0;
assign cache_perf.write_misses = '0;
assign cache_perf.bank_stalls = '0;
assign cache_perf.mshr_stalls = '0;
assign cache_perf.mem_stalls = perf_mem_stalls;
assign cache_perf.crsp_stalls = perf_crsp_stalls;
`endif
end
`ifdef DBG_TRACE_CACHE
for (genvar i = 0; i < NUM_REQS; ++i) begin
wire [`UP(UUID_WIDTH)-1:0] core_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] core_rsp_uuid;
if (UUID_WIDTH != 0) begin
assign core_req_uuid = core_bus_if[i].req_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
assign core_rsp_uuid = core_bus_if[i].rsp_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign core_req_uuid = 0;
assign core_rsp_uuid = 0;
end
wire core_req_fire = core_bus_if[i].req_valid && core_bus_if[i].req_ready;
wire core_rsp_fire = core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_trace_core
always @(posedge clk) begin
if (core_req_fire) begin
if (core_bus_if[i].req_data.rw)
`TRACE(1, ("%d: %s core-wr-req: addr=0x%0h, tag=0x%0h, req_idx=%0d, byteen=%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_req_uuid));
else
`TRACE(1, ("%d: %s core-rd-req: addr=0x%0h, tag=0x%0h, req_idx=%0d (#%0d)\n", $time, INSTANCE_ID, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag, i, core_req_uuid));
if (core_bus_if[i].req_valid && core_bus_if[i].req_ready) begin
if (core_bus_if[i].req_data.rw) begin
`TRACE(2, ("%t: %s core-wr-req[%0d]: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.byteen, core_bus_if[i].req_data.data, core_bus_if[i].req_data.tag.uuid))
end else begin
`TRACE(2, ("%t: %s core-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n", $time, INSTANCE_ID, i, `TO_FULL_ADDR(core_bus_if[i].req_data.addr), core_bus_if[i].req_data.tag.value, core_bus_if[i].req_data.tag.uuid))
end
end
if (core_rsp_fire) begin
`TRACE(1, ("%d: %s core-rd-rsp: tag=0x%0h, req_idx=%0d, data=0x%h (#%0d)\n", $time, INSTANCE_ID, core_bus_if[i].rsp_data.tag, i, core_bus_if[i].rsp_data.data, core_rsp_uuid));
if (core_bus_if[i].rsp_valid && core_bus_if[i].rsp_ready) begin
`TRACE(2, ("%t: %s core-rd-rsp[%0d]: tag=0x%0h, data=0x%h (#%0d)\n", $time, INSTANCE_ID, i, core_bus_if[i].rsp_data.tag.value, core_bus_if[i].rsp_data.data, core_bus_if[i].rsp_data.tag.uuid))
end
end
end
wire [`UP(UUID_WIDTH)-1:0] mem_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] mem_rsp_uuid;
if ((UUID_WIDTH != 0) && (NC_OR_BYPASS != 0)) begin
assign mem_req_uuid = mem_bus_if.req_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
assign mem_rsp_uuid = mem_bus_if.rsp_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign mem_req_uuid = 0;
assign mem_rsp_uuid = 0;
end
wire mem_req_fire = mem_bus_if.req_valid && mem_bus_if.req_ready;
wire mem_rsp_fire = mem_bus_if.rsp_valid && mem_bus_if.rsp_ready;
always @(posedge clk) begin
if (mem_req_fire) begin
if (mem_bus_if.req_data.rw)
`TRACE(1, ("%d: %s mem-wr-req: addr=0x%0h, tag=0x%0h, byteen=%h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_bus_if.req_data.byteen, mem_bus_if.req_data.data, mem_req_uuid));
else
`TRACE(1, ("%d: %s mem-rd-req: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, `TO_FULL_ADDR(mem_bus_if.req_data.addr), mem_bus_if.req_data.tag, mem_req_uuid));
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: %s mem-rd-rsp: tag=0x%0h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, mem_bus_if.rsp_data.tag, mem_bus_if.rsp_data.data, mem_rsp_uuid));
for (genvar i = 0; i < MEM_PORTS; ++i) begin : g_trace_mem
always @(posedge clk) begin
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
if (mem_bus_if[i].req_data.rw) begin
`TRACE(2, ("%t: %s mem-wr-req[%0d]: addr=0x%0h, tag=0x%0h, byteen=0x%h, data=0x%h (#%0d)\n",
$time, INSTANCE_ID, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, mem_bus_if[i].req_data.tag.uuid))
end else begin
`TRACE(2, ("%t: %s mem-rd-req[%0d]: addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, `TO_FULL_ADDR(mem_bus_if[i].req_data.addr), mem_bus_if[i].req_data.tag.value, mem_bus_if[i].req_data.tag.uuid))
end
end
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
`TRACE(2, ("%t: %s mem-rd-rsp[%0d]: data=0x%h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.data, mem_bus_if[i].rsp_data.tag.value, mem_bus_if[i].rsp_data.tag.uuid))
end
end
end
`endif

View file

@ -71,19 +71,19 @@ module VX_alu_int #(
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_imm = execute_if.data.op_args.alu.use_imm ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_args.alu.imm)}} : alu_in2;
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2_br = (execute_if.data.op_args.alu.use_imm && ~is_br_op) ? {NUM_LANES{`SEXT(`XLEN, execute_if.data.op_args.alu.imm)}} : alu_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_add_result
assign add_result[i] = alu_in1_PC[i] + alu_in2_imm[i];
assign add_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] + alu_in2_imm[i][31:0]));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_sub_result
wire [`XLEN:0] sub_in1 = {is_signed & alu_in1[i][`XLEN-1], alu_in1[i]};
wire [`XLEN:0] sub_in2 = {is_signed & alu_in2_br[i][`XLEN-1], alu_in2_br[i]};
assign sub_result[i] = sub_in1 - sub_in2;
assign sub_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] - alu_in2_imm[i][31:0]));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_shr_result
wire [`XLEN:0] shr_in1 = {is_signed && alu_in1[i][`XLEN-1], alu_in1[i]};
always @(*) begin
case (alu_op[1:0])
@ -102,7 +102,7 @@ module VX_alu_int #(
assign shr_result_w[i] = `XLEN'($signed(shr_res_w));
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_msc_result
always @(*) begin
case (alu_op[1:0])
2'b00: msc_result[i] = alu_in1[i] & alu_in2_imm[i]; // AND
@ -114,7 +114,7 @@ module VX_alu_int #(
assign msc_result_w[i] = `XLEN'($signed(alu_in1[i][31:0] << alu_in2_imm[i][4:0])); // SLLW
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_alu_result
wire [`XLEN-1:0] slt_br_result = `XLEN'({is_br_op && ~(| sub_result[i][`XLEN-1:0]), sub_result[i][`XLEN]});
wire [`XLEN-1:0] sub_slt_br_result = (is_sub_op && ~is_br_op) ? sub_result[i][`XLEN-1:0] : slt_br_result;
always @(*) begin
@ -141,9 +141,9 @@ module VX_alu_int #(
assign cbr_dest = add_result[0][1 +: `PC_BITS];
if (LANE_BITS != 0) begin
if (LANE_BITS != 0) begin : g_tid
assign tid = execute_if.data.tid[0 +: LANE_BITS];
end else begin
end else begin : g_tid_0
assign tid = 0;
end
@ -185,7 +185,7 @@ module VX_alu_int #(
.data_out ({branch_ctl_if.valid, branch_ctl_if.wid, branch_ctl_if.taken, branch_ctl_if.dest})
);
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_commit
assign commit_if.data.data[i] = (is_br_op_r && is_br_static) ? {(PC_r + `PC_BITS'(2)), 1'd0} : alu_result_r[i];
end
@ -194,8 +194,8 @@ module VX_alu_int #(
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (br_enable) begin
`TRACE(1, ("%d: %s-branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n",
$time, INSTANCE_ID, br_wid, {commit_if.data.PC, 1'b0}, br_taken, {br_dest, 1'b0}, commit_if.data.uuid));
`TRACE(2, ("%t: %s branch: wid=%0d, PC=0x%0h, taken=%b, dest=0x%0h (#%0d)\n",
$time, INSTANCE_ID, br_wid, {commit_if.data.PC, 1'b0}, br_taken, {br_dest, 1'b0}, commit_if.data.uuid))
end
end
`endif

View file

@ -68,7 +68,7 @@ module VX_alu_muldiv #(
wire mul_fire_in = mul_valid_in && mul_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mul_result_tmp
reg [`XLEN-1:0] mul_resultl, mul_resulth;
wire [`XLEN-1:0] mul_in1 = is_alu_w ? (execute_if.data.rs1_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs1_data[i];
wire [`XLEN-1:0] mul_in2 = is_alu_w ? (execute_if.data.rs2_data[i] & `XLEN'hFFFFFFFF) : execute_if.data.rs2_data[i];
@ -103,7 +103,7 @@ module VX_alu_muldiv #(
wire [NUM_LANES-1:0][`XLEN:0] mul_in1;
wire [NUM_LANES-1:0][`XLEN:0] mul_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mul_in
assign mul_in1[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]} : {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]};
assign mul_in2[i] = is_alu_w ? {{(`XLEN-31){execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]} : {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]};
end
@ -149,7 +149,7 @@ module VX_alu_muldiv #(
`else
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_multiplier
wire [`XLEN:0] mul_in1 = {is_signed_mul_a && execute_if.data.rs1_data[i][`XLEN-1], execute_if.data.rs1_data[i]};
wire [`XLEN:0] mul_in2 = {is_signed_mul_b && execute_if.data.rs2_data[i][`XLEN-1], execute_if.data.rs2_data[i]};
@ -184,7 +184,7 @@ module VX_alu_muldiv #(
`endif
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mul_result_out
`ifdef XLEN_64
assign mul_result_out[i] = is_mulh_out ? mul_result_tmp[i][2*(`XLEN)-1:`XLEN] :
(is_mul_w_out ? `XLEN'($signed(mul_result_tmp[i][31:0])) :
@ -219,7 +219,7 @@ module VX_alu_muldiv #(
wire [NUM_LANES-1:0][`XLEN-1:0] div_in1;
wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_div_in
`ifdef XLEN_64
assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
@ -234,7 +234,7 @@ module VX_alu_muldiv #(
wire [NUM_LANES-1:0][`XLEN-1:0] div_result_in;
wire div_fire_in = div_valid_in && div_ready_in;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_div_result_in
reg [`XLEN-1:0] div_quotient, div_remainder;
always @(*) begin
dpi_idiv (div_fire_in, is_signed_op, div_in1[i], div_in2[i], div_quotient, div_remainder);
@ -306,7 +306,7 @@ module VX_alu_muldiv #(
assign {div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out, is_div_w_out, div_pid_out, div_sop_out, div_eop_out} = div_tag_r;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_div_result_out
`ifdef XLEN_64
assign div_result_out[i] = is_rem_op_out ? (is_div_w_out ? `XLEN'($signed(div_remainder[i][31:0])) : div_remainder[i]) :
(is_div_w_out ? `XLEN'($signed(div_quotient[i][31:0])) : div_quotient[i]);
@ -324,8 +324,8 @@ module VX_alu_muldiv #(
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (TAG_WIDTH + (NUM_LANES * `XLEN)),
.ARBITER ("F"),
.OUT_BUF (1)
.ARBITER ("P"),
.OUT_BUF (2)
) rsp_buf (
.clk (clk),
.reset (reset),

View file

@ -30,20 +30,24 @@ module VX_alu_unit #(
`UNUSED_SPARAM (INSTANCE_ID)
localparam BLOCK_SIZE = `NUM_ALU_BLOCKS;
localparam NUM_LANES = `NUM_ALU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam RSP_ARB_SIZE = 1 + `EXT_M_ENABLED;
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
localparam PE_COUNT = 1 + `EXT_M_ENABLED;
localparam PE_SEL_BITS = `CLOG2(PE_COUNT);
localparam PE_IDX_INT = 0;
localparam PE_IDX_MDV = PE_IDX_INT + `EXT_M_ENABLED;
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) per_block_execute_if[BLOCK_SIZE]();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) per_block_commit_if[BLOCK_SIZE]();
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_BUF (PARTIAL_BW ? 1 : 0)
.OUT_BUF (PARTIAL_BW ? 3 : 0)
) dispatch_unit (
.clk (clk),
.reset (reset),
@ -51,103 +55,62 @@ module VX_alu_unit #(
.execute_if (per_block_execute_if)
);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) per_block_commit_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
`RESET_RELAY_EN (block_reset, reset,(BLOCK_SIZE > 1));
wire is_muldiv_op = `EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == `ALU_TYPE_MULDIV);
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_alus
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) int_execute_if();
) pe_execute_if[PE_COUNT]();
VX_commit_if #(
VX_commit_if#(
.NUM_LANES (NUM_LANES)
) int_commit_if();
) pe_commit_if[PE_COUNT]();
assign int_execute_if.valid = per_block_execute_if[block_idx].valid && ~is_muldiv_op;
assign int_execute_if.data = per_block_execute_if[block_idx].data;
reg [`UP(PE_SEL_BITS)-1:0] pe_select;
always @(*) begin
pe_select = PE_IDX_INT;
if (`EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == `ALU_TYPE_MULDIV))
pe_select = PE_IDX_MDV;
end
VX_pe_switch #(
.PE_COUNT (PE_COUNT),
.NUM_LANES (NUM_LANES),
.ARBITER ("R"),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (PARTIAL_BW ? 1 : 3)
) pe_switch (
.clk (clk),
.reset (reset),
.pe_sel (pe_select),
.execute_in_if (per_block_execute_if[block_idx]),
.commit_out_if (per_block_commit_if[block_idx]),
.execute_out_if (pe_execute_if),
.commit_in_if (pe_commit_if)
);
VX_alu_int #(
.INSTANCE_ID ($sformatf("%s-int%0d", INSTANCE_ID, block_idx)),
.INSTANCE_ID (`SFORMATF(("%s-int%0d", INSTANCE_ID, block_idx))),
.BLOCK_IDX (block_idx),
.NUM_LANES (NUM_LANES)
) alu_int (
.clk (clk),
.reset (block_reset),
.execute_if (int_execute_if),
.reset (reset),
.execute_if (pe_execute_if[PE_IDX_INT]),
.branch_ctl_if (branch_ctl_if[block_idx]),
.commit_if (int_commit_if)
.commit_if (pe_commit_if[PE_IDX_INT])
);
`ifdef EXT_M_ENABLE
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) muldiv_execute_if();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) muldiv_commit_if();
assign muldiv_execute_if.valid = per_block_execute_if[block_idx].valid && is_muldiv_op;
assign muldiv_execute_if.data = per_block_execute_if[block_idx].data;
VX_alu_muldiv #(
.INSTANCE_ID ($sformatf("%s-muldiv%0d", INSTANCE_ID, block_idx)),
.INSTANCE_ID (`SFORMATF(("%s-muldiv%0d", INSTANCE_ID, block_idx))),
.NUM_LANES (NUM_LANES)
) muldiv_unit (
.clk (clk),
.reset (block_reset),
.execute_if (muldiv_execute_if),
.commit_if (muldiv_commit_if)
.reset (reset),
.execute_if (pe_execute_if[PE_IDX_MDV]),
.commit_if (pe_commit_if[PE_IDX_MDV])
);
`endif
assign per_block_execute_if[block_idx].ready =
`ifdef EXT_M_ENABLE
is_muldiv_op ? muldiv_execute_if.ready :
`endif
int_execute_if.ready;
// send response
VX_stream_arb #(
.NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW),
.OUT_BUF (PARTIAL_BW ? 1 : 3),
.ARBITER ("F")
) rsp_arb (
.clk (clk),
.reset (block_reset),
.valid_in ({
`ifdef EXT_M_ENABLE
muldiv_commit_if.valid,
`endif
int_commit_if.valid
}),
.ready_in ({
`ifdef EXT_M_ENABLE
muldiv_commit_if.ready,
`endif
int_commit_if.ready
}),
.data_in ({
`ifdef EXT_M_ENABLE
muldiv_commit_if.data,
`endif
int_commit_if.data
}),
.data_out (per_block_commit_if[block_idx].data),
.valid_out (per_block_commit_if[block_idx].valid),
.ready_out (per_block_commit_if[block_idx].ready),
`UNUSED_PIN (sel_out)
);
end
VX_gather_unit #(

View file

@ -13,7 +13,7 @@
`include "VX_define.vh"
module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #(
module VX_commit import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
@ -41,28 +41,26 @@ module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #(
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] per_issue_commit_tmask;
wire [`ISSUE_WIDTH-1:0] per_issue_commit_eop;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_commit_arbs
wire [`NUM_EX_UNITS-1:0] valid_in;
wire [`NUM_EX_UNITS-1:0][DATAW-1:0] data_in;
wire [`NUM_EX_UNITS-1:0] ready_in;
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin : g_data_in
assign valid_in[j] = commit_if[j * `ISSUE_WIDTH + i].valid;
assign data_in[j] = commit_if[j * `ISSUE_WIDTH + i].data;
assign commit_if[j * `ISSUE_WIDTH + i].ready = ready_in[j];
end
`RESET_RELAY (arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (`NUM_EX_UNITS),
.DATAW (DATAW),
.ARBITER ("R"),
.ARBITER ("P"),
.OUT_BUF (1)
) commit_arb (
.clk (clk),
.reset (arb_reset),
.reset (reset),
.valid_in (valid_in),
.ready_in (ready_in),
.data_in (data_in),
@ -86,7 +84,7 @@ module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #(
assign commit_fire_any = (| per_issue_commit_fire);
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_commit_size
wire [COMMIT_SIZEW-1:0] count;
`POP_COUNT(count, per_issue_commit_tmask[i]);
assign commit_size[i] = count;
@ -103,7 +101,7 @@ module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #(
.data_out ({commit_fire_any_r, commit_size_r})
);
VX_reduce #(
VX_reduce_tree #(
.DATAW_IN (COMMIT_SIZEW),
.DATAW_OUT (COMMIT_ALL_SIZEW),
.N (`ISSUE_WIDTH),
@ -162,7 +160,7 @@ module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #(
// Writeback
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_writeback
assign writeback_if[i].valid = commit_arb_if[i].valid && commit_arb_if[i].data.wb;
assign writeback_if[i].data.uuid = commit_arb_if[i].data.uuid;
assign writeback_if[i].data.wis = wid_to_wis(commit_arb_if[i].data.wid);
@ -176,15 +174,15 @@ module VX_commit import VX_gpu_pkg::*, VX_trace_pkg::*; #(
end
`ifdef DBG_TRACE_PIPELINE
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_trace
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin : g_j
always @(posedge clk) begin
if (commit_if[j * `ISSUE_WIDTH + i].valid && commit_if[j * `ISSUE_WIDTH + i].ready) begin
`TRACE(1, ("%d: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, commit_if[j * `ISSUE_WIDTH + i].data.wid, {commit_if[j * `ISSUE_WIDTH + i].data.PC, 1'b0}));
`TRACE(1, ("%t: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, commit_if[j * `ISSUE_WIDTH + i].data.wid, {commit_if[j * `ISSUE_WIDTH + i].data.PC, 1'b0}))
trace_ex_type(1, j);
`TRACE(1, (", tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", commit_if[j * `ISSUE_WIDTH + i].data.tmask, commit_if[j * `ISSUE_WIDTH + i].data.wb, commit_if[j * `ISSUE_WIDTH + i].data.rd, commit_if[j * `ISSUE_WIDTH + i].data.sop, commit_if[j * `ISSUE_WIDTH + i].data.eop));
`TRACE_ARRAY1D(1, "0x%0h", commit_if[j * `ISSUE_WIDTH + i].data.data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", commit_if[j * `ISSUE_WIDTH + i].data.uuid));
`TRACE(1, (", tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", commit_if[j * `ISSUE_WIDTH + i].data.tmask, commit_if[j * `ISSUE_WIDTH + i].data.wb, commit_if[j * `ISSUE_WIDTH + i].data.rd, commit_if[j * `ISSUE_WIDTH + i].data.sop, commit_if[j * `ISSUE_WIDTH + i].data.eop))
`TRACE_ARRAY1D(1, "0x%0h", commit_if[j * `ISSUE_WIDTH + i].data.data, `NUM_THREADS)
`TRACE(1, (" (#%0d)\n", commit_if[j * `ISSUE_WIDTH + i].data.uuid))
end
end
end

View file

@ -28,7 +28,7 @@ module VX_core import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
input sysmem_perf_t sysmem_perf,
`endif
VX_dcr_bus_if.slave dcr_bus_if,
@ -65,44 +65,37 @@ module VX_core import VX_gpu_pkg::*; #(
) lsu_mem_if[`NUM_LSU_BLOCKS]();
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
VX_pipeline_perf_if pipeline_perf_if();
assign mem_perf_tmp_if.icache = mem_perf_if.icache;
assign mem_perf_tmp_if.dcache = mem_perf_if.dcache;
assign mem_perf_tmp_if.l2cache = mem_perf_if.l2cache;
assign mem_perf_tmp_if.l3cache = mem_perf_if.l3cache;
assign mem_perf_tmp_if.mem = mem_perf_if.mem;
lmem_perf_t lmem_perf;
coalescer_perf_t coalescer_perf;
pipeline_perf_t pipeline_perf;
sysmem_perf_t sysmem_perf_tmp;
always @(*) begin
sysmem_perf_tmp = sysmem_perf;
sysmem_perf_tmp.lmem = lmem_perf;
sysmem_perf_tmp.coalescer = coalescer_perf;
end
`endif
`RESET_RELAY (dcr_data_reset, reset);
`RESET_RELAY (schedule_reset, reset);
`RESET_RELAY (fetch_reset, reset);
`RESET_RELAY (decode_reset, reset);
`RESET_RELAY (issue_reset, reset);
`RESET_RELAY (execute_reset, reset);
`RESET_RELAY (commit_reset, reset);
base_dcrs_t base_dcrs;
VX_dcr_data dcr_data (
.clk (clk),
.reset (dcr_data_reset),
.reset (reset),
.dcr_bus_if (dcr_bus_if),
.base_dcrs (base_dcrs)
);
`SCOPE_IO_SWITCH (3)
`SCOPE_IO_SWITCH (3);
VX_schedule #(
.INSTANCE_ID ($sformatf("%s-schedule", INSTANCE_ID)),
.INSTANCE_ID (`SFORMATF(("%s-schedule", INSTANCE_ID))),
.CORE_ID (CORE_ID)
) schedule (
.clk (clk),
.reset (schedule_reset),
.reset (reset),
`ifdef PERF_ENABLE
.sched_perf (pipeline_perf_if.sched),
.sched_perf (pipeline_perf.sched),
`endif
.base_dcrs (base_dcrs),
@ -123,36 +116,36 @@ module VX_core import VX_gpu_pkg::*; #(
);
VX_fetch #(
.INSTANCE_ID ($sformatf("%s-fetch", INSTANCE_ID))
.INSTANCE_ID (`SFORMATF(("%s-fetch", INSTANCE_ID)))
) fetch (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (fetch_reset),
.reset (reset),
.icache_bus_if (icache_bus_if),
.schedule_if (schedule_if),
.fetch_if (fetch_if)
);
VX_decode #(
.INSTANCE_ID ($sformatf("%s-decode", INSTANCE_ID))
.INSTANCE_ID (`SFORMATF(("%s-decode", INSTANCE_ID)))
) decode (
.clk (clk),
.reset (decode_reset),
.reset (reset),
.fetch_if (fetch_if),
.decode_if (decode_if),
.decode_sched_if(decode_sched_if)
);
VX_issue #(
.INSTANCE_ID ($sformatf("%s-issue", INSTANCE_ID))
.INSTANCE_ID (`SFORMATF(("%s-issue", INSTANCE_ID)))
) issue (
`SCOPE_IO_BIND (1)
.clk (clk),
.reset (issue_reset),
.reset (reset),
`ifdef PERF_ENABLE
.issue_perf (pipeline_perf_if.issue),
.issue_perf (pipeline_perf.issue),
`endif
.decode_if (decode_if),
@ -161,17 +154,17 @@ module VX_core import VX_gpu_pkg::*; #(
);
VX_execute #(
.INSTANCE_ID ($sformatf("%s-execute", INSTANCE_ID)),
.INSTANCE_ID (`SFORMATF(("%s-execute", INSTANCE_ID))),
.CORE_ID (CORE_ID)
) execute (
`SCOPE_IO_BIND (2)
.clk (clk),
.reset (execute_reset),
.reset (reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_tmp_if),
.pipeline_perf_if(pipeline_perf_if),
.sysmem_perf (sysmem_perf_tmp),
.pipeline_perf (pipeline_perf),
`endif
.base_dcrs (base_dcrs),
@ -189,10 +182,10 @@ module VX_core import VX_gpu_pkg::*; #(
);
VX_commit #(
.INSTANCE_ID ($sformatf("%s-commit", INSTANCE_ID))
.INSTANCE_ID (`SFORMATF(("%s-commit", INSTANCE_ID)))
) commit (
.clk (clk),
.reset (commit_reset),
.reset (reset),
.commit_if (commit_if),
@ -202,134 +195,19 @@ module VX_core import VX_gpu_pkg::*; #(
.commit_sched_if(commit_sched_if)
);
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_dcache_if[`NUM_LSU_BLOCKS]();
`ifdef LMEM_ENABLE
`RESET_RELAY (lmem_unit_reset, reset);
VX_lmem_unit #(
VX_mem_unit #(
.INSTANCE_ID (INSTANCE_ID)
) lmem_unit (
.clk (clk),
.reset (lmem_unit_reset),
) mem_unit (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.lmem),
.lmem_perf (lmem_perf),
.coalescer_perf(coalescer_perf),
`endif
.lsu_mem_in_if (lsu_mem_if),
.lsu_mem_out_if (lsu_dcache_if)
.lsu_mem_if (lsu_mem_if),
.dcache_bus_if (dcache_bus_if)
);
`else
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
`ASSIGN_VX_LSU_MEM_IF (lsu_dcache_if[i], lsu_mem_if[i]);
end
`endif
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
VX_lsu_mem_if #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_coalesced_if();
if (LSU_WORD_SIZE != DCACHE_WORD_SIZE) begin
`RESET_RELAY (mem_coalescer_reset, reset);
VX_mem_coalescer #(
.INSTANCE_ID ($sformatf("%s-coalescer%0d", INSTANCE_ID, i)),
.NUM_REQS (`NUM_LSU_LANES),
.DATA_IN_SIZE (LSU_WORD_SIZE),
.DATA_OUT_SIZE (DCACHE_WORD_SIZE),
.ADDR_WIDTH (LSU_ADDR_WIDTH),
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.QUEUE_SIZE (`LSUQ_OUT_SIZE)
) mem_coalescer (
.clk (clk),
.reset (mem_coalescer_reset),
// Input request
.in_req_valid (lsu_dcache_if[i].req_valid),
.in_req_mask (lsu_dcache_if[i].req_data.mask),
.in_req_rw (lsu_dcache_if[i].req_data.rw),
.in_req_byteen (lsu_dcache_if[i].req_data.byteen),
.in_req_addr (lsu_dcache_if[i].req_data.addr),
.in_req_atype (lsu_dcache_if[i].req_data.atype),
.in_req_data (lsu_dcache_if[i].req_data.data),
.in_req_tag (lsu_dcache_if[i].req_data.tag),
.in_req_ready (lsu_dcache_if[i].req_ready),
// Input response
.in_rsp_valid (lsu_dcache_if[i].rsp_valid),
.in_rsp_mask (lsu_dcache_if[i].rsp_data.mask),
.in_rsp_data (lsu_dcache_if[i].rsp_data.data),
.in_rsp_tag (lsu_dcache_if[i].rsp_data.tag),
.in_rsp_ready (lsu_dcache_if[i].rsp_ready),
// Output request
.out_req_valid (dcache_coalesced_if.req_valid),
.out_req_mask (dcache_coalesced_if.req_data.mask),
.out_req_rw (dcache_coalesced_if.req_data.rw),
.out_req_byteen (dcache_coalesced_if.req_data.byteen),
.out_req_addr (dcache_coalesced_if.req_data.addr),
.out_req_atype (dcache_coalesced_if.req_data.atype),
.out_req_data (dcache_coalesced_if.req_data.data),
.out_req_tag (dcache_coalesced_if.req_data.tag),
.out_req_ready (dcache_coalesced_if.req_ready),
// Output response
.out_rsp_valid (dcache_coalesced_if.rsp_valid),
.out_rsp_mask (dcache_coalesced_if.rsp_data.mask),
.out_rsp_data (dcache_coalesced_if.rsp_data.data),
.out_rsp_tag (dcache_coalesced_if.rsp_data.tag),
.out_rsp_ready (dcache_coalesced_if.rsp_ready)
);
end else begin
`ASSIGN_VX_LSU_MEM_IF (dcache_coalesced_if, lsu_dcache_if[i]);
end
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_tmp_if[DCACHE_CHANNELS]();
`RESET_RELAY (lsu_adapter_reset, reset);
VX_lsu_adapter #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (0)
) lsu_adapter (
.clk (clk),
.reset (lsu_adapter_reset),
.lsu_mem_if (dcache_coalesced_if),
.mem_bus_if (dcache_bus_tmp_if)
);
for (genvar j = 0; j < DCACHE_CHANNELS; ++j) begin
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i * DCACHE_CHANNELS + j], dcache_bus_tmp_if[j]);
end
end
`ifdef PERF_ENABLE
wire [`CLOG2(LSU_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle;
@ -353,8 +231,8 @@ module VX_core import VX_gpu_pkg::*; #(
wire [LSU_NUM_REQS-1:0] perf_dcache_wr_req_fire, perf_dcache_wr_req_fire_r;
wire [LSU_NUM_REQS-1:0] perf_dcache_rsp_fire;
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_perf_dcache
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin : g_j
assign perf_dcache_rd_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && ~lsu_mem_if[i].req_data.rw;
assign perf_dcache_wr_req_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].req_valid && lsu_mem_if[i].req_data.mask[j] && lsu_mem_if[i].req_ready && lsu_mem_if[i].req_data.rw;
assign perf_dcache_rsp_fire[i * `NUM_LSU_LANES + j] = lsu_mem_if[i].rsp_valid && lsu_mem_if[i].rsp_data.mask[j] && lsu_mem_if[i].rsp_ready;
@ -400,12 +278,11 @@ module VX_core import VX_gpu_pkg::*; #(
end
end
assign pipeline_perf_if.ifetches = perf_ifetches;
assign pipeline_perf_if.loads = perf_loads;
assign pipeline_perf_if.stores = perf_stores;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
assign pipeline_perf_if.ifetch_latency = perf_icache_lat;
assign pipeline_perf_if.load_latency = perf_dcache_lat;
assign pipeline_perf.ifetches = perf_ifetches;
assign pipeline_perf.loads = perf_loads;
assign pipeline_perf.stores = perf_stores;
assign pipeline_perf.ifetch_latency = perf_icache_lat;
assign pipeline_perf.load_latency = perf_dcache_lat;
`endif

View file

@ -32,7 +32,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
output wire [DCACHE_NUM_REQS-1:0] dcache_req_rw,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] dcache_req_byteen,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] dcache_req_addr,
output wire [DCACHE_NUM_REQS-1:0][`ADDR_TYPE_WIDTH-1:0] dcache_req_atype,
output wire [DCACHE_NUM_REQS-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] dcache_req_flags,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_req_data,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] dcache_req_tag,
input wire [DCACHE_NUM_REQS-1:0] dcache_req_ready,
@ -96,7 +96,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign dcache_req_rw[i] = dcache_bus_if[i].req_data.rw;
assign dcache_req_byteen[i] = dcache_bus_if[i].req_data.byteen;
assign dcache_req_addr[i] = dcache_bus_if[i].req_data.addr;
assign dcache_req_atype[i] = dcache_bus_if[i].req_data.atype;
assign dcache_req_flags[i] = dcache_bus_if[i].req_data.flags;
assign dcache_req_data[i] = dcache_bus_if[i].req_data.data;
assign dcache_req_tag[i] = dcache_bus_if[i].req_data.tag;
assign dcache_bus_if[i].req_ready = dcache_req_ready[i];
@ -119,7 +119,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign icache_req_data = icache_bus_if.req_data.data;
assign icache_req_tag = icache_bus_if.req_data.tag;
assign icache_bus_if.req_ready = icache_req_ready;
`UNUSED_VAR (icache_bus_if.req_data.atype)
`UNUSED_VAR (icache_bus_if.req_data.flags)
assign icache_bus_if.rsp_valid = icache_rsp_valid;
assign icache_bus_if.rsp_data.tag = icache_rsp_tag;
@ -127,13 +127,13 @@ module VX_core_top import VX_gpu_pkg::*; #(
assign icache_rsp_ready = icache_bus_if.rsp_ready;
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if();
assign mem_perf_if.icache = '0;
assign mem_perf_if.dcache = '0;
assign mem_perf_if.l2cache = '0;
assign mem_perf_if.l3cache = '0;
assign mem_perf_if.lmem = '0;
assign mem_perf_if.mem = '0;
sysmem_perf_t mem_perf;
assign mem_perf.icache = '0;
assign mem_perf.dcache = '0;
assign mem_perf.l2cache = '0;
assign mem_perf.l3cache = '0;
assign mem_perf.lmem = '0;
assign mem_perf.mem = '0;
`endif
`ifdef SCOPE
@ -144,7 +144,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
`endif
VX_core #(
.INSTANCE_ID ($sformatf("core")),
.INSTANCE_ID (`SFORMATF(("core"))),
.CORE_ID (CORE_ID)
) core (
`SCOPE_IO_BIND (0)
@ -152,7 +152,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
.reset (reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.sysmem_perf (sysmem_perf),
`endif
.dcr_bus_if (dcr_bus_if),

View file

@ -41,8 +41,8 @@ import VX_fpu_pkg::*;
input base_dcrs_t base_dcrs,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
input sysmem_perf_t sysmem_perf,
input pipeline_perf_t pipeline_perf,
`endif
VX_commit_csr_if.slave commit_csr_if,
@ -83,7 +83,7 @@ import VX_fpu_pkg::*;
wire [`NUM_FPU_BLOCKS-1:0][`NW_WIDTH-1:0] fpu_write_wid;
fflags_t [`NUM_FPU_BLOCKS-1:0] fpu_write_fflags;
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin : g_fpu_write
assign fpu_write_enable[i] = fpu_csr_if[i].write_enable;
assign fpu_write_wid[i] = fpu_csr_if[i].write_wid;
assign fpu_write_fflags[i] = fpu_csr_if[i].write_fflags;
@ -107,7 +107,7 @@ import VX_fpu_pkg::*;
end
end
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin : g_fpu_csr_read_frm
assign fpu_csr_if[i].read_frm = fcsr[fpu_csr_if[i].read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS];
end
@ -155,41 +155,41 @@ import VX_fpu_pkg::*;
// CSRs read //////////////////////////////////////////////////////////////
reg [`XLEN-1:0] read_data_ro_r;
reg [`XLEN-1:0] read_data_rw_r;
reg read_addr_valid_r;
reg [`XLEN-1:0] read_data_ro_w;
reg [`XLEN-1:0] read_data_rw_w;
reg read_addr_valid_w;
always @(*) begin
read_data_ro_r = '0;
read_data_rw_r = '0;
read_addr_valid_r = 1;
read_data_ro_w = '0;
read_data_rw_w = '0;
read_addr_valid_w = 1;
case (read_addr)
`VX_CSR_MVENDORID : read_data_ro_r = `XLEN'(`VENDOR_ID);
`VX_CSR_MARCHID : read_data_ro_r = `XLEN'(`ARCHITECTURE_ID);
`VX_CSR_MIMPID : read_data_ro_r = `XLEN'(`IMPLEMENTATION_ID);
`VX_CSR_MISA : read_data_ro_r = `XLEN'({2'(`CLOG2(`XLEN/16)), 30'(`MISA_STD)});
`VX_CSR_MVENDORID : read_data_ro_w = `XLEN'(`VENDOR_ID);
`VX_CSR_MARCHID : read_data_ro_w = `XLEN'(`ARCHITECTURE_ID);
`VX_CSR_MIMPID : read_data_ro_w = `XLEN'(`IMPLEMENTATION_ID);
`VX_CSR_MISA : read_data_ro_w = `XLEN'({2'(`CLOG2(`XLEN/16)), 30'(`MISA_STD)});
`ifdef EXT_F_ENABLE
`VX_CSR_FFLAGS : read_data_rw_r = `XLEN'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]);
`VX_CSR_FRM : read_data_rw_r = `XLEN'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
`VX_CSR_FCSR : read_data_rw_r = `XLEN'(fcsr[read_wid]);
`VX_CSR_FFLAGS : read_data_rw_w = `XLEN'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]);
`VX_CSR_FRM : read_data_rw_w = `XLEN'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
`VX_CSR_FCSR : read_data_rw_w = `XLEN'(fcsr[read_wid]);
`endif
`VX_CSR_MSCRATCH : read_data_rw_r = mscratch;
`VX_CSR_MSCRATCH : read_data_rw_w = mscratch;
`VX_CSR_WARP_ID : read_data_ro_r = `XLEN'(read_wid);
`VX_CSR_CORE_ID : read_data_ro_r = `XLEN'(CORE_ID);
`VX_CSR_ACTIVE_THREADS: read_data_ro_r = `XLEN'(thread_masks[read_wid]);
`VX_CSR_ACTIVE_WARPS: read_data_ro_r = `XLEN'(active_warps);
`VX_CSR_NUM_THREADS: read_data_ro_r = `XLEN'(`NUM_THREADS);
`VX_CSR_NUM_WARPS : read_data_ro_r = `XLEN'(`NUM_WARPS);
`VX_CSR_NUM_CORES : read_data_ro_r = `XLEN'(`NUM_CORES * `NUM_CLUSTERS);
`VX_CSR_LOCAL_MEM_BASE: read_data_ro_r = `XLEN'(`LMEM_BASE_ADDR);
`VX_CSR_WARP_ID : read_data_ro_w = `XLEN'(read_wid);
`VX_CSR_CORE_ID : read_data_ro_w = `XLEN'(CORE_ID);
`VX_CSR_ACTIVE_THREADS: read_data_ro_w = `XLEN'(thread_masks[read_wid]);
`VX_CSR_ACTIVE_WARPS: read_data_ro_w = `XLEN'(active_warps);
`VX_CSR_NUM_THREADS: read_data_ro_w = `XLEN'(`NUM_THREADS);
`VX_CSR_NUM_WARPS : read_data_ro_w = `XLEN'(`NUM_WARPS);
`VX_CSR_NUM_CORES : read_data_ro_w = `XLEN'(`NUM_CORES * `NUM_CLUSTERS);
`VX_CSR_LOCAL_MEM_BASE: read_data_ro_w = `XLEN'(`LMEM_BASE_ADDR);
`CSR_READ_64(`VX_CSR_MCYCLE, read_data_ro_r, cycles);
`CSR_READ_64(`VX_CSR_MCYCLE, read_data_ro_w, cycles);
`VX_CSR_MPM_RESERVED : read_data_ro_r = 'x;
`VX_CSR_MPM_RESERVED_H : read_data_ro_r = 'x;
`VX_CSR_MPM_RESERVED : read_data_ro_w = 'x;
`VX_CSR_MPM_RESERVED_H : read_data_ro_w = 'x;
`CSR_READ_64(`VX_CSR_MINSTRET, read_data_ro_r, commit_csr_if.instret);
`CSR_READ_64(`VX_CSR_MINSTRET, read_data_ro_w, commit_csr_if.instret);
`VX_CSR_SATP,
`VX_CSR_MSTATUS,
@ -200,77 +200,79 @@ import VX_fpu_pkg::*;
`VX_CSR_MTVEC,
`VX_CSR_MEPC,
`VX_CSR_PMPCFG0,
`VX_CSR_PMPADDR0 : read_data_ro_r = `XLEN'(0);
`VX_CSR_PMPADDR0 : read_data_ro_w = `XLEN'(0);
default: begin
read_addr_valid_r = 0;
read_addr_valid_w = 0;
if ((read_addr >= `VX_CSR_MPM_USER && read_addr < (`VX_CSR_MPM_USER + 32))
|| (read_addr >= `VX_CSR_MPM_USER_H && read_addr < (`VX_CSR_MPM_USER_H + 32))) begin
read_addr_valid_r = 1;
read_addr_valid_w = 1;
`ifdef PERF_ENABLE
case (base_dcrs.mpm_class)
`VX_DCR_MPM_CLASS_CORE: begin
case (read_addr)
// PERF: pipeline
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_r, pipeline_perf_if.sched.idles);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_r, pipeline_perf_if.sched.stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_r, pipeline_perf_if.issue.ibf_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_r, pipeline_perf_if.issue.scb_stalls);
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_r, pipeline_perf_if.issue.opd_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_ALU]);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ID, read_data_ro_w, pipeline_perf.sched.idles);
`CSR_READ_64(`VX_CSR_MPM_SCHED_ST, read_data_ro_w, pipeline_perf.sched.stalls);
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_w, pipeline_perf.issue.ibf_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_w, pipeline_perf.issue.scb_stalls);
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_w, pipeline_perf.issue.opd_stalls);
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_ALU]);
`ifdef EXT_F_ENABLE
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_FPU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_FPU]);
`else
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_r, `PERF_CTR_BITS'(0));
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, `PERF_CTR_BITS'(0));
`endif
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_LSU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_r, pipeline_perf_if.issue.units_uses[`EX_SFU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_CSRS]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_r, pipeline_perf_if.issue.sfu_uses[`SFU_WCTL]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_LSU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_SFU]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_CSRS]);
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_WCTL]);
// PERF: memory
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_r, pipeline_perf_if.ifetches);
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_r, pipeline_perf_if.loads);
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_r, pipeline_perf_if.stores);
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_r, pipeline_perf_if.ifetch_latency);
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_r, pipeline_perf_if.load_latency);
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf.ifetches);
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf.loads);
`CSR_READ_64(`VX_CSR_MPM_STORES, read_data_ro_w, pipeline_perf.stores);
`CSR_READ_64(`VX_CSR_MPM_IFETCH_LT, read_data_ro_w, pipeline_perf.ifetch_latency);
`CSR_READ_64(`VX_CSR_MPM_LOAD_LT, read_data_ro_w, pipeline_perf.load_latency);
default:;
endcase
end
`VX_DCR_MPM_CLASS_MEM: begin
case (read_addr)
// PERF: icache
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_r, mem_perf_if.icache.reads);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_r, mem_perf_if.icache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_r, mem_perf_if.icache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_READS, read_data_ro_w, sysmem_perf.icache.reads);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MISS_R, read_data_ro_w, sysmem_perf.icache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_ICACHE_MSHR_ST, read_data_ro_w, sysmem_perf.icache.mshr_stalls);
// PERF: dcache
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_r, mem_perf_if.dcache.reads);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_r, mem_perf_if.dcache.writes);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_r, mem_perf_if.dcache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_r, mem_perf_if.dcache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_r, mem_perf_if.dcache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_r, mem_perf_if.dcache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_READS, read_data_ro_w, sysmem_perf.dcache.reads);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_WRITES, read_data_ro_w, sysmem_perf.dcache.writes);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_R, read_data_ro_w, sysmem_perf.dcache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MISS_W, read_data_ro_w, sysmem_perf.dcache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_BANK_ST, read_data_ro_w, sysmem_perf.dcache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_DCACHE_MSHR_ST, read_data_ro_w, sysmem_perf.dcache.mshr_stalls);
// PERF: lmem
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_r, mem_perf_if.lmem.reads);
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_r, mem_perf_if.lmem.writes);
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_r, mem_perf_if.lmem.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_LMEM_READS, read_data_ro_w, sysmem_perf.lmem.reads);
`CSR_READ_64(`VX_CSR_MPM_LMEM_WRITES, read_data_ro_w, sysmem_perf.lmem.writes);
`CSR_READ_64(`VX_CSR_MPM_LMEM_BANK_ST, read_data_ro_w, sysmem_perf.lmem.bank_stalls);
// PERF: l2cache
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_r, mem_perf_if.l2cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_r, mem_perf_if.l2cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_r, mem_perf_if.l2cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_r, mem_perf_if.l2cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_r, mem_perf_if.l2cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l2cache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_READS, read_data_ro_w, sysmem_perf.l2cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_WRITES, read_data_ro_w, sysmem_perf.l2cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_R, read_data_ro_w, sysmem_perf.l2cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MISS_W, read_data_ro_w, sysmem_perf.l2cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l2cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L2CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l2cache.mshr_stalls);
// PERF: l3cache
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_r, mem_perf_if.l3cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_r, mem_perf_if.l3cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_r, mem_perf_if.l3cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_r, mem_perf_if.l3cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_r, mem_perf_if.l3cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_r, mem_perf_if.l3cache.mshr_stalls);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_READS, read_data_ro_w, sysmem_perf.l3cache.reads);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_WRITES, read_data_ro_w, sysmem_perf.l3cache.writes);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_R, read_data_ro_w, sysmem_perf.l3cache.read_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MISS_W, read_data_ro_w, sysmem_perf.l3cache.write_misses);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_BANK_ST, read_data_ro_w, sysmem_perf.l3cache.bank_stalls);
`CSR_READ_64(`VX_CSR_MPM_L3CACHE_MSHR_ST, read_data_ro_w, sysmem_perf.l3cache.mshr_stalls);
// PERF: memory
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_r, mem_perf_if.mem.reads);
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_r, mem_perf_if.mem.writes);
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_r, mem_perf_if.mem.latency);
`CSR_READ_64(`VX_CSR_MPM_MEM_READS, read_data_ro_w, sysmem_perf.mem.reads);
`CSR_READ_64(`VX_CSR_MPM_MEM_WRITES, read_data_ro_w, sysmem_perf.mem.writes);
`CSR_READ_64(`VX_CSR_MPM_MEM_LT, read_data_ro_w, sysmem_perf.mem.latency);
// PERF: coalescer
`CSR_READ_64(`VX_CSR_MPM_COALESCER_MISS, read_data_ro_w, sysmem_perf.coalescer.misses);
default:;
endcase
end
@ -282,16 +284,16 @@ import VX_fpu_pkg::*;
endcase
end
assign read_data_ro = read_data_ro_r;
assign read_data_rw = read_data_rw_r;
assign read_data_ro = read_data_ro_w;
assign read_data_rw = read_data_rw_w;
`UNUSED_VAR (base_dcrs)
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
`RUNTIME_ASSERT(~read_enable || read_addr_valid_w, ("%t: *** invalid CSR read address: 0x%0h (#%0d)", $time, read_addr, read_uuid))
`ifdef PERF_ENABLE
`UNUSED_VAR (mem_perf_if.icache);
`UNUSED_VAR (mem_perf_if.lmem);
`UNUSED_VAR (sysmem_perf.icache);
`UNUSED_VAR (sysmem_perf.lmem);
`endif
endmodule

View file

@ -24,8 +24,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
input base_dcrs_t base_dcrs,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
input sysmem_perf_t sysmem_perf,
input pipeline_perf_t pipeline_perf,
`endif
`ifdef EXT_F_ENABLE
@ -66,7 +66,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
wire [NUM_LANES-1:0][`XLEN-1:0] rs1_data;
`UNUSED_VAR (rs1_data)
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_rs1_data
assign rs1_data[i] = execute_if.data.rs1_data[i];
end
@ -82,8 +82,8 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
.base_dcrs (base_dcrs),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if),
.sysmem_perf (sysmem_perf),
.pipeline_perf (pipeline_perf),
`endif
.commit_csr_if (commit_csr_if),
@ -113,12 +113,15 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
wire [NUM_LANES-1:0][`XLEN-1:0] wtid, gtid;
for (genvar i = 0; i < NUM_LANES; ++i) begin
if (PID_BITS != 0) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_wtid
if (PID_BITS != 0) begin : g_pid
assign wtid[i] = `XLEN'(execute_if.data.pid * NUM_LANES + i);
end else begin
end else begin : g_no_pid
assign wtid[i] = `XLEN'(i);
end
end
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_gtid
assign gtid[i] = (`XLEN'(CORE_ID) << (`NW_BITS + `NT_BITS)) + (`XLEN'(execute_if.data.wid) << `NT_BITS) + wtid[i];
end

View file

@ -13,7 +13,7 @@
`include "VX_define.vh"
module VX_dcr_data import VX_gpu_pkg::*, VX_trace_pkg::*; (
module VX_dcr_data import VX_gpu_pkg::*; (
input wire clk,
input wire reset,
@ -50,9 +50,9 @@ module VX_dcr_data import VX_gpu_pkg::*, VX_trace_pkg::*; (
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (dcr_bus_if.write_valid) begin
`TRACE(1, ("%d: base-dcr: state=", $time));
`TRACE(2, ("%t: base-dcr: state=", $time))
trace_base_dcr(1, dcr_bus_if.write_addr);
`TRACE(1, (", data=0x%h\n", dcr_bus_if.write_data));
`TRACE(2, (", data=0x%h\n", dcr_bus_if.write_data))
end
end
`endif

View file

@ -15,19 +15,19 @@
`ifdef EXT_F_ENABLE
`define USED_IREG(x) \
x``_r = {1'b0, ``x}; \
x``_v = {1'b0, ``x}; \
use_``x = 1
`define USED_FREG(x) \
x``_r = {1'b1, ``x}; \
x``_v = {1'b1, ``x}; \
use_``x = 1
`else
`define USED_IREG(x) \
x``_r = ``x; \
x``_v = ``x; \
use_``x = 1
`endif
module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #(
module VX_decode import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
@ -50,7 +50,7 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #(
reg [`EX_BITS-1:0] ex_type;
reg [`INST_OP_BITS-1:0] op_type;
op_args_t op_args;
reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r;
reg [`NR_BITS-1:0] rd_v, rs1_v, rs2_v, rs3_v;
reg use_rd, use_rs1, use_rs2, use_rs3;
reg is_wstall;
@ -152,13 +152,13 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #(
always @(*) begin
ex_type = '0;
ex_type = 'x;
op_type = 'x;
op_args = 'x;
rd_r = '0;
rs1_r = '0;
rs2_r = '0;
rs3_r = '0;
rd_v = '0;
rs1_v = '0;
rs2_v = '0;
rs3_v = '0;
use_rd = 0;
use_rs1 = 0;
use_rs2 = 0;
@ -376,14 +376,16 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #(
`USED_IREG (rs2);
end
`ifdef EXT_F_ENABLE
`INST_FMADD,
`INST_FMSUB,
`INST_FNMSUB,
`INST_FNMADD: begin
`INST_FMADD, // 7'b1000011
`INST_FMSUB, // 7'b1000111
`INST_FNMSUB, // 7'b1001011
`INST_FNMADD: // 7'b1001111
begin
ex_type = `EX_FPU;
op_type = `INST_OP_BITS'({2'b11, opcode[3:2]});
op_type = `INST_OP_BITS'({2'b00, 1'b1, opcode[3]});
op_args.fpu.frm = func3;
op_args.fpu.fmt[0] = func2[0]; // float / double
op_args.fpu.fmt[1] = opcode[3] ^ opcode[2]; // SUB
use_rd = 1;
`USED_FREG (rd);
`USED_FREG (rs1);
@ -399,9 +401,10 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #(
case (func5)
5'b00000, // FADD
5'b00001, // FSUB
5'b00010, // FMUL
5'b00011: begin // FDIV
op_type = `INST_OP_BITS'(func5[1:0]);
5'b00010: // FMUL
begin
op_type = `INST_OP_BITS'({2'b00, 1'b0, func5[1]});
op_args.fpu.fmt[1] = func5[0]; // SUB
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
@ -430,6 +433,13 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #(
`USED_FREG (rs1);
end
`endif
5'b00011: begin
// FDIV
op_type = `INST_OP_BITS'(`INST_FPU_DIV);
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
5'b01011: begin
// FSQRT
op_type = `INST_OP_BITS'(`INST_FPU_SQRT);
@ -527,7 +537,7 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #(
end
// disable write to integer register r0
wire wb = use_rd && (rd_r != 0);
wire wb = use_rd && (rd_v != 0);
VX_elastic_buffer #(
.DATAW (DATAW),
@ -537,7 +547,7 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #(
.reset (reset),
.valid_in (fetch_if.valid),
.ready_in (fetch_if.ready),
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_r, rs1_r, rs2_r, rs3_r}),
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_v, rs1_v, rs2_v, rs3_v}),
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
.valid_out (decode_if.valid),
.ready_out (decode_if.ready)
@ -547,9 +557,10 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #(
wire fetch_fire = fetch_if.valid && fetch_if.ready;
assign decode_sched_if.valid = fetch_fire;
assign decode_sched_if.wid = fetch_if.data.wid;
assign decode_sched_if.is_wstall = is_wstall;
assign decode_sched_if.valid = fetch_fire;
assign decode_sched_if.wid = fetch_if.data.wid;
assign decode_sched_if.unlock = ~is_wstall;
`ifndef L1_ENABLE
assign fetch_if.ibuf_pop = decode_if.ibuf_pop;
`endif
@ -557,14 +568,14 @@ module VX_decode import VX_gpu_pkg::*, VX_trace_pkg::*; #(
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (decode_if.valid && decode_if.ready) begin
`TRACE(1, ("%d: %s: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, INSTANCE_ID, decode_if.data.wid, {decode_if.data.PC, 1'd0}, instr));
`TRACE(1, ("%t: %s: wid=%0d, PC=0x%0h, instr=0x%0h, ex=", $time, INSTANCE_ID, decode_if.data.wid, {decode_if.data.PC, 1'd0}, instr))
trace_ex_type(1, decode_if.data.ex_type);
`TRACE(1, (", op="));
`TRACE(1, (", op="))
trace_ex_op(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args);
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, opds=%b%b%b%b",
decode_if.data.tmask, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, use_rd, use_rs1, use_rs2, use_rs3));
decode_if.data.tmask, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, use_rd, use_rs1, use_rs2, use_rs3))
trace_op_args(1, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args);
`TRACE(1, (" (#%0d)\n", decode_if.data.uuid));
`TRACE(1, (" (#%0d)\n", decode_if.data.uuid))
end
end
`endif

View file

@ -33,7 +33,7 @@ module VX_dispatch import VX_gpu_pkg::*; #(
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH;
wire [`NUM_THREADS-1:0][`NT_WIDTH-1:0] tids;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
for (genvar i = 0; i < `NUM_THREADS; ++i) begin : g_tids
assign tids[i] = `NT_WIDTH'(i);
end
@ -50,23 +50,19 @@ module VX_dispatch import VX_gpu_pkg::*; #(
`UNUSED_PIN (valid_out)
);
wire [`NUM_EX_UNITS-1:0] operands_reset;
assign operands_if.ready = operands_reset[operands_if.data.ex_type];
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
`RESET_RELAY (buffer_reset, reset);
wire [`NUM_EX_UNITS-1:0] operands_ready_in;
assign operands_if.ready = operands_ready_in[operands_if.data.ex_type];
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_buffers
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2),
.OUT_REG (2), // 2-cycle EB for area reduction
.LUTRAM (1)
.OUT_REG (1)
) buffer (
.clk (clk),
.reset (buffer_reset),
.reset (reset),
.valid_in (operands_if.valid && (operands_if.data.ex_type == `EX_BITS'(i))),
.ready_in (operands_reset[i]),
.ready_in (operands_ready_in[i]),
.data_in ({
operands_if.data.uuid,
operands_if.data.wis,
@ -92,7 +88,7 @@ module VX_dispatch import VX_gpu_pkg::*; #(
wire operands_if_stall = operands_if.valid && ~operands_if.ready;
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_perf_stalls
always @(posedge clk) begin
if (reset) begin
perf_stalls_r[i] <= '0;

View file

@ -49,13 +49,12 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire [`ISSUE_WIDTH-1:0][IN_DATAW-1:0] dispatch_data;
wire [`ISSUE_WIDTH-1:0] dispatch_ready;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_dispatch_data
assign dispatch_valid[i] = dispatch_if[i].valid;
assign dispatch_data[i] = dispatch_if[i].data;
assign dispatch_if[i].ready = dispatch_ready[i];
end
wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices;
wire [BLOCK_SIZE-1:0] block_ready;
wire [BLOCK_SIZE-1:0][NUM_LANES-1:0] block_tmask;
wire [BLOCK_SIZE-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] block_regs;
@ -66,30 +65,53 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire batch_done = (& block_done);
// batch select logic
logic [BATCH_COUNT_W-1:0] batch_idx;
if (BATCH_COUNT != 1) begin
if (BATCH_COUNT != 1) begin : g_batch_idx
wire [BATCH_COUNT_W-1:0] batch_idx_n;
wire [BATCH_COUNT-1:0] valid_batches;
for (genvar i = 0; i < BATCH_COUNT; ++i) begin : g_valid_batches
assign valid_batches[i] = | dispatch_valid[i * BLOCK_SIZE +: BLOCK_SIZE];
end
VX_generic_arbiter #(
.NUM_REQS (BATCH_COUNT),
.TYPE ("P")
) batch_sel (
.clk (clk),
.reset (reset),
.requests (valid_batches),
.grant_index (batch_idx_n),
`UNUSED_PIN (grant_onehot),
`UNUSED_PIN (grant_valid),
.grant_ready (batch_done)
);
always @(posedge clk) begin
if (reset) begin
batch_idx <= '0;
end else begin
batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done);
end else if (batch_done) begin
batch_idx <= batch_idx_n;
end
end
end else begin
end else begin : g_batch_idx_0
assign batch_idx = 0;
`UNUSED_VAR (batch_done)
end
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices;
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_issue_indices
assign issue_indices[block_idx] = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
end
wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
assign issue_indices[block_idx] = issue_idx;
`RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1));
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_blocks
wire [ISSUE_W-1:0] issue_idx = issue_indices[block_idx];
wire valid_p, ready_p;
if (`NUM_THREADS != NUM_LANES) begin
if (`NUM_THREADS > NUM_LANES) begin : g_partial_threads
reg [NUM_PACKETS-1:0] sent_mask_p;
wire [PID_WIDTH-1:0] start_p_n, start_p, end_p;
wire dispatch_valid_r;
@ -102,7 +124,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire fire_eop = fire_p && is_last_p;
always @(posedge clk) begin
if (block_reset) begin
if (reset) begin
sent_mask_p <= '0;
is_first_p <= 1;
end else begin
@ -124,8 +146,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
for (genvar j = 0; j < NUM_LANES; ++j) begin
for (genvar i = 0; i < NUM_PACKETS; ++i) begin : g_per_packet_data
for (genvar j = 0; j < NUM_LANES; ++j) begin : g_j
localparam k = i * NUM_LANES + j;
assign per_packet_tmask[i][j] = dispatch_tmask[k];
assign per_packet_regs[i][0][j] = dispatch_rs1_data[k];
@ -135,10 +157,12 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
end
wire [NUM_PACKETS-1:0] packet_valids;
wire [NUM_PACKETS-1:0][PID_WIDTH-1:0] packet_ids;
for (genvar i = 0; i < NUM_PACKETS; ++i) begin
for (genvar i = 0; i < NUM_PACKETS; ++i) begin : g_packet_valids
assign packet_valids[i] = (| per_packet_tmask[i]);
end
wire [NUM_PACKETS-1:0][PID_WIDTH-1:0] packet_ids;
for (genvar i = 0; i < NUM_PACKETS; ++i) begin : g_packet_ids
assign packet_ids[i] = PID_WIDTH'(i);
end
@ -187,13 +211,13 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
assign block_pid[block_idx] = start_p;
assign block_sop[block_idx] = is_first_p;
assign block_eop[block_idx] = is_last_p;
if (FANOUT_ENABLE) begin
if (FANOUT_ENABLE) begin : g_block_ready_fanout
assign block_ready[block_idx] = dispatch_valid_r && ready_p && block_enable;
end else begin
end else begin : g_block_ready
assign block_ready[block_idx] = ready_p && block_enable;
end
assign block_done[block_idx] = ~dispatch_valid[issue_idx] || fire_eop;
end else begin
assign block_done[block_idx] = fire_eop || ~dispatch_valid[issue_idx];
end else begin : g_full_threads
assign valid_p = dispatch_valid[issue_idx];
assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
@ -203,29 +227,31 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
assign block_sop[block_idx] = 1'b1;
assign block_eop[block_idx] = 1'b1;
assign block_ready[block_idx] = ready_p;
assign block_done[block_idx] = ~valid_p || ready_p;
assign block_done[block_idx] = ready_p || ~valid_p;
end
wire [ISSUE_ISW_W-1:0] isw;
if (BATCH_COUNT != 1) begin
if (BLOCK_SIZE != 1) begin
if (BATCH_COUNT != 1) begin : g_isw_batch
if (BLOCK_SIZE != 1) begin : g_block
assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)};
end else begin
end else begin : g_no_block
assign isw = batch_idx;
end
end else begin
end else begin : g_isw
assign isw = block_idx;
end
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
logic [OUT_DATAW-1:0] execute_data, execute_data_w;
VX_elastic_buffer #(
.DATAW (OUT_DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) buf_out (
.clk (clk),
.reset (block_reset),
.reset (reset),
.valid_in (valid_p),
.ready_in (ready_p),
.data_in ({
@ -239,17 +265,27 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
block_pid[block_idx],
block_sop[block_idx],
block_eop[block_idx]}),
.data_out (execute_if[block_idx].data),
.data_out (execute_data),
.valid_out (execute_if[block_idx].valid),
.ready_out (execute_if[block_idx].ready)
);
if (`NUM_THREADS != NUM_LANES) begin : g_execute_data_w_partial
assign execute_data_w = execute_data;
end else begin : g_execute_data_w_full
always @(*) begin
execute_data_w = execute_data;
execute_data_w[2:0] = {1'b0, 1'b1, 1'b1}; // default pid, sop, and eop
end
end
assign execute_if[block_idx].data = execute_data_w;
end
reg [`ISSUE_WIDTH-1:0] ready_in;
always @(*) begin
ready_in = 0;
for (integer i = 0; i < BLOCK_SIZE; ++i) begin
ready_in[issue_indices[i]] = block_ready[i] && block_eop[i];
for (integer block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
ready_in[issue_indices[block_idx]] = block_ready[block_idx] && block_eop[block_idx];
end
end
assign dispatch_ready = ready_in;

View file

@ -23,8 +23,8 @@ module VX_execute import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
input sysmem_perf_t sysmem_perf,
input pipeline_perf_t pipeline_perf,
`endif
input base_dcrs_t base_dcrs,
@ -51,41 +51,35 @@ module VX_execute import VX_gpu_pkg::*; #(
VX_fpu_csr_if fpu_csr_if[`NUM_FPU_BLOCKS]();
`endif
`RESET_RELAY (alu_reset, reset);
`RESET_RELAY (lsu_reset, reset);
`RESET_RELAY (sfu_reset, reset);
VX_alu_unit #(
.INSTANCE_ID ($sformatf("%s-alu", INSTANCE_ID))
.INSTANCE_ID (`SFORMATF(("%s-alu", INSTANCE_ID)))
) alu_unit (
.clk (clk),
.reset (alu_reset),
.reset (reset),
.dispatch_if (dispatch_if[`EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (commit_if[`EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.branch_ctl_if (branch_ctl_if)
);
`SCOPE_IO_SWITCH (1)
`SCOPE_IO_SWITCH (1);
VX_lsu_unit #(
.INSTANCE_ID ($sformatf("%s-lsu", INSTANCE_ID))
.INSTANCE_ID (`SFORMATF(("%s-lsu", INSTANCE_ID)))
) lsu_unit (
`SCOPE_IO_BIND (0)
.clk (clk),
.reset (lsu_reset),
.reset (reset),
.dispatch_if (dispatch_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (commit_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.lsu_mem_if (lsu_mem_if)
);
`ifdef EXT_F_ENABLE
`RESET_RELAY (fpu_reset, reset);
VX_fpu_unit #(
.INSTANCE_ID ($sformatf("%s-fpu", INSTANCE_ID))
.INSTANCE_ID (`SFORMATF(("%s-fpu", INSTANCE_ID)))
) fpu_unit (
.clk (clk),
.reset (fpu_reset),
.reset (reset),
.dispatch_if (dispatch_if[`EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.commit_if (commit_if[`EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
.fpu_csr_if (fpu_csr_if)
@ -93,14 +87,14 @@ module VX_execute import VX_gpu_pkg::*; #(
`endif
VX_sfu_unit #(
.INSTANCE_ID ($sformatf("%s-sfu", INSTANCE_ID)),
.INSTANCE_ID (`SFORMATF(("%s-sfu", INSTANCE_ID))),
.CORE_ID (CORE_ID)
) sfu_unit (
.clk (clk),
.reset (sfu_reset),
.reset (reset),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if (pipeline_perf_if),
.sysmem_perf (sysmem_perf),
.pipeline_perf (pipeline_perf),
`endif
.base_dcrs (base_dcrs),
.dispatch_if (dispatch_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),

View file

@ -51,8 +51,9 @@ module VX_fetch import VX_gpu_pkg::*; #(
wire [`NUM_THREADS-1:0] rsp_tmask;
VX_dp_ram #(
.DATAW (`PC_BITS + `NUM_THREADS),
.SIZE (`NUM_WARPS),
.DATAW (`PC_BITS + `NUM_THREADS),
.SIZE (`NUM_WARPS),
.RDW_MODE ("R"),
.LUTRAM (1)
) tag_store (
.clk (clk),
@ -71,7 +72,7 @@ module VX_fetch import VX_gpu_pkg::*; #(
// This resolves potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache requests.
// This issue is particularly prevalent when the icache and dcache are disabled and both requests share the same bus.
wire [`NUM_WARPS-1:0] pending_ibuf_full;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_pending_reads
VX_pending_size #(
.SIZE (`IBUF_SIZE)
) pending_reads (
@ -116,9 +117,9 @@ module VX_fetch import VX_gpu_pkg::*; #(
.ready_out (icache_bus_if.req_ready)
);
assign icache_bus_if.req_data.atype = '0;
assign icache_bus_if.req_data.flags = '0;
assign icache_bus_if.req_data.rw = 0;
assign icache_bus_if.req_data.byteen = 4'b1111;
assign icache_bus_if.req_data.byteen = '1;
assign icache_bus_if.req_data.data = '0;
// Icache Response
@ -131,47 +132,59 @@ module VX_fetch import VX_gpu_pkg::*; #(
assign fetch_if.data.uuid = rsp_uuid;
assign icache_bus_if.rsp_ready = fetch_if.ready;
`ifdef SCOPE
`ifdef DBG_SCOPE_FETCH
`SCOPE_IO_SWITCH (1);
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
VX_scope_tap #(
.SCOPE_ID (1),
.TRIGGERW (4),
.PROBEW (`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +
ICACHE_TAG_WIDTH + ICACHE_WORD_SIZE + ICACHE_ADDR_WIDTH +
(ICACHE_WORD_SIZE*8) + ICACHE_TAG_WIDTH)
) scope_tap (
.clk (clk),
.reset (scope_reset),
.start (1'b0),
.stop (1'b0),
.triggers ({
reset,
wire icache_bus_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
wire icache_bus_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 1, 6, 3, (
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +
`UUID_WIDTH + ICACHE_WORD_SIZE + ICACHE_ADDR_WIDTH +
`UUID_WIDTH + (ICACHE_WORD_SIZE * 8)
), {
schedule_if.valid,
schedule_if.ready,
icache_bus_if.req_valid,
icache_bus_if.req_ready,
icache_bus_if.rsp_valid,
icache_bus_if.rsp_ready
}, {
schedule_fire,
icache_req_fire,
icache_rsp_fire
}),
.probes ({
icache_bus_req_fire,
icache_bus_rsp_fire
},{
schedule_if.data.uuid, schedule_if.data.wid, schedule_if.data.tmask, schedule_if.data.PC,
icache_bus_if.req_data.tag, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
icache_bus_if.rsp_data.data, icache_bus_if.rsp_data.tag
}),
.bus_in (scope_bus_in),
.bus_out (scope_bus_out)
icache_bus_if.req_data.tag.uuid, icache_bus_if.req_data.byteen, icache_bus_if.req_data.addr,
icache_bus_if.rsp_data.tag.uuid, icache_bus_if.rsp_data.data
},
reset_negedge, 1'b0, 4096
);
`else
`SCOPE_IO_UNUSED()
`SCOPE_IO_UNUSED(0)
`endif
`endif
`ifdef CHIPSCOPE
`ifdef DBG_SCOPE_FETCH
ila_fetch ila_fetch_inst (
.clk (clk),
.probe0 ({schedule_if.valid, schedule_if.data, schedule_if.ready}),
.probe1 ({icache_bus_if.req_valid, icache_bus_if.req_data, icache_bus_if.req_ready}),
.probe2 ({icache_bus_if.rsp_valid, icache_bus_if.rsp_data, icache_bus_if.rsp_ready})
);
`endif
`endif
`ifdef DBG_TRACE_MEM
wire schedule_fire = schedule_if.valid && schedule_if.ready;
wire fetch_fire = fetch_if.valid && fetch_if.ready;
always @(posedge clk) begin
if (schedule_fire) begin
`TRACE(1, ("%d: %s req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, INSTANCE_ID, schedule_if.data.wid, {schedule_if.data.PC, 1'b0}, schedule_if.data.tmask, schedule_if.data.uuid));
if (schedule_if.valid && schedule_if.ready) begin
`TRACE(1, ("%t: %s req: wid=%0d, PC=0x%0h, tmask=%b (#%0d)\n", $time, INSTANCE_ID, schedule_if.data.wid, {schedule_if.data.PC, 1'b0}, schedule_if.data.tmask, schedule_if.data.uuid))
end
if (fetch_fire) begin
`TRACE(1, ("%d: %s rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, INSTANCE_ID, fetch_if.data.wid, {fetch_if.data.PC, 1'b0}, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid));
if (fetch_if.valid && fetch_if.ready) begin
`TRACE(1, ("%t: %s rsp: wid=%0d, PC=0x%0h, tmask=%b, instr=0x%0h (#%0d)\n", $time, INSTANCE_ID, fetch_if.data.wid, {fetch_if.data.PC, 1'b0}, fetch_if.data.tmask, fetch_if.data.instr, fetch_if.data.uuid))
end
end
`endif

View file

@ -41,7 +41,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_BUF (PARTIAL_BW ? 1 : 0)
.OUT_BUF (PARTIAL_BW ? 3 : 0)
) dispatch_unit (
.clk (clk),
.reset (reset),
@ -53,12 +53,10 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.NUM_LANES (NUM_LANES)
) per_block_commit_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_fpus
`UNUSED_VAR (per_block_execute_if[block_idx].data.tid)
`UNUSED_VAR (per_block_execute_if[block_idx].data.wb)
`RESET_RELAY_EN (block_reset, reset, (BLOCK_SIZE > 1));
// Store request info
wire fpu_req_valid, fpu_req_ready;
wire fpu_rsp_valid, fpu_rsp_ready;
@ -71,9 +69,9 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
wire [NUM_LANES-1:0] fpu_rsp_tmask;
wire [`PC_BITS-1:0] fpu_rsp_PC;
wire [`NR_BITS-1:0] fpu_rsp_rd;
wire [PID_WIDTH-1:0] fpu_rsp_pid;
wire fpu_rsp_sop;
wire fpu_rsp_eop;
wire [PID_WIDTH-1:0] fpu_rsp_pid, fpu_rsp_pid_u;
wire fpu_rsp_sop, fpu_rsp_sop_u;
wire fpu_rsp_eop, fpu_rsp_eop_u;
wire [TAG_WIDTH-1:0] fpu_req_tag, fpu_rsp_tag;
wire mdata_full;
@ -89,17 +87,30 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.SIZE (`FPUQ_SIZE)
) tag_store (
.clk (clk),
.reset (block_reset),
.reset (reset),
.acquire_en (execute_fire),
.write_addr (fpu_req_tag),
.write_data ({per_block_execute_if[block_idx].data.uuid, per_block_execute_if[block_idx].data.wid, per_block_execute_if[block_idx].data.tmask, per_block_execute_if[block_idx].data.PC, per_block_execute_if[block_idx].data.rd, per_block_execute_if[block_idx].data.pid, per_block_execute_if[block_idx].data.sop, per_block_execute_if[block_idx].data.eop}),
.read_data ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),
.read_data ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_pid_u, fpu_rsp_sop_u, fpu_rsp_eop_u}),
.read_addr (fpu_rsp_tag),
.release_en (fpu_rsp_fire),
.full (mdata_full),
`UNUSED_PIN (empty)
);
if (PID_BITS != 0) begin : g_fpu_rsp_pid
assign fpu_rsp_pid = fpu_rsp_pid_u;
assign fpu_rsp_sop = fpu_rsp_sop_u;
assign fpu_rsp_eop = fpu_rsp_eop_u;
end else begin : g_no_fpu_rsp_pid
`UNUSED_VAR (fpu_rsp_pid_u)
`UNUSED_VAR (fpu_rsp_sop_u)
`UNUSED_VAR (fpu_rsp_eop_u)
assign fpu_rsp_pid = 0;
assign fpu_rsp_sop = 1;
assign fpu_rsp_eop = 1;
end
// resolve dynamic FRM from CSR
wire [`INST_FRM_BITS-1:0] fpu_req_frm;
`ASSIGN_BLOCKED_WID (fpu_csr_if[block_idx].read_wid, per_block_execute_if[block_idx].data.wid, block_idx, `NUM_FPU_BLOCKS)
@ -119,7 +130,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_dpi (
.clk (clk),
.reset (block_reset),
.reset (reset),
.valid_in (fpu_req_valid),
.mask_in (per_block_execute_if[block_idx].data.tmask),
@ -148,7 +159,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_fpnew (
.clk (clk),
.reset (block_reset),
.reset (reset),
.valid_in (fpu_req_valid),
.mask_in (per_block_execute_if[block_idx].data.tmask),
@ -177,7 +188,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.OUT_BUF (PARTIAL_BW ? 1 : 3)
) fpu_dsp (
.clk (clk),
.reset (block_reset),
.reset (reset),
.valid_in (fpu_req_valid),
.mask_in (per_block_execute_if[block_idx].data.tmask),
@ -200,27 +211,38 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
`endif
// handle FPU response
// handle CSR update
fflags_t fpu_rsp_fflags_q;
if (PID_BITS != 0) begin
if (PID_BITS != 0) begin : g_pid
fflags_t fpu_rsp_fflags_r;
always @(posedge clk) begin
if (block_reset) begin
if (reset) begin
fpu_rsp_fflags_r <= '0;
end else if (fpu_rsp_fire) begin
fpu_rsp_fflags_r <= fpu_rsp_eop ? '0 : (fpu_rsp_fflags_r | fpu_rsp_fflags);
end
end
assign fpu_rsp_fflags_q = fpu_rsp_fflags_r | fpu_rsp_fflags;
end else begin
end else begin : g_no_pid
assign fpu_rsp_fflags_q = fpu_rsp_fflags;
end
assign fpu_csr_if[block_idx].write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags;
`ASSIGN_BLOCKED_WID (fpu_csr_if[block_idx].write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS)
assign fpu_csr_if[block_idx].write_fflags = fpu_rsp_fflags_q;
VX_fpu_csr_if fpu_csr_tmp_if();
assign fpu_csr_tmp_if.write_enable = fpu_rsp_fire && fpu_rsp_eop && fpu_rsp_has_fflags;
`ASSIGN_BLOCKED_WID (fpu_csr_tmp_if.write_wid, fpu_rsp_wid, block_idx, `NUM_FPU_BLOCKS)
assign fpu_csr_tmp_if.write_fflags = fpu_rsp_fflags_q;
VX_pipe_register #(
.DATAW (1 + `NW_WIDTH + $bits(fflags_t)),
.RESETW (1)
) fpu_csr_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({fpu_csr_tmp_if.write_enable, fpu_csr_tmp_if.write_wid, fpu_csr_tmp_if.write_fflags}),
.data_out ({fpu_csr_if[block_idx].write_enable, fpu_csr_if[block_idx].write_wid, fpu_csr_if[block_idx].write_fflags})
);
// send response
@ -229,7 +251,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
.SIZE (0)
) rsp_buf (
.clk (clk),
.reset (block_reset),
.reset (reset),
.valid_in (fpu_rsp_valid),
.ready_in (fpu_rsp_ready),
.data_in ({fpu_rsp_uuid, fpu_rsp_wid, fpu_rsp_tmask, fpu_rsp_PC, fpu_rsp_rd, fpu_rsp_result, fpu_rsp_pid, fpu_rsp_sop, fpu_rsp_eop}),

View file

@ -41,17 +41,17 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
wire [BLOCK_SIZE-1:0] commit_in_ready;
wire [BLOCK_SIZE-1:0][ISSUE_ISW_W-1:0] commit_in_isw;
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin : g_commit_in
assign commit_in_valid[i] = commit_in_if[i].valid;
assign commit_in_data[i] = commit_in_if[i].data;
assign commit_in_if[i].ready = commit_in_ready[i];
if (BLOCK_SIZE != `ISSUE_WIDTH) begin
if (BLOCK_SIZE != 1) begin
if (BLOCK_SIZE != `ISSUE_WIDTH) begin : g_commit_in_isw_partial
if (BLOCK_SIZE != 1) begin : g_block
assign commit_in_isw[i] = {commit_in_data[i][DATA_WIS_OFF+BLOCK_SIZE_W +: (ISSUE_ISW_W-BLOCK_SIZE_W)], BLOCK_SIZE_W'(i)};
end else begin
end else begin : g_no_block
assign commit_in_isw[i] = commit_in_data[i][DATA_WIS_OFF +: ISSUE_ISW_W];
end
end else begin
end else begin : g_commit_in_isw_full
assign commit_in_isw[i] = BLOCK_SIZE_W'(i);
end
end
@ -70,11 +70,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
commit_out_data[commit_in_isw[i]] = commit_in_data[i];
end
end
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin
for (genvar i = 0; i < BLOCK_SIZE; ++i) begin : g_commit_in_ready
assign commit_in_ready[i] = commit_out_ready[commit_in_isw[i]];
end
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin: g_out_bufs
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_tmp_if();
@ -94,31 +95,31 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
.ready_out (commit_tmp_if.ready)
);
logic [`NUM_THREADS-1:0] commit_tmask_r;
logic [`NUM_THREADS-1:0][`XLEN-1:0] commit_data_r;
if (PID_BITS != 0) begin
logic [`NUM_THREADS-1:0] commit_tmask_w;
logic [`NUM_THREADS-1:0][`XLEN-1:0] commit_data_w;
if (PID_BITS != 0) begin : g_commit_data_with_pid
always @(*) begin
commit_tmask_r = '0;
commit_data_r = 'x;
commit_tmask_w = '0;
commit_data_w = 'x;
for (integer j = 0; j < NUM_LANES; ++j) begin
commit_tmask_r[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.tmask[j];
commit_data_r[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.data[j];
commit_tmask_w[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.tmask[j];
commit_data_w[commit_tmp_if.data.pid * NUM_LANES + j] = commit_tmp_if.data.data[j];
end
end
end else begin
assign commit_tmask_r = commit_tmp_if.data.tmask;
assign commit_data_r = commit_tmp_if.data.data;
end else begin : g_commit_data_no_pid
assign commit_tmask_w = commit_tmp_if.data.tmask;
assign commit_data_w = commit_tmp_if.data.data;
end
assign commit_out_if[i].valid = commit_tmp_if.valid;
assign commit_out_if[i].data = {
commit_tmp_if.data.uuid,
commit_tmp_if.data.wid,
commit_tmask_r,
commit_tmask_w,
commit_tmp_if.data.PC,
commit_tmp_if.data.wb,
commit_tmp_if.data.rd,
commit_data_r,
commit_data_w,
1'b0, // PID
commit_tmp_if.data.sop,
commit_tmp_if.data.eop

View file

@ -35,11 +35,11 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
wire [PER_ISSUE_WARPS-1:0] ibuf_ready_in;
assign decode_if.ready = ibuf_ready_in[decode_if.data.wid];
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_instr_bufs
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`IBUF_SIZE),
.OUT_REG (2) // 2-cycle EB for area reduction
.OUT_REG (1)
) instr_buf (
.clk (clk),
.reset (reset),

View file

@ -16,7 +16,6 @@
module VX_ipdom_stack #(
parameter WIDTH = 1,
parameter DEPTH = 1,
parameter OUT_REG = 0,
parameter ADDRW = `LOG2UP(DEPTH)
) (
input wire clk,
@ -31,76 +30,63 @@ module VX_ipdom_stack #(
output wire empty,
output wire full
);
reg slot_set [DEPTH-1:0];
reg [ADDRW-1:0] rd_ptr, wr_ptr;
reg [ADDRW-1:0] rd_ptr, rd_ptr_n, wr_ptr;
reg empty_r, full_r;
wire [WIDTH-1:0] d0, d1;
wire d_set_n = slot_set[rd_ptr];
wire d_set_r;
always @(*) begin
rd_ptr_n = rd_ptr;
if (push) begin
rd_ptr_n = wr_ptr;
end else if (pop) begin
rd_ptr_n = rd_ptr - ADDRW'(d_set_r);
end
end
always @(posedge clk) begin
if (reset) begin
rd_ptr <= '0;
wr_ptr <= '0;
empty_r <= 1;
full_r <= 0;
rd_ptr <= '0;
end else begin
`ASSERT(~push || ~full, ("runtime error: writing to a full stack!"));
`ASSERT(~pop || ~empty, ("runtime error: reading an empty stack!"));
`ASSERT(~push || ~pop, ("runtime error: push and pop in same cycle not supported!"));
`ASSERT(~push || ~full, ("%t: runtime error: writing to a full stack!", $time));
`ASSERT(~pop || ~empty, ("%t: runtime error: reading an empty stack!", $time));
`ASSERT(~push || ~pop, ("%t: runtime error: push and pop in same cycle not supported!", $time));
if (push) begin
rd_ptr <= wr_ptr;
wr_ptr <= wr_ptr + ADDRW'(1);
empty_r <= 0;
full_r <= (ADDRW'(DEPTH-1) == wr_ptr);
end else if (pop) begin
wr_ptr <= wr_ptr - ADDRW'(d_set_n);
rd_ptr <= rd_ptr - ADDRW'(d_set_n);
empty_r <= (rd_ptr == 0) && (d_set_n == 1);
wr_ptr <= wr_ptr - ADDRW'(d_set_r);
empty_r <= (rd_ptr == 0) && d_set_r;
full_r <= 0;
end
rd_ptr <= rd_ptr_n;
end
end
wire [WIDTH * 2:0] qout = push ? {1'b0, q1, q0} : {1'b1, d1, d0};
VX_dp_ram #(
.DATAW (WIDTH * 2),
.SIZE (DEPTH),
.OUT_REG (OUT_REG ? 1 : 0),
.LUTRAM (OUT_REG ? 0 : 1)
) store (
.DATAW (1 + WIDTH * 2),
.SIZE (DEPTH),
.OUT_REG (1),
.RDW_MODE ("R")
) ipdom_store (
.clk (clk),
.reset (reset),
.read (1'b1),
.write (push),
.write (push || pop),
.wren (1'b1),
.waddr (wr_ptr),
.wdata ({q1, q0}),
.raddr (rd_ptr),
.rdata ({d1, d0})
);
always @(posedge clk) begin
if (push) begin
slot_set[wr_ptr] <= 0;
end else if (pop) begin
slot_set[rd_ptr] <= 1;
end
end
wire d_set_r;
VX_pipe_register #(
.DATAW (1),
.DEPTH (OUT_REG)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in (d_set_n),
.data_out (d_set_r)
.waddr (push ? wr_ptr : rd_ptr),
.wdata (qout),
.raddr (rd_ptr_n),
.rdata ({d_set_r, d1, d0})
);
assign d = d_set_r ? d0 : d1;

View file

@ -29,16 +29,17 @@ module VX_issue import VX_gpu_pkg::*; #(
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH]
);
`STATIC_ASSERT ((`ISSUE_WIDTH <= `NUM_WARPS), ("invalid parameter"))
`ifdef PERF_ENABLE
issue_perf_t per_issue_perf [`ISSUE_WIDTH];
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, ibf_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, scb_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, opd_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_issue_perf_units_uses
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, units_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
end
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin : g_issue_perf_sfu_uses
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, sfu_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
end
`endif
@ -49,9 +50,9 @@ module VX_issue import VX_gpu_pkg::*; #(
wire [`ISSUE_WIDTH-1:0] decode_ready_in;
assign decode_if.ready = decode_ready_in[decode_isw];
`SCOPE_IO_SWITCH (`ISSUE_WIDTH)
`SCOPE_IO_SWITCH (`ISSUE_WIDTH);
for (genvar issue_id = 0; issue_id < `ISSUE_WIDTH; ++issue_id) begin : issue_slices
for (genvar issue_id = 0; issue_id < `ISSUE_WIDTH; ++issue_id) begin : g_slices
VX_decode_if #(
.NUM_WARPS (PER_ISSUE_WARPS)
) per_issue_decode_if();
@ -76,15 +77,13 @@ module VX_issue import VX_gpu_pkg::*; #(
assign decode_if.ibuf_pop[issue_id * PER_ISSUE_WARPS +: PER_ISSUE_WARPS] = per_issue_decode_if.ibuf_pop;
`endif
`RESET_RELAY (slice_reset, reset);
VX_issue_slice #(
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, issue_id)),
.INSTANCE_ID (`SFORMATF(("%s%0d", INSTANCE_ID, issue_id))),
.ISSUE_ID (issue_id)
) issue_slice (
`SCOPE_IO_BIND(issue_id)
.clk (clk),
.reset (slice_reset),
.reset (reset),
`ifdef PERF_ENABLE
.issue_perf (per_issue_perf[issue_id]),
`endif
@ -94,7 +93,7 @@ module VX_issue import VX_gpu_pkg::*; #(
);
// Assign transposed dispatch_if
for (genvar ex_id = 0; ex_id < `NUM_EX_UNITS; ++ex_id) begin
for (genvar ex_id = 0; ex_id < `NUM_EX_UNITS; ++ex_id) begin : g_dispatch_if
`ASSIGN_VX_IF(dispatch_if[ex_id * `ISSUE_WIDTH + issue_id], per_issue_dispatch_if[ex_id]);
end
end

View file

@ -13,7 +13,7 @@
`include "VX_define.vh"
module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
module VX_issue_slice import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter ISSUE_ID = 0
) (
@ -36,16 +36,11 @@ module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
VX_scoreboard_if scoreboard_if();
VX_operands_if operands_if();
`RESET_RELAY (ibuf_reset, reset);
`RESET_RELAY (scoreboard_reset, reset);
`RESET_RELAY (operands_reset, reset);
`RESET_RELAY (dispatch_reset, reset);
VX_ibuffer #(
.INSTANCE_ID ($sformatf("%s-ibuffer", INSTANCE_ID))
.INSTANCE_ID (`SFORMATF(("%s-ibuffer", INSTANCE_ID)))
) ibuffer (
.clk (clk),
.reset (ibuf_reset),
.reset (reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.ibf_stalls),
`endif
@ -54,10 +49,10 @@ module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
);
VX_scoreboard #(
.INSTANCE_ID ($sformatf("%s-scoreboard", INSTANCE_ID))
.INSTANCE_ID (`SFORMATF(("%s-scoreboard", INSTANCE_ID)))
) scoreboard (
.clk (clk),
.reset (scoreboard_reset),
.reset (reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.scb_stalls),
.perf_units_uses(issue_perf.units_uses),
@ -69,10 +64,10 @@ module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
);
VX_operands #(
.INSTANCE_ID ($sformatf("%s-operands", INSTANCE_ID))
.INSTANCE_ID (`SFORMATF(("%s-operands", INSTANCE_ID)))
) operands (
.clk (clk),
.reset (operands_reset),
.reset (reset),
`ifdef PERF_ENABLE
.perf_stalls (issue_perf.opd_stalls),
`endif
@ -82,10 +77,10 @@ module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
);
VX_dispatch #(
.INSTANCE_ID ($sformatf("%s-dispatch", INSTANCE_ID))
.INSTANCE_ID (`SFORMATF(("%s-dispatch", INSTANCE_ID)))
) dispatch (
.clk (clk),
.reset (dispatch_reset),
.reset (reset),
`ifdef PERF_ENABLE
`UNUSED_PIN (perf_stalls),
`endif
@ -93,65 +88,90 @@ module VX_issue_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
.dispatch_if (dispatch_if)
);
`ifdef SCOPE
`ifdef DBG_SCOPE_ISSUE
wire operands_if_fire = operands_if.valid && operands_if.ready;
wire operands_if_not_ready = ~operands_if.ready;
wire writeback_if_valid = writeback_if.valid;
VX_scope_tap #(
.SCOPE_ID (2),
.TRIGGERW (4),
.PROBEW (`UUID_WIDTH + `NUM_THREADS + `EX_BITS + `INST_OP_BITS +
1 + `NR_BITS + (`NUM_THREADS * 3 * `XLEN) +
`UUID_WIDTH + `NUM_THREADS + `NR_BITS + (`NUM_THREADS*`XLEN) + 1)
) scope_tap (
.clk (clk),
.reset (scope_reset),
.start (1'b0),
.stop (1'b0),
.triggers ({
reset,
operands_if_fire,
operands_if_not_ready,
writeback_if_valid
}),
.probes ({
`SCOPE_IO_SWITCH (1);
wire decode_fire = decode_if.valid && decode_if.ready;
wire operands_fire = operands_if.valid && operands_if.ready;
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 2, 4, 3, (
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + 1 + `NR_BITS * 4 +
`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + 1 + `NR_BITS + (3 * `XLEN) +
`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * `XLEN) + 1
), {
decode_if.valid,
decode_if.ready,
operands_if.valid,
operands_if.ready
}, {
decode_fire,
operands_fire,
writeback_if.valid // ack-free
}, {
decode_if.data.uuid,
decode_if.data.wid,
decode_if.data.tmask,
decode_if.data.PC,
decode_if.data.ex_type,
decode_if.data.op_type,
decode_if.data.wb,
decode_if.data.rd,
decode_if.data.rs1,
decode_if.data.rs2,
decode_if.data.rs3,
operands_if.data.uuid,
operands_if.data.wis,
operands_if.data.tmask,
operands_if.data.PC,
operands_if.data.ex_type,
operands_if.data.op_type,
operands_if.data.wb,
operands_if.data.rd,
operands_if.data.rs1_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data,
operands_if.data.rs1_data[0],
operands_if.data.rs2_data[0],
operands_if.data.rs3_data[0],
writeback_if.data.uuid,
writeback_if.data.wis,
writeback_if.data.tmask,
writeback_if.data.rd,
writeback_if.data.data,
writeback_if.data.eop
}),
.bus_in (scope_bus_in),
.bus_out (scope_bus_out)
},
reset_negedge, 1'b0, 4096
);
`else
`SCOPE_IO_UNUSED()
`SCOPE_IO_UNUSED(0)
`endif
`endif
`ifdef CHIPSCOPE
`ifdef DBG_SCOPE_ISSUE
ila_issue ila_issue_inst (
.clk (clk),
.probe0 ({decode_if.valid, decode_if.data, decode_if.ready}),
.probe1 ({scoreboard_if.valid, scoreboard_if.data, scoreboard_if.ready}),
.probe2 ({operands_if.valid, operands_if.data, operands_if.ready}),
.probe3 ({writeback_if.valid, writeback_if.data})
);
`endif
`endif
`ifdef DBG_TRACE_PIPELINE
always @(posedge clk) begin
if (operands_if.valid && operands_if.ready) begin
`TRACE(1, ("%d: %s wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, wis_to_wid(operands_if.data.wis, ISSUE_ID), {operands_if.data.PC, 1'b0}));
`TRACE(1, ("%t: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, wis_to_wid(operands_if.data.wis, ISSUE_ID), {operands_if.data.PC, 1'b0}))
trace_ex_type(1, operands_if.data.ex_type);
`TRACE(1, (", op="));
`TRACE(1, (", op="))
trace_ex_op(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args);
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if.data.tmask, operands_if.data.wb, operands_if.data.rd));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs1_data, `NUM_THREADS);
`TRACE(1, (", rs2_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs2_data, `NUM_THREADS);
`TRACE(1, (", rs3_data="));
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs3_data, `NUM_THREADS);
`TRACE(1, (", tmask=%b, wb=%b, rd=%0d, rs1_data=", operands_if.data.tmask, operands_if.data.wb, operands_if.data.rd))
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs1_data, `NUM_THREADS)
`TRACE(1, (", rs2_data="))
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs2_data, `NUM_THREADS)
`TRACE(1, (", rs3_data="))
`TRACE_ARRAY1D(1, "0x%0h", operands_if.data.rs3_data, `NUM_THREADS)
trace_op_args(1, operands_if.data.ex_type, operands_if.data.op_type, operands_if.data.op_args);
`TRACE(1, (" (#%0d)\n", operands_if.data.uuid));
`TRACE(1, (" (#%0d)\n", operands_if.data.uuid))
end
end
`endif

View file

@ -80,7 +80,7 @@ module VX_issue_top import VX_gpu_pkg::*; #(
assign decode_if.data.rs3 = decode_rs3;
assign decode_ready = decode_if.ready;
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_writeback_if
assign writeback_if[i].valid = writeback_valid[i];
assign writeback_if[i].data.uuid = writeback_uuid[i];
assign writeback_if[i].data.wis = writeback_wis[i];
@ -92,7 +92,7 @@ module VX_issue_top import VX_gpu_pkg::*; #(
assign writeback_if[i].data.eop = writeback_eop[i];
end
for (genvar i = 0; i < `NUM_EX_UNITS * `ISSUE_WIDTH; ++i) begin
for (genvar i = 0; i < `NUM_EX_UNITS * `ISSUE_WIDTH; ++i) begin : g_dispatch_if
assign dispatch_valid[i] = dispatch_if[i].valid;
assign dispatch_uuid[i] = dispatch_if[i].data.uuid;
assign dispatch_wis[i] = dispatch_if[i].data.wis;
@ -113,6 +113,13 @@ module VX_issue_top import VX_gpu_pkg::*; #(
issue_perf_t issue_perf = '0;
`endif
`ifdef SCOPE
wire [0:0] scope_reset_w = 1'b0;
wire [0:0] scope_bus_in_w = 1'b0;
wire [0:0] scope_bus_out_w;
`UNUSED_VAR (scope_bus_out_w)
`endif
VX_issue #(
.INSTANCE_ID (INSTANCE_ID)
) issue (

View file

@ -1,201 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_lmem_unit import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output cache_perf_t cache_perf,
`endif
VX_lsu_mem_if.slave lsu_mem_in_if [`NUM_LSU_BLOCKS],
VX_lsu_mem_if.master lsu_mem_out_if [`NUM_LSU_BLOCKS]
);
`STATIC_ASSERT(`IS_DIVISBLE((1 << `LMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT(0 == (`LMEM_BASE_ADDR % (1 << `LMEM_LOG_SIZE)), ("invalid parameter"))
localparam REQ_DATAW = `NUM_LSU_LANES + 1 + `NUM_LSU_LANES * (LSU_WORD_SIZE + LSU_ADDR_WIDTH + `ADDR_TYPE_WIDTH + LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH;
localparam RSP_DATAW = `NUM_LSU_LANES + `NUM_LSU_LANES * (LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH;
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE);
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_switch_if[`NUM_LSU_BLOCKS]();
`RESET_RELAY_EX (block_reset, reset, `NUM_LSU_BLOCKS, 1);
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
wire [`NUM_LSU_LANES-1:0] is_addr_local_mask;
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
assign is_addr_local_mask[j] = lsu_mem_in_if[i].req_data.atype[j][`ADDR_TYPE_LOCAL];
end
wire is_addr_global = | (lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask);
wire is_addr_local = | (lsu_mem_in_if[i].req_data.mask & is_addr_local_mask);
wire req_global_ready;
wire req_local_ready;
VX_elastic_buffer #(
.DATAW (REQ_DATAW),
.SIZE (2),
.OUT_REG (1)
) req_global_buf (
.clk (clk),
.reset (block_reset[i]),
.valid_in (lsu_mem_in_if[i].req_valid && is_addr_global),
.data_in ({
lsu_mem_in_if[i].req_data.mask & ~is_addr_local_mask,
lsu_mem_in_if[i].req_data.rw,
lsu_mem_in_if[i].req_data.byteen,
lsu_mem_in_if[i].req_data.addr,
lsu_mem_in_if[i].req_data.atype,
lsu_mem_in_if[i].req_data.data,
lsu_mem_in_if[i].req_data.tag
}),
.ready_in (req_global_ready),
.valid_out (lsu_mem_out_if[i].req_valid),
.data_out ({
lsu_mem_out_if[i].req_data.mask,
lsu_mem_out_if[i].req_data.rw,
lsu_mem_out_if[i].req_data.byteen,
lsu_mem_out_if[i].req_data.addr,
lsu_mem_out_if[i].req_data.atype,
lsu_mem_out_if[i].req_data.data,
lsu_mem_out_if[i].req_data.tag
}),
.ready_out (lsu_mem_out_if[i].req_ready)
);
VX_elastic_buffer #(
.DATAW (REQ_DATAW),
.SIZE (0),
.OUT_REG (0)
) req_local_buf (
.clk (clk),
.reset (block_reset[i]),
.valid_in (lsu_mem_in_if[i].req_valid && is_addr_local),
.data_in ({
lsu_mem_in_if[i].req_data.mask & is_addr_local_mask,
lsu_mem_in_if[i].req_data.rw,
lsu_mem_in_if[i].req_data.byteen,
lsu_mem_in_if[i].req_data.addr,
lsu_mem_in_if[i].req_data.atype,
lsu_mem_in_if[i].req_data.data,
lsu_mem_in_if[i].req_data.tag
}),
.ready_in (req_local_ready),
.valid_out (lsu_switch_if[i].req_valid),
.data_out ({
lsu_switch_if[i].req_data.mask,
lsu_switch_if[i].req_data.rw,
lsu_switch_if[i].req_data.byteen,
lsu_switch_if[i].req_data.addr,
lsu_switch_if[i].req_data.atype,
lsu_switch_if[i].req_data.data,
lsu_switch_if[i].req_data.tag
}),
.ready_out (lsu_switch_if[i].req_ready)
);
assign lsu_mem_in_if[i].req_ready = (req_global_ready && is_addr_global)
|| (req_local_ready && is_addr_local);
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_DATAW),
.ARBITER ("R"),
.OUT_BUF (1)
) rsp_arb (
.clk (clk),
.reset (block_reset[i]),
.valid_in ({
lsu_switch_if[i].rsp_valid,
lsu_mem_out_if[i].rsp_valid
}),
.ready_in ({
lsu_switch_if[i].rsp_ready,
lsu_mem_out_if[i].rsp_ready
}),
.data_in ({
lsu_switch_if[i].rsp_data,
lsu_mem_out_if[i].rsp_data
}),
.data_out (lsu_mem_in_if[i].rsp_data),
.valid_out (lsu_mem_in_if[i].rsp_valid),
.ready_out (lsu_mem_in_if[i].rsp_ready),
`UNUSED_PIN (sel_out)
);
end
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_if[LSU_NUM_REQS]();
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lmem_bus_tmp_if[`NUM_LSU_LANES]();
VX_lsu_adapter #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_SEL_BITS (LSU_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (3),
.RSP_OUT_BUF (0)
) lsu_adapter (
.clk (clk),
.reset (block_reset[i]),
.lsu_mem_if (lsu_switch_if[i]),
.mem_bus_if (lmem_bus_tmp_if)
);
for (genvar j = 0; j < `NUM_LSU_LANES; ++j) begin
`ASSIGN_VX_MEM_BUS_IF (lmem_bus_if[i * `NUM_LSU_LANES + j], lmem_bus_tmp_if[j]);
end
end
`RESET_RELAY (lmem_reset, reset);
VX_local_mem #(
.INSTANCE_ID($sformatf("%s-lmem", INSTANCE_ID)),
.SIZE (1 << `LMEM_LOG_SIZE),
.NUM_REQS (LSU_NUM_REQS),
.NUM_BANKS (`LMEM_NUM_BANKS),
.WORD_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.OUT_BUF (3)
) local_mem (
.clk (clk),
.reset (lmem_reset),
`ifdef PERF_ENABLE
.cache_perf (cache_perf),
`endif
.mem_bus_if (lmem_bus_if)
);
endmodule

View file

@ -13,7 +13,7 @@
`include "VX_define.vh"
module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
module VX_lsu_slice import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
`SCOPE_IO_DECL
@ -59,25 +59,25 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
wire req_is_fence, rsp_is_fence;
wire [NUM_LANES-1:0][`XLEN-1:0] full_addr;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_full_addr
assign full_addr[i] = execute_if.data.rs1_data[i] + `SEXT(`XLEN, execute_if.data.op_args.lsu.offset);
end
// address type calculation
wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] mem_req_atype;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [NUM_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] mem_req_flags;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_flags
wire [MEM_ADDRW-1:0] block_addr = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
// is I/O address
wire [MEM_ADDRW-1:0] io_addr_start = MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT);
wire [MEM_ADDRW-1:0] io_addr_end = MEM_ADDRW'(`XLEN'(`IO_END_ADDR) >> MEM_ASHIFT);
assign mem_req_atype[i][`ADDR_TYPE_FLUSH] = req_is_fence;
assign mem_req_atype[i][`ADDR_TYPE_IO] = (block_addr >= io_addr_start) && (block_addr < io_addr_end);
assign mem_req_flags[i][`MEM_REQ_FLAG_FLUSH] = req_is_fence;
assign mem_req_flags[i][`MEM_REQ_FLAG_IO] = (block_addr >= io_addr_start) && (block_addr < io_addr_end);
`ifdef LMEM_ENABLE
// is local memory address
wire [MEM_ADDRW-1:0] lmem_addr_start = MEM_ADDRW'(`XLEN'(`LMEM_BASE_ADDR) >> MEM_ASHIFT);
wire [MEM_ADDRW-1:0] lmem_addr_end = MEM_ADDRW'((`XLEN'(`LMEM_BASE_ADDR) + `XLEN'(1 << `LMEM_LOG_SIZE)) >> MEM_ASHIFT);
assign mem_req_atype[i][`ADDR_TYPE_LOCAL] = (block_addr >= lmem_addr_start) && (block_addr < lmem_addr_end);
assign mem_req_flags[i][`MEM_REQ_FLAG_LOCAL] = (block_addr >= lmem_addr_start) && (block_addr < lmem_addr_end);
`endif
end
@ -102,8 +102,6 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
wire mem_req_fire = mem_req_valid && mem_req_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
`UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire)
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
wire no_rsp_buf_valid, no_rsp_buf_ready;
@ -151,49 +149,49 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] req_align;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_addr
assign req_align[i] = full_addr[i][REQ_ASHIFT-1:0];
assign mem_req_addr[i] = full_addr[i][`MEM_ADDR_WIDTH-1:REQ_ASHIFT];
end
// byte enable formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
reg [LSU_WORD_SIZE-1:0] mem_req_byteen_r;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_byteen_w
reg [LSU_WORD_SIZE-1:0] mem_req_byteen_w;
always @(*) begin
mem_req_byteen_r = '0;
mem_req_byteen_w = '0;
case (`INST_LSU_WSIZE(execute_if.data.op_type))
0: begin // 8-bit
mem_req_byteen_r[req_align[i]] = 1'b1;
mem_req_byteen_w[req_align[i]] = 1'b1;
end
1: begin // 16 bit
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
end
`ifdef XLEN_64
2: begin // 32 bit
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
mem_req_byteen_r[{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
mem_req_byteen_w[{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
end
`endif
// 3: 64 bit
default : mem_req_byteen_r = {LSU_WORD_SIZE{1'b1}};
default : mem_req_byteen_w = {LSU_WORD_SIZE{1'b1}};
endcase
end
assign mem_req_byteen[i] = mem_req_byteen_r;
assign mem_req_byteen[i] = mem_req_byteen_w;
end
// memory misalignment not supported!
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_missalign
wire lsu_req_fire = execute_if.valid && execute_if.ready;
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if.data.tmask[i] || req_is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if.data.op_type))) == 0),
("misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
execute_if.data.wid, {execute_if.data.PC, 1'b0}, full_addr[i], `INST_LSU_WSIZE(execute_if.data.op_type), execute_if.data.uuid));
("%t: misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
$time, execute_if.data.wid, {execute_if.data.PC, 1'b0}, full_addr[i], `INST_LSU_WSIZE(execute_if.data.op_type), execute_if.data.uuid))
end
// store data formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_data
always @(*) begin
mem_req_data[i] = execute_if.data.rs2_data[i];
case (req_align[i])
@ -215,7 +213,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr;
if (PID_BITS != 0) begin
if (PID_BITS != 0) begin : g_pids
reg [`LSUQ_IN_SIZE-1:0][PID_BITS:0] pkt_ctr;
reg [`LSUQ_IN_SIZE-1:0] pkt_sop, pkt_eop;
@ -271,10 +269,10 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
assign mem_rsp_sop_pkt = pkt_sop[pkt_raddr];
assign mem_rsp_eop_pkt = mem_rsp_eop_fire && pkt_eop[pkt_raddr] && (pkt_ctr[pkt_raddr] == 1);
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("allocator full!"))
`RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("Oops!"))
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("%t: allocator full!", $time))
`RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("%t: oops! broken sop request!", $time))
`UNUSED_VAR (mem_rsp_sop)
end else begin
end else begin : g_no_pids
assign pkt_waddr = 0;
assign mem_rsp_sop_pkt = mem_rsp_sop;
assign mem_rsp_eop_pkt = mem_rsp_eop;
@ -300,7 +298,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
wire [NUM_LANES-1:0] lsu_mem_req_mask;
wire [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] lsu_mem_req_byteen;
wire [NUM_LANES-1:0][LSU_ADDR_WIDTH-1:0] lsu_mem_req_addr;
wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] lsu_mem_req_atype;
wire [NUM_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] lsu_mem_req_flags;
wire [NUM_LANES-1:0][(LSU_WORD_SIZE*8)-1:0] lsu_mem_req_data;
wire [LSU_TAG_WIDTH-1:0] lsu_mem_req_tag;
wire lsu_mem_req_ready;
@ -311,16 +309,14 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
wire [LSU_TAG_WIDTH-1:0] lsu_mem_rsp_tag;
wire lsu_mem_rsp_ready;
`RESET_RELAY (mem_scheduler_reset, reset);
VX_mem_scheduler #(
.INSTANCE_ID ($sformatf("%s-scheduler", INSTANCE_ID)),
.INSTANCE_ID (`SFORMATF(("%s-memsched", INSTANCE_ID))),
.CORE_REQS (NUM_LANES),
.MEM_CHANNELS(NUM_LANES),
.WORD_SIZE (LSU_WORD_SIZE),
.LINE_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LSU_ADDR_WIDTH),
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_QUEUE_SIZE (`LSUQ_IN_SIZE),
.MEM_QUEUE_SIZE (`LSUQ_OUT_SIZE),
@ -330,7 +326,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
.CORE_OUT_BUF(0)
) mem_scheduler (
.clk (clk),
.reset (mem_scheduler_reset),
.reset (reset),
// Input request
.core_req_valid (mem_req_valid),
@ -338,12 +334,12 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
.core_req_mask (mem_req_mask),
.core_req_byteen(mem_req_byteen),
.core_req_addr (mem_req_addr),
.core_req_atype (mem_req_atype),
.core_req_flags (mem_req_flags),
.core_req_data (mem_req_data),
.core_req_tag (mem_req_tag),
.core_req_ready (mem_req_ready),
`UNUSED_PIN (core_req_empty),
`UNUSED_PIN (core_req_sent),
`UNUSED_PIN (core_req_wr_notify),
// Output response
.core_rsp_valid (mem_rsp_valid),
@ -360,7 +356,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
.mem_req_mask (lsu_mem_req_mask),
.mem_req_byteen (lsu_mem_req_byteen),
.mem_req_addr (lsu_mem_req_addr),
.mem_req_atype (lsu_mem_req_atype),
.mem_req_flags (lsu_mem_req_flags),
.mem_req_data (lsu_mem_req_data),
.mem_req_tag (lsu_mem_req_tag),
.mem_req_ready (lsu_mem_req_ready),
@ -378,7 +374,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
assign lsu_mem_if.req_data.rw = lsu_mem_req_rw;
assign lsu_mem_if.req_data.byteen = lsu_mem_req_byteen;
assign lsu_mem_if.req_data.addr = lsu_mem_req_addr;
assign lsu_mem_if.req_data.atype = lsu_mem_req_atype;
assign lsu_mem_if.req_data.flags = lsu_mem_req_flags;
assign lsu_mem_if.req_data.data = lsu_mem_req_data;
assign lsu_mem_if.req_data.tag = lsu_mem_req_tag;
assign lsu_mem_req_ready = lsu_mem_if.req_ready;
@ -426,7 +422,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
`endif
`endif
for (genvar i = 0; i < NUM_LANES; i++) begin
for (genvar i = 0; i < NUM_LANES; i++) begin : g_rsp_data
`ifdef XLEN_64
wire [63:0] rsp_data64 = mem_rsp_data[i];
wire [31:0] rsp_data32 = (rsp_align[i][2] ? mem_rsp_data[i][63:32] : mem_rsp_data[i][31:0]);
@ -483,6 +479,7 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
.valid_out (commit_no_rsp_if.valid),
.ready_out (commit_no_rsp_if.ready)
);
assign commit_no_rsp_if.data.rd = '0;
assign commit_no_rsp_if.data.wb = 1'b0;
assign commit_no_rsp_if.data.data = commit_rsp_if.data.data; // arbiter MUX optimization
@ -507,51 +504,74 @@ module VX_lsu_slice import VX_gpu_pkg::*, VX_trace_pkg::*; #(
`ifdef DBG_TRACE_MEM
always @(posedge clk) begin
if (execute_if.valid && fence_lock) begin
`TRACE(1, ("%d: *** %s fence wait\n", $time, INSTANCE_ID));
`TRACE(2, ("%t: *** %s fence wait\n", $time, INSTANCE_ID))
end
if (mem_req_fire) begin
if (mem_req_rw) begin
`TRACE(1, ("%d: %s Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
`TRACE(1, (", byteen=0x%0h, data=", mem_req_byteen));
`TRACE_ARRAY1D(1, "0x%0h", mem_req_data, NUM_LANES);
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_req_tag, execute_if.data.uuid));
`TRACE(2, ("%t: %s Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask))
`TRACE_ARRAY1D(2, "0x%h", full_addr, NUM_LANES)
`TRACE(2, (", flags="))
`TRACE_ARRAY1D(2, "%b", mem_req_flags, NUM_LANES)
`TRACE(2, (", byteen=0x%0h, data=", mem_req_byteen))
`TRACE_ARRAY1D(2, "0x%0h", mem_req_data, NUM_LANES)
`TRACE(2, (", sop=%b, eop=%b, tag=0x%0h (#%0d)\n", execute_if.data.sop, execute_if.data.eop, mem_req_tag, execute_if.data.uuid))
end else begin
`TRACE(1, ("%d: %s Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
`TRACE(1, (", byteen=0x%0h, rd=%0d, tag=0x%0h (#%0d)\n", mem_req_byteen, execute_if.data.rd, mem_req_tag, execute_if.data.uuid));
`TRACE(2, ("%t: %s Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, INSTANCE_ID, execute_if.data.wid, {execute_if.data.PC, 1'b0}, mem_req_mask))
`TRACE_ARRAY1D(2, "0x%h", full_addr, NUM_LANES)
`TRACE(2, (", flags="))
`TRACE_ARRAY1D(2, "%b", mem_req_flags, NUM_LANES)
`TRACE(2, (", byteen=0x%0h, rd=%0d, sop=%b, eop=%b, tag=0x%0h (#%0d)\n", mem_req_byteen, execute_if.data.rd, execute_if.data.sop, execute_if.data.eop, mem_req_tag, execute_if.data.uuid))
end
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: %s Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=",
$time, INSTANCE_ID, rsp_wid, {rsp_pc, 1'b0}, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop));
`TRACE_ARRAY1D(1, "0x%0h", mem_rsp_data, NUM_LANES);
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid));
`TRACE(2, ("%t: %s Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=",
$time, INSTANCE_ID, rsp_wid, {rsp_pc, 1'b0}, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop))
`TRACE_ARRAY1D(2, "0x%0h", mem_rsp_data, NUM_LANES)
`TRACE(2, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid))
end
end
`endif
`ifdef SCOPE
`ifdef DBG_SCOPE_LSU
VX_scope_tap #(
.SCOPE_ID (3),
.TRIGGERW (3),
.PROBEW (1 + NUM_LANES*(`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE*8) + `UUID_WIDTH + NUM_LANES*LSU_WORD_SIZE*8 + `UUID_WIDTH)
) scope_tap (
.clk (clk),
.reset (scope_reset),
.start (1'b0),
.stop (1'b0),
.triggers({reset, mem_req_fire, mem_rsp_fire}),
.probes ({mem_req_rw, full_addr, mem_req_byteen, mem_req_data, execute_if.data.uuid, rsp_data, rsp_uuid}),
.bus_in (scope_bus_in),
.bus_out(scope_bus_out)
`SCOPE_IO_SWITCH (1);
wire reset_negedge;
`NEG_EDGE (reset_negedge, reset);
`SCOPE_TAP_EX (0, 3, 4, 2, (
1 + NUM_LANES * (`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE * 8) + `UUID_WIDTH + NUM_LANES * LSU_WORD_SIZE * 8 + `UUID_WIDTH
), {
mem_req_valid,
mem_req_ready,
mem_rsp_valid,
mem_rsp_ready
}, {
mem_req_fire,
mem_rsp_fire
}, {
mem_req_rw,
full_addr,
mem_req_byteen,
mem_req_data,
execute_if.data.uuid,
rsp_data,
rsp_uuid
},
reset_negedge, 1'b0, 4096
);
`else
`SCOPE_IO_UNUSED()
`SCOPE_IO_UNUSED(0)
`endif
`endif
`ifdef CHIPSCOPE
`ifdef DBG_SCOPE_LSU
ila_lsu ila_lsu_inst (
.clk (clk),
.probe0 ({execute_if.valid, execute_if.data, execute_if.ready}),
.probe1 ({lsu_mem_if.req_valid, lsu_mem_if.req_data, lsu_mem_if.req_ready}),
.probe2 ({lsu_mem_if.rsp_valid, lsu_mem_if.rsp_data, lsu_mem_if.rsp_ready})
);
`endif
`endif
endmodule

View file

@ -31,9 +31,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
localparam BLOCK_SIZE = `NUM_LSU_BLOCKS;
localparam NUM_LANES = `NUM_LSU_LANES;
`ifdef SCOPE
`SCOPE_IO_SWITCH (BLOCK_SIZE);
`endif
VX_execute_if #(
.NUM_LANES (NUM_LANES)
@ -42,7 +40,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_BUF (1)
.OUT_BUF (3)
) dispatch_unit (
.clk (clk),
.reset (reset),
@ -54,16 +52,13 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
.NUM_LANES (NUM_LANES)
) per_block_commit_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : lsu_slices
`RESET_RELAY (slice_reset, reset);
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin : g_slices
VX_lsu_slice #(
.INSTANCE_ID ($sformatf("%s%0d", INSTANCE_ID, block_idx))
.INSTANCE_ID (`SFORMATF(("%s%0d", INSTANCE_ID, block_idx)))
) lsu_slice(
`SCOPE_IO_BIND (block_idx)
.clk (clk),
.reset (slice_reset),
.reset (reset),
.execute_if (per_block_execute_if[block_idx]),
.commit_if (per_block_commit_if[block_idx]),
.lsu_mem_if (lsu_mem_if[block_idx])

260
hw/rtl/core/VX_mem_unit.sv Normal file
View file

@ -0,0 +1,260 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_mem_unit import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = ""
) (
input wire clk,
input wire reset,
`ifdef PERF_ENABLE
output lmem_perf_t lmem_perf,
output coalescer_perf_t coalescer_perf,
`endif
VX_lsu_mem_if.slave lsu_mem_if [`NUM_LSU_BLOCKS],
VX_mem_bus_if.master dcache_bus_if [DCACHE_NUM_REQS]
);
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_dcache_if[`NUM_LSU_BLOCKS]();
`ifdef LMEM_ENABLE
`STATIC_ASSERT(`IS_DIVISBLE((1 << `LMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT(0 == (`LMEM_BASE_ADDR % (1 << `LMEM_LOG_SIZE)), ("invalid parameter"))
localparam LMEM_ADDR_WIDTH = `LMEM_LOG_SIZE - `CLOG2(LSU_WORD_SIZE);
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_lmem_if[`NUM_LSU_BLOCKS]();
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lmem_switches
VX_lmem_switch #(
.REQ0_OUT_BUF (1),
.REQ1_OUT_BUF (0),
.RSP_OUT_BUF (1),
.ARBITER ("P")
) lmem_switch (
.clk (clk),
.reset (reset),
.lsu_in_if (lsu_mem_if[i]),
.global_out_if(lsu_dcache_if[i]),
.local_out_if (lsu_lmem_if[i])
);
end
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LMEM_TAG_WIDTH)
) lmem_arb_if[1]();
VX_lsu_mem_arb #(
.NUM_INPUTS (`NUM_LSU_BLOCKS),
.NUM_OUTPUTS(1),
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH),
.TAG_SEL_IDX(0),
.ARBITER ("R"),
.REQ_OUT_BUF(0),
.RSP_OUT_BUF(2)
) lmem_arb (
.clk (clk),
.reset (reset),
.bus_in_if (lsu_lmem_if),
.bus_out_if (lmem_arb_if)
);
VX_mem_bus_if #(
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LMEM_TAG_WIDTH)
) lmem_adapt_if[`NUM_LSU_LANES]();
VX_lsu_adapter #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LMEM_TAG_WIDTH),
.TAG_SEL_BITS (LMEM_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (3),
.RSP_OUT_BUF (0)
) lmem_adapter (
.clk (clk),
.reset (reset),
.lsu_mem_if (lmem_arb_if[0]),
.mem_bus_if (lmem_adapt_if)
);
VX_local_mem #(
.INSTANCE_ID(`SFORMATF(("%s-lmem", INSTANCE_ID))),
.SIZE (1 << `LMEM_LOG_SIZE),
.NUM_REQS (`NUM_LSU_LANES),
.NUM_BANKS (`LMEM_NUM_BANKS),
.WORD_SIZE (LSU_WORD_SIZE),
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.TAG_WIDTH (LMEM_TAG_WIDTH),
.OUT_BUF (3)
) local_mem (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.lmem_perf (lmem_perf),
`endif
.mem_bus_if (lmem_adapt_if)
);
`else
`ifdef PERF_ENABLE
assign lmem_perf = '0;
`endif
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_dcache_if
`ASSIGN_VX_MEM_BUS_IF (lsu_dcache_if[i], lsu_mem_if[i]);
end
`endif
VX_lsu_mem_if #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_coalesced_if[`NUM_LSU_BLOCKS]();
`ifdef PERF_ENABLE
wire [`NUM_LSU_BLOCKS-1:0][`PERF_CTR_BITS-1:0] per_block_coalescer_misses;
wire [`PERF_CTR_BITS-1:0] coalescer_misses;
VX_reduce_tree #(
.DATAW_IN (`PERF_CTR_BITS),
.DATAW_OUT (`PERF_CTR_BITS),
.N (`NUM_LSU_BLOCKS),
.OP ("+")
) coalescer_reduce (
.data_in (per_block_coalescer_misses),
.data_out (coalescer_misses)
);
`BUFFER(coalescer_perf.misses, coalescer_misses);
`endif
if ((`NUM_LSU_LANES > 1) && (LSU_WORD_SIZE != DCACHE_WORD_SIZE)) begin : g_enabled
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_coalescers
VX_mem_coalescer #(
.INSTANCE_ID (`SFORMATF(("%s-coalescer%0d", INSTANCE_ID, i))),
.NUM_REQS (`NUM_LSU_LANES),
.DATA_IN_SIZE (LSU_WORD_SIZE),
.DATA_OUT_SIZE (DCACHE_WORD_SIZE),
.ADDR_WIDTH (LSU_ADDR_WIDTH),
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
.TAG_WIDTH (LSU_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.QUEUE_SIZE (`LSUQ_OUT_SIZE),
.PERF_CTR_BITS (`PERF_CTR_BITS)
) mem_coalescer (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.misses (per_block_coalescer_misses[i]),
`else
`UNUSED_PIN (misses),
`endif
// Input request
.in_req_valid (lsu_dcache_if[i].req_valid),
.in_req_mask (lsu_dcache_if[i].req_data.mask),
.in_req_rw (lsu_dcache_if[i].req_data.rw),
.in_req_byteen (lsu_dcache_if[i].req_data.byteen),
.in_req_addr (lsu_dcache_if[i].req_data.addr),
.in_req_flags (lsu_dcache_if[i].req_data.flags),
.in_req_data (lsu_dcache_if[i].req_data.data),
.in_req_tag (lsu_dcache_if[i].req_data.tag),
.in_req_ready (lsu_dcache_if[i].req_ready),
// Input response
.in_rsp_valid (lsu_dcache_if[i].rsp_valid),
.in_rsp_mask (lsu_dcache_if[i].rsp_data.mask),
.in_rsp_data (lsu_dcache_if[i].rsp_data.data),
.in_rsp_tag (lsu_dcache_if[i].rsp_data.tag),
.in_rsp_ready (lsu_dcache_if[i].rsp_ready),
// Output request
.out_req_valid (dcache_coalesced_if[i].req_valid),
.out_req_mask (dcache_coalesced_if[i].req_data.mask),
.out_req_rw (dcache_coalesced_if[i].req_data.rw),
.out_req_byteen (dcache_coalesced_if[i].req_data.byteen),
.out_req_addr (dcache_coalesced_if[i].req_data.addr),
.out_req_flags (dcache_coalesced_if[i].req_data.flags),
.out_req_data (dcache_coalesced_if[i].req_data.data),
.out_req_tag (dcache_coalesced_if[i].req_data.tag),
.out_req_ready (dcache_coalesced_if[i].req_ready),
// Output response
.out_rsp_valid (dcache_coalesced_if[i].rsp_valid),
.out_rsp_mask (dcache_coalesced_if[i].rsp_data.mask),
.out_rsp_data (dcache_coalesced_if[i].rsp_data.data),
.out_rsp_tag (dcache_coalesced_if[i].rsp_data.tag),
.out_rsp_ready (dcache_coalesced_if[i].rsp_ready)
);
end
end else begin : g_passthru
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_dcache_coalesced_if
`ASSIGN_VX_MEM_BUS_IF (dcache_coalesced_if[i], lsu_dcache_if[i]);
`ifdef PERF_ENABLE
assign per_block_coalescer_misses[i] = '0;
`endif
end
end
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_dcache_adapters
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_tmp_if[DCACHE_CHANNELS]();
VX_lsu_adapter #(
.NUM_LANES (DCACHE_CHANNELS),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (0),
.RSP_OUT_BUF (0)
) dcache_adapter (
.clk (clk),
.reset (reset),
.lsu_mem_if (dcache_coalesced_if[i]),
.mem_bus_if (dcache_bus_tmp_if)
);
for (genvar j = 0; j < DCACHE_CHANNELS; ++j) begin : g_dcache_bus_if
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i * DCACHE_CHANNELS + j], dcache_bus_tmp_if[j]);
end
end
endmodule

View file

@ -0,0 +1,127 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_mem_unit_top import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter LSU_WORD_WIDTH = LSU_WORD_SIZE * 8
) (
// Clock
input wire clk,
input wire reset,
// LSU memory request
input wire [`NUM_LSU_BLOCKS-1:0] lsu_req_valid,
input wire [`NUM_LSU_BLOCKS-1:0] lsu_req_rw,
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0] lsu_req_mask,
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_WORD_SIZE-1:0] lsu_req_byteen,
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_ADDR_WIDTH-1:0] lsu_req_addr,
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] lsu_req_flags,
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_WORD_WIDTH-1:0] lsu_req_data,
input wire [`NUM_LSU_BLOCKS-1:0][LSU_TAG_WIDTH-1:0] lsu_req_tag,
output wire [`NUM_LSU_BLOCKS-1:0] lsu_req_ready,
// LSU memory response
output wire [`NUM_LSU_BLOCKS-1:0] lsu_rsp_valid,
output wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0] lsu_rsp_mask,
output wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_WORD_WIDTH-1:0] lsu_rsp_data,
output wire [`NUM_LSU_BLOCKS-1:0][LSU_TAG_WIDTH-1:0] lsu_rsp_tag,
input wire [`NUM_LSU_BLOCKS-1:0] lsu_rsp_ready,
// Memory request
output wire [DCACHE_NUM_REQS-1:0] mem_req_valid,
output wire [DCACHE_NUM_REQS-1:0] mem_req_rw,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] mem_req_byteen,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] mem_req_addr,
output wire [DCACHE_NUM_REQS-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] mem_req_flags,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] mem_req_data,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] mem_req_tag,
input wire [DCACHE_NUM_REQS-1:0] mem_req_ready,
// Memory response
input wire [DCACHE_NUM_REQS-1:0] mem_rsp_valid,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] mem_rsp_data,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] mem_rsp_tag,
output wire [DCACHE_NUM_REQS-1:0] mem_rsp_ready
);
VX_lsu_mem_if #(
.NUM_LANES (`NUM_LSU_LANES),
.DATA_SIZE (LSU_WORD_SIZE),
.TAG_WIDTH (LSU_TAG_WIDTH)
) lsu_mem_if[`NUM_LSU_BLOCKS]();
// LSU memory request
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_mem_req
assign lsu_mem_if[i].req_valid = lsu_req_valid[i];
assign lsu_mem_if[i].req_data.rw = lsu_req_rw[i];
assign lsu_mem_if[i].req_data.mask = lsu_req_mask[i];
assign lsu_mem_if[i].req_data.byteen = lsu_req_byteen[i];
assign lsu_mem_if[i].req_data.addr = lsu_req_addr[i];
assign lsu_mem_if[i].req_data.flags = lsu_req_flags[i];
assign lsu_mem_if[i].req_data.data = lsu_req_data[i];
assign lsu_mem_if[i].req_data.tag = lsu_req_tag[i];
assign lsu_req_ready[i] = lsu_mem_if[i].req_ready;
end
// LSU memory response
for (genvar i = 0; i < `NUM_LSU_BLOCKS; ++i) begin : g_lsu_rsp
assign lsu_rsp_valid[i] = lsu_mem_if[i].rsp_valid;
assign lsu_rsp_mask[i] = lsu_mem_if[i].rsp_data.mask;
assign lsu_rsp_data[i] = lsu_mem_if[i].rsp_data.data;
assign lsu_rsp_tag[i] = lsu_mem_if[i].rsp_data.tag;
assign lsu_mem_if[i].rsp_ready = lsu_rsp_ready[i];
end
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) mem_bus_if[DCACHE_NUM_REQS]();
// memory request
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_mem_req
assign mem_req_valid[i] = mem_bus_if[i].req_valid;
assign mem_req_rw[i] = mem_bus_if[i].req_data.rw;
assign mem_req_byteen[i] = mem_bus_if[i].req_data.byteen;
assign mem_req_addr[i] = mem_bus_if[i].req_data.addr;
assign mem_req_flags[i] = mem_bus_if[i].req_data.flags;
assign mem_req_data[i] = mem_bus_if[i].req_data.data;
assign mem_req_tag[i] = mem_bus_if[i].req_data.tag;
assign mem_bus_if[i].req_ready = mem_req_ready[i];
end
// memory response
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin : g_mem_bus_rsp
assign mem_bus_if[i].rsp_valid = mem_rsp_valid[i];
assign mem_bus_if[i].rsp_data.tag = mem_rsp_tag[i];
assign mem_bus_if[i].rsp_data.data = mem_rsp_data[i];
assign mem_rsp_ready[i] = mem_bus_if[i].rsp_ready;
end
`ifdef PERF_ENABLE
cache_perf_t lmem_perf = '0;
`endif
VX_mem_unit #(
.INSTANCE_ID (INSTANCE_ID)
) mem_unit (
.clk (clk),
.reset (reset),
`ifdef PERF_ENABLE
.lmem_perf (lmem_perf),
`endif
.lsu_mem_if (lsu_mem_if),
.dcache_bus_if (mem_bus_if)
);
endmodule

View file

@ -23,7 +23,7 @@
module VX_operands import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter NUM_BANKS = 4,
parameter OUT_BUF = 4 // using 2-cycle EB for area reduction
parameter OUT_BUF = 3
) (
input wire clk,
input wire reset,
@ -37,15 +37,15 @@ module VX_operands import VX_gpu_pkg::*; #(
VX_operands_if.master operands_if
);
`UNUSED_SPARAM (INSTANCE_ID)
localparam NUM_SRC_REGS = 3;
localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_REGS);
localparam NUM_SRC_OPDS = 3;
localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_OPDS);
localparam REQ_SEL_WIDTH = `UP(REQ_SEL_BITS);
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS;
localparam META_DATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + `UUID_WIDTH;
localparam REGS_DATAW = `XLEN * `NUM_THREADS;
localparam DATAW = META_DATAW + NUM_SRC_REGS * REGS_DATAW;
localparam DATAW = META_DATAW + NUM_SRC_OPDS * REGS_DATAW;
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS);
localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS;
localparam XLEN_SIZE = `XLEN / 8;
@ -53,87 +53,80 @@ module VX_operands import VX_gpu_pkg::*; #(
`UNUSED_VAR (writeback_if.data.sop)
wire [NUM_SRC_REGS-1:0] src_valid;
wire [NUM_SRC_REGS-1:0] req_in_valid, req_in_ready;
wire [NUM_SRC_REGS-1:0][PER_BANK_ADDRW-1:0] req_in_data;
wire [NUM_SRC_REGS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
wire [NUM_SRC_OPDS-1:0] src_valid;
wire [NUM_SRC_OPDS-1:0] req_valid_in, req_ready_in;
wire [NUM_SRC_OPDS-1:0][PER_BANK_ADDRW-1:0] req_data_in;
wire [NUM_SRC_OPDS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
wire [NUM_BANKS-1:0] gpr_rd_valid, gpr_rd_ready;
wire [NUM_BANKS-1:0] gpr_rd_valid_st1, gpr_rd_valid_st2;
wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr, gpr_rd_addr_st1;
wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data_st1, gpr_rd_data_st2;
wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data_st2;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx, gpr_rd_req_idx_st1, gpr_rd_req_idx_st2;
wire pipe_ready_in;
wire pipe_valid_st1, pipe_ready_st1;
wire pipe_valid_st2, pipe_ready_st2;
wire [META_DATAW-1:0] pipe_data, pipe_data_st1, pipe_data_st2;
reg [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_n;
wire [NUM_SRC_REGS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] src_data_st1, src_data_st2;
reg [NUM_SRC_OPDS-1:0][(`NUM_THREADS * `XLEN)-1:0] src_data_st2, src_data_m_st2;
reg [NUM_SRC_REGS-1:0] data_fetched_n;
wire [NUM_SRC_REGS-1:0] data_fetched_st1;
reg [NUM_SRC_OPDS-1:0] data_fetched_st1;
reg has_collision_n;
wire has_collision_st1;
wire [NUM_SRC_REGS-1:0][`NR_BITS-1:0] src_regs = {scoreboard_if.data.rs3,
scoreboard_if.data.rs2,
scoreboard_if.data.rs1};
wire [NUM_SRC_OPDS-1:0][`NR_BITS-1:0] src_opds;
assign src_opds = {scoreboard_if.data.rs3, scoreboard_if.data.rs2, scoreboard_if.data.rs1};
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
if (ISSUE_WIS != 0) begin
assign req_in_data[i] = {src_regs[i][`NR_BITS-1:BANK_SEL_BITS], scoreboard_if.data.wis};
end else begin
assign req_in_data[i] = src_regs[i][`NR_BITS-1:BANK_SEL_BITS];
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_req_data_in
if (ISSUE_WIS != 0) begin : g_wis
assign req_data_in[i] = {src_opds[i][`NR_BITS-1:BANK_SEL_BITS], scoreboard_if.data.wis};
end else begin : g_no_wis
assign req_data_in[i] = src_opds[i][`NR_BITS-1:BANK_SEL_BITS];
end
if (NUM_BANKS != 1) begin
assign req_bank_idx[i] = src_regs[i][BANK_SEL_BITS-1:0];
end else begin
end
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_req_bank_idx
if (NUM_BANKS != 1) begin : g_multibanks
assign req_bank_idx[i] = src_opds[i][BANK_SEL_BITS-1:0];
end else begin : g_singlebank
assign req_bank_idx[i] = '0;
end
end
for (genvar i = 0; i < NUM_SRC_REGS; ++i) begin
assign src_valid[i] = (src_regs[i] != 0) && ~data_fetched_st1[i];
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_src_valid
assign src_valid[i] = (src_opds[i] != 0) && ~data_fetched_st1[i];
end
assign req_in_valid = {NUM_SRC_REGS{scoreboard_if.valid}} & src_valid;
assign req_valid_in = {NUM_SRC_OPDS{scoreboard_if.valid}} & src_valid;
VX_stream_xbar #(
.NUM_INPUTS (NUM_SRC_REGS),
.NUM_INPUTS (NUM_SRC_OPDS),
.NUM_OUTPUTS (NUM_BANKS),
.DATAW (PER_BANK_ADDRW),
.ARBITER ("P"), // use priority arbiter
.PERF_CTR_BITS(`PERF_CTR_BITS),
.OUT_BUF (0) // no output buffering
) req_xbar (
.clk (clk),
.reset (reset),
`UNUSED_PIN(collisions),
.valid_in (req_in_valid),
.data_in (req_in_data),
.valid_in (req_valid_in),
.data_in (req_data_in),
.sel_in (req_bank_idx),
.ready_in (req_in_ready),
.ready_in (req_ready_in),
.valid_out (gpr_rd_valid),
.data_out (gpr_rd_addr),
.sel_out (gpr_rd_req_idx),
.ready_out (gpr_rd_ready)
);
wire pipe_in_ready = pipe_ready_st1 || ~pipe_valid_st1;
assign gpr_rd_ready = {NUM_BANKS{pipe_in_ready}};
assign scoreboard_if.ready = pipe_in_ready && ~has_collision_n;
wire pipe_fire_st1 = pipe_valid_st1 && pipe_ready_st1;
wire pipe_fire_st2 = pipe_valid_st2 && pipe_ready_st2;
assign gpr_rd_ready = {NUM_BANKS{pipe_ready_in}};
always @(*) begin
has_collision_n = 0;
for (integer i = 0; i < NUM_SRC_REGS; ++i) begin
for (integer j = 1; j < (NUM_SRC_REGS-i); ++j) begin
for (integer i = 0; i < NUM_SRC_OPDS; ++i) begin
for (integer j = 1; j < (NUM_SRC_OPDS-i); ++j) begin
has_collision_n |= src_valid[i]
&& src_valid[j+i]
&& (req_bank_idx[i] == req_bank_idx[j+i]);
@ -141,14 +134,7 @@ module VX_operands import VX_gpu_pkg::*; #(
end
end
always @(*) begin
data_fetched_n = data_fetched_st1;
if (scoreboard_if.ready) begin
data_fetched_n = '0;
end else begin
data_fetched_n = data_fetched_st1 | req_in_ready;
end
end
wire [NUM_SRC_OPDS-1:0] req_fire_in = req_valid_in & req_ready_in;
assign pipe_data = {
scoreboard_if.data.wis,
@ -162,61 +148,74 @@ module VX_operands import VX_gpu_pkg::*; #(
scoreboard_if.data.uuid
};
VX_pipe_register #(
.DATAW (1 + NUM_SRC_REGS + NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH)),
.RESETW (1 + NUM_SRC_REGS)
assign scoreboard_if.ready = pipe_ready_in && ~has_collision_n;
wire pipe_fire_st1 = pipe_valid_st1 && pipe_ready_st1;
wire pipe_fire_st2 = pipe_valid_st2 && pipe_ready_st2;
VX_pipe_buffer #(
.DATAW (NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH))
) pipe_reg1 (
.clk (clk),
.reset (reset),
.enable (pipe_in_ready),
.data_in ({scoreboard_if.valid, data_fetched_n, gpr_rd_valid, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}),
.data_out ({pipe_valid_st1, data_fetched_st1, gpr_rd_valid_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1})
.valid_in (scoreboard_if.valid),
.ready_in (pipe_ready_in),
.data_in ({gpr_rd_valid, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}),
.data_out ({gpr_rd_valid_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1}),
.valid_out(pipe_valid_st1),
.ready_out(pipe_ready_st1)
);
assign pipe_ready_st1 = pipe_ready_st2 || ~pipe_valid_st2;
assign src_data_st1 = pipe_fire_st2 ? '0 : src_data_n;
always @(posedge clk) begin
if (reset || scoreboard_if.ready) begin
data_fetched_st1 <= 0;
end else begin
data_fetched_st1 <= data_fetched_st1 | req_fire_in;
end
end
wire pipe_valid2_st1 = pipe_valid_st1 && ~has_collision_st1;
`RESET_RELAY (pipe2_reset, reset); // needed for pipe_reg2's wide RESETW
VX_pipe_register #(
.DATAW (1 + NUM_SRC_REGS * REGS_DATAW + NUM_BANKS + NUM_BANKS * REGS_DATAW + META_DATAW + NUM_BANKS * REQ_SEL_WIDTH),
.RESETW (1 + NUM_SRC_REGS * REGS_DATAW)
VX_pipe_buffer #(
.DATAW (NUM_BANKS * (1 + REQ_SEL_WIDTH) + META_DATAW)
) pipe_reg2 (
.clk (clk),
.reset (pipe2_reset),
.enable (pipe_ready_st1),
.data_in ({pipe_valid2_st1, src_data_st1, gpr_rd_valid_st1, gpr_rd_data_st1, pipe_data_st1, gpr_rd_req_idx_st1}),
.data_out ({pipe_valid_st2, src_data_st2, gpr_rd_valid_st2, gpr_rd_data_st2, pipe_data_st2, gpr_rd_req_idx_st2})
.reset (reset),
.valid_in (pipe_valid2_st1),
.ready_in (pipe_ready_st1),
.data_in ({gpr_rd_valid_st1, gpr_rd_req_idx_st1, pipe_data_st1}),
.data_out ({gpr_rd_valid_st2, gpr_rd_req_idx_st2, pipe_data_st2}),
.valid_out(pipe_valid_st2),
.ready_out(pipe_ready_st2)
);
always @(*) begin
src_data_n = src_data_st2;
src_data_m_st2 = src_data_st2;
for (integer b = 0; b < NUM_BANKS; ++b) begin
if (gpr_rd_valid_st2[b]) begin
src_data_n[gpr_rd_req_idx_st2[b]] = gpr_rd_data_st2[b];
src_data_m_st2[gpr_rd_req_idx_st2[b]] = gpr_rd_data_st2[b];
end
end
end
always @(posedge clk) begin
if (reset || pipe_fire_st2) begin
src_data_st2 <= 0;
end else begin
src_data_st2 <= src_data_m_st2;
end
end
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF)),
.LUTRAM (1)
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (pipe_valid_st2),
.ready_in (pipe_ready_st2),
.data_in ({
pipe_data_st2,
src_data_n[0],
src_data_n[1],
src_data_n[2]
}),
.data_in ({pipe_data_st2, src_data_m_st2}),
.data_out ({
operands_if.data.wis,
operands_if.data.tmask,
@ -227,51 +226,39 @@ module VX_operands import VX_gpu_pkg::*; #(
operands_if.data.op_args,
operands_if.data.rd,
operands_if.data.uuid,
operands_if.data.rs1_data,
operands_if.data.rs3_data,
operands_if.data.rs2_data,
operands_if.data.rs3_data
operands_if.data.rs1_data
}),
.valid_out (operands_if.valid),
.ready_out (operands_if.ready)
);
wire [PER_BANK_ADDRW-1:0] gpr_wr_addr;
if (ISSUE_WIS != 0) begin
if (ISSUE_WIS != 0) begin : g_gpr_wr_addr
assign gpr_wr_addr = {writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS], writeback_if.data.wis};
end else begin
end else begin : g_gpr_wr_addr_no_wis
assign gpr_wr_addr = writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS];
end
wire [BANK_SEL_WIDTH-1:0] gpr_wr_bank_idx;
if (NUM_BANKS != 1) begin
if (NUM_BANKS != 1) begin : g_gpr_wr_bank_idx
assign gpr_wr_bank_idx = writeback_if.data.rd[BANK_SEL_BITS-1:0];
end else begin
end else begin : g_gpr_wr_bank_idx_0
assign gpr_wr_bank_idx = '0;
end
`ifdef GPR_RESET
reg wr_enabled = 0;
always @(posedge clk) begin
if (reset) begin
wr_enabled <= 1;
end
end
`else
wire wr_enabled = 1;
`endif
for (genvar b = 0; b < NUM_BANKS; ++b) begin
for (genvar b = 0; b < NUM_BANKS; ++b) begin : g_gpr_rams
wire gpr_wr_enabled;
if (BANK_SEL_BITS != 0) begin
assign gpr_wr_enabled = wr_enabled
&& writeback_if.valid
if (BANK_SEL_BITS != 0) begin : g_gpr_wr_enabled_multibanks
assign gpr_wr_enabled = writeback_if.valid
&& (gpr_wr_bank_idx == BANK_SEL_BITS'(b));
end else begin
assign gpr_wr_enabled = wr_enabled && writeback_if.valid;
end else begin : g_gpr_wr_enabled
assign gpr_wr_enabled = writeback_if.valid;
end
wire [BYTEENW-1:0] wren;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
for (genvar i = 0; i < `NUM_THREADS; ++i) begin : g_wren
assign wren[i*XLEN_SIZE+:XLEN_SIZE] = {XLEN_SIZE{writeback_if.data.tmask[i]}};
end
@ -282,7 +269,8 @@ module VX_operands import VX_gpu_pkg::*; #(
`ifdef GPR_RESET
.RESET_RAM (1),
`endif
.NO_RWCHECK (1)
.OUT_REG (1),
.RDW_MODE ("R")
) gpr_ram (
.clk (clk),
.reset (reset),
@ -292,7 +280,7 @@ module VX_operands import VX_gpu_pkg::*; #(
.waddr (gpr_wr_addr),
.wdata (writeback_if.data.data),
.raddr (gpr_rd_addr_st1[b]),
.rdata (gpr_rd_data_st1[b])
.rdata (gpr_rd_data_st2[b])
);
end
@ -302,7 +290,7 @@ module VX_operands import VX_gpu_pkg::*; #(
if (reset) begin
collisions_r <= '0;
end else begin
collisions_r <= collisions_r + `PERF_CTR_BITS'(scoreboard_if.valid && pipe_in_ready && has_collision_n);
collisions_r <= collisions_r + `PERF_CTR_BITS'(scoreboard_if.valid && pipe_ready_in && has_collision_n);
end
end
assign perf_stalls = collisions_r;

View file

@ -0,0 +1,93 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_pe_switch import VX_gpu_pkg::*; #(
parameter PE_COUNT = 0,
parameter NUM_LANES = 0,
parameter REQ_OUT_BUF = 0,
parameter RSP_OUT_BUF = 0,
parameter `STRING ARBITER = "R",
parameter PE_SEL_BITS = `CLOG2(PE_COUNT)
) (
input wire clk,
input wire reset,
input wire [`UP(PE_SEL_BITS)-1:0] pe_sel,
VX_execute_if.slave execute_in_if,
VX_commit_if.master commit_out_if,
VX_execute_if.master execute_out_if[PE_COUNT],
VX_commit_if .slave commit_in_if[PE_COUNT]
);
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam REQ_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `INST_ALU_BITS + $bits(op_args_t) + 1 + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
localparam RSP_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
wire [PE_COUNT-1:0] pe_req_valid;
wire [PE_COUNT-1:0][REQ_DATAW-1:0] pe_req_data;
wire [PE_COUNT-1:0] pe_req_ready;
VX_stream_switch #(
.DATAW (REQ_DATAW),
.NUM_INPUTS (1),
.NUM_OUTPUTS (PE_COUNT),
.OUT_BUF (REQ_OUT_BUF)
) req_switch (
.clk (clk),
.reset (reset),
.sel_in (pe_sel),
.valid_in (execute_in_if.valid),
.ready_in (execute_in_if.ready),
.data_in (execute_in_if.data),
.data_out (pe_req_data),
.valid_out (pe_req_valid),
.ready_out (pe_req_ready)
);
for (genvar i = 0; i < PE_COUNT; ++i) begin : g_execute_out_if
assign execute_out_if[i].valid = pe_req_valid[i];
assign execute_out_if[i].data = pe_req_data[i];
assign pe_req_ready[i] = execute_out_if[i].ready;
end
///////////////////////////////////////////////////////////////////////////
wire [PE_COUNT-1:0] pe_rsp_valid;
wire [PE_COUNT-1:0][RSP_DATAW-1:0] pe_rsp_data;
wire [PE_COUNT-1:0] pe_rsp_ready;
for (genvar i = 0; i < PE_COUNT; ++i) begin : g_commit_in_if
assign pe_rsp_valid[i] = commit_in_if[i].valid;
assign pe_rsp_data[i] = commit_in_if[i].data;
assign commit_in_if[i].ready = pe_rsp_ready[i];
end
VX_stream_arb #(
.NUM_INPUTS (PE_COUNT),
.DATAW (RSP_DATAW),
.ARBITER (ARBITER),
.OUT_BUF (RSP_OUT_BUF)
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in (pe_rsp_valid),
.ready_in (pe_rsp_ready),
.data_in (pe_rsp_data),
.data_out (commit_out_if.data),
.valid_out (commit_out_if.valid),
.ready_out (commit_out_if.ready),
`UNUSED_PIN (sel_out)
);
endmodule

View file

@ -68,8 +68,6 @@ module VX_schedule import VX_gpu_pkg::*; #(
reg [`PERF_CTR_BITS-1:0] cycles;
reg [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] issued_instrs;
wire schedule_fire = schedule_valid && schedule_ready;
wire schedule_if_fire = schedule_if.valid && schedule_if.ready;
@ -78,7 +76,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
wire [`NUM_ALU_BLOCKS-1:0][`NW_WIDTH-1:0] branch_wid;
wire [`NUM_ALU_BLOCKS-1:0] branch_taken;
wire [`NUM_ALU_BLOCKS-1:0][`PC_BITS-1:0] branch_dest;
for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin
for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin : g_branch_init
assign branch_valid[i] = branch_ctl_if[i].valid;
assign branch_wid[i] = branch_ctl_if[i].wid;
assign branch_taken[i] = branch_ctl_if[i].taken;
@ -113,6 +111,16 @@ module VX_schedule import VX_gpu_pkg::*; #(
barrier_stalls_n= barrier_stalls;
warp_pcs_n = warp_pcs;
// decode unlock
if (decode_sched_if.valid && decode_sched_if.unlock) begin
stalled_warps_n[decode_sched_if.wid] = 0;
end
// CSR unlock
if (sched_csr_if.unlock_warp) begin
stalled_warps_n[sched_csr_if.unlock_wid] = 0;
end
// wspawn handling
if (wspawn.valid && is_single_warp) begin
active_warps_n |= wspawn.wmask;
@ -170,10 +178,11 @@ module VX_schedule import VX_gpu_pkg::*; #(
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end
end
`ifdef GBAR_ENABLE
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_id)) begin
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_data.id)) begin
barrier_ctrs_n[warp_ctl_if.barrier.id] = '0; // reset barrier counter
barrier_masks_n[gbar_bus_if.rsp_id] = '0; // reset barrier mask
barrier_masks_n[gbar_bus_if.rsp_data.id] = '0; // reset barrier mask
stalled_warps_n = '0; // unlock all warps
end
`endif
@ -188,16 +197,6 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
end
// decode unlock
if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin
stalled_warps_n[decode_sched_if.wid] = 0;
end
// CSR unlock
if (sched_csr_if.unlock_warp) begin
stalled_warps_n[sched_csr_if.unlock_wid] = 0;
end
// stall the warp until decode stage
if (schedule_fire) begin
stalled_warps_n[schedule_wid] = 1;
@ -223,7 +222,6 @@ module VX_schedule import VX_gpu_pkg::*; #(
active_warps <= '0;
thread_masks <= '0;
barrier_stalls <= '0;
issued_instrs <= '0;
cycles <= '0;
wspawn.valid <= 0;
@ -268,10 +266,6 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
`endif
if (schedule_if_fire) begin
issued_instrs[schedule_if.data.wid] <= issued_instrs[schedule_if.data.wid] + `UUID_WIDTH'(1);
end
if (busy) begin
cycles <= cycles + 1;
end
@ -281,21 +275,19 @@ module VX_schedule import VX_gpu_pkg::*; #(
// barrier handling
`ifdef GBAR_ENABLE
assign gbar_bus_if.req_valid = gbar_req_valid;
assign gbar_bus_if.req_id = gbar_req_id;
assign gbar_bus_if.req_size_m1 = gbar_req_size_m1;
assign gbar_bus_if.req_core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
assign gbar_bus_if.req_valid = gbar_req_valid;
assign gbar_bus_if.req_data.id = gbar_req_id;
assign gbar_bus_if.req_data.size_m1 = gbar_req_size_m1;
assign gbar_bus_if.req_data.core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
`endif
// split/join handling
`RESET_RELAY (split_join_reset, reset);
VX_split_join #(
.INSTANCE_ID ($sformatf("%s-splitjoin", INSTANCE_ID))
.INSTANCE_ID (`SFORMATF(("%s-splitjoin", INSTANCE_ID)))
) split_join (
.clk (clk),
.reset (split_join_reset),
.reset (reset),
.valid (warp_ctl_if.valid),
.wid (warp_ctl_if.wid),
.split (warp_ctl_if.split),
@ -324,7 +316,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
);
wire [`NUM_WARPS-1:0][(`NUM_THREADS + `PC_BITS)-1:0] schedule_data;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_schedule_data
assign schedule_data[i] = {thread_masks[i], warp_pcs[i]};
end
@ -333,67 +325,50 @@ module VX_schedule import VX_gpu_pkg::*; #(
schedule_data[schedule_wid][(`NUM_THREADS + `PC_BITS)-5:0]
};
`ifndef NDEBUG
localparam GNW_WIDTH = `LOG2UP(`NUM_CLUSTERS * `NUM_CORES * `NUM_WARPS);
reg [`UUID_WIDTH-1:0] instr_uuid;
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(schedule_wid);
`ifdef SV_DPI
always @(posedge clk) begin
if (reset) begin
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(1, 32'd0));
end else if (schedule_fire) begin
instr_uuid <= `UUID_WIDTH'(dpi_uuid_gen(0, 32'(g_wid)));
end
end
wire [`UUID_WIDTH-1:0] instr_uuid;
`ifdef UUID_ENABLE
VX_uuid_gen #(
.CORE_ID (CORE_ID),
.UUID_WIDTH (`UUID_WIDTH)
) uuid_gen (
.clk (clk),
.reset (reset),
.incr (schedule_fire),
.wid (schedule_wid),
.uuid (instr_uuid)
);
`else
wire [GNW_WIDTH+16-1:0] w_uuid = {g_wid, 16'(schedule_pc)};
always @(*) begin
instr_uuid = `UUID_WIDTH'(w_uuid);
end
`endif
`else
wire [`UUID_WIDTH-1:0] instr_uuid = '0;
assign instr_uuid = '0;
`endif
VX_elastic_buffer #(
.DATAW (`NUM_THREADS + `PC_BITS + `NW_WIDTH)
.DATAW (`NUM_THREADS + `PC_BITS + `NW_WIDTH + `UUID_WIDTH),
.SIZE (2), // need to buffer out ready_in
.OUT_REG (1) // should be registered for BRAM acces in fetch unit
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (schedule_valid),
.ready_in (schedule_ready),
.data_in ({schedule_tmask, schedule_pc, schedule_wid}),
.data_out ({schedule_if.data.tmask, schedule_if.data.PC, schedule_if.data.wid}),
.data_in ({schedule_tmask, schedule_pc, schedule_wid, instr_uuid}),
.data_out ({schedule_if.data.tmask, schedule_if.data.PC, schedule_if.data.wid, schedule_if.data.uuid}),
.valid_out (schedule_if.valid),
.ready_out (schedule_if.ready)
);
assign schedule_if.data.uuid = instr_uuid;
// Track pending instructions per warp
reg [`NUM_WARPS-1:0] per_warp_incr;
always @(*) begin
per_warp_incr = 0;
if (schedule_if_fire) begin
per_warp_incr[schedule_if.data.wid] = 1;
end
end
wire [`NUM_WARPS-1:0] pending_warp_empty;
wire [`NUM_WARPS-1:0] pending_warp_alm_empty;
`RESET_RELAY_EX (pending_instr_reset, reset, `NUM_WARPS, `MAX_FANOUT);
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_pending_sizes
VX_pending_size #(
.SIZE (4096),
.ALM_EMPTY (1)
) counter (
.clk (clk),
.reset (pending_instr_reset[i]),
.incr (per_warp_incr[i]),
.reset (reset),
.incr (schedule_if_fire && (schedule_if.data.wid == `NW_WIDTH'(i))),
.decr (commit_sched_if.committed_warps[i]),
.empty (pending_warp_empty[i]),
.alm_empty (pending_warp_alm_empty[i]),
@ -407,7 +382,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
wire no_pending_instr = (& pending_warp_empty);
`BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1);
`BUFFER_EX(busy, (active_warps != 0 || ~no_pending_instr), 1'b1, 1, 1);
// export CSRs
assign sched_csr_if.cycles = cycles;
@ -422,7 +397,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
timeout_ctr <= '0;
timeout_enable <= 0;
end else begin
if (decode_sched_if.valid && ~decode_sched_if.is_wstall) begin
if (decode_sched_if.valid && decode_sched_if.unlock) begin
timeout_enable <= 1;
end
if (timeout_enable && active_warps !=0 && active_warps == stalled_warps) begin

View file

@ -30,6 +30,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
VX_scoreboard_if.master scoreboard_if
);
`UNUSED_SPARAM (INSTANCE_ID)
localparam NUM_SRC_OPDS = 3;
localparam NUM_OPDS = NUM_SRC_OPDS + 1;
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + 1;
VX_ibuffer_if staging_if [PER_ISSUE_WARPS]();
@ -42,7 +44,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
VX_reduce #(
VX_reduce_tree #(
.DATAW_IN (`NUM_EX_UNITS),
.N (PER_ISSUE_WARPS),
.OP ("|")
@ -51,7 +53,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
.data_out (perf_units_per_cycle)
);
VX_reduce #(
VX_reduce_tree #(
.DATAW_IN (`NUM_SFU_UNITS),
.N (PER_ISSUE_WARPS),
.OP ("|")
@ -60,17 +62,17 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
.data_out (perf_sfu_per_cycle)
);
`BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
`BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
`BUFFER_EX(perf_units_per_cycle_r, perf_units_per_cycle, 1'b1, 0, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
`BUFFER_EX(perf_sfu_per_cycle_r, perf_sfu_per_cycle, 1'b1, 0, `CDIV(PER_ISSUE_WARPS, `MAX_FANOUT));
wire [PER_ISSUE_WARPS-1:0] stg_valid_in;
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_stg_valid_in
assign stg_valid_in[w] = staging_if[w].valid;
end
wire perf_stall_per_cycle = (|stg_valid_in) && ~(|(stg_valid_in & operands_ready));
always @(posedge clk) begin
always @(posedge clk) begin : g_perf_stalls
if (reset) begin
perf_stalls <= '0;
end else begin
@ -78,7 +80,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
end
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_perf_units_uses
always @(posedge clk) begin
if (reset) begin
perf_units_uses[i] <= '0;
@ -88,7 +90,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
end
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin : g_perf_sfu_uses
always @(posedge clk) begin
if (reset) begin
perf_sfu_uses[i] <= '0;
@ -99,10 +101,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end
`endif
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (1)
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_stanging_bufs
VX_pipe_buffer #(
.DATAW (DATAW)
) stanging_buf (
.clk (clk),
.reset (reset),
@ -115,10 +116,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
);
end
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_scoreboard
reg [`NUM_REGS-1:0] inuse_regs;
reg [3:0] operands_busy, operands_busy_n;
reg [NUM_OPDS-1:0] operands_busy, operands_busy_n;
wire ibuffer_fire = ibuffer_if[w].valid && ibuffer_if[w].ready;
@ -128,6 +129,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
&& (writeback_if.data.wis == ISSUE_WIS_W'(w))
&& writeback_if.data.eop;
wire [NUM_OPDS-1:0][`NR_BITS-1:0] ibuf_opds, stg_opds;
assign ibuf_opds = {ibuffer_if[w].data.rs3, ibuffer_if[w].data.rs2, ibuffer_if[w].data.rs1, ibuffer_if[w].data.rd};
assign stg_opds = {staging_if[w].data.rs3, staging_if[w].data.rs2, staging_if[w].data.rs1, staging_if[w].data.rd};
`ifdef PERF_ENABLE
reg [`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
reg [`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
@ -135,86 +140,36 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
always @(*) begin
perf_inuse_units_per_cycle[w] = '0;
perf_inuse_sfu_per_cycle[w] = '0;
if (staging_if[w].valid) begin
if (operands_busy[0]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rd]] = 1;
if (inuse_units[staging_if[w].data.rd] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rd]] = 1;
end
end
if (operands_busy[1]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs1]] = 1;
if (inuse_units[staging_if[w].data.rs1] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs1]] = 1;
end
end
if (operands_busy[2]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs2]] = 1;
if (inuse_units[staging_if[w].data.rs2] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs2]] = 1;
end
end
if (operands_busy[3]) begin
perf_inuse_units_per_cycle[w][inuse_units[staging_if[w].data.rs3]] = 1;
if (inuse_units[staging_if[w].data.rs3] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[staging_if[w].data.rs3]] = 1;
for (integer i = 0; i < NUM_OPDS; ++i) begin
if (staging_if[w].valid && operands_busy[i]) begin
perf_inuse_units_per_cycle[w][inuse_units[stg_opds[i]]] = 1;
if (inuse_units[stg_opds[i]] == `EX_SFU) begin
perf_inuse_sfu_per_cycle[w][inuse_sfu[stg_opds[i]]] = 1;
end
end
end
end
`endif
always @(*) begin
operands_busy_n = operands_busy;
if (ibuffer_fire) begin
operands_busy_n = {
inuse_regs[ibuffer_if[w].data.rs3],
inuse_regs[ibuffer_if[w].data.rs2],
inuse_regs[ibuffer_if[w].data.rs1],
inuse_regs[ibuffer_if[w].data.rd]
};
end
if (writeback_fire) begin
for (genvar i = 0; i < NUM_OPDS; ++i) begin : g_operands_busy_n
always @(*) begin
operands_busy_n[i] = operands_busy[i];
if (ibuffer_fire) begin
if (writeback_if.data.rd == ibuffer_if[w].data.rd) begin
operands_busy_n[0] = 0;
end
if (writeback_if.data.rd == ibuffer_if[w].data.rs1) begin
operands_busy_n[1] = 0;
end
if (writeback_if.data.rd == ibuffer_if[w].data.rs2) begin
operands_busy_n[2] = 0;
end
if (writeback_if.data.rd == ibuffer_if[w].data.rs3) begin
operands_busy_n[3] = 0;
end
end else begin
if (writeback_if.data.rd == staging_if[w].data.rd) begin
operands_busy_n[0] = 0;
end
if (writeback_if.data.rd == staging_if[w].data.rs1) begin
operands_busy_n[1] = 0;
end
if (writeback_if.data.rd == staging_if[w].data.rs2) begin
operands_busy_n[2] = 0;
end
if (writeback_if.data.rd == staging_if[w].data.rs3) begin
operands_busy_n[3] = 0;
operands_busy_n[i] = inuse_regs[ibuf_opds[i]];
if (staging_fire && staging_if[w].data.wb && staging_if[w].data.rd == ibuf_opds[i]) begin
operands_busy_n[i] = 1;
end
end
end
if (staging_fire && staging_if[w].data.wb) begin
if (staging_if[w].data.rd == ibuffer_if[w].data.rd) begin
operands_busy_n[0] = 1;
end
if (staging_if[w].data.rd == ibuffer_if[w].data.rs1) begin
operands_busy_n[1] = 1;
end
if (staging_if[w].data.rd == ibuffer_if[w].data.rs2) begin
operands_busy_n[2] = 1;
end
if (staging_if[w].data.rd == ibuffer_if[w].data.rs3) begin
operands_busy_n[3] = 1;
if (writeback_fire) begin
if (ibuffer_fire) begin
if (writeback_if.data.rd == ibuf_opds[i]) begin
operands_busy_n[i] = 0;
end
end else begin
if (writeback_if.data.rd == stg_opds[i]) begin
operands_busy_n[i] = 0;
end
end
end
end
end
@ -230,8 +185,10 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
inuse_regs[staging_if[w].data.rd] <= 1;
end
end
operands_busy <= operands_busy_n;
operands_ready[w] <= ~(| operands_busy_n);
`ifdef PERF_ENABLE
if (staging_fire && staging_if[w].data.wb) begin
inuse_units[staging_if[w].data.rd] <= staging_if[w].data.ex_type;
@ -251,9 +208,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
end else begin
if (staging_if[w].valid && ~staging_if[w].ready) begin
`ifdef DBG_TRACE_PIPELINE
`TRACE(3, ("%d: *** %s-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
`TRACE(4, ("%t: *** %s-stall: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)\n",
$time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr,
operands_busy, staging_if[w].data.uuid));
operands_busy, staging_if[w].data.uuid))
`endif
timeout_ctr <= timeout_ctr + 1;
end else if (ibuffer_fire) begin
@ -265,11 +222,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
("%t: *** %s timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
$time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr,
operands_busy, staging_if[w].data.uuid));
operands_busy, staging_if[w].data.uuid))
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if.data.rd] != 0,
("%t: *** %s invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, INSTANCE_ID, w, {writeback_if.data.PC, 1'b0}, writeback_if.data.tmask, writeback_if.data.rd, writeback_if.data.uuid));
$time, INSTANCE_ID, w, {writeback_if.data.PC, 1'b0}, writeback_if.data.tmask, writeback_if.data.rd, writeback_if.data.uuid))
`endif
end
@ -278,23 +235,20 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
wire [PER_ISSUE_WARPS-1:0][DATAW-1:0] arb_data_in;
wire [PER_ISSUE_WARPS-1:0] arb_ready_in;
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_arb_data_in
assign arb_valid_in[w] = staging_if[w].valid && operands_ready[w];
assign arb_data_in[w] = staging_if[w].data;
assign staging_if[w].ready = arb_ready_in[w] && operands_ready[w];
end
`RESET_RELAY (arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (PER_ISSUE_WARPS),
.DATAW (DATAW),
.ARBITER ("F"),
.LUTRAM (1),
.OUT_BUF (4) // using 2-cycle EB for area reduction
.ARBITER ("C"),
.OUT_BUF (3)
) out_arb (
.clk (clk),
.reset (arb_reset),
.reset (reset),
.valid_in (arb_valid_in),
.ready_in (arb_ready_in),
.data_in (arb_data_in),

View file

@ -21,8 +21,8 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
input wire reset,
`ifdef PERF_ENABLE
VX_mem_perf_if.slave mem_perf_if,
VX_pipeline_perf_if.slave pipeline_perf_if,
input sysmem_perf_t sysmem_perf,
input pipeline_perf_t pipeline_perf,
`endif
input base_dcrs_t base_dcrs,
@ -41,24 +41,25 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
VX_warp_ctl_if.master warp_ctl_if
);
`UNUSED_SPARAM (INSTANCE_ID)
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_SFU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `PC_BITS + PID_WIDTH + 1 + 1;
localparam RSP_ARB_SIZE = 1 + 1;
localparam RSP_ARB_IDX_WCTL = 0;
localparam RSP_ARB_IDX_CSRS = 1;
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_SFU_LANES;
localparam PE_COUNT = 2;
localparam PE_SEL_BITS = `CLOG2(PE_COUNT);
localparam PE_IDX_WCTL = 0;
localparam PE_IDX_CSRS = 1;
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) per_block_execute_if[BLOCK_SIZE]();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) per_block_commit_if[BLOCK_SIZE]();
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_BUF (1)
.OUT_BUF (3)
) dispatch_unit (
.clk (clk),
.reset (reset),
@ -66,65 +67,62 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.execute_if (per_block_execute_if)
);
wire [RSP_ARB_SIZE-1:0] rsp_arb_valid_in;
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
// Warp control block
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) wctl_execute_if();
) pe_execute_if[PE_COUNT]();
VX_commit_if#(
.NUM_LANES (NUM_LANES)
) wctl_commit_if();
) pe_commit_if[PE_COUNT]();
assign wctl_execute_if.valid = per_block_execute_if[0].valid && `INST_SFU_IS_WCTL(per_block_execute_if[0].data.op_type);
assign wctl_execute_if.data = per_block_execute_if[0].data;
reg [PE_SEL_BITS-1:0] pe_select;
always @(*) begin
pe_select = PE_IDX_WCTL;
if (`INST_SFU_IS_CSR(per_block_execute_if[0].data.op_type))
pe_select = PE_IDX_CSRS;
end
`RESET_RELAY (wctl_reset, reset);
VX_pe_switch #(
.PE_COUNT (PE_COUNT),
.NUM_LANES (NUM_LANES),
.ARBITER ("R"),
.REQ_OUT_BUF(0),
.RSP_OUT_BUF(3)
) pe_switch (
.clk (clk),
.reset (reset),
.pe_sel (pe_select),
.execute_in_if (per_block_execute_if[0]),
.commit_out_if (per_block_commit_if[0]),
.execute_out_if (pe_execute_if),
.commit_in_if (pe_commit_if)
);
VX_wctl_unit #(
.INSTANCE_ID ($sformatf("%s-wctl", INSTANCE_ID)),
.INSTANCE_ID (`SFORMATF(("%s-wctl", INSTANCE_ID))),
.NUM_LANES (NUM_LANES)
) wctl_unit (
.clk (clk),
.reset (wctl_reset),
.execute_if (wctl_execute_if),
.reset (reset),
.execute_if (pe_execute_if[PE_IDX_WCTL]),
.warp_ctl_if(warp_ctl_if),
.commit_if (wctl_commit_if)
.commit_if (pe_commit_if[PE_IDX_WCTL])
);
assign rsp_arb_valid_in[RSP_ARB_IDX_WCTL] = wctl_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_WCTL] = wctl_commit_if.data;
assign wctl_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_WCTL];
// CSR unit
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) csr_execute_if();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) csr_commit_if();
assign csr_execute_if.valid = per_block_execute_if[0].valid && `INST_SFU_IS_CSR(per_block_execute_if[0].data.op_type);
assign csr_execute_if.data = per_block_execute_if[0].data;
`RESET_RELAY (csr_reset, reset);
VX_csr_unit #(
.INSTANCE_ID ($sformatf("%s-csr", INSTANCE_ID)),
.INSTANCE_ID (`SFORMATF(("%s-csr", INSTANCE_ID))),
.CORE_ID (CORE_ID),
.NUM_LANES (NUM_LANES)
) csr_unit (
.clk (clk),
.reset (csr_reset),
.reset (reset),
.base_dcrs (base_dcrs),
.execute_if (csr_execute_if),
.execute_if (pe_execute_if[PE_IDX_CSRS]),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if),
.sysmem_perf (sysmem_perf),
.pipeline_perf (pipeline_perf),
`endif
`ifdef EXT_F_ENABLE
@ -133,47 +131,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.sched_csr_if (sched_csr_if),
.commit_csr_if (commit_csr_if),
.commit_if (csr_commit_if)
);
assign rsp_arb_valid_in[RSP_ARB_IDX_CSRS] = csr_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_CSRS] = csr_commit_if.data;
assign csr_commit_if.ready = rsp_arb_ready_in[RSP_ARB_IDX_CSRS];
// can accept new request?
reg sfu_req_ready;
always @(*) begin
case (per_block_execute_if[0].data.op_type)
`INST_SFU_CSRRW,
`INST_SFU_CSRRS,
`INST_SFU_CSRRC: sfu_req_ready = csr_execute_if.ready;
default: sfu_req_ready = wctl_execute_if.ready;
endcase
end
assign per_block_execute_if[0].ready = sfu_req_ready;
// response arbitration
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) arb_commit_if[BLOCK_SIZE]();
VX_stream_arb #(
.NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW),
.ARBITER ("R"),
.OUT_BUF (3)
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in (rsp_arb_valid_in),
.ready_in (rsp_arb_ready_in),
.data_in (rsp_arb_data_in),
.data_out (arb_commit_if[0].data),
.valid_out (arb_commit_if[0].valid),
.ready_out (arb_commit_if[0].ready),
`UNUSED_PIN (sel_out)
.commit_if (pe_commit_if[PE_IDX_CSRS])
);
VX_gather_unit #(
@ -181,9 +139,9 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.NUM_LANES (NUM_LANES),
.OUT_BUF (3)
) gather_unit (
.clk (clk),
.reset (reset),
.commit_in_if (arb_commit_if),
.clk (clk),
.reset (reset),
.commit_in_if (per_block_commit_if),
.commit_out_if (commit_if)
);

View file

@ -45,16 +45,13 @@ module VX_split_join import VX_gpu_pkg::*; #(
wire ipdom_push = valid && split.valid && split.is_dvg;
wire ipdom_pop = valid && sjoin.valid && sjoin_is_dvg;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
`RESET_RELAY (ipdom_reset, reset);
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_ipdom_stacks
VX_ipdom_stack #(
.WIDTH (`NUM_THREADS+`PC_BITS),
.DEPTH (`DV_STACK_SIZE)
) ipdom_stack (
.clk (clk),
.reset (ipdom_reset),
.reset (reset),
.q0 (ipdom_q0),
.q1 (ipdom_q1),
.d (ipdom_data[i]),

View file

@ -1,399 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`ifndef VX_TRACE_PKG_VH
`define VX_TRACE_PKG_VH
`include "VX_define.vh"
package VX_trace_pkg;
`ifdef SIMULATION
`ifdef SV_DPI
import "DPI-C" function void dpi_trace(input int level, input string format /*verilator sformat*/);
`endif
import VX_gpu_pkg::*;
task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
case (ex_type)
`EX_ALU: `TRACE(level, ("ALU"));
`EX_LSU: `TRACE(level, ("LSU"));
`EX_FPU: `TRACE(level, ("FPU"));
`EX_SFU: `TRACE(level, ("SFU"));
default: `TRACE(level, ("?"));
endcase
endtask
task trace_ex_op(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input VX_gpu_pkg::op_args_t op_args
);
case (ex_type)
`EX_ALU: begin
case (op_args.alu.xtype)
`ALU_TYPE_ARITH: begin
if (op_args.alu.is_w) begin
if (op_args.alu.use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDIW"));
`INST_ALU_SLL: `TRACE(level, ("SLLIW"));
`INST_ALU_SRL: `TRACE(level, ("SRLIW"));
`INST_ALU_SRA: `TRACE(level, ("SRAIW"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDW"));
`INST_ALU_SUB: `TRACE(level, ("SUBW"));
`INST_ALU_SLL: `TRACE(level, ("SLLW"));
`INST_ALU_SRL: `TRACE(level, ("SRLW"));
`INST_ALU_SRA: `TRACE(level, ("SRAW"));
default: `TRACE(level, ("?"));
endcase
end
end else begin
if (op_args.alu.use_imm) begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADDI"));
`INST_ALU_SLL: `TRACE(level, ("SLLI"));
`INST_ALU_SRL: `TRACE(level, ("SRLI"));
`INST_ALU_SRA: `TRACE(level, ("SRAI"));
`INST_ALU_SLT: `TRACE(level, ("SLTI"));
`INST_ALU_SLTU: `TRACE(level, ("SLTIU"));
`INST_ALU_XOR: `TRACE(level, ("XORI"));
`INST_ALU_OR: `TRACE(level, ("ORI"));
`INST_ALU_AND: `TRACE(level, ("ANDI"));
`INST_ALU_LUI: `TRACE(level, ("LUI"));
`INST_ALU_AUIPC: `TRACE(level, ("AUIPC"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_ALU_BITS'(op_type))
`INST_ALU_ADD: `TRACE(level, ("ADD"));
`INST_ALU_SUB: `TRACE(level, ("SUB"));
`INST_ALU_SLL: `TRACE(level, ("SLL"));
`INST_ALU_SRL: `TRACE(level, ("SRL"));
`INST_ALU_SRA: `TRACE(level, ("SRA"));
`INST_ALU_SLT: `TRACE(level, ("SLT"));
`INST_ALU_SLTU: `TRACE(level, ("SLTU"));
`INST_ALU_XOR: `TRACE(level, ("XOR"));
`INST_ALU_OR: `TRACE(level, ("OR"));
`INST_ALU_AND: `TRACE(level, ("AND"));
`INST_ALU_CZEQ: `TRACE(level, ("CZERO.EQZ"));
`INST_ALU_CZNE: `TRACE(level, ("CZERO.NEZ"));
default: `TRACE(level, ("?"));
endcase
end
end
end
`ALU_TYPE_BRANCH: begin
case (`INST_BR_BITS'(op_type))
`INST_BR_EQ: `TRACE(level, ("BEQ"));
`INST_BR_NE: `TRACE(level, ("BNE"));
`INST_BR_LT: `TRACE(level, ("BLT"));
`INST_BR_GE: `TRACE(level, ("BGE"));
`INST_BR_LTU: `TRACE(level, ("BLTU"));
`INST_BR_GEU: `TRACE(level, ("BGEU"));
`INST_BR_JAL: `TRACE(level, ("JAL"));
`INST_BR_JALR: `TRACE(level, ("JALR"));
`INST_BR_ECALL: `TRACE(level, ("ECALL"));
`INST_BR_EBREAK:`TRACE(level, ("EBREAK"));
`INST_BR_URET: `TRACE(level, ("URET"));
`INST_BR_SRET: `TRACE(level, ("SRET"));
`INST_BR_MRET: `TRACE(level, ("MRET"));
default: `TRACE(level, ("?"));
endcase
end
`ALU_TYPE_MULDIV: begin
if (op_args.alu.is_w) begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MULW"));
`INST_M_DIV: `TRACE(level, ("DIVW"));
`INST_M_DIVU: `TRACE(level, ("DIVUW"));
`INST_M_REM: `TRACE(level, ("REMW"));
`INST_M_REMU: `TRACE(level, ("REMUW"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_M_BITS'(op_type))
`INST_M_MUL: `TRACE(level, ("MUL"));
`INST_M_MULH: `TRACE(level, ("MULH"));
`INST_M_MULHSU:`TRACE(level, ("MULHSU"));
`INST_M_MULHU: `TRACE(level, ("MULHU"));
`INST_M_DIV: `TRACE(level, ("DIV"));
`INST_M_DIVU: `TRACE(level, ("DIVU"));
`INST_M_REM: `TRACE(level, ("REM"));
`INST_M_REMU: `TRACE(level, ("REMU"));
default: `TRACE(level, ("?"));
endcase
end
end
default: `TRACE(level, ("?"));
endcase
end
`EX_LSU: begin
if (op_args.lsu.is_float) begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LW: `TRACE(level, ("FLW"));
`INST_LSU_LD: `TRACE(level, ("FLD"));
`INST_LSU_SW: `TRACE(level, ("FSW"));
`INST_LSU_SD: `TRACE(level, ("FSD"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LB: `TRACE(level, ("LB"));
`INST_LSU_LH: `TRACE(level, ("LH"));
`INST_LSU_LW: `TRACE(level, ("LW"));
`INST_LSU_LD: `TRACE(level, ("LD"));
`INST_LSU_LBU:`TRACE(level, ("LBU"));
`INST_LSU_LHU:`TRACE(level, ("LHU"));
`INST_LSU_LWU:`TRACE(level, ("LWU"));
`INST_LSU_SB: `TRACE(level, ("SB"));
`INST_LSU_SH: `TRACE(level, ("SH"));
`INST_LSU_SW: `TRACE(level, ("SW"));
`INST_LSU_SD: `TRACE(level, ("SD"));
`INST_LSU_FENCE:`TRACE(level,("FENCE"));
default: `TRACE(level, ("?"));
endcase
end
end
`EX_FPU: begin
case (`INST_FPU_BITS'(op_type))
`INST_FPU_ADD: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FADD.D"));
else
`TRACE(level, ("FADD.S"));
end
`INST_FPU_SUB: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FSUB.D"));
else
`TRACE(level, ("FSUB.S"));
end
`INST_FPU_MUL: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FMUL.D"));
else
`TRACE(level, ("FMUL.S"));
end
`INST_FPU_DIV: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FDIV.D"));
else
`TRACE(level, ("FDIV.S"));
end
`INST_FPU_SQRT: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FSQRT.D"));
else
`TRACE(level, ("FSQRT.S"));
end
`INST_FPU_MADD: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FMADD.D"));
else
`TRACE(level, ("FMADD.S"));
end
`INST_FPU_MSUB: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FMSUB.D"));
else
`TRACE(level, ("FMSUB.S"));
end
`INST_FPU_NMADD: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FNMADD.D"));
else
`TRACE(level, ("FNMADD.S"));
end
`INST_FPU_NMSUB: begin
if (op_args.fpu.fmt[0])
`TRACE(level, ("FNMSUB.D"));
else
`TRACE(level, ("FNMSUB.S"));
end
`INST_FPU_CMP: begin
if (op_args.fpu.fmt[0]) begin
case (op_args.fpu.frm[1:0])
0: `TRACE(level, ("FLE.D"));
1: `TRACE(level, ("FLT.D"));
2: `TRACE(level, ("FEQ.D"));
default: `TRACE(level, ("?"));
endcase
end else begin
case (op_args.fpu.frm[1:0])
0: `TRACE(level, ("FLE.S"));
1: `TRACE(level, ("FLT.S"));
2: `TRACE(level, ("FEQ.S"));
default: `TRACE(level, ("?"));
endcase
end
end
`INST_FPU_F2F: begin
if (op_args.fpu.fmt[0]) begin
`TRACE(level, ("FCVT.D.S"));
end else begin
`TRACE(level, ("FCVT.S.D"));
end
end
`INST_FPU_F2I: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.L.D"));
end else begin
`TRACE(level, ("FCVT.W.D"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.L.S"));
end else begin
`TRACE(level, ("FCVT.W.S"));
end
end
end
`INST_FPU_F2U: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.LU.D"));
end else begin
`TRACE(level, ("FCVT.WU.D"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.LU.S"));
end else begin
`TRACE(level, ("FCVT.WU.S"));
end
end
end
`INST_FPU_I2F: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.D.L"));
end else begin
`TRACE(level, ("FCVT.D.W"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.S.L"));
end else begin
`TRACE(level, ("FCVT.S.W"));
end
end
end
`INST_FPU_U2F: begin
if (op_args.fpu.fmt[0]) begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.D.LU"));
end else begin
`TRACE(level, ("FCVT.D.WU"));
end
end else begin
if (op_args.fpu.fmt[1]) begin
`TRACE(level, ("FCVT.S.LU"));
end else begin
`TRACE(level, ("FCVT.S.WU"));
end
end
end
`INST_FPU_MISC: begin
if (op_args.fpu.fmt[0]) begin
case (op_args.fpu.frm)
0: `TRACE(level, ("FSGNJ.D"));
1: `TRACE(level, ("FSGNJN.D"));
2: `TRACE(level, ("FSGNJX.D"));
3: `TRACE(level, ("FCLASS.D"));
4: `TRACE(level, ("FMV.X.D"));
5: `TRACE(level, ("FMV.D.X"));
6: `TRACE(level, ("FMIN.D"));
7: `TRACE(level, ("FMAX.D"));
endcase
end else begin
case (op_args.fpu.frm)
0: `TRACE(level, ("FSGNJ.S"));
1: `TRACE(level, ("FSGNJN.S"));
2: `TRACE(level, ("FSGNJX.S"));
3: `TRACE(level, ("FCLASS.S"));
4: `TRACE(level, ("FMV.X.S"));
5: `TRACE(level, ("FMV.S.X"));
6: `TRACE(level, ("FMIN.S"));
7: `TRACE(level, ("FMAX.S"));
endcase
end
end
default: `TRACE(level, ("?"));
endcase
end
`EX_SFU: begin
case (`INST_SFU_BITS'(op_type))
`INST_SFU_TMC: `TRACE(level, ("TMC"));
`INST_SFU_WSPAWN:`TRACE(level, ("WSPAWN"));
`INST_SFU_SPLIT: begin if (op_args.wctl.is_neg) `TRACE(level, ("SPLIT.N")); else `TRACE(level, ("SPLIT")); end
`INST_SFU_JOIN: `TRACE(level, ("JOIN"));
`INST_SFU_BAR: `TRACE(level, ("BAR"));
`INST_SFU_PRED: begin if (op_args.wctl.is_neg) `TRACE(level, ("PRED.N")); else `TRACE(level, ("PRED")); end
`INST_SFU_CSRRW: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end
`INST_SFU_CSRRS: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end
`INST_SFU_CSRRC: begin if (op_args.csr.use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end
default: `TRACE(level, ("?"));
endcase
end
default: `TRACE(level, ("?"));
endcase
endtask
task trace_op_args(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input VX_gpu_pkg::op_args_t op_args
);
case (ex_type)
`EX_ALU: begin
`TRACE(level, (", use_PC=%b, use_imm=%b, imm=0x%0h", op_args.alu.use_PC, op_args.alu.use_imm, op_args.alu.imm));
end
`EX_LSU: begin
`TRACE(level, (", offset=0x%0h", op_args.lsu.offset));
end
`EX_FPU: begin
`TRACE(level, (", fmt=0x%0h, frm=0x%0h", op_args.fpu.fmt, op_args.fpu.frm));
end
`EX_SFU: begin
if (`INST_SFU_IS_CSR(op_type)) begin
`TRACE(level, (", addr=0x%0h, use_imm=%b, imm=0x%0h", op_args.csr.addr, op_args.csr.use_imm, op_args.csr.imm));
end
end
default:;
endcase
endtask
task trace_base_dcr(input int level, input [`VX_DCR_ADDR_WIDTH-1:0] addr);
case (addr)
`VX_DCR_BASE_STARTUP_ADDR0: `TRACE(level, ("STARTUP_ADDR0"));
`VX_DCR_BASE_STARTUP_ADDR1: `TRACE(level, ("STARTUP_ADDR1"));
`VX_DCR_BASE_STARTUP_ARG0: `TRACE(level, ("STARTUP_ARG0"));
`VX_DCR_BASE_STARTUP_ARG1: `TRACE(level, ("STARTUP_ARG1"));
`VX_DCR_BASE_MPM_CLASS: `TRACE(level, ("MPM_CLASS"));
default: `TRACE(level, ("?"));
endcase
endtask
`endif
endpackage
`endif // VX_TRACE_PKG_VH

View file

@ -0,0 +1,44 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_uuid_gen import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter UUID_WIDTH = 48
) (
input wire clk,
input wire reset,
input wire incr,
input wire [`NW_WIDTH-1:0] wid,
output wire [UUID_WIDTH-1:0] uuid
);
localparam GNW_WIDTH = UUID_WIDTH - 32;
reg [31:0] uuid_cntrs [0:`NUM_WARPS-1];
reg [`NUM_WARPS-1:0] has_uuid_cntrs;
always @(posedge clk) begin
if (reset) begin
has_uuid_cntrs <= '0;
end else if (incr) begin
has_uuid_cntrs[wid] <= 1;
end
if (incr) begin
uuid_cntrs[wid] <= has_uuid_cntrs[wid] ? (uuid_cntrs[wid] + 1) : 1;
end
end
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(wid);
assign uuid = {g_wid, (has_uuid_cntrs[wid] ? uuid_cntrs[wid] : 0)};
endmodule

View file

@ -50,9 +50,9 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR);
wire [`UP(LANE_BITS)-1:0] tid;
if (LANE_BITS != 0) begin
if (LANE_BITS != 0) begin : g_tid
assign tid = execute_if.data.tid[0 +: LANE_BITS];
end else begin
end else begin : g_no_tid
assign tid = 0;
end
@ -63,7 +63,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
wire not_pred = execute_if.data.op_args.wctl.is_neg;
wire [NUM_LANES-1:0] taken;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_taken
assign taken[i] = (execute_if.data.rs1_data[i][0] ^ not_pred);
end
@ -131,7 +131,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
// wspawn
wire [`NUM_WARPS-1:0] wspawn_wmask;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_wspawn_wmask
assign wspawn_wmask[i] = (i < rs1_data[`NW_BITS:0]) && (i != execute_if.data.wid);
end
assign wspawn.valid = is_wspawn;
@ -162,7 +162,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
assign warp_ctl_if.sjoin = sjoin_r;
assign warp_ctl_if.barrier = barrier_r;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_commit_if
assign commit_if.data.data[i] = `XLEN'(dvstack_ptr);
end

View file

@ -1,17 +1,17 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Modified port of cast module from fpnew Libray
// Modified port of cast module from fpnew Libray
// reference: https://github.com/pulp-platform/fpnew
`include "VX_fpu_define.vh"
@ -22,7 +22,8 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
parameter LATENCY = 1,
parameter INT_WIDTH = 32,
parameter MAN_BITS = 23,
parameter EXP_BITS = 8
parameter EXP_BITS = 8,
parameter OUT_REG = 0
) (
input wire clk,
input wire reset,
@ -35,10 +36,10 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
input wire is_signed,
input wire [31:0] dataa,
output wire [31:0] result,
output wire [31:0] result,
output wire [`FP_FLAGS_BITS-1:0] fflags
);
);
// Constants
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
@ -55,11 +56,11 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
localparam FMT_SHIFT_COMPENSATION = S_MAN_WIDTH - 1 - MAN_BITS;
localparam NUM_FP_STICKY = 2 * S_MAN_WIDTH - MAN_BITS - 1; // removed mantissa, 1. and R
localparam NUM_INT_STICKY = 2 * S_MAN_WIDTH - INT_WIDTH; // removed int and R
// Input processing
fclass_t fclass;
VX_fp_classifier #(
fclass_t fclass;
VX_fp_classifier #(
.EXP_BITS (EXP_BITS),
.MAN_BITS (MAN_BITS)
) fp_classifier (
@ -69,9 +70,9 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
);
wire [S_MAN_WIDTH-1:0] input_mant;
wire [S_EXP_WIDTH-1:0] input_exp;
wire [S_EXP_WIDTH-1:0] input_exp;
wire input_sign;
wire i2f_sign = dataa[INT_WIDTH-1];
wire f2i_sign = dataa[INT_WIDTH-1] && is_signed;
wire [S_MAN_WIDTH-1:0] f2i_mantissa = f2i_sign ? (-dataa) : dataa;
@ -81,7 +82,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
assign input_sign = is_itof ? f2i_sign : i2f_sign;
// Pipeline stage0
wire is_itof_s0;
wire is_signed_s0;
wire [2:0] rnd_mode_s0;
@ -92,7 +93,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
VX_pipe_register #(
.DATAW (1 + `INST_FRM_BITS + 1 + $bits(fclass_t) + 1 + S_EXP_WIDTH + S_MAN_WIDTH),
.DEPTH (LATENCY > 2)
.DEPTH (LATENCY > 1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
@ -100,7 +101,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
.data_in ({is_itof, is_signed, frm, fclass, input_sign, input_exp, input_mant}),
.data_out ({is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
);
// Normalization
wire [LZC_RESULT_WIDTH-1:0] renorm_shamt_s0; // renormalization shift amount
@ -113,12 +114,12 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
.data_out (renorm_shamt_s0),
.valid_out (mant_is_nonzero_s0)
);
wire mant_is_zero_s0 = ~mant_is_nonzero_s0;
wire [S_MAN_WIDTH-1:0] input_mant_n_s0; // normalized input mantissa
wire [S_MAN_WIDTH-1:0] input_mant_n_s0; // normalized input mantissa
wire [S_EXP_WIDTH-1:0] input_exp_n_s0; // unbiased true exponent
// Realign input mantissa, append zeroes if destination is wider
assign input_mant_n_s0 = encoded_mant_s0 << renorm_shamt_s0;
@ -140,7 +141,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
VX_pipe_register #(
.DATAW (1 + `INST_FRM_BITS + 1 + $bits(fclass_t) + 1 + 1 + S_MAN_WIDTH + S_EXP_WIDTH),
.DEPTH (LATENCY > 1)
.DEPTH (LATENCY > 2)
) pipe_reg1 (
.clk (clk),
.reset (reset),
@ -169,30 +170,30 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
wire of_before_round_s1 = overflow;
// Pipeline stage2
wire is_itof_s2;
wire is_signed_s2;
wire [2:0] rnd_mode_s2;
fclass_t fclass_s2;
fclass_t fclass_s2;
wire mant_is_zero_s2;
wire input_sign_s2;
wire [2*S_MAN_WIDTH:0] destination_mant_s2;
wire [EXP_BITS-1:0] final_exp_s2;
wire of_before_round_s2;
VX_pipe_register #(
.DATAW (1 + 1 + `INST_FRM_BITS + $bits(fclass_t) + 1 + 1 + (2*S_MAN_WIDTH+1) + EXP_BITS + 1),
.DEPTH (LATENCY > 3)
.DEPTH (LATENCY > 0)
) pipe_reg2 (
.clk (clk),
.reset (reset),
.enable (enable),
.data_in ({is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
.data_out ({is_itof_s2, is_signed_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
);
);
// Rouding and classification
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
wire [INT_WIDTH-1:0] final_int_s2; // integer shifted in position
wire [1:0] f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2;
@ -237,20 +238,20 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
wire is_itof_s3;
wire is_signed_s3;
fclass_t fclass_s3;
fclass_t fclass_s3;
wire mant_is_zero_s3;
wire input_sign_s3;
wire rounded_sign_s3;
wire [INT_WIDTH-1:0] rounded_abs_s3;
wire of_before_round_s3;
wire of_before_round_s3;
wire f2i_round_has_sticky_s3;
wire i2f_round_has_sticky_s3;
`UNUSED_VAR (fclass_s3)
`UNUSED_VAR (fclass_s3)
VX_pipe_register #(
.DATAW (1 + 1 + $bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1),
.DEPTH (LATENCY > 4)
.DEPTH (LATENCY > 3)
) pipe_reg3 (
.clk (clk),
.reset (reset),
@ -258,7 +259,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
.data_in ({is_itof_s2, is_signed_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}),
.data_out ({is_itof_s3, is_signed_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3})
);
// Assemble regular result, nan box short ones. Int zeroes need to be detected
wire [INT_WIDTH-1:0] fmt_result_s3 = mant_is_zero_s3 ? 0 : {rounded_sign_s3, rounded_abs_s3[EXP_BITS+MAN_BITS-1:0]};
@ -278,18 +279,18 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
f2i_special_result_s3[INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1; // alone yields 2**(31)-1
f2i_special_result_s3[INT_WIDTH-1] = ~is_signed_s3; // for unsigned casts yields 2**31
end
end
end
// Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
wire f2i_result_is_special_s3 = fclass_s3.is_nan
wire f2i_result_is_special_s3 = fclass_s3.is_nan
| fclass_s3.is_inf
| of_before_round_s3
| (input_sign_s3 & ~is_signed_s3 & ~rounded_int_res_zero_s3);
fflags_t f2i_special_status_s3;
fflags_t i2f_status_s3, f2i_status_s3;
fflags_t tmp_fflags_s3;
// All integer special cases are invalid
assign f2i_special_status_s3 = {1'b1, 4'h0};
@ -306,7 +307,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
VX_pipe_register #(
.DATAW (32 + `FP_FLAGS_BITS),
.DEPTH (LATENCY > 0)
.DEPTH (OUT_REG)
) pipe_reg4 (
.clk (clk),
.reset (reset),

View file

@ -1,17 +1,17 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Modified port of noncomp module from fpnew Libray
// Modified port of noncomp module from fpnew Libray
// reference: https://github.com/pulp-platform/fpnew
`include "VX_fpu_define.vh"
@ -19,9 +19,10 @@
`ifdef FPU_DSP
module VX_fncp_unit import VX_fpu_pkg::*; #(
parameter LATENCY = 2,
parameter LATENCY = 1,
parameter EXP_BITS = 8,
parameter MAN_BITS = 23
parameter MAN_BITS = 23,
parameter OUT_REG = 0
) (
input wire clk,
input wire reset,
@ -33,10 +34,10 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
input wire [31:0] dataa,
input wire [31:0] datab,
output wire [31:0] result,
output wire [31:0] result,
output wire [`FP_FLAGS_BITS-1:0] fflags
);
);
localparam NEG_INF = 32'h00000001,
NEG_NORM = 32'h00000002,
NEG_SUBNORM = 32'h00000004,
@ -55,15 +56,15 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
wire a_smaller, ab_equal;
// Setup
assign a_sign = dataa[31];
assign a_sign = dataa[31];
assign a_exponent = dataa[30:23];
assign a_mantissa = dataa[22:0];
assign b_sign = datab[31];
assign b_sign = datab[31];
assign b_exponent = datab[30:23];
assign b_mantissa = datab[22:0];
VX_fp_classifier #(
VX_fp_classifier #(
.EXP_BITS (EXP_BITS),
.MAN_BITS (MAN_BITS)
) fp_class_a (
@ -72,7 +73,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
.clss_o (a_fclass)
);
VX_fp_classifier #(
VX_fp_classifier #(
.EXP_BITS (EXP_BITS),
.MAN_BITS (MAN_BITS)
) fp_class_b (
@ -82,7 +83,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
);
assign a_smaller = (dataa < datab) ^ (a_sign || b_sign);
assign ab_equal = (dataa == datab)
assign ab_equal = (dataa == datab)
|| (a_fclass.is_zero && b_fclass.is_zero); // +0 == -0
// Pipeline stage0
@ -101,54 +102,54 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
VX_pipe_register #(
.DATAW (4 + 2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fclass_t) + 1 + 1),
.DEPTH (LATENCY > 1)
.DEPTH (LATENCY > 0)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (enable),
.data_in ({op_mod, dataa, datab, a_sign, b_sign, a_exponent, a_mantissa, a_fclass, b_fclass, a_smaller, ab_equal}),
.data_out ({op_mod_s0, dataa_s0, datab_s0, a_sign_s0, b_sign_s0, a_exponent_s0, a_mantissa_s0, a_fclass_s0, b_fclass_s0, a_smaller_s0, ab_equal_s0})
);
);
// FCLASS
reg [31:0] fclass_mask_s0; // generate a 10-bit mask for integer reg
always @(*) begin
always @(*) begin
if (a_fclass_s0.is_normal) begin
fclass_mask_s0 = a_sign_s0 ? NEG_NORM : POS_NORM;
end
end
else if (a_fclass_s0.is_inf) begin
fclass_mask_s0 = a_sign_s0 ? NEG_INF : POS_INF;
end
end
else if (a_fclass_s0.is_zero) begin
fclass_mask_s0 = a_sign_s0 ? NEG_ZERO : POS_ZERO;
end
end
else if (a_fclass_s0.is_subnormal) begin
fclass_mask_s0 = a_sign_s0 ? NEG_SUBNORM : POS_SUBNORM;
end
end
else if (a_fclass_s0.is_nan) begin
fclass_mask_s0 = {22'h0, a_fclass_s0.is_quiet, a_fclass_s0.is_signaling, 8'h0};
end
else begin
end
else begin
fclass_mask_s0 = QUT_NAN;
end
end
// Min/Max
// Min/Max
reg [31:0] fminmax_res_s0;
always @(*) begin
if (a_fclass_s0.is_nan && b_fclass_s0.is_nan)
fminmax_res_s0 = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN
else if (a_fclass_s0.is_nan)
else if (a_fclass_s0.is_nan)
fminmax_res_s0 = datab_s0;
else if (b_fclass_s0.is_nan)
else if (b_fclass_s0.is_nan)
fminmax_res_s0 = dataa_s0;
else begin
else begin
// FMIN, FMAX
fminmax_res_s0 = (op_mod_s0[0] ^ a_smaller_s0) ? dataa_s0 : datab_s0;
end
end
// Sign injection
// Sign injection
reg [31:0] fsgnj_res_s0; // result of sign injection
always @(*) begin
case (op_mod_s0[1:0])
@ -158,12 +159,12 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
endcase
end
// Comparison
// Comparison
reg fcmp_res_s0; // result of comparison
reg fcmp_fflags_NV_s0; // comparison fflags
always @(*) begin
case (op_mod_s0[1:0])
0: begin // LE
0: begin // LE
if (a_fclass_s0.is_nan || b_fclass_s0.is_nan) begin
fcmp_res_s0 = 0;
fcmp_fflags_NV_s0 = 1;
@ -179,12 +180,12 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
end else begin
fcmp_res_s0 = (a_smaller_s0 & ~ab_equal_s0);
fcmp_fflags_NV_s0 = 0;
end
end
end
2: begin // EQ
if (a_fclass_s0.is_nan || b_fclass_s0.is_nan) begin
fcmp_res_s0 = 0;
fcmp_fflags_NV_s0 = a_fclass_s0.is_signaling | b_fclass_s0.is_signaling;
fcmp_fflags_NV_s0 = a_fclass_s0.is_signaling | b_fclass_s0.is_signaling;
end else begin
fcmp_res_s0 = ab_equal_s0;
fcmp_fflags_NV_s0 = 0;
@ -192,7 +193,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
end
default: begin
fcmp_res_s0 = 'x;
fcmp_fflags_NV_s0 = 'x;
fcmp_fflags_NV_s0 = 'x;
end
endcase
end
@ -216,7 +217,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
// FMV
result_s0 = dataa_s0;
fflags_NV_s0 = 0;
end
end
6,7: begin
// MIN/MAX
result_s0 = fminmax_res_s0;
@ -229,7 +230,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
VX_pipe_register #(
.DATAW (32 + 1),
.DEPTH (LATENCY > 0)
.DEPTH (OUT_REG)
) pipe_reg1 (
.clk (clk),
.reset (reset),

View file

@ -46,56 +46,68 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
input wire ready_out,
output wire valid_out
);
`UNUSED_VAR (frm)
localparam DATAW = 32 + `INST_FRM_BITS + 1 + 1;
wire [NUM_LANES-1:0][DATAW-1:0] data_in;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
fflags_t [NUM_LANES-1:0] fflags_out;
wire pe_enable;
wire [NUM_PES-1:0][31:0] pe_data_in;
wire [NUM_PES-1:0][DATAW-1:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in
assign data_in[i][0 +: 32] = dataa[i];
assign data_in[i][32 +: `INST_FRM_BITS] = frm;
assign data_in[i][32 + `INST_FRM_BITS +: 1] = is_itof;
assign data_in[i][32 + `INST_FRM_BITS + 1 +: 1] = is_signed;
end
VX_pe_serializer #(
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.LATENCY (`LATENCY_FCVT),
.DATA_IN_WIDTH(32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.DATA_IN_WIDTH (DATAW),
.DATA_OUT_WIDTH (`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG (0),
.OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
.OUT_BUF (2)
) pe_serializer (
.clk (clk),
.reset (reset),
.valid_in (valid_in),
.data_in (dataa),
.data_in (data_in),
.tag_in ({mask_in, tag_in}),
.ready_in (ready_in),
.pe_enable (pe_enable),
.pe_data_in (pe_data_in),
.pe_data_out(pe_data_out),
.pe_data_out(pe_data_in),
.pe_data_in (pe_data_out),
.valid_out (valid_out),
.data_out (data_out),
.tag_out ({mask_out, tag_out}),
.ready_out (ready_out)
);
for (genvar i = 0; i < NUM_LANES; ++i) begin
`UNUSED_VAR (pe_data_in)
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result
assign result[i] = data_out[i][0 +: 32];
assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS];
end
for (genvar i = 0; i < NUM_PES; ++i) begin
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fcvt_units
VX_fcvt_unit #(
.LATENCY (`LATENCY_FCVT)
.LATENCY (`LATENCY_FCVT),
.OUT_REG (1)
) fcvt_unit (
.clk (clk),
.reset (reset),
.enable (pe_enable),
.frm (frm),
.is_itof (is_itof),
.is_signed (is_signed),
.frm (pe_data_in[0][32 +: `INST_FRM_BITS]),
.is_itof (pe_data_in[0][32 + `INST_FRM_BITS +: 1]),
.is_signed (pe_data_in[0][32 + `INST_FRM_BITS + 1 +: 1]),
.dataa (pe_data_in[i][0 +: 32]),
.result (pe_data_out[i][0 +: 32]),
.fflags (pe_data_out[i][32 +: `FP_FLAGS_BITS])

View file

@ -44,31 +44,33 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
output wire valid_out,
input wire ready_out
);
`UNUSED_VAR (frm)
localparam DATAW = 2 * 32 + `INST_FRM_BITS;
wire [NUM_LANES-1:0][DATAW-1:0] data_in;
wire [NUM_LANES-1:0][2*32-1:0] data_in;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags_out;
wire pe_enable;
wire [NUM_PES-1:0][2*32-1:0] pe_data_in;
wire [NUM_PES-1:0][DATAW-1:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in
assign data_in[i][0 +: 32] = dataa[i];
assign data_in[i][32 +: 32] = datab[i];
assign data_in[i][64 +: `INST_FRM_BITS] = frm;
end
VX_pe_serializer #(
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.LATENCY (`LATENCY_FDIV),
.DATA_IN_WIDTH(2*32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.DATA_IN_WIDTH (DATAW),
.DATA_OUT_WIDTH (`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG (0),
.OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
.OUT_BUF (2)
) pe_serializer (
.clk (clk),
.reset (reset),
@ -77,15 +79,17 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
.tag_in ({mask_in, tag_in}),
.ready_in (ready_in),
.pe_enable (pe_enable),
.pe_data_in (pe_data_in),
.pe_data_out(pe_data_out),
.pe_data_out(pe_data_in),
.pe_data_in (pe_data_out),
.valid_out (valid_out),
.data_out (data_out),
.tag_out ({mask_out, tag_out}),
.ready_out (ready_out)
);
for (genvar i = 0; i < NUM_LANES; ++i) begin
`UNUSED_VAR (pe_data_in)
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result
assign result[i] = data_out[i][0 +: 32];
assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS];
end
@ -94,7 +98,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
`ifdef QUARTUS
for (genvar i = 0; i < NUM_PES; ++i) begin
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fdivs
acl_fdiv fdiv (
.clk (clk),
.areset (1'b0),
@ -112,7 +116,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
`elsif VIVADO
for (genvar i = 0; i < NUM_PES; ++i) begin
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fdivs
wire [3:0] tuser;
xil_fdiv fdiv (
.aclk (clk),
@ -134,7 +138,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
`else
for (genvar i = 0; i < NUM_PES; ++i) begin
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fdivs
reg [63:0] r;
`UNUSED_VAR (r)
fflags_t f;
@ -143,9 +147,9 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
dpi_fdiv (
pe_enable,
int'(0),
{32'hffffffff, pe_data_in[i][0 +: 32]},
{32'hffffffff, pe_data_in[i][32 +: 32]},
frm,
{32'hffffffff, pe_data_in[i][0 +: 32]}, // a
{32'hffffffff, pe_data_in[i][32 +: 32]}, // b
pe_data_in[0][64 +: `INST_FRM_BITS], // frm
r,
f
);

View file

@ -76,7 +76,6 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
reg is_fadd, is_fsub, is_fmul, is_fmadd, is_fmsub, is_fnmadd, is_fnmsub;
reg is_div, is_fcmp, is_itof, is_utof, is_ftoi, is_ftou, is_f2f;
reg dst_fmt, int_fmt;
reg [NUM_LANES-1:0][63:0] operands [3];
@ -88,7 +87,8 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
end
end
`UNUSED_VAR (fmt)
wire f_fmt = fmt[0];
wire i_fmt = fmt[1];
always @(*) begin
is_fadd = 0;
@ -106,25 +106,11 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
is_ftou = 0;
is_f2f = 0;
dst_fmt = 0;
int_fmt = 0;
`ifdef FLEN_64
dst_fmt = fmt[0];
`endif
`ifdef XLEN_64
int_fmt = fmt[1];
`endif
case (op_type)
`INST_FPU_ADD: begin core_select = FPU_FMA; is_fadd = 1; end
`INST_FPU_SUB: begin core_select = FPU_FMA; is_fsub = 1; end
`INST_FPU_ADD: begin core_select = FPU_FMA; is_fadd = ~i_fmt; is_fsub = i_fmt; end
`INST_FPU_MADD: begin core_select = FPU_FMA; is_fmadd = ~i_fmt; is_fmsub = i_fmt; end
`INST_FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = ~i_fmt; is_fnmsub = i_fmt; end
`INST_FPU_MUL: begin core_select = FPU_FMA; is_fmul = 1; end
`INST_FPU_MADD: begin core_select = FPU_FMA; is_fmadd = 1; end
`INST_FPU_MSUB: begin core_select = FPU_FMA; is_fmsub = 1; end
`INST_FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = 1; end
`INST_FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end
`INST_FPU_DIV: begin core_select = FPU_DIVSQRT; is_div = 1; end
`INST_FPU_SQRT: begin core_select = FPU_DIVSQRT; end
`INST_FPU_CMP: begin core_select = FPU_NCP; is_fcmp = 1; end
@ -138,7 +124,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
end
generate
begin : fma
begin : g_fma
reg [NUM_LANES-1:0][`XLEN-1:0] result_fma;
reg [NUM_LANES-1:0][63:0] result_fadd;
@ -164,13 +150,13 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
always @(*) begin
for (integer i = 0; i < NUM_LANES; ++i) begin
dpi_fadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fadd[i], fflags_fadd[i]);
dpi_fsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fsub[i], fflags_fsub[i]);
dpi_fmul (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fmul[i], fflags_fmul[i]);
dpi_fmadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmadd[i], fflags_fmadd[i]);
dpi_fmsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmsub[i], fflags_fmsub[i]);
dpi_fnmadd (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmadd[i], fflags_fnmadd[i]);
dpi_fnmsub (fma_fire, int'(dst_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmsub[i], fflags_fnmsub[i]);
dpi_fadd (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fadd[i], fflags_fadd[i]);
dpi_fsub (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fsub[i], fflags_fsub[i]);
dpi_fmul (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fmul[i], fflags_fmul[i]);
dpi_fmadd (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmadd[i], fflags_fmadd[i]);
dpi_fmsub (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fmsub[i], fflags_fmsub[i]);
dpi_fnmadd (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmadd[i], fflags_fnmadd[i]);
dpi_fnmsub (fma_fire, int'(f_fmt), operands[0][i], operands[1][i], operands[2][i], frm, result_fnmsub[i], fflags_fnmsub[i]);
result_fma[i] = is_fadd ? result_fadd[i][`XLEN-1:0] :
is_fsub ? result_fsub[i][`XLEN-1:0] :
@ -214,7 +200,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
endgenerate
generate
begin : fdiv
begin : g_fdiv
reg [NUM_LANES-1:0][`XLEN-1:0] result_fdiv_r;
reg [NUM_LANES-1:0][63:0] result_fdiv;
@ -226,7 +212,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
always @(*) begin
for (integer i = 0; i < NUM_LANES; ++i) begin
dpi_fdiv (fdiv_fire, int'(dst_fmt), operands[0][i], operands[1][i], frm, result_fdiv[i], fflags_fdiv[i]);
dpi_fdiv (fdiv_fire, int'(f_fmt), operands[0][i], operands[1][i], frm, result_fdiv[i], fflags_fdiv[i]);
result_fdiv_r[i] = result_fdiv[i][`XLEN-1:0];
end
end
@ -253,7 +239,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
endgenerate
generate
begin : fsqrt
begin : g_fsqrt
reg [NUM_LANES-1:0][`XLEN-1:0] result_fsqrt_r;
reg [NUM_LANES-1:0][63:0] result_fsqrt;
@ -265,7 +251,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
always @(*) begin
for (integer i = 0; i < NUM_LANES; ++i) begin
dpi_fsqrt (fsqrt_fire, int'(dst_fmt), operands[0][i], frm, result_fsqrt[i], fflags_fsqrt[i]);
dpi_fsqrt (fsqrt_fire, int'(f_fmt), operands[0][i], frm, result_fsqrt[i], fflags_fsqrt[i]);
result_fsqrt_r[i] = result_fsqrt[i][`XLEN-1:0];
end
end
@ -292,7 +278,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
endgenerate
generate
begin : fcvt
begin : g_fcvt
reg [NUM_LANES-1:0][`XLEN-1:0] result_fcvt;
reg [NUM_LANES-1:0][63:0] result_itof;
@ -313,11 +299,11 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
always @(*) begin
for (integer i = 0; i < NUM_LANES; ++i) begin
dpi_itof (fcvt_fire, int'(dst_fmt), int'(int_fmt), operands[0][i], frm, result_itof[i], fflags_itof[i]);
dpi_utof (fcvt_fire, int'(dst_fmt), int'(int_fmt), operands[0][i], frm, result_utof[i], fflags_utof[i]);
dpi_ftoi (fcvt_fire, int'(int_fmt), int'(dst_fmt), operands[0][i], frm, result_ftoi[i], fflags_ftoi[i]);
dpi_ftou (fcvt_fire, int'(int_fmt), int'(dst_fmt), operands[0][i], frm, result_ftou[i], fflags_ftou[i]);
dpi_f2f (fcvt_fire, int'(dst_fmt), operands[0][i], result_f2f[i]);
dpi_itof (fcvt_fire, int'(f_fmt), int'(i_fmt), operands[0][i], frm, result_itof[i], fflags_itof[i]);
dpi_utof (fcvt_fire, int'(f_fmt), int'(i_fmt), operands[0][i], frm, result_utof[i], fflags_utof[i]);
dpi_ftoi (fcvt_fire, int'(i_fmt), int'(f_fmt), operands[0][i], frm, result_ftoi[i], fflags_ftoi[i]);
dpi_ftou (fcvt_fire, int'(i_fmt), int'(f_fmt), operands[0][i], frm, result_ftou[i], fflags_ftou[i]);
dpi_f2f (fcvt_fire, int'(f_fmt), operands[0][i], result_f2f[i]);
result_fcvt[i] = is_itof ? result_itof[i][`XLEN-1:0] :
is_utof ? result_utof[i][`XLEN-1:0] :
@ -356,7 +342,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
endgenerate
generate
begin : fncp
begin : g_fncp
reg [NUM_LANES-1:0][`XLEN-1:0] result_fncp;
reg [NUM_LANES-1:0][63:0] result_fclss;
@ -384,17 +370,17 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
always @(*) begin
for (integer i = 0; i < NUM_LANES; ++i) begin
dpi_fclss (fncp_fire, int'(dst_fmt), operands[0][i], result_fclss[i]);
dpi_fle (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fle[i], fflags_fle[i]);
dpi_flt (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_flt[i], fflags_flt[i]);
dpi_feq (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_feq[i], fflags_feq[i]);
dpi_fmin (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fmin[i], fflags_fmin[i]);
dpi_fmax (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fmax[i], fflags_fmax[i]);
dpi_fsgnj (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnj[i]);
dpi_fsgnjn (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnjn[i]);
dpi_fsgnjx (fncp_fire, int'(dst_fmt), operands[0][i], operands[1][i], result_fsgnjx[i]);
result_fmvx[i] = dst_fmt ? operands[0][i] : 64'($signed(operands[0][i][31:0])); // sign-extension
result_fmvf[i] = dst_fmt ? operands[0][i] : (operands[0][i] | 64'hffffffff00000000); // nan-boxing
dpi_fclss (fncp_fire, int'(f_fmt), operands[0][i], result_fclss[i]);
dpi_fle (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fle[i], fflags_fle[i]);
dpi_flt (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_flt[i], fflags_flt[i]);
dpi_feq (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_feq[i], fflags_feq[i]);
dpi_fmin (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fmin[i], fflags_fmin[i]);
dpi_fmax (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fmax[i], fflags_fmax[i]);
dpi_fsgnj (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fsgnj[i]);
dpi_fsgnjn (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fsgnjn[i]);
dpi_fsgnjx (fncp_fire, int'(f_fmt), operands[0][i], operands[1][i], result_fsgnjx[i]);
result_fmvx[i] = f_fmt ? operands[0][i] : 64'($signed(operands[0][i][31:0])); // sign-extension
result_fmvf[i] = f_fmt ? operands[0][i] : (operands[0][i] | 64'hffffffff00000000); // nan-boxing
end
end
@ -444,7 +430,7 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_DATAW),
.ARBITER ("R"),
.ARBITER ("P"),
.OUT_BUF (0)
) div_sqrt_arb (
.clk (clk),
@ -463,14 +449,14 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
wire [NUM_FPC-1:0][RSP_DATAW-1:0] per_core_data_out;
for (genvar i = 0; i < NUM_FPC; ++i) begin
for (genvar i = 0; i < NUM_FPC; ++i) begin : g_per_core_data_out
assign per_core_data_out[i] = {per_core_result[i], per_core_has_fflags[i], per_core_fflags[i], per_core_tag_out[i]};
end
VX_stream_arb #(
.NUM_INPUTS (NUM_FPC),
.DATAW (RSP_DATAW),
.ARBITER ("F"),
.ARBITER ("R"),
.OUT_BUF (OUT_BUF)
) rsp_arb (
.clk (clk),

View file

@ -51,68 +51,39 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
localparam FPU_DIVSQRT = 1;
localparam FPU_CVT = 2;
localparam FPU_NCP = 3;
localparam NUM_FPC = 4;
localparam FPC_BITS = `LOG2UP(NUM_FPC);
localparam NUM_FPCORES = 4;
localparam FPCORES_BITS = `LOG2UP(NUM_FPCORES);
localparam REQ_DATAW = NUM_LANES + TAG_WIDTH + `INST_FPU_BITS + `INST_FMT_BITS + `INST_FRM_BITS + 3 * (NUM_LANES * 32);
localparam RSP_DATAW = (NUM_LANES * 32) + 1 + $bits(fflags_t) + TAG_WIDTH;
`UNUSED_VAR (fmt)
wire [NUM_FPC-1:0] per_core_ready_in;
wire [NUM_FPC-1:0][NUM_LANES-1:0][31:0] per_core_result;
wire [NUM_FPC-1:0][TAG_WIDTH-1:0] per_core_tag_out;
wire [NUM_FPC-1:0] per_core_ready_out;
wire [NUM_FPC-1:0] per_core_valid_out;
wire [NUM_FPC-1:0] per_core_has_fflags;
fflags_t [NUM_FPC-1:0] per_core_fflags;
wire [NUM_FPCORES-1:0] per_core_valid_in;
wire [NUM_FPCORES-1:0][REQ_DATAW-1:0] per_core_data_in;
wire [NUM_FPCORES-1:0] per_core_ready_in;
wire div_ready_in, sqrt_ready_in;
wire [NUM_LANES-1:0][31:0] div_result, sqrt_result;
wire [TAG_WIDTH-1:0] div_tag_out, sqrt_tag_out;
wire div_ready_out, sqrt_ready_out;
wire div_valid_out, sqrt_valid_out;
wire div_has_fflags, sqrt_has_fflags;
fflags_t div_fflags, sqrt_fflags;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0] per_core_mask_in;
wire [NUM_FPCORES-1:0][TAG_WIDTH-1:0] per_core_tag_in;
wire [NUM_FPCORES-1:0][`INST_FPU_BITS-1:0] per_core_op_type;
wire [NUM_FPCORES-1:0][`INST_FMT_BITS-1:0] per_core_fmt;
wire [NUM_FPCORES-1:0][`INST_FRM_BITS-1:0] per_core_frm;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_dataa;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_datab;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_datac;
reg [FPC_BITS-1:0] core_select;
reg is_madd, is_sub, is_neg, is_div, is_itof, is_signed;
always @(*) begin
is_madd = 0;
is_sub = 0;
is_neg = 0;
is_div = 0;
is_itof = 0;
is_signed = 0;
case (op_type)
`INST_FPU_ADD: begin core_select = FPU_FMA; end
`INST_FPU_SUB: begin core_select = FPU_FMA; is_sub = 1; end
`INST_FPU_MUL: begin core_select = FPU_FMA; is_neg = 1; end
`INST_FPU_MADD: begin core_select = FPU_FMA; is_madd = 1; end
`INST_FPU_MSUB: begin core_select = FPU_FMA; is_madd = 1; is_sub = 1; end
`INST_FPU_NMADD: begin core_select = FPU_FMA; is_madd = 1; is_neg = 1; end
`INST_FPU_NMSUB: begin core_select = FPU_FMA; is_madd = 1; is_sub = 1; is_neg = 1; end
`INST_FPU_DIV: begin core_select = FPU_DIVSQRT; is_div = 1; end
`INST_FPU_SQRT: begin core_select = FPU_DIVSQRT; end
`INST_FPU_F2I: begin core_select = FPU_CVT; is_signed = 1; end
`INST_FPU_F2U: begin core_select = FPU_CVT; end
`INST_FPU_I2F: begin core_select = FPU_CVT; is_itof = 1; is_signed = 1; end
`INST_FPU_U2F: begin core_select = FPU_CVT; is_itof = 1; end
default: begin core_select = FPU_NCP; end
endcase
end
`RESET_RELAY (fma_reset, reset);
`RESET_RELAY (div_reset, reset);
`RESET_RELAY (sqrt_reset, reset);
`RESET_RELAY (cvt_reset, reset);
`RESET_RELAY (ncp_reset, reset);
wire [NUM_FPCORES-1:0] per_core_valid_out;
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_result;
wire [NUM_FPCORES-1:0][TAG_WIDTH-1:0] per_core_tag_out;
wire [NUM_FPCORES-1:0] per_core_has_fflags;
fflags_t [NUM_FPCORES-1:0] per_core_fflags;
wire [NUM_FPCORES-1:0] per_core_ready_out;
wire [NUM_LANES-1:0][31:0] dataa_s;
wire [NUM_LANES-1:0][31:0] datab_s;
wire [NUM_LANES-1:0][31:0] datac_s;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data
assign dataa_s[i] = dataa[i][31:0];
assign datab_s[i] = datab[i][31:0];
assign datac_s[i] = datac[i][31:0];
@ -122,23 +93,60 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
`UNUSED_VAR (datab)
`UNUSED_VAR (datac)
// Decode fpu core type
wire [FPCORES_BITS-1:0] core_select = op_type[3:2];
VX_stream_switch #(
.DATAW (REQ_DATAW),
.NUM_OUTPUTS (NUM_FPCORES)
) req_switch (
.clk (clk),
.reset (reset),
.sel_in (core_select),
.valid_in (valid_in),
.ready_in (ready_in),
.data_in ({mask_in, tag_in, fmt, frm, dataa_s, datab_s, datac_s, op_type}),
.data_out (per_core_data_in),
.valid_out (per_core_valid_in),
.ready_out (per_core_ready_in)
);
for (genvar i = 0; i < NUM_FPCORES; ++i) begin : g_per_core_data_in
assign {
per_core_mask_in[i],
per_core_tag_in[i],
per_core_fmt[i],
per_core_frm[i],
per_core_dataa[i],
per_core_datab[i],
per_core_datac[i],
per_core_op_type[i]
} = per_core_data_in[i];
end
// FMA core ///////////////////////////////////////////////////////////////
wire is_madd = per_core_op_type[FPU_FMA][1];
wire is_neg = per_core_op_type[FPU_FMA][0];
wire is_sub = per_core_fmt[FPU_FMA][1];
VX_fpu_fma #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (TAG_WIDTH)
) fpu_fma (
.clk (clk),
.reset (fma_reset),
.valid_in (valid_in && (core_select == FPU_FMA)),
.reset (reset),
.valid_in (per_core_valid_in[FPU_FMA]),
.ready_in (per_core_ready_in[FPU_FMA]),
.mask_in (mask_in),
.tag_in (tag_in),
.frm (frm),
.mask_in (per_core_mask_in[FPU_FMA]),
.tag_in (per_core_tag_in[FPU_FMA]),
.frm (per_core_frm[FPU_FMA]),
.is_madd (is_madd),
.is_sub (is_sub),
.is_neg (is_neg),
.dataa (dataa_s),
.datab (datab_s),
.datac (datac_s),
.dataa (per_core_dataa[FPU_FMA]),
.datab (per_core_datab[FPU_FMA]),
.datac (per_core_datac[FPU_FMA]),
.has_fflags (per_core_has_fflags[FPU_FMA]),
.fflags (per_core_fflags[FPU_FMA]),
.result (per_core_result[FPU_FMA]),
@ -147,25 +155,99 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
.valid_out (per_core_valid_out[FPU_FMA])
);
// Div/Sqrt cores /////////////////////////////////////////////////////////
wire [1:0] div_sqrt_valid_in;
wire [1:0][REQ_DATAW-1:0] div_sqrt_data_in;
wire [1:0] div_sqrt_ready_in;
wire [1:0][NUM_LANES-1:0] div_sqrt_mask_in;
wire [1:0][TAG_WIDTH-1:0] div_sqrt_tag_in;
wire [1:0][`INST_FPU_BITS-1:0] div_sqrt_op_type;
wire [1:0][`INST_FMT_BITS-1:0] div_sqrt_fmt;
wire [1:0][`INST_FRM_BITS-1:0] div_sqrt_frm;
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_dataa;
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_datab;
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_datac;
wire [1:0] div_sqrt_valid_out;
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_result;
wire [1:0][TAG_WIDTH-1:0] div_sqrt_tag_out;
wire [1:0] div_sqrt_has_fflags;
fflags_t [1:0] div_sqrt_fflags;
wire [1:0] div_sqrt_ready_out;
wire div_sqrt_valid_tmp_in;
wire [REQ_DATAW-1:0] div_sqrt_data_tmp_in;
wire div_sqrt_ready_tmp_in;
VX_elastic_buffer #(
.DATAW (REQ_DATAW)
) div_sqrt_req_buffer (
.clk (clk),
.reset (reset),
.valid_in (per_core_valid_in[FPU_DIVSQRT]),
.ready_in (per_core_ready_in[FPU_DIVSQRT]),
.data_in (per_core_data_in[FPU_DIVSQRT]),
.data_out (div_sqrt_data_tmp_in),
.valid_out (div_sqrt_valid_tmp_in),
.ready_out (div_sqrt_ready_tmp_in)
);
wire is_sqrt = div_sqrt_data_tmp_in[0]; // op_type[0]
VX_stream_switch #(
.DATAW (REQ_DATAW),
.NUM_OUTPUTS (2)
) div_sqrt_req_switch (
.clk (clk),
.reset (reset),
.sel_in (is_sqrt),
.valid_in (div_sqrt_valid_tmp_in),
.ready_in (div_sqrt_ready_tmp_in),
.data_in (div_sqrt_data_tmp_in),
.data_out (div_sqrt_data_in),
.valid_out (div_sqrt_valid_in),
.ready_out (div_sqrt_ready_in)
);
for (genvar i = 0; i < 2; ++i) begin : g_div_sqrt_data_in
assign {
div_sqrt_mask_in[i],
div_sqrt_tag_in[i],
div_sqrt_fmt[i],
div_sqrt_frm[i],
div_sqrt_dataa[i],
div_sqrt_datab[i],
div_sqrt_datac[i],
div_sqrt_op_type[i]
} = div_sqrt_data_in[i];
end
`UNUSED_VAR (div_sqrt_op_type)
`UNUSED_VAR (div_sqrt_fmt)
`UNUSED_VAR (div_sqrt_datab)
`UNUSED_VAR (div_sqrt_datac)
VX_fpu_div #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (TAG_WIDTH)
) fpu_div (
.clk (clk),
.reset (div_reset),
.valid_in (valid_in && (core_select == FPU_DIVSQRT) && is_div),
.ready_in (div_ready_in),
.mask_in (mask_in),
.tag_in (tag_in),
.frm (frm),
.dataa (dataa_s),
.datab (datab_s),
.has_fflags (div_has_fflags),
.fflags (div_fflags),
.result (div_result),
.tag_out (div_tag_out),
.valid_out (div_valid_out),
.ready_out (div_ready_out)
.reset (reset),
.valid_in (div_sqrt_valid_in[0]),
.ready_in (div_sqrt_ready_in[0]),
.mask_in (div_sqrt_mask_in[0]),
.tag_in (div_sqrt_tag_in[0]),
.frm (div_sqrt_frm[0]),
.dataa (div_sqrt_dataa[0]),
.datab (div_sqrt_datab[0]),
.has_fflags (div_sqrt_has_fflags[0]),
.fflags (div_sqrt_fflags[0]),
.result (div_sqrt_result[0]),
.tag_out (div_sqrt_tag_out[0]),
.valid_out (div_sqrt_valid_out[0]),
.ready_out (div_sqrt_ready_out[0])
);
VX_fpu_sqrt #(
@ -173,92 +255,42 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
.TAG_WIDTH (TAG_WIDTH)
) fpu_sqrt (
.clk (clk),
.reset (sqrt_reset),
.valid_in (valid_in && (core_select == FPU_DIVSQRT) && ~is_div),
.ready_in (sqrt_ready_in),
.mask_in (mask_in),
.tag_in (tag_in),
.frm (frm),
.dataa (dataa_s),
.has_fflags (sqrt_has_fflags),
.fflags (sqrt_fflags),
.result (sqrt_result),
.tag_out (sqrt_tag_out),
.valid_out (sqrt_valid_out),
.ready_out (sqrt_ready_out)
.reset (reset),
.valid_in (div_sqrt_valid_in[1]),
.ready_in (div_sqrt_ready_in[1]),
.mask_in (div_sqrt_mask_in[1]),
.tag_in (div_sqrt_tag_in[1]),
.frm (div_sqrt_frm[1]),
.dataa (div_sqrt_dataa[1]),
.has_fflags (div_sqrt_has_fflags[1]),
.fflags (div_sqrt_fflags[1]),
.result (div_sqrt_result[1]),
.tag_out (div_sqrt_tag_out[1]),
.valid_out (div_sqrt_valid_out[1]),
.ready_out (div_sqrt_ready_out[1])
);
wire cvt_ret_int_in = ~is_itof;
wire cvt_ret_int_out;
VX_fpu_cvt #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (TAG_WIDTH+1)
) fpu_cvt (
.clk (clk),
.reset (cvt_reset),
.valid_in (valid_in && (core_select == FPU_CVT)),
.ready_in (per_core_ready_in[FPU_CVT]),
.mask_in (mask_in),
.tag_in ({cvt_ret_int_in, tag_in}),
.frm (frm),
.is_itof (is_itof),
.is_signed (is_signed),
.dataa (dataa_s),
.has_fflags (per_core_has_fflags[FPU_CVT]),
.fflags (per_core_fflags[FPU_CVT]),
.result (per_core_result[FPU_CVT]),
.tag_out ({cvt_ret_int_out, per_core_tag_out[FPU_CVT]}),
.valid_out (per_core_valid_out[FPU_CVT]),
.ready_out (per_core_ready_out[FPU_CVT])
);
wire ncp_ret_int_in = (op_type == `INST_FPU_CMP)
|| `INST_FPU_IS_CLASS(op_type, frm)
|| `INST_FPU_IS_MVXW(op_type, frm);
wire ncp_ret_int_out;
wire ncp_ret_sext_in = `INST_FPU_IS_MVXW(op_type, frm);
wire ncp_ret_sext_out;
VX_fpu_ncp #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (TAG_WIDTH+2)
) fpu_ncp (
.clk (clk),
.reset (ncp_reset),
.valid_in (valid_in && (core_select == FPU_NCP)),
.ready_in (per_core_ready_in[FPU_NCP]),
.mask_in (mask_in),
.tag_in ({ncp_ret_sext_in, ncp_ret_int_in, tag_in}),
.op_type (op_type),
.frm (frm),
.dataa (dataa_s),
.datab (datab_s),
.result (per_core_result[FPU_NCP]),
.has_fflags (per_core_has_fflags[FPU_NCP]),
.fflags (per_core_fflags[FPU_NCP]),
.tag_out ({ncp_ret_sext_out, ncp_ret_int_out, per_core_tag_out[FPU_NCP]}),
.valid_out (per_core_valid_out[FPU_NCP]),
.ready_out (per_core_ready_out[FPU_NCP])
);
///////////////////////////////////////////////////////////////////////////
assign per_core_ready_in[FPU_DIVSQRT] = is_div ? div_ready_in : sqrt_ready_in;
wire [1:0][RSP_DATAW-1:0] div_sqrt_arb_data_in;
for (genvar i = 0; i < 2; ++i) begin : g_div_sqrt_arb_data_in
assign div_sqrt_arb_data_in[i] = {
div_sqrt_result[i],
div_sqrt_has_fflags[i],
div_sqrt_fflags[i],
div_sqrt_tag_out[i]
};
end
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_DATAW),
.ARBITER ("R"),
.ARBITER ("P"),
.OUT_BUF (0)
) div_sqrt_arb (
) div_sqrt_rsp_arb (
.clk (clk),
.reset (reset),
.valid_in ({sqrt_valid_out, div_valid_out}),
.ready_in ({sqrt_ready_out, div_ready_out}),
.data_in ({{sqrt_result, sqrt_has_fflags, sqrt_fflags, sqrt_tag_out},
{div_result, div_has_fflags, div_fflags, div_tag_out}}),
.valid_in (div_sqrt_valid_out),
.ready_in (div_sqrt_ready_out),
.data_in (div_sqrt_arb_data_in),
.data_out ({
per_core_result[FPU_DIVSQRT],
per_core_has_fflags[FPU_DIVSQRT],
@ -270,12 +302,73 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
`UNUSED_PIN (sel_out)
);
// CVT core ///////////////////////////////////////////////////////////////
wire is_itof = per_core_op_type[FPU_CVT][1];
wire is_signed = ~per_core_op_type[FPU_CVT][0];
wire cvt_ret_int_in = ~is_itof;
wire cvt_ret_int_out;
VX_fpu_cvt #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (1+TAG_WIDTH)
) fpu_cvt (
.clk (clk),
.reset (reset),
.valid_in (per_core_valid_in[FPU_CVT]),
.ready_in (per_core_ready_in[FPU_CVT]),
.mask_in (per_core_mask_in[FPU_CVT]),
.tag_in ({cvt_ret_int_in, per_core_tag_in[FPU_CVT]}),
.frm (per_core_frm[FPU_CVT]),
.is_itof (is_itof),
.is_signed (is_signed),
.dataa (per_core_dataa[FPU_CVT]),
.has_fflags (per_core_has_fflags[FPU_CVT]),
.fflags (per_core_fflags[FPU_CVT]),
.result (per_core_result[FPU_CVT]),
.tag_out ({cvt_ret_int_out, per_core_tag_out[FPU_CVT]}),
.valid_out (per_core_valid_out[FPU_CVT]),
.ready_out (per_core_ready_out[FPU_CVT])
);
// NCP core ///////////////////////////////////////////////////////////////
wire ncp_ret_int_in = (per_core_op_type[FPU_NCP] == `INST_FPU_CMP)
|| `INST_FPU_IS_CLASS(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP])
|| `INST_FPU_IS_MVXW(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP]);
wire ncp_ret_int_out;
wire ncp_ret_sext_in = `INST_FPU_IS_MVXW(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP]);
wire ncp_ret_sext_out;
VX_fpu_ncp #(
.NUM_LANES (NUM_LANES),
.TAG_WIDTH (TAG_WIDTH+2)
) fpu_ncp (
.clk (clk),
.reset (reset),
.valid_in (per_core_valid_in[FPU_NCP]),
.ready_in (per_core_ready_in[FPU_NCP]),
.mask_in (per_core_mask_in[FPU_NCP]),
.tag_in ({ncp_ret_sext_in, ncp_ret_int_in, per_core_tag_in[FPU_NCP]}),
.op_type (per_core_op_type[FPU_NCP]),
.frm (per_core_frm[FPU_NCP]),
.dataa (per_core_dataa[FPU_NCP]),
.datab (per_core_datab[FPU_NCP]),
.result (per_core_result[FPU_NCP]),
.has_fflags (per_core_has_fflags[FPU_NCP]),
.fflags (per_core_fflags[FPU_NCP]),
.tag_out ({ncp_ret_sext_out, ncp_ret_int_out, per_core_tag_out[FPU_NCP]}),
.valid_out (per_core_valid_out[FPU_NCP]),
.ready_out (per_core_ready_out[FPU_NCP])
);
///////////////////////////////////////////////////////////////////////////
reg [NUM_FPC-1:0][RSP_DATAW+2-1:0] per_core_data_out;
reg [NUM_FPCORES-1:0][RSP_DATAW+2-1:0] per_core_data_out;
always @(*) begin
for (integer i = 0; i < NUM_FPC; ++i) begin
for (integer i = 0; i < NUM_FPCORES; ++i) begin
per_core_data_out[i][RSP_DATAW+1:2] = {
per_core_result[i],
per_core_has_fflags[i],
@ -294,9 +387,9 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
`UNUSED_VAR (op_ret_int_out)
VX_stream_arb #(
.NUM_INPUTS (NUM_FPC),
.NUM_INPUTS (NUM_FPCORES),
.DATAW (RSP_DATAW + 2),
.ARBITER ("F"),
.ARBITER ("R"),
.OUT_BUF (OUT_BUF)
) rsp_arb (
.clk (clk),
@ -310,25 +403,22 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
`UNUSED_PIN (sel_out)
);
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result
`ifdef FPU_RV64F
reg [`XLEN-1:0] result_r;
reg [`XLEN-1:0] result_w;
always @(*) begin
case (op_ret_int_out)
2'b11: result_r = `XLEN'($signed(result_s[i]));
2'b01: result_r = {32'h00000000, result_s[i]};
default: result_r = {32'hffffffff, result_s[i]};
2'b11: result_w = `XLEN'($signed(result_s[i]));
2'b01: result_w = {32'h00000000, result_s[i]};
default: result_w = {32'hffffffff, result_s[i]};
endcase
end
assign result[i] = result_r;
assign result[i] = result_w;
`else
assign result[i] = result_s[i];
`endif
end
// can accept new request?
assign ready_in = per_core_ready_in[core_select];
endmodule
`endif

View file

@ -49,26 +49,27 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
input wire ready_out,
output wire valid_out
);
`UNUSED_VAR (frm)
localparam DATAW = 3 * 32 + `INST_FRM_BITS;
wire [NUM_LANES-1:0][DATAW-1:0] data_in;
wire [NUM_LANES-1:0][3*32-1:0] data_in;
wire [NUM_LANES-1:0] mask_out;
wire [NUM_LANES-1:0][(`FP_FLAGS_BITS+32)-1:0] data_out;
wire [NUM_LANES-1:0][`FP_FLAGS_BITS-1:0] fflags_out;
wire pe_enable;
wire [NUM_PES-1:0][3*32-1:0] pe_data_in;
wire [NUM_PES-1:0][DATAW-1:0] pe_data_in;
wire [NUM_PES-1:0][(`FP_FLAGS_BITS+32)-1:0] pe_data_out;
reg [NUM_LANES-1:0][31:0] a, b, c;
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_select
always @(*) begin
if (is_madd) begin
// MADD / MSUB / NMADD / NMSUB
a[i] = is_neg ? {~dataa[i][31], dataa[i][30:0]} : dataa[i];
a[i] = {is_neg ^ dataa[i][31], dataa[i][30:0]};
b[i] = datab[i];
c[i] = (is_neg ^ is_sub) ? {~datac[i][31], datac[i][30:0]} : datac[i];
c[i] = {is_neg ^ is_sub ^ datac[i][31], datac[i][30:0]};
end else begin
if (is_neg) begin
// MUL
@ -77,29 +78,30 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
c[i] = '0;
end else begin
// ADD / SUB
a[i] = 32'h3f800000; // 1.0f
b[i] = dataa[i];
c[i] = is_sub ? {~datab[i][31], datab[i][30:0]} : datab[i];
a[i] = dataa[i];
b[i] = 32'h3f800000; // 1.0f
c[i] = {is_sub ^ datab[i][31], datab[i][30:0]};
end
end
end
end
for (genvar i = 0; i < NUM_LANES; ++i) begin
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in
assign data_in[i][0 +: 32] = a[i];
assign data_in[i][32 +: 32] = b[i];
assign data_in[i][64 +: 32] = c[i];
assign data_in[i][96 +: `INST_FRM_BITS] = frm;
end
VX_pe_serializer #(
.NUM_LANES (NUM_LANES),
.NUM_PES (NUM_PES),
.LATENCY (`LATENCY_FMA),
.DATA_IN_WIDTH(3*32),
.DATA_OUT_WIDTH(`FP_FLAGS_BITS + 32),
.DATA_IN_WIDTH (DATAW),
.DATA_OUT_WIDTH (`FP_FLAGS_BITS + 32),
.TAG_WIDTH (NUM_LANES + TAG_WIDTH),
.PE_REG ((NUM_LANES != NUM_PES) ? 1 : 0), // must be registered for DSPs
.OUT_BUF (((NUM_LANES / NUM_PES) > 2) ? 1 : 0)
.PE_REG (0),
.OUT_BUF (2)
) pe_serializer (
.clk (clk),
.reset (reset),
@ -108,15 +110,17 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
.tag_in ({mask_in, tag_in}),
.ready_in (ready_in),
.pe_enable (pe_enable),
.pe_data_in (pe_data_in),
.pe_data_out(pe_data_out),
.pe_data_out(pe_data_in),
.pe_data_in (pe_data_out),
.valid_out (valid_out),
.data_out (data_out),
.tag_out ({mask_out, tag_out}),
.ready_out (ready_out)
);
for (genvar i = 0; i < NUM_LANES; ++i) begin
`UNUSED_VAR (pe_data_in)
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_result
assign result[i] = data_out[i][0 +: 32];
assign fflags_out[i] = data_out[i][32 +: `FP_FLAGS_BITS];
end
@ -125,7 +129,7 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
`ifdef QUARTUS
for (genvar i = 0; i < NUM_PES; ++i) begin
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fmas
acl_fmadd fmadd (
.clk (clk),
.areset (1'b0),
@ -143,7 +147,7 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
`elsif VIVADO
for (genvar i = 0; i < NUM_PES; ++i) begin
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fmas
wire [2:0] tuser;
xil_fma fma (
@ -168,7 +172,7 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
`else
for (genvar i = 0; i < NUM_PES; ++i) begin
for (genvar i = 0; i < NUM_PES; ++i) begin : g_fmas
reg [63:0] r;
`UNUSED_VAR (r)
fflags_t f;
@ -177,10 +181,10 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
dpi_fmadd (
pe_enable,
int'(0),
{32'hffffffff, pe_data_in[i][0 +: 32]},
{32'hffffffff, pe_data_in[i][32 +: 32]},
{32'hffffffff, pe_data_in[i][64 +: 32]},
frm,
{32'hffffffff, pe_data_in[i][0 +: 32]}, // a
{32'hffffffff, pe_data_in[i][32 +: 32]}, // b
{32'hffffffff, pe_data_in[i][64 +: 32]}, // c
pe_data_in[0][96 +: `INST_FRM_BITS], // frm
r,
f
);

Some files were not shown because too many files have changed in this diff Show more