mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
cache subsystem refactoring
This commit is contained in:
parent
59497e52df
commit
dd40e9c754
15 changed files with 680 additions and 846 deletions
361
ci/regression.sh
361
ci/regression.sh
|
@ -21,31 +21,31 @@ rm -f blackbox.*.cache
|
|||
|
||||
unittest()
|
||||
{
|
||||
make -C tests/unittest run
|
||||
make -C hw/unittest > /dev/null
|
||||
make -C tests/unittest run
|
||||
make -C hw/unittest > /dev/null
|
||||
}
|
||||
|
||||
isa()
|
||||
{
|
||||
echo "begin isa tests..."
|
||||
echo "begin isa tests..."
|
||||
|
||||
make -C tests/riscv/isa run-simx
|
||||
make -C tests/riscv/isa run-rtlsim
|
||||
make -C tests/riscv/isa run-simx
|
||||
make -C tests/riscv/isa run-rtlsim
|
||||
|
||||
make -C sim/rtlsim clean && CONFIGS="-DDPI_DISABLE" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim
|
||||
make -C sim/rtlsim clean && CONFIGS="-DDPI_DISABLE" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim
|
||||
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim-32f
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim-32f
|
||||
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim-32f
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim-32f
|
||||
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim-32f
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim-32f
|
||||
|
||||
if [ "$XLEN" == "64" ]
|
||||
then
|
||||
if [ "$XLEN" == "64" ]
|
||||
then
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim-64f
|
||||
|
||||
|
@ -57,214 +57,216 @@ then
|
|||
|
||||
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-rtlsim-64fx
|
||||
fi
|
||||
fi
|
||||
|
||||
# restore default prebuilt configuration
|
||||
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
|
||||
# restore default prebuilt configuration
|
||||
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
|
||||
|
||||
echo "isa tests done!"
|
||||
echo "isa tests done!"
|
||||
}
|
||||
|
||||
regression()
|
||||
{
|
||||
echo "begin regression tests..."
|
||||
echo "begin regression tests..."
|
||||
|
||||
make -C tests/kernel run-simx
|
||||
make -C tests/kernel run-rtlsim
|
||||
make -C tests/kernel run-simx
|
||||
make -C tests/kernel run-rtlsim
|
||||
|
||||
make -C tests/regression run-simx
|
||||
make -C tests/regression run-rtlsim
|
||||
make -C tests/regression run-simx
|
||||
make -C tests/regression run-rtlsim
|
||||
|
||||
# test FPU hardware implementations
|
||||
CONFIGS="-DFPU_DPI" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
CONFIGS="-DFPU_DSP" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
CONFIGS="-DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
# test FPU hardware implementations
|
||||
CONFIGS="-DFPU_DPI" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
CONFIGS="-DFPU_DSP" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
CONFIGS="-DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
|
||||
# test local barrier
|
||||
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -t19"
|
||||
./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -t19"
|
||||
# test local barrier
|
||||
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -t19"
|
||||
./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -t19"
|
||||
|
||||
# test global barrier
|
||||
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -t20" --cores=2
|
||||
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -t20" --cores=2
|
||||
# test global barrier
|
||||
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -t20" --cores=2
|
||||
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -t20" --cores=2
|
||||
|
||||
# test FPU core
|
||||
# test FPU core
|
||||
|
||||
echo "regression tests done!"
|
||||
echo "regression tests done!"
|
||||
}
|
||||
|
||||
opencl()
|
||||
{
|
||||
echo "begin opencl tests..."
|
||||
echo "begin opencl tests..."
|
||||
|
||||
make -C tests/opencl run-simx
|
||||
make -C tests/opencl run-rtlsim
|
||||
make -C tests/opencl run-simx
|
||||
make -C tests/opencl run-rtlsim
|
||||
|
||||
echo "opencl tests done!"
|
||||
echo "opencl tests done!"
|
||||
}
|
||||
|
||||
cluster()
|
||||
{
|
||||
echo "begin clustering tests..."
|
||||
echo "begin clustering tests..."
|
||||
|
||||
# warp/threads configurations
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=1 --threads=1 --app=diverge
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=2 --app=diverge
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=diverge
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=diverge
|
||||
./ci/blackbox.sh --driver=simx --cores=1 --warps=1 --threads=1 --app=diverge
|
||||
./ci/blackbox.sh --driver=simx --cores=1 --warps=8 --threads=16 --app=diverge
|
||||
# warp/threads configurations
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=1 --threads=1 --app=diverge
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=2 --app=diverge
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=diverge
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=diverge
|
||||
./ci/blackbox.sh --driver=simx --cores=1 --warps=1 --threads=1 --app=diverge
|
||||
./ci/blackbox.sh --driver=simx --cores=1 --warps=8 --threads=16 --app=diverge
|
||||
|
||||
# cores clustering
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=diverge --args="-n1"
|
||||
# cores clustering
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=diverge --args="-n1"
|
||||
|
||||
# L2/L3
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=diverge --args="-n1"
|
||||
# L2/L3
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=diverge --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=diverge --args="-n1"
|
||||
|
||||
echo "clustering tests done!"
|
||||
echo "clustering tests done!"
|
||||
}
|
||||
|
||||
debug()
|
||||
{
|
||||
echo "begin debugging tests..."
|
||||
echo "begin debugging tests..."
|
||||
|
||||
# test CSV trace generation
|
||||
make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null
|
||||
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-simx-32im > run_simx.log
|
||||
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
|
||||
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
|
||||
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
|
||||
diff trace_rtlsim.csv trace_simx.csv
|
||||
# restore default prebuilt configuration
|
||||
make -C sim/simx clean && make -C sim/simx > /dev/null
|
||||
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
|
||||
# test CSV trace generation
|
||||
make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null
|
||||
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null
|
||||
make -C tests/riscv/isa run-simx-32im > run_simx.log
|
||||
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
|
||||
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
|
||||
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
|
||||
diff trace_rtlsim.csv trace_simx.csv
|
||||
# restore default prebuilt configuration
|
||||
make -C sim/simx clean && make -C sim/simx > /dev/null
|
||||
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
|
||||
|
||||
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=basic --args="-t0 -n1"
|
||||
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=basic --args="-t0 -n1"
|
||||
|
||||
echo "debugging tests done!"
|
||||
echo "debugging tests done!"
|
||||
}
|
||||
|
||||
config()
|
||||
{
|
||||
echo "begin configuration tests..."
|
||||
echo "begin configuration tests..."
|
||||
|
||||
# disable DPI
|
||||
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
|
||||
# disable DPI
|
||||
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
|
||||
|
||||
# issue width
|
||||
CONFIGS="-DISSUE_WIDTH=1" ./ci/blackbox.sh --driver=rtlsim --app=diverge
|
||||
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
|
||||
CONFIGS="-DISSUE_WIDTH=1" ./ci/blackbox.sh --driver=simx --app=diverge
|
||||
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=simx --app=diverge
|
||||
# issue width
|
||||
CONFIGS="-DISSUE_WIDTH=1" ./ci/blackbox.sh --driver=rtlsim --app=diverge
|
||||
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
|
||||
CONFIGS="-DISSUE_WIDTH=1" ./ci/blackbox.sh --driver=simx --app=diverge
|
||||
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=simx --app=diverge
|
||||
|
||||
# dispatch size
|
||||
CONFIGS="-DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=1" ./ci/blackbox.sh --driver=rtlsim --app=diverge
|
||||
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
|
||||
CONFIGS="-DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=1" ./ci/blackbox.sh --driver=simx --app=diverge
|
||||
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=simx --app=diverge
|
||||
# dispatch size
|
||||
CONFIGS="-DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=1" ./ci/blackbox.sh --driver=rtlsim --app=diverge
|
||||
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
|
||||
CONFIGS="-DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=1" ./ci/blackbox.sh --driver=simx --app=diverge
|
||||
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=simx --app=diverge
|
||||
|
||||
# FPU scaling
|
||||
CONFIGS="-DNUM_ALU_BLOCK=4 -DNUM_FPU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
CONFIGS="-DNUM_ALU_BLOCK=4 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
# FPU scaling
|
||||
CONFIGS="-DNUM_ALU_BLOCK=4 -DNUM_FPU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
CONFIGS="-DNUM_ALU_BLOCK=4 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
|
||||
# custom program startup address
|
||||
make -C tests/regression/dogfood clean-all
|
||||
STARTUP_ADDR=0x40000000 make -C tests/regression/dogfood
|
||||
CONFIGS="-DSTARTUP_ADDR=0x40000000" ./ci/blackbox.sh --driver=simx --app=dogfood
|
||||
CONFIGS="-DSTARTUP_ADDR=0x40000000" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
make -C tests/regression/dogfood clean-all
|
||||
make -C tests/regression/dogfood
|
||||
# custom program startup address
|
||||
make -C tests/regression/dogfood clean-all
|
||||
STARTUP_ADDR=0x40000000 make -C tests/regression/dogfood
|
||||
CONFIGS="-DSTARTUP_ADDR=0x40000000" ./ci/blackbox.sh --driver=simx --app=dogfood
|
||||
CONFIGS="-DSTARTUP_ADDR=0x40000000" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
|
||||
make -C tests/regression/dogfood clean-all
|
||||
make -C tests/regression/dogfood
|
||||
|
||||
# disabling M extension
|
||||
CONFIGS="-DEXT_M_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
|
||||
# disabling M extension
|
||||
CONFIGS="-DEXT_M_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
|
||||
|
||||
# disabling F extension
|
||||
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
|
||||
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext --perf=1
|
||||
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=simx --cores=1 --app=no_mf_ext --perf=1
|
||||
# disabling F extension
|
||||
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
|
||||
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext --perf=1
|
||||
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=simx --cores=1 --app=no_mf_ext --perf=1
|
||||
|
||||
# disable shared memory
|
||||
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem
|
||||
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem --perf=1
|
||||
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=simx --cores=1 --app=no_smem --perf=1
|
||||
# disable shared memory
|
||||
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo --perf=1
|
||||
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=simx --cores=1 --app=demo --perf=1
|
||||
|
||||
# disable L1 cache
|
||||
CONFIGS="-DL1_DISABLE -DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
# disable L1 cache
|
||||
CONFIGS="-DL1_DISABLE -DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
|
||||
CONFIGS="-DL1_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
|
||||
CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
|
||||
CONFIGS="-DICACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
|
||||
|
||||
# multiple L1 caches per cluster
|
||||
CONFIGS="-DNUM_DCACHES=2 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --cores=8 --warps=1 --threads=2
|
||||
# multiple L1 caches per cluster
|
||||
CONFIGS="-DNUM_DCACHES=2 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --cores=8 --warps=1 --threads=2
|
||||
|
||||
# test AXI bus
|
||||
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo
|
||||
# test AXI bus
|
||||
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo
|
||||
|
||||
# adjust l1 block size to match l2
|
||||
CONFIGS="-DL1_LINE_SIZE=64" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1"
|
||||
# reduce l1 line size
|
||||
CONFIGS="-DL1_LINE_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1"
|
||||
CONFIGS="-DL1_LINE_SIZE=4 -DL1_DISABLE -DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1"
|
||||
|
||||
# test cache banking
|
||||
CONFIGS="-DSMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
CONFIGS="-DSMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
CONFIGS="-DSMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemm
|
||||
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
|
||||
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
|
||||
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --cores=1 --app=sgemm
|
||||
# test cache banking
|
||||
CONFIGS="-DSMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
CONFIGS="-DSMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
|
||||
CONFIGS="-DSMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemm
|
||||
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
|
||||
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
|
||||
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --cores=1 --app=sgemm
|
||||
|
||||
# test 128-bit MEM block
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
|
||||
# test 128-bit MEM block
|
||||
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
|
||||
|
||||
# test single-bank DRAM
|
||||
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
|
||||
# test single-bank DRAM
|
||||
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
|
||||
|
||||
# test 27-bit DRAM address
|
||||
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
|
||||
# test 27-bit DRAM address
|
||||
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
|
||||
|
||||
echo "configuration tests done!"
|
||||
echo "configuration tests done!"
|
||||
}
|
||||
|
||||
stress0()
|
||||
{
|
||||
echo "begin stress0 tests..."
|
||||
echo "begin stress0 tests..."
|
||||
|
||||
# test verilator reset values
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --app=printf
|
||||
# test verilator reset values
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr
|
||||
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --app=printf
|
||||
|
||||
echo "stress0 tests done!"
|
||||
echo "stress0 tests done!"
|
||||
}
|
||||
|
||||
stress1()
|
||||
{
|
||||
echo "begin stress1 tests..."
|
||||
echo "begin stress1 tests..."
|
||||
|
||||
./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n128" --l2cache
|
||||
./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n128" --l2cache
|
||||
|
||||
echo "stress1 tests done!"
|
||||
echo "stress1 tests done!"
|
||||
}
|
||||
|
||||
synthesis()
|
||||
{
|
||||
echo "begin synthesis tests..."
|
||||
echo "begin synthesis tests..."
|
||||
|
||||
PREFIX=build_base make -C hw/syn/yosys clean
|
||||
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys elaborate
|
||||
PREFIX=build_base make -C hw/syn/yosys clean
|
||||
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys elaborate
|
||||
|
||||
echo "synthesis tests done!"
|
||||
echo "synthesis tests done!"
|
||||
}
|
||||
|
||||
show_usage()
|
||||
|
@ -277,45 +279,56 @@ start=$SECONDS
|
|||
|
||||
while [ "$1" != "" ]; do
|
||||
case $1 in
|
||||
--unittest ) unittest
|
||||
--unittest )
|
||||
unittest
|
||||
;;
|
||||
--isa ) isa
|
||||
;;
|
||||
--regression ) regression
|
||||
--regression )
|
||||
regression
|
||||
;;
|
||||
--opencl ) opencl
|
||||
--opencl )
|
||||
opencl
|
||||
;;
|
||||
--cluster ) cluster
|
||||
--cluster )
|
||||
cluster
|
||||
;;
|
||||
--debug ) debug
|
||||
--debug )
|
||||
debug
|
||||
;;
|
||||
--config ) config
|
||||
--config )
|
||||
config
|
||||
;;
|
||||
--stress0 ) stress0
|
||||
--stress0 )
|
||||
stress0
|
||||
;;
|
||||
--stress1 ) stress1
|
||||
--stress1 )
|
||||
stress1
|
||||
;;
|
||||
--stress ) stress0
|
||||
stress1
|
||||
--stress )
|
||||
stress0
|
||||
stress1
|
||||
;;
|
||||
--synthesis ) synthesis
|
||||
--synthesis )
|
||||
synthesis
|
||||
;;
|
||||
--all ) unittest
|
||||
isa
|
||||
regression
|
||||
opencl
|
||||
cluster
|
||||
debug
|
||||
config
|
||||
stress0
|
||||
stress1
|
||||
synthesis
|
||||
isa
|
||||
regression
|
||||
opencl
|
||||
cluster
|
||||
debug
|
||||
config
|
||||
stress0
|
||||
stress1
|
||||
synthesis
|
||||
;;
|
||||
-h | --help ) show_usage
|
||||
exit
|
||||
-h | --help )
|
||||
show_usage
|
||||
exit
|
||||
;;
|
||||
* ) show_usage
|
||||
exit 1
|
||||
* ) show_usage
|
||||
exit 1
|
||||
esac
|
||||
shift
|
||||
done
|
||||
|
|
|
@ -129,23 +129,15 @@
|
|||
`endif
|
||||
|
||||
`ifndef L1_LINE_SIZE
|
||||
`ifdef L1_DISABLE
|
||||
`define L1_LINE_SIZE ((`L2_ENABLED || `L3_ENABLED) ? 4 : `MEM_BLOCK_SIZE)
|
||||
`else
|
||||
`define L1_LINE_SIZE ((`L2_ENABLED || `L3_ENABLED) ? 16 : `MEM_BLOCK_SIZE)
|
||||
`endif
|
||||
`define L1_LINE_SIZE `MEM_BLOCK_SIZE
|
||||
`endif
|
||||
|
||||
`ifdef L2_ENABLE
|
||||
`ifndef L2_LINE_SIZE
|
||||
`define L2_LINE_SIZE `MEM_BLOCK_SIZE
|
||||
`else
|
||||
`define L2_LINE_SIZE `L1_LINE_SIZE
|
||||
`endif
|
||||
|
||||
`ifdef L3_ENABLE
|
||||
`ifndef L3_LINE_SIZE
|
||||
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
|
||||
`else
|
||||
`define L3_LINE_SIZE `L2_LINE_SIZE
|
||||
`endif
|
||||
|
||||
`ifdef XLEN_64
|
||||
|
|
|
@ -243,31 +243,18 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// non-cacheable tag bits
|
||||
`define NC_TAG_BITS 1
|
||||
|
||||
// cache address type bits
|
||||
`ifdef SM_ENABLE
|
||||
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BITS + 1)
|
||||
`else
|
||||
`define CACHE_ADDR_TYPE_BITS `NC_TAG_BITS
|
||||
`endif
|
||||
|
||||
`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0)
|
||||
`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
|
||||
(`CLOG2(mshr_size) + `CLOG2(num_banks) + `NC_TAG_BITS)
|
||||
(`CLOG2(mshr_size) + `CLOG2(num_banks))
|
||||
|
||||
`define CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
|
||||
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
|
||||
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
|
||||
|
||||
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
|
||||
(`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) + `NC_TAG_BITS)
|
||||
|
||||
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \
|
||||
`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width))
|
||||
(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width)) + 1)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -278,16 +265,13 @@
|
|||
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
|
||||
|
||||
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
|
||||
|
||||
`define CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG((`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
|
||||
|
||||
`define CACHE_CLUSTER_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG((`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)) + `NC_TAG_BITS), num_caches)
|
||||
`define CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
|
||||
|
||||
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches))), num_caches)
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
|
|
@ -99,7 +99,7 @@ package VX_gpu_pkg;
|
|||
`ifdef ICACHE_ENABLE
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
|
||||
`else
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
|
||||
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
|
||||
`endif
|
||||
|
||||
////////////////////////// Dcache Parameters //////////////////////////////
|
||||
|
@ -112,31 +112,26 @@ package VX_gpu_pkg;
|
|||
localparam DCACHE_LINE_SIZE = `L1_LINE_SIZE;
|
||||
|
||||
// Input request size
|
||||
localparam DCACHE_NUM_REQS = `MAX(`DCACHE_NUM_BANKS, `SMEM_NUM_BANKS);
|
||||
|
||||
// Memory request size
|
||||
localparam LSU_MEM_REQS = `NUM_LSU_LANES;
|
||||
localparam DCACHE_NUM_REQS = `UP((`NUM_LSU_LANES * (`XLEN / 8)) / DCACHE_WORD_SIZE);
|
||||
|
||||
// Batch select bits
|
||||
localparam DCACHE_NUM_BATCHES = ((LSU_MEM_REQS + DCACHE_NUM_REQS - 1) / DCACHE_NUM_REQS);
|
||||
localparam DCACHE_NUM_BATCHES = ((DCACHE_NUM_REQS + DCACHE_NUM_REQS - 1) / DCACHE_NUM_REQS);
|
||||
localparam DCACHE_BATCH_SEL_BITS = `CLOG2(DCACHE_NUM_BATCHES);
|
||||
|
||||
// Core request tag Id bits
|
||||
localparam LSUQ_TAG_BITS = (`CLOG2(`LSUQ_SIZE) + DCACHE_BATCH_SEL_BITS);
|
||||
localparam DCACHE_TAG_ID_BITS = (LSUQ_TAG_BITS + `CACHE_ADDR_TYPE_BITS);
|
||||
localparam DCACHE_TAG_ID_BITS = (`CLOG2(`LSUQ_SIZE) + DCACHE_BATCH_SEL_BITS);
|
||||
|
||||
// Core request tag bits
|
||||
localparam DCACHE_TAG_WIDTH = (`UUID_WIDTH + DCACHE_TAG_ID_BITS);
|
||||
localparam DCACHE_NOSM_TAG_WIDTH = (DCACHE_TAG_WIDTH - `SM_ENABLED);
|
||||
|
||||
// Memory request data bits
|
||||
localparam DCACHE_MEM_DATA_WIDTH = (DCACHE_LINE_SIZE * 8);
|
||||
|
||||
// Memory request tag bits
|
||||
`ifdef DCACHE_ENABLE
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_NOSM_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
|
||||
`else
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_NOSM_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
|
||||
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
|
||||
`endif
|
||||
|
||||
/////////////////////////////// L1 Parameters /////////////////////////////
|
||||
|
@ -165,7 +160,7 @@ package VX_gpu_pkg;
|
|||
`ifdef L2_ENABLE
|
||||
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
|
||||
`else
|
||||
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
|
||||
localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
|
||||
`endif
|
||||
|
||||
/////////////////////////////// L3 Parameters /////////////////////////////
|
||||
|
@ -186,7 +181,7 @@ package VX_gpu_pkg;
|
|||
`ifdef L3_ENABLE
|
||||
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
|
||||
`else
|
||||
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
|
||||
localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
|
||||
`endif
|
||||
|
||||
/* verilator lint_on UNUSED */
|
||||
|
|
|
@ -74,7 +74,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (ICACHE_WORD_SIZE),
|
||||
.DATA_SIZE (ICACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (ICACHE_TAG_WIDTH)
|
||||
) per_core_icache_bus_if[`SOCKET_SIZE]();
|
||||
|
||||
|
@ -120,7 +120,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
|
@ -134,7 +134,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.INSTANCE_ID ($sformatf("socket%0d-dcache", SOCKET_ID)),
|
||||
.NUM_UNITS (`NUM_DCACHES),
|
||||
.NUM_INPUTS (`SOCKET_SIZE),
|
||||
.TAG_SEL_IDX (1),
|
||||
.TAG_SEL_IDX (0),
|
||||
.CACHE_SIZE (`DCACHE_SIZE),
|
||||
.LINE_SIZE (DCACHE_LINE_SIZE),
|
||||
.NUM_BANKS (`DCACHE_NUM_BANKS),
|
||||
|
@ -145,7 +145,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.MSHR_SIZE (`DCACHE_MSHR_SIZE),
|
||||
.MRSQ_SIZE (`DCACHE_MRSQ_SIZE),
|
||||
.MREQ_SIZE (`DCACHE_MREQ_SIZE),
|
||||
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.WRITE_ENABLE (1),
|
||||
.NC_ENABLE (1),
|
||||
|
@ -182,7 +182,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.NUM_INPUTS (2),
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||
.TAG_SEL_IDX (0),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF (2),
|
||||
.RSP_OUT_BUF (2)
|
||||
|
|
234
hw/rtl/cache/VX_cache_bypass.sv
vendored
234
hw/rtl/cache/VX_cache_bypass.sv
vendored
|
@ -15,17 +15,17 @@
|
|||
|
||||
module VX_cache_bypass #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter NC_TAG_BIT = 0,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
|
||||
parameter NC_ENABLE = 0,
|
||||
parameter PASSTHRU = 0,
|
||||
parameter NC_ENABLE = 0,
|
||||
|
||||
parameter WORD_SIZE = 1,
|
||||
parameter LINE_SIZE = 1,
|
||||
|
||||
parameter CORE_ADDR_WIDTH = 1,
|
||||
|
||||
parameter CORE_TAG_IN_WIDTH = 1,
|
||||
parameter CORE_TAG_WIDTH = 1,
|
||||
|
||||
parameter MEM_ADDR_WIDTH = 1,
|
||||
parameter MEM_TAG_IN_WIDTH = 1,
|
||||
|
@ -36,8 +36,7 @@ module VX_cache_bypass #(
|
|||
parameter CORE_OUT_BUF = 0,
|
||||
parameter MEM_OUT_BUF = 0,
|
||||
|
||||
parameter CORE_DATA_WIDTH = WORD_SIZE * 8,
|
||||
parameter CORE_TAG_OUT_WIDTH = CORE_TAG_IN_WIDTH - NC_ENABLE
|
||||
parameter CORE_DATA_WIDTH = WORD_SIZE * 8
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
@ -57,38 +56,39 @@ module VX_cache_bypass #(
|
|||
localparam DIRECT_PASSTHRU = PASSTHRU && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == 1);
|
||||
|
||||
localparam REQ_SEL_BITS = `CLOG2(NUM_REQS);
|
||||
localparam MUX_DATAW = CORE_TAG_IN_WIDTH + CORE_DATA_WIDTH + WORD_SIZE + CORE_ADDR_WIDTH + 1;
|
||||
localparam MUX_DATAW = CORE_TAG_WIDTH + CORE_DATA_WIDTH + WORD_SIZE + CORE_ADDR_WIDTH + 1;
|
||||
|
||||
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
|
||||
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
|
||||
|
||||
localparam CORE_TAG_ID_BITS = CORE_TAG_IN_WIDTH - UUID_WIDTH;
|
||||
localparam CORE_TAG_ID_BITS = CORE_TAG_WIDTH - UUID_WIDTH;
|
||||
localparam MEM_TAG_ID_BITS = REQ_SEL_BITS + WSEL_BITS + CORE_TAG_ID_BITS;
|
||||
localparam MEM_TAG_BYPASS_BITS = UUID_WIDTH + MEM_TAG_ID_BITS;
|
||||
|
||||
localparam MEM_TAG_OUT_NC_WIDTH = MEM_TAG_OUT_WIDTH - 1 + NC_ENABLE;
|
||||
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
|
||||
localparam MEM_ADDRW = `XLEN - MEM_ASHIFT;
|
||||
|
||||
// core request handling
|
||||
// handle core requests ///////////////////////////////////////////////////
|
||||
|
||||
wire core_req_nc_valid;
|
||||
wire [NUM_REQS-1:0] core_req_nc_valids;
|
||||
wire [NUM_REQS-1:0] core_req_nc_idxs;
|
||||
wire [`UP(REQ_SEL_BITS)-1:0] core_req_nc_idx;
|
||||
wire [NUM_REQS-1:0] core_req_nc_sel;
|
||||
wire core_req_nc_valid;
|
||||
wire [NUM_REQS-1:0] core_req_nc_sel;
|
||||
wire core_req_nc_ready;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
if (PASSTHRU != 0) begin
|
||||
assign core_req_nc_idxs[i] = 1'b1;
|
||||
end else if (NC_ENABLE) begin
|
||||
wire [MEM_ADDRW-1:0] block_addr = core_bus_in_if[i].req_data.addr[CORE_ADDR_WIDTH-1 -: MEM_ADDRW];
|
||||
assign core_req_nc_idxs[i] = (block_addr >= MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT));
|
||||
end else begin
|
||||
assign core_req_nc_idxs[i] = core_bus_in_if[i].req_data.tag[NC_TAG_BIT];
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_req_nc_idxs[i] = 1'b0;
|
||||
end
|
||||
assign core_req_nc_valids[i] = core_bus_in_if[i].req_valid && core_req_nc_idxs[i];
|
||||
end
|
||||
|
||||
wire core_req_nc_ready = ~mem_bus_in_if.req_valid && mem_req_out_ready;
|
||||
|
||||
VX_generic_arbiter #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.TYPE (PASSTHRU ? "R" : "P"),
|
||||
|
@ -105,69 +105,58 @@ module VX_cache_bypass #(
|
|||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && ~core_req_nc_idxs[i];
|
||||
assign core_bus_out_if[i].req_data.rw = core_bus_in_if[i].req_data.rw;
|
||||
assign core_bus_out_if[i].req_data.addr = core_bus_in_if[i].req_data.addr;
|
||||
assign core_bus_out_if[i].req_data.byteen = core_bus_in_if[i].req_data.byteen;
|
||||
assign core_bus_out_if[i].req_data.data = core_bus_in_if[i].req_data.data;
|
||||
|
||||
VX_bits_remove #(
|
||||
.N (CORE_TAG_IN_WIDTH),
|
||||
.S (NC_ENABLE),
|
||||
.POS (NC_TAG_BIT)
|
||||
) core_req_tag_nc_remove (
|
||||
.data_in (core_bus_in_if[i].req_data.tag),
|
||||
.data_out (core_bus_out_if[i].req_data.tag)
|
||||
);
|
||||
|
||||
assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
|
||||
assign core_bus_in_if[i].req_ready = core_req_nc_valids[i] ? (core_req_nc_ready && core_req_nc_sel[i])
|
||||
: core_bus_out_if[i].req_ready;
|
||||
end
|
||||
|
||||
// memory request handling
|
||||
// handle memory requests /////////////////////////////////////////////////
|
||||
|
||||
wire mem_req_out_valid;
|
||||
wire mem_req_out_rw;
|
||||
wire [LINE_SIZE-1:0] mem_req_out_byteen;
|
||||
wire mem_req_out_valid;
|
||||
wire mem_req_out_rw;
|
||||
wire [LINE_SIZE-1:0] mem_req_out_byteen;
|
||||
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_out_addr;
|
||||
wire [`CS_LINE_WIDTH-1:0] mem_req_out_data;
|
||||
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_out_tag;
|
||||
wire mem_req_out_ready;
|
||||
wire [`CS_LINE_WIDTH-1:0] mem_req_out_data;
|
||||
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_out_tag;
|
||||
wire mem_req_out_ready;
|
||||
|
||||
wire [CORE_TAG_IN_WIDTH-1:0] core_req_nc_sel_tag;
|
||||
wire [CORE_DATA_WIDTH-1:0] core_req_nc_sel_data;
|
||||
wire [WORD_SIZE-1:0] core_req_nc_sel_byteen;
|
||||
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_sel_addr;
|
||||
wire core_req_nc_sel_rw;
|
||||
wire [CORE_TAG_WIDTH-1:0] core_req_nc_sel_tag;
|
||||
wire [CORE_DATA_WIDTH-1:0] core_req_nc_sel_data;
|
||||
wire [WORD_SIZE-1:0] core_req_nc_sel_byteen;
|
||||
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_sel_addr;
|
||||
wire core_req_nc_sel_rw;
|
||||
|
||||
wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in;
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_req_nc_mux_in[i] = {
|
||||
core_bus_in_if[i].req_data.tag,
|
||||
core_bus_in_if[i].req_data.data,
|
||||
core_bus_in_if[i].req_data.byteen,
|
||||
core_bus_in_if[i].req_data.addr,
|
||||
core_bus_in_if[i].req_data.rw
|
||||
core_bus_in_if[i].req_data.rw,
|
||||
core_bus_in_if[i].req_data.byteen,
|
||||
core_bus_in_if[i].req_data.addr,
|
||||
core_bus_in_if[i].req_data.data,
|
||||
core_bus_in_if[i].req_data.tag
|
||||
};
|
||||
end
|
||||
|
||||
assign {
|
||||
core_req_nc_sel_tag,
|
||||
core_req_nc_sel_data,
|
||||
core_req_nc_sel_byteen,
|
||||
core_req_nc_sel_addr,
|
||||
core_req_nc_sel_rw
|
||||
core_req_nc_sel_rw,
|
||||
core_req_nc_sel_byteen,
|
||||
core_req_nc_sel_addr,
|
||||
core_req_nc_sel_data,
|
||||
core_req_nc_sel_tag
|
||||
} = core_req_nc_mux_in[core_req_nc_idx];
|
||||
|
||||
assign core_req_nc_ready = ~mem_bus_in_if.req_valid && mem_req_out_ready;
|
||||
|
||||
assign mem_req_out_valid = mem_bus_in_if.req_valid || core_req_nc_valid;
|
||||
assign mem_req_out_rw = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.rw : core_req_nc_sel_rw;
|
||||
assign mem_req_out_addr = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.addr : core_req_nc_sel_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
|
||||
assign mem_req_out_rw = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.rw : core_req_nc_sel_rw;
|
||||
assign mem_req_out_addr = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.addr : core_req_nc_sel_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
|
||||
|
||||
wire [MEM_TAG_ID_BITS-1:0] mem_req_tag_id_bypass;
|
||||
|
||||
wire [CORE_TAG_ID_BITS-1:0] core_req_in_id = core_req_nc_sel_tag[CORE_TAG_ID_BITS-1:0];
|
||||
|
||||
if (WORDS_PER_LINE > 1) begin
|
||||
reg [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] mem_req_byteen_in_r;
|
||||
reg [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] mem_req_byteen_in_r;
|
||||
reg [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_r;
|
||||
|
||||
wire [WSEL_BITS-1:0] req_wsel = core_req_nc_sel_addr[WSEL_BITS-1:0];
|
||||
|
@ -181,7 +170,7 @@ module VX_cache_bypass #(
|
|||
end
|
||||
|
||||
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : mem_req_byteen_in_r;
|
||||
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : mem_req_data_in_r;
|
||||
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : mem_req_data_in_r;
|
||||
if (NUM_REQS > 1) begin
|
||||
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, req_wsel, core_req_in_id});
|
||||
end else begin
|
||||
|
@ -189,7 +178,7 @@ module VX_cache_bypass #(
|
|||
end
|
||||
end else begin
|
||||
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : core_req_nc_sel_byteen;
|
||||
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : core_req_nc_sel_data;
|
||||
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : core_req_nc_sel_data;
|
||||
if (NUM_REQS > 1) begin
|
||||
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, core_req_in_id});
|
||||
end else begin
|
||||
|
@ -197,7 +186,7 @@ module VX_cache_bypass #(
|
|||
end
|
||||
end
|
||||
|
||||
wire [MEM_TAG_OUT_NC_WIDTH-1:0] mem_req_tag_bypass;
|
||||
wire [MEM_TAG_BYPASS_BITS-1:0] mem_req_tag_bypass;
|
||||
|
||||
if (UUID_WIDTH != 0) begin
|
||||
assign mem_req_tag_bypass = {core_req_nc_sel_tag[CORE_TAG_ID_BITS +: UUID_WIDTH], mem_req_tag_id_bypass};
|
||||
|
@ -205,29 +194,24 @@ module VX_cache_bypass #(
|
|||
assign mem_req_tag_bypass = mem_req_tag_id_bypass;
|
||||
end
|
||||
|
||||
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_tag_bypass_nc;
|
||||
wire [(MEM_TAG_IN_WIDTH + 1)-1:0] mem_req_tag_in_nc;
|
||||
|
||||
VX_bits_insert #(
|
||||
.N (MEM_TAG_OUT_NC_WIDTH),
|
||||
.S (NC_ENABLE ? 0 : 1),
|
||||
.POS (NC_TAG_BIT)
|
||||
) mem_req_tag_bypass_nc_insert (
|
||||
.data_in (mem_req_tag_bypass),
|
||||
.sel_in (1'b0),
|
||||
.data_out (mem_req_tag_bypass_nc)
|
||||
);
|
||||
|
||||
VX_bits_insert #(
|
||||
.N (MEM_TAG_IN_WIDTH),
|
||||
.POS (NC_TAG_BIT)
|
||||
) mem_req_tag_in_nc_insert (
|
||||
.data_in (mem_bus_in_if.req_data.tag),
|
||||
.sel_in (1'b0),
|
||||
.data_out (mem_req_tag_in_nc)
|
||||
);
|
||||
|
||||
assign mem_req_out_tag = mem_bus_in_if.req_valid ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : mem_req_tag_bypass_nc;
|
||||
if (PASSTHRU != 0) begin
|
||||
assign mem_req_out_tag = mem_req_tag_bypass;
|
||||
`UNUSED_VAR (mem_bus_in_if.req_data.tag)
|
||||
end else begin
|
||||
if (NC_ENABLE) begin
|
||||
VX_bits_insert #(
|
||||
.N (MEM_TAG_OUT_WIDTH-1),
|
||||
.S (1),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) mem_req_tag_in_nc_insert (
|
||||
.data_in (mem_bus_in_if.req_valid ? (MEM_TAG_OUT_WIDTH-1)'(mem_bus_in_if.req_data.tag) : (MEM_TAG_OUT_WIDTH-1)'(mem_req_tag_bypass)),
|
||||
.sel_in (~mem_bus_in_if.req_valid),
|
||||
.data_out (mem_req_out_tag)
|
||||
);
|
||||
end else begin
|
||||
assign mem_req_out_tag = mem_bus_in_if.req_data.tag;
|
||||
end
|
||||
end
|
||||
|
||||
assign mem_bus_in_if.req_ready = mem_req_out_ready;
|
||||
|
||||
|
@ -246,48 +230,38 @@ module VX_cache_bypass #(
|
|||
.ready_out (mem_bus_out_if.req_ready)
|
||||
);
|
||||
|
||||
// core response handling
|
||||
// handle core responses //////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_REQS-1:0] core_rsp_in_valid;
|
||||
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_in_data;
|
||||
wire [NUM_REQS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_in_tag;
|
||||
wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_in_tag;
|
||||
wire [NUM_REQS-1:0] core_rsp_in_ready;
|
||||
|
||||
wire [NUM_REQS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_in_nc;
|
||||
|
||||
wire is_mem_rsp_nc;
|
||||
if (PASSTHRU != 0) begin
|
||||
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid;
|
||||
end else begin
|
||||
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid && mem_bus_out_if.rsp_data.tag[NC_TAG_BIT];
|
||||
if (NC_ENABLE) begin
|
||||
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid && mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
|
||||
end else begin
|
||||
assign is_mem_rsp_nc = 1'b0;
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
VX_bits_insert #(
|
||||
.N (CORE_TAG_OUT_WIDTH),
|
||||
.S (NC_ENABLE),
|
||||
.POS (NC_TAG_BIT)
|
||||
) core_rsp_tag_in_nc_insert (
|
||||
.data_in (core_bus_out_if[i].rsp_data.tag),
|
||||
.sel_in ('0),
|
||||
.data_out (core_rsp_tag_in_nc[i])
|
||||
);
|
||||
end
|
||||
|
||||
wire [MEM_TAG_OUT_NC_WIDTH-1:0] mem_rsp_tag_in_nc;
|
||||
wire [(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1:0] mem_rsp_tag_id_nc;
|
||||
|
||||
VX_bits_remove #(
|
||||
.N (MEM_TAG_OUT_WIDTH),
|
||||
.S (NC_ENABLE ? 0 : 1),
|
||||
.POS (NC_TAG_BIT)
|
||||
.S (NC_ENABLE),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) mem_rsp_tag_in_nc_remove (
|
||||
.data_in (mem_bus_out_if.rsp_data.tag),
|
||||
.data_out (mem_rsp_tag_in_nc)
|
||||
.data_out (mem_rsp_tag_id_nc)
|
||||
);
|
||||
|
||||
wire [`UP(REQ_SEL_BITS)-1:0] rsp_idx;
|
||||
if (NUM_REQS > 1) begin
|
||||
assign rsp_idx = mem_rsp_tag_in_nc[(CORE_TAG_ID_BITS + WSEL_BITS) +: REQ_SEL_BITS];
|
||||
assign rsp_idx = mem_rsp_tag_id_nc[(CORE_TAG_ID_BITS + WSEL_BITS) +: REQ_SEL_BITS];
|
||||
end else begin
|
||||
assign rsp_idx = 1'b0;
|
||||
end
|
||||
|
@ -304,7 +278,7 @@ module VX_cache_bypass #(
|
|||
end
|
||||
|
||||
if (WORDS_PER_LINE > 1) begin
|
||||
wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_in_nc[CORE_TAG_ID_BITS +: WSEL_BITS];
|
||||
wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS +: WSEL_BITS];
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ?
|
||||
core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
|
||||
|
@ -313,19 +287,28 @@ module VX_cache_bypass #(
|
|||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
wire [(CORE_TAG_ID_BITS + UUID_WIDTH)-1:0] mem_rsp_tag_in_nc2;
|
||||
if (UUID_WIDTH != 0) begin
|
||||
assign mem_rsp_tag_in_nc2 = {mem_rsp_tag_id_nc[(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1 -: UUID_WIDTH], mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0]};
|
||||
end else begin
|
||||
assign mem_rsp_tag_in_nc2 = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
if (UUID_WIDTH != 0) begin
|
||||
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_valid ? core_rsp_tag_in_nc[i] : {mem_rsp_tag_in_nc[MEM_TAG_OUT_NC_WIDTH-1 -: UUID_WIDTH], mem_rsp_tag_in_nc[CORE_TAG_ID_BITS-1:0]};
|
||||
if (PASSTHRU) begin
|
||||
assign core_rsp_in_tag[i] = mem_rsp_tag_in_nc2;
|
||||
end else if (NC_ENABLE) begin
|
||||
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.tag : mem_rsp_tag_in_nc2;
|
||||
end else begin
|
||||
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_valid ? core_rsp_tag_in_nc[i] : mem_rsp_tag_in_nc[CORE_TAG_ID_BITS-1:0];
|
||||
end
|
||||
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_data.tag;
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`CS_WORD_WIDTH + CORE_TAG_IN_WIDTH),
|
||||
.DATAW (`CS_WORD_WIDTH + CORE_TAG_WIDTH),
|
||||
.SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
|
||||
) core_rsp_buf (
|
||||
|
@ -340,26 +323,23 @@ module VX_cache_bypass #(
|
|||
);
|
||||
end
|
||||
|
||||
// memory response handling
|
||||
// handle memory responses ////////////////////////////////////////////////
|
||||
|
||||
if (PASSTHRU != 0) begin
|
||||
assign mem_bus_in_if.rsp_valid = 1'b0;
|
||||
assign mem_bus_in_if.rsp_data.data = '0;
|
||||
assign mem_bus_in_if.rsp_data.tag = '0;
|
||||
end else if (NC_ENABLE) begin
|
||||
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid && ~mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
|
||||
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
|
||||
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc[MEM_TAG_IN_WIDTH-1:0];
|
||||
end else begin
|
||||
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid && ~mem_bus_out_if.rsp_data.tag[NC_TAG_BIT];
|
||||
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid;
|
||||
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
|
||||
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc;
|
||||
end
|
||||
|
||||
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
|
||||
|
||||
VX_bits_remove #(
|
||||
.N (MEM_TAG_IN_WIDTH + 1),
|
||||
.POS (NC_TAG_BIT)
|
||||
) mem_rsp_tag_out_remove (
|
||||
.data_in (mem_bus_out_if.rsp_data.tag[(MEM_TAG_IN_WIDTH + 1)-1:0]),
|
||||
.data_out (mem_bus_in_if.rsp_data.tag)
|
||||
);
|
||||
|
||||
|
||||
wire [NUM_REQS-1:0] core_rsp_out_valid;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign core_rsp_out_valid[i] = core_bus_out_if[i].rsp_valid;
|
||||
end
|
||||
|
|
6
hw/rtl/cache/VX_cache_cluster.sv
vendored
6
hw/rtl/cache/VX_cache_cluster.sv
vendored
|
@ -75,8 +75,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
localparam NUM_CACHES = `UP(NUM_UNITS);
|
||||
localparam PASSTHRU = (NUM_UNITS == 0);
|
||||
localparam ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES);
|
||||
localparam MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
|
||||
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH)) :
|
||||
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
|
||||
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
|
||||
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
|
||||
|
||||
|
@ -155,6 +154,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.TAG_WIDTH (ARB_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (TAG_SEL_IDX),
|
||||
.CORE_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : CORE_OUT_BUF),
|
||||
.MEM_OUT_BUF ((NUM_CACHES > 1) ? 2 : MEM_OUT_BUF),
|
||||
.NC_ENABLE (NC_ENABLE),
|
||||
|
@ -181,7 +181,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
.NUM_INPUTS (NUM_CACHES),
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (1), // Skip 0 for NC flag
|
||||
.TAG_SEL_IDX (TAG_SEL_IDX),
|
||||
.ARBITER ("R"),
|
||||
.REQ_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0),
|
||||
.RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0)
|
||||
|
|
116
hw/rtl/cache/VX_cache_wrap.sv
vendored
116
hw/rtl/cache/VX_cache_wrap.sv
vendored
|
@ -16,9 +16,12 @@
|
|||
module VX_cache_wrap import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = "",
|
||||
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
|
||||
// Number of Word requests per cycle
|
||||
parameter NUM_REQS = 4,
|
||||
|
||||
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 4096,
|
||||
// Size of line inside a bank in bytes
|
||||
|
@ -49,7 +52,6 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
parameter TAG_WIDTH = UUID_WIDTH + 1,
|
||||
|
||||
// enable bypass for non-cacheable addresses
|
||||
parameter NC_TAG_BIT = 0,
|
||||
parameter NC_ENABLE = 0,
|
||||
|
||||
// Force bypass for all requests
|
||||
|
@ -74,48 +76,46 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
VX_mem_bus_if.master mem_bus_if
|
||||
);
|
||||
|
||||
`STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid parameter: NUM_BANKS=%d, NUM_REQS=%d", NUM_BANKS, NUM_REQS))
|
||||
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
|
||||
|
||||
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
|
||||
localparam CORE_TAG_X_WIDTH = TAG_WIDTH - NC_ENABLE;
|
||||
localparam MEM_TAG_X_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
|
||||
localparam MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
|
||||
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH)) :
|
||||
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
|
||||
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
|
||||
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
|
||||
localparam CACHE_MEM_TAG_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
|
||||
|
||||
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
|
||||
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
|
||||
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
|
||||
|
||||
localparam NC_BYPASS = (NC_ENABLE || PASSTHRU);
|
||||
localparam NC_OR_BYPASS = (NC_ENABLE || PASSTHRU);
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (CORE_TAG_X_WIDTH)
|
||||
) core_bus_bypass_if[NUM_REQS]();
|
||||
.TAG_WIDTH (TAG_WIDTH)
|
||||
) core_bus_cache_if[NUM_REQS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_X_WIDTH)
|
||||
) mem_bus_bypass_if();
|
||||
.TAG_WIDTH (CACHE_MEM_TAG_WIDTH)
|
||||
) mem_bus_cache_if();
|
||||
|
||||
if (NC_BYPASS) begin
|
||||
if (NC_OR_BYPASS) begin
|
||||
|
||||
`RESET_RELAY (nc_bypass_reset, reset);
|
||||
|
||||
VX_cache_bypass #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.NC_TAG_BIT (NC_TAG_BIT),
|
||||
.TAG_SEL_IDX (TAG_SEL_IDX),
|
||||
|
||||
.NC_ENABLE (NC_ENABLE),
|
||||
.PASSTHRU (PASSTHRU),
|
||||
.NC_ENABLE (PASSTHRU ? 0 : NC_ENABLE),
|
||||
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.LINE_SIZE (LINE_SIZE),
|
||||
|
||||
.CORE_ADDR_WIDTH (`CS_WORD_ADDR_WIDTH),
|
||||
.CORE_TAG_IN_WIDTH (TAG_WIDTH),
|
||||
.CORE_TAG_WIDTH (TAG_WIDTH),
|
||||
|
||||
.MEM_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH),
|
||||
.MEM_TAG_IN_WIDTH (MEM_TAG_X_WIDTH),
|
||||
.MEM_TAG_IN_WIDTH (CACHE_MEM_TAG_WIDTH),
|
||||
.MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH),
|
||||
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
|
@ -127,68 +127,40 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
.reset (nc_bypass_reset),
|
||||
|
||||
.core_bus_in_if (core_bus_if),
|
||||
.core_bus_out_if(core_bus_bypass_if),
|
||||
.core_bus_out_if(core_bus_cache_if),
|
||||
|
||||
.mem_bus_in_if (mem_bus_bypass_if),
|
||||
.mem_bus_in_if (mem_bus_cache_if),
|
||||
.mem_bus_out_if (mem_bus_if)
|
||||
);
|
||||
end else begin
|
||||
|
||||
end else begin
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
`ASSIGN_VX_MEM_BUS_IF (core_bus_bypass_if[i], core_bus_if[i]);
|
||||
`ASSIGN_VX_MEM_BUS_IF (core_bus_cache_if[i], core_bus_if[i]);
|
||||
end
|
||||
|
||||
assign mem_bus_if.req_valid = mem_bus_bypass_if.req_valid;
|
||||
assign mem_bus_if.req_data.addr = mem_bus_bypass_if.req_data.addr;
|
||||
assign mem_bus_if.req_data.rw = mem_bus_bypass_if.req_data.rw;
|
||||
assign mem_bus_if.req_data.byteen = mem_bus_bypass_if.req_data.byteen;
|
||||
assign mem_bus_if.req_data.data = mem_bus_bypass_if.req_data.data;
|
||||
assign mem_bus_bypass_if.req_ready = mem_bus_if.req_ready;
|
||||
|
||||
// Add explicit NC=0 flag to the memory request tag
|
||||
|
||||
VX_bits_insert #(
|
||||
.N (MEM_TAG_WIDTH-1),
|
||||
.POS (NC_TAG_BIT)
|
||||
) mem_req_tag_insert (
|
||||
.data_in (mem_bus_bypass_if.req_data.tag),
|
||||
.sel_in (1'b0),
|
||||
.data_out (mem_bus_if.req_data.tag)
|
||||
);
|
||||
|
||||
assign mem_bus_bypass_if.rsp_valid = mem_bus_if.rsp_valid;
|
||||
assign mem_bus_bypass_if.rsp_data.data = mem_bus_if.rsp_data.data;
|
||||
assign mem_bus_if.rsp_ready = mem_bus_bypass_if.rsp_ready;
|
||||
|
||||
// Remove NC flag from the memory response tag
|
||||
|
||||
VX_bits_remove #(
|
||||
.N (MEM_TAG_WIDTH),
|
||||
.POS (NC_TAG_BIT)
|
||||
) mem_rsp_tag_remove (
|
||||
.data_in (mem_bus_if.rsp_data.tag),
|
||||
.data_out (mem_bus_bypass_if.rsp_data.tag)
|
||||
);
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_cache_if);
|
||||
end
|
||||
|
||||
if (PASSTHRU != 0) begin
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
`UNUSED_VAR (core_bus_bypass_if[i].req_valid)
|
||||
`UNUSED_VAR (core_bus_bypass_if[i].req_data)
|
||||
assign core_bus_bypass_if[i].req_ready = 0;
|
||||
`UNUSED_VAR (core_bus_cache_if[i].req_valid)
|
||||
`UNUSED_VAR (core_bus_cache_if[i].req_data)
|
||||
assign core_bus_cache_if[i].req_ready = 0;
|
||||
|
||||
assign core_bus_bypass_if[i].rsp_valid = 0;
|
||||
assign core_bus_bypass_if[i].rsp_data = '0;
|
||||
`UNUSED_VAR (core_bus_bypass_if[i].rsp_ready)
|
||||
assign core_bus_cache_if[i].rsp_valid = 0;
|
||||
assign core_bus_cache_if[i].rsp_data = '0;
|
||||
`UNUSED_VAR (core_bus_cache_if[i].rsp_ready)
|
||||
end
|
||||
|
||||
assign mem_bus_bypass_if.req_valid = 0;
|
||||
assign mem_bus_bypass_if.req_data = '0;
|
||||
`UNUSED_VAR (mem_bus_bypass_if.req_ready)
|
||||
assign mem_bus_cache_if.req_valid = 0;
|
||||
assign mem_bus_cache_if.req_data = '0;
|
||||
`UNUSED_VAR (mem_bus_cache_if.req_ready)
|
||||
|
||||
`UNUSED_VAR (mem_bus_bypass_if.rsp_valid)
|
||||
`UNUSED_VAR (mem_bus_bypass_if.rsp_data)
|
||||
assign mem_bus_bypass_if.rsp_ready = 0;
|
||||
`UNUSED_VAR (mem_bus_cache_if.rsp_valid)
|
||||
`UNUSED_VAR (mem_bus_cache_if.rsp_data)
|
||||
assign mem_bus_cache_if.rsp_ready = 0;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
assign cache_perf = '0;
|
||||
|
@ -212,17 +184,17 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
.MREQ_SIZE (MREQ_SIZE),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.TAG_WIDTH (CORE_TAG_X_WIDTH),
|
||||
.CORE_OUT_BUF (NC_BYPASS ? 1 : CORE_OUT_BUF),
|
||||
.MEM_OUT_BUF (NC_BYPASS ? 1 : MEM_OUT_BUF)
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.CORE_OUT_BUF (NC_OR_BYPASS ? 1 : CORE_OUT_BUF),
|
||||
.MEM_OUT_BUF (NC_OR_BYPASS ? 1 : MEM_OUT_BUF)
|
||||
) cache (
|
||||
.clk (clk),
|
||||
.reset (cache_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (cache_perf),
|
||||
`endif
|
||||
.core_bus_if (core_bus_bypass_if),
|
||||
.mem_bus_if (mem_bus_bypass_if)
|
||||
.core_bus_if (core_bus_cache_if),
|
||||
.mem_bus_if (mem_bus_cache_if)
|
||||
);
|
||||
|
||||
end
|
||||
|
@ -260,7 +232,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
wire [`UP(UUID_WIDTH)-1:0] mem_req_uuid;
|
||||
wire [`UP(UUID_WIDTH)-1:0] mem_rsp_uuid;
|
||||
|
||||
if ((UUID_WIDTH != 0) && (NC_BYPASS != 0)) begin
|
||||
if ((UUID_WIDTH != 0) && (NC_OR_BYPASS != 0)) begin
|
||||
assign mem_req_uuid = mem_bus_if.req_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
assign mem_rsp_uuid = mem_bus_if.rsp_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
end else begin
|
||||
|
|
|
@ -72,9 +72,9 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
|
||||
) dcache_sm_bus_if[DCACHE_NUM_REQS]();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_mem_perf_if mem_perf_tmp_if();
|
||||
|
@ -191,7 +191,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
.pipeline_perf_if(pipeline_perf_if),
|
||||
`endif
|
||||
|
||||
.dcache_bus_if (dcache_bus_tmp_if),
|
||||
.dcache_bus_if (dcache_sm_bus_if),
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_dispatch_if(fpu_dispatch_if),
|
||||
|
@ -246,14 +246,14 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
`ifdef PERF_ENABLE
|
||||
.cache_perf (mem_perf_tmp_if.smem),
|
||||
`endif
|
||||
.dcache_bus_in_if (dcache_bus_tmp_if),
|
||||
.dcache_bus_in_if (dcache_sm_bus_if),
|
||||
.dcache_bus_out_if (dcache_bus_if)
|
||||
);
|
||||
|
||||
`else
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]);
|
||||
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_sm_bus_if[i]);
|
||||
end
|
||||
|
||||
`endif
|
||||
|
|
|
@ -33,12 +33,12 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] dcache_req_byteen,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] dcache_req_addr,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_req_data,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] dcache_req_tag,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] dcache_req_tag,
|
||||
input wire [DCACHE_NUM_REQS-1:0] dcache_req_ready,
|
||||
|
||||
input wire [DCACHE_NUM_REQS-1:0] dcache_rsp_valid,
|
||||
input wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_rsp_data,
|
||||
input wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] dcache_rsp_tag,
|
||||
input wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] dcache_rsp_tag,
|
||||
output wire [DCACHE_NUM_REQS-1:0] dcache_rsp_ready,
|
||||
|
||||
output wire icache_req_valid,
|
||||
|
@ -92,7 +92,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) dcache_bus_if[DCACHE_NUM_REQS]();
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
|
|
|
@ -30,16 +30,15 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||
// outputs
|
||||
VX_commit_if.master commit_if [`ISSUE_WIDTH]
|
||||
);
|
||||
localparam WORD_SIZE = `XLEN / 8;
|
||||
localparam ADDR_WIDTH = `MEM_ADDR_WIDTH - `CLOG2(WORD_SIZE);
|
||||
localparam BLOCK_SIZE = 1;
|
||||
localparam NUM_LANES = `NUM_LSU_LANES;
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
|
||||
localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_SIZE);
|
||||
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
|
||||
localparam MEM_ADDRW = `XLEN - MEM_ASHIFT;
|
||||
localparam REQ_ASHIFT = `CLOG2(DCACHE_WORD_SIZE);
|
||||
localparam CACHE_TAG_WIDTH = `UUID_WIDTH + (NUM_LANES * `CACHE_ADDR_TYPE_BITS) + LSUQ_TAG_BITS;
|
||||
localparam REQ_ASHIFT = `CLOG2(WORD_SIZE);
|
||||
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
|
@ -72,23 +71,14 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||
`UNUSED_VAR (execute_if[0].data.rs3_data)
|
||||
`UNUSED_VAR (execute_if[0].data.tid)
|
||||
|
||||
`ifdef SM_ENABLE
|
||||
`STATIC_ASSERT(`IS_DIVISBLE((1 << `SMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter"))
|
||||
`STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % (1 << `SMEM_LOG_SIZE)), ("invalid parameter"))
|
||||
localparam SMEM_START_B = MEM_ADDRW'(`XLEN'(`SMEM_BASE_ADDR) >> MEM_ASHIFT);
|
||||
localparam SMEM_END_B = MEM_ADDRW'((`XLEN'(`SMEM_BASE_ADDR) + (1 << `SMEM_LOG_SIZE)) >> MEM_ASHIFT);
|
||||
`endif
|
||||
|
||||
// tag_id = wid + PC + tmask + rd + op_type + align + is_dup + pid + pkt_addr
|
||||
localparam TAG_ID_WIDTH = `NW_WIDTH + `XLEN + NUM_LANES + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * (REQ_ASHIFT)) + `LSU_DUP_ENABLED + PID_WIDTH + LSUQ_SIZEW;
|
||||
|
||||
// tag = uuid + addr_type + tag_id
|
||||
localparam TAG_WIDTH = `UUID_WIDTH + (NUM_LANES * `CACHE_ADDR_TYPE_BITS) + TAG_ID_WIDTH;
|
||||
// tag = uuid + tag_id
|
||||
localparam TAG_WIDTH = `UUID_WIDTH + TAG_ID_WIDTH;
|
||||
|
||||
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % `MEM_BLOCK_SIZE), ("invalid parameter"))
|
||||
|
||||
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type;
|
||||
|
||||
// full address calculation
|
||||
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] full_addr;
|
||||
|
@ -113,21 +103,6 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||
assign lsu_is_dup = 0;
|
||||
`endif
|
||||
|
||||
// detect address type
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire [MEM_ADDRW-1:0] full_addr_b = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
|
||||
// is non-cacheable I/O address
|
||||
wire is_addr_io = (full_addr_b >= MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT));
|
||||
`ifdef SM_ENABLE
|
||||
// is shared memory address
|
||||
wire is_addr_sm = (full_addr_b >= SMEM_START_B) && (full_addr_b < SMEM_END_B);
|
||||
assign lsu_addr_type[i] = {is_addr_io, is_addr_sm};
|
||||
`else
|
||||
assign lsu_addr_type[i] = is_addr_io;
|
||||
`endif
|
||||
end
|
||||
|
||||
wire mem_req_empty;
|
||||
wire st_rsp_ready;
|
||||
wire lsu_valid, lsu_ready;
|
||||
|
@ -145,8 +120,8 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||
wire [NUM_LANES-1:0] mem_req_mask;
|
||||
wire mem_req_rw;
|
||||
wire [NUM_LANES-1:0][`MEM_ADDR_WIDTH-REQ_ASHIFT-1:0] mem_req_addr;
|
||||
reg [NUM_LANES-1:0][DCACHE_WORD_SIZE-1:0] mem_req_byteen;
|
||||
reg [NUM_LANES-1:0][`XLEN-1:0] mem_req_data;
|
||||
reg [NUM_LANES-1:0][WORD_SIZE-1:0] mem_req_byteen;
|
||||
reg [NUM_LANES-1:0][`XLEN-1:0] mem_req_data;
|
||||
wire [TAG_WIDTH-1:0] mem_req_tag;
|
||||
wire mem_req_ready;
|
||||
|
||||
|
@ -202,7 +177,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
|
||||
end
|
||||
`endif
|
||||
default : mem_req_byteen[i] = {DCACHE_WORD_SIZE{1'b1}};
|
||||
default : mem_req_byteen[i] = {WORD_SIZE{1'b1}};
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
@ -306,7 +281,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||
end
|
||||
|
||||
assign mem_req_tag = {
|
||||
execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr
|
||||
execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr
|
||||
`ifdef LSU_DUP_ENABLE
|
||||
, lsu_is_dup
|
||||
`endif
|
||||
|
@ -314,28 +289,27 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||
|
||||
wire [DCACHE_NUM_REQS-1:0] cache_req_valid;
|
||||
wire [DCACHE_NUM_REQS-1:0] cache_req_rw;
|
||||
wire [DCACHE_NUM_REQS-1:0][(`XLEN/8)-1:0] cache_req_byteen;
|
||||
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] cache_req_byteen;
|
||||
wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] cache_req_addr;
|
||||
wire [DCACHE_NUM_REQS-1:0][`XLEN-1:0] cache_req_data;
|
||||
wire [DCACHE_NUM_REQS-1:0][CACHE_TAG_WIDTH-1:0] cache_req_tag;
|
||||
wire [DCACHE_NUM_REQS-1:0][(DCACHE_WORD_SIZE*8)-1:0] cache_req_data;
|
||||
wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] cache_req_tag;
|
||||
wire [DCACHE_NUM_REQS-1:0] cache_req_ready;
|
||||
wire [DCACHE_NUM_REQS-1:0] cache_rsp_valid;
|
||||
wire [DCACHE_NUM_REQS-1:0][`XLEN-1:0] cache_rsp_data;
|
||||
wire [DCACHE_NUM_REQS-1:0][CACHE_TAG_WIDTH-1:0] cache_rsp_tag;
|
||||
wire [DCACHE_NUM_REQS-1:0][(DCACHE_WORD_SIZE*8)-1:0] cache_rsp_data;
|
||||
wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] cache_rsp_tag;
|
||||
wire [DCACHE_NUM_REQS-1:0] cache_rsp_ready;
|
||||
|
||||
`RESET_RELAY (mem_scheduler_reset, reset);
|
||||
|
||||
VX_mem_scheduler #(
|
||||
.INSTANCE_ID ($sformatf("core%0d-lsu-memsched", CORE_ID)),
|
||||
.CORE_REQS (LSU_MEM_REQS),
|
||||
.MEM_CHANNELS(DCACHE_NUM_REQS),
|
||||
.ADDR_WIDTH (DCACHE_ADDR_WIDTH),
|
||||
.WORD_SIZE (DCACHE_WORD_SIZE),
|
||||
.CORE_REQS (`NUM_LSU_LANES),
|
||||
.MEM_CHANNELS(DCACHE_NUM_REQS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.LINE_SIZE (DCACHE_WORD_SIZE),
|
||||
.ADDR_WIDTH (ADDR_WIDTH),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.QUEUE_SIZE (`LSUQ_SIZE),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.TAG_ID_WIDTH(TAG_ID_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.RSP_PARTIAL (1),
|
||||
.MEM_OUT_BUF (2)
|
||||
|
@ -349,7 +323,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||
.core_req_mask (mem_req_mask),
|
||||
.core_req_byteen(mem_req_byteen),
|
||||
.core_req_addr (mem_req_addr),
|
||||
.core_req_data (mem_req_data),
|
||||
.core_req_data (mem_req_data),
|
||||
.core_req_tag (mem_req_tag),
|
||||
.core_req_ready (mem_req_ready),
|
||||
.core_req_empty (mem_req_empty),
|
||||
|
@ -386,63 +360,16 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||
assign cache_bus_if[i].req_data.byteen = cache_req_byteen[i];
|
||||
assign cache_bus_if[i].req_data.addr = cache_req_addr[i];
|
||||
assign cache_bus_if[i].req_data.data = cache_req_data[i];
|
||||
assign cache_bus_if[i].req_data.tag = cache_req_tag[i];
|
||||
assign cache_req_ready[i] = cache_bus_if[i].req_ready;
|
||||
|
||||
assign cache_rsp_valid[i] = cache_bus_if[i].rsp_valid;
|
||||
assign cache_rsp_data[i] = cache_bus_if[i].rsp_data.data;
|
||||
assign cache_rsp_tag[i] = cache_bus_if[i].rsp_data.tag;
|
||||
assign cache_bus_if[i].rsp_ready = cache_rsp_ready[i];
|
||||
end
|
||||
|
||||
// cache tag formatting: <uuid, tag, type>
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
wire [`UUID_WIDTH-1:0] cache_req_uuid, cache_rsp_uuid;
|
||||
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type, cache_rsp_type;
|
||||
wire [`CLOG2(`LSUQ_SIZE)-1:0] cache_req_tag_x, cache_rsp_tag_x;
|
||||
if (DCACHE_NUM_BATCHES > 1) begin
|
||||
|
||||
wire [DCACHE_NUM_BATCHES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type_b, cache_rsp_type_b;
|
||||
wire [`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type_bi, cache_rsp_type_bi;
|
||||
wire [DCACHE_BATCH_SEL_BITS-1:0] cache_req_bid, cache_rsp_bid;
|
||||
|
||||
assign {cache_req_uuid, cache_req_type, cache_req_bid, cache_req_tag_x} = cache_req_tag[i];
|
||||
assign cache_req_type_bi = cache_req_type_b[cache_req_bid];
|
||||
assign cache_bus_if[i].req_data.tag = {cache_req_uuid, cache_req_bid, cache_req_tag_x, cache_req_type_bi};
|
||||
|
||||
assign {cache_rsp_uuid, cache_rsp_bid, cache_rsp_tag_x, cache_rsp_type_bi} = cache_bus_if[i].rsp_data.tag;
|
||||
assign cache_rsp_type_b = {DCACHE_NUM_BATCHES{cache_rsp_type_bi}};
|
||||
assign cache_rsp_tag[i] = {cache_rsp_uuid, cache_rsp_type, cache_rsp_bid, cache_rsp_tag_x};
|
||||
|
||||
for (genvar j = 0; j < DCACHE_NUM_BATCHES; ++j) begin
|
||||
localparam k = j * DCACHE_NUM_REQS + i;
|
||||
if (k < NUM_LANES) begin
|
||||
assign cache_req_type_b[j] = cache_req_type[k];
|
||||
assign cache_rsp_type[k] = cache_rsp_type_b[j];
|
||||
end else begin
|
||||
assign cache_req_type_b[j] = '0;
|
||||
`UNUSED_VAR (cache_rsp_type_b[j])
|
||||
end
|
||||
end
|
||||
|
||||
end else begin
|
||||
|
||||
assign {cache_req_uuid, cache_req_type, cache_req_tag_x} = cache_req_tag[i];
|
||||
assign cache_bus_if[i].req_data.tag = {cache_req_uuid, cache_req_tag_x, cache_req_type[i]};
|
||||
|
||||
assign {cache_rsp_uuid, cache_rsp_tag_x, cache_rsp_type[i]} = cache_bus_if[i].rsp_data.tag;
|
||||
assign cache_rsp_tag[i] = {cache_rsp_uuid, cache_rsp_type, cache_rsp_tag_x};
|
||||
|
||||
for (genvar j = 0; j < DCACHE_NUM_REQS; ++j) begin
|
||||
if (i != j) begin
|
||||
`UNUSED_VAR (cache_req_type[j])
|
||||
assign cache_rsp_type[j] = '0;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
wire [`UUID_WIDTH-1:0] rsp_uuid;
|
||||
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] rsp_addr_type;
|
||||
wire [`NW_WIDTH-1:0] rsp_wid;
|
||||
wire [NUM_LANES-1:0] rsp_tmask_uq;
|
||||
wire [`XLEN-1:0] rsp_pc;
|
||||
|
@ -457,12 +384,11 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
|
||||
assign {
|
||||
rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr
|
||||
rsp_uuid, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr
|
||||
`ifdef LSU_DUP_ENABLE
|
||||
, rsp_is_dup
|
||||
`endif
|
||||
} = mem_rsp_tag;
|
||||
`UNUSED_VAR (rsp_addr_type)
|
||||
`UNUSED_VAR (rsp_op_type)
|
||||
|
||||
// load response formatting
|
||||
|
@ -626,17 +552,13 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
|
|||
if (mem_req_rw) begin
|
||||
`TRACE(1, ("%d: D$%0d Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, mem_req_mask));
|
||||
`TRACE_ARRAY1D(1, full_addr, NUM_LANES);
|
||||
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, type=", mem_req_tag, mem_req_byteen));
|
||||
`TRACE_ARRAY1D(1, lsu_addr_type, NUM_LANES);
|
||||
`TRACE(1, (", data="));
|
||||
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, data=", mem_req_tag, mem_req_byteen));
|
||||
`TRACE_ARRAY1D(1, mem_req_data, NUM_LANES);
|
||||
`TRACE(1, (", is_dup=%b (#%0d)\n", lsu_is_dup, execute_if[0].data.uuid));
|
||||
end else begin
|
||||
`TRACE(1, ("%d: D$%0d Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, mem_req_mask));
|
||||
`TRACE_ARRAY1D(1, full_addr, NUM_LANES);
|
||||
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, type=", mem_req_tag, mem_req_byteen));
|
||||
`TRACE_ARRAY1D(1, lsu_addr_type, NUM_LANES);
|
||||
`TRACE(1, (", rd=%0d, is_dup=%b (#%0d)\n", execute_if[0].data.rd, lsu_is_dup, execute_if[0].data.uuid));
|
||||
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, rd=%0d, is_dup=%b (#%0d)\n", mem_req_tag, mem_req_byteen, execute_if[0].data.rd, lsu_is_dup, execute_if[0].data.uuid));
|
||||
end
|
||||
end
|
||||
if (mem_rsp_fire) begin
|
||||
|
|
|
@ -26,21 +26,53 @@ module VX_smem_unit import VX_gpu_pkg::*; #(
|
|||
VX_mem_bus_if.slave dcache_bus_in_if [DCACHE_NUM_REQS],
|
||||
VX_mem_bus_if.master dcache_bus_out_if [DCACHE_NUM_REQS]
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
`STATIC_ASSERT(`IS_DIVISBLE((1 << `SMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter"))
|
||||
`STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % (1 << `SMEM_LOG_SIZE)), ("invalid parameter"))
|
||||
|
||||
localparam SMEM_ADDR_WIDTH = `SMEM_LOG_SIZE - `CLOG2(DCACHE_WORD_SIZE);
|
||||
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
|
||||
localparam MEM_ADDRW = `XLEN - MEM_ASHIFT;
|
||||
localparam SMEM_START_B = MEM_ADDRW'(`XLEN'(`SMEM_BASE_ADDR) >> MEM_ASHIFT);
|
||||
localparam SMEM_END_B = MEM_ADDRW'((`XLEN'(`SMEM_BASE_ADDR) + (1 << `SMEM_LOG_SIZE)) >> MEM_ASHIFT);
|
||||
|
||||
wire [DCACHE_NUM_REQS-1:0] smem_req_valid;
|
||||
wire [DCACHE_NUM_REQS-1:0] smem_req_rw;
|
||||
wire [DCACHE_NUM_REQS-1:0][SMEM_ADDR_WIDTH-1:0] smem_req_addr;
|
||||
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] smem_req_byteen;
|
||||
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] smem_req_data;
|
||||
wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] smem_req_tag;
|
||||
wire [DCACHE_NUM_REQS-1:0] smem_req_ready;
|
||||
wire [DCACHE_NUM_REQS-1:0] smem_rsp_valid;
|
||||
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] smem_rsp_data;
|
||||
wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] smem_rsp_tag;
|
||||
wire [DCACHE_NUM_REQS-1:0] smem_rsp_ready;
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) smem_bus_if[DCACHE_NUM_REQS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) switch_out_bus_if[2 * DCACHE_NUM_REQS]();
|
||||
|
||||
`RESET_RELAY (switch_reset, reset);
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
|
||||
wire [MEM_ADDRW-1:0] block_addr = dcache_bus_in_if[i].req_data.addr[DCACHE_ADDR_WIDTH-1 -: MEM_ADDRW];
|
||||
wire bus_sel = (block_addr >= SMEM_START_B) && (block_addr < SMEM_END_B);
|
||||
|
||||
VX_smem_switch #(
|
||||
.NUM_REQS (2),
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH),
|
||||
.ARBITER ("P"),
|
||||
.REQ_OUT_BUF (2),
|
||||
.RSP_OUT_BUF (2)
|
||||
) smem_switch (
|
||||
.clk (clk),
|
||||
.reset (switch_reset),
|
||||
.bus_sel (bus_sel),
|
||||
.bus_in_if (dcache_bus_in_if[i]),
|
||||
.bus_out_if (switch_out_bus_if[i * 2 +: 2])
|
||||
);
|
||||
|
||||
// output bus[0] goes to the dcache
|
||||
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_out_if[i], switch_out_bus_if[i * 2 + 0]);
|
||||
|
||||
// output bus[1] goes to the local memory
|
||||
`ASSIGN_VX_MEM_BUS_IF (smem_bus_if[i], switch_out_bus_if[i * 2 + 1]);
|
||||
end
|
||||
|
||||
`RESET_RELAY (smem_reset, reset);
|
||||
|
||||
|
@ -52,7 +84,7 @@ module VX_smem_unit import VX_gpu_pkg::*; #(
|
|||
.WORD_SIZE (DCACHE_WORD_SIZE),
|
||||
.ADDR_WIDTH (SMEM_ADDR_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH)
|
||||
) shared_mem (
|
||||
.clk (clk),
|
||||
.reset (smem_reset),
|
||||
|
@ -60,65 +92,7 @@ module VX_smem_unit import VX_gpu_pkg::*; #(
|
|||
`ifdef PERF_ENABLE
|
||||
.cache_perf (cache_perf),
|
||||
`endif
|
||||
|
||||
// Core request
|
||||
.req_valid (smem_req_valid),
|
||||
.req_rw (smem_req_rw),
|
||||
.req_byteen (smem_req_byteen),
|
||||
.req_addr (smem_req_addr),
|
||||
.req_data (smem_req_data),
|
||||
.req_tag (smem_req_tag),
|
||||
.req_ready (smem_req_ready),
|
||||
|
||||
// Core response
|
||||
.rsp_valid (smem_rsp_valid),
|
||||
.rsp_data (smem_rsp_data),
|
||||
.rsp_tag (smem_rsp_tag),
|
||||
.rsp_ready (smem_rsp_ready)
|
||||
.mem_bus_if (smem_bus_if)
|
||||
);
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
|
||||
) switch_out_bus_if[2 * DCACHE_NUM_REQS]();
|
||||
|
||||
`RESET_RELAY (switch_reset, reset);
|
||||
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
|
||||
assign smem_req_valid[i] = switch_out_bus_if[i * 2 + 1].req_valid;
|
||||
assign smem_req_rw[i] = switch_out_bus_if[i * 2 + 1].req_data.rw;
|
||||
assign smem_req_byteen[i] = switch_out_bus_if[i * 2 + 1].req_data.byteen;
|
||||
assign smem_req_data[i] = switch_out_bus_if[i * 2 + 1].req_data.data;
|
||||
assign smem_req_tag[i] = switch_out_bus_if[i * 2 + 1].req_data.tag;
|
||||
assign switch_out_bus_if[i * 2 + 1].req_ready = smem_req_ready[i];
|
||||
|
||||
assign switch_out_bus_if[i * 2 + 1].rsp_valid = smem_rsp_valid[i];
|
||||
assign switch_out_bus_if[i * 2 + 1].rsp_data.data = smem_rsp_data[i];
|
||||
assign switch_out_bus_if[i * 2 + 1].rsp_data.tag = smem_rsp_tag[i];
|
||||
assign smem_rsp_ready[i] = switch_out_bus_if[i * 2 + 1].rsp_ready;
|
||||
|
||||
assign smem_req_addr[i] = switch_out_bus_if[i * 2 + 1].req_data.addr[SMEM_ADDR_WIDTH-1:0];
|
||||
|
||||
VX_smem_switch #(
|
||||
.NUM_REQS (2),
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH),
|
||||
.TAG_SEL_IDX (0),
|
||||
.ARBITER ("P"),
|
||||
.REQ_OUT_BUF (2),
|
||||
.RSP_OUT_BUF (2)
|
||||
) smem_switch (
|
||||
.clk (clk),
|
||||
.reset (switch_reset),
|
||||
.bus_in_if (dcache_bus_in_if[i]),
|
||||
.bus_out_if (switch_out_bus_if[i * 2 +: 2])
|
||||
);
|
||||
end
|
||||
|
||||
// this bus goes to the dcache
|
||||
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
|
||||
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_out_if[i], switch_out_bus_if[i * 2]);
|
||||
end
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -13,30 +13,33 @@
|
|||
|
||||
`include "VX_platform.vh"
|
||||
|
||||
`TRACING_OFF
|
||||
//`TRACING_OFF
|
||||
module VX_mem_scheduler #(
|
||||
parameter `STRING INSTANCE_ID = "",
|
||||
parameter CORE_REQS = 1,
|
||||
parameter MEM_CHANNELS = 1,
|
||||
parameter ADDR_WIDTH = 32,
|
||||
parameter WORD_SIZE = 4,
|
||||
parameter LINE_SIZE = 4,
|
||||
parameter TAG_WIDTH = 8,
|
||||
parameter TAG_ID_WIDTH = 8, // lower section of the request tag contains the tag identifier
|
||||
parameter UUID_WIDTH = 0, // upper section of the request tag contains the UUID
|
||||
parameter QUEUE_SIZE = 8,
|
||||
parameter RSP_PARTIAL = 0,
|
||||
parameter CORE_OUT_BUF = 0,
|
||||
parameter MEM_OUT_BUF = 0,
|
||||
parameter CORE_REQS = 1,
|
||||
parameter MEM_CHANNELS = 1,
|
||||
parameter WORD_SIZE = 4,
|
||||
parameter LINE_SIZE = 4,
|
||||
parameter ADDR_WIDTH = 32 - `CLOG2(WORD_SIZE),
|
||||
parameter UUID_WIDTH = 0, // upper section of the request tag contains the UUID
|
||||
parameter TAG_WIDTH = 8,
|
||||
parameter QUEUE_SIZE = 8,
|
||||
parameter RSP_PARTIAL = 0,
|
||||
parameter CORE_OUT_BUF = 0,
|
||||
parameter MEM_OUT_BUF = 0,
|
||||
|
||||
parameter WORD_WIDTH = WORD_SIZE * 8,
|
||||
parameter LINE_WIDTH = LINE_SIZE * 8,
|
||||
parameter MEM_REQS = (CORE_REQS * WORD_SIZE) / LINE_SIZE,
|
||||
parameter NUM_BATCHES = (MEM_REQS + MEM_CHANNELS - 1) / MEM_CHANNELS,
|
||||
parameter QUEUE_ADDRW = `CLOG2(QUEUE_SIZE),
|
||||
parameter BATCH_SEL_BITS = `CLOG2(NUM_BATCHES),
|
||||
parameter MEM_TAG_ID = TAG_WIDTH - TAG_ID_WIDTH,
|
||||
parameter MEM_TAGW = MEM_TAG_ID + QUEUE_ADDRW + BATCH_SEL_BITS
|
||||
parameter WORD_WIDTH = WORD_SIZE * 8,
|
||||
parameter LINE_WIDTH = LINE_SIZE * 8,
|
||||
parameter PER_LINE_REQS = LINE_SIZE / WORD_SIZE,
|
||||
parameter MERGED_REQS = CORE_REQS / PER_LINE_REQS,
|
||||
parameter NUM_BATCHES = (MERGED_REQS + MEM_CHANNELS - 1) / MEM_CHANNELS,
|
||||
parameter QUEUE_ADDRW = `CLOG2(QUEUE_SIZE),
|
||||
parameter BATCH_SEL_BITS= `CLOG2(NUM_BATCHES),
|
||||
parameter TAG_ID_WIDTH = TAG_WIDTH - UUID_WIDTH,
|
||||
parameter MEM_TAG_ID = TAG_WIDTH - TAG_ID_WIDTH,
|
||||
parameter MEM_ADDR_WIDTH= ADDR_WIDTH - `CLOG2(PER_LINE_REQS),
|
||||
parameter REQQ_TAG_WIDTH= MEM_TAG_ID + QUEUE_ADDRW,
|
||||
parameter MEM_TAG_WIDTH = REQQ_TAG_WIDTH + BATCH_SEL_BITS
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
@ -66,40 +69,39 @@ module VX_mem_scheduler #(
|
|||
output wire [MEM_CHANNELS-1:0] mem_req_valid,
|
||||
output wire [MEM_CHANNELS-1:0] mem_req_rw,
|
||||
output wire [MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen,
|
||||
output wire [MEM_CHANNELS-1:0][ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data,
|
||||
output wire [MEM_CHANNELS-1:0][MEM_TAGW-1:0]mem_req_tag,
|
||||
output wire [MEM_CHANNELS-1:0][MEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||
input wire [MEM_CHANNELS-1:0] mem_req_ready,
|
||||
|
||||
// Memory response
|
||||
input wire [MEM_CHANNELS-1:0] mem_rsp_valid,
|
||||
input wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [MEM_CHANNELS-1:0][MEM_TAGW-1:0] mem_rsp_tag,
|
||||
input wire [MEM_CHANNELS-1:0][MEM_TAG_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire [MEM_CHANNELS-1:0] mem_rsp_ready
|
||||
);
|
||||
|
||||
localparam REQQ_TAG_WIDTH = MEM_TAG_ID + QUEUE_ADDRW;
|
||||
localparam BATCH_SEL_WIDTH = `UP(BATCH_SEL_BITS);
|
||||
localparam STALL_TIMEOUT = 10000000;
|
||||
|
||||
`STATIC_ASSERT ((WORD_SIZE == LINE_SIZE), ("invalid parameter"))
|
||||
`STATIC_ASSERT (`IS_DIVISBLE(CORE_REQS * WORD_SIZE, LINE_SIZE), ("invalid parameter"))
|
||||
`STATIC_ASSERT ((MEM_TAG_ID >= UUID_WIDTH), ("invalid parameter"))
|
||||
`STATIC_ASSERT ((0 == RSP_PARTIAL) || (1 == RSP_PARTIAL), ("invalid parameter"))
|
||||
`RUNTIME_ASSERT ((~core_req_valid || core_req_mask != 0), ("invalid request mask"));
|
||||
|
||||
wire [MEM_CHANNELS-1:0] mem_req_valid_s;
|
||||
wire [MEM_CHANNELS-1:0] mem_req_mask_s;
|
||||
wire [MEM_CHANNELS-1:0] mem_req_rw_s;
|
||||
wire mem_req_rw_s;
|
||||
wire [MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen_s;
|
||||
wire [MEM_CHANNELS-1:0][ADDR_WIDTH-1:0] mem_req_addr_s;
|
||||
wire [MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr_s;
|
||||
wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data_s;
|
||||
wire [MEM_TAGW-1:0] mem_req_tag_s;
|
||||
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s;
|
||||
wire [MEM_CHANNELS-1:0] mem_req_ready_s;
|
||||
|
||||
wire mem_rsp_valid_s;
|
||||
wire [MEM_CHANNELS-1:0] mem_rsp_mask_s;
|
||||
wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_rsp_data_s;
|
||||
wire [MEM_TAGW-1:0] mem_rsp_tag_s;
|
||||
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s;
|
||||
wire mem_rsp_ready_s;
|
||||
wire mem_rsp_fire_s;
|
||||
|
||||
|
@ -124,11 +126,28 @@ module VX_mem_scheduler #(
|
|||
wire crsp_valid;
|
||||
wire [CORE_REQS-1:0] crsp_mask;
|
||||
wire [CORE_REQS-1:0][WORD_WIDTH-1:0] crsp_data;
|
||||
wire [TAG_WIDTH-1:0] crsp_tag;
|
||||
wire [REQQ_TAG_WIDTH-1:0] crsp_tag;
|
||||
wire crsp_sop;
|
||||
wire crsp_eop;
|
||||
wire crsp_ready;
|
||||
|
||||
wire reqq_valid_s;
|
||||
wire [MERGED_REQS-1:0] reqq_mask_s;
|
||||
wire reqq_rw_s;
|
||||
wire [MERGED_REQS-1:0][LINE_SIZE-1:0] reqq_byteen_s;
|
||||
wire [MERGED_REQS-1:0][MEM_ADDR_WIDTH-1:0] reqq_addr_s;
|
||||
wire [MERGED_REQS-1:0][LINE_WIDTH-1:0] reqq_data_s;
|
||||
wire [REQQ_TAG_WIDTH-1:0] reqq_tag_s;
|
||||
wire reqq_ready_s;
|
||||
|
||||
wire crsp_valid_s;
|
||||
wire [MERGED_REQS-1:0] crsp_mask_s;
|
||||
wire [MERGED_REQS-1:0][LINE_WIDTH-1:0] crsp_data_s;
|
||||
wire [REQQ_TAG_WIDTH-1:0] crsp_tag_s;
|
||||
wire crsp_sop_s;
|
||||
wire crsp_eop_s;
|
||||
wire crsp_ready_s;
|
||||
|
||||
// Request queue //////////////////////////////////////////////////////////
|
||||
|
||||
wire req_sent_all;
|
||||
|
@ -170,11 +189,9 @@ module VX_mem_scheduler #(
|
|||
|
||||
// Index buffer ///////////////////////////////////////////////////////////
|
||||
|
||||
wire rsp_complete;
|
||||
|
||||
assign ibuf_push = core_req_valid && core_req_ready && ~core_req_rw;
|
||||
assign ibuf_pop = crsp_valid && crsp_ready && rsp_complete;
|
||||
assign ibuf_raddr = mem_rsp_tag_s[BATCH_SEL_BITS +: QUEUE_ADDRW];
|
||||
assign ibuf_pop = crsp_valid && crsp_ready && crsp_eop;
|
||||
assign ibuf_raddr = crsp_tag[QUEUE_ADDRW-1:0];
|
||||
assign ibuf_din = core_req_tag[TAG_ID_WIDTH-1:0];
|
||||
|
||||
VX_index_buffer #(
|
||||
|
@ -195,12 +212,31 @@ module VX_mem_scheduler #(
|
|||
|
||||
`UNUSED_VAR (ibuf_empty)
|
||||
|
||||
wire [QUEUE_ADDRW-1:0] ibuf_waddr_s = reqq_tag_s[QUEUE_ADDRW-1:0];
|
||||
wire [QUEUE_ADDRW-1:0] ibuf_raddr_s = crsp_tag_s[QUEUE_ADDRW-1:0];
|
||||
|
||||
assign reqq_valid_s = reqq_valid;
|
||||
assign reqq_mask_s = reqq_mask;
|
||||
assign reqq_rw_s = reqq_rw;
|
||||
assign reqq_byteen_s= reqq_byteen;
|
||||
assign reqq_addr_s = reqq_addr;
|
||||
assign reqq_data_s = reqq_data;
|
||||
assign reqq_tag_s = reqq_tag;
|
||||
assign reqq_ready = reqq_ready_s;
|
||||
|
||||
assign crsp_valid = crsp_valid_s;
|
||||
assign crsp_mask = crsp_mask_s;
|
||||
assign crsp_data = crsp_data_s;
|
||||
assign crsp_tag = crsp_tag_s;
|
||||
assign crsp_sop = crsp_sop_s;
|
||||
assign crsp_eop = crsp_eop_s;
|
||||
assign crsp_ready_s = crsp_ready;
|
||||
|
||||
// Handle memory requests /////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_BATCHES-1:0][MEM_CHANNELS-1:0] mem_req_mask_b;
|
||||
wire [NUM_BATCHES-1:0][MEM_CHANNELS-1:0] mem_req_rw_b;
|
||||
wire [NUM_BATCHES-1:0][MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen_b;
|
||||
wire [NUM_BATCHES-1:0][MEM_CHANNELS-1:0][ADDR_WIDTH-1:0] mem_req_addr_b;
|
||||
wire [NUM_BATCHES-1:0][MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr_b;
|
||||
wire [NUM_BATCHES-1:0][MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data_b;
|
||||
|
||||
wire [BATCH_SEL_WIDTH-1:0] req_batch_idx;
|
||||
|
@ -208,15 +244,13 @@ module VX_mem_scheduler #(
|
|||
for (genvar i = 0; i < NUM_BATCHES; ++i) begin
|
||||
for (genvar j = 0; j < MEM_CHANNELS; ++j) begin
|
||||
localparam r = i * MEM_CHANNELS + j;
|
||||
if (r < CORE_REQS) begin
|
||||
assign mem_req_mask_b[i][j] = reqq_mask[r];
|
||||
assign mem_req_rw_b[i][j] = reqq_rw;
|
||||
assign mem_req_byteen_b[i][j] = reqq_byteen[r];
|
||||
assign mem_req_addr_b[i][j] = reqq_addr[r];
|
||||
assign mem_req_data_b[i][j] = reqq_data[r];
|
||||
if (r < MERGED_REQS) begin
|
||||
assign mem_req_mask_b[i][j] = reqq_mask_s[r];
|
||||
assign mem_req_byteen_b[i][j] = reqq_byteen_s[r];
|
||||
assign mem_req_addr_b[i][j] = reqq_addr_s[r];
|
||||
assign mem_req_data_b[i][j] = reqq_data_s[r];
|
||||
end else begin
|
||||
assign mem_req_mask_b[i][j] = 0;
|
||||
assign mem_req_rw_b[i][j] = '0;
|
||||
assign mem_req_byteen_b[i][j] = '0;
|
||||
assign mem_req_addr_b[i][j] = '0;
|
||||
assign mem_req_data_b[i][j] = '0;
|
||||
|
@ -225,22 +259,20 @@ module VX_mem_scheduler #(
|
|||
end
|
||||
|
||||
assign mem_req_mask_s = mem_req_mask_b[req_batch_idx];
|
||||
assign mem_req_rw_s = mem_req_rw_b[req_batch_idx];
|
||||
assign mem_req_rw_s = reqq_rw_s;
|
||||
assign mem_req_byteen_s = mem_req_byteen_b[req_batch_idx];
|
||||
assign mem_req_addr_s = mem_req_addr_b[req_batch_idx];
|
||||
assign mem_req_data_s = mem_req_data_b[req_batch_idx];
|
||||
|
||||
reg [MEM_CHANNELS-1:0] batch_sent_mask;
|
||||
|
||||
wire [MEM_CHANNELS-1:0] batch_sent_mask_n = batch_sent_mask | mem_req_ready_s;
|
||||
|
||||
reg [MEM_CHANNELS-1:0] batch_sent_mask;
|
||||
wire [MEM_CHANNELS-1:0] batch_sent_mask_n = batch_sent_mask | mem_req_ready_s;
|
||||
wire batch_sent_all = (mem_req_mask_s & ~batch_sent_mask_n) == 0;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
batch_sent_mask <= '0;
|
||||
end else begin
|
||||
if (reqq_valid) begin
|
||||
if (reqq_valid_s) begin
|
||||
if (batch_sent_all) begin
|
||||
batch_sent_mask <= '0;
|
||||
end else begin
|
||||
|
@ -256,7 +288,7 @@ module VX_mem_scheduler #(
|
|||
if (reset) begin
|
||||
req_batch_idx_r <= '0;
|
||||
end else begin
|
||||
if (reqq_valid && batch_sent_all) begin
|
||||
if (reqq_valid_s && batch_sent_all) begin
|
||||
if (req_sent_all) begin
|
||||
req_batch_idx_r <= '0;
|
||||
end else begin
|
||||
|
@ -288,22 +320,22 @@ module VX_mem_scheduler #(
|
|||
|
||||
assign req_batch_idx = req_batch_idx_r;
|
||||
assign req_sent_all = batch_sent_all && (req_batch_idx_r == req_batch_idx_last);
|
||||
assign mem_req_tag_s = {reqq_tag, req_batch_idx};
|
||||
assign mem_req_tag_s = {reqq_tag_s, req_batch_idx};
|
||||
|
||||
end else begin
|
||||
|
||||
assign req_batch_idx = '0;
|
||||
assign req_sent_all = batch_sent_all;
|
||||
assign mem_req_tag_s = reqq_tag;
|
||||
assign mem_req_tag_s = reqq_tag_s;
|
||||
|
||||
end
|
||||
|
||||
assign mem_req_valid_s = {MEM_CHANNELS{reqq_valid}} & mem_req_mask_s & ~batch_sent_mask;
|
||||
assign reqq_ready = req_sent_all;
|
||||
assign mem_req_valid_s = {MEM_CHANNELS{reqq_valid_s}} & mem_req_mask_s & ~batch_sent_mask;
|
||||
assign reqq_ready_s = req_sent_all;
|
||||
|
||||
for (genvar i = 0; i < MEM_CHANNELS; ++i) begin
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (1 + LINE_SIZE + ADDR_WIDTH + LINE_WIDTH + MEM_TAGW),
|
||||
.DATAW (1 + LINE_SIZE + MEM_ADDR_WIDTH + LINE_WIDTH + MEM_TAG_WIDTH),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(MEM_OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
|
||||
) mem_req_buf (
|
||||
|
@ -311,8 +343,8 @@ module VX_mem_scheduler #(
|
|||
.reset (reset),
|
||||
.valid_in (mem_req_valid_s[i]),
|
||||
.ready_in (mem_req_ready_s[i]),
|
||||
.data_in ({mem_req_rw_s[i], mem_req_byteen_s[i], mem_req_addr_s[i], mem_req_data_s[i], mem_req_tag_s}),
|
||||
.data_out ({mem_req_rw[i], mem_req_byteen[i], mem_req_addr[i], mem_req_data[i], mem_req_tag[i]}),
|
||||
.data_in ({mem_req_rw_s, mem_req_byteen_s[i], mem_req_addr_s[i], mem_req_data_s[i], mem_req_tag_s}),
|
||||
.data_out ({mem_req_rw[i], mem_req_byteen[i], mem_req_addr[i], mem_req_data[i], mem_req_tag[i]}),
|
||||
.valid_out (mem_req_valid[i]),
|
||||
.ready_out (mem_req_ready[i])
|
||||
);
|
||||
|
@ -320,16 +352,20 @@ module VX_mem_scheduler #(
|
|||
|
||||
// Handle memory responses ////////////////////////////////////////////////
|
||||
|
||||
reg [QUEUE_SIZE-1:0][CORE_REQS-1:0] rsp_rem_mask;
|
||||
wire [CORE_REQS-1:0] rsp_rem_mask_n, curr_mask;
|
||||
reg [QUEUE_SIZE-1:0] pending_req_valids;
|
||||
reg [QUEUE_SIZE-1:0][MERGED_REQS-1:0] rsp_rem_mask;
|
||||
wire [MERGED_REQS-1:0] rsp_rem_mask_n, curr_mask;
|
||||
wire [BATCH_SEL_WIDTH-1:0] rsp_batch_idx;
|
||||
|
||||
wire reqq_fire_s = reqq_valid_s && reqq_ready_s;
|
||||
wire reqq_rd_start_s = reqq_fire_s && ~reqq_rw_s && ~pending_req_valids[ibuf_waddr_s];
|
||||
|
||||
// Select memory response
|
||||
VX_mem_rsp_sel #(
|
||||
.NUM_REQS (MEM_CHANNELS),
|
||||
.DATA_WIDTH (LINE_WIDTH),
|
||||
.TAG_WIDTH (MEM_TAGW),
|
||||
.TAG_SEL_BITS (MEM_TAGW - MEM_TAG_ID),
|
||||
.TAG_WIDTH (MEM_TAG_WIDTH),
|
||||
.TAG_SEL_BITS (MEM_TAG_WIDTH - MEM_TAG_ID),
|
||||
.OUT_BUF (2)
|
||||
) mem_rsp_sel (
|
||||
.clk (clk),
|
||||
|
@ -345,13 +381,15 @@ module VX_mem_scheduler #(
|
|||
.rsp_ready_out (mem_rsp_ready_s)
|
||||
);
|
||||
|
||||
for (genvar r = 0; r < CORE_REQS; ++r) begin
|
||||
for (genvar r = 0; r < MERGED_REQS; ++r) begin
|
||||
localparam i = r / MEM_CHANNELS;
|
||||
localparam j = r % MEM_CHANNELS;
|
||||
assign curr_mask[r] = (BATCH_SEL_WIDTH'(i) == rsp_batch_idx) && mem_rsp_mask_s[j];
|
||||
end
|
||||
|
||||
assign rsp_rem_mask_n = rsp_rem_mask[ibuf_raddr] & ~curr_mask;
|
||||
assign rsp_rem_mask_n = rsp_rem_mask[ibuf_raddr_s] & ~curr_mask;
|
||||
|
||||
wire rsp_complete = ~(| rsp_rem_mask_n);
|
||||
|
||||
if (NUM_BATCHES > 1) begin
|
||||
assign rsp_batch_idx = mem_rsp_tag_s[BATCH_SEL_BITS-1:0];
|
||||
|
@ -359,14 +397,22 @@ module VX_mem_scheduler #(
|
|||
assign rsp_batch_idx = '0;
|
||||
end
|
||||
|
||||
assign rsp_complete = ~(| rsp_rem_mask_n);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (ibuf_push) begin
|
||||
rsp_rem_mask[ibuf_waddr] <= core_req_mask;
|
||||
if (reset) begin
|
||||
pending_req_valids <= '0;
|
||||
end else begin
|
||||
if (reqq_rd_start_s) begin
|
||||
pending_req_valids[ibuf_waddr_s] <= 1;
|
||||
end
|
||||
if (mem_rsp_fire_s && rsp_complete) begin
|
||||
pending_req_valids[ibuf_raddr_s] <= 0;
|
||||
end
|
||||
end
|
||||
if (reqq_rd_start_s) begin
|
||||
rsp_rem_mask[ibuf_waddr_s] <= reqq_mask_s;
|
||||
end
|
||||
if (mem_rsp_fire_s) begin
|
||||
rsp_rem_mask[ibuf_raddr] <= rsp_rem_mask_n;
|
||||
rsp_rem_mask[ibuf_raddr_s] <= rsp_rem_mask_n;
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -377,74 +423,78 @@ module VX_mem_scheduler #(
|
|||
reg [QUEUE_SIZE-1:0] rsp_sop_r;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (ibuf_push) begin
|
||||
rsp_sop_r[ibuf_waddr] <= 1;
|
||||
if (reqq_rd_start_s) begin
|
||||
rsp_sop_r[ibuf_waddr_s] <= 1;
|
||||
end
|
||||
if (mem_rsp_fire_s) begin
|
||||
rsp_sop_r[ibuf_raddr] <= 0;
|
||||
rsp_sop_r[ibuf_raddr_s] <= 0;
|
||||
end
|
||||
end
|
||||
|
||||
assign mem_rsp_ready_s = crsp_ready;
|
||||
assign mem_rsp_ready_s = crsp_ready_s;
|
||||
|
||||
assign crsp_valid = mem_rsp_valid_s;
|
||||
assign crsp_valid_s = mem_rsp_valid_s;
|
||||
|
||||
assign crsp_mask = curr_mask;
|
||||
assign crsp_sop = rsp_sop_r[ibuf_raddr];
|
||||
assign crsp_mask_s = curr_mask;
|
||||
assign crsp_sop_s = rsp_sop_r[ibuf_raddr_s];
|
||||
|
||||
for (genvar r = 0; r < CORE_REQS; ++r) begin
|
||||
for (genvar r = 0; r < MERGED_REQS; ++r) begin
|
||||
localparam j = r % MEM_CHANNELS;
|
||||
assign crsp_data[r] = mem_rsp_data_s[j];
|
||||
assign crsp_data_s[r] = mem_rsp_data_s[j];
|
||||
end
|
||||
|
||||
end else begin
|
||||
|
||||
reg [NUM_BATCHES*MEM_CHANNELS*WORD_WIDTH-1:0] rsp_store [QUEUE_SIZE-1:0];
|
||||
reg [NUM_BATCHES*MEM_CHANNELS*WORD_WIDTH-1:0] rsp_store_n;
|
||||
reg [CORE_REQS-1:0] rsp_orig_mask [QUEUE_SIZE-1:0];
|
||||
reg [NUM_BATCHES*MEM_CHANNELS*LINE_WIDTH-1:0] rsp_store [QUEUE_SIZE-1:0];
|
||||
reg [NUM_BATCHES*MEM_CHANNELS*LINE_WIDTH-1:0] rsp_store_n;
|
||||
reg [MERGED_REQS-1:0] rsp_orig_mask [QUEUE_SIZE-1:0];
|
||||
|
||||
always @(*) begin
|
||||
rsp_store_n = rsp_store[ibuf_raddr];
|
||||
rsp_store_n = rsp_store[ibuf_raddr_s];
|
||||
for (integer i = 0; i < MEM_CHANNELS; ++i) begin
|
||||
if ((MEM_CHANNELS == 1) || mem_rsp_mask_s[i]) begin
|
||||
rsp_store_n[(rsp_batch_idx * MEM_CHANNELS + i) * WORD_WIDTH +: WORD_WIDTH] = mem_rsp_data_s[i];
|
||||
rsp_store_n[(rsp_batch_idx * MEM_CHANNELS + i) * LINE_WIDTH +: LINE_WIDTH] = mem_rsp_data_s[i];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (ibuf_push) begin
|
||||
rsp_orig_mask[ibuf_waddr] <= core_req_mask;
|
||||
if (reqq_rd_start_s) begin
|
||||
rsp_orig_mask[ibuf_waddr_s] <= core_req_mask;
|
||||
end
|
||||
if (mem_rsp_valid_s) begin
|
||||
rsp_store[ibuf_raddr] <= rsp_store_n;
|
||||
rsp_store[ibuf_raddr_s] <= rsp_store_n;
|
||||
end
|
||||
end
|
||||
|
||||
assign mem_rsp_ready_s = crsp_ready || ~rsp_complete;
|
||||
assign mem_rsp_ready_s = crsp_ready_s || ~rsp_complete;
|
||||
|
||||
assign crsp_valid = mem_rsp_valid_s && rsp_complete;
|
||||
assign crsp_valid_s = mem_rsp_valid_s && rsp_complete;
|
||||
|
||||
assign crsp_mask = rsp_orig_mask[ibuf_raddr];
|
||||
assign crsp_sop = 1'b1;
|
||||
assign crsp_mask_s = rsp_orig_mask[ibuf_raddr_s];
|
||||
assign crsp_sop_s = 1'b1;
|
||||
|
||||
for (genvar r = 0; r < CORE_REQS; ++r) begin
|
||||
for (genvar r = 0; r < MERGED_REQS; ++r) begin
|
||||
localparam i = r / MEM_CHANNELS;
|
||||
localparam j = r % MEM_CHANNELS;
|
||||
assign crsp_data[r] = rsp_store_n[(i * MEM_CHANNELS + j) * LINE_WIDTH +: WORD_WIDTH];
|
||||
assign crsp_data_s[r] = rsp_store_n[(i * MEM_CHANNELS + j) * LINE_WIDTH +: LINE_WIDTH];
|
||||
end
|
||||
end
|
||||
|
||||
if (MEM_TAG_ID != 0) begin
|
||||
assign crsp_tag = {mem_rsp_tag_s[MEM_TAGW-1 -: MEM_TAG_ID], ibuf_dout};
|
||||
end else begin
|
||||
assign crsp_tag = ibuf_dout;
|
||||
end
|
||||
assign crsp_tag_s = mem_rsp_tag_s[MEM_TAG_WIDTH-1 -: REQQ_TAG_WIDTH];
|
||||
|
||||
assign crsp_eop = ibuf_pop;
|
||||
assign crsp_eop_s = rsp_complete;
|
||||
|
||||
// Send response to caller
|
||||
|
||||
wire [TAG_WIDTH-1:0] crsp_tag_2;
|
||||
|
||||
if (MEM_TAG_ID != 0) begin
|
||||
assign crsp_tag_2 = {crsp_tag[MEM_TAG_WIDTH-1 -: MEM_TAG_ID], ibuf_dout};
|
||||
end else begin
|
||||
assign crsp_tag_2 = ibuf_dout;
|
||||
end
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (CORE_REQS + 1 + 1 + (CORE_REQS * WORD_WIDTH) + TAG_WIDTH),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(CORE_OUT_BUF)),
|
||||
|
@ -454,7 +504,7 @@ module VX_mem_scheduler #(
|
|||
.reset (reset),
|
||||
.valid_in (crsp_valid),
|
||||
.ready_in (crsp_ready),
|
||||
.data_in ({crsp_mask, crsp_sop, crsp_eop, crsp_data, crsp_tag}),
|
||||
.data_in ({crsp_mask, crsp_sop, crsp_eop, crsp_data, crsp_tag_2}),
|
||||
.data_out ({core_rsp_mask, core_rsp_sop, core_rsp_eop, core_rsp_data, core_rsp_tag}),
|
||||
.valid_out (core_rsp_valid),
|
||||
.ready_out (core_rsp_ready)
|
||||
|
@ -469,8 +519,8 @@ module VX_mem_scheduler #(
|
|||
if (UUID_WIDTH != 0) begin
|
||||
assign req_dbg_uuid = core_req_tag[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
assign rsp_dbg_uuid = core_rsp_tag[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
assign mem_req_dbg_uuid = reqq_tag[REQQ_TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
assign mem_rsp_dbg_uuid = mem_rsp_tag_s[MEM_TAGW-1 -: UUID_WIDTH];
|
||||
assign mem_req_dbg_uuid = reqq_tag_s[REQQ_TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
assign mem_rsp_dbg_uuid = mem_rsp_tag_s[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
end else begin
|
||||
assign req_dbg_uuid = '0;
|
||||
assign rsp_dbg_uuid = '0;
|
||||
|
@ -483,30 +533,18 @@ module VX_mem_scheduler #(
|
|||
`UNUSED_VAR (mem_req_dbg_uuid)
|
||||
`UNUSED_VAR (mem_rsp_dbg_uuid)
|
||||
|
||||
reg [(`UP(UUID_WIDTH) + TAG_ID_WIDTH + 64)-1:0] pending_reqs [QUEUE_SIZE-1:0];
|
||||
reg [QUEUE_SIZE-1:0] pending_req_valids;
|
||||
reg [(`UP(UUID_WIDTH) + TAG_ID_WIDTH + 64)-1:0] pending_reqs_time [QUEUE_SIZE-1:0];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
pending_req_valids <= '0;
|
||||
end else begin
|
||||
if (ibuf_push) begin
|
||||
pending_req_valids[ibuf_waddr] <= 1'b1;
|
||||
end
|
||||
if (ibuf_pop) begin
|
||||
pending_req_valids[ibuf_raddr] <= 1'b0;
|
||||
end
|
||||
end
|
||||
|
||||
if (ibuf_push) begin
|
||||
pending_reqs[ibuf_waddr] <= {req_dbg_uuid, ibuf_din, $time};
|
||||
if (reqq_rd_start_s) begin
|
||||
pending_reqs_time[ibuf_waddr_s] <= {req_dbg_uuid, ibuf_din, $time};
|
||||
end
|
||||
|
||||
for (integer i = 0; i < QUEUE_SIZE; ++i) begin
|
||||
if (pending_req_valids[i]) begin
|
||||
`ASSERT(($time - pending_reqs[i][0 +: 64]) < STALL_TIMEOUT,
|
||||
`ASSERT(($time - pending_reqs_time[i][63:0]) < STALL_TIMEOUT,
|
||||
("%t: *** %s response timeout: remaining=%b, tag=0x%0h (#%0d)",
|
||||
$time, INSTANCE_ID, rsp_rem_mask[i], pending_reqs[i][64 +: TAG_ID_WIDTH], pending_reqs[i][64+TAG_ID_WIDTH +: `UP(UUID_WIDTH)]));
|
||||
$time, INSTANCE_ID, rsp_rem_mask[i], pending_reqs_time[i][64 +: TAG_ID_WIDTH], pending_reqs_time[i][64+TAG_ID_WIDTH +: `UP(UUID_WIDTH)]));
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -548,15 +586,15 @@ module VX_mem_scheduler #(
|
|||
`TRACE(1, ("%d: %s-mem-req-rd: valid=%b, addr=", $time, INSTANCE_ID, mem_req_fire_s));
|
||||
`TRACE_ARRAY1D(1, mem_req_addr_s, MEM_CHANNELS);
|
||||
end
|
||||
`TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_waddr, req_batch_idx, mem_req_dbg_uuid));
|
||||
`TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_waddr_s, req_batch_idx, mem_req_dbg_uuid));
|
||||
end
|
||||
if (mem_rsp_fire_s) begin
|
||||
`TRACE(1, ("%d: %s-mem-rsp: valid=%b, data=", $time, INSTANCE_ID, mem_rsp_mask_s));
|
||||
`TRACE_ARRAY1D(1, mem_rsp_data_s, MEM_CHANNELS);
|
||||
`TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_raddr, rsp_batch_idx, mem_rsp_dbg_uuid));
|
||||
`TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_raddr_s, rsp_batch_idx, mem_rsp_dbg_uuid));
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
`TRACING_ON
|
||||
//`TRACING_ON
|
||||
|
|
|
@ -43,20 +43,7 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
|
|||
output cache_perf_t cache_perf,
|
||||
`endif
|
||||
|
||||
// Core request
|
||||
input wire [NUM_REQS-1:0] req_valid,
|
||||
input wire [NUM_REQS-1:0] req_rw,
|
||||
input wire [NUM_REQS-1:0][ADDR_WIDTH-1:0] req_addr,
|
||||
input wire [NUM_REQS-1:0][WORD_SIZE-1:0] req_byteen,
|
||||
input wire [NUM_REQS-1:0][WORD_SIZE*8-1:0] req_data,
|
||||
input wire [NUM_REQS-1:0][TAG_WIDTH-1:0] req_tag,
|
||||
output wire [NUM_REQS-1:0] req_ready,
|
||||
|
||||
// Core response
|
||||
output wire [NUM_REQS-1:0] rsp_valid,
|
||||
output wire [NUM_REQS-1:0][WORD_SIZE*8-1:0] rsp_data,
|
||||
output wire [NUM_REQS-1:0][TAG_WIDTH-1:0] rsp_tag,
|
||||
input wire [NUM_REQS-1:0] rsp_ready
|
||||
VX_mem_bus_if.slave mem_bus_if [NUM_REQS]
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
`UNUSED_PARAM (UUID_WIDTH)
|
||||
|
@ -79,7 +66,7 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
|
|||
wire [NUM_REQS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
|
||||
if (NUM_BANKS > 1) begin
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign req_bank_idx[i] = req_addr[i][0 +: BANK_SEL_BITS];
|
||||
assign req_bank_idx[i] = mem_bus_if[i].req_data.addr[0 +: BANK_SEL_BITS];
|
||||
end
|
||||
end else begin
|
||||
assign req_bank_idx = 0;
|
||||
|
@ -89,7 +76,7 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
|
|||
|
||||
wire [NUM_REQS-1:0][BANK_ADDR_WIDTH-1:0] req_bank_addr;
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign req_bank_addr[i] = req_addr[i][BANK_SEL_BITS +: BANK_ADDR_WIDTH];
|
||||
assign req_bank_addr[i] = mem_bus_if[i].req_data.addr[BANK_SEL_BITS +: BANK_ADDR_WIDTH];
|
||||
end
|
||||
|
||||
// bank requests dispatch
|
||||
|
@ -102,22 +89,27 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
|
|||
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_req_tag;
|
||||
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_req_idx;
|
||||
wire [NUM_BANKS-1:0] per_bank_req_ready;
|
||||
|
||||
wire [NUM_BANKS-1:0][REQ_DATAW-1:0] per_bank_req_data_all;
|
||||
|
||||
wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_in;
|
||||
wire [NUM_BANKS-1:0][REQ_DATAW-1:0] req_data_out;
|
||||
wire [NUM_REQS-1:0] req_valid_in;
|
||||
wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_in;
|
||||
wire [NUM_REQS-1:0] req_ready_in;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`PERF_CTR_BITS-1:0] perf_collisions;
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign req_valid_in[i] = mem_bus_if[i].req_valid;
|
||||
assign req_data_in[i] = {
|
||||
req_rw[i],
|
||||
mem_bus_if[i].req_data.rw,
|
||||
req_bank_addr[i],
|
||||
req_byteen[i],
|
||||
req_data[i],
|
||||
req_tag[i]};
|
||||
end
|
||||
mem_bus_if[i].req_data.byteen,
|
||||
mem_bus_if[i].req_data.data,
|
||||
mem_bus_if[i].req_data.tag};
|
||||
assign mem_bus_if[i].req_ready = req_ready_in[i];
|
||||
end
|
||||
|
||||
VX_stream_xbar #(
|
||||
.NUM_INPUTS (NUM_REQS),
|
||||
|
@ -133,12 +125,12 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
|
|||
`else
|
||||
`UNUSED_PIN (collisions),
|
||||
`endif
|
||||
.valid_in (req_valid),
|
||||
.valid_in (req_valid_in),
|
||||
.data_in (req_data_in),
|
||||
.sel_in (req_bank_idx),
|
||||
.ready_in (req_ready),
|
||||
.ready_in (req_ready_in),
|
||||
.valid_out (per_bank_req_valid),
|
||||
.data_out (req_data_out),
|
||||
.data_out (per_bank_req_data_all),
|
||||
.sel_out (per_bank_req_idx),
|
||||
.ready_out (per_bank_req_ready)
|
||||
);
|
||||
|
@ -149,7 +141,7 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
|
|||
per_bank_req_addr[i],
|
||||
per_bank_req_byteen[i],
|
||||
per_bank_req_data[i],
|
||||
per_bank_req_tag[i]} = req_data_out[i];
|
||||
per_bank_req_tag[i]} = per_bank_req_data_all[i];
|
||||
end
|
||||
|
||||
// banks access
|
||||
|
@ -197,13 +189,16 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
|
|||
|
||||
// bank responses gather
|
||||
|
||||
wire [NUM_BANKS-1:0][RSP_DATAW-1:0] rsp_data_in;
|
||||
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_out;
|
||||
|
||||
wire [NUM_BANKS-1:0][RSP_DATAW-1:0] per_bank_rsp_data_all;
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; ++i) begin
|
||||
assign rsp_data_in[i] = {per_bank_rsp_data[i], per_bank_rsp_tag[i]};
|
||||
assign per_bank_rsp_data_all[i] = {per_bank_rsp_data[i], per_bank_rsp_tag[i]};
|
||||
end
|
||||
|
||||
wire [NUM_REQS-1:0] rsp_valid_out;
|
||||
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_out;
|
||||
wire [NUM_REQS-1:0] rsp_ready_out;
|
||||
|
||||
VX_stream_xbar #(
|
||||
.NUM_INPUTS (NUM_BANKS),
|
||||
.NUM_OUTPUTS (NUM_REQS),
|
||||
|
@ -215,16 +210,18 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
|
|||
`UNUSED_PIN (collisions),
|
||||
.sel_in (per_bank_rsp_idx),
|
||||
.valid_in (per_bank_rsp_valid),
|
||||
.data_in (per_bank_rsp_data_all),
|
||||
.ready_in (per_bank_rsp_ready),
|
||||
.data_in (rsp_data_in),
|
||||
.valid_out (rsp_valid_out),
|
||||
.data_out (rsp_data_out),
|
||||
.valid_out (rsp_valid),
|
||||
.ready_out (rsp_ready),
|
||||
.ready_out (rsp_ready_out),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign {rsp_data[i], rsp_tag[i]} = rsp_data_out[i];
|
||||
assign mem_bus_if[i].rsp_valid = rsp_valid_out[i];
|
||||
assign mem_bus_if[i].rsp_data = rsp_data_out[i];
|
||||
assign rsp_ready_out[i] = mem_bus_if[i].rsp_ready;
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
|
@ -277,8 +274,8 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
|
|||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
if (UUID_WIDTH != 0) begin
|
||||
assign req_uuid[i] = req_tag[i][TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
assign rsp_uuid[i] = rsp_tag[i][TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
assign req_uuid[i] = mem_bus_if[i].req_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
assign rsp_uuid[i] = mem_bus_if[i].rsp_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
|
||||
end else begin
|
||||
assign req_uuid[i] = 0;
|
||||
assign rsp_uuid[i] = 0;
|
||||
|
@ -297,25 +294,27 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
|
|||
assign per_bank_rsp_uuid[i] = 0;
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
for (integer i = 0; i < NUM_REQS; ++i) begin
|
||||
if (req_valid[i] && req_ready[i]) begin
|
||||
if (req_rw[i]) begin
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
|
||||
if (mem_bus_if[i].req_data.rw) begin
|
||||
`TRACE(1, ("%d: %s wr-req: req_idx=%0d, addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, req_addr[i], req_tag[i], req_byteen[i], req_data[i], req_uuid[i]));
|
||||
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, req_uuid[i]));
|
||||
end else begin
|
||||
`TRACE(1, ("%d: %s rd-req: req_idx=%0d, addr=0x%0h, tag=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, req_addr[i], req_tag[i], req_uuid[i]));
|
||||
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag, req_uuid[i]));
|
||||
end
|
||||
end
|
||||
if (rsp_valid[i] && rsp_ready[i]) begin
|
||||
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
|
||||
`TRACE(1, ("%d: %s rd-rsp: req_idx=%0d, tag=0x%0h, data=0x%0h (#%0d)\n",
|
||||
$time, INSTANCE_ID, i, rsp_tag[i], rsp_data[i], rsp_uuid[i]));
|
||||
$time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.tag, mem_bus_if[i].rsp_data.data[i], rsp_uuid[i]));
|
||||
end
|
||||
end
|
||||
|
||||
for (integer i = 0; i < NUM_BANKS; ++i) begin
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_BANKS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (per_bank_req_valid[i] && per_bank_req_ready[i]) begin
|
||||
if (per_bank_req_rw[i]) begin
|
||||
`TRACE(2, ("%d: %s-bank%0d wr-req: addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n",
|
||||
|
|
|
@ -13,54 +13,34 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_smem_switch #(
|
||||
module VX_smem_switch import VX_gpu_pkg::*; #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter DATA_SIZE = 1,
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
parameter REQ_OUT_BUF = 0,
|
||||
parameter RSP_OUT_BUF = 0,
|
||||
parameter `STRING ARBITER = "R"
|
||||
parameter `STRING ARBITER = "R",
|
||||
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
input wire [`UP(LOG_NUM_REQS)-1:0] bus_sel,
|
||||
VX_mem_bus_if.slave bus_in_if,
|
||||
VX_mem_bus_if.master bus_out_if [NUM_REQS]
|
||||
);
|
||||
localparam ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE));
|
||||
localparam DATA_WIDTH = (8 * DATA_SIZE);
|
||||
localparam LOG_NUM_REQS = `CLOG2(NUM_REQS);
|
||||
localparam TAG_OUT_WIDTH = TAG_WIDTH - LOG_NUM_REQS;
|
||||
localparam REQ_DATAW = TAG_OUT_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
|
||||
localparam RSP_DATAW = TAG_OUT_WIDTH + DATA_WIDTH;
|
||||
localparam ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE));
|
||||
localparam DATA_WIDTH = (8 * DATA_SIZE);
|
||||
localparam REQ_DATAW = TAG_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
|
||||
localparam RSP_DATAW = TAG_WIDTH + DATA_WIDTH;
|
||||
|
||||
// handle requests ////////////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_REQS-1:0] req_valid_out;
|
||||
wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_out;
|
||||
wire [NUM_REQS-1:0] req_ready_out;
|
||||
|
||||
wire [REQ_DATAW-1:0] req_data_in;
|
||||
wire [TAG_OUT_WIDTH-1:0] req_tag_in;
|
||||
wire [`UP(LOG_NUM_REQS)-1:0] req_sel_in;
|
||||
|
||||
VX_bits_remove #(
|
||||
.N (TAG_WIDTH),
|
||||
.S (LOG_NUM_REQS),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) bits_remove (
|
||||
.data_in (bus_in_if.req_data.tag),
|
||||
.data_out (req_tag_in)
|
||||
);
|
||||
|
||||
if (NUM_REQS > 1) begin
|
||||
assign req_sel_in = bus_in_if.req_data.tag[TAG_SEL_IDX +: LOG_NUM_REQS];
|
||||
end else begin
|
||||
assign req_sel_in = '0;
|
||||
end
|
||||
|
||||
assign req_data_in = {req_tag_in, bus_in_if.req_data.addr, bus_in_if.req_data.rw, bus_in_if.req_data.byteen, bus_in_if.req_data.data};
|
||||
|
||||
VX_stream_switch #(
|
||||
.NUM_OUTPUTS (NUM_REQS),
|
||||
.DATAW (REQ_DATAW),
|
||||
|
@ -68,63 +48,48 @@ module VX_smem_switch #(
|
|||
) req_switch (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.sel_in (req_sel_in),
|
||||
.valid_in (bus_in_if.req_valid),
|
||||
.sel_in (bus_sel),
|
||||
.valid_in (bus_in_if.req_valid),
|
||||
.data_in (bus_in_if.req_data),
|
||||
.ready_in (bus_in_if.req_ready),
|
||||
.data_in (req_data_in),
|
||||
.data_out (req_data_out),
|
||||
.valid_out (req_valid_out),
|
||||
.data_out (req_data_out),
|
||||
.ready_out (req_ready_out)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign bus_out_if[i].req_valid = req_valid_out[i];
|
||||
assign {bus_out_if[i].req_data.tag, bus_out_if[i].req_data.addr, bus_out_if[i].req_data.rw, bus_out_if[i].req_data.byteen, bus_out_if[i].req_data.data} = req_data_out[i];
|
||||
assign bus_out_if[i].req_data = req_data_out[i];
|
||||
assign req_ready_out[i] = bus_out_if[i].req_ready;
|
||||
end
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
// handle responses ///////////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_REQS-1:0] rsp_valid_out;
|
||||
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_out;
|
||||
wire [NUM_REQS-1:0] rsp_ready_out;
|
||||
wire [RSP_DATAW-1:0] rsp_data_in;
|
||||
wire [TAG_OUT_WIDTH-1:0] rsp_tag_in;
|
||||
wire [`UP(LOG_NUM_REQS)-1:0] rsp_sel_in;
|
||||
wire [NUM_REQS-1:0] rsp_valid_in;
|
||||
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_in;
|
||||
wire [NUM_REQS-1:0] rsp_ready_in;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign rsp_valid_out[i] = bus_out_if[i].rsp_valid;
|
||||
assign rsp_data_out[i] = {bus_out_if[i].rsp_data.tag, bus_out_if[i].rsp_data.data};
|
||||
assign bus_out_if[i].rsp_ready = rsp_ready_out[i];
|
||||
assign rsp_valid_in[i] = bus_out_if[i].rsp_valid;
|
||||
assign rsp_data_in[i] = bus_out_if[i].rsp_data;
|
||||
assign bus_out_if[i].rsp_ready = rsp_ready_in[i];
|
||||
end
|
||||
|
||||
VX_stream_arb #(
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (NUM_REQS),
|
||||
.DATAW (RSP_DATAW),
|
||||
.DATAW (RSP_DATAW),
|
||||
.ARBITER (ARBITER),
|
||||
.OUT_BUF (RSP_OUT_BUF)
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (rsp_valid_out),
|
||||
.ready_in (rsp_ready_out),
|
||||
.data_in (rsp_data_out),
|
||||
.data_out (rsp_data_in),
|
||||
.sel_out (rsp_sel_in),
|
||||
.valid_in (rsp_valid_in),
|
||||
.data_in (rsp_data_in),
|
||||
.ready_in (rsp_ready_in),
|
||||
.valid_out (bus_in_if.rsp_valid),
|
||||
.ready_out (bus_in_if.rsp_ready)
|
||||
.data_out (bus_in_if.rsp_data),
|
||||
.ready_out (bus_in_if.rsp_ready),
|
||||
`UNUSED_PIN (sel_out)
|
||||
);
|
||||
|
||||
VX_bits_insert #(
|
||||
.N (TAG_OUT_WIDTH),
|
||||
.S (LOG_NUM_REQS),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) bits_insert (
|
||||
.data_in (rsp_tag_in),
|
||||
.sel_in (rsp_sel_in),
|
||||
.data_out (bus_in_if.rsp_data.tag)
|
||||
);
|
||||
|
||||
assign {rsp_tag_in, bus_in_if.rsp_data.data} = rsp_data_in;
|
||||
|
||||
endmodule
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue