cache subsystem refactoring

This commit is contained in:
Blaise Tine 2024-02-29 00:08:14 -08:00
parent 59497e52df
commit dd40e9c754
15 changed files with 680 additions and 846 deletions

View file

@ -21,31 +21,31 @@ rm -f blackbox.*.cache
unittest()
{
make -C tests/unittest run
make -C hw/unittest > /dev/null
make -C tests/unittest run
make -C hw/unittest > /dev/null
}
isa()
{
echo "begin isa tests..."
echo "begin isa tests..."
make -C tests/riscv/isa run-simx
make -C tests/riscv/isa run-rtlsim
make -C tests/riscv/isa run-simx
make -C tests/riscv/isa run-rtlsim
make -C sim/rtlsim clean && CONFIGS="-DDPI_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim
make -C sim/rtlsim clean && CONFIGS="-DDPI_DISABLE" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DPI" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-32f
if [ "$XLEN" == "64" ]
then
if [ "$XLEN" == "64" ]
then
make -C sim/rtlsim clean && CONFIGS="-DFPU_FPNEW" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64f
@ -57,214 +57,216 @@ then
make -C sim/rtlsim clean && CONFIGS="-DFPU_DSP" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-rtlsim-64fx
fi
fi
# restore default prebuilt configuration
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
# restore default prebuilt configuration
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
echo "isa tests done!"
echo "isa tests done!"
}
regression()
{
echo "begin regression tests..."
echo "begin regression tests..."
make -C tests/kernel run-simx
make -C tests/kernel run-rtlsim
make -C tests/kernel run-simx
make -C tests/kernel run-rtlsim
make -C tests/regression run-simx
make -C tests/regression run-rtlsim
make -C tests/regression run-simx
make -C tests/regression run-rtlsim
# test FPU hardware implementations
CONFIGS="-DFPU_DPI" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DFPU_DSP" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
# test FPU hardware implementations
CONFIGS="-DFPU_DPI" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DFPU_DSP" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
# test local barrier
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -t19"
./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -t19"
# test local barrier
./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -t19"
./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -t19"
# test global barrier
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -t20" --cores=2
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -t20" --cores=2
# test global barrier
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=simx --app=dogfood --args="-n1 -t20" --cores=2
CONFIGS="-DGBAR_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=dogfood --args="-n1 -t20" --cores=2
# test FPU core
# test FPU core
echo "regression tests done!"
echo "regression tests done!"
}
opencl()
{
echo "begin opencl tests..."
echo "begin opencl tests..."
make -C tests/opencl run-simx
make -C tests/opencl run-rtlsim
make -C tests/opencl run-simx
make -C tests/opencl run-rtlsim
echo "opencl tests done!"
echo "opencl tests done!"
}
cluster()
{
echo "begin clustering tests..."
echo "begin clustering tests..."
# warp/threads configurations
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=2 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=diverge
./ci/blackbox.sh --driver=simx --cores=1 --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=simx --cores=1 --warps=8 --threads=16 --app=diverge
# warp/threads configurations
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=2 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=diverge
./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=diverge
./ci/blackbox.sh --driver=simx --cores=1 --warps=1 --threads=1 --app=diverge
./ci/blackbox.sh --driver=simx --cores=1 --warps=8 --threads=16 --app=diverge
# cores clustering
./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=diverge --args="-n1"
# cores clustering
./ci/blackbox.sh --driver=rtlsim --cores=1 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=1 --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --app=diverge --args="-n1"
# L2/L3
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=diverge --args="-n1"
# L2/L3
./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=2 --l2cache --app=diverge --args="-n1"
./ci/blackbox.sh --driver=simx --cores=4 --clusters=4 --l2cache --l3cache --app=diverge --args="-n1"
echo "clustering tests done!"
echo "clustering tests done!"
}
debug()
{
echo "begin debugging tests..."
echo "begin debugging tests..."
# test CSV trace generation
make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-simx-32im > run_simx.log
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
diff trace_rtlsim.csv trace_simx.csv
# restore default prebuilt configuration
make -C sim/simx clean && make -C sim/simx > /dev/null
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
# test CSV trace generation
make -C sim/simx clean && DEBUG=3 make -C sim/simx > /dev/null
make -C sim/rtlsim clean && DEBUG=3 CONFIGS="-DGPR_RESET" make -C sim/rtlsim > /dev/null
make -C tests/riscv/isa run-simx-32im > run_simx.log
make -C tests/riscv/isa run-rtlsim-32im > run_rtlsim.log
./ci/trace_csv.py -trtlsim run_rtlsim.log -otrace_rtlsim.csv
./ci/trace_csv.py -tsimx run_simx.log -otrace_simx.csv
diff trace_rtlsim.csv trace_simx.csv
# restore default prebuilt configuration
make -C sim/simx clean && make -C sim/simx > /dev/null
make -C sim/rtlsim clean && make -C sim/rtlsim > /dev/null
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=basic --args="-t0 -n1"
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --perf=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --debug=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug=1 --app=demo --args="-n1"
./ci/blackbox.sh --driver=opae --cores=1 --scope --app=basic --args="-t0 -n1"
echo "debugging tests done!"
echo "debugging tests done!"
}
config()
{
echo "begin configuration tests..."
echo "begin configuration tests..."
# disable DPI
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
# disable DPI
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
CONFIGS="-DDPI_DISABLE -DFPU_FPNEW" ./ci/blackbox.sh --driver=opae --app=dogfood
# issue width
CONFIGS="-DISSUE_WIDTH=1" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DISSUE_WIDTH=1" ./ci/blackbox.sh --driver=simx --app=diverge
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=simx --app=diverge
# issue width
CONFIGS="-DISSUE_WIDTH=1" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DISSUE_WIDTH=1" ./ci/blackbox.sh --driver=simx --app=diverge
CONFIGS="-DISSUE_WIDTH=2" ./ci/blackbox.sh --driver=simx --app=diverge
# dispatch size
CONFIGS="-DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=1" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=1" ./ci/blackbox.sh --driver=simx --app=diverge
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=simx --app=diverge
# dispatch size
CONFIGS="-DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=1" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=diverge
CONFIGS="-DNUM_ALU_BLOCK=1 -DNUM_ALU_LANES=1" ./ci/blackbox.sh --driver=simx --app=diverge
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_ALU_LANES=2" ./ci/blackbox.sh --driver=simx --app=diverge
# FPU scaling
CONFIGS="-DNUM_ALU_BLOCK=4 -DNUM_FPU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DNUM_ALU_BLOCK=4 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
# FPU scaling
CONFIGS="-DNUM_ALU_BLOCK=4 -DNUM_FPU_LANES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DNUM_ALU_BLOCK=2 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DNUM_ALU_BLOCK=4 -DNUM_FPU_LANES=4" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
# custom program startup address
make -C tests/regression/dogfood clean-all
STARTUP_ADDR=0x40000000 make -C tests/regression/dogfood
CONFIGS="-DSTARTUP_ADDR=0x40000000" ./ci/blackbox.sh --driver=simx --app=dogfood
CONFIGS="-DSTARTUP_ADDR=0x40000000" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
make -C tests/regression/dogfood clean-all
make -C tests/regression/dogfood
# custom program startup address
make -C tests/regression/dogfood clean-all
STARTUP_ADDR=0x40000000 make -C tests/regression/dogfood
CONFIGS="-DSTARTUP_ADDR=0x40000000" ./ci/blackbox.sh --driver=simx --app=dogfood
CONFIGS="-DSTARTUP_ADDR=0x40000000" ./ci/blackbox.sh --driver=rtlsim --app=dogfood
make -C tests/regression/dogfood clean-all
make -C tests/regression/dogfood
# disabling M extension
CONFIGS="-DEXT_M_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
# disabling M extension
CONFIGS="-DEXT_M_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
# disabling F extension
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext --perf=1
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=simx --cores=1 --app=no_mf_ext --perf=1
# disabling F extension
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext --perf=1
CONFIGS="-DEXT_F_DISABLE" ./ci/blackbox.sh --driver=simx --cores=1 --app=no_mf_ext --perf=1
# disable shared memory
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem --perf=1
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=simx --cores=1 --app=no_smem --perf=1
# disable shared memory
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo --perf=1
CONFIGS="-DSM_DISABLE" ./ci/blackbox.sh --driver=simx --cores=1 --app=demo --perf=1
# disable L1 cache
CONFIGS="-DL1_DISABLE -DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
# disable L1 cache
CONFIGS="-DL1_DISABLE -DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
CONFIGS="-DL1_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
CONFIGS="-DDCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
CONFIGS="-DICACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
# multiple L1 caches per cluster
CONFIGS="-DNUM_DCACHES=2 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --cores=8 --warps=1 --threads=2
# multiple L1 caches per cluster
CONFIGS="-DNUM_DCACHES=2 -DNUM_ICACHES=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm --cores=8 --warps=1 --threads=2
# test AXI bus
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo
# test AXI bus
AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo
# adjust l1 block size to match l2
CONFIGS="-DL1_LINE_SIZE=64" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1"
# reduce l1 line size
CONFIGS="-DL1_LINE_SIZE=16" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1"
CONFIGS="-DL1_LINE_SIZE=4 -DL1_DISABLE -DSM_DISABLE" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1"
# test cache banking
CONFIGS="-DSMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DSMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DSMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemm
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --cores=1 --app=sgemm
# test cache banking
CONFIGS="-DSMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DSMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --app=sgemm
CONFIGS="-DSMEM_NUM_BANKS=2 -DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --app=sgemm
CONFIGS="-DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=sgemm
CONFIGS="-DDCACHE_NUM_BANKS=2" ./ci/blackbox.sh --driver=simx --cores=1 --app=sgemm
# test 128-bit MEM block
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
# test 128-bit MEM block
CONFIGS="-DMEM_BLOCK_SIZE=16" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
# test single-bank DRAM
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
# test single-bank DRAM
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_BANKS=1" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
# test 27-bit DRAM address
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
# test 27-bit DRAM address
CONFIGS="-DPLATFORM_PARAM_LOCAL_MEMORY_ADDR_WIDTH=27" ./ci/blackbox.sh --driver=opae --cores=1 --app=demo
echo "configuration tests done!"
echo "configuration tests done!"
}
stress0()
{
echo "begin stress0 tests..."
echo "begin stress0 tests..."
# test verilator reset values
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --app=printf
# test verilator reset values
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=dogfood
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr
CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=opae --app=printf
echo "stress0 tests done!"
echo "stress0 tests done!"
}
stress1()
{
echo "begin stress1 tests..."
echo "begin stress1 tests..."
./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n128" --l2cache
./ci/blackbox.sh --driver=rtlsim --app=sgemm --args="-n128" --l2cache
echo "stress1 tests done!"
echo "stress1 tests done!"
}
synthesis()
{
echo "begin synthesis tests..."
echo "begin synthesis tests..."
PREFIX=build_base make -C hw/syn/yosys clean
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys elaborate
PREFIX=build_base make -C hw/syn/yosys clean
PREFIX=build_base CONFIGS="-DDPI_DISABLE -DEXT_F_DISABLE" make -C hw/syn/yosys elaborate
echo "synthesis tests done!"
echo "synthesis tests done!"
}
show_usage()
@ -277,45 +279,56 @@ start=$SECONDS
while [ "$1" != "" ]; do
case $1 in
--unittest ) unittest
--unittest )
unittest
;;
--isa ) isa
;;
--regression ) regression
--regression )
regression
;;
--opencl ) opencl
--opencl )
opencl
;;
--cluster ) cluster
--cluster )
cluster
;;
--debug ) debug
--debug )
debug
;;
--config ) config
--config )
config
;;
--stress0 ) stress0
--stress0 )
stress0
;;
--stress1 ) stress1
--stress1 )
stress1
;;
--stress ) stress0
stress1
--stress )
stress0
stress1
;;
--synthesis ) synthesis
--synthesis )
synthesis
;;
--all ) unittest
isa
regression
opencl
cluster
debug
config
stress0
stress1
synthesis
isa
regression
opencl
cluster
debug
config
stress0
stress1
synthesis
;;
-h | --help ) show_usage
exit
-h | --help )
show_usage
exit
;;
* ) show_usage
exit 1
* ) show_usage
exit 1
esac
shift
done

View file

@ -129,23 +129,15 @@
`endif
`ifndef L1_LINE_SIZE
`ifdef L1_DISABLE
`define L1_LINE_SIZE ((`L2_ENABLED || `L3_ENABLED) ? 4 : `MEM_BLOCK_SIZE)
`else
`define L1_LINE_SIZE ((`L2_ENABLED || `L3_ENABLED) ? 16 : `MEM_BLOCK_SIZE)
`endif
`define L1_LINE_SIZE `MEM_BLOCK_SIZE
`endif
`ifdef L2_ENABLE
`ifndef L2_LINE_SIZE
`define L2_LINE_SIZE `MEM_BLOCK_SIZE
`else
`define L2_LINE_SIZE `L1_LINE_SIZE
`endif
`ifdef L3_ENABLE
`ifndef L3_LINE_SIZE
`define L3_LINE_SIZE `MEM_BLOCK_SIZE
`else
`define L3_LINE_SIZE `L2_LINE_SIZE
`endif
`ifdef XLEN_64

View file

@ -243,31 +243,18 @@
///////////////////////////////////////////////////////////////////////////////
// non-cacheable tag bits
`define NC_TAG_BITS 1
// cache address type bits
`ifdef SM_ENABLE
`define CACHE_ADDR_TYPE_BITS (`NC_TAG_BITS + 1)
`else
`define CACHE_ADDR_TYPE_BITS `NC_TAG_BITS
`endif
`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0)
`define ARB_SEL_BITS(I, O) ((I > O) ? `CLOG2((I + O - 1) / O) : 0)
///////////////////////////////////////////////////////////////////////////////
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
(`CLOG2(mshr_size) + `CLOG2(num_banks) + `NC_TAG_BITS)
(`CLOG2(mshr_size) + `CLOG2(num_banks))
`define CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) + `NC_TAG_BITS)
`define CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width) \
`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width))
(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width)) + 1)
///////////////////////////////////////////////////////////////////////////////
@ -278,16 +265,13 @@
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
`define CACHE_CLUSTER_MEM_TAG_WIDTH(mshr_size, num_banks, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), num_caches)
`define CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG((`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG((`CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)) + `NC_TAG_BITS), num_caches)
`define CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, tag_width, num_inputs, num_caches) \
`CACHE_CLUSTER_MEM_ARB_TAG(`MAX(`CACHE_MEM_TAG_WIDTH(mshr_size, num_banks), `CACHE_NC_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches))), num_caches)
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches)), num_caches)
///////////////////////////////////////////////////////////////////////////////

View file

@ -99,7 +99,7 @@ package VX_gpu_pkg;
`ifdef ICACHE_ENABLE
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_MEM_TAG_WIDTH(`ICACHE_MSHR_SIZE, 1, `NUM_ICACHES);
`else
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
localparam ICACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(1, ICACHE_LINE_SIZE, ICACHE_WORD_SIZE, ICACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_ICACHES);
`endif
////////////////////////// Dcache Parameters //////////////////////////////
@ -112,31 +112,26 @@ package VX_gpu_pkg;
localparam DCACHE_LINE_SIZE = `L1_LINE_SIZE;
// Input request size
localparam DCACHE_NUM_REQS = `MAX(`DCACHE_NUM_BANKS, `SMEM_NUM_BANKS);
// Memory request size
localparam LSU_MEM_REQS = `NUM_LSU_LANES;
localparam DCACHE_NUM_REQS = `UP((`NUM_LSU_LANES * (`XLEN / 8)) / DCACHE_WORD_SIZE);
// Batch select bits
localparam DCACHE_NUM_BATCHES = ((LSU_MEM_REQS + DCACHE_NUM_REQS - 1) / DCACHE_NUM_REQS);
localparam DCACHE_NUM_BATCHES = ((DCACHE_NUM_REQS + DCACHE_NUM_REQS - 1) / DCACHE_NUM_REQS);
localparam DCACHE_BATCH_SEL_BITS = `CLOG2(DCACHE_NUM_BATCHES);
// Core request tag Id bits
localparam LSUQ_TAG_BITS = (`CLOG2(`LSUQ_SIZE) + DCACHE_BATCH_SEL_BITS);
localparam DCACHE_TAG_ID_BITS = (LSUQ_TAG_BITS + `CACHE_ADDR_TYPE_BITS);
localparam DCACHE_TAG_ID_BITS = (`CLOG2(`LSUQ_SIZE) + DCACHE_BATCH_SEL_BITS);
// Core request tag bits
localparam DCACHE_TAG_WIDTH = (`UUID_WIDTH + DCACHE_TAG_ID_BITS);
localparam DCACHE_NOSM_TAG_WIDTH = (DCACHE_TAG_WIDTH - `SM_ENABLED);
// Memory request data bits
localparam DCACHE_MEM_DATA_WIDTH = (DCACHE_LINE_SIZE * 8);
// Memory request tag bits
`ifdef DCACHE_ENABLE
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_NOSM_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_MEM_TAG_WIDTH(`DCACHE_MSHR_SIZE, `DCACHE_NUM_BANKS, DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`else
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_NC_BYPASS_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_NOSM_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
localparam DCACHE_MEM_TAG_WIDTH = `CACHE_CLUSTER_BYPASS_MEM_TAG_WIDTH(DCACHE_NUM_REQS, DCACHE_LINE_SIZE, DCACHE_WORD_SIZE, DCACHE_TAG_WIDTH, `SOCKET_SIZE, `NUM_DCACHES);
`endif
/////////////////////////////// L1 Parameters /////////////////////////////
@ -165,7 +160,7 @@ package VX_gpu_pkg;
`ifdef L2_ENABLE
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L2_MSHR_SIZE, `L2_NUM_BANKS, L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
`else
localparam L2_MEM_TAG_WIDTH = `CACHE_NC_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
localparam L2_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L2_NUM_REQS, `L2_LINE_SIZE, L2_WORD_SIZE, L2_TAG_WIDTH);
`endif
/////////////////////////////// L3 Parameters /////////////////////////////
@ -186,7 +181,7 @@ package VX_gpu_pkg;
`ifdef L3_ENABLE
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_MEM_TAG_WIDTH(`L3_MSHR_SIZE, `L3_NUM_BANKS, L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`else
localparam L3_MEM_TAG_WIDTH = `CACHE_NC_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
localparam L3_MEM_TAG_WIDTH = `CACHE_BYPASS_TAG_WIDTH(L3_NUM_REQS, `L3_LINE_SIZE, L3_WORD_SIZE, L3_TAG_WIDTH);
`endif
/* verilator lint_on UNUSED */

View file

@ -74,7 +74,7 @@ module VX_socket import VX_gpu_pkg::*; #(
///////////////////////////////////////////////////////////////////////////
VX_mem_bus_if #(
.DATA_SIZE (ICACHE_WORD_SIZE),
.DATA_SIZE (ICACHE_WORD_SIZE),
.TAG_WIDTH (ICACHE_TAG_WIDTH)
) per_core_icache_bus_if[`SOCKET_SIZE]();
@ -120,7 +120,7 @@ module VX_socket import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) per_core_dcache_bus_if[`SOCKET_SIZE * DCACHE_NUM_REQS]();
VX_mem_bus_if #(
@ -134,7 +134,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.INSTANCE_ID ($sformatf("socket%0d-dcache", SOCKET_ID)),
.NUM_UNITS (`NUM_DCACHES),
.NUM_INPUTS (`SOCKET_SIZE),
.TAG_SEL_IDX (1),
.TAG_SEL_IDX (0),
.CACHE_SIZE (`DCACHE_SIZE),
.LINE_SIZE (DCACHE_LINE_SIZE),
.NUM_BANKS (`DCACHE_NUM_BANKS),
@ -145,7 +145,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.MSHR_SIZE (`DCACHE_MSHR_SIZE),
.MRSQ_SIZE (`DCACHE_MRSQ_SIZE),
.MREQ_SIZE (`DCACHE_MREQ_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.WRITE_ENABLE (1),
.NC_ENABLE (1),
@ -182,7 +182,7 @@ module VX_socket import VX_gpu_pkg::*; #(
.NUM_INPUTS (2),
.DATA_SIZE (`L1_LINE_SIZE),
.TAG_WIDTH (L1_MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.TAG_SEL_IDX (0),
.ARBITER ("R"),
.REQ_OUT_BUF (2),
.RSP_OUT_BUF (2)

View file

@ -15,17 +15,17 @@
module VX_cache_bypass #(
parameter NUM_REQS = 1,
parameter NC_TAG_BIT = 0,
parameter TAG_SEL_IDX = 0,
parameter NC_ENABLE = 0,
parameter PASSTHRU = 0,
parameter NC_ENABLE = 0,
parameter WORD_SIZE = 1,
parameter LINE_SIZE = 1,
parameter CORE_ADDR_WIDTH = 1,
parameter CORE_TAG_IN_WIDTH = 1,
parameter CORE_TAG_WIDTH = 1,
parameter MEM_ADDR_WIDTH = 1,
parameter MEM_TAG_IN_WIDTH = 1,
@ -36,8 +36,7 @@ module VX_cache_bypass #(
parameter CORE_OUT_BUF = 0,
parameter MEM_OUT_BUF = 0,
parameter CORE_DATA_WIDTH = WORD_SIZE * 8,
parameter CORE_TAG_OUT_WIDTH = CORE_TAG_IN_WIDTH - NC_ENABLE
parameter CORE_DATA_WIDTH = WORD_SIZE * 8
) (
input wire clk,
input wire reset,
@ -57,38 +56,39 @@ module VX_cache_bypass #(
localparam DIRECT_PASSTHRU = PASSTHRU && (`CS_WORD_SEL_BITS == 0) && (NUM_REQS == 1);
localparam REQ_SEL_BITS = `CLOG2(NUM_REQS);
localparam MUX_DATAW = CORE_TAG_IN_WIDTH + CORE_DATA_WIDTH + WORD_SIZE + CORE_ADDR_WIDTH + 1;
localparam MUX_DATAW = CORE_TAG_WIDTH + CORE_DATA_WIDTH + WORD_SIZE + CORE_ADDR_WIDTH + 1;
localparam WORDS_PER_LINE = LINE_SIZE / WORD_SIZE;
localparam WSEL_BITS = `CLOG2(WORDS_PER_LINE);
localparam CORE_TAG_ID_BITS = CORE_TAG_IN_WIDTH - UUID_WIDTH;
localparam CORE_TAG_ID_BITS = CORE_TAG_WIDTH - UUID_WIDTH;
localparam MEM_TAG_ID_BITS = REQ_SEL_BITS + WSEL_BITS + CORE_TAG_ID_BITS;
localparam MEM_TAG_BYPASS_BITS = UUID_WIDTH + MEM_TAG_ID_BITS;
localparam MEM_TAG_OUT_NC_WIDTH = MEM_TAG_OUT_WIDTH - 1 + NC_ENABLE;
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
localparam MEM_ADDRW = `XLEN - MEM_ASHIFT;
// core request handling
// handle core requests ///////////////////////////////////////////////////
wire core_req_nc_valid;
wire [NUM_REQS-1:0] core_req_nc_valids;
wire [NUM_REQS-1:0] core_req_nc_idxs;
wire [`UP(REQ_SEL_BITS)-1:0] core_req_nc_idx;
wire [NUM_REQS-1:0] core_req_nc_sel;
wire core_req_nc_valid;
wire [NUM_REQS-1:0] core_req_nc_sel;
wire core_req_nc_ready;
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (PASSTHRU != 0) begin
assign core_req_nc_idxs[i] = 1'b1;
end else if (NC_ENABLE) begin
wire [MEM_ADDRW-1:0] block_addr = core_bus_in_if[i].req_data.addr[CORE_ADDR_WIDTH-1 -: MEM_ADDRW];
assign core_req_nc_idxs[i] = (block_addr >= MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT));
end else begin
assign core_req_nc_idxs[i] = core_bus_in_if[i].req_data.tag[NC_TAG_BIT];
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_nc_idxs[i] = 1'b0;
end
assign core_req_nc_valids[i] = core_bus_in_if[i].req_valid && core_req_nc_idxs[i];
end
wire core_req_nc_ready = ~mem_bus_in_if.req_valid && mem_req_out_ready;
VX_generic_arbiter #(
.NUM_REQS (NUM_REQS),
.TYPE (PASSTHRU ? "R" : "P"),
@ -105,69 +105,58 @@ module VX_cache_bypass #(
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_bus_out_if[i].req_valid = core_bus_in_if[i].req_valid && ~core_req_nc_idxs[i];
assign core_bus_out_if[i].req_data.rw = core_bus_in_if[i].req_data.rw;
assign core_bus_out_if[i].req_data.addr = core_bus_in_if[i].req_data.addr;
assign core_bus_out_if[i].req_data.byteen = core_bus_in_if[i].req_data.byteen;
assign core_bus_out_if[i].req_data.data = core_bus_in_if[i].req_data.data;
VX_bits_remove #(
.N (CORE_TAG_IN_WIDTH),
.S (NC_ENABLE),
.POS (NC_TAG_BIT)
) core_req_tag_nc_remove (
.data_in (core_bus_in_if[i].req_data.tag),
.data_out (core_bus_out_if[i].req_data.tag)
);
assign core_bus_out_if[i].req_data = core_bus_in_if[i].req_data;
assign core_bus_in_if[i].req_ready = core_req_nc_valids[i] ? (core_req_nc_ready && core_req_nc_sel[i])
: core_bus_out_if[i].req_ready;
end
// memory request handling
// handle memory requests /////////////////////////////////////////////////
wire mem_req_out_valid;
wire mem_req_out_rw;
wire [LINE_SIZE-1:0] mem_req_out_byteen;
wire mem_req_out_valid;
wire mem_req_out_rw;
wire [LINE_SIZE-1:0] mem_req_out_byteen;
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_out_addr;
wire [`CS_LINE_WIDTH-1:0] mem_req_out_data;
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_out_tag;
wire mem_req_out_ready;
wire [`CS_LINE_WIDTH-1:0] mem_req_out_data;
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_out_tag;
wire mem_req_out_ready;
wire [CORE_TAG_IN_WIDTH-1:0] core_req_nc_sel_tag;
wire [CORE_DATA_WIDTH-1:0] core_req_nc_sel_data;
wire [WORD_SIZE-1:0] core_req_nc_sel_byteen;
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_sel_addr;
wire core_req_nc_sel_rw;
wire [CORE_TAG_WIDTH-1:0] core_req_nc_sel_tag;
wire [CORE_DATA_WIDTH-1:0] core_req_nc_sel_data;
wire [WORD_SIZE-1:0] core_req_nc_sel_byteen;
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_sel_addr;
wire core_req_nc_sel_rw;
wire [NUM_REQS-1:0][MUX_DATAW-1:0] core_req_nc_mux_in;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_req_nc_mux_in[i] = {
core_bus_in_if[i].req_data.tag,
core_bus_in_if[i].req_data.data,
core_bus_in_if[i].req_data.byteen,
core_bus_in_if[i].req_data.addr,
core_bus_in_if[i].req_data.rw
core_bus_in_if[i].req_data.rw,
core_bus_in_if[i].req_data.byteen,
core_bus_in_if[i].req_data.addr,
core_bus_in_if[i].req_data.data,
core_bus_in_if[i].req_data.tag
};
end
assign {
core_req_nc_sel_tag,
core_req_nc_sel_data,
core_req_nc_sel_byteen,
core_req_nc_sel_addr,
core_req_nc_sel_rw
core_req_nc_sel_rw,
core_req_nc_sel_byteen,
core_req_nc_sel_addr,
core_req_nc_sel_data,
core_req_nc_sel_tag
} = core_req_nc_mux_in[core_req_nc_idx];
assign core_req_nc_ready = ~mem_bus_in_if.req_valid && mem_req_out_ready;
assign mem_req_out_valid = mem_bus_in_if.req_valid || core_req_nc_valid;
assign mem_req_out_rw = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.rw : core_req_nc_sel_rw;
assign mem_req_out_addr = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.addr : core_req_nc_sel_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
assign mem_req_out_rw = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.rw : core_req_nc_sel_rw;
assign mem_req_out_addr = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.addr : core_req_nc_sel_addr[WSEL_BITS +: MEM_ADDR_WIDTH];
wire [MEM_TAG_ID_BITS-1:0] mem_req_tag_id_bypass;
wire [CORE_TAG_ID_BITS-1:0] core_req_in_id = core_req_nc_sel_tag[CORE_TAG_ID_BITS-1:0];
if (WORDS_PER_LINE > 1) begin
reg [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] mem_req_byteen_in_r;
reg [WORDS_PER_LINE-1:0][WORD_SIZE-1:0] mem_req_byteen_in_r;
reg [WORDS_PER_LINE-1:0][CORE_DATA_WIDTH-1:0] mem_req_data_in_r;
wire [WSEL_BITS-1:0] req_wsel = core_req_nc_sel_addr[WSEL_BITS-1:0];
@ -181,7 +170,7 @@ module VX_cache_bypass #(
end
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : mem_req_byteen_in_r;
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : mem_req_data_in_r;
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : mem_req_data_in_r;
if (NUM_REQS > 1) begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, req_wsel, core_req_in_id});
end else begin
@ -189,7 +178,7 @@ module VX_cache_bypass #(
end
end else begin
assign mem_req_out_byteen = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.byteen : core_req_nc_sel_byteen;
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : core_req_nc_sel_data;
assign mem_req_out_data = mem_bus_in_if.req_valid ? mem_bus_in_if.req_data.data : core_req_nc_sel_data;
if (NUM_REQS > 1) begin
assign mem_req_tag_id_bypass = MEM_TAG_ID_BITS'({core_req_nc_idx, core_req_in_id});
end else begin
@ -197,7 +186,7 @@ module VX_cache_bypass #(
end
end
wire [MEM_TAG_OUT_NC_WIDTH-1:0] mem_req_tag_bypass;
wire [MEM_TAG_BYPASS_BITS-1:0] mem_req_tag_bypass;
if (UUID_WIDTH != 0) begin
assign mem_req_tag_bypass = {core_req_nc_sel_tag[CORE_TAG_ID_BITS +: UUID_WIDTH], mem_req_tag_id_bypass};
@ -205,29 +194,24 @@ module VX_cache_bypass #(
assign mem_req_tag_bypass = mem_req_tag_id_bypass;
end
wire [MEM_TAG_OUT_WIDTH-1:0] mem_req_tag_bypass_nc;
wire [(MEM_TAG_IN_WIDTH + 1)-1:0] mem_req_tag_in_nc;
VX_bits_insert #(
.N (MEM_TAG_OUT_NC_WIDTH),
.S (NC_ENABLE ? 0 : 1),
.POS (NC_TAG_BIT)
) mem_req_tag_bypass_nc_insert (
.data_in (mem_req_tag_bypass),
.sel_in (1'b0),
.data_out (mem_req_tag_bypass_nc)
);
VX_bits_insert #(
.N (MEM_TAG_IN_WIDTH),
.POS (NC_TAG_BIT)
) mem_req_tag_in_nc_insert (
.data_in (mem_bus_in_if.req_data.tag),
.sel_in (1'b0),
.data_out (mem_req_tag_in_nc)
);
assign mem_req_out_tag = mem_bus_in_if.req_valid ? MEM_TAG_OUT_WIDTH'(mem_req_tag_in_nc) : mem_req_tag_bypass_nc;
if (PASSTHRU != 0) begin
assign mem_req_out_tag = mem_req_tag_bypass;
`UNUSED_VAR (mem_bus_in_if.req_data.tag)
end else begin
if (NC_ENABLE) begin
VX_bits_insert #(
.N (MEM_TAG_OUT_WIDTH-1),
.S (1),
.POS (TAG_SEL_IDX)
) mem_req_tag_in_nc_insert (
.data_in (mem_bus_in_if.req_valid ? (MEM_TAG_OUT_WIDTH-1)'(mem_bus_in_if.req_data.tag) : (MEM_TAG_OUT_WIDTH-1)'(mem_req_tag_bypass)),
.sel_in (~mem_bus_in_if.req_valid),
.data_out (mem_req_out_tag)
);
end else begin
assign mem_req_out_tag = mem_bus_in_if.req_data.tag;
end
end
assign mem_bus_in_if.req_ready = mem_req_out_ready;
@ -246,48 +230,38 @@ module VX_cache_bypass #(
.ready_out (mem_bus_out_if.req_ready)
);
// core response handling
// handle core responses //////////////////////////////////////////////////
wire [NUM_REQS-1:0] core_rsp_in_valid;
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_in_data;
wire [NUM_REQS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_in_tag;
wire [NUM_REQS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_in_tag;
wire [NUM_REQS-1:0] core_rsp_in_ready;
wire [NUM_REQS-1:0][CORE_TAG_IN_WIDTH-1:0] core_rsp_tag_in_nc;
wire is_mem_rsp_nc;
if (PASSTHRU != 0) begin
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid;
end else begin
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid && mem_bus_out_if.rsp_data.tag[NC_TAG_BIT];
if (NC_ENABLE) begin
assign is_mem_rsp_nc = mem_bus_out_if.rsp_valid && mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
end else begin
assign is_mem_rsp_nc = 1'b0;
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_bits_insert #(
.N (CORE_TAG_OUT_WIDTH),
.S (NC_ENABLE),
.POS (NC_TAG_BIT)
) core_rsp_tag_in_nc_insert (
.data_in (core_bus_out_if[i].rsp_data.tag),
.sel_in ('0),
.data_out (core_rsp_tag_in_nc[i])
);
end
wire [MEM_TAG_OUT_NC_WIDTH-1:0] mem_rsp_tag_in_nc;
wire [(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1:0] mem_rsp_tag_id_nc;
VX_bits_remove #(
.N (MEM_TAG_OUT_WIDTH),
.S (NC_ENABLE ? 0 : 1),
.POS (NC_TAG_BIT)
.S (NC_ENABLE),
.POS (TAG_SEL_IDX)
) mem_rsp_tag_in_nc_remove (
.data_in (mem_bus_out_if.rsp_data.tag),
.data_out (mem_rsp_tag_in_nc)
.data_out (mem_rsp_tag_id_nc)
);
wire [`UP(REQ_SEL_BITS)-1:0] rsp_idx;
if (NUM_REQS > 1) begin
assign rsp_idx = mem_rsp_tag_in_nc[(CORE_TAG_ID_BITS + WSEL_BITS) +: REQ_SEL_BITS];
assign rsp_idx = mem_rsp_tag_id_nc[(CORE_TAG_ID_BITS + WSEL_BITS) +: REQ_SEL_BITS];
end else begin
assign rsp_idx = 1'b0;
end
@ -304,7 +278,7 @@ module VX_cache_bypass #(
end
if (WORDS_PER_LINE > 1) begin
wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_in_nc[CORE_TAG_ID_BITS +: WSEL_BITS];
wire [WSEL_BITS-1:0] rsp_wsel = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS +: WSEL_BITS];
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ?
core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data[rsp_wsel * CORE_DATA_WIDTH +: CORE_DATA_WIDTH];
@ -313,19 +287,28 @@ module VX_cache_bypass #(
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_in_data[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.data : mem_bus_out_if.rsp_data.data;
end
end
end
wire [(CORE_TAG_ID_BITS + UUID_WIDTH)-1:0] mem_rsp_tag_in_nc2;
if (UUID_WIDTH != 0) begin
assign mem_rsp_tag_in_nc2 = {mem_rsp_tag_id_nc[(MEM_TAG_OUT_WIDTH - NC_ENABLE)-1 -: UUID_WIDTH], mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0]};
end else begin
assign mem_rsp_tag_in_nc2 = mem_rsp_tag_id_nc[CORE_TAG_ID_BITS-1:0];
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (UUID_WIDTH != 0) begin
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_valid ? core_rsp_tag_in_nc[i] : {mem_rsp_tag_in_nc[MEM_TAG_OUT_NC_WIDTH-1 -: UUID_WIDTH], mem_rsp_tag_in_nc[CORE_TAG_ID_BITS-1:0]};
if (PASSTHRU) begin
assign core_rsp_in_tag[i] = mem_rsp_tag_in_nc2;
end else if (NC_ENABLE) begin
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_valid ? core_bus_out_if[i].rsp_data.tag : mem_rsp_tag_in_nc2;
end else begin
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_valid ? core_rsp_tag_in_nc[i] : mem_rsp_tag_in_nc[CORE_TAG_ID_BITS-1:0];
end
assign core_rsp_in_tag[i] = core_bus_out_if[i].rsp_data.tag;
end
end
for (genvar i = 0; i < NUM_REQS; ++i) begin
VX_elastic_buffer #(
.DATAW (`CS_WORD_WIDTH + CORE_TAG_IN_WIDTH),
.DATAW (`CS_WORD_WIDTH + CORE_TAG_WIDTH),
.SIZE ((!DIRECT_PASSTHRU) ? `TO_OUT_BUF_SIZE(CORE_OUT_BUF) : 0),
.OUT_REG (`TO_OUT_BUF_REG(CORE_OUT_BUF))
) core_rsp_buf (
@ -340,26 +323,23 @@ module VX_cache_bypass #(
);
end
// memory response handling
// handle memory responses ////////////////////////////////////////////////
if (PASSTHRU != 0) begin
assign mem_bus_in_if.rsp_valid = 1'b0;
assign mem_bus_in_if.rsp_data.data = '0;
assign mem_bus_in_if.rsp_data.tag = '0;
end else if (NC_ENABLE) begin
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid && ~mem_bus_out_if.rsp_data.tag[TAG_SEL_IDX];
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc[MEM_TAG_IN_WIDTH-1:0];
end else begin
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid && ~mem_bus_out_if.rsp_data.tag[NC_TAG_BIT];
assign mem_bus_in_if.rsp_valid = mem_bus_out_if.rsp_valid;
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
assign mem_bus_in_if.rsp_data.tag = mem_rsp_tag_id_nc;
end
assign mem_bus_in_if.rsp_data.data = mem_bus_out_if.rsp_data.data;
VX_bits_remove #(
.N (MEM_TAG_IN_WIDTH + 1),
.POS (NC_TAG_BIT)
) mem_rsp_tag_out_remove (
.data_in (mem_bus_out_if.rsp_data.tag[(MEM_TAG_IN_WIDTH + 1)-1:0]),
.data_out (mem_bus_in_if.rsp_data.tag)
);
wire [NUM_REQS-1:0] core_rsp_out_valid;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign core_rsp_out_valid[i] = core_bus_out_if[i].rsp_valid;
end

View file

@ -75,8 +75,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
localparam NUM_CACHES = `UP(NUM_UNITS);
localparam PASSTHRU = (NUM_UNITS == 0);
localparam ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES);
localparam MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH)) :
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
@ -155,6 +154,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.WRITE_ENABLE (WRITE_ENABLE),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (ARB_TAG_WIDTH),
.TAG_SEL_IDX (TAG_SEL_IDX),
.CORE_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : CORE_OUT_BUF),
.MEM_OUT_BUF ((NUM_CACHES > 1) ? 2 : MEM_OUT_BUF),
.NC_ENABLE (NC_ENABLE),
@ -181,7 +181,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
.NUM_INPUTS (NUM_CACHES),
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_WIDTH),
.TAG_SEL_IDX (1), // Skip 0 for NC flag
.TAG_SEL_IDX (TAG_SEL_IDX),
.ARBITER ("R"),
.REQ_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0),
.RSP_OUT_BUF ((NUM_CACHES > 1) ? 2 : 0)

View file

@ -16,9 +16,12 @@
module VX_cache_wrap import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter TAG_SEL_IDX = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Size of cache in bytes
parameter CACHE_SIZE = 4096,
// Size of line inside a bank in bytes
@ -49,7 +52,6 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
parameter TAG_WIDTH = UUID_WIDTH + 1,
// enable bypass for non-cacheable addresses
parameter NC_TAG_BIT = 0,
parameter NC_ENABLE = 0,
// Force bypass for all requests
@ -74,48 +76,46 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
VX_mem_bus_if.master mem_bus_if
);
`STATIC_ASSERT(NUM_BANKS <= NUM_REQS, ("invalid parameter: NUM_BANKS=%d, NUM_REQS=%d", NUM_BANKS, NUM_REQS))
`STATIC_ASSERT(NUM_BANKS == (1 << `CLOG2(NUM_BANKS)), ("invalid parameter"))
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
localparam CORE_TAG_X_WIDTH = TAG_WIDTH - NC_ENABLE;
localparam MEM_TAG_X_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
localparam MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH)) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
localparam MSHR_ADDR_WIDTH = `LOG2UP(MSHR_SIZE);
localparam CACHE_MEM_TAG_WIDTH = MSHR_ADDR_WIDTH + `CS_BANK_SEL_BITS;
localparam MEM_TAG_WIDTH = PASSTHRU ? `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS));
localparam NC_BYPASS = (NC_ENABLE || PASSTHRU);
localparam NC_OR_BYPASS = (NC_ENABLE || PASSTHRU);
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (CORE_TAG_X_WIDTH)
) core_bus_bypass_if[NUM_REQS]();
.TAG_WIDTH (TAG_WIDTH)
) core_bus_cache_if[NUM_REQS]();
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_X_WIDTH)
) mem_bus_bypass_if();
.TAG_WIDTH (CACHE_MEM_TAG_WIDTH)
) mem_bus_cache_if();
if (NC_BYPASS) begin
if (NC_OR_BYPASS) begin
`RESET_RELAY (nc_bypass_reset, reset);
VX_cache_bypass #(
.NUM_REQS (NUM_REQS),
.NC_TAG_BIT (NC_TAG_BIT),
.TAG_SEL_IDX (TAG_SEL_IDX),
.NC_ENABLE (NC_ENABLE),
.PASSTHRU (PASSTHRU),
.NC_ENABLE (PASSTHRU ? 0 : NC_ENABLE),
.WORD_SIZE (WORD_SIZE),
.LINE_SIZE (LINE_SIZE),
.CORE_ADDR_WIDTH (`CS_WORD_ADDR_WIDTH),
.CORE_TAG_IN_WIDTH (TAG_WIDTH),
.CORE_TAG_WIDTH (TAG_WIDTH),
.MEM_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH),
.MEM_TAG_IN_WIDTH (MEM_TAG_X_WIDTH),
.MEM_TAG_IN_WIDTH (CACHE_MEM_TAG_WIDTH),
.MEM_TAG_OUT_WIDTH (MEM_TAG_WIDTH),
.UUID_WIDTH (UUID_WIDTH),
@ -127,68 +127,40 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.reset (nc_bypass_reset),
.core_bus_in_if (core_bus_if),
.core_bus_out_if(core_bus_bypass_if),
.core_bus_out_if(core_bus_cache_if),
.mem_bus_in_if (mem_bus_bypass_if),
.mem_bus_in_if (mem_bus_cache_if),
.mem_bus_out_if (mem_bus_if)
);
end else begin
end else begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
`ASSIGN_VX_MEM_BUS_IF (core_bus_bypass_if[i], core_bus_if[i]);
`ASSIGN_VX_MEM_BUS_IF (core_bus_cache_if[i], core_bus_if[i]);
end
assign mem_bus_if.req_valid = mem_bus_bypass_if.req_valid;
assign mem_bus_if.req_data.addr = mem_bus_bypass_if.req_data.addr;
assign mem_bus_if.req_data.rw = mem_bus_bypass_if.req_data.rw;
assign mem_bus_if.req_data.byteen = mem_bus_bypass_if.req_data.byteen;
assign mem_bus_if.req_data.data = mem_bus_bypass_if.req_data.data;
assign mem_bus_bypass_if.req_ready = mem_bus_if.req_ready;
// Add explicit NC=0 flag to the memory request tag
VX_bits_insert #(
.N (MEM_TAG_WIDTH-1),
.POS (NC_TAG_BIT)
) mem_req_tag_insert (
.data_in (mem_bus_bypass_if.req_data.tag),
.sel_in (1'b0),
.data_out (mem_bus_if.req_data.tag)
);
assign mem_bus_bypass_if.rsp_valid = mem_bus_if.rsp_valid;
assign mem_bus_bypass_if.rsp_data.data = mem_bus_if.rsp_data.data;
assign mem_bus_if.rsp_ready = mem_bus_bypass_if.rsp_ready;
// Remove NC flag from the memory response tag
VX_bits_remove #(
.N (MEM_TAG_WIDTH),
.POS (NC_TAG_BIT)
) mem_rsp_tag_remove (
.data_in (mem_bus_if.rsp_data.tag),
.data_out (mem_bus_bypass_if.rsp_data.tag)
);
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if, mem_bus_cache_if);
end
if (PASSTHRU != 0) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
`UNUSED_VAR (core_bus_bypass_if[i].req_valid)
`UNUSED_VAR (core_bus_bypass_if[i].req_data)
assign core_bus_bypass_if[i].req_ready = 0;
`UNUSED_VAR (core_bus_cache_if[i].req_valid)
`UNUSED_VAR (core_bus_cache_if[i].req_data)
assign core_bus_cache_if[i].req_ready = 0;
assign core_bus_bypass_if[i].rsp_valid = 0;
assign core_bus_bypass_if[i].rsp_data = '0;
`UNUSED_VAR (core_bus_bypass_if[i].rsp_ready)
assign core_bus_cache_if[i].rsp_valid = 0;
assign core_bus_cache_if[i].rsp_data = '0;
`UNUSED_VAR (core_bus_cache_if[i].rsp_ready)
end
assign mem_bus_bypass_if.req_valid = 0;
assign mem_bus_bypass_if.req_data = '0;
`UNUSED_VAR (mem_bus_bypass_if.req_ready)
assign mem_bus_cache_if.req_valid = 0;
assign mem_bus_cache_if.req_data = '0;
`UNUSED_VAR (mem_bus_cache_if.req_ready)
`UNUSED_VAR (mem_bus_bypass_if.rsp_valid)
`UNUSED_VAR (mem_bus_bypass_if.rsp_data)
assign mem_bus_bypass_if.rsp_ready = 0;
`UNUSED_VAR (mem_bus_cache_if.rsp_valid)
`UNUSED_VAR (mem_bus_cache_if.rsp_data)
assign mem_bus_cache_if.rsp_ready = 0;
`ifdef PERF_ENABLE
assign cache_perf = '0;
@ -212,17 +184,17 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (CORE_TAG_X_WIDTH),
.CORE_OUT_BUF (NC_BYPASS ? 1 : CORE_OUT_BUF),
.MEM_OUT_BUF (NC_BYPASS ? 1 : MEM_OUT_BUF)
.TAG_WIDTH (TAG_WIDTH),
.CORE_OUT_BUF (NC_OR_BYPASS ? 1 : CORE_OUT_BUF),
.MEM_OUT_BUF (NC_OR_BYPASS ? 1 : MEM_OUT_BUF)
) cache (
.clk (clk),
.reset (cache_reset),
`ifdef PERF_ENABLE
.cache_perf (cache_perf),
`endif
.core_bus_if (core_bus_bypass_if),
.mem_bus_if (mem_bus_bypass_if)
.core_bus_if (core_bus_cache_if),
.mem_bus_if (mem_bus_cache_if)
);
end
@ -260,7 +232,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
wire [`UP(UUID_WIDTH)-1:0] mem_req_uuid;
wire [`UP(UUID_WIDTH)-1:0] mem_rsp_uuid;
if ((UUID_WIDTH != 0) && (NC_BYPASS != 0)) begin
if ((UUID_WIDTH != 0) && (NC_OR_BYPASS != 0)) begin
assign mem_req_uuid = mem_bus_if.req_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
assign mem_rsp_uuid = mem_bus_if.rsp_data.tag[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
end else begin

View file

@ -72,9 +72,9 @@ module VX_core import VX_gpu_pkg::*; #(
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_tmp_if[DCACHE_NUM_REQS]();
) dcache_sm_bus_if[DCACHE_NUM_REQS]();
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_tmp_if();
@ -191,7 +191,7 @@ module VX_core import VX_gpu_pkg::*; #(
.pipeline_perf_if(pipeline_perf_if),
`endif
.dcache_bus_if (dcache_bus_tmp_if),
.dcache_bus_if (dcache_sm_bus_if),
`ifdef EXT_F_ENABLE
.fpu_dispatch_if(fpu_dispatch_if),
@ -246,14 +246,14 @@ module VX_core import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
.cache_perf (mem_perf_tmp_if.smem),
`endif
.dcache_bus_in_if (dcache_bus_tmp_if),
.dcache_bus_in_if (dcache_sm_bus_if),
.dcache_bus_out_if (dcache_bus_if)
);
`else
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_bus_tmp_if[i]);
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_if[i], dcache_sm_bus_if[i]);
end
`endif

View file

@ -33,12 +33,12 @@ module VX_core_top import VX_gpu_pkg::*; #(
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] dcache_req_byteen,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] dcache_req_addr,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_req_data,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] dcache_req_tag,
output wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] dcache_req_tag,
input wire [DCACHE_NUM_REQS-1:0] dcache_req_ready,
input wire [DCACHE_NUM_REQS-1:0] dcache_rsp_valid,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_rsp_data,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] dcache_rsp_tag,
input wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] dcache_rsp_tag,
output wire [DCACHE_NUM_REQS-1:0] dcache_rsp_ready,
output wire icache_req_valid,
@ -92,7 +92,7 @@ module VX_core_top import VX_gpu_pkg::*; #(
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) dcache_bus_if[DCACHE_NUM_REQS]();
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin

View file

@ -30,16 +30,15 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
// outputs
VX_commit_if.master commit_if [`ISSUE_WIDTH]
);
localparam WORD_SIZE = `XLEN / 8;
localparam ADDR_WIDTH = `MEM_ADDR_WIDTH - `CLOG2(WORD_SIZE);
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_LSU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_SIZE);
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
localparam MEM_ADDRW = `XLEN - MEM_ASHIFT;
localparam REQ_ASHIFT = `CLOG2(DCACHE_WORD_SIZE);
localparam CACHE_TAG_WIDTH = `UUID_WIDTH + (NUM_LANES * `CACHE_ADDR_TYPE_BITS) + LSUQ_TAG_BITS;
localparam REQ_ASHIFT = `CLOG2(WORD_SIZE);
VX_execute_if #(
.NUM_LANES (NUM_LANES)
@ -72,23 +71,14 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
`UNUSED_VAR (execute_if[0].data.rs3_data)
`UNUSED_VAR (execute_if[0].data.tid)
`ifdef SM_ENABLE
`STATIC_ASSERT(`IS_DIVISBLE((1 << `SMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % (1 << `SMEM_LOG_SIZE)), ("invalid parameter"))
localparam SMEM_START_B = MEM_ADDRW'(`XLEN'(`SMEM_BASE_ADDR) >> MEM_ASHIFT);
localparam SMEM_END_B = MEM_ADDRW'((`XLEN'(`SMEM_BASE_ADDR) + (1 << `SMEM_LOG_SIZE)) >> MEM_ASHIFT);
`endif
// tag_id = wid + PC + tmask + rd + op_type + align + is_dup + pid + pkt_addr
localparam TAG_ID_WIDTH = `NW_WIDTH + `XLEN + NUM_LANES + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * (REQ_ASHIFT)) + `LSU_DUP_ENABLED + PID_WIDTH + LSUQ_SIZEW;
// tag = uuid + addr_type + tag_id
localparam TAG_WIDTH = `UUID_WIDTH + (NUM_LANES * `CACHE_ADDR_TYPE_BITS) + TAG_ID_WIDTH;
// tag = uuid + tag_id
localparam TAG_WIDTH = `UUID_WIDTH + TAG_ID_WIDTH;
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % `MEM_BLOCK_SIZE), ("invalid parameter"))
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type;
// full address calculation
wire [NUM_LANES-1:0][`XLEN-1:0] full_addr;
@ -113,21 +103,6 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
assign lsu_is_dup = 0;
`endif
// detect address type
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [MEM_ADDRW-1:0] full_addr_b = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
// is non-cacheable I/O address
wire is_addr_io = (full_addr_b >= MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT));
`ifdef SM_ENABLE
// is shared memory address
wire is_addr_sm = (full_addr_b >= SMEM_START_B) && (full_addr_b < SMEM_END_B);
assign lsu_addr_type[i] = {is_addr_io, is_addr_sm};
`else
assign lsu_addr_type[i] = is_addr_io;
`endif
end
wire mem_req_empty;
wire st_rsp_ready;
wire lsu_valid, lsu_ready;
@ -145,8 +120,8 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
wire [NUM_LANES-1:0] mem_req_mask;
wire mem_req_rw;
wire [NUM_LANES-1:0][`MEM_ADDR_WIDTH-REQ_ASHIFT-1:0] mem_req_addr;
reg [NUM_LANES-1:0][DCACHE_WORD_SIZE-1:0] mem_req_byteen;
reg [NUM_LANES-1:0][`XLEN-1:0] mem_req_data;
reg [NUM_LANES-1:0][WORD_SIZE-1:0] mem_req_byteen;
reg [NUM_LANES-1:0][`XLEN-1:0] mem_req_data;
wire [TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
@ -202,7 +177,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
end
`endif
default : mem_req_byteen[i] = {DCACHE_WORD_SIZE{1'b1}};
default : mem_req_byteen[i] = {WORD_SIZE{1'b1}};
endcase
end
end
@ -306,7 +281,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
end
assign mem_req_tag = {
execute_if[0].data.uuid, lsu_addr_type, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr
execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.tmask, execute_if[0].data.PC, execute_if[0].data.rd, execute_if[0].data.op_type, req_align, execute_if[0].data.pid, pkt_waddr
`ifdef LSU_DUP_ENABLE
, lsu_is_dup
`endif
@ -314,28 +289,27 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
wire [DCACHE_NUM_REQS-1:0] cache_req_valid;
wire [DCACHE_NUM_REQS-1:0] cache_req_rw;
wire [DCACHE_NUM_REQS-1:0][(`XLEN/8)-1:0] cache_req_byteen;
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] cache_req_byteen;
wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] cache_req_addr;
wire [DCACHE_NUM_REQS-1:0][`XLEN-1:0] cache_req_data;
wire [DCACHE_NUM_REQS-1:0][CACHE_TAG_WIDTH-1:0] cache_req_tag;
wire [DCACHE_NUM_REQS-1:0][(DCACHE_WORD_SIZE*8)-1:0] cache_req_data;
wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] cache_req_tag;
wire [DCACHE_NUM_REQS-1:0] cache_req_ready;
wire [DCACHE_NUM_REQS-1:0] cache_rsp_valid;
wire [DCACHE_NUM_REQS-1:0][`XLEN-1:0] cache_rsp_data;
wire [DCACHE_NUM_REQS-1:0][CACHE_TAG_WIDTH-1:0] cache_rsp_tag;
wire [DCACHE_NUM_REQS-1:0][(DCACHE_WORD_SIZE*8)-1:0] cache_rsp_data;
wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] cache_rsp_tag;
wire [DCACHE_NUM_REQS-1:0] cache_rsp_ready;
`RESET_RELAY (mem_scheduler_reset, reset);
VX_mem_scheduler #(
.INSTANCE_ID ($sformatf("core%0d-lsu-memsched", CORE_ID)),
.CORE_REQS (LSU_MEM_REQS),
.MEM_CHANNELS(DCACHE_NUM_REQS),
.ADDR_WIDTH (DCACHE_ADDR_WIDTH),
.WORD_SIZE (DCACHE_WORD_SIZE),
.CORE_REQS (`NUM_LSU_LANES),
.MEM_CHANNELS(DCACHE_NUM_REQS),
.WORD_SIZE (WORD_SIZE),
.LINE_SIZE (DCACHE_WORD_SIZE),
.ADDR_WIDTH (ADDR_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.QUEUE_SIZE (`LSUQ_SIZE),
.TAG_WIDTH (TAG_WIDTH),
.TAG_ID_WIDTH(TAG_ID_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.RSP_PARTIAL (1),
.MEM_OUT_BUF (2)
@ -349,7 +323,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
.core_req_mask (mem_req_mask),
.core_req_byteen(mem_req_byteen),
.core_req_addr (mem_req_addr),
.core_req_data (mem_req_data),
.core_req_data (mem_req_data),
.core_req_tag (mem_req_tag),
.core_req_ready (mem_req_ready),
.core_req_empty (mem_req_empty),
@ -386,63 +360,16 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
assign cache_bus_if[i].req_data.byteen = cache_req_byteen[i];
assign cache_bus_if[i].req_data.addr = cache_req_addr[i];
assign cache_bus_if[i].req_data.data = cache_req_data[i];
assign cache_bus_if[i].req_data.tag = cache_req_tag[i];
assign cache_req_ready[i] = cache_bus_if[i].req_ready;
assign cache_rsp_valid[i] = cache_bus_if[i].rsp_valid;
assign cache_rsp_data[i] = cache_bus_if[i].rsp_data.data;
assign cache_rsp_tag[i] = cache_bus_if[i].rsp_data.tag;
assign cache_bus_if[i].rsp_ready = cache_rsp_ready[i];
end
// cache tag formatting: <uuid, tag, type>
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
wire [`UUID_WIDTH-1:0] cache_req_uuid, cache_rsp_uuid;
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type, cache_rsp_type;
wire [`CLOG2(`LSUQ_SIZE)-1:0] cache_req_tag_x, cache_rsp_tag_x;
if (DCACHE_NUM_BATCHES > 1) begin
wire [DCACHE_NUM_BATCHES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type_b, cache_rsp_type_b;
wire [`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type_bi, cache_rsp_type_bi;
wire [DCACHE_BATCH_SEL_BITS-1:0] cache_req_bid, cache_rsp_bid;
assign {cache_req_uuid, cache_req_type, cache_req_bid, cache_req_tag_x} = cache_req_tag[i];
assign cache_req_type_bi = cache_req_type_b[cache_req_bid];
assign cache_bus_if[i].req_data.tag = {cache_req_uuid, cache_req_bid, cache_req_tag_x, cache_req_type_bi};
assign {cache_rsp_uuid, cache_rsp_bid, cache_rsp_tag_x, cache_rsp_type_bi} = cache_bus_if[i].rsp_data.tag;
assign cache_rsp_type_b = {DCACHE_NUM_BATCHES{cache_rsp_type_bi}};
assign cache_rsp_tag[i] = {cache_rsp_uuid, cache_rsp_type, cache_rsp_bid, cache_rsp_tag_x};
for (genvar j = 0; j < DCACHE_NUM_BATCHES; ++j) begin
localparam k = j * DCACHE_NUM_REQS + i;
if (k < NUM_LANES) begin
assign cache_req_type_b[j] = cache_req_type[k];
assign cache_rsp_type[k] = cache_rsp_type_b[j];
end else begin
assign cache_req_type_b[j] = '0;
`UNUSED_VAR (cache_rsp_type_b[j])
end
end
end else begin
assign {cache_req_uuid, cache_req_type, cache_req_tag_x} = cache_req_tag[i];
assign cache_bus_if[i].req_data.tag = {cache_req_uuid, cache_req_tag_x, cache_req_type[i]};
assign {cache_rsp_uuid, cache_rsp_tag_x, cache_rsp_type[i]} = cache_bus_if[i].rsp_data.tag;
assign cache_rsp_tag[i] = {cache_rsp_uuid, cache_rsp_type, cache_rsp_tag_x};
for (genvar j = 0; j < DCACHE_NUM_REQS; ++j) begin
if (i != j) begin
`UNUSED_VAR (cache_req_type[j])
assign cache_rsp_type[j] = '0;
end
end
end
end
wire [`UUID_WIDTH-1:0] rsp_uuid;
wire [NUM_LANES-1:0][`CACHE_ADDR_TYPE_BITS-1:0] rsp_addr_type;
wire [`NW_WIDTH-1:0] rsp_wid;
wire [NUM_LANES-1:0] rsp_tmask_uq;
wire [`XLEN-1:0] rsp_pc;
@ -457,12 +384,11 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
`endif
assign {
rsp_uuid, rsp_addr_type, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr
rsp_uuid, rsp_wid, rsp_tmask_uq, rsp_pc, rsp_rd, rsp_op_type, rsp_align, rsp_pid, pkt_raddr
`ifdef LSU_DUP_ENABLE
, rsp_is_dup
`endif
} = mem_rsp_tag;
`UNUSED_VAR (rsp_addr_type)
`UNUSED_VAR (rsp_op_type)
// load response formatting
@ -626,17 +552,13 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
if (mem_req_rw) begin
`TRACE(1, ("%d: D$%0d Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, mem_req_mask));
`TRACE_ARRAY1D(1, full_addr, NUM_LANES);
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, type=", mem_req_tag, mem_req_byteen));
`TRACE_ARRAY1D(1, lsu_addr_type, NUM_LANES);
`TRACE(1, (", data="));
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, data=", mem_req_tag, mem_req_byteen));
`TRACE_ARRAY1D(1, mem_req_data, NUM_LANES);
`TRACE(1, (", is_dup=%b (#%0d)\n", lsu_is_dup, execute_if[0].data.uuid));
end else begin
`TRACE(1, ("%d: D$%0d Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if[0].data.wid, execute_if[0].data.PC, mem_req_mask));
`TRACE_ARRAY1D(1, full_addr, NUM_LANES);
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, type=", mem_req_tag, mem_req_byteen));
`TRACE_ARRAY1D(1, lsu_addr_type, NUM_LANES);
`TRACE(1, (", rd=%0d, is_dup=%b (#%0d)\n", execute_if[0].data.rd, lsu_is_dup, execute_if[0].data.uuid));
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, rd=%0d, is_dup=%b (#%0d)\n", mem_req_tag, mem_req_byteen, execute_if[0].data.rd, lsu_is_dup, execute_if[0].data.uuid));
end
end
if (mem_rsp_fire) begin

View file

@ -26,21 +26,53 @@ module VX_smem_unit import VX_gpu_pkg::*; #(
VX_mem_bus_if.slave dcache_bus_in_if [DCACHE_NUM_REQS],
VX_mem_bus_if.master dcache_bus_out_if [DCACHE_NUM_REQS]
);
`UNUSED_PARAM (CORE_ID)
`STATIC_ASSERT(`IS_DIVISBLE((1 << `SMEM_LOG_SIZE), `MEM_BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT(0 == (`SMEM_BASE_ADDR % (1 << `SMEM_LOG_SIZE)), ("invalid parameter"))
localparam SMEM_ADDR_WIDTH = `SMEM_LOG_SIZE - `CLOG2(DCACHE_WORD_SIZE);
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
localparam MEM_ADDRW = `XLEN - MEM_ASHIFT;
localparam SMEM_START_B = MEM_ADDRW'(`XLEN'(`SMEM_BASE_ADDR) >> MEM_ASHIFT);
localparam SMEM_END_B = MEM_ADDRW'((`XLEN'(`SMEM_BASE_ADDR) + (1 << `SMEM_LOG_SIZE)) >> MEM_ASHIFT);
wire [DCACHE_NUM_REQS-1:0] smem_req_valid;
wire [DCACHE_NUM_REQS-1:0] smem_req_rw;
wire [DCACHE_NUM_REQS-1:0][SMEM_ADDR_WIDTH-1:0] smem_req_addr;
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] smem_req_byteen;
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] smem_req_data;
wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] smem_req_tag;
wire [DCACHE_NUM_REQS-1:0] smem_req_ready;
wire [DCACHE_NUM_REQS-1:0] smem_rsp_valid;
wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] smem_rsp_data;
wire [DCACHE_NUM_REQS-1:0][DCACHE_NOSM_TAG_WIDTH-1:0] smem_rsp_tag;
wire [DCACHE_NUM_REQS-1:0] smem_rsp_ready;
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) smem_bus_if[DCACHE_NUM_REQS]();
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) switch_out_bus_if[2 * DCACHE_NUM_REQS]();
`RESET_RELAY (switch_reset, reset);
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
wire [MEM_ADDRW-1:0] block_addr = dcache_bus_in_if[i].req_data.addr[DCACHE_ADDR_WIDTH-1 -: MEM_ADDRW];
wire bus_sel = (block_addr >= SMEM_START_B) && (block_addr < SMEM_END_B);
VX_smem_switch #(
.NUM_REQS (2),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.ARBITER ("P"),
.REQ_OUT_BUF (2),
.RSP_OUT_BUF (2)
) smem_switch (
.clk (clk),
.reset (switch_reset),
.bus_sel (bus_sel),
.bus_in_if (dcache_bus_in_if[i]),
.bus_out_if (switch_out_bus_if[i * 2 +: 2])
);
// output bus[0] goes to the dcache
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_out_if[i], switch_out_bus_if[i * 2 + 0]);
// output bus[1] goes to the local memory
`ASSIGN_VX_MEM_BUS_IF (smem_bus_if[i], switch_out_bus_if[i * 2 + 1]);
end
`RESET_RELAY (smem_reset, reset);
@ -52,7 +84,7 @@ module VX_smem_unit import VX_gpu_pkg::*; #(
.WORD_SIZE (DCACHE_WORD_SIZE),
.ADDR_WIDTH (SMEM_ADDR_WIDTH),
.UUID_WIDTH (`UUID_WIDTH),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
.TAG_WIDTH (DCACHE_TAG_WIDTH)
) shared_mem (
.clk (clk),
.reset (smem_reset),
@ -60,65 +92,7 @@ module VX_smem_unit import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
.cache_perf (cache_perf),
`endif
// Core request
.req_valid (smem_req_valid),
.req_rw (smem_req_rw),
.req_byteen (smem_req_byteen),
.req_addr (smem_req_addr),
.req_data (smem_req_data),
.req_tag (smem_req_tag),
.req_ready (smem_req_ready),
// Core response
.rsp_valid (smem_rsp_valid),
.rsp_data (smem_rsp_data),
.rsp_tag (smem_rsp_tag),
.rsp_ready (smem_rsp_ready)
.mem_bus_if (smem_bus_if)
);
VX_mem_bus_if #(
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_NOSM_TAG_WIDTH)
) switch_out_bus_if[2 * DCACHE_NUM_REQS]();
`RESET_RELAY (switch_reset, reset);
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
assign smem_req_valid[i] = switch_out_bus_if[i * 2 + 1].req_valid;
assign smem_req_rw[i] = switch_out_bus_if[i * 2 + 1].req_data.rw;
assign smem_req_byteen[i] = switch_out_bus_if[i * 2 + 1].req_data.byteen;
assign smem_req_data[i] = switch_out_bus_if[i * 2 + 1].req_data.data;
assign smem_req_tag[i] = switch_out_bus_if[i * 2 + 1].req_data.tag;
assign switch_out_bus_if[i * 2 + 1].req_ready = smem_req_ready[i];
assign switch_out_bus_if[i * 2 + 1].rsp_valid = smem_rsp_valid[i];
assign switch_out_bus_if[i * 2 + 1].rsp_data.data = smem_rsp_data[i];
assign switch_out_bus_if[i * 2 + 1].rsp_data.tag = smem_rsp_tag[i];
assign smem_rsp_ready[i] = switch_out_bus_if[i * 2 + 1].rsp_ready;
assign smem_req_addr[i] = switch_out_bus_if[i * 2 + 1].req_data.addr[SMEM_ADDR_WIDTH-1:0];
VX_smem_switch #(
.NUM_REQS (2),
.DATA_SIZE (DCACHE_WORD_SIZE),
.TAG_WIDTH (DCACHE_TAG_WIDTH),
.TAG_SEL_IDX (0),
.ARBITER ("P"),
.REQ_OUT_BUF (2),
.RSP_OUT_BUF (2)
) smem_switch (
.clk (clk),
.reset (switch_reset),
.bus_in_if (dcache_bus_in_if[i]),
.bus_out_if (switch_out_bus_if[i * 2 +: 2])
);
end
// this bus goes to the dcache
for (genvar i = 0; i < DCACHE_NUM_REQS; ++i) begin
`ASSIGN_VX_MEM_BUS_IF (dcache_bus_out_if[i], switch_out_bus_if[i * 2]);
end
endmodule

View file

@ -13,30 +13,33 @@
`include "VX_platform.vh"
`TRACING_OFF
//`TRACING_OFF
module VX_mem_scheduler #(
parameter `STRING INSTANCE_ID = "",
parameter CORE_REQS = 1,
parameter MEM_CHANNELS = 1,
parameter ADDR_WIDTH = 32,
parameter WORD_SIZE = 4,
parameter LINE_SIZE = 4,
parameter TAG_WIDTH = 8,
parameter TAG_ID_WIDTH = 8, // lower section of the request tag contains the tag identifier
parameter UUID_WIDTH = 0, // upper section of the request tag contains the UUID
parameter QUEUE_SIZE = 8,
parameter RSP_PARTIAL = 0,
parameter CORE_OUT_BUF = 0,
parameter MEM_OUT_BUF = 0,
parameter CORE_REQS = 1,
parameter MEM_CHANNELS = 1,
parameter WORD_SIZE = 4,
parameter LINE_SIZE = 4,
parameter ADDR_WIDTH = 32 - `CLOG2(WORD_SIZE),
parameter UUID_WIDTH = 0, // upper section of the request tag contains the UUID
parameter TAG_WIDTH = 8,
parameter QUEUE_SIZE = 8,
parameter RSP_PARTIAL = 0,
parameter CORE_OUT_BUF = 0,
parameter MEM_OUT_BUF = 0,
parameter WORD_WIDTH = WORD_SIZE * 8,
parameter LINE_WIDTH = LINE_SIZE * 8,
parameter MEM_REQS = (CORE_REQS * WORD_SIZE) / LINE_SIZE,
parameter NUM_BATCHES = (MEM_REQS + MEM_CHANNELS - 1) / MEM_CHANNELS,
parameter QUEUE_ADDRW = `CLOG2(QUEUE_SIZE),
parameter BATCH_SEL_BITS = `CLOG2(NUM_BATCHES),
parameter MEM_TAG_ID = TAG_WIDTH - TAG_ID_WIDTH,
parameter MEM_TAGW = MEM_TAG_ID + QUEUE_ADDRW + BATCH_SEL_BITS
parameter WORD_WIDTH = WORD_SIZE * 8,
parameter LINE_WIDTH = LINE_SIZE * 8,
parameter PER_LINE_REQS = LINE_SIZE / WORD_SIZE,
parameter MERGED_REQS = CORE_REQS / PER_LINE_REQS,
parameter NUM_BATCHES = (MERGED_REQS + MEM_CHANNELS - 1) / MEM_CHANNELS,
parameter QUEUE_ADDRW = `CLOG2(QUEUE_SIZE),
parameter BATCH_SEL_BITS= `CLOG2(NUM_BATCHES),
parameter TAG_ID_WIDTH = TAG_WIDTH - UUID_WIDTH,
parameter MEM_TAG_ID = TAG_WIDTH - TAG_ID_WIDTH,
parameter MEM_ADDR_WIDTH= ADDR_WIDTH - `CLOG2(PER_LINE_REQS),
parameter REQQ_TAG_WIDTH= MEM_TAG_ID + QUEUE_ADDRW,
parameter MEM_TAG_WIDTH = REQQ_TAG_WIDTH + BATCH_SEL_BITS
) (
input wire clk,
input wire reset,
@ -66,40 +69,39 @@ module VX_mem_scheduler #(
output wire [MEM_CHANNELS-1:0] mem_req_valid,
output wire [MEM_CHANNELS-1:0] mem_req_rw,
output wire [MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen,
output wire [MEM_CHANNELS-1:0][ADDR_WIDTH-1:0] mem_req_addr,
output wire [MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_CHANNELS-1:0][MEM_TAGW-1:0]mem_req_tag,
output wire [MEM_CHANNELS-1:0][MEM_TAG_WIDTH-1:0] mem_req_tag,
input wire [MEM_CHANNELS-1:0] mem_req_ready,
// Memory response
input wire [MEM_CHANNELS-1:0] mem_rsp_valid,
input wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_rsp_data,
input wire [MEM_CHANNELS-1:0][MEM_TAGW-1:0] mem_rsp_tag,
input wire [MEM_CHANNELS-1:0][MEM_TAG_WIDTH-1:0] mem_rsp_tag,
output wire [MEM_CHANNELS-1:0] mem_rsp_ready
);
localparam REQQ_TAG_WIDTH = MEM_TAG_ID + QUEUE_ADDRW;
localparam BATCH_SEL_WIDTH = `UP(BATCH_SEL_BITS);
localparam STALL_TIMEOUT = 10000000;
`STATIC_ASSERT ((WORD_SIZE == LINE_SIZE), ("invalid parameter"))
`STATIC_ASSERT (`IS_DIVISBLE(CORE_REQS * WORD_SIZE, LINE_SIZE), ("invalid parameter"))
`STATIC_ASSERT ((MEM_TAG_ID >= UUID_WIDTH), ("invalid parameter"))
`STATIC_ASSERT ((0 == RSP_PARTIAL) || (1 == RSP_PARTIAL), ("invalid parameter"))
`RUNTIME_ASSERT ((~core_req_valid || core_req_mask != 0), ("invalid request mask"));
wire [MEM_CHANNELS-1:0] mem_req_valid_s;
wire [MEM_CHANNELS-1:0] mem_req_mask_s;
wire [MEM_CHANNELS-1:0] mem_req_rw_s;
wire mem_req_rw_s;
wire [MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen_s;
wire [MEM_CHANNELS-1:0][ADDR_WIDTH-1:0] mem_req_addr_s;
wire [MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr_s;
wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data_s;
wire [MEM_TAGW-1:0] mem_req_tag_s;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_s;
wire [MEM_CHANNELS-1:0] mem_req_ready_s;
wire mem_rsp_valid_s;
wire [MEM_CHANNELS-1:0] mem_rsp_mask_s;
wire [MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_rsp_data_s;
wire [MEM_TAGW-1:0] mem_rsp_tag_s;
wire [MEM_TAG_WIDTH-1:0] mem_rsp_tag_s;
wire mem_rsp_ready_s;
wire mem_rsp_fire_s;
@ -124,11 +126,28 @@ module VX_mem_scheduler #(
wire crsp_valid;
wire [CORE_REQS-1:0] crsp_mask;
wire [CORE_REQS-1:0][WORD_WIDTH-1:0] crsp_data;
wire [TAG_WIDTH-1:0] crsp_tag;
wire [REQQ_TAG_WIDTH-1:0] crsp_tag;
wire crsp_sop;
wire crsp_eop;
wire crsp_ready;
wire reqq_valid_s;
wire [MERGED_REQS-1:0] reqq_mask_s;
wire reqq_rw_s;
wire [MERGED_REQS-1:0][LINE_SIZE-1:0] reqq_byteen_s;
wire [MERGED_REQS-1:0][MEM_ADDR_WIDTH-1:0] reqq_addr_s;
wire [MERGED_REQS-1:0][LINE_WIDTH-1:0] reqq_data_s;
wire [REQQ_TAG_WIDTH-1:0] reqq_tag_s;
wire reqq_ready_s;
wire crsp_valid_s;
wire [MERGED_REQS-1:0] crsp_mask_s;
wire [MERGED_REQS-1:0][LINE_WIDTH-1:0] crsp_data_s;
wire [REQQ_TAG_WIDTH-1:0] crsp_tag_s;
wire crsp_sop_s;
wire crsp_eop_s;
wire crsp_ready_s;
// Request queue //////////////////////////////////////////////////////////
wire req_sent_all;
@ -170,11 +189,9 @@ module VX_mem_scheduler #(
// Index buffer ///////////////////////////////////////////////////////////
wire rsp_complete;
assign ibuf_push = core_req_valid && core_req_ready && ~core_req_rw;
assign ibuf_pop = crsp_valid && crsp_ready && rsp_complete;
assign ibuf_raddr = mem_rsp_tag_s[BATCH_SEL_BITS +: QUEUE_ADDRW];
assign ibuf_pop = crsp_valid && crsp_ready && crsp_eop;
assign ibuf_raddr = crsp_tag[QUEUE_ADDRW-1:0];
assign ibuf_din = core_req_tag[TAG_ID_WIDTH-1:0];
VX_index_buffer #(
@ -195,12 +212,31 @@ module VX_mem_scheduler #(
`UNUSED_VAR (ibuf_empty)
wire [QUEUE_ADDRW-1:0] ibuf_waddr_s = reqq_tag_s[QUEUE_ADDRW-1:0];
wire [QUEUE_ADDRW-1:0] ibuf_raddr_s = crsp_tag_s[QUEUE_ADDRW-1:0];
assign reqq_valid_s = reqq_valid;
assign reqq_mask_s = reqq_mask;
assign reqq_rw_s = reqq_rw;
assign reqq_byteen_s= reqq_byteen;
assign reqq_addr_s = reqq_addr;
assign reqq_data_s = reqq_data;
assign reqq_tag_s = reqq_tag;
assign reqq_ready = reqq_ready_s;
assign crsp_valid = crsp_valid_s;
assign crsp_mask = crsp_mask_s;
assign crsp_data = crsp_data_s;
assign crsp_tag = crsp_tag_s;
assign crsp_sop = crsp_sop_s;
assign crsp_eop = crsp_eop_s;
assign crsp_ready_s = crsp_ready;
// Handle memory requests /////////////////////////////////////////////////
wire [NUM_BATCHES-1:0][MEM_CHANNELS-1:0] mem_req_mask_b;
wire [NUM_BATCHES-1:0][MEM_CHANNELS-1:0] mem_req_rw_b;
wire [NUM_BATCHES-1:0][MEM_CHANNELS-1:0][LINE_SIZE-1:0] mem_req_byteen_b;
wire [NUM_BATCHES-1:0][MEM_CHANNELS-1:0][ADDR_WIDTH-1:0] mem_req_addr_b;
wire [NUM_BATCHES-1:0][MEM_CHANNELS-1:0][MEM_ADDR_WIDTH-1:0] mem_req_addr_b;
wire [NUM_BATCHES-1:0][MEM_CHANNELS-1:0][LINE_WIDTH-1:0] mem_req_data_b;
wire [BATCH_SEL_WIDTH-1:0] req_batch_idx;
@ -208,15 +244,13 @@ module VX_mem_scheduler #(
for (genvar i = 0; i < NUM_BATCHES; ++i) begin
for (genvar j = 0; j < MEM_CHANNELS; ++j) begin
localparam r = i * MEM_CHANNELS + j;
if (r < CORE_REQS) begin
assign mem_req_mask_b[i][j] = reqq_mask[r];
assign mem_req_rw_b[i][j] = reqq_rw;
assign mem_req_byteen_b[i][j] = reqq_byteen[r];
assign mem_req_addr_b[i][j] = reqq_addr[r];
assign mem_req_data_b[i][j] = reqq_data[r];
if (r < MERGED_REQS) begin
assign mem_req_mask_b[i][j] = reqq_mask_s[r];
assign mem_req_byteen_b[i][j] = reqq_byteen_s[r];
assign mem_req_addr_b[i][j] = reqq_addr_s[r];
assign mem_req_data_b[i][j] = reqq_data_s[r];
end else begin
assign mem_req_mask_b[i][j] = 0;
assign mem_req_rw_b[i][j] = '0;
assign mem_req_byteen_b[i][j] = '0;
assign mem_req_addr_b[i][j] = '0;
assign mem_req_data_b[i][j] = '0;
@ -225,22 +259,20 @@ module VX_mem_scheduler #(
end
assign mem_req_mask_s = mem_req_mask_b[req_batch_idx];
assign mem_req_rw_s = mem_req_rw_b[req_batch_idx];
assign mem_req_rw_s = reqq_rw_s;
assign mem_req_byteen_s = mem_req_byteen_b[req_batch_idx];
assign mem_req_addr_s = mem_req_addr_b[req_batch_idx];
assign mem_req_data_s = mem_req_data_b[req_batch_idx];
reg [MEM_CHANNELS-1:0] batch_sent_mask;
wire [MEM_CHANNELS-1:0] batch_sent_mask_n = batch_sent_mask | mem_req_ready_s;
reg [MEM_CHANNELS-1:0] batch_sent_mask;
wire [MEM_CHANNELS-1:0] batch_sent_mask_n = batch_sent_mask | mem_req_ready_s;
wire batch_sent_all = (mem_req_mask_s & ~batch_sent_mask_n) == 0;
always @(posedge clk) begin
if (reset) begin
batch_sent_mask <= '0;
end else begin
if (reqq_valid) begin
if (reqq_valid_s) begin
if (batch_sent_all) begin
batch_sent_mask <= '0;
end else begin
@ -256,7 +288,7 @@ module VX_mem_scheduler #(
if (reset) begin
req_batch_idx_r <= '0;
end else begin
if (reqq_valid && batch_sent_all) begin
if (reqq_valid_s && batch_sent_all) begin
if (req_sent_all) begin
req_batch_idx_r <= '0;
end else begin
@ -288,22 +320,22 @@ module VX_mem_scheduler #(
assign req_batch_idx = req_batch_idx_r;
assign req_sent_all = batch_sent_all && (req_batch_idx_r == req_batch_idx_last);
assign mem_req_tag_s = {reqq_tag, req_batch_idx};
assign mem_req_tag_s = {reqq_tag_s, req_batch_idx};
end else begin
assign req_batch_idx = '0;
assign req_sent_all = batch_sent_all;
assign mem_req_tag_s = reqq_tag;
assign mem_req_tag_s = reqq_tag_s;
end
assign mem_req_valid_s = {MEM_CHANNELS{reqq_valid}} & mem_req_mask_s & ~batch_sent_mask;
assign reqq_ready = req_sent_all;
assign mem_req_valid_s = {MEM_CHANNELS{reqq_valid_s}} & mem_req_mask_s & ~batch_sent_mask;
assign reqq_ready_s = req_sent_all;
for (genvar i = 0; i < MEM_CHANNELS; ++i) begin
VX_elastic_buffer #(
.DATAW (1 + LINE_SIZE + ADDR_WIDTH + LINE_WIDTH + MEM_TAGW),
.DATAW (1 + LINE_SIZE + MEM_ADDR_WIDTH + LINE_WIDTH + MEM_TAG_WIDTH),
.SIZE (`TO_OUT_BUF_SIZE(MEM_OUT_BUF)),
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
) mem_req_buf (
@ -311,8 +343,8 @@ module VX_mem_scheduler #(
.reset (reset),
.valid_in (mem_req_valid_s[i]),
.ready_in (mem_req_ready_s[i]),
.data_in ({mem_req_rw_s[i], mem_req_byteen_s[i], mem_req_addr_s[i], mem_req_data_s[i], mem_req_tag_s}),
.data_out ({mem_req_rw[i], mem_req_byteen[i], mem_req_addr[i], mem_req_data[i], mem_req_tag[i]}),
.data_in ({mem_req_rw_s, mem_req_byteen_s[i], mem_req_addr_s[i], mem_req_data_s[i], mem_req_tag_s}),
.data_out ({mem_req_rw[i], mem_req_byteen[i], mem_req_addr[i], mem_req_data[i], mem_req_tag[i]}),
.valid_out (mem_req_valid[i]),
.ready_out (mem_req_ready[i])
);
@ -320,16 +352,20 @@ module VX_mem_scheduler #(
// Handle memory responses ////////////////////////////////////////////////
reg [QUEUE_SIZE-1:0][CORE_REQS-1:0] rsp_rem_mask;
wire [CORE_REQS-1:0] rsp_rem_mask_n, curr_mask;
reg [QUEUE_SIZE-1:0] pending_req_valids;
reg [QUEUE_SIZE-1:0][MERGED_REQS-1:0] rsp_rem_mask;
wire [MERGED_REQS-1:0] rsp_rem_mask_n, curr_mask;
wire [BATCH_SEL_WIDTH-1:0] rsp_batch_idx;
wire reqq_fire_s = reqq_valid_s && reqq_ready_s;
wire reqq_rd_start_s = reqq_fire_s && ~reqq_rw_s && ~pending_req_valids[ibuf_waddr_s];
// Select memory response
VX_mem_rsp_sel #(
.NUM_REQS (MEM_CHANNELS),
.DATA_WIDTH (LINE_WIDTH),
.TAG_WIDTH (MEM_TAGW),
.TAG_SEL_BITS (MEM_TAGW - MEM_TAG_ID),
.TAG_WIDTH (MEM_TAG_WIDTH),
.TAG_SEL_BITS (MEM_TAG_WIDTH - MEM_TAG_ID),
.OUT_BUF (2)
) mem_rsp_sel (
.clk (clk),
@ -345,13 +381,15 @@ module VX_mem_scheduler #(
.rsp_ready_out (mem_rsp_ready_s)
);
for (genvar r = 0; r < CORE_REQS; ++r) begin
for (genvar r = 0; r < MERGED_REQS; ++r) begin
localparam i = r / MEM_CHANNELS;
localparam j = r % MEM_CHANNELS;
assign curr_mask[r] = (BATCH_SEL_WIDTH'(i) == rsp_batch_idx) && mem_rsp_mask_s[j];
end
assign rsp_rem_mask_n = rsp_rem_mask[ibuf_raddr] & ~curr_mask;
assign rsp_rem_mask_n = rsp_rem_mask[ibuf_raddr_s] & ~curr_mask;
wire rsp_complete = ~(| rsp_rem_mask_n);
if (NUM_BATCHES > 1) begin
assign rsp_batch_idx = mem_rsp_tag_s[BATCH_SEL_BITS-1:0];
@ -359,14 +397,22 @@ module VX_mem_scheduler #(
assign rsp_batch_idx = '0;
end
assign rsp_complete = ~(| rsp_rem_mask_n);
always @(posedge clk) begin
if (ibuf_push) begin
rsp_rem_mask[ibuf_waddr] <= core_req_mask;
if (reset) begin
pending_req_valids <= '0;
end else begin
if (reqq_rd_start_s) begin
pending_req_valids[ibuf_waddr_s] <= 1;
end
if (mem_rsp_fire_s && rsp_complete) begin
pending_req_valids[ibuf_raddr_s] <= 0;
end
end
if (reqq_rd_start_s) begin
rsp_rem_mask[ibuf_waddr_s] <= reqq_mask_s;
end
if (mem_rsp_fire_s) begin
rsp_rem_mask[ibuf_raddr] <= rsp_rem_mask_n;
rsp_rem_mask[ibuf_raddr_s] <= rsp_rem_mask_n;
end
end
@ -377,74 +423,78 @@ module VX_mem_scheduler #(
reg [QUEUE_SIZE-1:0] rsp_sop_r;
always @(posedge clk) begin
if (ibuf_push) begin
rsp_sop_r[ibuf_waddr] <= 1;
if (reqq_rd_start_s) begin
rsp_sop_r[ibuf_waddr_s] <= 1;
end
if (mem_rsp_fire_s) begin
rsp_sop_r[ibuf_raddr] <= 0;
rsp_sop_r[ibuf_raddr_s] <= 0;
end
end
assign mem_rsp_ready_s = crsp_ready;
assign mem_rsp_ready_s = crsp_ready_s;
assign crsp_valid = mem_rsp_valid_s;
assign crsp_valid_s = mem_rsp_valid_s;
assign crsp_mask = curr_mask;
assign crsp_sop = rsp_sop_r[ibuf_raddr];
assign crsp_mask_s = curr_mask;
assign crsp_sop_s = rsp_sop_r[ibuf_raddr_s];
for (genvar r = 0; r < CORE_REQS; ++r) begin
for (genvar r = 0; r < MERGED_REQS; ++r) begin
localparam j = r % MEM_CHANNELS;
assign crsp_data[r] = mem_rsp_data_s[j];
assign crsp_data_s[r] = mem_rsp_data_s[j];
end
end else begin
reg [NUM_BATCHES*MEM_CHANNELS*WORD_WIDTH-1:0] rsp_store [QUEUE_SIZE-1:0];
reg [NUM_BATCHES*MEM_CHANNELS*WORD_WIDTH-1:0] rsp_store_n;
reg [CORE_REQS-1:0] rsp_orig_mask [QUEUE_SIZE-1:0];
reg [NUM_BATCHES*MEM_CHANNELS*LINE_WIDTH-1:0] rsp_store [QUEUE_SIZE-1:0];
reg [NUM_BATCHES*MEM_CHANNELS*LINE_WIDTH-1:0] rsp_store_n;
reg [MERGED_REQS-1:0] rsp_orig_mask [QUEUE_SIZE-1:0];
always @(*) begin
rsp_store_n = rsp_store[ibuf_raddr];
rsp_store_n = rsp_store[ibuf_raddr_s];
for (integer i = 0; i < MEM_CHANNELS; ++i) begin
if ((MEM_CHANNELS == 1) || mem_rsp_mask_s[i]) begin
rsp_store_n[(rsp_batch_idx * MEM_CHANNELS + i) * WORD_WIDTH +: WORD_WIDTH] = mem_rsp_data_s[i];
rsp_store_n[(rsp_batch_idx * MEM_CHANNELS + i) * LINE_WIDTH +: LINE_WIDTH] = mem_rsp_data_s[i];
end
end
end
always @(posedge clk) begin
if (ibuf_push) begin
rsp_orig_mask[ibuf_waddr] <= core_req_mask;
if (reqq_rd_start_s) begin
rsp_orig_mask[ibuf_waddr_s] <= core_req_mask;
end
if (mem_rsp_valid_s) begin
rsp_store[ibuf_raddr] <= rsp_store_n;
rsp_store[ibuf_raddr_s] <= rsp_store_n;
end
end
assign mem_rsp_ready_s = crsp_ready || ~rsp_complete;
assign mem_rsp_ready_s = crsp_ready_s || ~rsp_complete;
assign crsp_valid = mem_rsp_valid_s && rsp_complete;
assign crsp_valid_s = mem_rsp_valid_s && rsp_complete;
assign crsp_mask = rsp_orig_mask[ibuf_raddr];
assign crsp_sop = 1'b1;
assign crsp_mask_s = rsp_orig_mask[ibuf_raddr_s];
assign crsp_sop_s = 1'b1;
for (genvar r = 0; r < CORE_REQS; ++r) begin
for (genvar r = 0; r < MERGED_REQS; ++r) begin
localparam i = r / MEM_CHANNELS;
localparam j = r % MEM_CHANNELS;
assign crsp_data[r] = rsp_store_n[(i * MEM_CHANNELS + j) * LINE_WIDTH +: WORD_WIDTH];
assign crsp_data_s[r] = rsp_store_n[(i * MEM_CHANNELS + j) * LINE_WIDTH +: LINE_WIDTH];
end
end
if (MEM_TAG_ID != 0) begin
assign crsp_tag = {mem_rsp_tag_s[MEM_TAGW-1 -: MEM_TAG_ID], ibuf_dout};
end else begin
assign crsp_tag = ibuf_dout;
end
assign crsp_tag_s = mem_rsp_tag_s[MEM_TAG_WIDTH-1 -: REQQ_TAG_WIDTH];
assign crsp_eop = ibuf_pop;
assign crsp_eop_s = rsp_complete;
// Send response to caller
wire [TAG_WIDTH-1:0] crsp_tag_2;
if (MEM_TAG_ID != 0) begin
assign crsp_tag_2 = {crsp_tag[MEM_TAG_WIDTH-1 -: MEM_TAG_ID], ibuf_dout};
end else begin
assign crsp_tag_2 = ibuf_dout;
end
VX_elastic_buffer #(
.DATAW (CORE_REQS + 1 + 1 + (CORE_REQS * WORD_WIDTH) + TAG_WIDTH),
.SIZE (`TO_OUT_BUF_SIZE(CORE_OUT_BUF)),
@ -454,7 +504,7 @@ module VX_mem_scheduler #(
.reset (reset),
.valid_in (crsp_valid),
.ready_in (crsp_ready),
.data_in ({crsp_mask, crsp_sop, crsp_eop, crsp_data, crsp_tag}),
.data_in ({crsp_mask, crsp_sop, crsp_eop, crsp_data, crsp_tag_2}),
.data_out ({core_rsp_mask, core_rsp_sop, core_rsp_eop, core_rsp_data, core_rsp_tag}),
.valid_out (core_rsp_valid),
.ready_out (core_rsp_ready)
@ -469,8 +519,8 @@ module VX_mem_scheduler #(
if (UUID_WIDTH != 0) begin
assign req_dbg_uuid = core_req_tag[TAG_WIDTH-1 -: UUID_WIDTH];
assign rsp_dbg_uuid = core_rsp_tag[TAG_WIDTH-1 -: UUID_WIDTH];
assign mem_req_dbg_uuid = reqq_tag[REQQ_TAG_WIDTH-1 -: UUID_WIDTH];
assign mem_rsp_dbg_uuid = mem_rsp_tag_s[MEM_TAGW-1 -: UUID_WIDTH];
assign mem_req_dbg_uuid = reqq_tag_s[REQQ_TAG_WIDTH-1 -: UUID_WIDTH];
assign mem_rsp_dbg_uuid = mem_rsp_tag_s[MEM_TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign req_dbg_uuid = '0;
assign rsp_dbg_uuid = '0;
@ -483,30 +533,18 @@ module VX_mem_scheduler #(
`UNUSED_VAR (mem_req_dbg_uuid)
`UNUSED_VAR (mem_rsp_dbg_uuid)
reg [(`UP(UUID_WIDTH) + TAG_ID_WIDTH + 64)-1:0] pending_reqs [QUEUE_SIZE-1:0];
reg [QUEUE_SIZE-1:0] pending_req_valids;
reg [(`UP(UUID_WIDTH) + TAG_ID_WIDTH + 64)-1:0] pending_reqs_time [QUEUE_SIZE-1:0];
always @(posedge clk) begin
if (reset) begin
pending_req_valids <= '0;
end else begin
if (ibuf_push) begin
pending_req_valids[ibuf_waddr] <= 1'b1;
end
if (ibuf_pop) begin
pending_req_valids[ibuf_raddr] <= 1'b0;
end
end
if (ibuf_push) begin
pending_reqs[ibuf_waddr] <= {req_dbg_uuid, ibuf_din, $time};
if (reqq_rd_start_s) begin
pending_reqs_time[ibuf_waddr_s] <= {req_dbg_uuid, ibuf_din, $time};
end
for (integer i = 0; i < QUEUE_SIZE; ++i) begin
if (pending_req_valids[i]) begin
`ASSERT(($time - pending_reqs[i][0 +: 64]) < STALL_TIMEOUT,
`ASSERT(($time - pending_reqs_time[i][63:0]) < STALL_TIMEOUT,
("%t: *** %s response timeout: remaining=%b, tag=0x%0h (#%0d)",
$time, INSTANCE_ID, rsp_rem_mask[i], pending_reqs[i][64 +: TAG_ID_WIDTH], pending_reqs[i][64+TAG_ID_WIDTH +: `UP(UUID_WIDTH)]));
$time, INSTANCE_ID, rsp_rem_mask[i], pending_reqs_time[i][64 +: TAG_ID_WIDTH], pending_reqs_time[i][64+TAG_ID_WIDTH +: `UP(UUID_WIDTH)]));
end
end
end
@ -548,15 +586,15 @@ module VX_mem_scheduler #(
`TRACE(1, ("%d: %s-mem-req-rd: valid=%b, addr=", $time, INSTANCE_ID, mem_req_fire_s));
`TRACE_ARRAY1D(1, mem_req_addr_s, MEM_CHANNELS);
end
`TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_waddr, req_batch_idx, mem_req_dbg_uuid));
`TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_waddr_s, req_batch_idx, mem_req_dbg_uuid));
end
if (mem_rsp_fire_s) begin
`TRACE(1, ("%d: %s-mem-rsp: valid=%b, data=", $time, INSTANCE_ID, mem_rsp_mask_s));
`TRACE_ARRAY1D(1, mem_rsp_data_s, MEM_CHANNELS);
`TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_raddr, rsp_batch_idx, mem_rsp_dbg_uuid));
`TRACE(1, (", ibuf_idx=%0d, batch_idx=%0d (#%0d)\n", ibuf_raddr_s, rsp_batch_idx, mem_rsp_dbg_uuid));
end
end
`endif
endmodule
`TRACING_ON
//`TRACING_ON

View file

@ -43,20 +43,7 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
output cache_perf_t cache_perf,
`endif
// Core request
input wire [NUM_REQS-1:0] req_valid,
input wire [NUM_REQS-1:0] req_rw,
input wire [NUM_REQS-1:0][ADDR_WIDTH-1:0] req_addr,
input wire [NUM_REQS-1:0][WORD_SIZE-1:0] req_byteen,
input wire [NUM_REQS-1:0][WORD_SIZE*8-1:0] req_data,
input wire [NUM_REQS-1:0][TAG_WIDTH-1:0] req_tag,
output wire [NUM_REQS-1:0] req_ready,
// Core response
output wire [NUM_REQS-1:0] rsp_valid,
output wire [NUM_REQS-1:0][WORD_SIZE*8-1:0] rsp_data,
output wire [NUM_REQS-1:0][TAG_WIDTH-1:0] rsp_tag,
input wire [NUM_REQS-1:0] rsp_ready
VX_mem_bus_if.slave mem_bus_if [NUM_REQS]
);
`UNUSED_SPARAM (INSTANCE_ID)
`UNUSED_PARAM (UUID_WIDTH)
@ -79,7 +66,7 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
wire [NUM_REQS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
if (NUM_BANKS > 1) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign req_bank_idx[i] = req_addr[i][0 +: BANK_SEL_BITS];
assign req_bank_idx[i] = mem_bus_if[i].req_data.addr[0 +: BANK_SEL_BITS];
end
end else begin
assign req_bank_idx = 0;
@ -89,7 +76,7 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
wire [NUM_REQS-1:0][BANK_ADDR_WIDTH-1:0] req_bank_addr;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign req_bank_addr[i] = req_addr[i][BANK_SEL_BITS +: BANK_ADDR_WIDTH];
assign req_bank_addr[i] = mem_bus_if[i].req_data.addr[BANK_SEL_BITS +: BANK_ADDR_WIDTH];
end
// bank requests dispatch
@ -102,22 +89,27 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_req_tag;
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_req_idx;
wire [NUM_BANKS-1:0] per_bank_req_ready;
wire [NUM_BANKS-1:0][REQ_DATAW-1:0] per_bank_req_data_all;
wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_in;
wire [NUM_BANKS-1:0][REQ_DATAW-1:0] req_data_out;
wire [NUM_REQS-1:0] req_valid_in;
wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_in;
wire [NUM_REQS-1:0] req_ready_in;
`ifdef PERF_ENABLE
wire [`PERF_CTR_BITS-1:0] perf_collisions;
`endif
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign req_valid_in[i] = mem_bus_if[i].req_valid;
assign req_data_in[i] = {
req_rw[i],
mem_bus_if[i].req_data.rw,
req_bank_addr[i],
req_byteen[i],
req_data[i],
req_tag[i]};
end
mem_bus_if[i].req_data.byteen,
mem_bus_if[i].req_data.data,
mem_bus_if[i].req_data.tag};
assign mem_bus_if[i].req_ready = req_ready_in[i];
end
VX_stream_xbar #(
.NUM_INPUTS (NUM_REQS),
@ -133,12 +125,12 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
`else
`UNUSED_PIN (collisions),
`endif
.valid_in (req_valid),
.valid_in (req_valid_in),
.data_in (req_data_in),
.sel_in (req_bank_idx),
.ready_in (req_ready),
.ready_in (req_ready_in),
.valid_out (per_bank_req_valid),
.data_out (req_data_out),
.data_out (per_bank_req_data_all),
.sel_out (per_bank_req_idx),
.ready_out (per_bank_req_ready)
);
@ -149,7 +141,7 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
per_bank_req_addr[i],
per_bank_req_byteen[i],
per_bank_req_data[i],
per_bank_req_tag[i]} = req_data_out[i];
per_bank_req_tag[i]} = per_bank_req_data_all[i];
end
// banks access
@ -197,13 +189,16 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
// bank responses gather
wire [NUM_BANKS-1:0][RSP_DATAW-1:0] rsp_data_in;
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_out;
wire [NUM_BANKS-1:0][RSP_DATAW-1:0] per_bank_rsp_data_all;
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign rsp_data_in[i] = {per_bank_rsp_data[i], per_bank_rsp_tag[i]};
assign per_bank_rsp_data_all[i] = {per_bank_rsp_data[i], per_bank_rsp_tag[i]};
end
wire [NUM_REQS-1:0] rsp_valid_out;
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_out;
wire [NUM_REQS-1:0] rsp_ready_out;
VX_stream_xbar #(
.NUM_INPUTS (NUM_BANKS),
.NUM_OUTPUTS (NUM_REQS),
@ -215,16 +210,18 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
`UNUSED_PIN (collisions),
.sel_in (per_bank_rsp_idx),
.valid_in (per_bank_rsp_valid),
.data_in (per_bank_rsp_data_all),
.ready_in (per_bank_rsp_ready),
.data_in (rsp_data_in),
.valid_out (rsp_valid_out),
.data_out (rsp_data_out),
.valid_out (rsp_valid),
.ready_out (rsp_ready),
.ready_out (rsp_ready_out),
`UNUSED_PIN (sel_out)
);
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign {rsp_data[i], rsp_tag[i]} = rsp_data_out[i];
assign mem_bus_if[i].rsp_valid = rsp_valid_out[i];
assign mem_bus_if[i].rsp_data = rsp_data_out[i];
assign rsp_ready_out[i] = mem_bus_if[i].rsp_ready;
end
`ifdef PERF_ENABLE
@ -277,8 +274,8 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (UUID_WIDTH != 0) begin
assign req_uuid[i] = req_tag[i][TAG_WIDTH-1 -: UUID_WIDTH];
assign rsp_uuid[i] = rsp_tag[i][TAG_WIDTH-1 -: UUID_WIDTH];
assign req_uuid[i] = mem_bus_if[i].req_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
assign rsp_uuid[i] = mem_bus_if[i].rsp_data.tag[TAG_WIDTH-1 -: UUID_WIDTH];
end else begin
assign req_uuid[i] = 0;
assign rsp_uuid[i] = 0;
@ -297,25 +294,27 @@ module VX_shared_mem import VX_gpu_pkg::*; #(
assign per_bank_rsp_uuid[i] = 0;
end
end
always @(posedge clk) begin
for (integer i = 0; i < NUM_REQS; ++i) begin
if (req_valid[i] && req_ready[i]) begin
if (req_rw[i]) begin
for (genvar i = 0; i < NUM_REQS; ++i) begin
always @(posedge clk) begin
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
if (mem_bus_if[i].req_data.rw) begin
`TRACE(1, ("%d: %s wr-req: req_idx=%0d, addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, req_addr[i], req_tag[i], req_byteen[i], req_data[i], req_uuid[i]));
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag, mem_bus_if[i].req_data.byteen, mem_bus_if[i].req_data.data, req_uuid[i]));
end else begin
`TRACE(1, ("%d: %s rd-req: req_idx=%0d, addr=0x%0h, tag=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, req_addr[i], req_tag[i], req_uuid[i]));
$time, INSTANCE_ID, i, mem_bus_if[i].req_data.addr, mem_bus_if[i].req_data.tag, req_uuid[i]));
end
end
if (rsp_valid[i] && rsp_ready[i]) begin
if (mem_bus_if[i].rsp_valid && mem_bus_if[i].rsp_ready) begin
`TRACE(1, ("%d: %s rd-rsp: req_idx=%0d, tag=0x%0h, data=0x%0h (#%0d)\n",
$time, INSTANCE_ID, i, rsp_tag[i], rsp_data[i], rsp_uuid[i]));
$time, INSTANCE_ID, i, mem_bus_if[i].rsp_data.tag, mem_bus_if[i].rsp_data.data[i], rsp_uuid[i]));
end
end
for (integer i = 0; i < NUM_BANKS; ++i) begin
end
for (genvar i = 0; i < NUM_BANKS; ++i) begin
always @(posedge clk) begin
if (per_bank_req_valid[i] && per_bank_req_ready[i]) begin
if (per_bank_req_rw[i]) begin
`TRACE(2, ("%d: %s-bank%0d wr-req: addr=0x%0h, tag=0x%0h, byteen=%b, data=0x%0h (#%0d)\n",

View file

@ -13,54 +13,34 @@
`include "VX_define.vh"
module VX_smem_switch #(
module VX_smem_switch import VX_gpu_pkg::*; #(
parameter NUM_REQS = 1,
parameter DATA_SIZE = 1,
parameter TAG_WIDTH = 1,
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
parameter TAG_SEL_IDX = 0,
parameter REQ_OUT_BUF = 0,
parameter RSP_OUT_BUF = 0,
parameter `STRING ARBITER = "R"
parameter `STRING ARBITER = "R",
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS)
) (
input wire clk,
input wire reset,
input wire [`UP(LOG_NUM_REQS)-1:0] bus_sel,
VX_mem_bus_if.slave bus_in_if,
VX_mem_bus_if.master bus_out_if [NUM_REQS]
);
localparam ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE));
localparam DATA_WIDTH = (8 * DATA_SIZE);
localparam LOG_NUM_REQS = `CLOG2(NUM_REQS);
localparam TAG_OUT_WIDTH = TAG_WIDTH - LOG_NUM_REQS;
localparam REQ_DATAW = TAG_OUT_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
localparam RSP_DATAW = TAG_OUT_WIDTH + DATA_WIDTH;
localparam ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE));
localparam DATA_WIDTH = (8 * DATA_SIZE);
localparam REQ_DATAW = TAG_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
localparam RSP_DATAW = TAG_WIDTH + DATA_WIDTH;
// handle requests ////////////////////////////////////////////////////////
wire [NUM_REQS-1:0] req_valid_out;
wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_data_out;
wire [NUM_REQS-1:0] req_ready_out;
wire [REQ_DATAW-1:0] req_data_in;
wire [TAG_OUT_WIDTH-1:0] req_tag_in;
wire [`UP(LOG_NUM_REQS)-1:0] req_sel_in;
VX_bits_remove #(
.N (TAG_WIDTH),
.S (LOG_NUM_REQS),
.POS (TAG_SEL_IDX)
) bits_remove (
.data_in (bus_in_if.req_data.tag),
.data_out (req_tag_in)
);
if (NUM_REQS > 1) begin
assign req_sel_in = bus_in_if.req_data.tag[TAG_SEL_IDX +: LOG_NUM_REQS];
end else begin
assign req_sel_in = '0;
end
assign req_data_in = {req_tag_in, bus_in_if.req_data.addr, bus_in_if.req_data.rw, bus_in_if.req_data.byteen, bus_in_if.req_data.data};
VX_stream_switch #(
.NUM_OUTPUTS (NUM_REQS),
.DATAW (REQ_DATAW),
@ -68,63 +48,48 @@ module VX_smem_switch #(
) req_switch (
.clk (clk),
.reset (reset),
.sel_in (req_sel_in),
.valid_in (bus_in_if.req_valid),
.sel_in (bus_sel),
.valid_in (bus_in_if.req_valid),
.data_in (bus_in_if.req_data),
.ready_in (bus_in_if.req_ready),
.data_in (req_data_in),
.data_out (req_data_out),
.valid_out (req_valid_out),
.data_out (req_data_out),
.ready_out (req_ready_out)
);
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign bus_out_if[i].req_valid = req_valid_out[i];
assign {bus_out_if[i].req_data.tag, bus_out_if[i].req_data.addr, bus_out_if[i].req_data.rw, bus_out_if[i].req_data.byteen, bus_out_if[i].req_data.data} = req_data_out[i];
assign bus_out_if[i].req_data = req_data_out[i];
assign req_ready_out[i] = bus_out_if[i].req_ready;
end
///////////////////////////////////////////////////////////////////////
// handle responses ///////////////////////////////////////////////////////
wire [NUM_REQS-1:0] rsp_valid_out;
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_out;
wire [NUM_REQS-1:0] rsp_ready_out;
wire [RSP_DATAW-1:0] rsp_data_in;
wire [TAG_OUT_WIDTH-1:0] rsp_tag_in;
wire [`UP(LOG_NUM_REQS)-1:0] rsp_sel_in;
wire [NUM_REQS-1:0] rsp_valid_in;
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_in;
wire [NUM_REQS-1:0] rsp_ready_in;
for (genvar i = 0; i < NUM_REQS; ++i) begin
assign rsp_valid_out[i] = bus_out_if[i].rsp_valid;
assign rsp_data_out[i] = {bus_out_if[i].rsp_data.tag, bus_out_if[i].rsp_data.data};
assign bus_out_if[i].rsp_ready = rsp_ready_out[i];
assign rsp_valid_in[i] = bus_out_if[i].rsp_valid;
assign rsp_data_in[i] = bus_out_if[i].rsp_data;
assign bus_out_if[i].rsp_ready = rsp_ready_in[i];
end
VX_stream_arb #(
VX_stream_arb #(
.NUM_INPUTS (NUM_REQS),
.DATAW (RSP_DATAW),
.DATAW (RSP_DATAW),
.ARBITER (ARBITER),
.OUT_BUF (RSP_OUT_BUF)
) rsp_arb (
.clk (clk),
.reset (reset),
.valid_in (rsp_valid_out),
.ready_in (rsp_ready_out),
.data_in (rsp_data_out),
.data_out (rsp_data_in),
.sel_out (rsp_sel_in),
.valid_in (rsp_valid_in),
.data_in (rsp_data_in),
.ready_in (rsp_ready_in),
.valid_out (bus_in_if.rsp_valid),
.ready_out (bus_in_if.rsp_ready)
.data_out (bus_in_if.rsp_data),
.ready_out (bus_in_if.rsp_ready),
`UNUSED_PIN (sel_out)
);
VX_bits_insert #(
.N (TAG_OUT_WIDTH),
.S (LOG_NUM_REQS),
.POS (TAG_SEL_IDX)
) bits_insert (
.data_in (rsp_tag_in),
.sel_in (rsp_sel_in),
.data_out (bus_in_if.rsp_data.tag)
);
assign {rsp_tag_in, bus_in_if.rsp_data.data} = rsp_data_in;
endmodule